(PREPARE_LOOP): Return __GCONV_EMPTY_INPUT only if input is really
[kopensolaris-gnu/glibc.git] / iconvdata / utf-16.c
index 8348ce2..b816508 100644 (file)
@@ -1,24 +1,25 @@
 /* Conversion module for UTF-16.
-   Copyright (C) 1999, 2000 Free Software Foundation, Inc.
+   Copyright (C) 1999, 2000-2002 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
 
    The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Library General Public License as
-   published by the Free Software Foundation; either version 2 of the
-   License, or (at your option) any later version.
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
 
    The GNU C Library is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Library General Public License for more details.
+   Lesser General Public License for more details.
 
-   You should have received a copy of the GNU Library General Public
-   License along with the GNU C Library; see the file COPYING.LIB.  If not,
-   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-   Boston, MA 02111-1307, USA.  */
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
 
 #include <byteswap.h>
+#include <dlfcn.h>
 #include <gconv.h>
 #include <stddef.h>
 #include <stdint.h>
@@ -27,6 +28,8 @@
 
 /* This is the Byte Order Mark character (BOM).  */
 #define BOM    0xfeff
+/* And in the other byte order.  */
+#define BOM_OE 0xfffe
 
 
 /* Definitions used in the body of the `gconv' function.  */
 #define PREPARE_LOOP \
   enum direction dir = ((struct utf16_data *) step->__data)->dir;            \
   enum variant var = ((struct utf16_data *) step->__data)->var;                      \
-  if (!FROM_DIRECTION && var == UTF_16 && !data->__internal_use                      \
-      && data->__invocation_counter == 0)                                    \
+  int swap;                                                                  \
+  if (FROM_DIRECTION && var == UTF_16)                                       \
+    {                                                                        \
+      if (data->__invocation_counter == 0)                                   \
+       {                                                                     \
+         /* We have to find out which byte order the file is encoded in.  */ \
+         if (inptr + 2 > inend)                                              \
+           return (inptr == inend                                            \
+                   ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);        \
+                                                                             \
+         if (get16u (inptr) == BOM)                                          \
+           /* Simply ignore the BOM character.  */                           \
+           *inptrp = inptr += 2;                                             \
+         else if (get16u (inptr) == BOM_OE)                                  \
+           {                                                                 \
+             ((struct utf16_data *) step->__data)->swap = 1;                 \
+             *inptrp = inptr += 2;                                           \
+           }                                                                 \
+       }                                                                     \
+    }                                                                        \
+  else if (!FROM_DIRECTION && var == UTF_16 && !data->__internal_use         \
+          && data->__invocation_counter == 0)                                \
     {                                                                        \
       /* Emit the Byte Order Mark.  */                                       \
-      if (outbuf + 2 > outend)                                               \
+      if (__builtin_expect (outbuf + 2 > outend, 0))                         \
        return __GCONV_FULL_OUTPUT;                                           \
                                                                              \
-      *(uint16_t *) outbuf = BOM;                                            \
+      put16u (outbuf, BOM);                                                  \
       outbuf += 2;                                                           \
-    }
-#define EXTRA_LOOP_ARGS                , var, data
+    }                                                                        \
+  swap = ((struct utf16_data *) step->__data)->swap;
+#define EXTRA_LOOP_ARGS                , var, swap
 
 
 /* Direction of the transformation.  */
@@ -74,9 +98,11 @@ struct utf16_data
 {
   enum direction dir;
   enum variant var;
+  int swap;
 };
 
 
+extern int gconv_init (struct __gconv_step *step);
 int
 gconv_init (struct __gconv_step *step)
 {
@@ -86,39 +112,39 @@ gconv_init (struct __gconv_step *step)
   enum variant var = illegal_var;
   int result;
 
-  if (__strcasecmp (step->__from_name, "UTF-16") == 0)
+  if (__strcasecmp (step->__from_name, "UTF-16//") == 0)
     {
       dir = from_utf16;
       var = UTF_16;
     }
-  else if (__strcasecmp (step->__to_name, "UTF-16") == 0)
+  else if (__strcasecmp (step->__to_name, "UTF-16//") == 0)
     {
       dir = to_utf16;
       var = UTF_16;
     }
-  else if (__strcasecmp (step->__from_name, "UTF-16BE") == 0)
+  else if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0)
     {
       dir = from_utf16;
       var = UTF_16BE;
     }
-  else if (__strcasecmp (step->__to_name, "UTF-16BE") == 0)
+  else if (__strcasecmp (step->__to_name, "UTF-16BE//") == 0)
     {
       dir = to_utf16;
       var = UTF_16BE;
     }
-  else if (__strcasecmp (step->__from_name, "UTF-16LE") == 0)
+  else if (__strcasecmp (step->__from_name, "UTF-16LE//") == 0)
     {
       dir = from_utf16;
       var = UTF_16LE;
     }
-  else if (__strcasecmp (step->__to_name, "UTF-16LE") == 0)
+  else if (__strcasecmp (step->__to_name, "UTF-16LE//") == 0)
     {
       dir = to_utf16;
       var = UTF_16LE;
     }
 
   result = __GCONV_NOCONV;
-  if (dir != illegal_dir)
+  if (__builtin_expect (dir, to_utf16) != illegal_dir)
     {
       new_data = (struct utf16_data *) malloc (sizeof (struct utf16_data));
 
@@ -127,12 +153,15 @@ gconv_init (struct __gconv_step *step)
        {
          new_data->dir = dir;
          new_data->var = var;
+         new_data->swap = ((var == UTF_16LE && BYTE_ORDER == BIG_ENDIAN)
+                           || (var == UTF_16BE
+                               && BYTE_ORDER == LITTLE_ENDIAN));
          step->__data = new_data;
 
-         if (var == from_utf16)
+         if (dir == from_utf16)
            {
              step->__min_needed_from = MIN_NEEDED_FROM;
-             step->__max_needed_from = MIN_NEEDED_FROM;
+             step->__max_needed_from = MAX_NEEDED_FROM;
              step->__min_needed_to = MIN_NEEDED_TO;
              step->__max_needed_to = MIN_NEEDED_TO;
            }
@@ -141,7 +170,7 @@ gconv_init (struct __gconv_step *step)
              step->__min_needed_from = MIN_NEEDED_TO;
              step->__max_needed_from = MIN_NEEDED_TO;
              step->__min_needed_to = MIN_NEEDED_FROM;
-             step->__max_needed_to = MIN_NEEDED_FROM;
+             step->__max_needed_to = MAX_NEEDED_FROM;
            }
 
          step->__stateful = 0;
@@ -154,6 +183,7 @@ gconv_init (struct __gconv_step *step)
 }
 
 
+extern void gconv_end (struct __gconv_step *data);
 void
 gconv_end (struct __gconv_step *data)
 {
@@ -168,64 +198,76 @@ gconv_end (struct __gconv_step *data)
 #define LOOPFCT                        TO_LOOP
 #define BODY \
   {                                                                          \
-    uint32_t c = *((uint32_t *) inptr);                                              \
+    uint32_t c = get32 (inptr);                                                      \
                                                                              \
-    if ((__BYTE_ORDER == __LITTLE_ENDIAN && var == UTF_16BE)                 \
-        || (__BYTE_ORDER == __BIG_ENDIAN && var == UTF_16LE))                \
+    if (__builtin_expect (c >= 0xd800 && c < 0xe000, 0))                     \
       {                                                                              \
-       if (c >= 0x10000)                                                     \
+       /* Surrogate characters in UCS-4 input are not valid.                 \
+          We must catch this.  If we let surrogates pass through,            \
+          attackers could make a security hole exploit by                    \
+          synthesizing any desired plane 1-16 character.  */                 \
+       result = __GCONV_ILLEGAL_INPUT;                                       \
+       if (! ignore_errors_p ())                                             \
+         break;                                                              \
+       inptr += 4;                                                           \
+       ++*irreversible;                                                      \
+       continue;                                                             \
+      }                                                                              \
+                                                                             \
+    if (swap)                                                                \
+      {                                                                              \
+       if (__builtin_expect (c >= 0x10000, 0))                               \
          {                                                                   \
-           if (c >= 0x110000)                                                \
+           if (__builtin_expect (c >= 0x110000, 0))                          \
              {                                                               \
-               result = __GCONV_ILLEGAL_INPUT;                               \
-               break;                                                        \
+               STANDARD_TO_LOOP_ERR_HANDLER (4);                             \
              }                                                               \
                                                                              \
            /* Generate a surrogate character.  */                            \
-           if (NEED_LENGTH_TEST && outptr + 4 > outend)                      \
+           if (__builtin_expect (outptr + 4 > outend, 0))                    \
              {                                                               \
                /* Overflow in the output buffer.  */                         \
                result = __GCONV_FULL_OUTPUT;                                 \
                break;                                                        \
              }                                                               \
                                                                              \
-           *((uint16_t *) outptr) = bswap_16 (0xd7c0 + (c >> 10));           \
+           put16 (outptr, bswap_16 (0xd7c0 + (c >> 10)));                    \
            outptr += 2;                                                      \
-           *((uint16_t *) outptr) = bswap_16 (0xdc00 + (c & 0x3ff));         \
+           put16 (outptr, bswap_16 (0xdc00 + (c & 0x3ff)));                  \
          }                                                                   \
        else                                                                  \
-         *((uint16_t *) outptr) = bswap_16 (c);                              \
+         put16 (outptr, bswap_16 (c));                                       \
       }                                                                              \
     else                                                                     \
       {                                                                              \
-       if (c >= 0x10000)                                                     \
+       if (__builtin_expect (c >= 0x10000, 0))                               \
          {                                                                   \
-           if (c >= 0x110000)                                                \
+           if (__builtin_expect (c >= 0x110000, 0))                          \
              {                                                               \
-               result = __GCONV_ILLEGAL_INPUT;                               \
-               break;                                                        \
+               STANDARD_TO_LOOP_ERR_HANDLER (4);                             \
              }                                                               \
                                                                              \
            /* Generate a surrogate character.  */                            \
-           if (NEED_LENGTH_TEST && outptr + 4 > outend)                      \
+           if (__builtin_expect (outptr + 4 > outend, 0))                    \
              {                                                               \
                /* Overflow in the output buffer.  */                         \
                result = __GCONV_FULL_OUTPUT;                                 \
                break;                                                        \
              }                                                               \
                                                                              \
-           *((uint16_t *) outptr) = 0xd7c0 + (c >> 10);                      \
+           put16 (outptr, 0xd7c0 + (c >> 10));                               \
            outptr += 2;                                                      \
-           *((uint16_t *) outptr) = 0xdc00 + (c & 0x3ff);                    \
+           put16 (outptr, 0xdc00 + (c & 0x3ff));                             \
          }                                                                   \
        else                                                                  \
-         *((uint16_t *) outptr) = c;                                         \
+         put16 (outptr, c);                                                  \
       }                                                                              \
     outptr += 2;                                                             \
     inptr += 4;                                                                      \
   }
+#define LOOP_NEED_FLAGS
 #define EXTRA_LOOP_DECLS \
-       , enum variant var, struct __gconv_step_data *step_data
+       , enum variant var, int swap
 #include <iconv/loop.c>
 
 
@@ -236,17 +278,16 @@ gconv_end (struct __gconv_step *data)
 #define LOOPFCT                        FROM_LOOP
 #define BODY \
   {                                                                          \
-    uint16_t u1 = *(uint16_t *) inptr;                                       \
+    uint16_t u1 = get16 (inptr);                                             \
                                                                              \
-    if ((__BYTE_ORDER == __LITTLE_ENDIAN && var == UTF_16BE)                 \
-        || (__BYTE_ORDER == __BIG_ENDIAN && var == UTF_16LE))                \
+    if (swap)                                                                \
       {                                                                              \
        u1 = bswap_16 (u1);                                                   \
                                                                              \
-       if (u1 < 0xd800 || u1 > 0xdfff)                                       \
+       if (__builtin_expect (u1 < 0xd800, 1) || u1 > 0xdfff)                 \
          {                                                                   \
            /* No surrogate.  */                                              \
-           *((uint32_t *) outptr) = u1;                                      \
+           put32 (outptr, u1);                                               \
            inptr += 2;                                                       \
          }                                                                   \
        else                                                                  \
@@ -255,7 +296,7 @@ gconv_end (struct __gconv_step *data)
                                                                              \
            /* It's a surrogate character.  At least the first word says      \
               it is.  */                                                     \
-           if (NEED_LENGTH_TEST && inptr + 4 > inend)                        \
+           if (__builtin_expect (inptr + 4 > inend, 0))                      \
              {                                                               \
                /* We don't have enough input for another complete input      \
                   character.  */                                             \
@@ -263,34 +304,26 @@ gconv_end (struct __gconv_step *data)
                break;                                                        \
              }                                                               \
                                                                              \
-           u2 = bswap_16 (((uint16_t *) inptr)[1]);                          \
-           if (u2 < 0xdc00 || u2 >= 0xdfff)                                  \
+           inptr += 2;                                                       \
+           u2 = bswap_16 (get16 (inptr));                                    \
+           if (__builtin_expect (u2 < 0xdc00, 0)                             \
+               || __builtin_expect (u2 == 0xdfff, 0))                        \
              {                                                               \
                /* This is no valid second word for a surrogate.  */          \
-               result = __GCONV_ILLEGAL_INPUT;                               \
-               break;                                                        \
+               inptr -= 2;                                                   \
+               STANDARD_FROM_LOOP_ERR_HANDLER (2);                           \
              }                                                               \
                                                                              \
-           *((uint32_t *) outptr) = ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00);   \
-           inptr += 4;                                                       \
+           put32 (outptr, ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00));            \
+           inptr += 2;                                                       \
          }                                                                   \
       }                                                                              \
     else                                                                     \
       {                                                                              \
-       if (u1 == BOM && var == UTF_16 && !step_data->__internal_use          \
-           && step_data->__invocation_counter == 0 && inptr == *inptrp)      \
-         {                                                                   \
-           /* This is the first word in the file and it is the BOM and       \
-              we are converting a file without specified byte order.         \
-              Simply sack the BOM.  */                                       \
-           inptr += 2;                                                       \
-           continue;                                                         \
-         }                                                                   \
-                                                                             \
-       if (u1 < 0xd800 || u1 > 0xdfff)                                       \
+       if (__builtin_expect (u1 < 0xd800, 1) || u1 > 0xdfff)                 \
          {                                                                   \
            /* No surrogate.  */                                              \
-           *((uint32_t *) outptr) = u1;                                      \
+           put32 (outptr, u1);                                               \
            inptr += 2;                                                       \
          }                                                                   \
        else                                                                  \
@@ -299,7 +332,7 @@ gconv_end (struct __gconv_step *data)
                                                                              \
            /* It's a surrogate character.  At least the first word says      \
               it is.  */                                                     \
-           if (NEED_LENGTH_TEST && inptr + 4 > inend)                        \
+           if (__builtin_expect (inptr + 4 > inend, 0))                      \
              {                                                               \
                /* We don't have enough input for another complete input      \
                   character.  */                                             \
@@ -307,22 +340,25 @@ gconv_end (struct __gconv_step *data)
                break;                                                        \
              }                                                               \
                                                                              \
-           u2 = ((uint16_t *) inptr)[1];                                     \
-           if (u2 < 0xdc00 || u2 >= 0xdfff)                                  \
+           inptr += 2;                                                       \
+           u2 = get16 (inptr);                                               \
+           if (__builtin_expect (u2 < 0xdc00, 0)                             \
+               || __builtin_expect (u2 >= 0xdfff, 0))                        \
              {                                                               \
                /* This is no valid second word for a surrogate.  */          \
-               result = __GCONV_ILLEGAL_INPUT;                               \
-               break;                                                        \
+               inptr -= 2;                                                   \
+               STANDARD_FROM_LOOP_ERR_HANDLER (2);                           \
              }                                                               \
                                                                              \
-           *((uint32_t *) outptr) = ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00);   \
-           inptr += 4;                                                       \
+           put32 (outptr, ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00));            \
+           inptr += 2;                                                       \
          }                                                                   \
       }                                                                              \
     outptr += 4;                                                             \
   }
+#define LOOP_NEED_FLAGS
 #define EXTRA_LOOP_DECLS \
-       , enum variant var, struct __gconv_step_data *step_data
+       , enum variant var, int swap
 #include <iconv/loop.c>