b8165088e137e6f70cf548e3a4a3f916e7cff93e
[kopensolaris-gnu/glibc.git] / iconvdata / utf-16.c
1 /* Conversion module for UTF-16.
2    Copyright (C) 1999, 2000-2002 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
5
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, write to the Free
18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 #include <byteswap.h>
22 #include <dlfcn.h>
23 #include <gconv.h>
24 #include <stddef.h>
25 #include <stdint.h>
26 #include <stdlib.h>
27 #include <string.h>
28
29 /* This is the Byte Order Mark character (BOM).  */
30 #define BOM     0xfeff
31 /* And in the other byte order.  */
32 #define BOM_OE  0xfffe
33
34
35 /* Definitions used in the body of the `gconv' function.  */
36 #define FROM_LOOP               from_utf16_loop
37 #define TO_LOOP                 to_utf16_loop
38 #define DEFINE_INIT             0
39 #define DEFINE_FINI             0
40 #define MIN_NEEDED_FROM         2
41 #define MAX_NEEDED_FROM         4
42 #define MIN_NEEDED_TO           4
43 #define FROM_DIRECTION          (dir == from_utf16)
44 #define PREPARE_LOOP \
45   enum direction dir = ((struct utf16_data *) step->__data)->dir;             \
46   enum variant var = ((struct utf16_data *) step->__data)->var;               \
47   int swap;                                                                   \
48   if (FROM_DIRECTION && var == UTF_16)                                        \
49     {                                                                         \
50       if (data->__invocation_counter == 0)                                    \
51         {                                                                     \
52           /* We have to find out which byte order the file is encoded in.  */ \
53           if (inptr + 2 > inend)                                              \
54             return (inptr == inend                                            \
55                     ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);        \
56                                                                               \
57           if (get16u (inptr) == BOM)                                          \
58             /* Simply ignore the BOM character.  */                           \
59             *inptrp = inptr += 2;                                             \
60           else if (get16u (inptr) == BOM_OE)                                  \
61             {                                                                 \
62               ((struct utf16_data *) step->__data)->swap = 1;                 \
63               *inptrp = inptr += 2;                                           \
64             }                                                                 \
65         }                                                                     \
66     }                                                                         \
67   else if (!FROM_DIRECTION && var == UTF_16 && !data->__internal_use          \
68            && data->__invocation_counter == 0)                                \
69     {                                                                         \
70       /* Emit the Byte Order Mark.  */                                        \
71       if (__builtin_expect (outbuf + 2 > outend, 0))                          \
72         return __GCONV_FULL_OUTPUT;                                           \
73                                                                               \
74       put16u (outbuf, BOM);                                                   \
75       outbuf += 2;                                                            \
76     }                                                                         \
77   swap = ((struct utf16_data *) step->__data)->swap;
78 #define EXTRA_LOOP_ARGS         , var, swap
79
80
81 /* Direction of the transformation.  */
82 enum direction
83 {
84   illegal_dir,
85   to_utf16,
86   from_utf16
87 };
88
89 enum variant
90 {
91   illegal_var,
92   UTF_16,
93   UTF_16LE,
94   UTF_16BE
95 };
96
97 struct utf16_data
98 {
99   enum direction dir;
100   enum variant var;
101   int swap;
102 };
103
104
105 extern int gconv_init (struct __gconv_step *step);
106 int
107 gconv_init (struct __gconv_step *step)
108 {
109   /* Determine which direction.  */
110   struct utf16_data *new_data;
111   enum direction dir = illegal_dir;
112   enum variant var = illegal_var;
113   int result;
114
115   if (__strcasecmp (step->__from_name, "UTF-16//") == 0)
116     {
117       dir = from_utf16;
118       var = UTF_16;
119     }
120   else if (__strcasecmp (step->__to_name, "UTF-16//") == 0)
121     {
122       dir = to_utf16;
123       var = UTF_16;
124     }
125   else if (__strcasecmp (step->__from_name, "UTF-16BE//") == 0)
126     {
127       dir = from_utf16;
128       var = UTF_16BE;
129     }
130   else if (__strcasecmp (step->__to_name, "UTF-16BE//") == 0)
131     {
132       dir = to_utf16;
133       var = UTF_16BE;
134     }
135   else if (__strcasecmp (step->__from_name, "UTF-16LE//") == 0)
136     {
137       dir = from_utf16;
138       var = UTF_16LE;
139     }
140   else if (__strcasecmp (step->__to_name, "UTF-16LE//") == 0)
141     {
142       dir = to_utf16;
143       var = UTF_16LE;
144     }
145
146   result = __GCONV_NOCONV;
147   if (__builtin_expect (dir, to_utf16) != illegal_dir)
148     {
149       new_data = (struct utf16_data *) malloc (sizeof (struct utf16_data));
150
151       result = __GCONV_NOMEM;
152       if (new_data != NULL)
153         {
154           new_data->dir = dir;
155           new_data->var = var;
156           new_data->swap = ((var == UTF_16LE && BYTE_ORDER == BIG_ENDIAN)
157                             || (var == UTF_16BE
158                                 && BYTE_ORDER == LITTLE_ENDIAN));
159           step->__data = new_data;
160
161           if (dir == from_utf16)
162             {
163               step->__min_needed_from = MIN_NEEDED_FROM;
164               step->__max_needed_from = MAX_NEEDED_FROM;
165               step->__min_needed_to = MIN_NEEDED_TO;
166               step->__max_needed_to = MIN_NEEDED_TO;
167             }
168           else
169             {
170               step->__min_needed_from = MIN_NEEDED_TO;
171               step->__max_needed_from = MIN_NEEDED_TO;
172               step->__min_needed_to = MIN_NEEDED_FROM;
173               step->__max_needed_to = MAX_NEEDED_FROM;
174             }
175
176           step->__stateful = 0;
177
178           result = __GCONV_OK;
179         }
180     }
181
182   return result;
183 }
184
185
186 extern void gconv_end (struct __gconv_step *data);
187 void
188 gconv_end (struct __gconv_step *data)
189 {
190   free (data->__data);
191 }
192
193
194 /* Convert from the internal (UCS4-like) format to UTF-16.  */
195 #define MIN_NEEDED_INPUT        MIN_NEEDED_TO
196 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_FROM
197 #define MAX_NEEDED_OUTPUT       MAX_NEEDED_FROM
198 #define LOOPFCT                 TO_LOOP
199 #define BODY \
200   {                                                                           \
201     uint32_t c = get32 (inptr);                                               \
202                                                                               \
203     if (__builtin_expect (c >= 0xd800 && c < 0xe000, 0))                      \
204       {                                                                       \
205         /* Surrogate characters in UCS-4 input are not valid.                 \
206            We must catch this.  If we let surrogates pass through,            \
207            attackers could make a security hole exploit by                    \
208            synthesizing any desired plane 1-16 character.  */                 \
209         result = __GCONV_ILLEGAL_INPUT;                                       \
210         if (! ignore_errors_p ())                                             \
211           break;                                                              \
212         inptr += 4;                                                           \
213         ++*irreversible;                                                      \
214         continue;                                                             \
215       }                                                                       \
216                                                                               \
217     if (swap)                                                                 \
218       {                                                                       \
219         if (__builtin_expect (c >= 0x10000, 0))                               \
220           {                                                                   \
221             if (__builtin_expect (c >= 0x110000, 0))                          \
222               {                                                               \
223                 STANDARD_TO_LOOP_ERR_HANDLER (4);                             \
224               }                                                               \
225                                                                               \
226             /* Generate a surrogate character.  */                            \
227             if (__builtin_expect (outptr + 4 > outend, 0))                    \
228               {                                                               \
229                 /* Overflow in the output buffer.  */                         \
230                 result = __GCONV_FULL_OUTPUT;                                 \
231                 break;                                                        \
232               }                                                               \
233                                                                               \
234             put16 (outptr, bswap_16 (0xd7c0 + (c >> 10)));                    \
235             outptr += 2;                                                      \
236             put16 (outptr, bswap_16 (0xdc00 + (c & 0x3ff)));                  \
237           }                                                                   \
238         else                                                                  \
239           put16 (outptr, bswap_16 (c));                                       \
240       }                                                                       \
241     else                                                                      \
242       {                                                                       \
243         if (__builtin_expect (c >= 0x10000, 0))                               \
244           {                                                                   \
245             if (__builtin_expect (c >= 0x110000, 0))                          \
246               {                                                               \
247                 STANDARD_TO_LOOP_ERR_HANDLER (4);                             \
248               }                                                               \
249                                                                               \
250             /* Generate a surrogate character.  */                            \
251             if (__builtin_expect (outptr + 4 > outend, 0))                    \
252               {                                                               \
253                 /* Overflow in the output buffer.  */                         \
254                 result = __GCONV_FULL_OUTPUT;                                 \
255                 break;                                                        \
256               }                                                               \
257                                                                               \
258             put16 (outptr, 0xd7c0 + (c >> 10));                               \
259             outptr += 2;                                                      \
260             put16 (outptr, 0xdc00 + (c & 0x3ff));                             \
261           }                                                                   \
262         else                                                                  \
263           put16 (outptr, c);                                                  \
264       }                                                                       \
265     outptr += 2;                                                              \
266     inptr += 4;                                                               \
267   }
268 #define LOOP_NEED_FLAGS
269 #define EXTRA_LOOP_DECLS \
270         , enum variant var, int swap
271 #include <iconv/loop.c>
272
273
274 /* Convert from UTF-16 to the internal (UCS4-like) format.  */
275 #define MIN_NEEDED_INPUT        MIN_NEEDED_FROM
276 #define MAX_NEEDED_INPUT        MAX_NEEDED_FROM
277 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_TO
278 #define LOOPFCT                 FROM_LOOP
279 #define BODY \
280   {                                                                           \
281     uint16_t u1 = get16 (inptr);                                              \
282                                                                               \
283     if (swap)                                                                 \
284       {                                                                       \
285         u1 = bswap_16 (u1);                                                   \
286                                                                               \
287         if (__builtin_expect (u1 < 0xd800, 1) || u1 > 0xdfff)                 \
288           {                                                                   \
289             /* No surrogate.  */                                              \
290             put32 (outptr, u1);                                               \
291             inptr += 2;                                                       \
292           }                                                                   \
293         else                                                                  \
294           {                                                                   \
295             uint16_t u2;                                                      \
296                                                                               \
297             /* It's a surrogate character.  At least the first word says      \
298                it is.  */                                                     \
299             if (__builtin_expect (inptr + 4 > inend, 0))                      \
300               {                                                               \
301                 /* We don't have enough input for another complete input      \
302                    character.  */                                             \
303                 result = __GCONV_INCOMPLETE_INPUT;                            \
304                 break;                                                        \
305               }                                                               \
306                                                                               \
307             inptr += 2;                                                       \
308             u2 = bswap_16 (get16 (inptr));                                    \
309             if (__builtin_expect (u2 < 0xdc00, 0)                             \
310                 || __builtin_expect (u2 == 0xdfff, 0))                        \
311               {                                                               \
312                 /* This is no valid second word for a surrogate.  */          \
313                 inptr -= 2;                                                   \
314                 STANDARD_FROM_LOOP_ERR_HANDLER (2);                           \
315               }                                                               \
316                                                                               \
317             put32 (outptr, ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00));            \
318             inptr += 2;                                                       \
319           }                                                                   \
320       }                                                                       \
321     else                                                                      \
322       {                                                                       \
323         if (__builtin_expect (u1 < 0xd800, 1) || u1 > 0xdfff)                 \
324           {                                                                   \
325             /* No surrogate.  */                                              \
326             put32 (outptr, u1);                                               \
327             inptr += 2;                                                       \
328           }                                                                   \
329         else                                                                  \
330           {                                                                   \
331             uint16_t u2;                                                      \
332                                                                               \
333             /* It's a surrogate character.  At least the first word says      \
334                it is.  */                                                     \
335             if (__builtin_expect (inptr + 4 > inend, 0))                      \
336               {                                                               \
337                 /* We don't have enough input for another complete input      \
338                    character.  */                                             \
339                 result = __GCONV_INCOMPLETE_INPUT;                            \
340                 break;                                                        \
341               }                                                               \
342                                                                               \
343             inptr += 2;                                                       \
344             u2 = get16 (inptr);                                               \
345             if (__builtin_expect (u2 < 0xdc00, 0)                             \
346                 || __builtin_expect (u2 >= 0xdfff, 0))                        \
347               {                                                               \
348                 /* This is no valid second word for a surrogate.  */          \
349                 inptr -= 2;                                                   \
350                 STANDARD_FROM_LOOP_ERR_HANDLER (2);                           \
351               }                                                               \
352                                                                               \
353             put32 (outptr, ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00));            \
354             inptr += 2;                                                       \
355           }                                                                   \
356       }                                                                       \
357     outptr += 4;                                                              \
358   }
359 #define LOOP_NEED_FLAGS
360 #define EXTRA_LOOP_DECLS \
361         , enum variant var, int swap
362 #include <iconv/loop.c>
363
364
365 /* Now define the toplevel functions.  */
366 #include <iconv/skeleton.c>