iconv module to implement conversion from and to UTF-16.
[kopensolaris-gnu/glibc.git] / iconvdata / utf-16.c
1 /* Conversion module for UTF-16.
2    Copyright (C) 1999, 2000 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
5
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Library General Public License as
8    published by the Free Software Foundation; either version 2 of the
9    License, or (at your option) any later version.
10
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Library General Public License for more details.
15
16    You should have received a copy of the GNU Library General Public
17    License along with the GNU C Library; see the file COPYING.LIB.  If not,
18    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19    Boston, MA 02111-1307, USA.  */
20
21 #include <byteswap.h>
22 #include <gconv.h>
23 #include <stddef.h>
24 #include <stdint.h>
25 #include <stdlib.h>
26 #include <string.h>
27
28 /* This is the Byte Order Mark character (BOM).  */
29 #define BOM     0xfeff
30
31
32 /* Definitions used in the body of the `gconv' function.  */
33 #define FROM_LOOP               from_utf16_loop
34 #define TO_LOOP                 to_utf16_loop
35 #define DEFINE_INIT             0
36 #define DEFINE_FINI             0
37 #define MIN_NEEDED_FROM         2
38 #define MAX_NEEDED_FROM         4
39 #define MIN_NEEDED_TO           4
40 #define FROM_DIRECTION          (dir == from_utf16)
41 #define PREPARE_LOOP \
42   enum direction dir = ((struct utf16_data *) step->__data)->dir;             \
43   enum variant var = ((struct utf16_data *) step->__data)->var;               \
44   if (!FROM_DIRECTION && var == UTF_16 && !data->__internal_use               \
45       && data->__invocation_counter == 0)                                     \
46     {                                                                         \
47       /* Emit the Byte Order Mark.  */                                        \
48       if (outbuf + 2 > outend)                                                \
49         return __GCONV_FULL_OUTPUT;                                           \
50                                                                               \
51       *(uint16_t *) outbuf = BOM;                                             \
52       outbuf += 2;                                                            \
53     }
54 #define EXTRA_LOOP_ARGS         , var, data
55
56
57 /* Direction of the transformation.  */
58 enum direction
59 {
60   illegal_dir,
61   to_utf16,
62   from_utf16
63 };
64
65 enum variant
66 {
67   illegal_var,
68   UTF_16,
69   UTF_16LE,
70   UTF_16BE
71 };
72
73 struct utf16_data
74 {
75   enum direction dir;
76   enum variant var;
77 };
78
79
80 int
81 gconv_init (struct __gconv_step *step)
82 {
83   /* Determine which direction.  */
84   struct utf16_data *new_data;
85   enum direction dir = illegal_dir;
86   enum variant var = illegal_var;
87   int result;
88
89   if (__strcasecmp (step->__from_name, "UTF-16") == 0)
90     {
91       dir = from_utf16;
92       var = UTF_16;
93     }
94   else if (__strcasecmp (step->__to_name, "UTF-16") == 0)
95     {
96       dir = to_utf16;
97       var = UTF_16;
98     }
99   else if (__strcasecmp (step->__from_name, "UTF-16BE") == 0)
100     {
101       dir = from_utf16;
102       var = UTF_16BE;
103     }
104   else if (__strcasecmp (step->__to_name, "UTF-16BE") == 0)
105     {
106       dir = to_utf16;
107       var = UTF_16BE;
108     }
109   else if (__strcasecmp (step->__from_name, "UTF-16LE") == 0)
110     {
111       dir = from_utf16;
112       var = UTF_16LE;
113     }
114   else if (__strcasecmp (step->__to_name, "UTF-16LE") == 0)
115     {
116       dir = to_utf16;
117       var = UTF_16LE;
118     }
119
120   result = __GCONV_NOCONV;
121   if (dir != illegal_dir)
122     {
123       new_data = (struct utf16_data *) malloc (sizeof (struct utf16_data));
124
125       result = __GCONV_NOMEM;
126       if (new_data != NULL)
127         {
128           new_data->dir = dir;
129           new_data->var = var;
130           step->__data = new_data;
131
132           if (var == from_utf16)
133             {
134               step->__min_needed_from = MIN_NEEDED_FROM;
135               step->__max_needed_from = MIN_NEEDED_FROM;
136               step->__min_needed_to = MIN_NEEDED_TO;
137               step->__max_needed_to = MIN_NEEDED_TO;
138             }
139           else
140             {
141               step->__min_needed_from = MIN_NEEDED_TO;
142               step->__max_needed_from = MIN_NEEDED_TO;
143               step->__min_needed_to = MIN_NEEDED_FROM;
144               step->__max_needed_to = MIN_NEEDED_FROM;
145             }
146
147           step->__stateful = 0;
148
149           result = __GCONV_OK;
150         }
151     }
152
153   return result;
154 }
155
156
157 void
158 gconv_end (struct __gconv_step *data)
159 {
160   free (data->__data);
161 }
162
163
164 /* Convert from the internal (UCS4-like) format to UTF-16.  */
165 #define MIN_NEEDED_INPUT        MIN_NEEDED_TO
166 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_FROM
167 #define MAX_NEEDED_OUTPUT       MAX_NEEDED_FROM
168 #define LOOPFCT                 TO_LOOP
169 #define BODY \
170   {                                                                           \
171     uint32_t c = *((uint32_t *) inptr);                                       \
172                                                                               \
173     if ((__BYTE_ORDER == __LITTLE_ENDIAN && var == UTF_16BE)                  \
174         || (__BYTE_ORDER == __BIG_ENDIAN && var == UTF_16LE))                 \
175       {                                                                       \
176         if (c >= 0x10000)                                                     \
177           {                                                                   \
178             if (c >= 0x110000)                                                \
179               {                                                               \
180                 result = __GCONV_ILLEGAL_INPUT;                               \
181                 break;                                                        \
182               }                                                               \
183                                                                               \
184             /* Generate a surrogate character.  */                            \
185             if (NEED_LENGTH_TEST && outptr + 4 > outend)                      \
186               {                                                               \
187                 /* Overflow in the output buffer.  */                         \
188                 result = __GCONV_FULL_OUTPUT;                                 \
189                 break;                                                        \
190               }                                                               \
191                                                                               \
192             *((uint16_t *) outptr) = bswap_16 (0xd7c0 + (c >> 10));           \
193             outptr += 2;                                                      \
194             *((uint16_t *) outptr) = bswap_16 (0xdc00 + (c & 0x3ff));         \
195           }                                                                   \
196         else                                                                  \
197           *((uint16_t *) outptr) = bswap_16 (c);                              \
198       }                                                                       \
199     else                                                                      \
200       {                                                                       \
201         if (c >= 0x10000)                                                     \
202           {                                                                   \
203             if (c >= 0x110000)                                                \
204               {                                                               \
205                 result = __GCONV_ILLEGAL_INPUT;                               \
206                 break;                                                        \
207               }                                                               \
208                                                                               \
209             /* Generate a surrogate character.  */                            \
210             if (NEED_LENGTH_TEST && outptr + 4 > outend)                      \
211               {                                                               \
212                 /* Overflow in the output buffer.  */                         \
213                 result = __GCONV_FULL_OUTPUT;                                 \
214                 break;                                                        \
215               }                                                               \
216                                                                               \
217             *((uint16_t *) outptr) = 0xd7c0 + (c >> 10);                      \
218             outptr += 2;                                                      \
219             *((uint16_t *) outptr) = 0xdc00 + (c & 0x3ff);                    \
220           }                                                                   \
221         else                                                                  \
222           *((uint16_t *) outptr) = c;                                         \
223       }                                                                       \
224     outptr += 2;                                                              \
225     inptr += 4;                                                               \
226   }
227 #define EXTRA_LOOP_DECLS \
228         , enum variant var, struct __gconv_step_data *step_data
229 #include <iconv/loop.c>
230
231
232 /* Convert from UTF-16 to the internal (UCS4-like) format.  */
233 #define MIN_NEEDED_INPUT        MIN_NEEDED_FROM
234 #define MAX_NEEDED_INPUT        MAX_NEEDED_FROM
235 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_TO
236 #define LOOPFCT                 FROM_LOOP
237 #define BODY \
238   {                                                                           \
239     uint16_t u1 = *(uint16_t *) inptr;                                        \
240                                                                               \
241     if ((__BYTE_ORDER == __LITTLE_ENDIAN && var == UTF_16BE)                  \
242         || (__BYTE_ORDER == __BIG_ENDIAN && var == UTF_16LE))                 \
243       {                                                                       \
244         u1 = bswap_16 (u1);                                                   \
245                                                                               \
246         if (u1 < 0xd800 || u1 > 0xdfff)                                       \
247           {                                                                   \
248             /* No surrogate.  */                                              \
249             *((uint32_t *) outptr) = u1;                                      \
250             inptr += 2;                                                       \
251           }                                                                   \
252         else                                                                  \
253           {                                                                   \
254             uint16_t u2;                                                      \
255                                                                               \
256             /* It's a surrogate character.  At least the first word says      \
257                it is.  */                                                     \
258             if (NEED_LENGTH_TEST && inptr + 4 > inend)                        \
259               {                                                               \
260                 /* We don't have enough input for another complete input      \
261                    character.  */                                             \
262                 result = __GCONV_INCOMPLETE_INPUT;                            \
263                 break;                                                        \
264               }                                                               \
265                                                                               \
266             u2 = bswap_16 (((uint16_t *) inptr)[1]);                          \
267             if (u2 < 0xdc00 || u2 >= 0xdfff)                                  \
268               {                                                               \
269                 /* This is no valid second word for a surrogate.  */          \
270                 result = __GCONV_ILLEGAL_INPUT;                               \
271                 break;                                                        \
272               }                                                               \
273                                                                               \
274             *((uint32_t *) outptr) = ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00);   \
275             inptr += 4;                                                       \
276           }                                                                   \
277       }                                                                       \
278     else                                                                      \
279       {                                                                       \
280         if (u1 == BOM && var == UTF_16 && !step_data->__internal_use          \
281             && step_data->__invocation_counter == 0 && inptr == *inptrp)      \
282           {                                                                   \
283             /* This is the first word in the file and it is the BOM and       \
284                we are converting a file without specified byte order.         \
285                Simply sack the BOM.  */                                       \
286             inptr += 2;                                                       \
287             continue;                                                         \
288           }                                                                   \
289                                                                               \
290         if (u1 < 0xd800 || u1 > 0xdfff)                                       \
291           {                                                                   \
292             /* No surrogate.  */                                              \
293             *((uint32_t *) outptr) = u1;                                      \
294             inptr += 2;                                                       \
295           }                                                                   \
296         else                                                                  \
297           {                                                                   \
298             uint16_t u2;                                                      \
299                                                                               \
300             /* It's a surrogate character.  At least the first word says      \
301                it is.  */                                                     \
302             if (NEED_LENGTH_TEST && inptr + 4 > inend)                        \
303               {                                                               \
304                 /* We don't have enough input for another complete input      \
305                    character.  */                                             \
306                 result = __GCONV_INCOMPLETE_INPUT;                            \
307                 break;                                                        \
308               }                                                               \
309                                                                               \
310             u2 = ((uint16_t *) inptr)[1];                                     \
311             if (u2 < 0xdc00 || u2 >= 0xdfff)                                  \
312               {                                                               \
313                 /* This is no valid second word for a surrogate.  */          \
314                 result = __GCONV_ILLEGAL_INPUT;                               \
315                 break;                                                        \
316               }                                                               \
317                                                                               \
318             *((uint32_t *) outptr) = ((u1 - 0xd7c0) << 10) + (u2 - 0xdc00);   \
319             inptr += 4;                                                       \
320           }                                                                   \
321       }                                                                       \
322     outptr += 4;                                                              \
323   }
324 #define EXTRA_LOOP_DECLS \
325         , enum variant var, struct __gconv_step_data *step_data
326 #include <iconv/loop.c>
327
328
329 /* Now define the toplevel functions.  */
330 #include <iconv/skeleton.c>