(PREPARE_LOOP): Return __GCONV_EMPTY_INPUT only if input is really
[kopensolaris-gnu/glibc.git] / iconvdata / unicode.c
1 /* Conversion module for Unicode
2    Copyright (C) 1999, 2000-2002 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
5
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, write to the Free
18    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19    02111-1307 USA.  */
20
21 #include <byteswap.h>
22 #include <dlfcn.h>
23 #include <gconv.h>
24 #include <stddef.h>
25 #include <stdint.h>
26 #include <stdlib.h>
27 #include <string.h>
28
29 /* This is the Byte Order Mark character (BOM).  */
30 #define BOM     0xfeff
31 /* And in the other endian format.  */
32 #define BOM_OE  0xfffe
33
34
35 /* Definitions used in the body of the `gconv' function.  */
36 #define FROM_LOOP               from_unicode_loop
37 #define TO_LOOP                 to_unicode_loop
38 #define DEFINE_INIT             0
39 #define DEFINE_FINI             0
40 #define MIN_NEEDED_FROM         2
41 #define MIN_NEEDED_TO           4
42 #define FROM_DIRECTION          (dir == from_unicode)
43 #define PREPARE_LOOP \
44   enum direction dir = ((struct unicode_data *) step->__data)->dir;           \
45   int swap;                                                                   \
46   if (FROM_DIRECTION)                                                         \
47     {                                                                         \
48       if (data->__invocation_counter == 0)                                    \
49         {                                                                     \
50           /* We have to find out which byte order the file is encoded in.  */ \
51           if (inptr + 2 > inend)                                              \
52             return (inptr == inend                                            \
53                     ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);        \
54                                                                               \
55           if (get16u (inptr) == BOM)                                          \
56             /* Simply ignore the BOM character.  */                           \
57             *inptrp = inptr += 2;                                             \
58           else if (get16u (inptr) == BOM_OE)                                  \
59             {                                                                 \
60               ((struct unicode_data *) step->__data)->swap = 1;               \
61               *inptrp = inptr += 2;                                           \
62             }                                                                 \
63         }                                                                     \
64     }                                                                         \
65   else if (!data->__internal_use && data->__invocation_counter == 0)          \
66     {                                                                         \
67       /* Emit the Byte Order Mark.  */                                        \
68       if (__builtin_expect (outbuf + 2 > outend, 0))                          \
69         return __GCONV_FULL_OUTPUT;                                           \
70                                                                               \
71       put16u (outbuf, BOM);                                                   \
72       outbuf += 2;                                                            \
73     }                                                                         \
74   swap = ((struct unicode_data *) step->__data)->swap;
75 #define EXTRA_LOOP_ARGS         , swap
76
77
78 /* Direction of the transformation.  */
79 enum direction
80 {
81   illegal_dir,
82   to_unicode,
83   from_unicode
84 };
85
86 struct unicode_data
87 {
88   enum direction dir;
89   int swap;
90 };
91
92
93 extern int gconv_init (struct __gconv_step *step);
94 int
95 gconv_init (struct __gconv_step *step)
96 {
97   /* Determine which direction.  */
98   struct unicode_data *new_data;
99   enum direction dir = illegal_dir;
100   int result;
101
102   if (strcmp (step->__from_name, "UNICODE//") == 0)
103     dir = from_unicode;
104   else
105     dir = to_unicode;
106
107   new_data = (struct unicode_data *) malloc (sizeof (struct unicode_data));
108
109   result = __GCONV_NOMEM;
110   if (new_data != NULL)
111     {
112       new_data->dir = dir;
113       new_data->swap = 0;
114       step->__data = new_data;
115
116       if (dir == from_unicode)
117         {
118           step->__min_needed_from = MIN_NEEDED_FROM;
119           step->__max_needed_from = MIN_NEEDED_FROM;
120           step->__min_needed_to = MIN_NEEDED_TO;
121           step->__max_needed_to = MIN_NEEDED_TO;
122         }
123       else
124         {
125           step->__min_needed_from = MIN_NEEDED_TO;
126           step->__max_needed_from = MIN_NEEDED_TO;
127           step->__min_needed_to = MIN_NEEDED_FROM;
128           step->__max_needed_to = MIN_NEEDED_FROM;
129         }
130
131       step->__stateful = 0;
132
133       result = __GCONV_OK;
134     }
135
136   return result;
137 }
138
139
140 extern void gconv_end (struct __gconv_step *data);
141 void
142 gconv_end (struct __gconv_step *data)
143 {
144   free (data->__data);
145 }
146
147
148 /* Convert from the internal (UCS4-like) format to UCS2.  */
149 #define MIN_NEEDED_INPUT        MIN_NEEDED_TO
150 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_FROM
151 #define LOOPFCT                 TO_LOOP
152 #define BODY \
153   {                                                                           \
154     uint32_t c = get32 (inptr);                                               \
155                                                                               \
156     if (__builtin_expect (c >= 0x10000, 0))                                   \
157       {                                                                       \
158         UNICODE_TAG_HANDLER (c, 4);                                           \
159         STANDARD_TO_LOOP_ERR_HANDLER (4);                                     \
160       }                                                                       \
161     else if (__builtin_expect (c >= 0xd800 && c < 0xe000, 0))                 \
162       {                                                                       \
163         /* Surrogate characters in UCS-4 input are not valid.                 \
164            We must catch this, because the UCS-2 output might be              \
165            interpreted as UTF-16 by other programs.  If we let                \
166            surrogates pass through, attackers could make a security           \
167            hole exploit by synthesizing any desired plane 1-16                \
168            character.  */                                                     \
169         result = __GCONV_ILLEGAL_INPUT;                                       \
170         if (! ignore_errors_p ())                                             \
171           break;                                                              \
172         inptr += 4;                                                           \
173         ++*irreversible;                                                      \
174         continue;                                                             \
175       }                                                                       \
176     else                                                                      \
177       {                                                                       \
178         put16 (outptr, c);                                                    \
179         outptr += 2;                                                          \
180       }                                                                       \
181                                                                               \
182     inptr += 4;                                                               \
183   }
184 #define LOOP_NEED_FLAGS
185 #define EXTRA_LOOP_DECLS \
186         , int swap
187 #include <iconv/loop.c>
188
189
190 /* Convert from UCS2 to the internal (UCS4-like) format.  */
191 #define MIN_NEEDED_INPUT        MIN_NEEDED_FROM
192 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_TO
193 #define LOOPFCT                 FROM_LOOP
194 #define BODY \
195   {                                                                           \
196     uint16_t u1 = get16 (inptr);                                              \
197                                                                               \
198     if (swap)                                                                 \
199       u1 = bswap_16 (u1);                                                     \
200                                                                               \
201     if (__builtin_expect (u1 >= 0xd800 && u1 < 0xe000, 0))                    \
202       {                                                                       \
203         /* Surrogate characters in UCS-2 input are not valid.  Reject         \
204            them.  (Catching this here is not security relevant.)  */          \
205         STANDARD_FROM_LOOP_ERR_HANDLER (2);                                   \
206       }                                                                       \
207                                                                               \
208     put32 (outptr, u1);                                                       \
209                                                                               \
210     inptr += 2;                                                               \
211     outptr += 4;                                                              \
212   }
213 #define LOOP_NEED_FLAGS
214 #define EXTRA_LOOP_DECLS \
215         , int swap
216 #include <iconv/loop.c>
217
218
219 /* Now define the toplevel functions.  */
220 #include <iconv/skeleton.c>