c224ee7eace81a851f1ed77877ff3f4a82cf67c3
[kopensolaris-gnu/glibc.git] / iconvdata / iso-2022-cn.c
1 /* Conversion module for ISO-2022-CN.
2    Copyright (C) 1999, 2000 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Ulrich Drepper <drepper@cygnus.com>, 1999.
5
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Library General Public License as
8    published by the Free Software Foundation; either version 2 of the
9    License, or (at your option) any later version.
10
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Library General Public License for more details.
15
16    You should have received a copy of the GNU Library General Public
17    License along with the GNU C Library; see the file COPYING.LIB.  If not,
18    write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19    Boston, MA 02111-1307, USA.  */
20
21 #include <gconv.h>
22 #include <stdint.h>
23 #include <string.h>
24 #include "gb2312.h"
25 #include "cns11643l1.h"
26 #include "cns11643l2.h"
27
28 #include <assert.h>
29
30 /* This makes obvious what everybody knows: 0x1b is the Esc character.  */
31 #define ESC     0x1b
32
33 /* We have single-byte shift-in and shift-out sequences, and the single
34    shift sequence SS2 which replaces the SS2 designation for the next
35    two bytes.  */
36 #define SI      0x0f
37 #define SO      0x0e
38 #define SS2_0   ESC
39 #define SS2_1   0x4e
40
41 /* Definitions used in the body of the `gconv' function.  */
42 #define CHARSET_NAME            "ISO-2022-CN//"
43 #define DEFINE_INIT             1
44 #define DEFINE_FINI             1
45 #define FROM_LOOP               from_iso2022cn_loop
46 #define TO_LOOP                 to_iso2022cn_loop
47 #define MIN_NEEDED_FROM         1
48 #define MAX_NEEDED_FROM         4
49 #define MIN_NEEDED_TO           4
50 #define MAX_NEEDED_TO           4
51 #define PREPARE_LOOP \
52   int save_set;                                                               \
53   int *setp = &data->__statep->__count;
54 #define EXTRA_LOOP_ARGS         , setp
55
56
57 /* The COUNT element of the state keeps track of the currently selected
58    character set.  The possible values are:  */
59 enum
60 {
61   ASCII_set = 0,
62   GB2312_set = 8,
63   CNS11643_1_set = 16,
64   CNS11643_2_set = 24,
65   CURRENT_SEL_MASK = 24,
66   GB2312_ann = 32,
67   CNS11643_1_ann = 64,
68   CNS11643_2_ann = 128,
69   CURRENT_ANN_MASK = 224
70 };
71
72
73 /* Since this is a stateful encoding we have to provide code which resets
74    the output state to the initial state.  This has to be done during the
75    flushing.  */
76 #define EMIT_SHIFT_TO_INIT \
77   if (data->__statep->__count != ASCII_set)                                   \
78     {                                                                         \
79       if (FROM_DIRECTION)                                                     \
80         /* It's easy, we don't have to emit anything, we just reset the       \
81            state for the input.  */                                           \
82         data->__statep->__count = ASCII_set;                                  \
83       else                                                                    \
84         {                                                                     \
85           unsigned char *outbuf = data->__outbuf;                             \
86                                                                               \
87           /* We are not in the initial state.  To switch back we have         \
88              to emit `SI'.  */                                                \
89           if (__builtin_expect (outbuf == data->__outbufend, 0))              \
90             /* We don't have enough room in the output buffer.  */            \
91             status = __GCONV_FULL_OUTPUT;                                     \
92           else                                                                \
93             {                                                                 \
94               /* Write out the shift sequence.  */                            \
95               *outbuf++ = SI;                                                 \
96               data->__outbuf = outbuf;                                        \
97               data->__statep->__count = ASCII_set;                            \
98             }                                                                 \
99         }                                                                     \
100     }
101
102
103 /* Since we might have to reset input pointer we must be able to save
104    and retore the state.  */
105 #define SAVE_RESET_STATE(Save) \
106   if (Save)                                                                   \
107     save_set = *setp;                                                         \
108   else                                                                        \
109     *setp = save_set
110
111
112 /* First define the conversion function from ISO-2022-CN to UCS4.  */
113 #define MIN_NEEDED_INPUT        MIN_NEEDED_FROM
114 #define MAX_NEEDED_INPUT        MAX_NEEDED_FROM
115 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_TO
116 #define LOOPFCT                 FROM_LOOP
117 #define BODY \
118   {                                                                           \
119     uint32_t ch = *inptr;                                                     \
120                                                                               \
121     /* This is a 7bit character set, disallow all 8bit characters.  */        \
122     if (__builtin_expect (ch, 0) > 0x7f)                                      \
123       {                                                                       \
124         if (! ignore_errors_p ())                                             \
125           {                                                                   \
126             result = __GCONV_ILLEGAL_INPUT;                                   \
127             break;                                                            \
128           }                                                                   \
129                                                                               \
130         ++inptr;                                                              \
131         ++*converted;                                                         \
132         continue;                                                             \
133       }                                                                       \
134                                                                               \
135     /* Recognize escape sequences.  */                                        \
136     if (__builtin_expect (ch, 0) == ESC)                                      \
137       {                                                                       \
138         /* There are two kinds of escape sequences we have to handle:         \
139            - those announcing the use of GB and CNS characters on the         \
140              line; we can simply ignore them                                  \
141            - the initial byte of the SS2 sequence.                            \
142         */                                                                    \
143         if (NEED_LENGTH_TEST                                                  \
144             && (__builtin_expect (inptr + 1 > inend, 0)                       \
145                 || (inptr[1] == '$'                                           \
146                     && (__builtin_expect (inptr + 2 > inend, 0)               \
147                         || (inptr[2] == ')'                                   \
148                             && __builtin_expect (inptr + 3 > inend, 0))       \
149                         || (inptr[2] == '*'                                   \
150                             && __builtin_expect (inptr + 3 > inend, 0))))     \
151                 || (inptr[1] == SS2_1                                         \
152                     && __builtin_expect (inptr + 3 > inend, 0))))             \
153           {                                                                   \
154             result = __GCONV_EMPTY_INPUT;                                     \
155             break;                                                            \
156           }                                                                   \
157         if (inptr[1] == '$'                                                   \
158             && ((inptr[2] == ')' && (inptr[3] == 'A' || inptr[3] == 'G'))     \
159                 || (inptr[2] == '*' && inptr[3] == 'H')))                     \
160           {                                                                   \
161             /* OK, we accept those character sets.  */                        \
162             if (inptr[3] == 'A')                                              \
163               ann = GB2312_ann;                                               \
164             else if (inptr[3] == 'G')                                         \
165               ann = CNS11643_1_ann;                                           \
166             inptr += 4;                                                       \
167             continue;                                                         \
168           }                                                                   \
169       }                                                                       \
170     else if (__builtin_expect (ch, 0) == SO)                                  \
171       {                                                                       \
172         /* Switch to use GB2312 or CNS 11643 plane 1, depending on which      \
173            S0 designation came last.  The only problem is what to do with     \
174            faulty input files where no designator came.                       \
175            XXX For now I'll default to use GB2312.  If this is not the        \
176            best behaviour (e.g., we should flag an error) let me know.  */    \
177         ++inptr;                                                              \
178         set = ann == CNS11643_1_ann ? CNS11643_1_set : GB2312_set;            \
179         continue;                                                             \
180       }                                                                       \
181     else if (__builtin_expect (ch, 0) == SI)                                  \
182       {                                                                       \
183         /* Switch to use ASCII.  */                                           \
184         ++inptr;                                                              \
185         set = ASCII_set;                                                      \
186         continue;                                                             \
187       }                                                                       \
188                                                                               \
189     if (__builtin_expect (ch, 0) == ESC && inptr[1] == SS2_1)                 \
190       {                                                                       \
191         /* This is a character from CNS 11643 plane 2.                        \
192            XXX We could test here whether the use of this character           \
193            set was announced.  */                                             \
194         inptr += 2;                                                           \
195         ch = cns11643l2_to_ucs4 (&inptr, 2, 0);                               \
196         if (__builtin_expect (ch, 0) == __UNKNOWN_10646_CHAR)                 \
197           {                                                                   \
198             if (! ignore_errors_p ())                                         \
199               {                                                               \
200                 /* This is an illegal character.  */                          \
201                 inptr -= 2;                                                   \
202                 result = __GCONV_ILLEGAL_INPUT;                               \
203                 break;                                                        \
204               }                                                               \
205                                                                               \
206             ++*converted;                                                     \
207             continue;                                                         \
208           }                                                                   \
209       }                                                                       \
210     else if (set == ASCII_set)                                                \
211       {                                                                       \
212         /* Almost done, just advance the input pointer.  */                   \
213         ++inptr;                                                              \
214       }                                                                       \
215     else                                                                      \
216       {                                                                       \
217         /* That's pretty easy, we have a dedicated functions for this.  */    \
218         if (set == GB2312_set)                                                \
219           ch = gb2312_to_ucs4 (&inptr,                                        \
220                                NEED_LENGTH_TEST ? inend - inptr : 2, 0);      \
221         else                                                                  \
222           {                                                                   \
223             assert (set == CNS11643_1_set);                                   \
224             ch = cns11643l1_to_ucs4 (&inptr,                                  \
225                                      NEED_LENGTH_TEST ? inend - inptr : 2, 0);\
226           }                                                                   \
227                                                                               \
228         if (NEED_LENGTH_TEST && __builtin_expect (ch, 1) == 0)                \
229           {                                                                   \
230             result = __GCONV_EMPTY_INPUT;                                     \
231             break;                                                            \
232           }                                                                   \
233         else if (__builtin_expect (ch, 1) == __UNKNOWN_10646_CHAR)            \
234           {                                                                   \
235             if (! ignore_errors_p ())                                         \
236               {                                                               \
237                 /* This is an illegal character.  */                          \
238                 result = __GCONV_ILLEGAL_INPUT;                               \
239                 break;                                                        \
240               }                                                               \
241                                                                               \
242             ++inptr;                                                          \
243             ++*converted;                                                     \
244             continue;                                                         \
245           }                                                                   \
246       }                                                                       \
247                                                                               \
248     put32 (outptr, ch);                                                       \
249     outptr += 4;                                                              \
250   }
251 #define EXTRA_LOOP_DECLS        , int *setp
252 #define INIT_PARAMS             int set = *setp & CURRENT_SEL_MASK; \
253                                 int ann = *setp & CURRENT_ANN_MASK
254 #define UPDATE_PARAMS           *setp = set | ann
255 #include <iconv/loop.c>
256
257
258 /* Next, define the other direction.  */
259 #define MIN_NEEDED_INPUT        MIN_NEEDED_TO
260 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_FROM
261 #define MAX_NEEDED_OUTPUT       MAX_NEEDED_FROM
262 #define LOOPFCT                 TO_LOOP
263 #define BODY \
264   {                                                                           \
265     uint32_t ch = get32 (inptr);                                              \
266                                                                               \
267     /* First see whether we can write the character using the currently       \
268        selected character set.  */                                            \
269     if (ch < 0x80)                                                            \
270       {                                                                       \
271         if (set != ASCII_set)                                                 \
272           {                                                                   \
273             *outptr++ = SI;                                                   \
274             set = ASCII_set;                                                  \
275             if (NEED_LENGTH_TEST && __builtin_expect (outptr == outend, 0))   \
276               {                                                               \
277                 result = __GCONV_FULL_OUTPUT;                                 \
278                 break;                                                        \
279               }                                                               \
280           }                                                                   \
281                                                                               \
282         *outptr++ = ch;                                                       \
283                                                                               \
284         /* At the end of the line we have to clear the `ann' flags since      \
285            every line must contain this information again.  */                \
286         if (ch == L'\n')                                                      \
287           ann = 0;                                                            \
288       }                                                                       \
289     else                                                                      \
290       {                                                                       \
291         char buf[2];                                                          \
292         int used;                                                             \
293         size_t written = 0;                                                   \
294                                                                               \
295         if (set == GB2312_set || (ann & CNS11643_1_ann) == 0)                 \
296           {                                                                   \
297             written = ucs4_to_gb2312 (ch, buf, 2);                            \
298             used = GB2312_set;                                                \
299           }                                                                   \
300         else                                                                  \
301           {                                                                   \
302             written = ucs4_to_cns11643l1 (ch, buf, 2);                        \
303             used = CNS11643_1_set;                                            \
304           }                                                                   \
305                                                                               \
306         if (written == __UNKNOWN_10646_CHAR)                                  \
307           {                                                                   \
308             /* Cannot convert it using the currently selected SO set.         \
309                Next try the SS2 set.  */                                      \
310             written = ucs4_to_cns11643l2 (ch, buf, 2);                        \
311             if (written != __UNKNOWN_10646_CHAR)                              \
312               /* Yep, that worked.  */                                        \
313               used = CNS11643_2_set;                                          \
314             else                                                              \
315               {                                                               \
316                 /* Well, see whether we have to change the SO set.  */        \
317                 if (set == GB2312_set)                                        \
318                   written = ucs4_to_cns11643l1 (ch, buf, 2);                  \
319                 else                                                          \
320                   written = ucs4_to_gb2312 (ch, buf, 2);                      \
321                                                                               \
322                 if (__builtin_expect (written, 0) != __UNKNOWN_10646_CHAR)    \
323                   /* Oh well, then switch SO.  */                             \
324                   used = GB2312_set + CNS11643_1_set - set;                   \
325                 else                                                          \
326                   {                                                           \
327                     /* Even this does not work.  Error.  */                   \
328                     if (! ignore_errors_p ())                                 \
329                       {                                                       \
330                         result = __GCONV_ILLEGAL_INPUT;                       \
331                         break;                                                \
332                       }                                                       \
333                                                                               \
334                     inptr += 4;                                               \
335                     ++*converted;                                             \
336                     continue;                                                 \
337                   }                                                           \
338               }                                                               \
339           }                                                                   \
340         assert (written == 2);                                                \
341                                                                               \
342         /* See whether we have to emit an escape sequence.  */                \
343         if (set != used)                                                      \
344           {                                                                   \
345             /* First see whether we announced that we use this                \
346                character set.  */                                             \
347             if ((ann & (2 << used)) == 0)                                     \
348               {                                                               \
349                 const char *escseq;                                           \
350                                                                               \
351                 if (NEED_LENGTH_TEST                                          \
352                     && __builtin_expect (outptr + 4 > outend, 0))             \
353                   {                                                           \
354                     result = __GCONV_FULL_OUTPUT;                             \
355                     break;                                                    \
356                   }                                                           \
357                                                                               \
358                 assert (used >= 1 && used <= 3);                              \
359                 escseq = "\e$)A\e$)G\e$*H" + (used - 1) * 4;                  \
360                 *outptr++ = *escseq++;                                        \
361                 *outptr++ = *escseq++;                                        \
362                 *outptr++ = *escseq++;                                        \
363                 *outptr++ = *escseq++;                                        \
364                                                                               \
365                 if (used == GB2312_set)                                       \
366                   ann = (ann & CNS11643_2_ann) | GB2312_ann;                  \
367                 else if (used == CNS11643_1_set)                              \
368                   ann = (ann & CNS11643_2_ann) | CNS11643_1_ann;              \
369                 else                                                          \
370                   ann |= CNS11643_2_ann;                                      \
371               }                                                               \
372                                                                               \
373             if (used == CNS11643_2_set)                                       \
374               {                                                               \
375                 if (__builtin_expect (outptr + 2 > outend, 0))                \
376                   {                                                           \
377                     result = __GCONV_FULL_OUTPUT;                             \
378                     break;                                                    \
379                   }                                                           \
380                 *outptr++ = SS2_0;                                            \
381                 *outptr++ = SS2_1;                                            \
382               }                                                               \
383             else                                                              \
384               {                                                               \
385                 /* We only have to emit something is currently ASCII is       \
386                    selected.  Otherwise we are switching within the           \
387                    SO charset.  */                                            \
388                 if (set == ASCII_set)                                         \
389                   {                                                           \
390                     if (__builtin_expect (outptr + 1 > outend, 0))            \
391                       {                                                       \
392                         result = __GCONV_FULL_OUTPUT;                         \
393                         break;                                                \
394                       }                                                       \
395                     *outptr++ = SO;                                           \
396                   }                                                           \
397               }                                                               \
398                                                                               \
399             /* Always test the length here since we have used up all the      \
400                guaranteed output buffer slots.  */                            \
401             if (__builtin_expect (outptr + 2 > outend, 0))                    \
402               {                                                               \
403                 result = __GCONV_FULL_OUTPUT;                                 \
404                 break;                                                        \
405               }                                                               \
406           }                                                                   \
407         else if (NEED_LENGTH_TEST                                             \
408                  && __builtin_expect (outptr + 2 > outend, 0))                \
409           {                                                                   \
410             result = __GCONV_FULL_OUTPUT;                                     \
411             break;                                                            \
412           }                                                                   \
413                                                                               \
414         *outptr++ = buf[0];                                                   \
415         *outptr++ = buf[1];                                                   \
416       }                                                                       \
417                                                                               \
418     /* Now that we wrote the output increment the input pointer.  */          \
419     inptr += 4;                                                               \
420   }
421 #define EXTRA_LOOP_DECLS        , int *setp
422 #define INIT_PARAMS             int set = *setp & CURRENT_SEL_MASK; \
423                                 int ann = *setp & CURRENT_ANN_MASK
424 #define UPDATE_PARAMS           *setp = set | ann
425 #include <iconv/loop.c>
426
427
428 /* Now define the toplevel functions.  */
429 #include <iconv/skeleton.c>