1 /* Mapping tables for JOHAB handling.
2 Copyright (C) 1998 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Jungshik Shin <jshin@pantheon.yale.edu>, 1998.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
16 You should have received a copy of the GNU Library General Public
17 License along with the GNU C Library; see the file COPYING.LIB. If not,
18 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
27 /* Direction of the transformation. */
28 static int to_johab_object;
29 static int from_johab_object;
31 /* The table for Bit pattern to Hangul Jamo
32 5 bits each are used to encode
33 leading consonants(19 + 1 filler), medial vowels(21 + 1 filler)
34 and trailing consonants(27 + 1 filler).
36 KS C 5601-1992 Annex 3 Table 2
37 0 : Filler, -1: invalid, >= 1 : valid
42 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
43 19, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
47 -1, -1, 0, 1, 2, 3, 4, 5,
48 -1, -1, 6, 7, 8, 9, 10, 11,
49 -1, -1, 12, 13, 14, 15, 16, 17,
50 -1, -1, 18, 19, 20, 21, -1, -1
54 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
55 -1, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, -1, -1
59 Hangul Jamo in Johab to Unicode 2.0 : Unicode 2.0
60 defines 51 Hangul Compatibility Jamos in the block [0x3131,0x314e]
62 It's to be considered later which Jamo block to use, Compatibility
63 block [0x3131,0x314e] or Hangul Conjoining Jamo block, [0x1100,0x11ff]
66 const wchar_t init_to_ucs[19] =
68 0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139, 0x3141, 0x3142,
69 0x3143, 0x3145, 0x3146, 0x3147, 0x3148, 0x3149, 0x314a, 0x314b,
70 0x314c, 0x314d, 0x314e
73 const wchar_t final_to_ucs[27] =
75 L'\0', L'\0', 0x3133, L'\0', 0x3135, 0x3136, L'\0', L'\0',
76 0x313a, 0x313b, 0x314c, 0x313d, 0x313e, 0x313f,
77 0x3140, L'\0', L'\0', 0x3144, L'\0', L'\0', L'\0',
78 L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'
81 /* The following three arrays are used to convert
82 precomposed Hangul syllables in [0xac00,0xd???]
83 to Jamo bit patterns for Johab encoding
85 cf. : KS C 5601-1992, Annex3 Table 2
87 Arrays are used to speed up things although it's possible
88 to get the same result arithmetically.
91 const int init_to_bit[19] =
93 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00,
94 0xa000, 0xa400, 0xa800, 0xac00, 0xb000, 0xb400,
95 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, 0xcc00,
99 const int mid_to_bit[21] =
101 0x0060, 0x0080, 0x00a0, 0x00c0, 0x00e0,
102 0x0140, 0x0160, 0x0180, 0x01a0, 0x01c0, 0x1e0,
103 0x0240, 0x0260, 0x0280, 0x02a0, 0x02c0, 0x02e0,
104 0x0340, 0x0360, 0x0380, 0x03a0
107 const int final_to_bit[28] =
109 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11,
110 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d
113 /* The conversion table from
114 UCS4 Hangul Compatibility Jamo in [0x3131,0x3163]
117 cf. 1. KS C 5601-1992 Annex 3 Table 2
118 2. Unicode 2.0 manual
121 const uint16_t jamo_from_ucs_table[51] =
127 0x9441, 0x9841, 0x9c41,
128 0x844a, 0x844b, 0x844c, 0x844d, 0x884e, 0x884f, 0x8450,
129 0xa041, 0xa441, 0xa841,
131 0xac41, 0xb041, 0xb441, 0xb841, 0xbc41,
132 0xc041, 0xc441, 0xc841, 0xca41, 0xd041,
133 0x8461, 0x8481, 0x84a1, 0x84c1, 0x84e1,
134 0x8541, 0x8561, 0x8581, 0x85a1, 0x85c1, 0x85e1,
135 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1,
136 0x8741, 0x8761, 0x8781, 0x87a1
140 static inline wchar_t
141 johab_sym_hanja_to_ucs (int idx, int c1, int c2)
144 return (wchar_t) ksc5601_sym_to_ucs[(c1 - 0xd9) * 188 + c2
145 - (c2 > 0x90 ? 0x43 : 0x31)];
147 return (wchar_t) ksc5601_hanja_to_ucs[(c1 - 0xe0) * 188 + c2
148 - (c2 > 0x90 ? 0x43 : 0x31)];
152 johab_hanja_from_ucs (wchar_t ch)
156 if (ucs4_to_ksc5601_hanja (ch, &idx))
159 /* Hanja begins at the 42th row. 42=0x2a : 0x2a + 0x20 = 0x4a. */
160 idx1 = idx / 256 - 0x4a;
161 idx2 = idx % 256 + 0x80;
163 return ((idx1 / 2) * 256 + 0xe000 + idx2
164 + (idx1 % 2 ? 0 : (idx2 > 0xee ? 0x43 : 0x31) - 0xa1));
171 johab_sym_from_ucs (wchar_t ch)
174 if (ucs4_to_ksc5601_sym (ch, &idx))
178 idx1 = idx / 256 - 0x21;
179 idx2 = idx % 256 + 0x80;
181 return ((idx1 / 2) * 256 + 0xd900 + idx2
182 + (idx1 % 2 ? 0 : (idx2 > 0xee ? 0x43 : 0x31) - 0xa1));
191 johab_from_ucs4 (wchar_t ch, unsigned char *cp)
197 if (ch >= 0xac00 && ch <= 0xd7a3)
200 idx = init_to_bit[ch / 588]; /* 21*28 = 588 */
201 idx += mid_to_bit[(ch / 28) % 21]; /* (ch % (21 * 28)) / 28 */
202 idx += final_to_bit[ch % 28]; /* (ch % (21 * 28)) % 28 */
204 /* KS C 5601-1992 Annex 3 regards 0xA4DA(Hangul Filler : U3164)
206 else if (ch >= 0x3131 && ch <= 0x3163)
207 idx = jamo_from_ucs_table[ch - 0x3131];
208 else if (ch >= 0x4e00 && ch <= 0x9fa5
209 || ch >= 0xf900 && ch <= 0xfa0b)
210 idx = johab_hanja_from_ucs (ch);
211 /* Half-width Korean Currency Won Sign
212 else if ( ch == 0x20a9 )
216 idx = johab_sym_from_ucs (ch);
218 *cp = (char) (idx / 256);
219 *(cp + 1) = (char) (idx & 0xff);
224 *cp = (char) (0x7f & ch);
225 *(cp + 1) = (char) 0;
232 gconv_init (struct gconv_step *step)
234 /* Determine which direction. */
235 if (strcasestr (step->from_name, "JOHAB") != NULL)
236 step->data = &from_johab_object;
237 else if (strcasestr (step->to_name, "JOHAB") != NULL)
238 step->data = &to_johab_object;
247 gconv_end (struct gconv_step *data)
254 gconv (struct gconv_step *step, struct gconv_step_data *data,
255 const char *inbuf, size_t *inbufsize, size_t * written, int do_flush)
257 struct gconv_step *next_step = step + 1;
258 struct gconv_step_data *next_data = data + 1;
259 gconv_fct fct = next_step->fct;
263 /* If the function is called with no input this means we have to reset
264 to the initial state. The possibly partly converted input is
270 /* Call the steps down the chain if there are any. */
275 struct gconv_step *next_step = step + 1;
276 struct gconv_step_data *next_data = data + 1;
278 result = (*fct) (next_step, next_data, NULL, 0, written, 1);
280 /* Clear output buffer. */
281 data->outbufavail = 0;
292 if (step->data == &from_johab_object)
294 size_t inchars = *inbufsize;
295 size_t outwchars = data->outbufavail;
296 char *outbuf = data->outbuf;
300 && (outwchars + sizeof (wchar_t) <= data->outbufsize))
302 int inchar = (unsigned char) inbuf[cnt];
304 /* half-width Korean Currency WON sign
307 else if (inchar < 0x7f)
308 ch = (wchar_t) inchar;
311 ch = (wchar_t) inchar;
315 2nd byte : 0x41-0x7e, 0x81-0xfe
317 1st byte : 0xd8-0xde, 0xe0-0xf9
318 2nd byte : 0x31-0x7e, 0x91-0xfe
319 0xd831-0xd87e and 0xd891-0xd8fe are user-defined area */
321 else if (inchar > 0xf9 || inchar == 0xdf
322 || (inchar > 0x7e && inchar < 0x84)
323 || (inchar > 0xd3 && inchar < 0xd9))
324 /* These are illegal. */
328 /* Two-byte character. First test whether the next
329 character is also available. */
333 if (cnt + 1 >= inchars)
335 /* The second character is not available. Store
336 the intermediate result. */
337 result = GCONV_INCOMPLETE_INPUT;
341 inchar2 = (unsigned char) inbuf[++cnt];
342 idx = inchar * 256 + inchar2;
346 i = init[(idx & 0x7c00) >> 10];
347 m = mid[(idx & 0x03e0) >> 5];
348 f = final[idx & 0x001f];
349 if (i == -1 || m == -1 || f == -1)
350 /* This is illegal. */
352 else if (i > 0 && m > 0)
353 ch = ((i - 1) * 21 + (m - 1)) * 28 + f + 0xac00;
354 else if (i > 0 && m == 0 & f == 0)
355 ch = init_to_ucs[i - 1];
356 else if (i == 0 && m > 0 & f == 0)
357 ch = 0x314e + m; /* 0x314f + m - 1 */
358 else if (i == 0 && m == 0 & f > 0)
359 ch = final_to_ucs[f - 1]; /* round trip?? */
361 /* This is illegal. */
367 || (inchar2 > 0x7e && inchar2 < 0x91)
369 /* This is illegal. */
371 else if (inchar == 0xda
372 && inchar2 > 0xa0 && inchar2 < 0xd4)
373 /* This is illegal. */
374 /* Modern Hangul Jaso is defined elsewhere
379 ch = johab_sym_hanja_to_ucs (idx, inchar,
381 /* if (idx <= 0xdefe)
382 ch = ksc5601_sym_to_ucs[(inchar - 0xd9) * 192
384 - (inchar2>0x90 ? 0x43 : 0x31)];
387 ch = ksc5601_hanja_to_ucs[(inchar - 0xe0) *192
389 - (inchar2>0x90 ? 0x43 : 0x31)];
398 if (ch == L'\0' && inbuf[cnt] != '\0')
400 /* This is an illegal character. */
401 result = GCONV_ILLEGAL_INPUT;
405 *((wchar_t *) (outbuf + outwchars)) = ch;
407 outwchars += sizeof (wchar_t);
411 data->outbufavail = outwchars;
415 size_t inwchars = *inbufsize;
416 size_t outchars = data->outbufavail;
417 char *outbuf = data->outbuf;
421 while (inwchars >= cnt + sizeof (wchar_t)
422 && outchars < data->outbufsize)
424 wchar_t ch = *((wchar_t *) (inbuf + cnt));
427 if (ch >= (sizeof (from_ucs4_lat1)
428 / sizeof (from_ucs4_lat1[0])))
430 if (ch >= 0x0391 && ch <= 0x0451)
431 cp = from_ucs4_greek[ch - 0x391];
432 else if (ch >= 0x2010 && ch <= 0x9fa0)
433 cp = from_ucs4_cjk[ch - 0x02010];
438 cp = from_ucs4_lat1[ch];
440 johab_from_ucs4 (ch, cp);
442 if (cp[0] == '\0' && ch != 0)
443 /* Illegal character. */
446 outbuf[outchars] = cp[0];
447 /* Now test for a possible second byte and write this
451 if (outchars + 1 >= data->outbufsize)
453 /* The result does not fit into the buffer. */
457 outbuf[++outchars] = cp[1];
462 cnt += sizeof (wchar_t);
465 data->outbufavail = outchars;
467 if (outchars + extra < data->outbufsize)
469 /* If there is still room in the output buffer something
470 is wrong with the input. */
471 if (inwchars >= cnt + sizeof (wchar_t))
473 /* An error occurred. */
474 result = GCONV_ILLEGAL_INPUT;
479 /* There are some unprocessed bytes at the end of the
481 result = GCONV_INCOMPLETE_INPUT;
487 if (result != GCONV_OK)
492 /* This is the last step. */
493 result = (*inbufsize > (step->data == &from_johab_object
494 ? 0 : sizeof (wchar_t) - 1)
495 ? GCONV_FULL_OUTPUT : GCONV_EMPTY_INPUT);
500 result = GCONV_EMPTY_INPUT;
502 if (data->outbufavail > 0)
504 /* Call the functions below in the chain. */
505 size_t newavail = data->outbufavail;
507 result = (*fct) (next_step, next_data, data->outbuf, &newavail,
510 /* Correct the output buffer. */
511 if (newavail != data->outbufavail && newavail > 0)
513 memmove (data->outbuf,
514 &data->outbuf[data->outbufavail - newavail],
516 data->outbufavail = newavail;
520 while (*inbufsize > 0 && result == GCONV_EMPTY_INPUT);
523 if (written != NULL && data->is_last)