1 /* Mapping tables for JOHAB handling.
2 Copyright (C) 1998 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Jungshik Shin <jshin@pantheon.yale.edu>, 1998.
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public License as
8 published by the Free Software Foundation; either version 2 of the
9 License, or (at your option) any later version.
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details.
16 You should have received a copy of the GNU Library General Public
17 License along with the GNU C Library; see the file COPYING.LIB. If not,
18 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 Boston, MA 02111-1307, USA. */
28 /* Direction of the transformation. */
41 /* The table for Bit pattern to Hangul Jamo
42 5 bits each are used to encode
43 leading consonants(19 + 1 filler), medial vowels(21 + 1 filler)
44 and trailing consonants(27 + 1 filler).
46 KS C 5601-1992 Annex 3 Table 2
47 0 : Filler, -1: invalid, >= 1 : valid
52 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
53 19, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
57 -1, -1, 0, 1, 2, 3, 4, 5,
58 -1, -1, 6, 7, 8, 9, 10, 11,
59 -1, -1, 12, 13, 14, 15, 16, 17,
60 -1, -1, 18, 19, 20, 21, -1, -1
64 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
65 -1, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, -1, -1
69 Hangul Jamo in Johab to Unicode 2.0 : Unicode 2.0
70 defines 51 Hangul Compatibility Jamos in the block [0x3131,0x314e]
72 It's to be considered later which Jamo block to use, Compatibility
73 block [0x3131,0x314e] or Hangul Conjoining Jamo block, [0x1100,0x11ff]
76 const wchar_t init_to_ucs[19] =
78 0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139, 0x3141, 0x3142,
79 0x3143, 0x3145, 0x3146, 0x3147, 0x3148, 0x3149, 0x314a, 0x314b,
80 0x314c, 0x314d, 0x314e
83 const wchar_t final_to_ucs[27] =
85 L'\0', L'\0', 0x3133, L'\0', 0x3135, 0x3136, L'\0', L'\0',
86 0x313a, 0x313b, 0x314c, 0x313d, 0x313e, 0x313f,
87 0x3140, L'\0', L'\0', 0x3144, L'\0', L'\0', L'\0',
88 L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'
91 /* The following three arrays are used to convert
92 precomposed Hangul syllables in [0xac00,0xd???]
93 to Jamo bit patterns for Johab encoding
95 cf. : KS C 5601-1992, Annex3 Table 2
97 Arrays are used to speed up things although it's possible
98 to get the same result arithmetically.
101 const int init_to_bit[19] =
103 0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00,
104 0xa000, 0xa400, 0xa800, 0xac00, 0xb000, 0xb400,
105 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, 0xcc00,
109 const int mid_to_bit[21] =
111 0x0060, 0x0080, 0x00a0, 0x00c0, 0x00e0,
112 0x0140, 0x0160, 0x0180, 0x01a0, 0x01c0, 0x1e0,
113 0x0240, 0x0260, 0x0280, 0x02a0, 0x02c0, 0x02e0,
114 0x0340, 0x0360, 0x0380, 0x03a0
117 const int final_to_bit[28] =
119 1, 2, 3, 4, 5, 6, 7, 8, 9, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11,
120 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d
123 /* The conversion table from
124 UCS4 Hangul Compatibility Jamo in [0x3131,0x3163]
127 cf. 1. KS C 5601-1992 Annex 3 Table 2
128 2. Unicode 2.0 manual
131 const uint16_t jamo_from_ucs_table[51] =
137 0x9441, 0x9841, 0x9c41,
138 0x844a, 0x844b, 0x844c, 0x844d, 0x884e, 0x884f, 0x8450,
139 0xa041, 0xa441, 0xa841,
141 0xac41, 0xb041, 0xb441, 0xb841, 0xbc41,
142 0xc041, 0xc441, 0xc841, 0xca41, 0xd041,
143 0x8461, 0x8481, 0x84a1, 0x84c1, 0x84e1,
144 0x8541, 0x8561, 0x8581, 0x85a1, 0x85c1, 0x85e1,
145 0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1,
146 0x8741, 0x8761, 0x8781, 0x87a1
150 static inline wchar_t
151 johab_sym_hanja_to_ucs (int idx, int c1, int c2)
154 return (wchar_t) ksc5601_sym_to_ucs[(c1 - 0xd9) * 188 + c2
155 - (c2 > 0x90 ? 0x43 : 0x31)];
157 return (wchar_t) ksc5601_hanja_to_ucs[(c1 - 0xe0) * 188 + c2
158 - (c2 > 0x90 ? 0x43 : 0x31)];
162 johab_hanja_from_ucs (wchar_t ch)
166 if (ucs4_to_ksc5601_hanja (ch, &idx))
169 /* Hanja begins at the 42th row. 42=0x2a : 0x2a + 0x20 = 0x4a. */
170 idx1 = idx / 256 - 0x4a;
171 idx2 = idx % 256 + 0x80;
173 return ((idx1 / 2) * 256 + 0xe000 + idx2
174 + (idx1 % 2 ? 0 : (idx2 > 0xee ? 0x43 : 0x31) - 0xa1));
181 johab_sym_from_ucs (wchar_t ch)
184 if (ucs4_to_ksc5601_sym (ch, &idx))
188 idx1 = idx / 256 - 0x21;
189 idx2 = idx % 256 + 0x80;
191 return ((idx1 / 2) * 256 + 0xd900 + idx2
192 + (idx1 % 2 ? 0 : (idx2 > 0xee ? 0x43 : 0x31) - 0xa1));
201 johab_from_ucs4 (wchar_t ch, unsigned char *cp)
207 if (ch >= 0xac00 && ch <= 0xd7a3)
210 idx = init_to_bit[ch / 588]; /* 21*28 = 588 */
211 idx += mid_to_bit[(ch / 28) % 21]; /* (ch % (21 * 28)) / 28 */
212 idx += final_to_bit[ch % 28]; /* (ch % (21 * 28)) % 28 */
214 /* KS C 5601-1992 Annex 3 regards 0xA4DA(Hangul Filler : U3164)
216 else if (ch >= 0x3131 && ch <= 0x3163)
217 idx = jamo_from_ucs_table[ch - 0x3131];
218 else if (ch >= 0x4e00 && ch <= 0x9fa5
219 || ch >= 0xf900 && ch <= 0xfa0b)
220 idx = johab_hanja_from_ucs (ch);
221 /* Half-width Korean Currency Won Sign
222 else if ( ch == 0x20a9 )
226 idx = johab_sym_from_ucs (ch);
228 *cp = (char) (idx / 256);
229 *(cp + 1) = (char) (idx & 0xff);
234 *cp = (char) (0x7f & ch);
235 *(cp + 1) = (char) 0;
242 gconv_init (struct gconv_step *step)
244 /* Determine which direction. */
245 struct johab_data *new_data;
249 if (strcasestr (step->from_name, "JOHAB") != NULL)
251 else if (strcasestr (step->to_name, "JOHAB") != NULL)
256 result = GCONV_NOCONV;
259 = (struct johab_data *) malloc (sizeof (struct johab_data)))
263 step->data = new_data;
272 gconv_end (struct gconv_step *data)
279 gconv (struct gconv_step *step, struct gconv_step_data *data,
280 const char *inbuf, size_t *inbufsize, size_t * written, int do_flush)
282 struct gconv_step *next_step = step + 1;
283 struct gconv_step_data *next_data = data + 1;
284 gconv_fct fct = next_step->fct;
288 /* If the function is called with no input this means we have to reset
289 to the initial state. The possibly partly converted input is
295 /* Call the steps down the chain if there are any. */
300 struct gconv_step *next_step = step + 1;
301 struct gconv_step_data *next_data = data + 1;
303 result = (*fct) (next_step, next_data, NULL, 0, written, 1);
305 /* Clear output buffer. */
306 data->outbufavail = 0;
311 enum direction dir = ((struct johab_data *) step->data)->dir;
319 if (dir == from_johab)
321 size_t inchars = *inbufsize;
322 size_t outwchars = data->outbufavail;
323 char *outbuf = data->outbuf;
327 && (outwchars + sizeof (wchar_t) <= data->outbufsize))
329 int inchar = (unsigned char) inbuf[cnt];
331 /* half-width Korean Currency WON sign
334 else if (inchar < 0x7f)
335 ch = (wchar_t) inchar;
338 ch = (wchar_t) inchar;
342 2nd byte : 0x41-0x7e, 0x81-0xfe
344 1st byte : 0xd8-0xde, 0xe0-0xf9
345 2nd byte : 0x31-0x7e, 0x91-0xfe
346 0xd831-0xd87e and 0xd891-0xd8fe are user-defined area */
348 else if (inchar > 0xf9 || inchar == 0xdf
349 || (inchar > 0x7e && inchar < 0x84)
350 || (inchar > 0xd3 && inchar < 0xd9))
351 /* These are illegal. */
355 /* Two-byte character. First test whether the next
356 character is also available. */
360 if (cnt + 1 >= inchars)
362 /* The second character is not available. Store
363 the intermediate result. */
364 result = GCONV_INCOMPLETE_INPUT;
368 inchar2 = (unsigned char) inbuf[++cnt];
369 idx = inchar * 256 + inchar2;
373 i = init[(idx & 0x7c00) >> 10];
374 m = mid[(idx & 0x03e0) >> 5];
375 f = final[idx & 0x001f];
376 if (i == -1 || m == -1 || f == -1)
377 /* This is illegal. */
379 else if (i > 0 && m > 0)
380 ch = ((i - 1) * 21 + (m - 1)) * 28 + f + 0xac00;
381 else if (i > 0 && m == 0 & f == 0)
382 ch = init_to_ucs[i - 1];
383 else if (i == 0 && m > 0 & f == 0)
384 ch = 0x314e + m; /* 0x314f + m - 1 */
385 else if (i == 0 && m == 0 & f > 0)
386 ch = final_to_ucs[f - 1]; /* round trip?? */
388 /* This is illegal. */
394 || (inchar2 > 0x7e && inchar2 < 0x91)
396 /* This is illegal. */
398 else if (inchar == 0xda
399 && inchar2 > 0xa0 && inchar2 < 0xd4)
400 /* This is illegal. */
401 /* Modern Hangul Jaso is defined elsewhere
406 ch = johab_sym_hanja_to_ucs (idx, inchar,
408 /* if (idx <= 0xdefe)
409 ch = ksc5601_sym_to_ucs[(inchar - 0xd9) * 192
411 - (inchar2>0x90 ? 0x43 : 0x31)];
414 ch = ksc5601_hanja_to_ucs[(inchar - 0xe0) *192
416 - (inchar2>0x90 ? 0x43 : 0x31)];
425 if (ch == L'\0' && inbuf[cnt] != '\0')
427 /* This is an illegal character. */
428 result = GCONV_ILLEGAL_INPUT;
432 *((wchar_t *) (outbuf + outwchars)) = ch;
434 outwchars += sizeof (wchar_t);
438 data->outbufavail = outwchars;
442 size_t inwchars = *inbufsize;
443 size_t outchars = data->outbufavail;
444 char *outbuf = data->outbuf;
448 while (inwchars >= cnt + sizeof (wchar_t)
449 && outchars < data->outbufsize)
451 wchar_t ch = *((wchar_t *) (inbuf + cnt));
454 if (ch >= (sizeof (from_ucs4_lat1)
455 / sizeof (from_ucs4_lat1[0])))
457 if (ch >= 0x0391 && ch <= 0x0451)
458 cp = from_ucs4_greek[ch - 0x391];
459 else if (ch >= 0x2010 && ch <= 0x9fa0)
460 cp = from_ucs4_cjk[ch - 0x02010];
465 cp = from_ucs4_lat1[ch];
467 johab_from_ucs4 (ch, cp);
469 if (cp[0] == '\0' && ch != 0)
470 /* Illegal character. */
473 outbuf[outchars] = cp[0];
474 /* Now test for a possible second byte and write this
478 if (outchars + 1 >= data->outbufsize)
480 /* The result does not fit into the buffer. */
484 outbuf[++outchars] = cp[1];
489 cnt += sizeof (wchar_t);
492 data->outbufavail = outchars;
494 if (outchars + extra < data->outbufsize)
496 /* If there is still room in the output buffer something
497 is wrong with the input. */
498 if (inwchars >= cnt + sizeof (wchar_t))
500 /* An error occurred. */
501 result = GCONV_ILLEGAL_INPUT;
506 /* There are some unprocessed bytes at the end of the
508 result = GCONV_INCOMPLETE_INPUT;
514 if (result != GCONV_OK)
519 /* This is the last step. */
520 result = (*inbufsize > (dir == from_johab
521 ? 0 : sizeof (wchar_t) - 1)
522 ? GCONV_FULL_OUTPUT : GCONV_EMPTY_INPUT);
527 result = GCONV_EMPTY_INPUT;
529 if (data->outbufavail > 0)
531 /* Call the functions below in the chain. */
532 size_t newavail = data->outbufavail;
534 result = (*fct) (next_step, next_data, data->outbuf, &newavail,
537 /* Correct the output buffer. */
538 if (newavail != data->outbufavail && newavail > 0)
540 memmove (data->outbuf,
541 &data->outbuf[data->outbufavail - newavail],
543 data->outbufavail = newavail;
547 while (*inbufsize > 0 && result == GCONV_EMPTY_INPUT);
550 if (written != NULL && data->is_last)