1 /* Copyright (C) 1995-1999, 2000 Free Software Foundation, Inc.
2 This file is part of the GNU C Library.
3 Contributed by Ulrich Drepper <drepper@gnu.org>, 1995.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public License as
7 published by the Free Software Foundation; either version 2 of the
8 License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
15 You should have received a copy of the GNU Library General Public
16 License along with the GNU C Library; see the file COPYING.LIB. If not,
17 write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 Boston, MA 02111-1307, USA. */
37 #include "localeinfo.h"
39 #include "linereader.h"
40 #include "locfile-token.h"
42 #include "localedef.h"
47 #ifdef PREDEFINED_CLASSES
48 /* These are the extra bits not in wctype.h since these are not preallocated
50 # define _ISwspecial1 (1 << 29)
51 # define _ISwspecial2 (1 << 30)
52 # define _ISwspecial3 (1 << 31)
56 /* The bit used for representing a special class. */
57 #define BITPOS(class) ((class) - tok_upper)
58 #define BIT(class) (_ISbit (BITPOS (class)))
59 #define BITw(class) (_ISwbit (BITPOS (class)))
61 #define ELEM(ctype, collection, idx, value) \
62 *find_idx (ctype, &ctype->collection idx, &ctype->collection##_max idx, \
63 &ctype->collection##_act idx, value)
66 /* To be compatible with former implementations we for now restrict
67 the number of bits for character classes to 16. When compatibility
68 is not necessary anymore increase the number to 32. */
69 #define char_class_t uint16_t
70 #define char_class32_t uint32_t
73 /* Type to describe a transliteration action. We have a possibly
74 multiple character from-string and a set of multiple character
75 to-strings. All are 32bit values since this is what is used in
76 the gconv functions. */
81 struct translit_to_t *next;
91 struct translit_to_t *to;
93 struct translit_t *next;
96 struct translit_ignore_t
105 struct translit_ignore_t *next;
109 /* Type to describe a transliteration include statement. */
110 struct translit_include_t
112 const char *copy_locale;
113 const char *copy_repertoire;
115 struct translit_include_t *next;
119 /* The real definition of the struct for the LC_CTYPE locale. */
120 struct locale_ctype_t
123 size_t charnames_max;
124 size_t charnames_act;
125 /* An index lookup table, to speedup find_idx. */
126 #define MAX_CHARNAMES_IDX 0x10000
127 uint32_t *charnames_idx;
129 struct repertoire_t *repertoire;
131 /* We will allow up to 8 * sizeof (uint32_t) character classes. */
132 #define MAX_NR_CHARCLASS (8 * sizeof (uint32_t))
134 const char *classnames[MAX_NR_CHARCLASS];
135 uint32_t last_class_char;
136 uint32_t class256_collection[256];
137 uint32_t *class_collection;
138 size_t class_collection_max;
139 size_t class_collection_act;
141 uint32_t class_offset;
143 struct charseq **mbdigits;
150 struct charseq *mboutdigits[10];
151 uint32_t wcoutdigits[10];
152 size_t outdigits_act;
154 /* If the following number ever turns out to be too small simply
155 increase it. But I doubt it will. --drepper@gnu */
156 #define MAX_NR_CHARMAP 16
157 const char *mapnames[MAX_NR_CHARMAP];
158 uint32_t *map_collection[MAX_NR_CHARMAP];
159 uint32_t map256_collection[2][256];
160 size_t map_collection_max[MAX_NR_CHARMAP];
161 size_t map_collection_act[MAX_NR_CHARMAP];
162 size_t map_collection_nr;
164 int tomap_done[MAX_NR_CHARMAP];
167 /* Transliteration information. */
168 struct translit_include_t *translit_include;
169 struct translit_t *translit;
170 struct translit_ignore_t *translit_ignore;
171 uint32_t ntranslit_ignore;
173 uint32_t *default_missing;
174 const char *default_missing_file;
175 size_t default_missing_lineno;
177 /* The arrays for the binary representation. */
178 char_class_t *ctype_b;
179 char_class32_t *ctype32_b;
183 struct iovec *class_3level;
184 struct iovec *map_3level;
185 uint32_t *class_name_ptr;
186 uint32_t *map_name_ptr;
189 const char *codeset_name;
190 uint32_t *translit_from_idx;
191 uint32_t *translit_from_tbl;
192 uint32_t *translit_to_idx;
193 uint32_t *translit_to_tbl;
194 uint32_t translit_idx_size;
195 size_t translit_from_tbl_size;
196 size_t translit_to_tbl_size;
198 struct obstack mempool;
202 #define obstack_chunk_alloc xmalloc
203 #define obstack_chunk_free free
206 /* Prototypes for local functions. */
207 static void ctype_startup (struct linereader *lr, struct localedef_t *locale,
208 struct charmap_t *charmap,
209 struct localedef_t *copy_locale,
211 static void ctype_class_new (struct linereader *lr,
212 struct locale_ctype_t *ctype, const char *name);
213 static void ctype_map_new (struct linereader *lr,
214 struct locale_ctype_t *ctype,
215 const char *name, struct charmap_t *charmap);
216 static uint32_t *find_idx (struct locale_ctype_t *ctype, uint32_t **table,
217 size_t *max, size_t *act, unsigned int idx);
218 static void set_class_defaults (struct locale_ctype_t *ctype,
219 struct charmap_t *charmap,
220 struct repertoire_t *repertoire);
221 static void allocate_arrays (struct locale_ctype_t *ctype,
222 struct charmap_t *charmap,
223 struct repertoire_t *repertoire);
226 static const char *longnames[] =
228 "zero", "one", "two", "three", "four",
229 "five", "six", "seven", "eight", "nine"
231 static const char *uninames[] =
233 "U00000030", "U00000031", "U00000032", "U00000033", "U00000034",
234 "U00000035", "U00000036", "U00000037", "U00000038", "U00000039"
236 static const unsigned char digits[] = "0123456789";
240 ctype_startup (struct linereader *lr, struct localedef_t *locale,
241 struct charmap_t *charmap, struct localedef_t *copy_locale,
245 struct locale_ctype_t *ctype;
247 if (!ignore_content && locale->categories[LC_CTYPE].ctype == NULL)
249 if (copy_locale == NULL)
251 /* Allocate the needed room. */
252 locale->categories[LC_CTYPE].ctype = ctype =
253 (struct locale_ctype_t *) xcalloc (1,
254 sizeof (struct locale_ctype_t));
256 /* We have seen no names yet. */
257 ctype->charnames_max = charmap->mb_cur_max == 1 ? 256 : 512;
259 (unsigned int *) xmalloc (ctype->charnames_max
260 * sizeof (unsigned int));
261 for (cnt = 0; cnt < 256; ++cnt)
262 ctype->charnames[cnt] = cnt;
263 ctype->charnames_act = 256;
264 ctype->charnames_idx =
265 (uint32_t *) xmalloc (MAX_CHARNAMES_IDX * sizeof (uint32_t));
266 for (cnt = 0; cnt < MAX_CHARNAMES_IDX; ++cnt)
267 ctype->charnames_idx[cnt] = ~((uint32_t) 0);
269 /* Fill character class information. */
270 ctype->last_class_char = ILLEGAL_CHAR_VALUE;
271 /* The order of the following instructions determines the bit
273 ctype_class_new (lr, ctype, "upper");
274 ctype_class_new (lr, ctype, "lower");
275 ctype_class_new (lr, ctype, "alpha");
276 ctype_class_new (lr, ctype, "digit");
277 ctype_class_new (lr, ctype, "xdigit");
278 ctype_class_new (lr, ctype, "space");
279 ctype_class_new (lr, ctype, "print");
280 ctype_class_new (lr, ctype, "graph");
281 ctype_class_new (lr, ctype, "blank");
282 ctype_class_new (lr, ctype, "cntrl");
283 ctype_class_new (lr, ctype, "punct");
284 ctype_class_new (lr, ctype, "alnum");
285 #ifdef PREDEFINED_CLASSES
286 /* The following are extensions from ISO 14652. */
287 ctype_class_new (lr, ctype, "left_to_right");
288 ctype_class_new (lr, ctype, "right_to_left");
289 ctype_class_new (lr, ctype, "num_terminator");
290 ctype_class_new (lr, ctype, "num_separator");
291 ctype_class_new (lr, ctype, "segment_separator");
292 ctype_class_new (lr, ctype, "block_separator");
293 ctype_class_new (lr, ctype, "direction_control");
294 ctype_class_new (lr, ctype, "sym_swap_layout");
295 ctype_class_new (lr, ctype, "char_shape_selector");
296 ctype_class_new (lr, ctype, "num_shape_selector");
297 ctype_class_new (lr, ctype, "non_spacing");
298 ctype_class_new (lr, ctype, "non_spacing_level3");
299 ctype_class_new (lr, ctype, "normal_connect");
300 ctype_class_new (lr, ctype, "r_connect");
301 ctype_class_new (lr, ctype, "no_connect");
302 ctype_class_new (lr, ctype, "no_connect-space");
303 ctype_class_new (lr, ctype, "vowel_connect");
306 ctype->class_collection_max = charmap->mb_cur_max == 1 ? 256 : 512;
307 ctype->class_collection
308 = (uint32_t *) xcalloc (sizeof (unsigned long int),
309 ctype->class_collection_max);
310 ctype->class_collection_act = 256;
312 /* Fill character map information. */
313 ctype->last_map_idx = MAX_NR_CHARMAP;
314 ctype_map_new (lr, ctype, "toupper", charmap);
315 ctype_map_new (lr, ctype, "tolower", charmap);
316 #ifdef PREDEFINED_CLASSES
317 ctype_map_new (lr, ctype, "tosymmetric", charmap);
320 /* Fill first 256 entries in `toXXX' arrays. */
321 for (cnt = 0; cnt < 256; ++cnt)
323 ctype->map_collection[0][cnt] = cnt;
324 ctype->map_collection[1][cnt] = cnt;
325 #ifdef PREDEFINED_CLASSES
326 ctype->map_collection[2][cnt] = cnt;
328 ctype->map256_collection[0][cnt] = cnt;
329 ctype->map256_collection[1][cnt] = cnt;
332 obstack_init (&ctype->mempool);
335 ctype = locale->categories[LC_CTYPE].ctype =
336 copy_locale->categories[LC_CTYPE].ctype;
342 ctype_finish (struct localedef_t *locale, struct charmap_t *charmap)
344 /* See POSIX.2, table 2-6 for the meaning of the following table. */
349 const char allow[NCLASS];
351 valid_table[NCLASS] =
353 /* The order is important. See token.h for more information.
354 M = Always, D = Default, - = Permitted, X = Mutually exclusive */
355 { "upper", "--MX-XDDXXX-" },
356 { "lower", "--MX-XDDXXX-" },
357 { "alpha", "---X-XDDXXX-" },
358 { "digit", "XXX--XDDXXX-" },
359 { "xdigit", "-----XDDXXX-" },
360 { "space", "XXXXX------X" },
361 { "print", "---------X--" },
362 { "graph", "---------X--" },
363 { "blank", "XXXXXM-----X" },
364 { "cntrl", "XXXXX-XX--XX" },
365 { "punct", "XXXXX-DD-X-X" },
366 { "alnum", "-----XDDXXX-" }
370 uint32_t space_value;
371 struct charseq *space_seq;
372 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
379 /* Now resolve copying and also handle completely missing definitions. */
382 const char *repertoire_name;
384 /* First see whether we were supposed to copy. If yes, find the
385 actual definition. */
386 if (locale->copy_name[LC_CTYPE] != NULL)
388 /* Find the copying locale. This has to happen transitively since
389 the locale we are copying from might also copying another one. */
390 struct localedef_t *from = locale;
393 from = find_locale (LC_CTYPE, from->copy_name[LC_CTYPE],
394 from->repertoire_name, charmap);
395 while (from->categories[LC_CTYPE].ctype == NULL
396 && from->copy_name[LC_CTYPE] != NULL);
398 ctype = locale->categories[LC_CTYPE].ctype
399 = from->categories[LC_CTYPE].ctype;
402 /* If there is still no definition issue an warning and create an
407 error (0, 0, _("No definition for %s category found"), "LC_CTYPE");
408 ctype_startup (NULL, locale, charmap, NULL, 0);
409 ctype = locale->categories[LC_CTYPE].ctype;
412 /* Get the repertoire we have to use. */
413 repertoire_name = locale->repertoire_name ?: repertoire_global;
414 if (repertoire_name != NULL)
415 ctype->repertoire = repertoire_read (repertoire_name);
418 /* We need the name of the currently used 8-bit character set to
419 make correct conversion between this 8-bit representation and the
420 ISO 10646 character set used internally for wide characters. */
421 ctype->codeset_name = charmap->code_set_name;
422 if (ctype->codeset_name == NULL)
425 error (0, 0, _("No character set name specified in charmap"));
426 ctype->codeset_name = "//UNKNOWN//";
429 /* Set default value for classes not specified. */
430 set_class_defaults (ctype, charmap, ctype->repertoire);
432 /* Check according to table. */
433 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
435 uint32_t tmp = ctype->class_collection[cnt];
439 for (cls1 = 0; cls1 < NCLASS; ++cls1)
440 if ((tmp & _ISwbit (cls1)) != 0)
441 for (cls2 = 0; cls2 < NCLASS; ++cls2)
442 if (valid_table[cls1].allow[cls2] != '-')
444 int eq = (tmp & _ISwbit (cls2)) != 0;
445 switch (valid_table[cls1].allow[cls2])
450 uint32_t value = ctype->charnames[cnt];
454 character L'\\u%0*x' in class `%s' must be in class `%s'"),
455 value > 0xffff ? 8 : 4, value,
456 valid_table[cls1].name,
457 valid_table[cls2].name);
464 uint32_t value = ctype->charnames[cnt];
468 character L'\\u%0*x' in class `%s' must not be in class `%s'"),
469 value > 0xffff ? 8 : 4, value,
470 valid_table[cls1].name,
471 valid_table[cls2].name);
476 ctype->class_collection[cnt] |= _ISwbit (cls2);
480 error (5, 0, _("internal error in %s, line %u"),
481 __FUNCTION__, __LINE__);
487 for (cnt = 0; cnt < 256; ++cnt)
489 uint32_t tmp = ctype->class256_collection[cnt];
493 for (cls1 = 0; cls1 < NCLASS; ++cls1)
494 if ((tmp & _ISbit (cls1)) != 0)
495 for (cls2 = 0; cls2 < NCLASS; ++cls2)
496 if (valid_table[cls1].allow[cls2] != '-')
498 int eq = (tmp & _ISbit (cls2)) != 0;
499 switch (valid_table[cls1].allow[cls2])
506 snprintf (buf, sizeof buf, "\\%Zo", cnt);
510 character '%s' in class `%s' must be in class `%s'"),
511 buf, valid_table[cls1].name,
512 valid_table[cls2].name);
521 snprintf (buf, sizeof buf, "\\%Zo", cnt);
525 character '%s' in class `%s' must not be in class `%s'"),
526 buf, valid_table[cls1].name,
527 valid_table[cls2].name);
532 ctype->class256_collection[cnt] |= _ISbit (cls2);
536 error (5, 0, _("internal error in %s, line %u"),
537 __FUNCTION__, __LINE__);
543 /* ... and now test <SP> as a special case. */
545 if (((cnt = BITPOS (tok_space),
546 (ELEM (ctype, class_collection, , space_value)
547 & BITw (tok_space)) == 0)
548 || (cnt = BITPOS (tok_blank),
549 (ELEM (ctype, class_collection, , space_value)
550 & BITw (tok_blank)) == 0)))
553 error (0, 0, _("<SP> character not in class `%s'"),
554 valid_table[cnt].name);
556 else if (((cnt = BITPOS (tok_punct),
557 (ELEM (ctype, class_collection, , space_value)
558 & BITw (tok_punct)) != 0)
559 || (cnt = BITPOS (tok_graph),
560 (ELEM (ctype, class_collection, , space_value)
565 error (0, 0, _("<SP> character must not be in class `%s'"),
566 valid_table[cnt].name);
569 ELEM (ctype, class_collection, , space_value) |= BITw (tok_print);
571 space_seq = charmap_find_value (charmap, "SP", 2);
572 if (space_seq == NULL)
573 space_seq = charmap_find_value (charmap, "space", 5);
574 if (space_seq == NULL)
575 space_seq = charmap_find_value (charmap, "U00000020", 9);
576 if (space_seq == NULL || space_seq->nbytes != 1)
579 error (0, 0, _("character <SP> not defined in character map"));
581 else if (((cnt = BITPOS (tok_space),
582 (ctype->class256_collection[space_seq->bytes[0]]
583 & BIT (tok_space)) == 0)
584 || (cnt = BITPOS (tok_blank),
585 (ctype->class256_collection[space_seq->bytes[0]]
586 & BIT (tok_blank)) == 0)))
589 error (0, 0, _("<SP> character not in class `%s'"),
590 valid_table[cnt].name);
592 else if (((cnt = BITPOS (tok_punct),
593 (ctype->class256_collection[space_seq->bytes[0]]
594 & BIT (tok_punct)) != 0)
595 || (cnt = BITPOS (tok_graph),
596 (ctype->class256_collection[space_seq->bytes[0]]
597 & BIT (tok_graph)) != 0)))
600 error (0, 0, _("<SP> character must not be in class `%s'"),
601 valid_table[cnt].name);
604 ctype->class256_collection[space_seq->bytes[0]] |= BIT (tok_print);
606 /* Now that the tests are done make sure the name array contains all
607 characters which are handled in the WIDTH section of the
608 character set definition file. */
609 if (charmap->width_rules != NULL)
610 for (cnt = 0; cnt < charmap->nwidth_rules; ++cnt)
612 unsigned char bytes[charmap->mb_cur_max];
613 int nbytes = charmap->width_rules[cnt].from->nbytes;
615 /* We have the range of character for which the width is
616 specified described using byte sequences of the multibyte
617 charset. We have to convert this to UCS4 now. And we
618 cannot simply convert the beginning and the end of the
619 sequence, we have to iterate over the byte sequence and
620 convert it for every single character. */
621 memcpy (bytes, charmap->width_rules[cnt].from->bytes, nbytes);
623 while (nbytes < charmap->width_rules[cnt].to->nbytes
624 || memcmp (bytes, charmap->width_rules[cnt].to->bytes,
627 /* Find the UCS value for `bytes'. */
630 struct charseq *seq = charmap_find_symbol (charmap, bytes, nbytes);
633 wch = ILLEGAL_CHAR_VALUE;
634 else if (seq->ucs4 != UNINITIALIZED_CHAR_VALUE)
637 wch = repertoire_find_value (ctype->repertoire, seq->name,
640 if (wch != ILLEGAL_CHAR_VALUE)
641 /* We are only interested in the side-effects of the
642 `find_idx' call. It will add appropriate entries in
643 the name array if this is necessary. */
644 (void) find_idx (ctype, NULL, NULL, NULL, wch);
646 /* "Increment" the bytes sequence. */
648 while (inner >= 0 && bytes[inner] == 0xff)
653 /* We have to extend the byte sequence. */
654 if (nbytes >= charmap->width_rules[cnt].to->nbytes)
658 memset (&bytes[1], 0, nbytes);
664 while (++inner < nbytes)
670 /* Now set all the other characters of the character set to the
673 while (iterate_table (&charmap->char_table, &curs, &key, &len, &vdata) == 0)
675 struct charseq *data = (struct charseq *) vdata;
677 if (data->ucs4 == UNINITIALIZED_CHAR_VALUE)
678 data->ucs4 = repertoire_find_value (ctype->repertoire,
681 if (data->ucs4 != ILLEGAL_CHAR_VALUE)
682 (void) find_idx (ctype, NULL, NULL, NULL, data->ucs4);
685 /* There must be a multiple of 10 digits. */
686 if (ctype->mbdigits_act % 10 != 0)
688 assert (ctype->mbdigits_act == ctype->wcdigits_act);
689 ctype->wcdigits_act -= ctype->mbdigits_act % 10;
690 ctype->mbdigits_act -= ctype->mbdigits_act % 10;
691 error (0, 0, _("`digit' category has not entries in groups of ten"));
694 /* Check the input digits. There must be a multiple of ten available.
695 In each group it could be that one or the other character is missing.
696 In this case the whole group must be removed. */
698 while (cnt < ctype->mbdigits_act)
701 for (inner = 0; inner < 10; ++inner)
702 if (ctype->mbdigits[cnt + inner] == NULL)
709 /* Remove the group. */
710 memmove (&ctype->mbdigits[cnt], &ctype->mbdigits[cnt + 10],
711 ((ctype->wcdigits_act - cnt - 10)
712 * sizeof (ctype->mbdigits[0])));
713 ctype->mbdigits_act -= 10;
717 /* If no input digits are given use the default. */
718 if (ctype->mbdigits_act == 0)
720 if (ctype->mbdigits_max == 0)
722 ctype->mbdigits = obstack_alloc (&charmap->mem_pool,
723 10 * sizeof (struct charseq *));
724 ctype->mbdigits_max = 10;
727 for (cnt = 0; cnt < 10; ++cnt)
729 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
731 if (ctype->mbdigits[cnt] == NULL)
733 ctype->mbdigits[cnt] = charmap_find_symbol (charmap,
735 strlen (longnames[cnt]));
736 if (ctype->mbdigits[cnt] == NULL)
738 /* Hum, this ain't good. */
740 no input digits defined and none of the standard names in the charmap"));
742 ctype->mbdigits[cnt] = obstack_alloc (&charmap->mem_pool,
743 sizeof (struct charseq) + 1);
745 /* This is better than nothing. */
746 ctype->mbdigits[cnt]->bytes[0] = digits[cnt];
747 ctype->mbdigits[cnt]->nbytes = 1;
752 ctype->mbdigits_act = 10;
755 /* Check the wide character input digits. There must be a multiple
756 of ten available. In each group it could be that one or the other
757 character is missing. In this case the whole group must be
760 while (cnt < ctype->wcdigits_act)
763 for (inner = 0; inner < 10; ++inner)
764 if (ctype->wcdigits[cnt + inner] == ILLEGAL_CHAR_VALUE)
771 /* Remove the group. */
772 memmove (&ctype->wcdigits[cnt], &ctype->wcdigits[cnt + 10],
773 ((ctype->wcdigits_act - cnt - 10)
774 * sizeof (ctype->wcdigits[0])));
775 ctype->wcdigits_act -= 10;
779 /* If no input digits are given use the default. */
780 if (ctype->wcdigits_act == 0)
782 if (ctype->wcdigits_max == 0)
784 ctype->wcdigits = obstack_alloc (&charmap->mem_pool,
785 10 * sizeof (uint32_t));
786 ctype->wcdigits_max = 10;
789 for (cnt = 0; cnt < 10; ++cnt)
790 ctype->wcdigits[cnt] = L'0' + cnt;
792 ctype->mbdigits_act = 10;
795 /* Check the outdigits. */
797 for (cnt = 0; cnt < 10; ++cnt)
798 if (ctype->mboutdigits[cnt] == NULL)
800 static struct charseq replace[2];
805 not all characters used in `outdigit' are available in the charmap"));
809 replace[0].nbytes = 1;
810 replace[0].bytes[0] = '?';
811 replace[0].bytes[1] = '\0';
812 ctype->mboutdigits[cnt] = &replace[0];
816 for (cnt = 0; cnt < 10; ++cnt)
817 if (ctype->wcoutdigits[cnt] == 0)
822 not all characters used in `outdigit' are available in the repertoire"));
826 ctype->wcoutdigits[cnt] = L'?';
829 /* Sort the entries in the translit_ignore list. */
830 if (ctype->translit_ignore != NULL)
832 struct translit_ignore_t *firstp = ctype->translit_ignore;
833 struct translit_ignore_t *runp;
835 ctype->ntranslit_ignore = 1;
837 for (runp = firstp->next; runp != NULL; runp = runp->next)
839 struct translit_ignore_t *lastp = NULL;
840 struct translit_ignore_t *cmpp;
842 ++ctype->ntranslit_ignore;
844 for (cmpp = firstp; cmpp != NULL; lastp = cmpp, cmpp = cmpp->next)
845 if (runp->from < cmpp->from)
853 ctype->translit_ignore = firstp;
859 ctype_output (struct localedef_t *locale, struct charmap_t *charmap,
860 const char *output_path)
862 static const char nulbytes[4] = { 0, 0, 0, 0 };
863 struct locale_ctype_t *ctype = locale->categories[LC_CTYPE].ctype;
864 const size_t nelems = (_NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1)
865 + ctype->nr_charclass + ctype->map_collection_nr);
866 struct iovec iov[2 + nelems + 2 * ctype->nr_charclass
867 + ctype->map_collection_nr + 4];
868 struct locale_file data;
869 uint32_t idx[nelems + 1];
870 uint32_t default_missing_len;
871 size_t elem, cnt, offset, total;
874 /* Now prepare the output: Find the sizes of the table we can use. */
875 allocate_arrays (ctype, charmap, ctype->repertoire);
877 data.magic = LIMAGIC (LC_CTYPE);
879 iov[0].iov_base = (void *) &data;
880 iov[0].iov_len = sizeof (data);
882 iov[1].iov_base = (void *) idx;
883 iov[1].iov_len = nelems * sizeof (uint32_t);
885 idx[0] = iov[0].iov_len + iov[1].iov_len;
888 for (elem = 0; elem < nelems; ++elem)
890 if (elem < _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1))
893 #define CTYPE_EMPTY(name) \
895 iov[2 + elem + offset].iov_base = NULL; \
896 iov[2 + elem + offset].iov_len = 0; \
897 idx[elem + 1] = idx[elem]; \
900 CTYPE_EMPTY(_NL_CTYPE_GAP1);
901 CTYPE_EMPTY(_NL_CTYPE_GAP2);
902 CTYPE_EMPTY(_NL_CTYPE_GAP3);
903 CTYPE_EMPTY(_NL_CTYPE_GAP4);
904 CTYPE_EMPTY(_NL_CTYPE_GAP5);
905 CTYPE_EMPTY(_NL_CTYPE_GAP6);
907 #define CTYPE_DATA(name, base, len) \
908 case _NL_ITEM_INDEX (name): \
909 iov[2 + elem + offset].iov_base = (base); \
910 iov[2 + elem + offset].iov_len = (len); \
911 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len; \
914 CTYPE_DATA (_NL_CTYPE_CLASS,
916 (256 + 128) * sizeof (char_class_t));
918 CTYPE_DATA (_NL_CTYPE_TOUPPER,
920 (256 + 128) * sizeof (uint32_t));
921 CTYPE_DATA (_NL_CTYPE_TOLOWER,
923 (256 + 128) * sizeof (uint32_t));
925 CTYPE_DATA (_NL_CTYPE_TOUPPER32,
927 256 * sizeof (uint32_t));
928 CTYPE_DATA (_NL_CTYPE_TOLOWER32,
930 256 * sizeof (uint32_t));
932 CTYPE_DATA (_NL_CTYPE_CLASS32,
934 256 * sizeof (char_class32_t));
936 CTYPE_DATA (_NL_CTYPE_CLASS_OFFSET,
937 &ctype->class_offset, sizeof (uint32_t));
939 CTYPE_DATA (_NL_CTYPE_MAP_OFFSET,
940 &ctype->map_offset, sizeof (uint32_t));
942 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TAB_SIZE,
943 &ctype->translit_idx_size, sizeof (uint32_t));
945 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_IDX,
946 ctype->translit_from_idx,
947 ctype->translit_idx_size * sizeof (uint32_t));
949 CTYPE_DATA (_NL_CTYPE_TRANSLIT_FROM_TBL,
950 ctype->translit_from_tbl,
951 ctype->translit_from_tbl_size);
953 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_IDX,
954 ctype->translit_to_idx,
955 ctype->translit_idx_size * sizeof (uint32_t));
957 CTYPE_DATA (_NL_CTYPE_TRANSLIT_TO_TBL,
958 ctype->translit_to_tbl, ctype->translit_to_tbl_size);
960 case _NL_ITEM_INDEX (_NL_CTYPE_CLASS_NAMES):
961 /* The class name array. */
963 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt, ++offset)
965 iov[2 + elem + offset].iov_base
966 = (void *) ctype->classnames[cnt];
967 iov[2 + elem + offset].iov_len
968 = strlen (ctype->classnames[cnt]) + 1;
969 total += iov[2 + elem + offset].iov_len;
971 iov[2 + elem + offset].iov_base = (void *) nulbytes;
972 iov[2 + elem + offset].iov_len = 1 + (4 - ((total + 1) % 4));
973 total += 1 + (4 - ((total + 1) % 4));
975 idx[elem + 1] = idx[elem] + total;
978 case _NL_ITEM_INDEX (_NL_CTYPE_MAP_NAMES):
979 /* The class name array. */
981 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt, ++offset)
983 iov[2 + elem + offset].iov_base
984 = (void *) ctype->mapnames[cnt];
985 iov[2 + elem + offset].iov_len
986 = strlen (ctype->mapnames[cnt]) + 1;
987 total += iov[2 + elem + offset].iov_len;
989 iov[2 + elem + offset].iov_base = (void *) nulbytes;
990 iov[2 + elem + offset].iov_len = 1 + (4 - ((total + 1) % 4));
991 total += 1 + (4 - ((total + 1) % 4));
993 idx[elem + 1] = idx[elem] + total;
996 CTYPE_DATA (_NL_CTYPE_WIDTH,
997 ctype->width.iov_base,
998 ctype->width.iov_len);
1000 CTYPE_DATA (_NL_CTYPE_MB_CUR_MAX,
1001 &ctype->mb_cur_max, sizeof (uint32_t));
1003 case _NL_ITEM_INDEX (_NL_CTYPE_CODESET_NAME):
1004 total = strlen (ctype->codeset_name) + 1;
1006 iov[2 + elem + offset].iov_base = (char *) ctype->codeset_name;
1009 iov[2 + elem + offset].iov_base = alloca ((total + 3) & ~3);
1010 memset (mempcpy (iov[2 + elem + offset].iov_base,
1011 ctype->codeset_name, total),
1012 '\0', 4 - (total & 3));
1013 total = (total + 3) & ~3;
1015 iov[2 + elem + offset].iov_len = total;
1016 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1019 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_MB_LEN):
1020 iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t));
1021 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
1022 *(uint32_t *) iov[2 + elem + offset].iov_base =
1023 ctype->mbdigits_act / 10;
1024 idx[elem + 1] = idx[elem] + sizeof (uint32_t);
1027 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS_WC_LEN):
1028 /* Align entries. */
1029 iov[2 + elem + offset].iov_base = (void *) nulbytes;
1030 iov[2 + elem + offset].iov_len = (4 - idx[elem] % 4) % 4;
1031 idx[elem] += iov[2 + elem + offset].iov_len;
1034 iov[2 + elem + offset].iov_base = alloca (sizeof (uint32_t));
1035 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
1036 *(uint32_t *) iov[2 + elem + offset].iov_base =
1037 ctype->wcdigits_act / 10;
1038 idx[elem + 1] = idx[elem] + sizeof (uint32_t);
1041 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_MB):
1042 /* Compute the length of all possible characters. For INDIGITS
1043 there might be more than one. We simply concatenate all of
1044 them with a NUL byte following. The NUL byte wouldn't be
1045 necessary but it makes it easier for the user. */
1048 for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB);
1049 cnt < ctype->mbdigits_act; cnt += 10)
1050 total += ctype->mbdigits[cnt]->nbytes + 1;
1051 iov[2 + elem + offset].iov_base = (char *) alloca (total);
1052 iov[2 + elem + offset].iov_len = total;
1054 cp = iov[2 + elem + offset].iov_base;
1055 for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_MB);
1056 cnt < ctype->mbdigits_act; cnt += 10)
1058 cp = mempcpy (cp, ctype->mbdigits[cnt]->bytes,
1059 ctype->mbdigits[cnt]->nbytes);
1062 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1065 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_MB):
1066 /* Compute the length of all possible characters. For INDIGITS
1067 there might be more than one. We simply concatenate all of
1068 them with a NUL byte following. The NUL byte wouldn't be
1069 necessary but it makes it easier for the user. */
1070 cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_MB);
1071 total = ctype->mboutdigits[cnt]->nbytes + 1;
1072 iov[2 + elem + offset].iov_base = (char *) alloca (total);
1073 iov[2 + elem + offset].iov_len = total;
1075 *(char *) mempcpy (iov[2 + elem + offset].iov_base,
1076 ctype->mboutdigits[cnt]->bytes,
1077 ctype->mboutdigits[cnt]->nbytes) = '\0';
1078 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1081 case _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS9_WC):
1082 total = ctype->wcdigits_act / 10;
1084 iov[2 + elem + offset].iov_base =
1085 (uint32_t *) alloca (total * sizeof (uint32_t));
1086 iov[2 + elem + offset].iov_len = total * sizeof (uint32_t);
1088 for (cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_INDIGITS0_WC);
1089 cnt < ctype->wcdigits_act; cnt += 10)
1090 ((uint32_t *) iov[2 + elem + offset].iov_base)[cnt / 10]
1091 = ctype->wcdigits[cnt];
1092 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1095 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC):
1096 /* Align entries. */
1097 iov[2 + elem + offset].iov_base = (void *) nulbytes;
1098 iov[2 + elem + offset].iov_len = (4 - idx[elem] % 4) % 4;
1099 idx[elem] += iov[2 + elem + offset].iov_len;
1103 case _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT1_WC) ... _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT9_WC):
1104 cnt = elem - _NL_ITEM_INDEX (_NL_CTYPE_OUTDIGIT0_WC);
1105 iov[2 + elem + offset].iov_base = &ctype->wcoutdigits[cnt];
1106 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
1107 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1110 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN):
1111 /* Align entries. */
1112 iov[2 + elem + offset].iov_base = (void *) nulbytes;
1113 iov[2 + elem + offset].iov_len = (4 - idx[elem] % 4) % 4;
1114 idx[elem] += iov[2 + elem + offset].iov_len;
1117 default_missing_len = (ctype->default_missing
1118 ? wcslen ((wchar_t *)ctype->default_missing)
1120 iov[2 + elem + offset].iov_base = &default_missing_len;
1121 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
1122 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1125 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_DEFAULT_MISSING):
1126 iov[2 + elem + offset].iov_base =
1127 ctype->default_missing ?: (uint32_t *) L"";
1128 iov[2 + elem + offset].iov_len =
1129 wcslen (iov[2 + elem + offset].iov_base);
1130 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1133 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE_LEN):
1134 /* Align entries. */
1135 iov[2 + elem + offset].iov_base = (void *) nulbytes;
1136 iov[2 + elem + offset].iov_len = (4 - idx[elem] % 4) % 4;
1137 idx[elem] += iov[2 + elem + offset].iov_len;
1140 iov[2 + elem + offset].iov_base = &ctype->ntranslit_ignore;
1141 iov[2 + elem + offset].iov_len = sizeof (uint32_t);
1142 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1145 case _NL_ITEM_INDEX(_NL_CTYPE_TRANSLIT_IGNORE):
1147 uint32_t *ranges = (uint32_t *) alloca (ctype->ntranslit_ignore
1148 * 3 * sizeof (uint32_t));
1149 struct translit_ignore_t *runp;
1151 iov[2 + elem + offset].iov_base = ranges;
1152 iov[2 + elem + offset].iov_len = (ctype->ntranslit_ignore
1153 * 3 * sizeof (uint32_t));
1155 for (runp = ctype->translit_ignore; runp != NULL;
1158 *ranges++ = runp->from;
1159 *ranges++ = runp->to;
1160 *ranges++ = runp->step;
1163 /* Remove the following line in case a new entry is added
1164 after _NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN. */
1166 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1170 assert (! "unknown CTYPE element");
1174 /* Handle extra maps. */
1175 size_t nr = elem - _NL_ITEM_INDEX (_NL_CTYPE_EXTRA_MAP_1);
1176 if (nr < ctype->nr_charclass)
1178 iov[2 + elem + offset].iov_base = ctype->class_b[nr];
1179 iov[2 + elem + offset].iov_len = 256 / 32 * sizeof (uint32_t);
1180 idx[elem] += iov[2 + elem + offset].iov_len;
1183 iov[2 + elem + offset] = ctype->class_3level[nr];
1187 nr -= ctype->nr_charclass;
1188 assert (nr < ctype->map_collection_nr);
1189 iov[2 + elem + offset] = ctype->map_3level[nr];
1191 idx[elem + 1] = idx[elem] + iov[2 + elem + offset].iov_len;
1195 assert (2 + elem + offset == (nelems + 2 * ctype->nr_charclass
1196 + ctype->map_collection_nr + 4 + 2));
1198 write_locale_data (output_path, "LC_CTYPE", 2 + elem + offset, iov);
1202 /* Local functions. */
1204 ctype_class_new (struct linereader *lr, struct locale_ctype_t *ctype,
1209 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
1210 if (strcmp (ctype->classnames[cnt], name) == 0)
1213 if (cnt < ctype->nr_charclass)
1215 lr_error (lr, _("character class `%s' already defined"), name);
1219 if (ctype->nr_charclass == MAX_NR_CHARCLASS)
1220 /* Exit code 2 is prescribed in P1003.2b. */
1222 implementation limit: no more than %Zd character classes allowed"),
1225 ctype->classnames[ctype->nr_charclass++] = name;
1230 ctype_map_new (struct linereader *lr, struct locale_ctype_t *ctype,
1231 const char *name, struct charmap_t *charmap)
1233 size_t max_chars = 0;
1236 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
1238 if (strcmp (ctype->mapnames[cnt], name) == 0)
1241 if (max_chars < ctype->map_collection_max[cnt])
1242 max_chars = ctype->map_collection_max[cnt];
1245 if (cnt < ctype->map_collection_nr)
1247 lr_error (lr, _("character map `%s' already defined"), name);
1251 if (ctype->map_collection_nr == MAX_NR_CHARMAP)
1252 /* Exit code 2 is prescribed in P1003.2b. */
1254 implementation limit: no more than %d character maps allowed"),
1257 ctype->mapnames[cnt] = name;
1260 ctype->map_collection_max[cnt] = charmap->mb_cur_max == 1 ? 256 : 512;
1262 ctype->map_collection_max[cnt] = max_chars;
1264 ctype->map_collection[cnt] = (uint32_t *)
1265 xcalloc (sizeof (uint32_t), ctype->map_collection_max[cnt]);
1266 ctype->map_collection_act[cnt] = 256;
1268 ++ctype->map_collection_nr;
1272 /* We have to be prepared that TABLE, MAX, and ACT can be NULL. This
1273 is possible if we only want to extend the name array. */
1275 find_idx (struct locale_ctype_t *ctype, uint32_t **table, size_t *max,
1276 size_t *act, uint32_t idx)
1281 return table == NULL ? NULL : &(*table)[idx];
1283 /* If idx is in the usual range, use the charnames_idx lookup table
1284 instead of the slow search loop. */
1285 if (idx < MAX_CHARNAMES_IDX)
1287 if (ctype->charnames_idx[idx] != ~((uint32_t) 0))
1289 cnt = ctype->charnames_idx[idx];
1292 cnt = ctype->charnames_act;
1296 for (cnt = 256; cnt < ctype->charnames_act; ++cnt)
1297 if (ctype->charnames[cnt] == idx)
1301 /* We have to distinguish two cases: the name is found or not. */
1302 if (cnt == ctype->charnames_act)
1304 /* Extend the name array. */
1305 if (ctype->charnames_act == ctype->charnames_max)
1307 ctype->charnames_max *= 2;
1308 ctype->charnames = (uint32_t *)
1309 xrealloc (ctype->charnames,
1310 sizeof (uint32_t) * ctype->charnames_max);
1312 ctype->charnames[ctype->charnames_act++] = idx;
1313 if (idx < MAX_CHARNAMES_IDX)
1314 ctype->charnames_idx[idx] = cnt;
1318 /* We have done everything we are asked to do. */
1322 /* The caller does not want to extend the table. */
1323 return (cnt >= *act ? NULL : &(*table)[cnt]);
1329 size_t old_max = *max;
1332 while (*max <= cnt);
1335 (uint32_t *) xrealloc (*table, *max * sizeof (uint32_t));
1336 memset (&(*table)[old_max], '\0',
1337 (*max - old_max) * sizeof (uint32_t));
1343 return &(*table)[cnt];
1348 get_character (struct token *now, struct charmap_t *charmap,
1349 struct repertoire_t *repertoire,
1350 struct charseq **seqp, uint32_t *wchp)
1352 if (now->tok == tok_bsymbol)
1354 /* This will hopefully be the normal case. */
1355 *wchp = repertoire_find_value (repertoire, now->val.str.startmb,
1356 now->val.str.lenmb);
1357 *seqp = charmap_find_value (charmap, now->val.str.startmb,
1358 now->val.str.lenmb);
1360 else if (now->tok == tok_ucs4)
1364 snprintf (utmp, sizeof (utmp), "U%08X", now->val.ucs4);
1365 *seqp = charmap_find_value (charmap, utmp, 9);
1368 *seqp = repertoire_find_seq (repertoire, now->val.ucs4);
1372 /* Compute the value in the charmap from the UCS value. */
1373 const char *symbol = repertoire_find_symbol (repertoire,
1379 *seqp = charmap_find_value (charmap, symbol, strlen (symbol));
1383 if (repertoire != NULL)
1385 /* Insert a negative entry. */
1386 static const struct charseq negative
1387 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1388 uint32_t *newp = obstack_alloc (&repertoire->mem_pool,
1390 *newp = now->val.ucs4;
1392 insert_entry (&repertoire->seq_table, newp,
1393 sizeof (uint32_t), (void *) &negative);
1397 (*seqp)->ucs4 = now->val.ucs4;
1399 else if ((*seqp)->ucs4 != now->val.ucs4)
1402 *wchp = now->val.ucs4;
1404 else if (now->tok == tok_charcode)
1406 /* We must map from the byte code to UCS4. */
1407 *seqp = charmap_find_symbol (charmap, now->val.str.startmb,
1408 now->val.str.lenmb);
1411 *wchp = ILLEGAL_CHAR_VALUE;
1414 if ((*seqp)->ucs4 == UNINITIALIZED_CHAR_VALUE)
1415 (*seqp)->ucs4 = repertoire_find_value (repertoire, (*seqp)->name,
1416 strlen ((*seqp)->name));
1417 *wchp = (*seqp)->ucs4;
1427 /* Ellipsis like in `<foo123>..<foo12a>' or `<j1234>....<j1245>' and
1428 the .(2). counterparts. */
1430 charclass_symbolic_ellipsis (struct linereader *ldfile,
1431 struct locale_ctype_t *ctype,
1432 struct charmap_t *charmap,
1433 struct repertoire_t *repertoire,
1435 const char *last_str,
1436 unsigned long int class256_bit,
1437 unsigned long int class_bit, int base,
1438 int ignore_content, int handle_digits, int step)
1440 const char *nowstr = now->val.str.startmb;
1441 char tmp[now->val.str.lenmb + 1];
1444 unsigned long int from;
1445 unsigned long int to;
1447 /* We have to compute the ellipsis values using the symbolic names. */
1448 assert (last_str != NULL);
1450 if (strlen (last_str) != now->val.str.lenmb)
1454 _("`%s' and `%.*s' are no valid names for symbolic range"),
1455 last_str, (int) now->val.str.lenmb, nowstr);
1459 if (memcmp (last_str, nowstr, now->val.str.lenmb) == 0)
1460 /* Nothing to do, the names are the same. */
1463 for (cp = last_str; *cp == *(nowstr + (cp - last_str)); ++cp)
1467 from = strtoul (cp, &endp, base);
1468 if ((from == UINT_MAX && errno == ERANGE) || *endp != '\0')
1471 to = strtoul (nowstr + (cp - last_str), &endp, base);
1472 if ((to == UINT_MAX && errno == ERANGE)
1473 || (endp - nowstr) != now->val.str.lenmb || from >= to)
1476 /* OK, we have a range FROM - TO. Now we can create the symbolic names. */
1477 if (!ignore_content)
1479 now->val.str.startmb = tmp;
1480 while ((from += step) <= to)
1482 struct charseq *seq;
1485 sprintf (tmp, (base == 10 ? "%.*s%0*ld" : "%.*s%0*lX"),
1486 (int) (cp - last_str), last_str,
1487 (int) (now->val.str.lenmb - (cp - last_str)),
1490 get_character (now, charmap, repertoire, &seq, &wch);
1492 if (seq != NULL && seq->nbytes == 1)
1493 /* Yep, we can store information about this byte sequence. */
1494 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
1496 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1497 /* We have the UCS4 position. */
1498 *find_idx (ctype, &ctype->class_collection,
1499 &ctype->class_collection_max,
1500 &ctype->class_collection_act, wch) |= class_bit;
1502 if (handle_digits == 1)
1504 /* We must store the digit values. */
1505 if (ctype->mbdigits_act == ctype->mbdigits_max)
1507 ctype->mbdigits_max *= 2;
1508 ctype->mbdigits = xrealloc (ctype->mbdigits,
1509 (ctype->mbdigits_max
1510 * sizeof (char *)));
1511 ctype->wcdigits_max *= 2;
1512 ctype->wcdigits = xrealloc (ctype->wcdigits,
1513 (ctype->wcdigits_max
1514 * sizeof (uint32_t)));
1517 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1518 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1520 else if (handle_digits == 2)
1522 /* We must store the digit values. */
1523 if (ctype->outdigits_act >= 10)
1525 lr_error (ldfile, _("\
1526 %s: field `%s' does not contain exactly ten entries"),
1527 "LC_CTYPE", "outdigit");
1531 ctype->mboutdigits[ctype->outdigits_act] = seq;
1532 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1533 ++ctype->outdigits_act;
1540 /* Ellipsis like in `<U1234>..<U2345>' or `<U1234>..(2)..<U2345>'. */
1542 charclass_ucs4_ellipsis (struct linereader *ldfile,
1543 struct locale_ctype_t *ctype,
1544 struct charmap_t *charmap,
1545 struct repertoire_t *repertoire,
1546 struct token *now, uint32_t last_wch,
1547 unsigned long int class256_bit,
1548 unsigned long int class_bit, int ignore_content,
1549 int handle_digits, int step)
1551 if (last_wch > now->val.ucs4)
1553 lr_error (ldfile, _("\
1554 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
1555 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, now->val.ucs4,
1556 (now->val.ucs4 | last_wch) < 65536 ? 4 : 8, last_wch);
1560 if (!ignore_content)
1561 while ((last_wch += step) <= now->val.ucs4)
1563 /* We have to find out whether there is a byte sequence corresponding
1564 to this UCS4 value. */
1565 struct charseq *seq;
1568 snprintf (utmp, sizeof (utmp), "U%08X", last_wch);
1569 seq = charmap_find_value (charmap, utmp, 9);
1572 snprintf (utmp, sizeof (utmp), "U%04X", last_wch);
1573 seq = charmap_find_value (charmap, utmp, 5);
1577 /* Try looking in the repertoire map. */
1578 seq = repertoire_find_seq (repertoire, last_wch);
1580 /* If this is the first time we look for this sequence create a new
1584 static const struct charseq negative
1585 = { .ucs4 = ILLEGAL_CHAR_VALUE };
1587 /* Find the symbolic name for this UCS4 value. */
1588 if (repertoire != NULL)
1590 const char *symbol = repertoire_find_symbol (repertoire,
1592 uint32_t *newp = obstack_alloc (&repertoire->mem_pool,
1597 /* We have a name, now search the multibyte value. */
1598 seq = charmap_find_value (charmap, symbol, strlen (symbol));
1601 /* We have to create a fake entry. */
1602 seq = (struct charseq *) &negative;
1604 seq->ucs4 = last_wch;
1606 insert_entry (&repertoire->seq_table, newp, sizeof (uint32_t),
1610 /* We have to create a fake entry. */
1611 seq = (struct charseq *) &negative;
1614 /* We have a name, now search the multibyte value. */
1615 if (seq->ucs4 == last_wch && seq->nbytes == 1)
1616 /* Yep, we can store information about this byte sequence. */
1617 ctype->class256_collection[(size_t) seq->bytes[0]]
1620 /* And of course we have the UCS4 position. */
1622 *find_idx (ctype, &ctype->class_collection,
1623 &ctype->class_collection_max,
1624 &ctype->class_collection_act, last_wch) |= class_bit;
1626 if (handle_digits == 1)
1628 /* We must store the digit values. */
1629 if (ctype->mbdigits_act == ctype->mbdigits_max)
1631 ctype->mbdigits_max *= 2;
1632 ctype->mbdigits = xrealloc (ctype->mbdigits,
1633 (ctype->mbdigits_max
1634 * sizeof (char *)));
1635 ctype->wcdigits_max *= 2;
1636 ctype->wcdigits = xrealloc (ctype->wcdigits,
1637 (ctype->wcdigits_max
1638 * sizeof (uint32_t)));
1641 ctype->mbdigits[ctype->mbdigits_act++] = (seq->ucs4 == last_wch
1643 ctype->wcdigits[ctype->wcdigits_act++] = last_wch;
1645 else if (handle_digits == 2)
1647 /* We must store the digit values. */
1648 if (ctype->outdigits_act >= 10)
1650 lr_error (ldfile, _("\
1651 %s: field `%s' does not contain exactly ten entries"),
1652 "LC_CTYPE", "outdigit");
1656 ctype->mboutdigits[ctype->outdigits_act] = (seq->ucs4 == last_wch
1658 ctype->wcoutdigits[ctype->outdigits_act] = last_wch;
1659 ++ctype->outdigits_act;
1665 /* Ellipsis as in `/xea/x12.../xea/x34'. */
1667 charclass_charcode_ellipsis (struct linereader *ldfile,
1668 struct locale_ctype_t *ctype,
1669 struct charmap_t *charmap,
1670 struct repertoire_t *repertoire,
1671 struct token *now, char *last_charcode,
1672 uint32_t last_charcode_len,
1673 unsigned long int class256_bit,
1674 unsigned long int class_bit, int ignore_content,
1677 /* First check whether the to-value is larger. */
1678 if (now->val.charcode.nbytes != last_charcode_len)
1680 lr_error (ldfile, _("\
1681 start and end character sequence of range must have the same length"));
1685 if (memcmp (last_charcode, now->val.charcode.bytes, last_charcode_len) > 0)
1687 lr_error (ldfile, _("\
1688 to-value character sequence is smaller than from-value sequence"));
1692 if (!ignore_content)
1696 /* Increment the byte sequence value. */
1697 struct charseq *seq;
1701 for (i = last_charcode_len - 1; i >= 0; --i)
1702 if (++last_charcode[i] != 0)
1705 if (last_charcode_len == 1)
1706 /* Of course we have the charcode value. */
1707 ctype->class256_collection[(size_t) last_charcode[0]]
1710 /* Find the symbolic name. */
1711 seq = charmap_find_symbol (charmap, last_charcode,
1715 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1716 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1717 strlen (seq->name));
1718 wch = seq == NULL ? ILLEGAL_CHAR_VALUE : seq->ucs4;
1720 if (wch != ILLEGAL_CHAR_VALUE && class_bit != 0)
1721 *find_idx (ctype, &ctype->class_collection,
1722 &ctype->class_collection_max,
1723 &ctype->class_collection_act, wch) |= class_bit;
1726 wch = ILLEGAL_CHAR_VALUE;
1728 if (handle_digits == 1)
1730 /* We must store the digit values. */
1731 if (ctype->mbdigits_act == ctype->mbdigits_max)
1733 ctype->mbdigits_max *= 2;
1734 ctype->mbdigits = xrealloc (ctype->mbdigits,
1735 (ctype->mbdigits_max
1736 * sizeof (char *)));
1737 ctype->wcdigits_max *= 2;
1738 ctype->wcdigits = xrealloc (ctype->wcdigits,
1739 (ctype->wcdigits_max
1740 * sizeof (uint32_t)));
1743 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1744 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1745 seq->nbytes = last_charcode_len;
1747 ctype->mbdigits[ctype->mbdigits_act++] = seq;
1748 ctype->wcdigits[ctype->wcdigits_act++] = wch;
1750 else if (handle_digits == 2)
1752 struct charseq *seq;
1753 /* We must store the digit values. */
1754 if (ctype->outdigits_act >= 10)
1756 lr_error (ldfile, _("\
1757 %s: field `%s' does not contain exactly ten entries"),
1758 "LC_CTYPE", "outdigit");
1762 seq = xmalloc (sizeof (struct charseq) + last_charcode_len);
1763 memcpy ((char *) (seq + 1), last_charcode, last_charcode_len);
1764 seq->nbytes = last_charcode_len;
1766 ctype->mboutdigits[ctype->outdigits_act] = seq;
1767 ctype->wcoutdigits[ctype->outdigits_act] = wch;
1768 ++ctype->outdigits_act;
1771 while (memcmp (last_charcode, now->val.charcode.bytes,
1772 last_charcode_len) != 0);
1777 /* Read one transliteration entry. */
1779 read_widestring (struct linereader *ldfile, struct token *now,
1780 struct charmap_t *charmap, struct repertoire_t *repertoire)
1784 if (now->tok == tok_default_missing)
1785 /* The special name "" will denote this case. */
1786 wstr = ((uint32_t *) { 0 });
1787 else if (now->tok == tok_bsymbol)
1789 /* Get the value from the repertoire. */
1790 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1791 wstr[0] = repertoire_find_value (repertoire, now->val.str.startmb,
1792 now->val.str.lenmb);
1793 if (wstr[0] == ILLEGAL_CHAR_VALUE)
1795 /* We cannot proceed, we don't know the UCS4 value. */
1802 else if (now->tok == tok_ucs4)
1804 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1805 wstr[0] = now->val.ucs4;
1808 else if (now->tok == tok_charcode)
1810 /* Argh, we have to convert to the symbol name first and then to the
1812 struct charseq *seq = charmap_find_symbol (charmap,
1813 now->val.str.startmb,
1814 now->val.str.lenmb);
1816 /* Cannot find the UCS4 value. */
1819 if (seq->ucs4 == UNINITIALIZED_CHAR_VALUE)
1820 seq->ucs4 = repertoire_find_value (repertoire, seq->name,
1821 strlen (seq->name));
1822 if (seq->ucs4 == ILLEGAL_CHAR_VALUE)
1823 /* We cannot proceed, we don't know the UCS4 value. */
1826 wstr = (uint32_t *) xmalloc (2 * sizeof (uint32_t));
1827 wstr[0] = seq->ucs4;
1830 else if (now->tok == tok_string)
1832 wstr = now->val.str.startwc;
1833 if (wstr == NULL || wstr[0] == 0)
1838 if (now->tok != tok_eol && now->tok != tok_eof)
1839 lr_ignore_rest (ldfile, 0);
1840 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
1841 return (uint32_t *) -1l;
1849 read_translit_entry (struct linereader *ldfile, struct locale_ctype_t *ctype,
1850 struct token *now, struct charmap_t *charmap,
1851 struct repertoire_t *repertoire)
1853 uint32_t *from_wstr = read_widestring (ldfile, now, charmap, repertoire);
1854 struct translit_t *result;
1855 struct translit_to_t **top;
1856 struct obstack *ob = &ctype->mempool;
1860 if (from_wstr == NULL)
1861 /* There is no valid from string. */
1864 result = (struct translit_t *) obstack_alloc (ob,
1865 sizeof (struct translit_t));
1866 result->from = from_wstr;
1867 result->fname = ldfile->fname;
1868 result->lineno = ldfile->lineno;
1869 result->next = NULL;
1879 /* Next we have one or more transliterations. They are
1880 separated by semicolons. */
1881 now = lr_token (ldfile, charmap, repertoire);
1883 if (!first && (now->tok == tok_semicolon || now->tok == tok_eol))
1885 /* One string read. */
1886 const uint32_t zero = 0;
1890 obstack_grow (ob, &zero, 4);
1891 to_wstr = obstack_finish (ob);
1893 *top = obstack_alloc (ob, sizeof (struct translit_to_t));
1894 (*top)->str = to_wstr;
1895 (*top)->next = NULL;
1898 if (now->tok == tok_eol)
1900 result->next = ctype->translit;
1901 ctype->translit = result;
1906 top = &(*top)->next;
1911 to_wstr = read_widestring (ldfile, now, charmap, repertoire);
1912 if (to_wstr == (uint32_t *) -1l)
1914 /* An error occurred. */
1915 obstack_free (ob, result);
1919 if (to_wstr == NULL)
1922 /* This value is usable. */
1923 obstack_grow (ob, to_wstr, wcslen ((wchar_t *) to_wstr) * 4);
1932 read_translit_ignore_entry (struct linereader *ldfile,
1933 struct locale_ctype_t *ctype,
1934 struct charmap_t *charmap,
1935 struct repertoire_t *repertoire)
1937 /* We expect a semicolon-separated list of characters we ignore. We are
1938 only interested in the wide character definitions. These must be
1939 single characters, possibly defining a range when an ellipsis is used. */
1942 struct token *now = lr_token (ldfile, charmap, repertoire);
1943 struct translit_ignore_t *newp;
1946 if (now->tok == tok_eol || now->tok == tok_eof)
1949 _("premature end of `translit_ignore' definition"));
1953 if (now->tok != tok_bsymbol && now->tok != tok_ucs4)
1955 lr_error (ldfile, _("syntax error"));
1956 lr_ignore_rest (ldfile, 0);
1960 if (now->tok == tok_ucs4)
1961 from = now->val.ucs4;
1963 /* Try to get the value. */
1964 from = repertoire_find_value (repertoire, now->val.str.startmb,
1965 now->val.str.lenmb);
1967 if (from == ILLEGAL_CHAR_VALUE)
1969 lr_error (ldfile, "invalid character name");
1974 newp = (struct translit_ignore_t *)
1975 obstack_alloc (&ctype->mempool, sizeof (struct translit_ignore_t));
1980 newp->next = ctype->translit_ignore;
1981 ctype->translit_ignore = newp;
1984 /* Now we expect either a semicolon, an ellipsis, or the end of the
1986 now = lr_token (ldfile, charmap, repertoire);
1988 if (now->tok == tok_ellipsis2 || now->tok == tok_ellipsis2_2)
1990 /* XXX Should we bother implementing `....'? `...' certainly
1991 will not be implemented. */
1993 int step = now->tok == tok_ellipsis2_2 ? 2 : 1;
1995 now = lr_token (ldfile, charmap, repertoire);
1997 if (now->tok == tok_eol || now->tok == tok_eof)
2000 _("premature end of `translit_ignore' definition"));
2004 if (now->tok != tok_bsymbol && now->tok != tok_ucs4)
2006 lr_error (ldfile, _("syntax error"));
2007 lr_ignore_rest (ldfile, 0);
2011 if (now->tok == tok_ucs4)
2014 /* Try to get the value. */
2015 to = repertoire_find_value (repertoire, now->val.str.startmb,
2016 now->val.str.lenmb);
2018 if (to == ILLEGAL_CHAR_VALUE)
2019 lr_error (ldfile, "invalid character name");
2022 /* Make sure the `to'-value is larger. */
2029 lr_error (ldfile, _("\
2030 to-value <U%0*X> of range is smaller than from-value <U%0*X>"),
2031 (to | from) < 65536 ? 4 : 8, to,
2032 (to | from) < 65536 ? 4 : 8, from);
2035 /* And the next token. */
2036 now = lr_token (ldfile, charmap, repertoire);
2039 if (now->tok == tok_eol || now->tok == tok_eof)
2043 if (now->tok == tok_semicolon)
2047 /* If we come here something is wrong. */
2048 lr_error (ldfile, _("syntax error"));
2049 lr_ignore_rest (ldfile, 0);
2055 /* The parser for the LC_CTYPE section of the locale definition. */
2057 ctype_read (struct linereader *ldfile, struct localedef_t *result,
2058 struct charmap_t *charmap, const char *repertoire_name,
2061 struct repertoire_t *repertoire = NULL;
2062 struct locale_ctype_t *ctype;
2064 enum token_t nowtok;
2066 struct charseq *last_seq;
2067 uint32_t last_wch = 0;
2068 enum token_t last_token;
2069 enum token_t ellipsis_token;
2071 char last_charcode[16];
2072 size_t last_charcode_len = 0;
2073 const char *last_str = NULL;
2075 struct localedef_t *copy_locale = NULL;
2077 /* Get the repertoire we have to use. */
2078 if (repertoire_name != NULL)
2079 repertoire = repertoire_read (repertoire_name);
2081 /* The rest of the line containing `LC_CTYPE' must be free. */
2082 lr_ignore_rest (ldfile, 1);
2087 now = lr_token (ldfile, charmap, NULL);
2090 while (nowtok == tok_eol);
2092 /* If we see `copy' now we are almost done. */
2093 if (nowtok == tok_copy)
2095 now = lr_token (ldfile, charmap, NULL);
2096 if (now->tok != tok_string)
2098 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2102 now = lr_token (ldfile, charmap, NULL);
2103 while (now->tok != tok_eof && now->tok != tok_end);
2105 if (now->tok != tok_eof
2106 || (now = lr_token (ldfile, charmap, NULL), now->tok == tok_eof))
2107 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2108 else if (now->tok != tok_lc_ctype)
2110 lr_error (ldfile, _("\
2111 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2112 lr_ignore_rest (ldfile, 0);
2115 lr_ignore_rest (ldfile, 1);
2120 if (! ignore_content)
2122 /* Get the locale definition. */
2123 copy_locale = load_locale (LC_CTYPE, now->val.str.startmb,
2124 repertoire_name, charmap, NULL);
2125 if ((copy_locale->avail & CTYPE_LOCALE) == 0)
2127 /* Not yet loaded. So do it now. */
2128 if (locfile_read (copy_locale, charmap) != 0)
2133 lr_ignore_rest (ldfile, 1);
2135 now = lr_token (ldfile, charmap, NULL);
2139 /* Prepare the data structures. */
2140 ctype_startup (ldfile, result, charmap, copy_locale, ignore_content);
2141 ctype = result->categories[LC_CTYPE].ctype;
2143 /* Remember the repertoire we use. */
2144 if (!ignore_content)
2145 ctype->repertoire = repertoire;
2149 unsigned long int class_bit = 0;
2150 unsigned long int class256_bit = 0;
2151 int handle_digits = 0;
2153 /* Of course we don't proceed beyond the end of file. */
2154 if (nowtok == tok_eof)
2157 /* Ingore empty lines. */
2158 if (nowtok == tok_eol)
2160 now = lr_token (ldfile, charmap, NULL);
2168 now = lr_token (ldfile, charmap, NULL);
2169 while (now->tok == tok_ident || now->tok == tok_string)
2171 ctype_class_new (ldfile, ctype, now->val.str.startmb);
2172 now = lr_token (ldfile, charmap, NULL);
2173 if (now->tok != tok_semicolon)
2175 now = lr_token (ldfile, charmap, NULL);
2177 if (now->tok != tok_eol)
2179 %s: syntax error in definition of new character class"), "LC_CTYPE");
2183 now = lr_token (ldfile, charmap, NULL);
2184 while (now->tok == tok_ident || now->tok == tok_string)
2186 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
2187 now = lr_token (ldfile, charmap, NULL);
2188 if (now->tok != tok_semicolon)
2190 now = lr_token (ldfile, charmap, NULL);
2192 if (now->tok != tok_eol)
2194 %s: syntax error in definition of new character map"), "LC_CTYPE");
2198 /* Ignore the rest of the line if we don't need the input of
2202 lr_ignore_rest (ldfile, 0);
2206 /* We simply forget the `class' keyword and use the following
2207 operand to determine the bit. */
2208 now = lr_token (ldfile, charmap, NULL);
2209 if (now->tok == tok_ident || now->tok == tok_string)
2211 /* Must can be one of the predefined class names. */
2212 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2213 if (strcmp (ctype->classnames[cnt], now->val.str.startmb) == 0)
2215 if (cnt >= ctype->nr_charclass)
2217 #ifdef PREDEFINED_CLASSES
2218 if (now->val.str.lenmb == 8
2219 && memcmp ("special1", now->val.str.startmb, 8) == 0)
2220 class_bit = _ISwspecial1;
2221 else if (now->val.str.lenmb == 8
2222 && memcmp ("special2", now->val.str.startmb, 8) == 0)
2223 class_bit = _ISwspecial2;
2224 else if (now->val.str.lenmb == 8
2225 && memcmp ("special3", now->val.str.startmb, 8) == 0)
2226 class_bit = _ISwspecial3;
2230 /* OK, it's a new class. */
2231 ctype_class_new (ldfile, ctype, now->val.str.startmb);
2233 class_bit = _ISwbit (ctype->nr_charclass - 1);
2238 class_bit = _ISwbit (cnt);
2240 free (now->val.str.startmb);
2243 else if (now->tok == tok_digit)
2244 goto handle_tok_digit;
2245 else if (now->tok < tok_upper || now->tok > tok_blank)
2249 class_bit = BITw (now->tok);
2250 class256_bit = BIT (now->tok);
2253 /* The next character must be a semicolon. */
2254 now = lr_token (ldfile, charmap, NULL);
2255 if (now->tok != tok_semicolon)
2257 goto read_charclass;
2270 /* Ignore the rest of the line if we don't need the input of
2274 lr_ignore_rest (ldfile, 0);
2278 class_bit = BITw (now->tok);
2279 class256_bit = BIT (now->tok);
2282 ctype->class_done |= class_bit;
2283 last_token = tok_none;
2284 ellipsis_token = tok_none;
2286 now = lr_token (ldfile, charmap, NULL);
2287 while (now->tok != tok_eol && now->tok != tok_eof)
2290 struct charseq *seq;
2292 if (ellipsis_token == tok_none)
2294 if (get_character (now, charmap, repertoire, &seq, &wch))
2297 if (!ignore_content && seq != NULL && seq->nbytes == 1)
2298 /* Yep, we can store information about this byte
2300 ctype->class256_collection[seq->bytes[0]] |= class256_bit;
2302 if (!ignore_content && wch != ILLEGAL_CHAR_VALUE
2304 /* We have the UCS4 position. */
2305 *find_idx (ctype, &ctype->class_collection,
2306 &ctype->class_collection_max,
2307 &ctype->class_collection_act, wch) |= class_bit;
2309 last_token = now->tok;
2310 /* Terminate the string. */
2311 if (last_token == tok_bsymbol)
2313 now->val.str.startmb[now->val.str.lenmb] = '\0';
2314 last_str = now->val.str.startmb;
2320 memcpy (last_charcode, now->val.charcode.bytes, 16);
2321 last_charcode_len = now->val.charcode.nbytes;
2323 if (!ignore_content && handle_digits == 1)
2325 /* We must store the digit values. */
2326 if (ctype->mbdigits_act == ctype->mbdigits_max)
2328 ctype->mbdigits_max += 10;
2329 ctype->mbdigits = xrealloc (ctype->mbdigits,
2330 (ctype->mbdigits_max
2331 * sizeof (char *)));
2332 ctype->wcdigits_max += 10;
2333 ctype->wcdigits = xrealloc (ctype->wcdigits,
2334 (ctype->wcdigits_max
2335 * sizeof (uint32_t)));
2338 ctype->mbdigits[ctype->mbdigits_act++] = seq;
2339 ctype->wcdigits[ctype->wcdigits_act++] = wch;
2341 else if (!ignore_content && handle_digits == 2)
2343 /* We must store the digit values. */
2344 if (ctype->outdigits_act >= 10)
2346 lr_error (ldfile, _("\
2347 %s: field `%s' does not contain exactly ten entries"),
2348 "LC_CTYPE", "outdigit");
2349 lr_ignore_rest (ldfile, 0);
2353 ctype->mboutdigits[ctype->outdigits_act] = seq;
2354 ctype->wcoutdigits[ctype->outdigits_act] = wch;
2355 ++ctype->outdigits_act;
2360 /* Now it gets complicated. We have to resolve the
2361 ellipsis problem. First we must distinguish between
2362 the different kind of ellipsis and this must match the
2363 tokens we have seen. */
2364 assert (last_token != tok_none);
2366 if (last_token != now->tok)
2368 lr_error (ldfile, _("\
2369 ellipsis range must be marked by two operands of same type"));
2370 lr_ignore_rest (ldfile, 0);
2374 if (last_token == tok_bsymbol)
2376 if (ellipsis_token == tok_ellipsis3)
2377 lr_error (ldfile, _("with symbolic name range values \
2378 the absolute ellipsis `...' must not be used"));
2380 charclass_symbolic_ellipsis (ldfile, ctype, charmap,
2381 repertoire, now, last_str,
2382 class256_bit, class_bit,
2387 handle_digits, step);
2389 else if (last_token == tok_ucs4)
2391 if (ellipsis_token != tok_ellipsis2)
2392 lr_error (ldfile, _("\
2393 with UCS range values one must use the hexadecimal symbolic ellipsis `..'"));
2395 charclass_ucs4_ellipsis (ldfile, ctype, charmap,
2396 repertoire, now, last_wch,
2397 class256_bit, class_bit,
2398 ignore_content, handle_digits,
2403 assert (last_token == tok_charcode);
2405 if (ellipsis_token != tok_ellipsis3)
2406 lr_error (ldfile, _("\
2407 with character code range values one must use the absolute ellipsis `...'"));
2409 charclass_charcode_ellipsis (ldfile, ctype, charmap,
2413 class256_bit, class_bit,
2418 /* Now we have used the last value. */
2419 last_token = tok_none;
2422 /* Next we expect a semicolon or the end of the line. */
2423 now = lr_token (ldfile, charmap, NULL);
2424 if (now->tok == tok_eol || now->tok == tok_eof)
2427 if (last_token != tok_none
2428 && now->tok >= tok_ellipsis2 && now->tok <= tok_ellipsis4_2)
2430 if (now->tok == tok_ellipsis2_2)
2432 now->tok = tok_ellipsis2;
2435 else if (now->tok == tok_ellipsis4_2)
2437 now->tok = tok_ellipsis4;
2441 ellipsis_token = now->tok;
2443 now = lr_token (ldfile, charmap, NULL);
2447 if (now->tok != tok_semicolon)
2450 /* And get the next character. */
2451 now = lr_token (ldfile, charmap, NULL);
2453 ellipsis_token = tok_none;
2459 /* Ignore the rest of the line if we don't need the input of
2463 lr_ignore_rest (ldfile, 0);
2468 class_bit = _ISwdigit;
2469 class256_bit = _ISdigit;
2471 goto read_charclass;
2474 /* Ignore the rest of the line if we don't need the input of
2478 lr_ignore_rest (ldfile, 0);
2482 if (ctype->outdigits_act != 0)
2483 lr_error (ldfile, _("\
2484 %s: field `%s' declared more than once"),
2485 "LC_CTYPE", "outdigit");
2489 goto read_charclass;
2492 /* Ignore the rest of the line if we don't need the input of
2496 lr_ignore_rest (ldfile, 0);
2504 /* Ignore the rest of the line if we don't need the input of
2508 lr_ignore_rest (ldfile, 0);
2516 /* Ignore the rest of the line if we don't need the input of
2520 lr_ignore_rest (ldfile, 0);
2524 /* We simply forget the `map' keyword and use the following
2525 operand to determine the mapping. */
2526 now = lr_token (ldfile, charmap, NULL);
2527 if (now->tok == tok_ident || now->tok == tok_string)
2531 for (cnt = 2; cnt < ctype->map_collection_nr; ++cnt)
2532 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2535 if (cnt < ctype->map_collection_nr)
2536 free (now->val.str.startmb);
2538 /* OK, it's a new map. */
2539 ctype_map_new (ldfile, ctype, now->val.str.startmb, charmap);
2543 else if (now->tok < tok_toupper || now->tok > tok_tolower)
2546 mapidx = now->tok - tok_toupper;
2548 now = lr_token (ldfile, charmap, NULL);
2549 /* This better should be a semicolon. */
2550 if (now->tok != tok_semicolon)
2554 /* Test whether this mapping was already defined. */
2555 if (ctype->tomap_done[mapidx])
2557 lr_error (ldfile, _("duplicated definition for mapping `%s'"),
2558 ctype->mapnames[mapidx]);
2559 lr_ignore_rest (ldfile, 0);
2562 ctype->tomap_done[mapidx] = 1;
2564 now = lr_token (ldfile, charmap, NULL);
2565 while (now->tok != tok_eol && now->tok != tok_eof)
2567 struct charseq *from_seq;
2569 struct charseq *to_seq;
2572 /* Every pair starts with an opening brace. */
2573 if (now->tok != tok_open_brace)
2576 /* Next comes the from-value. */
2577 now = lr_token (ldfile, charmap, NULL);
2578 if (get_character (now, charmap, repertoire, &from_seq,
2582 /* The next is a comma. */
2583 now = lr_token (ldfile, charmap, NULL);
2584 if (now->tok != tok_comma)
2587 /* And the other value. */
2588 now = lr_token (ldfile, charmap, NULL);
2589 if (get_character (now, charmap, repertoire, &to_seq,
2593 /* And the last thing is the closing brace. */
2594 now = lr_token (ldfile, charmap, NULL);
2595 if (now->tok != tok_close_brace)
2598 if (!ignore_content)
2600 if (mapidx < 2 && from_seq != NULL && to_seq != NULL
2601 && from_seq->nbytes == 1 && to_seq->nbytes == 1)
2602 /* We can use this value. */
2603 ctype->map256_collection[mapidx][from_seq->bytes[0]]
2606 if (from_wch != ILLEGAL_CHAR_VALUE
2607 && to_wch != ILLEGAL_CHAR_VALUE)
2608 /* Both correct values. */
2609 *find_idx (ctype, &ctype->map_collection[mapidx],
2610 &ctype->map_collection_max[mapidx],
2611 &ctype->map_collection_act[mapidx],
2615 /* Now comes a semicolon or the end of the line/file. */
2616 now = lr_token (ldfile, charmap, NULL);
2617 if (now->tok == tok_semicolon)
2618 now = lr_token (ldfile, charmap, NULL);
2622 case tok_translit_start:
2623 /* Ignore the entire translit section with its peculiar syntax
2624 if we don't need the input. */
2629 lr_ignore_rest (ldfile, 0);
2630 now = lr_token (ldfile, charmap, NULL);
2632 while (now->tok != tok_translit_end && now->tok != tok_eof);
2634 if (now->tok == tok_eof)
2635 lr_error (ldfile, _(\
2636 "%s: `translit_start' section does not end with `translit_end'"),
2642 /* The rest of the line better should be empty. */
2643 lr_ignore_rest (ldfile, 1);
2645 /* We count here the number of allocated entries in the `translit'
2649 ldfile->translate_strings = 1;
2650 ldfile->return_widestr = 1;
2652 /* We proceed until we see the `translit_end' token. */
2653 while (now = lr_token (ldfile, charmap, repertoire),
2654 now->tok != tok_translit_end && now->tok != tok_eof)
2656 if (now->tok == tok_eol)
2657 /* Ignore empty lines. */
2660 if (now->tok == tok_include)
2662 /* We have to include locale. */
2663 const char *locale_name;
2664 const char *repertoire_name;
2665 struct translit_include_t *include_stmt, **include_ptr;
2667 now = lr_token (ldfile, charmap, NULL);
2668 /* This should be a string or an identifier. In any
2669 case something to name a locale. */
2670 if (now->tok != tok_string && now->tok != tok_ident)
2673 lr_error (ldfile, _("%s: syntax error"), "LC_CTYPE");
2674 lr_ignore_rest (ldfile, 0);
2677 locale_name = now->val.str.startmb;
2679 /* Next should be a semicolon. */
2680 now = lr_token (ldfile, charmap, NULL);
2681 if (now->tok != tok_semicolon)
2682 goto translit_syntax;
2684 /* Now the repertoire name. */
2685 now = lr_token (ldfile, charmap, NULL);
2686 if ((now->tok != tok_string && now->tok != tok_ident)
2687 || now->val.str.startmb == NULL)
2688 goto translit_syntax;
2689 repertoire_name = now->val.str.startmb;
2691 /* Save the include statement for later processing. */
2692 include_stmt = (struct translit_include_t *)
2693 xmalloc (sizeof (struct translit_include_t));
2694 include_stmt->copy_locale = locale_name;
2695 include_stmt->copy_repertoire = repertoire_name;
2696 include_stmt->next = NULL;
2698 include_ptr = &ctype->translit_include;
2699 while (*include_ptr != NULL)
2700 include_ptr = &(*include_ptr)->next;
2701 *include_ptr = include_stmt;
2703 /* The rest of the line must be empty. */
2704 lr_ignore_rest (ldfile, 1);
2706 /* Make sure the locale is read. */
2707 add_to_readlist (LC_CTYPE, locale_name, repertoire_name,
2711 else if (now->tok == tok_default_missing)
2717 /* We expect a single character or string as the
2719 now = lr_token (ldfile, charmap, NULL);
2720 wstr = read_widestring (ldfile, now, charmap,
2725 if (ctype->default_missing != NULL)
2727 lr_error (ldfile, _("\
2728 %s: duplicate `default_missing' definition"), "LC_CTYPE");
2729 error_at_line (0, 0, ctype->default_missing_file,
2730 ctype->default_missing_lineno,
2732 previous definition was here"));
2736 ctype->default_missing = wstr;
2737 ctype->default_missing_file = ldfile->fname;
2738 ctype->default_missing_lineno = ldfile->lineno;
2740 /* We can have more entries, ignore them. */
2741 lr_ignore_rest (ldfile, 0);
2744 else if (wstr == (uint32_t *) -1l)
2745 /* This was an syntax error. */
2748 /* Maybe there is another replacement we can use. */
2749 now = lr_token (ldfile, charmap, NULL);
2750 if (now->tok == tok_eol || now->tok == tok_eof)
2752 /* Nothing found. We tell the user. */
2753 lr_error (ldfile, _("\
2754 %s: no representable `default_missing' definition found"), "LC_CTYPE");
2757 if (now->tok != tok_semicolon)
2758 goto translit_syntax;
2763 else if (now->tok == tok_translit_ignore)
2765 read_translit_ignore_entry (ldfile, ctype, charmap,
2770 read_translit_entry (ldfile, ctype, now, charmap, repertoire);
2772 ldfile->return_widestr = 0;
2774 if (now->tok == tok_eof)
2775 lr_error (ldfile, _(\
2776 "%s: `translit_start' section does not end with `translit_end'"),
2782 /* Ignore the rest of the line if we don't need the input of
2786 lr_ignore_rest (ldfile, 0);
2790 /* This could mean one of several things. First test whether
2791 it's a character class name. */
2792 for (cnt = 0; cnt < ctype->nr_charclass; ++cnt)
2793 if (strcmp (now->val.str.startmb, ctype->classnames[cnt]) == 0)
2795 if (cnt < ctype->nr_charclass)
2797 class_bit = _ISwbit (cnt);
2798 class256_bit = cnt <= 11 ? _ISbit (cnt) : 0;
2799 free (now->val.str.startmb);
2800 goto read_charclass;
2802 for (cnt = 0; cnt < ctype->map_collection_nr; ++cnt)
2803 if (strcmp (now->val.str.startmb, ctype->mapnames[cnt]) == 0)
2805 if (cnt < ctype->map_collection_nr)
2808 free (now->val.str.startmb);
2811 #ifdef PREDEFINED_CLASSES
2812 if (strcmp (now->val.str.startmb, "special1") == 0)
2814 class_bit = _ISwspecial1;
2815 free (now->val.str.startmb);
2816 goto read_charclass;
2818 if (strcmp (now->val.str.startmb, "special2") == 0)
2820 class_bit = _ISwspecial2;
2821 free (now->val.str.startmb);
2822 goto read_charclass;
2824 if (strcmp (now->val.str.startmb, "special3") == 0)
2826 class_bit = _ISwspecial3;
2827 free (now->val.str.startmb);
2828 goto read_charclass;
2830 if (strcmp (now->val.str.startmb, "tosymmetric") == 0)
2839 /* Next we assume `LC_CTYPE'. */
2840 now = lr_token (ldfile, charmap, NULL);
2841 if (now->tok == tok_eof)
2843 if (now->tok == tok_eol)
2844 lr_error (ldfile, _("%s: incomplete `END' line"),
2846 else if (now->tok != tok_lc_ctype)
2847 lr_error (ldfile, _("\
2848 %1$s: definition does not end with `END %1$s'"), "LC_CTYPE");
2849 lr_ignore_rest (ldfile, now->tok == tok_lc_ctype);
2854 if (now->tok != tok_eof)
2855 SYNTAX_ERROR (_("%s: syntax error"), "LC_CTYPE");
2858 /* Prepare for the next round. */
2859 now = lr_token (ldfile, charmap, NULL);
2863 /* When we come here we reached the end of the file. */
2864 lr_error (ldfile, _("%s: premature end of file"), "LC_CTYPE");
2869 set_class_defaults (struct locale_ctype_t *ctype, struct charmap_t *charmap,
2870 struct repertoire_t *repertoire)
2874 /* These function defines the default values for the classes and conversions
2875 according to POSIX.2 2.5.2.1.
2876 It may seem that the order of these if-blocks is arbitrary but it is NOT.
2877 Don't move them unless you know what you do! */
2879 auto void set_default (int bitpos, int from, int to);
2881 void set_default (int bitpos, int from, int to)
2885 int bit = _ISbit (bitpos);
2886 int bitw = _ISwbit (bitpos);
2887 /* Define string. */
2890 for (ch = from; ch <= to; ++ch)
2892 struct charseq *seq;
2895 seq = charmap_find_value (charmap, tmp, 1);
2899 sprintf (buf, "U%08X", ch);
2900 seq = charmap_find_value (charmap, buf, 9);
2906 %s: character `%s' not defined in charmap while needed as default value"),
2909 else if (seq->nbytes != 1)
2911 %s: character `%s' in charmap not representable with one byte"),
2914 ctype->class256_collection[seq->bytes[0]] |= bit;
2916 /* No need to search here, the ASCII value is also the Unicode
2918 ELEM (ctype, class_collection, , ch) |= bitw;
2922 /* Set default values if keyword was not present. */
2923 if ((ctype->class_done & BITw (tok_upper)) == 0)
2924 /* "If this keyword [lower] is not specified, the lowercase letters
2925 `A' through `Z', ..., shall automatically belong to this class,
2926 with implementation defined character values." [P1003.2, 2.5.2.1] */
2927 set_default (BITPOS (tok_upper), 'A', 'Z');
2929 if ((ctype->class_done & BITw (tok_lower)) == 0)
2930 /* "If this keyword [lower] is not specified, the lowercase letters
2931 `a' through `z', ..., shall automatically belong to this class,
2932 with implementation defined character values." [P1003.2, 2.5.2.1] */
2933 set_default (BITPOS (tok_lower), 'a', 'z');
2935 if ((ctype->class_done & BITw (tok_alpha)) == 0)
2937 /* Table 2-6 in P1003.2 says that characters in class `upper' or
2938 class `lower' *must* be in class `alpha'. */
2939 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower);
2940 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower);
2942 for (cnt = 0; cnt < 256; ++cnt)
2943 if ((ctype->class256_collection[cnt] & mask) != 0)
2944 ctype->class256_collection[cnt] |= BIT (tok_alpha);
2946 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2947 if ((ctype->class_collection[cnt] & maskw) != 0)
2948 ctype->class_collection[cnt] |= BITw (tok_alpha);
2951 if ((ctype->class_done & BITw (tok_digit)) == 0)
2952 /* "If this keyword [digit] is not specified, the digits `0' through
2953 `9', ..., shall automatically belong to this class, with
2954 implementation-defined character values." [P1003.2, 2.5.2.1] */
2955 set_default (BITPOS (tok_digit), '0', '9');
2957 /* "Only characters specified for the `alpha' and `digit' keyword
2958 shall be specified. Characters specified for the keyword `alpha'
2959 and `digit' are automatically included in this class. */
2961 unsigned long int mask = BIT (tok_alpha) | BIT (tok_digit);
2962 unsigned long int maskw = BITw (tok_alpha) | BITw (tok_digit);
2964 for (cnt = 0; cnt < 256; ++cnt)
2965 if ((ctype->class256_collection[cnt] & mask) != 0)
2966 ctype->class256_collection[cnt] |= BIT (tok_alnum);
2968 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
2969 if ((ctype->class_collection[cnt] & maskw) != 0)
2970 ctype->class_collection[cnt] |= BITw (tok_alnum);
2973 if ((ctype->class_done & BITw (tok_space)) == 0)
2974 /* "If this keyword [space] is not specified, the characters <space>,
2975 <form-feed>, <newline>, <carriage-return>, <tab>, and
2976 <vertical-tab>, ..., shall automatically belong to this class,
2977 with implementation-defined character values." [P1003.2, 2.5.2.1] */
2979 struct charseq *seq;
2981 seq = charmap_find_value (charmap, "space", 5);
2983 seq = charmap_find_value (charmap, "SP", 2);
2985 seq = charmap_find_value (charmap, "U00000020", 9);
2990 %s: character `%s' not defined while needed as default value"),
2991 "LC_CTYPE", "<space>");
2993 else if (seq->nbytes != 1)
2995 %s: character `%s' in charmap not representable with one byte"),
2996 "LC_CTYPE", "<space>");
2998 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3000 /* No need to search. */
3001 ELEM (ctype, class_collection, , L' ') |= BITw (tok_space);
3003 seq = charmap_find_value (charmap, "form-feed", 9);
3005 seq = charmap_find_value (charmap, "U0000000C", 9);
3010 %s: character `%s' not defined while needed as default value"),
3011 "LC_CTYPE", "<form-feed>");
3013 else if (seq->nbytes != 1)
3015 %s: character `%s' in charmap not representable with one byte"),
3016 "LC_CTYPE", "<form-feed>");
3018 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3020 /* No need to search. */
3021 ELEM (ctype, class_collection, , L'\f') |= BITw (tok_space);
3024 seq = charmap_find_value (charmap, "newline", 7);
3026 seq = charmap_find_value (charmap, "U0000000A", 9);
3031 character `%s' not defined while needed as default value"),
3034 else if (seq->nbytes != 1)
3036 %s: character `%s' in charmap not representable with one byte"),
3037 "LC_CTYPE", "<newline>");
3039 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3041 /* No need to search. */
3042 ELEM (ctype, class_collection, , L'\n') |= BITw (tok_space);
3045 seq = charmap_find_value (charmap, "carriage-return", 15);
3047 seq = charmap_find_value (charmap, "U0000000D", 9);
3052 %s: character `%s' not defined while needed as default value"),
3053 "LC_CTYPE", "<carriage-return>");
3055 else if (seq->nbytes != 1)
3057 %s: character `%s' in charmap not representable with one byte"),
3058 "LC_CTYPE", "<carriage-return>");
3060 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3062 /* No need to search. */
3063 ELEM (ctype, class_collection, , L'\r') |= BITw (tok_space);
3066 seq = charmap_find_value (charmap, "tab", 3);
3068 seq = charmap_find_value (charmap, "U00000009", 9);
3073 %s: character `%s' not defined while needed as default value"),
3074 "LC_CTYPE", "<tab>");
3076 else if (seq->nbytes != 1)
3078 %s: character `%s' in charmap not representable with one byte"),
3079 "LC_CTYPE", "<tab>");
3081 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3083 /* No need to search. */
3084 ELEM (ctype, class_collection, , L'\t') |= BITw (tok_space);
3087 seq = charmap_find_value (charmap, "vertical-tab", 12);
3089 seq = charmap_find_value (charmap, "U0000000B", 9);
3094 %s: character `%s' not defined while needed as default value"),
3095 "LC_CTYPE", "<vertical-tab>");
3097 else if (seq->nbytes != 1)
3099 %s: character `%s' in charmap not representable with one byte"),
3100 "LC_CTYPE", "<vertical-tab>");
3102 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_space);
3104 /* No need to search. */
3105 ELEM (ctype, class_collection, , L'\v') |= BITw (tok_space);
3108 if ((ctype->class_done & BITw (tok_xdigit)) == 0)
3109 /* "If this keyword is not specified, the digits `0' to `9', the
3110 uppercase letters `A' through `F', and the lowercase letters `a'
3111 through `f', ..., shell automatically belong to this class, with
3112 implementation defined character values." [P1003.2, 2.5.2.1] */
3114 set_default (BITPOS (tok_xdigit), '0', '9');
3115 set_default (BITPOS (tok_xdigit), 'A', 'F');
3116 set_default (BITPOS (tok_xdigit), 'a', 'f');
3119 if ((ctype->class_done & BITw (tok_blank)) == 0)
3120 /* "If this keyword [blank] is unspecified, the characters <space> and
3121 <tab> shall belong to this character class." [P1003.2, 2.5.2.1] */
3123 struct charseq *seq;
3125 seq = charmap_find_value (charmap, "space", 5);
3127 seq = charmap_find_value (charmap, "SP", 2);
3129 seq = charmap_find_value (charmap, "U00000020", 9);
3134 %s: character `%s' not defined while needed as default value"),
3135 "LC_CTYPE", "<space>");
3137 else if (seq->nbytes != 1)
3139 %s: character `%s' in charmap not representable with one byte"),
3140 "LC_CTYPE", "<space>");
3142 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
3144 /* No need to search. */
3145 ELEM (ctype, class_collection, , L' ') |= BITw (tok_blank);
3148 seq = charmap_find_value (charmap, "tab", 3);
3150 seq = charmap_find_value (charmap, "U00000009", 9);
3155 %s: character `%s' not defined while needed as default value"),
3156 "LC_CTYPE", "<tab>");
3158 else if (seq->nbytes != 1)
3160 %s: character `%s' in charmap not representable with one byte"),
3161 "LC_CTYPE", "<tab>");
3163 ctype->class256_collection[seq->bytes[0]] |= BIT (tok_blank);
3165 /* No need to search. */
3166 ELEM (ctype, class_collection, , L'\t') |= BITw (tok_blank);
3169 if ((ctype->class_done & BITw (tok_graph)) == 0)
3170 /* "If this keyword [graph] is not specified, characters specified for
3171 the keywords `upper', `lower', `alpha', `digit', `xdigit' and `punct',
3172 shall belong to this character class." [P1003.2, 2.5.2.1] */
3174 unsigned long int mask = BIT (tok_upper) | BIT (tok_lower) |
3175 BIT (tok_alpha) | BIT (tok_digit) | BIT (tok_xdigit) | BIT (tok_punct);
3176 unsigned long int maskw = BITw (tok_upper) | BITw (tok_lower) |
3177 BITw (tok_alpha) | BITw (tok_digit) | BITw (tok_xdigit) |
3181 for (cnt = 0; cnt < ctype->class_collection_act; ++cnt)
3182 if ((ctype->class_collection[cnt] & maskw) != 0)
3183 ctype->class_collection[cnt] |= BITw (tok_graph);
3185 for (cnt = 0; cnt < 256; ++cnt)