Mercurial > hg > CbC > CbC_gcc
comparison libcpp/makeucnid.c @ 111:04ced10e8804
gcc 7
author | kono |
---|---|
date | Fri, 27 Oct 2017 22:46:09 +0900 |
parents | a06113de4d67 |
children | 84e7813d76e9 |
comparison
equal
deleted
inserted
replaced
68:561a7518be6b | 111:04ced10e8804 |
---|---|
1 /* Make ucnid.h from various sources. | 1 /* Make ucnid.h from various sources. |
2 Copyright (C) 2005, 2009 Free Software Foundation, Inc. | 2 Copyright (C) 2005-2017 Free Software Foundation, Inc. |
3 | 3 |
4 This program is free software; you can redistribute it and/or modify it | 4 This program is free software; you can redistribute it and/or modify it |
5 under the terms of the GNU General Public License as published by the | 5 under the terms of the GNU General Public License as published by the |
6 Free Software Foundation; either version 3, or (at your option) any | 6 Free Software Foundation; either version 3, or (at your option) any |
7 later version. | 7 later version. |
27 #include <stdlib.h> | 27 #include <stdlib.h> |
28 | 28 |
29 enum { | 29 enum { |
30 C99 = 1, | 30 C99 = 1, |
31 CXX = 2, | 31 CXX = 2, |
32 digit = 4, | 32 N99 = 4, |
33 not_NFC = 8, | 33 C11 = 8, |
34 not_NFKC = 16, | 34 N11 = 16, |
35 maybe_not_NFC = 32 | 35 all_languages = C99 | CXX | C11, |
36 not_NFC = 32, | |
37 not_NFKC = 64, | |
38 maybe_not_NFC = 128 | |
36 }; | 39 }; |
37 | 40 |
38 static unsigned flags[65536]; | 41 #define NUM_CODE_POINTS 0x110000 |
39 static unsigned short decomp[65536][2]; | 42 #define MAX_CODE_POINT 0x10ffff |
40 static unsigned char combining_value[65536]; | 43 |
44 static unsigned flags[NUM_CODE_POINTS]; | |
45 static unsigned int all_decomp[NUM_CODE_POINTS][2]; | |
46 static unsigned int decomp[NUM_CODE_POINTS][2]; | |
47 static unsigned char combining_value[NUM_CODE_POINTS]; | |
41 | 48 |
42 /* Die! */ | 49 /* Die! */ |
43 | 50 |
44 static void | 51 static void |
45 fail (const char *s) | 52 fail (const char *s) |
46 { | 53 { |
47 fprintf (stderr, "%s\n", s); | 54 fprintf (stderr, "%s\n", s); |
48 exit (1); | 55 exit (1); |
49 } | 56 } |
50 | 57 |
51 /* Read ucnid.tab and set the C99 and CXX flags in header[]. */ | 58 /* Read ucnid.tab and set the flags for language versions in header[]. */ |
52 | 59 |
53 static void | 60 static void |
54 read_ucnid (const char *fname) | 61 read_ucnid (const char *fname) |
55 { | 62 { |
56 FILE *f = fopen (fname, "r"); | 63 FILE *f = fopen (fname, "r"); |
64 | 71 |
65 if (!fgets (line, sizeof (line), f)) | 72 if (!fgets (line, sizeof (line), f)) |
66 break; | 73 break; |
67 if (strcmp (line, "[C99]\n") == 0) | 74 if (strcmp (line, "[C99]\n") == 0) |
68 fl = C99; | 75 fl = C99; |
76 else if (strcmp (line, "[C99DIG]\n") == 0) | |
77 fl = C99|N99; | |
69 else if (strcmp (line, "[CXX]\n") == 0) | 78 else if (strcmp (line, "[CXX]\n") == 0) |
70 fl = CXX; | 79 fl = CXX; |
80 else if (strcmp (line, "[C11]\n") == 0) | |
81 fl = C11; | |
82 else if (strcmp (line, "[C11NOSTART]\n") == 0) | |
83 fl = C11|N11; | |
71 else if (isxdigit (line[0])) | 84 else if (isxdigit (line[0])) |
72 { | 85 { |
73 char *l = line; | 86 char *l = line; |
74 while (*l) | 87 while (*l) |
75 { | 88 { |
90 if (! isspace (*l)) | 103 if (! isspace (*l)) |
91 fail ("parsing ucnid.tab, junk after range"); | 104 fail ("parsing ucnid.tab, junk after range"); |
92 } | 105 } |
93 while (isspace (*l)) | 106 while (isspace (*l)) |
94 l++; | 107 l++; |
95 if (end > 0xFFFF) | 108 if (end > MAX_CODE_POINT) |
96 fail ("parsing ucnid.tab, end too large"); | 109 fail ("parsing ucnid.tab, end too large"); |
97 while (start <= end) | 110 while (start <= end) |
98 flags[start++] |= fl; | 111 flags[start++] |= fl; |
99 } | 112 } |
100 } | 113 } |
102 if (ferror (f)) | 115 if (ferror (f)) |
103 fail ("reading ucnid.tab"); | 116 fail ("reading ucnid.tab"); |
104 fclose (f); | 117 fclose (f); |
105 } | 118 } |
106 | 119 |
107 /* Read UnicodeData.txt and set the 'digit' flag, and | 120 /* Read UnicodeData.txt and fill in the 'decomp' table to be the |
108 also fill in the 'decomp' table to be the decompositions of | 121 decompositions of characters for which both the character |
109 characters for which both the character decomposed and all the code | 122 decomposed and all the code points in the decomposition are valid |
110 points in the decomposition are either C99 or CXX. */ | 123 for some supported language version, and the 'all_decomp' table to |
124 be the decompositions of all characters without those | |
125 constraints. */ | |
111 | 126 |
112 static void | 127 static void |
113 read_table (char *fname) | 128 read_table (char *fname) |
114 { | 129 { |
115 FILE * f = fopen (fname, "r"); | 130 FILE * f = fopen (fname, "r"); |
119 for (;;) | 134 for (;;) |
120 { | 135 { |
121 char line[256]; | 136 char line[256]; |
122 unsigned long codepoint, this_decomp[4]; | 137 unsigned long codepoint, this_decomp[4]; |
123 char *l; | 138 char *l; |
124 int i; | 139 int i, j; |
125 int decomp_useful; | 140 int decomp_useful; |
126 | 141 |
127 if (!fgets (line, sizeof (line), f)) | 142 if (!fgets (line, sizeof (line), f)) |
128 break; | 143 break; |
129 codepoint = strtoul (line, &l, 16); | 144 codepoint = strtoul (line, &l, 16); |
130 if (l == line || *l != ';') | 145 if (l == line || *l != ';') |
131 fail ("parsing UnicodeData.txt, reading code point"); | 146 fail ("parsing UnicodeData.txt, reading code point"); |
132 if (codepoint > 0xffff || ! (flags[codepoint] & (C99 | CXX))) | 147 if (codepoint > MAX_CODE_POINT) |
133 continue; | 148 fail ("parsing UnicodeData.txt, code point too large"); |
134 | 149 |
135 do { | 150 do { |
136 l++; | 151 l++; |
137 } while (*l != ';'); | 152 } while (*l != ';'); |
138 /* Category value; things starting with 'N' are numbers of some | 153 /* Category value. */ |
139 kind. */ | |
140 if (*++l == 'N') | |
141 flags[codepoint] |= digit; | |
142 | |
143 do { | 154 do { |
144 l++; | 155 l++; |
145 } while (*l != ';'); | 156 } while (*l != ';'); |
146 /* Canonical combining class; in NFC/NFKC, they must be increasing | 157 /* Canonical combining class; in NFC/NFKC, they must be increasing |
147 (or zero). */ | 158 (or zero). */ |
171 while (isspace (*l)) | 182 while (isspace (*l)) |
172 l++; | 183 l++; |
173 } | 184 } |
174 if (i > 2) /* Decomposition too long. */ | 185 if (i > 2) /* Decomposition too long. */ |
175 fail ("parsing UnicodeData.txt, decomposition too long"); | 186 fail ("parsing UnicodeData.txt, decomposition too long"); |
176 if (decomp_useful) | 187 for (j = 0; j < i; j++) |
188 all_decomp[codepoint][j] = this_decomp[j]; | |
189 if ((flags[codepoint] & all_languages) && decomp_useful) | |
177 while (--i >= 0) | 190 while (--i >= 0) |
178 decomp[codepoint][i] = this_decomp[i]; | 191 decomp[codepoint][i] = this_decomp[i]; |
179 } | 192 } |
180 if (ferror (f)) | 193 if (ferror (f)) |
181 fail ("reading UnicodeData.txt"); | 194 fail ("reading UnicodeData.txt"); |
208 continue; | 221 continue; |
209 | 222 |
210 start = strtoul (line, &l, 16); | 223 start = strtoul (line, &l, 16); |
211 if (l == line) | 224 if (l == line) |
212 fail ("parsing DerivedNormalizationProps.txt, reading start"); | 225 fail ("parsing DerivedNormalizationProps.txt, reading start"); |
213 if (start > 0xffff) | 226 if (start > MAX_CODE_POINT) |
214 continue; | 227 fail ("parsing DerivedNormalizationProps.txt, code point too large"); |
215 if (*l == '.' && l[1] == '.') | 228 if (*l == '.' && l[1] == '.') |
216 end = strtoul (l + 2, &l, 16); | 229 end = strtoul (l + 2, &l, 16); |
217 else | 230 else |
218 end = start; | 231 end = start; |
219 | 232 |
237 { | 250 { |
238 unsigned i; | 251 unsigned i; |
239 unsigned last_flag = flags[0]; | 252 unsigned last_flag = flags[0]; |
240 bool really_safe = decomp[0][0] == 0; | 253 bool really_safe = decomp[0][0] == 0; |
241 unsigned char last_combine = combining_value[0]; | 254 unsigned char last_combine = combining_value[0]; |
255 | |
256 printf ("static const struct ucnrange ucnranges[] = {\n"); | |
242 | 257 |
243 for (i = 1; i <= 65536; i++) | 258 for (i = 1; i <= NUM_CODE_POINTS; i++) |
244 if (i == 65536 | 259 if (i == NUM_CODE_POINTS |
245 || (flags[i] != last_flag && ((flags[i] | last_flag) & (C99 | CXX))) | 260 || (flags[i] != last_flag && ((flags[i] | last_flag) & all_languages)) |
246 || really_safe != (decomp[i][0] == 0) | 261 || really_safe != (decomp[i][0] == 0) |
247 || combining_value[i] != last_combine) | 262 || combining_value[i] != last_combine) |
248 { | 263 { |
249 printf ("{ %s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n", | 264 printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n", |
250 last_flag & C99 ? "C99" : " 0", | 265 last_flag & C99 ? "C99" : " 0", |
251 last_flag & digit ? "DIG" : " 0", | 266 last_flag & N99 ? "N99" : " 0", |
252 last_flag & CXX ? "CXX" : " 0", | 267 last_flag & CXX ? "CXX" : " 0", |
268 last_flag & C11 ? "C11" : " 0", | |
269 last_flag & N11 ? "N11" : " 0", | |
253 really_safe ? "CID" : " 0", | 270 really_safe ? "CID" : " 0", |
254 last_flag & not_NFC ? " 0" : "NFC", | 271 last_flag & not_NFC ? " 0" : "NFC", |
255 last_flag & not_NFKC ? " 0" : "NKC", | 272 last_flag & not_NFKC ? " 0" : "NKC", |
256 last_flag & maybe_not_NFC ? "CTX" : " 0", | 273 last_flag & maybe_not_NFC ? "CTX" : " 0", |
257 combining_value[i - 1], | 274 combining_value[i - 1], |
258 i - 1); | 275 i - 1); |
259 last_flag = flags[i]; | 276 last_flag = flags[i]; |
260 last_combine = combining_value[0]; | 277 last_combine = combining_value[0]; |
261 really_safe = decomp[i][0] == 0; | 278 really_safe = decomp[i][0] == 0; |
262 } | 279 } |
280 | |
281 printf ("};\n"); | |
282 } | |
283 | |
284 /* Return whether a given character is valid in an identifier for some | |
285 supported language, either as itself or as a UCN. */ | |
286 | |
287 static bool | |
288 char_id_valid (unsigned int c) | |
289 { | |
290 return ((flags[c] & all_languages) | |
291 || (c == 0x24) | |
292 || (c >= 0x30 && c <= 0x39) | |
293 || (c >= 0x41 && c <= 0x5a) | |
294 || (c >= 0x61 && c <= 0x7a)); | |
295 } | |
296 | |
297 /* Write out the switch statement over characters for which it is | |
298 context-dependent whether they are in NFC. */ | |
299 | |
300 static void | |
301 write_context_switch (void) | |
302 { | |
303 unsigned i; | |
304 printf ("static bool\n" | |
305 "check_nfc (cpp_reader *pfile, cppchar_t c, cppchar_t p)\n" | |
306 "{\n" | |
307 " switch (c)\n" | |
308 " {\n"); | |
309 for (i = 0; i < NUM_CODE_POINTS; i++) | |
310 { | |
311 bool found_case = false; | |
312 unsigned j; | |
313 if (!(flags[i] & all_languages) || !(flags[i] & maybe_not_NFC)) | |
314 continue; | |
315 if ((i >= 0x1161 && i <= 0x1175) || (i >= 0x11A8 && i <= 0x11C2)) | |
316 continue; /* Hangul handled algorithmically. */ | |
317 printf (" case %#06x:\n" | |
318 " switch (p)\n" | |
319 "\t{\n", i); | |
320 /* If an NFC starter character decomposes with this character I | |
321 as the second character and an NFC starter character S as the | |
322 first character, that latter character as a previous | |
323 character means this character is not NFC. Furthermore, any | |
324 NFC starter character K made by a series of compositions of S | |
325 with combining characters whose combining class is greater | |
326 than that of I also means this character is not NFC. */ | |
327 for (j = 0; j < NUM_CODE_POINTS; j++) | |
328 { | |
329 unsigned s, k; | |
330 if (all_decomp[j][1] != i) | |
331 continue; | |
332 s = all_decomp[j][0]; | |
333 if (combining_value[s] != 0 || (flags[s] & not_NFC) != 0) | |
334 continue; | |
335 if (char_id_valid (s)) | |
336 { | |
337 found_case = true; | |
338 printf ("\tcase %#06x:\n", s); | |
339 } | |
340 for (k = 0; k < NUM_CODE_POINTS; k++) | |
341 { | |
342 unsigned t = k; | |
343 if (k == s || !char_id_valid (k)) | |
344 continue; | |
345 while (all_decomp[t][1] != 0 | |
346 && combining_value[all_decomp[t][1]] > combining_value[i]) | |
347 { | |
348 if (combining_value[t] != 0 || (flags[t] & not_NFC) != 0) | |
349 break; | |
350 t = all_decomp[t][0]; | |
351 } | |
352 if (t == s) | |
353 { | |
354 found_case = true; | |
355 printf ("\tcase %#06x:\n", k); | |
356 } | |
357 } | |
358 } | |
359 if (found_case) | |
360 printf ("\t return false;\n"); | |
361 else | |
362 printf ("\t/* Non-NFC cases not applicable to C/C++. */\n"); | |
363 printf ("\tdefault:\n" | |
364 "\t return true;\n" | |
365 "\t}\n\n"); | |
366 } | |
367 printf (" default:\n" | |
368 " cpp_error (pfile, CPP_DL_ICE, \"Character %%x might not be NFKC\", c);\n" | |
369 " return true;\n" | |
370 " }\n" | |
371 "}\n"); | |
263 } | 372 } |
264 | 373 |
265 /* Print out the huge copyright notice. */ | 374 /* Print out the huge copyright notice. */ |
266 | 375 |
267 static void | 376 static void |
268 write_copyright (void) | 377 write_copyright (void) |
269 { | 378 { |
270 static const char copyright[] = "\ | 379 static const char copyright[] = "\ |
271 /* Unicode characters and various properties.\n\ | 380 /* Unicode characters and various properties.\n\ |
272 Copyright (C) 2003, 2005 Free Software Foundation, Inc.\n\ | 381 Copyright (C) 2003-2017 Free Software Foundation, Inc.\n\ |
273 \n\ | 382 \n\ |
274 This program is free software; you can redistribute it and/or modify it\n\ | 383 This program is free software; you can redistribute it and/or modify it\n\ |
275 under the terms of the GNU General Public License as published by the\n\ | 384 under the terms of the GNU General Public License as published by the\n\ |
276 Free Software Foundation; either version 3, or (at your option) any\n\ | 385 Free Software Foundation; either version 3, or (at your option) any\n\ |
277 later version.\n\ | 386 later version.\n\ |
336 read_table (argv[2]); | 445 read_table (argv[2]); |
337 read_derived (argv[3]); | 446 read_derived (argv[3]); |
338 | 447 |
339 write_copyright (); | 448 write_copyright (); |
340 write_table (); | 449 write_table (); |
450 write_context_switch (); | |
341 return 0; | 451 return 0; |
342 } | 452 } |