comparison libcpp/makeucnid.c @ 111:04ced10e8804

gcc 7
author kono
date Fri, 27 Oct 2017 22:46:09 +0900
parents a06113de4d67
children 84e7813d76e9
comparison
equal deleted inserted replaced
68:561a7518be6b 111:04ced10e8804
1 /* Make ucnid.h from various sources. 1 /* Make ucnid.h from various sources.
2 Copyright (C) 2005, 2009 Free Software Foundation, Inc. 2 Copyright (C) 2005-2017 Free Software Foundation, Inc.
3 3
4 This program is free software; you can redistribute it and/or modify it 4 This program is free software; you can redistribute it and/or modify it
5 under the terms of the GNU General Public License as published by the 5 under the terms of the GNU General Public License as published by the
6 Free Software Foundation; either version 3, or (at your option) any 6 Free Software Foundation; either version 3, or (at your option) any
7 later version. 7 later version.
27 #include <stdlib.h> 27 #include <stdlib.h>
28 28
29 enum { 29 enum {
30 C99 = 1, 30 C99 = 1,
31 CXX = 2, 31 CXX = 2,
32 digit = 4, 32 N99 = 4,
33 not_NFC = 8, 33 C11 = 8,
34 not_NFKC = 16, 34 N11 = 16,
35 maybe_not_NFC = 32 35 all_languages = C99 | CXX | C11,
36 not_NFC = 32,
37 not_NFKC = 64,
38 maybe_not_NFC = 128
36 }; 39 };
37 40
38 static unsigned flags[65536]; 41 #define NUM_CODE_POINTS 0x110000
39 static unsigned short decomp[65536][2]; 42 #define MAX_CODE_POINT 0x10ffff
40 static unsigned char combining_value[65536]; 43
44 static unsigned flags[NUM_CODE_POINTS];
45 static unsigned int all_decomp[NUM_CODE_POINTS][2];
46 static unsigned int decomp[NUM_CODE_POINTS][2];
47 static unsigned char combining_value[NUM_CODE_POINTS];
41 48
42 /* Die! */ 49 /* Die! */
43 50
44 static void 51 static void
45 fail (const char *s) 52 fail (const char *s)
46 { 53 {
47 fprintf (stderr, "%s\n", s); 54 fprintf (stderr, "%s\n", s);
48 exit (1); 55 exit (1);
49 } 56 }
50 57
51 /* Read ucnid.tab and set the C99 and CXX flags in header[]. */ 58 /* Read ucnid.tab and set the flags for language versions in header[]. */
52 59
53 static void 60 static void
54 read_ucnid (const char *fname) 61 read_ucnid (const char *fname)
55 { 62 {
56 FILE *f = fopen (fname, "r"); 63 FILE *f = fopen (fname, "r");
64 71
65 if (!fgets (line, sizeof (line), f)) 72 if (!fgets (line, sizeof (line), f))
66 break; 73 break;
67 if (strcmp (line, "[C99]\n") == 0) 74 if (strcmp (line, "[C99]\n") == 0)
68 fl = C99; 75 fl = C99;
76 else if (strcmp (line, "[C99DIG]\n") == 0)
77 fl = C99|N99;
69 else if (strcmp (line, "[CXX]\n") == 0) 78 else if (strcmp (line, "[CXX]\n") == 0)
70 fl = CXX; 79 fl = CXX;
80 else if (strcmp (line, "[C11]\n") == 0)
81 fl = C11;
82 else if (strcmp (line, "[C11NOSTART]\n") == 0)
83 fl = C11|N11;
71 else if (isxdigit (line[0])) 84 else if (isxdigit (line[0]))
72 { 85 {
73 char *l = line; 86 char *l = line;
74 while (*l) 87 while (*l)
75 { 88 {
90 if (! isspace (*l)) 103 if (! isspace (*l))
91 fail ("parsing ucnid.tab, junk after range"); 104 fail ("parsing ucnid.tab, junk after range");
92 } 105 }
93 while (isspace (*l)) 106 while (isspace (*l))
94 l++; 107 l++;
95 if (end > 0xFFFF) 108 if (end > MAX_CODE_POINT)
96 fail ("parsing ucnid.tab, end too large"); 109 fail ("parsing ucnid.tab, end too large");
97 while (start <= end) 110 while (start <= end)
98 flags[start++] |= fl; 111 flags[start++] |= fl;
99 } 112 }
100 } 113 }
102 if (ferror (f)) 115 if (ferror (f))
103 fail ("reading ucnid.tab"); 116 fail ("reading ucnid.tab");
104 fclose (f); 117 fclose (f);
105 } 118 }
106 119
107 /* Read UnicodeData.txt and set the 'digit' flag, and 120 /* Read UnicodeData.txt and fill in the 'decomp' table to be the
108 also fill in the 'decomp' table to be the decompositions of 121 decompositions of characters for which both the character
109 characters for which both the character decomposed and all the code 122 decomposed and all the code points in the decomposition are valid
110 points in the decomposition are either C99 or CXX. */ 123 for some supported language version, and the 'all_decomp' table to
124 be the decompositions of all characters without those
125 constraints. */
111 126
112 static void 127 static void
113 read_table (char *fname) 128 read_table (char *fname)
114 { 129 {
115 FILE * f = fopen (fname, "r"); 130 FILE * f = fopen (fname, "r");
119 for (;;) 134 for (;;)
120 { 135 {
121 char line[256]; 136 char line[256];
122 unsigned long codepoint, this_decomp[4]; 137 unsigned long codepoint, this_decomp[4];
123 char *l; 138 char *l;
124 int i; 139 int i, j;
125 int decomp_useful; 140 int decomp_useful;
126 141
127 if (!fgets (line, sizeof (line), f)) 142 if (!fgets (line, sizeof (line), f))
128 break; 143 break;
129 codepoint = strtoul (line, &l, 16); 144 codepoint = strtoul (line, &l, 16);
130 if (l == line || *l != ';') 145 if (l == line || *l != ';')
131 fail ("parsing UnicodeData.txt, reading code point"); 146 fail ("parsing UnicodeData.txt, reading code point");
132 if (codepoint > 0xffff || ! (flags[codepoint] & (C99 | CXX))) 147 if (codepoint > MAX_CODE_POINT)
133 continue; 148 fail ("parsing UnicodeData.txt, code point too large");
134 149
135 do { 150 do {
136 l++; 151 l++;
137 } while (*l != ';'); 152 } while (*l != ';');
138 /* Category value; things starting with 'N' are numbers of some 153 /* Category value. */
139 kind. */
140 if (*++l == 'N')
141 flags[codepoint] |= digit;
142
143 do { 154 do {
144 l++; 155 l++;
145 } while (*l != ';'); 156 } while (*l != ';');
146 /* Canonical combining class; in NFC/NFKC, they must be increasing 157 /* Canonical combining class; in NFC/NFKC, they must be increasing
147 (or zero). */ 158 (or zero). */
171 while (isspace (*l)) 182 while (isspace (*l))
172 l++; 183 l++;
173 } 184 }
174 if (i > 2) /* Decomposition too long. */ 185 if (i > 2) /* Decomposition too long. */
175 fail ("parsing UnicodeData.txt, decomposition too long"); 186 fail ("parsing UnicodeData.txt, decomposition too long");
176 if (decomp_useful) 187 for (j = 0; j < i; j++)
188 all_decomp[codepoint][j] = this_decomp[j];
189 if ((flags[codepoint] & all_languages) && decomp_useful)
177 while (--i >= 0) 190 while (--i >= 0)
178 decomp[codepoint][i] = this_decomp[i]; 191 decomp[codepoint][i] = this_decomp[i];
179 } 192 }
180 if (ferror (f)) 193 if (ferror (f))
181 fail ("reading UnicodeData.txt"); 194 fail ("reading UnicodeData.txt");
208 continue; 221 continue;
209 222
210 start = strtoul (line, &l, 16); 223 start = strtoul (line, &l, 16);
211 if (l == line) 224 if (l == line)
212 fail ("parsing DerivedNormalizationProps.txt, reading start"); 225 fail ("parsing DerivedNormalizationProps.txt, reading start");
213 if (start > 0xffff) 226 if (start > MAX_CODE_POINT)
214 continue; 227 fail ("parsing DerivedNormalizationProps.txt, code point too large");
215 if (*l == '.' && l[1] == '.') 228 if (*l == '.' && l[1] == '.')
216 end = strtoul (l + 2, &l, 16); 229 end = strtoul (l + 2, &l, 16);
217 else 230 else
218 end = start; 231 end = start;
219 232
237 { 250 {
238 unsigned i; 251 unsigned i;
239 unsigned last_flag = flags[0]; 252 unsigned last_flag = flags[0];
240 bool really_safe = decomp[0][0] == 0; 253 bool really_safe = decomp[0][0] == 0;
241 unsigned char last_combine = combining_value[0]; 254 unsigned char last_combine = combining_value[0];
255
256 printf ("static const struct ucnrange ucnranges[] = {\n");
242 257
243 for (i = 1; i <= 65536; i++) 258 for (i = 1; i <= NUM_CODE_POINTS; i++)
244 if (i == 65536 259 if (i == NUM_CODE_POINTS
245 || (flags[i] != last_flag && ((flags[i] | last_flag) & (C99 | CXX))) 260 || (flags[i] != last_flag && ((flags[i] | last_flag) & all_languages))
246 || really_safe != (decomp[i][0] == 0) 261 || really_safe != (decomp[i][0] == 0)
247 || combining_value[i] != last_combine) 262 || combining_value[i] != last_combine)
248 { 263 {
249 printf ("{ %s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n", 264 printf ("{ %s|%s|%s|%s|%s|%s|%s|%s|%s, %3d, %#06x },\n",
250 last_flag & C99 ? "C99" : " 0", 265 last_flag & C99 ? "C99" : " 0",
251 last_flag & digit ? "DIG" : " 0", 266 last_flag & N99 ? "N99" : " 0",
252 last_flag & CXX ? "CXX" : " 0", 267 last_flag & CXX ? "CXX" : " 0",
268 last_flag & C11 ? "C11" : " 0",
269 last_flag & N11 ? "N11" : " 0",
253 really_safe ? "CID" : " 0", 270 really_safe ? "CID" : " 0",
254 last_flag & not_NFC ? " 0" : "NFC", 271 last_flag & not_NFC ? " 0" : "NFC",
255 last_flag & not_NFKC ? " 0" : "NKC", 272 last_flag & not_NFKC ? " 0" : "NKC",
256 last_flag & maybe_not_NFC ? "CTX" : " 0", 273 last_flag & maybe_not_NFC ? "CTX" : " 0",
257 combining_value[i - 1], 274 combining_value[i - 1],
258 i - 1); 275 i - 1);
259 last_flag = flags[i]; 276 last_flag = flags[i];
260 last_combine = combining_value[0]; 277 last_combine = combining_value[0];
261 really_safe = decomp[i][0] == 0; 278 really_safe = decomp[i][0] == 0;
262 } 279 }
280
281 printf ("};\n");
282 }
283
284 /* Return whether a given character is valid in an identifier for some
285 supported language, either as itself or as a UCN. */
286
287 static bool
288 char_id_valid (unsigned int c)
289 {
290 return ((flags[c] & all_languages)
291 || (c == 0x24)
292 || (c >= 0x30 && c <= 0x39)
293 || (c >= 0x41 && c <= 0x5a)
294 || (c >= 0x61 && c <= 0x7a));
295 }
296
297 /* Write out the switch statement over characters for which it is
298 context-dependent whether they are in NFC. */
299
300 static void
301 write_context_switch (void)
302 {
303 unsigned i;
304 printf ("static bool\n"
305 "check_nfc (cpp_reader *pfile, cppchar_t c, cppchar_t p)\n"
306 "{\n"
307 " switch (c)\n"
308 " {\n");
309 for (i = 0; i < NUM_CODE_POINTS; i++)
310 {
311 bool found_case = false;
312 unsigned j;
313 if (!(flags[i] & all_languages) || !(flags[i] & maybe_not_NFC))
314 continue;
315 if ((i >= 0x1161 && i <= 0x1175) || (i >= 0x11A8 && i <= 0x11C2))
316 continue; /* Hangul handled algorithmically. */
317 printf (" case %#06x:\n"
318 " switch (p)\n"
319 "\t{\n", i);
320 /* If an NFC starter character decomposes with this character I
321 as the second character and an NFC starter character S as the
322 first character, that latter character as a previous
323 character means this character is not NFC. Furthermore, any
324 NFC starter character K made by a series of compositions of S
325 with combining characters whose combining class is greater
326 than that of I also means this character is not NFC. */
327 for (j = 0; j < NUM_CODE_POINTS; j++)
328 {
329 unsigned s, k;
330 if (all_decomp[j][1] != i)
331 continue;
332 s = all_decomp[j][0];
333 if (combining_value[s] != 0 || (flags[s] & not_NFC) != 0)
334 continue;
335 if (char_id_valid (s))
336 {
337 found_case = true;
338 printf ("\tcase %#06x:\n", s);
339 }
340 for (k = 0; k < NUM_CODE_POINTS; k++)
341 {
342 unsigned t = k;
343 if (k == s || !char_id_valid (k))
344 continue;
345 while (all_decomp[t][1] != 0
346 && combining_value[all_decomp[t][1]] > combining_value[i])
347 {
348 if (combining_value[t] != 0 || (flags[t] & not_NFC) != 0)
349 break;
350 t = all_decomp[t][0];
351 }
352 if (t == s)
353 {
354 found_case = true;
355 printf ("\tcase %#06x:\n", k);
356 }
357 }
358 }
359 if (found_case)
360 printf ("\t return false;\n");
361 else
362 printf ("\t/* Non-NFC cases not applicable to C/C++. */\n");
363 printf ("\tdefault:\n"
364 "\t return true;\n"
365 "\t}\n\n");
366 }
367 printf (" default:\n"
368 " cpp_error (pfile, CPP_DL_ICE, \"Character %%x might not be NFKC\", c);\n"
369 " return true;\n"
370 " }\n"
371 "}\n");
263 } 372 }
264 373
265 /* Print out the huge copyright notice. */ 374 /* Print out the huge copyright notice. */
266 375
267 static void 376 static void
268 write_copyright (void) 377 write_copyright (void)
269 { 378 {
270 static const char copyright[] = "\ 379 static const char copyright[] = "\
271 /* Unicode characters and various properties.\n\ 380 /* Unicode characters and various properties.\n\
272 Copyright (C) 2003, 2005 Free Software Foundation, Inc.\n\ 381 Copyright (C) 2003-2017 Free Software Foundation, Inc.\n\
273 \n\ 382 \n\
274 This program is free software; you can redistribute it and/or modify it\n\ 383 This program is free software; you can redistribute it and/or modify it\n\
275 under the terms of the GNU General Public License as published by the\n\ 384 under the terms of the GNU General Public License as published by the\n\
276 Free Software Foundation; either version 3, or (at your option) any\n\ 385 Free Software Foundation; either version 3, or (at your option) any\n\
277 later version.\n\ 386 later version.\n\
336 read_table (argv[2]); 445 read_table (argv[2]);
337 read_derived (argv[3]); 446 read_derived (argv[3]);
338 447
339 write_copyright (); 448 write_copyright ();
340 write_table (); 449 write_table ();
450 write_context_switch ();
341 return 0; 451 return 0;
342 } 452 }