0
|
1 /* CPP Library - charsets
|
|
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, 2008, 2009
|
|
3 Free Software Foundation, Inc.
|
|
4
|
|
5 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
|
|
6
|
|
7 This program is free software; you can redistribute it and/or modify it
|
|
8 under the terms of the GNU General Public License as published by the
|
|
9 Free Software Foundation; either version 3, or (at your option) any
|
|
10 later version.
|
|
11
|
|
12 This program is distributed in the hope that it will be useful,
|
|
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15 GNU General Public License for more details.
|
|
16
|
|
17 You should have received a copy of the GNU General Public License
|
|
18 along with this program; see the file COPYING3. If not see
|
|
19 <http://www.gnu.org/licenses/>. */
|
|
20
|
|
21 #include "config.h"
|
|
22 #include "system.h"
|
|
23 #include "cpplib.h"
|
|
24 #include "internal.h"
|
|
25
|
|
26 /* Character set handling for C-family languages.
|
|
27
|
|
28 Terminological note: In what follows, "charset" or "character set"
|
|
29 will be taken to mean both an abstract set of characters and an
|
|
30 encoding for that set.
|
|
31
|
|
32 The C99 standard discusses two character sets: source and execution.
|
|
33 The source character set is used for internal processing in translation
|
|
34 phases 1 through 4; the execution character set is used thereafter.
|
|
35 Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
|
|
36 character encodings (see 3.7.2, 3.7.3 for the standardese meanings
|
|
37 of these terms). Furthermore, the "basic character set" (listed in
|
|
38 5.2.1p3) is to be encoded in each with values one byte wide, and is
|
|
39 to appear in the initial shift state.
|
|
40
|
|
41 It is not explicitly mentioned, but there is also a "wide execution
|
|
42 character set" used to encode wide character constants and wide
|
|
43 string literals; this is supposed to be the result of applying the
|
|
44 standard library function mbstowcs() to an equivalent narrow string
|
|
45 (6.4.5p5). However, the behavior of hexadecimal and octal
|
|
46 \-escapes is at odds with this; they are supposed to be translated
|
|
47 directly to wchar_t values (6.4.4.4p5,6).
|
|
48
|
|
49 The source character set is not necessarily the character set used
|
|
50 to encode physical source files on disk; translation phase 1 converts
|
|
51 from whatever that encoding is to the source character set.
|
|
52
|
|
53 The presence of universal character names in C99 (6.4.3 et seq.)
|
|
54 forces the source character set to be isomorphic to ISO 10646,
|
|
55 that is, Unicode. There is no such constraint on the execution
|
|
56 character set; note also that the conversion from source to
|
|
57 execution character set does not occur for identifiers (5.1.1.2p1#5).
|
|
58
|
|
59 For convenience of implementation, the source character set's
|
|
60 encoding of the basic character set should be identical to the
|
|
61 execution character set OF THE HOST SYSTEM's encoding of the basic
|
|
62 character set, and it should not be a state-dependent encoding.
|
|
63
|
|
64 cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
|
|
65 depending on whether the host is based on ASCII or EBCDIC (see
|
|
66 respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
|
|
67 Technical Report #16). With limited exceptions, it relies on the
|
|
68 system library's iconv() primitive to do charset conversion
|
|
69 (specified in SUSv2). */
|
|
70
|
|
71 #if !HAVE_ICONV
|
|
72 /* Make certain that the uses of iconv(), iconv_open(), iconv_close()
|
|
73 below, which are guarded only by if statements with compile-time
|
|
74 constant conditions, do not cause link errors. */
|
|
75 #define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
|
|
76 #define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
|
|
77 #define iconv_close(x) (void)0
|
|
78 #define ICONV_CONST
|
|
79 #endif
|
|
80
|
|
81 #if HOST_CHARSET == HOST_CHARSET_ASCII
|
|
82 #define SOURCE_CHARSET "UTF-8"
|
|
83 #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
|
|
84 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
|
|
85 #define SOURCE_CHARSET "UTF-EBCDIC"
|
|
86 #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
|
|
87 #else
|
|
88 #error "Unrecognized basic host character set"
|
|
89 #endif
|
|
90
|
|
91 #ifndef EILSEQ
|
|
92 #define EILSEQ EINVAL
|
|
93 #endif
|
|
94
|
|
95 /* This structure is used for a resizable string buffer throughout. */
|
|
96 /* Don't call it strbuf, as that conflicts with unistd.h on systems
|
|
97 such as DYNIX/ptx where unistd.h includes stropts.h. */
|
|
98 struct _cpp_strbuf
|
|
99 {
|
|
100 uchar *text;
|
|
101 size_t asize;
|
|
102 size_t len;
|
|
103 };
|
|
104
|
|
105 /* This is enough to hold any string that fits on a single 80-column
|
|
106 line, even if iconv quadruples its size (e.g. conversion from
|
|
107 ASCII to UTF-32) rounded up to a power of two. */
|
|
108 #define OUTBUF_BLOCK_SIZE 256
|
|
109
|
|
110 /* Conversions between UTF-8 and UTF-16/32 are implemented by custom
|
|
111 logic. This is because a depressing number of systems lack iconv,
|
|
112 or have have iconv libraries that do not do these conversions, so
|
|
113 we need a fallback implementation for them. To ensure the fallback
|
|
114 doesn't break due to neglect, it is used on all systems.
|
|
115
|
|
116 UTF-32 encoding is nice and simple: a four-byte binary number,
|
|
117 constrained to the range 00000000-7FFFFFFF to avoid questions of
|
|
118 signedness. We do have to cope with big- and little-endian
|
|
119 variants.
|
|
120
|
|
121 UTF-16 encoding uses two-byte binary numbers, again in big- and
|
|
122 little-endian variants, for all values in the 00000000-0000FFFF
|
|
123 range. Values in the 00010000-0010FFFF range are encoded as pairs
|
|
124 of two-byte numbers, called "surrogate pairs": given a number S in
|
|
125 this range, it is mapped to a pair (H, L) as follows:
|
|
126
|
|
127 H = (S - 0x10000) / 0x400 + 0xD800
|
|
128 L = (S - 0x10000) % 0x400 + 0xDC00
|
|
129
|
|
130 Two-byte values in the D800...DFFF range are ill-formed except as a
|
|
131 component of a surrogate pair. Even if the encoding within a
|
|
132 two-byte value is little-endian, the H member of the surrogate pair
|
|
133 comes first.
|
|
134
|
|
135 There is no way to encode values in the 00110000-7FFFFFFF range,
|
|
136 which is not currently a problem as there are no assigned code
|
|
137 points in that range; however, the author expects that it will
|
|
138 eventually become necessary to abandon UTF-16 due to this
|
|
139 limitation. Note also that, because of these pairs, UTF-16 does
|
|
140 not meet the requirements of the C standard for a wide character
|
|
141 encoding (see 3.7.3 and 6.4.4.4p11).
|
|
142
|
|
143 UTF-8 encoding looks like this:
|
|
144
|
|
145 value range encoded as
|
|
146 00000000-0000007F 0xxxxxxx
|
|
147 00000080-000007FF 110xxxxx 10xxxxxx
|
|
148 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
|
|
149 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
150 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
151 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
152
|
|
153 Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
|
|
154 which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
|
|
155 never occur. Note also that any value that can be encoded by a
|
|
156 given row of the table can also be encoded by all successive rows,
|
|
157 but this is not done; only the shortest possible encoding for any
|
|
158 given value is valid. For instance, the character 07C0 could be
|
|
159 encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
|
|
160 FC 80 80 80 9F 80. Only the first is valid.
|
|
161
|
|
162 An implementation note: the transformation from UTF-16 to UTF-8, or
|
|
163 vice versa, is easiest done by using UTF-32 as an intermediary. */
|
|
164
|
|
165 /* Internal primitives which go from an UTF-8 byte stream to native-endian
|
|
166 UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
|
|
167 operation in several places below. */
|
|
168 static inline int
|
|
169 one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
|
|
170 cppchar_t *cp)
|
|
171 {
|
|
172 static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
|
|
173 static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
|
|
174
|
|
175 cppchar_t c;
|
|
176 const uchar *inbuf = *inbufp;
|
|
177 size_t nbytes, i;
|
|
178
|
|
179 if (*inbytesleftp < 1)
|
|
180 return EINVAL;
|
|
181
|
|
182 c = *inbuf;
|
|
183 if (c < 0x80)
|
|
184 {
|
|
185 *cp = c;
|
|
186 *inbytesleftp -= 1;
|
|
187 *inbufp += 1;
|
|
188 return 0;
|
|
189 }
|
|
190
|
|
191 /* The number of leading 1-bits in the first byte indicates how many
|
|
192 bytes follow. */
|
|
193 for (nbytes = 2; nbytes < 7; nbytes++)
|
|
194 if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
|
|
195 goto found;
|
|
196 return EILSEQ;
|
|
197 found:
|
|
198
|
|
199 if (*inbytesleftp < nbytes)
|
|
200 return EINVAL;
|
|
201
|
|
202 c = (c & masks[nbytes-1]);
|
|
203 inbuf++;
|
|
204 for (i = 1; i < nbytes; i++)
|
|
205 {
|
|
206 cppchar_t n = *inbuf++;
|
|
207 if ((n & 0xC0) != 0x80)
|
|
208 return EILSEQ;
|
|
209 c = ((c << 6) + (n & 0x3F));
|
|
210 }
|
|
211
|
|
212 /* Make sure the shortest possible encoding was used. */
|
|
213 if (c <= 0x7F && nbytes > 1) return EILSEQ;
|
|
214 if (c <= 0x7FF && nbytes > 2) return EILSEQ;
|
|
215 if (c <= 0xFFFF && nbytes > 3) return EILSEQ;
|
|
216 if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ;
|
|
217 if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
|
|
218
|
|
219 /* Make sure the character is valid. */
|
|
220 if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
|
|
221
|
|
222 *cp = c;
|
|
223 *inbufp = inbuf;
|
|
224 *inbytesleftp -= nbytes;
|
|
225 return 0;
|
|
226 }
|
|
227
|
|
228 static inline int
|
|
229 one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
|
|
230 {
|
|
231 static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
|
|
232 static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
|
|
233 size_t nbytes;
|
|
234 uchar buf[6], *p = &buf[6];
|
|
235 uchar *outbuf = *outbufp;
|
|
236
|
|
237 nbytes = 1;
|
|
238 if (c < 0x80)
|
|
239 *--p = c;
|
|
240 else
|
|
241 {
|
|
242 do
|
|
243 {
|
|
244 *--p = ((c & 0x3F) | 0x80);
|
|
245 c >>= 6;
|
|
246 nbytes++;
|
|
247 }
|
|
248 while (c >= 0x3F || (c & limits[nbytes-1]));
|
|
249 *--p = (c | masks[nbytes-1]);
|
|
250 }
|
|
251
|
|
252 if (*outbytesleftp < nbytes)
|
|
253 return E2BIG;
|
|
254
|
|
255 while (p < &buf[6])
|
|
256 *outbuf++ = *p++;
|
|
257 *outbytesleftp -= nbytes;
|
|
258 *outbufp = outbuf;
|
|
259 return 0;
|
|
260 }
|
|
261
|
|
262 /* The following four functions transform one character between the two
|
|
263 encodings named in the function name. All have the signature
|
|
264 int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
|
|
265 uchar **outbufp, size_t *outbytesleftp)
|
|
266
|
|
267 BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
|
|
268 interpreted as a boolean indicating whether big-endian or
|
|
269 little-endian encoding is to be used for the member of the pair
|
|
270 that is not UTF-8.
|
|
271
|
|
272 INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
|
|
273 do for iconv.
|
|
274
|
|
275 The return value is either 0 for success, or an errno value for
|
|
276 failure, which may be E2BIG (need more space), EILSEQ (ill-formed
|
|
277 input sequence), ir EINVAL (incomplete input sequence). */
|
|
278
|
|
279 static inline int
|
|
280 one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
|
|
281 uchar **outbufp, size_t *outbytesleftp)
|
|
282 {
|
|
283 uchar *outbuf;
|
|
284 cppchar_t s = 0;
|
|
285 int rval;
|
|
286
|
|
287 /* Check for space first, since we know exactly how much we need. */
|
|
288 if (*outbytesleftp < 4)
|
|
289 return E2BIG;
|
|
290
|
|
291 rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
|
|
292 if (rval)
|
|
293 return rval;
|
|
294
|
|
295 outbuf = *outbufp;
|
|
296 outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
|
|
297 outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
|
|
298 outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
|
|
299 outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
|
|
300
|
|
301 *outbufp += 4;
|
|
302 *outbytesleftp -= 4;
|
|
303 return 0;
|
|
304 }
|
|
305
|
|
306 static inline int
|
|
307 one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
|
|
308 uchar **outbufp, size_t *outbytesleftp)
|
|
309 {
|
|
310 cppchar_t s;
|
|
311 int rval;
|
|
312 const uchar *inbuf;
|
|
313
|
|
314 if (*inbytesleftp < 4)
|
|
315 return EINVAL;
|
|
316
|
|
317 inbuf = *inbufp;
|
|
318
|
|
319 s = inbuf[bigend ? 0 : 3] << 24;
|
|
320 s += inbuf[bigend ? 1 : 2] << 16;
|
|
321 s += inbuf[bigend ? 2 : 1] << 8;
|
|
322 s += inbuf[bigend ? 3 : 0];
|
|
323
|
|
324 if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
|
|
325 return EILSEQ;
|
|
326
|
|
327 rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
|
|
328 if (rval)
|
|
329 return rval;
|
|
330
|
|
331 *inbufp += 4;
|
|
332 *inbytesleftp -= 4;
|
|
333 return 0;
|
|
334 }
|
|
335
|
|
336 static inline int
|
|
337 one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
|
|
338 uchar **outbufp, size_t *outbytesleftp)
|
|
339 {
|
|
340 int rval;
|
|
341 cppchar_t s = 0;
|
|
342 const uchar *save_inbuf = *inbufp;
|
|
343 size_t save_inbytesleft = *inbytesleftp;
|
|
344 uchar *outbuf = *outbufp;
|
|
345
|
|
346 rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
|
|
347 if (rval)
|
|
348 return rval;
|
|
349
|
|
350 if (s > 0x0010FFFF)
|
|
351 {
|
|
352 *inbufp = save_inbuf;
|
|
353 *inbytesleftp = save_inbytesleft;
|
|
354 return EILSEQ;
|
|
355 }
|
|
356
|
|
357 if (s < 0xFFFF)
|
|
358 {
|
|
359 if (*outbytesleftp < 2)
|
|
360 {
|
|
361 *inbufp = save_inbuf;
|
|
362 *inbytesleftp = save_inbytesleft;
|
|
363 return E2BIG;
|
|
364 }
|
|
365 outbuf[bigend ? 1 : 0] = (s & 0x00FF);
|
|
366 outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
|
|
367
|
|
368 *outbufp += 2;
|
|
369 *outbytesleftp -= 2;
|
|
370 return 0;
|
|
371 }
|
|
372 else
|
|
373 {
|
|
374 cppchar_t hi, lo;
|
|
375
|
|
376 if (*outbytesleftp < 4)
|
|
377 {
|
|
378 *inbufp = save_inbuf;
|
|
379 *inbytesleftp = save_inbytesleft;
|
|
380 return E2BIG;
|
|
381 }
|
|
382
|
|
383 hi = (s - 0x10000) / 0x400 + 0xD800;
|
|
384 lo = (s - 0x10000) % 0x400 + 0xDC00;
|
|
385
|
|
386 /* Even if we are little-endian, put the high surrogate first.
|
|
387 ??? Matches practice? */
|
|
388 outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
|
|
389 outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
|
|
390 outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
|
|
391 outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
|
|
392
|
|
393 *outbufp += 4;
|
|
394 *outbytesleftp -= 4;
|
|
395 return 0;
|
|
396 }
|
|
397 }
|
|
398
|
|
399 static inline int
|
|
400 one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
|
|
401 uchar **outbufp, size_t *outbytesleftp)
|
|
402 {
|
|
403 cppchar_t s;
|
|
404 const uchar *inbuf = *inbufp;
|
|
405 int rval;
|
|
406
|
|
407 if (*inbytesleftp < 2)
|
|
408 return EINVAL;
|
|
409 s = inbuf[bigend ? 0 : 1] << 8;
|
|
410 s += inbuf[bigend ? 1 : 0];
|
|
411
|
|
412 /* Low surrogate without immediately preceding high surrogate is invalid. */
|
|
413 if (s >= 0xDC00 && s <= 0xDFFF)
|
|
414 return EILSEQ;
|
|
415 /* High surrogate must have a following low surrogate. */
|
|
416 else if (s >= 0xD800 && s <= 0xDBFF)
|
|
417 {
|
|
418 cppchar_t hi = s, lo;
|
|
419 if (*inbytesleftp < 4)
|
|
420 return EINVAL;
|
|
421
|
|
422 lo = inbuf[bigend ? 2 : 3] << 8;
|
|
423 lo += inbuf[bigend ? 3 : 2];
|
|
424
|
|
425 if (lo < 0xDC00 || lo > 0xDFFF)
|
|
426 return EILSEQ;
|
|
427
|
|
428 s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
|
|
429 }
|
|
430
|
|
431 rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
|
|
432 if (rval)
|
|
433 return rval;
|
|
434
|
|
435 /* Success - update the input pointers (one_cppchar_to_utf8 has done
|
|
436 the output pointers for us). */
|
|
437 if (s <= 0xFFFF)
|
|
438 {
|
|
439 *inbufp += 2;
|
|
440 *inbytesleftp -= 2;
|
|
441 }
|
|
442 else
|
|
443 {
|
|
444 *inbufp += 4;
|
|
445 *inbytesleftp -= 4;
|
|
446 }
|
|
447 return 0;
|
|
448 }
|
|
449
|
|
450 /* Helper routine for the next few functions. The 'const' on
|
|
451 one_conversion means that we promise not to modify what function is
|
|
452 pointed to, which lets the inliner see through it. */
|
|
453
|
|
454 static inline bool
|
|
455 conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
|
|
456 uchar **, size_t *),
|
|
457 iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
|
|
458 {
|
|
459 const uchar *inbuf;
|
|
460 uchar *outbuf;
|
|
461 size_t inbytesleft, outbytesleft;
|
|
462 int rval;
|
|
463
|
|
464 inbuf = from;
|
|
465 inbytesleft = flen;
|
|
466 outbuf = to->text + to->len;
|
|
467 outbytesleft = to->asize - to->len;
|
|
468
|
|
469 for (;;)
|
|
470 {
|
|
471 do
|
|
472 rval = one_conversion (cd, &inbuf, &inbytesleft,
|
|
473 &outbuf, &outbytesleft);
|
|
474 while (inbytesleft && !rval);
|
|
475
|
|
476 if (__builtin_expect (inbytesleft == 0, 1))
|
|
477 {
|
|
478 to->len = to->asize - outbytesleft;
|
|
479 return true;
|
|
480 }
|
|
481 if (rval != E2BIG)
|
|
482 {
|
|
483 errno = rval;
|
|
484 return false;
|
|
485 }
|
|
486
|
|
487 outbytesleft += OUTBUF_BLOCK_SIZE;
|
|
488 to->asize += OUTBUF_BLOCK_SIZE;
|
|
489 to->text = XRESIZEVEC (uchar, to->text, to->asize);
|
|
490 outbuf = to->text + to->asize - outbytesleft;
|
|
491 }
|
|
492 }
|
|
493
|
|
494
|
|
495 /* These functions convert entire strings between character sets.
|
|
496 They all have the signature
|
|
497
|
|
498 bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
|
|
499
|
|
500 The input string FROM is converted as specified by the function
|
|
501 name plus the iconv descriptor CD (which may be fake), and the
|
|
502 result appended to TO. On any error, false is returned, otherwise true. */
|
|
503
|
|
504 /* These four use the custom conversion code above. */
|
|
505 static bool
|
|
506 convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
|
|
507 struct _cpp_strbuf *to)
|
|
508 {
|
|
509 return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
|
|
510 }
|
|
511
|
|
512 static bool
|
|
513 convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
|
|
514 struct _cpp_strbuf *to)
|
|
515 {
|
|
516 return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
|
|
517 }
|
|
518
|
|
519 static bool
|
|
520 convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
|
|
521 struct _cpp_strbuf *to)
|
|
522 {
|
|
523 return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
|
|
524 }
|
|
525
|
|
526 static bool
|
|
527 convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
|
|
528 struct _cpp_strbuf *to)
|
|
529 {
|
|
530 return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
|
|
531 }
|
|
532
|
|
533 /* Identity conversion, used when we have no alternative. */
|
|
534 static bool
|
|
535 convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
|
|
536 const uchar *from, size_t flen, struct _cpp_strbuf *to)
|
|
537 {
|
|
538 if (to->len + flen > to->asize)
|
|
539 {
|
|
540 to->asize = to->len + flen;
|
|
541 to->text = XRESIZEVEC (uchar, to->text, to->asize);
|
|
542 }
|
|
543 memcpy (to->text + to->len, from, flen);
|
|
544 to->len += flen;
|
|
545 return true;
|
|
546 }
|
|
547
|
|
548 /* And this one uses the system iconv primitive. It's a little
|
|
549 different, since iconv's interface is a little different. */
|
|
550 #if HAVE_ICONV
|
|
551
|
|
552 #define CONVERT_ICONV_GROW_BUFFER \
|
|
553 do { \
|
|
554 outbytesleft += OUTBUF_BLOCK_SIZE; \
|
|
555 to->asize += OUTBUF_BLOCK_SIZE; \
|
|
556 to->text = XRESIZEVEC (uchar, to->text, to->asize); \
|
|
557 outbuf = (char *)to->text + to->asize - outbytesleft; \
|
|
558 } while (0)
|
|
559
|
|
560 static bool
|
|
561 convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
|
|
562 struct _cpp_strbuf *to)
|
|
563 {
|
|
564 ICONV_CONST char *inbuf;
|
|
565 char *outbuf;
|
|
566 size_t inbytesleft, outbytesleft;
|
|
567
|
|
568 /* Reset conversion descriptor and check that it is valid. */
|
|
569 if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
|
|
570 return false;
|
|
571
|
|
572 inbuf = (ICONV_CONST char *)from;
|
|
573 inbytesleft = flen;
|
|
574 outbuf = (char *)to->text + to->len;
|
|
575 outbytesleft = to->asize - to->len;
|
|
576
|
|
577 for (;;)
|
|
578 {
|
|
579 iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
|
|
580 if (__builtin_expect (inbytesleft == 0, 1))
|
|
581 {
|
|
582 /* Close out any shift states, returning to the initial state. */
|
|
583 if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
|
|
584 {
|
|
585 if (errno != E2BIG)
|
|
586 return false;
|
|
587
|
|
588 CONVERT_ICONV_GROW_BUFFER;
|
|
589 if (iconv (cd, 0, 0, &outbuf, &outbytesleft) == (size_t)-1)
|
|
590 return false;
|
|
591 }
|
|
592
|
|
593 to->len = to->asize - outbytesleft;
|
|
594 return true;
|
|
595 }
|
|
596 if (errno != E2BIG)
|
|
597 return false;
|
|
598
|
|
599 CONVERT_ICONV_GROW_BUFFER;
|
|
600 }
|
|
601 }
|
|
602 #else
|
|
603 #define convert_using_iconv 0 /* prevent undefined symbol error below */
|
|
604 #endif
|
|
605
|
|
606 /* Arrange for the above custom conversion logic to be used automatically
|
|
607 when conversion between a suitable pair of character sets is requested. */
|
|
608
|
|
609 #define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
|
|
610 CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
|
|
611
|
|
612 struct conversion
|
|
613 {
|
|
614 const char *pair;
|
|
615 convert_f func;
|
|
616 iconv_t fake_cd;
|
|
617 };
|
|
618 static const struct conversion conversion_tab[] = {
|
|
619 { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
|
|
620 { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
|
|
621 { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
|
|
622 { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
|
|
623 { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
|
|
624 { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
|
|
625 { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
|
|
626 { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
|
|
627 };
|
|
628
|
|
629 /* Subroutine of cpp_init_iconv: initialize and return a
|
|
630 cset_converter structure for conversion from FROM to TO. If
|
|
631 iconv_open() fails, issue an error and return an identity
|
|
632 converter. Silently return an identity converter if FROM and TO
|
|
633 are identical. */
|
|
634 static struct cset_converter
|
|
635 init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
|
|
636 {
|
|
637 struct cset_converter ret;
|
|
638 char *pair;
|
|
639 size_t i;
|
|
640
|
|
641 if (!strcasecmp (to, from))
|
|
642 {
|
|
643 ret.func = convert_no_conversion;
|
|
644 ret.cd = (iconv_t) -1;
|
|
645 ret.width = -1;
|
|
646 return ret;
|
|
647 }
|
|
648
|
|
649 pair = (char *) alloca(strlen(to) + strlen(from) + 2);
|
|
650
|
|
651 strcpy(pair, from);
|
|
652 strcat(pair, "/");
|
|
653 strcat(pair, to);
|
|
654 for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
|
|
655 if (!strcasecmp (pair, conversion_tab[i].pair))
|
|
656 {
|
|
657 ret.func = conversion_tab[i].func;
|
|
658 ret.cd = conversion_tab[i].fake_cd;
|
|
659 ret.width = -1;
|
|
660 return ret;
|
|
661 }
|
|
662
|
|
663 /* No custom converter - try iconv. */
|
|
664 if (HAVE_ICONV)
|
|
665 {
|
|
666 ret.func = convert_using_iconv;
|
|
667 ret.cd = iconv_open (to, from);
|
|
668 ret.width = -1;
|
|
669
|
|
670 if (ret.cd == (iconv_t) -1)
|
|
671 {
|
|
672 if (errno == EINVAL)
|
|
673 cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
|
|
674 "conversion from %s to %s not supported by iconv",
|
|
675 from, to);
|
|
676 else
|
|
677 cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
|
|
678
|
|
679 ret.func = convert_no_conversion;
|
|
680 }
|
|
681 }
|
|
682 else
|
|
683 {
|
|
684 cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
|
|
685 "no iconv implementation, cannot convert from %s to %s",
|
|
686 from, to);
|
|
687 ret.func = convert_no_conversion;
|
|
688 ret.cd = (iconv_t) -1;
|
|
689 ret.width = -1;
|
|
690 }
|
|
691 return ret;
|
|
692 }
|
|
693
|
|
694 /* If charset conversion is requested, initialize iconv(3) descriptors
|
|
695 for conversion from the source character set to the execution
|
|
696 character sets. If iconv is not present in the C library, and
|
|
697 conversion is requested, issue an error. */
|
|
698
|
|
699 void
|
|
700 cpp_init_iconv (cpp_reader *pfile)
|
|
701 {
|
|
702 const char *ncset = CPP_OPTION (pfile, narrow_charset);
|
|
703 const char *wcset = CPP_OPTION (pfile, wide_charset);
|
|
704 const char *default_wcset;
|
|
705
|
|
706 bool be = CPP_OPTION (pfile, bytes_big_endian);
|
|
707
|
|
708 if (CPP_OPTION (pfile, wchar_precision) >= 32)
|
|
709 default_wcset = be ? "UTF-32BE" : "UTF-32LE";
|
|
710 else if (CPP_OPTION (pfile, wchar_precision) >= 16)
|
|
711 default_wcset = be ? "UTF-16BE" : "UTF-16LE";
|
|
712 else
|
|
713 /* This effectively means that wide strings are not supported,
|
|
714 so don't do any conversion at all. */
|
|
715 default_wcset = SOURCE_CHARSET;
|
|
716
|
|
717 if (!ncset)
|
|
718 ncset = SOURCE_CHARSET;
|
|
719 if (!wcset)
|
|
720 wcset = default_wcset;
|
|
721
|
|
722 pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
|
|
723 pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
|
|
724 pfile->char16_cset_desc = init_iconv_desc (pfile,
|
|
725 be ? "UTF-16BE" : "UTF-16LE",
|
|
726 SOURCE_CHARSET);
|
|
727 pfile->char16_cset_desc.width = 16;
|
|
728 pfile->char32_cset_desc = init_iconv_desc (pfile,
|
|
729 be ? "UTF-32BE" : "UTF-32LE",
|
|
730 SOURCE_CHARSET);
|
|
731 pfile->char32_cset_desc.width = 32;
|
|
732 pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
|
|
733 pfile->wide_cset_desc.width = CPP_OPTION (pfile, wchar_precision);
|
|
734 }
|
|
735
|
|
736 /* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary. */
|
|
737 void
|
|
738 _cpp_destroy_iconv (cpp_reader *pfile)
|
|
739 {
|
|
740 if (HAVE_ICONV)
|
|
741 {
|
|
742 if (pfile->narrow_cset_desc.func == convert_using_iconv)
|
|
743 iconv_close (pfile->narrow_cset_desc.cd);
|
|
744 if (pfile->wide_cset_desc.func == convert_using_iconv)
|
|
745 iconv_close (pfile->wide_cset_desc.cd);
|
|
746 }
|
|
747 }
|
|
748
|
|
749 /* Utility routine for use by a full compiler. C is a character taken
|
|
750 from the *basic* source character set, encoded in the host's
|
|
751 execution encoding. Convert it to (the target's) execution
|
|
752 encoding, and return that value.
|
|
753
|
|
754 Issues an internal error if C's representation in the narrow
|
|
755 execution character set fails to be a single-byte value (C99
|
|
756 5.2.1p3: "The representation of each member of the source and
|
|
757 execution character sets shall fit in a byte.") May also issue an
|
|
758 internal error if C fails to be a member of the basic source
|
|
759 character set (testing this exactly is too hard, especially when
|
|
760 the host character set is EBCDIC). */
|
|
761 cppchar_t
|
|
762 cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
|
|
763 {
|
|
764 uchar sbuf[1];
|
|
765 struct _cpp_strbuf tbuf;
|
|
766
|
|
767 /* This test is merely an approximation, but it suffices to catch
|
|
768 the most important thing, which is that we don't get handed a
|
|
769 character outside the unibyte range of the host character set. */
|
|
770 if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
|
|
771 {
|
|
772 cpp_error (pfile, CPP_DL_ICE,
|
|
773 "character 0x%lx is not in the basic source character set\n",
|
|
774 (unsigned long)c);
|
|
775 return 0;
|
|
776 }
|
|
777
|
|
778 /* Being a character in the unibyte range of the host character set,
|
|
779 we can safely splat it into a one-byte buffer and trust that that
|
|
780 is a well-formed string. */
|
|
781 sbuf[0] = c;
|
|
782
|
|
783 /* This should never need to reallocate, but just in case... */
|
|
784 tbuf.asize = 1;
|
|
785 tbuf.text = XNEWVEC (uchar, tbuf.asize);
|
|
786 tbuf.len = 0;
|
|
787
|
|
788 if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
|
|
789 {
|
|
790 cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
|
|
791 return 0;
|
|
792 }
|
|
793 if (tbuf.len != 1)
|
|
794 {
|
|
795 cpp_error (pfile, CPP_DL_ICE,
|
|
796 "character 0x%lx is not unibyte in execution character set",
|
|
797 (unsigned long)c);
|
|
798 return 0;
|
|
799 }
|
|
800 c = tbuf.text[0];
|
|
801 free(tbuf.text);
|
|
802 return c;
|
|
803 }
|
|
804
|
|
805
|
|
806
|
|
807 /* Utility routine that computes a mask of the form 0000...111... with
|
|
808 WIDTH 1-bits. */
|
|
809 static inline size_t
|
|
810 width_to_mask (size_t width)
|
|
811 {
|
|
812 width = MIN (width, BITS_PER_CPPCHAR_T);
|
|
813 if (width >= CHAR_BIT * sizeof (size_t))
|
|
814 return ~(size_t) 0;
|
|
815 else
|
|
816 return ((size_t) 1 << width) - 1;
|
|
817 }
|
|
818
|
|
819 /* A large table of unicode character information. */
|
|
820 enum {
|
|
821 /* Valid in a C99 identifier? */
|
|
822 C99 = 1,
|
|
823 /* Valid in a C99 identifier, but not as the first character? */
|
|
824 DIG = 2,
|
|
825 /* Valid in a C++ identifier? */
|
|
826 CXX = 4,
|
|
827 /* NFC representation is not valid in an identifier? */
|
|
828 CID = 8,
|
|
829 /* Might be valid NFC form? */
|
|
830 NFC = 16,
|
|
831 /* Might be valid NFKC form? */
|
|
832 NKC = 32,
|
|
833 /* Certain preceding characters might make it not valid NFC/NKFC form? */
|
|
834 CTX = 64
|
|
835 };
|
|
836
|
|
837 static const struct {
|
|
838 /* Bitmap of flags above. */
|
|
839 unsigned char flags;
|
|
840 /* Combining class of the character. */
|
|
841 unsigned char combine;
|
|
842 /* Last character in the range described by this entry. */
|
|
843 unsigned short end;
|
|
844 } ucnranges[] = {
|
|
845 #include "ucnid.h"
|
|
846 };
|
|
847
|
|
848 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
|
|
849 the start of an identifier, and 0 if C is not valid in an
|
|
850 identifier. We assume C has already gone through the checks of
|
|
851 _cpp_valid_ucn. Also update NST for C if returning nonzero. The
|
|
852 algorithm is a simple binary search on the table defined in
|
|
853 ucnid.h. */
|
|
854
|
|
855 static int
|
|
856 ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
|
|
857 struct normalize_state *nst)
|
|
858 {
|
|
859 int mn, mx, md;
|
|
860
|
|
861 if (c > 0xFFFF)
|
|
862 return 0;
|
|
863
|
|
864 mn = 0;
|
|
865 mx = ARRAY_SIZE (ucnranges) - 1;
|
|
866 while (mx != mn)
|
|
867 {
|
|
868 md = (mn + mx) / 2;
|
|
869 if (c <= ucnranges[md].end)
|
|
870 mx = md;
|
|
871 else
|
|
872 mn = md + 1;
|
|
873 }
|
|
874
|
|
875 /* When -pedantic, we require the character to have been listed by
|
|
876 the standard for the current language. Otherwise, we accept the
|
|
877 union of the acceptable sets for C++98 and C99. */
|
|
878 if (! (ucnranges[mn].flags & (C99 | CXX)))
|
|
879 return 0;
|
|
880
|
|
881 if (CPP_PEDANTIC (pfile)
|
|
882 && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
|
|
883 || (CPP_OPTION (pfile, cplusplus)
|
|
884 && !(ucnranges[mn].flags & CXX))))
|
|
885 return 0;
|
|
886
|
|
887 /* Update NST. */
|
|
888 if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
|
|
889 nst->level = normalized_none;
|
|
890 else if (ucnranges[mn].flags & CTX)
|
|
891 {
|
|
892 bool safe;
|
|
893 cppchar_t p = nst->previous;
|
|
894
|
|
895 /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam. */
|
|
896 if (c == 0x09BE)
|
|
897 safe = p != 0x09C7; /* Use 09CB instead of 09C7 09BE. */
|
|
898 else if (c == 0x0B3E)
|
|
899 safe = p != 0x0B47; /* Use 0B4B instead of 0B47 0B3E. */
|
|
900 else if (c == 0x0BBE)
|
|
901 safe = p != 0x0BC6 && p != 0x0BC7; /* Use 0BCA/0BCB instead. */
|
|
902 else if (c == 0x0CC2)
|
|
903 safe = p != 0x0CC6; /* Use 0CCA instead of 0CC6 0CC2. */
|
|
904 else if (c == 0x0D3E)
|
|
905 safe = p != 0x0D46 && p != 0x0D47; /* Use 0D4A/0D4B instead. */
|
|
906 /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
|
|
907 and are combined algorithmically from a sequence of the form
|
|
908 1100-1112 1161-1175 11A8-11C2
|
|
909 (if the third is not present, it is treated as 11A7, which is not
|
|
910 really a valid character).
|
|
911 Unfortunately, C99 allows (only) the NFC form, but C++ allows
|
|
912 only the combining characters. */
|
|
913 else if (c >= 0x1161 && c <= 0x1175)
|
|
914 safe = p < 0x1100 || p > 0x1112;
|
|
915 else if (c >= 0x11A8 && c <= 0x11C2)
|
|
916 safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
|
|
917 else
|
|
918 {
|
|
919 /* Uh-oh, someone updated ucnid.h without updating this code. */
|
|
920 cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
|
|
921 safe = true;
|
|
922 }
|
|
923 if (!safe && c < 0x1161)
|
|
924 nst->level = normalized_none;
|
|
925 else if (!safe)
|
|
926 nst->level = MAX (nst->level, normalized_identifier_C);
|
|
927 }
|
|
928 else if (ucnranges[mn].flags & NKC)
|
|
929 ;
|
|
930 else if (ucnranges[mn].flags & NFC)
|
|
931 nst->level = MAX (nst->level, normalized_C);
|
|
932 else if (ucnranges[mn].flags & CID)
|
|
933 nst->level = MAX (nst->level, normalized_identifier_C);
|
|
934 else
|
|
935 nst->level = normalized_none;
|
|
936 nst->previous = c;
|
|
937 nst->prev_class = ucnranges[mn].combine;
|
|
938
|
|
939 /* In C99, UCN digits may not begin identifiers. */
|
|
940 if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
|
|
941 return 2;
|
|
942
|
|
943 return 1;
|
|
944 }
|
|
945
|
|
946 /* [lex.charset]: The character designated by the universal character
|
|
947 name \UNNNNNNNN is that character whose character short name in
|
|
948 ISO/IEC 10646 is NNNNNNNN; the character designated by the
|
|
949 universal character name \uNNNN is that character whose character
|
|
950 short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
|
|
951 for a universal character name is less than 0x20 or in the range
|
|
952 0x7F-0x9F (inclusive), or if the universal character name
|
|
953 designates a character in the basic source character set, then the
|
|
954 program is ill-formed.
|
|
955
|
|
956 *PSTR must be preceded by "\u" or "\U"; it is assumed that the
|
|
957 buffer end is delimited by a non-hex digit. Returns zero if the
|
|
958 UCN has not been consumed.
|
|
959
|
|
960 Otherwise the nonzero value of the UCN, whether valid or invalid,
|
|
961 is returned. Diagnostics are emitted for invalid values. PSTR
|
|
962 is updated to point one beyond the UCN, or to the syntactically
|
|
963 invalid character.
|
|
964
|
|
965 IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
|
|
966 an identifier, or 2 otherwise. */
|
|
967
|
|
968 cppchar_t
|
|
969 _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
|
|
970 const uchar *limit, int identifier_pos,
|
|
971 struct normalize_state *nst)
|
|
972 {
|
|
973 cppchar_t result, c;
|
|
974 unsigned int length;
|
|
975 const uchar *str = *pstr;
|
|
976 const uchar *base = str - 2;
|
|
977
|
|
978 if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
|
|
979 cpp_error (pfile, CPP_DL_WARNING,
|
|
980 "universal character names are only valid in C++ and C99");
|
|
981 else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
|
|
982 cpp_error (pfile, CPP_DL_WARNING,
|
|
983 "the meaning of '\\%c' is different in traditional C",
|
|
984 (int) str[-1]);
|
|
985
|
|
986 if (str[-1] == 'u')
|
|
987 length = 4;
|
|
988 else if (str[-1] == 'U')
|
|
989 length = 8;
|
|
990 else
|
|
991 {
|
|
992 cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
|
|
993 length = 4;
|
|
994 }
|
|
995
|
|
996 result = 0;
|
|
997 do
|
|
998 {
|
|
999 c = *str;
|
|
1000 if (!ISXDIGIT (c))
|
|
1001 break;
|
|
1002 str++;
|
|
1003 result = (result << 4) + hex_value (c);
|
|
1004 }
|
|
1005 while (--length && str < limit);
|
|
1006
|
|
1007 /* Partial UCNs are not valid in strings, but decompose into
|
|
1008 multiple tokens in identifiers, so we can't give a helpful
|
|
1009 error message in that case. */
|
|
1010 if (length && identifier_pos)
|
|
1011 return 0;
|
|
1012
|
|
1013 *pstr = str;
|
|
1014 if (length)
|
|
1015 {
|
|
1016 cpp_error (pfile, CPP_DL_ERROR,
|
|
1017 "incomplete universal character name %.*s",
|
|
1018 (int) (str - base), base);
|
|
1019 result = 1;
|
|
1020 }
|
|
1021 /* The standard permits $, @ and ` to be specified as UCNs. We use
|
|
1022 hex escapes so that this also works with EBCDIC hosts. */
|
|
1023 else if ((result < 0xa0
|
|
1024 && (result != 0x24 && result != 0x40 && result != 0x60))
|
|
1025 || (result & 0x80000000)
|
|
1026 || (result >= 0xD800 && result <= 0xDFFF))
|
|
1027 {
|
|
1028 cpp_error (pfile, CPP_DL_ERROR,
|
|
1029 "%.*s is not a valid universal character",
|
|
1030 (int) (str - base), base);
|
|
1031 result = 1;
|
|
1032 }
|
|
1033 else if (identifier_pos && result == 0x24
|
|
1034 && CPP_OPTION (pfile, dollars_in_ident))
|
|
1035 {
|
|
1036 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
|
|
1037 {
|
|
1038 CPP_OPTION (pfile, warn_dollars) = 0;
|
|
1039 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
|
|
1040 }
|
|
1041 NORMALIZE_STATE_UPDATE_IDNUM (nst);
|
|
1042 }
|
|
1043 else if (identifier_pos)
|
|
1044 {
|
|
1045 int validity = ucn_valid_in_identifier (pfile, result, nst);
|
|
1046
|
|
1047 if (validity == 0)
|
|
1048 cpp_error (pfile, CPP_DL_ERROR,
|
|
1049 "universal character %.*s is not valid in an identifier",
|
|
1050 (int) (str - base), base);
|
|
1051 else if (validity == 2 && identifier_pos == 1)
|
|
1052 cpp_error (pfile, CPP_DL_ERROR,
|
|
1053 "universal character %.*s is not valid at the start of an identifier",
|
|
1054 (int) (str - base), base);
|
|
1055 }
|
|
1056
|
|
1057 if (result == 0)
|
|
1058 result = 1;
|
|
1059
|
|
1060 return result;
|
|
1061 }
|
|
1062
|
|
1063 /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
|
|
1064 it to the execution character set and write the result into TBUF.
|
|
1065 An advanced pointer is returned. Issues all relevant diagnostics. */
|
|
1066 static const uchar *
|
|
1067 convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
|
|
1068 struct _cpp_strbuf *tbuf, struct cset_converter cvt)
|
|
1069 {
|
|
1070 cppchar_t ucn;
|
|
1071 uchar buf[6];
|
|
1072 uchar *bufp = buf;
|
|
1073 size_t bytesleft = 6;
|
|
1074 int rval;
|
|
1075 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
|
|
1076
|
|
1077 from++; /* Skip u/U. */
|
|
1078 ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst);
|
|
1079
|
|
1080 rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
|
|
1081 if (rval)
|
|
1082 {
|
|
1083 errno = rval;
|
|
1084 cpp_errno (pfile, CPP_DL_ERROR,
|
|
1085 "converting UCN to source character set");
|
|
1086 }
|
|
1087 else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
|
|
1088 cpp_errno (pfile, CPP_DL_ERROR,
|
|
1089 "converting UCN to execution character set");
|
|
1090
|
|
1091 return from;
|
|
1092 }
|
|
1093
|
|
1094 /* Subroutine of convert_hex and convert_oct. N is the representation
|
|
1095 in the execution character set of a numeric escape; write it into the
|
|
1096 string buffer TBUF and update the end-of-string pointer therein. WIDE
|
|
1097 is true if it's a wide string that's being assembled in TBUF. This
|
|
1098 function issues no diagnostics and never fails. */
|
|
1099 static void
|
|
1100 emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
|
|
1101 struct _cpp_strbuf *tbuf, struct cset_converter cvt)
|
|
1102 {
|
|
1103 size_t width = cvt.width;
|
|
1104
|
|
1105 if (width != CPP_OPTION (pfile, char_precision))
|
|
1106 {
|
|
1107 /* We have to render this into the target byte order, which may not
|
|
1108 be our byte order. */
|
|
1109 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
|
|
1110 size_t cwidth = CPP_OPTION (pfile, char_precision);
|
|
1111 size_t cmask = width_to_mask (cwidth);
|
|
1112 size_t nbwc = width / cwidth;
|
|
1113 size_t i;
|
|
1114 size_t off = tbuf->len;
|
|
1115 cppchar_t c;
|
|
1116
|
|
1117 if (tbuf->len + nbwc > tbuf->asize)
|
|
1118 {
|
|
1119 tbuf->asize += OUTBUF_BLOCK_SIZE;
|
|
1120 tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
|
|
1121 }
|
|
1122
|
|
1123 for (i = 0; i < nbwc; i++)
|
|
1124 {
|
|
1125 c = n & cmask;
|
|
1126 n >>= cwidth;
|
|
1127 tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
|
|
1128 }
|
|
1129 tbuf->len += nbwc;
|
|
1130 }
|
|
1131 else
|
|
1132 {
|
|
1133 /* Note: this code does not handle the case where the target
|
|
1134 and host have a different number of bits in a byte. */
|
|
1135 if (tbuf->len + 1 > tbuf->asize)
|
|
1136 {
|
|
1137 tbuf->asize += OUTBUF_BLOCK_SIZE;
|
|
1138 tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
|
|
1139 }
|
|
1140 tbuf->text[tbuf->len++] = n;
|
|
1141 }
|
|
1142 }
|
|
1143
|
|
1144 /* Convert a hexadecimal escape, pointed to by FROM, to the execution
|
|
1145 character set and write it into the string buffer TBUF. Returns an
|
|
1146 advanced pointer, and issues diagnostics as necessary.
|
|
1147 No character set translation occurs; this routine always produces the
|
|
1148 execution-set character with numeric value equal to the given hex
|
|
1149 number. You can, e.g. generate surrogate pairs this way. */
|
|
1150 static const uchar *
|
|
1151 convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
|
|
1152 struct _cpp_strbuf *tbuf, struct cset_converter cvt)
|
|
1153 {
|
|
1154 cppchar_t c, n = 0, overflow = 0;
|
|
1155 int digits_found = 0;
|
|
1156 size_t width = cvt.width;
|
|
1157 size_t mask = width_to_mask (width);
|
|
1158
|
|
1159 if (CPP_WTRADITIONAL (pfile))
|
|
1160 cpp_error (pfile, CPP_DL_WARNING,
|
|
1161 "the meaning of '\\x' is different in traditional C");
|
|
1162
|
|
1163 from++; /* Skip 'x'. */
|
|
1164 while (from < limit)
|
|
1165 {
|
|
1166 c = *from;
|
|
1167 if (! hex_p (c))
|
|
1168 break;
|
|
1169 from++;
|
|
1170 overflow |= n ^ (n << 4 >> 4);
|
|
1171 n = (n << 4) + hex_value (c);
|
|
1172 digits_found = 1;
|
|
1173 }
|
|
1174
|
|
1175 if (!digits_found)
|
|
1176 {
|
|
1177 cpp_error (pfile, CPP_DL_ERROR,
|
|
1178 "\\x used with no following hex digits");
|
|
1179 return from;
|
|
1180 }
|
|
1181
|
|
1182 if (overflow | (n != (n & mask)))
|
|
1183 {
|
|
1184 cpp_error (pfile, CPP_DL_PEDWARN,
|
|
1185 "hex escape sequence out of range");
|
|
1186 n &= mask;
|
|
1187 }
|
|
1188
|
|
1189 emit_numeric_escape (pfile, n, tbuf, cvt);
|
|
1190
|
|
1191 return from;
|
|
1192 }
|
|
1193
|
|
1194 /* Convert an octal escape, pointed to by FROM, to the execution
|
|
1195 character set and write it into the string buffer TBUF. Returns an
|
|
1196 advanced pointer, and issues diagnostics as necessary.
|
|
1197 No character set translation occurs; this routine always produces the
|
|
1198 execution-set character with numeric value equal to the given octal
|
|
1199 number. */
|
|
1200 static const uchar *
|
|
1201 convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
|
|
1202 struct _cpp_strbuf *tbuf, struct cset_converter cvt)
|
|
1203 {
|
|
1204 size_t count = 0;
|
|
1205 cppchar_t c, n = 0;
|
|
1206 size_t width = cvt.width;
|
|
1207 size_t mask = width_to_mask (width);
|
|
1208 bool overflow = false;
|
|
1209
|
|
1210 while (from < limit && count++ < 3)
|
|
1211 {
|
|
1212 c = *from;
|
|
1213 if (c < '0' || c > '7')
|
|
1214 break;
|
|
1215 from++;
|
|
1216 overflow |= n ^ (n << 3 >> 3);
|
|
1217 n = (n << 3) + c - '0';
|
|
1218 }
|
|
1219
|
|
1220 if (n != (n & mask))
|
|
1221 {
|
|
1222 cpp_error (pfile, CPP_DL_PEDWARN,
|
|
1223 "octal escape sequence out of range");
|
|
1224 n &= mask;
|
|
1225 }
|
|
1226
|
|
1227 emit_numeric_escape (pfile, n, tbuf, cvt);
|
|
1228
|
|
1229 return from;
|
|
1230 }
|
|
1231
|
|
1232 /* Convert an escape sequence (pointed to by FROM) to its value on
|
|
1233 the target, and to the execution character set. Do not scan past
|
|
1234 LIMIT. Write the converted value into TBUF. Returns an advanced
|
|
1235 pointer. Handles all relevant diagnostics. */
|
|
1236 static const uchar *
|
|
1237 convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
|
|
1238 struct _cpp_strbuf *tbuf, struct cset_converter cvt)
|
|
1239 {
|
|
1240 /* Values of \a \b \e \f \n \r \t \v respectively. */
|
|
1241 #if HOST_CHARSET == HOST_CHARSET_ASCII
|
|
1242 static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
|
|
1243 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
|
|
1244 static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
|
|
1245 #else
|
|
1246 #error "unknown host character set"
|
|
1247 #endif
|
|
1248
|
|
1249 uchar c;
|
|
1250
|
|
1251 c = *from;
|
|
1252 switch (c)
|
|
1253 {
|
|
1254 /* UCNs, hex escapes, and octal escapes are processed separately. */
|
|
1255 case 'u': case 'U':
|
|
1256 return convert_ucn (pfile, from, limit, tbuf, cvt);
|
|
1257
|
|
1258 case 'x':
|
|
1259 return convert_hex (pfile, from, limit, tbuf, cvt);
|
|
1260 break;
|
|
1261
|
|
1262 case '0': case '1': case '2': case '3':
|
|
1263 case '4': case '5': case '6': case '7':
|
|
1264 return convert_oct (pfile, from, limit, tbuf, cvt);
|
|
1265
|
|
1266 /* Various letter escapes. Get the appropriate host-charset
|
|
1267 value into C. */
|
|
1268 case '\\': case '\'': case '"': case '?': break;
|
|
1269
|
|
1270 case '(': case '{': case '[': case '%':
|
|
1271 /* '\(', etc, can be used at the beginning of a line in a long
|
|
1272 string split onto multiple lines with \-newline, to prevent
|
|
1273 Emacs or other text editors from getting confused. '\%' can
|
|
1274 be used to prevent SCCS from mangling printf format strings. */
|
|
1275 if (CPP_PEDANTIC (pfile))
|
|
1276 goto unknown;
|
|
1277 break;
|
|
1278
|
|
1279 case 'b': c = charconsts[1]; break;
|
|
1280 case 'f': c = charconsts[3]; break;
|
|
1281 case 'n': c = charconsts[4]; break;
|
|
1282 case 'r': c = charconsts[5]; break;
|
|
1283 case 't': c = charconsts[6]; break;
|
|
1284 case 'v': c = charconsts[7]; break;
|
|
1285
|
|
1286 case 'a':
|
|
1287 if (CPP_WTRADITIONAL (pfile))
|
|
1288 cpp_error (pfile, CPP_DL_WARNING,
|
|
1289 "the meaning of '\\a' is different in traditional C");
|
|
1290 c = charconsts[0];
|
|
1291 break;
|
|
1292
|
|
1293 case 'e': case 'E':
|
|
1294 if (CPP_PEDANTIC (pfile))
|
|
1295 cpp_error (pfile, CPP_DL_PEDWARN,
|
|
1296 "non-ISO-standard escape sequence, '\\%c'", (int) c);
|
|
1297 c = charconsts[2];
|
|
1298 break;
|
|
1299
|
|
1300 default:
|
|
1301 unknown:
|
|
1302 if (ISGRAPH (c))
|
|
1303 cpp_error (pfile, CPP_DL_PEDWARN,
|
|
1304 "unknown escape sequence '\\%c'", (int) c);
|
|
1305 else
|
|
1306 {
|
|
1307 /* diagnostic.c does not support "%03o". When it does, this
|
|
1308 code can use %03o directly in the diagnostic again. */
|
|
1309 char buf[32];
|
|
1310 sprintf(buf, "%03o", (int) c);
|
|
1311 cpp_error (pfile, CPP_DL_PEDWARN,
|
|
1312 "unknown escape sequence: '\\%s'", buf);
|
|
1313 }
|
|
1314 }
|
|
1315
|
|
1316 /* Now convert what we have to the execution character set. */
|
|
1317 if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
|
|
1318 cpp_errno (pfile, CPP_DL_ERROR,
|
|
1319 "converting escape sequence to execution character set");
|
|
1320
|
|
1321 return from + 1;
|
|
1322 }
|
|
1323
|
|
1324 /* TYPE is a token type. The return value is the conversion needed to
|
|
1325 convert from source to execution character set for the given type. */
|
|
1326 static struct cset_converter
|
|
1327 converter_for_type (cpp_reader *pfile, enum cpp_ttype type)
|
|
1328 {
|
|
1329 switch (type)
|
|
1330 {
|
|
1331 default:
|
|
1332 return pfile->narrow_cset_desc;
|
|
1333 case CPP_CHAR16:
|
|
1334 case CPP_STRING16:
|
|
1335 return pfile->char16_cset_desc;
|
|
1336 case CPP_CHAR32:
|
|
1337 case CPP_STRING32:
|
|
1338 return pfile->char32_cset_desc;
|
|
1339 case CPP_WCHAR:
|
|
1340 case CPP_WSTRING:
|
|
1341 return pfile->wide_cset_desc;
|
|
1342 }
|
|
1343 }
|
|
1344
|
|
1345 /* FROM is an array of cpp_string structures of length COUNT. These
|
|
1346 are to be converted from the source to the execution character set,
|
|
1347 escape sequences translated, and finally all are to be
|
|
1348 concatenated. WIDE indicates whether or not to produce a wide
|
|
1349 string. The result is written into TO. Returns true for success,
|
|
1350 false for failure. */
|
|
1351 bool
|
|
1352 cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
|
|
1353 cpp_string *to, enum cpp_ttype type)
|
|
1354 {
|
|
1355 struct _cpp_strbuf tbuf;
|
|
1356 const uchar *p, *base, *limit;
|
|
1357 size_t i;
|
|
1358 struct cset_converter cvt = converter_for_type (pfile, type);
|
|
1359
|
|
1360 tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
|
|
1361 tbuf.text = XNEWVEC (uchar, tbuf.asize);
|
|
1362 tbuf.len = 0;
|
|
1363
|
|
1364 for (i = 0; i < count; i++)
|
|
1365 {
|
|
1366 p = from[i].text;
|
|
1367 if (*p == 'L' || *p == 'u' || *p == 'U') p++;
|
|
1368 p++; /* Skip leading quote. */
|
|
1369 limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
|
|
1370
|
|
1371 for (;;)
|
|
1372 {
|
|
1373 base = p;
|
|
1374 while (p < limit && *p != '\\')
|
|
1375 p++;
|
|
1376 if (p > base)
|
|
1377 {
|
|
1378 /* We have a run of normal characters; these can be fed
|
|
1379 directly to convert_cset. */
|
|
1380 if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
|
|
1381 goto fail;
|
|
1382 }
|
|
1383 if (p == limit)
|
|
1384 break;
|
|
1385
|
|
1386 p = convert_escape (pfile, p + 1, limit, &tbuf, cvt);
|
|
1387 }
|
|
1388 }
|
|
1389 /* NUL-terminate the 'to' buffer and translate it to a cpp_string
|
|
1390 structure. */
|
|
1391 emit_numeric_escape (pfile, 0, &tbuf, cvt);
|
|
1392 tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
|
|
1393 to->text = tbuf.text;
|
|
1394 to->len = tbuf.len;
|
|
1395 return true;
|
|
1396
|
|
1397 fail:
|
|
1398 cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
|
|
1399 free (tbuf.text);
|
|
1400 return false;
|
|
1401 }
|
|
1402
|
|
1403 /* Subroutine of do_line and do_linemarker. Convert escape sequences
|
|
1404 in a string, but do not perform character set conversion. */
|
|
1405 bool
|
|
1406 cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
|
|
1407 size_t count, cpp_string *to,
|
|
1408 enum cpp_ttype type ATTRIBUTE_UNUSED)
|
|
1409 {
|
|
1410 struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
|
|
1411 bool retval;
|
|
1412
|
|
1413 pfile->narrow_cset_desc.func = convert_no_conversion;
|
|
1414 pfile->narrow_cset_desc.cd = (iconv_t) -1;
|
|
1415 pfile->narrow_cset_desc.width = CPP_OPTION (pfile, char_precision);
|
|
1416
|
|
1417 retval = cpp_interpret_string (pfile, from, count, to, CPP_STRING);
|
|
1418
|
|
1419 pfile->narrow_cset_desc = save_narrow_cset_desc;
|
|
1420 return retval;
|
|
1421 }
|
|
1422
|
|
1423
|
|
1424 /* Subroutine of cpp_interpret_charconst which performs the conversion
|
|
1425 to a number, for narrow strings. STR is the string structure returned
|
|
1426 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
|
|
1427 cpp_interpret_charconst. */
|
|
1428 static cppchar_t
|
|
1429 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
|
|
1430 unsigned int *pchars_seen, int *unsignedp)
|
|
1431 {
|
|
1432 size_t width = CPP_OPTION (pfile, char_precision);
|
|
1433 size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
|
|
1434 size_t mask = width_to_mask (width);
|
|
1435 size_t i;
|
|
1436 cppchar_t result, c;
|
|
1437 bool unsigned_p;
|
|
1438
|
|
1439 /* The value of a multi-character character constant, or a
|
|
1440 single-character character constant whose representation in the
|
|
1441 execution character set is more than one byte long, is
|
|
1442 implementation defined. This implementation defines it to be the
|
|
1443 number formed by interpreting the byte sequence in memory as a
|
|
1444 big-endian binary number. If overflow occurs, the high bytes are
|
|
1445 lost, and a warning is issued.
|
|
1446
|
|
1447 We don't want to process the NUL terminator handed back by
|
|
1448 cpp_interpret_string. */
|
|
1449 result = 0;
|
|
1450 for (i = 0; i < str.len - 1; i++)
|
|
1451 {
|
|
1452 c = str.text[i] & mask;
|
|
1453 if (width < BITS_PER_CPPCHAR_T)
|
|
1454 result = (result << width) | c;
|
|
1455 else
|
|
1456 result = c;
|
|
1457 }
|
|
1458
|
|
1459 if (i > max_chars)
|
|
1460 {
|
|
1461 i = max_chars;
|
|
1462 cpp_error (pfile, CPP_DL_WARNING,
|
|
1463 "character constant too long for its type");
|
|
1464 }
|
|
1465 else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
|
|
1466 cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
|
|
1467
|
|
1468 /* Multichar constants are of type int and therefore signed. */
|
|
1469 if (i > 1)
|
|
1470 unsigned_p = 0;
|
|
1471 else
|
|
1472 unsigned_p = CPP_OPTION (pfile, unsigned_char);
|
|
1473
|
|
1474 /* Truncate the constant to its natural width, and simultaneously
|
|
1475 sign- or zero-extend to the full width of cppchar_t.
|
|
1476 For single-character constants, the value is WIDTH bits wide.
|
|
1477 For multi-character constants, the value is INT_PRECISION bits wide. */
|
|
1478 if (i > 1)
|
|
1479 width = CPP_OPTION (pfile, int_precision);
|
|
1480 if (width < BITS_PER_CPPCHAR_T)
|
|
1481 {
|
|
1482 mask = ((cppchar_t) 1 << width) - 1;
|
|
1483 if (unsigned_p || !(result & (1 << (width - 1))))
|
|
1484 result &= mask;
|
|
1485 else
|
|
1486 result |= ~mask;
|
|
1487 }
|
|
1488 *pchars_seen = i;
|
|
1489 *unsignedp = unsigned_p;
|
|
1490 return result;
|
|
1491 }
|
|
1492
|
|
1493 /* Subroutine of cpp_interpret_charconst which performs the conversion
|
|
1494 to a number, for wide strings. STR is the string structure returned
|
|
1495 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
|
|
1496 cpp_interpret_charconst. TYPE is the token type. */
|
|
1497 static cppchar_t
|
|
1498 wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
|
|
1499 unsigned int *pchars_seen, int *unsignedp,
|
|
1500 enum cpp_ttype type)
|
|
1501 {
|
|
1502 bool bigend = CPP_OPTION (pfile, bytes_big_endian);
|
|
1503 size_t width = converter_for_type (pfile, type).width;
|
|
1504 size_t cwidth = CPP_OPTION (pfile, char_precision);
|
|
1505 size_t mask = width_to_mask (width);
|
|
1506 size_t cmask = width_to_mask (cwidth);
|
|
1507 size_t nbwc = width / cwidth;
|
|
1508 size_t off, i;
|
|
1509 cppchar_t result = 0, c;
|
|
1510
|
|
1511 /* This is finicky because the string is in the target's byte order,
|
|
1512 which may not be our byte order. Only the last character, ignoring
|
|
1513 the NUL terminator, is relevant. */
|
|
1514 off = str.len - (nbwc * 2);
|
|
1515 result = 0;
|
|
1516 for (i = 0; i < nbwc; i++)
|
|
1517 {
|
|
1518 c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
|
|
1519 result = (result << cwidth) | (c & cmask);
|
|
1520 }
|
|
1521
|
|
1522 /* Wide character constants have type wchar_t, and a single
|
|
1523 character exactly fills a wchar_t, so a multi-character wide
|
|
1524 character constant is guaranteed to overflow. */
|
|
1525 if (str.len > nbwc * 2)
|
|
1526 cpp_error (pfile, CPP_DL_WARNING,
|
|
1527 "character constant too long for its type");
|
|
1528
|
|
1529 /* Truncate the constant to its natural width, and simultaneously
|
|
1530 sign- or zero-extend to the full width of cppchar_t. */
|
|
1531 if (width < BITS_PER_CPPCHAR_T)
|
|
1532 {
|
|
1533 if (type == CPP_CHAR16 || type == CPP_CHAR32
|
|
1534 || CPP_OPTION (pfile, unsigned_wchar)
|
|
1535 || !(result & (1 << (width - 1))))
|
|
1536 result &= mask;
|
|
1537 else
|
|
1538 result |= ~mask;
|
|
1539 }
|
|
1540
|
|
1541 if (type == CPP_CHAR16 || type == CPP_CHAR32
|
|
1542 || CPP_OPTION (pfile, unsigned_wchar))
|
|
1543 *unsignedp = 1;
|
|
1544 else
|
|
1545 *unsignedp = 0;
|
|
1546
|
|
1547 *pchars_seen = 1;
|
|
1548 return result;
|
|
1549 }
|
|
1550
|
|
1551 /* Interpret a (possibly wide) character constant in TOKEN.
|
|
1552 PCHARS_SEEN points to a variable that is filled in with the number
|
|
1553 of characters seen, and UNSIGNEDP to a variable that indicates
|
|
1554 whether the result has signed type. */
|
|
1555 cppchar_t
|
|
1556 cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
|
|
1557 unsigned int *pchars_seen, int *unsignedp)
|
|
1558 {
|
|
1559 cpp_string str = { 0, 0 };
|
|
1560 bool wide = (token->type != CPP_CHAR);
|
|
1561 cppchar_t result;
|
|
1562
|
|
1563 /* an empty constant will appear as L'', u'', U'' or '' */
|
|
1564 if (token->val.str.len == (size_t) (2 + wide))
|
|
1565 {
|
|
1566 cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
|
|
1567 return 0;
|
|
1568 }
|
|
1569 else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, token->type))
|
|
1570 return 0;
|
|
1571
|
|
1572 if (wide)
|
|
1573 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp,
|
|
1574 token->type);
|
|
1575 else
|
|
1576 result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
|
|
1577
|
|
1578 if (str.text != token->val.str.text)
|
|
1579 free ((void *)str.text);
|
|
1580
|
|
1581 return result;
|
|
1582 }
|
|
1583
|
|
1584 /* Convert an identifier denoted by ID and LEN, which might contain
|
|
1585 UCN escapes, to the source character set, either UTF-8 or
|
|
1586 UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */
|
|
1587 cpp_hashnode *
|
|
1588 _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
|
|
1589 {
|
|
1590 /* It turns out that a UCN escape always turns into fewer characters
|
|
1591 than the escape itself, so we can allocate a temporary in advance. */
|
|
1592 uchar * buf = (uchar *) alloca (len + 1);
|
|
1593 uchar * bufp = buf;
|
|
1594 size_t idp;
|
|
1595
|
|
1596 for (idp = 0; idp < len; idp++)
|
|
1597 if (id[idp] != '\\')
|
|
1598 *bufp++ = id[idp];
|
|
1599 else
|
|
1600 {
|
|
1601 unsigned length = id[idp+1] == 'u' ? 4 : 8;
|
|
1602 cppchar_t value = 0;
|
|
1603 size_t bufleft = len - (bufp - buf);
|
|
1604 int rval;
|
|
1605
|
|
1606 idp += 2;
|
|
1607 while (length && idp < len && ISXDIGIT (id[idp]))
|
|
1608 {
|
|
1609 value = (value << 4) + hex_value (id[idp]);
|
|
1610 idp++;
|
|
1611 length--;
|
|
1612 }
|
|
1613 idp--;
|
|
1614
|
|
1615 /* Special case for EBCDIC: if the identifier contains
|
|
1616 a '$' specified using a UCN, translate it to EBCDIC. */
|
|
1617 if (value == 0x24)
|
|
1618 {
|
|
1619 *bufp++ = '$';
|
|
1620 continue;
|
|
1621 }
|
|
1622
|
|
1623 rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
|
|
1624 if (rval)
|
|
1625 {
|
|
1626 errno = rval;
|
|
1627 cpp_errno (pfile, CPP_DL_ERROR,
|
|
1628 "converting UCN to source character set");
|
|
1629 break;
|
|
1630 }
|
|
1631 }
|
|
1632
|
|
1633 return CPP_HASHNODE (ht_lookup (pfile->hash_table,
|
|
1634 buf, bufp - buf, HT_ALLOC));
|
|
1635 }
|
|
1636
|
|
1637 /* Convert an input buffer (containing the complete contents of one
|
|
1638 source file) from INPUT_CHARSET to the source character set. INPUT
|
|
1639 points to the input buffer, SIZE is its allocated size, and LEN is
|
|
1640 the length of the meaningful data within the buffer. The
|
|
1641 translated buffer is returned, *ST_SIZE is set to the length of
|
|
1642 the meaningful data within the translated buffer, and *BUFFER_START
|
|
1643 is set to the start of the returned buffer. *BUFFER_START may
|
|
1644 differ from the return value in the case of a BOM or other ignored
|
|
1645 marker information.
|
|
1646
|
|
1647 INPUT is expected to have been allocated with xmalloc. This
|
|
1648 function will either set *BUFFER_START to INPUT, or free it and set
|
|
1649 *BUFFER_START to a pointer to another xmalloc-allocated block of
|
|
1650 memory. */
|
|
1651 uchar *
|
|
1652 _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
|
|
1653 uchar *input, size_t size, size_t len,
|
|
1654 const unsigned char **buffer_start, off_t *st_size)
|
|
1655 {
|
|
1656 struct cset_converter input_cset;
|
|
1657 struct _cpp_strbuf to;
|
|
1658 unsigned char *buffer;
|
|
1659
|
|
1660 input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
|
|
1661 if (input_cset.func == convert_no_conversion)
|
|
1662 {
|
|
1663 to.text = input;
|
|
1664 to.asize = size;
|
|
1665 to.len = len;
|
|
1666 }
|
|
1667 else
|
|
1668 {
|
|
1669 to.asize = MAX (65536, len);
|
|
1670 to.text = XNEWVEC (uchar, to.asize);
|
|
1671 to.len = 0;
|
|
1672
|
|
1673 if (!APPLY_CONVERSION (input_cset, input, len, &to))
|
|
1674 cpp_error (pfile, CPP_DL_ERROR,
|
|
1675 "failure to convert %s to %s",
|
|
1676 CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
|
|
1677
|
|
1678 free (input);
|
|
1679 }
|
|
1680
|
|
1681 /* Clean up the mess. */
|
|
1682 if (input_cset.func == convert_using_iconv)
|
|
1683 iconv_close (input_cset.cd);
|
|
1684
|
|
1685 /* Resize buffer if we allocated substantially too much, or if we
|
|
1686 haven't enough space for the \n-terminator. */
|
|
1687 if (to.len + 4096 < to.asize || to.len >= to.asize)
|
|
1688 to.text = XRESIZEVEC (uchar, to.text, to.len + 1);
|
|
1689
|
|
1690 /* If the file is using old-school Mac line endings (\r only),
|
|
1691 terminate with another \r, not an \n, so that we do not mistake
|
|
1692 the \r\n sequence for a single DOS line ending and erroneously
|
|
1693 issue the "No newline at end of file" diagnostic. */
|
|
1694 if (to.len && to.text[to.len - 1] == '\r')
|
|
1695 to.text[to.len] = '\r';
|
|
1696 else
|
|
1697 to.text[to.len] = '\n';
|
|
1698
|
|
1699 buffer = to.text;
|
|
1700 *st_size = to.len;
|
|
1701 #if HOST_CHARSET == HOST_CHARSET_ASCII
|
|
1702 /* The HOST_CHARSET test just above ensures that the source charset
|
|
1703 is UTF-8. So, ignore a UTF-8 BOM if we see one. Note that
|
|
1704 glib'c UTF-8 iconv() provider (as of glibc 2.7) does not ignore a
|
|
1705 BOM -- however, even if it did, we would still need this code due
|
|
1706 to the 'convert_no_conversion' case. */
|
|
1707 if (to.len >= 3 && to.text[0] == 0xef && to.text[1] == 0xbb
|
|
1708 && to.text[2] == 0xbf)
|
|
1709 {
|
|
1710 *st_size -= 3;
|
|
1711 buffer += 3;
|
|
1712 }
|
|
1713 #endif
|
|
1714
|
|
1715 *buffer_start = to.text;
|
|
1716 return buffer;
|
|
1717 }
|
|
1718
|
|
1719 /* Decide on the default encoding to assume for input files. */
|
|
1720 const char *
|
|
1721 _cpp_default_encoding (void)
|
|
1722 {
|
|
1723 const char *current_encoding = NULL;
|
|
1724
|
|
1725 /* We disable this because the default codeset is 7-bit ASCII on
|
|
1726 most platforms, and this causes conversion failures on every
|
|
1727 file in GCC that happens to have one of the upper 128 characters
|
|
1728 in it -- most likely, as part of the name of a contributor.
|
|
1729 We should definitely recognize in-band markers of file encoding,
|
|
1730 like:
|
|
1731 - the appropriate Unicode byte-order mark (FE FF) to recognize
|
|
1732 UTF16 and UCS4 (in both big-endian and little-endian flavors)
|
|
1733 and UTF8
|
|
1734 - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
|
|
1735 distinguish ASCII and EBCDIC.
|
|
1736 - now we can parse something like "#pragma GCC encoding <xyz>
|
|
1737 on the first line, or even Emacs/VIM's mode line tags (there's
|
|
1738 a problem here in that VIM uses the last line, and Emacs has
|
|
1739 its more elaborate "local variables" convention).
|
|
1740 - investigate whether Java has another common convention, which
|
|
1741 would be friendly to support.
|
|
1742 (Zack Weinberg and Paolo Bonzini, May 20th 2004) */
|
|
1743 #if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
|
|
1744 setlocale (LC_CTYPE, "");
|
|
1745 current_encoding = nl_langinfo (CODESET);
|
|
1746 #endif
|
|
1747 if (current_encoding == NULL || *current_encoding == '\0')
|
|
1748 current_encoding = SOURCE_CHARSET;
|
|
1749
|
|
1750 return current_encoding;
|
|
1751 }
|