Mercurial > hg > CbC > CbC_gcc
diff libcpp/charset.c @ 111:04ced10e8804
gcc 7
author | kono |
---|---|
date | Fri, 27 Oct 2017 22:46:09 +0900 |
parents | f6334be47118 |
children | 84e7813d76e9 |
line wrap: on
line diff
--- a/libcpp/charset.c Sun Aug 21 07:07:55 2011 +0900 +++ b/libcpp/charset.c Fri Oct 27 22:46:09 2017 +0900 @@ -1,6 +1,5 @@ /* CPP Library - charsets - Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, 2008, 2009, - 2010 Free Software Foundation, Inc. + Copyright (C) 1998-2017 Free Software Foundation, Inc. Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges. @@ -354,7 +353,7 @@ return EILSEQ; } - if (s < 0xFFFF) + if (s <= 0xFFFF) { if (*outbytesleftp < 2) { @@ -538,6 +537,7 @@ if (to->len + flen > to->asize) { to->asize = to->len + flen; + to->asize += to->asize / 4; to->text = XRESIZEVEC (uchar, to->text, to->asize); } memcpy (to->text + to->len, from, flen); @@ -609,13 +609,13 @@ #define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \ CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO) -struct conversion +struct cpp_conversion { const char *pair; convert_f func; iconv_t fake_cd; }; -static const struct conversion conversion_tab[] = { +static const struct cpp_conversion conversion_tab[] = { { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 }, { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 }, { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 }, @@ -812,6 +812,51 @@ +/* cpp_substring_ranges's constructor. */ + +cpp_substring_ranges::cpp_substring_ranges () : + m_ranges (NULL), + m_num_ranges (0), + m_alloc_ranges (8) +{ + m_ranges = XNEWVEC (source_range, m_alloc_ranges); +} + +/* cpp_substring_ranges's destructor. */ + +cpp_substring_ranges::~cpp_substring_ranges () +{ + free (m_ranges); +} + +/* Add RANGE to the vector of source_range information. */ + +void +cpp_substring_ranges::add_range (source_range range) +{ + if (m_num_ranges >= m_alloc_ranges) + { + m_alloc_ranges *= 2; + m_ranges + = (source_range *)xrealloc (m_ranges, + sizeof (source_range) * m_alloc_ranges); + } + m_ranges[m_num_ranges++] = range; +} + +/* Read NUM ranges from LOC_READER, adding them to the vector of source_range + information. */ + +void +cpp_substring_ranges::add_n_ranges (int num, + cpp_string_location_reader &loc_reader) +{ + for (int i = 0; i < num; i++) + add_range (loc_reader.get_next ()); +} + + + /* Utility routine that computes a mask of the form 0000...111... with WIDTH 1-bits. */ static inline size_t @@ -829,29 +874,32 @@ /* Valid in a C99 identifier? */ C99 = 1, /* Valid in a C99 identifier, but not as the first character? */ - DIG = 2, + N99 = 2, /* Valid in a C++ identifier? */ CXX = 4, + /* Valid in a C11/C++11 identifier? */ + C11 = 8, + /* Valid in a C11/C++11 identifier, but not as the first character? */ + N11 = 16, /* NFC representation is not valid in an identifier? */ - CID = 8, + CID = 32, /* Might be valid NFC form? */ - NFC = 16, + NFC = 64, /* Might be valid NFKC form? */ - NKC = 32, + NKC = 128, /* Certain preceding characters might make it not valid NFC/NKFC form? */ - CTX = 64 + CTX = 256 }; -static const struct { +struct ucnrange { /* Bitmap of flags above. */ - unsigned char flags; + unsigned short flags; /* Combining class of the character. */ unsigned char combine; /* Last character in the range described by this entry. */ - unsigned short end; -} ucnranges[] = { + unsigned int end; +}; #include "ucnid.h" -}; /* Returns 1 if C is valid in an identifier, 2 if C is valid except at the start of an identifier, and 0 if C is not valid in an @@ -865,8 +913,9 @@ struct normalize_state *nst) { int mn, mx, md; + unsigned short valid_flags, invalid_start_flags; - if (c > 0xFFFF) + if (c > 0x10FFFF) return 0; mn = 0; @@ -882,15 +931,25 @@ /* When -pedantic, we require the character to have been listed by the standard for the current language. Otherwise, we accept the - union of the acceptable sets for C++98 and C99. */ - if (! (ucnranges[mn].flags & (C99 | CXX))) + union of the acceptable sets for all supported language versions. */ + valid_flags = C99 | CXX | C11; + if (CPP_PEDANTIC (pfile)) + { + if (CPP_OPTION (pfile, c11_identifiers)) + valid_flags = C11; + else if (CPP_OPTION (pfile, c99)) + valid_flags = C99; + else if (CPP_OPTION (pfile, cplusplus)) + valid_flags = CXX; + } + if (! (ucnranges[mn].flags & valid_flags)) return 0; - - if (CPP_PEDANTIC (pfile) - && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99)) - || (CPP_OPTION (pfile, cplusplus) - && !(ucnranges[mn].flags & CXX)))) - return 0; + if (CPP_OPTION (pfile, c11_identifiers)) + invalid_start_flags = N11; + else if (CPP_OPTION (pfile, c99)) + invalid_start_flags = N99; + else + invalid_start_flags = 0; /* Update NST. */ if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class) @@ -900,17 +959,6 @@ bool safe; cppchar_t p = nst->previous; - /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam. */ - if (c == 0x09BE) - safe = p != 0x09C7; /* Use 09CB instead of 09C7 09BE. */ - else if (c == 0x0B3E) - safe = p != 0x0B47; /* Use 0B4B instead of 0B47 0B3E. */ - else if (c == 0x0BBE) - safe = p != 0x0BC6 && p != 0x0BC7; /* Use 0BCA/0BCB instead. */ - else if (c == 0x0CC2) - safe = p != 0x0CC6; /* Use 0CCA instead of 0CC6 0CC2. */ - else if (c == 0x0D3E) - safe = p != 0x0D46 && p != 0x0D47; /* Use 0D4A/0D4B instead. */ /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC, and are combined algorithmically from a sequence of the form 1100-1112 1161-1175 11A8-11C2 @@ -918,20 +966,19 @@ really a valid character). Unfortunately, C99 allows (only) the NFC form, but C++ allows only the combining characters. */ - else if (c >= 0x1161 && c <= 0x1175) + if (c >= 0x1161 && c <= 0x1175) safe = p < 0x1100 || p > 0x1112; else if (c >= 0x11A8 && c <= 0x11C2) safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0); else + safe = check_nfc (pfile, c, p); + if (!safe) { - /* Uh-oh, someone updated ucnid.h without updating this code. */ - cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c); - safe = true; + if ((c >= 0x1161 && c <= 0x1175) || (c >= 0x11A8 && c <= 0x11C2)) + nst->level = MAX (nst->level, normalized_identifier_C); + else + nst->level = normalized_none; } - if (!safe && c < 0x1161) - nst->level = normalized_none; - else if (!safe) - nst->level = MAX (nst->level, normalized_identifier_C); } else if (ucnranges[mn].flags & NKC) ; @@ -941,11 +988,13 @@ nst->level = MAX (nst->level, normalized_identifier_C); else nst->level = normalized_none; - nst->previous = c; + if (ucnranges[mn].combine == 0) + nst->previous = c; nst->prev_class = ucnranges[mn].combine; - /* In C99, UCN digits may not begin identifiers. */ - if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG)) + /* In C99, UCN digits may not begin identifiers. In C11 and C++11, + UCN combining characters may not begin identifiers. */ + if (ucnranges[mn].flags & invalid_start_flags) return 2; return 1; @@ -968,21 +1017,25 @@ or 0060 (`), nor one in the range D800 through DFFF inclusive. *PSTR must be preceded by "\u" or "\U"; it is assumed that the - buffer end is delimited by a non-hex digit. Returns zero if the - UCN has not been consumed. + buffer end is delimited by a non-hex digit. Returns false if the + UCN has not been consumed, true otherwise. - Otherwise the nonzero value of the UCN, whether valid or invalid, - is returned. Diagnostics are emitted for invalid values. PSTR - is updated to point one beyond the UCN, or to the syntactically - invalid character. + The value of the UCN, whether valid or invalid, is returned in *CP. + Diagnostics are emitted for invalid values. PSTR is updated to point + one beyond the UCN, or to the syntactically invalid character. IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of - an identifier, or 2 otherwise. */ + an identifier, or 2 otherwise. -cppchar_t + If LOC_READER is non-NULL, then position information is + read from *LOC_READER and CHAR_RANGE->m_finish is updated accordingly. */ + +bool _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, const uchar *limit, int identifier_pos, - struct normalize_state *nst) + struct normalize_state *nst, cppchar_t *cp, + source_range *char_range, + cpp_string_location_reader *loc_reader) { cppchar_t result, c; unsigned int length; @@ -992,6 +1045,10 @@ if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99)) cpp_error (pfile, CPP_DL_WARNING, "universal character names are only valid in C++ and C99"); + else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0 + && !CPP_OPTION (pfile, cplusplus)) + cpp_error (pfile, CPP_DL_WARNING, + "C99's universal character names are incompatible with C90"); else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0) cpp_warning (pfile, CPP_W_TRADITIONAL, "the meaning of '\\%c' is different in traditional C", @@ -1014,6 +1071,11 @@ if (!ISXDIGIT (c)) break; str++; + if (loc_reader) + { + gcc_assert (char_range); + char_range->m_finish = loc_reader->get_next ().m_finish; + } result = (result << 4) + hex_value (c); } while (--length && str < limit); @@ -1022,8 +1084,11 @@ multiple tokens in identifiers, so we can't give a helpful error message in that case. */ if (length && identifier_pos) - return 0; - + { + *cp = 0; + return false; + } + *pstr = str; if (length) { @@ -1055,7 +1120,7 @@ CPP_OPTION (pfile, warn_dollars) = 0; cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number"); } - NORMALIZE_STATE_UPDATE_IDNUM (nst); + NORMALIZE_STATE_UPDATE_IDNUM (nst, result); } else if (identifier_pos) { @@ -1071,18 +1136,23 @@ (int) (str - base), base); } - if (result == 0) - result = 1; - - return result; + *cp = result; + return true; } /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate - it to the execution character set and write the result into TBUF. - An advanced pointer is returned. Issues all relevant diagnostics. */ + it to the execution character set and write the result into TBUF, + if TBUF is non-NULL. + An advanced pointer is returned. Issues all relevant diagnostics. + If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE + contains the location of the character so far: location information + is read from *LOC_READER, and *RANGES is updated accordingly. */ static const uchar * convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit, - struct _cpp_strbuf *tbuf, struct cset_converter cvt) + struct _cpp_strbuf *tbuf, struct cset_converter cvt, + source_range char_range, + cpp_string_location_reader *loc_reader, + cpp_substring_ranges *ranges) { cppchar_t ucn; uchar buf[6]; @@ -1091,8 +1161,17 @@ int rval; struct normalize_state nst = INITIAL_NORMALIZE_STATE; + /* loc_reader and ranges must either be both NULL, or both be non-NULL. */ + gcc_assert ((loc_reader != NULL) == (ranges != NULL)); + from++; /* Skip u/U. */ - ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst); + + if (loc_reader) + /* The u/U is part of the spelling of this character. */ + char_range.m_finish = loc_reader->get_next ().m_finish; + + _cpp_valid_ucn (pfile, &from, limit, 0, &nst, + &ucn, &char_range, loc_reader); rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft); if (rval) @@ -1101,9 +1180,20 @@ cpp_errno (pfile, CPP_DL_ERROR, "converting UCN to source character set"); } - else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf)) - cpp_errno (pfile, CPP_DL_ERROR, - "converting UCN to execution character set"); + else + { + if (tbuf) + if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf)) + cpp_errno (pfile, CPP_DL_ERROR, + "converting UCN to execution character set"); + + if (loc_reader) + { + int num_encoded_bytes = 6 - bytesleft; + for (int i = 0; i < num_encoded_bytes; i++) + ranges->add_range (char_range); + } + } return from; } @@ -1159,31 +1249,48 @@ } /* Convert a hexadecimal escape, pointed to by FROM, to the execution - character set and write it into the string buffer TBUF. Returns an - advanced pointer, and issues diagnostics as necessary. + character set and write it into the string buffer TBUF (if non-NULL). + Returns an advanced pointer, and issues diagnostics as necessary. No character set translation occurs; this routine always produces the execution-set character with numeric value equal to the given hex - number. You can, e.g. generate surrogate pairs this way. */ + number. You can, e.g. generate surrogate pairs this way. + If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE + contains the location of the character so far: location information + is read from *LOC_READER, and *RANGES is updated accordingly. */ static const uchar * convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit, - struct _cpp_strbuf *tbuf, struct cset_converter cvt) + struct _cpp_strbuf *tbuf, struct cset_converter cvt, + source_range char_range, + cpp_string_location_reader *loc_reader, + cpp_substring_ranges *ranges) { cppchar_t c, n = 0, overflow = 0; int digits_found = 0; size_t width = cvt.width; size_t mask = width_to_mask (width); + /* loc_reader and ranges must either be both NULL, or both be non-NULL. */ + gcc_assert ((loc_reader != NULL) == (ranges != NULL)); + if (CPP_WTRADITIONAL (pfile)) cpp_warning (pfile, CPP_W_TRADITIONAL, "the meaning of '\\x' is different in traditional C"); - from++; /* Skip 'x'. */ + /* Skip 'x'. */ + from++; + + /* The 'x' is part of the spelling of this character. */ + if (loc_reader) + char_range.m_finish = loc_reader->get_next ().m_finish; + while (from < limit) { c = *from; if (! hex_p (c)) break; from++; + if (loc_reader) + char_range.m_finish = loc_reader->get_next ().m_finish; overflow |= n ^ (n << 4 >> 4); n = (n << 4) + hex_value (c); digits_found = 1; @@ -1203,7 +1310,10 @@ n &= mask; } - emit_numeric_escape (pfile, n, tbuf, cvt); + if (tbuf) + emit_numeric_escape (pfile, n, tbuf, cvt); + if (ranges) + ranges->add_range (char_range); return from; } @@ -1213,10 +1323,16 @@ advanced pointer, and issues diagnostics as necessary. No character set translation occurs; this routine always produces the execution-set character with numeric value equal to the given octal - number. */ + number. + If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE + contains the location of the character so far: location information + is read from *LOC_READER, and *RANGES is updated accordingly. */ static const uchar * convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit, - struct _cpp_strbuf *tbuf, struct cset_converter cvt) + struct _cpp_strbuf *tbuf, struct cset_converter cvt, + source_range char_range, + cpp_string_location_reader *loc_reader, + cpp_substring_ranges *ranges) { size_t count = 0; cppchar_t c, n = 0; @@ -1224,12 +1340,17 @@ size_t mask = width_to_mask (width); bool overflow = false; + /* loc_reader and ranges must either be both NULL, or both be non-NULL. */ + gcc_assert ((loc_reader != NULL) == (ranges != NULL)); + while (from < limit && count++ < 3) { c = *from; if (c < '0' || c > '7') break; from++; + if (loc_reader) + char_range.m_finish = loc_reader->get_next ().m_finish; overflow |= n ^ (n << 3 >> 3); n = (n << 3) + c - '0'; } @@ -1241,18 +1362,26 @@ n &= mask; } - emit_numeric_escape (pfile, n, tbuf, cvt); + if (tbuf) + emit_numeric_escape (pfile, n, tbuf, cvt); + if (ranges) + ranges->add_range (char_range); return from; } /* Convert an escape sequence (pointed to by FROM) to its value on the target, and to the execution character set. Do not scan past - LIMIT. Write the converted value into TBUF. Returns an advanced - pointer. Handles all relevant diagnostics. */ + LIMIT. Write the converted value into TBUF, if TBUF is non-NULL. + Returns an advanced pointer. Handles all relevant diagnostics. + If LOC_READER is non-NULL, then RANGES must be non-NULL: location + information is read from *LOC_READER, and *RANGES is updated + accordingly. */ static const uchar * convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit, - struct _cpp_strbuf *tbuf, struct cset_converter cvt) + struct _cpp_strbuf *tbuf, struct cset_converter cvt, + cpp_string_location_reader *loc_reader, + cpp_substring_ranges *ranges) { /* Values of \a \b \e \f \n \r \t \v respectively. */ #if HOST_CHARSET == HOST_CHARSET_ASCII @@ -1265,20 +1394,28 @@ uchar c; + /* Record the location of the backslash. */ + source_range char_range; + if (loc_reader) + char_range = loc_reader->get_next (); + c = *from; switch (c) { /* UCNs, hex escapes, and octal escapes are processed separately. */ case 'u': case 'U': - return convert_ucn (pfile, from, limit, tbuf, cvt); + return convert_ucn (pfile, from, limit, tbuf, cvt, + char_range, loc_reader, ranges); case 'x': - return convert_hex (pfile, from, limit, tbuf, cvt); + return convert_hex (pfile, from, limit, tbuf, cvt, + char_range, loc_reader, ranges); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': - return convert_oct (pfile, from, limit, tbuf, cvt); + return convert_oct (pfile, from, limit, tbuf, cvt, + char_range, loc_reader, ranges); /* Various letter escapes. Get the appropriate host-charset value into C. */ @@ -1330,10 +1467,17 @@ } } - /* Now convert what we have to the execution character set. */ - if (!APPLY_CONVERSION (cvt, &c, 1, tbuf)) - cpp_errno (pfile, CPP_DL_ERROR, - "converting escape sequence to execution character set"); + if (tbuf) + /* Now convert what we have to the execution character set. */ + if (!APPLY_CONVERSION (cvt, &c, 1, tbuf)) + cpp_errno (pfile, CPP_DL_ERROR, + "converting escape sequence to execution character set"); + + if (loc_reader) + { + char_range.m_finish = loc_reader->get_next ().m_finish; + ranges->add_range (char_range); + } return from + 1; } @@ -1347,6 +1491,7 @@ { default: return pfile->narrow_cset_desc; + case CPP_UTF8CHAR: case CPP_UTF8STRING: return pfile->utf8_cset_desc; case CPP_CHAR16: @@ -1365,28 +1510,52 @@ are to be converted from the source to the execution character set, escape sequences translated, and finally all are to be concatenated. WIDE indicates whether or not to produce a wide - string. The result is written into TO. Returns true for success, - false for failure. */ -bool -cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count, - cpp_string *to, enum cpp_ttype type) + string. If TO is non-NULL, the result is written into TO. + If LOC_READERS and OUT are non-NULL, then location information + is read from LOC_READERS (which must be an array of length COUNT), + and location information is written to *RANGES. + + Returns true for success, false for failure. */ + +static bool +cpp_interpret_string_1 (cpp_reader *pfile, const cpp_string *from, size_t count, + cpp_string *to, enum cpp_ttype type, + cpp_string_location_reader *loc_readers, + cpp_substring_ranges *out) { struct _cpp_strbuf tbuf; const uchar *p, *base, *limit; size_t i; struct cset_converter cvt = converter_for_type (pfile, type); - tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len); - tbuf.text = XNEWVEC (uchar, tbuf.asize); - tbuf.len = 0; + /* loc_readers and out must either be both NULL, or both be non-NULL. */ + gcc_assert ((loc_readers != NULL) == (out != NULL)); + if (to) + { + tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len); + tbuf.text = XNEWVEC (uchar, tbuf.asize); + tbuf.len = 0; + } + + cpp_string_location_reader *loc_reader = NULL; for (i = 0; i < count; i++) { + if (loc_readers) + loc_reader = &loc_readers[i]; + p = from[i].text; if (*p == 'u') { - if (*++p == '8') - p++; + p++; + if (loc_reader) + loc_reader->get_next (); + if (*p == '8') + { + p++; + if (loc_reader) + loc_reader->get_next (); + } } else if (*p == 'L' || *p == 'U') p++; if (*p == 'R') @@ -1395,23 +1564,64 @@ /* Skip over 'R"'. */ p += 2; + if (loc_reader) + { + loc_reader->get_next (); + loc_reader->get_next (); + } prefix = p; while (*p != '(') - p++; + { + p++; + if (loc_reader) + loc_reader->get_next (); + } p++; + if (loc_reader) + loc_reader->get_next (); limit = from[i].text + from[i].len; if (limit >= p + (p - prefix) + 1) limit -= (p - prefix) + 1; /* Raw strings are all normal characters; these can be fed directly to convert_cset. */ - if (!APPLY_CONVERSION (cvt, p, limit - p, &tbuf)) - goto fail; + if (to) + if (!APPLY_CONVERSION (cvt, p, limit - p, &tbuf)) + goto fail; + + if (loc_reader) + { + /* If generating source ranges, assume we have a 1:1 + correspondence between bytes in the source encoding and bytes + in the execution encoding (e.g. if we have a UTF-8 to UTF-8 + conversion), so that this run of bytes in the source file + corresponds to a run of bytes in the execution string. + This requirement is guaranteed by an early-reject in + cpp_interpret_string_ranges. */ + gcc_assert (cvt.func == convert_no_conversion); + out->add_n_ranges (limit - p, *loc_reader); + } continue; } - p++; /* Skip leading quote. */ + /* If we don't now have a leading quote, something has gone wrong. + This can occur if cpp_interpret_string_ranges is handling a + stringified macro argument, but should not be possible otherwise. */ + if (*p != '"' && *p != '\'') + { + gcc_assert (out != NULL); + cpp_error (pfile, CPP_DL_ERROR, "missing open quote"); + if (to) + free (tbuf.text); + return false; + } + + /* Skip leading quote. */ + p++; + if (loc_reader) + loc_reader->get_next (); + limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */ for (;;) @@ -1423,29 +1633,137 @@ { /* We have a run of normal characters; these can be fed directly to convert_cset. */ - if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf)) - goto fail; + if (to) + if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf)) + goto fail; + /* Similar to above: assumes we have a 1:1 correspondence + between bytes in the source encoding and bytes in the + execution encoding. */ + if (loc_reader) + { + gcc_assert (cvt.func == convert_no_conversion); + out->add_n_ranges (p - base, *loc_reader); + } } - if (p == limit) + if (p >= limit) break; - p = convert_escape (pfile, p + 1, limit, &tbuf, cvt); + struct _cpp_strbuf *tbuf_ptr = to ? &tbuf : NULL; + p = convert_escape (pfile, p + 1, limit, tbuf_ptr, cvt, + loc_reader, out); } } - /* NUL-terminate the 'to' buffer and translate it to a cpp_string - structure. */ - emit_numeric_escape (pfile, 0, &tbuf, cvt); - tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len); - to->text = tbuf.text; - to->len = tbuf.len; + + if (to) + { + /* NUL-terminate the 'to' buffer and translate it to a cpp_string + structure. */ + emit_numeric_escape (pfile, 0, &tbuf, cvt); + tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len); + to->text = tbuf.text; + to->len = tbuf.len; + } + /* Use the location of the trailing quote as the location of the + NUL-terminator. */ + if (loc_reader) + { + source_range range = loc_reader->get_next (); + out->add_range (range); + } + return true; fail: cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set"); - free (tbuf.text); + if (to) + free (tbuf.text); return false; } +/* FROM is an array of cpp_string structures of length COUNT. These + are to be converted from the source to the execution character set, + escape sequences translated, and finally all are to be + concatenated. WIDE indicates whether or not to produce a wide + string. The result is written into TO. Returns true for success, + false for failure. */ +bool +cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count, + cpp_string *to, enum cpp_ttype type) +{ + return cpp_interpret_string_1 (pfile, from, count, to, type, NULL, NULL); +} + +/* A "do nothing" error-handling callback for use by + cpp_interpret_string_ranges, so that it can temporarily suppress + error-handling. */ + +static bool +noop_error_cb (cpp_reader *, int, int, rich_location *, + const char *, va_list *) +{ + /* no-op. */ + return true; +} + +/* This function mimics the behavior of cpp_interpret_string, but + rather than generating a string in the execution character set, + *OUT is written to with the source code ranges of the characters + in such a string. + FROM and LOC_READERS should both be arrays of length COUNT. + Returns NULL for success, or an error message for failure. */ + +const char * +cpp_interpret_string_ranges (cpp_reader *pfile, const cpp_string *from, + cpp_string_location_reader *loc_readers, + size_t count, + cpp_substring_ranges *out, + enum cpp_ttype type) +{ + /* There are a couple of cases in the range-handling in + cpp_interpret_string_1 that rely on there being a 1:1 correspondence + between bytes in the source encoding and bytes in the execution + encoding, so that each byte in the execution string can correspond + to the location of a byte in the source string. + + This holds for the typical case of a UTF-8 to UTF-8 conversion. + Enforce this requirement by only attempting to track substring + locations if we have source encoding == execution encoding. + + This is a stronger condition than we need, since we could e.g. + have ASCII to EBCDIC (with 1 byte per character before and after), + but it seems to be a reasonable restriction. */ + struct cset_converter cvt = converter_for_type (pfile, type); + if (cvt.func != convert_no_conversion) + return "execution character set != source character set"; + + /* For on-demand strings we have already lexed the strings, so there + should be no errors. However, if we have bogus source location + data (or stringified macro arguments), the attempt to lex the + strings could fail with an error. Temporarily install an + error-handler to catch the error, so that it can lead to this call + failing, rather than being emitted as a user-visible diagnostic. + If an error does occur, we should see it via the return value of + cpp_interpret_string_1. */ + bool (*saved_error_handler) (cpp_reader *, int, int, rich_location *, + const char *, va_list *) + ATTRIBUTE_FPTR_PRINTF(5,0); + + saved_error_handler = pfile->cb.error; + pfile->cb.error = noop_error_cb; + + bool result = cpp_interpret_string_1 (pfile, from, count, NULL, type, + loc_readers, out); + + /* Restore the saved error-handler. */ + pfile->cb.error = saved_error_handler; + + if (!result) + return "cpp_interpret_string_1 failed"; + + /* Success. */ + return NULL; +} + /* Subroutine of do_line and do_linemarker. Convert escape sequences in a string, but do not perform character set conversion. */ bool @@ -1603,17 +1921,25 @@ unsigned int *pchars_seen, int *unsignedp) { cpp_string str = { 0, 0 }; - bool wide = (token->type != CPP_CHAR); + bool wide = (token->type != CPP_CHAR && token->type != CPP_UTF8CHAR); + int u8 = 2 * int(token->type == CPP_UTF8CHAR); cppchar_t result; - /* an empty constant will appear as L'', u'', U'' or '' */ - if (token->val.str.len == (size_t) (2 + wide)) + /* An empty constant will appear as L'', u'', U'', u8'', or '' */ + if (token->val.str.len == (size_t) (2 + wide + u8)) { cpp_error (pfile, CPP_DL_ERROR, "empty character constant"); + *pchars_seen = 0; + *unsignedp = 0; return 0; } - else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, token->type)) - return 0; + else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, + token->type)) + { + *pchars_seen = 0; + *unsignedp = 0; + return 0; + } if (wide) result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp, @@ -1729,9 +2055,15 @@ iconv_close (input_cset.cd); /* Resize buffer if we allocated substantially too much, or if we - haven't enough space for the \n-terminator. */ - if (to.len + 4096 < to.asize || to.len >= to.asize) - to.text = XRESIZEVEC (uchar, to.text, to.len + 1); + haven't enough space for the \n-terminator or following + 15 bytes of padding (used to quiet warnings from valgrind or + Address Sanitizer, when the optimized lexer accesses aligned + 16-byte memory chunks, including the bytes after the malloced, + area, and stops lexing on '\n'). */ + if (to.len + 4096 < to.asize || to.len + 16 > to.asize) + to.text = XRESIZEVEC (uchar, to.text, to.len + 16); + + memset (to.text + to.len, '\0', 16); /* If the file is using old-school Mac line endings (\r only), terminate with another \r, not an \n, so that we do not mistake @@ -1795,3 +2127,39 @@ return current_encoding; } + +/* Implementation of class cpp_string_location_reader. */ + +/* Constructor for cpp_string_location_reader. */ + +cpp_string_location_reader:: +cpp_string_location_reader (source_location src_loc, + line_maps *line_table) +: m_line_table (line_table) +{ + src_loc = get_range_from_loc (line_table, src_loc).m_start; + + /* SRC_LOC might be a macro location. It only makes sense to do + column-by-column calculations on ordinary maps, so get the + corresponding location in an ordinary map. */ + m_loc + = linemap_resolve_location (line_table, src_loc, + LRK_SPELLING_LOCATION, NULL); + + const line_map_ordinary *map + = linemap_check_ordinary (linemap_lookup (line_table, m_loc)); + m_offset_per_column = (1 << map->m_range_bits); +} + +/* Get the range of the next source byte. */ + +source_range +cpp_string_location_reader::get_next () +{ + source_range result; + result.m_start = m_loc; + result.m_finish = m_loc; + if (m_loc <= LINE_MAP_MAX_LOCATION_WITH_COLS) + m_loc += m_offset_per_column; + return result; +}