comparison libcpp/charset.c @ 145:1830386684a0

gcc-9.2.0
author anatofuz
date Thu, 13 Feb 2020 11:34:05 +0900
parents 84e7813d76e9
children
comparison
equal deleted inserted replaced
131:84e7813d76e9 145:1830386684a0
1 /* CPP Library - charsets 1 /* CPP Library - charsets
2 Copyright (C) 1998-2018 Free Software Foundation, Inc. 2 Copyright (C) 1998-2020 Free Software Foundation, Inc.
3 3
4 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges. 4 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
5 5
6 This program is free software; you can redistribute it and/or modify it 6 This program is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the 7 under the terms of the GNU General Public License as published by the
899 /* Last character in the range described by this entry. */ 899 /* Last character in the range described by this entry. */
900 unsigned int end; 900 unsigned int end;
901 }; 901 };
902 #include "ucnid.h" 902 #include "ucnid.h"
903 903
904 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */
905 #define UCS_LIMIT 0x10FFFF
906
904 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at 907 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
905 the start of an identifier, and 0 if C is not valid in an 908 the start of an identifier, and 0 if C is not valid in an
906 identifier. We assume C has already gone through the checks of 909 identifier. We assume C has already gone through the checks of
907 _cpp_valid_ucn. Also update NST for C if returning nonzero. The 910 _cpp_valid_ucn. Also update NST for C if returning nonzero. The
908 algorithm is a simple binary search on the table defined in 911 algorithm is a simple binary search on the table defined in
913 struct normalize_state *nst) 916 struct normalize_state *nst)
914 { 917 {
915 int mn, mx, md; 918 int mn, mx, md;
916 unsigned short valid_flags, invalid_start_flags; 919 unsigned short valid_flags, invalid_start_flags;
917 920
918 if (c > 0x10FFFF) 921 if (c > UCS_LIMIT)
919 return 0; 922 return 0;
920 923
921 mn = 0; 924 mn = 0;
922 mx = ARRAY_SIZE (ucnranges) - 1; 925 mx = ARRAY_SIZE (ucnranges) - 1;
923 while (mx != mn) 926 while (mx != mn)
1014 1017
1015 C99 6.4.3: A universal character name shall not specify a character 1018 C99 6.4.3: A universal character name shall not specify a character
1016 whose short identifier is less than 00A0 other than 0024 ($), 0040 (@), 1019 whose short identifier is less than 00A0 other than 0024 ($), 0040 (@),
1017 or 0060 (`), nor one in the range D800 through DFFF inclusive. 1020 or 0060 (`), nor one in the range D800 through DFFF inclusive.
1018 1021
1022 If the hexadecimal value is larger than the upper bound of the UCS
1023 codespace specified in ISO/IEC 10646, a pedantic warning is issued
1024 in all versions of C and in the C++2a or later versions of C++.
1025
1019 *PSTR must be preceded by "\u" or "\U"; it is assumed that the 1026 *PSTR must be preceded by "\u" or "\U"; it is assumed that the
1020 buffer end is delimited by a non-hex digit. Returns false if the 1027 buffer end is delimited by a non-hex digit. Returns false if the
1021 UCN has not been consumed, true otherwise. 1028 UCN has not been consumed, true otherwise.
1022 1029
1023 The value of the UCN, whether valid or invalid, is returned in *CP. 1030 The value of the UCN, whether valid or invalid, is returned in *CP.
1133 else if (validity == 2 && identifier_pos == 1) 1140 else if (validity == 2 && identifier_pos == 1)
1134 cpp_error (pfile, CPP_DL_ERROR, 1141 cpp_error (pfile, CPP_DL_ERROR,
1135 "universal character %.*s is not valid at the start of an identifier", 1142 "universal character %.*s is not valid at the start of an identifier",
1136 (int) (str - base), base); 1143 (int) (str - base), base);
1137 } 1144 }
1145 else if (result > UCS_LIMIT
1146 && (!CPP_OPTION (pfile, cplusplus)
1147 || CPP_OPTION (pfile, lang) > CLK_CXX17))
1148 cpp_error (pfile, CPP_DL_PEDWARN,
1149 "%.*s is outside the UCS codespace",
1150 (int) (str - base), base);
1138 1151
1139 *cp = result; 1152 *cp = result;
1140 return true; 1153 return true;
1141 } 1154 }
1142 1155
1194 ranges->add_range (char_range); 1207 ranges->add_range (char_range);
1195 } 1208 }
1196 } 1209 }
1197 1210
1198 return from; 1211 return from;
1212 }
1213
1214 /* Performs a similar task as _cpp_valid_ucn, but parses UTF-8-encoded
1215 extended characters rather than UCNs. If the return value is TRUE, then a
1216 character was successfully decoded and stored in *CP; *PSTR has been
1217 updated to point one past the valid UTF-8 sequence. Diagnostics may have
1218 been emitted if the character parsed is not allowed in the current context.
1219 If the return value is FALSE, then *PSTR has not been modified and *CP may
1220 equal 0, to indicate that *PSTR does not form a valid UTF-8 sequence, or it
1221 may, when processing an identifier in C mode, equal a codepoint that was
1222 validly encoded but is not allowed to appear in an identifier. In either
1223 case, no diagnostic is emitted, and the return value of FALSE should cause
1224 a new token to be formed.
1225
1226 Unlike _cpp_valid_ucn, this will never be called when lexing a string; only
1227 a potential identifier, or a CPP_OTHER token. NST is unused in the latter
1228 case.
1229
1230 As in _cpp_valid_ucn, IDENTIFIER_POS is 0 when not in an identifier, 1 for
1231 the start of an identifier, or 2 otherwise. */
1232
1233 extern bool
1234 _cpp_valid_utf8 (cpp_reader *pfile,
1235 const uchar **pstr,
1236 const uchar *limit,
1237 int identifier_pos,
1238 struct normalize_state *nst,
1239 cppchar_t *cp)
1240 {
1241 const uchar *base = *pstr;
1242 size_t inbytesleft = limit - base;
1243 if (one_utf8_to_cppchar (pstr, &inbytesleft, cp))
1244 {
1245 /* No diagnostic here as this byte will rather become a
1246 new token. */
1247 *cp = 0;
1248 return false;
1249 }
1250
1251 if (identifier_pos)
1252 {
1253 switch (ucn_valid_in_identifier (pfile, *cp, nst))
1254 {
1255
1256 case 0:
1257 /* In C++, this is an error for invalid character in an identifier
1258 because logically, the UTF-8 was converted to a UCN during
1259 translation phase 1 (even though we don't physically do it that
1260 way). In C, this byte rather becomes grammatically a separate
1261 token. */
1262
1263 if (CPP_OPTION (pfile, cplusplus))
1264 cpp_error (pfile, CPP_DL_ERROR,
1265 "extended character %.*s is not valid in an identifier",
1266 (int) (*pstr - base), base);
1267 else
1268 {
1269 *pstr = base;
1270 return false;
1271 }
1272
1273 break;
1274
1275 case 2:
1276 if (identifier_pos == 1)
1277 {
1278 /* This is treated the same way in C++ or C99 -- lexed as an
1279 identifier which is then invalid because an identifier is
1280 not allowed to start with this character. */
1281 cpp_error (pfile, CPP_DL_ERROR,
1282 "extended character %.*s is not valid at the start of an identifier",
1283 (int) (*pstr - base), base);
1284 }
1285 break;
1286 }
1287 }
1288
1289 return true;
1199 } 1290 }
1200 1291
1201 /* Subroutine of convert_hex and convert_oct. N is the representation 1292 /* Subroutine of convert_hex and convert_oct. N is the representation
1202 in the execution character set of a numeric escape; write it into the 1293 in the execution character set of a numeric escape; write it into the
1203 string buffer TBUF and update the end-of-string pointer therein. WIDE 1294 string buffer TBUF and update the end-of-string pointer therein. WIDE
1788 1879
1789 1880
1790 /* Subroutine of cpp_interpret_charconst which performs the conversion 1881 /* Subroutine of cpp_interpret_charconst which performs the conversion
1791 to a number, for narrow strings. STR is the string structure returned 1882 to a number, for narrow strings. STR is the string structure returned
1792 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for 1883 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
1793 cpp_interpret_charconst. */ 1884 cpp_interpret_charconst. TYPE is the token type. */
1794 static cppchar_t 1885 static cppchar_t
1795 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str, 1886 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
1796 unsigned int *pchars_seen, int *unsignedp) 1887 unsigned int *pchars_seen, int *unsignedp,
1888 enum cpp_ttype type)
1797 { 1889 {
1798 size_t width = CPP_OPTION (pfile, char_precision); 1890 size_t width = CPP_OPTION (pfile, char_precision);
1799 size_t max_chars = CPP_OPTION (pfile, int_precision) / width; 1891 size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
1800 size_t mask = width_to_mask (width); 1892 size_t mask = width_to_mask (width);
1801 size_t i; 1893 size_t i;
1820 result = (result << width) | c; 1912 result = (result << width) | c;
1821 else 1913 else
1822 result = c; 1914 result = c;
1823 } 1915 }
1824 1916
1917 if (type == CPP_UTF8CHAR)
1918 max_chars = 1;
1825 if (i > max_chars) 1919 if (i > max_chars)
1826 { 1920 {
1827 i = max_chars; 1921 i = max_chars;
1828 cpp_error (pfile, CPP_DL_WARNING, 1922 cpp_error (pfile, type == CPP_UTF8CHAR ? CPP_DL_ERROR : CPP_DL_WARNING,
1829 "character constant too long for its type"); 1923 "character constant too long for its type");
1830 } 1924 }
1831 else if (i > 1 && CPP_OPTION (pfile, warn_multichar)) 1925 else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
1832 cpp_warning (pfile, CPP_W_MULTICHAR, "multi-character character constant"); 1926 cpp_warning (pfile, CPP_W_MULTICHAR, "multi-character character constant");
1833 1927
1834 /* Multichar constants are of type int and therefore signed. */ 1928 /* Multichar constants are of type int and therefore signed. */
1835 if (i > 1) 1929 if (i > 1)
1836 unsigned_p = 0; 1930 unsigned_p = 0;
1931 else if (type == CPP_UTF8CHAR && !CPP_OPTION (pfile, cplusplus))
1932 unsigned_p = 1;
1837 else 1933 else
1838 unsigned_p = CPP_OPTION (pfile, unsigned_char); 1934 unsigned_p = CPP_OPTION (pfile, unsigned_char);
1839 1935
1840 /* Truncate the constant to its natural width, and simultaneously 1936 /* Truncate the constant to its natural width, and simultaneously
1841 sign- or zero-extend to the full width of cppchar_t. 1937 sign- or zero-extend to the full width of cppchar_t.
1872 size_t cmask = width_to_mask (cwidth); 1968 size_t cmask = width_to_mask (cwidth);
1873 size_t nbwc = width / cwidth; 1969 size_t nbwc = width / cwidth;
1874 size_t off, i; 1970 size_t off, i;
1875 cppchar_t result = 0, c; 1971 cppchar_t result = 0, c;
1876 1972
1973 if (str.len <= nbwc)
1974 {
1975 /* Error recovery, if no errors have been diagnosed previously,
1976 there should be at least two wide characters. Empty literals
1977 are diagnosed earlier and we can get just the zero terminator
1978 only if there were errors diagnosed during conversion. */
1979 *pchars_seen = 0;
1980 *unsignedp = 0;
1981 return 0;
1982 }
1983
1877 /* This is finicky because the string is in the target's byte order, 1984 /* This is finicky because the string is in the target's byte order,
1878 which may not be our byte order. Only the last character, ignoring 1985 which may not be our byte order. Only the last character, ignoring
1879 the NUL terminator, is relevant. */ 1986 the NUL terminator, is relevant. */
1880 off = str.len - (nbwc * 2); 1987 off = str.len - (nbwc * 2);
1881 result = 0; 1988 result = 0;
1887 1994
1888 /* Wide character constants have type wchar_t, and a single 1995 /* Wide character constants have type wchar_t, and a single
1889 character exactly fills a wchar_t, so a multi-character wide 1996 character exactly fills a wchar_t, so a multi-character wide
1890 character constant is guaranteed to overflow. */ 1997 character constant is guaranteed to overflow. */
1891 if (str.len > nbwc * 2) 1998 if (str.len > nbwc * 2)
1892 cpp_error (pfile, CPP_DL_WARNING, 1999 cpp_error (pfile, (CPP_OPTION (pfile, cplusplus)
2000 && (type == CPP_CHAR16 || type == CPP_CHAR32))
2001 ? CPP_DL_ERROR : CPP_DL_WARNING,
1893 "character constant too long for its type"); 2002 "character constant too long for its type");
1894 2003
1895 /* Truncate the constant to its natural width, and simultaneously 2004 /* Truncate the constant to its natural width, and simultaneously
1896 sign- or zero-extend to the full width of cppchar_t. */ 2005 sign- or zero-extend to the full width of cppchar_t. */
1897 if (width < BITS_PER_CPPCHAR_T) 2006 if (width < BITS_PER_CPPCHAR_T)
1945 2054
1946 if (wide) 2055 if (wide)
1947 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp, 2056 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp,
1948 token->type); 2057 token->type);
1949 else 2058 else
1950 result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp); 2059 result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp,
2060 token->type);
1951 2061
1952 if (str.text != token->val.str.text) 2062 if (str.text != token->val.str.text)
1953 free ((void *)str.text); 2063 free ((void *)str.text);
1954 2064
1955 return result; 2065 return result;
1956 } 2066 }
1957 2067
1958 /* Convert an identifier denoted by ID and LEN, which might contain 2068 /* Convert an identifier denoted by ID and LEN, which might contain
1959 UCN escapes, to the source character set, either UTF-8 or 2069 UCN escapes or UTF-8 multibyte chars, to the source character set,
1960 UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */ 2070 either UTF-8 or UTF-EBCDIC. Assumes that the identifier is actually
2071 a valid identifier. */
1961 cpp_hashnode * 2072 cpp_hashnode *
1962 _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) 2073 _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
1963 { 2074 {
1964 /* It turns out that a UCN escape always turns into fewer characters 2075 /* It turns out that a UCN escape always turns into fewer characters
1965 than the escape itself, so we can allocate a temporary in advance. */ 2076 than the escape itself, so we can allocate a temporary in advance. */
2133 /* Implementation of class cpp_string_location_reader. */ 2244 /* Implementation of class cpp_string_location_reader. */
2134 2245
2135 /* Constructor for cpp_string_location_reader. */ 2246 /* Constructor for cpp_string_location_reader. */
2136 2247
2137 cpp_string_location_reader:: 2248 cpp_string_location_reader::
2138 cpp_string_location_reader (source_location src_loc, 2249 cpp_string_location_reader (location_t src_loc,
2139 line_maps *line_table) 2250 line_maps *line_table)
2140 : m_line_table (line_table)
2141 { 2251 {
2142 src_loc = get_range_from_loc (line_table, src_loc).m_start; 2252 src_loc = get_range_from_loc (line_table, src_loc).m_start;
2143 2253
2144 /* SRC_LOC might be a macro location. It only makes sense to do 2254 /* SRC_LOC might be a macro location. It only makes sense to do
2145 column-by-column calculations on ordinary maps, so get the 2255 column-by-column calculations on ordinary maps, so get the
2163 result.m_finish = m_loc; 2273 result.m_finish = m_loc;
2164 if (m_loc <= LINE_MAP_MAX_LOCATION_WITH_COLS) 2274 if (m_loc <= LINE_MAP_MAX_LOCATION_WITH_COLS)
2165 m_loc += m_offset_per_column; 2275 m_loc += m_offset_per_column;
2166 return result; 2276 return result;
2167 } 2277 }
2278
2279 /* Helper for cpp_byte_column_to_display_column and its inverse. Given a
2280 pointer to a UTF-8-encoded character, compute its display width. *INBUFP
2281 points on entry to the start of the UTF-8 encoding of the character, and
2282 is updated to point just after the last byte of the encoding. *INBYTESLEFTP
2283 contains on entry the remaining size of the buffer into which *INBUFP
2284 points, and this is also updated accordingly. If *INBUFP does not
2285 point to a valid UTF-8-encoded sequence, then it will be treated as a single
2286 byte with display width 1. */
2287
2288 static inline int
2289 compute_next_display_width (const uchar **inbufp, size_t *inbytesleftp)
2290 {
2291 cppchar_t c;
2292 if (one_utf8_to_cppchar (inbufp, inbytesleftp, &c) != 0)
2293 {
2294 /* Input is not convertible to UTF-8. This could be fine, e.g. in a
2295 string literal, so don't complain. Just treat it as if it has a width
2296 of one. */
2297 ++*inbufp;
2298 --*inbytesleftp;
2299 return 1;
2300 }
2301
2302 /* one_utf8_to_cppchar() has updated inbufp and inbytesleftp for us. */
2303 return cpp_wcwidth (c);
2304 }
2305
2306 /* For the string of length DATA_LENGTH bytes that begins at DATA, compute
2307 how many display columns are occupied by the first COLUMN bytes. COLUMN
2308 may exceed DATA_LENGTH, in which case the phantom bytes at the end are
2309 treated as if they have display width 1. */
2310
2311 int
2312 cpp_byte_column_to_display_column (const char *data, int data_length,
2313 int column)
2314 {
2315 int display_col = 0;
2316 const uchar *udata = (const uchar *) data;
2317 const int offset = MAX (0, column - data_length);
2318 size_t inbytesleft = column - offset;
2319 while (inbytesleft)
2320 display_col += compute_next_display_width (&udata, &inbytesleft);
2321 return display_col + offset;
2322 }
2323
2324 /* For the string of length DATA_LENGTH bytes that begins at DATA, compute
2325 the least number of bytes that will result in at least DISPLAY_COL display
2326 columns. The return value may exceed DATA_LENGTH if the entire string does
2327 not occupy enough display columns. */
2328
2329 int
2330 cpp_display_column_to_byte_column (const char *data, int data_length,
2331 int display_col)
2332 {
2333 int column = 0;
2334 const uchar *udata = (const uchar *) data;
2335 size_t inbytesleft = data_length;
2336 while (column < display_col && inbytesleft)
2337 column += compute_next_display_width (&udata, &inbytesleft);
2338 return data_length - inbytesleft + MAX (0, display_col - column);
2339 }
2340
2341 /* Our own version of wcwidth(). We don't use the actual wcwidth() in glibc,
2342 because that will inspect the user's locale, and in particular in an ASCII
2343 locale, it will not return anything useful for extended characters. But GCC
2344 in other respects (see e.g. _cpp_default_encoding()) behaves as if
2345 everything is UTF-8. We also make some tweaks that are useful for the way
2346 GCC needs to use this data, e.g. tabs and other control characters should be
2347 treated as having width 1. The lookup tables are generated from
2348 contrib/unicode/gen_wcwidth.py and were made by simply calling glibc
2349 wcwidth() on all codepoints, then applying the small tweaks. These tables
2350 are not highly optimized, but for the present purpose of outputting
2351 diagnostics, they are sufficient. */
2352
2353 #include "generated_cpp_wcwidth.h"
2354 int cpp_wcwidth (cppchar_t c)
2355 {
2356 if (__builtin_expect (c <= wcwidth_range_ends[0], true))
2357 return wcwidth_widths[0];
2358
2359 /* Binary search the tables. */
2360 int begin = 1;
2361 static const int end
2362 = sizeof wcwidth_range_ends / sizeof (*wcwidth_range_ends);
2363 int len = end - begin;
2364 do
2365 {
2366 int half = len/2;
2367 int middle = begin + half;
2368 if (c > wcwidth_range_ends[middle])
2369 {
2370 begin = middle + 1;
2371 len -= half + 1;
2372 }
2373 else
2374 len = half;
2375 } while (len);
2376
2377 if (__builtin_expect (begin != end, true))
2378 return wcwidth_widths[begin];
2379 return 1;
2380 }