Mercurial > hg > CbC > CbC_gcc
comparison libcpp/charset.c @ 145:1830386684a0
gcc-9.2.0
author | anatofuz |
---|---|
date | Thu, 13 Feb 2020 11:34:05 +0900 |
parents | 84e7813d76e9 |
children |
comparison
equal
deleted
inserted
replaced
131:84e7813d76e9 | 145:1830386684a0 |
---|---|
1 /* CPP Library - charsets | 1 /* CPP Library - charsets |
2 Copyright (C) 1998-2018 Free Software Foundation, Inc. | 2 Copyright (C) 1998-2020 Free Software Foundation, Inc. |
3 | 3 |
4 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges. | 4 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges. |
5 | 5 |
6 This program is free software; you can redistribute it and/or modify it | 6 This program is free software; you can redistribute it and/or modify it |
7 under the terms of the GNU General Public License as published by the | 7 under the terms of the GNU General Public License as published by the |
899 /* Last character in the range described by this entry. */ | 899 /* Last character in the range described by this entry. */ |
900 unsigned int end; | 900 unsigned int end; |
901 }; | 901 }; |
902 #include "ucnid.h" | 902 #include "ucnid.h" |
903 | 903 |
904 /* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive. */ | |
905 #define UCS_LIMIT 0x10FFFF | |
906 | |
904 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at | 907 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at |
905 the start of an identifier, and 0 if C is not valid in an | 908 the start of an identifier, and 0 if C is not valid in an |
906 identifier. We assume C has already gone through the checks of | 909 identifier. We assume C has already gone through the checks of |
907 _cpp_valid_ucn. Also update NST for C if returning nonzero. The | 910 _cpp_valid_ucn. Also update NST for C if returning nonzero. The |
908 algorithm is a simple binary search on the table defined in | 911 algorithm is a simple binary search on the table defined in |
913 struct normalize_state *nst) | 916 struct normalize_state *nst) |
914 { | 917 { |
915 int mn, mx, md; | 918 int mn, mx, md; |
916 unsigned short valid_flags, invalid_start_flags; | 919 unsigned short valid_flags, invalid_start_flags; |
917 | 920 |
918 if (c > 0x10FFFF) | 921 if (c > UCS_LIMIT) |
919 return 0; | 922 return 0; |
920 | 923 |
921 mn = 0; | 924 mn = 0; |
922 mx = ARRAY_SIZE (ucnranges) - 1; | 925 mx = ARRAY_SIZE (ucnranges) - 1; |
923 while (mx != mn) | 926 while (mx != mn) |
1014 | 1017 |
1015 C99 6.4.3: A universal character name shall not specify a character | 1018 C99 6.4.3: A universal character name shall not specify a character |
1016 whose short identifier is less than 00A0 other than 0024 ($), 0040 (@), | 1019 whose short identifier is less than 00A0 other than 0024 ($), 0040 (@), |
1017 or 0060 (`), nor one in the range D800 through DFFF inclusive. | 1020 or 0060 (`), nor one in the range D800 through DFFF inclusive. |
1018 | 1021 |
1022 If the hexadecimal value is larger than the upper bound of the UCS | |
1023 codespace specified in ISO/IEC 10646, a pedantic warning is issued | |
1024 in all versions of C and in the C++2a or later versions of C++. | |
1025 | |
1019 *PSTR must be preceded by "\u" or "\U"; it is assumed that the | 1026 *PSTR must be preceded by "\u" or "\U"; it is assumed that the |
1020 buffer end is delimited by a non-hex digit. Returns false if the | 1027 buffer end is delimited by a non-hex digit. Returns false if the |
1021 UCN has not been consumed, true otherwise. | 1028 UCN has not been consumed, true otherwise. |
1022 | 1029 |
1023 The value of the UCN, whether valid or invalid, is returned in *CP. | 1030 The value of the UCN, whether valid or invalid, is returned in *CP. |
1133 else if (validity == 2 && identifier_pos == 1) | 1140 else if (validity == 2 && identifier_pos == 1) |
1134 cpp_error (pfile, CPP_DL_ERROR, | 1141 cpp_error (pfile, CPP_DL_ERROR, |
1135 "universal character %.*s is not valid at the start of an identifier", | 1142 "universal character %.*s is not valid at the start of an identifier", |
1136 (int) (str - base), base); | 1143 (int) (str - base), base); |
1137 } | 1144 } |
1145 else if (result > UCS_LIMIT | |
1146 && (!CPP_OPTION (pfile, cplusplus) | |
1147 || CPP_OPTION (pfile, lang) > CLK_CXX17)) | |
1148 cpp_error (pfile, CPP_DL_PEDWARN, | |
1149 "%.*s is outside the UCS codespace", | |
1150 (int) (str - base), base); | |
1138 | 1151 |
1139 *cp = result; | 1152 *cp = result; |
1140 return true; | 1153 return true; |
1141 } | 1154 } |
1142 | 1155 |
1194 ranges->add_range (char_range); | 1207 ranges->add_range (char_range); |
1195 } | 1208 } |
1196 } | 1209 } |
1197 | 1210 |
1198 return from; | 1211 return from; |
1212 } | |
1213 | |
1214 /* Performs a similar task as _cpp_valid_ucn, but parses UTF-8-encoded | |
1215 extended characters rather than UCNs. If the return value is TRUE, then a | |
1216 character was successfully decoded and stored in *CP; *PSTR has been | |
1217 updated to point one past the valid UTF-8 sequence. Diagnostics may have | |
1218 been emitted if the character parsed is not allowed in the current context. | |
1219 If the return value is FALSE, then *PSTR has not been modified and *CP may | |
1220 equal 0, to indicate that *PSTR does not form a valid UTF-8 sequence, or it | |
1221 may, when processing an identifier in C mode, equal a codepoint that was | |
1222 validly encoded but is not allowed to appear in an identifier. In either | |
1223 case, no diagnostic is emitted, and the return value of FALSE should cause | |
1224 a new token to be formed. | |
1225 | |
1226 Unlike _cpp_valid_ucn, this will never be called when lexing a string; only | |
1227 a potential identifier, or a CPP_OTHER token. NST is unused in the latter | |
1228 case. | |
1229 | |
1230 As in _cpp_valid_ucn, IDENTIFIER_POS is 0 when not in an identifier, 1 for | |
1231 the start of an identifier, or 2 otherwise. */ | |
1232 | |
1233 extern bool | |
1234 _cpp_valid_utf8 (cpp_reader *pfile, | |
1235 const uchar **pstr, | |
1236 const uchar *limit, | |
1237 int identifier_pos, | |
1238 struct normalize_state *nst, | |
1239 cppchar_t *cp) | |
1240 { | |
1241 const uchar *base = *pstr; | |
1242 size_t inbytesleft = limit - base; | |
1243 if (one_utf8_to_cppchar (pstr, &inbytesleft, cp)) | |
1244 { | |
1245 /* No diagnostic here as this byte will rather become a | |
1246 new token. */ | |
1247 *cp = 0; | |
1248 return false; | |
1249 } | |
1250 | |
1251 if (identifier_pos) | |
1252 { | |
1253 switch (ucn_valid_in_identifier (pfile, *cp, nst)) | |
1254 { | |
1255 | |
1256 case 0: | |
1257 /* In C++, this is an error for invalid character in an identifier | |
1258 because logically, the UTF-8 was converted to a UCN during | |
1259 translation phase 1 (even though we don't physically do it that | |
1260 way). In C, this byte rather becomes grammatically a separate | |
1261 token. */ | |
1262 | |
1263 if (CPP_OPTION (pfile, cplusplus)) | |
1264 cpp_error (pfile, CPP_DL_ERROR, | |
1265 "extended character %.*s is not valid in an identifier", | |
1266 (int) (*pstr - base), base); | |
1267 else | |
1268 { | |
1269 *pstr = base; | |
1270 return false; | |
1271 } | |
1272 | |
1273 break; | |
1274 | |
1275 case 2: | |
1276 if (identifier_pos == 1) | |
1277 { | |
1278 /* This is treated the same way in C++ or C99 -- lexed as an | |
1279 identifier which is then invalid because an identifier is | |
1280 not allowed to start with this character. */ | |
1281 cpp_error (pfile, CPP_DL_ERROR, | |
1282 "extended character %.*s is not valid at the start of an identifier", | |
1283 (int) (*pstr - base), base); | |
1284 } | |
1285 break; | |
1286 } | |
1287 } | |
1288 | |
1289 return true; | |
1199 } | 1290 } |
1200 | 1291 |
1201 /* Subroutine of convert_hex and convert_oct. N is the representation | 1292 /* Subroutine of convert_hex and convert_oct. N is the representation |
1202 in the execution character set of a numeric escape; write it into the | 1293 in the execution character set of a numeric escape; write it into the |
1203 string buffer TBUF and update the end-of-string pointer therein. WIDE | 1294 string buffer TBUF and update the end-of-string pointer therein. WIDE |
1788 | 1879 |
1789 | 1880 |
1790 /* Subroutine of cpp_interpret_charconst which performs the conversion | 1881 /* Subroutine of cpp_interpret_charconst which performs the conversion |
1791 to a number, for narrow strings. STR is the string structure returned | 1882 to a number, for narrow strings. STR is the string structure returned |
1792 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for | 1883 by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for |
1793 cpp_interpret_charconst. */ | 1884 cpp_interpret_charconst. TYPE is the token type. */ |
1794 static cppchar_t | 1885 static cppchar_t |
1795 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str, | 1886 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str, |
1796 unsigned int *pchars_seen, int *unsignedp) | 1887 unsigned int *pchars_seen, int *unsignedp, |
1888 enum cpp_ttype type) | |
1797 { | 1889 { |
1798 size_t width = CPP_OPTION (pfile, char_precision); | 1890 size_t width = CPP_OPTION (pfile, char_precision); |
1799 size_t max_chars = CPP_OPTION (pfile, int_precision) / width; | 1891 size_t max_chars = CPP_OPTION (pfile, int_precision) / width; |
1800 size_t mask = width_to_mask (width); | 1892 size_t mask = width_to_mask (width); |
1801 size_t i; | 1893 size_t i; |
1820 result = (result << width) | c; | 1912 result = (result << width) | c; |
1821 else | 1913 else |
1822 result = c; | 1914 result = c; |
1823 } | 1915 } |
1824 | 1916 |
1917 if (type == CPP_UTF8CHAR) | |
1918 max_chars = 1; | |
1825 if (i > max_chars) | 1919 if (i > max_chars) |
1826 { | 1920 { |
1827 i = max_chars; | 1921 i = max_chars; |
1828 cpp_error (pfile, CPP_DL_WARNING, | 1922 cpp_error (pfile, type == CPP_UTF8CHAR ? CPP_DL_ERROR : CPP_DL_WARNING, |
1829 "character constant too long for its type"); | 1923 "character constant too long for its type"); |
1830 } | 1924 } |
1831 else if (i > 1 && CPP_OPTION (pfile, warn_multichar)) | 1925 else if (i > 1 && CPP_OPTION (pfile, warn_multichar)) |
1832 cpp_warning (pfile, CPP_W_MULTICHAR, "multi-character character constant"); | 1926 cpp_warning (pfile, CPP_W_MULTICHAR, "multi-character character constant"); |
1833 | 1927 |
1834 /* Multichar constants are of type int and therefore signed. */ | 1928 /* Multichar constants are of type int and therefore signed. */ |
1835 if (i > 1) | 1929 if (i > 1) |
1836 unsigned_p = 0; | 1930 unsigned_p = 0; |
1931 else if (type == CPP_UTF8CHAR && !CPP_OPTION (pfile, cplusplus)) | |
1932 unsigned_p = 1; | |
1837 else | 1933 else |
1838 unsigned_p = CPP_OPTION (pfile, unsigned_char); | 1934 unsigned_p = CPP_OPTION (pfile, unsigned_char); |
1839 | 1935 |
1840 /* Truncate the constant to its natural width, and simultaneously | 1936 /* Truncate the constant to its natural width, and simultaneously |
1841 sign- or zero-extend to the full width of cppchar_t. | 1937 sign- or zero-extend to the full width of cppchar_t. |
1872 size_t cmask = width_to_mask (cwidth); | 1968 size_t cmask = width_to_mask (cwidth); |
1873 size_t nbwc = width / cwidth; | 1969 size_t nbwc = width / cwidth; |
1874 size_t off, i; | 1970 size_t off, i; |
1875 cppchar_t result = 0, c; | 1971 cppchar_t result = 0, c; |
1876 | 1972 |
1973 if (str.len <= nbwc) | |
1974 { | |
1975 /* Error recovery, if no errors have been diagnosed previously, | |
1976 there should be at least two wide characters. Empty literals | |
1977 are diagnosed earlier and we can get just the zero terminator | |
1978 only if there were errors diagnosed during conversion. */ | |
1979 *pchars_seen = 0; | |
1980 *unsignedp = 0; | |
1981 return 0; | |
1982 } | |
1983 | |
1877 /* This is finicky because the string is in the target's byte order, | 1984 /* This is finicky because the string is in the target's byte order, |
1878 which may not be our byte order. Only the last character, ignoring | 1985 which may not be our byte order. Only the last character, ignoring |
1879 the NUL terminator, is relevant. */ | 1986 the NUL terminator, is relevant. */ |
1880 off = str.len - (nbwc * 2); | 1987 off = str.len - (nbwc * 2); |
1881 result = 0; | 1988 result = 0; |
1887 | 1994 |
1888 /* Wide character constants have type wchar_t, and a single | 1995 /* Wide character constants have type wchar_t, and a single |
1889 character exactly fills a wchar_t, so a multi-character wide | 1996 character exactly fills a wchar_t, so a multi-character wide |
1890 character constant is guaranteed to overflow. */ | 1997 character constant is guaranteed to overflow. */ |
1891 if (str.len > nbwc * 2) | 1998 if (str.len > nbwc * 2) |
1892 cpp_error (pfile, CPP_DL_WARNING, | 1999 cpp_error (pfile, (CPP_OPTION (pfile, cplusplus) |
2000 && (type == CPP_CHAR16 || type == CPP_CHAR32)) | |
2001 ? CPP_DL_ERROR : CPP_DL_WARNING, | |
1893 "character constant too long for its type"); | 2002 "character constant too long for its type"); |
1894 | 2003 |
1895 /* Truncate the constant to its natural width, and simultaneously | 2004 /* Truncate the constant to its natural width, and simultaneously |
1896 sign- or zero-extend to the full width of cppchar_t. */ | 2005 sign- or zero-extend to the full width of cppchar_t. */ |
1897 if (width < BITS_PER_CPPCHAR_T) | 2006 if (width < BITS_PER_CPPCHAR_T) |
1945 | 2054 |
1946 if (wide) | 2055 if (wide) |
1947 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp, | 2056 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp, |
1948 token->type); | 2057 token->type); |
1949 else | 2058 else |
1950 result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp); | 2059 result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp, |
2060 token->type); | |
1951 | 2061 |
1952 if (str.text != token->val.str.text) | 2062 if (str.text != token->val.str.text) |
1953 free ((void *)str.text); | 2063 free ((void *)str.text); |
1954 | 2064 |
1955 return result; | 2065 return result; |
1956 } | 2066 } |
1957 | 2067 |
1958 /* Convert an identifier denoted by ID and LEN, which might contain | 2068 /* Convert an identifier denoted by ID and LEN, which might contain |
1959 UCN escapes, to the source character set, either UTF-8 or | 2069 UCN escapes or UTF-8 multibyte chars, to the source character set, |
1960 UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */ | 2070 either UTF-8 or UTF-EBCDIC. Assumes that the identifier is actually |
2071 a valid identifier. */ | |
1961 cpp_hashnode * | 2072 cpp_hashnode * |
1962 _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) | 2073 _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len) |
1963 { | 2074 { |
1964 /* It turns out that a UCN escape always turns into fewer characters | 2075 /* It turns out that a UCN escape always turns into fewer characters |
1965 than the escape itself, so we can allocate a temporary in advance. */ | 2076 than the escape itself, so we can allocate a temporary in advance. */ |
2133 /* Implementation of class cpp_string_location_reader. */ | 2244 /* Implementation of class cpp_string_location_reader. */ |
2134 | 2245 |
2135 /* Constructor for cpp_string_location_reader. */ | 2246 /* Constructor for cpp_string_location_reader. */ |
2136 | 2247 |
2137 cpp_string_location_reader:: | 2248 cpp_string_location_reader:: |
2138 cpp_string_location_reader (source_location src_loc, | 2249 cpp_string_location_reader (location_t src_loc, |
2139 line_maps *line_table) | 2250 line_maps *line_table) |
2140 : m_line_table (line_table) | |
2141 { | 2251 { |
2142 src_loc = get_range_from_loc (line_table, src_loc).m_start; | 2252 src_loc = get_range_from_loc (line_table, src_loc).m_start; |
2143 | 2253 |
2144 /* SRC_LOC might be a macro location. It only makes sense to do | 2254 /* SRC_LOC might be a macro location. It only makes sense to do |
2145 column-by-column calculations on ordinary maps, so get the | 2255 column-by-column calculations on ordinary maps, so get the |
2163 result.m_finish = m_loc; | 2273 result.m_finish = m_loc; |
2164 if (m_loc <= LINE_MAP_MAX_LOCATION_WITH_COLS) | 2274 if (m_loc <= LINE_MAP_MAX_LOCATION_WITH_COLS) |
2165 m_loc += m_offset_per_column; | 2275 m_loc += m_offset_per_column; |
2166 return result; | 2276 return result; |
2167 } | 2277 } |
2278 | |
2279 /* Helper for cpp_byte_column_to_display_column and its inverse. Given a | |
2280 pointer to a UTF-8-encoded character, compute its display width. *INBUFP | |
2281 points on entry to the start of the UTF-8 encoding of the character, and | |
2282 is updated to point just after the last byte of the encoding. *INBYTESLEFTP | |
2283 contains on entry the remaining size of the buffer into which *INBUFP | |
2284 points, and this is also updated accordingly. If *INBUFP does not | |
2285 point to a valid UTF-8-encoded sequence, then it will be treated as a single | |
2286 byte with display width 1. */ | |
2287 | |
2288 static inline int | |
2289 compute_next_display_width (const uchar **inbufp, size_t *inbytesleftp) | |
2290 { | |
2291 cppchar_t c; | |
2292 if (one_utf8_to_cppchar (inbufp, inbytesleftp, &c) != 0) | |
2293 { | |
2294 /* Input is not convertible to UTF-8. This could be fine, e.g. in a | |
2295 string literal, so don't complain. Just treat it as if it has a width | |
2296 of one. */ | |
2297 ++*inbufp; | |
2298 --*inbytesleftp; | |
2299 return 1; | |
2300 } | |
2301 | |
2302 /* one_utf8_to_cppchar() has updated inbufp and inbytesleftp for us. */ | |
2303 return cpp_wcwidth (c); | |
2304 } | |
2305 | |
2306 /* For the string of length DATA_LENGTH bytes that begins at DATA, compute | |
2307 how many display columns are occupied by the first COLUMN bytes. COLUMN | |
2308 may exceed DATA_LENGTH, in which case the phantom bytes at the end are | |
2309 treated as if they have display width 1. */ | |
2310 | |
2311 int | |
2312 cpp_byte_column_to_display_column (const char *data, int data_length, | |
2313 int column) | |
2314 { | |
2315 int display_col = 0; | |
2316 const uchar *udata = (const uchar *) data; | |
2317 const int offset = MAX (0, column - data_length); | |
2318 size_t inbytesleft = column - offset; | |
2319 while (inbytesleft) | |
2320 display_col += compute_next_display_width (&udata, &inbytesleft); | |
2321 return display_col + offset; | |
2322 } | |
2323 | |
2324 /* For the string of length DATA_LENGTH bytes that begins at DATA, compute | |
2325 the least number of bytes that will result in at least DISPLAY_COL display | |
2326 columns. The return value may exceed DATA_LENGTH if the entire string does | |
2327 not occupy enough display columns. */ | |
2328 | |
2329 int | |
2330 cpp_display_column_to_byte_column (const char *data, int data_length, | |
2331 int display_col) | |
2332 { | |
2333 int column = 0; | |
2334 const uchar *udata = (const uchar *) data; | |
2335 size_t inbytesleft = data_length; | |
2336 while (column < display_col && inbytesleft) | |
2337 column += compute_next_display_width (&udata, &inbytesleft); | |
2338 return data_length - inbytesleft + MAX (0, display_col - column); | |
2339 } | |
2340 | |
2341 /* Our own version of wcwidth(). We don't use the actual wcwidth() in glibc, | |
2342 because that will inspect the user's locale, and in particular in an ASCII | |
2343 locale, it will not return anything useful for extended characters. But GCC | |
2344 in other respects (see e.g. _cpp_default_encoding()) behaves as if | |
2345 everything is UTF-8. We also make some tweaks that are useful for the way | |
2346 GCC needs to use this data, e.g. tabs and other control characters should be | |
2347 treated as having width 1. The lookup tables are generated from | |
2348 contrib/unicode/gen_wcwidth.py and were made by simply calling glibc | |
2349 wcwidth() on all codepoints, then applying the small tweaks. These tables | |
2350 are not highly optimized, but for the present purpose of outputting | |
2351 diagnostics, they are sufficient. */ | |
2352 | |
2353 #include "generated_cpp_wcwidth.h" | |
2354 int cpp_wcwidth (cppchar_t c) | |
2355 { | |
2356 if (__builtin_expect (c <= wcwidth_range_ends[0], true)) | |
2357 return wcwidth_widths[0]; | |
2358 | |
2359 /* Binary search the tables. */ | |
2360 int begin = 1; | |
2361 static const int end | |
2362 = sizeof wcwidth_range_ends / sizeof (*wcwidth_range_ends); | |
2363 int len = end - begin; | |
2364 do | |
2365 { | |
2366 int half = len/2; | |
2367 int middle = begin + half; | |
2368 if (c > wcwidth_range_ends[middle]) | |
2369 { | |
2370 begin = middle + 1; | |
2371 len -= half + 1; | |
2372 } | |
2373 else | |
2374 len = half; | |
2375 } while (len); | |
2376 | |
2377 if (__builtin_expect (begin != end, true)) | |
2378 return wcwidth_widths[begin]; | |
2379 return 1; | |
2380 } |