comparison libcpp/charset.c @ 111:04ced10e8804

gcc 7
author kono
date Fri, 27 Oct 2017 22:46:09 +0900
parents f6334be47118
children 84e7813d76e9
comparison
equal deleted inserted replaced
68:561a7518be6b 111:04ced10e8804
1 /* CPP Library - charsets 1 /* CPP Library - charsets
2 Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2006, 2008, 2009, 2 Copyright (C) 1998-2017 Free Software Foundation, Inc.
3 2010 Free Software Foundation, Inc.
4 3
5 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges. 4 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
6 5
7 This program is free software; you can redistribute it and/or modify it 6 This program is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by the 7 under the terms of the GNU General Public License as published by the
352 *inbufp = save_inbuf; 351 *inbufp = save_inbuf;
353 *inbytesleftp = save_inbytesleft; 352 *inbytesleftp = save_inbytesleft;
354 return EILSEQ; 353 return EILSEQ;
355 } 354 }
356 355
357 if (s < 0xFFFF) 356 if (s <= 0xFFFF)
358 { 357 {
359 if (*outbytesleftp < 2) 358 if (*outbytesleftp < 2)
360 { 359 {
361 *inbufp = save_inbuf; 360 *inbufp = save_inbuf;
362 *inbytesleftp = save_inbytesleft; 361 *inbytesleftp = save_inbytesleft;
536 const uchar *from, size_t flen, struct _cpp_strbuf *to) 535 const uchar *from, size_t flen, struct _cpp_strbuf *to)
537 { 536 {
538 if (to->len + flen > to->asize) 537 if (to->len + flen > to->asize)
539 { 538 {
540 to->asize = to->len + flen; 539 to->asize = to->len + flen;
540 to->asize += to->asize / 4;
541 to->text = XRESIZEVEC (uchar, to->text, to->asize); 541 to->text = XRESIZEVEC (uchar, to->text, to->asize);
542 } 542 }
543 memcpy (to->text + to->len, from, flen); 543 memcpy (to->text + to->len, from, flen);
544 to->len += flen; 544 to->len += flen;
545 return true; 545 return true;
607 when conversion between a suitable pair of character sets is requested. */ 607 when conversion between a suitable pair of character sets is requested. */
608 608
609 #define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \ 609 #define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
610 CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO) 610 CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
611 611
612 struct conversion 612 struct cpp_conversion
613 { 613 {
614 const char *pair; 614 const char *pair;
615 convert_f func; 615 convert_f func;
616 iconv_t fake_cd; 616 iconv_t fake_cd;
617 }; 617 };
618 static const struct conversion conversion_tab[] = { 618 static const struct cpp_conversion conversion_tab[] = {
619 { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 }, 619 { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
620 { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 }, 620 { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
621 { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 }, 621 { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
622 { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 }, 622 { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
623 { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 }, 623 { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
810 return c; 810 return c;
811 } 811 }
812 812
813 813
814 814
815 /* cpp_substring_ranges's constructor. */
816
817 cpp_substring_ranges::cpp_substring_ranges () :
818 m_ranges (NULL),
819 m_num_ranges (0),
820 m_alloc_ranges (8)
821 {
822 m_ranges = XNEWVEC (source_range, m_alloc_ranges);
823 }
824
825 /* cpp_substring_ranges's destructor. */
826
827 cpp_substring_ranges::~cpp_substring_ranges ()
828 {
829 free (m_ranges);
830 }
831
832 /* Add RANGE to the vector of source_range information. */
833
834 void
835 cpp_substring_ranges::add_range (source_range range)
836 {
837 if (m_num_ranges >= m_alloc_ranges)
838 {
839 m_alloc_ranges *= 2;
840 m_ranges
841 = (source_range *)xrealloc (m_ranges,
842 sizeof (source_range) * m_alloc_ranges);
843 }
844 m_ranges[m_num_ranges++] = range;
845 }
846
847 /* Read NUM ranges from LOC_READER, adding them to the vector of source_range
848 information. */
849
850 void
851 cpp_substring_ranges::add_n_ranges (int num,
852 cpp_string_location_reader &loc_reader)
853 {
854 for (int i = 0; i < num; i++)
855 add_range (loc_reader.get_next ());
856 }
857
858
859
815 /* Utility routine that computes a mask of the form 0000...111... with 860 /* Utility routine that computes a mask of the form 0000...111... with
816 WIDTH 1-bits. */ 861 WIDTH 1-bits. */
817 static inline size_t 862 static inline size_t
818 width_to_mask (size_t width) 863 width_to_mask (size_t width)
819 { 864 {
827 /* A large table of unicode character information. */ 872 /* A large table of unicode character information. */
828 enum { 873 enum {
829 /* Valid in a C99 identifier? */ 874 /* Valid in a C99 identifier? */
830 C99 = 1, 875 C99 = 1,
831 /* Valid in a C99 identifier, but not as the first character? */ 876 /* Valid in a C99 identifier, but not as the first character? */
832 DIG = 2, 877 N99 = 2,
833 /* Valid in a C++ identifier? */ 878 /* Valid in a C++ identifier? */
834 CXX = 4, 879 CXX = 4,
880 /* Valid in a C11/C++11 identifier? */
881 C11 = 8,
882 /* Valid in a C11/C++11 identifier, but not as the first character? */
883 N11 = 16,
835 /* NFC representation is not valid in an identifier? */ 884 /* NFC representation is not valid in an identifier? */
836 CID = 8, 885 CID = 32,
837 /* Might be valid NFC form? */ 886 /* Might be valid NFC form? */
838 NFC = 16, 887 NFC = 64,
839 /* Might be valid NFKC form? */ 888 /* Might be valid NFKC form? */
840 NKC = 32, 889 NKC = 128,
841 /* Certain preceding characters might make it not valid NFC/NKFC form? */ 890 /* Certain preceding characters might make it not valid NFC/NKFC form? */
842 CTX = 64 891 CTX = 256
843 }; 892 };
844 893
845 static const struct { 894 struct ucnrange {
846 /* Bitmap of flags above. */ 895 /* Bitmap of flags above. */
847 unsigned char flags; 896 unsigned short flags;
848 /* Combining class of the character. */ 897 /* Combining class of the character. */
849 unsigned char combine; 898 unsigned char combine;
850 /* Last character in the range described by this entry. */ 899 /* Last character in the range described by this entry. */
851 unsigned short end; 900 unsigned int end;
852 } ucnranges[] = { 901 };
853 #include "ucnid.h" 902 #include "ucnid.h"
854 };
855 903
856 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at 904 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
857 the start of an identifier, and 0 if C is not valid in an 905 the start of an identifier, and 0 if C is not valid in an
858 identifier. We assume C has already gone through the checks of 906 identifier. We assume C has already gone through the checks of
859 _cpp_valid_ucn. Also update NST for C if returning nonzero. The 907 _cpp_valid_ucn. Also update NST for C if returning nonzero. The
863 static int 911 static int
864 ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c, 912 ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
865 struct normalize_state *nst) 913 struct normalize_state *nst)
866 { 914 {
867 int mn, mx, md; 915 int mn, mx, md;
868 916 unsigned short valid_flags, invalid_start_flags;
869 if (c > 0xFFFF) 917
918 if (c > 0x10FFFF)
870 return 0; 919 return 0;
871 920
872 mn = 0; 921 mn = 0;
873 mx = ARRAY_SIZE (ucnranges) - 1; 922 mx = ARRAY_SIZE (ucnranges) - 1;
874 while (mx != mn) 923 while (mx != mn)
880 mn = md + 1; 929 mn = md + 1;
881 } 930 }
882 931
883 /* When -pedantic, we require the character to have been listed by 932 /* When -pedantic, we require the character to have been listed by
884 the standard for the current language. Otherwise, we accept the 933 the standard for the current language. Otherwise, we accept the
885 union of the acceptable sets for C++98 and C99. */ 934 union of the acceptable sets for all supported language versions. */
886 if (! (ucnranges[mn].flags & (C99 | CXX))) 935 valid_flags = C99 | CXX | C11;
936 if (CPP_PEDANTIC (pfile))
937 {
938 if (CPP_OPTION (pfile, c11_identifiers))
939 valid_flags = C11;
940 else if (CPP_OPTION (pfile, c99))
941 valid_flags = C99;
942 else if (CPP_OPTION (pfile, cplusplus))
943 valid_flags = CXX;
944 }
945 if (! (ucnranges[mn].flags & valid_flags))
887 return 0; 946 return 0;
888 947 if (CPP_OPTION (pfile, c11_identifiers))
889 if (CPP_PEDANTIC (pfile) 948 invalid_start_flags = N11;
890 && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99)) 949 else if (CPP_OPTION (pfile, c99))
891 || (CPP_OPTION (pfile, cplusplus) 950 invalid_start_flags = N99;
892 && !(ucnranges[mn].flags & CXX)))) 951 else
893 return 0; 952 invalid_start_flags = 0;
894 953
895 /* Update NST. */ 954 /* Update NST. */
896 if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class) 955 if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
897 nst->level = normalized_none; 956 nst->level = normalized_none;
898 else if (ucnranges[mn].flags & CTX) 957 else if (ucnranges[mn].flags & CTX)
899 { 958 {
900 bool safe; 959 bool safe;
901 cppchar_t p = nst->previous; 960 cppchar_t p = nst->previous;
902 961
903 /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam. */
904 if (c == 0x09BE)
905 safe = p != 0x09C7; /* Use 09CB instead of 09C7 09BE. */
906 else if (c == 0x0B3E)
907 safe = p != 0x0B47; /* Use 0B4B instead of 0B47 0B3E. */
908 else if (c == 0x0BBE)
909 safe = p != 0x0BC6 && p != 0x0BC7; /* Use 0BCA/0BCB instead. */
910 else if (c == 0x0CC2)
911 safe = p != 0x0CC6; /* Use 0CCA instead of 0CC6 0CC2. */
912 else if (c == 0x0D3E)
913 safe = p != 0x0D46 && p != 0x0D47; /* Use 0D4A/0D4B instead. */
914 /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC, 962 /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
915 and are combined algorithmically from a sequence of the form 963 and are combined algorithmically from a sequence of the form
916 1100-1112 1161-1175 11A8-11C2 964 1100-1112 1161-1175 11A8-11C2
917 (if the third is not present, it is treated as 11A7, which is not 965 (if the third is not present, it is treated as 11A7, which is not
918 really a valid character). 966 really a valid character).
919 Unfortunately, C99 allows (only) the NFC form, but C++ allows 967 Unfortunately, C99 allows (only) the NFC form, but C++ allows
920 only the combining characters. */ 968 only the combining characters. */
921 else if (c >= 0x1161 && c <= 0x1175) 969 if (c >= 0x1161 && c <= 0x1175)
922 safe = p < 0x1100 || p > 0x1112; 970 safe = p < 0x1100 || p > 0x1112;
923 else if (c >= 0x11A8 && c <= 0x11C2) 971 else if (c >= 0x11A8 && c <= 0x11C2)
924 safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0); 972 safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
925 else 973 else
974 safe = check_nfc (pfile, c, p);
975 if (!safe)
926 { 976 {
927 /* Uh-oh, someone updated ucnid.h without updating this code. */ 977 if ((c >= 0x1161 && c <= 0x1175) || (c >= 0x11A8 && c <= 0x11C2))
928 cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c); 978 nst->level = MAX (nst->level, normalized_identifier_C);
929 safe = true; 979 else
980 nst->level = normalized_none;
930 } 981 }
931 if (!safe && c < 0x1161)
932 nst->level = normalized_none;
933 else if (!safe)
934 nst->level = MAX (nst->level, normalized_identifier_C);
935 } 982 }
936 else if (ucnranges[mn].flags & NKC) 983 else if (ucnranges[mn].flags & NKC)
937 ; 984 ;
938 else if (ucnranges[mn].flags & NFC) 985 else if (ucnranges[mn].flags & NFC)
939 nst->level = MAX (nst->level, normalized_C); 986 nst->level = MAX (nst->level, normalized_C);
940 else if (ucnranges[mn].flags & CID) 987 else if (ucnranges[mn].flags & CID)
941 nst->level = MAX (nst->level, normalized_identifier_C); 988 nst->level = MAX (nst->level, normalized_identifier_C);
942 else 989 else
943 nst->level = normalized_none; 990 nst->level = normalized_none;
944 nst->previous = c; 991 if (ucnranges[mn].combine == 0)
992 nst->previous = c;
945 nst->prev_class = ucnranges[mn].combine; 993 nst->prev_class = ucnranges[mn].combine;
946 994
947 /* In C99, UCN digits may not begin identifiers. */ 995 /* In C99, UCN digits may not begin identifiers. In C11 and C++11,
948 if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG)) 996 UCN combining characters may not begin identifiers. */
997 if (ucnranges[mn].flags & invalid_start_flags)
949 return 2; 998 return 2;
950 999
951 return 1; 1000 return 1;
952 } 1001 }
953 1002
966 C99 6.4.3: A universal character name shall not specify a character 1015 C99 6.4.3: A universal character name shall not specify a character
967 whose short identifier is less than 00A0 other than 0024 ($), 0040 (@), 1016 whose short identifier is less than 00A0 other than 0024 ($), 0040 (@),
968 or 0060 (`), nor one in the range D800 through DFFF inclusive. 1017 or 0060 (`), nor one in the range D800 through DFFF inclusive.
969 1018
970 *PSTR must be preceded by "\u" or "\U"; it is assumed that the 1019 *PSTR must be preceded by "\u" or "\U"; it is assumed that the
971 buffer end is delimited by a non-hex digit. Returns zero if the 1020 buffer end is delimited by a non-hex digit. Returns false if the
972 UCN has not been consumed. 1021 UCN has not been consumed, true otherwise.
973 1022
974 Otherwise the nonzero value of the UCN, whether valid or invalid, 1023 The value of the UCN, whether valid or invalid, is returned in *CP.
975 is returned. Diagnostics are emitted for invalid values. PSTR 1024 Diagnostics are emitted for invalid values. PSTR is updated to point
976 is updated to point one beyond the UCN, or to the syntactically 1025 one beyond the UCN, or to the syntactically invalid character.
977 invalid character.
978 1026
979 IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of 1027 IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
980 an identifier, or 2 otherwise. */ 1028 an identifier, or 2 otherwise.
981 1029
982 cppchar_t 1030 If LOC_READER is non-NULL, then position information is
1031 read from *LOC_READER and CHAR_RANGE->m_finish is updated accordingly. */
1032
1033 bool
983 _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr, 1034 _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
984 const uchar *limit, int identifier_pos, 1035 const uchar *limit, int identifier_pos,
985 struct normalize_state *nst) 1036 struct normalize_state *nst, cppchar_t *cp,
1037 source_range *char_range,
1038 cpp_string_location_reader *loc_reader)
986 { 1039 {
987 cppchar_t result, c; 1040 cppchar_t result, c;
988 unsigned int length; 1041 unsigned int length;
989 const uchar *str = *pstr; 1042 const uchar *str = *pstr;
990 const uchar *base = str - 2; 1043 const uchar *base = str - 2;
991 1044
992 if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99)) 1045 if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
993 cpp_error (pfile, CPP_DL_WARNING, 1046 cpp_error (pfile, CPP_DL_WARNING,
994 "universal character names are only valid in C++ and C99"); 1047 "universal character names are only valid in C++ and C99");
1048 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0
1049 && !CPP_OPTION (pfile, cplusplus))
1050 cpp_error (pfile, CPP_DL_WARNING,
1051 "C99's universal character names are incompatible with C90");
995 else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0) 1052 else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
996 cpp_warning (pfile, CPP_W_TRADITIONAL, 1053 cpp_warning (pfile, CPP_W_TRADITIONAL,
997 "the meaning of '\\%c' is different in traditional C", 1054 "the meaning of '\\%c' is different in traditional C",
998 (int) str[-1]); 1055 (int) str[-1]);
999 1056
1012 { 1069 {
1013 c = *str; 1070 c = *str;
1014 if (!ISXDIGIT (c)) 1071 if (!ISXDIGIT (c))
1015 break; 1072 break;
1016 str++; 1073 str++;
1074 if (loc_reader)
1075 {
1076 gcc_assert (char_range);
1077 char_range->m_finish = loc_reader->get_next ().m_finish;
1078 }
1017 result = (result << 4) + hex_value (c); 1079 result = (result << 4) + hex_value (c);
1018 } 1080 }
1019 while (--length && str < limit); 1081 while (--length && str < limit);
1020 1082
1021 /* Partial UCNs are not valid in strings, but decompose into 1083 /* Partial UCNs are not valid in strings, but decompose into
1022 multiple tokens in identifiers, so we can't give a helpful 1084 multiple tokens in identifiers, so we can't give a helpful
1023 error message in that case. */ 1085 error message in that case. */
1024 if (length && identifier_pos) 1086 if (length && identifier_pos)
1025 return 0; 1087 {
1026 1088 *cp = 0;
1089 return false;
1090 }
1091
1027 *pstr = str; 1092 *pstr = str;
1028 if (length) 1093 if (length)
1029 { 1094 {
1030 cpp_error (pfile, CPP_DL_ERROR, 1095 cpp_error (pfile, CPP_DL_ERROR,
1031 "incomplete universal character name %.*s", 1096 "incomplete universal character name %.*s",
1053 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping) 1118 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1054 { 1119 {
1055 CPP_OPTION (pfile, warn_dollars) = 0; 1120 CPP_OPTION (pfile, warn_dollars) = 0;
1056 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number"); 1121 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1057 } 1122 }
1058 NORMALIZE_STATE_UPDATE_IDNUM (nst); 1123 NORMALIZE_STATE_UPDATE_IDNUM (nst, result);
1059 } 1124 }
1060 else if (identifier_pos) 1125 else if (identifier_pos)
1061 { 1126 {
1062 int validity = ucn_valid_in_identifier (pfile, result, nst); 1127 int validity = ucn_valid_in_identifier (pfile, result, nst);
1063 1128
1069 cpp_error (pfile, CPP_DL_ERROR, 1134 cpp_error (pfile, CPP_DL_ERROR,
1070 "universal character %.*s is not valid at the start of an identifier", 1135 "universal character %.*s is not valid at the start of an identifier",
1071 (int) (str - base), base); 1136 (int) (str - base), base);
1072 } 1137 }
1073 1138
1074 if (result == 0) 1139 *cp = result;
1075 result = 1; 1140 return true;
1076
1077 return result;
1078 } 1141 }
1079 1142
1080 /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate 1143 /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
1081 it to the execution character set and write the result into TBUF. 1144 it to the execution character set and write the result into TBUF,
1082 An advanced pointer is returned. Issues all relevant diagnostics. */ 1145 if TBUF is non-NULL.
1146 An advanced pointer is returned. Issues all relevant diagnostics.
1147 If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
1148 contains the location of the character so far: location information
1149 is read from *LOC_READER, and *RANGES is updated accordingly. */
1083 static const uchar * 1150 static const uchar *
1084 convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit, 1151 convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
1085 struct _cpp_strbuf *tbuf, struct cset_converter cvt) 1152 struct _cpp_strbuf *tbuf, struct cset_converter cvt,
1153 source_range char_range,
1154 cpp_string_location_reader *loc_reader,
1155 cpp_substring_ranges *ranges)
1086 { 1156 {
1087 cppchar_t ucn; 1157 cppchar_t ucn;
1088 uchar buf[6]; 1158 uchar buf[6];
1089 uchar *bufp = buf; 1159 uchar *bufp = buf;
1090 size_t bytesleft = 6; 1160 size_t bytesleft = 6;
1091 int rval; 1161 int rval;
1092 struct normalize_state nst = INITIAL_NORMALIZE_STATE; 1162 struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1093 1163
1164 /* loc_reader and ranges must either be both NULL, or both be non-NULL. */
1165 gcc_assert ((loc_reader != NULL) == (ranges != NULL));
1166
1094 from++; /* Skip u/U. */ 1167 from++; /* Skip u/U. */
1095 ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst); 1168
1169 if (loc_reader)
1170 /* The u/U is part of the spelling of this character. */
1171 char_range.m_finish = loc_reader->get_next ().m_finish;
1172
1173 _cpp_valid_ucn (pfile, &from, limit, 0, &nst,
1174 &ucn, &char_range, loc_reader);
1096 1175
1097 rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft); 1176 rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
1098 if (rval) 1177 if (rval)
1099 { 1178 {
1100 errno = rval; 1179 errno = rval;
1101 cpp_errno (pfile, CPP_DL_ERROR, 1180 cpp_errno (pfile, CPP_DL_ERROR,
1102 "converting UCN to source character set"); 1181 "converting UCN to source character set");
1103 } 1182 }
1104 else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf)) 1183 else
1105 cpp_errno (pfile, CPP_DL_ERROR, 1184 {
1106 "converting UCN to execution character set"); 1185 if (tbuf)
1186 if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
1187 cpp_errno (pfile, CPP_DL_ERROR,
1188 "converting UCN to execution character set");
1189
1190 if (loc_reader)
1191 {
1192 int num_encoded_bytes = 6 - bytesleft;
1193 for (int i = 0; i < num_encoded_bytes; i++)
1194 ranges->add_range (char_range);
1195 }
1196 }
1107 1197
1108 return from; 1198 return from;
1109 } 1199 }
1110 1200
1111 /* Subroutine of convert_hex and convert_oct. N is the representation 1201 /* Subroutine of convert_hex and convert_oct. N is the representation
1157 tbuf->text[tbuf->len++] = n; 1247 tbuf->text[tbuf->len++] = n;
1158 } 1248 }
1159 } 1249 }
1160 1250
1161 /* Convert a hexadecimal escape, pointed to by FROM, to the execution 1251 /* Convert a hexadecimal escape, pointed to by FROM, to the execution
1162 character set and write it into the string buffer TBUF. Returns an 1252 character set and write it into the string buffer TBUF (if non-NULL).
1163 advanced pointer, and issues diagnostics as necessary. 1253 Returns an advanced pointer, and issues diagnostics as necessary.
1164 No character set translation occurs; this routine always produces the 1254 No character set translation occurs; this routine always produces the
1165 execution-set character with numeric value equal to the given hex 1255 execution-set character with numeric value equal to the given hex
1166 number. You can, e.g. generate surrogate pairs this way. */ 1256 number. You can, e.g. generate surrogate pairs this way.
1257 If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
1258 contains the location of the character so far: location information
1259 is read from *LOC_READER, and *RANGES is updated accordingly. */
1167 static const uchar * 1260 static const uchar *
1168 convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit, 1261 convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
1169 struct _cpp_strbuf *tbuf, struct cset_converter cvt) 1262 struct _cpp_strbuf *tbuf, struct cset_converter cvt,
1263 source_range char_range,
1264 cpp_string_location_reader *loc_reader,
1265 cpp_substring_ranges *ranges)
1170 { 1266 {
1171 cppchar_t c, n = 0, overflow = 0; 1267 cppchar_t c, n = 0, overflow = 0;
1172 int digits_found = 0; 1268 int digits_found = 0;
1173 size_t width = cvt.width; 1269 size_t width = cvt.width;
1174 size_t mask = width_to_mask (width); 1270 size_t mask = width_to_mask (width);
1175 1271
1272 /* loc_reader and ranges must either be both NULL, or both be non-NULL. */
1273 gcc_assert ((loc_reader != NULL) == (ranges != NULL));
1274
1176 if (CPP_WTRADITIONAL (pfile)) 1275 if (CPP_WTRADITIONAL (pfile))
1177 cpp_warning (pfile, CPP_W_TRADITIONAL, 1276 cpp_warning (pfile, CPP_W_TRADITIONAL,
1178 "the meaning of '\\x' is different in traditional C"); 1277 "the meaning of '\\x' is different in traditional C");
1179 1278
1180 from++; /* Skip 'x'. */ 1279 /* Skip 'x'. */
1280 from++;
1281
1282 /* The 'x' is part of the spelling of this character. */
1283 if (loc_reader)
1284 char_range.m_finish = loc_reader->get_next ().m_finish;
1285
1181 while (from < limit) 1286 while (from < limit)
1182 { 1287 {
1183 c = *from; 1288 c = *from;
1184 if (! hex_p (c)) 1289 if (! hex_p (c))
1185 break; 1290 break;
1186 from++; 1291 from++;
1292 if (loc_reader)
1293 char_range.m_finish = loc_reader->get_next ().m_finish;
1187 overflow |= n ^ (n << 4 >> 4); 1294 overflow |= n ^ (n << 4 >> 4);
1188 n = (n << 4) + hex_value (c); 1295 n = (n << 4) + hex_value (c);
1189 digits_found = 1; 1296 digits_found = 1;
1190 } 1297 }
1191 1298
1201 cpp_error (pfile, CPP_DL_PEDWARN, 1308 cpp_error (pfile, CPP_DL_PEDWARN,
1202 "hex escape sequence out of range"); 1309 "hex escape sequence out of range");
1203 n &= mask; 1310 n &= mask;
1204 } 1311 }
1205 1312
1206 emit_numeric_escape (pfile, n, tbuf, cvt); 1313 if (tbuf)
1314 emit_numeric_escape (pfile, n, tbuf, cvt);
1315 if (ranges)
1316 ranges->add_range (char_range);
1207 1317
1208 return from; 1318 return from;
1209 } 1319 }
1210 1320
1211 /* Convert an octal escape, pointed to by FROM, to the execution 1321 /* Convert an octal escape, pointed to by FROM, to the execution
1212 character set and write it into the string buffer TBUF. Returns an 1322 character set and write it into the string buffer TBUF. Returns an
1213 advanced pointer, and issues diagnostics as necessary. 1323 advanced pointer, and issues diagnostics as necessary.
1214 No character set translation occurs; this routine always produces the 1324 No character set translation occurs; this routine always produces the
1215 execution-set character with numeric value equal to the given octal 1325 execution-set character with numeric value equal to the given octal
1216 number. */ 1326 number.
1327 If LOC_READER is non-NULL, then RANGES must be non-NULL and CHAR_RANGE
1328 contains the location of the character so far: location information
1329 is read from *LOC_READER, and *RANGES is updated accordingly. */
1217 static const uchar * 1330 static const uchar *
1218 convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit, 1331 convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
1219 struct _cpp_strbuf *tbuf, struct cset_converter cvt) 1332 struct _cpp_strbuf *tbuf, struct cset_converter cvt,
1333 source_range char_range,
1334 cpp_string_location_reader *loc_reader,
1335 cpp_substring_ranges *ranges)
1220 { 1336 {
1221 size_t count = 0; 1337 size_t count = 0;
1222 cppchar_t c, n = 0; 1338 cppchar_t c, n = 0;
1223 size_t width = cvt.width; 1339 size_t width = cvt.width;
1224 size_t mask = width_to_mask (width); 1340 size_t mask = width_to_mask (width);
1225 bool overflow = false; 1341 bool overflow = false;
1226 1342
1343 /* loc_reader and ranges must either be both NULL, or both be non-NULL. */
1344 gcc_assert ((loc_reader != NULL) == (ranges != NULL));
1345
1227 while (from < limit && count++ < 3) 1346 while (from < limit && count++ < 3)
1228 { 1347 {
1229 c = *from; 1348 c = *from;
1230 if (c < '0' || c > '7') 1349 if (c < '0' || c > '7')
1231 break; 1350 break;
1232 from++; 1351 from++;
1352 if (loc_reader)
1353 char_range.m_finish = loc_reader->get_next ().m_finish;
1233 overflow |= n ^ (n << 3 >> 3); 1354 overflow |= n ^ (n << 3 >> 3);
1234 n = (n << 3) + c - '0'; 1355 n = (n << 3) + c - '0';
1235 } 1356 }
1236 1357
1237 if (n != (n & mask)) 1358 if (n != (n & mask))
1239 cpp_error (pfile, CPP_DL_PEDWARN, 1360 cpp_error (pfile, CPP_DL_PEDWARN,
1240 "octal escape sequence out of range"); 1361 "octal escape sequence out of range");
1241 n &= mask; 1362 n &= mask;
1242 } 1363 }
1243 1364
1244 emit_numeric_escape (pfile, n, tbuf, cvt); 1365 if (tbuf)
1366 emit_numeric_escape (pfile, n, tbuf, cvt);
1367 if (ranges)
1368 ranges->add_range (char_range);
1245 1369
1246 return from; 1370 return from;
1247 } 1371 }
1248 1372
1249 /* Convert an escape sequence (pointed to by FROM) to its value on 1373 /* Convert an escape sequence (pointed to by FROM) to its value on
1250 the target, and to the execution character set. Do not scan past 1374 the target, and to the execution character set. Do not scan past
1251 LIMIT. Write the converted value into TBUF. Returns an advanced 1375 LIMIT. Write the converted value into TBUF, if TBUF is non-NULL.
1252 pointer. Handles all relevant diagnostics. */ 1376 Returns an advanced pointer. Handles all relevant diagnostics.
1377 If LOC_READER is non-NULL, then RANGES must be non-NULL: location
1378 information is read from *LOC_READER, and *RANGES is updated
1379 accordingly. */
1253 static const uchar * 1380 static const uchar *
1254 convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit, 1381 convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
1255 struct _cpp_strbuf *tbuf, struct cset_converter cvt) 1382 struct _cpp_strbuf *tbuf, struct cset_converter cvt,
1383 cpp_string_location_reader *loc_reader,
1384 cpp_substring_ranges *ranges)
1256 { 1385 {
1257 /* Values of \a \b \e \f \n \r \t \v respectively. */ 1386 /* Values of \a \b \e \f \n \r \t \v respectively. */
1258 #if HOST_CHARSET == HOST_CHARSET_ASCII 1387 #if HOST_CHARSET == HOST_CHARSET_ASCII
1259 static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 }; 1388 static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
1260 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC 1389 #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
1263 #error "unknown host character set" 1392 #error "unknown host character set"
1264 #endif 1393 #endif
1265 1394
1266 uchar c; 1395 uchar c;
1267 1396
1397 /* Record the location of the backslash. */
1398 source_range char_range;
1399 if (loc_reader)
1400 char_range = loc_reader->get_next ();
1401
1268 c = *from; 1402 c = *from;
1269 switch (c) 1403 switch (c)
1270 { 1404 {
1271 /* UCNs, hex escapes, and octal escapes are processed separately. */ 1405 /* UCNs, hex escapes, and octal escapes are processed separately. */
1272 case 'u': case 'U': 1406 case 'u': case 'U':
1273 return convert_ucn (pfile, from, limit, tbuf, cvt); 1407 return convert_ucn (pfile, from, limit, tbuf, cvt,
1408 char_range, loc_reader, ranges);
1274 1409
1275 case 'x': 1410 case 'x':
1276 return convert_hex (pfile, from, limit, tbuf, cvt); 1411 return convert_hex (pfile, from, limit, tbuf, cvt,
1412 char_range, loc_reader, ranges);
1277 break; 1413 break;
1278 1414
1279 case '0': case '1': case '2': case '3': 1415 case '0': case '1': case '2': case '3':
1280 case '4': case '5': case '6': case '7': 1416 case '4': case '5': case '6': case '7':
1281 return convert_oct (pfile, from, limit, tbuf, cvt); 1417 return convert_oct (pfile, from, limit, tbuf, cvt,
1418 char_range, loc_reader, ranges);
1282 1419
1283 /* Various letter escapes. Get the appropriate host-charset 1420 /* Various letter escapes. Get the appropriate host-charset
1284 value into C. */ 1421 value into C. */
1285 case '\\': case '\'': case '"': case '?': break; 1422 case '\\': case '\'': case '"': case '?': break;
1286 1423
1328 cpp_error (pfile, CPP_DL_PEDWARN, 1465 cpp_error (pfile, CPP_DL_PEDWARN,
1329 "unknown escape sequence: '\\%s'", buf); 1466 "unknown escape sequence: '\\%s'", buf);
1330 } 1467 }
1331 } 1468 }
1332 1469
1333 /* Now convert what we have to the execution character set. */ 1470 if (tbuf)
1334 if (!APPLY_CONVERSION (cvt, &c, 1, tbuf)) 1471 /* Now convert what we have to the execution character set. */
1335 cpp_errno (pfile, CPP_DL_ERROR, 1472 if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
1336 "converting escape sequence to execution character set"); 1473 cpp_errno (pfile, CPP_DL_ERROR,
1474 "converting escape sequence to execution character set");
1475
1476 if (loc_reader)
1477 {
1478 char_range.m_finish = loc_reader->get_next ().m_finish;
1479 ranges->add_range (char_range);
1480 }
1337 1481
1338 return from + 1; 1482 return from + 1;
1339 } 1483 }
1340 1484
1341 /* TYPE is a token type. The return value is the conversion needed to 1485 /* TYPE is a token type. The return value is the conversion needed to
1345 { 1489 {
1346 switch (type) 1490 switch (type)
1347 { 1491 {
1348 default: 1492 default:
1349 return pfile->narrow_cset_desc; 1493 return pfile->narrow_cset_desc;
1494 case CPP_UTF8CHAR:
1350 case CPP_UTF8STRING: 1495 case CPP_UTF8STRING:
1351 return pfile->utf8_cset_desc; 1496 return pfile->utf8_cset_desc;
1352 case CPP_CHAR16: 1497 case CPP_CHAR16:
1353 case CPP_STRING16: 1498 case CPP_STRING16:
1354 return pfile->char16_cset_desc; 1499 return pfile->char16_cset_desc;
1363 1508
1364 /* FROM is an array of cpp_string structures of length COUNT. These 1509 /* FROM is an array of cpp_string structures of length COUNT. These
1365 are to be converted from the source to the execution character set, 1510 are to be converted from the source to the execution character set,
1366 escape sequences translated, and finally all are to be 1511 escape sequences translated, and finally all are to be
1367 concatenated. WIDE indicates whether or not to produce a wide 1512 concatenated. WIDE indicates whether or not to produce a wide
1368 string. The result is written into TO. Returns true for success, 1513 string. If TO is non-NULL, the result is written into TO.
1369 false for failure. */ 1514 If LOC_READERS and OUT are non-NULL, then location information
1370 bool 1515 is read from LOC_READERS (which must be an array of length COUNT),
1371 cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count, 1516 and location information is written to *RANGES.
1372 cpp_string *to, enum cpp_ttype type) 1517
1518 Returns true for success, false for failure. */
1519
1520 static bool
1521 cpp_interpret_string_1 (cpp_reader *pfile, const cpp_string *from, size_t count,
1522 cpp_string *to, enum cpp_ttype type,
1523 cpp_string_location_reader *loc_readers,
1524 cpp_substring_ranges *out)
1373 { 1525 {
1374 struct _cpp_strbuf tbuf; 1526 struct _cpp_strbuf tbuf;
1375 const uchar *p, *base, *limit; 1527 const uchar *p, *base, *limit;
1376 size_t i; 1528 size_t i;
1377 struct cset_converter cvt = converter_for_type (pfile, type); 1529 struct cset_converter cvt = converter_for_type (pfile, type);
1378 1530
1379 tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len); 1531 /* loc_readers and out must either be both NULL, or both be non-NULL. */
1380 tbuf.text = XNEWVEC (uchar, tbuf.asize); 1532 gcc_assert ((loc_readers != NULL) == (out != NULL));
1381 tbuf.len = 0; 1533
1382 1534 if (to)
1535 {
1536 tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
1537 tbuf.text = XNEWVEC (uchar, tbuf.asize);
1538 tbuf.len = 0;
1539 }
1540
1541 cpp_string_location_reader *loc_reader = NULL;
1383 for (i = 0; i < count; i++) 1542 for (i = 0; i < count; i++)
1384 { 1543 {
1544 if (loc_readers)
1545 loc_reader = &loc_readers[i];
1546
1385 p = from[i].text; 1547 p = from[i].text;
1386 if (*p == 'u') 1548 if (*p == 'u')
1387 { 1549 {
1388 if (*++p == '8') 1550 p++;
1389 p++; 1551 if (loc_reader)
1552 loc_reader->get_next ();
1553 if (*p == '8')
1554 {
1555 p++;
1556 if (loc_reader)
1557 loc_reader->get_next ();
1558 }
1390 } 1559 }
1391 else if (*p == 'L' || *p == 'U') p++; 1560 else if (*p == 'L' || *p == 'U') p++;
1392 if (*p == 'R') 1561 if (*p == 'R')
1393 { 1562 {
1394 const uchar *prefix; 1563 const uchar *prefix;
1395 1564
1396 /* Skip over 'R"'. */ 1565 /* Skip over 'R"'. */
1397 p += 2; 1566 p += 2;
1567 if (loc_reader)
1568 {
1569 loc_reader->get_next ();
1570 loc_reader->get_next ();
1571 }
1398 prefix = p; 1572 prefix = p;
1399 while (*p != '(') 1573 while (*p != '(')
1400 p++; 1574 {
1575 p++;
1576 if (loc_reader)
1577 loc_reader->get_next ();
1578 }
1401 p++; 1579 p++;
1580 if (loc_reader)
1581 loc_reader->get_next ();
1402 limit = from[i].text + from[i].len; 1582 limit = from[i].text + from[i].len;
1403 if (limit >= p + (p - prefix) + 1) 1583 if (limit >= p + (p - prefix) + 1)
1404 limit -= (p - prefix) + 1; 1584 limit -= (p - prefix) + 1;
1405 1585
1406 /* Raw strings are all normal characters; these can be fed 1586 /* Raw strings are all normal characters; these can be fed
1407 directly to convert_cset. */ 1587 directly to convert_cset. */
1408 if (!APPLY_CONVERSION (cvt, p, limit - p, &tbuf)) 1588 if (to)
1409 goto fail; 1589 if (!APPLY_CONVERSION (cvt, p, limit - p, &tbuf))
1590 goto fail;
1591
1592 if (loc_reader)
1593 {
1594 /* If generating source ranges, assume we have a 1:1
1595 correspondence between bytes in the source encoding and bytes
1596 in the execution encoding (e.g. if we have a UTF-8 to UTF-8
1597 conversion), so that this run of bytes in the source file
1598 corresponds to a run of bytes in the execution string.
1599 This requirement is guaranteed by an early-reject in
1600 cpp_interpret_string_ranges. */
1601 gcc_assert (cvt.func == convert_no_conversion);
1602 out->add_n_ranges (limit - p, *loc_reader);
1603 }
1410 1604
1411 continue; 1605 continue;
1412 } 1606 }
1413 1607
1414 p++; /* Skip leading quote. */ 1608 /* If we don't now have a leading quote, something has gone wrong.
1609 This can occur if cpp_interpret_string_ranges is handling a
1610 stringified macro argument, but should not be possible otherwise. */
1611 if (*p != '"' && *p != '\'')
1612 {
1613 gcc_assert (out != NULL);
1614 cpp_error (pfile, CPP_DL_ERROR, "missing open quote");
1615 if (to)
1616 free (tbuf.text);
1617 return false;
1618 }
1619
1620 /* Skip leading quote. */
1621 p++;
1622 if (loc_reader)
1623 loc_reader->get_next ();
1624
1415 limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */ 1625 limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
1416 1626
1417 for (;;) 1627 for (;;)
1418 { 1628 {
1419 base = p; 1629 base = p;
1421 p++; 1631 p++;
1422 if (p > base) 1632 if (p > base)
1423 { 1633 {
1424 /* We have a run of normal characters; these can be fed 1634 /* We have a run of normal characters; these can be fed
1425 directly to convert_cset. */ 1635 directly to convert_cset. */
1426 if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf)) 1636 if (to)
1427 goto fail; 1637 if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
1638 goto fail;
1639 /* Similar to above: assumes we have a 1:1 correspondence
1640 between bytes in the source encoding and bytes in the
1641 execution encoding. */
1642 if (loc_reader)
1643 {
1644 gcc_assert (cvt.func == convert_no_conversion);
1645 out->add_n_ranges (p - base, *loc_reader);
1646 }
1428 } 1647 }
1429 if (p == limit) 1648 if (p >= limit)
1430 break; 1649 break;
1431 1650
1432 p = convert_escape (pfile, p + 1, limit, &tbuf, cvt); 1651 struct _cpp_strbuf *tbuf_ptr = to ? &tbuf : NULL;
1652 p = convert_escape (pfile, p + 1, limit, tbuf_ptr, cvt,
1653 loc_reader, out);
1433 } 1654 }
1434 } 1655 }
1435 /* NUL-terminate the 'to' buffer and translate it to a cpp_string 1656
1436 structure. */ 1657 if (to)
1437 emit_numeric_escape (pfile, 0, &tbuf, cvt); 1658 {
1438 tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len); 1659 /* NUL-terminate the 'to' buffer and translate it to a cpp_string
1439 to->text = tbuf.text; 1660 structure. */
1440 to->len = tbuf.len; 1661 emit_numeric_escape (pfile, 0, &tbuf, cvt);
1662 tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
1663 to->text = tbuf.text;
1664 to->len = tbuf.len;
1665 }
1666 /* Use the location of the trailing quote as the location of the
1667 NUL-terminator. */
1668 if (loc_reader)
1669 {
1670 source_range range = loc_reader->get_next ();
1671 out->add_range (range);
1672 }
1673
1441 return true; 1674 return true;
1442 1675
1443 fail: 1676 fail:
1444 cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set"); 1677 cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
1445 free (tbuf.text); 1678 if (to)
1679 free (tbuf.text);
1446 return false; 1680 return false;
1681 }
1682
1683 /* FROM is an array of cpp_string structures of length COUNT. These
1684 are to be converted from the source to the execution character set,
1685 escape sequences translated, and finally all are to be
1686 concatenated. WIDE indicates whether or not to produce a wide
1687 string. The result is written into TO. Returns true for success,
1688 false for failure. */
1689 bool
1690 cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
1691 cpp_string *to, enum cpp_ttype type)
1692 {
1693 return cpp_interpret_string_1 (pfile, from, count, to, type, NULL, NULL);
1694 }
1695
1696 /* A "do nothing" error-handling callback for use by
1697 cpp_interpret_string_ranges, so that it can temporarily suppress
1698 error-handling. */
1699
1700 static bool
1701 noop_error_cb (cpp_reader *, int, int, rich_location *,
1702 const char *, va_list *)
1703 {
1704 /* no-op. */
1705 return true;
1706 }
1707
1708 /* This function mimics the behavior of cpp_interpret_string, but
1709 rather than generating a string in the execution character set,
1710 *OUT is written to with the source code ranges of the characters
1711 in such a string.
1712 FROM and LOC_READERS should both be arrays of length COUNT.
1713 Returns NULL for success, or an error message for failure. */
1714
1715 const char *
1716 cpp_interpret_string_ranges (cpp_reader *pfile, const cpp_string *from,
1717 cpp_string_location_reader *loc_readers,
1718 size_t count,
1719 cpp_substring_ranges *out,
1720 enum cpp_ttype type)
1721 {
1722 /* There are a couple of cases in the range-handling in
1723 cpp_interpret_string_1 that rely on there being a 1:1 correspondence
1724 between bytes in the source encoding and bytes in the execution
1725 encoding, so that each byte in the execution string can correspond
1726 to the location of a byte in the source string.
1727
1728 This holds for the typical case of a UTF-8 to UTF-8 conversion.
1729 Enforce this requirement by only attempting to track substring
1730 locations if we have source encoding == execution encoding.
1731
1732 This is a stronger condition than we need, since we could e.g.
1733 have ASCII to EBCDIC (with 1 byte per character before and after),
1734 but it seems to be a reasonable restriction. */
1735 struct cset_converter cvt = converter_for_type (pfile, type);
1736 if (cvt.func != convert_no_conversion)
1737 return "execution character set != source character set";
1738
1739 /* For on-demand strings we have already lexed the strings, so there
1740 should be no errors. However, if we have bogus source location
1741 data (or stringified macro arguments), the attempt to lex the
1742 strings could fail with an error. Temporarily install an
1743 error-handler to catch the error, so that it can lead to this call
1744 failing, rather than being emitted as a user-visible diagnostic.
1745 If an error does occur, we should see it via the return value of
1746 cpp_interpret_string_1. */
1747 bool (*saved_error_handler) (cpp_reader *, int, int, rich_location *,
1748 const char *, va_list *)
1749 ATTRIBUTE_FPTR_PRINTF(5,0);
1750
1751 saved_error_handler = pfile->cb.error;
1752 pfile->cb.error = noop_error_cb;
1753
1754 bool result = cpp_interpret_string_1 (pfile, from, count, NULL, type,
1755 loc_readers, out);
1756
1757 /* Restore the saved error-handler. */
1758 pfile->cb.error = saved_error_handler;
1759
1760 if (!result)
1761 return "cpp_interpret_string_1 failed";
1762
1763 /* Success. */
1764 return NULL;
1447 } 1765 }
1448 1766
1449 /* Subroutine of do_line and do_linemarker. Convert escape sequences 1767 /* Subroutine of do_line and do_linemarker. Convert escape sequences
1450 in a string, but do not perform character set conversion. */ 1768 in a string, but do not perform character set conversion. */
1451 bool 1769 bool
1601 cppchar_t 1919 cppchar_t
1602 cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token, 1920 cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
1603 unsigned int *pchars_seen, int *unsignedp) 1921 unsigned int *pchars_seen, int *unsignedp)
1604 { 1922 {
1605 cpp_string str = { 0, 0 }; 1923 cpp_string str = { 0, 0 };
1606 bool wide = (token->type != CPP_CHAR); 1924 bool wide = (token->type != CPP_CHAR && token->type != CPP_UTF8CHAR);
1925 int u8 = 2 * int(token->type == CPP_UTF8CHAR);
1607 cppchar_t result; 1926 cppchar_t result;
1608 1927
1609 /* an empty constant will appear as L'', u'', U'' or '' */ 1928 /* An empty constant will appear as L'', u'', U'', u8'', or '' */
1610 if (token->val.str.len == (size_t) (2 + wide)) 1929 if (token->val.str.len == (size_t) (2 + wide + u8))
1611 { 1930 {
1612 cpp_error (pfile, CPP_DL_ERROR, "empty character constant"); 1931 cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
1932 *pchars_seen = 0;
1933 *unsignedp = 0;
1613 return 0; 1934 return 0;
1614 } 1935 }
1615 else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, token->type)) 1936 else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str,
1616 return 0; 1937 token->type))
1938 {
1939 *pchars_seen = 0;
1940 *unsignedp = 0;
1941 return 0;
1942 }
1617 1943
1618 if (wide) 1944 if (wide)
1619 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp, 1945 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp,
1620 token->type); 1946 token->type);
1621 else 1947 else
1727 /* Clean up the mess. */ 2053 /* Clean up the mess. */
1728 if (input_cset.func == convert_using_iconv) 2054 if (input_cset.func == convert_using_iconv)
1729 iconv_close (input_cset.cd); 2055 iconv_close (input_cset.cd);
1730 2056
1731 /* Resize buffer if we allocated substantially too much, or if we 2057 /* Resize buffer if we allocated substantially too much, or if we
1732 haven't enough space for the \n-terminator. */ 2058 haven't enough space for the \n-terminator or following
1733 if (to.len + 4096 < to.asize || to.len >= to.asize) 2059 15 bytes of padding (used to quiet warnings from valgrind or
1734 to.text = XRESIZEVEC (uchar, to.text, to.len + 1); 2060 Address Sanitizer, when the optimized lexer accesses aligned
2061 16-byte memory chunks, including the bytes after the malloced,
2062 area, and stops lexing on '\n'). */
2063 if (to.len + 4096 < to.asize || to.len + 16 > to.asize)
2064 to.text = XRESIZEVEC (uchar, to.text, to.len + 16);
2065
2066 memset (to.text + to.len, '\0', 16);
1735 2067
1736 /* If the file is using old-school Mac line endings (\r only), 2068 /* If the file is using old-school Mac line endings (\r only),
1737 terminate with another \r, not an \n, so that we do not mistake 2069 terminate with another \r, not an \n, so that we do not mistake
1738 the \r\n sequence for a single DOS line ending and erroneously 2070 the \r\n sequence for a single DOS line ending and erroneously
1739 issue the "No newline at end of file" diagnostic. */ 2071 issue the "No newline at end of file" diagnostic. */
1793 if (current_encoding == NULL || *current_encoding == '\0') 2125 if (current_encoding == NULL || *current_encoding == '\0')
1794 current_encoding = SOURCE_CHARSET; 2126 current_encoding = SOURCE_CHARSET;
1795 2127
1796 return current_encoding; 2128 return current_encoding;
1797 } 2129 }
2130
2131 /* Implementation of class cpp_string_location_reader. */
2132
2133 /* Constructor for cpp_string_location_reader. */
2134
2135 cpp_string_location_reader::
2136 cpp_string_location_reader (source_location src_loc,
2137 line_maps *line_table)
2138 : m_line_table (line_table)
2139 {
2140 src_loc = get_range_from_loc (line_table, src_loc).m_start;
2141
2142 /* SRC_LOC might be a macro location. It only makes sense to do
2143 column-by-column calculations on ordinary maps, so get the
2144 corresponding location in an ordinary map. */
2145 m_loc
2146 = linemap_resolve_location (line_table, src_loc,
2147 LRK_SPELLING_LOCATION, NULL);
2148
2149 const line_map_ordinary *map
2150 = linemap_check_ordinary (linemap_lookup (line_table, m_loc));
2151 m_offset_per_column = (1 << map->m_range_bits);
2152 }
2153
2154 /* Get the range of the next source byte. */
2155
2156 source_range
2157 cpp_string_location_reader::get_next ()
2158 {
2159 source_range result;
2160 result.m_start = m_loc;
2161 result.m_finish = m_loc;
2162 if (m_loc <= LINE_MAP_MAX_LOCATION_WITH_COLS)
2163 m_loc += m_offset_per_column;
2164 return result;
2165 }