CbC/CbC_gcc: libcpp/charset.c comparison

comparison libcpp/charset.c @ 145:1830386684a0

gcc-9.2.0

author	anatofuz
date	Thu, 13 Feb 2020 11:34:05 +0900
parents	84e7813d76e9
children

comparison

equal deleted inserted replaced

-:84e7813d76e9
+:1830386684a0
 /* CPP Library - charsets
-Copyright (C) 1998-2018 Free Software Foundation, Inc.
+Copyright (C) 1998-2020 Free Software Foundation, Inc.
 Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
 This program is free software; you can redistribute it and/or modify it
 under the terms of the GNU General Public License as published by the
 /* Last character in the range described by this entry.  */
 unsigned int end;
 };
 #include "ucnid.h"
+/* ISO 10646 defines the UCS codespace as the range 0-0x10FFFF inclusive.  */
+#define UCS_LIMIT 0x10FFFF
 /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
 the start of an identifier, and 0 if C is not valid in an
 identifier.  We assume C has already gone through the checks of
 _cpp_valid_ucn.  Also update NST for C if returning nonzero.  The
 algorithm is a simple binary search on the table defined in
 			 struct normalize_state *nst)
 {
 int mn, mx, md;
 unsigned short valid_flags, invalid_start_flags;
-if (c > 0x10FFFF)
+if (c > UCS_LIMIT)
 return 0;
 mn = 0;
 mx = ARRAY_SIZE (ucnranges) - 1;
 while (mx != mn)
 C99 6.4.3: A universal character name shall not specify a character
 whose short identifier is less than 00A0 other than 0024 ($), 0040 (@),
 or 0060 (`), nor one in the range D800 through DFFF inclusive.
+If the hexadecimal value is larger than the upper bound of the UCS
+codespace specified in ISO/IEC 10646, a pedantic warning is issued
+in all versions of C and in the C++2a or later versions of C++.
 *PSTR must be preceded by "\u" or "\U"; it is assumed that the
 buffer end is delimited by a non-hex digit.  Returns false if the
 UCN has not been consumed, true otherwise.
 The value of the UCN, whether valid or invalid, is returned in *CP.
 else if (validity == 2 && identifier_pos == 1)
 	cpp_error (pfile, CPP_DL_ERROR,
 "universal character %.*s is not valid at the start of an identifier",
 		   (int) (str - base), base);
 }
+else if (result > UCS_LIMIT
+	   && (!CPP_OPTION (pfile, cplusplus)
+	       || CPP_OPTION (pfile, lang) > CLK_CXX17))
+cpp_error (pfile, CPP_DL_PEDWARN,
+	       "%.*s is outside the UCS codespace",
+	       (int) (str - base), base);
 *cp = result;
 return true;
 }
 	    ranges->add_range (char_range);
 	}
 }
 return from;
+}
+/*  Performs a similar task as _cpp_valid_ucn, but parses UTF-8-encoded
+extended characters rather than UCNs.  If the return value is TRUE, then a
+character was successfully decoded and stored in *CP; *PSTR has been
+updated to point one past the valid UTF-8 sequence.  Diagnostics may have
+been emitted if the character parsed is not allowed in the current context.
+If the return value is FALSE, then *PSTR has not been modified and *CP may
+equal 0, to indicate that *PSTR does not form a valid UTF-8 sequence, or it
+may, when processing an identifier in C mode, equal a codepoint that was
+validly encoded but is not allowed to appear in an identifier.  In either
+case, no diagnostic is emitted, and the return value of FALSE should cause
+a new token to be formed.
+Unlike _cpp_valid_ucn, this will never be called when lexing a string; only
+a potential identifier, or a CPP_OTHER token.  NST is unused in the latter
+case.
+As in _cpp_valid_ucn, IDENTIFIER_POS is 0 when not in an identifier, 1 for
+the start of an identifier, or 2 otherwise.  */
+extern bool
+_cpp_valid_utf8 (cpp_reader *pfile,
+		 const uchar **pstr,
+		 const uchar *limit,
+		 int identifier_pos,
+		 struct normalize_state *nst,
+		 cppchar_t *cp)
+{
+const uchar *base = *pstr;
+size_t inbytesleft = limit - base;
+if (one_utf8_to_cppchar (pstr, &inbytesleft, cp))
+{
+/* No diagnostic here as this byte will rather become a
+	 new token.  */
+*cp = 0;
+return false;
+}
+if (identifier_pos)
+{
+switch (ucn_valid_in_identifier (pfile, *cp, nst))
+	{
+	case 0:
+	  /* In C++, this is an error for invalid character in an identifier
+	     because logically, the UTF-8 was converted to a UCN during
+	     translation phase 1 (even though we don't physically do it that
+	     way).  In C, this byte rather becomes grammatically a separate
+	     token.  */
+	  if (CPP_OPTION (pfile, cplusplus))
+	    cpp_error (pfile, CPP_DL_ERROR,
+		       "extended character %.*s is not valid in an identifier",
+		       (int) (*pstr - base), base);
+	  else
+	    {
+	      *pstr = base;
+	      return false;
+	    }
+	  break;
+	case 2:
+	  if (identifier_pos == 1)
+	    {
+	      /* This is treated the same way in C++ or C99 -- lexed as an
+		 identifier which is then invalid because an identifier is
+		 not allowed to start with this character.  */
+	      cpp_error (pfile, CPP_DL_ERROR,
+	  "extended character %.*s is not valid at the start of an identifier",
+			 (int) (*pstr - base), base);
+	    }
+	  break;
+	}
+}
+return true;
 }
 /* Subroutine of convert_hex and convert_oct.  N is the representation
 in the execution character set of a numeric escape; write it into the
 string buffer TBUF and update the end-of-string pointer therein.  WIDE
 /* Subroutine of cpp_interpret_charconst which performs the conversion
 to a number, for narrow strings.  STR is the string structure returned
 by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
-cpp_interpret_charconst.  */
+cpp_interpret_charconst.  TYPE is the token type.  */
 static cppchar_t
 narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
-			 unsigned int *pchars_seen, int *unsignedp)
+			 unsigned int *pchars_seen, int *unsignedp,
+			 enum cpp_ttype type)
 {
 size_t width = CPP_OPTION (pfile, char_precision);
 size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
 size_t mask = width_to_mask (width);
 size_t i;
 	result = (result << width) | c;
 else
 	result = c;
 }
+if (type == CPP_UTF8CHAR)
+max_chars = 1;
 if (i > max_chars)
 {
 i = max_chars;
-cpp_error (pfile, CPP_DL_WARNING,
+cpp_error (pfile, type == CPP_UTF8CHAR ? CPP_DL_ERROR : CPP_DL_WARNING,
 		 "character constant too long for its type");
 }
 else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
 cpp_warning (pfile, CPP_W_MULTICHAR, "multi-character character constant");
 /* Multichar constants are of type int and therefore signed.  */
 if (i > 1)
 unsigned_p = 0;
+else if (type == CPP_UTF8CHAR && !CPP_OPTION (pfile, cplusplus))
+unsigned_p = 1;
 else
 unsigned_p = CPP_OPTION (pfile, unsigned_char);
 /* Truncate the constant to its natural width, and simultaneously
 sign- or zero-extend to the full width of cppchar_t.
 size_t cmask = width_to_mask (cwidth);
 size_t nbwc = width / cwidth;
 size_t off, i;
 cppchar_t result = 0, c;
+if (str.len <= nbwc)
+{
+/* Error recovery, if no errors have been diagnosed previously,
+	 there should be at least two wide characters.  Empty literals
+	 are diagnosed earlier and we can get just the zero terminator
+	 only if there were errors diagnosed during conversion.  */
+*pchars_seen = 0;
+*unsignedp = 0;
+return 0;
+}
 /* This is finicky because the string is in the target's byte order,
 which may not be our byte order.  Only the last character, ignoring
 the NUL terminator, is relevant.  */
 off = str.len - (nbwc * 2);
 result = 0;
 /* Wide character constants have type wchar_t, and a single
 character exactly fills a wchar_t, so a multi-character wide
 character constant is guaranteed to overflow.  */
 if (str.len > nbwc * 2)
-cpp_error (pfile, CPP_DL_WARNING,
+cpp_error (pfile, (CPP_OPTION (pfile, cplusplus)
+		       && (type == CPP_CHAR16 || type == CPP_CHAR32))
+		      ? CPP_DL_ERROR : CPP_DL_WARNING,
 	       "character constant too long for its type");
 /* Truncate the constant to its natural width, and simultaneously
 sign- or zero-extend to the full width of cppchar_t.  */
 if (width < BITS_PER_CPPCHAR_T)
 if (wide)
 result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp,
 				    token->type);
 else
-result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
+result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp,
+				      token->type);
 if (str.text != token->val.str.text)
 free ((void *)str.text);
 return result;
 }
 /* Convert an identifier denoted by ID and LEN, which might contain
-UCN escapes, to the source character set, either UTF-8 or
+UCN escapes or UTF-8 multibyte chars, to the source character set,
-UTF-EBCDIC.  Assumes that the identifier is actually a valid identifier.  */
+either UTF-8 or UTF-EBCDIC.  Assumes that the identifier is actually
+a valid identifier.  */
 cpp_hashnode *
 _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
 {
 /* It turns out that a UCN escape always turns into fewer characters
 than the escape itself, so we can allocate a temporary in advance.  */
 /* Implementation of class cpp_string_location_reader.  */
 /* Constructor for cpp_string_location_reader.  */
 cpp_string_location_reader::
-cpp_string_location_reader (source_location src_loc,
+cpp_string_location_reader (location_t src_loc,
 			    line_maps *line_table)
-: m_line_table (line_table)
 {
 src_loc = get_range_from_loc (line_table, src_loc).m_start;
 /* SRC_LOC might be a macro location.  It only makes sense to do
 column-by-column calculations on ordinary maps, so get the
 result.m_finish = m_loc;
 if (m_loc <= LINE_MAP_MAX_LOCATION_WITH_COLS)
 m_loc += m_offset_per_column;
 return result;
 }
+/* Helper for cpp_byte_column_to_display_column and its inverse.  Given a
+pointer to a UTF-8-encoded character, compute its display width.  *INBUFP
+points on entry to the start of the UTF-8 encoding of the character, and
+is updated to point just after the last byte of the encoding.  *INBYTESLEFTP
+contains on entry the remaining size of the buffer into which *INBUFP
+points, and this is also updated accordingly.  If *INBUFP does not
+point to a valid UTF-8-encoded sequence, then it will be treated as a single
+byte with display width 1.  */
+static inline int
+compute_next_display_width (const uchar **inbufp, size_t *inbytesleftp)
+{
+cppchar_t c;
+if (one_utf8_to_cppchar (inbufp, inbytesleftp, &c) != 0)
+{
+/* Input is not convertible to UTF-8.  This could be fine, e.g. in a
+	 string literal, so don't complain.  Just treat it as if it has a width
+	 of one.  */
+++*inbufp;
+--*inbytesleftp;
+return 1;
+}
+/*  one_utf8_to_cppchar() has updated inbufp and inbytesleftp for us.  */
+return cpp_wcwidth (c);
+}
+/*  For the string of length DATA_LENGTH bytes that begins at DATA, compute
+how many display columns are occupied by the first COLUMN bytes.  COLUMN
+may exceed DATA_LENGTH, in which case the phantom bytes at the end are
+treated as if they have display width 1.  */
+int
+cpp_byte_column_to_display_column (const char *data, int data_length,
+				   int column)
+{
+int display_col = 0;
+const uchar *udata = (const uchar *) data;
+const int offset = MAX (0, column - data_length);
+size_t inbytesleft = column - offset;
+while (inbytesleft)
+display_col += compute_next_display_width (&udata, &inbytesleft);
+return display_col + offset;
+}
+/*  For the string of length DATA_LENGTH bytes that begins at DATA, compute
+the least number of bytes that will result in at least DISPLAY_COL display
+columns.  The return value may exceed DATA_LENGTH if the entire string does
+not occupy enough display columns.  */
+int
+cpp_display_column_to_byte_column (const char *data, int data_length,
+				   int display_col)
+{
+int column = 0;
+const uchar *udata = (const uchar *) data;
+size_t inbytesleft = data_length;
+while (column < display_col && inbytesleft)
+column += compute_next_display_width (&udata, &inbytesleft);
+return data_length - inbytesleft + MAX (0, display_col - column);
+}
+/* Our own version of wcwidth().  We don't use the actual wcwidth() in glibc,
+because that will inspect the user's locale, and in particular in an ASCII
+locale, it will not return anything useful for extended characters.  But GCC
+in other respects (see e.g. _cpp_default_encoding()) behaves as if
+everything is UTF-8.  We also make some tweaks that are useful for the way
+GCC needs to use this data, e.g. tabs and other control characters should be
+treated as having width 1.  The lookup tables are generated from
+contrib/unicode/gen_wcwidth.py and were made by simply calling glibc
+wcwidth() on all codepoints, then applying the small tweaks.  These tables
+are not highly optimized, but for the present purpose of outputting
+diagnostics, they are sufficient.  */
+#include "generated_cpp_wcwidth.h"
+int cpp_wcwidth (cppchar_t c)
+{
+if (__builtin_expect (c <= wcwidth_range_ends[0], true))
+return wcwidth_widths[0];
+/* Binary search the tables.  */
+int begin = 1;
+static const int end
+= sizeof wcwidth_range_ends / sizeof (*wcwidth_range_ends);
+int len = end - begin;
+do
+{
+int half = len/2;
+int middle = begin + half;
+if (c > wcwidth_range_ends[middle])
+	{
+	  begin = middle + 1;
+	  len -= half + 1;
+	}
+else
+	len = half;
+} while (len);
+if (__builtin_expect (begin != end, true))
+return wcwidth_widths[begin];
+return 1;
+}

Mercurial > hg > CbC > CbC_gcc

comparison libcpp/charset.c @ 145:1830386684a0