comparison gcc/d/dmd/lexer.c @ 145:1830386684a0

gcc-9.2.0
author anatofuz
date Thu, 13 Feb 2020 11:34:05 +0900
parents
children
comparison
equal deleted inserted replaced
131:84e7813d76e9 145:1830386684a0
1
2 /* Compiler implementation of the D programming language
3 * Copyright (C) 1999-2019 by The D Language Foundation, All Rights Reserved
4 * written by Walter Bright
5 * http://www.digitalmars.com
6 * Distributed under the Boost Software License, Version 1.0.
7 * http://www.boost.org/LICENSE_1_0.txt
8 * https://github.com/D-Programming-Language/dmd/blob/master/src/lexer.c
9 */
10
11 /* Lexical Analyzer */
12
13 #include "root/dsystem.h" // for time() and ctime()
14 #include "root/rmem.h"
15
16 #include "mars.h"
17 #include "lexer.h"
18 #include "utf.h"
19 #include "identifier.h"
20 #include "id.h"
21
22 extern int HtmlNamedEntity(const utf8_t *p, size_t length);
23
24 #define LS 0x2028 // UTF line separator
25 #define PS 0x2029 // UTF paragraph separator
26
27 /********************************************
28 * Do our own char maps
29 */
30
31 static unsigned char cmtable[256];
32
33 const int CMoctal = 0x1;
34 const int CMhex = 0x2;
35 const int CMidchar = 0x4;
36
37 inline bool isoctal (utf8_t c) { return (cmtable[c] & CMoctal) != 0; }
38 inline bool ishex (utf8_t c) { return (cmtable[c] & CMhex) != 0; }
39 inline bool isidchar(utf8_t c) { return (cmtable[c] & CMidchar) != 0; }
40
41 struct CMTableInitializer
42 {
43 CMTableInitializer();
44 };
45
46 static CMTableInitializer cmtableinitializer;
47
48 CMTableInitializer::CMTableInitializer()
49 {
50 for (unsigned c = 0; c < 256; c++)
51 {
52 if ('0' <= c && c <= '7')
53 cmtable[c] |= CMoctal;
54 if (isxdigit(c))
55 cmtable[c] |= CMhex;
56 if (isalnum(c) || c == '_')
57 cmtable[c] |= CMidchar;
58 }
59 }
60
61 /*************************** Lexer ********************************************/
62
63 OutBuffer Lexer::stringbuffer;
64
65 Lexer::Lexer(const char *filename,
66 const utf8_t *base, size_t begoffset, size_t endoffset,
67 bool doDocComment, bool commentToken)
68 {
69 scanloc = Loc(filename, 1, 1);
70 //printf("Lexer::Lexer(%p,%d)\n",base,length);
71 //printf("lexer.filename = %s\n", filename);
72 this->token = Token();
73 this->token.ptr = NULL;
74 this->token.value = TOKreserved;
75 this->token.blockComment = NULL;
76 this->token.lineComment = NULL;
77 this->base = base;
78 this->end = base + endoffset;
79 p = base + begoffset;
80 line = p;
81 this->doDocComment = doDocComment;
82 this->anyToken = 0;
83 this->commentToken = commentToken;
84 this->errors = false;
85 //initKeywords();
86
87 /* If first line starts with '#!', ignore the line
88 */
89
90 if (p[0] == '#' && p[1] =='!')
91 {
92 p += 2;
93 while (1)
94 {
95 utf8_t c = *p++;
96 switch (c)
97 {
98 case 0:
99 case 0x1A:
100 p--;
101 /* fall through */
102
103 case '\n':
104 break;
105
106 default:
107 continue;
108 }
109 break;
110 }
111 endOfLine();
112 }
113 }
114
115
116 void Lexer::endOfLine()
117 {
118 scanloc.linnum++;
119 line = p;
120 }
121
122
123 void Lexer::error(const char *format, ...)
124 {
125 va_list ap;
126 va_start(ap, format);
127 ::verror(token.loc, format, ap);
128 va_end(ap);
129 errors = true;
130 }
131
132 void Lexer::error(Loc loc, const char *format, ...)
133 {
134 va_list ap;
135 va_start(ap, format);
136 ::verror(loc, format, ap);
137 va_end(ap);
138 errors = true;
139 }
140
141 void Lexer::deprecation(const char *format, ...)
142 {
143 va_list ap;
144 va_start(ap, format);
145 ::vdeprecation(token.loc, format, ap);
146 va_end(ap);
147 if (global.params.useDeprecated == DIAGNOSTICerror)
148 errors = true;
149 }
150
151 TOK Lexer::nextToken()
152 {
153 if (token.next)
154 {
155 Token *t = token.next;
156 memcpy(&token,t,sizeof(Token));
157 t->free();
158 }
159 else
160 {
161 scan(&token);
162 }
163 //token.print();
164 return token.value;
165 }
166
167 Token *Lexer::peek(Token *ct)
168 {
169 Token *t;
170 if (ct->next)
171 t = ct->next;
172 else
173 {
174 t = Token::alloc();
175 scan(t);
176 ct->next = t;
177 }
178 return t;
179 }
180
181 /***********************
182 * Look ahead at next token's value.
183 */
184
185 TOK Lexer::peekNext()
186 {
187 return peek(&token)->value;
188 }
189
190 /***********************
191 * Look 2 tokens ahead at value.
192 */
193
194 TOK Lexer::peekNext2()
195 {
196 Token *t = peek(&token);
197 return peek(t)->value;
198 }
199
200 /*********************************
201 * tk is on the opening (.
202 * Look ahead and return token that is past the closing ).
203 */
204
205 Token *Lexer::peekPastParen(Token *tk)
206 {
207 //printf("peekPastParen()\n");
208 int parens = 1;
209 int curlynest = 0;
210 while (1)
211 {
212 tk = peek(tk);
213 //tk->print();
214 switch (tk->value)
215 {
216 case TOKlparen:
217 parens++;
218 continue;
219
220 case TOKrparen:
221 --parens;
222 if (parens)
223 continue;
224 tk = peek(tk);
225 break;
226
227 case TOKlcurly:
228 curlynest++;
229 continue;
230
231 case TOKrcurly:
232 if (--curlynest >= 0)
233 continue;
234 break;
235
236 case TOKsemicolon:
237 if (curlynest)
238 continue;
239 break;
240
241 case TOKeof:
242 break;
243
244 default:
245 continue;
246 }
247 return tk;
248 }
249 }
250
251 /****************************
252 * Turn next token in buffer into a token.
253 */
254
255 void Lexer::scan(Token *t)
256 {
257 unsigned lastLine = scanloc.linnum;
258 Loc startLoc;
259
260 t->blockComment = NULL;
261 t->lineComment = NULL;
262 while (1)
263 {
264 t->ptr = p;
265 //printf("p = %p, *p = '%c'\n",p,*p);
266 t->loc = loc();
267 switch (*p)
268 {
269 case 0:
270 case 0x1A:
271 t->value = TOKeof; // end of file
272 return;
273
274 case ' ':
275 case '\t':
276 case '\v':
277 case '\f':
278 p++;
279 continue; // skip white space
280
281 case '\r':
282 p++;
283 if (*p != '\n') // if CR stands by itself
284 endOfLine();
285 continue; // skip white space
286
287 case '\n':
288 p++;
289 endOfLine();
290 continue; // skip white space
291
292 case '0': case '1': case '2': case '3': case '4':
293 case '5': case '6': case '7': case '8': case '9':
294 t->value = number(t);
295 return;
296
297 case '\'':
298 t->value = charConstant(t);
299 return;
300
301 case 'r':
302 if (p[1] != '"')
303 goto case_ident;
304 p++;
305 /* fall through */
306 case '`':
307 t->value = wysiwygStringConstant(t, *p);
308 return;
309
310 case 'x':
311 if (p[1] != '"')
312 goto case_ident;
313 p++;
314 t->value = hexStringConstant(t);
315 return;
316
317 case 'q':
318 if (p[1] == '"')
319 {
320 p++;
321 t->value = delimitedStringConstant(t);
322 return;
323 }
324 else if (p[1] == '{')
325 {
326 p++;
327 t->value = tokenStringConstant(t);
328 return;
329 }
330 else
331 goto case_ident;
332
333 case '"':
334 t->value = escapeStringConstant(t);
335 return;
336
337 case 'a': case 'b': case 'c': case 'd': case 'e':
338 case 'f': case 'g': case 'h': case 'i': case 'j':
339 case 'k': case 'l': case 'm': case 'n': case 'o':
340 case 'p': /*case 'q': case 'r':*/ case 's': case 't':
341 case 'u': case 'v': case 'w': /*case 'x':*/ case 'y':
342 case 'z':
343 case 'A': case 'B': case 'C': case 'D': case 'E':
344 case 'F': case 'G': case 'H': case 'I': case 'J':
345 case 'K': case 'L': case 'M': case 'N': case 'O':
346 case 'P': case 'Q': case 'R': case 'S': case 'T':
347 case 'U': case 'V': case 'W': case 'X': case 'Y':
348 case 'Z':
349 case '_':
350 case_ident:
351 { utf8_t c;
352
353 while (1)
354 {
355 c = *++p;
356 if (isidchar(c))
357 continue;
358 else if (c & 0x80)
359 { const utf8_t *s = p;
360 unsigned u = decodeUTF();
361 if (isUniAlpha(u))
362 continue;
363 error("char 0x%04x not allowed in identifier", u);
364 p = s;
365 }
366 break;
367 }
368
369 Identifier *id = Identifier::idPool((const char *)t->ptr, p - t->ptr);
370 t->ident = id;
371 t->value = (TOK) id->getValue();
372 anyToken = 1;
373 if (*t->ptr == '_') // if special identifier token
374 {
375 static bool initdone = false;
376 static char date[11+1];
377 static char time[8+1];
378 static char timestamp[24+1];
379
380 if (!initdone) // lazy evaluation
381 {
382 initdone = true;
383 time_t ct;
384 ::time(&ct);
385 char *p = ctime(&ct);
386 assert(p);
387 sprintf(&date[0], "%.6s %.4s", p + 4, p + 20);
388 sprintf(&time[0], "%.8s", p + 11);
389 sprintf(&timestamp[0], "%.24s", p);
390 }
391
392 if (id == Id::DATE)
393 {
394 t->ustring = (utf8_t *)date;
395 goto Lstr;
396 }
397 else if (id == Id::TIME)
398 {
399 t->ustring = (utf8_t *)time;
400 goto Lstr;
401 }
402 else if (id == Id::VENDOR)
403 {
404 t->ustring = (utf8_t *)const_cast<char *>(global.vendor);
405 goto Lstr;
406 }
407 else if (id == Id::TIMESTAMP)
408 {
409 t->ustring = (utf8_t *)timestamp;
410 Lstr:
411 t->value = TOKstring;
412 t->postfix = 0;
413 t->len = (unsigned)strlen((char *)t->ustring);
414 }
415 else if (id == Id::VERSIONX)
416 { unsigned major = 0;
417 unsigned minor = 0;
418 bool point = false;
419
420 for (const char *p = global.version + 1; 1; p++)
421 {
422 c = *p;
423 if (isdigit((utf8_t)c))
424 minor = minor * 10 + c - '0';
425 else if (c == '.')
426 {
427 if (point)
428 break; // ignore everything after second '.'
429 point = true;
430 major = minor;
431 minor = 0;
432 }
433 else
434 break;
435 }
436 t->value = TOKint64v;
437 t->uns64value = major * 1000 + minor;
438 }
439 else if (id == Id::EOFX)
440 {
441 t->value = TOKeof;
442 // Advance scanner to end of file
443 while (!(*p == 0 || *p == 0x1A))
444 p++;
445 }
446 }
447 //printf("t->value = %d\n",t->value);
448 return;
449 }
450
451 case '/':
452 p++;
453 switch (*p)
454 {
455 case '=':
456 p++;
457 t->value = TOKdivass;
458 return;
459
460 case '*':
461 p++;
462 startLoc = loc();
463 while (1)
464 {
465 while (1)
466 { utf8_t c = *p;
467 switch (c)
468 {
469 case '/':
470 break;
471
472 case '\n':
473 endOfLine();
474 p++;
475 continue;
476
477 case '\r':
478 p++;
479 if (*p != '\n')
480 endOfLine();
481 continue;
482
483 case 0:
484 case 0x1A:
485 error("unterminated /* */ comment");
486 p = end;
487 t->loc = loc();
488 t->value = TOKeof;
489 return;
490
491 default:
492 if (c & 0x80)
493 { unsigned u = decodeUTF();
494 if (u == PS || u == LS)
495 endOfLine();
496 }
497 p++;
498 continue;
499 }
500 break;
501 }
502 p++;
503 if (p[-2] == '*' && p - 3 != t->ptr)
504 break;
505 }
506 if (commentToken)
507 {
508 t->loc = startLoc;
509 t->value = TOKcomment;
510 return;
511 }
512 else if (doDocComment && t->ptr[2] == '*' && p - 4 != t->ptr)
513 { // if /** but not /**/
514 getDocComment(t, lastLine == startLoc.linnum);
515 }
516 continue;
517
518 case '/': // do // style comments
519 startLoc = loc();
520 while (1)
521 { utf8_t c = *++p;
522 switch (c)
523 {
524 case '\n':
525 break;
526
527 case '\r':
528 if (p[1] == '\n')
529 p++;
530 break;
531
532 case 0:
533 case 0x1A:
534 if (commentToken)
535 {
536 p = end;
537 t->loc = startLoc;
538 t->value = TOKcomment;
539 return;
540 }
541 if (doDocComment && t->ptr[2] == '/')
542 getDocComment(t, lastLine == startLoc.linnum);
543 p = end;
544 t->loc = loc();
545 t->value = TOKeof;
546 return;
547
548 default:
549 if (c & 0x80)
550 { unsigned u = decodeUTF();
551 if (u == PS || u == LS)
552 break;
553 }
554 continue;
555 }
556 break;
557 }
558
559 if (commentToken)
560 {
561 p++;
562 endOfLine();
563 t->loc = startLoc;
564 t->value = TOKcomment;
565 return;
566 }
567 if (doDocComment && t->ptr[2] == '/')
568 getDocComment(t, lastLine == startLoc.linnum);
569
570 p++;
571 endOfLine();
572 continue;
573
574 case '+':
575 { int nest;
576
577 startLoc = loc();
578 p++;
579 nest = 1;
580 while (1)
581 { utf8_t c = *p;
582 switch (c)
583 {
584 case '/':
585 p++;
586 if (*p == '+')
587 {
588 p++;
589 nest++;
590 }
591 continue;
592
593 case '+':
594 p++;
595 if (*p == '/')
596 {
597 p++;
598 if (--nest == 0)
599 break;
600 }
601 continue;
602
603 case '\r':
604 p++;
605 if (*p != '\n')
606 endOfLine();
607 continue;
608
609 case '\n':
610 endOfLine();
611 p++;
612 continue;
613
614 case 0:
615 case 0x1A:
616 error("unterminated /+ +/ comment");
617 p = end;
618 t->loc = loc();
619 t->value = TOKeof;
620 return;
621
622 default:
623 if (c & 0x80)
624 { unsigned u = decodeUTF();
625 if (u == PS || u == LS)
626 endOfLine();
627 }
628 p++;
629 continue;
630 }
631 break;
632 }
633 if (commentToken)
634 {
635 t->loc = startLoc;
636 t->value = TOKcomment;
637 return;
638 }
639 if (doDocComment && t->ptr[2] == '+' && p - 4 != t->ptr)
640 { // if /++ but not /++/
641 getDocComment(t, lastLine == startLoc.linnum);
642 }
643 continue;
644 }
645 default:
646 break;
647 }
648 t->value = TOKdiv;
649 return;
650
651 case '.':
652 p++;
653 if (isdigit(*p))
654 { /* Note that we don't allow ._1 and ._ as being
655 * valid floating point numbers.
656 */
657 p--;
658 t->value = inreal(t);
659 }
660 else if (p[0] == '.')
661 {
662 if (p[1] == '.')
663 { p += 2;
664 t->value = TOKdotdotdot;
665 }
666 else
667 { p++;
668 t->value = TOKslice;
669 }
670 }
671 else
672 t->value = TOKdot;
673 return;
674
675 case '&':
676 p++;
677 if (*p == '=')
678 { p++;
679 t->value = TOKandass;
680 }
681 else if (*p == '&')
682 { p++;
683 t->value = TOKandand;
684 }
685 else
686 t->value = TOKand;
687 return;
688
689 case '|':
690 p++;
691 if (*p == '=')
692 { p++;
693 t->value = TOKorass;
694 }
695 else if (*p == '|')
696 { p++;
697 t->value = TOKoror;
698 }
699 else
700 t->value = TOKor;
701 return;
702
703 case '-':
704 p++;
705 if (*p == '=')
706 { p++;
707 t->value = TOKminass;
708 }
709 else if (*p == '-')
710 { p++;
711 t->value = TOKminusminus;
712 }
713 else
714 t->value = TOKmin;
715 return;
716
717 case '+':
718 p++;
719 if (*p == '=')
720 { p++;
721 t->value = TOKaddass;
722 }
723 else if (*p == '+')
724 { p++;
725 t->value = TOKplusplus;
726 }
727 else
728 t->value = TOKadd;
729 return;
730
731 case '<':
732 p++;
733 if (*p == '=')
734 { p++;
735 t->value = TOKle; // <=
736 }
737 else if (*p == '<')
738 { p++;
739 if (*p == '=')
740 { p++;
741 t->value = TOKshlass; // <<=
742 }
743 else
744 t->value = TOKshl; // <<
745 }
746 else if (*p == '>')
747 { p++;
748 if (*p == '=')
749 { p++;
750 t->value = TOKleg; // <>=
751 }
752 else
753 t->value = TOKlg; // <>
754 }
755 else
756 t->value = TOKlt; // <
757 return;
758
759 case '>':
760 p++;
761 if (*p == '=')
762 { p++;
763 t->value = TOKge; // >=
764 }
765 else if (*p == '>')
766 { p++;
767 if (*p == '=')
768 { p++;
769 t->value = TOKshrass; // >>=
770 }
771 else if (*p == '>')
772 { p++;
773 if (*p == '=')
774 { p++;
775 t->value = TOKushrass; // >>>=
776 }
777 else
778 t->value = TOKushr; // >>>
779 }
780 else
781 t->value = TOKshr; // >>
782 }
783 else
784 t->value = TOKgt; // >
785 return;
786
787 case '!':
788 p++;
789 if (*p == '=')
790 { p++;
791 t->value = TOKnotequal; // !=
792 }
793 else if (*p == '<')
794 { p++;
795 if (*p == '>')
796 { p++;
797 if (*p == '=')
798 { p++;
799 t->value = TOKunord; // !<>=
800 }
801 else
802 t->value = TOKue; // !<>
803 }
804 else if (*p == '=')
805 { p++;
806 t->value = TOKug; // !<=
807 }
808 else
809 t->value = TOKuge; // !<
810 }
811 else if (*p == '>')
812 { p++;
813 if (*p == '=')
814 { p++;
815 t->value = TOKul; // !>=
816 }
817 else
818 t->value = TOKule; // !>
819 }
820 else
821 t->value = TOKnot; // !
822 return;
823
824 case '=':
825 p++;
826 if (*p == '=')
827 { p++;
828 t->value = TOKequal; // ==
829 }
830 else if (*p == '>')
831 { p++;
832 t->value = TOKgoesto; // =>
833 }
834 else
835 t->value = TOKassign; // =
836 return;
837
838 case '~':
839 p++;
840 if (*p == '=')
841 { p++;
842 t->value = TOKcatass; // ~=
843 }
844 else
845 t->value = TOKtilde; // ~
846 return;
847
848 case '^':
849 p++;
850 if (*p == '^')
851 { p++;
852 if (*p == '=')
853 { p++;
854 t->value = TOKpowass; // ^^=
855 }
856 else
857 t->value = TOKpow; // ^^
858 }
859 else if (*p == '=')
860 { p++;
861 t->value = TOKxorass; // ^=
862 }
863 else
864 t->value = TOKxor; // ^
865 return;
866
867 case '(': p++; t->value = TOKlparen; return;
868 case ')': p++; t->value = TOKrparen; return;
869 case '[': p++; t->value = TOKlbracket; return;
870 case ']': p++; t->value = TOKrbracket; return;
871 case '{': p++; t->value = TOKlcurly; return;
872 case '}': p++; t->value = TOKrcurly; return;
873 case '?': p++; t->value = TOKquestion; return;
874 case ',': p++; t->value = TOKcomma; return;
875 case ';': p++; t->value = TOKsemicolon; return;
876 case ':': p++; t->value = TOKcolon; return;
877 case '$': p++; t->value = TOKdollar; return;
878 case '@': p++; t->value = TOKat; return;
879
880 case '*':
881 p++;
882 if (*p == '=')
883 { p++;
884 t->value = TOKmulass;
885 }
886 else
887 t->value = TOKmul;
888 return;
889 case '%':
890 p++;
891 if (*p == '=')
892 { p++;
893 t->value = TOKmodass;
894 }
895 else
896 t->value = TOKmod;
897 return;
898
899 case '#':
900 {
901 p++;
902 Token n;
903 scan(&n);
904 if (n.value == TOKidentifier)
905 {
906 if (n.ident == Id::line)
907 {
908 poundLine();
909 continue;
910 }
911 else
912 {
913 const Loc locx = loc();
914 warning(locx, "C preprocessor directive `#%s` is not supported", n.ident->toChars());
915 }
916 }
917 else if (n.value == TOKif)
918 {
919 error("C preprocessor directive `#if` is not supported, use `version` or `static if`");
920 }
921 t->value = TOKpound;
922 return;
923 }
924
925 default:
926 { unsigned c = *p;
927
928 if (c & 0x80)
929 { c = decodeUTF();
930
931 // Check for start of unicode identifier
932 if (isUniAlpha(c))
933 goto case_ident;
934
935 if (c == PS || c == LS)
936 {
937 endOfLine();
938 p++;
939 continue;
940 }
941 }
942 if (c < 0x80 && isprint(c))
943 error("character '%c' is not a valid token", c);
944 else
945 error("character 0x%02x is not a valid token", c);
946 p++;
947 continue;
948 }
949 }
950 }
951 }
952
953 /*******************************************
954 * Parse escape sequence.
955 */
956
957 unsigned Lexer::escapeSequence()
958 { unsigned c = *p;
959
960 int n;
961 int ndigits;
962
963 switch (c)
964 {
965 case '\'':
966 case '"':
967 case '?':
968 case '\\':
969 Lconsume:
970 p++;
971 break;
972
973 case 'a': c = 7; goto Lconsume;
974 case 'b': c = 8; goto Lconsume;
975 case 'f': c = 12; goto Lconsume;
976 case 'n': c = 10; goto Lconsume;
977 case 'r': c = 13; goto Lconsume;
978 case 't': c = 9; goto Lconsume;
979 case 'v': c = 11; goto Lconsume;
980
981 case 'u':
982 ndigits = 4;
983 goto Lhex;
984 case 'U':
985 ndigits = 8;
986 goto Lhex;
987 case 'x':
988 ndigits = 2;
989 Lhex:
990 p++;
991 c = *p;
992 if (ishex((utf8_t)c))
993 { unsigned v;
994
995 n = 0;
996 v = 0;
997 while (1)
998 {
999 if (isdigit((utf8_t)c))
1000 c -= '0';
1001 else if (islower(c))
1002 c -= 'a' - 10;
1003 else
1004 c -= 'A' - 10;
1005 v = v * 16 + c;
1006 c = *++p;
1007 if (++n == ndigits)
1008 break;
1009 if (!ishex((utf8_t)c))
1010 { error("escape hex sequence has %d hex digits instead of %d", n, ndigits);
1011 break;
1012 }
1013 }
1014 if (ndigits != 2 && !utf_isValidDchar(v))
1015 { error("invalid UTF character \\U%08x", v);
1016 v = '?'; // recover with valid UTF character
1017 }
1018 c = v;
1019 }
1020 else
1021 error("undefined escape hex sequence \\%c",c);
1022 break;
1023
1024 case '&': // named character entity
1025 for (const utf8_t *idstart = ++p; 1; p++)
1026 {
1027 switch (*p)
1028 {
1029 case ';':
1030 c = HtmlNamedEntity(idstart, p - idstart);
1031 if (c == ~0U)
1032 { error("unnamed character entity &%.*s;", (int)(p - idstart), idstart);
1033 c = ' ';
1034 }
1035 p++;
1036 break;
1037
1038 default:
1039 if (isalpha(*p) ||
1040 (p != idstart && isdigit(*p)))
1041 continue;
1042 error("unterminated named entity &%.*s;", (int)(p - idstart + 1), idstart);
1043 break;
1044 }
1045 break;
1046 }
1047 break;
1048
1049 case 0:
1050 case 0x1A: // end of file
1051 c = '\\';
1052 break;
1053
1054 default:
1055 if (isoctal((utf8_t)c))
1056 { unsigned v;
1057
1058 n = 0;
1059 v = 0;
1060 do
1061 {
1062 v = v * 8 + (c - '0');
1063 c = *++p;
1064 } while (++n < 3 && isoctal((utf8_t)c));
1065 c = v;
1066 if (c > 0xFF)
1067 error("escape octal sequence \\%03o is larger than \\377", c);
1068 }
1069 else
1070 error("undefined escape sequence \\%c",c);
1071 break;
1072 }
1073 return c;
1074 }
1075
1076 /**************************************
1077 */
1078
1079 TOK Lexer::wysiwygStringConstant(Token *t, int tc)
1080 {
1081 int c;
1082 Loc start = loc();
1083
1084 p++;
1085 stringbuffer.reset();
1086 while (1)
1087 {
1088 c = *p++;
1089 switch (c)
1090 {
1091 case '\n':
1092 endOfLine();
1093 break;
1094
1095 case '\r':
1096 if (*p == '\n')
1097 continue; // ignore
1098 c = '\n'; // treat EndOfLine as \n character
1099 endOfLine();
1100 break;
1101
1102 case 0:
1103 case 0x1A:
1104 error("unterminated string constant starting at %s", start.toChars());
1105 t->ustring = (utf8_t *)const_cast<char *>("");
1106 t->len = 0;
1107 t->postfix = 0;
1108 return TOKstring;
1109
1110 case '"':
1111 case '`':
1112 if (c == tc)
1113 {
1114 t->len = (unsigned)stringbuffer.offset;
1115 stringbuffer.writeByte(0);
1116 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset);
1117 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1118 stringPostfix(t);
1119 return TOKstring;
1120 }
1121 break;
1122
1123 default:
1124 if (c & 0x80)
1125 { p--;
1126 unsigned u = decodeUTF();
1127 p++;
1128 if (u == PS || u == LS)
1129 endOfLine();
1130 stringbuffer.writeUTF8(u);
1131 continue;
1132 }
1133 break;
1134 }
1135 stringbuffer.writeByte(c);
1136 }
1137 }
1138
1139 /**************************************
1140 * Lex hex strings:
1141 * x"0A ae 34FE BD"
1142 */
1143
1144 TOK Lexer::hexStringConstant(Token *t)
1145 {
1146 unsigned c;
1147 Loc start = loc();
1148 unsigned n = 0;
1149 unsigned v = ~0; // dead assignment, needed to suppress warning
1150
1151 p++;
1152 stringbuffer.reset();
1153 while (1)
1154 {
1155 c = *p++;
1156 switch (c)
1157 {
1158 case ' ':
1159 case '\t':
1160 case '\v':
1161 case '\f':
1162 continue; // skip white space
1163
1164 case '\r':
1165 if (*p == '\n')
1166 continue; // ignore
1167 // Treat isolated '\r' as if it were a '\n'
1168 /* fall through */
1169 case '\n':
1170 endOfLine();
1171 continue;
1172
1173 case 0:
1174 case 0x1A:
1175 error("unterminated string constant starting at %s", start.toChars());
1176 t->ustring = (utf8_t *)const_cast<char *>("");
1177 t->len = 0;
1178 t->postfix = 0;
1179 return TOKxstring;
1180
1181 case '"':
1182 if (n & 1)
1183 { error("odd number (%d) of hex characters in hex string", n);
1184 stringbuffer.writeByte(v);
1185 }
1186 t->len = (unsigned)stringbuffer.offset;
1187 stringbuffer.writeByte(0);
1188 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset);
1189 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1190 stringPostfix(t);
1191 return TOKxstring;
1192
1193 default:
1194 if (c >= '0' && c <= '9')
1195 c -= '0';
1196 else if (c >= 'a' && c <= 'f')
1197 c -= 'a' - 10;
1198 else if (c >= 'A' && c <= 'F')
1199 c -= 'A' - 10;
1200 else if (c & 0x80)
1201 { p--;
1202 unsigned u = decodeUTF();
1203 p++;
1204 if (u == PS || u == LS)
1205 endOfLine();
1206 else
1207 error("non-hex character \\u%04x in hex string", u);
1208 }
1209 else
1210 error("non-hex character '%c' in hex string", c);
1211 if (n & 1)
1212 { v = (v << 4) | c;
1213 stringbuffer.writeByte(v);
1214 }
1215 else
1216 v = c;
1217 n++;
1218 break;
1219 }
1220 }
1221 }
1222
1223
1224 /**************************************
1225 * Lex delimited strings:
1226 * q"(foo(xxx))" // "foo(xxx)"
1227 * q"[foo(]" // "foo("
1228 * q"/foo]/" // "foo]"
1229 * q"HERE
1230 * foo
1231 * HERE" // "foo\n"
1232 * Input:
1233 * p is on the "
1234 */
1235
1236 TOK Lexer::delimitedStringConstant(Token *t)
1237 {
1238 unsigned c;
1239 Loc start = loc();
1240 unsigned delimleft = 0;
1241 unsigned delimright = 0;
1242 unsigned nest = 1;
1243 unsigned nestcount = ~0; // dead assignment, needed to suppress warning
1244 Identifier *hereid = NULL;
1245 unsigned blankrol = 0;
1246 unsigned startline = 0;
1247
1248 p++;
1249 stringbuffer.reset();
1250 while (1)
1251 {
1252 c = *p++;
1253 //printf("c = '%c'\n", c);
1254 switch (c)
1255 {
1256 case '\n':
1257 Lnextline:
1258 endOfLine();
1259 startline = 1;
1260 if (blankrol)
1261 { blankrol = 0;
1262 continue;
1263 }
1264 if (hereid)
1265 {
1266 stringbuffer.writeUTF8(c);
1267 continue;
1268 }
1269 break;
1270
1271 case '\r':
1272 if (*p == '\n')
1273 continue; // ignore
1274 c = '\n'; // treat EndOfLine as \n character
1275 goto Lnextline;
1276
1277 case 0:
1278 case 0x1A:
1279 error("unterminated delimited string constant starting at %s", start.toChars());
1280 t->ustring = (utf8_t *)const_cast<char *>("");
1281 t->len = 0;
1282 t->postfix = 0;
1283 return TOKstring;
1284
1285 default:
1286 if (c & 0x80)
1287 { p--;
1288 c = decodeUTF();
1289 p++;
1290 if (c == PS || c == LS)
1291 goto Lnextline;
1292 }
1293 break;
1294 }
1295 if (delimleft == 0)
1296 { delimleft = c;
1297 nest = 1;
1298 nestcount = 1;
1299 if (c == '(')
1300 delimright = ')';
1301 else if (c == '{')
1302 delimright = '}';
1303 else if (c == '[')
1304 delimright = ']';
1305 else if (c == '<')
1306 delimright = '>';
1307 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
1308 { // Start of identifier; must be a heredoc
1309 Token tok;
1310 p--;
1311 scan(&tok); // read in heredoc identifier
1312 if (tok.value != TOKidentifier)
1313 { error("identifier expected for heredoc, not %s", tok.toChars());
1314 delimright = c;
1315 }
1316 else
1317 { hereid = tok.ident;
1318 //printf("hereid = '%s'\n", hereid->toChars());
1319 blankrol = 1;
1320 }
1321 nest = 0;
1322 }
1323 else
1324 { delimright = c;
1325 nest = 0;
1326 if (isspace(c))
1327 error("delimiter cannot be whitespace");
1328 }
1329 }
1330 else
1331 {
1332 if (blankrol)
1333 { error("heredoc rest of line should be blank");
1334 blankrol = 0;
1335 continue;
1336 }
1337 if (nest == 1)
1338 {
1339 if (c == delimleft)
1340 nestcount++;
1341 else if (c == delimright)
1342 { nestcount--;
1343 if (nestcount == 0)
1344 goto Ldone;
1345 }
1346 }
1347 else if (c == delimright)
1348 goto Ldone;
1349 if (startline && isalpha(c) && hereid)
1350 { Token tok;
1351 const utf8_t *psave = p;
1352 p--;
1353 scan(&tok); // read in possible heredoc identifier
1354 //printf("endid = '%s'\n", tok.ident->toChars());
1355 if (tok.value == TOKidentifier && tok.ident->equals(hereid))
1356 { /* should check that rest of line is blank
1357 */
1358 goto Ldone;
1359 }
1360 p = psave;
1361 }
1362 stringbuffer.writeUTF8(c);
1363 startline = 0;
1364 }
1365 }
1366
1367 Ldone:
1368 if (*p == '"')
1369 p++;
1370 else if (hereid)
1371 error("delimited string must end in %s\"", hereid->toChars());
1372 else
1373 error("delimited string must end in %c\"", delimright);
1374 t->len = (unsigned)stringbuffer.offset;
1375 stringbuffer.writeByte(0);
1376 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset);
1377 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1378 stringPostfix(t);
1379 return TOKstring;
1380 }
1381
1382 /**************************************
1383 * Lex delimited strings:
1384 * q{ foo(xxx) } // " foo(xxx) "
1385 * q{foo(} // "foo("
1386 * q{{foo}"}"} // "{foo}"}""
1387 * Input:
1388 * p is on the q
1389 */
1390
1391 TOK Lexer::tokenStringConstant(Token *t)
1392 {
1393 unsigned nest = 1;
1394 Loc start = loc();
1395 const utf8_t *pstart = ++p;
1396
1397 while (1)
1398 { Token tok;
1399
1400 scan(&tok);
1401 switch (tok.value)
1402 {
1403 case TOKlcurly:
1404 nest++;
1405 continue;
1406
1407 case TOKrcurly:
1408 if (--nest == 0)
1409 {
1410 t->len = (unsigned)(p - 1 - pstart);
1411 t->ustring = (utf8_t *)mem.xmalloc(t->len + 1);
1412 memcpy(t->ustring, pstart, t->len);
1413 t->ustring[t->len] = 0;
1414 stringPostfix(t);
1415 return TOKstring;
1416 }
1417 continue;
1418
1419 case TOKeof:
1420 error("unterminated token string constant starting at %s", start.toChars());
1421 t->ustring = (utf8_t *)const_cast<char *>("");
1422 t->len = 0;
1423 t->postfix = 0;
1424 return TOKstring;
1425
1426 default:
1427 continue;
1428 }
1429 }
1430 }
1431
1432
1433
1434 /**************************************
1435 */
1436
1437 TOK Lexer::escapeStringConstant(Token *t)
1438 {
1439 unsigned c;
1440 Loc start = loc();
1441
1442 p++;
1443 stringbuffer.reset();
1444 while (1)
1445 {
1446 c = *p++;
1447 switch (c)
1448 {
1449 case '\\':
1450 switch (*p)
1451 {
1452 case 'u':
1453 case 'U':
1454 case '&':
1455 c = escapeSequence();
1456 stringbuffer.writeUTF8(c);
1457 continue;
1458
1459 default:
1460 c = escapeSequence();
1461 break;
1462 }
1463 break;
1464 case '\n':
1465 endOfLine();
1466 break;
1467
1468 case '\r':
1469 if (*p == '\n')
1470 continue; // ignore
1471 c = '\n'; // treat EndOfLine as \n character
1472 endOfLine();
1473 break;
1474
1475 case '"':
1476 t->len = (unsigned)stringbuffer.offset;
1477 stringbuffer.writeByte(0);
1478 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset);
1479 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
1480 stringPostfix(t);
1481 return TOKstring;
1482
1483 case 0:
1484 case 0x1A:
1485 p--;
1486 error("unterminated string constant starting at %s", start.toChars());
1487 t->ustring = (utf8_t *)const_cast<char *>("");
1488 t->len = 0;
1489 t->postfix = 0;
1490 return TOKstring;
1491
1492 default:
1493 if (c & 0x80)
1494 {
1495 p--;
1496 c = decodeUTF();
1497 if (c == LS || c == PS)
1498 { c = '\n';
1499 endOfLine();
1500 }
1501 p++;
1502 stringbuffer.writeUTF8(c);
1503 continue;
1504 }
1505 break;
1506 }
1507 stringbuffer.writeByte(c);
1508 }
1509 }
1510
1511 /**************************************
1512 */
1513
1514 TOK Lexer::charConstant(Token *t)
1515 {
1516 unsigned c;
1517 TOK tk = TOKcharv;
1518
1519 //printf("Lexer::charConstant\n");
1520 p++;
1521 c = *p++;
1522 switch (c)
1523 {
1524 case '\\':
1525 switch (*p)
1526 {
1527 case 'u':
1528 t->uns64value = escapeSequence();
1529 tk = TOKwcharv;
1530 break;
1531
1532 case 'U':
1533 case '&':
1534 t->uns64value = escapeSequence();
1535 tk = TOKdcharv;
1536 break;
1537
1538 default:
1539 t->uns64value = escapeSequence();
1540 break;
1541 }
1542 break;
1543 case '\n':
1544 L1:
1545 endOfLine();
1546 /* fall through */
1547 case '\r':
1548 case 0:
1549 case 0x1A:
1550 case '\'':
1551 error("unterminated character constant");
1552 t->uns64value = '?';
1553 return tk;
1554
1555 default:
1556 if (c & 0x80)
1557 {
1558 p--;
1559 c = decodeUTF();
1560 p++;
1561 if (c == LS || c == PS)
1562 goto L1;
1563 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
1564 tk = TOKwcharv;
1565 else
1566 tk = TOKdcharv;
1567 }
1568 t->uns64value = c;
1569 break;
1570 }
1571
1572 if (*p != '\'')
1573 {
1574 error("unterminated character constant");
1575 t->uns64value = '?';
1576 return tk;
1577 }
1578 p++;
1579 return tk;
1580 }
1581
1582 /***************************************
1583 * Get postfix of string literal.
1584 */
1585
1586 void Lexer::stringPostfix(Token *t)
1587 {
1588 switch (*p)
1589 {
1590 case 'c':
1591 case 'w':
1592 case 'd':
1593 t->postfix = *p;
1594 p++;
1595 break;
1596
1597 default:
1598 t->postfix = 0;
1599 break;
1600 }
1601 }
1602
1603 /**************************************
1604 * Read in a number.
1605 * If it's an integer, store it in tok.TKutok.Vlong.
1606 * integers can be decimal, octal or hex
1607 * Handle the suffixes U, UL, LU, L, etc.
1608 * If it's double, store it in tok.TKutok.Vdouble.
1609 * Returns:
1610 * TKnum
1611 * TKdouble,...
1612 */
1613
1614 TOK Lexer::number(Token *t)
1615 {
1616 int base = 10;
1617 const utf8_t *start = p;
1618 unsigned c;
1619 uinteger_t n = 0; // unsigned >=64 bit integer type
1620 int d;
1621 bool err = false;
1622 bool overflow = false;
1623
1624 c = *p;
1625 if (c == '0')
1626 {
1627 ++p;
1628 c = *p;
1629 switch (c)
1630 {
1631 case '0': case '1': case '2': case '3':
1632 case '4': case '5': case '6': case '7':
1633 n = c - '0';
1634 ++p;
1635 base = 8;
1636 break;
1637
1638 case 'x':
1639 case 'X':
1640 ++p;
1641 base = 16;
1642 break;
1643
1644 case 'b':
1645 case 'B':
1646 ++p;
1647 base = 2;
1648 break;
1649
1650 case '.':
1651 if (p[1] == '.')
1652 goto Ldone; // if ".."
1653 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
1654 goto Ldone; // if ".identifier" or ".unicode"
1655 goto Lreal; // '.' is part of current token
1656
1657 case 'i':
1658 case 'f':
1659 case 'F':
1660 goto Lreal;
1661
1662 case '_':
1663 ++p;
1664 base = 8;
1665 break;
1666
1667 case 'L':
1668 if (p[1] == 'i')
1669 goto Lreal;
1670 break;
1671
1672 default:
1673 break;
1674 }
1675 }
1676
1677 while (1)
1678 {
1679 c = *p;
1680 switch (c)
1681 {
1682 case '0': case '1':
1683 ++p;
1684 d = c - '0';
1685 break;
1686
1687 case '2': case '3':
1688 case '4': case '5': case '6': case '7':
1689 if (base == 2 && !err)
1690 {
1691 error("binary digit expected");
1692 err = true;
1693 }
1694 ++p;
1695 d = c - '0';
1696 break;
1697
1698 case '8': case '9':
1699 ++p;
1700 if (base < 10 && !err)
1701 {
1702 error("radix %d digit expected, not '%c'", base, c);
1703 err = true;
1704 }
1705 d = c - '0';
1706 break;
1707
1708 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1709 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1710 ++p;
1711 if (base != 16)
1712 {
1713 if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
1714 goto Lreal;
1715 if (!err)
1716 {
1717 error("radix %d digit expected, not '%c'", base, c);
1718 err = true;
1719 }
1720 }
1721 if (c >= 'a')
1722 d = c + 10 - 'a';
1723 else
1724 d = c + 10 - 'A';
1725 break;
1726
1727 case 'L':
1728 if (p[1] == 'i')
1729 goto Lreal;
1730 goto Ldone;
1731
1732 case '.':
1733 if (p[1] == '.')
1734 goto Ldone; // if ".."
1735 if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
1736 goto Ldone; // if ".identifier" or ".unicode"
1737 goto Lreal; // otherwise as part of a floating point literal
1738
1739 case 'p':
1740 case 'P':
1741 case 'i':
1742 Lreal:
1743 p = start;
1744 return inreal(t);
1745
1746 case '_':
1747 ++p;
1748 continue;
1749
1750 default:
1751 goto Ldone;
1752 }
1753
1754 uinteger_t n2 = n * base;
1755 if ((n2 / base != n || n2 + d < n))
1756 {
1757 overflow = true;
1758 }
1759 n = n2 + d;
1760
1761 // if n needs more than 64 bits
1762 if (sizeof(n) > 8 &&
1763 n > 0xFFFFFFFFFFFFFFFFULL)
1764 {
1765 overflow = true;
1766 }
1767 }
1768
1769 Ldone:
1770
1771 if (overflow && !err)
1772 {
1773 error("integer overflow");
1774 err = true;
1775 }
1776
1777 enum FLAGS
1778 {
1779 FLAGS_none = 0,
1780 FLAGS_decimal = 1, // decimal
1781 FLAGS_unsigned = 2, // u or U suffix
1782 FLAGS_long = 4, // L suffix
1783 };
1784
1785 unsigned flags = (base == 10) ? FLAGS_decimal : FLAGS_none;
1786
1787 // Parse trailing 'u', 'U', 'l' or 'L' in any combination
1788 const utf8_t *psuffix = p;
1789 while (1)
1790 {
1791 utf8_t f;
1792 switch (*p)
1793 {
1794 case 'U':
1795 case 'u':
1796 f = FLAGS_unsigned;
1797 goto L1;
1798
1799 case 'l':
1800 f = FLAGS_long;
1801 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
1802 goto L1;
1803
1804 case 'L':
1805 f = FLAGS_long;
1806 L1:
1807 p++;
1808 if ((flags & f) && !err)
1809 {
1810 error("unrecognized token");
1811 err = true;
1812 }
1813 flags = (FLAGS) (flags | f);
1814 continue;
1815 default:
1816 break;
1817 }
1818 break;
1819 }
1820
1821 if (base == 8 && n >= 8)
1822 error("octal literals 0%llo%.*s are no longer supported, use std.conv.octal!%llo%.*s instead",
1823 n, p - psuffix, psuffix, n, p - psuffix, psuffix);
1824
1825 TOK result;
1826 switch (flags)
1827 {
1828 case FLAGS_none:
1829 /* Octal or Hexadecimal constant.
1830 * First that fits: int, uint, long, ulong
1831 */
1832 if (n & 0x8000000000000000LL)
1833 result = TOKuns64v;
1834 else if (n & 0xFFFFFFFF00000000LL)
1835 result = TOKint64v;
1836 else if (n & 0x80000000)
1837 result = TOKuns32v;
1838 else
1839 result = TOKint32v;
1840 break;
1841
1842 case FLAGS_decimal:
1843 /* First that fits: int, long, long long
1844 */
1845 if (n & 0x8000000000000000LL)
1846 {
1847 if (!err)
1848 {
1849 error("signed integer overflow");
1850 err = true;
1851 }
1852 result = TOKuns64v;
1853 }
1854 else if (n & 0xFFFFFFFF80000000LL)
1855 result = TOKint64v;
1856 else
1857 result = TOKint32v;
1858 break;
1859
1860 case FLAGS_unsigned:
1861 case FLAGS_decimal | FLAGS_unsigned:
1862 /* First that fits: uint, ulong
1863 */
1864 if (n & 0xFFFFFFFF00000000LL)
1865 result = TOKuns64v;
1866 else
1867 result = TOKuns32v;
1868 break;
1869
1870 case FLAGS_decimal | FLAGS_long:
1871 if (n & 0x8000000000000000LL)
1872 {
1873 if (!err)
1874 {
1875 error("signed integer overflow");
1876 err = true;
1877 }
1878 result = TOKuns64v;
1879 }
1880 else
1881 result = TOKint64v;
1882 break;
1883
1884 case FLAGS_long:
1885 if (n & 0x8000000000000000LL)
1886 result = TOKuns64v;
1887 else
1888 result = TOKint64v;
1889 break;
1890
1891 case FLAGS_unsigned | FLAGS_long:
1892 case FLAGS_decimal | FLAGS_unsigned | FLAGS_long:
1893 result = TOKuns64v;
1894 break;
1895
1896 default:
1897 assert(0);
1898 }
1899 t->uns64value = n;
1900 return result;
1901 }
1902
1903 /**************************************
1904 * Read in characters, converting them to real.
1905 * Bugs:
1906 * Exponent overflow not detected.
1907 * Too much requested precision is not detected.
1908 */
1909
1910 TOK Lexer::inreal(Token *t)
1911 {
1912 //printf("Lexer::inreal()\n");
1913 bool isWellformedString = true;
1914 stringbuffer.reset();
1915 const utf8_t *pstart = p;
1916 char hex = 0;
1917 unsigned c = *p++;
1918
1919 // Leading '0x'
1920 if (c == '0')
1921 {
1922 c = *p++;
1923 if (c == 'x' || c == 'X')
1924 {
1925 hex = true;
1926 c = *p++;
1927 }
1928 }
1929
1930 // Digits to left of '.'
1931 while (1)
1932 {
1933 if (c == '.')
1934 {
1935 c = *p++;
1936 break;
1937 }
1938 if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
1939 {
1940 c = *p++;
1941 continue;
1942 }
1943 break;
1944 }
1945
1946 // Digits to right of '.'
1947 while (1)
1948 {
1949 if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
1950 {
1951 c = *p++;
1952 continue;
1953 }
1954 break;
1955 }
1956
1957 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
1958 {
1959 c = *p++;
1960 if (c == '-' || c == '+')
1961 {
1962 c = *p++;
1963 }
1964 bool anyexp = false;
1965 while (1)
1966 {
1967 if (isdigit(c))
1968 {
1969 anyexp = true;
1970 c = *p++;
1971 continue;
1972 }
1973 if (c == '_')
1974 {
1975 c = *p++;
1976 continue;
1977 }
1978 if (!anyexp)
1979 {
1980 error("missing exponent");
1981 isWellformedString = false;
1982 }
1983 break;
1984 }
1985 }
1986 else if (hex)
1987 {
1988 error("exponent required for hex float");
1989 isWellformedString = false;
1990 }
1991 --p;
1992 while (pstart < p)
1993 {
1994 if (*pstart != '_')
1995 stringbuffer.writeByte(*pstart);
1996 ++pstart;
1997 }
1998
1999 stringbuffer.writeByte(0);
2000 const char *sbufptr = (char *)stringbuffer.data;
2001 TOK result;
2002 bool isOutOfRange = false;
2003 t->floatvalue = (isWellformedString ? CTFloat::parse(sbufptr, &isOutOfRange) : CTFloat::zero);
2004 errno = 0;
2005 switch (*p)
2006 {
2007 case 'F':
2008 case 'f':
2009 if (isWellformedString && !isOutOfRange)
2010 isOutOfRange = Port::isFloat32LiteralOutOfRange(sbufptr);
2011 result = TOKfloat32v;
2012 p++;
2013 break;
2014
2015 default:
2016 if (isWellformedString && !isOutOfRange)
2017 isOutOfRange = Port::isFloat64LiteralOutOfRange(sbufptr);
2018 result = TOKfloat64v;
2019 break;
2020
2021 case 'l':
2022 error("use 'L' suffix instead of 'l'");
2023 /* fall through */
2024 case 'L':
2025 result = TOKfloat80v;
2026 p++;
2027 break;
2028 }
2029 if (*p == 'i' || *p == 'I')
2030 {
2031 if (*p == 'I')
2032 error("use 'i' suffix instead of 'I'");
2033 p++;
2034 switch (result)
2035 {
2036 case TOKfloat32v:
2037 result = TOKimaginary32v;
2038 break;
2039 case TOKfloat64v:
2040 result = TOKimaginary64v;
2041 break;
2042 case TOKfloat80v:
2043 result = TOKimaginary80v;
2044 break;
2045 default: break;
2046 }
2047 }
2048 const bool isLong = (result == TOKfloat80v || result == TOKimaginary80v);
2049 if (isOutOfRange && !isLong)
2050 {
2051 const char *suffix = (result == TOKfloat32v || result == TOKimaginary32v) ? "f" : "";
2052 error(scanloc, "number '%s%s' is not representable", (char *)stringbuffer.data, suffix);
2053 }
2054 return result;
2055 }
2056
2057 /*********************************************
2058 * parse:
2059 * #line linnum [filespec]
2060 * also allow __LINE__ for linnum, and __FILE__ for filespec
2061 */
2062
2063 void Lexer::poundLine()
2064 {
2065 Token tok;
2066 int linnum = this->scanloc.linnum;
2067 char *filespec = NULL;
2068 Loc loc = this->loc();
2069
2070 scan(&tok);
2071 if (tok.value == TOKint32v || tok.value == TOKint64v)
2072 {
2073 int lin = (int)(tok.uns64value - 1);
2074 if ((unsigned)lin != tok.uns64value - 1)
2075 error("line number %lld out of range", (unsigned long long)tok.uns64value);
2076 else
2077 linnum = lin;
2078 }
2079 else if (tok.value == TOKline)
2080 {
2081 }
2082 else
2083 goto Lerr;
2084
2085 while (1)
2086 {
2087 switch (*p)
2088 {
2089 case 0:
2090 case 0x1A:
2091 case '\n':
2092 Lnewline:
2093 this->scanloc.linnum = linnum;
2094 if (filespec)
2095 this->scanloc.filename = filespec;
2096 return;
2097
2098 case '\r':
2099 p++;
2100 if (*p != '\n')
2101 { p--;
2102 goto Lnewline;
2103 }
2104 continue;
2105
2106 case ' ':
2107 case '\t':
2108 case '\v':
2109 case '\f':
2110 p++;
2111 continue; // skip white space
2112
2113 case '_':
2114 if (memcmp(p, "__FILE__", 8) == 0)
2115 {
2116 p += 8;
2117 filespec = mem.xstrdup(scanloc.filename);
2118 continue;
2119 }
2120 goto Lerr;
2121
2122 case '"':
2123 if (filespec)
2124 goto Lerr;
2125 stringbuffer.reset();
2126 p++;
2127 while (1)
2128 { unsigned c;
2129
2130 c = *p;
2131 switch (c)
2132 {
2133 case '\n':
2134 case '\r':
2135 case 0:
2136 case 0x1A:
2137 goto Lerr;
2138
2139 case '"':
2140 stringbuffer.writeByte(0);
2141 filespec = mem.xstrdup((char *)stringbuffer.data);
2142 p++;
2143 break;
2144
2145 default:
2146 if (c & 0x80)
2147 { unsigned u = decodeUTF();
2148 if (u == PS || u == LS)
2149 goto Lerr;
2150 }
2151 stringbuffer.writeByte(c);
2152 p++;
2153 continue;
2154 }
2155 break;
2156 }
2157 continue;
2158
2159 default:
2160 if (*p & 0x80)
2161 { unsigned u = decodeUTF();
2162 if (u == PS || u == LS)
2163 goto Lnewline;
2164 }
2165 goto Lerr;
2166 }
2167 }
2168
2169 Lerr:
2170 error(loc, "#line integer [\"filespec\"]\\n expected");
2171 }
2172
2173
2174 /********************************************
2175 * Decode UTF character.
2176 * Issue error messages for invalid sequences.
2177 * Return decoded character, advance p to last character in UTF sequence.
2178 */
2179
2180 unsigned Lexer::decodeUTF()
2181 {
2182 dchar_t u;
2183 utf8_t c;
2184 const utf8_t *s = p;
2185 size_t len;
2186 size_t idx;
2187 const char *msg;
2188
2189 c = *s;
2190 assert(c & 0x80);
2191
2192 // Check length of remaining string up to 6 UTF-8 characters
2193 for (len = 1; len < 6 && s[len]; len++)
2194 ;
2195
2196 idx = 0;
2197 msg = utf_decodeChar(s, len, &idx, &u);
2198 p += idx - 1;
2199 if (msg)
2200 {
2201 error("%s", msg);
2202 }
2203 return u;
2204 }
2205
2206
2207 /***************************************************
2208 * Parse doc comment embedded between t->ptr and p.
2209 * Remove trailing blanks and tabs from lines.
2210 * Replace all newlines with \n.
2211 * Remove leading comment character from each line.
2212 * Decide if it's a lineComment or a blockComment.
2213 * Append to previous one for this token.
2214 */
2215
2216 void Lexer::getDocComment(Token *t, unsigned lineComment)
2217 {
2218 /* ct tells us which kind of comment it is: '/', '*', or '+'
2219 */
2220 utf8_t ct = t->ptr[2];
2221
2222 /* Start of comment text skips over / * *, / + +, or / / /
2223 */
2224 const utf8_t *q = t->ptr + 3; // start of comment text
2225
2226 const utf8_t *qend = p;
2227 if (ct == '*' || ct == '+')
2228 qend -= 2;
2229
2230 /* Scan over initial row of ****'s or ++++'s or ////'s
2231 */
2232 for (; q < qend; q++)
2233 {
2234 if (*q != ct)
2235 break;
2236 }
2237
2238 /* Remove leading spaces until start of the comment
2239 */
2240 int linestart = 0;
2241 if (ct == '/')
2242 {
2243 while (q < qend && (*q == ' ' || *q == '\t'))
2244 ++q;
2245 }
2246 else if (q < qend)
2247 {
2248 if (*q == '\r')
2249 {
2250 ++q;
2251 if (q < qend && *q == '\n')
2252 ++q;
2253 linestart = 1;
2254 }
2255 else if (*q == '\n')
2256 {
2257 ++q;
2258 linestart = 1;
2259 }
2260 }
2261
2262 /* Remove trailing row of ****'s or ++++'s
2263 */
2264 if (ct != '/')
2265 {
2266 for (; q < qend; qend--)
2267 {
2268 if (qend[-1] != ct)
2269 break;
2270 }
2271 }
2272
2273 /* Comment is now [q .. qend].
2274 * Canonicalize it into buf[].
2275 */
2276 OutBuffer buf;
2277
2278 for (; q < qend; q++)
2279 {
2280 utf8_t c = *q;
2281
2282 switch (c)
2283 {
2284 case '*':
2285 case '+':
2286 if (linestart && c == ct)
2287 { linestart = 0;
2288 /* Trim preceding whitespace up to preceding \n
2289 */
2290 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2291 buf.offset--;
2292 continue;
2293 }
2294 break;
2295
2296 case ' ':
2297 case '\t':
2298 break;
2299
2300 case '\r':
2301 if (q[1] == '\n')
2302 continue; // skip the \r
2303 goto Lnewline;
2304
2305 default:
2306 if (c == 226)
2307 {
2308 // If LS or PS
2309 if (q[1] == 128 &&
2310 (q[2] == 168 || q[2] == 169))
2311 {
2312 q += 2;
2313 goto Lnewline;
2314 }
2315 }
2316 linestart = 0;
2317 break;
2318
2319 Lnewline:
2320 c = '\n'; // replace all newlines with \n
2321 /* fall through */
2322 case '\n':
2323 linestart = 1;
2324
2325 /* Trim trailing whitespace
2326 */
2327 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2328 buf.offset--;
2329
2330 break;
2331 }
2332 buf.writeByte(c);
2333 }
2334
2335 /* Trim trailing whitespace (if the last line does not have newline)
2336 */
2337 if (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2338 {
2339 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
2340 buf.offset--;
2341 }
2342
2343 // Always end with a newline
2344 if (!buf.offset || buf.data[buf.offset - 1] != '\n')
2345 buf.writeByte('\n');
2346
2347 buf.writeByte(0);
2348
2349 // It's a line comment if the start of the doc comment comes
2350 // after other non-whitespace on the same line.
2351 const utf8_t** dc = (lineComment && anyToken)
2352 ? &t->lineComment
2353 : &t->blockComment;
2354
2355 // Combine with previous doc comment, if any
2356 if (*dc)
2357 *dc = combineComments(*dc, (utf8_t *)buf.data);
2358 else
2359 *dc = (utf8_t *)buf.extractData();
2360 }
2361
2362 /********************************************
2363 * Combine two document comments into one,
2364 * separated by a newline.
2365 */
2366
2367 const utf8_t *Lexer::combineComments(const utf8_t *c1, const utf8_t *c2)
2368 {
2369 //printf("Lexer::combineComments('%s', '%s')\n", c1, c2);
2370
2371 const utf8_t *c = c2;
2372
2373 if (c1)
2374 {
2375 c = c1;
2376 if (c2)
2377 {
2378 size_t len1 = strlen((const char *)c1);
2379 size_t len2 = strlen((const char *)c2);
2380
2381 int insertNewLine = 0;
2382 if (len1 && c1[len1 - 1] != '\n')
2383 {
2384 ++len1;
2385 insertNewLine = 1;
2386 }
2387
2388 utf8_t *p = (utf8_t *)mem.xmalloc(len1 + 1 + len2 + 1);
2389 memcpy(p, c1, len1 - insertNewLine);
2390 if (insertNewLine)
2391 p[len1 - 1] = '\n';
2392
2393 p[len1] = '\n';
2394
2395 memcpy(p + len1 + 1, c2, len2);
2396 p[len1 + 1 + len2] = 0;
2397 c = p;
2398 }
2399 }
2400 return c;
2401 }