Mercurial > hg > CbC > CbC_gcc
comparison gcc/d/dmd/lexer.c @ 145:1830386684a0
gcc-9.2.0
author | anatofuz |
---|---|
date | Thu, 13 Feb 2020 11:34:05 +0900 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
131:84e7813d76e9 | 145:1830386684a0 |
---|---|
1 | |
2 /* Compiler implementation of the D programming language | |
3 * Copyright (C) 1999-2019 by The D Language Foundation, All Rights Reserved | |
4 * written by Walter Bright | |
5 * http://www.digitalmars.com | |
6 * Distributed under the Boost Software License, Version 1.0. | |
7 * http://www.boost.org/LICENSE_1_0.txt | |
8 * https://github.com/D-Programming-Language/dmd/blob/master/src/lexer.c | |
9 */ | |
10 | |
11 /* Lexical Analyzer */ | |
12 | |
13 #include "root/dsystem.h" // for time() and ctime() | |
14 #include "root/rmem.h" | |
15 | |
16 #include "mars.h" | |
17 #include "lexer.h" | |
18 #include "utf.h" | |
19 #include "identifier.h" | |
20 #include "id.h" | |
21 | |
22 extern int HtmlNamedEntity(const utf8_t *p, size_t length); | |
23 | |
24 #define LS 0x2028 // UTF line separator | |
25 #define PS 0x2029 // UTF paragraph separator | |
26 | |
27 /******************************************** | |
28 * Do our own char maps | |
29 */ | |
30 | |
31 static unsigned char cmtable[256]; | |
32 | |
33 const int CMoctal = 0x1; | |
34 const int CMhex = 0x2; | |
35 const int CMidchar = 0x4; | |
36 | |
37 inline bool isoctal (utf8_t c) { return (cmtable[c] & CMoctal) != 0; } | |
38 inline bool ishex (utf8_t c) { return (cmtable[c] & CMhex) != 0; } | |
39 inline bool isidchar(utf8_t c) { return (cmtable[c] & CMidchar) != 0; } | |
40 | |
41 struct CMTableInitializer | |
42 { | |
43 CMTableInitializer(); | |
44 }; | |
45 | |
46 static CMTableInitializer cmtableinitializer; | |
47 | |
48 CMTableInitializer::CMTableInitializer() | |
49 { | |
50 for (unsigned c = 0; c < 256; c++) | |
51 { | |
52 if ('0' <= c && c <= '7') | |
53 cmtable[c] |= CMoctal; | |
54 if (isxdigit(c)) | |
55 cmtable[c] |= CMhex; | |
56 if (isalnum(c) || c == '_') | |
57 cmtable[c] |= CMidchar; | |
58 } | |
59 } | |
60 | |
61 /*************************** Lexer ********************************************/ | |
62 | |
63 OutBuffer Lexer::stringbuffer; | |
64 | |
65 Lexer::Lexer(const char *filename, | |
66 const utf8_t *base, size_t begoffset, size_t endoffset, | |
67 bool doDocComment, bool commentToken) | |
68 { | |
69 scanloc = Loc(filename, 1, 1); | |
70 //printf("Lexer::Lexer(%p,%d)\n",base,length); | |
71 //printf("lexer.filename = %s\n", filename); | |
72 this->token = Token(); | |
73 this->token.ptr = NULL; | |
74 this->token.value = TOKreserved; | |
75 this->token.blockComment = NULL; | |
76 this->token.lineComment = NULL; | |
77 this->base = base; | |
78 this->end = base + endoffset; | |
79 p = base + begoffset; | |
80 line = p; | |
81 this->doDocComment = doDocComment; | |
82 this->anyToken = 0; | |
83 this->commentToken = commentToken; | |
84 this->errors = false; | |
85 //initKeywords(); | |
86 | |
87 /* If first line starts with '#!', ignore the line | |
88 */ | |
89 | |
90 if (p[0] == '#' && p[1] =='!') | |
91 { | |
92 p += 2; | |
93 while (1) | |
94 { | |
95 utf8_t c = *p++; | |
96 switch (c) | |
97 { | |
98 case 0: | |
99 case 0x1A: | |
100 p--; | |
101 /* fall through */ | |
102 | |
103 case '\n': | |
104 break; | |
105 | |
106 default: | |
107 continue; | |
108 } | |
109 break; | |
110 } | |
111 endOfLine(); | |
112 } | |
113 } | |
114 | |
115 | |
116 void Lexer::endOfLine() | |
117 { | |
118 scanloc.linnum++; | |
119 line = p; | |
120 } | |
121 | |
122 | |
123 void Lexer::error(const char *format, ...) | |
124 { | |
125 va_list ap; | |
126 va_start(ap, format); | |
127 ::verror(token.loc, format, ap); | |
128 va_end(ap); | |
129 errors = true; | |
130 } | |
131 | |
132 void Lexer::error(Loc loc, const char *format, ...) | |
133 { | |
134 va_list ap; | |
135 va_start(ap, format); | |
136 ::verror(loc, format, ap); | |
137 va_end(ap); | |
138 errors = true; | |
139 } | |
140 | |
141 void Lexer::deprecation(const char *format, ...) | |
142 { | |
143 va_list ap; | |
144 va_start(ap, format); | |
145 ::vdeprecation(token.loc, format, ap); | |
146 va_end(ap); | |
147 if (global.params.useDeprecated == DIAGNOSTICerror) | |
148 errors = true; | |
149 } | |
150 | |
151 TOK Lexer::nextToken() | |
152 { | |
153 if (token.next) | |
154 { | |
155 Token *t = token.next; | |
156 memcpy(&token,t,sizeof(Token)); | |
157 t->free(); | |
158 } | |
159 else | |
160 { | |
161 scan(&token); | |
162 } | |
163 //token.print(); | |
164 return token.value; | |
165 } | |
166 | |
167 Token *Lexer::peek(Token *ct) | |
168 { | |
169 Token *t; | |
170 if (ct->next) | |
171 t = ct->next; | |
172 else | |
173 { | |
174 t = Token::alloc(); | |
175 scan(t); | |
176 ct->next = t; | |
177 } | |
178 return t; | |
179 } | |
180 | |
181 /*********************** | |
182 * Look ahead at next token's value. | |
183 */ | |
184 | |
185 TOK Lexer::peekNext() | |
186 { | |
187 return peek(&token)->value; | |
188 } | |
189 | |
190 /*********************** | |
191 * Look 2 tokens ahead at value. | |
192 */ | |
193 | |
194 TOK Lexer::peekNext2() | |
195 { | |
196 Token *t = peek(&token); | |
197 return peek(t)->value; | |
198 } | |
199 | |
200 /********************************* | |
201 * tk is on the opening (. | |
202 * Look ahead and return token that is past the closing ). | |
203 */ | |
204 | |
205 Token *Lexer::peekPastParen(Token *tk) | |
206 { | |
207 //printf("peekPastParen()\n"); | |
208 int parens = 1; | |
209 int curlynest = 0; | |
210 while (1) | |
211 { | |
212 tk = peek(tk); | |
213 //tk->print(); | |
214 switch (tk->value) | |
215 { | |
216 case TOKlparen: | |
217 parens++; | |
218 continue; | |
219 | |
220 case TOKrparen: | |
221 --parens; | |
222 if (parens) | |
223 continue; | |
224 tk = peek(tk); | |
225 break; | |
226 | |
227 case TOKlcurly: | |
228 curlynest++; | |
229 continue; | |
230 | |
231 case TOKrcurly: | |
232 if (--curlynest >= 0) | |
233 continue; | |
234 break; | |
235 | |
236 case TOKsemicolon: | |
237 if (curlynest) | |
238 continue; | |
239 break; | |
240 | |
241 case TOKeof: | |
242 break; | |
243 | |
244 default: | |
245 continue; | |
246 } | |
247 return tk; | |
248 } | |
249 } | |
250 | |
251 /**************************** | |
252 * Turn next token in buffer into a token. | |
253 */ | |
254 | |
255 void Lexer::scan(Token *t) | |
256 { | |
257 unsigned lastLine = scanloc.linnum; | |
258 Loc startLoc; | |
259 | |
260 t->blockComment = NULL; | |
261 t->lineComment = NULL; | |
262 while (1) | |
263 { | |
264 t->ptr = p; | |
265 //printf("p = %p, *p = '%c'\n",p,*p); | |
266 t->loc = loc(); | |
267 switch (*p) | |
268 { | |
269 case 0: | |
270 case 0x1A: | |
271 t->value = TOKeof; // end of file | |
272 return; | |
273 | |
274 case ' ': | |
275 case '\t': | |
276 case '\v': | |
277 case '\f': | |
278 p++; | |
279 continue; // skip white space | |
280 | |
281 case '\r': | |
282 p++; | |
283 if (*p != '\n') // if CR stands by itself | |
284 endOfLine(); | |
285 continue; // skip white space | |
286 | |
287 case '\n': | |
288 p++; | |
289 endOfLine(); | |
290 continue; // skip white space | |
291 | |
292 case '0': case '1': case '2': case '3': case '4': | |
293 case '5': case '6': case '7': case '8': case '9': | |
294 t->value = number(t); | |
295 return; | |
296 | |
297 case '\'': | |
298 t->value = charConstant(t); | |
299 return; | |
300 | |
301 case 'r': | |
302 if (p[1] != '"') | |
303 goto case_ident; | |
304 p++; | |
305 /* fall through */ | |
306 case '`': | |
307 t->value = wysiwygStringConstant(t, *p); | |
308 return; | |
309 | |
310 case 'x': | |
311 if (p[1] != '"') | |
312 goto case_ident; | |
313 p++; | |
314 t->value = hexStringConstant(t); | |
315 return; | |
316 | |
317 case 'q': | |
318 if (p[1] == '"') | |
319 { | |
320 p++; | |
321 t->value = delimitedStringConstant(t); | |
322 return; | |
323 } | |
324 else if (p[1] == '{') | |
325 { | |
326 p++; | |
327 t->value = tokenStringConstant(t); | |
328 return; | |
329 } | |
330 else | |
331 goto case_ident; | |
332 | |
333 case '"': | |
334 t->value = escapeStringConstant(t); | |
335 return; | |
336 | |
337 case 'a': case 'b': case 'c': case 'd': case 'e': | |
338 case 'f': case 'g': case 'h': case 'i': case 'j': | |
339 case 'k': case 'l': case 'm': case 'n': case 'o': | |
340 case 'p': /*case 'q': case 'r':*/ case 's': case 't': | |
341 case 'u': case 'v': case 'w': /*case 'x':*/ case 'y': | |
342 case 'z': | |
343 case 'A': case 'B': case 'C': case 'D': case 'E': | |
344 case 'F': case 'G': case 'H': case 'I': case 'J': | |
345 case 'K': case 'L': case 'M': case 'N': case 'O': | |
346 case 'P': case 'Q': case 'R': case 'S': case 'T': | |
347 case 'U': case 'V': case 'W': case 'X': case 'Y': | |
348 case 'Z': | |
349 case '_': | |
350 case_ident: | |
351 { utf8_t c; | |
352 | |
353 while (1) | |
354 { | |
355 c = *++p; | |
356 if (isidchar(c)) | |
357 continue; | |
358 else if (c & 0x80) | |
359 { const utf8_t *s = p; | |
360 unsigned u = decodeUTF(); | |
361 if (isUniAlpha(u)) | |
362 continue; | |
363 error("char 0x%04x not allowed in identifier", u); | |
364 p = s; | |
365 } | |
366 break; | |
367 } | |
368 | |
369 Identifier *id = Identifier::idPool((const char *)t->ptr, p - t->ptr); | |
370 t->ident = id; | |
371 t->value = (TOK) id->getValue(); | |
372 anyToken = 1; | |
373 if (*t->ptr == '_') // if special identifier token | |
374 { | |
375 static bool initdone = false; | |
376 static char date[11+1]; | |
377 static char time[8+1]; | |
378 static char timestamp[24+1]; | |
379 | |
380 if (!initdone) // lazy evaluation | |
381 { | |
382 initdone = true; | |
383 time_t ct; | |
384 ::time(&ct); | |
385 char *p = ctime(&ct); | |
386 assert(p); | |
387 sprintf(&date[0], "%.6s %.4s", p + 4, p + 20); | |
388 sprintf(&time[0], "%.8s", p + 11); | |
389 sprintf(×tamp[0], "%.24s", p); | |
390 } | |
391 | |
392 if (id == Id::DATE) | |
393 { | |
394 t->ustring = (utf8_t *)date; | |
395 goto Lstr; | |
396 } | |
397 else if (id == Id::TIME) | |
398 { | |
399 t->ustring = (utf8_t *)time; | |
400 goto Lstr; | |
401 } | |
402 else if (id == Id::VENDOR) | |
403 { | |
404 t->ustring = (utf8_t *)const_cast<char *>(global.vendor); | |
405 goto Lstr; | |
406 } | |
407 else if (id == Id::TIMESTAMP) | |
408 { | |
409 t->ustring = (utf8_t *)timestamp; | |
410 Lstr: | |
411 t->value = TOKstring; | |
412 t->postfix = 0; | |
413 t->len = (unsigned)strlen((char *)t->ustring); | |
414 } | |
415 else if (id == Id::VERSIONX) | |
416 { unsigned major = 0; | |
417 unsigned minor = 0; | |
418 bool point = false; | |
419 | |
420 for (const char *p = global.version + 1; 1; p++) | |
421 { | |
422 c = *p; | |
423 if (isdigit((utf8_t)c)) | |
424 minor = minor * 10 + c - '0'; | |
425 else if (c == '.') | |
426 { | |
427 if (point) | |
428 break; // ignore everything after second '.' | |
429 point = true; | |
430 major = minor; | |
431 minor = 0; | |
432 } | |
433 else | |
434 break; | |
435 } | |
436 t->value = TOKint64v; | |
437 t->uns64value = major * 1000 + minor; | |
438 } | |
439 else if (id == Id::EOFX) | |
440 { | |
441 t->value = TOKeof; | |
442 // Advance scanner to end of file | |
443 while (!(*p == 0 || *p == 0x1A)) | |
444 p++; | |
445 } | |
446 } | |
447 //printf("t->value = %d\n",t->value); | |
448 return; | |
449 } | |
450 | |
451 case '/': | |
452 p++; | |
453 switch (*p) | |
454 { | |
455 case '=': | |
456 p++; | |
457 t->value = TOKdivass; | |
458 return; | |
459 | |
460 case '*': | |
461 p++; | |
462 startLoc = loc(); | |
463 while (1) | |
464 { | |
465 while (1) | |
466 { utf8_t c = *p; | |
467 switch (c) | |
468 { | |
469 case '/': | |
470 break; | |
471 | |
472 case '\n': | |
473 endOfLine(); | |
474 p++; | |
475 continue; | |
476 | |
477 case '\r': | |
478 p++; | |
479 if (*p != '\n') | |
480 endOfLine(); | |
481 continue; | |
482 | |
483 case 0: | |
484 case 0x1A: | |
485 error("unterminated /* */ comment"); | |
486 p = end; | |
487 t->loc = loc(); | |
488 t->value = TOKeof; | |
489 return; | |
490 | |
491 default: | |
492 if (c & 0x80) | |
493 { unsigned u = decodeUTF(); | |
494 if (u == PS || u == LS) | |
495 endOfLine(); | |
496 } | |
497 p++; | |
498 continue; | |
499 } | |
500 break; | |
501 } | |
502 p++; | |
503 if (p[-2] == '*' && p - 3 != t->ptr) | |
504 break; | |
505 } | |
506 if (commentToken) | |
507 { | |
508 t->loc = startLoc; | |
509 t->value = TOKcomment; | |
510 return; | |
511 } | |
512 else if (doDocComment && t->ptr[2] == '*' && p - 4 != t->ptr) | |
513 { // if /** but not /**/ | |
514 getDocComment(t, lastLine == startLoc.linnum); | |
515 } | |
516 continue; | |
517 | |
518 case '/': // do // style comments | |
519 startLoc = loc(); | |
520 while (1) | |
521 { utf8_t c = *++p; | |
522 switch (c) | |
523 { | |
524 case '\n': | |
525 break; | |
526 | |
527 case '\r': | |
528 if (p[1] == '\n') | |
529 p++; | |
530 break; | |
531 | |
532 case 0: | |
533 case 0x1A: | |
534 if (commentToken) | |
535 { | |
536 p = end; | |
537 t->loc = startLoc; | |
538 t->value = TOKcomment; | |
539 return; | |
540 } | |
541 if (doDocComment && t->ptr[2] == '/') | |
542 getDocComment(t, lastLine == startLoc.linnum); | |
543 p = end; | |
544 t->loc = loc(); | |
545 t->value = TOKeof; | |
546 return; | |
547 | |
548 default: | |
549 if (c & 0x80) | |
550 { unsigned u = decodeUTF(); | |
551 if (u == PS || u == LS) | |
552 break; | |
553 } | |
554 continue; | |
555 } | |
556 break; | |
557 } | |
558 | |
559 if (commentToken) | |
560 { | |
561 p++; | |
562 endOfLine(); | |
563 t->loc = startLoc; | |
564 t->value = TOKcomment; | |
565 return; | |
566 } | |
567 if (doDocComment && t->ptr[2] == '/') | |
568 getDocComment(t, lastLine == startLoc.linnum); | |
569 | |
570 p++; | |
571 endOfLine(); | |
572 continue; | |
573 | |
574 case '+': | |
575 { int nest; | |
576 | |
577 startLoc = loc(); | |
578 p++; | |
579 nest = 1; | |
580 while (1) | |
581 { utf8_t c = *p; | |
582 switch (c) | |
583 { | |
584 case '/': | |
585 p++; | |
586 if (*p == '+') | |
587 { | |
588 p++; | |
589 nest++; | |
590 } | |
591 continue; | |
592 | |
593 case '+': | |
594 p++; | |
595 if (*p == '/') | |
596 { | |
597 p++; | |
598 if (--nest == 0) | |
599 break; | |
600 } | |
601 continue; | |
602 | |
603 case '\r': | |
604 p++; | |
605 if (*p != '\n') | |
606 endOfLine(); | |
607 continue; | |
608 | |
609 case '\n': | |
610 endOfLine(); | |
611 p++; | |
612 continue; | |
613 | |
614 case 0: | |
615 case 0x1A: | |
616 error("unterminated /+ +/ comment"); | |
617 p = end; | |
618 t->loc = loc(); | |
619 t->value = TOKeof; | |
620 return; | |
621 | |
622 default: | |
623 if (c & 0x80) | |
624 { unsigned u = decodeUTF(); | |
625 if (u == PS || u == LS) | |
626 endOfLine(); | |
627 } | |
628 p++; | |
629 continue; | |
630 } | |
631 break; | |
632 } | |
633 if (commentToken) | |
634 { | |
635 t->loc = startLoc; | |
636 t->value = TOKcomment; | |
637 return; | |
638 } | |
639 if (doDocComment && t->ptr[2] == '+' && p - 4 != t->ptr) | |
640 { // if /++ but not /++/ | |
641 getDocComment(t, lastLine == startLoc.linnum); | |
642 } | |
643 continue; | |
644 } | |
645 default: | |
646 break; | |
647 } | |
648 t->value = TOKdiv; | |
649 return; | |
650 | |
651 case '.': | |
652 p++; | |
653 if (isdigit(*p)) | |
654 { /* Note that we don't allow ._1 and ._ as being | |
655 * valid floating point numbers. | |
656 */ | |
657 p--; | |
658 t->value = inreal(t); | |
659 } | |
660 else if (p[0] == '.') | |
661 { | |
662 if (p[1] == '.') | |
663 { p += 2; | |
664 t->value = TOKdotdotdot; | |
665 } | |
666 else | |
667 { p++; | |
668 t->value = TOKslice; | |
669 } | |
670 } | |
671 else | |
672 t->value = TOKdot; | |
673 return; | |
674 | |
675 case '&': | |
676 p++; | |
677 if (*p == '=') | |
678 { p++; | |
679 t->value = TOKandass; | |
680 } | |
681 else if (*p == '&') | |
682 { p++; | |
683 t->value = TOKandand; | |
684 } | |
685 else | |
686 t->value = TOKand; | |
687 return; | |
688 | |
689 case '|': | |
690 p++; | |
691 if (*p == '=') | |
692 { p++; | |
693 t->value = TOKorass; | |
694 } | |
695 else if (*p == '|') | |
696 { p++; | |
697 t->value = TOKoror; | |
698 } | |
699 else | |
700 t->value = TOKor; | |
701 return; | |
702 | |
703 case '-': | |
704 p++; | |
705 if (*p == '=') | |
706 { p++; | |
707 t->value = TOKminass; | |
708 } | |
709 else if (*p == '-') | |
710 { p++; | |
711 t->value = TOKminusminus; | |
712 } | |
713 else | |
714 t->value = TOKmin; | |
715 return; | |
716 | |
717 case '+': | |
718 p++; | |
719 if (*p == '=') | |
720 { p++; | |
721 t->value = TOKaddass; | |
722 } | |
723 else if (*p == '+') | |
724 { p++; | |
725 t->value = TOKplusplus; | |
726 } | |
727 else | |
728 t->value = TOKadd; | |
729 return; | |
730 | |
731 case '<': | |
732 p++; | |
733 if (*p == '=') | |
734 { p++; | |
735 t->value = TOKle; // <= | |
736 } | |
737 else if (*p == '<') | |
738 { p++; | |
739 if (*p == '=') | |
740 { p++; | |
741 t->value = TOKshlass; // <<= | |
742 } | |
743 else | |
744 t->value = TOKshl; // << | |
745 } | |
746 else if (*p == '>') | |
747 { p++; | |
748 if (*p == '=') | |
749 { p++; | |
750 t->value = TOKleg; // <>= | |
751 } | |
752 else | |
753 t->value = TOKlg; // <> | |
754 } | |
755 else | |
756 t->value = TOKlt; // < | |
757 return; | |
758 | |
759 case '>': | |
760 p++; | |
761 if (*p == '=') | |
762 { p++; | |
763 t->value = TOKge; // >= | |
764 } | |
765 else if (*p == '>') | |
766 { p++; | |
767 if (*p == '=') | |
768 { p++; | |
769 t->value = TOKshrass; // >>= | |
770 } | |
771 else if (*p == '>') | |
772 { p++; | |
773 if (*p == '=') | |
774 { p++; | |
775 t->value = TOKushrass; // >>>= | |
776 } | |
777 else | |
778 t->value = TOKushr; // >>> | |
779 } | |
780 else | |
781 t->value = TOKshr; // >> | |
782 } | |
783 else | |
784 t->value = TOKgt; // > | |
785 return; | |
786 | |
787 case '!': | |
788 p++; | |
789 if (*p == '=') | |
790 { p++; | |
791 t->value = TOKnotequal; // != | |
792 } | |
793 else if (*p == '<') | |
794 { p++; | |
795 if (*p == '>') | |
796 { p++; | |
797 if (*p == '=') | |
798 { p++; | |
799 t->value = TOKunord; // !<>= | |
800 } | |
801 else | |
802 t->value = TOKue; // !<> | |
803 } | |
804 else if (*p == '=') | |
805 { p++; | |
806 t->value = TOKug; // !<= | |
807 } | |
808 else | |
809 t->value = TOKuge; // !< | |
810 } | |
811 else if (*p == '>') | |
812 { p++; | |
813 if (*p == '=') | |
814 { p++; | |
815 t->value = TOKul; // !>= | |
816 } | |
817 else | |
818 t->value = TOKule; // !> | |
819 } | |
820 else | |
821 t->value = TOKnot; // ! | |
822 return; | |
823 | |
824 case '=': | |
825 p++; | |
826 if (*p == '=') | |
827 { p++; | |
828 t->value = TOKequal; // == | |
829 } | |
830 else if (*p == '>') | |
831 { p++; | |
832 t->value = TOKgoesto; // => | |
833 } | |
834 else | |
835 t->value = TOKassign; // = | |
836 return; | |
837 | |
838 case '~': | |
839 p++; | |
840 if (*p == '=') | |
841 { p++; | |
842 t->value = TOKcatass; // ~= | |
843 } | |
844 else | |
845 t->value = TOKtilde; // ~ | |
846 return; | |
847 | |
848 case '^': | |
849 p++; | |
850 if (*p == '^') | |
851 { p++; | |
852 if (*p == '=') | |
853 { p++; | |
854 t->value = TOKpowass; // ^^= | |
855 } | |
856 else | |
857 t->value = TOKpow; // ^^ | |
858 } | |
859 else if (*p == '=') | |
860 { p++; | |
861 t->value = TOKxorass; // ^= | |
862 } | |
863 else | |
864 t->value = TOKxor; // ^ | |
865 return; | |
866 | |
867 case '(': p++; t->value = TOKlparen; return; | |
868 case ')': p++; t->value = TOKrparen; return; | |
869 case '[': p++; t->value = TOKlbracket; return; | |
870 case ']': p++; t->value = TOKrbracket; return; | |
871 case '{': p++; t->value = TOKlcurly; return; | |
872 case '}': p++; t->value = TOKrcurly; return; | |
873 case '?': p++; t->value = TOKquestion; return; | |
874 case ',': p++; t->value = TOKcomma; return; | |
875 case ';': p++; t->value = TOKsemicolon; return; | |
876 case ':': p++; t->value = TOKcolon; return; | |
877 case '$': p++; t->value = TOKdollar; return; | |
878 case '@': p++; t->value = TOKat; return; | |
879 | |
880 case '*': | |
881 p++; | |
882 if (*p == '=') | |
883 { p++; | |
884 t->value = TOKmulass; | |
885 } | |
886 else | |
887 t->value = TOKmul; | |
888 return; | |
889 case '%': | |
890 p++; | |
891 if (*p == '=') | |
892 { p++; | |
893 t->value = TOKmodass; | |
894 } | |
895 else | |
896 t->value = TOKmod; | |
897 return; | |
898 | |
899 case '#': | |
900 { | |
901 p++; | |
902 Token n; | |
903 scan(&n); | |
904 if (n.value == TOKidentifier) | |
905 { | |
906 if (n.ident == Id::line) | |
907 { | |
908 poundLine(); | |
909 continue; | |
910 } | |
911 else | |
912 { | |
913 const Loc locx = loc(); | |
914 warning(locx, "C preprocessor directive `#%s` is not supported", n.ident->toChars()); | |
915 } | |
916 } | |
917 else if (n.value == TOKif) | |
918 { | |
919 error("C preprocessor directive `#if` is not supported, use `version` or `static if`"); | |
920 } | |
921 t->value = TOKpound; | |
922 return; | |
923 } | |
924 | |
925 default: | |
926 { unsigned c = *p; | |
927 | |
928 if (c & 0x80) | |
929 { c = decodeUTF(); | |
930 | |
931 // Check for start of unicode identifier | |
932 if (isUniAlpha(c)) | |
933 goto case_ident; | |
934 | |
935 if (c == PS || c == LS) | |
936 { | |
937 endOfLine(); | |
938 p++; | |
939 continue; | |
940 } | |
941 } | |
942 if (c < 0x80 && isprint(c)) | |
943 error("character '%c' is not a valid token", c); | |
944 else | |
945 error("character 0x%02x is not a valid token", c); | |
946 p++; | |
947 continue; | |
948 } | |
949 } | |
950 } | |
951 } | |
952 | |
953 /******************************************* | |
954 * Parse escape sequence. | |
955 */ | |
956 | |
957 unsigned Lexer::escapeSequence() | |
958 { unsigned c = *p; | |
959 | |
960 int n; | |
961 int ndigits; | |
962 | |
963 switch (c) | |
964 { | |
965 case '\'': | |
966 case '"': | |
967 case '?': | |
968 case '\\': | |
969 Lconsume: | |
970 p++; | |
971 break; | |
972 | |
973 case 'a': c = 7; goto Lconsume; | |
974 case 'b': c = 8; goto Lconsume; | |
975 case 'f': c = 12; goto Lconsume; | |
976 case 'n': c = 10; goto Lconsume; | |
977 case 'r': c = 13; goto Lconsume; | |
978 case 't': c = 9; goto Lconsume; | |
979 case 'v': c = 11; goto Lconsume; | |
980 | |
981 case 'u': | |
982 ndigits = 4; | |
983 goto Lhex; | |
984 case 'U': | |
985 ndigits = 8; | |
986 goto Lhex; | |
987 case 'x': | |
988 ndigits = 2; | |
989 Lhex: | |
990 p++; | |
991 c = *p; | |
992 if (ishex((utf8_t)c)) | |
993 { unsigned v; | |
994 | |
995 n = 0; | |
996 v = 0; | |
997 while (1) | |
998 { | |
999 if (isdigit((utf8_t)c)) | |
1000 c -= '0'; | |
1001 else if (islower(c)) | |
1002 c -= 'a' - 10; | |
1003 else | |
1004 c -= 'A' - 10; | |
1005 v = v * 16 + c; | |
1006 c = *++p; | |
1007 if (++n == ndigits) | |
1008 break; | |
1009 if (!ishex((utf8_t)c)) | |
1010 { error("escape hex sequence has %d hex digits instead of %d", n, ndigits); | |
1011 break; | |
1012 } | |
1013 } | |
1014 if (ndigits != 2 && !utf_isValidDchar(v)) | |
1015 { error("invalid UTF character \\U%08x", v); | |
1016 v = '?'; // recover with valid UTF character | |
1017 } | |
1018 c = v; | |
1019 } | |
1020 else | |
1021 error("undefined escape hex sequence \\%c",c); | |
1022 break; | |
1023 | |
1024 case '&': // named character entity | |
1025 for (const utf8_t *idstart = ++p; 1; p++) | |
1026 { | |
1027 switch (*p) | |
1028 { | |
1029 case ';': | |
1030 c = HtmlNamedEntity(idstart, p - idstart); | |
1031 if (c == ~0U) | |
1032 { error("unnamed character entity &%.*s;", (int)(p - idstart), idstart); | |
1033 c = ' '; | |
1034 } | |
1035 p++; | |
1036 break; | |
1037 | |
1038 default: | |
1039 if (isalpha(*p) || | |
1040 (p != idstart && isdigit(*p))) | |
1041 continue; | |
1042 error("unterminated named entity &%.*s;", (int)(p - idstart + 1), idstart); | |
1043 break; | |
1044 } | |
1045 break; | |
1046 } | |
1047 break; | |
1048 | |
1049 case 0: | |
1050 case 0x1A: // end of file | |
1051 c = '\\'; | |
1052 break; | |
1053 | |
1054 default: | |
1055 if (isoctal((utf8_t)c)) | |
1056 { unsigned v; | |
1057 | |
1058 n = 0; | |
1059 v = 0; | |
1060 do | |
1061 { | |
1062 v = v * 8 + (c - '0'); | |
1063 c = *++p; | |
1064 } while (++n < 3 && isoctal((utf8_t)c)); | |
1065 c = v; | |
1066 if (c > 0xFF) | |
1067 error("escape octal sequence \\%03o is larger than \\377", c); | |
1068 } | |
1069 else | |
1070 error("undefined escape sequence \\%c",c); | |
1071 break; | |
1072 } | |
1073 return c; | |
1074 } | |
1075 | |
1076 /************************************** | |
1077 */ | |
1078 | |
1079 TOK Lexer::wysiwygStringConstant(Token *t, int tc) | |
1080 { | |
1081 int c; | |
1082 Loc start = loc(); | |
1083 | |
1084 p++; | |
1085 stringbuffer.reset(); | |
1086 while (1) | |
1087 { | |
1088 c = *p++; | |
1089 switch (c) | |
1090 { | |
1091 case '\n': | |
1092 endOfLine(); | |
1093 break; | |
1094 | |
1095 case '\r': | |
1096 if (*p == '\n') | |
1097 continue; // ignore | |
1098 c = '\n'; // treat EndOfLine as \n character | |
1099 endOfLine(); | |
1100 break; | |
1101 | |
1102 case 0: | |
1103 case 0x1A: | |
1104 error("unterminated string constant starting at %s", start.toChars()); | |
1105 t->ustring = (utf8_t *)const_cast<char *>(""); | |
1106 t->len = 0; | |
1107 t->postfix = 0; | |
1108 return TOKstring; | |
1109 | |
1110 case '"': | |
1111 case '`': | |
1112 if (c == tc) | |
1113 { | |
1114 t->len = (unsigned)stringbuffer.offset; | |
1115 stringbuffer.writeByte(0); | |
1116 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset); | |
1117 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); | |
1118 stringPostfix(t); | |
1119 return TOKstring; | |
1120 } | |
1121 break; | |
1122 | |
1123 default: | |
1124 if (c & 0x80) | |
1125 { p--; | |
1126 unsigned u = decodeUTF(); | |
1127 p++; | |
1128 if (u == PS || u == LS) | |
1129 endOfLine(); | |
1130 stringbuffer.writeUTF8(u); | |
1131 continue; | |
1132 } | |
1133 break; | |
1134 } | |
1135 stringbuffer.writeByte(c); | |
1136 } | |
1137 } | |
1138 | |
1139 /************************************** | |
1140 * Lex hex strings: | |
1141 * x"0A ae 34FE BD" | |
1142 */ | |
1143 | |
1144 TOK Lexer::hexStringConstant(Token *t) | |
1145 { | |
1146 unsigned c; | |
1147 Loc start = loc(); | |
1148 unsigned n = 0; | |
1149 unsigned v = ~0; // dead assignment, needed to suppress warning | |
1150 | |
1151 p++; | |
1152 stringbuffer.reset(); | |
1153 while (1) | |
1154 { | |
1155 c = *p++; | |
1156 switch (c) | |
1157 { | |
1158 case ' ': | |
1159 case '\t': | |
1160 case '\v': | |
1161 case '\f': | |
1162 continue; // skip white space | |
1163 | |
1164 case '\r': | |
1165 if (*p == '\n') | |
1166 continue; // ignore | |
1167 // Treat isolated '\r' as if it were a '\n' | |
1168 /* fall through */ | |
1169 case '\n': | |
1170 endOfLine(); | |
1171 continue; | |
1172 | |
1173 case 0: | |
1174 case 0x1A: | |
1175 error("unterminated string constant starting at %s", start.toChars()); | |
1176 t->ustring = (utf8_t *)const_cast<char *>(""); | |
1177 t->len = 0; | |
1178 t->postfix = 0; | |
1179 return TOKxstring; | |
1180 | |
1181 case '"': | |
1182 if (n & 1) | |
1183 { error("odd number (%d) of hex characters in hex string", n); | |
1184 stringbuffer.writeByte(v); | |
1185 } | |
1186 t->len = (unsigned)stringbuffer.offset; | |
1187 stringbuffer.writeByte(0); | |
1188 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset); | |
1189 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); | |
1190 stringPostfix(t); | |
1191 return TOKxstring; | |
1192 | |
1193 default: | |
1194 if (c >= '0' && c <= '9') | |
1195 c -= '0'; | |
1196 else if (c >= 'a' && c <= 'f') | |
1197 c -= 'a' - 10; | |
1198 else if (c >= 'A' && c <= 'F') | |
1199 c -= 'A' - 10; | |
1200 else if (c & 0x80) | |
1201 { p--; | |
1202 unsigned u = decodeUTF(); | |
1203 p++; | |
1204 if (u == PS || u == LS) | |
1205 endOfLine(); | |
1206 else | |
1207 error("non-hex character \\u%04x in hex string", u); | |
1208 } | |
1209 else | |
1210 error("non-hex character '%c' in hex string", c); | |
1211 if (n & 1) | |
1212 { v = (v << 4) | c; | |
1213 stringbuffer.writeByte(v); | |
1214 } | |
1215 else | |
1216 v = c; | |
1217 n++; | |
1218 break; | |
1219 } | |
1220 } | |
1221 } | |
1222 | |
1223 | |
1224 /************************************** | |
1225 * Lex delimited strings: | |
1226 * q"(foo(xxx))" // "foo(xxx)" | |
1227 * q"[foo(]" // "foo(" | |
1228 * q"/foo]/" // "foo]" | |
1229 * q"HERE | |
1230 * foo | |
1231 * HERE" // "foo\n" | |
1232 * Input: | |
1233 * p is on the " | |
1234 */ | |
1235 | |
1236 TOK Lexer::delimitedStringConstant(Token *t) | |
1237 { | |
1238 unsigned c; | |
1239 Loc start = loc(); | |
1240 unsigned delimleft = 0; | |
1241 unsigned delimright = 0; | |
1242 unsigned nest = 1; | |
1243 unsigned nestcount = ~0; // dead assignment, needed to suppress warning | |
1244 Identifier *hereid = NULL; | |
1245 unsigned blankrol = 0; | |
1246 unsigned startline = 0; | |
1247 | |
1248 p++; | |
1249 stringbuffer.reset(); | |
1250 while (1) | |
1251 { | |
1252 c = *p++; | |
1253 //printf("c = '%c'\n", c); | |
1254 switch (c) | |
1255 { | |
1256 case '\n': | |
1257 Lnextline: | |
1258 endOfLine(); | |
1259 startline = 1; | |
1260 if (blankrol) | |
1261 { blankrol = 0; | |
1262 continue; | |
1263 } | |
1264 if (hereid) | |
1265 { | |
1266 stringbuffer.writeUTF8(c); | |
1267 continue; | |
1268 } | |
1269 break; | |
1270 | |
1271 case '\r': | |
1272 if (*p == '\n') | |
1273 continue; // ignore | |
1274 c = '\n'; // treat EndOfLine as \n character | |
1275 goto Lnextline; | |
1276 | |
1277 case 0: | |
1278 case 0x1A: | |
1279 error("unterminated delimited string constant starting at %s", start.toChars()); | |
1280 t->ustring = (utf8_t *)const_cast<char *>(""); | |
1281 t->len = 0; | |
1282 t->postfix = 0; | |
1283 return TOKstring; | |
1284 | |
1285 default: | |
1286 if (c & 0x80) | |
1287 { p--; | |
1288 c = decodeUTF(); | |
1289 p++; | |
1290 if (c == PS || c == LS) | |
1291 goto Lnextline; | |
1292 } | |
1293 break; | |
1294 } | |
1295 if (delimleft == 0) | |
1296 { delimleft = c; | |
1297 nest = 1; | |
1298 nestcount = 1; | |
1299 if (c == '(') | |
1300 delimright = ')'; | |
1301 else if (c == '{') | |
1302 delimright = '}'; | |
1303 else if (c == '[') | |
1304 delimright = ']'; | |
1305 else if (c == '<') | |
1306 delimright = '>'; | |
1307 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) | |
1308 { // Start of identifier; must be a heredoc | |
1309 Token tok; | |
1310 p--; | |
1311 scan(&tok); // read in heredoc identifier | |
1312 if (tok.value != TOKidentifier) | |
1313 { error("identifier expected for heredoc, not %s", tok.toChars()); | |
1314 delimright = c; | |
1315 } | |
1316 else | |
1317 { hereid = tok.ident; | |
1318 //printf("hereid = '%s'\n", hereid->toChars()); | |
1319 blankrol = 1; | |
1320 } | |
1321 nest = 0; | |
1322 } | |
1323 else | |
1324 { delimright = c; | |
1325 nest = 0; | |
1326 if (isspace(c)) | |
1327 error("delimiter cannot be whitespace"); | |
1328 } | |
1329 } | |
1330 else | |
1331 { | |
1332 if (blankrol) | |
1333 { error("heredoc rest of line should be blank"); | |
1334 blankrol = 0; | |
1335 continue; | |
1336 } | |
1337 if (nest == 1) | |
1338 { | |
1339 if (c == delimleft) | |
1340 nestcount++; | |
1341 else if (c == delimright) | |
1342 { nestcount--; | |
1343 if (nestcount == 0) | |
1344 goto Ldone; | |
1345 } | |
1346 } | |
1347 else if (c == delimright) | |
1348 goto Ldone; | |
1349 if (startline && isalpha(c) && hereid) | |
1350 { Token tok; | |
1351 const utf8_t *psave = p; | |
1352 p--; | |
1353 scan(&tok); // read in possible heredoc identifier | |
1354 //printf("endid = '%s'\n", tok.ident->toChars()); | |
1355 if (tok.value == TOKidentifier && tok.ident->equals(hereid)) | |
1356 { /* should check that rest of line is blank | |
1357 */ | |
1358 goto Ldone; | |
1359 } | |
1360 p = psave; | |
1361 } | |
1362 stringbuffer.writeUTF8(c); | |
1363 startline = 0; | |
1364 } | |
1365 } | |
1366 | |
1367 Ldone: | |
1368 if (*p == '"') | |
1369 p++; | |
1370 else if (hereid) | |
1371 error("delimited string must end in %s\"", hereid->toChars()); | |
1372 else | |
1373 error("delimited string must end in %c\"", delimright); | |
1374 t->len = (unsigned)stringbuffer.offset; | |
1375 stringbuffer.writeByte(0); | |
1376 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset); | |
1377 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); | |
1378 stringPostfix(t); | |
1379 return TOKstring; | |
1380 } | |
1381 | |
1382 /************************************** | |
1383 * Lex delimited strings: | |
1384 * q{ foo(xxx) } // " foo(xxx) " | |
1385 * q{foo(} // "foo(" | |
1386 * q{{foo}"}"} // "{foo}"}"" | |
1387 * Input: | |
1388 * p is on the q | |
1389 */ | |
1390 | |
1391 TOK Lexer::tokenStringConstant(Token *t) | |
1392 { | |
1393 unsigned nest = 1; | |
1394 Loc start = loc(); | |
1395 const utf8_t *pstart = ++p; | |
1396 | |
1397 while (1) | |
1398 { Token tok; | |
1399 | |
1400 scan(&tok); | |
1401 switch (tok.value) | |
1402 { | |
1403 case TOKlcurly: | |
1404 nest++; | |
1405 continue; | |
1406 | |
1407 case TOKrcurly: | |
1408 if (--nest == 0) | |
1409 { | |
1410 t->len = (unsigned)(p - 1 - pstart); | |
1411 t->ustring = (utf8_t *)mem.xmalloc(t->len + 1); | |
1412 memcpy(t->ustring, pstart, t->len); | |
1413 t->ustring[t->len] = 0; | |
1414 stringPostfix(t); | |
1415 return TOKstring; | |
1416 } | |
1417 continue; | |
1418 | |
1419 case TOKeof: | |
1420 error("unterminated token string constant starting at %s", start.toChars()); | |
1421 t->ustring = (utf8_t *)const_cast<char *>(""); | |
1422 t->len = 0; | |
1423 t->postfix = 0; | |
1424 return TOKstring; | |
1425 | |
1426 default: | |
1427 continue; | |
1428 } | |
1429 } | |
1430 } | |
1431 | |
1432 | |
1433 | |
1434 /************************************** | |
1435 */ | |
1436 | |
1437 TOK Lexer::escapeStringConstant(Token *t) | |
1438 { | |
1439 unsigned c; | |
1440 Loc start = loc(); | |
1441 | |
1442 p++; | |
1443 stringbuffer.reset(); | |
1444 while (1) | |
1445 { | |
1446 c = *p++; | |
1447 switch (c) | |
1448 { | |
1449 case '\\': | |
1450 switch (*p) | |
1451 { | |
1452 case 'u': | |
1453 case 'U': | |
1454 case '&': | |
1455 c = escapeSequence(); | |
1456 stringbuffer.writeUTF8(c); | |
1457 continue; | |
1458 | |
1459 default: | |
1460 c = escapeSequence(); | |
1461 break; | |
1462 } | |
1463 break; | |
1464 case '\n': | |
1465 endOfLine(); | |
1466 break; | |
1467 | |
1468 case '\r': | |
1469 if (*p == '\n') | |
1470 continue; // ignore | |
1471 c = '\n'; // treat EndOfLine as \n character | |
1472 endOfLine(); | |
1473 break; | |
1474 | |
1475 case '"': | |
1476 t->len = (unsigned)stringbuffer.offset; | |
1477 stringbuffer.writeByte(0); | |
1478 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset); | |
1479 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset); | |
1480 stringPostfix(t); | |
1481 return TOKstring; | |
1482 | |
1483 case 0: | |
1484 case 0x1A: | |
1485 p--; | |
1486 error("unterminated string constant starting at %s", start.toChars()); | |
1487 t->ustring = (utf8_t *)const_cast<char *>(""); | |
1488 t->len = 0; | |
1489 t->postfix = 0; | |
1490 return TOKstring; | |
1491 | |
1492 default: | |
1493 if (c & 0x80) | |
1494 { | |
1495 p--; | |
1496 c = decodeUTF(); | |
1497 if (c == LS || c == PS) | |
1498 { c = '\n'; | |
1499 endOfLine(); | |
1500 } | |
1501 p++; | |
1502 stringbuffer.writeUTF8(c); | |
1503 continue; | |
1504 } | |
1505 break; | |
1506 } | |
1507 stringbuffer.writeByte(c); | |
1508 } | |
1509 } | |
1510 | |
1511 /************************************** | |
1512 */ | |
1513 | |
1514 TOK Lexer::charConstant(Token *t) | |
1515 { | |
1516 unsigned c; | |
1517 TOK tk = TOKcharv; | |
1518 | |
1519 //printf("Lexer::charConstant\n"); | |
1520 p++; | |
1521 c = *p++; | |
1522 switch (c) | |
1523 { | |
1524 case '\\': | |
1525 switch (*p) | |
1526 { | |
1527 case 'u': | |
1528 t->uns64value = escapeSequence(); | |
1529 tk = TOKwcharv; | |
1530 break; | |
1531 | |
1532 case 'U': | |
1533 case '&': | |
1534 t->uns64value = escapeSequence(); | |
1535 tk = TOKdcharv; | |
1536 break; | |
1537 | |
1538 default: | |
1539 t->uns64value = escapeSequence(); | |
1540 break; | |
1541 } | |
1542 break; | |
1543 case '\n': | |
1544 L1: | |
1545 endOfLine(); | |
1546 /* fall through */ | |
1547 case '\r': | |
1548 case 0: | |
1549 case 0x1A: | |
1550 case '\'': | |
1551 error("unterminated character constant"); | |
1552 t->uns64value = '?'; | |
1553 return tk; | |
1554 | |
1555 default: | |
1556 if (c & 0x80) | |
1557 { | |
1558 p--; | |
1559 c = decodeUTF(); | |
1560 p++; | |
1561 if (c == LS || c == PS) | |
1562 goto L1; | |
1563 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE)) | |
1564 tk = TOKwcharv; | |
1565 else | |
1566 tk = TOKdcharv; | |
1567 } | |
1568 t->uns64value = c; | |
1569 break; | |
1570 } | |
1571 | |
1572 if (*p != '\'') | |
1573 { | |
1574 error("unterminated character constant"); | |
1575 t->uns64value = '?'; | |
1576 return tk; | |
1577 } | |
1578 p++; | |
1579 return tk; | |
1580 } | |
1581 | |
1582 /*************************************** | |
1583 * Get postfix of string literal. | |
1584 */ | |
1585 | |
1586 void Lexer::stringPostfix(Token *t) | |
1587 { | |
1588 switch (*p) | |
1589 { | |
1590 case 'c': | |
1591 case 'w': | |
1592 case 'd': | |
1593 t->postfix = *p; | |
1594 p++; | |
1595 break; | |
1596 | |
1597 default: | |
1598 t->postfix = 0; | |
1599 break; | |
1600 } | |
1601 } | |
1602 | |
1603 /************************************** | |
1604 * Read in a number. | |
1605 * If it's an integer, store it in tok.TKutok.Vlong. | |
1606 * integers can be decimal, octal or hex | |
1607 * Handle the suffixes U, UL, LU, L, etc. | |
1608 * If it's double, store it in tok.TKutok.Vdouble. | |
1609 * Returns: | |
1610 * TKnum | |
1611 * TKdouble,... | |
1612 */ | |
1613 | |
1614 TOK Lexer::number(Token *t) | |
1615 { | |
1616 int base = 10; | |
1617 const utf8_t *start = p; | |
1618 unsigned c; | |
1619 uinteger_t n = 0; // unsigned >=64 bit integer type | |
1620 int d; | |
1621 bool err = false; | |
1622 bool overflow = false; | |
1623 | |
1624 c = *p; | |
1625 if (c == '0') | |
1626 { | |
1627 ++p; | |
1628 c = *p; | |
1629 switch (c) | |
1630 { | |
1631 case '0': case '1': case '2': case '3': | |
1632 case '4': case '5': case '6': case '7': | |
1633 n = c - '0'; | |
1634 ++p; | |
1635 base = 8; | |
1636 break; | |
1637 | |
1638 case 'x': | |
1639 case 'X': | |
1640 ++p; | |
1641 base = 16; | |
1642 break; | |
1643 | |
1644 case 'b': | |
1645 case 'B': | |
1646 ++p; | |
1647 base = 2; | |
1648 break; | |
1649 | |
1650 case '.': | |
1651 if (p[1] == '.') | |
1652 goto Ldone; // if ".." | |
1653 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80) | |
1654 goto Ldone; // if ".identifier" or ".unicode" | |
1655 goto Lreal; // '.' is part of current token | |
1656 | |
1657 case 'i': | |
1658 case 'f': | |
1659 case 'F': | |
1660 goto Lreal; | |
1661 | |
1662 case '_': | |
1663 ++p; | |
1664 base = 8; | |
1665 break; | |
1666 | |
1667 case 'L': | |
1668 if (p[1] == 'i') | |
1669 goto Lreal; | |
1670 break; | |
1671 | |
1672 default: | |
1673 break; | |
1674 } | |
1675 } | |
1676 | |
1677 while (1) | |
1678 { | |
1679 c = *p; | |
1680 switch (c) | |
1681 { | |
1682 case '0': case '1': | |
1683 ++p; | |
1684 d = c - '0'; | |
1685 break; | |
1686 | |
1687 case '2': case '3': | |
1688 case '4': case '5': case '6': case '7': | |
1689 if (base == 2 && !err) | |
1690 { | |
1691 error("binary digit expected"); | |
1692 err = true; | |
1693 } | |
1694 ++p; | |
1695 d = c - '0'; | |
1696 break; | |
1697 | |
1698 case '8': case '9': | |
1699 ++p; | |
1700 if (base < 10 && !err) | |
1701 { | |
1702 error("radix %d digit expected, not '%c'", base, c); | |
1703 err = true; | |
1704 } | |
1705 d = c - '0'; | |
1706 break; | |
1707 | |
1708 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': | |
1709 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': | |
1710 ++p; | |
1711 if (base != 16) | |
1712 { | |
1713 if (c == 'e' || c == 'E' || c == 'f' || c == 'F') | |
1714 goto Lreal; | |
1715 if (!err) | |
1716 { | |
1717 error("radix %d digit expected, not '%c'", base, c); | |
1718 err = true; | |
1719 } | |
1720 } | |
1721 if (c >= 'a') | |
1722 d = c + 10 - 'a'; | |
1723 else | |
1724 d = c + 10 - 'A'; | |
1725 break; | |
1726 | |
1727 case 'L': | |
1728 if (p[1] == 'i') | |
1729 goto Lreal; | |
1730 goto Ldone; | |
1731 | |
1732 case '.': | |
1733 if (p[1] == '.') | |
1734 goto Ldone; // if ".." | |
1735 if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)) | |
1736 goto Ldone; // if ".identifier" or ".unicode" | |
1737 goto Lreal; // otherwise as part of a floating point literal | |
1738 | |
1739 case 'p': | |
1740 case 'P': | |
1741 case 'i': | |
1742 Lreal: | |
1743 p = start; | |
1744 return inreal(t); | |
1745 | |
1746 case '_': | |
1747 ++p; | |
1748 continue; | |
1749 | |
1750 default: | |
1751 goto Ldone; | |
1752 } | |
1753 | |
1754 uinteger_t n2 = n * base; | |
1755 if ((n2 / base != n || n2 + d < n)) | |
1756 { | |
1757 overflow = true; | |
1758 } | |
1759 n = n2 + d; | |
1760 | |
1761 // if n needs more than 64 bits | |
1762 if (sizeof(n) > 8 && | |
1763 n > 0xFFFFFFFFFFFFFFFFULL) | |
1764 { | |
1765 overflow = true; | |
1766 } | |
1767 } | |
1768 | |
1769 Ldone: | |
1770 | |
1771 if (overflow && !err) | |
1772 { | |
1773 error("integer overflow"); | |
1774 err = true; | |
1775 } | |
1776 | |
1777 enum FLAGS | |
1778 { | |
1779 FLAGS_none = 0, | |
1780 FLAGS_decimal = 1, // decimal | |
1781 FLAGS_unsigned = 2, // u or U suffix | |
1782 FLAGS_long = 4, // L suffix | |
1783 }; | |
1784 | |
1785 unsigned flags = (base == 10) ? FLAGS_decimal : FLAGS_none; | |
1786 | |
1787 // Parse trailing 'u', 'U', 'l' or 'L' in any combination | |
1788 const utf8_t *psuffix = p; | |
1789 while (1) | |
1790 { | |
1791 utf8_t f; | |
1792 switch (*p) | |
1793 { | |
1794 case 'U': | |
1795 case 'u': | |
1796 f = FLAGS_unsigned; | |
1797 goto L1; | |
1798 | |
1799 case 'l': | |
1800 f = FLAGS_long; | |
1801 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead"); | |
1802 goto L1; | |
1803 | |
1804 case 'L': | |
1805 f = FLAGS_long; | |
1806 L1: | |
1807 p++; | |
1808 if ((flags & f) && !err) | |
1809 { | |
1810 error("unrecognized token"); | |
1811 err = true; | |
1812 } | |
1813 flags = (FLAGS) (flags | f); | |
1814 continue; | |
1815 default: | |
1816 break; | |
1817 } | |
1818 break; | |
1819 } | |
1820 | |
1821 if (base == 8 && n >= 8) | |
1822 error("octal literals 0%llo%.*s are no longer supported, use std.conv.octal!%llo%.*s instead", | |
1823 n, p - psuffix, psuffix, n, p - psuffix, psuffix); | |
1824 | |
1825 TOK result; | |
1826 switch (flags) | |
1827 { | |
1828 case FLAGS_none: | |
1829 /* Octal or Hexadecimal constant. | |
1830 * First that fits: int, uint, long, ulong | |
1831 */ | |
1832 if (n & 0x8000000000000000LL) | |
1833 result = TOKuns64v; | |
1834 else if (n & 0xFFFFFFFF00000000LL) | |
1835 result = TOKint64v; | |
1836 else if (n & 0x80000000) | |
1837 result = TOKuns32v; | |
1838 else | |
1839 result = TOKint32v; | |
1840 break; | |
1841 | |
1842 case FLAGS_decimal: | |
1843 /* First that fits: int, long, long long | |
1844 */ | |
1845 if (n & 0x8000000000000000LL) | |
1846 { | |
1847 if (!err) | |
1848 { | |
1849 error("signed integer overflow"); | |
1850 err = true; | |
1851 } | |
1852 result = TOKuns64v; | |
1853 } | |
1854 else if (n & 0xFFFFFFFF80000000LL) | |
1855 result = TOKint64v; | |
1856 else | |
1857 result = TOKint32v; | |
1858 break; | |
1859 | |
1860 case FLAGS_unsigned: | |
1861 case FLAGS_decimal | FLAGS_unsigned: | |
1862 /* First that fits: uint, ulong | |
1863 */ | |
1864 if (n & 0xFFFFFFFF00000000LL) | |
1865 result = TOKuns64v; | |
1866 else | |
1867 result = TOKuns32v; | |
1868 break; | |
1869 | |
1870 case FLAGS_decimal | FLAGS_long: | |
1871 if (n & 0x8000000000000000LL) | |
1872 { | |
1873 if (!err) | |
1874 { | |
1875 error("signed integer overflow"); | |
1876 err = true; | |
1877 } | |
1878 result = TOKuns64v; | |
1879 } | |
1880 else | |
1881 result = TOKint64v; | |
1882 break; | |
1883 | |
1884 case FLAGS_long: | |
1885 if (n & 0x8000000000000000LL) | |
1886 result = TOKuns64v; | |
1887 else | |
1888 result = TOKint64v; | |
1889 break; | |
1890 | |
1891 case FLAGS_unsigned | FLAGS_long: | |
1892 case FLAGS_decimal | FLAGS_unsigned | FLAGS_long: | |
1893 result = TOKuns64v; | |
1894 break; | |
1895 | |
1896 default: | |
1897 assert(0); | |
1898 } | |
1899 t->uns64value = n; | |
1900 return result; | |
1901 } | |
1902 | |
1903 /************************************** | |
1904 * Read in characters, converting them to real. | |
1905 * Bugs: | |
1906 * Exponent overflow not detected. | |
1907 * Too much requested precision is not detected. | |
1908 */ | |
1909 | |
1910 TOK Lexer::inreal(Token *t) | |
1911 { | |
1912 //printf("Lexer::inreal()\n"); | |
1913 bool isWellformedString = true; | |
1914 stringbuffer.reset(); | |
1915 const utf8_t *pstart = p; | |
1916 char hex = 0; | |
1917 unsigned c = *p++; | |
1918 | |
1919 // Leading '0x' | |
1920 if (c == '0') | |
1921 { | |
1922 c = *p++; | |
1923 if (c == 'x' || c == 'X') | |
1924 { | |
1925 hex = true; | |
1926 c = *p++; | |
1927 } | |
1928 } | |
1929 | |
1930 // Digits to left of '.' | |
1931 while (1) | |
1932 { | |
1933 if (c == '.') | |
1934 { | |
1935 c = *p++; | |
1936 break; | |
1937 } | |
1938 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') | |
1939 { | |
1940 c = *p++; | |
1941 continue; | |
1942 } | |
1943 break; | |
1944 } | |
1945 | |
1946 // Digits to right of '.' | |
1947 while (1) | |
1948 { | |
1949 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') | |
1950 { | |
1951 c = *p++; | |
1952 continue; | |
1953 } | |
1954 break; | |
1955 } | |
1956 | |
1957 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P'))) | |
1958 { | |
1959 c = *p++; | |
1960 if (c == '-' || c == '+') | |
1961 { | |
1962 c = *p++; | |
1963 } | |
1964 bool anyexp = false; | |
1965 while (1) | |
1966 { | |
1967 if (isdigit(c)) | |
1968 { | |
1969 anyexp = true; | |
1970 c = *p++; | |
1971 continue; | |
1972 } | |
1973 if (c == '_') | |
1974 { | |
1975 c = *p++; | |
1976 continue; | |
1977 } | |
1978 if (!anyexp) | |
1979 { | |
1980 error("missing exponent"); | |
1981 isWellformedString = false; | |
1982 } | |
1983 break; | |
1984 } | |
1985 } | |
1986 else if (hex) | |
1987 { | |
1988 error("exponent required for hex float"); | |
1989 isWellformedString = false; | |
1990 } | |
1991 --p; | |
1992 while (pstart < p) | |
1993 { | |
1994 if (*pstart != '_') | |
1995 stringbuffer.writeByte(*pstart); | |
1996 ++pstart; | |
1997 } | |
1998 | |
1999 stringbuffer.writeByte(0); | |
2000 const char *sbufptr = (char *)stringbuffer.data; | |
2001 TOK result; | |
2002 bool isOutOfRange = false; | |
2003 t->floatvalue = (isWellformedString ? CTFloat::parse(sbufptr, &isOutOfRange) : CTFloat::zero); | |
2004 errno = 0; | |
2005 switch (*p) | |
2006 { | |
2007 case 'F': | |
2008 case 'f': | |
2009 if (isWellformedString && !isOutOfRange) | |
2010 isOutOfRange = Port::isFloat32LiteralOutOfRange(sbufptr); | |
2011 result = TOKfloat32v; | |
2012 p++; | |
2013 break; | |
2014 | |
2015 default: | |
2016 if (isWellformedString && !isOutOfRange) | |
2017 isOutOfRange = Port::isFloat64LiteralOutOfRange(sbufptr); | |
2018 result = TOKfloat64v; | |
2019 break; | |
2020 | |
2021 case 'l': | |
2022 error("use 'L' suffix instead of 'l'"); | |
2023 /* fall through */ | |
2024 case 'L': | |
2025 result = TOKfloat80v; | |
2026 p++; | |
2027 break; | |
2028 } | |
2029 if (*p == 'i' || *p == 'I') | |
2030 { | |
2031 if (*p == 'I') | |
2032 error("use 'i' suffix instead of 'I'"); | |
2033 p++; | |
2034 switch (result) | |
2035 { | |
2036 case TOKfloat32v: | |
2037 result = TOKimaginary32v; | |
2038 break; | |
2039 case TOKfloat64v: | |
2040 result = TOKimaginary64v; | |
2041 break; | |
2042 case TOKfloat80v: | |
2043 result = TOKimaginary80v; | |
2044 break; | |
2045 default: break; | |
2046 } | |
2047 } | |
2048 const bool isLong = (result == TOKfloat80v || result == TOKimaginary80v); | |
2049 if (isOutOfRange && !isLong) | |
2050 { | |
2051 const char *suffix = (result == TOKfloat32v || result == TOKimaginary32v) ? "f" : ""; | |
2052 error(scanloc, "number '%s%s' is not representable", (char *)stringbuffer.data, suffix); | |
2053 } | |
2054 return result; | |
2055 } | |
2056 | |
2057 /********************************************* | |
2058 * parse: | |
2059 * #line linnum [filespec] | |
2060 * also allow __LINE__ for linnum, and __FILE__ for filespec | |
2061 */ | |
2062 | |
2063 void Lexer::poundLine() | |
2064 { | |
2065 Token tok; | |
2066 int linnum = this->scanloc.linnum; | |
2067 char *filespec = NULL; | |
2068 Loc loc = this->loc(); | |
2069 | |
2070 scan(&tok); | |
2071 if (tok.value == TOKint32v || tok.value == TOKint64v) | |
2072 { | |
2073 int lin = (int)(tok.uns64value - 1); | |
2074 if ((unsigned)lin != tok.uns64value - 1) | |
2075 error("line number %lld out of range", (unsigned long long)tok.uns64value); | |
2076 else | |
2077 linnum = lin; | |
2078 } | |
2079 else if (tok.value == TOKline) | |
2080 { | |
2081 } | |
2082 else | |
2083 goto Lerr; | |
2084 | |
2085 while (1) | |
2086 { | |
2087 switch (*p) | |
2088 { | |
2089 case 0: | |
2090 case 0x1A: | |
2091 case '\n': | |
2092 Lnewline: | |
2093 this->scanloc.linnum = linnum; | |
2094 if (filespec) | |
2095 this->scanloc.filename = filespec; | |
2096 return; | |
2097 | |
2098 case '\r': | |
2099 p++; | |
2100 if (*p != '\n') | |
2101 { p--; | |
2102 goto Lnewline; | |
2103 } | |
2104 continue; | |
2105 | |
2106 case ' ': | |
2107 case '\t': | |
2108 case '\v': | |
2109 case '\f': | |
2110 p++; | |
2111 continue; // skip white space | |
2112 | |
2113 case '_': | |
2114 if (memcmp(p, "__FILE__", 8) == 0) | |
2115 { | |
2116 p += 8; | |
2117 filespec = mem.xstrdup(scanloc.filename); | |
2118 continue; | |
2119 } | |
2120 goto Lerr; | |
2121 | |
2122 case '"': | |
2123 if (filespec) | |
2124 goto Lerr; | |
2125 stringbuffer.reset(); | |
2126 p++; | |
2127 while (1) | |
2128 { unsigned c; | |
2129 | |
2130 c = *p; | |
2131 switch (c) | |
2132 { | |
2133 case '\n': | |
2134 case '\r': | |
2135 case 0: | |
2136 case 0x1A: | |
2137 goto Lerr; | |
2138 | |
2139 case '"': | |
2140 stringbuffer.writeByte(0); | |
2141 filespec = mem.xstrdup((char *)stringbuffer.data); | |
2142 p++; | |
2143 break; | |
2144 | |
2145 default: | |
2146 if (c & 0x80) | |
2147 { unsigned u = decodeUTF(); | |
2148 if (u == PS || u == LS) | |
2149 goto Lerr; | |
2150 } | |
2151 stringbuffer.writeByte(c); | |
2152 p++; | |
2153 continue; | |
2154 } | |
2155 break; | |
2156 } | |
2157 continue; | |
2158 | |
2159 default: | |
2160 if (*p & 0x80) | |
2161 { unsigned u = decodeUTF(); | |
2162 if (u == PS || u == LS) | |
2163 goto Lnewline; | |
2164 } | |
2165 goto Lerr; | |
2166 } | |
2167 } | |
2168 | |
2169 Lerr: | |
2170 error(loc, "#line integer [\"filespec\"]\\n expected"); | |
2171 } | |
2172 | |
2173 | |
2174 /******************************************** | |
2175 * Decode UTF character. | |
2176 * Issue error messages for invalid sequences. | |
2177 * Return decoded character, advance p to last character in UTF sequence. | |
2178 */ | |
2179 | |
2180 unsigned Lexer::decodeUTF() | |
2181 { | |
2182 dchar_t u; | |
2183 utf8_t c; | |
2184 const utf8_t *s = p; | |
2185 size_t len; | |
2186 size_t idx; | |
2187 const char *msg; | |
2188 | |
2189 c = *s; | |
2190 assert(c & 0x80); | |
2191 | |
2192 // Check length of remaining string up to 6 UTF-8 characters | |
2193 for (len = 1; len < 6 && s[len]; len++) | |
2194 ; | |
2195 | |
2196 idx = 0; | |
2197 msg = utf_decodeChar(s, len, &idx, &u); | |
2198 p += idx - 1; | |
2199 if (msg) | |
2200 { | |
2201 error("%s", msg); | |
2202 } | |
2203 return u; | |
2204 } | |
2205 | |
2206 | |
2207 /*************************************************** | |
2208 * Parse doc comment embedded between t->ptr and p. | |
2209 * Remove trailing blanks and tabs from lines. | |
2210 * Replace all newlines with \n. | |
2211 * Remove leading comment character from each line. | |
2212 * Decide if it's a lineComment or a blockComment. | |
2213 * Append to previous one for this token. | |
2214 */ | |
2215 | |
2216 void Lexer::getDocComment(Token *t, unsigned lineComment) | |
2217 { | |
2218 /* ct tells us which kind of comment it is: '/', '*', or '+' | |
2219 */ | |
2220 utf8_t ct = t->ptr[2]; | |
2221 | |
2222 /* Start of comment text skips over / * *, / + +, or / / / | |
2223 */ | |
2224 const utf8_t *q = t->ptr + 3; // start of comment text | |
2225 | |
2226 const utf8_t *qend = p; | |
2227 if (ct == '*' || ct == '+') | |
2228 qend -= 2; | |
2229 | |
2230 /* Scan over initial row of ****'s or ++++'s or ////'s | |
2231 */ | |
2232 for (; q < qend; q++) | |
2233 { | |
2234 if (*q != ct) | |
2235 break; | |
2236 } | |
2237 | |
2238 /* Remove leading spaces until start of the comment | |
2239 */ | |
2240 int linestart = 0; | |
2241 if (ct == '/') | |
2242 { | |
2243 while (q < qend && (*q == ' ' || *q == '\t')) | |
2244 ++q; | |
2245 } | |
2246 else if (q < qend) | |
2247 { | |
2248 if (*q == '\r') | |
2249 { | |
2250 ++q; | |
2251 if (q < qend && *q == '\n') | |
2252 ++q; | |
2253 linestart = 1; | |
2254 } | |
2255 else if (*q == '\n') | |
2256 { | |
2257 ++q; | |
2258 linestart = 1; | |
2259 } | |
2260 } | |
2261 | |
2262 /* Remove trailing row of ****'s or ++++'s | |
2263 */ | |
2264 if (ct != '/') | |
2265 { | |
2266 for (; q < qend; qend--) | |
2267 { | |
2268 if (qend[-1] != ct) | |
2269 break; | |
2270 } | |
2271 } | |
2272 | |
2273 /* Comment is now [q .. qend]. | |
2274 * Canonicalize it into buf[]. | |
2275 */ | |
2276 OutBuffer buf; | |
2277 | |
2278 for (; q < qend; q++) | |
2279 { | |
2280 utf8_t c = *q; | |
2281 | |
2282 switch (c) | |
2283 { | |
2284 case '*': | |
2285 case '+': | |
2286 if (linestart && c == ct) | |
2287 { linestart = 0; | |
2288 /* Trim preceding whitespace up to preceding \n | |
2289 */ | |
2290 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) | |
2291 buf.offset--; | |
2292 continue; | |
2293 } | |
2294 break; | |
2295 | |
2296 case ' ': | |
2297 case '\t': | |
2298 break; | |
2299 | |
2300 case '\r': | |
2301 if (q[1] == '\n') | |
2302 continue; // skip the \r | |
2303 goto Lnewline; | |
2304 | |
2305 default: | |
2306 if (c == 226) | |
2307 { | |
2308 // If LS or PS | |
2309 if (q[1] == 128 && | |
2310 (q[2] == 168 || q[2] == 169)) | |
2311 { | |
2312 q += 2; | |
2313 goto Lnewline; | |
2314 } | |
2315 } | |
2316 linestart = 0; | |
2317 break; | |
2318 | |
2319 Lnewline: | |
2320 c = '\n'; // replace all newlines with \n | |
2321 /* fall through */ | |
2322 case '\n': | |
2323 linestart = 1; | |
2324 | |
2325 /* Trim trailing whitespace | |
2326 */ | |
2327 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) | |
2328 buf.offset--; | |
2329 | |
2330 break; | |
2331 } | |
2332 buf.writeByte(c); | |
2333 } | |
2334 | |
2335 /* Trim trailing whitespace (if the last line does not have newline) | |
2336 */ | |
2337 if (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) | |
2338 { | |
2339 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t')) | |
2340 buf.offset--; | |
2341 } | |
2342 | |
2343 // Always end with a newline | |
2344 if (!buf.offset || buf.data[buf.offset - 1] != '\n') | |
2345 buf.writeByte('\n'); | |
2346 | |
2347 buf.writeByte(0); | |
2348 | |
2349 // It's a line comment if the start of the doc comment comes | |
2350 // after other non-whitespace on the same line. | |
2351 const utf8_t** dc = (lineComment && anyToken) | |
2352 ? &t->lineComment | |
2353 : &t->blockComment; | |
2354 | |
2355 // Combine with previous doc comment, if any | |
2356 if (*dc) | |
2357 *dc = combineComments(*dc, (utf8_t *)buf.data); | |
2358 else | |
2359 *dc = (utf8_t *)buf.extractData(); | |
2360 } | |
2361 | |
2362 /******************************************** | |
2363 * Combine two document comments into one, | |
2364 * separated by a newline. | |
2365 */ | |
2366 | |
2367 const utf8_t *Lexer::combineComments(const utf8_t *c1, const utf8_t *c2) | |
2368 { | |
2369 //printf("Lexer::combineComments('%s', '%s')\n", c1, c2); | |
2370 | |
2371 const utf8_t *c = c2; | |
2372 | |
2373 if (c1) | |
2374 { | |
2375 c = c1; | |
2376 if (c2) | |
2377 { | |
2378 size_t len1 = strlen((const char *)c1); | |
2379 size_t len2 = strlen((const char *)c2); | |
2380 | |
2381 int insertNewLine = 0; | |
2382 if (len1 && c1[len1 - 1] != '\n') | |
2383 { | |
2384 ++len1; | |
2385 insertNewLine = 1; | |
2386 } | |
2387 | |
2388 utf8_t *p = (utf8_t *)mem.xmalloc(len1 + 1 + len2 + 1); | |
2389 memcpy(p, c1, len1 - insertNewLine); | |
2390 if (insertNewLine) | |
2391 p[len1 - 1] = '\n'; | |
2392 | |
2393 p[len1] = '\n'; | |
2394 | |
2395 memcpy(p + len1 + 1, c2, len2); | |
2396 p[len1 + 1 + len2] = 0; | |
2397 c = p; | |
2398 } | |
2399 } | |
2400 return c; | |
2401 } |