145
|
1
|
|
2 /* Compiler implementation of the D programming language
|
|
3 * Copyright (C) 1999-2019 by The D Language Foundation, All Rights Reserved
|
|
4 * written by Walter Bright
|
|
5 * http://www.digitalmars.com
|
|
6 * Distributed under the Boost Software License, Version 1.0.
|
|
7 * http://www.boost.org/LICENSE_1_0.txt
|
|
8 * https://github.com/D-Programming-Language/dmd/blob/master/src/lexer.c
|
|
9 */
|
|
10
|
|
11 /* Lexical Analyzer */
|
|
12
|
|
13 #include "root/dsystem.h" // for time() and ctime()
|
|
14 #include "root/rmem.h"
|
|
15
|
|
16 #include "mars.h"
|
|
17 #include "lexer.h"
|
|
18 #include "utf.h"
|
|
19 #include "identifier.h"
|
|
20 #include "id.h"
|
|
21
|
|
22 extern int HtmlNamedEntity(const utf8_t *p, size_t length);
|
|
23
|
|
24 #define LS 0x2028 // UTF line separator
|
|
25 #define PS 0x2029 // UTF paragraph separator
|
|
26
|
|
27 /********************************************
|
|
28 * Do our own char maps
|
|
29 */
|
|
30
|
|
31 static unsigned char cmtable[256];
|
|
32
|
|
33 const int CMoctal = 0x1;
|
|
34 const int CMhex = 0x2;
|
|
35 const int CMidchar = 0x4;
|
|
36
|
|
37 inline bool isoctal (utf8_t c) { return (cmtable[c] & CMoctal) != 0; }
|
|
38 inline bool ishex (utf8_t c) { return (cmtable[c] & CMhex) != 0; }
|
|
39 inline bool isidchar(utf8_t c) { return (cmtable[c] & CMidchar) != 0; }
|
|
40
|
|
41 struct CMTableInitializer
|
|
42 {
|
|
43 CMTableInitializer();
|
|
44 };
|
|
45
|
|
46 static CMTableInitializer cmtableinitializer;
|
|
47
|
|
48 CMTableInitializer::CMTableInitializer()
|
|
49 {
|
|
50 for (unsigned c = 0; c < 256; c++)
|
|
51 {
|
|
52 if ('0' <= c && c <= '7')
|
|
53 cmtable[c] |= CMoctal;
|
|
54 if (isxdigit(c))
|
|
55 cmtable[c] |= CMhex;
|
|
56 if (isalnum(c) || c == '_')
|
|
57 cmtable[c] |= CMidchar;
|
|
58 }
|
|
59 }
|
|
60
|
|
61 /*************************** Lexer ********************************************/
|
|
62
|
|
63 OutBuffer Lexer::stringbuffer;
|
|
64
|
|
65 Lexer::Lexer(const char *filename,
|
|
66 const utf8_t *base, size_t begoffset, size_t endoffset,
|
|
67 bool doDocComment, bool commentToken)
|
|
68 {
|
|
69 scanloc = Loc(filename, 1, 1);
|
|
70 //printf("Lexer::Lexer(%p,%d)\n",base,length);
|
|
71 //printf("lexer.filename = %s\n", filename);
|
|
72 this->token = Token();
|
|
73 this->token.ptr = NULL;
|
|
74 this->token.value = TOKreserved;
|
|
75 this->token.blockComment = NULL;
|
|
76 this->token.lineComment = NULL;
|
|
77 this->base = base;
|
|
78 this->end = base + endoffset;
|
|
79 p = base + begoffset;
|
|
80 line = p;
|
|
81 this->doDocComment = doDocComment;
|
|
82 this->anyToken = 0;
|
|
83 this->commentToken = commentToken;
|
|
84 this->errors = false;
|
|
85 //initKeywords();
|
|
86
|
|
87 /* If first line starts with '#!', ignore the line
|
|
88 */
|
|
89
|
|
90 if (p[0] == '#' && p[1] =='!')
|
|
91 {
|
|
92 p += 2;
|
|
93 while (1)
|
|
94 {
|
|
95 utf8_t c = *p++;
|
|
96 switch (c)
|
|
97 {
|
|
98 case 0:
|
|
99 case 0x1A:
|
|
100 p--;
|
|
101 /* fall through */
|
|
102
|
|
103 case '\n':
|
|
104 break;
|
|
105
|
|
106 default:
|
|
107 continue;
|
|
108 }
|
|
109 break;
|
|
110 }
|
|
111 endOfLine();
|
|
112 }
|
|
113 }
|
|
114
|
|
115
|
|
116 void Lexer::endOfLine()
|
|
117 {
|
|
118 scanloc.linnum++;
|
|
119 line = p;
|
|
120 }
|
|
121
|
|
122
|
|
123 void Lexer::error(const char *format, ...)
|
|
124 {
|
|
125 va_list ap;
|
|
126 va_start(ap, format);
|
|
127 ::verror(token.loc, format, ap);
|
|
128 va_end(ap);
|
|
129 errors = true;
|
|
130 }
|
|
131
|
|
132 void Lexer::error(Loc loc, const char *format, ...)
|
|
133 {
|
|
134 va_list ap;
|
|
135 va_start(ap, format);
|
|
136 ::verror(loc, format, ap);
|
|
137 va_end(ap);
|
|
138 errors = true;
|
|
139 }
|
|
140
|
|
141 void Lexer::deprecation(const char *format, ...)
|
|
142 {
|
|
143 va_list ap;
|
|
144 va_start(ap, format);
|
|
145 ::vdeprecation(token.loc, format, ap);
|
|
146 va_end(ap);
|
|
147 if (global.params.useDeprecated == DIAGNOSTICerror)
|
|
148 errors = true;
|
|
149 }
|
|
150
|
|
151 TOK Lexer::nextToken()
|
|
152 {
|
|
153 if (token.next)
|
|
154 {
|
|
155 Token *t = token.next;
|
|
156 memcpy(&token,t,sizeof(Token));
|
|
157 t->free();
|
|
158 }
|
|
159 else
|
|
160 {
|
|
161 scan(&token);
|
|
162 }
|
|
163 //token.print();
|
|
164 return token.value;
|
|
165 }
|
|
166
|
|
167 Token *Lexer::peek(Token *ct)
|
|
168 {
|
|
169 Token *t;
|
|
170 if (ct->next)
|
|
171 t = ct->next;
|
|
172 else
|
|
173 {
|
|
174 t = Token::alloc();
|
|
175 scan(t);
|
|
176 ct->next = t;
|
|
177 }
|
|
178 return t;
|
|
179 }
|
|
180
|
|
181 /***********************
|
|
182 * Look ahead at next token's value.
|
|
183 */
|
|
184
|
|
185 TOK Lexer::peekNext()
|
|
186 {
|
|
187 return peek(&token)->value;
|
|
188 }
|
|
189
|
|
190 /***********************
|
|
191 * Look 2 tokens ahead at value.
|
|
192 */
|
|
193
|
|
194 TOK Lexer::peekNext2()
|
|
195 {
|
|
196 Token *t = peek(&token);
|
|
197 return peek(t)->value;
|
|
198 }
|
|
199
|
|
200 /*********************************
|
|
201 * tk is on the opening (.
|
|
202 * Look ahead and return token that is past the closing ).
|
|
203 */
|
|
204
|
|
205 Token *Lexer::peekPastParen(Token *tk)
|
|
206 {
|
|
207 //printf("peekPastParen()\n");
|
|
208 int parens = 1;
|
|
209 int curlynest = 0;
|
|
210 while (1)
|
|
211 {
|
|
212 tk = peek(tk);
|
|
213 //tk->print();
|
|
214 switch (tk->value)
|
|
215 {
|
|
216 case TOKlparen:
|
|
217 parens++;
|
|
218 continue;
|
|
219
|
|
220 case TOKrparen:
|
|
221 --parens;
|
|
222 if (parens)
|
|
223 continue;
|
|
224 tk = peek(tk);
|
|
225 break;
|
|
226
|
|
227 case TOKlcurly:
|
|
228 curlynest++;
|
|
229 continue;
|
|
230
|
|
231 case TOKrcurly:
|
|
232 if (--curlynest >= 0)
|
|
233 continue;
|
|
234 break;
|
|
235
|
|
236 case TOKsemicolon:
|
|
237 if (curlynest)
|
|
238 continue;
|
|
239 break;
|
|
240
|
|
241 case TOKeof:
|
|
242 break;
|
|
243
|
|
244 default:
|
|
245 continue;
|
|
246 }
|
|
247 return tk;
|
|
248 }
|
|
249 }
|
|
250
|
|
251 /****************************
|
|
252 * Turn next token in buffer into a token.
|
|
253 */
|
|
254
|
|
255 void Lexer::scan(Token *t)
|
|
256 {
|
|
257 unsigned lastLine = scanloc.linnum;
|
|
258 Loc startLoc;
|
|
259
|
|
260 t->blockComment = NULL;
|
|
261 t->lineComment = NULL;
|
|
262 while (1)
|
|
263 {
|
|
264 t->ptr = p;
|
|
265 //printf("p = %p, *p = '%c'\n",p,*p);
|
|
266 t->loc = loc();
|
|
267 switch (*p)
|
|
268 {
|
|
269 case 0:
|
|
270 case 0x1A:
|
|
271 t->value = TOKeof; // end of file
|
|
272 return;
|
|
273
|
|
274 case ' ':
|
|
275 case '\t':
|
|
276 case '\v':
|
|
277 case '\f':
|
|
278 p++;
|
|
279 continue; // skip white space
|
|
280
|
|
281 case '\r':
|
|
282 p++;
|
|
283 if (*p != '\n') // if CR stands by itself
|
|
284 endOfLine();
|
|
285 continue; // skip white space
|
|
286
|
|
287 case '\n':
|
|
288 p++;
|
|
289 endOfLine();
|
|
290 continue; // skip white space
|
|
291
|
|
292 case '0': case '1': case '2': case '3': case '4':
|
|
293 case '5': case '6': case '7': case '8': case '9':
|
|
294 t->value = number(t);
|
|
295 return;
|
|
296
|
|
297 case '\'':
|
|
298 t->value = charConstant(t);
|
|
299 return;
|
|
300
|
|
301 case 'r':
|
|
302 if (p[1] != '"')
|
|
303 goto case_ident;
|
|
304 p++;
|
|
305 /* fall through */
|
|
306 case '`':
|
|
307 t->value = wysiwygStringConstant(t, *p);
|
|
308 return;
|
|
309
|
|
310 case 'x':
|
|
311 if (p[1] != '"')
|
|
312 goto case_ident;
|
|
313 p++;
|
|
314 t->value = hexStringConstant(t);
|
|
315 return;
|
|
316
|
|
317 case 'q':
|
|
318 if (p[1] == '"')
|
|
319 {
|
|
320 p++;
|
|
321 t->value = delimitedStringConstant(t);
|
|
322 return;
|
|
323 }
|
|
324 else if (p[1] == '{')
|
|
325 {
|
|
326 p++;
|
|
327 t->value = tokenStringConstant(t);
|
|
328 return;
|
|
329 }
|
|
330 else
|
|
331 goto case_ident;
|
|
332
|
|
333 case '"':
|
|
334 t->value = escapeStringConstant(t);
|
|
335 return;
|
|
336
|
|
337 case 'a': case 'b': case 'c': case 'd': case 'e':
|
|
338 case 'f': case 'g': case 'h': case 'i': case 'j':
|
|
339 case 'k': case 'l': case 'm': case 'n': case 'o':
|
|
340 case 'p': /*case 'q': case 'r':*/ case 's': case 't':
|
|
341 case 'u': case 'v': case 'w': /*case 'x':*/ case 'y':
|
|
342 case 'z':
|
|
343 case 'A': case 'B': case 'C': case 'D': case 'E':
|
|
344 case 'F': case 'G': case 'H': case 'I': case 'J':
|
|
345 case 'K': case 'L': case 'M': case 'N': case 'O':
|
|
346 case 'P': case 'Q': case 'R': case 'S': case 'T':
|
|
347 case 'U': case 'V': case 'W': case 'X': case 'Y':
|
|
348 case 'Z':
|
|
349 case '_':
|
|
350 case_ident:
|
|
351 { utf8_t c;
|
|
352
|
|
353 while (1)
|
|
354 {
|
|
355 c = *++p;
|
|
356 if (isidchar(c))
|
|
357 continue;
|
|
358 else if (c & 0x80)
|
|
359 { const utf8_t *s = p;
|
|
360 unsigned u = decodeUTF();
|
|
361 if (isUniAlpha(u))
|
|
362 continue;
|
|
363 error("char 0x%04x not allowed in identifier", u);
|
|
364 p = s;
|
|
365 }
|
|
366 break;
|
|
367 }
|
|
368
|
|
369 Identifier *id = Identifier::idPool((const char *)t->ptr, p - t->ptr);
|
|
370 t->ident = id;
|
|
371 t->value = (TOK) id->getValue();
|
|
372 anyToken = 1;
|
|
373 if (*t->ptr == '_') // if special identifier token
|
|
374 {
|
|
375 static bool initdone = false;
|
|
376 static char date[11+1];
|
|
377 static char time[8+1];
|
|
378 static char timestamp[24+1];
|
|
379
|
|
380 if (!initdone) // lazy evaluation
|
|
381 {
|
|
382 initdone = true;
|
|
383 time_t ct;
|
|
384 ::time(&ct);
|
|
385 char *p = ctime(&ct);
|
|
386 assert(p);
|
|
387 sprintf(&date[0], "%.6s %.4s", p + 4, p + 20);
|
|
388 sprintf(&time[0], "%.8s", p + 11);
|
|
389 sprintf(×tamp[0], "%.24s", p);
|
|
390 }
|
|
391
|
|
392 if (id == Id::DATE)
|
|
393 {
|
|
394 t->ustring = (utf8_t *)date;
|
|
395 goto Lstr;
|
|
396 }
|
|
397 else if (id == Id::TIME)
|
|
398 {
|
|
399 t->ustring = (utf8_t *)time;
|
|
400 goto Lstr;
|
|
401 }
|
|
402 else if (id == Id::VENDOR)
|
|
403 {
|
|
404 t->ustring = (utf8_t *)const_cast<char *>(global.vendor);
|
|
405 goto Lstr;
|
|
406 }
|
|
407 else if (id == Id::TIMESTAMP)
|
|
408 {
|
|
409 t->ustring = (utf8_t *)timestamp;
|
|
410 Lstr:
|
|
411 t->value = TOKstring;
|
|
412 t->postfix = 0;
|
|
413 t->len = (unsigned)strlen((char *)t->ustring);
|
|
414 }
|
|
415 else if (id == Id::VERSIONX)
|
|
416 { unsigned major = 0;
|
|
417 unsigned minor = 0;
|
|
418 bool point = false;
|
|
419
|
|
420 for (const char *p = global.version + 1; 1; p++)
|
|
421 {
|
|
422 c = *p;
|
|
423 if (isdigit((utf8_t)c))
|
|
424 minor = minor * 10 + c - '0';
|
|
425 else if (c == '.')
|
|
426 {
|
|
427 if (point)
|
|
428 break; // ignore everything after second '.'
|
|
429 point = true;
|
|
430 major = minor;
|
|
431 minor = 0;
|
|
432 }
|
|
433 else
|
|
434 break;
|
|
435 }
|
|
436 t->value = TOKint64v;
|
|
437 t->uns64value = major * 1000 + minor;
|
|
438 }
|
|
439 else if (id == Id::EOFX)
|
|
440 {
|
|
441 t->value = TOKeof;
|
|
442 // Advance scanner to end of file
|
|
443 while (!(*p == 0 || *p == 0x1A))
|
|
444 p++;
|
|
445 }
|
|
446 }
|
|
447 //printf("t->value = %d\n",t->value);
|
|
448 return;
|
|
449 }
|
|
450
|
|
451 case '/':
|
|
452 p++;
|
|
453 switch (*p)
|
|
454 {
|
|
455 case '=':
|
|
456 p++;
|
|
457 t->value = TOKdivass;
|
|
458 return;
|
|
459
|
|
460 case '*':
|
|
461 p++;
|
|
462 startLoc = loc();
|
|
463 while (1)
|
|
464 {
|
|
465 while (1)
|
|
466 { utf8_t c = *p;
|
|
467 switch (c)
|
|
468 {
|
|
469 case '/':
|
|
470 break;
|
|
471
|
|
472 case '\n':
|
|
473 endOfLine();
|
|
474 p++;
|
|
475 continue;
|
|
476
|
|
477 case '\r':
|
|
478 p++;
|
|
479 if (*p != '\n')
|
|
480 endOfLine();
|
|
481 continue;
|
|
482
|
|
483 case 0:
|
|
484 case 0x1A:
|
|
485 error("unterminated /* */ comment");
|
|
486 p = end;
|
|
487 t->loc = loc();
|
|
488 t->value = TOKeof;
|
|
489 return;
|
|
490
|
|
491 default:
|
|
492 if (c & 0x80)
|
|
493 { unsigned u = decodeUTF();
|
|
494 if (u == PS || u == LS)
|
|
495 endOfLine();
|
|
496 }
|
|
497 p++;
|
|
498 continue;
|
|
499 }
|
|
500 break;
|
|
501 }
|
|
502 p++;
|
|
503 if (p[-2] == '*' && p - 3 != t->ptr)
|
|
504 break;
|
|
505 }
|
|
506 if (commentToken)
|
|
507 {
|
|
508 t->loc = startLoc;
|
|
509 t->value = TOKcomment;
|
|
510 return;
|
|
511 }
|
|
512 else if (doDocComment && t->ptr[2] == '*' && p - 4 != t->ptr)
|
|
513 { // if /** but not /**/
|
|
514 getDocComment(t, lastLine == startLoc.linnum);
|
|
515 }
|
|
516 continue;
|
|
517
|
|
518 case '/': // do // style comments
|
|
519 startLoc = loc();
|
|
520 while (1)
|
|
521 { utf8_t c = *++p;
|
|
522 switch (c)
|
|
523 {
|
|
524 case '\n':
|
|
525 break;
|
|
526
|
|
527 case '\r':
|
|
528 if (p[1] == '\n')
|
|
529 p++;
|
|
530 break;
|
|
531
|
|
532 case 0:
|
|
533 case 0x1A:
|
|
534 if (commentToken)
|
|
535 {
|
|
536 p = end;
|
|
537 t->loc = startLoc;
|
|
538 t->value = TOKcomment;
|
|
539 return;
|
|
540 }
|
|
541 if (doDocComment && t->ptr[2] == '/')
|
|
542 getDocComment(t, lastLine == startLoc.linnum);
|
|
543 p = end;
|
|
544 t->loc = loc();
|
|
545 t->value = TOKeof;
|
|
546 return;
|
|
547
|
|
548 default:
|
|
549 if (c & 0x80)
|
|
550 { unsigned u = decodeUTF();
|
|
551 if (u == PS || u == LS)
|
|
552 break;
|
|
553 }
|
|
554 continue;
|
|
555 }
|
|
556 break;
|
|
557 }
|
|
558
|
|
559 if (commentToken)
|
|
560 {
|
|
561 p++;
|
|
562 endOfLine();
|
|
563 t->loc = startLoc;
|
|
564 t->value = TOKcomment;
|
|
565 return;
|
|
566 }
|
|
567 if (doDocComment && t->ptr[2] == '/')
|
|
568 getDocComment(t, lastLine == startLoc.linnum);
|
|
569
|
|
570 p++;
|
|
571 endOfLine();
|
|
572 continue;
|
|
573
|
|
574 case '+':
|
|
575 { int nest;
|
|
576
|
|
577 startLoc = loc();
|
|
578 p++;
|
|
579 nest = 1;
|
|
580 while (1)
|
|
581 { utf8_t c = *p;
|
|
582 switch (c)
|
|
583 {
|
|
584 case '/':
|
|
585 p++;
|
|
586 if (*p == '+')
|
|
587 {
|
|
588 p++;
|
|
589 nest++;
|
|
590 }
|
|
591 continue;
|
|
592
|
|
593 case '+':
|
|
594 p++;
|
|
595 if (*p == '/')
|
|
596 {
|
|
597 p++;
|
|
598 if (--nest == 0)
|
|
599 break;
|
|
600 }
|
|
601 continue;
|
|
602
|
|
603 case '\r':
|
|
604 p++;
|
|
605 if (*p != '\n')
|
|
606 endOfLine();
|
|
607 continue;
|
|
608
|
|
609 case '\n':
|
|
610 endOfLine();
|
|
611 p++;
|
|
612 continue;
|
|
613
|
|
614 case 0:
|
|
615 case 0x1A:
|
|
616 error("unterminated /+ +/ comment");
|
|
617 p = end;
|
|
618 t->loc = loc();
|
|
619 t->value = TOKeof;
|
|
620 return;
|
|
621
|
|
622 default:
|
|
623 if (c & 0x80)
|
|
624 { unsigned u = decodeUTF();
|
|
625 if (u == PS || u == LS)
|
|
626 endOfLine();
|
|
627 }
|
|
628 p++;
|
|
629 continue;
|
|
630 }
|
|
631 break;
|
|
632 }
|
|
633 if (commentToken)
|
|
634 {
|
|
635 t->loc = startLoc;
|
|
636 t->value = TOKcomment;
|
|
637 return;
|
|
638 }
|
|
639 if (doDocComment && t->ptr[2] == '+' && p - 4 != t->ptr)
|
|
640 { // if /++ but not /++/
|
|
641 getDocComment(t, lastLine == startLoc.linnum);
|
|
642 }
|
|
643 continue;
|
|
644 }
|
|
645 default:
|
|
646 break;
|
|
647 }
|
|
648 t->value = TOKdiv;
|
|
649 return;
|
|
650
|
|
651 case '.':
|
|
652 p++;
|
|
653 if (isdigit(*p))
|
|
654 { /* Note that we don't allow ._1 and ._ as being
|
|
655 * valid floating point numbers.
|
|
656 */
|
|
657 p--;
|
|
658 t->value = inreal(t);
|
|
659 }
|
|
660 else if (p[0] == '.')
|
|
661 {
|
|
662 if (p[1] == '.')
|
|
663 { p += 2;
|
|
664 t->value = TOKdotdotdot;
|
|
665 }
|
|
666 else
|
|
667 { p++;
|
|
668 t->value = TOKslice;
|
|
669 }
|
|
670 }
|
|
671 else
|
|
672 t->value = TOKdot;
|
|
673 return;
|
|
674
|
|
675 case '&':
|
|
676 p++;
|
|
677 if (*p == '=')
|
|
678 { p++;
|
|
679 t->value = TOKandass;
|
|
680 }
|
|
681 else if (*p == '&')
|
|
682 { p++;
|
|
683 t->value = TOKandand;
|
|
684 }
|
|
685 else
|
|
686 t->value = TOKand;
|
|
687 return;
|
|
688
|
|
689 case '|':
|
|
690 p++;
|
|
691 if (*p == '=')
|
|
692 { p++;
|
|
693 t->value = TOKorass;
|
|
694 }
|
|
695 else if (*p == '|')
|
|
696 { p++;
|
|
697 t->value = TOKoror;
|
|
698 }
|
|
699 else
|
|
700 t->value = TOKor;
|
|
701 return;
|
|
702
|
|
703 case '-':
|
|
704 p++;
|
|
705 if (*p == '=')
|
|
706 { p++;
|
|
707 t->value = TOKminass;
|
|
708 }
|
|
709 else if (*p == '-')
|
|
710 { p++;
|
|
711 t->value = TOKminusminus;
|
|
712 }
|
|
713 else
|
|
714 t->value = TOKmin;
|
|
715 return;
|
|
716
|
|
717 case '+':
|
|
718 p++;
|
|
719 if (*p == '=')
|
|
720 { p++;
|
|
721 t->value = TOKaddass;
|
|
722 }
|
|
723 else if (*p == '+')
|
|
724 { p++;
|
|
725 t->value = TOKplusplus;
|
|
726 }
|
|
727 else
|
|
728 t->value = TOKadd;
|
|
729 return;
|
|
730
|
|
731 case '<':
|
|
732 p++;
|
|
733 if (*p == '=')
|
|
734 { p++;
|
|
735 t->value = TOKle; // <=
|
|
736 }
|
|
737 else if (*p == '<')
|
|
738 { p++;
|
|
739 if (*p == '=')
|
|
740 { p++;
|
|
741 t->value = TOKshlass; // <<=
|
|
742 }
|
|
743 else
|
|
744 t->value = TOKshl; // <<
|
|
745 }
|
|
746 else if (*p == '>')
|
|
747 { p++;
|
|
748 if (*p == '=')
|
|
749 { p++;
|
|
750 t->value = TOKleg; // <>=
|
|
751 }
|
|
752 else
|
|
753 t->value = TOKlg; // <>
|
|
754 }
|
|
755 else
|
|
756 t->value = TOKlt; // <
|
|
757 return;
|
|
758
|
|
759 case '>':
|
|
760 p++;
|
|
761 if (*p == '=')
|
|
762 { p++;
|
|
763 t->value = TOKge; // >=
|
|
764 }
|
|
765 else if (*p == '>')
|
|
766 { p++;
|
|
767 if (*p == '=')
|
|
768 { p++;
|
|
769 t->value = TOKshrass; // >>=
|
|
770 }
|
|
771 else if (*p == '>')
|
|
772 { p++;
|
|
773 if (*p == '=')
|
|
774 { p++;
|
|
775 t->value = TOKushrass; // >>>=
|
|
776 }
|
|
777 else
|
|
778 t->value = TOKushr; // >>>
|
|
779 }
|
|
780 else
|
|
781 t->value = TOKshr; // >>
|
|
782 }
|
|
783 else
|
|
784 t->value = TOKgt; // >
|
|
785 return;
|
|
786
|
|
787 case '!':
|
|
788 p++;
|
|
789 if (*p == '=')
|
|
790 { p++;
|
|
791 t->value = TOKnotequal; // !=
|
|
792 }
|
|
793 else if (*p == '<')
|
|
794 { p++;
|
|
795 if (*p == '>')
|
|
796 { p++;
|
|
797 if (*p == '=')
|
|
798 { p++;
|
|
799 t->value = TOKunord; // !<>=
|
|
800 }
|
|
801 else
|
|
802 t->value = TOKue; // !<>
|
|
803 }
|
|
804 else if (*p == '=')
|
|
805 { p++;
|
|
806 t->value = TOKug; // !<=
|
|
807 }
|
|
808 else
|
|
809 t->value = TOKuge; // !<
|
|
810 }
|
|
811 else if (*p == '>')
|
|
812 { p++;
|
|
813 if (*p == '=')
|
|
814 { p++;
|
|
815 t->value = TOKul; // !>=
|
|
816 }
|
|
817 else
|
|
818 t->value = TOKule; // !>
|
|
819 }
|
|
820 else
|
|
821 t->value = TOKnot; // !
|
|
822 return;
|
|
823
|
|
824 case '=':
|
|
825 p++;
|
|
826 if (*p == '=')
|
|
827 { p++;
|
|
828 t->value = TOKequal; // ==
|
|
829 }
|
|
830 else if (*p == '>')
|
|
831 { p++;
|
|
832 t->value = TOKgoesto; // =>
|
|
833 }
|
|
834 else
|
|
835 t->value = TOKassign; // =
|
|
836 return;
|
|
837
|
|
838 case '~':
|
|
839 p++;
|
|
840 if (*p == '=')
|
|
841 { p++;
|
|
842 t->value = TOKcatass; // ~=
|
|
843 }
|
|
844 else
|
|
845 t->value = TOKtilde; // ~
|
|
846 return;
|
|
847
|
|
848 case '^':
|
|
849 p++;
|
|
850 if (*p == '^')
|
|
851 { p++;
|
|
852 if (*p == '=')
|
|
853 { p++;
|
|
854 t->value = TOKpowass; // ^^=
|
|
855 }
|
|
856 else
|
|
857 t->value = TOKpow; // ^^
|
|
858 }
|
|
859 else if (*p == '=')
|
|
860 { p++;
|
|
861 t->value = TOKxorass; // ^=
|
|
862 }
|
|
863 else
|
|
864 t->value = TOKxor; // ^
|
|
865 return;
|
|
866
|
|
867 case '(': p++; t->value = TOKlparen; return;
|
|
868 case ')': p++; t->value = TOKrparen; return;
|
|
869 case '[': p++; t->value = TOKlbracket; return;
|
|
870 case ']': p++; t->value = TOKrbracket; return;
|
|
871 case '{': p++; t->value = TOKlcurly; return;
|
|
872 case '}': p++; t->value = TOKrcurly; return;
|
|
873 case '?': p++; t->value = TOKquestion; return;
|
|
874 case ',': p++; t->value = TOKcomma; return;
|
|
875 case ';': p++; t->value = TOKsemicolon; return;
|
|
876 case ':': p++; t->value = TOKcolon; return;
|
|
877 case '$': p++; t->value = TOKdollar; return;
|
|
878 case '@': p++; t->value = TOKat; return;
|
|
879
|
|
880 case '*':
|
|
881 p++;
|
|
882 if (*p == '=')
|
|
883 { p++;
|
|
884 t->value = TOKmulass;
|
|
885 }
|
|
886 else
|
|
887 t->value = TOKmul;
|
|
888 return;
|
|
889 case '%':
|
|
890 p++;
|
|
891 if (*p == '=')
|
|
892 { p++;
|
|
893 t->value = TOKmodass;
|
|
894 }
|
|
895 else
|
|
896 t->value = TOKmod;
|
|
897 return;
|
|
898
|
|
899 case '#':
|
|
900 {
|
|
901 p++;
|
|
902 Token n;
|
|
903 scan(&n);
|
|
904 if (n.value == TOKidentifier)
|
|
905 {
|
|
906 if (n.ident == Id::line)
|
|
907 {
|
|
908 poundLine();
|
|
909 continue;
|
|
910 }
|
|
911 else
|
|
912 {
|
|
913 const Loc locx = loc();
|
|
914 warning(locx, "C preprocessor directive `#%s` is not supported", n.ident->toChars());
|
|
915 }
|
|
916 }
|
|
917 else if (n.value == TOKif)
|
|
918 {
|
|
919 error("C preprocessor directive `#if` is not supported, use `version` or `static if`");
|
|
920 }
|
|
921 t->value = TOKpound;
|
|
922 return;
|
|
923 }
|
|
924
|
|
925 default:
|
|
926 { unsigned c = *p;
|
|
927
|
|
928 if (c & 0x80)
|
|
929 { c = decodeUTF();
|
|
930
|
|
931 // Check for start of unicode identifier
|
|
932 if (isUniAlpha(c))
|
|
933 goto case_ident;
|
|
934
|
|
935 if (c == PS || c == LS)
|
|
936 {
|
|
937 endOfLine();
|
|
938 p++;
|
|
939 continue;
|
|
940 }
|
|
941 }
|
|
942 if (c < 0x80 && isprint(c))
|
|
943 error("character '%c' is not a valid token", c);
|
|
944 else
|
|
945 error("character 0x%02x is not a valid token", c);
|
|
946 p++;
|
|
947 continue;
|
|
948 }
|
|
949 }
|
|
950 }
|
|
951 }
|
|
952
|
|
953 /*******************************************
|
|
954 * Parse escape sequence.
|
|
955 */
|
|
956
|
|
957 unsigned Lexer::escapeSequence()
|
|
958 { unsigned c = *p;
|
|
959
|
|
960 int n;
|
|
961 int ndigits;
|
|
962
|
|
963 switch (c)
|
|
964 {
|
|
965 case '\'':
|
|
966 case '"':
|
|
967 case '?':
|
|
968 case '\\':
|
|
969 Lconsume:
|
|
970 p++;
|
|
971 break;
|
|
972
|
|
973 case 'a': c = 7; goto Lconsume;
|
|
974 case 'b': c = 8; goto Lconsume;
|
|
975 case 'f': c = 12; goto Lconsume;
|
|
976 case 'n': c = 10; goto Lconsume;
|
|
977 case 'r': c = 13; goto Lconsume;
|
|
978 case 't': c = 9; goto Lconsume;
|
|
979 case 'v': c = 11; goto Lconsume;
|
|
980
|
|
981 case 'u':
|
|
982 ndigits = 4;
|
|
983 goto Lhex;
|
|
984 case 'U':
|
|
985 ndigits = 8;
|
|
986 goto Lhex;
|
|
987 case 'x':
|
|
988 ndigits = 2;
|
|
989 Lhex:
|
|
990 p++;
|
|
991 c = *p;
|
|
992 if (ishex((utf8_t)c))
|
|
993 { unsigned v;
|
|
994
|
|
995 n = 0;
|
|
996 v = 0;
|
|
997 while (1)
|
|
998 {
|
|
999 if (isdigit((utf8_t)c))
|
|
1000 c -= '0';
|
|
1001 else if (islower(c))
|
|
1002 c -= 'a' - 10;
|
|
1003 else
|
|
1004 c -= 'A' - 10;
|
|
1005 v = v * 16 + c;
|
|
1006 c = *++p;
|
|
1007 if (++n == ndigits)
|
|
1008 break;
|
|
1009 if (!ishex((utf8_t)c))
|
|
1010 { error("escape hex sequence has %d hex digits instead of %d", n, ndigits);
|
|
1011 break;
|
|
1012 }
|
|
1013 }
|
|
1014 if (ndigits != 2 && !utf_isValidDchar(v))
|
|
1015 { error("invalid UTF character \\U%08x", v);
|
|
1016 v = '?'; // recover with valid UTF character
|
|
1017 }
|
|
1018 c = v;
|
|
1019 }
|
|
1020 else
|
|
1021 error("undefined escape hex sequence \\%c",c);
|
|
1022 break;
|
|
1023
|
|
1024 case '&': // named character entity
|
|
1025 for (const utf8_t *idstart = ++p; 1; p++)
|
|
1026 {
|
|
1027 switch (*p)
|
|
1028 {
|
|
1029 case ';':
|
|
1030 c = HtmlNamedEntity(idstart, p - idstart);
|
|
1031 if (c == ~0U)
|
|
1032 { error("unnamed character entity &%.*s;", (int)(p - idstart), idstart);
|
|
1033 c = ' ';
|
|
1034 }
|
|
1035 p++;
|
|
1036 break;
|
|
1037
|
|
1038 default:
|
|
1039 if (isalpha(*p) ||
|
|
1040 (p != idstart && isdigit(*p)))
|
|
1041 continue;
|
|
1042 error("unterminated named entity &%.*s;", (int)(p - idstart + 1), idstart);
|
|
1043 break;
|
|
1044 }
|
|
1045 break;
|
|
1046 }
|
|
1047 break;
|
|
1048
|
|
1049 case 0:
|
|
1050 case 0x1A: // end of file
|
|
1051 c = '\\';
|
|
1052 break;
|
|
1053
|
|
1054 default:
|
|
1055 if (isoctal((utf8_t)c))
|
|
1056 { unsigned v;
|
|
1057
|
|
1058 n = 0;
|
|
1059 v = 0;
|
|
1060 do
|
|
1061 {
|
|
1062 v = v * 8 + (c - '0');
|
|
1063 c = *++p;
|
|
1064 } while (++n < 3 && isoctal((utf8_t)c));
|
|
1065 c = v;
|
|
1066 if (c > 0xFF)
|
|
1067 error("escape octal sequence \\%03o is larger than \\377", c);
|
|
1068 }
|
|
1069 else
|
|
1070 error("undefined escape sequence \\%c",c);
|
|
1071 break;
|
|
1072 }
|
|
1073 return c;
|
|
1074 }
|
|
1075
|
|
1076 /**************************************
|
|
1077 */
|
|
1078
|
|
1079 TOK Lexer::wysiwygStringConstant(Token *t, int tc)
|
|
1080 {
|
|
1081 int c;
|
|
1082 Loc start = loc();
|
|
1083
|
|
1084 p++;
|
|
1085 stringbuffer.reset();
|
|
1086 while (1)
|
|
1087 {
|
|
1088 c = *p++;
|
|
1089 switch (c)
|
|
1090 {
|
|
1091 case '\n':
|
|
1092 endOfLine();
|
|
1093 break;
|
|
1094
|
|
1095 case '\r':
|
|
1096 if (*p == '\n')
|
|
1097 continue; // ignore
|
|
1098 c = '\n'; // treat EndOfLine as \n character
|
|
1099 endOfLine();
|
|
1100 break;
|
|
1101
|
|
1102 case 0:
|
|
1103 case 0x1A:
|
|
1104 error("unterminated string constant starting at %s", start.toChars());
|
|
1105 t->ustring = (utf8_t *)const_cast<char *>("");
|
|
1106 t->len = 0;
|
|
1107 t->postfix = 0;
|
|
1108 return TOKstring;
|
|
1109
|
|
1110 case '"':
|
|
1111 case '`':
|
|
1112 if (c == tc)
|
|
1113 {
|
|
1114 t->len = (unsigned)stringbuffer.offset;
|
|
1115 stringbuffer.writeByte(0);
|
|
1116 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset);
|
|
1117 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
|
|
1118 stringPostfix(t);
|
|
1119 return TOKstring;
|
|
1120 }
|
|
1121 break;
|
|
1122
|
|
1123 default:
|
|
1124 if (c & 0x80)
|
|
1125 { p--;
|
|
1126 unsigned u = decodeUTF();
|
|
1127 p++;
|
|
1128 if (u == PS || u == LS)
|
|
1129 endOfLine();
|
|
1130 stringbuffer.writeUTF8(u);
|
|
1131 continue;
|
|
1132 }
|
|
1133 break;
|
|
1134 }
|
|
1135 stringbuffer.writeByte(c);
|
|
1136 }
|
|
1137 }
|
|
1138
|
|
1139 /**************************************
|
|
1140 * Lex hex strings:
|
|
1141 * x"0A ae 34FE BD"
|
|
1142 */
|
|
1143
|
|
1144 TOK Lexer::hexStringConstant(Token *t)
|
|
1145 {
|
|
1146 unsigned c;
|
|
1147 Loc start = loc();
|
|
1148 unsigned n = 0;
|
|
1149 unsigned v = ~0; // dead assignment, needed to suppress warning
|
|
1150
|
|
1151 p++;
|
|
1152 stringbuffer.reset();
|
|
1153 while (1)
|
|
1154 {
|
|
1155 c = *p++;
|
|
1156 switch (c)
|
|
1157 {
|
|
1158 case ' ':
|
|
1159 case '\t':
|
|
1160 case '\v':
|
|
1161 case '\f':
|
|
1162 continue; // skip white space
|
|
1163
|
|
1164 case '\r':
|
|
1165 if (*p == '\n')
|
|
1166 continue; // ignore
|
|
1167 // Treat isolated '\r' as if it were a '\n'
|
|
1168 /* fall through */
|
|
1169 case '\n':
|
|
1170 endOfLine();
|
|
1171 continue;
|
|
1172
|
|
1173 case 0:
|
|
1174 case 0x1A:
|
|
1175 error("unterminated string constant starting at %s", start.toChars());
|
|
1176 t->ustring = (utf8_t *)const_cast<char *>("");
|
|
1177 t->len = 0;
|
|
1178 t->postfix = 0;
|
|
1179 return TOKxstring;
|
|
1180
|
|
1181 case '"':
|
|
1182 if (n & 1)
|
|
1183 { error("odd number (%d) of hex characters in hex string", n);
|
|
1184 stringbuffer.writeByte(v);
|
|
1185 }
|
|
1186 t->len = (unsigned)stringbuffer.offset;
|
|
1187 stringbuffer.writeByte(0);
|
|
1188 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset);
|
|
1189 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
|
|
1190 stringPostfix(t);
|
|
1191 return TOKxstring;
|
|
1192
|
|
1193 default:
|
|
1194 if (c >= '0' && c <= '9')
|
|
1195 c -= '0';
|
|
1196 else if (c >= 'a' && c <= 'f')
|
|
1197 c -= 'a' - 10;
|
|
1198 else if (c >= 'A' && c <= 'F')
|
|
1199 c -= 'A' - 10;
|
|
1200 else if (c & 0x80)
|
|
1201 { p--;
|
|
1202 unsigned u = decodeUTF();
|
|
1203 p++;
|
|
1204 if (u == PS || u == LS)
|
|
1205 endOfLine();
|
|
1206 else
|
|
1207 error("non-hex character \\u%04x in hex string", u);
|
|
1208 }
|
|
1209 else
|
|
1210 error("non-hex character '%c' in hex string", c);
|
|
1211 if (n & 1)
|
|
1212 { v = (v << 4) | c;
|
|
1213 stringbuffer.writeByte(v);
|
|
1214 }
|
|
1215 else
|
|
1216 v = c;
|
|
1217 n++;
|
|
1218 break;
|
|
1219 }
|
|
1220 }
|
|
1221 }
|
|
1222
|
|
1223
|
|
1224 /**************************************
|
|
1225 * Lex delimited strings:
|
|
1226 * q"(foo(xxx))" // "foo(xxx)"
|
|
1227 * q"[foo(]" // "foo("
|
|
1228 * q"/foo]/" // "foo]"
|
|
1229 * q"HERE
|
|
1230 * foo
|
|
1231 * HERE" // "foo\n"
|
|
1232 * Input:
|
|
1233 * p is on the "
|
|
1234 */
|
|
1235
|
|
1236 TOK Lexer::delimitedStringConstant(Token *t)
|
|
1237 {
|
|
1238 unsigned c;
|
|
1239 Loc start = loc();
|
|
1240 unsigned delimleft = 0;
|
|
1241 unsigned delimright = 0;
|
|
1242 unsigned nest = 1;
|
|
1243 unsigned nestcount = ~0; // dead assignment, needed to suppress warning
|
|
1244 Identifier *hereid = NULL;
|
|
1245 unsigned blankrol = 0;
|
|
1246 unsigned startline = 0;
|
|
1247
|
|
1248 p++;
|
|
1249 stringbuffer.reset();
|
|
1250 while (1)
|
|
1251 {
|
|
1252 c = *p++;
|
|
1253 //printf("c = '%c'\n", c);
|
|
1254 switch (c)
|
|
1255 {
|
|
1256 case '\n':
|
|
1257 Lnextline:
|
|
1258 endOfLine();
|
|
1259 startline = 1;
|
|
1260 if (blankrol)
|
|
1261 { blankrol = 0;
|
|
1262 continue;
|
|
1263 }
|
|
1264 if (hereid)
|
|
1265 {
|
|
1266 stringbuffer.writeUTF8(c);
|
|
1267 continue;
|
|
1268 }
|
|
1269 break;
|
|
1270
|
|
1271 case '\r':
|
|
1272 if (*p == '\n')
|
|
1273 continue; // ignore
|
|
1274 c = '\n'; // treat EndOfLine as \n character
|
|
1275 goto Lnextline;
|
|
1276
|
|
1277 case 0:
|
|
1278 case 0x1A:
|
|
1279 error("unterminated delimited string constant starting at %s", start.toChars());
|
|
1280 t->ustring = (utf8_t *)const_cast<char *>("");
|
|
1281 t->len = 0;
|
|
1282 t->postfix = 0;
|
|
1283 return TOKstring;
|
|
1284
|
|
1285 default:
|
|
1286 if (c & 0x80)
|
|
1287 { p--;
|
|
1288 c = decodeUTF();
|
|
1289 p++;
|
|
1290 if (c == PS || c == LS)
|
|
1291 goto Lnextline;
|
|
1292 }
|
|
1293 break;
|
|
1294 }
|
|
1295 if (delimleft == 0)
|
|
1296 { delimleft = c;
|
|
1297 nest = 1;
|
|
1298 nestcount = 1;
|
|
1299 if (c == '(')
|
|
1300 delimright = ')';
|
|
1301 else if (c == '{')
|
|
1302 delimright = '}';
|
|
1303 else if (c == '[')
|
|
1304 delimright = ']';
|
|
1305 else if (c == '<')
|
|
1306 delimright = '>';
|
|
1307 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c)))
|
|
1308 { // Start of identifier; must be a heredoc
|
|
1309 Token tok;
|
|
1310 p--;
|
|
1311 scan(&tok); // read in heredoc identifier
|
|
1312 if (tok.value != TOKidentifier)
|
|
1313 { error("identifier expected for heredoc, not %s", tok.toChars());
|
|
1314 delimright = c;
|
|
1315 }
|
|
1316 else
|
|
1317 { hereid = tok.ident;
|
|
1318 //printf("hereid = '%s'\n", hereid->toChars());
|
|
1319 blankrol = 1;
|
|
1320 }
|
|
1321 nest = 0;
|
|
1322 }
|
|
1323 else
|
|
1324 { delimright = c;
|
|
1325 nest = 0;
|
|
1326 if (isspace(c))
|
|
1327 error("delimiter cannot be whitespace");
|
|
1328 }
|
|
1329 }
|
|
1330 else
|
|
1331 {
|
|
1332 if (blankrol)
|
|
1333 { error("heredoc rest of line should be blank");
|
|
1334 blankrol = 0;
|
|
1335 continue;
|
|
1336 }
|
|
1337 if (nest == 1)
|
|
1338 {
|
|
1339 if (c == delimleft)
|
|
1340 nestcount++;
|
|
1341 else if (c == delimright)
|
|
1342 { nestcount--;
|
|
1343 if (nestcount == 0)
|
|
1344 goto Ldone;
|
|
1345 }
|
|
1346 }
|
|
1347 else if (c == delimright)
|
|
1348 goto Ldone;
|
|
1349 if (startline && isalpha(c) && hereid)
|
|
1350 { Token tok;
|
|
1351 const utf8_t *psave = p;
|
|
1352 p--;
|
|
1353 scan(&tok); // read in possible heredoc identifier
|
|
1354 //printf("endid = '%s'\n", tok.ident->toChars());
|
|
1355 if (tok.value == TOKidentifier && tok.ident->equals(hereid))
|
|
1356 { /* should check that rest of line is blank
|
|
1357 */
|
|
1358 goto Ldone;
|
|
1359 }
|
|
1360 p = psave;
|
|
1361 }
|
|
1362 stringbuffer.writeUTF8(c);
|
|
1363 startline = 0;
|
|
1364 }
|
|
1365 }
|
|
1366
|
|
1367 Ldone:
|
|
1368 if (*p == '"')
|
|
1369 p++;
|
|
1370 else if (hereid)
|
|
1371 error("delimited string must end in %s\"", hereid->toChars());
|
|
1372 else
|
|
1373 error("delimited string must end in %c\"", delimright);
|
|
1374 t->len = (unsigned)stringbuffer.offset;
|
|
1375 stringbuffer.writeByte(0);
|
|
1376 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset);
|
|
1377 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
|
|
1378 stringPostfix(t);
|
|
1379 return TOKstring;
|
|
1380 }
|
|
1381
|
|
1382 /**************************************
|
|
1383 * Lex delimited strings:
|
|
1384 * q{ foo(xxx) } // " foo(xxx) "
|
|
1385 * q{foo(} // "foo("
|
|
1386 * q{{foo}"}"} // "{foo}"}""
|
|
1387 * Input:
|
|
1388 * p is on the q
|
|
1389 */
|
|
1390
|
|
1391 TOK Lexer::tokenStringConstant(Token *t)
|
|
1392 {
|
|
1393 unsigned nest = 1;
|
|
1394 Loc start = loc();
|
|
1395 const utf8_t *pstart = ++p;
|
|
1396
|
|
1397 while (1)
|
|
1398 { Token tok;
|
|
1399
|
|
1400 scan(&tok);
|
|
1401 switch (tok.value)
|
|
1402 {
|
|
1403 case TOKlcurly:
|
|
1404 nest++;
|
|
1405 continue;
|
|
1406
|
|
1407 case TOKrcurly:
|
|
1408 if (--nest == 0)
|
|
1409 {
|
|
1410 t->len = (unsigned)(p - 1 - pstart);
|
|
1411 t->ustring = (utf8_t *)mem.xmalloc(t->len + 1);
|
|
1412 memcpy(t->ustring, pstart, t->len);
|
|
1413 t->ustring[t->len] = 0;
|
|
1414 stringPostfix(t);
|
|
1415 return TOKstring;
|
|
1416 }
|
|
1417 continue;
|
|
1418
|
|
1419 case TOKeof:
|
|
1420 error("unterminated token string constant starting at %s", start.toChars());
|
|
1421 t->ustring = (utf8_t *)const_cast<char *>("");
|
|
1422 t->len = 0;
|
|
1423 t->postfix = 0;
|
|
1424 return TOKstring;
|
|
1425
|
|
1426 default:
|
|
1427 continue;
|
|
1428 }
|
|
1429 }
|
|
1430 }
|
|
1431
|
|
1432
|
|
1433
|
|
1434 /**************************************
|
|
1435 */
|
|
1436
|
|
1437 TOK Lexer::escapeStringConstant(Token *t)
|
|
1438 {
|
|
1439 unsigned c;
|
|
1440 Loc start = loc();
|
|
1441
|
|
1442 p++;
|
|
1443 stringbuffer.reset();
|
|
1444 while (1)
|
|
1445 {
|
|
1446 c = *p++;
|
|
1447 switch (c)
|
|
1448 {
|
|
1449 case '\\':
|
|
1450 switch (*p)
|
|
1451 {
|
|
1452 case 'u':
|
|
1453 case 'U':
|
|
1454 case '&':
|
|
1455 c = escapeSequence();
|
|
1456 stringbuffer.writeUTF8(c);
|
|
1457 continue;
|
|
1458
|
|
1459 default:
|
|
1460 c = escapeSequence();
|
|
1461 break;
|
|
1462 }
|
|
1463 break;
|
|
1464 case '\n':
|
|
1465 endOfLine();
|
|
1466 break;
|
|
1467
|
|
1468 case '\r':
|
|
1469 if (*p == '\n')
|
|
1470 continue; // ignore
|
|
1471 c = '\n'; // treat EndOfLine as \n character
|
|
1472 endOfLine();
|
|
1473 break;
|
|
1474
|
|
1475 case '"':
|
|
1476 t->len = (unsigned)stringbuffer.offset;
|
|
1477 stringbuffer.writeByte(0);
|
|
1478 t->ustring = (utf8_t *)mem.xmalloc(stringbuffer.offset);
|
|
1479 memcpy(t->ustring, stringbuffer.data, stringbuffer.offset);
|
|
1480 stringPostfix(t);
|
|
1481 return TOKstring;
|
|
1482
|
|
1483 case 0:
|
|
1484 case 0x1A:
|
|
1485 p--;
|
|
1486 error("unterminated string constant starting at %s", start.toChars());
|
|
1487 t->ustring = (utf8_t *)const_cast<char *>("");
|
|
1488 t->len = 0;
|
|
1489 t->postfix = 0;
|
|
1490 return TOKstring;
|
|
1491
|
|
1492 default:
|
|
1493 if (c & 0x80)
|
|
1494 {
|
|
1495 p--;
|
|
1496 c = decodeUTF();
|
|
1497 if (c == LS || c == PS)
|
|
1498 { c = '\n';
|
|
1499 endOfLine();
|
|
1500 }
|
|
1501 p++;
|
|
1502 stringbuffer.writeUTF8(c);
|
|
1503 continue;
|
|
1504 }
|
|
1505 break;
|
|
1506 }
|
|
1507 stringbuffer.writeByte(c);
|
|
1508 }
|
|
1509 }
|
|
1510
|
|
1511 /**************************************
|
|
1512 */
|
|
1513
|
|
1514 TOK Lexer::charConstant(Token *t)
|
|
1515 {
|
|
1516 unsigned c;
|
|
1517 TOK tk = TOKcharv;
|
|
1518
|
|
1519 //printf("Lexer::charConstant\n");
|
|
1520 p++;
|
|
1521 c = *p++;
|
|
1522 switch (c)
|
|
1523 {
|
|
1524 case '\\':
|
|
1525 switch (*p)
|
|
1526 {
|
|
1527 case 'u':
|
|
1528 t->uns64value = escapeSequence();
|
|
1529 tk = TOKwcharv;
|
|
1530 break;
|
|
1531
|
|
1532 case 'U':
|
|
1533 case '&':
|
|
1534 t->uns64value = escapeSequence();
|
|
1535 tk = TOKdcharv;
|
|
1536 break;
|
|
1537
|
|
1538 default:
|
|
1539 t->uns64value = escapeSequence();
|
|
1540 break;
|
|
1541 }
|
|
1542 break;
|
|
1543 case '\n':
|
|
1544 L1:
|
|
1545 endOfLine();
|
|
1546 /* fall through */
|
|
1547 case '\r':
|
|
1548 case 0:
|
|
1549 case 0x1A:
|
|
1550 case '\'':
|
|
1551 error("unterminated character constant");
|
|
1552 t->uns64value = '?';
|
|
1553 return tk;
|
|
1554
|
|
1555 default:
|
|
1556 if (c & 0x80)
|
|
1557 {
|
|
1558 p--;
|
|
1559 c = decodeUTF();
|
|
1560 p++;
|
|
1561 if (c == LS || c == PS)
|
|
1562 goto L1;
|
|
1563 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE))
|
|
1564 tk = TOKwcharv;
|
|
1565 else
|
|
1566 tk = TOKdcharv;
|
|
1567 }
|
|
1568 t->uns64value = c;
|
|
1569 break;
|
|
1570 }
|
|
1571
|
|
1572 if (*p != '\'')
|
|
1573 {
|
|
1574 error("unterminated character constant");
|
|
1575 t->uns64value = '?';
|
|
1576 return tk;
|
|
1577 }
|
|
1578 p++;
|
|
1579 return tk;
|
|
1580 }
|
|
1581
|
|
1582 /***************************************
|
|
1583 * Get postfix of string literal.
|
|
1584 */
|
|
1585
|
|
1586 void Lexer::stringPostfix(Token *t)
|
|
1587 {
|
|
1588 switch (*p)
|
|
1589 {
|
|
1590 case 'c':
|
|
1591 case 'w':
|
|
1592 case 'd':
|
|
1593 t->postfix = *p;
|
|
1594 p++;
|
|
1595 break;
|
|
1596
|
|
1597 default:
|
|
1598 t->postfix = 0;
|
|
1599 break;
|
|
1600 }
|
|
1601 }
|
|
1602
|
|
1603 /**************************************
|
|
1604 * Read in a number.
|
|
1605 * If it's an integer, store it in tok.TKutok.Vlong.
|
|
1606 * integers can be decimal, octal or hex
|
|
1607 * Handle the suffixes U, UL, LU, L, etc.
|
|
1608 * If it's double, store it in tok.TKutok.Vdouble.
|
|
1609 * Returns:
|
|
1610 * TKnum
|
|
1611 * TKdouble,...
|
|
1612 */
|
|
1613
|
|
1614 TOK Lexer::number(Token *t)
|
|
1615 {
|
|
1616 int base = 10;
|
|
1617 const utf8_t *start = p;
|
|
1618 unsigned c;
|
|
1619 uinteger_t n = 0; // unsigned >=64 bit integer type
|
|
1620 int d;
|
|
1621 bool err = false;
|
|
1622 bool overflow = false;
|
|
1623
|
|
1624 c = *p;
|
|
1625 if (c == '0')
|
|
1626 {
|
|
1627 ++p;
|
|
1628 c = *p;
|
|
1629 switch (c)
|
|
1630 {
|
|
1631 case '0': case '1': case '2': case '3':
|
|
1632 case '4': case '5': case '6': case '7':
|
|
1633 n = c - '0';
|
|
1634 ++p;
|
|
1635 base = 8;
|
|
1636 break;
|
|
1637
|
|
1638 case 'x':
|
|
1639 case 'X':
|
|
1640 ++p;
|
|
1641 base = 16;
|
|
1642 break;
|
|
1643
|
|
1644 case 'b':
|
|
1645 case 'B':
|
|
1646 ++p;
|
|
1647 base = 2;
|
|
1648 break;
|
|
1649
|
|
1650 case '.':
|
|
1651 if (p[1] == '.')
|
|
1652 goto Ldone; // if ".."
|
|
1653 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)
|
|
1654 goto Ldone; // if ".identifier" or ".unicode"
|
|
1655 goto Lreal; // '.' is part of current token
|
|
1656
|
|
1657 case 'i':
|
|
1658 case 'f':
|
|
1659 case 'F':
|
|
1660 goto Lreal;
|
|
1661
|
|
1662 case '_':
|
|
1663 ++p;
|
|
1664 base = 8;
|
|
1665 break;
|
|
1666
|
|
1667 case 'L':
|
|
1668 if (p[1] == 'i')
|
|
1669 goto Lreal;
|
|
1670 break;
|
|
1671
|
|
1672 default:
|
|
1673 break;
|
|
1674 }
|
|
1675 }
|
|
1676
|
|
1677 while (1)
|
|
1678 {
|
|
1679 c = *p;
|
|
1680 switch (c)
|
|
1681 {
|
|
1682 case '0': case '1':
|
|
1683 ++p;
|
|
1684 d = c - '0';
|
|
1685 break;
|
|
1686
|
|
1687 case '2': case '3':
|
|
1688 case '4': case '5': case '6': case '7':
|
|
1689 if (base == 2 && !err)
|
|
1690 {
|
|
1691 error("binary digit expected");
|
|
1692 err = true;
|
|
1693 }
|
|
1694 ++p;
|
|
1695 d = c - '0';
|
|
1696 break;
|
|
1697
|
|
1698 case '8': case '9':
|
|
1699 ++p;
|
|
1700 if (base < 10 && !err)
|
|
1701 {
|
|
1702 error("radix %d digit expected, not '%c'", base, c);
|
|
1703 err = true;
|
|
1704 }
|
|
1705 d = c - '0';
|
|
1706 break;
|
|
1707
|
|
1708 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
|
|
1709 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
|
|
1710 ++p;
|
|
1711 if (base != 16)
|
|
1712 {
|
|
1713 if (c == 'e' || c == 'E' || c == 'f' || c == 'F')
|
|
1714 goto Lreal;
|
|
1715 if (!err)
|
|
1716 {
|
|
1717 error("radix %d digit expected, not '%c'", base, c);
|
|
1718 err = true;
|
|
1719 }
|
|
1720 }
|
|
1721 if (c >= 'a')
|
|
1722 d = c + 10 - 'a';
|
|
1723 else
|
|
1724 d = c + 10 - 'A';
|
|
1725 break;
|
|
1726
|
|
1727 case 'L':
|
|
1728 if (p[1] == 'i')
|
|
1729 goto Lreal;
|
|
1730 goto Ldone;
|
|
1731
|
|
1732 case '.':
|
|
1733 if (p[1] == '.')
|
|
1734 goto Ldone; // if ".."
|
|
1735 if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80))
|
|
1736 goto Ldone; // if ".identifier" or ".unicode"
|
|
1737 goto Lreal; // otherwise as part of a floating point literal
|
|
1738
|
|
1739 case 'p':
|
|
1740 case 'P':
|
|
1741 case 'i':
|
|
1742 Lreal:
|
|
1743 p = start;
|
|
1744 return inreal(t);
|
|
1745
|
|
1746 case '_':
|
|
1747 ++p;
|
|
1748 continue;
|
|
1749
|
|
1750 default:
|
|
1751 goto Ldone;
|
|
1752 }
|
|
1753
|
|
1754 uinteger_t n2 = n * base;
|
|
1755 if ((n2 / base != n || n2 + d < n))
|
|
1756 {
|
|
1757 overflow = true;
|
|
1758 }
|
|
1759 n = n2 + d;
|
|
1760
|
|
1761 // if n needs more than 64 bits
|
|
1762 if (sizeof(n) > 8 &&
|
|
1763 n > 0xFFFFFFFFFFFFFFFFULL)
|
|
1764 {
|
|
1765 overflow = true;
|
|
1766 }
|
|
1767 }
|
|
1768
|
|
1769 Ldone:
|
|
1770
|
|
1771 if (overflow && !err)
|
|
1772 {
|
|
1773 error("integer overflow");
|
|
1774 err = true;
|
|
1775 }
|
|
1776
|
|
1777 enum FLAGS
|
|
1778 {
|
|
1779 FLAGS_none = 0,
|
|
1780 FLAGS_decimal = 1, // decimal
|
|
1781 FLAGS_unsigned = 2, // u or U suffix
|
|
1782 FLAGS_long = 4, // L suffix
|
|
1783 };
|
|
1784
|
|
1785 unsigned flags = (base == 10) ? FLAGS_decimal : FLAGS_none;
|
|
1786
|
|
1787 // Parse trailing 'u', 'U', 'l' or 'L' in any combination
|
|
1788 const utf8_t *psuffix = p;
|
|
1789 while (1)
|
|
1790 {
|
|
1791 utf8_t f;
|
|
1792 switch (*p)
|
|
1793 {
|
|
1794 case 'U':
|
|
1795 case 'u':
|
|
1796 f = FLAGS_unsigned;
|
|
1797 goto L1;
|
|
1798
|
|
1799 case 'l':
|
|
1800 f = FLAGS_long;
|
|
1801 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead");
|
|
1802 goto L1;
|
|
1803
|
|
1804 case 'L':
|
|
1805 f = FLAGS_long;
|
|
1806 L1:
|
|
1807 p++;
|
|
1808 if ((flags & f) && !err)
|
|
1809 {
|
|
1810 error("unrecognized token");
|
|
1811 err = true;
|
|
1812 }
|
|
1813 flags = (FLAGS) (flags | f);
|
|
1814 continue;
|
|
1815 default:
|
|
1816 break;
|
|
1817 }
|
|
1818 break;
|
|
1819 }
|
|
1820
|
|
1821 if (base == 8 && n >= 8)
|
|
1822 error("octal literals 0%llo%.*s are no longer supported, use std.conv.octal!%llo%.*s instead",
|
|
1823 n, p - psuffix, psuffix, n, p - psuffix, psuffix);
|
|
1824
|
|
1825 TOK result;
|
|
1826 switch (flags)
|
|
1827 {
|
|
1828 case FLAGS_none:
|
|
1829 /* Octal or Hexadecimal constant.
|
|
1830 * First that fits: int, uint, long, ulong
|
|
1831 */
|
|
1832 if (n & 0x8000000000000000LL)
|
|
1833 result = TOKuns64v;
|
|
1834 else if (n & 0xFFFFFFFF00000000LL)
|
|
1835 result = TOKint64v;
|
|
1836 else if (n & 0x80000000)
|
|
1837 result = TOKuns32v;
|
|
1838 else
|
|
1839 result = TOKint32v;
|
|
1840 break;
|
|
1841
|
|
1842 case FLAGS_decimal:
|
|
1843 /* First that fits: int, long, long long
|
|
1844 */
|
|
1845 if (n & 0x8000000000000000LL)
|
|
1846 {
|
|
1847 if (!err)
|
|
1848 {
|
|
1849 error("signed integer overflow");
|
|
1850 err = true;
|
|
1851 }
|
|
1852 result = TOKuns64v;
|
|
1853 }
|
|
1854 else if (n & 0xFFFFFFFF80000000LL)
|
|
1855 result = TOKint64v;
|
|
1856 else
|
|
1857 result = TOKint32v;
|
|
1858 break;
|
|
1859
|
|
1860 case FLAGS_unsigned:
|
|
1861 case FLAGS_decimal | FLAGS_unsigned:
|
|
1862 /* First that fits: uint, ulong
|
|
1863 */
|
|
1864 if (n & 0xFFFFFFFF00000000LL)
|
|
1865 result = TOKuns64v;
|
|
1866 else
|
|
1867 result = TOKuns32v;
|
|
1868 break;
|
|
1869
|
|
1870 case FLAGS_decimal | FLAGS_long:
|
|
1871 if (n & 0x8000000000000000LL)
|
|
1872 {
|
|
1873 if (!err)
|
|
1874 {
|
|
1875 error("signed integer overflow");
|
|
1876 err = true;
|
|
1877 }
|
|
1878 result = TOKuns64v;
|
|
1879 }
|
|
1880 else
|
|
1881 result = TOKint64v;
|
|
1882 break;
|
|
1883
|
|
1884 case FLAGS_long:
|
|
1885 if (n & 0x8000000000000000LL)
|
|
1886 result = TOKuns64v;
|
|
1887 else
|
|
1888 result = TOKint64v;
|
|
1889 break;
|
|
1890
|
|
1891 case FLAGS_unsigned | FLAGS_long:
|
|
1892 case FLAGS_decimal | FLAGS_unsigned | FLAGS_long:
|
|
1893 result = TOKuns64v;
|
|
1894 break;
|
|
1895
|
|
1896 default:
|
|
1897 assert(0);
|
|
1898 }
|
|
1899 t->uns64value = n;
|
|
1900 return result;
|
|
1901 }
|
|
1902
|
|
1903 /**************************************
|
|
1904 * Read in characters, converting them to real.
|
|
1905 * Bugs:
|
|
1906 * Exponent overflow not detected.
|
|
1907 * Too much requested precision is not detected.
|
|
1908 */
|
|
1909
|
|
1910 TOK Lexer::inreal(Token *t)
|
|
1911 {
|
|
1912 //printf("Lexer::inreal()\n");
|
|
1913 bool isWellformedString = true;
|
|
1914 stringbuffer.reset();
|
|
1915 const utf8_t *pstart = p;
|
|
1916 char hex = 0;
|
|
1917 unsigned c = *p++;
|
|
1918
|
|
1919 // Leading '0x'
|
|
1920 if (c == '0')
|
|
1921 {
|
|
1922 c = *p++;
|
|
1923 if (c == 'x' || c == 'X')
|
|
1924 {
|
|
1925 hex = true;
|
|
1926 c = *p++;
|
|
1927 }
|
|
1928 }
|
|
1929
|
|
1930 // Digits to left of '.'
|
|
1931 while (1)
|
|
1932 {
|
|
1933 if (c == '.')
|
|
1934 {
|
|
1935 c = *p++;
|
|
1936 break;
|
|
1937 }
|
|
1938 if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
|
|
1939 {
|
|
1940 c = *p++;
|
|
1941 continue;
|
|
1942 }
|
|
1943 break;
|
|
1944 }
|
|
1945
|
|
1946 // Digits to right of '.'
|
|
1947 while (1)
|
|
1948 {
|
|
1949 if (isdigit(c) || (hex && isxdigit(c)) || c == '_')
|
|
1950 {
|
|
1951 c = *p++;
|
|
1952 continue;
|
|
1953 }
|
|
1954 break;
|
|
1955 }
|
|
1956
|
|
1957 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P')))
|
|
1958 {
|
|
1959 c = *p++;
|
|
1960 if (c == '-' || c == '+')
|
|
1961 {
|
|
1962 c = *p++;
|
|
1963 }
|
|
1964 bool anyexp = false;
|
|
1965 while (1)
|
|
1966 {
|
|
1967 if (isdigit(c))
|
|
1968 {
|
|
1969 anyexp = true;
|
|
1970 c = *p++;
|
|
1971 continue;
|
|
1972 }
|
|
1973 if (c == '_')
|
|
1974 {
|
|
1975 c = *p++;
|
|
1976 continue;
|
|
1977 }
|
|
1978 if (!anyexp)
|
|
1979 {
|
|
1980 error("missing exponent");
|
|
1981 isWellformedString = false;
|
|
1982 }
|
|
1983 break;
|
|
1984 }
|
|
1985 }
|
|
1986 else if (hex)
|
|
1987 {
|
|
1988 error("exponent required for hex float");
|
|
1989 isWellformedString = false;
|
|
1990 }
|
|
1991 --p;
|
|
1992 while (pstart < p)
|
|
1993 {
|
|
1994 if (*pstart != '_')
|
|
1995 stringbuffer.writeByte(*pstart);
|
|
1996 ++pstart;
|
|
1997 }
|
|
1998
|
|
1999 stringbuffer.writeByte(0);
|
|
2000 const char *sbufptr = (char *)stringbuffer.data;
|
|
2001 TOK result;
|
|
2002 bool isOutOfRange = false;
|
|
2003 t->floatvalue = (isWellformedString ? CTFloat::parse(sbufptr, &isOutOfRange) : CTFloat::zero);
|
|
2004 errno = 0;
|
|
2005 switch (*p)
|
|
2006 {
|
|
2007 case 'F':
|
|
2008 case 'f':
|
|
2009 if (isWellformedString && !isOutOfRange)
|
|
2010 isOutOfRange = Port::isFloat32LiteralOutOfRange(sbufptr);
|
|
2011 result = TOKfloat32v;
|
|
2012 p++;
|
|
2013 break;
|
|
2014
|
|
2015 default:
|
|
2016 if (isWellformedString && !isOutOfRange)
|
|
2017 isOutOfRange = Port::isFloat64LiteralOutOfRange(sbufptr);
|
|
2018 result = TOKfloat64v;
|
|
2019 break;
|
|
2020
|
|
2021 case 'l':
|
|
2022 error("use 'L' suffix instead of 'l'");
|
|
2023 /* fall through */
|
|
2024 case 'L':
|
|
2025 result = TOKfloat80v;
|
|
2026 p++;
|
|
2027 break;
|
|
2028 }
|
|
2029 if (*p == 'i' || *p == 'I')
|
|
2030 {
|
|
2031 if (*p == 'I')
|
|
2032 error("use 'i' suffix instead of 'I'");
|
|
2033 p++;
|
|
2034 switch (result)
|
|
2035 {
|
|
2036 case TOKfloat32v:
|
|
2037 result = TOKimaginary32v;
|
|
2038 break;
|
|
2039 case TOKfloat64v:
|
|
2040 result = TOKimaginary64v;
|
|
2041 break;
|
|
2042 case TOKfloat80v:
|
|
2043 result = TOKimaginary80v;
|
|
2044 break;
|
|
2045 default: break;
|
|
2046 }
|
|
2047 }
|
|
2048 const bool isLong = (result == TOKfloat80v || result == TOKimaginary80v);
|
|
2049 if (isOutOfRange && !isLong)
|
|
2050 {
|
|
2051 const char *suffix = (result == TOKfloat32v || result == TOKimaginary32v) ? "f" : "";
|
|
2052 error(scanloc, "number '%s%s' is not representable", (char *)stringbuffer.data, suffix);
|
|
2053 }
|
|
2054 return result;
|
|
2055 }
|
|
2056
|
|
2057 /*********************************************
|
|
2058 * parse:
|
|
2059 * #line linnum [filespec]
|
|
2060 * also allow __LINE__ for linnum, and __FILE__ for filespec
|
|
2061 */
|
|
2062
|
|
2063 void Lexer::poundLine()
|
|
2064 {
|
|
2065 Token tok;
|
|
2066 int linnum = this->scanloc.linnum;
|
|
2067 char *filespec = NULL;
|
|
2068 Loc loc = this->loc();
|
|
2069
|
|
2070 scan(&tok);
|
|
2071 if (tok.value == TOKint32v || tok.value == TOKint64v)
|
|
2072 {
|
|
2073 int lin = (int)(tok.uns64value - 1);
|
|
2074 if ((unsigned)lin != tok.uns64value - 1)
|
|
2075 error("line number %lld out of range", (unsigned long long)tok.uns64value);
|
|
2076 else
|
|
2077 linnum = lin;
|
|
2078 }
|
|
2079 else if (tok.value == TOKline)
|
|
2080 {
|
|
2081 }
|
|
2082 else
|
|
2083 goto Lerr;
|
|
2084
|
|
2085 while (1)
|
|
2086 {
|
|
2087 switch (*p)
|
|
2088 {
|
|
2089 case 0:
|
|
2090 case 0x1A:
|
|
2091 case '\n':
|
|
2092 Lnewline:
|
|
2093 this->scanloc.linnum = linnum;
|
|
2094 if (filespec)
|
|
2095 this->scanloc.filename = filespec;
|
|
2096 return;
|
|
2097
|
|
2098 case '\r':
|
|
2099 p++;
|
|
2100 if (*p != '\n')
|
|
2101 { p--;
|
|
2102 goto Lnewline;
|
|
2103 }
|
|
2104 continue;
|
|
2105
|
|
2106 case ' ':
|
|
2107 case '\t':
|
|
2108 case '\v':
|
|
2109 case '\f':
|
|
2110 p++;
|
|
2111 continue; // skip white space
|
|
2112
|
|
2113 case '_':
|
|
2114 if (memcmp(p, "__FILE__", 8) == 0)
|
|
2115 {
|
|
2116 p += 8;
|
|
2117 filespec = mem.xstrdup(scanloc.filename);
|
|
2118 continue;
|
|
2119 }
|
|
2120 goto Lerr;
|
|
2121
|
|
2122 case '"':
|
|
2123 if (filespec)
|
|
2124 goto Lerr;
|
|
2125 stringbuffer.reset();
|
|
2126 p++;
|
|
2127 while (1)
|
|
2128 { unsigned c;
|
|
2129
|
|
2130 c = *p;
|
|
2131 switch (c)
|
|
2132 {
|
|
2133 case '\n':
|
|
2134 case '\r':
|
|
2135 case 0:
|
|
2136 case 0x1A:
|
|
2137 goto Lerr;
|
|
2138
|
|
2139 case '"':
|
|
2140 stringbuffer.writeByte(0);
|
|
2141 filespec = mem.xstrdup((char *)stringbuffer.data);
|
|
2142 p++;
|
|
2143 break;
|
|
2144
|
|
2145 default:
|
|
2146 if (c & 0x80)
|
|
2147 { unsigned u = decodeUTF();
|
|
2148 if (u == PS || u == LS)
|
|
2149 goto Lerr;
|
|
2150 }
|
|
2151 stringbuffer.writeByte(c);
|
|
2152 p++;
|
|
2153 continue;
|
|
2154 }
|
|
2155 break;
|
|
2156 }
|
|
2157 continue;
|
|
2158
|
|
2159 default:
|
|
2160 if (*p & 0x80)
|
|
2161 { unsigned u = decodeUTF();
|
|
2162 if (u == PS || u == LS)
|
|
2163 goto Lnewline;
|
|
2164 }
|
|
2165 goto Lerr;
|
|
2166 }
|
|
2167 }
|
|
2168
|
|
2169 Lerr:
|
|
2170 error(loc, "#line integer [\"filespec\"]\\n expected");
|
|
2171 }
|
|
2172
|
|
2173
|
|
2174 /********************************************
|
|
2175 * Decode UTF character.
|
|
2176 * Issue error messages for invalid sequences.
|
|
2177 * Return decoded character, advance p to last character in UTF sequence.
|
|
2178 */
|
|
2179
|
|
2180 unsigned Lexer::decodeUTF()
|
|
2181 {
|
|
2182 dchar_t u;
|
|
2183 utf8_t c;
|
|
2184 const utf8_t *s = p;
|
|
2185 size_t len;
|
|
2186 size_t idx;
|
|
2187 const char *msg;
|
|
2188
|
|
2189 c = *s;
|
|
2190 assert(c & 0x80);
|
|
2191
|
|
2192 // Check length of remaining string up to 6 UTF-8 characters
|
|
2193 for (len = 1; len < 6 && s[len]; len++)
|
|
2194 ;
|
|
2195
|
|
2196 idx = 0;
|
|
2197 msg = utf_decodeChar(s, len, &idx, &u);
|
|
2198 p += idx - 1;
|
|
2199 if (msg)
|
|
2200 {
|
|
2201 error("%s", msg);
|
|
2202 }
|
|
2203 return u;
|
|
2204 }
|
|
2205
|
|
2206
|
|
2207 /***************************************************
|
|
2208 * Parse doc comment embedded between t->ptr and p.
|
|
2209 * Remove trailing blanks and tabs from lines.
|
|
2210 * Replace all newlines with \n.
|
|
2211 * Remove leading comment character from each line.
|
|
2212 * Decide if it's a lineComment or a blockComment.
|
|
2213 * Append to previous one for this token.
|
|
2214 */
|
|
2215
|
|
2216 void Lexer::getDocComment(Token *t, unsigned lineComment)
|
|
2217 {
|
|
2218 /* ct tells us which kind of comment it is: '/', '*', or '+'
|
|
2219 */
|
|
2220 utf8_t ct = t->ptr[2];
|
|
2221
|
|
2222 /* Start of comment text skips over / * *, / + +, or / / /
|
|
2223 */
|
|
2224 const utf8_t *q = t->ptr + 3; // start of comment text
|
|
2225
|
|
2226 const utf8_t *qend = p;
|
|
2227 if (ct == '*' || ct == '+')
|
|
2228 qend -= 2;
|
|
2229
|
|
2230 /* Scan over initial row of ****'s or ++++'s or ////'s
|
|
2231 */
|
|
2232 for (; q < qend; q++)
|
|
2233 {
|
|
2234 if (*q != ct)
|
|
2235 break;
|
|
2236 }
|
|
2237
|
|
2238 /* Remove leading spaces until start of the comment
|
|
2239 */
|
|
2240 int linestart = 0;
|
|
2241 if (ct == '/')
|
|
2242 {
|
|
2243 while (q < qend && (*q == ' ' || *q == '\t'))
|
|
2244 ++q;
|
|
2245 }
|
|
2246 else if (q < qend)
|
|
2247 {
|
|
2248 if (*q == '\r')
|
|
2249 {
|
|
2250 ++q;
|
|
2251 if (q < qend && *q == '\n')
|
|
2252 ++q;
|
|
2253 linestart = 1;
|
|
2254 }
|
|
2255 else if (*q == '\n')
|
|
2256 {
|
|
2257 ++q;
|
|
2258 linestart = 1;
|
|
2259 }
|
|
2260 }
|
|
2261
|
|
2262 /* Remove trailing row of ****'s or ++++'s
|
|
2263 */
|
|
2264 if (ct != '/')
|
|
2265 {
|
|
2266 for (; q < qend; qend--)
|
|
2267 {
|
|
2268 if (qend[-1] != ct)
|
|
2269 break;
|
|
2270 }
|
|
2271 }
|
|
2272
|
|
2273 /* Comment is now [q .. qend].
|
|
2274 * Canonicalize it into buf[].
|
|
2275 */
|
|
2276 OutBuffer buf;
|
|
2277
|
|
2278 for (; q < qend; q++)
|
|
2279 {
|
|
2280 utf8_t c = *q;
|
|
2281
|
|
2282 switch (c)
|
|
2283 {
|
|
2284 case '*':
|
|
2285 case '+':
|
|
2286 if (linestart && c == ct)
|
|
2287 { linestart = 0;
|
|
2288 /* Trim preceding whitespace up to preceding \n
|
|
2289 */
|
|
2290 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
|
|
2291 buf.offset--;
|
|
2292 continue;
|
|
2293 }
|
|
2294 break;
|
|
2295
|
|
2296 case ' ':
|
|
2297 case '\t':
|
|
2298 break;
|
|
2299
|
|
2300 case '\r':
|
|
2301 if (q[1] == '\n')
|
|
2302 continue; // skip the \r
|
|
2303 goto Lnewline;
|
|
2304
|
|
2305 default:
|
|
2306 if (c == 226)
|
|
2307 {
|
|
2308 // If LS or PS
|
|
2309 if (q[1] == 128 &&
|
|
2310 (q[2] == 168 || q[2] == 169))
|
|
2311 {
|
|
2312 q += 2;
|
|
2313 goto Lnewline;
|
|
2314 }
|
|
2315 }
|
|
2316 linestart = 0;
|
|
2317 break;
|
|
2318
|
|
2319 Lnewline:
|
|
2320 c = '\n'; // replace all newlines with \n
|
|
2321 /* fall through */
|
|
2322 case '\n':
|
|
2323 linestart = 1;
|
|
2324
|
|
2325 /* Trim trailing whitespace
|
|
2326 */
|
|
2327 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
|
|
2328 buf.offset--;
|
|
2329
|
|
2330 break;
|
|
2331 }
|
|
2332 buf.writeByte(c);
|
|
2333 }
|
|
2334
|
|
2335 /* Trim trailing whitespace (if the last line does not have newline)
|
|
2336 */
|
|
2337 if (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
|
|
2338 {
|
|
2339 while (buf.offset && (buf.data[buf.offset - 1] == ' ' || buf.data[buf.offset - 1] == '\t'))
|
|
2340 buf.offset--;
|
|
2341 }
|
|
2342
|
|
2343 // Always end with a newline
|
|
2344 if (!buf.offset || buf.data[buf.offset - 1] != '\n')
|
|
2345 buf.writeByte('\n');
|
|
2346
|
|
2347 buf.writeByte(0);
|
|
2348
|
|
2349 // It's a line comment if the start of the doc comment comes
|
|
2350 // after other non-whitespace on the same line.
|
|
2351 const utf8_t** dc = (lineComment && anyToken)
|
|
2352 ? &t->lineComment
|
|
2353 : &t->blockComment;
|
|
2354
|
|
2355 // Combine with previous doc comment, if any
|
|
2356 if (*dc)
|
|
2357 *dc = combineComments(*dc, (utf8_t *)buf.data);
|
|
2358 else
|
|
2359 *dc = (utf8_t *)buf.extractData();
|
|
2360 }
|
|
2361
|
|
2362 /********************************************
|
|
2363 * Combine two document comments into one,
|
|
2364 * separated by a newline.
|
|
2365 */
|
|
2366
|
|
2367 const utf8_t *Lexer::combineComments(const utf8_t *c1, const utf8_t *c2)
|
|
2368 {
|
|
2369 //printf("Lexer::combineComments('%s', '%s')\n", c1, c2);
|
|
2370
|
|
2371 const utf8_t *c = c2;
|
|
2372
|
|
2373 if (c1)
|
|
2374 {
|
|
2375 c = c1;
|
|
2376 if (c2)
|
|
2377 {
|
|
2378 size_t len1 = strlen((const char *)c1);
|
|
2379 size_t len2 = strlen((const char *)c2);
|
|
2380
|
|
2381 int insertNewLine = 0;
|
|
2382 if (len1 && c1[len1 - 1] != '\n')
|
|
2383 {
|
|
2384 ++len1;
|
|
2385 insertNewLine = 1;
|
|
2386 }
|
|
2387
|
|
2388 utf8_t *p = (utf8_t *)mem.xmalloc(len1 + 1 + len2 + 1);
|
|
2389 memcpy(p, c1, len1 - insertNewLine);
|
|
2390 if (insertNewLine)
|
|
2391 p[len1 - 1] = '\n';
|
|
2392
|
|
2393 p[len1] = '\n';
|
|
2394
|
|
2395 memcpy(p + len1 + 1, c2, len2);
|
|
2396 p[len1 + 1 + len2] = 0;
|
|
2397 c = p;
|
|
2398 }
|
|
2399 }
|
|
2400 return c;
|
|
2401 }
|