Mercurial > hg > CbC > CbC_gcc
comparison gcc/ada/libgnat/s-regpat.adb @ 111:04ced10e8804
gcc 7
author | kono |
---|---|
date | Fri, 27 Oct 2017 22:46:09 +0900 |
parents | |
children | 84e7813d76e9 |
comparison
equal
deleted
inserted
replaced
68:561a7518be6b | 111:04ced10e8804 |
---|---|
1 ------------------------------------------------------------------------------ | |
2 -- -- | |
3 -- GNAT LIBRARY COMPONENTS -- | |
4 -- -- | |
5 -- G N A T . R E G P A T -- | |
6 -- -- | |
7 -- B o d y -- | |
8 -- -- | |
9 -- Copyright (C) 1986 by University of Toronto. -- | |
10 -- Copyright (C) 1999-2017, AdaCore -- | |
11 -- -- | |
12 -- GNAT is free software; you can redistribute it and/or modify it under -- | |
13 -- terms of the GNU General Public License as published by the Free Soft- -- | |
14 -- ware Foundation; either version 3, or (at your option) any later ver- -- | |
15 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- -- | |
16 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -- | |
17 -- or FITNESS FOR A PARTICULAR PURPOSE. -- | |
18 -- -- | |
19 -- As a special exception under Section 7 of GPL version 3, you are granted -- | |
20 -- additional permissions described in the GCC Runtime Library Exception, -- | |
21 -- version 3.1, as published by the Free Software Foundation. -- | |
22 -- -- | |
23 -- You should have received a copy of the GNU General Public License and -- | |
24 -- a copy of the GCC Runtime Library Exception along with this program; -- | |
25 -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see -- | |
26 -- <http://www.gnu.org/licenses/>. -- | |
27 -- -- | |
28 -- GNAT was originally developed by the GNAT team at New York University. -- | |
29 -- Extensive contributions were provided by Ada Core Technologies Inc. -- | |
30 -- -- | |
31 ------------------------------------------------------------------------------ | |
32 | |
33 -- This is an altered Ada 95 version of the original V8 style regular | |
34 -- expression library written in C by Henry Spencer. Apart from the | |
35 -- translation to Ada, the interface has been considerably changed to | |
36 -- use the Ada String type instead of C-style nul-terminated strings. | |
37 | |
38 -- Beware that some of this code is subtly aware of the way operator | |
39 -- precedence is structured in regular expressions. Serious changes in | |
40 -- regular-expression syntax might require a total rethink. | |
41 | |
42 with System.IO; use System.IO; | |
43 with Ada.Characters.Handling; use Ada.Characters.Handling; | |
44 with Ada.Unchecked_Conversion; | |
45 | |
46 package body System.Regpat is | |
47 | |
48 Debug : constant Boolean := False; | |
49 -- Set to True to activate debug traces. This is normally set to constant | |
50 -- False to simply delete all the trace code. It is to be edited to True | |
51 -- for internal debugging of the package. | |
52 | |
53 ---------------------------- | |
54 -- Implementation details -- | |
55 ---------------------------- | |
56 | |
57 -- This is essentially a linear encoding of a nondeterministic | |
58 -- finite-state machine, also known as syntax charts or | |
59 -- "railroad normal form" in parsing technology. | |
60 | |
61 -- Each node is an opcode plus a "next" pointer, possibly plus an | |
62 -- operand. "Next" pointers of all nodes except BRANCH implement | |
63 -- concatenation; a "next" pointer with a BRANCH on both ends of it | |
64 -- is connecting two alternatives. | |
65 | |
66 -- The operand of some types of node is a literal string; for others, | |
67 -- it is a node leading into a sub-FSM. In particular, the operand of | |
68 -- a BRANCH node is the first node of the branch. | |
69 -- (NB this is *not* a tree structure: the tail of the branch connects | |
70 -- to the thing following the set of BRANCHes). | |
71 | |
72 -- You can see the exact byte-compiled version by using the Dump | |
73 -- subprogram. However, here are a few examples: | |
74 | |
75 -- (a|b): 1 : BRANCH (next at 9) | |
76 -- 4 : EXACT (next at 17) operand=a | |
77 -- 9 : BRANCH (next at 17) | |
78 -- 12 : EXACT (next at 17) operand=b | |
79 -- 17 : EOP (next at 0) | |
80 -- | |
81 -- (ab)*: 1 : CURLYX (next at 25) { 0, 32767} | |
82 -- 8 : OPEN 1 (next at 12) | |
83 -- 12 : EXACT (next at 18) operand=ab | |
84 -- 18 : CLOSE 1 (next at 22) | |
85 -- 22 : WHILEM (next at 0) | |
86 -- 25 : NOTHING (next at 28) | |
87 -- 28 : EOP (next at 0) | |
88 | |
89 -- The opcodes are: | |
90 | |
91 type Opcode is | |
92 | |
93 -- Name Operand? Meaning | |
94 | |
95 (EOP, -- no End of program | |
96 MINMOD, -- no Next operator is not greedy | |
97 | |
98 -- Classes of characters | |
99 | |
100 ANY, -- no Match any one character except newline | |
101 SANY, -- no Match any character, including new line | |
102 ANYOF, -- class Match any character in this class | |
103 EXACT, -- str Match this string exactly | |
104 EXACTF, -- str Match this string (case-folding is one) | |
105 NOTHING, -- no Match empty string | |
106 SPACE, -- no Match any whitespace character | |
107 NSPACE, -- no Match any non-whitespace character | |
108 DIGIT, -- no Match any numeric character | |
109 NDIGIT, -- no Match any non-numeric character | |
110 ALNUM, -- no Match any alphanumeric character | |
111 NALNUM, -- no Match any non-alphanumeric character | |
112 | |
113 -- Branches | |
114 | |
115 BRANCH, -- node Match this alternative, or the next | |
116 | |
117 -- Simple loops (when the following node is one character in length) | |
118 | |
119 STAR, -- node Match this simple thing 0 or more times | |
120 PLUS, -- node Match this simple thing 1 or more times | |
121 CURLY, -- 2num node Match this simple thing between n and m times. | |
122 | |
123 -- Complex loops | |
124 | |
125 CURLYX, -- 2num node Match this complex thing {n,m} times | |
126 -- The nums are coded on two characters each | |
127 | |
128 WHILEM, -- no Do curly processing and see if rest matches | |
129 | |
130 -- Matches after or before a word | |
131 | |
132 BOL, -- no Match "" at beginning of line | |
133 MBOL, -- no Same, assuming multiline (match after \n) | |
134 SBOL, -- no Same, assuming single line (don't match at \n) | |
135 EOL, -- no Match "" at end of line | |
136 MEOL, -- no Same, assuming multiline (match before \n) | |
137 SEOL, -- no Same, assuming single line (don't match at \n) | |
138 | |
139 BOUND, -- no Match "" at any word boundary | |
140 NBOUND, -- no Match "" at any word non-boundary | |
141 | |
142 -- Parenthesis groups handling | |
143 | |
144 REFF, -- num Match some already matched string, folded | |
145 OPEN, -- num Mark this point in input as start of #n | |
146 CLOSE); -- num Analogous to OPEN | |
147 | |
148 for Opcode'Size use 8; | |
149 | |
150 -- Opcode notes: | |
151 | |
152 -- BRANCH | |
153 -- The set of branches constituting a single choice are hooked | |
154 -- together with their "next" pointers, since precedence prevents | |
155 -- anything being concatenated to any individual branch. The | |
156 -- "next" pointer of the last BRANCH in a choice points to the | |
157 -- thing following the whole choice. This is also where the | |
158 -- final "next" pointer of each individual branch points; each | |
159 -- branch starts with the operand node of a BRANCH node. | |
160 | |
161 -- STAR,PLUS | |
162 -- '?', and complex '*' and '+', are implemented with CURLYX. | |
163 -- branches. Simple cases (one character per match) are implemented with | |
164 -- STAR and PLUS for speed and to minimize recursive plunges. | |
165 | |
166 -- OPEN,CLOSE | |
167 -- ...are numbered at compile time. | |
168 | |
169 -- EXACT, EXACTF | |
170 -- There are in fact two arguments, the first one is the length (minus | |
171 -- one of the string argument), coded on one character, the second | |
172 -- argument is the string itself, coded on length + 1 characters. | |
173 | |
174 -- A node is one char of opcode followed by two chars of "next" pointer. | |
175 -- "Next" pointers are stored as two 8-bit pieces, high order first. The | |
176 -- value is a positive offset from the opcode of the node containing it. | |
177 -- An operand, if any, simply follows the node. (Note that much of the | |
178 -- code generation knows about this implicit relationship.) | |
179 | |
180 -- Using two bytes for the "next" pointer is vast overkill for most | |
181 -- things, but allows patterns to get big without disasters. | |
182 | |
183 Next_Pointer_Bytes : constant := 3; | |
184 -- Points after the "next pointer" data. An instruction is therefore: | |
185 -- 1 byte: instruction opcode | |
186 -- 2 bytes: pointer to next instruction | |
187 -- * bytes: optional data for the instruction | |
188 | |
189 ----------------------- | |
190 -- Character classes -- | |
191 ----------------------- | |
192 -- This is the implementation for character classes ([...]) in the | |
193 -- syntax for regular expressions. Each character (0..256) has an | |
194 -- entry into the table. This makes for a very fast matching | |
195 -- algorithm. | |
196 | |
197 type Class_Byte is mod 256; | |
198 type Character_Class is array (Class_Byte range 0 .. 31) of Class_Byte; | |
199 | |
200 type Bit_Conversion_Array is array (Class_Byte range 0 .. 7) of Class_Byte; | |
201 Bit_Conversion : constant Bit_Conversion_Array := | |
202 (1, 2, 4, 8, 16, 32, 64, 128); | |
203 | |
204 type Std_Class is (ANYOF_NONE, | |
205 ANYOF_ALNUM, -- Alphanumeric class [a-zA-Z0-9] | |
206 ANYOF_NALNUM, | |
207 ANYOF_SPACE, -- Space class [ \t\n\r\f] | |
208 ANYOF_NSPACE, | |
209 ANYOF_DIGIT, -- Digit class [0-9] | |
210 ANYOF_NDIGIT, | |
211 ANYOF_ALNUMC, -- Alphanumeric class [a-zA-Z0-9] | |
212 ANYOF_NALNUMC, | |
213 ANYOF_ALPHA, -- Alpha class [a-zA-Z] | |
214 ANYOF_NALPHA, | |
215 ANYOF_ASCII, -- Ascii class (7 bits) 0..127 | |
216 ANYOF_NASCII, | |
217 ANYOF_CNTRL, -- Control class | |
218 ANYOF_NCNTRL, | |
219 ANYOF_GRAPH, -- Graphic class | |
220 ANYOF_NGRAPH, | |
221 ANYOF_LOWER, -- Lower case class [a-z] | |
222 ANYOF_NLOWER, | |
223 ANYOF_PRINT, -- printable class | |
224 ANYOF_NPRINT, | |
225 ANYOF_PUNCT, -- | |
226 ANYOF_NPUNCT, | |
227 ANYOF_UPPER, -- Upper case class [A-Z] | |
228 ANYOF_NUPPER, | |
229 ANYOF_XDIGIT, -- Hexadecimal digit | |
230 ANYOF_NXDIGIT | |
231 ); | |
232 | |
233 procedure Set_In_Class | |
234 (Bitmap : in out Character_Class; | |
235 C : Character); | |
236 -- Set the entry to True for C in the class Bitmap | |
237 | |
238 function Get_From_Class | |
239 (Bitmap : Character_Class; | |
240 C : Character) return Boolean; | |
241 -- Return True if the entry is set for C in the class Bitmap | |
242 | |
243 procedure Reset_Class (Bitmap : out Character_Class); | |
244 -- Clear all the entries in the class Bitmap | |
245 | |
246 pragma Inline (Set_In_Class); | |
247 pragma Inline (Get_From_Class); | |
248 pragma Inline (Reset_Class); | |
249 | |
250 ----------------------- | |
251 -- Local Subprograms -- | |
252 ----------------------- | |
253 | |
254 function "=" (Left : Character; Right : Opcode) return Boolean; | |
255 | |
256 function Is_Alnum (C : Character) return Boolean; | |
257 -- Return True if C is an alphanum character or an underscore ('_') | |
258 | |
259 function Is_White_Space (C : Character) return Boolean; | |
260 -- Return True if C is a whitespace character | |
261 | |
262 function Is_Printable (C : Character) return Boolean; | |
263 -- Return True if C is a printable character | |
264 | |
265 function Operand (P : Pointer) return Pointer; | |
266 -- Return a pointer to the first operand of the node at P | |
267 | |
268 function String_Length | |
269 (Program : Program_Data; | |
270 P : Pointer) return Program_Size; | |
271 -- Return the length of the string argument of the node at P | |
272 | |
273 function String_Operand (P : Pointer) return Pointer; | |
274 -- Return a pointer to the string argument of the node at P | |
275 | |
276 procedure Bitmap_Operand | |
277 (Program : Program_Data; | |
278 P : Pointer; | |
279 Op : out Character_Class); | |
280 -- Return a pointer to the string argument of the node at P | |
281 | |
282 function Get_Next | |
283 (Program : Program_Data; | |
284 IP : Pointer) return Pointer; | |
285 -- Dig the next instruction pointer out of a node | |
286 | |
287 procedure Optimize (Self : in out Pattern_Matcher); | |
288 -- Optimize a Pattern_Matcher by noting certain special cases | |
289 | |
290 function Read_Natural | |
291 (Program : Program_Data; | |
292 IP : Pointer) return Natural; | |
293 -- Return the 2-byte natural coded at position IP | |
294 | |
295 -- All of the subprograms above are tiny and should be inlined | |
296 | |
297 pragma Inline ("="); | |
298 pragma Inline (Is_Alnum); | |
299 pragma Inline (Is_White_Space); | |
300 pragma Inline (Get_Next); | |
301 pragma Inline (Operand); | |
302 pragma Inline (Read_Natural); | |
303 pragma Inline (String_Length); | |
304 pragma Inline (String_Operand); | |
305 | |
306 type Expression_Flags is record | |
307 Has_Width, -- Known never to match null string | |
308 Simple, -- Simple enough to be STAR/PLUS operand | |
309 SP_Start : Boolean; -- Starts with * or + | |
310 end record; | |
311 | |
312 Worst_Expression : constant Expression_Flags := (others => False); | |
313 -- Worst case | |
314 | |
315 procedure Dump_Until | |
316 (Program : Program_Data; | |
317 Index : in out Pointer; | |
318 Till : Pointer; | |
319 Indent : Natural; | |
320 Do_Print : Boolean := True); | |
321 -- Dump the program until the node Till (not included) is met. Every line | |
322 -- is indented with Index spaces at the beginning Dumps till the end if | |
323 -- Till is 0. | |
324 | |
325 procedure Dump_Operation | |
326 (Program : Program_Data; | |
327 Index : Pointer; | |
328 Indent : Natural); | |
329 -- Same as above, but only dumps a single operation, and compute its | |
330 -- indentation from the program. | |
331 | |
332 --------- | |
333 -- "=" -- | |
334 --------- | |
335 | |
336 function "=" (Left : Character; Right : Opcode) return Boolean is | |
337 begin | |
338 return Character'Pos (Left) = Opcode'Pos (Right); | |
339 end "="; | |
340 | |
341 -------------------- | |
342 -- Bitmap_Operand -- | |
343 -------------------- | |
344 | |
345 procedure Bitmap_Operand | |
346 (Program : Program_Data; | |
347 P : Pointer; | |
348 Op : out Character_Class) | |
349 is | |
350 function Convert is new Ada.Unchecked_Conversion | |
351 (Program_Data, Character_Class); | |
352 | |
353 begin | |
354 Op (0 .. 31) := Convert (Program (P + Next_Pointer_Bytes .. P + 34)); | |
355 end Bitmap_Operand; | |
356 | |
357 ------------- | |
358 -- Compile -- | |
359 ------------- | |
360 | |
361 procedure Compile | |
362 (Matcher : out Pattern_Matcher; | |
363 Expression : String; | |
364 Final_Code_Size : out Program_Size; | |
365 Flags : Regexp_Flags := No_Flags) | |
366 is | |
367 -- We can't allocate space until we know how big the compiled form | |
368 -- will be, but we can't compile it (and thus know how big it is) | |
369 -- until we've got a place to put the code. So we cheat: we compile | |
370 -- it twice, once with code generation turned off and size counting | |
371 -- turned on, and once "for real". | |
372 | |
373 -- This also means that we don't allocate space until we are sure | |
374 -- that the thing really will compile successfully, and we never | |
375 -- have to move the code and thus invalidate pointers into it. | |
376 | |
377 -- Beware that the optimization-preparation code in here knows | |
378 -- about some of the structure of the compiled regexp. | |
379 | |
380 PM : Pattern_Matcher renames Matcher; | |
381 Program : Program_Data renames PM.Program; | |
382 | |
383 Emit_Ptr : Pointer := Program_First; | |
384 | |
385 Parse_Pos : Natural := Expression'First; -- Input-scan pointer | |
386 Parse_End : constant Natural := Expression'Last; | |
387 | |
388 ---------------------------- | |
389 -- Subprograms for Create -- | |
390 ---------------------------- | |
391 | |
392 procedure Emit (B : Character); | |
393 -- Output the Character B to the Program. If code-generation is | |
394 -- disabled, simply increments the program counter. | |
395 | |
396 function Emit_Node (Op : Opcode) return Pointer; | |
397 -- If code-generation is enabled, Emit_Node outputs the | |
398 -- opcode Op and reserves space for a pointer to the next node. | |
399 -- Return value is the location of new opcode, i.e. old Emit_Ptr. | |
400 | |
401 procedure Emit_Natural (IP : Pointer; N : Natural); | |
402 -- Split N on two characters at position IP | |
403 | |
404 procedure Emit_Class (Bitmap : Character_Class); | |
405 -- Emits a character class | |
406 | |
407 procedure Case_Emit (C : Character); | |
408 -- Emit C, after converting is to lower-case if the regular | |
409 -- expression is case insensitive. | |
410 | |
411 procedure Parse | |
412 (Parenthesized : Boolean; | |
413 Capturing : Boolean; | |
414 Flags : out Expression_Flags; | |
415 IP : out Pointer); | |
416 -- Parse regular expression, i.e. main body or parenthesized thing. | |
417 -- Caller must absorb opening parenthesis. Capturing should be set to | |
418 -- True when we have an open parenthesis from which we want the user | |
419 -- to extra text. | |
420 | |
421 procedure Parse_Branch | |
422 (Flags : out Expression_Flags; | |
423 First : Boolean; | |
424 IP : out Pointer); | |
425 -- Implements the concatenation operator and handles '|'. | |
426 -- First should be true if this is the first item of the alternative. | |
427 | |
428 procedure Parse_Piece | |
429 (Expr_Flags : out Expression_Flags; | |
430 IP : out Pointer); | |
431 -- Parse something followed by possible [*+?] | |
432 | |
433 procedure Parse_Atom | |
434 (Expr_Flags : out Expression_Flags; | |
435 IP : out Pointer); | |
436 -- Parse_Atom is the lowest level parse procedure. | |
437 -- | |
438 -- Optimization: Gobbles an entire sequence of ordinary characters so | |
439 -- that it can turn them into a single node, which is smaller to store | |
440 -- and faster to run. Backslashed characters are exceptions, each | |
441 -- becoming a separate node; the code is simpler that way and it's | |
442 -- not worth fixing. | |
443 | |
444 procedure Insert_Operator | |
445 (Op : Opcode; | |
446 Operand : Pointer; | |
447 Greedy : Boolean := True); | |
448 -- Insert_Operator inserts an operator in front of an already-emitted | |
449 -- operand and relocates the operand. This applies to PLUS and STAR. | |
450 -- If Minmod is True, then the operator is non-greedy. | |
451 | |
452 function Insert_Operator_Before | |
453 (Op : Opcode; | |
454 Operand : Pointer; | |
455 Greedy : Boolean; | |
456 Opsize : Pointer) return Pointer; | |
457 -- Insert an operator before Operand (and move the latter forward in the | |
458 -- program). Opsize is the size needed to represent the operator. This | |
459 -- returns the position at which the operator was inserted, and moves | |
460 -- Emit_Ptr after the new position of the operand. | |
461 | |
462 procedure Insert_Curly_Operator | |
463 (Op : Opcode; | |
464 Min : Natural; | |
465 Max : Natural; | |
466 Operand : Pointer; | |
467 Greedy : Boolean := True); | |
468 -- Insert an operator for CURLY ({Min}, {Min,} or {Min,Max}). | |
469 -- If Minmod is True, then the operator is non-greedy. | |
470 | |
471 procedure Link_Tail (P, Val : Pointer); | |
472 -- Link_Tail sets the next-pointer at the end of a node chain | |
473 | |
474 procedure Link_Operand_Tail (P, Val : Pointer); | |
475 -- Link_Tail on operand of first argument; noop if operand-less | |
476 | |
477 procedure Fail (M : String); | |
478 pragma No_Return (Fail); | |
479 -- Fail with a diagnostic message, if possible | |
480 | |
481 function Is_Curly_Operator (IP : Natural) return Boolean; | |
482 -- Return True if IP is looking at a '{' that is the beginning | |
483 -- of a curly operator, i.e. it matches {\d+,?\d*} | |
484 | |
485 function Is_Mult (IP : Natural) return Boolean; | |
486 -- Return True if C is a regexp multiplier: '+', '*' or '?' | |
487 | |
488 procedure Get_Curly_Arguments | |
489 (IP : Natural; | |
490 Min : out Natural; | |
491 Max : out Natural; | |
492 Greedy : out Boolean); | |
493 -- Parse the argument list for a curly operator. | |
494 -- It is assumed that IP is indeed pointing at a valid operator. | |
495 -- So what is IP and how come IP is not referenced in the body ??? | |
496 | |
497 procedure Parse_Character_Class (IP : out Pointer); | |
498 -- Parse a character class. | |
499 -- The calling subprogram should consume the opening '[' before. | |
500 | |
501 procedure Parse_Literal | |
502 (Expr_Flags : out Expression_Flags; | |
503 IP : out Pointer); | |
504 -- Parse_Literal encodes a string of characters to be matched exactly | |
505 | |
506 function Parse_Posix_Character_Class return Std_Class; | |
507 -- Parse a posix character class, like [:alpha:] or [:^alpha:]. | |
508 -- The caller is supposed to absorb the opening [. | |
509 | |
510 pragma Inline (Is_Mult); | |
511 pragma Inline (Emit_Natural); | |
512 pragma Inline (Parse_Character_Class); -- since used only once | |
513 | |
514 --------------- | |
515 -- Case_Emit -- | |
516 --------------- | |
517 | |
518 procedure Case_Emit (C : Character) is | |
519 begin | |
520 if (Flags and Case_Insensitive) /= 0 then | |
521 Emit (To_Lower (C)); | |
522 | |
523 else | |
524 -- Dump current character | |
525 | |
526 Emit (C); | |
527 end if; | |
528 end Case_Emit; | |
529 | |
530 ---------- | |
531 -- Emit -- | |
532 ---------- | |
533 | |
534 procedure Emit (B : Character) is | |
535 begin | |
536 if Emit_Ptr <= PM.Size then | |
537 Program (Emit_Ptr) := B; | |
538 end if; | |
539 | |
540 Emit_Ptr := Emit_Ptr + 1; | |
541 end Emit; | |
542 | |
543 ---------------- | |
544 -- Emit_Class -- | |
545 ---------------- | |
546 | |
547 procedure Emit_Class (Bitmap : Character_Class) is | |
548 subtype Program31 is Program_Data (0 .. 31); | |
549 | |
550 function Convert is new Ada.Unchecked_Conversion | |
551 (Character_Class, Program31); | |
552 | |
553 begin | |
554 -- What is the mysterious constant 31 here??? Can't it be expressed | |
555 -- symbolically (size of integer - 1 or some such???). In any case | |
556 -- it should be declared as a constant (and referenced presumably | |
557 -- as this constant + 1 below. | |
558 | |
559 if Emit_Ptr + 31 <= PM.Size then | |
560 Program (Emit_Ptr .. Emit_Ptr + 31) := Convert (Bitmap); | |
561 end if; | |
562 | |
563 Emit_Ptr := Emit_Ptr + 32; | |
564 end Emit_Class; | |
565 | |
566 ------------------ | |
567 -- Emit_Natural -- | |
568 ------------------ | |
569 | |
570 procedure Emit_Natural (IP : Pointer; N : Natural) is | |
571 begin | |
572 if IP + 1 <= PM.Size then | |
573 Program (IP + 1) := Character'Val (N / 256); | |
574 Program (IP) := Character'Val (N mod 256); | |
575 end if; | |
576 end Emit_Natural; | |
577 | |
578 --------------- | |
579 -- Emit_Node -- | |
580 --------------- | |
581 | |
582 function Emit_Node (Op : Opcode) return Pointer is | |
583 Result : constant Pointer := Emit_Ptr; | |
584 | |
585 begin | |
586 if Emit_Ptr + 2 <= PM.Size then | |
587 Program (Emit_Ptr) := Character'Val (Opcode'Pos (Op)); | |
588 Program (Emit_Ptr + 1) := ASCII.NUL; | |
589 Program (Emit_Ptr + 2) := ASCII.NUL; | |
590 end if; | |
591 | |
592 Emit_Ptr := Emit_Ptr + Next_Pointer_Bytes; | |
593 return Result; | |
594 end Emit_Node; | |
595 | |
596 ---------- | |
597 -- Fail -- | |
598 ---------- | |
599 | |
600 procedure Fail (M : String) is | |
601 begin | |
602 raise Expression_Error with M; | |
603 end Fail; | |
604 | |
605 ------------------------- | |
606 -- Get_Curly_Arguments -- | |
607 ------------------------- | |
608 | |
609 procedure Get_Curly_Arguments | |
610 (IP : Natural; | |
611 Min : out Natural; | |
612 Max : out Natural; | |
613 Greedy : out Boolean) | |
614 is | |
615 pragma Unreferenced (IP); | |
616 | |
617 Save_Pos : Natural := Parse_Pos + 1; | |
618 | |
619 begin | |
620 Min := 0; | |
621 Max := Max_Curly_Repeat; | |
622 | |
623 while Expression (Parse_Pos) /= '}' | |
624 and then Expression (Parse_Pos) /= ',' | |
625 loop | |
626 Parse_Pos := Parse_Pos + 1; | |
627 end loop; | |
628 | |
629 Min := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1)); | |
630 | |
631 if Expression (Parse_Pos) = ',' then | |
632 Save_Pos := Parse_Pos + 1; | |
633 while Expression (Parse_Pos) /= '}' loop | |
634 Parse_Pos := Parse_Pos + 1; | |
635 end loop; | |
636 | |
637 if Save_Pos /= Parse_Pos then | |
638 Max := Natural'Value (Expression (Save_Pos .. Parse_Pos - 1)); | |
639 end if; | |
640 | |
641 else | |
642 Max := Min; | |
643 end if; | |
644 | |
645 if Parse_Pos < Expression'Last | |
646 and then Expression (Parse_Pos + 1) = '?' | |
647 then | |
648 Greedy := False; | |
649 Parse_Pos := Parse_Pos + 1; | |
650 | |
651 else | |
652 Greedy := True; | |
653 end if; | |
654 end Get_Curly_Arguments; | |
655 | |
656 --------------------------- | |
657 -- Insert_Curly_Operator -- | |
658 --------------------------- | |
659 | |
660 procedure Insert_Curly_Operator | |
661 (Op : Opcode; | |
662 Min : Natural; | |
663 Max : Natural; | |
664 Operand : Pointer; | |
665 Greedy : Boolean := True) | |
666 is | |
667 Old : Pointer; | |
668 begin | |
669 Old := Insert_Operator_Before (Op, Operand, Greedy, Opsize => 7); | |
670 Emit_Natural (Old + Next_Pointer_Bytes, Min); | |
671 Emit_Natural (Old + Next_Pointer_Bytes + 2, Max); | |
672 end Insert_Curly_Operator; | |
673 | |
674 ---------------------------- | |
675 -- Insert_Operator_Before -- | |
676 ---------------------------- | |
677 | |
678 function Insert_Operator_Before | |
679 (Op : Opcode; | |
680 Operand : Pointer; | |
681 Greedy : Boolean; | |
682 Opsize : Pointer) return Pointer | |
683 is | |
684 Dest : constant Pointer := Emit_Ptr; | |
685 Old : Pointer; | |
686 Size : Pointer := Opsize; | |
687 | |
688 begin | |
689 -- If not greedy, we have to emit another opcode first | |
690 | |
691 if not Greedy then | |
692 Size := Size + Next_Pointer_Bytes; | |
693 end if; | |
694 | |
695 -- Move the operand in the byte-compilation, so that we can insert | |
696 -- the operator before it. | |
697 | |
698 if Emit_Ptr + Size <= PM.Size then | |
699 Program (Operand + Size .. Emit_Ptr + Size) := | |
700 Program (Operand .. Emit_Ptr); | |
701 end if; | |
702 | |
703 -- Insert the operator at the position previously occupied by the | |
704 -- operand. | |
705 | |
706 Emit_Ptr := Operand; | |
707 | |
708 if not Greedy then | |
709 Old := Emit_Node (MINMOD); | |
710 Link_Tail (Old, Old + Next_Pointer_Bytes); | |
711 end if; | |
712 | |
713 Old := Emit_Node (Op); | |
714 Emit_Ptr := Dest + Size; | |
715 return Old; | |
716 end Insert_Operator_Before; | |
717 | |
718 --------------------- | |
719 -- Insert_Operator -- | |
720 --------------------- | |
721 | |
722 procedure Insert_Operator | |
723 (Op : Opcode; | |
724 Operand : Pointer; | |
725 Greedy : Boolean := True) | |
726 is | |
727 Discard : Pointer; | |
728 pragma Warnings (Off, Discard); | |
729 begin | |
730 Discard := Insert_Operator_Before | |
731 (Op, Operand, Greedy, Opsize => Next_Pointer_Bytes); | |
732 end Insert_Operator; | |
733 | |
734 ----------------------- | |
735 -- Is_Curly_Operator -- | |
736 ----------------------- | |
737 | |
738 function Is_Curly_Operator (IP : Natural) return Boolean is | |
739 Scan : Natural := IP; | |
740 | |
741 begin | |
742 if Expression (Scan) /= '{' | |
743 or else Scan + 2 > Expression'Last | |
744 or else not Is_Digit (Expression (Scan + 1)) | |
745 then | |
746 return False; | |
747 end if; | |
748 | |
749 Scan := Scan + 1; | |
750 | |
751 -- The first digit | |
752 | |
753 loop | |
754 Scan := Scan + 1; | |
755 | |
756 if Scan > Expression'Last then | |
757 return False; | |
758 end if; | |
759 | |
760 exit when not Is_Digit (Expression (Scan)); | |
761 end loop; | |
762 | |
763 if Expression (Scan) = ',' then | |
764 loop | |
765 Scan := Scan + 1; | |
766 | |
767 if Scan > Expression'Last then | |
768 return False; | |
769 end if; | |
770 | |
771 exit when not Is_Digit (Expression (Scan)); | |
772 end loop; | |
773 end if; | |
774 | |
775 return Expression (Scan) = '}'; | |
776 end Is_Curly_Operator; | |
777 | |
778 ------------- | |
779 -- Is_Mult -- | |
780 ------------- | |
781 | |
782 function Is_Mult (IP : Natural) return Boolean is | |
783 C : constant Character := Expression (IP); | |
784 | |
785 begin | |
786 return C = '*' | |
787 or else C = '+' | |
788 or else C = '?' | |
789 or else (C = '{' and then Is_Curly_Operator (IP)); | |
790 end Is_Mult; | |
791 | |
792 ----------------------- | |
793 -- Link_Operand_Tail -- | |
794 ----------------------- | |
795 | |
796 procedure Link_Operand_Tail (P, Val : Pointer) is | |
797 begin | |
798 if P <= PM.Size and then Program (P) = BRANCH then | |
799 Link_Tail (Operand (P), Val); | |
800 end if; | |
801 end Link_Operand_Tail; | |
802 | |
803 --------------- | |
804 -- Link_Tail -- | |
805 --------------- | |
806 | |
807 procedure Link_Tail (P, Val : Pointer) is | |
808 Scan : Pointer; | |
809 Temp : Pointer; | |
810 Offset : Pointer; | |
811 | |
812 begin | |
813 -- Find last node (the size of the pattern matcher might be too | |
814 -- small, so don't try to read past its end). | |
815 | |
816 Scan := P; | |
817 while Scan + Next_Pointer_Bytes <= PM.Size loop | |
818 Temp := Get_Next (Program, Scan); | |
819 exit when Temp = Scan; | |
820 Scan := Temp; | |
821 end loop; | |
822 | |
823 Offset := Val - Scan; | |
824 | |
825 Emit_Natural (Scan + 1, Natural (Offset)); | |
826 end Link_Tail; | |
827 | |
828 ----------- | |
829 -- Parse -- | |
830 ----------- | |
831 | |
832 -- Combining parenthesis handling with the base level of regular | |
833 -- expression is a trifle forced, but the need to tie the tails of the | |
834 -- the branches to what follows makes it hard to avoid. | |
835 | |
836 procedure Parse | |
837 (Parenthesized : Boolean; | |
838 Capturing : Boolean; | |
839 Flags : out Expression_Flags; | |
840 IP : out Pointer) | |
841 is | |
842 E : String renames Expression; | |
843 Br, Br2 : Pointer; | |
844 Ender : Pointer; | |
845 Par_No : Natural; | |
846 New_Flags : Expression_Flags; | |
847 Have_Branch : Boolean := False; | |
848 | |
849 begin | |
850 Flags := (Has_Width => True, others => False); -- Tentatively | |
851 | |
852 -- Make an OPEN node, if parenthesized | |
853 | |
854 if Parenthesized and then Capturing then | |
855 if Matcher.Paren_Count > Max_Paren_Count then | |
856 Fail ("too many ()"); | |
857 end if; | |
858 | |
859 Par_No := Matcher.Paren_Count + 1; | |
860 Matcher.Paren_Count := Matcher.Paren_Count + 1; | |
861 IP := Emit_Node (OPEN); | |
862 Emit (Character'Val (Par_No)); | |
863 else | |
864 IP := 0; | |
865 Par_No := 0; | |
866 end if; | |
867 | |
868 -- Pick up the branches, linking them together | |
869 | |
870 Parse_Branch (New_Flags, True, Br); | |
871 | |
872 if Br = 0 then | |
873 IP := 0; | |
874 return; | |
875 end if; | |
876 | |
877 if Parse_Pos <= Parse_End | |
878 and then E (Parse_Pos) = '|' | |
879 then | |
880 Insert_Operator (BRANCH, Br); | |
881 Have_Branch := True; | |
882 end if; | |
883 | |
884 if IP /= 0 then | |
885 Link_Tail (IP, Br); -- OPEN -> first | |
886 else | |
887 IP := Br; | |
888 end if; | |
889 | |
890 if not New_Flags.Has_Width then | |
891 Flags.Has_Width := False; | |
892 end if; | |
893 | |
894 Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; | |
895 | |
896 while Parse_Pos <= Parse_End | |
897 and then (E (Parse_Pos) = '|') | |
898 loop | |
899 Parse_Pos := Parse_Pos + 1; | |
900 Parse_Branch (New_Flags, False, Br); | |
901 | |
902 if Br = 0 then | |
903 IP := 0; | |
904 return; | |
905 end if; | |
906 | |
907 Link_Tail (IP, Br); -- BRANCH -> BRANCH | |
908 | |
909 if not New_Flags.Has_Width then | |
910 Flags.Has_Width := False; | |
911 end if; | |
912 | |
913 Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; | |
914 end loop; | |
915 | |
916 -- Make a closing node, and hook it on the end | |
917 | |
918 if Parenthesized then | |
919 if Capturing then | |
920 Ender := Emit_Node (CLOSE); | |
921 Emit (Character'Val (Par_No)); | |
922 Link_Tail (IP, Ender); | |
923 | |
924 else | |
925 -- Need to keep looking after the closing parenthesis | |
926 Ender := Emit_Ptr; | |
927 end if; | |
928 | |
929 else | |
930 Ender := Emit_Node (EOP); | |
931 Link_Tail (IP, Ender); | |
932 end if; | |
933 | |
934 if Have_Branch and then Emit_Ptr <= PM.Size + 1 then | |
935 | |
936 -- Hook the tails of the branches to the closing node | |
937 | |
938 Br := IP; | |
939 loop | |
940 Link_Operand_Tail (Br, Ender); | |
941 Br2 := Get_Next (Program, Br); | |
942 exit when Br2 = Br; | |
943 Br := Br2; | |
944 end loop; | |
945 end if; | |
946 | |
947 -- Check for proper termination | |
948 | |
949 if Parenthesized then | |
950 if Parse_Pos > Parse_End or else E (Parse_Pos) /= ')' then | |
951 Fail ("unmatched ()"); | |
952 end if; | |
953 | |
954 Parse_Pos := Parse_Pos + 1; | |
955 | |
956 elsif Parse_Pos <= Parse_End then | |
957 if E (Parse_Pos) = ')' then | |
958 Fail ("unmatched ')'"); | |
959 else | |
960 Fail ("junk on end"); -- "Can't happen" | |
961 end if; | |
962 end if; | |
963 end Parse; | |
964 | |
965 ---------------- | |
966 -- Parse_Atom -- | |
967 ---------------- | |
968 | |
969 procedure Parse_Atom | |
970 (Expr_Flags : out Expression_Flags; | |
971 IP : out Pointer) | |
972 is | |
973 C : Character; | |
974 | |
975 begin | |
976 -- Tentatively set worst expression case | |
977 | |
978 Expr_Flags := Worst_Expression; | |
979 | |
980 C := Expression (Parse_Pos); | |
981 Parse_Pos := Parse_Pos + 1; | |
982 | |
983 case (C) is | |
984 when '^' => | |
985 IP := | |
986 Emit_Node | |
987 (if (Flags and Multiple_Lines) /= 0 then MBOL | |
988 elsif (Flags and Single_Line) /= 0 then SBOL | |
989 else BOL); | |
990 | |
991 when '$' => | |
992 IP := | |
993 Emit_Node | |
994 (if (Flags and Multiple_Lines) /= 0 then MEOL | |
995 elsif (Flags and Single_Line) /= 0 then SEOL | |
996 else EOL); | |
997 | |
998 when '.' => | |
999 IP := | |
1000 Emit_Node | |
1001 (if (Flags and Single_Line) /= 0 then SANY else ANY); | |
1002 | |
1003 Expr_Flags.Has_Width := True; | |
1004 Expr_Flags.Simple := True; | |
1005 | |
1006 when '[' => | |
1007 Parse_Character_Class (IP); | |
1008 Expr_Flags.Has_Width := True; | |
1009 Expr_Flags.Simple := True; | |
1010 | |
1011 when '(' => | |
1012 declare | |
1013 New_Flags : Expression_Flags; | |
1014 | |
1015 begin | |
1016 if Parse_Pos <= Parse_End - 1 | |
1017 and then Expression (Parse_Pos) = '?' | |
1018 and then Expression (Parse_Pos + 1) = ':' | |
1019 then | |
1020 Parse_Pos := Parse_Pos + 2; | |
1021 | |
1022 -- Non-capturing parenthesis | |
1023 | |
1024 Parse (True, False, New_Flags, IP); | |
1025 | |
1026 else | |
1027 -- Capturing parenthesis | |
1028 | |
1029 Parse (True, True, New_Flags, IP); | |
1030 Expr_Flags.Has_Width := | |
1031 Expr_Flags.Has_Width or else New_Flags.Has_Width; | |
1032 Expr_Flags.SP_Start := | |
1033 Expr_Flags.SP_Start or else New_Flags.SP_Start; | |
1034 if IP = 0 then | |
1035 return; | |
1036 end if; | |
1037 end if; | |
1038 end; | |
1039 | |
1040 when '|' | ASCII.LF | ')' => | |
1041 Fail ("internal urp"); -- Supposed to be caught earlier | |
1042 | |
1043 when '?' | '+' | '*' => | |
1044 Fail (C & " follows nothing"); | |
1045 | |
1046 when '{' => | |
1047 if Is_Curly_Operator (Parse_Pos - 1) then | |
1048 Fail (C & " follows nothing"); | |
1049 else | |
1050 Parse_Literal (Expr_Flags, IP); | |
1051 end if; | |
1052 | |
1053 when '\' => | |
1054 if Parse_Pos > Parse_End then | |
1055 Fail ("trailing \"); | |
1056 end if; | |
1057 | |
1058 Parse_Pos := Parse_Pos + 1; | |
1059 | |
1060 case Expression (Parse_Pos - 1) is | |
1061 when 'b' => | |
1062 IP := Emit_Node (BOUND); | |
1063 | |
1064 when 'B' => | |
1065 IP := Emit_Node (NBOUND); | |
1066 | |
1067 when 's' => | |
1068 IP := Emit_Node (SPACE); | |
1069 Expr_Flags.Simple := True; | |
1070 Expr_Flags.Has_Width := True; | |
1071 | |
1072 when 'S' => | |
1073 IP := Emit_Node (NSPACE); | |
1074 Expr_Flags.Simple := True; | |
1075 Expr_Flags.Has_Width := True; | |
1076 | |
1077 when 'd' => | |
1078 IP := Emit_Node (DIGIT); | |
1079 Expr_Flags.Simple := True; | |
1080 Expr_Flags.Has_Width := True; | |
1081 | |
1082 when 'D' => | |
1083 IP := Emit_Node (NDIGIT); | |
1084 Expr_Flags.Simple := True; | |
1085 Expr_Flags.Has_Width := True; | |
1086 | |
1087 when 'w' => | |
1088 IP := Emit_Node (ALNUM); | |
1089 Expr_Flags.Simple := True; | |
1090 Expr_Flags.Has_Width := True; | |
1091 | |
1092 when 'W' => | |
1093 IP := Emit_Node (NALNUM); | |
1094 Expr_Flags.Simple := True; | |
1095 Expr_Flags.Has_Width := True; | |
1096 | |
1097 when 'A' => | |
1098 IP := Emit_Node (SBOL); | |
1099 | |
1100 when 'G' => | |
1101 IP := Emit_Node (SEOL); | |
1102 | |
1103 when '0' .. '9' => | |
1104 IP := Emit_Node (REFF); | |
1105 | |
1106 declare | |
1107 Save : constant Natural := Parse_Pos - 1; | |
1108 | |
1109 begin | |
1110 while Parse_Pos <= Expression'Last | |
1111 and then Is_Digit (Expression (Parse_Pos)) | |
1112 loop | |
1113 Parse_Pos := Parse_Pos + 1; | |
1114 end loop; | |
1115 | |
1116 Emit (Character'Val (Natural'Value | |
1117 (Expression (Save .. Parse_Pos - 1)))); | |
1118 end; | |
1119 | |
1120 when others => | |
1121 Parse_Pos := Parse_Pos - 1; | |
1122 Parse_Literal (Expr_Flags, IP); | |
1123 end case; | |
1124 | |
1125 when others => | |
1126 Parse_Literal (Expr_Flags, IP); | |
1127 end case; | |
1128 end Parse_Atom; | |
1129 | |
1130 ------------------ | |
1131 -- Parse_Branch -- | |
1132 ------------------ | |
1133 | |
1134 procedure Parse_Branch | |
1135 (Flags : out Expression_Flags; | |
1136 First : Boolean; | |
1137 IP : out Pointer) | |
1138 is | |
1139 E : String renames Expression; | |
1140 Chain : Pointer; | |
1141 Last : Pointer; | |
1142 New_Flags : Expression_Flags; | |
1143 | |
1144 Discard : Pointer; | |
1145 pragma Warnings (Off, Discard); | |
1146 | |
1147 begin | |
1148 Flags := Worst_Expression; -- Tentatively | |
1149 IP := (if First then Emit_Ptr else Emit_Node (BRANCH)); | |
1150 | |
1151 Chain := 0; | |
1152 while Parse_Pos <= Parse_End | |
1153 and then E (Parse_Pos) /= ')' | |
1154 and then E (Parse_Pos) /= ASCII.LF | |
1155 and then E (Parse_Pos) /= '|' | |
1156 loop | |
1157 Parse_Piece (New_Flags, Last); | |
1158 | |
1159 if Last = 0 then | |
1160 IP := 0; | |
1161 return; | |
1162 end if; | |
1163 | |
1164 Flags.Has_Width := Flags.Has_Width or else New_Flags.Has_Width; | |
1165 | |
1166 if Chain = 0 then -- First piece | |
1167 Flags.SP_Start := Flags.SP_Start or else New_Flags.SP_Start; | |
1168 else | |
1169 Link_Tail (Chain, Last); | |
1170 end if; | |
1171 | |
1172 Chain := Last; | |
1173 end loop; | |
1174 | |
1175 -- Case where loop ran zero CURLY | |
1176 | |
1177 if Chain = 0 then | |
1178 Discard := Emit_Node (NOTHING); | |
1179 end if; | |
1180 end Parse_Branch; | |
1181 | |
1182 --------------------------- | |
1183 -- Parse_Character_Class -- | |
1184 --------------------------- | |
1185 | |
1186 procedure Parse_Character_Class (IP : out Pointer) is | |
1187 Bitmap : Character_Class; | |
1188 Invert : Boolean := False; | |
1189 In_Range : Boolean := False; | |
1190 Named_Class : Std_Class := ANYOF_NONE; | |
1191 Value : Character; | |
1192 Last_Value : Character := ASCII.NUL; | |
1193 | |
1194 begin | |
1195 Reset_Class (Bitmap); | |
1196 | |
1197 -- Do we have an invert character class ? | |
1198 | |
1199 if Parse_Pos <= Parse_End | |
1200 and then Expression (Parse_Pos) = '^' | |
1201 then | |
1202 Invert := True; | |
1203 Parse_Pos := Parse_Pos + 1; | |
1204 end if; | |
1205 | |
1206 -- First character can be ] or - without closing the class | |
1207 | |
1208 if Parse_Pos <= Parse_End | |
1209 and then (Expression (Parse_Pos) = ']' | |
1210 or else Expression (Parse_Pos) = '-') | |
1211 then | |
1212 Set_In_Class (Bitmap, Expression (Parse_Pos)); | |
1213 Parse_Pos := Parse_Pos + 1; | |
1214 end if; | |
1215 | |
1216 -- While we don't have the end of the class | |
1217 | |
1218 while Parse_Pos <= Parse_End | |
1219 and then Expression (Parse_Pos) /= ']' | |
1220 loop | |
1221 Named_Class := ANYOF_NONE; | |
1222 Value := Expression (Parse_Pos); | |
1223 Parse_Pos := Parse_Pos + 1; | |
1224 | |
1225 -- Do we have a Posix character class | |
1226 if Value = '[' then | |
1227 Named_Class := Parse_Posix_Character_Class; | |
1228 | |
1229 elsif Value = '\' then | |
1230 if Parse_Pos = Parse_End then | |
1231 Fail ("Trailing \"); | |
1232 end if; | |
1233 Value := Expression (Parse_Pos); | |
1234 Parse_Pos := Parse_Pos + 1; | |
1235 | |
1236 case Value is | |
1237 when 'w' => Named_Class := ANYOF_ALNUM; | |
1238 when 'W' => Named_Class := ANYOF_NALNUM; | |
1239 when 's' => Named_Class := ANYOF_SPACE; | |
1240 when 'S' => Named_Class := ANYOF_NSPACE; | |
1241 when 'd' => Named_Class := ANYOF_DIGIT; | |
1242 when 'D' => Named_Class := ANYOF_NDIGIT; | |
1243 when 'n' => Value := ASCII.LF; | |
1244 when 'r' => Value := ASCII.CR; | |
1245 when 't' => Value := ASCII.HT; | |
1246 when 'f' => Value := ASCII.FF; | |
1247 when 'e' => Value := ASCII.ESC; | |
1248 when 'a' => Value := ASCII.BEL; | |
1249 | |
1250 -- when 'x' => ??? hexadecimal value | |
1251 -- when 'c' => ??? control character | |
1252 -- when '0'..'9' => ??? octal character | |
1253 | |
1254 when others => null; | |
1255 end case; | |
1256 end if; | |
1257 | |
1258 -- Do we have a character class? | |
1259 | |
1260 if Named_Class /= ANYOF_NONE then | |
1261 | |
1262 -- A range like 'a-\d' or 'a-[:digit:] is not a range | |
1263 | |
1264 if In_Range then | |
1265 Set_In_Class (Bitmap, Last_Value); | |
1266 Set_In_Class (Bitmap, '-'); | |
1267 In_Range := False; | |
1268 end if; | |
1269 | |
1270 -- Expand the range | |
1271 | |
1272 case Named_Class is | |
1273 when ANYOF_NONE => null; | |
1274 | |
1275 when ANYOF_ALNUM | ANYOF_ALNUMC => | |
1276 for Value in Class_Byte'Range loop | |
1277 if Is_Alnum (Character'Val (Value)) then | |
1278 Set_In_Class (Bitmap, Character'Val (Value)); | |
1279 end if; | |
1280 end loop; | |
1281 | |
1282 when ANYOF_NALNUM | ANYOF_NALNUMC => | |
1283 for Value in Class_Byte'Range loop | |
1284 if not Is_Alnum (Character'Val (Value)) then | |
1285 Set_In_Class (Bitmap, Character'Val (Value)); | |
1286 end if; | |
1287 end loop; | |
1288 | |
1289 when ANYOF_SPACE => | |
1290 for Value in Class_Byte'Range loop | |
1291 if Is_White_Space (Character'Val (Value)) then | |
1292 Set_In_Class (Bitmap, Character'Val (Value)); | |
1293 end if; | |
1294 end loop; | |
1295 | |
1296 when ANYOF_NSPACE => | |
1297 for Value in Class_Byte'Range loop | |
1298 if not Is_White_Space (Character'Val (Value)) then | |
1299 Set_In_Class (Bitmap, Character'Val (Value)); | |
1300 end if; | |
1301 end loop; | |
1302 | |
1303 when ANYOF_DIGIT => | |
1304 for Value in Class_Byte'Range loop | |
1305 if Is_Digit (Character'Val (Value)) then | |
1306 Set_In_Class (Bitmap, Character'Val (Value)); | |
1307 end if; | |
1308 end loop; | |
1309 | |
1310 when ANYOF_NDIGIT => | |
1311 for Value in Class_Byte'Range loop | |
1312 if not Is_Digit (Character'Val (Value)) then | |
1313 Set_In_Class (Bitmap, Character'Val (Value)); | |
1314 end if; | |
1315 end loop; | |
1316 | |
1317 when ANYOF_ALPHA => | |
1318 for Value in Class_Byte'Range loop | |
1319 if Is_Letter (Character'Val (Value)) then | |
1320 Set_In_Class (Bitmap, Character'Val (Value)); | |
1321 end if; | |
1322 end loop; | |
1323 | |
1324 when ANYOF_NALPHA => | |
1325 for Value in Class_Byte'Range loop | |
1326 if not Is_Letter (Character'Val (Value)) then | |
1327 Set_In_Class (Bitmap, Character'Val (Value)); | |
1328 end if; | |
1329 end loop; | |
1330 | |
1331 when ANYOF_ASCII => | |
1332 for Value in 0 .. 127 loop | |
1333 Set_In_Class (Bitmap, Character'Val (Value)); | |
1334 end loop; | |
1335 | |
1336 when ANYOF_NASCII => | |
1337 for Value in 128 .. 255 loop | |
1338 Set_In_Class (Bitmap, Character'Val (Value)); | |
1339 end loop; | |
1340 | |
1341 when ANYOF_CNTRL => | |
1342 for Value in Class_Byte'Range loop | |
1343 if Is_Control (Character'Val (Value)) then | |
1344 Set_In_Class (Bitmap, Character'Val (Value)); | |
1345 end if; | |
1346 end loop; | |
1347 | |
1348 when ANYOF_NCNTRL => | |
1349 for Value in Class_Byte'Range loop | |
1350 if not Is_Control (Character'Val (Value)) then | |
1351 Set_In_Class (Bitmap, Character'Val (Value)); | |
1352 end if; | |
1353 end loop; | |
1354 | |
1355 when ANYOF_GRAPH => | |
1356 for Value in Class_Byte'Range loop | |
1357 if Is_Graphic (Character'Val (Value)) then | |
1358 Set_In_Class (Bitmap, Character'Val (Value)); | |
1359 end if; | |
1360 end loop; | |
1361 | |
1362 when ANYOF_NGRAPH => | |
1363 for Value in Class_Byte'Range loop | |
1364 if not Is_Graphic (Character'Val (Value)) then | |
1365 Set_In_Class (Bitmap, Character'Val (Value)); | |
1366 end if; | |
1367 end loop; | |
1368 | |
1369 when ANYOF_LOWER => | |
1370 for Value in Class_Byte'Range loop | |
1371 if Is_Lower (Character'Val (Value)) then | |
1372 Set_In_Class (Bitmap, Character'Val (Value)); | |
1373 end if; | |
1374 end loop; | |
1375 | |
1376 when ANYOF_NLOWER => | |
1377 for Value in Class_Byte'Range loop | |
1378 if not Is_Lower (Character'Val (Value)) then | |
1379 Set_In_Class (Bitmap, Character'Val (Value)); | |
1380 end if; | |
1381 end loop; | |
1382 | |
1383 when ANYOF_PRINT => | |
1384 for Value in Class_Byte'Range loop | |
1385 if Is_Printable (Character'Val (Value)) then | |
1386 Set_In_Class (Bitmap, Character'Val (Value)); | |
1387 end if; | |
1388 end loop; | |
1389 | |
1390 when ANYOF_NPRINT => | |
1391 for Value in Class_Byte'Range loop | |
1392 if not Is_Printable (Character'Val (Value)) then | |
1393 Set_In_Class (Bitmap, Character'Val (Value)); | |
1394 end if; | |
1395 end loop; | |
1396 | |
1397 when ANYOF_PUNCT => | |
1398 for Value in Class_Byte'Range loop | |
1399 if Is_Printable (Character'Val (Value)) | |
1400 and then not Is_White_Space (Character'Val (Value)) | |
1401 and then not Is_Alnum (Character'Val (Value)) | |
1402 then | |
1403 Set_In_Class (Bitmap, Character'Val (Value)); | |
1404 end if; | |
1405 end loop; | |
1406 | |
1407 when ANYOF_NPUNCT => | |
1408 for Value in Class_Byte'Range loop | |
1409 if not Is_Printable (Character'Val (Value)) | |
1410 or else Is_White_Space (Character'Val (Value)) | |
1411 or else Is_Alnum (Character'Val (Value)) | |
1412 then | |
1413 Set_In_Class (Bitmap, Character'Val (Value)); | |
1414 end if; | |
1415 end loop; | |
1416 | |
1417 when ANYOF_UPPER => | |
1418 for Value in Class_Byte'Range loop | |
1419 if Is_Upper (Character'Val (Value)) then | |
1420 Set_In_Class (Bitmap, Character'Val (Value)); | |
1421 end if; | |
1422 end loop; | |
1423 | |
1424 when ANYOF_NUPPER => | |
1425 for Value in Class_Byte'Range loop | |
1426 if not Is_Upper (Character'Val (Value)) then | |
1427 Set_In_Class (Bitmap, Character'Val (Value)); | |
1428 end if; | |
1429 end loop; | |
1430 | |
1431 when ANYOF_XDIGIT => | |
1432 for Value in Class_Byte'Range loop | |
1433 if Is_Hexadecimal_Digit (Character'Val (Value)) then | |
1434 Set_In_Class (Bitmap, Character'Val (Value)); | |
1435 end if; | |
1436 end loop; | |
1437 | |
1438 when ANYOF_NXDIGIT => | |
1439 for Value in Class_Byte'Range loop | |
1440 if not Is_Hexadecimal_Digit | |
1441 (Character'Val (Value)) | |
1442 then | |
1443 Set_In_Class (Bitmap, Character'Val (Value)); | |
1444 end if; | |
1445 end loop; | |
1446 | |
1447 end case; | |
1448 | |
1449 -- Not a character range | |
1450 | |
1451 elsif not In_Range then | |
1452 Last_Value := Value; | |
1453 | |
1454 if Parse_Pos > Expression'Last then | |
1455 Fail ("Empty character class []"); | |
1456 end if; | |
1457 | |
1458 if Expression (Parse_Pos) = '-' | |
1459 and then Parse_Pos < Parse_End | |
1460 and then Expression (Parse_Pos + 1) /= ']' | |
1461 then | |
1462 Parse_Pos := Parse_Pos + 1; | |
1463 | |
1464 -- Do we have a range like '\d-a' and '[:space:]-a' | |
1465 -- which is not a real range | |
1466 | |
1467 if Named_Class /= ANYOF_NONE then | |
1468 Set_In_Class (Bitmap, '-'); | |
1469 else | |
1470 In_Range := True; | |
1471 end if; | |
1472 | |
1473 else | |
1474 Set_In_Class (Bitmap, Value); | |
1475 | |
1476 end if; | |
1477 | |
1478 -- Else in a character range | |
1479 | |
1480 else | |
1481 if Last_Value > Value then | |
1482 Fail ("Invalid Range [" & Last_Value'Img | |
1483 & "-" & Value'Img & "]"); | |
1484 end if; | |
1485 | |
1486 while Last_Value <= Value loop | |
1487 Set_In_Class (Bitmap, Last_Value); | |
1488 Last_Value := Character'Succ (Last_Value); | |
1489 end loop; | |
1490 | |
1491 In_Range := False; | |
1492 | |
1493 end if; | |
1494 | |
1495 end loop; | |
1496 | |
1497 -- Optimize case-insensitive ranges (put the upper case or lower | |
1498 -- case character into the bitmap) | |
1499 | |
1500 if (Flags and Case_Insensitive) /= 0 then | |
1501 for C in Character'Range loop | |
1502 if Get_From_Class (Bitmap, C) then | |
1503 Set_In_Class (Bitmap, To_Lower (C)); | |
1504 Set_In_Class (Bitmap, To_Upper (C)); | |
1505 end if; | |
1506 end loop; | |
1507 end if; | |
1508 | |
1509 -- Optimize inverted classes | |
1510 | |
1511 if Invert then | |
1512 for J in Bitmap'Range loop | |
1513 Bitmap (J) := not Bitmap (J); | |
1514 end loop; | |
1515 end if; | |
1516 | |
1517 Parse_Pos := Parse_Pos + 1; | |
1518 | |
1519 -- Emit the class | |
1520 | |
1521 IP := Emit_Node (ANYOF); | |
1522 Emit_Class (Bitmap); | |
1523 end Parse_Character_Class; | |
1524 | |
1525 ------------------- | |
1526 -- Parse_Literal -- | |
1527 ------------------- | |
1528 | |
1529 -- This is a bit tricky due to quoted chars and due to | |
1530 -- the multiplier characters '*', '+', and '?' that | |
1531 -- take the SINGLE char previous as their operand. | |
1532 | |
1533 -- On entry, the character at Parse_Pos - 1 is going to go | |
1534 -- into the string, no matter what it is. It could be | |
1535 -- following a \ if Parse_Atom was entered from the '\' case. | |
1536 | |
1537 -- Basic idea is to pick up a good char in C and examine | |
1538 -- the next char. If Is_Mult (C) then twiddle, if it's a \ | |
1539 -- then frozzle and if it's another magic char then push C and | |
1540 -- terminate the string. If none of the above, push C on the | |
1541 -- string and go around again. | |
1542 | |
1543 -- Start_Pos is used to remember where "the current character" | |
1544 -- starts in the string, if due to an Is_Mult we need to back | |
1545 -- up and put the current char in a separate 1-character string. | |
1546 -- When Start_Pos is 0, C is the only char in the string; | |
1547 -- this is used in Is_Mult handling, and in setting the SIMPLE | |
1548 -- flag at the end. | |
1549 | |
1550 procedure Parse_Literal | |
1551 (Expr_Flags : out Expression_Flags; | |
1552 IP : out Pointer) | |
1553 is | |
1554 Start_Pos : Natural := 0; | |
1555 C : Character; | |
1556 Length_Ptr : Pointer; | |
1557 | |
1558 Has_Special_Operator : Boolean := False; | |
1559 | |
1560 begin | |
1561 Parse_Pos := Parse_Pos - 1; -- Look at current character | |
1562 | |
1563 IP := | |
1564 Emit_Node | |
1565 (if (Flags and Case_Insensitive) /= 0 then EXACTF else EXACT); | |
1566 | |
1567 Length_Ptr := Emit_Ptr; | |
1568 Emit_Ptr := String_Operand (IP); | |
1569 | |
1570 Parse_Loop : | |
1571 loop | |
1572 C := Expression (Parse_Pos); -- Get current character | |
1573 | |
1574 case C is | |
1575 when '.' | '[' | '(' | ')' | '|' | ASCII.LF | '$' | '^' => | |
1576 | |
1577 if Start_Pos = 0 then | |
1578 Start_Pos := Parse_Pos; | |
1579 Emit (C); -- First character is always emitted | |
1580 else | |
1581 exit Parse_Loop; -- Else we are done | |
1582 end if; | |
1583 | |
1584 when '?' | '+' | '*' | '{' => | |
1585 | |
1586 if Start_Pos = 0 then | |
1587 Start_Pos := Parse_Pos; | |
1588 Emit (C); -- First character is always emitted | |
1589 | |
1590 -- Are we looking at an operator, or is this | |
1591 -- simply a normal character ? | |
1592 | |
1593 elsif not Is_Mult (Parse_Pos) then | |
1594 Start_Pos := Parse_Pos; | |
1595 Case_Emit (C); | |
1596 | |
1597 else | |
1598 -- We've got something like "abc?d". Mark this as a | |
1599 -- special case. What we want to emit is a first | |
1600 -- constant string for "ab", then one for "c" that will | |
1601 -- ultimately be transformed with a CURLY operator, A | |
1602 -- special case has to be handled for "a?", since there | |
1603 -- is no initial string to emit. | |
1604 | |
1605 Has_Special_Operator := True; | |
1606 exit Parse_Loop; | |
1607 end if; | |
1608 | |
1609 when '\' => | |
1610 Start_Pos := Parse_Pos; | |
1611 | |
1612 if Parse_Pos = Parse_End then | |
1613 Fail ("Trailing \"); | |
1614 | |
1615 else | |
1616 case Expression (Parse_Pos + 1) is | |
1617 when 'b' | 'B' | 's' | 'S' | 'd' | 'D' | |
1618 | 'w' | 'W' | '0' .. '9' | 'G' | 'A' | |
1619 => exit Parse_Loop; | |
1620 when 'n' => Emit (ASCII.LF); | |
1621 when 't' => Emit (ASCII.HT); | |
1622 when 'r' => Emit (ASCII.CR); | |
1623 when 'f' => Emit (ASCII.FF); | |
1624 when 'e' => Emit (ASCII.ESC); | |
1625 when 'a' => Emit (ASCII.BEL); | |
1626 when others => Emit (Expression (Parse_Pos + 1)); | |
1627 end case; | |
1628 | |
1629 Parse_Pos := Parse_Pos + 1; | |
1630 end if; | |
1631 | |
1632 when others => | |
1633 Start_Pos := Parse_Pos; | |
1634 Case_Emit (C); | |
1635 end case; | |
1636 | |
1637 Parse_Pos := Parse_Pos + 1; | |
1638 exit Parse_Loop when Parse_Pos > Parse_End | |
1639 or else Emit_Ptr - Length_Ptr = 254; | |
1640 end loop Parse_Loop; | |
1641 | |
1642 -- Is the string followed by a '*+?{' operator ? If yes, and if there | |
1643 -- is an initial string to emit, do it now. | |
1644 | |
1645 if Has_Special_Operator | |
1646 and then Emit_Ptr >= Length_Ptr + Next_Pointer_Bytes | |
1647 then | |
1648 Emit_Ptr := Emit_Ptr - 1; | |
1649 Parse_Pos := Start_Pos; | |
1650 end if; | |
1651 | |
1652 if Length_Ptr <= PM.Size then | |
1653 Program (Length_Ptr) := Character'Val (Emit_Ptr - Length_Ptr - 2); | |
1654 end if; | |
1655 | |
1656 Expr_Flags.Has_Width := True; | |
1657 | |
1658 -- Slight optimization when there is a single character | |
1659 | |
1660 if Emit_Ptr = Length_Ptr + 2 then | |
1661 Expr_Flags.Simple := True; | |
1662 end if; | |
1663 end Parse_Literal; | |
1664 | |
1665 ----------------- | |
1666 -- Parse_Piece -- | |
1667 ----------------- | |
1668 | |
1669 -- Note that the branching code sequences used for '?' and the | |
1670 -- general cases of '*' and + are somewhat optimized: they use | |
1671 -- the same NOTHING node as both the endmarker for their branch | |
1672 -- list and the body of the last branch. It might seem that | |
1673 -- this node could be dispensed with entirely, but the endmarker | |
1674 -- role is not redundant. | |
1675 | |
1676 procedure Parse_Piece | |
1677 (Expr_Flags : out Expression_Flags; | |
1678 IP : out Pointer) | |
1679 is | |
1680 Op : Character; | |
1681 New_Flags : Expression_Flags; | |
1682 Greedy : Boolean := True; | |
1683 | |
1684 begin | |
1685 Parse_Atom (New_Flags, IP); | |
1686 | |
1687 if IP = 0 then | |
1688 return; | |
1689 end if; | |
1690 | |
1691 if Parse_Pos > Parse_End | |
1692 or else not Is_Mult (Parse_Pos) | |
1693 then | |
1694 Expr_Flags := New_Flags; | |
1695 return; | |
1696 end if; | |
1697 | |
1698 Op := Expression (Parse_Pos); | |
1699 | |
1700 Expr_Flags := | |
1701 (if Op /= '+' | |
1702 then (SP_Start => True, others => False) | |
1703 else (Has_Width => True, others => False)); | |
1704 | |
1705 -- Detect non greedy operators in the easy cases | |
1706 | |
1707 if Op /= '{' | |
1708 and then Parse_Pos + 1 <= Parse_End | |
1709 and then Expression (Parse_Pos + 1) = '?' | |
1710 then | |
1711 Greedy := False; | |
1712 Parse_Pos := Parse_Pos + 1; | |
1713 end if; | |
1714 | |
1715 -- Generate the byte code | |
1716 | |
1717 case Op is | |
1718 when '*' => | |
1719 | |
1720 if New_Flags.Simple then | |
1721 Insert_Operator (STAR, IP, Greedy); | |
1722 else | |
1723 Link_Tail (IP, Emit_Node (WHILEM)); | |
1724 Insert_Curly_Operator | |
1725 (CURLYX, 0, Max_Curly_Repeat, IP, Greedy); | |
1726 Link_Tail (IP, Emit_Node (NOTHING)); | |
1727 end if; | |
1728 | |
1729 when '+' => | |
1730 | |
1731 if New_Flags.Simple then | |
1732 Insert_Operator (PLUS, IP, Greedy); | |
1733 else | |
1734 Link_Tail (IP, Emit_Node (WHILEM)); | |
1735 Insert_Curly_Operator | |
1736 (CURLYX, 1, Max_Curly_Repeat, IP, Greedy); | |
1737 Link_Tail (IP, Emit_Node (NOTHING)); | |
1738 end if; | |
1739 | |
1740 when '?' => | |
1741 if New_Flags.Simple then | |
1742 Insert_Curly_Operator (CURLY, 0, 1, IP, Greedy); | |
1743 else | |
1744 Link_Tail (IP, Emit_Node (WHILEM)); | |
1745 Insert_Curly_Operator (CURLYX, 0, 1, IP, Greedy); | |
1746 Link_Tail (IP, Emit_Node (NOTHING)); | |
1747 end if; | |
1748 | |
1749 when '{' => | |
1750 declare | |
1751 Min, Max : Natural; | |
1752 | |
1753 begin | |
1754 Get_Curly_Arguments (Parse_Pos, Min, Max, Greedy); | |
1755 | |
1756 if New_Flags.Simple then | |
1757 Insert_Curly_Operator (CURLY, Min, Max, IP, Greedy); | |
1758 else | |
1759 Link_Tail (IP, Emit_Node (WHILEM)); | |
1760 Insert_Curly_Operator (CURLYX, Min, Max, IP, Greedy); | |
1761 Link_Tail (IP, Emit_Node (NOTHING)); | |
1762 end if; | |
1763 end; | |
1764 | |
1765 when others => | |
1766 null; | |
1767 end case; | |
1768 | |
1769 Parse_Pos := Parse_Pos + 1; | |
1770 | |
1771 if Parse_Pos <= Parse_End | |
1772 and then Is_Mult (Parse_Pos) | |
1773 then | |
1774 Fail ("nested *+{"); | |
1775 end if; | |
1776 end Parse_Piece; | |
1777 | |
1778 --------------------------------- | |
1779 -- Parse_Posix_Character_Class -- | |
1780 --------------------------------- | |
1781 | |
1782 function Parse_Posix_Character_Class return Std_Class is | |
1783 Invert : Boolean := False; | |
1784 Class : Std_Class := ANYOF_NONE; | |
1785 E : String renames Expression; | |
1786 | |
1787 -- Class names. Note that code assumes that the length of all | |
1788 -- classes starting with the same letter have the same length. | |
1789 | |
1790 Alnum : constant String := "alnum:]"; | |
1791 Alpha : constant String := "alpha:]"; | |
1792 Ascii_C : constant String := "ascii:]"; | |
1793 Cntrl : constant String := "cntrl:]"; | |
1794 Digit : constant String := "digit:]"; | |
1795 Graph : constant String := "graph:]"; | |
1796 Lower : constant String := "lower:]"; | |
1797 Print : constant String := "print:]"; | |
1798 Punct : constant String := "punct:]"; | |
1799 Space : constant String := "space:]"; | |
1800 Upper : constant String := "upper:]"; | |
1801 Word : constant String := "word:]"; | |
1802 Xdigit : constant String := "xdigit:]"; | |
1803 | |
1804 begin | |
1805 -- Case of character class specified | |
1806 | |
1807 if Parse_Pos <= Parse_End | |
1808 and then Expression (Parse_Pos) = ':' | |
1809 then | |
1810 Parse_Pos := Parse_Pos + 1; | |
1811 | |
1812 -- Do we have something like: [[:^alpha:]] | |
1813 | |
1814 if Parse_Pos <= Parse_End | |
1815 and then Expression (Parse_Pos) = '^' | |
1816 then | |
1817 Invert := True; | |
1818 Parse_Pos := Parse_Pos + 1; | |
1819 end if; | |
1820 | |
1821 -- Check for class names based on first letter | |
1822 | |
1823 case Expression (Parse_Pos) is | |
1824 when 'a' => | |
1825 | |
1826 -- All 'a' classes have the same length (Alnum'Length) | |
1827 | |
1828 if Parse_Pos + Alnum'Length - 1 <= Parse_End then | |
1829 if | |
1830 E (Parse_Pos .. Parse_Pos + Alnum'Length - 1) = Alnum | |
1831 then | |
1832 Class := | |
1833 (if Invert then ANYOF_NALNUMC else ANYOF_ALNUMC); | |
1834 Parse_Pos := Parse_Pos + Alnum'Length; | |
1835 | |
1836 elsif | |
1837 E (Parse_Pos .. Parse_Pos + Alpha'Length - 1) = Alpha | |
1838 then | |
1839 Class := | |
1840 (if Invert then ANYOF_NALPHA else ANYOF_ALPHA); | |
1841 Parse_Pos := Parse_Pos + Alpha'Length; | |
1842 | |
1843 elsif E (Parse_Pos .. Parse_Pos + Ascii_C'Length - 1) = | |
1844 Ascii_C | |
1845 then | |
1846 Class := | |
1847 (if Invert then ANYOF_NASCII else ANYOF_ASCII); | |
1848 Parse_Pos := Parse_Pos + Ascii_C'Length; | |
1849 else | |
1850 Fail ("Invalid character class: " & E); | |
1851 end if; | |
1852 | |
1853 else | |
1854 Fail ("Invalid character class: " & E); | |
1855 end if; | |
1856 | |
1857 when 'c' => | |
1858 if Parse_Pos + Cntrl'Length - 1 <= Parse_End | |
1859 and then | |
1860 E (Parse_Pos .. Parse_Pos + Cntrl'Length - 1) = Cntrl | |
1861 then | |
1862 Class := (if Invert then ANYOF_NCNTRL else ANYOF_CNTRL); | |
1863 Parse_Pos := Parse_Pos + Cntrl'Length; | |
1864 else | |
1865 Fail ("Invalid character class: " & E); | |
1866 end if; | |
1867 | |
1868 when 'd' => | |
1869 if Parse_Pos + Digit'Length - 1 <= Parse_End | |
1870 and then | |
1871 E (Parse_Pos .. Parse_Pos + Digit'Length - 1) = Digit | |
1872 then | |
1873 Class := (if Invert then ANYOF_NDIGIT else ANYOF_DIGIT); | |
1874 Parse_Pos := Parse_Pos + Digit'Length; | |
1875 end if; | |
1876 | |
1877 when 'g' => | |
1878 if Parse_Pos + Graph'Length - 1 <= Parse_End | |
1879 and then | |
1880 E (Parse_Pos .. Parse_Pos + Graph'Length - 1) = Graph | |
1881 then | |
1882 Class := (if Invert then ANYOF_NGRAPH else ANYOF_GRAPH); | |
1883 Parse_Pos := Parse_Pos + Graph'Length; | |
1884 else | |
1885 Fail ("Invalid character class: " & E); | |
1886 end if; | |
1887 | |
1888 when 'l' => | |
1889 if Parse_Pos + Lower'Length - 1 <= Parse_End | |
1890 and then | |
1891 E (Parse_Pos .. Parse_Pos + Lower'Length - 1) = Lower | |
1892 then | |
1893 Class := (if Invert then ANYOF_NLOWER else ANYOF_LOWER); | |
1894 Parse_Pos := Parse_Pos + Lower'Length; | |
1895 else | |
1896 Fail ("Invalid character class: " & E); | |
1897 end if; | |
1898 | |
1899 when 'p' => | |
1900 | |
1901 -- All 'p' classes have the same length | |
1902 | |
1903 if Parse_Pos + Print'Length - 1 <= Parse_End then | |
1904 if | |
1905 E (Parse_Pos .. Parse_Pos + Print'Length - 1) = Print | |
1906 then | |
1907 Class := | |
1908 (if Invert then ANYOF_NPRINT else ANYOF_PRINT); | |
1909 Parse_Pos := Parse_Pos + Print'Length; | |
1910 | |
1911 elsif | |
1912 E (Parse_Pos .. Parse_Pos + Punct'Length - 1) = Punct | |
1913 then | |
1914 Class := | |
1915 (if Invert then ANYOF_NPUNCT else ANYOF_PUNCT); | |
1916 Parse_Pos := Parse_Pos + Punct'Length; | |
1917 | |
1918 else | |
1919 Fail ("Invalid character class: " & E); | |
1920 end if; | |
1921 | |
1922 else | |
1923 Fail ("Invalid character class: " & E); | |
1924 end if; | |
1925 | |
1926 when 's' => | |
1927 if Parse_Pos + Space'Length - 1 <= Parse_End | |
1928 and then | |
1929 E (Parse_Pos .. Parse_Pos + Space'Length - 1) = Space | |
1930 then | |
1931 Class := (if Invert then ANYOF_NSPACE else ANYOF_SPACE); | |
1932 Parse_Pos := Parse_Pos + Space'Length; | |
1933 else | |
1934 Fail ("Invalid character class: " & E); | |
1935 end if; | |
1936 | |
1937 when 'u' => | |
1938 if Parse_Pos + Upper'Length - 1 <= Parse_End | |
1939 and then | |
1940 E (Parse_Pos .. Parse_Pos + Upper'Length - 1) = Upper | |
1941 then | |
1942 Class := (if Invert then ANYOF_NUPPER else ANYOF_UPPER); | |
1943 Parse_Pos := Parse_Pos + Upper'Length; | |
1944 else | |
1945 Fail ("Invalid character class: " & E); | |
1946 end if; | |
1947 | |
1948 when 'w' => | |
1949 if Parse_Pos + Word'Length - 1 <= Parse_End | |
1950 and then | |
1951 E (Parse_Pos .. Parse_Pos + Word'Length - 1) = Word | |
1952 then | |
1953 Class := (if Invert then ANYOF_NALNUM else ANYOF_ALNUM); | |
1954 Parse_Pos := Parse_Pos + Word'Length; | |
1955 else | |
1956 Fail ("Invalid character class: " & E); | |
1957 end if; | |
1958 | |
1959 when 'x' => | |
1960 if Parse_Pos + Xdigit'Length - 1 <= Parse_End | |
1961 and then | |
1962 E (Parse_Pos .. Parse_Pos + Xdigit'Length - 1) = Xdigit | |
1963 then | |
1964 Class := (if Invert then ANYOF_NXDIGIT else ANYOF_XDIGIT); | |
1965 Parse_Pos := Parse_Pos + Xdigit'Length; | |
1966 | |
1967 else | |
1968 Fail ("Invalid character class: " & E); | |
1969 end if; | |
1970 | |
1971 when others => | |
1972 Fail ("Invalid character class: " & E); | |
1973 end case; | |
1974 | |
1975 -- Character class not specified | |
1976 | |
1977 else | |
1978 return ANYOF_NONE; | |
1979 end if; | |
1980 | |
1981 return Class; | |
1982 end Parse_Posix_Character_Class; | |
1983 | |
1984 -- Local Declarations | |
1985 | |
1986 Result : Pointer; | |
1987 | |
1988 Expr_Flags : Expression_Flags; | |
1989 pragma Unreferenced (Expr_Flags); | |
1990 | |
1991 -- Start of processing for Compile | |
1992 | |
1993 begin | |
1994 Parse (False, False, Expr_Flags, Result); | |
1995 | |
1996 if Result = 0 then | |
1997 Fail ("Couldn't compile expression"); | |
1998 end if; | |
1999 | |
2000 Final_Code_Size := Emit_Ptr - 1; | |
2001 | |
2002 -- Do we want to actually compile the expression, or simply get the | |
2003 -- code size ??? | |
2004 | |
2005 if Emit_Ptr <= PM.Size then | |
2006 Optimize (PM); | |
2007 end if; | |
2008 | |
2009 PM.Flags := Flags; | |
2010 end Compile; | |
2011 | |
2012 function Compile | |
2013 (Expression : String; | |
2014 Flags : Regexp_Flags := No_Flags) return Pattern_Matcher | |
2015 is | |
2016 -- Assume the compiled regexp will fit in 1000 chars. If it does not we | |
2017 -- will have to compile a second time once the correct size is known. If | |
2018 -- it fits, we save a significant amount of time by avoiding the second | |
2019 -- compilation. | |
2020 | |
2021 Dummy : Pattern_Matcher (1000); | |
2022 Size : Program_Size; | |
2023 | |
2024 begin | |
2025 Compile (Dummy, Expression, Size, Flags); | |
2026 | |
2027 if Size <= Dummy.Size then | |
2028 return Pattern_Matcher' | |
2029 (Size => Size, | |
2030 First => Dummy.First, | |
2031 Anchored => Dummy.Anchored, | |
2032 Must_Have => Dummy.Must_Have, | |
2033 Must_Have_Length => Dummy.Must_Have_Length, | |
2034 Paren_Count => Dummy.Paren_Count, | |
2035 Flags => Dummy.Flags, | |
2036 Program => | |
2037 Dummy.Program | |
2038 (Dummy.Program'First .. Dummy.Program'First + Size - 1)); | |
2039 else | |
2040 -- We have to recompile now that we know the size | |
2041 -- ??? Can we use Ada 2005's return construct ? | |
2042 | |
2043 declare | |
2044 Result : Pattern_Matcher (Size); | |
2045 begin | |
2046 Compile (Result, Expression, Size, Flags); | |
2047 return Result; | |
2048 end; | |
2049 end if; | |
2050 end Compile; | |
2051 | |
2052 procedure Compile | |
2053 (Matcher : out Pattern_Matcher; | |
2054 Expression : String; | |
2055 Flags : Regexp_Flags := No_Flags) | |
2056 is | |
2057 Size : Program_Size; | |
2058 | |
2059 begin | |
2060 Compile (Matcher, Expression, Size, Flags); | |
2061 | |
2062 if Size > Matcher.Size then | |
2063 raise Expression_Error with "Pattern_Matcher is too small"; | |
2064 end if; | |
2065 end Compile; | |
2066 | |
2067 -------------------- | |
2068 -- Dump_Operation -- | |
2069 -------------------- | |
2070 | |
2071 procedure Dump_Operation | |
2072 (Program : Program_Data; | |
2073 Index : Pointer; | |
2074 Indent : Natural) | |
2075 is | |
2076 Current : Pointer := Index; | |
2077 begin | |
2078 Dump_Until (Program, Current, Current + 1, Indent); | |
2079 end Dump_Operation; | |
2080 | |
2081 ---------------- | |
2082 -- Dump_Until -- | |
2083 ---------------- | |
2084 | |
2085 procedure Dump_Until | |
2086 (Program : Program_Data; | |
2087 Index : in out Pointer; | |
2088 Till : Pointer; | |
2089 Indent : Natural; | |
2090 Do_Print : Boolean := True) | |
2091 is | |
2092 function Image (S : String) return String; | |
2093 -- Remove leading space | |
2094 | |
2095 ----------- | |
2096 -- Image -- | |
2097 ----------- | |
2098 | |
2099 function Image (S : String) return String is | |
2100 begin | |
2101 if S (S'First) = ' ' then | |
2102 return S (S'First + 1 .. S'Last); | |
2103 else | |
2104 return S; | |
2105 end if; | |
2106 end Image; | |
2107 | |
2108 -- Local variables | |
2109 | |
2110 Op : Opcode; | |
2111 Next : Pointer; | |
2112 Length : Pointer; | |
2113 Local_Indent : Natural := Indent; | |
2114 | |
2115 -- Start of processing for Dump_Until | |
2116 | |
2117 begin | |
2118 while Index < Till loop | |
2119 Op := Opcode'Val (Character'Pos ((Program (Index)))); | |
2120 Next := Get_Next (Program, Index); | |
2121 | |
2122 if Do_Print then | |
2123 declare | |
2124 Point : constant String := Pointer'Image (Index); | |
2125 begin | |
2126 Put ((1 .. 4 - Point'Length => ' ') | |
2127 & Point & ":" | |
2128 & (1 .. Local_Indent * 2 => ' ') & Opcode'Image (Op)); | |
2129 end; | |
2130 | |
2131 -- Print the parenthesis number | |
2132 | |
2133 if Op = OPEN or else Op = CLOSE or else Op = REFF then | |
2134 Put (Image (Natural'Image | |
2135 (Character'Pos | |
2136 (Program (Index + Next_Pointer_Bytes))))); | |
2137 end if; | |
2138 | |
2139 if Next = Index then | |
2140 Put (" (-)"); | |
2141 else | |
2142 Put (" (" & Image (Pointer'Image (Next)) & ")"); | |
2143 end if; | |
2144 end if; | |
2145 | |
2146 case Op is | |
2147 when ANYOF => | |
2148 declare | |
2149 Bitmap : Character_Class; | |
2150 Last : Character := ASCII.NUL; | |
2151 Current : Natural := 0; | |
2152 Current_Char : Character; | |
2153 | |
2154 begin | |
2155 Bitmap_Operand (Program, Index, Bitmap); | |
2156 | |
2157 if Do_Print then | |
2158 Put ("["); | |
2159 | |
2160 while Current <= 255 loop | |
2161 Current_Char := Character'Val (Current); | |
2162 | |
2163 -- First item in a range | |
2164 | |
2165 if Get_From_Class (Bitmap, Current_Char) then | |
2166 Last := Current_Char; | |
2167 | |
2168 -- Search for the last item in the range | |
2169 | |
2170 loop | |
2171 Current := Current + 1; | |
2172 exit when Current > 255; | |
2173 Current_Char := Character'Val (Current); | |
2174 exit when | |
2175 not Get_From_Class (Bitmap, Current_Char); | |
2176 end loop; | |
2177 | |
2178 if not Is_Graphic (Last) then | |
2179 Put (Last'Img); | |
2180 else | |
2181 Put (Last); | |
2182 end if; | |
2183 | |
2184 if Character'Succ (Last) /= Current_Char then | |
2185 Put ("\-" & Character'Pred (Current_Char)); | |
2186 end if; | |
2187 | |
2188 else | |
2189 Current := Current + 1; | |
2190 end if; | |
2191 end loop; | |
2192 | |
2193 Put_Line ("]"); | |
2194 end if; | |
2195 | |
2196 Index := Index + Next_Pointer_Bytes + Bitmap'Length; | |
2197 end; | |
2198 | |
2199 when EXACT | EXACTF => | |
2200 Length := String_Length (Program, Index); | |
2201 if Do_Print then | |
2202 Put (" (" & Image (Program_Size'Image (Length + 1)) | |
2203 & " chars) <" | |
2204 & String (Program (String_Operand (Index) | |
2205 .. String_Operand (Index) | |
2206 + Length))); | |
2207 Put_Line (">"); | |
2208 end if; | |
2209 | |
2210 Index := String_Operand (Index) + Length + 1; | |
2211 | |
2212 -- Node operand | |
2213 | |
2214 when BRANCH | STAR | PLUS => | |
2215 if Do_Print then | |
2216 New_Line; | |
2217 end if; | |
2218 | |
2219 Index := Index + Next_Pointer_Bytes; | |
2220 Dump_Until (Program, Index, Pointer'Min (Next, Till), | |
2221 Local_Indent + 1, Do_Print); | |
2222 | |
2223 when CURLY | CURLYX => | |
2224 if Do_Print then | |
2225 Put_Line | |
2226 (" {" | |
2227 & Image (Natural'Image | |
2228 (Read_Natural (Program, Index + Next_Pointer_Bytes))) | |
2229 & "," | |
2230 & Image (Natural'Image (Read_Natural (Program, Index + 5))) | |
2231 & "}"); | |
2232 end if; | |
2233 | |
2234 Index := Index + 7; | |
2235 Dump_Until (Program, Index, Pointer'Min (Next, Till), | |
2236 Local_Indent + 1, Do_Print); | |
2237 | |
2238 when OPEN => | |
2239 if Do_Print then | |
2240 New_Line; | |
2241 end if; | |
2242 | |
2243 Index := Index + 4; | |
2244 Local_Indent := Local_Indent + 1; | |
2245 | |
2246 when CLOSE | REFF => | |
2247 if Do_Print then | |
2248 New_Line; | |
2249 end if; | |
2250 | |
2251 Index := Index + 4; | |
2252 | |
2253 if Op = CLOSE then | |
2254 Local_Indent := Local_Indent - 1; | |
2255 end if; | |
2256 | |
2257 when others => | |
2258 Index := Index + Next_Pointer_Bytes; | |
2259 | |
2260 if Do_Print then | |
2261 New_Line; | |
2262 end if; | |
2263 | |
2264 exit when Op = EOP; | |
2265 end case; | |
2266 end loop; | |
2267 end Dump_Until; | |
2268 | |
2269 ---------- | |
2270 -- Dump -- | |
2271 ---------- | |
2272 | |
2273 procedure Dump (Self : Pattern_Matcher) is | |
2274 Program : Program_Data renames Self.Program; | |
2275 Index : Pointer := Program'First; | |
2276 | |
2277 -- Start of processing for Dump | |
2278 | |
2279 begin | |
2280 Put_Line ("Must start with (Self.First) = " | |
2281 & Character'Image (Self.First)); | |
2282 | |
2283 if (Self.Flags and Case_Insensitive) /= 0 then | |
2284 Put_Line (" Case_Insensitive mode"); | |
2285 end if; | |
2286 | |
2287 if (Self.Flags and Single_Line) /= 0 then | |
2288 Put_Line (" Single_Line mode"); | |
2289 end if; | |
2290 | |
2291 if (Self.Flags and Multiple_Lines) /= 0 then | |
2292 Put_Line (" Multiple_Lines mode"); | |
2293 end if; | |
2294 | |
2295 Dump_Until (Program, Index, Self.Program'Last + 1, 0); | |
2296 end Dump; | |
2297 | |
2298 -------------------- | |
2299 -- Get_From_Class -- | |
2300 -------------------- | |
2301 | |
2302 function Get_From_Class | |
2303 (Bitmap : Character_Class; | |
2304 C : Character) return Boolean | |
2305 is | |
2306 Value : constant Class_Byte := Character'Pos (C); | |
2307 begin | |
2308 return | |
2309 (Bitmap (Value / 8) and Bit_Conversion (Value mod 8)) /= 0; | |
2310 end Get_From_Class; | |
2311 | |
2312 -------------- | |
2313 -- Get_Next -- | |
2314 -------------- | |
2315 | |
2316 function Get_Next (Program : Program_Data; IP : Pointer) return Pointer is | |
2317 begin | |
2318 return IP + Pointer (Read_Natural (Program, IP + 1)); | |
2319 end Get_Next; | |
2320 | |
2321 -------------- | |
2322 -- Is_Alnum -- | |
2323 -------------- | |
2324 | |
2325 function Is_Alnum (C : Character) return Boolean is | |
2326 begin | |
2327 return Is_Alphanumeric (C) or else C = '_'; | |
2328 end Is_Alnum; | |
2329 | |
2330 ------------------ | |
2331 -- Is_Printable -- | |
2332 ------------------ | |
2333 | |
2334 function Is_Printable (C : Character) return Boolean is | |
2335 begin | |
2336 -- Printable if space or graphic character or other whitespace | |
2337 -- Other white space includes (HT/LF/VT/FF/CR = codes 9-13) | |
2338 | |
2339 return C in Character'Val (32) .. Character'Val (126) | |
2340 or else C in ASCII.HT .. ASCII.CR; | |
2341 end Is_Printable; | |
2342 | |
2343 -------------------- | |
2344 -- Is_White_Space -- | |
2345 -------------------- | |
2346 | |
2347 function Is_White_Space (C : Character) return Boolean is | |
2348 begin | |
2349 -- Note: HT = 9, LF = 10, VT = 11, FF = 12, CR = 13 | |
2350 | |
2351 return C = ' ' or else C in ASCII.HT .. ASCII.CR; | |
2352 end Is_White_Space; | |
2353 | |
2354 ----------- | |
2355 -- Match -- | |
2356 ----------- | |
2357 | |
2358 procedure Match | |
2359 (Self : Pattern_Matcher; | |
2360 Data : String; | |
2361 Matches : out Match_Array; | |
2362 Data_First : Integer := -1; | |
2363 Data_Last : Positive := Positive'Last) | |
2364 is | |
2365 Program : Program_Data renames Self.Program; -- Shorter notation | |
2366 | |
2367 First_In_Data : constant Integer := Integer'Max (Data_First, Data'First); | |
2368 Last_In_Data : constant Integer := Integer'Min (Data_Last, Data'Last); | |
2369 | |
2370 -- Global work variables | |
2371 | |
2372 Input_Pos : Natural; -- String-input pointer | |
2373 BOL_Pos : Natural; -- Beginning of input, for ^ check | |
2374 Matched : Boolean := False; -- Until proven True | |
2375 | |
2376 Matches_Full : Match_Array (0 .. Natural'Max (Self.Paren_Count, | |
2377 Matches'Last)); | |
2378 -- Stores the value of all the parenthesis pairs. | |
2379 -- We do not use directly Matches, so that we can also use back | |
2380 -- references (REFF) even if Matches is too small. | |
2381 | |
2382 type Natural_Array is array (Match_Count range <>) of Natural; | |
2383 Matches_Tmp : Natural_Array (Matches_Full'Range); | |
2384 -- Save the opening position of parenthesis | |
2385 | |
2386 Last_Paren : Natural := 0; | |
2387 -- Last parenthesis seen | |
2388 | |
2389 Greedy : Boolean := True; | |
2390 -- True if the next operator should be greedy | |
2391 | |
2392 type Current_Curly_Record; | |
2393 type Current_Curly_Access is access all Current_Curly_Record; | |
2394 type Current_Curly_Record is record | |
2395 Paren_Floor : Natural; -- How far back to strip parenthesis data | |
2396 Cur : Integer; -- How many instances of scan we've matched | |
2397 Min : Natural; -- Minimal number of scans to match | |
2398 Max : Natural; -- Maximal number of scans to match | |
2399 Greedy : Boolean; -- Whether to work our way up or down | |
2400 Scan : Pointer; -- The thing to match | |
2401 Next : Pointer; -- What has to match after it | |
2402 Lastloc : Natural; -- Where we started matching this scan | |
2403 Old_Cc : Current_Curly_Access; -- Before we started this one | |
2404 end record; | |
2405 -- Data used to handle the curly operator and the plus and star | |
2406 -- operators for complex expressions. | |
2407 | |
2408 Current_Curly : Current_Curly_Access := null; | |
2409 -- The curly currently being processed | |
2410 | |
2411 ----------------------- | |
2412 -- Local Subprograms -- | |
2413 ----------------------- | |
2414 | |
2415 function Index (Start : Positive; C : Character) return Natural; | |
2416 -- Find character C in Data starting at Start and return position | |
2417 | |
2418 function Repeat | |
2419 (IP : Pointer; | |
2420 Max : Natural := Natural'Last) return Natural; | |
2421 -- Repeatedly match something simple, report how many | |
2422 -- It only matches on things of length 1. | |
2423 -- Starting from Input_Pos, it matches at most Max CURLY. | |
2424 | |
2425 function Try (Pos : Positive) return Boolean; | |
2426 -- Try to match at specific point | |
2427 | |
2428 function Match (IP : Pointer) return Boolean; | |
2429 -- This is the main matching routine. Conceptually the strategy | |
2430 -- is simple: check to see whether the current node matches, | |
2431 -- call self recursively to see whether the rest matches, | |
2432 -- and then act accordingly. | |
2433 -- | |
2434 -- In practice Match makes some effort to avoid recursion, in | |
2435 -- particular by going through "ordinary" nodes (that don't | |
2436 -- need to know whether the rest of the match failed) by | |
2437 -- using a loop instead of recursion. | |
2438 -- Why is the above comment part of the spec rather than body ??? | |
2439 | |
2440 function Match_Whilem return Boolean; | |
2441 -- Return True if a WHILEM matches the Current_Curly | |
2442 | |
2443 function Recurse_Match (IP : Pointer; From : Natural) return Boolean; | |
2444 pragma Inline (Recurse_Match); | |
2445 -- Calls Match recursively. It saves and restores the parenthesis | |
2446 -- status and location in the input stream correctly, so that | |
2447 -- backtracking is possible | |
2448 | |
2449 function Match_Simple_Operator | |
2450 (Op : Opcode; | |
2451 Scan : Pointer; | |
2452 Next : Pointer; | |
2453 Greedy : Boolean) return Boolean; | |
2454 -- Return True it the simple operator (possibly non-greedy) matches | |
2455 | |
2456 Dump_Indent : Integer := -1; | |
2457 procedure Dump_Current (Scan : Pointer; Prefix : Boolean := True); | |
2458 procedure Dump_Error (Msg : String); | |
2459 -- Debug: print the current context | |
2460 | |
2461 pragma Inline (Index); | |
2462 pragma Inline (Repeat); | |
2463 | |
2464 -- These are two complex functions, but used only once | |
2465 | |
2466 pragma Inline (Match_Whilem); | |
2467 pragma Inline (Match_Simple_Operator); | |
2468 | |
2469 ----------- | |
2470 -- Index -- | |
2471 ----------- | |
2472 | |
2473 function Index (Start : Positive; C : Character) return Natural is | |
2474 begin | |
2475 for J in Start .. Last_In_Data loop | |
2476 if Data (J) = C then | |
2477 return J; | |
2478 end if; | |
2479 end loop; | |
2480 | |
2481 return 0; | |
2482 end Index; | |
2483 | |
2484 ------------------- | |
2485 -- Recurse_Match -- | |
2486 ------------------- | |
2487 | |
2488 function Recurse_Match (IP : Pointer; From : Natural) return Boolean is | |
2489 L : constant Natural := Last_Paren; | |
2490 Tmp_F : constant Match_Array := | |
2491 Matches_Full (From + 1 .. Matches_Full'Last); | |
2492 Start : constant Natural_Array := | |
2493 Matches_Tmp (From + 1 .. Matches_Tmp'Last); | |
2494 Input : constant Natural := Input_Pos; | |
2495 | |
2496 Dump_Indent_Save : constant Integer := Dump_Indent; | |
2497 | |
2498 begin | |
2499 if Match (IP) then | |
2500 return True; | |
2501 end if; | |
2502 | |
2503 Last_Paren := L; | |
2504 Matches_Full (Tmp_F'Range) := Tmp_F; | |
2505 Matches_Tmp (Start'Range) := Start; | |
2506 Input_Pos := Input; | |
2507 Dump_Indent := Dump_Indent_Save; | |
2508 return False; | |
2509 end Recurse_Match; | |
2510 | |
2511 ------------------ | |
2512 -- Dump_Current -- | |
2513 ------------------ | |
2514 | |
2515 procedure Dump_Current (Scan : Pointer; Prefix : Boolean := True) is | |
2516 Length : constant := 10; | |
2517 Pos : constant String := Integer'Image (Input_Pos); | |
2518 | |
2519 begin | |
2520 if Prefix then | |
2521 Put ((1 .. 5 - Pos'Length => ' ')); | |
2522 Put (Pos & " <" | |
2523 & Data (Input_Pos | |
2524 .. Integer'Min (Last_In_Data, Input_Pos + Length - 1))); | |
2525 Put ((1 .. Length - 1 - Last_In_Data + Input_Pos => ' ')); | |
2526 Put ("> |"); | |
2527 | |
2528 else | |
2529 Put (" "); | |
2530 end if; | |
2531 | |
2532 Dump_Operation (Program, Scan, Indent => Dump_Indent); | |
2533 end Dump_Current; | |
2534 | |
2535 ---------------- | |
2536 -- Dump_Error -- | |
2537 ---------------- | |
2538 | |
2539 procedure Dump_Error (Msg : String) is | |
2540 begin | |
2541 Put (" | "); | |
2542 Put ((1 .. Dump_Indent * 2 => ' ')); | |
2543 Put_Line (Msg); | |
2544 end Dump_Error; | |
2545 | |
2546 ----------- | |
2547 -- Match -- | |
2548 ----------- | |
2549 | |
2550 function Match (IP : Pointer) return Boolean is | |
2551 Scan : Pointer := IP; | |
2552 Next : Pointer; | |
2553 Op : Opcode; | |
2554 Result : Boolean; | |
2555 | |
2556 begin | |
2557 Dump_Indent := Dump_Indent + 1; | |
2558 | |
2559 State_Machine : | |
2560 loop | |
2561 pragma Assert (Scan /= 0); | |
2562 | |
2563 -- Determine current opcode and count its usage in debug mode | |
2564 | |
2565 Op := Opcode'Val (Character'Pos (Program (Scan))); | |
2566 | |
2567 -- Calculate offset of next instruction. Second character is most | |
2568 -- significant in Program_Data. | |
2569 | |
2570 Next := Get_Next (Program, Scan); | |
2571 | |
2572 if Debug then | |
2573 Dump_Current (Scan); | |
2574 end if; | |
2575 | |
2576 case Op is | |
2577 when EOP => | |
2578 Dump_Indent := Dump_Indent - 1; | |
2579 return True; -- Success | |
2580 | |
2581 when BRANCH => | |
2582 if Program (Next) /= BRANCH then | |
2583 Next := Operand (Scan); -- No choice, avoid recursion | |
2584 | |
2585 else | |
2586 loop | |
2587 if Recurse_Match (Operand (Scan), 0) then | |
2588 Dump_Indent := Dump_Indent - 1; | |
2589 return True; | |
2590 end if; | |
2591 | |
2592 Scan := Get_Next (Program, Scan); | |
2593 exit when Scan = 0 or else Program (Scan) /= BRANCH; | |
2594 end loop; | |
2595 | |
2596 exit State_Machine; | |
2597 end if; | |
2598 | |
2599 when NOTHING => | |
2600 null; | |
2601 | |
2602 when BOL => | |
2603 exit State_Machine when Input_Pos /= BOL_Pos | |
2604 and then ((Self.Flags and Multiple_Lines) = 0 | |
2605 or else Data (Input_Pos - 1) /= ASCII.LF); | |
2606 | |
2607 when MBOL => | |
2608 exit State_Machine when Input_Pos /= BOL_Pos | |
2609 and then Data (Input_Pos - 1) /= ASCII.LF; | |
2610 | |
2611 when SBOL => | |
2612 exit State_Machine when Input_Pos /= BOL_Pos; | |
2613 | |
2614 when EOL => | |
2615 | |
2616 -- A combination of MEOL and SEOL | |
2617 | |
2618 if (Self.Flags and Multiple_Lines) = 0 then | |
2619 | |
2620 -- Single line mode | |
2621 | |
2622 exit State_Machine when Input_Pos <= Data'Last; | |
2623 | |
2624 elsif Input_Pos <= Last_In_Data then | |
2625 exit State_Machine when Data (Input_Pos) /= ASCII.LF; | |
2626 else | |
2627 exit State_Machine when Last_In_Data /= Data'Last; | |
2628 end if; | |
2629 | |
2630 when MEOL => | |
2631 if Input_Pos <= Last_In_Data then | |
2632 exit State_Machine when Data (Input_Pos) /= ASCII.LF; | |
2633 else | |
2634 exit State_Machine when Last_In_Data /= Data'Last; | |
2635 end if; | |
2636 | |
2637 when SEOL => | |
2638 | |
2639 -- If there is a character before Data'Last (even if | |
2640 -- Last_In_Data stops before then), we can't have the | |
2641 -- end of the line. | |
2642 | |
2643 exit State_Machine when Input_Pos <= Data'Last; | |
2644 | |
2645 when BOUND | NBOUND => | |
2646 | |
2647 -- Was last char in word ? | |
2648 | |
2649 declare | |
2650 N : Boolean := False; | |
2651 Ln : Boolean := False; | |
2652 | |
2653 begin | |
2654 if Input_Pos /= First_In_Data then | |
2655 N := Is_Alnum (Data (Input_Pos - 1)); | |
2656 end if; | |
2657 | |
2658 Ln := | |
2659 (if Input_Pos > Last_In_Data | |
2660 then False | |
2661 else Is_Alnum (Data (Input_Pos))); | |
2662 | |
2663 if Op = BOUND then | |
2664 if N = Ln then | |
2665 exit State_Machine; | |
2666 end if; | |
2667 else | |
2668 if N /= Ln then | |
2669 exit State_Machine; | |
2670 end if; | |
2671 end if; | |
2672 end; | |
2673 | |
2674 when SPACE => | |
2675 exit State_Machine when Input_Pos > Last_In_Data | |
2676 or else not Is_White_Space (Data (Input_Pos)); | |
2677 Input_Pos := Input_Pos + 1; | |
2678 | |
2679 when NSPACE => | |
2680 exit State_Machine when Input_Pos > Last_In_Data | |
2681 or else Is_White_Space (Data (Input_Pos)); | |
2682 Input_Pos := Input_Pos + 1; | |
2683 | |
2684 when DIGIT => | |
2685 exit State_Machine when Input_Pos > Last_In_Data | |
2686 or else not Is_Digit (Data (Input_Pos)); | |
2687 Input_Pos := Input_Pos + 1; | |
2688 | |
2689 when NDIGIT => | |
2690 exit State_Machine when Input_Pos > Last_In_Data | |
2691 or else Is_Digit (Data (Input_Pos)); | |
2692 Input_Pos := Input_Pos + 1; | |
2693 | |
2694 when ALNUM => | |
2695 exit State_Machine when Input_Pos > Last_In_Data | |
2696 or else not Is_Alnum (Data (Input_Pos)); | |
2697 Input_Pos := Input_Pos + 1; | |
2698 | |
2699 when NALNUM => | |
2700 exit State_Machine when Input_Pos > Last_In_Data | |
2701 or else Is_Alnum (Data (Input_Pos)); | |
2702 Input_Pos := Input_Pos + 1; | |
2703 | |
2704 when ANY => | |
2705 exit State_Machine when Input_Pos > Last_In_Data | |
2706 or else Data (Input_Pos) = ASCII.LF; | |
2707 Input_Pos := Input_Pos + 1; | |
2708 | |
2709 when SANY => | |
2710 exit State_Machine when Input_Pos > Last_In_Data; | |
2711 Input_Pos := Input_Pos + 1; | |
2712 | |
2713 when EXACT => | |
2714 declare | |
2715 Opnd : Pointer := String_Operand (Scan); | |
2716 Current : Positive := Input_Pos; | |
2717 Last : constant Pointer := | |
2718 Opnd + String_Length (Program, Scan); | |
2719 | |
2720 begin | |
2721 while Opnd <= Last loop | |
2722 exit State_Machine when Current > Last_In_Data | |
2723 or else Program (Opnd) /= Data (Current); | |
2724 Current := Current + 1; | |
2725 Opnd := Opnd + 1; | |
2726 end loop; | |
2727 | |
2728 Input_Pos := Current; | |
2729 end; | |
2730 | |
2731 when EXACTF => | |
2732 declare | |
2733 Opnd : Pointer := String_Operand (Scan); | |
2734 Current : Positive := Input_Pos; | |
2735 | |
2736 Last : constant Pointer := | |
2737 Opnd + String_Length (Program, Scan); | |
2738 | |
2739 begin | |
2740 while Opnd <= Last loop | |
2741 exit State_Machine when Current > Last_In_Data | |
2742 or else Program (Opnd) /= To_Lower (Data (Current)); | |
2743 Current := Current + 1; | |
2744 Opnd := Opnd + 1; | |
2745 end loop; | |
2746 | |
2747 Input_Pos := Current; | |
2748 end; | |
2749 | |
2750 when ANYOF => | |
2751 declare | |
2752 Bitmap : Character_Class; | |
2753 begin | |
2754 Bitmap_Operand (Program, Scan, Bitmap); | |
2755 exit State_Machine when Input_Pos > Last_In_Data | |
2756 or else not Get_From_Class (Bitmap, Data (Input_Pos)); | |
2757 Input_Pos := Input_Pos + 1; | |
2758 end; | |
2759 | |
2760 when OPEN => | |
2761 declare | |
2762 No : constant Natural := | |
2763 Character'Pos (Program (Operand (Scan))); | |
2764 begin | |
2765 Matches_Tmp (No) := Input_Pos; | |
2766 end; | |
2767 | |
2768 when CLOSE => | |
2769 declare | |
2770 No : constant Natural := | |
2771 Character'Pos (Program (Operand (Scan))); | |
2772 | |
2773 begin | |
2774 Matches_Full (No) := (Matches_Tmp (No), Input_Pos - 1); | |
2775 | |
2776 if Last_Paren < No then | |
2777 Last_Paren := No; | |
2778 end if; | |
2779 end; | |
2780 | |
2781 when REFF => | |
2782 declare | |
2783 No : constant Natural := | |
2784 Character'Pos (Program (Operand (Scan))); | |
2785 | |
2786 Data_Pos : Natural; | |
2787 | |
2788 begin | |
2789 -- If we haven't seen that parenthesis yet | |
2790 | |
2791 if Last_Paren < No then | |
2792 Dump_Indent := Dump_Indent - 1; | |
2793 | |
2794 if Debug then | |
2795 Dump_Error ("REFF: No match, backtracking"); | |
2796 end if; | |
2797 | |
2798 return False; | |
2799 end if; | |
2800 | |
2801 Data_Pos := Matches_Full (No).First; | |
2802 | |
2803 while Data_Pos <= Matches_Full (No).Last loop | |
2804 if Input_Pos > Last_In_Data | |
2805 or else Data (Input_Pos) /= Data (Data_Pos) | |
2806 then | |
2807 Dump_Indent := Dump_Indent - 1; | |
2808 | |
2809 if Debug then | |
2810 Dump_Error ("REFF: No match, backtracking"); | |
2811 end if; | |
2812 | |
2813 return False; | |
2814 end if; | |
2815 | |
2816 Input_Pos := Input_Pos + 1; | |
2817 Data_Pos := Data_Pos + 1; | |
2818 end loop; | |
2819 end; | |
2820 | |
2821 when MINMOD => | |
2822 Greedy := False; | |
2823 | |
2824 when STAR | PLUS | CURLY => | |
2825 declare | |
2826 Greed : constant Boolean := Greedy; | |
2827 begin | |
2828 Greedy := True; | |
2829 Result := Match_Simple_Operator (Op, Scan, Next, Greed); | |
2830 Dump_Indent := Dump_Indent - 1; | |
2831 return Result; | |
2832 end; | |
2833 | |
2834 when CURLYX => | |
2835 | |
2836 -- Looking at something like: | |
2837 | |
2838 -- 1: CURLYX {n,m} (->4) | |
2839 -- 2: code for complex thing (->3) | |
2840 -- 3: WHILEM (->0) | |
2841 -- 4: NOTHING | |
2842 | |
2843 declare | |
2844 Min : constant Natural := | |
2845 Read_Natural (Program, Scan + Next_Pointer_Bytes); | |
2846 Max : constant Natural := | |
2847 Read_Natural | |
2848 (Program, Scan + Next_Pointer_Bytes + 2); | |
2849 Cc : aliased Current_Curly_Record; | |
2850 | |
2851 Has_Match : Boolean; | |
2852 | |
2853 begin | |
2854 Cc := (Paren_Floor => Last_Paren, | |
2855 Cur => -1, | |
2856 Min => Min, | |
2857 Max => Max, | |
2858 Greedy => Greedy, | |
2859 Scan => Scan + 7, | |
2860 Next => Next, | |
2861 Lastloc => 0, | |
2862 Old_Cc => Current_Curly); | |
2863 Greedy := True; | |
2864 Current_Curly := Cc'Unchecked_Access; | |
2865 | |
2866 Has_Match := Match (Next - Next_Pointer_Bytes); | |
2867 | |
2868 -- Start on the WHILEM | |
2869 | |
2870 Current_Curly := Cc.Old_Cc; | |
2871 Dump_Indent := Dump_Indent - 1; | |
2872 | |
2873 if not Has_Match then | |
2874 if Debug then | |
2875 Dump_Error ("CURLYX failed..."); | |
2876 end if; | |
2877 end if; | |
2878 | |
2879 return Has_Match; | |
2880 end; | |
2881 | |
2882 when WHILEM => | |
2883 Result := Match_Whilem; | |
2884 Dump_Indent := Dump_Indent - 1; | |
2885 | |
2886 if Debug and then not Result then | |
2887 Dump_Error ("WHILEM: no match, backtracking"); | |
2888 end if; | |
2889 | |
2890 return Result; | |
2891 end case; | |
2892 | |
2893 Scan := Next; | |
2894 end loop State_Machine; | |
2895 | |
2896 if Debug then | |
2897 Dump_Error ("failed..."); | |
2898 Dump_Indent := Dump_Indent - 1; | |
2899 end if; | |
2900 | |
2901 -- If we get here, there is no match. For successful matches when EOP | |
2902 -- is the terminating point. | |
2903 | |
2904 return False; | |
2905 end Match; | |
2906 | |
2907 --------------------------- | |
2908 -- Match_Simple_Operator -- | |
2909 --------------------------- | |
2910 | |
2911 function Match_Simple_Operator | |
2912 (Op : Opcode; | |
2913 Scan : Pointer; | |
2914 Next : Pointer; | |
2915 Greedy : Boolean) return Boolean | |
2916 is | |
2917 Next_Char : Character := ASCII.NUL; | |
2918 Next_Char_Known : Boolean := False; | |
2919 No : Integer; -- Can be negative | |
2920 Min : Natural; | |
2921 Max : Natural := Natural'Last; | |
2922 Operand_Code : Pointer; | |
2923 Old : Natural; | |
2924 Last_Pos : Natural; | |
2925 Save : constant Natural := Input_Pos; | |
2926 | |
2927 begin | |
2928 -- Lookahead to avoid useless match attempts when we know what | |
2929 -- character comes next. | |
2930 | |
2931 if Program (Next) = EXACT then | |
2932 Next_Char := Program (String_Operand (Next)); | |
2933 Next_Char_Known := True; | |
2934 end if; | |
2935 | |
2936 -- Find the minimal and maximal values for the operator | |
2937 | |
2938 case Op is | |
2939 when STAR => | |
2940 Min := 0; | |
2941 Operand_Code := Operand (Scan); | |
2942 | |
2943 when PLUS => | |
2944 Min := 1; | |
2945 Operand_Code := Operand (Scan); | |
2946 | |
2947 when others => | |
2948 Min := Read_Natural (Program, Scan + Next_Pointer_Bytes); | |
2949 Max := Read_Natural (Program, Scan + Next_Pointer_Bytes + 2); | |
2950 Operand_Code := Scan + 7; | |
2951 end case; | |
2952 | |
2953 if Debug then | |
2954 Dump_Current (Operand_Code, Prefix => False); | |
2955 end if; | |
2956 | |
2957 -- Non greedy operators | |
2958 | |
2959 if not Greedy then | |
2960 | |
2961 -- Test we can repeat at least Min times | |
2962 | |
2963 if Min /= 0 then | |
2964 No := Repeat (Operand_Code, Min); | |
2965 | |
2966 if No < Min then | |
2967 if Debug then | |
2968 Dump_Error ("failed... matched" & No'Img & " times"); | |
2969 end if; | |
2970 | |
2971 return False; | |
2972 end if; | |
2973 end if; | |
2974 | |
2975 Old := Input_Pos; | |
2976 | |
2977 -- Find the place where 'next' could work | |
2978 | |
2979 if Next_Char_Known then | |
2980 | |
2981 -- Last position to check | |
2982 | |
2983 if Max = Natural'Last then | |
2984 Last_Pos := Last_In_Data; | |
2985 else | |
2986 Last_Pos := Input_Pos + Max; | |
2987 | |
2988 if Last_Pos > Last_In_Data then | |
2989 Last_Pos := Last_In_Data; | |
2990 end if; | |
2991 end if; | |
2992 | |
2993 -- Look for the first possible opportunity | |
2994 | |
2995 if Debug then | |
2996 Dump_Error ("Next_Char must be " & Next_Char); | |
2997 end if; | |
2998 | |
2999 loop | |
3000 -- Find the next possible position | |
3001 | |
3002 while Input_Pos <= Last_Pos | |
3003 and then Data (Input_Pos) /= Next_Char | |
3004 loop | |
3005 Input_Pos := Input_Pos + 1; | |
3006 end loop; | |
3007 | |
3008 if Input_Pos > Last_Pos then | |
3009 return False; | |
3010 end if; | |
3011 | |
3012 -- Check that we still match if we stop at the position we | |
3013 -- just found. | |
3014 | |
3015 declare | |
3016 Num : constant Natural := Input_Pos - Old; | |
3017 | |
3018 begin | |
3019 Input_Pos := Old; | |
3020 | |
3021 if Debug then | |
3022 Dump_Error ("Would we still match at that position?"); | |
3023 end if; | |
3024 | |
3025 if Repeat (Operand_Code, Num) < Num then | |
3026 return False; | |
3027 end if; | |
3028 end; | |
3029 | |
3030 -- Input_Pos now points to the new position | |
3031 | |
3032 if Match (Get_Next (Program, Scan)) then | |
3033 return True; | |
3034 end if; | |
3035 | |
3036 Old := Input_Pos; | |
3037 Input_Pos := Input_Pos + 1; | |
3038 end loop; | |
3039 | |
3040 -- We do not know what the next character is | |
3041 | |
3042 else | |
3043 while Max >= Min loop | |
3044 if Debug then | |
3045 Dump_Error ("Non-greedy repeat, N=" & Min'Img); | |
3046 Dump_Error ("Do we still match Next if we stop here?"); | |
3047 end if; | |
3048 | |
3049 -- If the next character matches | |
3050 | |
3051 if Recurse_Match (Next, 1) then | |
3052 return True; | |
3053 end if; | |
3054 | |
3055 Input_Pos := Save + Min; | |
3056 | |
3057 -- Could not or did not match -- move forward | |
3058 | |
3059 if Repeat (Operand_Code, 1) /= 0 then | |
3060 Min := Min + 1; | |
3061 else | |
3062 if Debug then | |
3063 Dump_Error ("Non-greedy repeat failed..."); | |
3064 end if; | |
3065 | |
3066 return False; | |
3067 end if; | |
3068 end loop; | |
3069 end if; | |
3070 | |
3071 return False; | |
3072 | |
3073 -- Greedy operators | |
3074 | |
3075 else | |
3076 No := Repeat (Operand_Code, Max); | |
3077 | |
3078 if Debug and then No < Min then | |
3079 Dump_Error ("failed... matched" & No'Img & " times"); | |
3080 end if; | |
3081 | |
3082 -- ??? Perl has some special code here in case the next | |
3083 -- instruction is of type EOL, since $ and \Z can match before | |
3084 -- *and* after newline at the end. | |
3085 | |
3086 -- ??? Perl has some special code here in case (paren) is True | |
3087 | |
3088 -- Else, if we don't have any parenthesis | |
3089 | |
3090 while No >= Min loop | |
3091 if not Next_Char_Known | |
3092 or else (Input_Pos <= Last_In_Data | |
3093 and then Data (Input_Pos) = Next_Char) | |
3094 then | |
3095 if Match (Next) then | |
3096 return True; | |
3097 end if; | |
3098 end if; | |
3099 | |
3100 -- Could not or did not work, we back up | |
3101 | |
3102 No := No - 1; | |
3103 Input_Pos := Save + No; | |
3104 end loop; | |
3105 | |
3106 return False; | |
3107 end if; | |
3108 end Match_Simple_Operator; | |
3109 | |
3110 ------------------ | |
3111 -- Match_Whilem -- | |
3112 ------------------ | |
3113 | |
3114 -- This is really hard to understand, because after we match what we | |
3115 -- are trying to match, we must make sure the rest of the REx is going | |
3116 -- to match for sure, and to do that we have to go back UP the parse | |
3117 -- tree by recursing ever deeper. And if it fails, we have to reset | |
3118 -- our parent's current state that we can try again after backing off. | |
3119 | |
3120 function Match_Whilem return Boolean is | |
3121 Cc : constant Current_Curly_Access := Current_Curly; | |
3122 | |
3123 N : constant Natural := Cc.Cur + 1; | |
3124 Ln : Natural := 0; | |
3125 | |
3126 Lastloc : constant Natural := Cc.Lastloc; | |
3127 -- Detection of 0-len | |
3128 | |
3129 begin | |
3130 -- If degenerate scan matches "", assume scan done | |
3131 | |
3132 if Input_Pos = Cc.Lastloc | |
3133 and then N >= Cc.Min | |
3134 then | |
3135 -- Temporarily restore the old context, and check that we | |
3136 -- match was comes after CURLYX. | |
3137 | |
3138 Current_Curly := Cc.Old_Cc; | |
3139 | |
3140 if Current_Curly /= null then | |
3141 Ln := Current_Curly.Cur; | |
3142 end if; | |
3143 | |
3144 if Match (Cc.Next) then | |
3145 return True; | |
3146 end if; | |
3147 | |
3148 if Current_Curly /= null then | |
3149 Current_Curly.Cur := Ln; | |
3150 end if; | |
3151 | |
3152 Current_Curly := Cc; | |
3153 return False; | |
3154 end if; | |
3155 | |
3156 -- First, just match a string of min scans | |
3157 | |
3158 if N < Cc.Min then | |
3159 Cc.Cur := N; | |
3160 Cc.Lastloc := Input_Pos; | |
3161 | |
3162 if Debug then | |
3163 Dump_Error | |
3164 ("Tests that we match at least" & Cc.Min'Img & " N=" & N'Img); | |
3165 end if; | |
3166 | |
3167 if Match (Cc.Scan) then | |
3168 return True; | |
3169 end if; | |
3170 | |
3171 Cc.Cur := N - 1; | |
3172 Cc.Lastloc := Lastloc; | |
3173 | |
3174 if Debug then | |
3175 Dump_Error ("failed..."); | |
3176 end if; | |
3177 | |
3178 return False; | |
3179 end if; | |
3180 | |
3181 -- Prefer next over scan for minimal matching | |
3182 | |
3183 if not Cc.Greedy then | |
3184 Current_Curly := Cc.Old_Cc; | |
3185 | |
3186 if Current_Curly /= null then | |
3187 Ln := Current_Curly.Cur; | |
3188 end if; | |
3189 | |
3190 if Recurse_Match (Cc.Next, Cc.Paren_Floor) then | |
3191 return True; | |
3192 end if; | |
3193 | |
3194 if Current_Curly /= null then | |
3195 Current_Curly.Cur := Ln; | |
3196 end if; | |
3197 | |
3198 Current_Curly := Cc; | |
3199 | |
3200 -- Maximum greed exceeded ? | |
3201 | |
3202 if N >= Cc.Max then | |
3203 if Debug then | |
3204 Dump_Error ("failed..."); | |
3205 end if; | |
3206 return False; | |
3207 end if; | |
3208 | |
3209 -- Try scanning more and see if it helps | |
3210 Cc.Cur := N; | |
3211 Cc.Lastloc := Input_Pos; | |
3212 | |
3213 if Debug then | |
3214 Dump_Error ("Next failed, what about Current?"); | |
3215 end if; | |
3216 | |
3217 if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then | |
3218 return True; | |
3219 end if; | |
3220 | |
3221 Cc.Cur := N - 1; | |
3222 Cc.Lastloc := Lastloc; | |
3223 return False; | |
3224 end if; | |
3225 | |
3226 -- Prefer scan over next for maximal matching | |
3227 | |
3228 if N < Cc.Max then -- more greed allowed ? | |
3229 Cc.Cur := N; | |
3230 Cc.Lastloc := Input_Pos; | |
3231 | |
3232 if Debug then | |
3233 Dump_Error ("Recurse at current position"); | |
3234 end if; | |
3235 | |
3236 if Recurse_Match (Cc.Scan, Cc.Paren_Floor) then | |
3237 return True; | |
3238 end if; | |
3239 end if; | |
3240 | |
3241 -- Failed deeper matches of scan, so see if this one works | |
3242 | |
3243 Current_Curly := Cc.Old_Cc; | |
3244 | |
3245 if Current_Curly /= null then | |
3246 Ln := Current_Curly.Cur; | |
3247 end if; | |
3248 | |
3249 if Debug then | |
3250 Dump_Error ("Failed matching for later positions"); | |
3251 end if; | |
3252 | |
3253 if Match (Cc.Next) then | |
3254 return True; | |
3255 end if; | |
3256 | |
3257 if Current_Curly /= null then | |
3258 Current_Curly.Cur := Ln; | |
3259 end if; | |
3260 | |
3261 Current_Curly := Cc; | |
3262 Cc.Cur := N - 1; | |
3263 Cc.Lastloc := Lastloc; | |
3264 | |
3265 if Debug then | |
3266 Dump_Error ("failed..."); | |
3267 end if; | |
3268 | |
3269 return False; | |
3270 end Match_Whilem; | |
3271 | |
3272 ------------ | |
3273 -- Repeat -- | |
3274 ------------ | |
3275 | |
3276 function Repeat | |
3277 (IP : Pointer; | |
3278 Max : Natural := Natural'Last) return Natural | |
3279 is | |
3280 Scan : Natural := Input_Pos; | |
3281 Last : Natural; | |
3282 Op : constant Opcode := Opcode'Val (Character'Pos (Program (IP))); | |
3283 Count : Natural; | |
3284 C : Character; | |
3285 Is_First : Boolean := True; | |
3286 Bitmap : Character_Class; | |
3287 | |
3288 begin | |
3289 if Max = Natural'Last or else Scan + Max - 1 > Last_In_Data then | |
3290 Last := Last_In_Data; | |
3291 else | |
3292 Last := Scan + Max - 1; | |
3293 end if; | |
3294 | |
3295 case Op is | |
3296 when ANY => | |
3297 while Scan <= Last | |
3298 and then Data (Scan) /= ASCII.LF | |
3299 loop | |
3300 Scan := Scan + 1; | |
3301 end loop; | |
3302 | |
3303 when SANY => | |
3304 Scan := Last + 1; | |
3305 | |
3306 when EXACT => | |
3307 | |
3308 -- The string has only one character if Repeat was called | |
3309 | |
3310 C := Program (String_Operand (IP)); | |
3311 while Scan <= Last | |
3312 and then C = Data (Scan) | |
3313 loop | |
3314 Scan := Scan + 1; | |
3315 end loop; | |
3316 | |
3317 when EXACTF => | |
3318 | |
3319 -- The string has only one character if Repeat was called | |
3320 | |
3321 C := Program (String_Operand (IP)); | |
3322 while Scan <= Last | |
3323 and then To_Lower (C) = Data (Scan) | |
3324 loop | |
3325 Scan := Scan + 1; | |
3326 end loop; | |
3327 | |
3328 when ANYOF => | |
3329 if Is_First then | |
3330 Bitmap_Operand (Program, IP, Bitmap); | |
3331 Is_First := False; | |
3332 end if; | |
3333 | |
3334 while Scan <= Last | |
3335 and then Get_From_Class (Bitmap, Data (Scan)) | |
3336 loop | |
3337 Scan := Scan + 1; | |
3338 end loop; | |
3339 | |
3340 when ALNUM => | |
3341 while Scan <= Last | |
3342 and then Is_Alnum (Data (Scan)) | |
3343 loop | |
3344 Scan := Scan + 1; | |
3345 end loop; | |
3346 | |
3347 when NALNUM => | |
3348 while Scan <= Last | |
3349 and then not Is_Alnum (Data (Scan)) | |
3350 loop | |
3351 Scan := Scan + 1; | |
3352 end loop; | |
3353 | |
3354 when SPACE => | |
3355 while Scan <= Last | |
3356 and then Is_White_Space (Data (Scan)) | |
3357 loop | |
3358 Scan := Scan + 1; | |
3359 end loop; | |
3360 | |
3361 when NSPACE => | |
3362 while Scan <= Last | |
3363 and then not Is_White_Space (Data (Scan)) | |
3364 loop | |
3365 Scan := Scan + 1; | |
3366 end loop; | |
3367 | |
3368 when DIGIT => | |
3369 while Scan <= Last | |
3370 and then Is_Digit (Data (Scan)) | |
3371 loop | |
3372 Scan := Scan + 1; | |
3373 end loop; | |
3374 | |
3375 when NDIGIT => | |
3376 while Scan <= Last | |
3377 and then not Is_Digit (Data (Scan)) | |
3378 loop | |
3379 Scan := Scan + 1; | |
3380 end loop; | |
3381 | |
3382 when others => | |
3383 raise Program_Error; | |
3384 end case; | |
3385 | |
3386 Count := Scan - Input_Pos; | |
3387 Input_Pos := Scan; | |
3388 return Count; | |
3389 end Repeat; | |
3390 | |
3391 --------- | |
3392 -- Try -- | |
3393 --------- | |
3394 | |
3395 function Try (Pos : Positive) return Boolean is | |
3396 begin | |
3397 Input_Pos := Pos; | |
3398 Last_Paren := 0; | |
3399 Matches_Full := (others => No_Match); | |
3400 | |
3401 if Match (Program_First) then | |
3402 Matches_Full (0) := (Pos, Input_Pos - 1); | |
3403 return True; | |
3404 end if; | |
3405 | |
3406 return False; | |
3407 end Try; | |
3408 | |
3409 -- Start of processing for Match | |
3410 | |
3411 begin | |
3412 -- Do we have the regexp Never_Match? | |
3413 | |
3414 if Self.Size = 0 then | |
3415 Matches := (others => No_Match); | |
3416 return; | |
3417 end if; | |
3418 | |
3419 -- If there is a "must appear" string, look for it | |
3420 | |
3421 if Self.Must_Have_Length > 0 then | |
3422 declare | |
3423 First : constant Character := Program (Self.Must_Have); | |
3424 Must_First : constant Pointer := Self.Must_Have; | |
3425 Must_Last : constant Pointer := | |
3426 Must_First + Pointer (Self.Must_Have_Length - 1); | |
3427 Next_Try : Natural := Index (First_In_Data, First); | |
3428 | |
3429 begin | |
3430 while Next_Try /= 0 | |
3431 and then Data (Next_Try .. Next_Try + Self.Must_Have_Length - 1) | |
3432 = String (Program (Must_First .. Must_Last)) | |
3433 loop | |
3434 Next_Try := Index (Next_Try + 1, First); | |
3435 end loop; | |
3436 | |
3437 if Next_Try = 0 then | |
3438 Matches := (others => No_Match); | |
3439 return; -- Not present | |
3440 end if; | |
3441 end; | |
3442 end if; | |
3443 | |
3444 -- Mark beginning of line for ^ | |
3445 | |
3446 BOL_Pos := Data'First; | |
3447 | |
3448 -- Simplest case first: an anchored match need be tried only once | |
3449 | |
3450 if Self.Anchored and then (Self.Flags and Multiple_Lines) = 0 then | |
3451 Matched := Try (First_In_Data); | |
3452 | |
3453 elsif Self.Anchored then | |
3454 declare | |
3455 Next_Try : Natural := First_In_Data; | |
3456 begin | |
3457 -- Test the first position in the buffer | |
3458 Matched := Try (Next_Try); | |
3459 | |
3460 -- Else only test after newlines | |
3461 | |
3462 if not Matched then | |
3463 while Next_Try <= Last_In_Data loop | |
3464 while Next_Try <= Last_In_Data | |
3465 and then Data (Next_Try) /= ASCII.LF | |
3466 loop | |
3467 Next_Try := Next_Try + 1; | |
3468 end loop; | |
3469 | |
3470 Next_Try := Next_Try + 1; | |
3471 | |
3472 if Next_Try <= Last_In_Data then | |
3473 Matched := Try (Next_Try); | |
3474 exit when Matched; | |
3475 end if; | |
3476 end loop; | |
3477 end if; | |
3478 end; | |
3479 | |
3480 elsif Self.First /= ASCII.NUL then | |
3481 -- We know what char it must start with | |
3482 | |
3483 declare | |
3484 Next_Try : Natural := Index (First_In_Data, Self.First); | |
3485 | |
3486 begin | |
3487 while Next_Try /= 0 loop | |
3488 Matched := Try (Next_Try); | |
3489 exit when Matched; | |
3490 Next_Try := Index (Next_Try + 1, Self.First); | |
3491 end loop; | |
3492 end; | |
3493 | |
3494 else | |
3495 -- Messy cases: try all locations (including for the empty string) | |
3496 | |
3497 Matched := Try (First_In_Data); | |
3498 | |
3499 if not Matched then | |
3500 for S in First_In_Data + 1 .. Last_In_Data loop | |
3501 Matched := Try (S); | |
3502 exit when Matched; | |
3503 end loop; | |
3504 end if; | |
3505 end if; | |
3506 | |
3507 -- Matched has its value | |
3508 | |
3509 for J in Last_Paren + 1 .. Matches'Last loop | |
3510 Matches_Full (J) := No_Match; | |
3511 end loop; | |
3512 | |
3513 Matches := Matches_Full (Matches'Range); | |
3514 end Match; | |
3515 | |
3516 ----------- | |
3517 -- Match -- | |
3518 ----------- | |
3519 | |
3520 function Match | |
3521 (Self : Pattern_Matcher; | |
3522 Data : String; | |
3523 Data_First : Integer := -1; | |
3524 Data_Last : Positive := Positive'Last) return Natural | |
3525 is | |
3526 Matches : Match_Array (0 .. 0); | |
3527 | |
3528 begin | |
3529 Match (Self, Data, Matches, Data_First, Data_Last); | |
3530 if Matches (0) = No_Match then | |
3531 return Data'First - 1; | |
3532 else | |
3533 return Matches (0).First; | |
3534 end if; | |
3535 end Match; | |
3536 | |
3537 function Match | |
3538 (Self : Pattern_Matcher; | |
3539 Data : String; | |
3540 Data_First : Integer := -1; | |
3541 Data_Last : Positive := Positive'Last) return Boolean | |
3542 is | |
3543 Matches : Match_Array (0 .. 0); | |
3544 | |
3545 begin | |
3546 Match (Self, Data, Matches, Data_First, Data_Last); | |
3547 return Matches (0).First >= Data'First; | |
3548 end Match; | |
3549 | |
3550 procedure Match | |
3551 (Expression : String; | |
3552 Data : String; | |
3553 Matches : out Match_Array; | |
3554 Size : Program_Size := Auto_Size; | |
3555 Data_First : Integer := -1; | |
3556 Data_Last : Positive := Positive'Last) | |
3557 is | |
3558 PM : Pattern_Matcher (Size); | |
3559 Finalize_Size : Program_Size; | |
3560 pragma Unreferenced (Finalize_Size); | |
3561 begin | |
3562 if Size = 0 then | |
3563 Match (Compile (Expression), Data, Matches, Data_First, Data_Last); | |
3564 else | |
3565 Compile (PM, Expression, Finalize_Size); | |
3566 Match (PM, Data, Matches, Data_First, Data_Last); | |
3567 end if; | |
3568 end Match; | |
3569 | |
3570 ----------- | |
3571 -- Match -- | |
3572 ----------- | |
3573 | |
3574 function Match | |
3575 (Expression : String; | |
3576 Data : String; | |
3577 Size : Program_Size := Auto_Size; | |
3578 Data_First : Integer := -1; | |
3579 Data_Last : Positive := Positive'Last) return Natural | |
3580 is | |
3581 PM : Pattern_Matcher (Size); | |
3582 Final_Size : Program_Size; | |
3583 pragma Unreferenced (Final_Size); | |
3584 begin | |
3585 if Size = 0 then | |
3586 return Match (Compile (Expression), Data, Data_First, Data_Last); | |
3587 else | |
3588 Compile (PM, Expression, Final_Size); | |
3589 return Match (PM, Data, Data_First, Data_Last); | |
3590 end if; | |
3591 end Match; | |
3592 | |
3593 ----------- | |
3594 -- Match -- | |
3595 ----------- | |
3596 | |
3597 function Match | |
3598 (Expression : String; | |
3599 Data : String; | |
3600 Size : Program_Size := Auto_Size; | |
3601 Data_First : Integer := -1; | |
3602 Data_Last : Positive := Positive'Last) return Boolean | |
3603 is | |
3604 Matches : Match_Array (0 .. 0); | |
3605 PM : Pattern_Matcher (Size); | |
3606 Final_Size : Program_Size; | |
3607 pragma Unreferenced (Final_Size); | |
3608 begin | |
3609 if Size = 0 then | |
3610 Match (Compile (Expression), Data, Matches, Data_First, Data_Last); | |
3611 else | |
3612 Compile (PM, Expression, Final_Size); | |
3613 Match (PM, Data, Matches, Data_First, Data_Last); | |
3614 end if; | |
3615 | |
3616 return Matches (0).First >= Data'First; | |
3617 end Match; | |
3618 | |
3619 ------------- | |
3620 -- Operand -- | |
3621 ------------- | |
3622 | |
3623 function Operand (P : Pointer) return Pointer is | |
3624 begin | |
3625 return P + Next_Pointer_Bytes; | |
3626 end Operand; | |
3627 | |
3628 -------------- | |
3629 -- Optimize -- | |
3630 -------------- | |
3631 | |
3632 procedure Optimize (Self : in out Pattern_Matcher) is | |
3633 Scan : Pointer; | |
3634 Program : Program_Data renames Self.Program; | |
3635 | |
3636 begin | |
3637 -- Start with safe defaults (no optimization): | |
3638 -- * No known first character of match | |
3639 -- * Does not necessarily start at beginning of line | |
3640 -- * No string known that has to appear in data | |
3641 | |
3642 Self.First := ASCII.NUL; | |
3643 Self.Anchored := False; | |
3644 Self.Must_Have := Program'Last + 1; | |
3645 Self.Must_Have_Length := 0; | |
3646 | |
3647 Scan := Program_First; -- First instruction (can be anything) | |
3648 | |
3649 if Program (Scan) = EXACT then | |
3650 Self.First := Program (String_Operand (Scan)); | |
3651 | |
3652 elsif Program (Scan) = BOL | |
3653 or else Program (Scan) = SBOL | |
3654 or else Program (Scan) = MBOL | |
3655 then | |
3656 Self.Anchored := True; | |
3657 end if; | |
3658 end Optimize; | |
3659 | |
3660 ----------------- | |
3661 -- Paren_Count -- | |
3662 ----------------- | |
3663 | |
3664 function Paren_Count (Regexp : Pattern_Matcher) return Match_Count is | |
3665 begin | |
3666 return Regexp.Paren_Count; | |
3667 end Paren_Count; | |
3668 | |
3669 ----------- | |
3670 -- Quote -- | |
3671 ----------- | |
3672 | |
3673 function Quote (Str : String) return String is | |
3674 S : String (1 .. Str'Length * 2); | |
3675 Last : Natural := 0; | |
3676 | |
3677 begin | |
3678 for J in Str'Range loop | |
3679 case Str (J) is | |
3680 when '^' | '$' | '|' | '*' | '+' | '?' | '{' | | |
3681 '}' | '[' | ']' | '(' | ')' | '\' | '.' => | |
3682 | |
3683 S (Last + 1) := '\'; | |
3684 S (Last + 2) := Str (J); | |
3685 Last := Last + 2; | |
3686 | |
3687 when others => | |
3688 S (Last + 1) := Str (J); | |
3689 Last := Last + 1; | |
3690 end case; | |
3691 end loop; | |
3692 | |
3693 return S (1 .. Last); | |
3694 end Quote; | |
3695 | |
3696 ------------------ | |
3697 -- Read_Natural -- | |
3698 ------------------ | |
3699 | |
3700 function Read_Natural | |
3701 (Program : Program_Data; | |
3702 IP : Pointer) return Natural | |
3703 is | |
3704 begin | |
3705 return Character'Pos (Program (IP)) + | |
3706 256 * Character'Pos (Program (IP + 1)); | |
3707 end Read_Natural; | |
3708 | |
3709 ----------------- | |
3710 -- Reset_Class -- | |
3711 ----------------- | |
3712 | |
3713 procedure Reset_Class (Bitmap : out Character_Class) is | |
3714 begin | |
3715 Bitmap := (others => 0); | |
3716 end Reset_Class; | |
3717 | |
3718 ------------------ | |
3719 -- Set_In_Class -- | |
3720 ------------------ | |
3721 | |
3722 procedure Set_In_Class | |
3723 (Bitmap : in out Character_Class; | |
3724 C : Character) | |
3725 is | |
3726 Value : constant Class_Byte := Character'Pos (C); | |
3727 begin | |
3728 Bitmap (Value / 8) := Bitmap (Value / 8) | |
3729 or Bit_Conversion (Value mod 8); | |
3730 end Set_In_Class; | |
3731 | |
3732 ------------------- | |
3733 -- String_Length -- | |
3734 ------------------- | |
3735 | |
3736 function String_Length | |
3737 (Program : Program_Data; | |
3738 P : Pointer) return Program_Size | |
3739 is | |
3740 begin | |
3741 pragma Assert (Program (P) = EXACT or else Program (P) = EXACTF); | |
3742 return Character'Pos (Program (P + Next_Pointer_Bytes)); | |
3743 end String_Length; | |
3744 | |
3745 -------------------- | |
3746 -- String_Operand -- | |
3747 -------------------- | |
3748 | |
3749 function String_Operand (P : Pointer) return Pointer is | |
3750 begin | |
3751 return P + 4; | |
3752 end String_Operand; | |
3753 | |
3754 end System.Regpat; |