111
|
1 ------------------------------------------------------------------------------
|
|
2 -- --
|
|
3 -- GNAT RUN-TIME COMPONENTS --
|
|
4 -- --
|
|
5 -- G N A T . D E C O D E _ S T R I N G --
|
|
6 -- --
|
|
7 -- S p e c --
|
|
8 -- --
|
131
|
9 -- Copyright (C) 2007-2018, AdaCore --
|
111
|
10 -- --
|
|
11 -- GNAT is free software; you can redistribute it and/or modify it under --
|
|
12 -- terms of the GNU General Public License as published by the Free Soft- --
|
|
13 -- ware Foundation; either version 3, or (at your option) any later ver- --
|
|
14 -- sion. GNAT is distributed in the hope that it will be useful, but WITH- --
|
|
15 -- OUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY --
|
|
16 -- or FITNESS FOR A PARTICULAR PURPOSE. --
|
|
17 -- --
|
|
18 -- As a special exception under Section 7 of GPL version 3, you are granted --
|
|
19 -- additional permissions described in the GCC Runtime Library Exception, --
|
|
20 -- version 3.1, as published by the Free Software Foundation. --
|
|
21 -- --
|
|
22 -- You should have received a copy of the GNU General Public License and --
|
|
23 -- a copy of the GCC Runtime Library Exception along with this program; --
|
|
24 -- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
|
|
25 -- <http://www.gnu.org/licenses/>. --
|
|
26 -- --
|
|
27 -- GNAT was originally developed by the GNAT team at New York University. --
|
|
28 -- Extensive contributions were provided by Ada Core Technologies Inc. --
|
|
29 -- --
|
|
30 ------------------------------------------------------------------------------
|
|
31
|
|
32 -- This package provides a utility routine for converting from an encoded
|
|
33 -- string to a corresponding Wide_String or Wide_Wide_String value.
|
|
34
|
|
35 with Interfaces; use Interfaces;
|
|
36
|
|
37 with System.WCh_Cnv; use System.WCh_Cnv;
|
|
38 with System.WCh_Con; use System.WCh_Con;
|
|
39
|
|
40 package body GNAT.Decode_String is
|
|
41
|
|
42 -----------------------
|
|
43 -- Local Subprograms --
|
|
44 -----------------------
|
|
45
|
|
46 procedure Bad;
|
|
47 pragma No_Return (Bad);
|
|
48 -- Raise error for bad encoding
|
|
49
|
|
50 procedure Past_End;
|
|
51 pragma No_Return (Past_End);
|
|
52 -- Raise error for off end of string
|
|
53
|
|
54 ---------
|
|
55 -- Bad --
|
|
56 ---------
|
|
57
|
|
58 procedure Bad is
|
|
59 begin
|
|
60 raise Constraint_Error with
|
|
61 "bad encoding or character out of range";
|
|
62 end Bad;
|
|
63
|
|
64 ---------------------------
|
|
65 -- Decode_Wide_Character --
|
|
66 ---------------------------
|
|
67
|
|
68 procedure Decode_Wide_Character
|
|
69 (Input : String;
|
|
70 Ptr : in out Natural;
|
|
71 Result : out Wide_Character)
|
|
72 is
|
|
73 Char : Wide_Wide_Character;
|
|
74 begin
|
|
75 Decode_Wide_Wide_Character (Input, Ptr, Char);
|
|
76
|
|
77 if Wide_Wide_Character'Pos (Char) > 16#FFFF# then
|
|
78 Bad;
|
|
79 else
|
|
80 Result := Wide_Character'Val (Wide_Wide_Character'Pos (Char));
|
|
81 end if;
|
|
82 end Decode_Wide_Character;
|
|
83
|
|
84 ------------------------
|
|
85 -- Decode_Wide_String --
|
|
86 ------------------------
|
|
87
|
|
88 function Decode_Wide_String (S : String) return Wide_String is
|
|
89 Result : Wide_String (1 .. S'Length);
|
|
90 Length : Natural;
|
|
91 begin
|
|
92 Decode_Wide_String (S, Result, Length);
|
|
93 return Result (1 .. Length);
|
|
94 end Decode_Wide_String;
|
|
95
|
|
96 procedure Decode_Wide_String
|
|
97 (S : String;
|
|
98 Result : out Wide_String;
|
|
99 Length : out Natural)
|
|
100 is
|
|
101 Ptr : Natural;
|
|
102
|
|
103 begin
|
|
104 Ptr := S'First;
|
|
105 Length := 0;
|
|
106 while Ptr <= S'Last loop
|
|
107 if Length >= Result'Last then
|
|
108 Past_End;
|
|
109 end if;
|
|
110
|
|
111 Length := Length + 1;
|
|
112 Decode_Wide_Character (S, Ptr, Result (Length));
|
|
113 end loop;
|
|
114 end Decode_Wide_String;
|
|
115
|
|
116 --------------------------------
|
|
117 -- Decode_Wide_Wide_Character --
|
|
118 --------------------------------
|
|
119
|
|
120 procedure Decode_Wide_Wide_Character
|
|
121 (Input : String;
|
|
122 Ptr : in out Natural;
|
|
123 Result : out Wide_Wide_Character)
|
|
124 is
|
|
125 C : Character;
|
|
126
|
|
127 function In_Char return Character;
|
|
128 pragma Inline (In_Char);
|
|
129 -- Function to get one input character
|
|
130
|
|
131 -------------
|
|
132 -- In_Char --
|
|
133 -------------
|
|
134
|
|
135 function In_Char return Character is
|
|
136 begin
|
|
137 if Ptr <= Input'Last then
|
|
138 Ptr := Ptr + 1;
|
|
139 return Input (Ptr - 1);
|
|
140 else
|
|
141 Past_End;
|
|
142 end if;
|
|
143 end In_Char;
|
|
144
|
|
145 -- Start of processing for Decode_Wide_Wide_Character
|
|
146
|
|
147 begin
|
|
148 C := In_Char;
|
|
149
|
|
150 -- Special fast processing for UTF-8 case
|
|
151
|
|
152 if Encoding_Method = WCEM_UTF8 then
|
|
153 UTF8 : declare
|
|
154 U : Unsigned_32;
|
|
155 W : Unsigned_32;
|
|
156
|
|
157 procedure Get_UTF_Byte;
|
|
158 pragma Inline (Get_UTF_Byte);
|
|
159 -- Used to interpret 2#10xxxxxx# continuation byte in UTF-8 mode.
|
|
160 -- Reads a byte, and raises CE if the first two bits are not 10.
|
|
161 -- Otherwise shifts W 6 bits left and or's in the 6 xxxxxx bits.
|
|
162
|
|
163 ------------------
|
|
164 -- Get_UTF_Byte --
|
|
165 ------------------
|
|
166
|
|
167 procedure Get_UTF_Byte is
|
|
168 begin
|
|
169 U := Unsigned_32 (Character'Pos (In_Char));
|
|
170
|
|
171 if (U and 2#11000000#) /= 2#10_000000# then
|
|
172 Bad;
|
|
173 end if;
|
|
174
|
|
175 W := Shift_Left (W, 6) or (U and 2#00111111#);
|
|
176 end Get_UTF_Byte;
|
|
177
|
|
178 -- Start of processing for UTF8 case
|
|
179
|
|
180 begin
|
|
181 -- Note: for details of UTF8 encoding see RFC 3629
|
|
182
|
|
183 U := Unsigned_32 (Character'Pos (C));
|
|
184
|
|
185 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
|
|
186
|
|
187 if (U and 2#10000000#) = 2#00000000# then
|
|
188 Result := Wide_Wide_Character'Val (Character'Pos (C));
|
|
189
|
|
190 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
|
|
191
|
|
192 elsif (U and 2#11100000#) = 2#110_00000# then
|
|
193 W := U and 2#00011111#;
|
|
194 Get_UTF_Byte;
|
|
195
|
|
196 if W not in 16#00_0080# .. 16#00_07FF# then
|
|
197 Bad;
|
|
198 end if;
|
|
199
|
|
200 Result := Wide_Wide_Character'Val (W);
|
|
201
|
|
202 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
|
|
203
|
|
204 elsif (U and 2#11110000#) = 2#1110_0000# then
|
|
205 W := U and 2#00001111#;
|
|
206 Get_UTF_Byte;
|
|
207 Get_UTF_Byte;
|
|
208
|
|
209 if W not in 16#00_0800# .. 16#00_FFFF# then
|
|
210 Bad;
|
|
211 end if;
|
|
212
|
|
213 Result := Wide_Wide_Character'Val (W);
|
|
214
|
|
215 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
216
|
|
217 elsif (U and 2#11111000#) = 2#11110_000# then
|
|
218 W := U and 2#00000111#;
|
|
219
|
|
220 for K in 1 .. 3 loop
|
|
221 Get_UTF_Byte;
|
|
222 end loop;
|
|
223
|
|
224 if W not in 16#01_0000# .. 16#10_FFFF# then
|
|
225 Bad;
|
|
226 end if;
|
|
227
|
|
228 Result := Wide_Wide_Character'Val (W);
|
|
229
|
|
230 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx 10xxxxxx
|
|
231 -- 10xxxxxx 10xxxxxx
|
|
232
|
|
233 elsif (U and 2#11111100#) = 2#111110_00# then
|
|
234 W := U and 2#00000011#;
|
|
235
|
|
236 for K in 1 .. 4 loop
|
|
237 Get_UTF_Byte;
|
|
238 end loop;
|
|
239
|
|
240 if W not in 16#0020_0000# .. 16#03FF_FFFF# then
|
|
241 Bad;
|
|
242 end if;
|
|
243
|
|
244 Result := Wide_Wide_Character'Val (W);
|
|
245
|
|
246 -- All other cases are invalid, note that this includes:
|
|
247
|
|
248 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx 10xxxxxx
|
|
249 -- 10xxxxxx 10xxxxxx 10xxxxxx
|
|
250
|
|
251 -- since Wide_Wide_Character does not include code values
|
|
252 -- greater than 16#03FF_FFFF#.
|
|
253
|
|
254 else
|
|
255 Bad;
|
|
256 end if;
|
|
257 end UTF8;
|
|
258
|
|
259 -- All encoding functions other than UTF-8
|
|
260
|
|
261 else
|
|
262 Non_UTF8 : declare
|
|
263 function Char_Sequence_To_UTF is
|
|
264 new Char_Sequence_To_UTF_32 (In_Char);
|
|
265
|
|
266 begin
|
|
267 -- For brackets, must test for specific case of [ not followed by
|
|
268 -- quotation, where we must not call Char_Sequence_To_UTF, but
|
|
269 -- instead just return the bracket unchanged.
|
|
270
|
|
271 if Encoding_Method = WCEM_Brackets
|
|
272 and then C = '['
|
|
273 and then (Ptr > Input'Last or else Input (Ptr) /= '"')
|
|
274 then
|
|
275 Result := '[';
|
|
276
|
|
277 -- All other cases including [" with Brackets
|
|
278
|
|
279 else
|
|
280 Result :=
|
|
281 Wide_Wide_Character'Val
|
|
282 (Char_Sequence_To_UTF (C, Encoding_Method));
|
|
283 end if;
|
|
284 end Non_UTF8;
|
|
285 end if;
|
|
286 end Decode_Wide_Wide_Character;
|
|
287
|
|
288 -----------------------------
|
|
289 -- Decode_Wide_Wide_String --
|
|
290 -----------------------------
|
|
291
|
|
292 function Decode_Wide_Wide_String (S : String) return Wide_Wide_String is
|
|
293 Result : Wide_Wide_String (1 .. S'Length);
|
|
294 Length : Natural;
|
|
295 begin
|
|
296 Decode_Wide_Wide_String (S, Result, Length);
|
|
297 return Result (1 .. Length);
|
|
298 end Decode_Wide_Wide_String;
|
|
299
|
|
300 procedure Decode_Wide_Wide_String
|
|
301 (S : String;
|
|
302 Result : out Wide_Wide_String;
|
|
303 Length : out Natural)
|
|
304 is
|
|
305 Ptr : Natural;
|
|
306
|
|
307 begin
|
|
308 Ptr := S'First;
|
|
309 Length := 0;
|
|
310 while Ptr <= S'Last loop
|
|
311 if Length >= Result'Last then
|
|
312 Past_End;
|
|
313 end if;
|
|
314
|
|
315 Length := Length + 1;
|
|
316 Decode_Wide_Wide_Character (S, Ptr, Result (Length));
|
|
317 end loop;
|
|
318 end Decode_Wide_Wide_String;
|
|
319
|
|
320 -------------------------
|
|
321 -- Next_Wide_Character --
|
|
322 -------------------------
|
|
323
|
|
324 procedure Next_Wide_Character (Input : String; Ptr : in out Natural) is
|
|
325 Discard : Wide_Character;
|
|
326 begin
|
|
327 Decode_Wide_Character (Input, Ptr, Discard);
|
|
328 end Next_Wide_Character;
|
|
329
|
|
330 ------------------------------
|
|
331 -- Next_Wide_Wide_Character --
|
|
332 ------------------------------
|
|
333
|
|
334 procedure Next_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
|
|
335 Discard : Wide_Wide_Character;
|
|
336 begin
|
|
337 Decode_Wide_Wide_Character (Input, Ptr, Discard);
|
|
338 end Next_Wide_Wide_Character;
|
|
339
|
|
340 --------------
|
|
341 -- Past_End --
|
|
342 --------------
|
|
343
|
|
344 procedure Past_End is
|
|
345 begin
|
|
346 raise Constraint_Error with "past end of string";
|
|
347 end Past_End;
|
|
348
|
|
349 -------------------------
|
|
350 -- Prev_Wide_Character --
|
|
351 -------------------------
|
|
352
|
|
353 procedure Prev_Wide_Character (Input : String; Ptr : in out Natural) is
|
|
354 begin
|
|
355 if Ptr > Input'Last + 1 then
|
|
356 Past_End;
|
|
357 end if;
|
|
358
|
|
359 -- Special efficient encoding for UTF-8 case
|
|
360
|
|
361 if Encoding_Method = WCEM_UTF8 then
|
|
362 UTF8 : declare
|
|
363 U : Unsigned_32;
|
|
364
|
|
365 procedure Getc;
|
|
366 pragma Inline (Getc);
|
|
367 -- Gets the character at Input (Ptr - 1) and returns code in U as
|
|
368 -- Unsigned_32 value. On return Ptr is decremented by one.
|
|
369
|
|
370 procedure Skip_UTF_Byte;
|
|
371 pragma Inline (Skip_UTF_Byte);
|
|
372 -- Checks that U is 2#10xxxxxx# and then calls Get
|
|
373
|
|
374 ----------
|
|
375 -- Getc --
|
|
376 ----------
|
|
377
|
|
378 procedure Getc is
|
|
379 begin
|
|
380 if Ptr <= Input'First then
|
|
381 Past_End;
|
|
382 else
|
|
383 Ptr := Ptr - 1;
|
|
384 U := Unsigned_32 (Character'Pos (Input (Ptr)));
|
|
385 end if;
|
|
386 end Getc;
|
|
387
|
|
388 -------------------
|
|
389 -- Skip_UTF_Byte --
|
|
390 -------------------
|
|
391
|
|
392 procedure Skip_UTF_Byte is
|
|
393 begin
|
|
394 if (U and 2#11000000#) = 2#10_000000# then
|
|
395 Getc;
|
|
396 else
|
|
397 Bad;
|
|
398 end if;
|
|
399 end Skip_UTF_Byte;
|
|
400
|
|
401 -- Start of processing for UTF-8 case
|
|
402
|
|
403 begin
|
|
404 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
|
|
405
|
|
406 Getc;
|
|
407
|
|
408 if (U and 2#10000000#) = 2#00000000# then
|
|
409 return;
|
|
410
|
|
411 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
|
|
412
|
|
413 else
|
|
414 Skip_UTF_Byte;
|
|
415
|
|
416 if (U and 2#11100000#) = 2#110_00000# then
|
|
417 return;
|
|
418
|
|
419 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
|
|
420
|
|
421 else
|
|
422 Skip_UTF_Byte;
|
|
423
|
|
424 if (U and 2#11110000#) = 2#1110_0000# then
|
|
425 return;
|
|
426
|
|
427 -- Any other code is invalid, note that this includes:
|
|
428
|
|
429 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
|
|
430 -- 10xxxxxx
|
|
431
|
|
432 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
|
|
433 -- 10xxxxxx 10xxxxxx
|
|
434 -- 10xxxxxx
|
|
435
|
|
436 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
|
|
437 -- 10xxxxxx 10xxxxxx
|
|
438 -- 10xxxxxx 10xxxxxx
|
|
439
|
|
440 -- since Wide_Character does not allow codes > 16#FFFF#
|
|
441
|
|
442 else
|
|
443 Bad;
|
|
444 end if;
|
|
445 end if;
|
|
446 end if;
|
|
447 end UTF8;
|
|
448
|
|
449 -- Special efficient encoding for brackets case
|
|
450
|
|
451 elsif Encoding_Method = WCEM_Brackets then
|
|
452 Brackets : declare
|
|
453 P : Natural;
|
|
454 S : Natural;
|
|
455
|
|
456 begin
|
|
457 -- See if we have "] at end positions
|
|
458
|
|
459 if Ptr > Input'First + 1
|
|
460 and then Input (Ptr - 1) = ']'
|
|
461 and then Input (Ptr - 2) = '"'
|
|
462 then
|
|
463 P := Ptr - 2;
|
|
464
|
|
465 -- Loop back looking for [" at start
|
|
466
|
|
467 while P >= Ptr - 10 loop
|
|
468 if P <= Input'First + 1 then
|
|
469 Bad;
|
|
470
|
|
471 elsif Input (P - 1) = '"'
|
|
472 and then Input (P - 2) = '['
|
|
473 then
|
|
474 -- Found ["..."], scan forward to check it
|
|
475
|
|
476 S := P - 2;
|
|
477 P := S;
|
|
478 Next_Wide_Character (Input, P);
|
|
479
|
|
480 -- OK if at original pointer, else error
|
|
481
|
|
482 if P = Ptr then
|
|
483 Ptr := S;
|
|
484 return;
|
|
485 else
|
|
486 Bad;
|
|
487 end if;
|
|
488 end if;
|
|
489
|
|
490 P := P - 1;
|
|
491 end loop;
|
|
492
|
|
493 -- Falling through loop means more than 8 chars between the
|
|
494 -- enclosing brackets (or simply a missing left bracket)
|
|
495
|
|
496 Bad;
|
|
497
|
|
498 -- Here if no bracket sequence present
|
|
499
|
|
500 else
|
|
501 if Ptr = Input'First then
|
|
502 Past_End;
|
|
503 else
|
|
504 Ptr := Ptr - 1;
|
|
505 end if;
|
|
506 end if;
|
|
507 end Brackets;
|
|
508
|
|
509 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
|
|
510 -- go to the start of the string and skip forwards till Ptr matches.
|
|
511
|
|
512 else
|
|
513 Non_UTF_Brackets : declare
|
|
514 Discard : Wide_Character;
|
|
515 PtrS : Natural;
|
|
516 PtrP : Natural;
|
|
517
|
|
518 begin
|
|
519 PtrS := Input'First;
|
|
520
|
|
521 if Ptr <= PtrS then
|
|
522 Past_End;
|
|
523 end if;
|
|
524
|
|
525 loop
|
|
526 PtrP := PtrS;
|
|
527 Decode_Wide_Character (Input, PtrS, Discard);
|
|
528
|
|
529 if PtrS = Ptr then
|
|
530 Ptr := PtrP;
|
|
531 return;
|
|
532
|
|
533 elsif PtrS > Ptr then
|
|
534 Bad;
|
|
535 end if;
|
|
536 end loop;
|
|
537
|
|
538 exception
|
|
539 when Constraint_Error =>
|
|
540 Bad;
|
|
541 end Non_UTF_Brackets;
|
|
542 end if;
|
|
543 end Prev_Wide_Character;
|
|
544
|
|
545 ------------------------------
|
|
546 -- Prev_Wide_Wide_Character --
|
|
547 ------------------------------
|
|
548
|
|
549 procedure Prev_Wide_Wide_Character (Input : String; Ptr : in out Natural) is
|
|
550 begin
|
|
551 if Ptr > Input'Last + 1 then
|
|
552 Past_End;
|
|
553 end if;
|
|
554
|
|
555 -- Special efficient encoding for UTF-8 case
|
|
556
|
|
557 if Encoding_Method = WCEM_UTF8 then
|
|
558 UTF8 : declare
|
|
559 U : Unsigned_32;
|
|
560
|
|
561 procedure Getc;
|
|
562 pragma Inline (Getc);
|
|
563 -- Gets the character at Input (Ptr - 1) and returns code in U as
|
|
564 -- Unsigned_32 value. On return Ptr is decremented by one.
|
|
565
|
|
566 procedure Skip_UTF_Byte;
|
|
567 pragma Inline (Skip_UTF_Byte);
|
|
568 -- Checks that U is 2#10xxxxxx# and then calls Get
|
|
569
|
|
570 ----------
|
|
571 -- Getc --
|
|
572 ----------
|
|
573
|
|
574 procedure Getc is
|
|
575 begin
|
|
576 if Ptr <= Input'First then
|
|
577 Past_End;
|
|
578 else
|
|
579 Ptr := Ptr - 1;
|
|
580 U := Unsigned_32 (Character'Pos (Input (Ptr)));
|
|
581 end if;
|
|
582 end Getc;
|
|
583
|
|
584 -------------------
|
|
585 -- Skip_UTF_Byte --
|
|
586 -------------------
|
|
587
|
|
588 procedure Skip_UTF_Byte is
|
|
589 begin
|
|
590 if (U and 2#11000000#) = 2#10_000000# then
|
|
591 Getc;
|
|
592 else
|
|
593 Bad;
|
|
594 end if;
|
|
595 end Skip_UTF_Byte;
|
|
596
|
|
597 -- Start of processing for UTF-8 case
|
|
598
|
|
599 begin
|
|
600 -- 16#00_0000#-16#00_007F#: 0xxxxxxx
|
|
601
|
|
602 Getc;
|
|
603
|
|
604 if (U and 2#10000000#) = 2#00000000# then
|
|
605 return;
|
|
606
|
|
607 -- 16#00_0080#-16#00_07FF#: 110xxxxx 10xxxxxx
|
|
608
|
|
609 else
|
|
610 Skip_UTF_Byte;
|
|
611
|
|
612 if (U and 2#11100000#) = 2#110_00000# then
|
|
613 return;
|
|
614
|
|
615 -- 16#00_0800#-16#00_ffff#: 1110xxxx 10xxxxxx 10xxxxxx
|
|
616
|
|
617 else
|
|
618 Skip_UTF_Byte;
|
|
619
|
|
620 if (U and 2#11110000#) = 2#1110_0000# then
|
|
621 return;
|
|
622
|
|
623 -- 16#01_0000#-16#10_FFFF#: 11110xxx 10xxxxxx 10xxxxxx
|
|
624 -- 10xxxxxx
|
|
625
|
|
626 else
|
|
627 Skip_UTF_Byte;
|
|
628
|
|
629 if (U and 2#11111000#) = 2#11110_000# then
|
|
630 return;
|
|
631
|
|
632 -- 16#0020_0000#-16#03FF_FFFF#: 111110xx 10xxxxxx
|
|
633 -- 10xxxxxx 10xxxxxx
|
|
634 -- 10xxxxxx
|
|
635
|
|
636 else
|
|
637 Skip_UTF_Byte;
|
|
638
|
|
639 if (U and 2#11111100#) = 2#111110_00# then
|
|
640 return;
|
|
641
|
|
642 -- Any other code is invalid, note that this includes:
|
|
643
|
|
644 -- 16#0400_0000#-16#7FFF_FFFF#: 1111110x 10xxxxxx
|
|
645 -- 10xxxxxx 10xxxxxx
|
|
646 -- 10xxxxxx 10xxxxxx
|
|
647
|
|
648 -- since Wide_Wide_Character does not allow codes
|
|
649 -- greater than 16#03FF_FFFF#
|
|
650
|
|
651 else
|
|
652 Bad;
|
|
653 end if;
|
|
654 end if;
|
|
655 end if;
|
|
656 end if;
|
|
657 end if;
|
|
658 end UTF8;
|
|
659
|
|
660 -- Special efficient encoding for brackets case
|
|
661
|
|
662 elsif Encoding_Method = WCEM_Brackets then
|
|
663 Brackets : declare
|
|
664 P : Natural;
|
|
665 S : Natural;
|
|
666
|
|
667 begin
|
|
668 -- See if we have "] at end positions
|
|
669
|
|
670 if Ptr > Input'First + 1
|
|
671 and then Input (Ptr - 1) = ']'
|
|
672 and then Input (Ptr - 2) = '"'
|
|
673 then
|
|
674 P := Ptr - 2;
|
|
675
|
|
676 -- Loop back looking for [" at start
|
|
677
|
|
678 while P >= Ptr - 10 loop
|
|
679 if P <= Input'First + 1 then
|
|
680 Bad;
|
|
681
|
|
682 elsif Input (P - 1) = '"'
|
|
683 and then Input (P - 2) = '['
|
|
684 then
|
|
685 -- Found ["..."], scan forward to check it
|
|
686
|
|
687 S := P - 2;
|
|
688 P := S;
|
|
689 Next_Wide_Wide_Character (Input, P);
|
|
690
|
|
691 -- OK if at original pointer, else error
|
|
692
|
|
693 if P = Ptr then
|
|
694 Ptr := S;
|
|
695 return;
|
|
696 else
|
|
697 Bad;
|
|
698 end if;
|
|
699 end if;
|
|
700
|
|
701 P := P - 1;
|
|
702 end loop;
|
|
703
|
|
704 -- Falling through loop means more than 8 chars between the
|
|
705 -- enclosing brackets (or simply a missing left bracket)
|
|
706
|
|
707 Bad;
|
|
708
|
|
709 -- Here if no bracket sequence present
|
|
710
|
|
711 else
|
|
712 if Ptr = Input'First then
|
|
713 Past_End;
|
|
714 else
|
|
715 Ptr := Ptr - 1;
|
|
716 end if;
|
|
717 end if;
|
|
718 end Brackets;
|
|
719
|
|
720 -- Non-UTF-8/Brackets. These are the inefficient cases where we have to
|
|
721 -- go to the start of the string and skip forwards till Ptr matches.
|
|
722
|
|
723 else
|
|
724 Non_UTF8_Brackets : declare
|
|
725 Discard : Wide_Wide_Character;
|
|
726 PtrS : Natural;
|
|
727 PtrP : Natural;
|
|
728
|
|
729 begin
|
|
730 PtrS := Input'First;
|
|
731
|
|
732 if Ptr <= PtrS then
|
|
733 Past_End;
|
|
734 end if;
|
|
735
|
|
736 loop
|
|
737 PtrP := PtrS;
|
|
738 Decode_Wide_Wide_Character (Input, PtrS, Discard);
|
|
739
|
|
740 if PtrS = Ptr then
|
|
741 Ptr := PtrP;
|
|
742 return;
|
|
743
|
|
744 elsif PtrS > Ptr then
|
|
745 Bad;
|
|
746 end if;
|
|
747 end loop;
|
|
748
|
|
749 exception
|
|
750 when Constraint_Error =>
|
|
751 Bad;
|
|
752 end Non_UTF8_Brackets;
|
|
753 end if;
|
|
754 end Prev_Wide_Wide_Character;
|
|
755
|
|
756 --------------------------
|
|
757 -- Validate_Wide_String --
|
|
758 --------------------------
|
|
759
|
|
760 function Validate_Wide_String (S : String) return Boolean is
|
|
761 Ptr : Natural;
|
|
762
|
|
763 begin
|
|
764 Ptr := S'First;
|
|
765 while Ptr <= S'Last loop
|
|
766 Next_Wide_Character (S, Ptr);
|
|
767 end loop;
|
|
768
|
|
769 return True;
|
|
770
|
|
771 exception
|
|
772 when Constraint_Error =>
|
|
773 return False;
|
|
774 end Validate_Wide_String;
|
|
775
|
|
776 -------------------------------
|
|
777 -- Validate_Wide_Wide_String --
|
|
778 -------------------------------
|
|
779
|
|
780 function Validate_Wide_Wide_String (S : String) return Boolean is
|
|
781 Ptr : Natural;
|
|
782
|
|
783 begin
|
|
784 Ptr := S'First;
|
|
785 while Ptr <= S'Last loop
|
|
786 Next_Wide_Wide_Character (S, Ptr);
|
|
787 end loop;
|
|
788
|
|
789 return True;
|
|
790
|
|
791 exception
|
|
792 when Constraint_Error =>
|
|
793 return False;
|
|
794 end Validate_Wide_Wide_String;
|
|
795
|
|
796 end GNAT.Decode_String;
|