145
|
1 #!/usr/bin/python3
|
|
2 # -*- coding: utf-8 -*-
|
|
3 # Copyright (C) 2014-2019 Free Software Foundation, Inc.
|
|
4 # This file is part of the GNU C Library.
|
|
5 #
|
|
6 # The GNU C Library is free software; you can redistribute it and/or
|
|
7 # modify it under the terms of the GNU Lesser General Public
|
|
8 # License as published by the Free Software Foundation; either
|
|
9 # version 2.1 of the License, or (at your option) any later version.
|
|
10 #
|
|
11 # The GNU C Library is distributed in the hope that it will be useful,
|
|
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
14 # Lesser General Public License for more details.
|
|
15 #
|
|
16 # You should have received a copy of the GNU Lesser General Public
|
|
17 # License along with the GNU C Library; if not, see
|
|
18 # <https://www.gnu.org/licenses/>.
|
|
19
|
|
20 '''glibc/localedata/charmaps/UTF-8 file generator script
|
|
21
|
|
22 This script generates a glibc/localedata/charmaps/UTF-8 file
|
|
23 from Unicode data.
|
|
24
|
|
25 Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
|
|
26
|
|
27 It will output UTF-8 file
|
|
28 '''
|
|
29
|
|
30 import argparse
|
|
31 import sys
|
|
32 import re
|
|
33 import unicode_utils
|
|
34
|
|
35 # Auxiliary tables for Hangul syllable names, see the Unicode 3.0 book,
|
|
36 # sections 3.11 and 4.4.
|
|
37
|
|
38 JAMO_INITIAL_SHORT_NAME = (
|
|
39 'G', 'GG', 'N', 'D', 'DD', 'R', 'M', 'B', 'BB', 'S', 'SS', '', 'J', 'JJ',
|
|
40 'C', 'K', 'T', 'P', 'H'
|
|
41 )
|
|
42
|
|
43 JAMO_MEDIAL_SHORT_NAME = (
|
|
44 'A', 'AE', 'YA', 'YAE', 'EO', 'E', 'YEO', 'YE', 'O', 'WA', 'WAE', 'OE',
|
|
45 'YO', 'U', 'WEO', 'WE', 'WI', 'YU', 'EU', 'YI', 'I'
|
|
46 )
|
|
47
|
|
48 JAMO_FINAL_SHORT_NAME = (
|
|
49 '', 'G', 'GG', 'GS', 'N', 'NI', 'NH', 'D', 'L', 'LG', 'LM', 'LB', 'LS',
|
|
50 'LT', 'LP', 'LH', 'M', 'B', 'BS', 'S', 'SS', 'NG', 'J', 'C', 'K', 'T',
|
|
51 'P', 'H'
|
|
52 )
|
|
53
|
|
54 def process_range(start, end, outfile, name):
|
|
55 '''Writes a range of code points into the CHARMAP section of the
|
|
56 output file
|
|
57
|
|
58 '''
|
|
59 if 'Hangul Syllable' in name:
|
|
60 # from glibc/localedata/ChangeLog:
|
|
61 #
|
|
62 # 2000-09-24 Bruno Haible <haible@clisp.cons.org>
|
|
63 # * charmaps/UTF-8: Expand <Hangul Syllable> and <Private Use> ranges,
|
|
64 # so they become printable and carry a width. Comment out surrogate
|
|
65 # ranges. Add a WIDTH table
|
|
66 #
|
|
67 # So we expand the Hangul Syllables here:
|
|
68 for i in range(int(start, 16), int(end, 16)+1 ):
|
|
69 index2, index3 = divmod(i - 0xaC00, 28)
|
|
70 index1, index2 = divmod(index2, 21)
|
|
71 hangul_syllable_name = 'HANGUL SYLLABLE ' \
|
|
72 + JAMO_INITIAL_SHORT_NAME[index1] \
|
|
73 + JAMO_MEDIAL_SHORT_NAME[index2] \
|
|
74 + JAMO_FINAL_SHORT_NAME[index3]
|
|
75 outfile.write('{:<11s} {:<12s} {:s}\n'.format(
|
|
76 unicode_utils.ucs_symbol(i), convert_to_hex(i),
|
|
77 hangul_syllable_name))
|
|
78 return
|
|
79 # UnicodeData.txt file has contains code point ranges like this:
|
|
80 #
|
|
81 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
|
82 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
|
83 #
|
|
84 # The glibc UTF-8 file splits ranges like these into shorter
|
|
85 # ranges of 64 code points each:
|
|
86 #
|
|
87 # <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
|
|
88 # …
|
|
89 # <U4D80>..<U4DB5> /xe4/xb6/x80 <CJK Ideograph Extension A>
|
|
90 for i in range(int(start, 16), int(end, 16), 64 ):
|
|
91 if i > (int(end, 16)-64):
|
|
92 outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
|
|
93 unicode_utils.ucs_symbol(i),
|
|
94 unicode_utils.ucs_symbol(int(end,16)),
|
|
95 convert_to_hex(i),
|
|
96 name))
|
|
97 break
|
|
98 outfile.write('{:s}..{:s} {:<12s} {:s}\n'.format(
|
|
99 unicode_utils.ucs_symbol(i),
|
|
100 unicode_utils.ucs_symbol(i+63),
|
|
101 convert_to_hex(i),
|
|
102 name))
|
|
103
|
|
104 def process_charmap(flines, outfile):
|
|
105 '''This function takes an array which contains *all* lines of
|
|
106 of UnicodeData.txt and write lines to outfile as used in the
|
|
107
|
|
108 CHARMAP
|
|
109 …
|
|
110 END CHARMAP
|
|
111
|
|
112 section of the UTF-8 file in glibc/localedata/charmaps/UTF-8.
|
|
113
|
|
114 Samples for input lines:
|
|
115
|
|
116 0010;<control>;Cc;0;BN;;;;;N;DATA LINK ESCAPE;;;;
|
|
117 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
|
118 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
|
119 D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
|
|
120 DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
|
|
121 100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
|
|
122 10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
|
|
123
|
|
124 Samples for output lines (Unicode-Value UTF-8-HEX Unicode-Char-Name):
|
|
125
|
|
126 <U0010> /x10 DATA LINK ESCAPE
|
|
127 <U3400>..<U343F> /xe3/x90/x80 <CJK Ideograph Extension A>
|
|
128 %<UD800> /xed/xa0/x80 <Non Private Use High Surrogate, First>
|
|
129 %<UDB7F> /xed/xad/xbf <Non Private Use High Surrogate, Last>
|
|
130 <U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
|
|
131
|
|
132 '''
|
|
133 fields_start = []
|
|
134 for line in flines:
|
|
135 fields = line.split(";")
|
|
136 # Some characters have “<control>” as their name. We try to
|
|
137 # use the “Unicode 1.0 Name” (10th field in
|
|
138 # UnicodeData.txt) for them.
|
|
139 #
|
|
140 # The Characters U+0080, U+0081, U+0084 and U+0099 have
|
|
141 # “<control>” as their name but do not even have aa
|
|
142 # ”Unicode 1.0 Name”. We could write code to take their
|
|
143 # alternate names from NameAliases.txt.
|
|
144 if fields[1] == "<control>" and fields[10]:
|
|
145 fields[1] = fields[10]
|
|
146 # Handling code point ranges like:
|
|
147 #
|
|
148 # 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
|
|
149 # 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
|
|
150 if fields[1].endswith(', First>') and not 'Surrogate,' in fields[1]:
|
|
151 fields_start = fields
|
|
152 continue
|
|
153 if fields[1].endswith(', Last>') and not 'Surrogate,' in fields[1]:
|
|
154 process_range(fields_start[0], fields[0],
|
|
155 outfile, fields[1][:-7]+'>')
|
|
156 fields_start = []
|
|
157 continue
|
|
158 fields_start = []
|
|
159 if 'Surrogate,' in fields[1]:
|
|
160 # Comment out the surrogates in the UTF-8 file.
|
|
161 # One could of course skip them completely but
|
|
162 # the original UTF-8 file in glibc had them as
|
|
163 # comments, so we keep these comment lines.
|
|
164 outfile.write('%')
|
|
165 outfile.write('{:<11s} {:<12s} {:s}\n'.format(
|
|
166 unicode_utils.ucs_symbol(int(fields[0], 16)),
|
|
167 convert_to_hex(int(fields[0], 16)),
|
|
168 fields[1]))
|
|
169
|
|
170 def convert_to_hex(code_point):
|
|
171 '''Converts a code point to a hexadecimal UTF-8 representation
|
|
172 like /x**/x**/x**.'''
|
|
173 # Getting UTF8 of Unicode characters.
|
|
174 # In Python3, .encode('UTF-8') does not work for
|
|
175 # surrogates. Therefore, we use this conversion table
|
|
176 surrogates = {
|
|
177 0xD800: '/xed/xa0/x80',
|
|
178 0xDB7F: '/xed/xad/xbf',
|
|
179 0xDB80: '/xed/xae/x80',
|
|
180 0xDBFF: '/xed/xaf/xbf',
|
|
181 0xDC00: '/xed/xb0/x80',
|
|
182 0xDFFF: '/xed/xbf/xbf',
|
|
183 }
|
|
184 if code_point in surrogates:
|
|
185 return surrogates[code_point]
|
|
186 return ''.join([
|
|
187 '/x{:02x}'.format(c) for c in chr(code_point).encode('UTF-8')
|
|
188 ])
|
|
189
|
|
190 def write_header_charmap(outfile):
|
|
191 '''Write the header on top of the CHARMAP section to the output file'''
|
|
192 outfile.write("<code_set_name> UTF-8\n")
|
|
193 outfile.write("<comment_char> %\n")
|
|
194 outfile.write("<escape_char> /\n")
|
|
195 outfile.write("<mb_cur_min> 1\n")
|
|
196 outfile.write("<mb_cur_max> 6\n\n")
|
|
197 outfile.write("% CHARMAP generated using utf8_gen.py\n")
|
|
198 outfile.write("% alias ISO-10646/UTF-8\n")
|
|
199 outfile.write("CHARMAP\n")
|
|
200
|
|
201 def write_header_width(outfile, unicode_version):
|
|
202 '''Writes the header on top of the WIDTH section to the output file'''
|
|
203 outfile.write('% Character width according to Unicode '
|
|
204 + '{:s}.\n'.format(unicode_version))
|
|
205 outfile.write('% - Default width is 1.\n')
|
|
206 outfile.write('% - Double-width characters have width 2; generated from\n')
|
|
207 outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
|
|
208 outfile.write('% - Non-spacing characters have width 0; '
|
|
209 + 'generated from PropList.txt or\n')
|
|
210 outfile.write('% "grep \'^[^;]*;[^;]*;[^;]*;[^;]*;NSM;\' '
|
|
211 + 'UnicodeData.txt"\n')
|
|
212 outfile.write('% - Format control characters have width 0; '
|
|
213 + 'generated from\n')
|
|
214 outfile.write("% \"grep '^[^;]*;[^;]*;Cf;' UnicodeData.txt\"\n")
|
|
215 # Not needed covered by Cf
|
|
216 # outfile.write("% - Zero width characters have width 0; generated from\n")
|
|
217 # outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
|
|
218 outfile.write("WIDTH\n")
|
|
219
|
|
220 def process_width(outfile, ulines, elines, plines):
|
|
221 '''ulines are lines from UnicodeData.txt, elines are lines from
|
|
222 EastAsianWidth.txt containing characters with width “W” or “F”,
|
|
223 plines are lines from PropList.txt which contain characters
|
|
224 with the property “Prepended_Concatenation_Mark”.
|
|
225
|
|
226 '''
|
|
227 width_dict = {}
|
|
228 for line in elines:
|
|
229 fields = line.split(";")
|
|
230 if not '..' in fields[0]:
|
|
231 code_points = (fields[0], fields[0])
|
|
232 else:
|
|
233 code_points = fields[0].split("..")
|
|
234 for key in range(int(code_points[0], 16),
|
|
235 int(code_points[1], 16)+1):
|
|
236 width_dict[key] = 2
|
|
237
|
|
238 for line in ulines:
|
|
239 fields = line.split(";")
|
|
240 if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
|
|
241 width_dict[int(fields[0], 16)] = 0
|
|
242
|
|
243 for line in plines:
|
|
244 # Characters with the property “Prepended_Concatenation_Mark”
|
|
245 # should have the width 1:
|
|
246 fields = line.split(";")
|
|
247 if not '..' in fields[0]:
|
|
248 code_points = (fields[0], fields[0])
|
|
249 else:
|
|
250 code_points = fields[0].split("..")
|
|
251 for key in range(int(code_points[0], 16),
|
|
252 int(code_points[1], 16)+1):
|
|
253 del width_dict[key] # default width is 1
|
|
254
|
|
255 # handle special cases for compatibility
|
|
256 for key in list((0x00AD,)):
|
|
257 # https://www.cs.tut.fi/~jkorpela/shy.html
|
|
258 if key in width_dict:
|
|
259 del width_dict[key] # default width is 1
|
|
260 for key in list(range(0x1160, 0x1200)):
|
|
261 width_dict[key] = 0
|
|
262 for key in list(range(0x3248, 0x3250)):
|
|
263 # These are “A” which means we can decide whether to treat them
|
|
264 # as “W” or “N” based on context:
|
|
265 # http://www.unicode.org/mail-arch/unicode-ml/y2017-m08/0023.html
|
|
266 # For us, “W” seems better.
|
|
267 width_dict[key] = 2
|
|
268 for key in list(range(0x4DC0, 0x4E00)):
|
|
269 width_dict[key] = 2
|
|
270
|
|
271 same_width_lists = []
|
|
272 current_width_list = []
|
|
273 for key in sorted(width_dict):
|
|
274 if not current_width_list:
|
|
275 current_width_list = [key]
|
|
276 elif (key == current_width_list[-1] + 1
|
|
277 and width_dict[key] == width_dict[current_width_list[0]]):
|
|
278 current_width_list.append(key)
|
|
279 else:
|
|
280 same_width_lists.append(current_width_list)
|
|
281 current_width_list = [key]
|
|
282 if current_width_list:
|
|
283 same_width_lists.append(current_width_list)
|
|
284
|
|
285 for same_width_list in same_width_lists:
|
|
286 if len(same_width_list) == 1:
|
|
287 outfile.write('{:s}\t{:d}\n'.format(
|
|
288 unicode_utils.ucs_symbol(same_width_list[0]),
|
|
289 width_dict[same_width_list[0]]))
|
|
290 else:
|
|
291 outfile.write('{:s}...{:s}\t{:d}\n'.format(
|
|
292 unicode_utils.ucs_symbol(same_width_list[0]),
|
|
293 unicode_utils.ucs_symbol(same_width_list[-1]),
|
|
294 width_dict[same_width_list[0]]))
|
|
295
|
|
296 if __name__ == "__main__":
|
|
297 PARSER = argparse.ArgumentParser(
|
|
298 description='''
|
|
299 Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.
|
|
300 ''')
|
|
301 PARSER.add_argument(
|
|
302 '-u', '--unicode_data_file',
|
|
303 nargs='?',
|
|
304 type=str,
|
|
305 default='UnicodeData.txt',
|
|
306 help=('The UnicodeData.txt file to read, '
|
|
307 + 'default: %(default)s'))
|
|
308 PARSER.add_argument(
|
|
309 '-e', '--east_asian_with_file',
|
|
310 nargs='?',
|
|
311 type=str,
|
|
312 default='EastAsianWidth.txt',
|
|
313 help=('The EastAsianWidth.txt file to read, '
|
|
314 + 'default: %(default)s'))
|
|
315 PARSER.add_argument(
|
|
316 '-p', '--prop_list_file',
|
|
317 nargs='?',
|
|
318 type=str,
|
|
319 default='PropList.txt',
|
|
320 help=('The PropList.txt file to read, '
|
|
321 + 'default: %(default)s'))
|
|
322 PARSER.add_argument(
|
|
323 '--unicode_version',
|
|
324 nargs='?',
|
|
325 required=True,
|
|
326 type=str,
|
|
327 help='The Unicode version of the input files used.')
|
|
328 ARGS = PARSER.parse_args()
|
|
329
|
|
330 with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
|
|
331 UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
|
|
332 with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
|
|
333 EAST_ASIAN_WIDTH_LINES = []
|
|
334 for LINE in EAST_ASIAN_WIDTH_FILE:
|
|
335 # If characters from EastAasianWidth.txt which are from
|
|
336 # from reserved ranges (i.e. not yet assigned code points)
|
|
337 # are added to the WIDTH section of the UTF-8 file, then
|
|
338 # “make check” produces “Unknown Character” errors for
|
|
339 # these code points because such unassigned code points
|
|
340 # are not in the CHARMAP section of the UTF-8 file.
|
|
341 #
|
|
342 # Therefore, we skip all reserved code points when reading
|
|
343 # the EastAsianWidth.txt file.
|
|
344 if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
|
|
345 continue
|
|
346 if re.match(r'^[^;]*;[WF]', LINE):
|
|
347 EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
|
|
348 with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:
|
|
349 PROP_LIST_LINES = []
|
|
350 for LINE in PROP_LIST_FILE:
|
|
351 if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
|
|
352 PROP_LIST_LINES.append(LINE.strip())
|
|
353 with open('UTF-8', mode='w') as OUTFILE:
|
|
354 # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
|
|
355 write_header_charmap(OUTFILE)
|
|
356 process_charmap(UNICODE_DATA_LINES, OUTFILE)
|
|
357 OUTFILE.write("END CHARMAP\n\n")
|
|
358 # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
|
|
359 write_header_width(OUTFILE, ARGS.unicode_version)
|
|
360 process_width(OUTFILE,
|
|
361 UNICODE_DATA_LINES,
|
|
362 EAST_ASIAN_WIDTH_LINES,
|
|
363 PROP_LIST_LINES)
|
|
364 OUTFILE.write("END WIDTH\n")
|