Skip to content

Commit c89e9d9

Browse files
committed
add tablegen script and table
1 parent ea86ece commit c89e9d9

File tree

2 files changed

+546
-0
lines changed

2 files changed

+546
-0
lines changed

scripts/unicode.py

Lines changed: 320 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
#!/usr/bin/env python
2+
#
3+
# Copyright 2011-2013 The Rust Project Developers. See the COPYRIGHT
4+
# file at the top-level directory of this distribution and at
5+
# http://rust-lang.org/COPYRIGHT.
6+
#
7+
# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8+
# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9+
# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10+
# option. This file may not be copied, modified, or distributed
11+
# except according to those terms.
12+
13+
# This script uses the following Unicode tables:
14+
# - EastAsianWidth.txt
15+
# - ReadMe.txt
16+
# - UnicodeData.txt
17+
#
18+
# Since this should not require frequent updates, we just store this
19+
# out-of-line and check the unicode.rs file into git.
20+
21+
import fileinput, re, os, sys, operator
22+
23+
preamble = '''// Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
24+
// file at the top-level directory of this distribution and at
25+
// http://rust-lang.org/COPYRIGHT.
26+
//
27+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
28+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
29+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
30+
// option. This file may not be copied, modified, or distributed
31+
// except according to those terms.
32+
33+
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
34+
35+
#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
36+
'''
37+
38+
# Mapping taken from Table 12 from:
39+
# http://www.unicode.org/reports/tr44/#General_Category_Values
40+
expanded_categories = {
41+
'Lu': ['LC', 'L'], 'Ll': ['LC', 'L'], 'Lt': ['LC', 'L'],
42+
'Lm': ['L'], 'Lo': ['L'],
43+
'Mn': ['M'], 'Mc': ['M'], 'Me': ['M'],
44+
'Nd': ['N'], 'Nl': ['N'], 'No': ['No'],
45+
'Pc': ['P'], 'Pd': ['P'], 'Ps': ['P'], 'Pe': ['P'],
46+
'Pi': ['P'], 'Pf': ['P'], 'Po': ['P'],
47+
'Sm': ['S'], 'Sc': ['S'], 'Sk': ['S'], 'So': ['S'],
48+
'Zs': ['Z'], 'Zl': ['Z'], 'Zp': ['Z'],
49+
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
50+
}
51+
52+
# these are the surrogate codepoints, which are not valid rust characters
53+
surrogate_codepoints = (0xd800, 0xdfff)
54+
55+
def fetch(f):
56+
if not os.path.exists(os.path.basename(f)):
57+
os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
58+
% f)
59+
60+
if not os.path.exists(os.path.basename(f)):
61+
sys.stderr.write("cannot load %s" % f)
62+
exit(1)
63+
64+
def is_surrogate(n):
65+
return surrogate_codepoints[0] <= n <= surrogate_codepoints[1]
66+
67+
def load_unicode_data(f):
68+
fetch(f)
69+
gencats = {}
70+
71+
udict = {};
72+
range_start = -1;
73+
for line in fileinput.input(f):
74+
data = line.split(';');
75+
if len(data) != 15:
76+
continue
77+
cp = int(data[0], 16);
78+
if is_surrogate(cp):
79+
continue
80+
if range_start >= 0:
81+
for i in xrange(range_start, cp):
82+
udict[i] = data;
83+
range_start = -1;
84+
if data[1].endswith(", First>"):
85+
range_start = cp;
86+
continue;
87+
udict[cp] = data;
88+
89+
for code in udict:
90+
[code_org, name, gencat, combine, bidi,
91+
decomp, deci, digit, num, mirror,
92+
old, iso, upcase, lowcase, titlecase ] = udict[code];
93+
94+
# place letter in categories as appropriate
95+
for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
96+
if cat not in gencats:
97+
gencats[cat] = []
98+
gencats[cat].append(code)
99+
100+
gencats = group_cats(gencats)
101+
102+
return gencats
103+
104+
def group_cats(cats):
105+
cats_out = {}
106+
for cat in cats:
107+
cats_out[cat] = group_cat(cats[cat])
108+
return cats_out
109+
110+
def group_cat(cat):
111+
cat_out = []
112+
letters = sorted(set(cat))
113+
cur_start = letters.pop(0)
114+
cur_end = cur_start
115+
for letter in letters:
116+
assert letter > cur_end, \
117+
"cur_end: %s, letter: %s" % (hex(cur_end), hex(letter))
118+
if letter == cur_end + 1:
119+
cur_end = letter
120+
else:
121+
cat_out.append((cur_start, cur_end))
122+
cur_start = cur_end = letter
123+
cat_out.append((cur_start, cur_end))
124+
return cat_out
125+
126+
def format_table_content(f, content, indent):
127+
line = " "*indent
128+
first = True
129+
for chunk in content.split(","):
130+
if len(line) + len(chunk) < 98:
131+
if first:
132+
line += chunk
133+
else:
134+
line += ", " + chunk
135+
first = False
136+
else:
137+
f.write(line + ",\n")
138+
line = " "*indent + chunk
139+
f.write(line)
140+
141+
# load all widths of want_widths, except those in except_cats
142+
def load_east_asian_width(want_widths, except_cats):
143+
f = "EastAsianWidth.txt"
144+
fetch(f)
145+
widths = {}
146+
re1 = re.compile("^([0-9A-F]+);(\w+) +# (\w+)")
147+
re2 = re.compile("^([0-9A-F]+)\.\.([0-9A-F]+);(\w+) +# (\w+)")
148+
149+
for line in fileinput.input(f):
150+
width = None
151+
d_lo = 0
152+
d_hi = 0
153+
cat = None
154+
m = re1.match(line)
155+
if m:
156+
d_lo = m.group(1)
157+
d_hi = m.group(1)
158+
width = m.group(2)
159+
cat = m.group(3)
160+
else:
161+
m = re2.match(line)
162+
if m:
163+
d_lo = m.group(1)
164+
d_hi = m.group(2)
165+
width = m.group(3)
166+
cat = m.group(4)
167+
else:
168+
continue
169+
if cat in except_cats or width not in want_widths:
170+
continue
171+
d_lo = int(d_lo, 16)
172+
d_hi = int(d_hi, 16)
173+
if width not in widths:
174+
widths[width] = []
175+
widths[width].append((d_lo, d_hi))
176+
return widths
177+
178+
def escape_char(c):
179+
return "'\\u{%x}'" % c
180+
181+
def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
182+
pfun=lambda x: "(%s,%s)" % (escape_char(x[0]), escape_char(x[1])), is_const=True):
183+
pub_string = "const"
184+
if not is_const:
185+
pub_string = "let"
186+
if is_pub:
187+
pub_string = "pub " + pub_string
188+
f.write(" %s %s: %s = &[\n" % (pub_string, name, t_type))
189+
data = ""
190+
first = True
191+
for dat in t_data:
192+
if not first:
193+
data += ","
194+
first = False
195+
data += pfun(dat)
196+
format_table_content(f, data, 8)
197+
f.write("\n ];\n\n")
198+
199+
def emit_charwidth_module(f, width_table):
200+
f.write("pub mod charwidth {\n")
201+
f.write(" use core::option::Option;\n")
202+
f.write(" use core::option::Option::{Some, None};\n")
203+
f.write(" use core::slice::SliceExt;\n")
204+
f.write(" use core::result::Result::{Ok, Err};\n")
205+
f.write("""
206+
fn bsearch_range_value_table(c: char, is_cjk: bool, r: &'static [(char, char, u8, u8)]) -> u8 {
207+
use core::cmp::Ordering::{Equal, Less, Greater};
208+
match r.binary_search_by(|&(lo, hi, _, _)| {
209+
if lo <= c && c <= hi { Equal }
210+
else if hi < c { Less }
211+
else { Greater }
212+
}) {
213+
Ok(idx) => {
214+
let (_, _, r_ncjk, r_cjk) = r[idx];
215+
if is_cjk { r_cjk } else { r_ncjk }
216+
}
217+
Err(_) => 1
218+
}
219+
}
220+
""")
221+
222+
f.write("""
223+
pub fn width(c: char, is_cjk: bool) -> Option<usize> {
224+
match c as usize {
225+
_c @ 0 => Some(0), // null is zero width
226+
cu if cu < 0x20 => None, // control sequences have no width
227+
cu if cu < 0x7F => Some(1), // ASCII
228+
cu if cu < 0xA0 => None, // more control sequences
229+
_ => Some(bsearch_range_value_table(c, is_cjk, charwidth_table) as usize)
230+
}
231+
}
232+
233+
""")
234+
235+
f.write(" // character width table. Based on Markus Kuhn's free wcwidth() implementation,\n")
236+
f.write(" // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n")
237+
emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False,
238+
pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3]))
239+
f.write("}\n\n")
240+
241+
def remove_from_wtable(wtable, val):
242+
wtable_out = []
243+
while wtable:
244+
if wtable[0][1] < val:
245+
wtable_out.append(wtable.pop(0))
246+
elif wtable[0][0] > val:
247+
break
248+
else:
249+
(wt_lo, wt_hi, width, width_cjk) = wtable.pop(0)
250+
if wt_lo == wt_hi == val:
251+
continue
252+
elif wt_lo == val:
253+
wtable_out.append((wt_lo+1, wt_hi, width, width_cjk))
254+
elif wt_hi == val:
255+
wtable_out.append((wt_lo, wt_hi-1, width, width_cjk))
256+
else:
257+
wtable_out.append((wt_lo, val-1, width, width_cjk))
258+
wtable_out.append((val+1, wt_hi, width, width_cjk))
259+
if wtable:
260+
wtable_out.extend(wtable)
261+
return wtable_out
262+
263+
264+
265+
def optimize_width_table(wtable):
266+
wtable_out = []
267+
w_this = wtable.pop(0)
268+
while wtable:
269+
if w_this[1] == wtable[0][0] - 1 and w_this[2:3] == wtable[0][2:3]:
270+
w_tmp = wtable.pop(0)
271+
w_this = (w_this[0], w_tmp[1], w_tmp[2], w_tmp[3])
272+
else:
273+
wtable_out.append(w_this)
274+
w_this = wtable.pop(0)
275+
wtable_out.append(w_this)
276+
return wtable_out
277+
278+
if __name__ == "__main__":
279+
r = "tables.rs"
280+
if os.path.exists(r):
281+
os.remove(r)
282+
with open(r, "w") as rf:
283+
# write the file's preamble
284+
rf.write(preamble)
285+
286+
# download and parse all the data
287+
fetch("ReadMe.txt")
288+
with open("ReadMe.txt") as readme:
289+
pattern = "for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
290+
unicode_version = re.search(pattern, readme.read()).groups()
291+
rf.write("""
292+
/// The version of [Unicode](http://www.unicode.org/)
293+
/// that this version of unicode_charwidth is based on.
294+
pub const UNICODE_VERSION: (u64, u64, u64) = (%s, %s, %s);
295+
296+
""" % unicode_version)
297+
gencats = load_unicode_data("UnicodeData.txt")
298+
299+
### character width module
300+
width_table = []
301+
for zwcat in ["Me", "Mn", "Cf"]:
302+
width_table.extend(map(lambda (lo, hi): (lo, hi, 0, 0), gencats[zwcat]))
303+
width_table.append((4448, 4607, 0, 0))
304+
305+
# get widths, except those that are explicitly marked zero-width above
306+
ea_widths = load_east_asian_width(["W", "F", "A"], ["Me", "Mn", "Cf"])
307+
# these are doublewidth
308+
for dwcat in ["W", "F"]:
309+
width_table.extend(map(lambda (lo, hi): (lo, hi, 2, 2), ea_widths[dwcat]))
310+
width_table.extend(map(lambda (lo, hi): (lo, hi, 1, 2), ea_widths["A"]))
311+
312+
width_table.sort(key=lambda w: w[0])
313+
314+
# soft hyphen is not zero width in preformatted text; it's used to indicate
315+
# a hyphen inserted to facilitate a linebreak.
316+
width_table = remove_from_wtable(width_table, 173)
317+
318+
# optimize the width table by collapsing adjacent entities when possible
319+
width_table = optimize_width_table(width_table)
320+
emit_charwidth_module(rf, width_table)

0 commit comments

Comments
 (0)