Differences From
Artifact [3976f4bc78]:
21 21 local file = io.stdin
22 22 local path
23 23 if arg[1] then
24 24 path = arg[1]
25 25 file = io.open(path, 'rb')
26 26 end
27 27
28 -local bitmask_raw = function(n,ofs)
29 - ofs = ofs or 0
30 - local function rec(i)
31 - if i > n then return end
32 - return 1<<(i+ofs), rec(i+1)
33 - end
34 - return 1<<ofs, rec(1)
35 -end
36 -
37 -local bitmask = function(tbl,ofs)
38 - local codes = {bitmask_raw(#tbl,ofs)}
39 - local m = {}
40 - local maxbit
41 - for i, s in ipairs(tbl) do
42 - m[s] = codes[i]
43 - m[codes[i]] = s
44 - maxbit = i
45 - end
46 - m[true] = {ofs or 0,maxbit}
47 - return m
48 -end
49 -
50 -local basictype = enum {
51 - 'numeral';
52 - 'alpha';
53 - 'symbol';
54 - 'punct';
55 - 'space';
56 - 'ctl';
57 - 'glyph'; -- hanji
58 -}
59 -local props = bitmask({
60 - 'hex',
61 - 'upper', 'lower', 'diac',
62 - 'wordbreak', 'wordsep',
63 - 'disallow',
64 - 'brack', 'right', 'left',
65 - 'noprint', 'superimpose'
66 -}, 3)
67 -
28 +local ss = require'sirsem'
29 +local basictype = ss.str.charclass
30 +local props = ss.str.charprop
68 31 local overrides = {
69 32 [0x200B] = basictype.space | props.wordsep; -- database entry is wrong
70 33 }
71 34
72 35 local mask = ~0 -- mask out irrelevant properties to compactify database
73 36
74 37 local function parsecat(tbl)
................................................................................
78 41 elseif tbl.class == 'Nd' then c = b.numeral
79 42 elseif tbl.class == 'No' then c = b.numeral | p.diac
80 43 elseif tbl.class == 'Cc' then
81 44 if tbl.kind == 'S'
82 45 or tbl.kind == 'WS'
83 46 or tbl.kind == 'B' then c = b.space | p.wordsep
84 47 else c = b.ctl | p.wordbreak | p.disallow end
85 - elseif tbl.class == 'Lu' then c = b.alpha | p.upper
86 - elseif tbl.class == 'Ll' then c = b.alpha | p.lower
48 + elseif tbl.class == 'Lu' then c = b.letter | p.upper
49 + elseif tbl.class == 'Ll' then c = b.letter | p.lower
87 50 elseif tbl.class == 'Lo'
88 - or tbl.class == 'Lt' then c = b.alpha
51 + or tbl.class == 'Lt' then c = b.letter
89 52 elseif tbl.class == 'Po' then c = b.punct | p.wordbreak
90 53 elseif tbl.class == 'Sm' then c = b.symbol | p.wordsep
91 54 elseif tbl.class == 'Ps' then c = b.punct | p.brack | p.left
92 55 elseif tbl.class == 'Pe' then c = b.punct | p.brack | p.right
93 56 elseif tbl.class == 'Pc'
94 57 or tbl.class == 'Pd'
95 58 or tbl.class == 'Sk'
................................................................................
104 67
105 68 local ranuirAlpha = {0xe39d, 0xe39f, 0xe3ad, 0xe3af, 0xe3b5, 0xe3b7, 0xe3b9, 0xe3bb, 0xe3bd, 0xe3be, 0xe3bf, 0xe3c5, 0xe3c7, 0xe3c9, 0xe3cb, 0xe3cc, 0xe3cd, 0xe3ce, 0xe3cf}
106 69 local ranuirSpecial = {
107 70 [0xe390] = basictype.space | props.wordsep;
108 71 }
109 72
110 73 local ranuir = {}
111 -for _,v in pairs(ranuirAlpha) do ranuir[v] = basictype.alpha end
74 +for _,v in pairs(ranuirAlpha) do ranuir[v] = basictype.letter end
112 75 for k,v in pairs(ranuirSpecial) do ranuir[k] = v end
113 76 local ranuirKeys = {}
114 77 for k in pairs(ranuir) do table.insert(ranuirKeys, k) end
115 78 table.sort(ranuirKeys)
116 79
117 80 local recs = {}
118 81 local ranuirok = false
119 82 for ln in file:lines() do
120 83 local v = {}
121 84 for s in ln:gmatch('[^;]*') do
122 85 table.insert(v, s)
123 86 end
124 87 v[1] = tonumber(v[1],0x10)
125 - if v[1] > 0x7f then -- discard ASCII, we already have that
88 +-- if v[1] > 0x7f then -- discard ASCII, we already have that
126 89 local code = {
127 90 codepoint = v[1];
128 91 name = v[2];
129 92 class = v[3];
130 93 kind = v[5];
131 94 }
132 95 code.cat = parsecat(code)
................................................................................
140 103 end
141 104 ranuirok = true
142 105 end
143 106
144 107 if code.cat ~= 0 then
145 108 table.insert(recs,code)
146 109 end
147 - end
110 +-- end
148 111 end
149 112
150 113
151 114 local ranges = {}
152 115 local last = recs[1]
153 116 local start = last
154 117 local altern = false
................................................................................
179 142 flush()
180 143 start = r
181 144 end
182 145 last = r
183 146 end
184 147 flush()
185 148
186 --- expand bitmask
187 - -- for k,v in pairs(ranges) do
188 - -- local basic = v[3] & ((1<<3) - 1) -- first three bits
189 - -- if basic ~= 0 then
190 - -- v[4] = basictype[basic]
191 - -- end
192 - -- local bitrange = props[true]
193 - -- for j=bitrange[1], bitrange[2] do
194 - -- if (v[3] & (1<<j)) ~= 0 then
195 - -- table.insert(v, props[1<<j])
196 - -- end
197 - -- end
198 - -- end
199 -
200 149 -- the data has been collected and formatted in the manner we
201 150 -- need; now we just need to emit it as a lua table
202 151
203 152 local tab = {}
204 153 local top = 1
205 154 for k,v in pairs(ranges) do
206 155 tab[top] = string.format('{0x%x,0x%x,%u}',table.unpack(v))
207 156 top = top + 1
208 157 end
209 -io.stdout:write(string.format(tpl, table.concat(tab,',')))
158 +io.stdout:write(string.format(tpl, table.concat(tab,',\n')))