21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
..
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
...
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
...
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
...
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
|
local file = io.stdin
local path
if arg[1] then
path = arg[1]
file = io.open(path, 'rb')
end
local bitmask_raw = function(n,ofs)
ofs = ofs or 0
local function rec(i)
if i > n then return end
return 1<<(i+ofs), rec(i+1)
end
return 1<<ofs, rec(1)
end
local bitmask = function(tbl,ofs)
local codes = {bitmask_raw(#tbl,ofs)}
local m = {}
local maxbit
for i, s in ipairs(tbl) do
m[s] = codes[i]
m[codes[i]] = s
maxbit = i
end
m[true] = {ofs or 0,maxbit}
return m
end
local basictype = enum {
'numeral';
'alpha';
'symbol';
'punct';
'space';
'ctl';
'glyph'; -- hanji
}
local props = bitmask({
'hex',
'upper', 'lower', 'diac',
'wordbreak', 'wordsep',
'disallow',
'brack', 'right', 'left',
'noprint', 'superimpose'
}, 3)
local overrides = {
[0x200B] = basictype.space | props.wordsep; -- database entry is wrong
}
local mask = ~0 -- mask out irrelevant properties to compactify database
local function parsecat(tbl)
................................................................................
elseif tbl.class == 'Nd' then c = b.numeral
elseif tbl.class == 'No' then c = b.numeral | p.diac
elseif tbl.class == 'Cc' then
if tbl.kind == 'S'
or tbl.kind == 'WS'
or tbl.kind == 'B' then c = b.space | p.wordsep
else c = b.ctl | p.wordbreak | p.disallow end
elseif tbl.class == 'Lu' then c = b.alpha | p.upper
elseif tbl.class == 'Ll' then c = b.alpha | p.lower
elseif tbl.class == 'Lo'
or tbl.class == 'Lt' then c = b.alpha
elseif tbl.class == 'Po' then c = b.punct | p.wordbreak
elseif tbl.class == 'Sm' then c = b.symbol | p.wordsep
elseif tbl.class == 'Ps' then c = b.punct | p.brack | p.left
elseif tbl.class == 'Pe' then c = b.punct | p.brack | p.right
elseif tbl.class == 'Pc'
or tbl.class == 'Pd'
or tbl.class == 'Sk'
................................................................................
local ranuirAlpha = {0xe39d, 0xe39f, 0xe3ad, 0xe3af, 0xe3b5, 0xe3b7, 0xe3b9, 0xe3bb, 0xe3bd, 0xe3be, 0xe3bf, 0xe3c5, 0xe3c7, 0xe3c9, 0xe3cb, 0xe3cc, 0xe3cd, 0xe3ce, 0xe3cf}
local ranuirSpecial = {
[0xe390] = basictype.space | props.wordsep;
}
local ranuir = {}
for _,v in pairs(ranuirAlpha) do ranuir[v] = basictype.alpha end
for k,v in pairs(ranuirSpecial) do ranuir[k] = v end
local ranuirKeys = {}
for k in pairs(ranuir) do table.insert(ranuirKeys, k) end
table.sort(ranuirKeys)
local recs = {}
local ranuirok = false
for ln in file:lines() do
local v = {}
for s in ln:gmatch('[^;]*') do
table.insert(v, s)
end
v[1] = tonumber(v[1],0x10)
if v[1] > 0x7f then -- discard ASCII, we already have that
local code = {
codepoint = v[1];
name = v[2];
class = v[3];
kind = v[5];
}
code.cat = parsecat(code)
................................................................................
end
ranuirok = true
end
if code.cat ~= 0 then
table.insert(recs,code)
end
end
end
local ranges = {}
local last = recs[1]
local start = last
local altern = false
................................................................................
flush()
start = r
end
last = r
end
flush()
-- expand bitmask
-- for k,v in pairs(ranges) do
-- local basic = v[3] & ((1<<3) - 1) -- first three bits
-- if basic ~= 0 then
-- v[4] = basictype[basic]
-- end
-- local bitrange = props[true]
-- for j=bitrange[1], bitrange[2] do
-- if (v[3] & (1<<j)) ~= 0 then
-- table.insert(v, props[1<<j])
-- end
-- end
-- end
-- the data has been collected and formatted in the manner we
-- need; now we just need to emit it as a lua table
local tab = {}
local top = 1
for k,v in pairs(ranges) do
tab[top] = string.format('{0x%x,0x%x,%u}',table.unpack(v))
top = top + 1
end
io.stdout:write(string.format(tpl, table.concat(tab,',')))
|
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
|
|
|
|
|
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
|
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
..
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
..
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
...
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
...
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
|
local file = io.stdin
local path
if arg[1] then
path = arg[1]
file = io.open(path, 'rb')
end
local ss = require'sirsem'
local basictype = ss.str.charclass
local props = ss.str.charprop
local overrides = {
[0x200B] = basictype.space | props.wordsep; -- database entry is wrong
}
local mask = ~0 -- mask out irrelevant properties to compactify database
local function parsecat(tbl)
................................................................................
elseif tbl.class == 'Nd' then c = b.numeral
elseif tbl.class == 'No' then c = b.numeral | p.diac
elseif tbl.class == 'Cc' then
if tbl.kind == 'S'
or tbl.kind == 'WS'
or tbl.kind == 'B' then c = b.space | p.wordsep
else c = b.ctl | p.wordbreak | p.disallow end
elseif tbl.class == 'Lu' then c = b.letter | p.upper
elseif tbl.class == 'Ll' then c = b.letter | p.lower
elseif tbl.class == 'Lo'
or tbl.class == 'Lt' then c = b.letter
elseif tbl.class == 'Po' then c = b.punct | p.wordbreak
elseif tbl.class == 'Sm' then c = b.symbol | p.wordsep
elseif tbl.class == 'Ps' then c = b.punct | p.brack | p.left
elseif tbl.class == 'Pe' then c = b.punct | p.brack | p.right
elseif tbl.class == 'Pc'
or tbl.class == 'Pd'
or tbl.class == 'Sk'
................................................................................
local ranuirAlpha = {0xe39d, 0xe39f, 0xe3ad, 0xe3af, 0xe3b5, 0xe3b7, 0xe3b9, 0xe3bb, 0xe3bd, 0xe3be, 0xe3bf, 0xe3c5, 0xe3c7, 0xe3c9, 0xe3cb, 0xe3cc, 0xe3cd, 0xe3ce, 0xe3cf}
local ranuirSpecial = {
[0xe390] = basictype.space | props.wordsep;
}
local ranuir = {}
for _,v in pairs(ranuirAlpha) do ranuir[v] = basictype.letter end
for k,v in pairs(ranuirSpecial) do ranuir[k] = v end
local ranuirKeys = {}
for k in pairs(ranuir) do table.insert(ranuirKeys, k) end
table.sort(ranuirKeys)
local recs = {}
local ranuirok = false
for ln in file:lines() do
local v = {}
for s in ln:gmatch('[^;]*') do
table.insert(v, s)
end
v[1] = tonumber(v[1],0x10)
-- if v[1] > 0x7f then -- discard ASCII, we already have that
local code = {
codepoint = v[1];
name = v[2];
class = v[3];
kind = v[5];
}
code.cat = parsecat(code)
................................................................................
end
ranuirok = true
end
if code.cat ~= 0 then
table.insert(recs,code)
end
-- end
end
local ranges = {}
local last = recs[1]
local start = last
local altern = false
................................................................................
flush()
start = r
end
last = r
end
flush()
-- the data has been collected and formatted in the manner we
-- need; now we just need to emit it as a lua table
local tab = {}
local top = 1
for k,v in pairs(ranges) do
tab[top] = string.format('{0x%x,0x%x,%u}',table.unpack(v))
top = top + 1
end
io.stdout:write(string.format(tpl, table.concat(tab,',\n')))
|