cortav  Diff

Differences From Artifact [3976f4bc78]:

To Artifact [cf6aee3c65]:


21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
..
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
...
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
...
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
...
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
local file = io.stdin
local path
if arg[1] then
	path = arg[1]
	file = io.open(path, 'rb')
end

local bitmask_raw = function(n,ofs)
	ofs = ofs or 0
	local function rec(i)
		if i > n then return end
		return 1<<(i+ofs), rec(i+1)
	end
	return 1<<ofs, rec(1)
end

local bitmask = function(tbl,ofs)
	local codes = {bitmask_raw(#tbl,ofs)}
	local m = {}
	local maxbit
	for i, s in ipairs(tbl) do
		m[s] = codes[i]
		m[codes[i]] = s
		maxbit = i
	end
	m[true] = {ofs or 0,maxbit}
	return m
end

local basictype = enum {
	'numeral';
	'alpha';
	'symbol';
	'punct';
	'space';
	'ctl';
	'glyph'; -- hanji
}
local props = bitmask({
	'hex',
	'upper', 'lower', 'diac',
	'wordbreak', 'wordsep',
	'disallow',
	'brack', 'right', 'left',
	'noprint', 'superimpose'
}, 3)

local overrides = {
	[0x200B] = basictype.space | props.wordsep; -- database entry is wrong
}

local mask = ~0 -- mask out irrelevant properties to compactify database

local function parsecat(tbl)
................................................................................
	elseif tbl.class == 'Nd' then c = b.numeral
	elseif tbl.class == 'No' then c = b.numeral | p.diac
	elseif tbl.class == 'Cc' then
		if tbl.kind == 'S'
		or tbl.kind == 'WS'
		or tbl.kind == 'B' then c  = b.space | p.wordsep
      else c = b.ctl | p.wordbreak | p.disallow end
	elseif tbl.class == 'Lu' then c = b.alpha | p.upper
	elseif tbl.class == 'Ll' then c = b.alpha | p.lower
	elseif tbl.class == 'Lo'
	    or tbl.class == 'Lt' then c = b.alpha
	elseif tbl.class == 'Po' then c = b.punct | p.wordbreak
	elseif tbl.class == 'Sm' then c = b.symbol | p.wordsep
	elseif tbl.class == 'Ps' then c = b.punct | p.brack | p.left
	elseif tbl.class == 'Pe' then c = b.punct | p.brack | p.right
	elseif tbl.class == 'Pc'
	    or tbl.class == 'Pd'
	    or tbl.class == 'Sk'
................................................................................

local ranuirAlpha = {0xe39d, 0xe39f, 0xe3ad, 0xe3af, 0xe3b5, 0xe3b7, 0xe3b9, 0xe3bb, 0xe3bd, 0xe3be, 0xe3bf, 0xe3c5, 0xe3c7, 0xe3c9, 0xe3cb, 0xe3cc, 0xe3cd, 0xe3ce, 0xe3cf}
local ranuirSpecial = {
	[0xe390] = basictype.space | props.wordsep;
}

local ranuir = {}
for _,v in pairs(ranuirAlpha) do ranuir[v] = basictype.alpha end
for k,v in pairs(ranuirSpecial) do ranuir[k] = v end
local ranuirKeys = {}
for k in pairs(ranuir) do table.insert(ranuirKeys, k) end
table.sort(ranuirKeys)

local recs = {}
local ranuirok = false
for ln in file:lines() do
	local v = {}
	for s in ln:gmatch('[^;]*') do
		table.insert(v, s)
	end
	v[1] = tonumber(v[1],0x10)
	if v[1] > 0x7f then -- discard ASCII, we already have that
		local code = {
			codepoint = v[1];
			name = v[2];
			class = v[3];
			kind = v[5];
		}
		code.cat = parsecat(code)
................................................................................
			end
			ranuirok = true
		end

		if code.cat ~= 0 then
			table.insert(recs,code)
		end
	end
end


local ranges = {}
local last = recs[1]
local start = last
local altern = false
................................................................................
		flush()
		start = r
	end
	last = r
end
flush()

-- expand bitmask
	-- for k,v in pairs(ranges) do
	-- 	local basic = v[3] & ((1<<3) - 1) -- first three bits
	-- 	if basic ~= 0 then
	-- 		v[4] = basictype[basic]
	-- 	end
	-- 	local bitrange = props[true]
	-- 	for j=bitrange[1], bitrange[2] do
	-- 		if (v[3] & (1<<j)) ~= 0 then
	-- 			table.insert(v, props[1<<j])
	-- 		end
	-- 	end
	-- end

-- the data has been collected and formatted in the manner we
-- need; now we just need to emit it as a lua table

local tab = {}
local top = 1
for k,v in pairs(ranges) do
	tab[top] = string.format('{0x%x,0x%x,%u}',table.unpack(v))
	top = top + 1
end
io.stdout:write(string.format(tpl, table.concat(tab,',')))







|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<
|
<
<
<
<
<
<
<
<







 







|
|

|







 







|













|







 







|







 







<
<
<
<
<
<
<
<
<
<
<
<
<
<









|
21
22
23
24
25
26
27
28





















29








30








31
32
33
34
35
36
37
..
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
..
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
...
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
...
142
143
144
145
146
147
148














149
150
151
152
153
154
155
156
157
158
local file = io.stdin
local path
if arg[1] then
	path = arg[1]
	file = io.open(path, 'rb')
end

local ss = require'sirsem'





















local basictype = ss.str.charclass








local props = ss.str.charprop








local overrides = {
	[0x200B] = basictype.space | props.wordsep; -- database entry is wrong
}

local mask = ~0 -- mask out irrelevant properties to compactify database

local function parsecat(tbl)
................................................................................
	elseif tbl.class == 'Nd' then c = b.numeral
	elseif tbl.class == 'No' then c = b.numeral | p.diac
	elseif tbl.class == 'Cc' then
		if tbl.kind == 'S'
		or tbl.kind == 'WS'
		or tbl.kind == 'B' then c  = b.space | p.wordsep
      else c = b.ctl | p.wordbreak | p.disallow end
	elseif tbl.class == 'Lu' then c = b.letter | p.upper
	elseif tbl.class == 'Ll' then c = b.letter | p.lower
	elseif tbl.class == 'Lo'
	    or tbl.class == 'Lt' then c = b.letter
	elseif tbl.class == 'Po' then c = b.punct | p.wordbreak
	elseif tbl.class == 'Sm' then c = b.symbol | p.wordsep
	elseif tbl.class == 'Ps' then c = b.punct | p.brack | p.left
	elseif tbl.class == 'Pe' then c = b.punct | p.brack | p.right
	elseif tbl.class == 'Pc'
	    or tbl.class == 'Pd'
	    or tbl.class == 'Sk'
................................................................................

local ranuirAlpha = {0xe39d, 0xe39f, 0xe3ad, 0xe3af, 0xe3b5, 0xe3b7, 0xe3b9, 0xe3bb, 0xe3bd, 0xe3be, 0xe3bf, 0xe3c5, 0xe3c7, 0xe3c9, 0xe3cb, 0xe3cc, 0xe3cd, 0xe3ce, 0xe3cf}
local ranuirSpecial = {
	[0xe390] = basictype.space | props.wordsep;
}

local ranuir = {}
for _,v in pairs(ranuirAlpha) do ranuir[v] = basictype.letter end
for k,v in pairs(ranuirSpecial) do ranuir[k] = v end
local ranuirKeys = {}
for k in pairs(ranuir) do table.insert(ranuirKeys, k) end
table.sort(ranuirKeys)

local recs = {}
local ranuirok = false
for ln in file:lines() do
	local v = {}
	for s in ln:gmatch('[^;]*') do
		table.insert(v, s)
	end
	v[1] = tonumber(v[1],0x10)
-- 	if v[1] > 0x7f then -- discard ASCII, we already have that
		local code = {
			codepoint = v[1];
			name = v[2];
			class = v[3];
			kind = v[5];
		}
		code.cat = parsecat(code)
................................................................................
			end
			ranuirok = true
		end

		if code.cat ~= 0 then
			table.insert(recs,code)
		end
-- 	end
end


local ranges = {}
local last = recs[1]
local start = last
local altern = false
................................................................................
		flush()
		start = r
	end
	last = r
end
flush()















-- the data has been collected and formatted in the manner we
-- need; now we just need to emit it as a lua table

local tab = {}
local top = 1
for k,v in pairs(ranges) do
	tab[top] = string.format('{0x%x,0x%x,%u}',table.unpack(v))
	top = top + 1
end
io.stdout:write(string.format(tpl, table.concat(tab,',\n')))