-- [ʞ] tools/ucs.lua
-- ~ lexi hale <lexi@hale.su>
-- ? table generator for unicode character classes
-- 🄯 AGPLv3
local tpl = [[
local ss = require 'sirsem'
ss.str.enc.utf8.ranges = {%s}
]]
local enum = function(syms)
local e = {}
for i,v in pairs(syms) do
e[v] = i
e[i] = v
end
return e
end
local file = io.stdin
local path
if arg[1] then
path = arg[1]
file = io.open(path, 'rb')
end
local bitmask_raw = function(n,ofs)
ofs = ofs or 0
local function rec(i)
if i > n then return end
return 1<<(i+ofs), rec(i+1)
end
return 1<<ofs, rec(1)
end
local bitmask = function(tbl,ofs)
local codes = {bitmask_raw(#tbl,ofs)}
local m = {}
local maxbit
for i, s in ipairs(tbl) do
m[s] = codes[i]
m[codes[i]] = s
maxbit = i
end
m[true] = {ofs or 0,maxbit}
return m
end
local basictype = enum {
'numeral';
'alpha';
'symbol';
'punct';
'space';
'ctl';
'glyph'; -- hanji
}
local props = bitmask({
'hex',
'upper', 'lower', 'diac',
'wordbreak', 'wordsep',
'disallow',
'brack', 'right', 'left',
'noprint', 'superimpose'
}, 3)
local overrides = {
[0x200B] = basictype.space | props.wordsep; -- database entry is wrong
}
local mask = ~0 -- mask out irrelevant properties to compactify database
local function parsecat(tbl)
local c,p,b = 0,props,basictype
if overrides[tbl.codepoint] then
c = overrides[tbl.codepoint]
elseif tbl.class == 'Nd' then c = b.numeral
elseif tbl.class == 'No' then c = b.numeral | p.diac
elseif tbl.class == 'Cc' then
if tbl.kind == 'S'
or tbl.kind == 'WS'
or tbl.kind == 'B' then c = b.space | p.wordsep
else c = b.ctl | p.wordbreak | p.disallow end
elseif tbl.class == 'Lu' then c = b.alpha | p.upper
elseif tbl.class == 'Ll' then c = b.alpha | p.lower
elseif tbl.class == 'Lo'
or tbl.class == 'Lt' then c = b.alpha
elseif tbl.class == 'Po' then c = b.punct | p.wordbreak
elseif tbl.class == 'Sm' then c = b.symbol | p.wordsep
elseif tbl.class == 'Ps' then c = b.punct | p.brack | p.left
elseif tbl.class == 'Pe' then c = b.punct | p.brack | p.right
elseif tbl.class == 'Pc'
or tbl.class == 'Pd'
or tbl.class == 'Sk'
or tbl.class == 'Sc' then c = b.symbol
elseif tbl.class == 'Zs' then c = b.space
if tbl.kind == 'WS' then c=c|p.wordsep end
elseif tbl.class == 'So' then c = b.glyph
elseif tbl.class == 'Mn' then c = b.symbol | p.diac | p.superimpose
end
return c & mask
end
local ranuirAlpha = {0xe39d, 0xe39f, 0xe3ad, 0xe3af, 0xe3b5, 0xe3b7, 0xe3b9, 0xe3bb, 0xe3bd, 0xe3be, 0xe3bf, 0xe3c5, 0xe3c7, 0xe3c9, 0xe3cb, 0xe3cc, 0xe3cd, 0xe3ce, 0xe3cf}
local ranuirSpecial = {
[0xe390] = basictype.space | props.wordsep;
}
local ranuir = {}
for _,v in pairs(ranuirAlpha) do ranuir[v] = basictype.alpha end
for k,v in pairs(ranuirSpecial) do ranuir[k] = v end
local ranuirKeys = {}
for k in pairs(ranuir) do table.insert(ranuirKeys, k) end
table.sort(ranuirKeys)
local recs = {}
local ranuirok = false
for ln in file:lines() do
local v = {}
for s in ln:gmatch('[^;]*') do
table.insert(v, s)
end
v[1] = tonumber(v[1],0x10)
if v[1] > 0x7f then -- discard ASCII, we already have that
local code = {
codepoint = v[1];
name = v[2];
class = v[3];
kind = v[5];
}
code.cat = parsecat(code)
if (not ranuirok) and code.codepoint > 0xe390 then
for _,ri in pairs(ranuirKeys) do
table.insert(recs, {
codepoint = ri;
cat = ranuir[ri];
})
end
ranuirok = true
end
if code.cat ~= 0 then
table.insert(recs,code)
end
end
end
local ranges = {}
local last = recs[1]
local start = last
local altern = false
local flush = function(i)
local new = {start.codepoint, last.codepoint, last.cat}
if altern then
new[3] = new[3] | props.upper | props.lower
end
table.insert(ranges, new)
altern = false
end
for i, r in ipairs(recs) do
if r.cat ~= last.cat then
-- we can massively compactify this set with one weird trick:
-- most non-ascii cased character sets are not in AAAAaaaa,
-- but rather AaAaAa order. so we can look for this simple
-- pattern and compress it, shaving c. 1/3rd off our dataset
local ambi = props.upper | props.lower
if (altern or (start == last and (last.cat & props.upper) ~= 0)) and
((r.cat &~ ambi) == (last.cat &~ ambi)) then
altern = true
last = r
else
flush()
start = r
end
elseif altern then
flush()
start = r
end
last = r
end
flush()
-- expand bitmask
-- for k,v in pairs(ranges) do
-- local basic = v[3] & ((1<<3) - 1) -- first three bits
-- if basic ~= 0 then
-- v[4] = basictype[basic]
-- end
-- local bitrange = props[true]
-- for j=bitrange[1], bitrange[2] do
-- if (v[3] & (1<<j)) ~= 0 then
-- table.insert(v, props[1<<j])
-- end
-- end
-- end
-- the data has been collected and formatted in the manner we
-- need; now we just need to emit it as a lua table
local tab = {}
local top = 1
for k,v in pairs(ranges) do
tab[top] = string.format('{0x%x,0x%x,%u}',table.unpack(v))
top = top + 1
end
io.stdout:write(string.format(tpl, table.concat(tab,',')))