cortav  Artifact [3976f4bc78]

Artifact 3976f4bc78944ddaf7dd86953cb7c2c13e11f6deaaaa77d4f20b8ecff65f70de:


-- [ʞ] tools/ucs.lua
--  ~ lexi hale <lexi@hale.su>
--  ? table generator for unicode character classes
--  🄯 AGPLv3


local tpl = [[
local ss = require 'sirsem'
ss.str.enc.utf8.ranges = {%s}
]]

local enum = function(syms)
	local e = {}
	for i,v in pairs(syms) do
		e[v] = i
		e[i] = v
	end
	return e
end

local file = io.stdin
local path
if arg[1] then
	path = arg[1]
	file = io.open(path, 'rb')
end

local bitmask_raw = function(n,ofs)
	ofs = ofs or 0
	local function rec(i)
		if i > n then return end
		return 1<<(i+ofs), rec(i+1)
	end
	return 1<<ofs, rec(1)
end

local bitmask = function(tbl,ofs)
	local codes = {bitmask_raw(#tbl,ofs)}
	local m = {}
	local maxbit
	for i, s in ipairs(tbl) do
		m[s] = codes[i]
		m[codes[i]] = s
		maxbit = i
	end
	m[true] = {ofs or 0,maxbit}
	return m
end

local basictype = enum {
	'numeral';
	'alpha';
	'symbol';
	'punct';
	'space';
	'ctl';
	'glyph'; -- hanji
}
local props = bitmask({
	'hex',
	'upper', 'lower', 'diac',
	'wordbreak', 'wordsep',
	'disallow',
	'brack', 'right', 'left',
	'noprint', 'superimpose'
}, 3)

local overrides = {
	[0x200B] = basictype.space | props.wordsep; -- database entry is wrong
}

local mask = ~0 -- mask out irrelevant properties to compactify database

local function parsecat(tbl)
	local c,p,b = 0,props,basictype
	if overrides[tbl.codepoint] then
		c = overrides[tbl.codepoint]
	elseif tbl.class == 'Nd' then c = b.numeral
	elseif tbl.class == 'No' then c = b.numeral | p.diac
	elseif tbl.class == 'Cc' then
		if tbl.kind == 'S'
		or tbl.kind == 'WS'
		or tbl.kind == 'B' then c  = b.space | p.wordsep
      else c = b.ctl | p.wordbreak | p.disallow end
	elseif tbl.class == 'Lu' then c = b.alpha | p.upper
	elseif tbl.class == 'Ll' then c = b.alpha | p.lower
	elseif tbl.class == 'Lo'
	    or tbl.class == 'Lt' then c = b.alpha
	elseif tbl.class == 'Po' then c = b.punct | p.wordbreak
	elseif tbl.class == 'Sm' then c = b.symbol | p.wordsep
	elseif tbl.class == 'Ps' then c = b.punct | p.brack | p.left
	elseif tbl.class == 'Pe' then c = b.punct | p.brack | p.right
	elseif tbl.class == 'Pc'
	    or tbl.class == 'Pd'
	    or tbl.class == 'Sk'
	    or tbl.class == 'Sc' then c = b.symbol
	elseif tbl.class == 'Zs' then c = b.space
		if tbl.kind == 'WS' then c=c|p.wordsep end
	elseif tbl.class == 'So' then c = b.glyph
	elseif tbl.class == 'Mn' then c = b.symbol | p.diac | p.superimpose
	end
	return c & mask
end

local ranuirAlpha = {0xe39d, 0xe39f, 0xe3ad, 0xe3af, 0xe3b5, 0xe3b7, 0xe3b9, 0xe3bb, 0xe3bd, 0xe3be, 0xe3bf, 0xe3c5, 0xe3c7, 0xe3c9, 0xe3cb, 0xe3cc, 0xe3cd, 0xe3ce, 0xe3cf}
local ranuirSpecial = {
	[0xe390] = basictype.space | props.wordsep;
}

local ranuir = {}
for _,v in pairs(ranuirAlpha) do ranuir[v] = basictype.alpha end
for k,v in pairs(ranuirSpecial) do ranuir[k] = v end
local ranuirKeys = {}
for k in pairs(ranuir) do table.insert(ranuirKeys, k) end
table.sort(ranuirKeys)

local recs = {}
local ranuirok = false
for ln in file:lines() do
	local v = {}
	for s in ln:gmatch('[^;]*') do
		table.insert(v, s)
	end
	v[1] = tonumber(v[1],0x10)
	if v[1] > 0x7f then -- discard ASCII, we already have that
		local code = {
			codepoint = v[1];
			name = v[2];
			class = v[3];
			kind = v[5];
		}
		code.cat = parsecat(code)

		if (not ranuirok) and code.codepoint > 0xe390 then
			for _,ri in pairs(ranuirKeys) do
				table.insert(recs, {
					codepoint = ri;
					cat = ranuir[ri];
				})
			end
			ranuirok = true
		end

		if code.cat ~= 0 then
			table.insert(recs,code)
		end
	end
end


local ranges = {}
local last = recs[1]
local start = last
local altern = false
local flush = function(i)
	local new = {start.codepoint, last.codepoint, last.cat}
	if altern then
		new[3] = new[3] | props.upper | props.lower
	end
	table.insert(ranges, new)
	altern = false
end
for i, r in ipairs(recs) do
	if r.cat ~= last.cat then
	-- we can massively compactify this set with one weird trick:
	-- most non-ascii cased character sets are not in AAAAaaaa,
	-- but rather AaAaAa order. so we can look for this simple
	-- pattern and compress it, shaving c. 1/3rd off our dataset
		local ambi = props.upper | props.lower
		if (altern or (start == last and (last.cat & props.upper) ~= 0)) and
			((r.cat &~ ambi) == (last.cat &~ ambi)) then
			altern = true
			last = r
		else
			flush()
			start = r
		end
	elseif altern then
		flush()
		start = r
	end
	last = r
end
flush()

-- expand bitmask
	-- for k,v in pairs(ranges) do
	-- 	local basic = v[3] & ((1<<3) - 1) -- first three bits
	-- 	if basic ~= 0 then
	-- 		v[4] = basictype[basic]
	-- 	end
	-- 	local bitrange = props[true]
	-- 	for j=bitrange[1], bitrange[2] do
	-- 		if (v[3] & (1<<j)) ~= 0 then
	-- 			table.insert(v, props[1<<j])
	-- 		end
	-- 	end
	-- end

-- the data has been collected and formatted in the manner we
-- need; now we just need to emit it as a lua table

local tab = {}
local top = 1
for k,v in pairs(ranges) do
	tab[top] = string.format('{0x%x,0x%x,%u}',table.unpack(v))
	top = top + 1
end
io.stdout:write(string.format(tpl, table.concat(tab,',')))