cortav  ucs.lua at [c50482b020]

File tool/ucs.lua artifact cf6aee3c65 part of check-in c50482b020


-- [ʞ] tools/ucs.lua
--  ~ lexi hale <lexi@hale.su>
--  ? table generator for unicode character classes
--  🄯 AGPLv3


local tpl = [[
local ss = require 'sirsem'
ss.str.enc.utf8.ranges = {%s}
]]

local enum = function(syms)
	local e = {}
	for i,v in pairs(syms) do
		e[v] = i
		e[i] = v
	end
	return e
end

local file = io.stdin
local path
if arg[1] then
	path = arg[1]
	file = io.open(path, 'rb')
end

local ss = require'sirsem'
local basictype = ss.str.charclass
local props = ss.str.charprop
local overrides = {
	[0x200B] = basictype.space | props.wordsep; -- database entry is wrong
}

local mask = ~0 -- mask out irrelevant properties to compactify database

local function parsecat(tbl)
	local c,p,b = 0,props,basictype
	if overrides[tbl.codepoint] then
		c = overrides[tbl.codepoint]
	elseif tbl.class == 'Nd' then c = b.numeral
	elseif tbl.class == 'No' then c = b.numeral | p.diac
	elseif tbl.class == 'Cc' then
		if tbl.kind == 'S'
		or tbl.kind == 'WS'
		or tbl.kind == 'B' then c  = b.space | p.wordsep
      else c = b.ctl | p.wordbreak | p.disallow end
	elseif tbl.class == 'Lu' then c = b.letter | p.upper
	elseif tbl.class == 'Ll' then c = b.letter | p.lower
	elseif tbl.class == 'Lo'
	    or tbl.class == 'Lt' then c = b.letter
	elseif tbl.class == 'Po' then c = b.punct | p.wordbreak
	elseif tbl.class == 'Sm' then c = b.symbol | p.wordsep
	elseif tbl.class == 'Ps' then c = b.punct | p.brack | p.left
	elseif tbl.class == 'Pe' then c = b.punct | p.brack | p.right
	elseif tbl.class == 'Pc'
	    or tbl.class == 'Pd'
	    or tbl.class == 'Sk'
	    or tbl.class == 'Sc' then c = b.symbol
	elseif tbl.class == 'Zs' then c = b.space
		if tbl.kind == 'WS' then c=c|p.wordsep end
	elseif tbl.class == 'So' then c = b.glyph
	elseif tbl.class == 'Mn' then c = b.symbol | p.diac | p.superimpose
	end
	return c & mask
end

local ranuirAlpha = {0xe39d, 0xe39f, 0xe3ad, 0xe3af, 0xe3b5, 0xe3b7, 0xe3b9, 0xe3bb, 0xe3bd, 0xe3be, 0xe3bf, 0xe3c5, 0xe3c7, 0xe3c9, 0xe3cb, 0xe3cc, 0xe3cd, 0xe3ce, 0xe3cf}
local ranuirSpecial = {
	[0xe390] = basictype.space | props.wordsep;
}

local ranuir = {}
for _,v in pairs(ranuirAlpha) do ranuir[v] = basictype.letter end
for k,v in pairs(ranuirSpecial) do ranuir[k] = v end
local ranuirKeys = {}
for k in pairs(ranuir) do table.insert(ranuirKeys, k) end
table.sort(ranuirKeys)

local recs = {}
local ranuirok = false
for ln in file:lines() do
	local v = {}
	for s in ln:gmatch('[^;]*') do
		table.insert(v, s)
	end
	v[1] = tonumber(v[1],0x10)
-- 	if v[1] > 0x7f then -- discard ASCII, we already have that
		local code = {
			codepoint = v[1];
			name = v[2];
			class = v[3];
			kind = v[5];
		}
		code.cat = parsecat(code)

		if (not ranuirok) and code.codepoint > 0xe390 then
			for _,ri in pairs(ranuirKeys) do
				table.insert(recs, {
					codepoint = ri;
					cat = ranuir[ri];
				})
			end
			ranuirok = true
		end

		if code.cat ~= 0 then
			table.insert(recs,code)
		end
-- 	end
end


local ranges = {}
local last = recs[1]
local start = last
local altern = false
local flush = function(i)
	local new = {start.codepoint, last.codepoint, last.cat}
	if altern then
		new[3] = new[3] | props.upper | props.lower
	end
	table.insert(ranges, new)
	altern = false
end
for i, r in ipairs(recs) do
	if r.cat ~= last.cat then
	-- we can massively compactify this set with one weird trick:
	-- most non-ascii cased character sets are not in AAAAaaaa,
	-- but rather AaAaAa order. so we can look for this simple
	-- pattern and compress it, shaving c. 1/3rd off our dataset
		local ambi = props.upper | props.lower
		if (altern or (start == last and (last.cat & props.upper) ~= 0)) and
			((r.cat &~ ambi) == (last.cat &~ ambi)) then
			altern = true
			last = r
		else
			flush()
			start = r
		end
	elseif altern then
		flush()
		start = r
	end
	last = r
end
flush()

-- the data has been collected and formatted in the manner we
-- need; now we just need to emit it as a lua table

local tab = {}
local top = 1
for k,v in pairs(ranges) do
	tab[top] = string.format('{0x%x,0x%x,%u}',table.unpack(v))
	top = top + 1
end
io.stdout:write(string.format(tpl, table.concat(tab,',\n')))