@@ -9,8 +9,53 @@ -- ELS External Linguistics Subdirectorate -- +WSO Worlds Security Overdirectorate -- EID External Influence Directorate ] +--# parvan +-- +--## orthographies +-- parvan supports encoding words using multiple +-- orthographies. every database has a "primary" +-- orthography, which must be Unicode- or ASCII- +-- compatible, and which is used as the basis of the +-- uniform object paths. other orthographies can be +-- managed with the [$script] command, which can set +-- how they are displayed to the user. every word +-- can have zero or more representations mapped to a +-- particular orthography. +-- +--## file format +-- parvan defines two separate file formats, both +-- representations of a dictionary. one, the "working +-- format", is binary; the other, the "exchange format" +-- is comprised of UTF8 codepoint sequences and can be +-- (to some degree) written and read by human beings, +-- tho its primary purpose is as a line-based format +-- that allows parvan dictionaries to be managed with +-- conventional source control solutions like fossil +-- or git +-- +--## magic numbers +-- all parvan files share the same 4-byte header. it +-- is comprised of the sequence +-- [$ 0x50 0x56 $VERS $SUBTYPE ] +-- where [$$VERS] is a byte that is altered whenever a +-- breaking change is made to the format. [$$SUBTYPE] +-- indicates whether the file is binary- or text-based. +-- the byte 0x20 indicates an exchange file, while +-- 0x02 indicates a binary database. +-- +--## extensions +-- parvan recommends the use of the extension [$*.pv] +-- for its binary databases, and [$*.pvx] for the +-- exchange format. +-- +--## styled text +-- text in parvan documents should be written using +-- cortav syntax. at some future time this will +-- hopefully be used to generate styled output where +-- possible + local function implies(a,b) return a==b or not(a) end local function map(lst,fn) local new = {} @@ -53,8 +98,20 @@ i = i + sc iter(...) end iter(...) +end +local function mergeD(dest, tbl, ...) + if tbl == nil then return dest end + for k,v in pairs(tbl) do dest[k] = v end + return mergeD(dest, ...) +end +local function merge(...) + local t = {} + return mergeD(t, ...) +end +local function copy(t,...) + return merge(t), copy(...) end local function fastDelete(table,idx) -- delete without preserving table order local l = #table @@ -66,8 +123,22 @@ local new = {} tcatD(new, ...) return new end +local function iota(n,m) + if not m then return iota(1,n) end + if n == m then return n end + return n, iota(n+1,m) +end +local function keys(m) + local i,ks = 1,{} + for k in next, m do + ks[i] = k + i = i + 1 + end + return ks +end + local ansi = { levels = { plain = 0; ansi = 1; @@ -115,9 +186,9 @@ for k,v in pairs(ansi.seqs) do local lvl, on, off = table.unpack(v) if lvl <= cl then f[k] = function(s) - return esc..on .. s .. esc..off + return (esc..on) .. s .. (esc..off) end else f[k] = id end end local function ftoi(r,g,b) @@ -125,13 +196,19 @@ math.ceil(g*0xff), math.ceil(b*0xff) end local reset = "\27[39m" - function f.color(str, n, br) - return string.format("\27[%s%cm", - (bg and 4 or 3) + - (br and 6 or 0), 0x30+n) - .. str .. reset + function f.color(str, n, bright) + if n<=15 then + return string.format("\27[%s%cm", + (bg and 4 or 3) + + (br and 6 or 0), 0x30+n) + .. str .. reset + else + return string.format("\27[%c8;5;%sm", + (bg and 0x34 or 0x33), n) + .. str .. reset + end end function f.resetLine() return '\27[1K\13' end @@ -298,8 +375,41 @@ return vals end; } end + +fmt.any = function(struct) + local map, keylist = {},{} + for i,v in ipairs(struct) do + if type(v) ~= 'string' then + map[v[1]] = v[2] + v = v[1] + else + map[v] = true + end + table.insert(keylist, v) + end + local tdisc = fmt.enum(table.unpack(keylist)) + return { + encode = function(a) + if type(a) == 'string' and map[a] == true then + return marshal(tdisc, a) + else + local tname, obj = table.unpack(a) + assert(map[tname] ~= true, '`any` enumeration '..tostring(tname)..' has no associated struct') + return marshal(tdisc, tname) .. + marshal(map[tname], obj) + end + end; + decode = function(s) + local tname = parse(tdisc, s) + if map[tname] ~= true then + local obj = parse(map[tname], s) + return {tname,obj} + else return tname end + end; + } +end fmt.map = function(from,to,ity) local ent = fmt.list({ {'key', from}, @@ -318,8 +428,9 @@ local m = {} for _,p in pairs(lst) do m[p.key] = p.val end return m end; + null = function() return {} end; } end fmt.enum = function(...) @@ -341,8 +452,9 @@ } end fmt.uid = fmt.u32 +fmt.blob = fmt.string fmt.relatable = function(ty) return tcat(ty,{ {'rels',fmt.list(fmt.uid,fmt.u16)}; @@ -368,10 +480,26 @@ fmt.phrase = fmt.relatable { {'str',fmt.label}; {'means',fmt.list(fmt.meaning,fmt.u8)}; } + +fmt.ortho = fmt.map(fmt.uid, fmt.blob, fmt.u8) +-- UID <0> is always the UTF-8 representation of the primary ortho + +fmt.writing = { + {'enc',fmt.ortho}; -- if empty, print the morphs in sequence + {'info',fmt.label}; + {'morphs',fmt.list(fmt.uid,fmt.u16)}; +} fmt.def = fmt.relatable { + {'writings', fmt.list(fmt.writing,fmt.u8)}; + -- for japanese-like languages where words that are + -- pronounced/written the same under the indexing + -- orthography have alternate writings that are + -- definition-specific. e.g. words よう and さま + -- would both have a definition written as 様 + -- ordinary languages will have only 1 writing {'part', fmt.u8}; {'branch', fmt.list(fmt.label,fmt.u8)}; {'means', fmt.list(fmt.meaning,fmt.u8)}; {'forms', fmt.map(fmt.u16,fmt.label,fmt.u16)}; @@ -379,8 +507,30 @@ } fmt.word = fmt.relatable { {'defs', fmt.list(fmt.def,fmt.u8)}; + -- store secondary encodings of this word + {'enc', fmt.ortho}; +} + +fmt.orthography = { + {'uid', fmt.uid}; + {'name', fmt.tag}; + {'repr', fmt.any{ + 'utf8'; -- display as utf-8 compatible text + 'opaque'; -- do not display at all; used only by other tools + 'bytes'; -- display as raw hexadecimal bytes + {'int',fmt.u8}; -- display as a series of integers (n=byte len) + {'glyphs',{ -- map to a palette of custom glyphs. + -- treated as 'opaque' in text-only environments + {'glyphs', fmt.list { + {'image',fmt.blob}; + {'name',fmt.tag}; + }}; + {'encoding', fmt.u8}; -- number of bytes per codepoint + {'format',fmt.enum('svg','bmp','png')}; + }}; + }}; } fmt.dictHeader = { {'lang', fmt.tag}; @@ -393,8 +543,9 @@ {'parts', fmt.list(fmt.tag,fmt.u8)}; -- which parts of speech does this form apply to? -- leave empty if not relevant },fmt.u16)}; + {'orthographies', fmt.list(fmt.orthography,fmt.u8)} } fmt.relSet = { {'uid', fmt.uid}; @@ -404,42 +555,65 @@ {'kind', fmt.enum('syn','ant','met')}; -- membership is stored in individual objects, using a field -- attached by the 'relatable' template } + +fmt.pair = function(k,v) return { + {'key',k or fmt.tag}; + {'val', v or fmt.blob}; +} end + +fmt.morph = { + {'name',fmt.tag}; + {'enc', fmt.ortho}; + {'meta', fmt.list(fmt.pair(nil,fmt.string),fmt.u16)}; + {'rads', fmt.list(fmt.uid,fmt.u16)}; +} fmt.dict = { {'header', fmt.dictHeader}; {'words', fmt.map(fmt.string,fmt.word)}; {'relsets', fmt.list(fmt.relSet)}; + {'morphs', fmt.map(fmt.uid,fmt.morph)}; } -function marshal(ty, val) +function marshal(ty, val, pvers) + pvers = pvers or 0 if ty.encode then return ty.encode(val) end local ac = {} for idx,fld in ipairs(ty) do - local name, fty = table.unpack(fld) - table.insert(ac, marshal(fty, - assert(val[name], - string.format('marshalling error: missing field %s', name) - ) - )) + local name, fty, vers = table.unpack(fld) + vers = vers or 0 + if pvers >= vers then + table.insert(ac, marshal(fty, + assert(val[name], + string.format('marshalling error: missing field %s', name) + ), + pvers)) + end end return table.concat(ac) end -function parse(ty, stream) +function parse(ty, stream, pvers) + pvers = pvers or 0 if ty.decode then return ty.decode(stream) end local obj = {} for idx,fld in ipairs(ty) do - local name, fty = table.unpack(fld) - obj[name] = parse(fty, stream) + local name, fty, vers, dflt = table.unpack(fld) + vers = vers or 0 + if pvers >= vers then + obj[name] = parse(fty, stream, pvers) + else + obj[name] = dflt + end end return obj end @@ -508,10 +682,12 @@ local function readDict(file) local s = stream(file) local magic = s:next 'c4' - if magic ~= 'PV0\2' then - id10t 'not a parvan file' + if magic == 'PV0 ' then + id10t 'text-based dictionaries must be translated to binary using the `import` command before they can be used' + elseif magic ~= 'PV0\2' then + id10t 'not a parvan0 file' end local d = parse(fmt.dict, s) -- handle atoms for lit,w in pairs(d.words) do @@ -527,9 +703,9 @@ return d end -local function strwords(str) -- here be dragons +local function strwords(str,maxwords) -- here be dragons local wds = {} local w = {} local state, d, quo, dquo = 0,0 local function flush(n,final) @@ -542,8 +718,11 @@ state = n quo = nil dquo = nil d = 0 + if #wds == maxwords then + state = 100 + end end local function isws(c) return c == 0x20 or c == 0x09 or c == 0x0a end @@ -597,19 +776,58 @@ else table.insert(w,cp) end state = state - 10 + elseif state == 100 then -- word limit reached + -- triggered from flush + table.insert(wds, string.sub(str, p)) + return wds end end flush(nil,true) return wds end + +local function strsan(str) + local d,m = 0,0 + local r = {} + local unclosed = {} + local i = 1 + for bytepos,cp in utf8.codes(str) do + -- insert backslashes for characters that would + -- disrupt strwords() parsing + if cp == 0x0a then + table.insert(r, 0x5c) + table.insert(r, 0x6e) i=i+2 + else + if cp == 0x5b then + d = d + 1 + table.insert(unclosed,i) + elseif cp == 0x5d then + if d >= 1 then + d = d - 1 + unclosed[rawlen(unclosed)] = nil + else + table.insert(r, 0x5c) i=i+1 + end + end + table.insert(r, cp) i=i+1 + end + end + for j=#unclosed,1,-1 do + table.insert(r,unclosed[j],0x5c) + end + return '[' .. utf8.char(table.unpack(r)) .. ']' +end local predicates local function parsefilter(str) local f = strwords(str) if #f == 1 then return function(e) return predicates.lit.fn(e,f[1]) end end - if not predicates[f[1]] then + if not next(f) then + -- null predicate matches all + return function() return true end + elseif not predicates[f[1]] then id10t('no such predicate %s',f[1]) else local p = predicates[f[1]].fn return function(e) @@ -636,16 +854,30 @@ pred = parsefilter(pred) if pred(e) then return false end return p_none(e,...) end; - local function p_some(e,count,pred,...) - if count == 0 then return true end - if pred == nil then return false end - pred = parsefilter(pred) - if pred(e) then - count = count-1 + local function p_some(e,cmp,count,...) + local cfn = { + eq = function(a,b) return a == b end; + ne = function(a,b) return a ~= b end; + lt = function(a,b) return a < b end; + gt = function(a,b) return a > b end; + } + if not cfn[cmp] then + id10t('[some %s]: invalid comparator', cmp) end - return p_some(e,count,...) + count = tonumber(count) + local function rec(n,pred,...) + if pred == nil then + return cfn[cmp](n,count) + end + pred = parsefilter(pred) + if pred(e) then + n=n+1 + end + return rec(n,...) + end + return rec(0,...) end; local function prepScan(...) local map = {} @@ -656,24 +888,95 @@ predicates = { all = { fn = p_all; syntax = '…'; - help = 'every sub- matches' + help = 'every sub- matches'; }; any = { fn = p_any; syntax = '…'; - help = 'any sub- matches' + help = 'any sub- matches'; }; none = { fn = p_none; syntax = '…'; - help = 'no sub- matches' + help = 'no sub- matches (also useful to force evaluation for side effects without creates matches)'; }; some = { fn = p_some; - syntax = ' …'; - help = ' or more sub-s match' + syntax = '(eq|ne|lt|gt) …'; + help = ' [or more/less] sub-s match'; + }; + seq = { + syntax = " '[' … ']' …"; + help = 'reuse the same stack of arguments'; + fn = function(e,wrap,args,...) + local lst = {} + local function eval(pred,...) + if not pred then return end + table.insert(lst, pred .. ' ' .. args) + eval(...) + end + eval(...) + local filter = wrap .. ' ' ..table.concat(map(lst, strsan), ' ') + return parsefilter(filter)(e) + end; + }; + mark = { + syntax = ' []'; + help = 'apply to the words that match , or all the words that are tested if no is supplied. use to visually indicate the reason that a given term matched the query'; + fn = function(e, val, pred) + if pred == nil or parsefilter(pred)(e) then + e.mark = e.mark or {} + for k,v in pairs(e.mark) do + if v==val then return true end + end + table.insert(e.mark, val) + return true + end + end; + }; + clear = { + syntax = ' []'; + help = 'like [mark] but clears marks instead of setting them'; + fn = function(e, val, pred) + if pred == nil or parsefilter(pred)(e) then + e.mark = e.mark or {} + for k,v in pairs(e.mark) do + if v==val then + table.remove(e.mark,k) + return true + end + end + return true + end + end; + }; + marked = { + syntax = '(by [pred]|in )'; + help = 'tests for an existing on the result'; + fn = function(e, mode, val, pred) + if mode == 'in' then + pred = val val = nil + if pred == nil then + id10t '[marked in ] requires a predicate' + end + elseif mode == 'by' then + if val == nil then + id10t '[marked by ] requires a mark' + end + else id10t('invalid form [marked %s]', mode) end + + if pred == nil or parsefilter(pred)(e) then + if e.mark == nil or not next(e.mark) + then return false end + if val then + for k,v in pairs(e.mark) do + if v==val then return true end + end + else return true end + end + end; }; def = { help = 'word has at least one definition that contains all s'; syntax = '…'; @@ -693,26 +996,124 @@ return false end; }; lit = { - help = 'word is, begins with, or ends with '; - syntax = ' [(pfx|sfx)]'; - fn = function(e,val,op) + help = 'word is, begins with, matches, or ends with in