Differences From
Artifact [af37224306]:
6 6 -- < Commission for Defense Communication >
7 7 -- +WCO Worlds Culture Overdirectorate
8 8 -- SSD Social Sciences Directorate
9 9 -- ELS External Linguistics Subdirectorate
10 10 -- +WSO Worlds Security Overdirectorate
11 11 -- EID External Influence Directorate ]
12 12
13 +--# parvan
14 +--
15 +--## orthographies
16 +-- parvan supports encoding words using multiple
17 +-- orthographies. every database has a "primary"
18 +-- orthography, which must be Unicode- or ASCII-
19 +-- compatible, and which is used as the basis of the
20 +-- uniform object paths. other orthographies can be
21 +-- managed with the [$script] command, which can set
22 +-- how they are displayed to the user. every word
23 +-- can have zero or more representations mapped to a
24 +-- particular orthography.
25 +--
26 +--## file format
27 +-- parvan defines two separate file formats, both
28 +-- representations of a dictionary. one, the "working
29 +-- format", is binary; the other, the "exchange format"
30 +-- is comprised of UTF8 codepoint sequences and can be
31 +-- (to some degree) written and read by human beings,
32 +-- tho its primary purpose is as a line-based format
33 +-- that allows parvan dictionaries to be managed with
34 +-- conventional source control solutions like fossil
35 +-- or git
36 +--
37 +--## magic numbers
38 +-- all parvan files share the same 4-byte header. it
39 +-- is comprised of the sequence
40 +-- [$ 0x50 0x56 $VERS $SUBTYPE ]
41 +-- where [$$VERS] is a byte that is altered whenever a
42 +-- breaking change is made to the format. [$$SUBTYPE]
43 +-- indicates whether the file is binary- or text-based.
44 +-- the byte 0x20 indicates an exchange file, while
45 +-- 0x02 indicates a binary database.
46 +--
47 +--## extensions
48 +-- parvan recommends the use of the extension [$*.pv]
49 +-- for its binary databases, and [$*.pvx] for the
50 +-- exchange format.
51 +--
52 +--## styled text
53 +-- text in parvan documents should be written using
54 +-- cortav syntax. at some future time this will
55 +-- hopefully be used to generate styled output where
56 +-- possible
57 +
13 58 local function implies(a,b) return a==b or not(a) end
14 59
15 60 local function map(lst,fn)
16 61 local new = {}
17 62 for k,v in pairs(lst) do
18 63 local nv, nk = fn(v,k)
19 64 new[nk or k] = nv
................................................................................
50 95 if src == nil then return end
51 96 local sc = #src
52 97 for j=1,sc do dest[i+j] = src[j] end
53 98 i = i + sc
54 99 iter(...)
55 100 end
56 101 iter(...)
102 +end
103 +local function mergeD(dest, tbl, ...)
104 + if tbl == nil then return dest end
105 + for k,v in pairs(tbl) do dest[k] = v end
106 + return mergeD(dest, ...)
107 +end
108 +local function merge(...)
109 + local t = {}
110 + return mergeD(t, ...)
111 +end
112 +local function copy(t,...)
113 + return merge(t), copy(...)
57 114 end
58 115 local function fastDelete(table,idx)
59 116 -- delete without preserving table order
60 117 local l = #table
61 118 table[idx] = table[l]
62 119 table[l] = nil
63 120 return table
64 121 end
65 122 local function tcat(...)
66 123 local new = {}
67 124 tcatD(new, ...)
68 125 return new
69 126 end
127 +local function iota(n,m)
128 + if not m then return iota(1,n) end
129 + if n == m then return n end
130 + return n, iota(n+1,m)
131 +end
132 +local function keys(m)
133 + local i,ks = 1,{}
134 + for k in next, m do
135 + ks[i] = k
136 + i = i + 1
137 + end
138 + return ks
139 +end
140 +
70 141 local ansi = {
71 142 levels = {
72 143 plain = 0;
73 144 ansi = 1;
74 145 color = 2;
75 146 color8b = 3;
76 147 color24b = 4;
................................................................................
112 183 local id = function(...) return ... end
113 184 local esc = '\27'
114 185 local f = {}
115 186 for k,v in pairs(ansi.seqs) do
116 187 local lvl, on, off = table.unpack(v)
117 188 if lvl <= cl then
118 189 f[k] = function(s)
119 - return esc..on .. s .. esc..off
190 + return (esc..on) .. s .. (esc..off)
120 191 end
121 192 else f[k] = id end
122 193 end
123 194 local function ftoi(r,g,b)
124 195 return math.ceil(r*0xff),
125 196 math.ceil(g*0xff),
126 197 math.ceil(b*0xff)
127 198 end
128 199 local reset = "\27[39m"
129 - function f.color(str, n, br)
130 - return string.format("\27[%s%cm",
131 - (bg and 4 or 3) +
132 - (br and 6 or 0), 0x30+n)
133 - .. str .. reset
200 + function f.color(str, n, bright)
201 + if n<=15 then
202 + return string.format("\27[%s%cm",
203 + (bg and 4 or 3) +
204 + (br and 6 or 0), 0x30+n)
205 + .. str .. reset
206 + else
207 + return string.format("\27[%c8;5;%sm",
208 + (bg and 0x34 or 0x33), n)
209 + .. str .. reset
210 + end
134 211 end
135 212 function f.resetLine()
136 213 return '\27[1K\13'
137 214 end
138 215 if cl == ansi.levels.color24b then
139 216 function f.rgb(str, r,g,b, bg)
140 217 return string.format("\27[%c8;2;%u;%u;%um", bg and 0x34 or 0x33,
................................................................................
295 372 for i=1,n do
296 373 table.insert(vals, parse(t, s))
297 374 end
298 375 return vals
299 376 end;
300 377 }
301 378 end
379 +
380 +fmt.any = function(struct)
381 + local map, keylist = {},{}
382 + for i,v in ipairs(struct) do
383 + if type(v) ~= 'string' then
384 + map[v[1]] = v[2]
385 + v = v[1]
386 + else
387 + map[v] = true
388 + end
389 + table.insert(keylist, v)
390 + end
391 + local tdisc = fmt.enum(table.unpack(keylist))
392 + return {
393 + encode = function(a)
394 + if type(a) == 'string' and map[a] == true then
395 + return marshal(tdisc, a)
396 + else
397 + local tname, obj = table.unpack(a)
398 + assert(map[tname] ~= true, '`any` enumeration '..tostring(tname)..' has no associated struct')
399 + return marshal(tdisc, tname) ..
400 + marshal(map[tname], obj)
401 + end
402 + end;
403 + decode = function(s)
404 + local tname = parse(tdisc, s)
405 + if map[tname] ~= true then
406 + local obj = parse(map[tname], s)
407 + return {tname,obj}
408 + else return tname end
409 + end;
410 + }
411 +end
302 412
303 413 fmt.map = function(from,to,ity)
304 414 local ent = fmt.list({
305 415 {'key', from},
306 416 {'val', to}
307 417 }, ity)
308 418 return {
................................................................................
315 425 end;
316 426 decode = function(s)
317 427 local lst = ent.decode(s)
318 428 local m = {}
319 429 for _,p in pairs(lst) do m[p.key] = p.val end
320 430 return m
321 431 end;
432 + null = function() return {} end;
322 433 }
323 434 end
324 435
325 436 fmt.enum = function(...)
326 437 local vals,rmap = {...},{}
327 438 for k,v in pairs(vals) do rmap[v] = k-1 end
328 439 local ty = fmt.u8
................................................................................
338 449 if (n+1) > #vals then error(string.format('enum "%s" does not have %u members', table.concat(vals,'","'),n),3) end
339 450 return vals[n+1]
340 451 end;
341 452 }
342 453 end
343 454
344 455 fmt.uid = fmt.u32
456 +fmt.blob = fmt.string
345 457
346 458 fmt.relatable = function(ty)
347 459 return tcat(ty,{
348 460 {'rels',fmt.list(fmt.uid,fmt.u16)};
349 461 })
350 462 end
351 463
................................................................................
365 477 {'notes', fmt.list(fmt.note,fmt.u8)};
366 478 }
367 479
368 480 fmt.phrase = fmt.relatable {
369 481 {'str',fmt.label};
370 482 {'means',fmt.list(fmt.meaning,fmt.u8)};
371 483 }
484 +
485 +fmt.ortho = fmt.map(fmt.uid, fmt.blob, fmt.u8)
486 +-- UID <0> is always the UTF-8 representation of the primary ortho
487 +
488 +fmt.writing = {
489 + {'enc',fmt.ortho}; -- if empty, print the morphs in sequence
490 + {'info',fmt.label};
491 + {'morphs',fmt.list(fmt.uid,fmt.u16)};
492 +}
372 493
373 494 fmt.def = fmt.relatable {
495 + {'writings', fmt.list(fmt.writing,fmt.u8)};
496 + -- for japanese-like languages where words that are
497 + -- pronounced/written the same under the indexing
498 + -- orthography have alternate writings that are
499 + -- definition-specific. e.g. words よう and さま
500 + -- would both have a definition written as 様
501 + -- ordinary languages will have only 1 writing
374 502 {'part', fmt.u8};
375 503 {'branch', fmt.list(fmt.label,fmt.u8)};
376 504 {'means', fmt.list(fmt.meaning,fmt.u8)};
377 505 {'forms', fmt.map(fmt.u16,fmt.label,fmt.u16)};
378 506 {'phrases', fmt.list(fmt.phrase,fmt.u16)};
379 507 }
380 508
381 509 fmt.word = fmt.relatable {
382 510 {'defs', fmt.list(fmt.def,fmt.u8)};
511 + -- store secondary encodings of this word
512 + {'enc', fmt.ortho};
513 +}
514 +
515 +fmt.orthography = {
516 + {'uid', fmt.uid};
517 + {'name', fmt.tag};
518 + {'repr', fmt.any{
519 + 'utf8'; -- display as utf-8 compatible text
520 + 'opaque'; -- do not display at all; used only by other tools
521 + 'bytes'; -- display as raw hexadecimal bytes
522 + {'int',fmt.u8}; -- display as a series of integers (n=byte len)
523 + {'glyphs',{ -- map to a palette of custom glyphs.
524 + -- treated as 'opaque' in text-only environments
525 + {'glyphs', fmt.list {
526 + {'image',fmt.blob};
527 + {'name',fmt.tag};
528 + }};
529 + {'encoding', fmt.u8}; -- number of bytes per codepoint
530 + {'format',fmt.enum('svg','bmp','png')};
531 + }};
532 + }};
383 533 }
384 534
385 535 fmt.dictHeader = {
386 536 {'lang', fmt.tag};
387 537 {'meta', fmt.string};
388 538 {'partsOfSpeech', fmt.list(fmt.tag,fmt.u16)};
389 539 {'inflectionForms', fmt.list({
................................................................................
390 540 {'name', fmt.tag};
391 541 {'abbrev', fmt.tag};
392 542 {'desc', fmt.string};
393 543 {'parts', fmt.list(fmt.tag,fmt.u8)};
394 544 -- which parts of speech does this form apply to?
395 545 -- leave empty if not relevant
396 546 },fmt.u16)};
547 + {'orthographies', fmt.list(fmt.orthography,fmt.u8)}
397 548 }
398 549
399 550 fmt.relSet = {
400 551 {'uid', fmt.uid};
401 552 -- IDs are persistent random values so they can be used
402 553 -- as reliable identifiers even when merging exports in
403 554 -- a parvan-unaware VCS
404 555 {'kind', fmt.enum('syn','ant','met')};
405 556 -- membership is stored in individual objects, using a field
406 557 -- attached by the 'relatable' template
407 558 }
559 +
560 +fmt.pair = function(k,v) return {
561 + {'key',k or fmt.tag};
562 + {'val', v or fmt.blob};
563 +} end
564 +
565 +fmt.morph = {
566 + {'name',fmt.tag};
567 + {'enc', fmt.ortho};
568 + {'meta', fmt.list(fmt.pair(nil,fmt.string),fmt.u16)};
569 + {'rads', fmt.list(fmt.uid,fmt.u16)};
570 +}
408 571
409 572 fmt.dict = {
410 573 {'header', fmt.dictHeader};
411 574 {'words', fmt.map(fmt.string,fmt.word)};
412 575 {'relsets', fmt.list(fmt.relSet)};
576 + {'morphs', fmt.map(fmt.uid,fmt.morph)};
413 577 }
414 578
415 -function marshal(ty, val)
579 +function marshal(ty, val, pvers)
580 + pvers = pvers or 0
416 581 if ty.encode then
417 582 return ty.encode(val)
418 583 end
419 584 local ac = {}
420 585
421 586 for idx,fld in ipairs(ty) do
422 - local name, fty = table.unpack(fld)
423 - table.insert(ac, marshal(fty,
424 - assert(val[name],
425 - string.format('marshalling error: missing field %s', name)
426 - )
427 - ))
587 + local name, fty, vers = table.unpack(fld)
588 + vers = vers or 0
589 + if pvers >= vers then
590 + table.insert(ac, marshal(fty,
591 + assert(val[name],
592 + string.format('marshalling error: missing field %s', name)
593 + ),
594 + pvers))
595 + end
428 596 end
429 597
430 598 return table.concat(ac)
431 599 end
432 600
433 -function parse(ty, stream)
601 +function parse(ty, stream, pvers)
602 + pvers = pvers or 0
434 603 if ty.decode then
435 604 return ty.decode(stream)
436 605 end
437 606
438 607 local obj = {}
439 608 for idx,fld in ipairs(ty) do
440 - local name, fty = table.unpack(fld)
441 - obj[name] = parse(fty, stream)
609 + local name, fty, vers, dflt = table.unpack(fld)
610 + vers = vers or 0
611 + if pvers >= vers then
612 + obj[name] = parse(fty, stream, pvers)
613 + else
614 + obj[name] = dflt
615 + end
442 616 end
443 617 return obj
444 618 end
445 619
446 620 local function
447 621 atomizer()
448 622 local map = {}
................................................................................
505 679 return 'PV0\2'..marshal(fmt.dict, d)
506 680 end
507 681
508 682 local function
509 683 readDict(file)
510 684 local s = stream(file)
511 685 local magic = s:next 'c4'
512 - if magic ~= 'PV0\2' then
513 - id10t 'not a parvan file'
686 + if magic == 'PV0 ' then
687 + id10t 'text-based dictionaries must be translated to binary using the `import` command before they can be used'
688 + elseif magic ~= 'PV0\2' then
689 + id10t 'not a parvan0 file'
514 690 end
515 691 local d = parse(fmt.dict, s)
516 692 -- handle atoms
517 693 for lit,w in pairs(d.words) do
518 694 for j,def in ipairs(w.defs) do
519 695 def.part = d.header.partsOfSpeech[def.part]
520 696 end
................................................................................
524 700 -- enable faster lookup that would otherwise require
525 701 -- expensive scans
526 702 rebuildRelationCache(d)
527 703 return d
528 704 end
529 705
530 706
531 -local function strwords(str) -- here be dragons
707 +local function strwords(str,maxwords) -- here be dragons
532 708 local wds = {}
533 709 local w = {}
534 710 local state, d, quo, dquo = 0,0
535 711 local function flush(n,final)
536 712 if next(w) or state ~= 0 and state < 10 then
537 713 table.insert(wds, utf8.char(table.unpack(w)))
538 714 w = {}
................................................................................
539 715 elseif final and state > 10 then
540 716 table.insert(wds, '\\')
541 717 end
542 718 state = n
543 719 quo = nil
544 720 dquo = nil
545 721 d = 0
722 + if #wds == maxwords then
723 + state = 100
724 + end
546 725 end
547 726 local function isws(c)
548 727 return c == 0x20 or c == 0x09 or c == 0x0a
549 728 end
550 729 for p,cp in utf8.codes(str) do
551 730 if state == 0 then -- begin
552 731 if not(isws(cp)) then
................................................................................
594 773 -- 12 = quote escape, 11 = raw escape
595 774 if cp == 0x63 then --n
596 775 table.insert(w,0x0a)
597 776 else
598 777 table.insert(w,cp)
599 778 end
600 779 state = state - 10
780 + elseif state == 100 then -- word limit reached
781 + -- triggered from flush
782 + table.insert(wds, string.sub(str, p))
783 + return wds
601 784 end
602 785 end
603 786 flush(nil,true)
604 787 return wds
605 788 end
789 +
790 +local function strsan(str)
791 + local d,m = 0,0
792 + local r = {}
793 + local unclosed = {}
794 + local i = 1
795 + for bytepos,cp in utf8.codes(str) do
796 + -- insert backslashes for characters that would
797 + -- disrupt strwords() parsing
798 + if cp == 0x0a then
799 + table.insert(r, 0x5c)
800 + table.insert(r, 0x6e) i=i+2
801 + else
802 + if cp == 0x5b then
803 + d = d + 1
804 + table.insert(unclosed,i)
805 + elseif cp == 0x5d then
806 + if d >= 1 then
807 + d = d - 1
808 + unclosed[rawlen(unclosed)] = nil
809 + else
810 + table.insert(r, 0x5c) i=i+1
811 + end
812 + end
813 + table.insert(r, cp) i=i+1
814 + end
815 + end
816 + for j=#unclosed,1,-1 do
817 + table.insert(r,unclosed[j],0x5c)
818 + end
819 + return '[' .. utf8.char(table.unpack(r)) .. ']'
820 +end
606 821
607 822 local predicates
608 823 local function parsefilter(str)
609 824 local f = strwords(str)
610 825 if #f == 1 then return function(e) return predicates.lit.fn(e,f[1]) end end
611 - if not predicates[f[1]] then
826 + if not next(f) then
827 + -- null predicate matches all
828 + return function() return true end
829 + elseif not predicates[f[1]] then
612 830 id10t('no such predicate %s',f[1])
613 831 else
614 832 local p = predicates[f[1]].fn
615 833 return function(e)
616 834 return p(e, table.unpack(f,2))
617 835 end
618 836 end
................................................................................
633 851 end;
634 852 local function p_none(e,pred,...)
635 853 if pred == nil then return true end
636 854 pred = parsefilter(pred)
637 855 if pred(e) then return false end
638 856 return p_none(e,...)
639 857 end;
640 - local function p_some(e,count,pred,...)
641 - if count == 0 then return true end
642 - if pred == nil then return false end
643 - pred = parsefilter(pred)
644 - if pred(e) then
645 - count = count-1
858 + local function p_some(e,cmp,count,...)
859 + local cfn = {
860 + eq = function(a,b) return a == b end;
861 + ne = function(a,b) return a ~= b end;
862 + lt = function(a,b) return a < b end;
863 + gt = function(a,b) return a > b end;
864 + }
865 + if not cfn[cmp] then
866 + id10t('[some %s]: invalid comparator', cmp)
646 867 end
647 - return p_some(e,count,...)
868 + count = tonumber(count)
869 + local function rec(n,pred,...)
870 + if pred == nil then
871 + return cfn[cmp](n,count)
872 + end
873 + pred = parsefilter(pred)
874 + if pred(e) then
875 + n=n+1
876 + end
877 + return rec(n,...)
878 + end
879 + return rec(0,...)
648 880 end;
649 881
650 882 local function prepScan(...)
651 883 local map = {}
652 884 local tgt = select('#',...)
653 885 for _,v in pairs{...} do map[v] = true end
654 886 return map,tgt
655 887 end
656 888 predicates = {
657 889 all = {
658 890 fn = p_all;
659 891 syntax = '<pred>…';
660 - help = 'every sub-<pred> matches'
892 + help = 'every sub-<pred> matches';
661 893 };
662 894 any = {
663 895 fn = p_any;
664 896 syntax = '<pred>…';
665 - help = 'any sub-<pred> matches'
897 + help = 'any sub-<pred> matches';
666 898 };
667 899 none = {
668 900 fn = p_none;
669 901 syntax = '<pred>…';
670 - help = 'no sub-<pred> matches'
902 + help = 'no sub-<pred> matches (also useful to force evaluation for side effects without creates matches)';
671 903 };
672 904 some = {
673 905 fn = p_some;
674 - syntax = '<count> <pred>…';
675 - help = '<count> or more sub-<pred>s match'
906 + syntax = '(eq|ne|lt|gt) <count> <pred>…';
907 + help = '<count> [or more/less] sub-<pred>s match';
908 + };
909 + seq = {
910 + syntax = "<wrap> '[' <arg>… ']' <pred>…";
911 + help = 'reuse the same stack of arguments';
912 + fn = function(e,wrap,args,...)
913 + local lst = {}
914 + local function eval(pred,...)
915 + if not pred then return end
916 + table.insert(lst, pred .. ' ' .. args)
917 + eval(...)
918 + end
919 + eval(...)
920 + local filter = wrap .. ' ' ..table.concat(map(lst, strsan), ' ')
921 + return parsefilter(filter)(e)
922 + end;
923 + };
924 + mark = {
925 + syntax = '<mark> [<pred>]';
926 + help = 'apply <mark> to the words that match <pred>, or all the words that are tested if no <pred> is supplied. use to visually indicate the reason that a given term matched the query';
927 + fn = function(e, val, pred)
928 + if pred == nil or parsefilter(pred)(e) then
929 + e.mark = e.mark or {}
930 + for k,v in pairs(e.mark) do
931 + if v==val then return true end
932 + end
933 + table.insert(e.mark, val)
934 + return true
935 + end
936 + end;
937 + };
938 + clear = {
939 + syntax = '<mark> [<pred>]';
940 + help = 'like [mark] but clears marks instead of setting them';
941 + fn = function(e, val, pred)
942 + if pred == nil or parsefilter(pred)(e) then
943 + e.mark = e.mark or {}
944 + for k,v in pairs(e.mark) do
945 + if v==val then
946 + table.remove(e.mark,k)
947 + return true
948 + end
949 + end
950 + return true
951 + end
952 + end;
953 + };
954 + marked = {
955 + syntax = '(by <mark> [pred]|in <pred>)';
956 + help = 'tests for an existing <mark> on the result';
957 + fn = function(e, mode, val, pred)
958 + if mode == 'in' then
959 + pred = val val = nil
960 + if pred == nil then
961 + id10t '[marked in <pred>] requires a predicate'
962 + end
963 + elseif mode == 'by' then
964 + if val == nil then
965 + id10t '[marked by <mark>] requires a mark'
966 + end
967 + else id10t('invalid form [marked %s]', mode) end
968 +
969 + if pred == nil or parsefilter(pred)(e) then
970 + if e.mark == nil or not next(e.mark)
971 + then return false end
972 + if val then
973 + for k,v in pairs(e.mark) do
974 + if v==val then return true end
975 + end
976 + else return true end
977 + end
978 + end;
676 979 };
677 980 def = {
678 981 help = 'word has at least one definition that contains all <keyword>s';
679 982 syntax = '<keyword>…';
680 983 fn = function(e,...)
681 984 local kw = {...}
682 985 for i,d in ipairs(e.word.defs) do
................................................................................
690 993 ::notfound::
691 994 end
692 995 end
693 996 return false
694 997 end;
695 998 };
696 999 lit = {
697 - help = 'word is, begins with, or ends with <word>';
698 - syntax = '<word> [(pfx|sfx)]';
699 - fn = function(e,val,op)
1000 + help = 'word is, begins with, matches, or ends with <search> in <script> or the primary orthography ("also" enables searching the primary as well as the listed scripts)';
1001 + syntax = '<search> [(pfx|sfx|match)] [any|(in|also) <script>…]';
1002 + fn = function(e,val,...)
1003 + local opts,oc = {...},1
1004 + local scripts, op = {0}
1005 + if opts[oc] == 'pfx' or opts[oc] == 'sfx' or opts[oc] == 'match' then
1006 + op = opts[oc]
1007 + oc = oc + 1
1008 + end
1009 + if opts[oc] then
1010 + if opts[oc] == 'any' then
1011 + scripts = nil
1012 + else
1013 + if opts[oc] == 'in' then
1014 + scripts = {}
1015 + elseif opts[oc] ~= 'also' then
1016 + id10t('[lit … %s]: invalid spec', opts[oc])
1017 + end
1018 + if #opts < oc+1 then
1019 + id10t('[lit … %s]: missing argument', opts[oc])
1020 + end
1021 + for i=oc+1,#opts do
1022 + table.insert(scripts, opts[oc])
1023 + end
1024 + end
1025 + end
700 1026 if not op then
701 1027 return e.lit == val
702 1028 elseif op == 'pfx' then
703 1029 return val == string.sub(e.lit,1,#val)
704 1030 elseif op == 'sfx' then
705 1031 return val == string.sub(e.lit,(#e.lit) - #val + 1)
1032 + elseif op == 'match' then
1033 + return string.find(e.lit, val) ~= nil
706 1034 else
707 - id10t('[lit %s %s] is not a valid filter, “%s” should be either “pfx” or “sfx”',val,op,op)
1035 + id10t('[lit %s %s] is not a valid filter, “%s” should be “pfx”, “sfx”, or “match”',val,op,op)
708 1036 end
709 1037 end;
710 1038 };
1039 + morph = {
1040 + help = 'find words with specific morphs';
1041 + syntax = "(any|all|only) <script> [seq] ((lit|rec) <repr>|rad '[' <repr>… ']')…";
1042 + };
711 1043 form = {
712 1044 help = 'match against word\'s inflected forms';
713 - syntax = '(<inflect> | <form> (set | is <inflect> | (pfx|sfx|match) <affix>))';
714 - fn = function(e, k, op, v)
1045 + syntax = '(<inflect> | (of <form>|has) [any] ([un]set | is <inflect> | (pfx|sfx|sub) <affix>)…)';
1046 + fn = function(e, k, mode, ...)
1047 + if k == nil then -- eq [form has set]
1048 + for _,d in pairs(e.word.defs) do
1049 + if next(d.forms) then return true end
1050 + end
1051 + elseif mode == 'of' or mode == 'has' then
1052 + local match,mc = {...},1
1053 + if not next(match) then
1054 + id10t('[form %s]: missing spec',mode)
1055 + end
1056 +
1057 + local any = match[1]=='any'
1058 + local eval = function()return true end;
1059 + if any then
1060 + nc = 2
1061 + eval = function()return false end;
1062 + end
1063 +
1064 + local ok = false
1065 + local fns = {
1066 + set = function(a) return a~=nil end;
1067 + unset = function(a) return a==nil end;
1068 + is = function(a,b)return a and a==b end;
1069 + pfx = function(a,b)return a and string.sub(a,1,#b) == b end;
1070 + sfx = function(a,b)return a and string.sub(a,0-#b) == b end;
1071 + sub = function(a,b)return a and string.find(a,b) ~= nil end;
1072 + }
1073 + repeat local n, op, arg = 1, table.unpack(match,mc)
1074 + print(op,arg)
1075 + if not op then id10t "missing argument for [form]" end
1076 + if not fns[op] then
1077 + id10t('[form … %s] is not a valid filter', op)
1078 + end
1079 + if op ~= "set" and op ~= 'unset' then
1080 + n = 2
1081 + if not arg then
1082 + id10t('[form … %s]: missing argument', op)
1083 + end
1084 + end
1085 + local oe = eval
1086 + eval = any and function(a,b)
1087 + return fns[op](a,b) or oe(a,b)
1088 + end or function(a,b)
1089 + return fns[op](a,b) and oe(a,b)
1090 + end
1091 + ok = true
1092 + mc = mc + n until mc > #match
1093 + if not ok then
1094 + id10t '[form]: incomplete spec'
1095 + end
1096 +
1097 + for _,d in pairs(e.word.defs) do
1098 + if mode=='has' then
1099 + for cat,infd in pairs(d.forms) do
1100 + if eval(infd) then return true end
1101 + end
1102 + else
1103 + if eval(d.forms[k]) then return true end
1104 + end
1105 + end
1106 + elseif mode ~= nil then
1107 + id10t('[form %s]: invalid mode', mode)
1108 + else
1109 + for _,d in pairs(e.word.defs) do
1110 + for _,v in pairs(d.forms) do
1111 + if v == k then return true end
1112 + end
1113 + end
1114 + end
1115 + return false
715 1116 end;
716 1117 };
717 1118 part = {
718 1119 help = 'word has definitions for every <part> of speech';
719 1120 syntax = '<part>…';
720 1121 fn = function(e,...)
721 1122 local map, tgt = prepScan(...)
................................................................................
723 1124 for i,d in ipairs(e.word.defs) do
724 1125 if map[d.part] then matches = matches + 1 end
725 1126 end
726 1127 return matches == tgt
727 1128 end
728 1129 };
729 1130 root = {
730 - help = 'match a word that derives from every <word>';
1131 + help = 'match an entry that derives from every <word>';
731 1132 syntax = '<word>…';
732 1133 fn = function(e,...)
733 1134 local map, tgt = prepScan(...)
734 1135 for i,d in ipairs(e.word.defs) do
735 1136 local matches = 0
736 1137 for j,r in ipairs(d.branch) do
737 1138 if map[r] then matches = matches + 1 end
738 1139 end
739 1140 if matches == tgt then return true end
740 1141 end
741 1142 end
742 1143 };
1144 + phrase = {
1145 + syntax = '<pred>…';
1146 + help = 'match only words with phrases';
1147 + };
1148 + ex = {
1149 + syntax= '[by <source>] [(any|all) <term>…]';
1150 + help = 'entry has an example by <source> with any/all of <term>s';
1151 + fn = function() end;
1152 + };
743 1153 note = {
744 - help = 'word has a matching note';
1154 + help = 'entry has a matching note';
745 1155 syntax = '([kind <kind> [<term>]] | term <term> | (min|max|count) <n>)';
746 1156 fn = function(e, op, k, t)
747 1157 if op == 'kind' or op == 'term' then
748 1158 if op == 'term' and t then
749 1159 id10t('too many arguments for [note term <term>]')
750 1160 end
751 1161 for _,d in ipairs(e.word.defs) do
................................................................................
784 1194 if not fd then error(userError("cannot open file " .. file),2) end
785 1195 return fd
786 1196 else
787 1197 return file
788 1198 end
789 1199 end
790 1200
791 -local function copy(tab)
792 - local new = {}
793 - for k,v in pairs(tab) do new[k] = v end
794 - return new
795 -end
796 -
797 1201 local function pathParse(p)
798 1202 -- this is cursed, rewrite without regex pls TODO
799 1203 if p == '.' then return {} end
800 1204 local function comp(pfx)
801 1205 return pfx .. '([0-9]+)'
802 1206 end
803 1207 local function mtry(...)
................................................................................
824 1228 end
825 1229 if w then break end
826 1230 end
827 1231 end
828 1232 if not w then w=p:match('^(.-)%.?$') end
829 1233 return {w = w, dn = tonumber(dn), mn = tonumber(mn), pn=tonumber(pn); nn = tonumber(nn), xn = tonumber(xn)}
830 1234 end
831 -local function pathString(p,styler)
1235 +local function pathString(p,styler,display)
832 1236 local function s(s, st, ...)
833 1237 if styler then
834 1238 return styler[st](tostring(s),...)
835 1239 else return s end
836 1240 end
837 1241
838 1242 local function comp(c,n,...)
................................................................................
842 1246 local t = {}
843 1247 if p.w then t[1] = s(p.w,'ul') else return '.' end
844 1248 if p.dn then t[2] = string.format(".%s", s(p.dn,'br')) end
845 1249 if p.pn then t[#t+1] = comp('p',p.pn,4,true) end
846 1250 if p.mn then t[#t+1] = comp('m',p.mn,5,true) end
847 1251 if p.xn then t[#t+1] = comp('x',p.xn,6,true)
848 1252 elseif p.nn then t[#t+1] = comp('n',p.nn,4) end
849 - if t[2] == nil then
1253 + if t[2] == nil and not display then
850 1254 return p.w .. '.' --make sure paths are always valid
851 1255 end
852 1256 return s(table.concat(t),'em')
853 1257 end
854 1258 local function pathMatch(a,b)
855 1259 return a.w == b.w
856 1260 and a.dn == b.dn
................................................................................
874 1278 if not a.dn then return res end
875 1279
876 1280 res.def = lookup('definition', res.word.defs, a.dn)
877 1281 if (not a.pn) and (not a.mn) then return res end
878 1282
879 1283 local m if a.pn then
880 1284 res.phrase = lookup('phrase', res.def.phrases, a.pn)
881 - res.meaning = lookup('meaning', res.phrase.means, a.mn)
1285 + if a.mn then
1286 + res.meaning = lookup('meaning', res.phrase.means, a.mn)
1287 + else return res end
882 1288 else
883 1289 res.meaning = lookup('meaning', res.def.means, a.mn)
884 1290 end
885 1291
886 1292 if a.xn then
887 1293 res.ex = lookup('example',res.meaning.examples,a.xn)
888 1294 elseif a.nn then
................................................................................
919 1325 elseif super.nn then
920 1326 if sub.xn then return false end
921 1327 if sub.nn ~= super.nn then return false end
922 1328 end
923 1329
924 1330 return true
925 1331 end
1332 +
1333 +function ansi.formatMarkup(text, sty)
1334 + return (string.gsub(text, '(.?)(%b[])', function(esc,seg)
1335 + if esc == '\\' then return seg end
1336 + local mode, text = seg:match('^%[(.)%s*(.-)%]$')
1337 + local r
1338 + if mode == '\\' then
1339 + r = text
1340 + elseif mode == '*' then
1341 + r = sty.br(ansi.formatMarkup(text,sty))
1342 + elseif mode == '!' then
1343 + r = sty.em(ansi.formatMarkup(text,sty))
1344 + elseif mode == '_' then
1345 + r = sty.ul(ansi.formatMarkup(text,sty))
1346 + elseif mode == '$' then
1347 + r = sty.color(ansi.formatMarkup(text,sty),6,true)
1348 + elseif mode == '>' then
1349 + r = pathString(pathParse(text),sty,true)
1350 + else return seg end
1351 + return esc..r
1352 + end))
1353 +end
926 1354
927 1355 local cmds = {
928 1356 create = {
929 1357 help = "initialize a new dictionary file";
930 1358 syntax = "<lang>";
931 1359 raw = true;
932 1360 exec = function(ctx, lang)
................................................................................
936 1364 local fd = safeopen(ctx.file,"wb")
937 1365 local new = {
938 1366 header = {
939 1367 lang = lang;
940 1368 meta = "";
941 1369 partsOfSpeech = {};
942 1370 inflectionForms = {};
1371 + orthographies = {};
943 1372 };
944 1373 words = {};
945 1374 relsets = {};
1375 + morphs = {};
946 1376 }
947 1377 local o = writeDict(new);
948 1378 fd:write(o)
949 1379 fd:close()
950 1380 end;
951 1381 };
952 1382 coin = {
................................................................................
953 1383 help = "add a new word";
954 1384 syntax = "<word>";
955 1385 write = true;
956 1386 exec = function(ctx,word)
957 1387 if ctx.dict.words[word] then
958 1388 id10t "word already coined"
959 1389 end
960 - ctx.dict.words[word] = {defs={},rels={}}
1390 + ctx.dict.words[word] = {defs={},rels={},enc={}}
961 1391 end;
962 1392 };
963 1393 def = {
964 1394 help = "define a word";
965 1395 syntax = "<word> <part-of-speech> [<meaning> [<root>…]]";
966 1396 write = true;
967 1397 exec = function(ctx,word,part,means,...)
968 1398 local etym = {...}
969 1399 if (not word) or not part then
970 1400 id10t 'bad definition'
971 1401 end
972 1402 if not ctx.dict.words[word] then
973 - ctx.dict.words[word] = {defs={},rels={}}
1403 + ctx.dict.words[word] = {defs={},rels={},enc={}}
974 1404 end
975 1405 local n = #(ctx.dict.words[word].defs)+1
976 1406 ctx.dict.words[word].defs[n] = {
977 1407 part = part;
1408 + writings = {};
978 1409 branch = etym;
979 1410 means = {means and {
980 1411 lit=means;
981 1412 examples={};
982 1413 notes={};
983 1414 rels={};
984 1415 } or nil};
................................................................................
990 1421 end;
991 1422 };
992 1423 mean = {
993 1424 help = "add a meaning to a definition";
994 1425 syntax = "<word> <def#> <meaning>";
995 1426 write = true;
996 1427 exec = function(ctx,word,dn,m)
997 - local t = pathResolve(ctx,{w=word,dn=dn})
998 - table.insert(t.d.means, {lit=m,notes={}})
1428 + local t = pathResolve(ctx,{w=word,dn=tonumber(dn)})
1429 + table.insert(t.def.means, {lit=m,notes={},examples={},rels={}})
999 1430 end;
1000 1431 };
1001 1432 rel = {
1002 1433 help = "manage groups of related words";
1003 1434 syntax = {
1004 1435 "(show|purge) <path> [<kind>]";
1005 1436 "(link|drop) <word> <group#> <path>…";
................................................................................
1130 1561 end
1131 1562 end
1132 1563 end;
1133 1564 };
1134 1565 mod = {
1135 1566 help = "move, merge, split, or delete words or definitions";
1136 1567 syntax = {
1137 - "<path> (drop | [move|merge|clobber] <path> | out [<part> [<root>…]])";
1138 - "path ::= <word>[(@<def#>[/<meaning#>[:<note#>]]|.)]";
1568 + "<path> (drop | [(to|move)|merge|clobber] <path>)";
1569 + "path ::= <word>[.[<def#>[/p<phrase#>][/m<meaning#>[(/n<note#>|/x<example#>)]]]]";
1139 1570 };
1140 1571 write = true;
1572 + };
1573 + morph = {
1574 + help = "manage and attach morphs (morphemes/composable glyphs)";
1575 + syntax = {
1576 + "(<ls>|<define>|<mod>)";
1577 + "define ::= def (id <name>|as) [<form>]… [from <morph>…]";
1578 + "ls ::= ls (<morph>|meta <key> <value>|has <key>)…";
1579 + "mod ::= <morph> (drop|[un]link <path>|meta <key> [<value>]|inc [<morph>])";
1580 + "morph ::= (id <name>|enc <form>)";
1581 + "form ::= [<script>]=<repr>";
1582 + };
1141 1583 };
1142 1584 note = {
1143 1585 help = "add a note to a definition or a paragraph to a note";
1144 1586 syntax = {"(<m-path> (add|for) <kind> | <m-path>:<note#>) <para>…";
1145 - "m-path ::= <word>@<def#>/<meaning#>"};
1587 + "m-path ::= <word>.<def#>[/p<phrase#]/m<meaning#>"};
1146 1588 write = true;
1147 1589 exec = function(ctx,path,...)
1148 1590 local paras, mng
1149 1591 local dest = pathParse(path)
1150 1592 local t = pathResolve(ctx,path)
1151 1593 if dest.nn then
1152 1594 paras = {...}
................................................................................
1226 1668 end
1227 1669 end
1228 1670 end
1229 1671
1230 1672 function cmds.ls.exec(ctx,...)
1231 1673 local filter = nil
1232 1674 local out = {}
1233 - for i,f in ipairs{...} do
1675 + local args = {...}
1676 + for i=#args,1,-1 do local f <const> = args[i]
1234 1677 local fn = parsefilter(f)
1235 1678 local of = filter or function() return false end
1236 1679 filter = function(e)
1237 1680 return fn(e) or of(e)
1238 1681 end
1239 1682 end
1240 1683 for lit,w in pairs(ctx.dict.words) do
1241 - local e = {lit=lit, word=w}
1684 + local e = {lit=lit, word=w, dict=ctx.dict}
1242 1685 if filter == nil or filter(e) then
1243 1686 table.insert(out, e)
1244 1687 end
1245 1688 end
1246 1689 table.sort(out, function(a,b) return a.lit < b.lit end)
1247 1690 local fo = ctx.sty[io.stdout]
1248 1691
................................................................................
1285 1728 return {
1286 1729 syn = flatten(synonymSets);
1287 1730 ant = flatten(antonymSets);
1288 1731 met = flatten(metonymSets);
1289 1732 }
1290 1733 end
1291 1734
1292 - local function formatRels(rls, padlen)
1735 + local function formatRels(lines, rls, padlen)
1293 1736 -- optimize for the common case
1294 1737 if next(rls.syn) == nil and
1295 1738 next(rls.ant) == nil and
1296 1739 next(rls.met) == nil then return {} end
1297 1740 local pad = string.rep(' ',padlen)
1298 1741 local function format(label, set)
1299 1742 local each = map(set, function(e)
................................................................................
1301 1744 local str = fo.ul(e.w)
1302 1745 if ed then str = string.format('%s(%s)',str,ed.part) end
1303 1746 if e.mn then str = string.format('%s§%u',str,e.mn) end
1304 1747 return str
1305 1748 end)
1306 1749 return fo.em(string.format("%s%s %s",pad,label,table.concat(each,', ')))
1307 1750 end
1308 - local lines = {}
1309 1751 local function add(l,c,lst)
1310 1752 table.insert(lines, format(fo.color(l,c,true),lst))
1311 1753 end
1312 1754 if next(rls.syn) then add('synonyms:',2,rls.syn) end
1313 1755 if next(rls.ant) then add('antonyms:',1,rls.ant) end
1314 1756 if next(rls.met) then add('metonyms:',4,rls.met) end
1315 1757 return lines
1316 1758 end
1317 1759
1318 - local function meanings(w,d,md,n)
1319 - local start = md and 2 or 1
1320 - local part = string.format('(%s)', d.part)
1321 - local pad = md and string.rep(' ', #part) or ''
1322 - local function note(n,insert)
1760 + local function formatMeaning(m, obj, path, indent) -- m = dest tbl
1761 + local pad = string.rep(' ', indent)
1762 + local function note(j,n,markup)
1323 1763 if not next(n.paras) then return end
1324 1764 local pad = string.rep(' ',#(n.kind) + 9)
1325 - insert(' ' .. fo.hl(' ' .. n.kind .. ' ') .. ' ' .. n.paras[1])
1765 +
1766 + local nid = ''
1767 + if ctx.flags.ident then
1768 + nid='‹'..pathString(merge(path,{nn=j}), fo)..'›'
1769 + end
1770 + table.insert(m, nid..' ' .. fo.hl(' ' .. n.kind .. ' ') .. ' ' .. markup(n.paras[1]))
1326 1771 for i=2,#n.paras do
1327 - insert(pad..n.paras[2])
1328 - end
1329 - end
1330 - local m = { (function()
1331 - if d.means[1] then
1332 - if md then
1333 - local id = ''
1334 - if ctx.flags.ident then
1335 - id=' ['..pathString({w=w.lit,dn=n,mn=1}, fo)..']'
1336 - end
1337 - return string.format(" %s %s 1. %s", id, fo.em(part), d.means[1].lit)
1338 - end
1339 - else return
1340 - fo.em(string.format(' %s [empty definition #%u]', part,n))
1341 - end
1342 - end)() }
1343 - tcatD(m, formatRels(gatherRelSets{w=w.lit,dn=n,mn=1}, 6))
1344 - for i=start,#d.means do local v = d.means[i]
1345 - local id = ''
1346 - if ctx.flags.ident then id='['..pathString({w=w.lit,dn=n,mn=n}, fo)..']' end
1347 - table.insert(m, string.format(' %s%s %u. %s', pad, id, i, v.lit))
1348 - tcatD(m, formatRels(gatherRelSets{w=w.lit,dn=n,mn=i}, 6))
1349 - for j,n in ipairs(v.notes) do
1350 - note(n, function(v) table.insert(m, v) end)
1351 - end
1352 - end
1353 - return table.concat(m,'\n')
1354 - end
1355 - local function autobreak(str)
1356 - if str ~= '' then return str..'\n' else return str end
1357 - end
1772 + table.insert(m, pad..markup(n.paras[2]))
1773 + end
1774 + end
1775 + local id = ''
1776 + if ctx.flags.ident then id='‹'..pathString(path, fo)..'›' end
1777 + table.insert(m, string.format('%s%s %u. %s', pad, id, path.mn, ansi.formatMarkup(obj.lit,fo)))
1778 + formatRels(m,gatherRelSets(path), 6)
1779 + for j,n in ipairs(obj.notes) do
1780 + note(j,n, function(v) return ansi.formatMarkup(v,fo) end)
1781 + end
1782 + end
1783 +
1784 + local function defnMeanings(m, w,def,path,indent)
1785 + local part = ''
1786 + for i=1,#def.means do local v = def.means[i]
1787 + formatMeaning(m, v, merge(path,{mn=i}),indent)
1788 + end
1789 + end
1790 + local function parthead(def)
1791 + local str = string.format('(%s)', def.part)
1792 + return fo.color(fo.em(str), 199), #str
1793 + end
1794 + local markcolor, markcolors, markmap = 0, {
1795 + 117, 75, 203, 48, 200, 190, 26, 48, 226, 198
1796 + }, {}
1358 1797 for i, w in ipairs(out) do
1359 - local d = fo.ul(fo.br(w.lit))
1360 - local wordrels = autobreak(table.concat(
1361 - formatRels(gatherRelSets{w=w.lit}, 2),
1362 - '\n'
1363 - ))
1364 - if #w.word.defs == 1 then
1365 - d=d .. ' '
1366 - .. fo.rgb(fo.em('('..(w.word.defs[1].part)..')'),.8,.5,1) .. '\n'
1367 - .. meanings(w,w.word.defs[1],false,1) .. '\n'
1368 - .. autobreak(table.concat(formatRels(gatherRelSets{w=w.lit,dn=1}, 4), '\n'))
1369 - .. wordrels .. '\n'
1370 - else
1371 - for j, def in ipairs(w.word.defs) do
1372 - local syn if wsc and wsc[j] then syn = wsc[j] end
1373 - d=d .. '\n'
1374 - .. meanings(w,syn,def,true,j) .. '\n'
1375 - .. autobreak(table.concat(
1376 - formatRels(gatherRelSets{w=w.lit,dn=j}, 4),
1377 - '\n'
1378 - ))
1379 - end
1380 - d=d .. wordrels .. '\n'
1381 - end
1382 - io.stdout:write(d)
1798 + local lines = { fo.ul(fo.br(w.lit)) }
1799 + local pad = 4
1800 + local ndefs = #w.word.defs
1801 + if ndefs == 1 then
1802 + local header = parthead(w.word.defs[1])
1803 + lines[1] = lines[1] .. ' ' .. header
1804 + end
1805 + local mark
1806 + local markline
1807 + if w.mark then
1808 + local marks = {}
1809 + for _,m in pairs(w.mark) do
1810 + if not markmap[m] then
1811 + markmap[m] = markcolors[markcolor+1]
1812 + markcolor = (markcolor+1)%#markcolors
1813 + end
1814 + local c = markmap[m]
1815 + table.insert(marks, fo.hl(fo.color(string.format(' %s ',m),c)))
1816 + end
1817 + mark = table.concat(marks, ' ')
1818 + markline = #lines
1819 + end
1820 +
1821 + for j, d in ipairs(w.word.defs) do
1822 + local top = #lines -- :/
1823 + local header, hdln = parthead(d)
1824 + defnMeanings(lines, w, d, {w=w.lit, dn=j}, ndefs==1 and 0 or hdln+1)
1825 + if ctx.flags.rels then
1826 + formatRels(lines, gatherRelSets{w=w.lit,dn=j}, 0)
1827 + end
1828 + if ndefs > 1 then
1829 + lines[top+1] = ' ' .. header .. string.sub(lines[top+1],hdln+2)
1830 + end
1831 + end
1832 + if ctx.flags.rels then
1833 + formatRels(lines,gatherRelSets{w=w.lit}, 2)
1834 + end
1835 + if markline then
1836 + lines[markline] = mark .. ' ' .. lines[markline]
1837 + end
1838 + io.stdout:write(table.concat(lines,'\n')..'\n')
1383 1839 end
1384 1840 end
1385 1841
1386 1842 function cmds.import.exec(ctx,file)
1387 1843 local ifd = io.stdin
1388 1844 if file then
1389 1845 ifd = safeopen(file,'r')
................................................................................
1391 1847
1392 1848 local new = {
1393 1849 header = {
1394 1850 lang = lang;
1395 1851 meta = "";
1396 1852 partsOfSpeech = {};
1397 1853 inflectionForms = {};
1854 + orthographies = {};
1398 1855 };
1399 1856 words = {};
1400 1857 relsets = {};
1858 + morphs = {};
1401 1859 }
1402 1860
1403 1861 local state = 0
1404 1862 local relsets = {}
1405 1863 local path = {}
1406 1864 local inflmap, lastinfl = {}
1865 + local orthoIDs, lastOrtho = {}
1866 + local morphIDs, lastMorph = {}
1867 + local lastWriting
1407 1868 for l in ifd:lines() do
1408 1869 local words = strwords(l)
1409 1870 local c = words[1]
1410 1871 local function syn(mn,mx)
1411 1872 local nw = #words - 1
1412 1873 if nw < mn or (mx ~= nil and nw > mx) then
1413 1874 if mx ~= nil then
1414 1875 id10t('command %s needs between %u~%u words',c,mn,mx)
1415 1876 else
1416 1877 id10t('command %s needs at least %u words',c,mn)
1417 1878 end
1418 1879 end
1880 + end
1881 + local function getuid(tbl, uid)
1882 + if tonumber(uid,16) == nil then
1883 + if not tbl[uid] then
1884 + tbl[uid] = math.random(0,0xffffFFFF)
1885 + end
1886 + return tbl[uid]
1887 + end
1888 + return tonumber(uid,16)
1419 1889 end
1420 1890 if c ~= '*' and c~='meta' then -- comments
1421 1891 if state == 0 then
1422 - if c ~= 'pv0' then
1892 + if c ~= 'PV0' then
1423 1893 id10t "not a parvan export"
1424 1894 end
1425 1895 new.header.lang = words[2]
1426 1896 new.header.meta = words[3]
1427 1897 state = 1
1428 1898 else
1429 1899 local T = pathResolve({dict=new}, path)
................................................................................
1430 1900 local W,D,P,M,N,X =
1431 1901 T.word,
1432 1902 T.def,
1433 1903 T.phrase,
1434 1904 T.meaning,
1435 1905 T.note,
1436 1906 T.ex
1437 - if c == 'w' then syn(1) state = 2
1907 + if c == 'w' then syn(1) state = 10
1438 1908 path = {w=words[2]}
1439 - new.words[words[2]] = {defs={},rels={}}
1909 + new.words[words[2]] = {defs={},rels={},enc={}}
1910 + lastMorph = nil
1440 1911 elseif c == 'f' then syn(1)
1441 1912 local nf = {
1442 1913 name = words[2];
1443 1914 abbrev = words[3] or "";
1444 1915 desc = words[4] or "";
1445 1916 parts = {};
1446 1917 }
................................................................................
1450 1921 elseif c == 'fp' then syn(1)
1451 1922 if not lastinfl then
1452 1923 id10t 'fp can only be used after f' end
1453 1924 table.insert(lastinfl.parts,words[2])
1454 1925 elseif c == 's' then syn(2)
1455 1926 relsets[words[3]] = relsets[words[3]] or {}
1456 1927 relsets[words[3]].kind = words[2]
1457 - relsets[words[3]].uid = tonumber(words[3])
1928 + relsets[words[3]].uid = tonumber(words[3],16)
1458 1929 relsets[words[3]].members = relsets[words[3]].members or {}
1459 - elseif state >= 2 and c == 'r' then syn(1)
1930 + elseif c == 'mo' then syn(1)
1931 + local uid,name = table.unpack(words,2)
1932 + uid = getuid(morphIDs, uid)
1933 + new.morphs[uid] = {
1934 + name = name or '';
1935 + enc = {};
1936 + meta = {};
1937 + rads = {};
1938 + }
1939 + lastMorph = new.morphs[uid]
1940 + elseif lastMorph and state < 10 and c == 'M' then syn(2)
1941 + local key, val = table.unpack(words,2)
1942 + table.insert(lastMorph.meta, {key=key,val=val})
1943 + elseif lastMorph and state < 10 and c == 'r' then syn(1)
1944 + local r = getuid(morphIDs, words[2])
1945 + table.insert(lastMorph.rads, r)
1946 + elseif c == 'e' then syn(2)
1947 + local scr, blob = table.unpack(words,2)
1948 + scr = getuid(orthoIDs, scr)
1949 + if state <= 10 and lastMorph then
1950 + lastMorph.enc[scr] = blob
1951 + elseif state == 10 then
1952 + W.enc[scr] = blob
1953 + elseif state >= 11 and lastWriting then
1954 + lastWriting.enc[scr] = blob
1955 + else
1956 + id10t('encoding “%s” declared at bad position', blob)
1957 + end
1958 + elseif c == 'o' then syn(3)
1959 + local uid, name, repr = table.unpack(words,2)
1960 + repr = strwords(repr)
1961 + uid = getuid(orthoIDs, uid)
1962 + if #repr > 1 then
1963 + local kind, p1,p2 = table.unpack(repr)
1964 + repr = {kind,{}}
1965 + if kind == 'glyphs' then
1966 + repr[2].format = p1
1967 + repr[2].glyphs = {}
1968 + repr[2].encoding = p2
1969 + elseif kind == 'int' then
1970 + repr[2] = tonumber(p1,16)
1971 + end
1972 + else repr=repr[1] end
1973 + table.insert(new.header.orthographies, {
1974 + uid = uid;
1975 + name = name;
1976 + repr = repr;
1977 + })
1978 + lastOrtho = new.header.orthographies[#(new.header.orthographies)]
1979 + elseif c == 'og' then syn(2)
1980 + if not lastOrtho then
1981 + id10t '`og` must follow an orthography declaration'
1982 + elseif lastOrtho.repr[1] ~= 'glyphs' then
1983 + id10t('orthography declares %s representation', lastOrtho.repr[1])
1984 + end
1985 + local name, data = table.unpack(words,2)
1986 + table.insert(lastOrtho.repr[2].glyphs, {
1987 + -- TODO decode base?? data for binary encodings
1988 + name = name, image = data
1989 + })
1990 + elseif state >= 10 and c == 'r' or c == 'rh' then syn(1)
1460 1991 local rt
1461 - if state == 2 then
1992 + if state == 10 then
1462 1993 rt = W.rels
1463 - elseif state == 3 then
1994 + elseif state == 11 then
1464 1995 rt = D.rels
1465 - elseif state == 4 then
1996 + elseif state == 12 then
1466 1997 rt = D.rels
1467 1998 elseif state == 14 then
1468 1999 rt = P.rels
1469 2000 end
1470 2001 relsets[words[2]] = relsets[words[2]] or {
1471 - uid = tonumber(words[2]) or math.random(0,0xffffFFFF);
2002 + uid = tonumber(words[2],16) or math.random(0,0xffffFFFF);
1472 2003 members={};
1473 2004 }
1474 - table.insert(relsets[words[2]].members, path)
1475 - elseif state >= 2 and c == 'd' then syn(1) state = 3
2005 + local mems = relsets[words[2]].members
2006 + if c == 'rh' and next(mems) then
2007 + mems[#mems+1] = mems[1]
2008 + mems[1] = path
2009 + else
2010 + table.insert(mems,path)
2011 + end
2012 + elseif state >= 10 and c == 'd' then syn(1) state = 11
1476 2013 table.insert(W.defs, {
1477 2014 part = words[2];
2015 + writings = {};
1478 2016 branch = {};
1479 2017 means = {};
1480 2018 forms = {};
1481 2019 phrases = {};
1482 2020 rels = {};
1483 2021 })
1484 2022 path = {w = path.w, dn = #(W.defs)}
1485 - elseif state >= 3 and c == 'dr' then syn(1)
2023 + elseif state >= 11 and c == 'dr' then syn(1)
1486 2024 table.insert(D.branch, words[2])
1487 - elseif state >= 3 and c == 'df' then syn(2)
2025 + elseif state >= 11 and c == 'df' then syn(2)
1488 2026 if not inflmap[words[2]] then
1489 2027 id10t('no inflection form %s defined', words[2])
1490 2028 end
1491 2029 D.forms[inflmap[words[2]]] = words[3]
1492 - elseif state >= 3 and c == 'p' then syn(1) state = 14
2030 + elseif state >= 11 and c == 'p' then syn(1) state = 12
1493 2031 table.insert(D.phrases, {
1494 2032 str = words[2];
1495 2033 means = {};
1496 2034 rels = {};
1497 2035 })
1498 2036 path = {w = path.w, dn = path.dn, pn = #(D.phrases)}
1499 - elseif state >= 3 and c == 'm' then syn(1) state = 4
1500 - table.insert(D.means, {
2037 + elseif state >= 11 and c == 'm' then syn(1) state = 13
2038 + table.insert((P or D).means, {
1501 2039 lit = words[2];
1502 2040 notes = {};
1503 2041 examples = {};
1504 2042 rels = {};
1505 2043 });
1506 - path = {w = path.w, dn = path.dn, pn=path.pn, mn = #(D.means)}
1507 - elseif state >= 4 and c == 'n' then syn(1) state = 5
2044 + path = {w = path.w, dn = path.dn, pn=path.pn, mn = #((P or D).means)}
2045 + elseif state >= 11 and c == 'W' then
2046 + table.insert(D.writings, {
2047 + info = words[2] or '';
2048 + enc = {};
2049 + morphs = {};
2050 + })
2051 + lastWriting = D.writings[#(D.writings)]
2052 + elseif state >= 11 and lastWriting and c == 'Wmo' then syn(1)
2053 + local morph = getuid(morphIDs, words[2])
2054 + table.insert(lastWriting.morphs, morph)
2055 + elseif state >= 13 and c == 'x' then syn(1)
2056 + table.insert(M.examples, {
2057 + quote = words[2];
2058 + src = words[3] or '';
2059 + })
2060 + elseif state >= 13 and c == 'n' then syn(1) state = 14
1508 2061 table.insert(M.notes, {kind=words[2], paras={}})
1509 2062 path = {w = path.w, dn = path.dn, pn = path.pn, mn = path.mn, nn = #(M.notes)};
1510 - elseif state >= 5 and c == 'np' then syn(1)
2063 + elseif state >= 14 and c == 'np' then syn(1)
1511 2064 table.insert(N.paras, words[2])
1512 2065 end
1513 2066 -- we ignore invalid ctls, for sake of forward-compat
1514 2067 end
1515 2068 end
1516 2069 end
1517 2070
................................................................................
1532 2085 ofd:write(o)
1533 2086 ofd:close()
1534 2087 end
1535 2088
1536 2089 function cmds.export.exec(ctx,file)
1537 2090 local ofd = io.stdout
1538 2091 if file then ofd = safeopen(file, 'w+') end
1539 - local function san(str)
1540 - local d = 0
1541 - local r = {}
1542 - for i,cp in utf8.codes(str) do
1543 - -- insert backslashes for characters that would
1544 - -- disrupt strwords() parsing
1545 - if cp == 0x0a then
1546 - table.insert(r, 0x5c)
1547 - table.insert(r, 0x6e)
1548 - else
1549 - if cp == 0x5b then
1550 - d = d + 1
1551 - elseif cp == 0x5d then
1552 - if d >= 1 then
1553 - d = d - 1
1554 - else
1555 - table.insert(r, 0x5c)
1556 - end
1557 - end
1558 - table.insert(r, cp)
1559 - end
1560 - end
1561 - return '[' .. utf8.char(table.unpack(r)) .. ']'
1562 - end
2092 + local san = strsan
1563 2093 local function o(lvl,...)
1564 2094 local pfx = ''
1565 2095 if ctx.flags.human and lvl > 0 then
1566 2096 pfx = string.rep('\t', lvl)
1567 2097 end
1568 2098 ofd:write(pfx..string.format(...)..'\n')
1569 2099 end
1570 2100 local d = ctx.dict
1571 - o(0,'pv0 %s %s', san(d.header.lang), san(d.header.meta))
2101 + o(0,'PV0 %s %s', san(d.header.lang), san(d.header.meta))
1572 2102 local function checksyn(obj,lvl)
1573 2103 for k,v in pairs(obj.rels) do
1574 - o(lvl,'r %u',s.uid)
2104 + if d._relCache[v].mems[1].obj == obj
2105 + then o(lvl,'rh %x',v)
2106 + else o(lvl,'r %x',v)
2107 + end
1575 2108 end
1576 2109 end
1577 2110 for i,f in pairs(d.header.inflectionForms) do
1578 2111 o(0,'f %s %s %s', san(f.name), san(f.abbrev), san(f.desc))
1579 2112 for j,p in pairs(f.parts) do
1580 2113 o(1,'fp %s', san(p))
1581 2114 end
1582 2115 end
1583 - local function scanMeans(tbl,path,lvl)
1584 - for j,m in ipairs(def.means) do
2116 + for i,s in pairs(d.header.orthographies) do
2117 + local repr
2118 + if type(s.repr) == 'string'
2119 + then repr = s.repr
2120 + else repr = s.repr[1] end
2121 + if repr == 'int' then
2122 + repr = repr .. ' ' .. tostring(s.repr[2])
2123 + elseif repr == 'glyphs' then
2124 + repr = repr .. ' ' .. s.repr[2].format .. ' ' .. tostring(s.repr[2].encoding)
2125 + end
2126 + o(0, 'o %x %s %s', s.uid, san(s.name), san(repr))
2127 + if s.repr[1] == 'glyphs' then
2128 + for _,g in ipairs(s.repr[2].glyphs) do
2129 + o(1, 'og %s %s', san(g.name), san(g.image))
2130 + end
2131 + end
2132 + end
2133 + local function scanMeans(tbl,lvl)
2134 + for j,m in ipairs(tbl) do
1585 2135 o(lvl,'m %s', san(m.lit))
1586 - local lp = copy(path)
1587 - lp.mn = j
1588 - checksyn(m,lp,lvl+1)
2136 + checksyn(m,lvl+1)
2137 + for k,x in ipairs(m.examples) do
2138 + o(lvl+1,'x %s %s', san(x.quote,x.src))
2139 + end
1589 2140 for k,n in ipairs(m.notes) do
1590 2141 o(lvl+1,'n %s', san(n.kind))
1591 2142 for a,p in ipairs(n.paras) do
1592 2143 o(lvl+2,'np %s', san(p))
1593 2144 end
1594 2145 end
1595 2146 end
2147 + end
2148 + local function scanMeta(n, meta)
2149 + for i,m in ipairs(meta) do
2150 + o(n, 'M %s %s', san(m.key), san(m.val)) end
2151 + end
2152 + local function scanEnc(n, tbl)
2153 + for uid,enc in pairs(tbl)
2154 + do o(n, 'e %x %s',uid,san(enc)) end
2155 + end
2156 + for uid, m in pairs(d.morphs) do
2157 + o(0, 'mo %x %s', uid, san(m.name))
2158 + scanMeta(1, m.meta)
2159 + scanEnc(1, m.enc)
1596 2160 end
1597 2161 for lit, w in pairs(d.words) do
1598 2162 o(0,'w %s',san(lit))
1599 - checksyn(w,{w=lit},1)
2163 + checksyn(w,1)
2164 + scanEnc(1, w.enc)
1600 2165 for i,def in ipairs(w.defs) do
1601 2166 o(1,'d %s',san(def.part))
1602 - checksyn(def,{w=lit,dn=i},2)
2167 + for _, writ in ipairs(def.writings) do
2168 + if writ.info == '' then o(2,'W') else
2169 + o(2,'W %s',san(writ.info)) end
2170 + for mid,uid in pairs(writ.morphs) do
2171 + o(3, 'Wmo %x', uid) end
2172 + for uid,enc in pairs(writ.enc)
2173 + do o(3, 'e %x %s',uid,san(enc)) end
2174 + end
2175 + checksyn(def,2)
1603 2176 for j,r in ipairs(def.branch) do
1604 2177 o(2,'dr %s',san(r))
1605 2178 end
2179 + scanMeans(def.means, 2)
1606 2180 for j,p in ipairs(def.phrases) do
1607 2181 o(2,'p %s',san(p.str))
1608 - scanMeans(p.means, {w=lit,dn=i,pn=j}, 3)
2182 + scanMeans(p.means, 3)
1609 2183 end
1610 - scanMeans(def.means, {w=lit,dn=i}, 2)
1611 2184 end
1612 2185 end
1613 - for _,s in ipairs(d.relsets) do o(0,'s %s %u', s.kind, s.uid) end
2186 + for _,s in ipairs(d.relsets) do o(0,'s %s %x', s.kind, s.uid) end
1614 2187 end
1615 2188
1616 2189 local function filterD(lst, fn)
1617 2190 -- cheap algorithm to destructively filter a list
1618 2191 -- DOES NOT preserve order!!
1619 2192 local top = #lst
1620 2193 for i=top,1,-1 do local m = lst[i]
................................................................................
1624 2197 top = top - 1
1625 2198 end
1626 2199 end
1627 2200 return lst
1628 2201 end
1629 2202
1630 2203 function cmds.mod.exec(ctx, orig, oper, dest, ...)
2204 + local ops = {
2205 + word = {
2206 + mask = {
2207 + word = {move=true,merge=true,clobber=true};
2208 + };
2209 + move = function(from,to) end;
2210 + merge = function(from,to) end;
2211 + clobber = function(from,to) end;
2212 + };
2213 + def = {
2214 + mask = {
2215 + word = {move=true};
2216 + def = {merge=true,clobber=true};
2217 + };
2218 + move = function(from,to) end;
2219 + merge = function(from,to) end;
2220 + clobber = function(from,to) end;
2221 + };
2222 + phrase = {
2223 + mask = {
2224 + def = {move=true};
2225 + phrase = {clobber=true};
2226 + };
2227 + move = function(from,to) end;
2228 + clobber = function(from,to) end;
2229 + };
2230 + meaning = {
2231 + mask = {
2232 + def = {move=true};
2233 + phrase = {move=true};
2234 + meaning = {merge=true,clobber=true};
2235 + };
2236 + move = function(from,to) end;
2237 + merge = function(from,to) end;
2238 + clobber = function(from,to) end;
2239 + };
2240 + example = {
2241 + mask = {
2242 + meaning={move=true};
2243 + example={merge=true,clobber=true};
2244 + };
2245 + move = function(from,to) end;
2246 + merge = function(from,to) end;
2247 + clobber = function(from,to) end;
2248 + };
2249 + note = {
2250 + mask = {
2251 + meaning={move=true};
2252 + note={merge=true,clobber=true};
2253 + };
2254 + move = function(from,to) end;
2255 + merge = function(from,to) end;
2256 + clobber = function(from,to) end;
2257 + };
2258 + }
1631 2259 rebuildRelationCache(ctx.dict)
1632 2260 end
1633 2261
1634 2262 local function fileLegible(file)
1635 2263 -- check if we can access the file
1636 2264 local fd = io.open(file,"rb")
1637 2265 local ret = false
................................................................................
1759 2387 showHelp(ctx, cmd, c)
1760 2388 end
1761 2389 end
1762 2390 end
1763 2391
1764 2392 local globalFlags <const> = {
1765 2393 human = {'h','human','enable human-readable exports'};
1766 - ident = {'i','ident','show identifier paths for all items'}
2394 + ident = {'i','ident','show identifier paths for all items'};
2395 + rels = {'r','rels', 'show relationships between words'};
1767 2396 }
1768 2397
1769 2398 local function
1770 2399 usage(me,ctx)
1771 2400 local ln = 0
1772 2401 local ct = {}
1773 2402 local fe = ctx.sty[io.stderr]
................................................................................
1793 2422 showHelp(ctx,k,v)
1794 2423 end
1795 2424 return 64
1796 2425 end
1797 2426
1798 2427 local function
1799 2428 dispatch(argv, ctx)
2429 + local loglevel = 2
1800 2430 local ferr = ctx.sty[io.stderr]
1801 2431 local args = {}
1802 2432 local flags = {}
1803 2433 local i = 1 while i <= #argv do
1804 2434 local a = argv[i]
1805 2435 if a == '--' then i=i+1 break
1806 2436 elseif a:sub(1,2) == '--' then
................................................................................
1815 2445 if v[1] == c then flags[k] = true break end
1816 2446 end
1817 2447 end
1818 2448 else table.insert(args, a) end
1819 2449 i = i + 1 end
1820 2450 for j=i,#argv do table.insert(args,argv[j]) end
1821 2451
2452 +
2453 + do local ll = os.getenv('parvan_log')
2454 + if ll then loglevel = tonumber(ll) end
2455 + if flags[quiet] then loglevel=0
2456 + elseif flags[debug] then loglevel=4 end
2457 + end
2458 +
1822 2459 local file, cmd = table.unpack(args)
1823 2460 if cmd and cmds[cmd] then
1824 2461 local c,fd,dict = cmds[cmd]
1825 2462 if (not c.raw) and not c.nofile then
1826 2463 fd = safeopen(file, "rb")
1827 2464 dict = readDict(fd:read 'a')
1828 2465 fd:close()
1829 2466 -- lua io has no truncate method, so we must
1830 2467 -- rely on the clobbering behavior of the open()
1831 2468 -- call instead :(
1832 2469 end
1833 2470
2471 + local function log(lvl,...)
2472 + local loglevels = {
2473 + fatal = 1,
2474 + warn = 2,
2475 + info = 3,
2476 + debug = 4
2477 + }
2478 + if loglevels[lvl] <= loglevel then
2479 + ctx.log(lvl,...)
2480 + end
2481 + end
1834 2482 cmds[cmd].exec({
1835 2483 sty = ctx.sty;
1836 2484 try = ctx.try;
1837 - log = ctx.log;
2485 + log = log;
1838 2486
1839 2487 flags = flags;
1840 2488 file = file;
1841 2489 fd = fd;
1842 2490 dict = dict;
1843 2491 }, table.unpack(args,3))
1844 2492