475
476
477
478
479
480
481
482
483
|
else b:push(str.ptr + i,1) end
end
if mode ~= 0 then return m.t.null(), 0, false end
return b:finalize(), max, spacebroke
end
return m
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
|
else b:push(str.ptr + i,1) end
end
if mode ~= 0 then return m.t.null(), 0, false end
return b:finalize(), max, spacebroke
end
local terra disemvowel_codepoint(start: pstr): {pstr, pstr}
-- TODO rewrite this in a more generative way -- it should be possible
-- to have a long string listing vowels and generate all the necessary
-- code based on that
var dslen: intptr = 0
var repl = pstr.null()
var adc = m.cdowncase(start(0))
if adc == @'a' or adc == @'e'
or adc == @'i' or adc == @'o'
or adc == @'u' then
dslen = 1 goto done
end
if start.ct >= 2 then
var tb = pstr { start.ptr, 2 }
if tb:cmp('ä') or tb:cmp('ë') or tb:cmp('ï')
or tb:cmp('Ä') or tb:cmp('Ë') or tb:cmp('Ï')
or tb:cmp('ö') or tb:cmp('ü') -- haben Sie für diese Vokale
or tb:cmp('Ö') or tb:cmp('Ü') -- kein Ausweis dabei, mein Herr
or tb:cmp('á') or tb:cmp('é') or tb:cmp('í')
or tb:cmp('Á') or tb:cmp('É') or tb:cmp('Í')
or tb:cmp('ó') or tb:cmp('ú')
or tb:cmp('Ó') or tb:cmp('Ú')
or tb:cmp('à') or tb:cmp('è') or tb:cmp('ì')
or tb:cmp('À') or tb:cmp('È') or tb:cmp('Ì')
or tb:cmp('ò') or tb:cmp('ù') -- not so fast,
or tb:cmp('Ò') or tb:cmp('Ù') -- "il capo"
or tb:cmp('ā') or tb:cmp('ē') or tb:cmp('ī')
or tb:cmp('Ā') or tb:cmp('Ē') or tb:cmp('Ī')
or tb:cmp('ō') or tb:cmp('ū') -- take that latin
or tb:cmp('Ō') or tb:cmp('Ū') -- and also hawaiian
or tb:cmp('æ') or tb:cmp('Æ') -- sorry elon
or tb:cmp('œ') or tb:cmp('Œ') -- sacre bleu
or tb:cmp('ij') or tb:cmp('IJ') -- ok wtf dutch
then dslen = 2 goto done end
end
if start.ct >= 3 then
var s = [&uint8](start.ptr) -- for safe unicode comparisons, bc char is dumb
if s[0] == 0xe3 and -- eliminate kana
((s[1] == 0x81 and s[2] >= 0x81) or
(s[1] == 0x82 and s[2] <= 0x96)) or
((s[1] == 0x82 and s[2] >= 0xa1) or
(s[1] == 0x83 and s[2] <= 0xb6))
then dslen = 3 goto done end
end
-- TODO handle more nonroman scripts
-- maybe remove consonant pointing in arabic??
-- i guess remove vowels from devanagari
-- no idea what to do about chinese
-- no disemvoweling applied, return the current byte as is
repl = pstr { ptr = start.ptr, ct = 1 }
dslen = 1
::done::
start:advance(dslen)
lib.io.fmt('applied %llu bytes of disemvowelling; adding "%.*s"; continuing with "%.*s"\n', dslen, repl.ct, repl.ptr, start.ct, start.ptr)
return repl, start
end
terra m.disemvowel(pool: &lib.mem.pool, str: m.t): m.t
var acc: m.acc acc:pool(pool, str.ct)
if str.ct == 0 then str.ct = m.sz(str.ptr) end
var cur = str while cur.ct > 0 do
var add, cont = disemvowel_codepoint(cur)
if add:ref() then acc:ppush(add) end
cur = cont
end
return acc:finalize()
end
return m
|