Differences From
Artifact [2798df18ea]:
475 475
476 476 else b:push(str.ptr + i,1) end
477 477 end
478 478 if mode ~= 0 then return m.t.null(), 0, false end
479 479
480 480 return b:finalize(), max, spacebroke
481 481 end
482 +
483 +local terra disemvowel_codepoint(start: pstr): {pstr, pstr}
484 + -- TODO rewrite this in a more generative way -- it should be possible
485 + -- to have a long string listing vowels and generate all the necessary
486 + -- code based on that
487 + var dslen: intptr = 0
488 + var repl = pstr.null()
489 +
490 + var adc = m.cdowncase(start(0))
491 + if adc == @'a' or adc == @'e'
492 + or adc == @'i' or adc == @'o'
493 + or adc == @'u' then
494 + dslen = 1 goto done
495 + end
496 +
497 + if start.ct >= 2 then
498 + var tb = pstr { start.ptr, 2 }
499 + if tb:cmp('ä') or tb:cmp('ë') or tb:cmp('ï')
500 + or tb:cmp('Ä') or tb:cmp('Ë') or tb:cmp('Ï')
501 + or tb:cmp('ö') or tb:cmp('ü') -- haben Sie für diese Vokale
502 + or tb:cmp('Ö') or tb:cmp('Ü') -- kein Ausweis dabei, mein Herr
503 +
504 + or tb:cmp('á') or tb:cmp('é') or tb:cmp('í')
505 + or tb:cmp('Á') or tb:cmp('É') or tb:cmp('Í')
506 + or tb:cmp('ó') or tb:cmp('ú')
507 + or tb:cmp('Ó') or tb:cmp('Ú')
508 +
509 + or tb:cmp('à') or tb:cmp('è') or tb:cmp('ì')
510 + or tb:cmp('À') or tb:cmp('È') or tb:cmp('Ì')
511 + or tb:cmp('ò') or tb:cmp('ù') -- not so fast,
512 + or tb:cmp('Ò') or tb:cmp('Ù') -- "il capo"
513 +
514 + or tb:cmp('ā') or tb:cmp('ē') or tb:cmp('ī')
515 + or tb:cmp('Ā') or tb:cmp('Ē') or tb:cmp('Ī')
516 + or tb:cmp('ō') or tb:cmp('ū') -- take that latin
517 + or tb:cmp('Ō') or tb:cmp('Ū') -- and also hawaiian
518 +
519 + or tb:cmp('æ') or tb:cmp('Æ') -- sorry elon
520 + or tb:cmp('œ') or tb:cmp('Œ') -- sacre bleu
521 + or tb:cmp('ij') or tb:cmp('IJ') -- ok wtf dutch
522 +
523 + then dslen = 2 goto done end
524 + end
525 +
526 + if start.ct >= 3 then
527 + var s = [&uint8](start.ptr) -- for safe unicode comparisons, bc char is dumb
528 + if s[0] == 0xe3 and -- eliminate kana
529 + ((s[1] == 0x81 and s[2] >= 0x81) or
530 + (s[1] == 0x82 and s[2] <= 0x96)) or
531 + ((s[1] == 0x82 and s[2] >= 0xa1) or
532 + (s[1] == 0x83 and s[2] <= 0xb6))
533 + then dslen = 3 goto done end
534 + end
535 + -- TODO handle more nonroman scripts
536 + -- maybe remove consonant pointing in arabic??
537 + -- i guess remove vowels from devanagari
538 + -- no idea what to do about chinese
539 +
540 + -- no disemvoweling applied, return the current byte as is
541 + repl = pstr { ptr = start.ptr, ct = 1 }
542 + dslen = 1
543 +
544 + ::done::
545 + start:advance(dslen)
546 + lib.io.fmt('applied %llu bytes of disemvowelling; adding "%.*s"; continuing with "%.*s"\n', dslen, repl.ct, repl.ptr, start.ct, start.ptr)
547 + return repl, start
548 +end
549 +
550 +terra m.disemvowel(pool: &lib.mem.pool, str: m.t): m.t
551 + var acc: m.acc acc:pool(pool, str.ct)
552 + if str.ct == 0 then str.ct = m.sz(str.ptr) end
553 + var cur = str while cur.ct > 0 do
554 + var add, cont = disemvowel_codepoint(cur)
555 + if add:ref() then acc:ppush(add) end
556 + cur = cont
557 + end
558 + return acc:finalize()
559 +end
482 560
483 561 return m