parsav  Diff

Differences From Artifact [2798df18ea]:

To Artifact [ee5af81e76]:


475
476
477
478
479
480
481
482














































































483

		else b:push(str.ptr + i,1) end
	end
	if mode ~= 0 then return m.t.null(), 0, false end

	return b:finalize(), max, spacebroke
end















































































return m








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>

475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561

		else b:push(str.ptr + i,1) end
	end
	if mode ~= 0 then return m.t.null(), 0, false end

	return b:finalize(), max, spacebroke
end

local terra disemvowel_codepoint(start: pstr): {pstr, pstr}
 -- TODO rewrite this in a more generative way -- it should be possible
 -- to have a long string listing vowels and generate all the necessary
 -- code based on that
	var dslen: intptr = 0
	var repl = pstr.null()

	var adc = m.cdowncase(start(0))
	if adc == @'a' or adc == @'e'
	or adc == @'i' or adc == @'o'
	or adc == @'u' then
		dslen = 1 goto done
	end

	if start.ct >= 2 then
		var tb = pstr { start.ptr, 2 }
		if tb:cmp('ä') or tb:cmp('ë') or tb:cmp('ï')
		or tb:cmp('Ä') or tb:cmp('Ë') or tb:cmp('Ï')
		or tb:cmp('ö') or tb:cmp('ü') -- haben Sie für diese Vokale
		or tb:cmp('Ö') or tb:cmp('Ü') -- kein Ausweis dabei, mein Herr

		or tb:cmp('á') or tb:cmp('é') or tb:cmp('í')
		or tb:cmp('Á') or tb:cmp('É') or tb:cmp('Í')
		or tb:cmp('ó') or tb:cmp('ú') 
		or tb:cmp('Ó') or tb:cmp('Ú')

		or tb:cmp('à') or tb:cmp('è') or tb:cmp('ì')
		or tb:cmp('À') or tb:cmp('È') or tb:cmp('Ì')
		or tb:cmp('ò') or tb:cmp('ù') -- not so fast,
		or tb:cmp('Ò') or tb:cmp('Ù') -- "il capo"

		or tb:cmp('ā') or tb:cmp('ē') or tb:cmp('ī')
		or tb:cmp('Ā') or tb:cmp('Ē') or tb:cmp('Ī')
		or tb:cmp('ō') or tb:cmp('ū') -- take that latin
		or tb:cmp('Ō') or tb:cmp('Ū') -- and also hawaiian

		or tb:cmp('æ') or tb:cmp('Æ') -- sorry elon
		or tb:cmp('œ') or tb:cmp('Œ') -- sacre bleu
		or tb:cmp('ij') or tb:cmp('IJ') -- ok wtf dutch

		then dslen = 2 goto done end
	end

	if start.ct >= 3 then
		var s = [&uint8](start.ptr) -- for safe unicode comparisons, bc char is dumb
		if s[0] == 0xe3 and -- eliminate kana
			((s[1] == 0x81 and s[2] >= 0x81) or
			 (s[1] == 0x82 and s[2] <= 0x96)) or
			((s[1] == 0x82 and s[2] >= 0xa1) or
			 (s[1] == 0x83 and s[2] <= 0xb6))
		then dslen = 3 goto done end
	end
	-- TODO handle more nonroman scripts
	-- maybe remove consonant pointing in arabic??
	-- i guess remove vowels from devanagari
	-- no idea what to do about chinese

	-- no disemvoweling applied, return the current byte as is
		repl = pstr { ptr = start.ptr, ct = 1 }
		dslen = 1

	::done::
	start:advance(dslen)
	lib.io.fmt('applied %llu bytes of disemvowelling; adding "%.*s"; continuing with "%.*s"\n', dslen, repl.ct, repl.ptr, start.ct, start.ptr)
	return repl, start
end

terra m.disemvowel(pool: &lib.mem.pool, str: m.t): m.t
	var acc: m.acc acc:pool(pool, str.ct)
	if str.ct == 0 then str.ct = m.sz(str.ptr) end
	var cur = str while cur.ct > 0 do
		var add, cont = disemvowel_codepoint(cur)
		if add:ref() then acc:ppush(add) end
		cur = cont
	end
	return acc:finalize()
end

return m