parsav  Diff

Differences From Artifact [2798df18ea]:

To Artifact [ee5af81e76]:


   475    475   
   476    476   		else b:push(str.ptr + i,1) end
   477    477   	end
   478    478   	if mode ~= 0 then return m.t.null(), 0, false end
   479    479   
   480    480   	return b:finalize(), max, spacebroke
   481    481   end
          482  +
          483  +local terra disemvowel_codepoint(start: pstr): {pstr, pstr}
          484  + -- TODO rewrite this in a more generative way -- it should be possible
          485  + -- to have a long string listing vowels and generate all the necessary
          486  + -- code based on that
          487  +	var dslen: intptr = 0
          488  +	var repl = pstr.null()
          489  +
          490  +	var adc = m.cdowncase(start(0))
          491  +	if adc == @'a' or adc == @'e'
          492  +	or adc == @'i' or adc == @'o'
          493  +	or adc == @'u' then
          494  +		dslen = 1 goto done
          495  +	end
          496  +
          497  +	if start.ct >= 2 then
          498  +		var tb = pstr { start.ptr, 2 }
          499  +		if tb:cmp('ä') or tb:cmp('ë') or tb:cmp('ï')
          500  +		or tb:cmp('Ä') or tb:cmp('Ë') or tb:cmp('Ï')
          501  +		or tb:cmp('ö') or tb:cmp('ü') -- haben Sie für diese Vokale
          502  +		or tb:cmp('Ö') or tb:cmp('Ü') -- kein Ausweis dabei, mein Herr
          503  +
          504  +		or tb:cmp('á') or tb:cmp('é') or tb:cmp('í')
          505  +		or tb:cmp('Á') or tb:cmp('É') or tb:cmp('Í')
          506  +		or tb:cmp('ó') or tb:cmp('ú') 
          507  +		or tb:cmp('Ó') or tb:cmp('Ú')
          508  +
          509  +		or tb:cmp('à') or tb:cmp('è') or tb:cmp('ì')
          510  +		or tb:cmp('À') or tb:cmp('È') or tb:cmp('Ì')
          511  +		or tb:cmp('ò') or tb:cmp('ù') -- not so fast,
          512  +		or tb:cmp('Ò') or tb:cmp('Ù') -- "il capo"
          513  +
          514  +		or tb:cmp('ā') or tb:cmp('ē') or tb:cmp('ī')
          515  +		or tb:cmp('Ā') or tb:cmp('Ē') or tb:cmp('Ī')
          516  +		or tb:cmp('ō') or tb:cmp('ū') -- take that latin
          517  +		or tb:cmp('Ō') or tb:cmp('Ū') -- and also hawaiian
          518  +
          519  +		or tb:cmp('æ') or tb:cmp('Æ') -- sorry elon
          520  +		or tb:cmp('œ') or tb:cmp('Œ') -- sacre bleu
          521  +		or tb:cmp('ij') or tb:cmp('IJ') -- ok wtf dutch
          522  +
          523  +		then dslen = 2 goto done end
          524  +	end
          525  +
          526  +	if start.ct >= 3 then
          527  +		var s = [&uint8](start.ptr) -- for safe unicode comparisons, bc char is dumb
          528  +		if s[0] == 0xe3 and -- eliminate kana
          529  +			((s[1] == 0x81 and s[2] >= 0x81) or
          530  +			 (s[1] == 0x82 and s[2] <= 0x96)) or
          531  +			((s[1] == 0x82 and s[2] >= 0xa1) or
          532  +			 (s[1] == 0x83 and s[2] <= 0xb6))
          533  +		then dslen = 3 goto done end
          534  +	end
          535  +	-- TODO handle more nonroman scripts
          536  +	-- maybe remove consonant pointing in arabic??
          537  +	-- i guess remove vowels from devanagari
          538  +	-- no idea what to do about chinese
          539  +
          540  +	-- no disemvoweling applied, return the current byte as is
          541  +		repl = pstr { ptr = start.ptr, ct = 1 }
          542  +		dslen = 1
          543  +
          544  +	::done::
          545  +	start:advance(dslen)
          546  +	lib.io.fmt('applied %llu bytes of disemvowelling; adding "%.*s"; continuing with "%.*s"\n', dslen, repl.ct, repl.ptr, start.ct, start.ptr)
          547  +	return repl, start
          548  +end
          549  +
          550  +terra m.disemvowel(pool: &lib.mem.pool, str: m.t): m.t
          551  +	var acc: m.acc acc:pool(pool, str.ct)
          552  +	if str.ct == 0 then str.ct = m.sz(str.ptr) end
          553  +	var cur = str while cur.ct > 0 do
          554  +		var add, cont = disemvowel_codepoint(cur)
          555  +		if add:ref() then acc:ppush(add) end
          556  +		cur = cont
          557  +	end
          558  +	return acc:finalize()
          559  +end
   482    560   
   483    561   return m