parsav  str.t at [c774e2c5a9]

File str.t artifact bff3416286 part of check-in c774e2c5a9


-- vim: ft=terra
-- string.t: string classes
local util = lib.util
local pstr = lib.mem.ptr(int8)
local pref = lib.mem.ref(int8)

local m = {
	t = pstr, ref = pref;
	sz = terralib.externfunction('strlen', rawstring -> intptr);
	cmp = terralib.externfunction('strcmp', {rawstring, rawstring} -> int);
	ncmp = terralib.externfunction('strncmp', {rawstring, rawstring, intptr} -> int);
	cpy = terralib.externfunction('stpcpy',{rawstring, rawstring} -> rawstring);
	ncpy = terralib.externfunction('stpncpy',{rawstring, rawstring, intptr} -> rawstring);
	cat = terralib.externfunction('strcat',{rawstring, rawstring} -> rawstring);
	ncat = terralib.externfunction('strncat',{rawstring, rawstring, intptr} -> rawstring);
	dup = terralib.externfunction('strdup',rawstring -> rawstring);
	ndup = terralib.externfunction('strndup',{rawstring, intptr} -> rawstring);
	fmt = terralib.externfunction('asprintf',
		terralib.types.funcpointer({&rawstring,rawstring},{int},true));
	bfmt = terralib.externfunction('sprintf',
		terralib.types.funcpointer({rawstring,rawstring},{int},true));
	span = terralib.externfunction('strspn',{rawstring, rawstring} -> rawstring);
}

terra m.ffw(str: &int8, maxlen: intptr)
	if maxlen == 0 then maxlen = m.sz(str) end
	while maxlen > 0 and @str ~= 0 and
	      (@str == @' ' or @str == @'\t' or @str == @'\n') do
		str = str + 1
		maxlen = maxlen - 1
	end
	return str
end

do local strptr = (lib.mem.ptr(int8))
	local strref = (lib.mem.ref(int8))
	local byteptr = (lib.mem.ptr(uint8))
	local function install_funcs(ty)
		ty.metamethods.__cast = function(from,to,e)
			local v = e:asvalue()
			if type(v) == 'string' then
				return `ty {ptr = v, ct = [#v]}
			elseif from == &int8 then
				return `ty {ptr = e, ct = m.sz(e)}
			elseif to == &int8 then
				return e.ptr
			end
		end
		terra ty:pdup(p: &lib.mem.pool): strptr
			if not @self then return strptr.null() end
			if self.ct == 0 then self.ct = m.sz(self.ptr) end
			var newstr = p:alloc(int8, self.ct)
			lib.mem.cpy(newstr.ptr, self.ptr, self.ct)
			return newstr
		end
		terra ty:cmp(other: ty)
			if self.ptr == nil and other.ptr == nil then return true end
			if self.ptr == nil or other.ptr == nil then return false end

			var sz = lib.math.biggest(self.ct, other.ct)
			for i = 0, sz do
				if self.ptr[i] == 0 and other.ptr[i] == 0 then return true end
				if self.ptr[i] ~= other.ptr[i] then return false end
			end
			return true
		end
		terra ty:startswith(other: ty)
			for i=0, other.ct do
				if other(i) == 0 then return true end
				if other(i) ~= self(i) then return false end
			end
			return true
		end
		terra ty:ffw()
			var newp = m.ffw(self.ptr,self.ct)
			var newct = self.ct - (newp - self.ptr)
			return ty { ptr = newp, ct = newct }
		end
		terra ty:blob()
			return byteptr {
				ptr = [&uint8](self.ptr);
				ct = self.ct;
			}
		end
	end
	install_funcs(strptr)
	install_funcs(strref)

	--strptr.methods.cmpl = macro(function(self,other)
	--	return `self:cmp(strptr { ptr = [other:asvalue()], ct = [#(other:asvalue())] })
	--end)
	--strref.methods.cmpl = macro(function(self,other)
	--	return `self:cmp(strref { ptr = [other:asvalue()], ct = [#(other:asvalue())] })
	--end)

	terra byteptr:cmp(other: byteptr)
		var sz = lib.math.biggest(self.ct, other.ct)
		for i = 0, sz do
			if self.ptr[i] == 0 and other.ptr[i] == 0 then return true end
			if self.ptr[i] ~= other.ptr[i] then return false end
		end
		return true
	end
end

terra m.normalize(s: pstr)
	var c: rawstring = s.ptr
	var n: rawstring = s.ptr
	while n < s.ptr + s.ct do
		while @n == 0 or @n == @'\r' do
			n = n + 1
			if n > s.ptr + s.ct then
				c = c + 1 goto done
			end
		end
		@c = @n
		c = c + 1
		n = n + 1
	end ::done::
	@c = 0
	return pstr { ptr = s.ptr, ct = c - s.ptr }
end

struct m.acc {
	buf: rawstring
	sz: intptr
	run: intptr
	space: intptr
	pool: &lib.mem.pool
}

terra m.cdowncase(c: int8)
	if c >= @'A' and c <= @'Z' then
		return c + (@'a' - @'A')
	else return c end
end

terra m.cupcase(c: int8)
	if c >= @'a' and c <= @'z' then
		return c - (@'a' - @'A')
	else return c end
end

local terra biggest(a: intptr, b: intptr)
	if a > b then return a else return b end
end

terra m.acc:init(run: intptr)
	--lib.dbg('initializing string accumulator')
	self.pool = nil
	if run == 0 then
		lib.warn('attempted to allocate zero-length string accumulator')
		self.buf = nil
	else
		self.buf = [rawstring](lib.mem.heapa_raw(run))
		if self.buf == nil then
			lib.warn('string buffer allocation failed, very little memory availble')
		end
	end
	self.run = lib.trn(self.buf == nil, 0, run)
	self.space = self.run
	self.sz = 0
	return self
end;

terra m.acc:pool(pool: &lib.mem.pool, run: intptr)
	self.buf = [&int8](pool:alloc_bytes(run))
	self.pool = pool
	self.run = run
	self.space = self.run
	self.sz = 0
	return self
end

terra m.acc:free()
	--lib.dbg('freeing string accumulator')
	if self.pool ~= nil then
		lib.dbg('attempted to free pooled string accumulator; use frame-reset instead')
		return
	end
	if self.buf ~= nil and self.space > 0 then
		lib.mem.heapf(self.buf)
	end
end;

terra m.acc:crush()
	if self.pool ~= nil then return self end -- no point unless at end of buffer
	--lib.dbg('crushing string accumulator', &self.buf[0])
	self.buf = [rawstring](lib.mem.heapr_raw(self.buf, self.sz))
	self.space = self.sz
	return self
end;

terra m.acc:finalize()
	--lib.dbg('finalizing string accumulator')
	self:crush()
	var pt: lib.mem.ptr(int8)
	pt.ptr = self.buf
	pt.ct = self.sz
	self.buf = nil
	self.sz = 0
	return pt
end;

terra m.acc:cue(sz: intptr)
	if sz <= self.run then return end
	var curspace = self.space
	self.run = sz
	if self.space - self.sz < self.run then
		self.space = self.sz + self.run
		if self.pool ~= nil then
			self.buf = [&int8](self.pool:realloc_bytes(self.buf, curspace, self.space))
		else
			self.buf = [rawstring](lib.mem.heapr_raw(self.buf, self.space))
		end
	end
end

terra m.acc:reset() -- semantic convenience function
	self.sz = 0
end

terra m.acc:push(str: rawstring, len: intptr)
	--var llen = len
	if str == nil then return self end
	--if str[len - 1] == 0xA then llen = llen - 1 end -- don't display newlines in debug output
	-- lib.dbg('pushing "',{str,llen},'" onto accumulator')
	if self.buf == nil then self:init(self.run) end
	if self.buf == nil then lib.warn('attempted to push string onto unallocated accumulator') return self end
	if len == 0 then len = m.sz(str) end
	if len >= self.space - self.sz then
		self:cue(self.space + biggest(self.run,len + 1))
		--self.space = self.space + biggest(self.run,len + 1)
		--self.buf = [rawstring](lib.mem.heapr_raw(self.buf, self.space))
	end
	lib.mem.cpy(self.buf + self.sz, str, len)
	self.sz = self.sz + len
	self.buf[self.sz] = 0
	return self
end;

terra m.acc:dpush(i: intptr)
	var decbuf: int8[21]
	var si = lib.math.decstr_friendly(i, &decbuf[20])
	var len: intptr = [decbuf.type.N] - (si - &decbuf[0])
	return self:push(si,len-1)
end

terra m.acc:ipush(i: intptr)
	var decbuf: int8[21]
	var si = lib.math.decstr(i, &decbuf[20])
	var len: intptr = [decbuf.type.N] - (si - &decbuf[0])
	return self:push(si,len-1)
end

terra m.acc:shpush(i: uint64)
	var sbuf: int8[lib.math.shorthand.maxlen]
	var len = lib.math.shorthand.gen(i,&sbuf[0])
	return self:push(&sbuf[0], len)
end

m.lit = macro(function(str)
	if str:asvalue() ~= nil then
		return `[lib.mem.ref(int8)] {ptr = [str:asvalue()], ct = [#(str:asvalue())]}
	else
		return `[lib.mem.ref(int8)] {ptr = nil, ct = 0}
	end
end)

m.plit = macro(function(str)
	if str:asvalue() ~= nil then
		return `[lib.mem.ptr(int8)] {ptr = [str:asvalue()], ct = [#(str:asvalue())]}
	else
		return `[lib.mem.ptr(int8)] {ptr = nil, ct = 0}
	end
end)

m.acc.methods.lpush = macro(function(self,str)
	return `self:push([str:asvalue()], [#(str:asvalue())]) end)
m.acc.methods.ppush = terra(self: &m.acc, str: lib.mem.ptr(int8))
	self:push(str.ptr, str.ct)            return self end;
m.acc.methods.rpush = terra(self: &m.acc, str: lib.mem.ref(int8))
	self:push(str.ptr, str.ct)            return self end;
m.acc.methods.merge = terra(self: &m.acc, str: lib.mem.ptr(int8))
	self:push(str.ptr, str.ct) str:free() return self end;
local composefn = function(call, ...)
	local minlen = 0
	local pstrs = {}
	for i,v in ipairs{...} do
		if type(v) == 'table' then
			local gl = 16 -- guess wildly
			if v.tree and v.tree.type.convertible == 'tuple' then
				pstrs[#pstrs+1] = {str = `v._0, len = `v._1}
			elseif v.asvalue and type(v:asvalue()) == 'string' then
				local str = v:asvalue()
				pstrs[#pstrs+1] = {str = str, len = #str}
				gl = #str + 1
			elseif v.tree and v.tree.type.ptr_basetype == int8 then
				pstrs[#pstrs+1] = {str = `v.ptr, len = `v.ct}
			else pstrs[#pstrs+1] = {str = v, len = 0} end
			minlen = minlen + gl
		elseif type(v) == 'string' then 
			pstrs[#pstrs+1] = {str = v, len = #v}
			minlen = minlen + #v + 1
		else error('invalid type in compose expression') end
	end
	call = call(minlen) --`self:init(minlen)
	for i,v in ipairs(pstrs) do
		call = `[call]:push([v.str],[v.len])
	end
	return call
end
m.acc.methods.compose = macro(function(self, ...)
	return composefn(function(minlen) return `self:init(minlen) end, ...)
end)
m.acc.methods.pcompose = macro(function(self, pool, ...)
	return composefn(function(minlen) return `self:pool(pool,minlen) end, ...)
end)

m.acc.metamethods.__lshift = terralib.overloadedfunction('(<<)', {
	terra(self: &m.acc, str: rawstring)         return self: push(str,0) end;
	terra(self: &m.acc, str: lib.mem.ptr(int8)) return self:ppush(str  ) end;
})

m.box = terralib.memoize(function(ty)
	local b = struct {
		obj: ty
		storage: int8[0]
	}
	b.name = string.format('bytebox<%s>', ty.name)
	b.methods.mk = terra(sz: intptr)
		return [&b](lib.mem.heapa_raw(sizeof(b) + sz))
	end
	terra b:free() lib.mem.heapf(self) end -- enhhhhh
	return b
end)

m.encapsulate = function(ty, vals)
	local memreq_const = sizeof(ty)
	local ptr = symbol(&int8)
	local box = symbol(&m.box(ty))
	local memreq_exp = `0
	local copiers = {}
	for k,v in pairs(vals) do
		local ty = (`box.obj.[k]).tree.type
		local kp
		local isnull, nullify
		if ty.ptr_basetype then
			kp = quote [box].obj.[k] = [ty] { ptr = [&ty.ptr_basetype]([ptr]) } ; end
			nullify = quote [box].obj.[k] = [ty] { ptr = nil, ct = 0 } end
		else
			kp = quote [box].obj.[k] = [ty]([ptr]) ; end
			nullify = quote [box].obj.[k] = nil end
		end

		local cpy
		if type(v) ~= 'table' or #v ~= 2 then
			cpy = quote [kp] ; [ptr] = m.cpy(ptr, v) end
			isnull = `v == nil
		end
		if type(v) == 'string' then
			memreq_const = memreq_const + #v + 1
			isnull = `false
		elseif type(v) == 'table' and v.tree and (v.tree.type.ptr_basetype == int8 or v.tree.type.ptr_basetype == uint8) then
			cpy = quote [kp]; [ptr] = [&int8](lib.mem.cpy([ptr], [v].ptr, [v].ct)) end
			if ty.ptr_basetype then
				cpy = quote [cpy]; [box].obj.[k].ct = [v].ct end
			end
			isnull = `[v].ptr == nil
		elseif type(v) == 'table' and v.asvalue and type(v:asvalue()) == 'string' then
			local str = tostring(v:asvalue())
			memreq_const = memreq_const + #str + 1
			isnull = `false
		elseif type(v) == 'table' and #v == 2 then
			local str,sz = v[1],v[2]
			if type(sz) == 'number' then
				memreq_const = memreq_const + sz
			elseif type(sz:asvalue()) == 'number' then
				memreq_const = memreq_const + sz:asvalue()
			else memreq_exp = `[sz] + [memreq_exp] end

			cpy = quote [kp] ;
				--lib.io.fmt('encapsulating string %p → %p [%s] sz %llu\n', str, [ptr], str, sz)
				[ptr] = [&int8](lib.mem.cpy([ptr], str, sz))
				--lib.io.fmt(' :: encapsulated string %p [%s]\n', box.obj.[k],box.obj.[k])
			end
			if ty.ptr_basetype then
				cpy = quote [cpy]; [box].obj.[k].ct = sz end
			end
			isnull = `[str] == nil
		else
			memreq_exp = `(m.sz(v) + 1) + [memreq_exp] -- make room for NUL
			isnull = `v == nil
			if ty.ptr_basetype then
				cpy = quote [cpy]; [box].obj.[k].ct = m.sz(v) end
			end
		end

		copiers[#copiers + 1] = quote
			if [isnull] then [nullify]
			            else [cpy] end
		end
	end

	return quote
		var sz: intptr = memreq_const + [memreq_exp]
		var [box] = [&m.box(ty)](lib.mem.heapa_raw(sz))
		var [ptr] = [box].storage
		[copiers]
	in [lib.mem.ptr(ty)] { ct = 1, ptr = &([box].obj) } end
end

terra m.cspan(str: lib.mem.ptr(int8), reject: lib.mem.ref(int8), maxlen: intptr)
	for i=0, lib.math.smallest(maxlen,str.ct) do
		if str.ptr[i] == 0 then return 0 end
		for j=0, reject.ct do
			if str.ptr[i] == reject.ptr[j] then return i end
		end
	end
	return maxlen
end

terra m.ffw_unsafe(str: &int8)
	while  @str ~= 0 and
	      (@str == @' ' or @str == @'\t' or @str == @'\n') do
		str = str + 1
	end
	return str
end

terra m.find(haystack: pstr, needle: pstr): pstr
	for i=0,haystack.ct do
		for j=0, needle.ct do
			if haystack(i + j) ~= needle(j) then goto nomatch end
		end
		do return pstr {
			ptr = haystack.ptr + i;
			ct = haystack.ct - i;
		} end
	::nomatch::end
	return pstr.null()
end

terra m.splitmap(str: pstr, delim: pstr, expect: uint16)
	var vec: lib.mem.vec(pstr) vec:init(expect)
	var start = pstr{str.ptr, str.ct}
	while true do
		var n = m.find(start, delim)
		if not n then break end
		vec:push(pstr {ptr = start.ptr, ct = start.ct - n.ct})
		n.ptr = n.ptr + delim.ct
		n.ct = n.ct - delim.ct
		start = n
	end
	vec:push(start)
	return vec:crush()
end

terra m.toknext(str: m.t, delim: int8, brkspace: bool): {pstr,intptr,bool}
	var b: m.acc b:init(48)
	var mode: int8 = 0
	var esc = false
	var spacebroke = false
	var max = 0
	for i=0, str.ct do
		max = i
		if str(i) == 0         then break
		elseif esc == true     then b:push(str.ptr + i,1) esc = false
		elseif str(i) == @'\\' then esc = true

		elseif mode == 0 and str(i) == delim then break
		elseif mode ~= 2 and str(i) == @'"'  then
			if mode == 1
				then mode = 0
				else mode = 1
			end
		elseif mode ~= 1 and str(i) == @"'" then
			if mode == 2
				then mode = 0
				else mode = 2
			end

		elseif brkspace and mode == 0 and (
			str(i) == @' ' or str(i) == @'\t' or
			str(i) == @'\r' or str(i) == @'\n') then
			spacebroke = true
			break

		else b:push(str.ptr + i,1) end
	end
	if mode ~= 0 then return m.t.null(), 0, false end

	return b:finalize(), max, spacebroke
end

local terra disemvowel_codepoint(start: pstr): {pstr, pstr}
 -- TODO rewrite this in a more generative way -- it should be possible
 -- to have a long string listing vowels and generate all the necessary
 -- code based on that
	var dslen: intptr = 0
	var repl = pstr.null()

	var adc = m.cdowncase(start(0))
	if adc == @'a' or adc == @'e'
	or adc == @'i' or adc == @'o'
	or adc == @'u' then
		dslen = 1 goto done
	end

	if start.ct >= 2 then
		var tb = pstr { start.ptr, 2 }
		if tb:cmp('ä') or tb:cmp('ë') or tb:cmp('ï')
		or tb:cmp('Ä') or tb:cmp('Ë') or tb:cmp('Ï')
		or tb:cmp('ö') or tb:cmp('ü') -- haben Sie für diese Vokale
		or tb:cmp('Ö') or tb:cmp('Ü') -- kein Ausweis dabei, mein Herr

		or tb:cmp('á') or tb:cmp('é') or tb:cmp('í')
		or tb:cmp('Á') or tb:cmp('É') or tb:cmp('Í')
		or tb:cmp('ó') or tb:cmp('ú') 
		or tb:cmp('Ó') or tb:cmp('Ú')

		or tb:cmp('à') or tb:cmp('è') or tb:cmp('ì')
		or tb:cmp('À') or tb:cmp('È') or tb:cmp('Ì')
		or tb:cmp('ò') or tb:cmp('ù') -- not so fast,
		or tb:cmp('Ò') or tb:cmp('Ù') -- "il capo"

		or tb:cmp('ā') or tb:cmp('ē') or tb:cmp('ī')
		or tb:cmp('Ā') or tb:cmp('Ē') or tb:cmp('Ī')
		or tb:cmp('ō') or tb:cmp('ū') -- take that latin
		or tb:cmp('Ō') or tb:cmp('Ū') -- and also hawaiian

		or tb:cmp('æ') or tb:cmp('Æ') -- sorry elon
		or tb:cmp('œ') or tb:cmp('Œ') -- sacre bleu
		or tb:cmp('ij') or tb:cmp('IJ') -- ok wtf dutch

		then dslen = 2 goto done end
	end

	if start.ct >= 3 then
		var s = [&uint8](start.ptr) -- for safe unicode comparisons, bc char is dumb
		if s[0] == 0xe3 and -- eliminate kana
			((s[1] == 0x81 and s[2] >= 0x81) or
			 (s[1] == 0x82 and s[2] <= 0x96)) or
			((s[1] == 0x82 and s[2] >= 0xa1) or
			 (s[1] == 0x83 and s[2] <= 0xb6))
		then dslen = 3 goto done end
	end
	-- TODO handle more nonroman scripts
	-- maybe remove consonant pointing in arabic??
	-- i guess remove vowels from devanagari
	-- no idea what to do about chinese

	-- no disemvoweling applied, return the current byte as is
		repl = pstr { ptr = start.ptr, ct = 1 }
		dslen = 1

	::done::
	start:advance(dslen)
	lib.io.fmt('applied %llu bytes of disemvowelling; adding "%.*s"; continuing with "%.*s"\n', dslen, repl.ct, repl.ptr, start.ct, start.ptr)
	return repl, start
end

terra m.disemvowel(pool: &lib.mem.pool, str: m.t): m.t
	var acc: m.acc acc:pool(pool, str.ct)
	if str.ct == 0 then str.ct = m.sz(str.ptr) end
	var cur = str while cur.ct > 0 do
		var add, cont = disemvowel_codepoint(cur)
		if add:ref() then acc:ppush(add) end
		cur = cont
	end
	return acc:finalize()
end

terra m.acc:qesc(str: m.t, wrap: bool)
 -- escape double-quotes
	if wrap then self:lpush '"' end
	for i=0, str.ct do
		if     str(i) == @'"'  then self:lpush '\\"'
		elseif str(i) == @'\\' then self:lpush '\\\\'
		elseif str(i) == @'\n' then self:lpush '\\n'
		elseif str(i) == @'\t' then self:lpush '\\t'
		elseif str(i) < 0x20 then -- for json
			var hex = lib.math.hexbyte(str(i))
			self:lpush('\\u00'):push(&hex[0], 2)
		else   self:push(str.ptr + i,1) end
	end
	if wrap then self:lpush '"' end
	return self
end

terra m.qesc(pool: &lib.mem.pool, str: m.t, wrap: bool): m.t
 -- convenience function
	var a: m.acc a:pool(pool, 2 + str.ct + str.ct/2)
	a:qesc(str,wrap)
	return a:finalize()
end

terra m.acc:qpush(str: m.t)
 -- convenience adaptor
	return self:qesc(str, false)
end

return m