-- vim: ft=terra
-- string.t: string classes
local util = lib.util
local pstr = lib.mem.ptr(int8)
local pref = lib.mem.ref(int8)
local m = {
t = pstr, ref = pref;
sz = terralib.externfunction('strlen', rawstring -> intptr);
cmp = terralib.externfunction('strcmp', {rawstring, rawstring} -> int);
ncmp = terralib.externfunction('strncmp', {rawstring, rawstring, intptr} -> int);
cpy = terralib.externfunction('stpcpy',{rawstring, rawstring} -> rawstring);
ncpy = terralib.externfunction('stpncpy',{rawstring, rawstring, intptr} -> rawstring);
cat = terralib.externfunction('strcat',{rawstring, rawstring} -> rawstring);
ncat = terralib.externfunction('strncat',{rawstring, rawstring, intptr} -> rawstring);
dup = terralib.externfunction('strdup',rawstring -> rawstring);
ndup = terralib.externfunction('strndup',{rawstring, intptr} -> rawstring);
fmt = terralib.externfunction('asprintf',
terralib.types.funcpointer({&rawstring,rawstring},{int},true));
bfmt = terralib.externfunction('sprintf',
terralib.types.funcpointer({rawstring,rawstring},{int},true));
span = terralib.externfunction('strspn',{rawstring, rawstring} -> rawstring);
}
terra m.ffw(str: &int8, maxlen: intptr)
if maxlen == 0 then maxlen = m.sz(str) end
while maxlen > 0 and @str ~= 0 and
(@str == @' ' or @str == @'\t' or @str == @'\n') do
str = str + 1
maxlen = maxlen - 1
end
return str
end
do local strptr = (lib.mem.ptr(int8))
local strref = (lib.mem.ref(int8))
local byteptr = (lib.mem.ptr(uint8))
local function install_funcs(ty)
ty.metamethods.__cast = function(from,to,e)
local v = e:asvalue()
if type(v) == 'string' then
return `ty {ptr = v, ct = [#v]}
elseif from == &int8 then
return `ty {ptr = e, ct = m.sz(e)}
elseif to == &int8 then
return e.ptr
end
end
terra ty:pdup(p: &lib.mem.pool): strptr
if not @self then return strptr.null() end
if self.ct == 0 then self.ct = m.sz(self.ptr) end
var newstr = p:alloc(int8, self.ct)
lib.mem.cpy(newstr.ptr, self.ptr, self.ct)
return newstr
end
terra ty:cmp(other: ty)
if self.ptr == nil and other.ptr == nil then return true end
if self.ptr == nil or other.ptr == nil then return false end
var sz = lib.math.biggest(self.ct, other.ct)
for i = 0, sz do
if self.ptr[i] == 0 and other.ptr[i] == 0 then return true end
if self.ptr[i] ~= other.ptr[i] then return false end
end
return true
end
terra ty:startswith(other: ty)
for i=0, other.ct do
if other(i) == 0 then return true end
if other(i) ~= self(i) then return false end
end
return true
end
terra ty:ffw()
var newp = m.ffw(self.ptr,self.ct)
var newct = self.ct - (newp - self.ptr)
return ty { ptr = newp, ct = newct }
end
terra ty:blob()
return byteptr {
ptr = [&uint8](self.ptr);
ct = self.ct;
}
end
end
install_funcs(strptr)
install_funcs(strref)
--strptr.methods.cmpl = macro(function(self,other)
-- return `self:cmp(strptr { ptr = [other:asvalue()], ct = [#(other:asvalue())] })
--end)
--strref.methods.cmpl = macro(function(self,other)
-- return `self:cmp(strref { ptr = [other:asvalue()], ct = [#(other:asvalue())] })
--end)
terra byteptr:cmp(other: byteptr)
var sz = lib.math.biggest(self.ct, other.ct)
for i = 0, sz do
if self.ptr[i] == 0 and other.ptr[i] == 0 then return true end
if self.ptr[i] ~= other.ptr[i] then return false end
end
return true
end
end
terra m.normalize(s: pstr)
var c: rawstring = s.ptr
var n: rawstring = s.ptr
while n < s.ptr + s.ct do
while @n == 0 or @n == @'\r' do
n = n + 1
if n > s.ptr + s.ct then
c = c + 1 goto done
end
end
@c = @n
c = c + 1
n = n + 1
end ::done::
@c = 0
return pstr { ptr = s.ptr, ct = c - s.ptr }
end
struct m.acc {
buf: rawstring
sz: intptr
run: intptr
space: intptr
pool: &lib.mem.pool
}
terra m.cdowncase(c: int8)
if c >= @'A' and c <= @'Z' then
return c + (@'a' - @'A')
else return c end
end
terra m.cupcase(c: int8)
if c >= @'a' and c <= @'z' then
return c - (@'a' - @'A')
else return c end
end
local terra biggest(a: intptr, b: intptr)
if a > b then return a else return b end
end
terra m.acc:init(run: intptr)
--lib.dbg('initializing string accumulator')
self.pool = nil
if run == 0 then
lib.warn('attempted to allocate zero-length string accumulator')
self.buf = nil
else
self.buf = [rawstring](lib.mem.heapa_raw(run))
if self.buf == nil then
lib.warn('string buffer allocation failed, very little memory availble')
end
end
self.run = lib.trn(self.buf == nil, 0, run)
self.space = self.run
self.sz = 0
return self
end;
terra m.acc:pool(pool: &lib.mem.pool, run: intptr)
self.buf = [&int8](pool:alloc_bytes(run))
self.pool = pool
self.run = run
self.space = self.run
self.sz = 0
return self
end
terra m.acc:free()
--lib.dbg('freeing string accumulator')
if self.pool ~= nil then
lib.dbg('attempted to free pooled string accumulator; use frame-reset instead')
return
end
if self.buf ~= nil and self.space > 0 then
lib.mem.heapf(self.buf)
end
end;
terra m.acc:crush()
--lib.dbg('crushing string accumulator')
if self.pool ~= nil then return self end -- no point unless at end of buffer
self.buf = [rawstring](lib.mem.heapr_raw(self.buf, self.sz))
self.space = self.sz
return self
end;
terra m.acc:finalize()
--lib.dbg('finalizing string accumulator')
self:crush()
var pt: lib.mem.ptr(int8)
pt.ptr = self.buf
pt.ct = self.sz
self.buf = nil
self.sz = 0
return pt
end;
terra m.acc:cue(sz: intptr)
if sz <= self.run then return end
var curspace = self.space
self.run = sz
if self.space - self.sz < self.run then
self.space = self.sz + self.run
if self.pool ~= nil then
self.buf = [&int8](self.pool:realloc_bytes(self.buf, curspace, self.space))
else
self.buf = [rawstring](lib.mem.heapr_raw(self.buf, self.space))
end
end
end
terra m.acc:reset() -- semantic convenience function
self.sz = 0
end
terra m.acc:push(str: rawstring, len: intptr)
--var llen = len
if str == nil then return self end
--if str[len - 1] == 0xA then llen = llen - 1 end -- don't display newlines in debug output
-- lib.dbg('pushing "',{str,llen},'" onto accumulator')
if self.buf == nil then self:init(self.run) end
if self.buf == nil then lib.warn('attempted to push string onto unallocated accumulator') return self end
if len == 0 then len = m.sz(str) end
if len >= self.space - self.sz then
self:cue(self.space + biggest(self.run,len + 1))
--self.space = self.space + biggest(self.run,len + 1)
--self.buf = [rawstring](lib.mem.heapr_raw(self.buf, self.space))
end
lib.mem.cpy(self.buf + self.sz, str, len)
self.sz = self.sz + len
self.buf[self.sz] = 0
return self
end;
terra m.acc:dpush(i: intptr)
var decbuf: int8[21]
var si = lib.math.decstr_friendly(i, &decbuf[20])
var len: intptr = [decbuf.type.N] - (si - &decbuf[0])
return self:push(si,len-1)
end
terra m.acc:ipush(i: intptr)
var decbuf: int8[21]
var si = lib.math.decstr(i, &decbuf[20])
var len: intptr = [decbuf.type.N] - (si - &decbuf[0])
return self:push(si,len-1)
end
terra m.acc:shpush(i: uint64)
var sbuf: int8[lib.math.shorthand.maxlen]
var len = lib.math.shorthand.gen(i,&sbuf[0])
return self:push(&sbuf[0], len)
end
m.lit = macro(function(str)
if str:asvalue() ~= nil then
return `[lib.mem.ref(int8)] {ptr = [str:asvalue()], ct = [#(str:asvalue())]}
else
return `[lib.mem.ref(int8)] {ptr = nil, ct = 0}
end
end)
m.plit = macro(function(str)
if str:asvalue() ~= nil then
return `[lib.mem.ptr(int8)] {ptr = [str:asvalue()], ct = [#(str:asvalue())]}
else
return `[lib.mem.ptr(int8)] {ptr = nil, ct = 0}
end
end)
m.acc.methods.lpush = macro(function(self,str)
return `self:push([str:asvalue()], [#(str:asvalue())]) end)
m.acc.methods.ppush = terra(self: &m.acc, str: lib.mem.ptr(int8))
self:push(str.ptr, str.ct) return self end;
m.acc.methods.rpush = terra(self: &m.acc, str: lib.mem.ref(int8))
self:push(str.ptr, str.ct) return self end;
m.acc.methods.merge = terra(self: &m.acc, str: lib.mem.ptr(int8))
self:push(str.ptr, str.ct) str:free() return self end;
local composefn = function(call, ...)
local minlen = 0
local pstrs = {}
for i,v in ipairs{...} do
if type(v) == 'table' then
local gl = 16 -- guess wildly
if v.tree and v.tree.type.convertible == 'tuple' then
pstrs[#pstrs+1] = {str = `v._0, len = `v._1}
elseif v.asvalue and type(v:asvalue()) == 'string' then
local str = v:asvalue()
pstrs[#pstrs+1] = {str = str, len = #str}
gl = #str + 1
elseif v.tree and v.tree.type.ptr_basetype == int8 then
pstrs[#pstrs+1] = {str = `v.ptr, len = `v.ct}
else pstrs[#pstrs+1] = {str = v, len = 0} end
minlen = minlen + gl
elseif type(v) == 'string' then
pstrs[#pstrs+1] = {str = v, len = #v}
minlen = minlen + #v + 1
else error('invalid type in compose expression') end
end
call = call(minlen) --`self:init(minlen)
for i,v in ipairs(pstrs) do
call = `[call]:push([v.str],[v.len])
end
return call
end
m.acc.methods.compose = macro(function(self, ...)
return composefn(function(minlen) return `self:init(minlen) end, ...)
end)
m.acc.methods.pcompose = macro(function(self, pool, ...)
return composefn(function(minlen) return `self:pool(pool,minlen) end, ...)
end)
m.acc.metamethods.__lshift = terralib.overloadedfunction('(<<)', {
terra(self: &m.acc, str: rawstring) return self: push(str,0) end;
terra(self: &m.acc, str: lib.mem.ptr(int8)) return self:ppush(str ) end;
})
m.box = terralib.memoize(function(ty)
local b = struct {
obj: ty
storage: int8[0]
}
b.name = string.format('bytebox<%s>', ty.name)
b.methods.mk = terra(sz: intptr)
return [&b](lib.mem.heapa_raw(sizeof(b) + sz))
end
terra b:free() lib.mem.heapf(self) end -- enhhhhh
return b
end)
m.encapsulate = function(ty, vals)
local memreq_const = sizeof(ty)
local ptr = symbol(&int8)
local box = symbol(&m.box(ty))
local memreq_exp = `0
local copiers = {}
for k,v in pairs(vals) do
local ty = (`box.obj.[k]).tree.type
local kp
local isnull, nullify
if ty.ptr_basetype then
kp = quote [box].obj.[k] = [ty] { ptr = [&ty.ptr_basetype]([ptr]) } ; end
nullify = quote [box].obj.[k] = [ty] { ptr = nil, ct = 0 } end
else
kp = quote [box].obj.[k] = [ty]([ptr]) ; end
nullify = quote [box].obj.[k] = nil end
end
local cpy
if type(v) ~= 'table' or #v ~= 2 then
cpy = quote [kp] ; [ptr] = m.cpy(ptr, v) end
isnull = `v == nil
end
if type(v) == 'string' then
memreq_const = memreq_const + #v + 1
isnull = `false
elseif type(v) == 'table' and v.tree and (v.tree.type.ptr_basetype == int8 or v.tree.type.ptr_basetype == uint8) then
cpy = quote [kp]; [ptr] = [&int8](lib.mem.cpy([ptr], [v].ptr, [v].ct)) end
if ty.ptr_basetype then
cpy = quote [cpy]; [box].obj.[k].ct = [v].ct end
end
isnull = `[v].ptr == nil
elseif type(v) == 'table' and v.asvalue and type(v:asvalue()) == 'string' then
local str = tostring(v:asvalue())
memreq_const = memreq_const + #str + 1
isnull = `false
elseif type(v) == 'table' and #v == 2 then
local str,sz = v[1],v[2]
if type(sz) == 'number' then
memreq_const = memreq_const + sz
elseif type(sz:asvalue()) == 'number' then
memreq_const = memreq_const + sz:asvalue()
else memreq_exp = `[sz] + [memreq_exp] end
cpy = quote [kp] ;
--lib.io.fmt('encapsulating string %p → %p [%s] sz %llu\n', str, [ptr], str, sz)
[ptr] = [&int8](lib.mem.cpy([ptr], str, sz))
--lib.io.fmt(' :: encapsulated string %p [%s]\n', box.obj.[k],box.obj.[k])
end
if ty.ptr_basetype then
cpy = quote [cpy]; [box].obj.[k].ct = sz end
end
isnull = `[str] == nil
else
memreq_exp = `(m.sz(v) + 1) + [memreq_exp] -- make room for NUL
isnull = `v == nil
if ty.ptr_basetype then
cpy = quote [cpy]; [box].obj.[k].ct = m.sz(v) end
end
end
copiers[#copiers + 1] = quote
if [isnull] then [nullify]
else [cpy] end
end
end
return quote
var sz: intptr = memreq_const + [memreq_exp]
var [box] = [&m.box(ty)](lib.mem.heapa_raw(sz))
var [ptr] = [box].storage
[copiers]
in [lib.mem.ptr(ty)] { ct = 1, ptr = &([box].obj) } end
end
terra m.cspan(str: lib.mem.ptr(int8), reject: lib.mem.ref(int8), maxlen: intptr)
for i=0, lib.math.smallest(maxlen,str.ct) do
if str.ptr[i] == 0 then return 0 end
for j=0, reject.ct do
if str.ptr[i] == reject.ptr[j] then return i end
end
end
return maxlen
end
terra m.ffw_unsafe(str: &int8)
while @str ~= 0 and
(@str == @' ' or @str == @'\t' or @str == @'\n') do
str = str + 1
end
return str
end
terra m.find(haystack: pstr, needle: pstr): pstr
for i=0,haystack.ct do
for j=0, needle.ct do
if haystack(i + j) ~= needle(j) then goto nomatch end
end
do return pstr {
ptr = haystack.ptr + i;
ct = haystack.ct - i;
} end
::nomatch::end
return pstr.null()
end
terra m.splitmap(str: pstr, delim: pstr, expect: uint16)
var vec: lib.mem.vec(pstr) vec:init(expect)
var start = pstr{str.ptr, str.ct}
while true do
var n = m.find(start, delim)
if not n then break end
vec:push(pstr {ptr = start.ptr, ct = start.ct - n.ct})
n.ptr = n.ptr + delim.ct
n.ct = n.ct - delim.ct
start = n
end
vec:push(start)
return vec:crush()
end
terra m.toknext(str: m.t, delim: int8, brkspace: bool): {pstr,intptr,bool}
var b: m.acc b:init(48)
var mode: int8 = 0
var esc = false
var spacebroke = false
var max = 0
for i=0, str.ct do
max = i
if str(i) == 0 then break
elseif esc == true then b:push(str.ptr + i,1) esc = false
elseif str(i) == @'\\' then esc = true
elseif mode == 0 and str(i) == delim then break
elseif mode ~= 2 and str(i) == @'"' then
if mode == 1
then mode = 0
else mode = 1
end
elseif mode ~= 1 and str(i) == @"'" then
if mode == 2
then mode = 0
else mode = 2
end
elseif brkspace and mode == 0 and (
str(i) == @' ' or str(i) == @'\t' or
str(i) == @'\r' or str(i) == @'\n') then
spacebroke = true
break
else b:push(str.ptr + i,1) end
end
if mode ~= 0 then return m.t.null(), 0, false end
return b:finalize(), max, spacebroke
end
local terra disemvowel_codepoint(start: pstr): {pstr, pstr}
-- TODO rewrite this in a more generative way -- it should be possible
-- to have a long string listing vowels and generate all the necessary
-- code based on that
var dslen: intptr = 0
var repl = pstr.null()
var adc = m.cdowncase(start(0))
if adc == @'a' or adc == @'e'
or adc == @'i' or adc == @'o'
or adc == @'u' then
dslen = 1 goto done
end
if start.ct >= 2 then
var tb = pstr { start.ptr, 2 }
if tb:cmp('ä') or tb:cmp('ë') or tb:cmp('ï')
or tb:cmp('Ä') or tb:cmp('Ë') or tb:cmp('Ï')
or tb:cmp('ö') or tb:cmp('ü') -- haben Sie für diese Vokale
or tb:cmp('Ö') or tb:cmp('Ü') -- kein Ausweis dabei, mein Herr
or tb:cmp('á') or tb:cmp('é') or tb:cmp('í')
or tb:cmp('Á') or tb:cmp('É') or tb:cmp('Í')
or tb:cmp('ó') or tb:cmp('ú')
or tb:cmp('Ó') or tb:cmp('Ú')
or tb:cmp('à') or tb:cmp('è') or tb:cmp('ì')
or tb:cmp('À') or tb:cmp('È') or tb:cmp('Ì')
or tb:cmp('ò') or tb:cmp('ù') -- not so fast,
or tb:cmp('Ò') or tb:cmp('Ù') -- "il capo"
or tb:cmp('ā') or tb:cmp('ē') or tb:cmp('ī')
or tb:cmp('Ā') or tb:cmp('Ē') or tb:cmp('Ī')
or tb:cmp('ō') or tb:cmp('ū') -- take that latin
or tb:cmp('Ō') or tb:cmp('Ū') -- and also hawaiian
or tb:cmp('æ') or tb:cmp('Æ') -- sorry elon
or tb:cmp('œ') or tb:cmp('Œ') -- sacre bleu
or tb:cmp('ij') or tb:cmp('IJ') -- ok wtf dutch
then dslen = 2 goto done end
end
if start.ct >= 3 then
var s = [&uint8](start.ptr) -- for safe unicode comparisons, bc char is dumb
if s[0] == 0xe3 and -- eliminate kana
((s[1] == 0x81 and s[2] >= 0x81) or
(s[1] == 0x82 and s[2] <= 0x96)) or
((s[1] == 0x82 and s[2] >= 0xa1) or
(s[1] == 0x83 and s[2] <= 0xb6))
then dslen = 3 goto done end
end
-- TODO handle more nonroman scripts
-- maybe remove consonant pointing in arabic??
-- i guess remove vowels from devanagari
-- no idea what to do about chinese
-- no disemvoweling applied, return the current byte as is
repl = pstr { ptr = start.ptr, ct = 1 }
dslen = 1
::done::
start:advance(dslen)
lib.io.fmt('applied %llu bytes of disemvowelling; adding "%.*s"; continuing with "%.*s"\n', dslen, repl.ct, repl.ptr, start.ct, start.ptr)
return repl, start
end
terra m.disemvowel(pool: &lib.mem.pool, str: m.t): m.t
var acc: m.acc acc:pool(pool, str.ct)
if str.ct == 0 then str.ct = m.sz(str.ptr) end
var cur = str while cur.ct > 0 do
var add, cont = disemvowel_codepoint(cur)
if add:ref() then acc:ppush(add) end
cur = cont
end
return acc:finalize()
end
terra m.qesc(pool: &lib.mem.pool, str: m.t): m.t
-- escape double-quotes
var a: m.acc a:pool(pool, str.ct + str.ct/2)
a:lpush '"'
for i=0, str.ct do
if str(i) == @'"' then a:lpush '\\"'
elseif str(i) == @'\\' then a:lpush '\\\\'
elseif str(i) < 0x20 then -- for json
var hex = lib.math.hexbyte(str(i))
a:lpush('\\u00'):push(&hex[0], 2)
else a:push(str.ptr + i,1) end
end
a:lpush '"'
return a:finalize()
end
return m