cortav  Diff

Differences From Artifact [581e1b0127]:

To Artifact [dc1f0ae1fb]:


   212    212   	ascii = {
   213    213   		len = string.len; char = string.char; codepoint = string.byte;
   214    214   		iswhitespace = function(c)
   215    215   			return (c == ' ') or (c == '\t') or (c == '\n')
   216    216         end;
   217    217   		ranges = {
   218    218   			{0x00,0x1a, cc.ctl};
   219         -			{0x1b,0x1b, cc.ctl, cp.disallow};
          219  +			{0x1b,0x1b, cc.ctl | cp.disallow};
   220    220   			{0x1c,0x1f, cc.ctl};
   221    221   			{0x20,0x20, cc.space};
   222    222   			{0x21,0x22, cc.punct};
   223    223   			{0x23,0x26, cc.symbol};
   224    224   			{0x27,0x29, cc.punct};
   225    225   			{0x2a,0x2b, cc.symbol};
   226    226   			{0x2c,0x2f, cc.punct};
   227         -			{0x30,0x39, cc.numeral, cp.hexnumeral};
          227  +			{0x30,0x39, cc.numeral | cp.hexnumeral};
   228    228   			{0x3a,0x3b, cc.punct};
   229         -			{0x3c,0x3e, cc.symbol, cp.mathop};
          229  +			{0x3c,0x3e, cc.symbol | cp.mathop};
   230    230   			{0x3f,0x3f, cc.punct};
   231    231   			{0x40,0x40, cc.symbol};
   232         -			{0x41,0x46, cc.letter, cp.ucase, cp.hexnumeral};
   233         -			{0x47,0x5a, cc.letter, cp.ucase};
   234         -			{0x5b,0x5d, cc.symbol, cp.mathop};
   235         -			{0x5e,0x5e, cc.symbol, mathop};
          232  +			{0x41,0x46, cc.letter | cp.upper | cp.hexnumeral};
          233  +			{0x47,0x5a, cc.letter | cp.upper};
          234  +			{0x5b,0x5d, cc.symbol | cp.mathop};
          235  +			{0x5e,0x5e, cc.symbol | cp.mathop};
   236    236   			{0x5f,0x60, cc.symbol};
   237         -			{0x61,0x66, cc.letter, cp.lcase, cp.hexnumeral};
   238         -			{0x67,0x7a, cc.letter, cp.lcase};
          237  +			{0x61,0x66, cc.letter | cp.lower | cp.hexnumeral};
          238  +			{0x67,0x7a, cc.letter | cp.lower};
   239    239   			{0x7b,0x7e, cc.symbol};
   240    240   			{0x7f,0x7f, cc.ctl, cp.disallow};
   241    241   		}
   242    242   	};
   243    243   	raw = {len = string.len; char = string.char; codepoint = string.byte;
   244    244   		encodeUCS = function(str) return str end;
   245    245   		iswhitespace = function(c)
................................................................................
   250    250   
   251    251   -- unicode ranges are optionally generated from consortium data
   252    252   -- files and injected through a generated source file. if this
   253    253   -- part of the build process is disabled (e.g. due to lack of
   254    254   -- internet access, or to keep the size of the executable as
   255    255   -- small as possible), we still at least can make the ascii
   256    256   -- ranges available to UTF8 (UTF8 being a superset of ascii)
   257         -ss.str.enc.utf8.ranges = ss.delegate(ss.str.enc.ascii.ranges)
          257  +ss.str.enc.utf8.ranges = ss.str.enc.ascii.ranges
   258    258   
   259    259   function ss.str.enc.ascii.encodeUCS(str)
   260    260   	local newstr = ''
   261    261   	for c,p in ss.str.each(ss.str.enc.utf8, str, true) do
   262    262   		if c > 0x7F then
   263    263   			newstr = newstr .. '?'
   264    264   		else
................................................................................
   266    266   		end
   267    267   	end
   268    268   end
   269    269   
   270    270   for _, v in pairs{'utf8','ascii','raw'} do
   271    271   	ss.str.enc[v].parse_escape = ss.str.enc_generics.pfxescape('\\',ss.str.enc[v])
   272    272   end
          273  +
          274  +function ss.bitmask_expand(ty, v)
          275  +	local bitrange = ty[true]
          276  +	local fb
          277  +	if bitrange[1] ~= 0 then
          278  +		fb = v & ((1<<bitrange[1]) - 1) -- first N bits
          279  +	end
          280  +	local tbl = {}
          281  +	for j=bitrange[1], bitrange[2] do
          282  +		if (fb & (1<<j)) ~= 0 then
          283  +			tbl[ty[1<<j]] = true
          284  +		end
          285  +	end
          286  +	return tbl, fb
          287  +end
   273    288   
   274    289   function ss.str.classify(enc, ch)
   275    290   	if not enc.ranges then return {} end
   276    291   	if type(ch)=='string' then ch = enc.codepoint(ch) end
   277         -	-- TODO
          292  +
          293  +	for _, r in pairs(enc.ranges) do
          294  +		if ch >= r[1] and ch <= r[2] then
          295  +			local p,b = ss.bitmask_expand(ss.str.charprop, r[3])
          296  +			if b then p[ss.str.charclass[b]] = true end
          297  +			return p
          298  +		end
          299  +	end
          300  +
          301  +	return {}
   278    302   end
   279    303   
   280    304   
   281    305   function ss.str.each(enc, str, ascode)
   282    306   	if enc.each then return enc.each(enc,str,ascode) end
   283    307   	local pm = {
   284    308   		__index = {