212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
...
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
...
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
|
ascii = {
len = string.len; char = string.char; codepoint = string.byte;
iswhitespace = function(c)
return (c == ' ') or (c == '\t') or (c == '\n')
end;
ranges = {
{0x00,0x1a, cc.ctl};
{0x1b,0x1b, cc.ctl, cp.disallow};
{0x1c,0x1f, cc.ctl};
{0x20,0x20, cc.space};
{0x21,0x22, cc.punct};
{0x23,0x26, cc.symbol};
{0x27,0x29, cc.punct};
{0x2a,0x2b, cc.symbol};
{0x2c,0x2f, cc.punct};
{0x30,0x39, cc.numeral, cp.hexnumeral};
{0x3a,0x3b, cc.punct};
{0x3c,0x3e, cc.symbol, cp.mathop};
{0x3f,0x3f, cc.punct};
{0x40,0x40, cc.symbol};
{0x41,0x46, cc.letter, cp.ucase, cp.hexnumeral};
{0x47,0x5a, cc.letter, cp.ucase};
{0x5b,0x5d, cc.symbol, cp.mathop};
{0x5e,0x5e, cc.symbol, mathop};
{0x5f,0x60, cc.symbol};
{0x61,0x66, cc.letter, cp.lcase, cp.hexnumeral};
{0x67,0x7a, cc.letter, cp.lcase};
{0x7b,0x7e, cc.symbol};
{0x7f,0x7f, cc.ctl, cp.disallow};
}
};
raw = {len = string.len; char = string.char; codepoint = string.byte;
encodeUCS = function(str) return str end;
iswhitespace = function(c)
................................................................................
-- unicode ranges are optionally generated from consortium data
-- files and injected through a generated source file. if this
-- part of the build process is disabled (e.g. due to lack of
-- internet access, or to keep the size of the executable as
-- small as possible), we still at least can make the ascii
-- ranges available to UTF8 (UTF8 being a superset of ascii)
ss.str.enc.utf8.ranges = ss.delegate(ss.str.enc.ascii.ranges)
function ss.str.enc.ascii.encodeUCS(str)
local newstr = ''
for c,p in ss.str.each(ss.str.enc.utf8, str, true) do
if c > 0x7F then
newstr = newstr .. '?'
else
................................................................................
end
end
end
for _, v in pairs{'utf8','ascii','raw'} do
ss.str.enc[v].parse_escape = ss.str.enc_generics.pfxescape('\\',ss.str.enc[v])
end
function ss.str.classify(enc, ch)
if not enc.ranges then return {} end
if type(ch)=='string' then ch = enc.codepoint(ch) end
-- TODO
end
function ss.str.each(enc, str, ascode)
if enc.each then return enc.each(enc,str,ascode) end
local pm = {
__index = {
|
|
|
|
|
|
|
|
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
<
>
>
>
>
>
>
>
>
>
>
|
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
...
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
...
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
|
ascii = {
len = string.len; char = string.char; codepoint = string.byte;
iswhitespace = function(c)
return (c == ' ') or (c == '\t') or (c == '\n')
end;
ranges = {
{0x00,0x1a, cc.ctl};
{0x1b,0x1b, cc.ctl | cp.disallow};
{0x1c,0x1f, cc.ctl};
{0x20,0x20, cc.space};
{0x21,0x22, cc.punct};
{0x23,0x26, cc.symbol};
{0x27,0x29, cc.punct};
{0x2a,0x2b, cc.symbol};
{0x2c,0x2f, cc.punct};
{0x30,0x39, cc.numeral | cp.hexnumeral};
{0x3a,0x3b, cc.punct};
{0x3c,0x3e, cc.symbol | cp.mathop};
{0x3f,0x3f, cc.punct};
{0x40,0x40, cc.symbol};
{0x41,0x46, cc.letter | cp.upper | cp.hexnumeral};
{0x47,0x5a, cc.letter | cp.upper};
{0x5b,0x5d, cc.symbol | cp.mathop};
{0x5e,0x5e, cc.symbol | cp.mathop};
{0x5f,0x60, cc.symbol};
{0x61,0x66, cc.letter | cp.lower | cp.hexnumeral};
{0x67,0x7a, cc.letter | cp.lower};
{0x7b,0x7e, cc.symbol};
{0x7f,0x7f, cc.ctl, cp.disallow};
}
};
raw = {len = string.len; char = string.char; codepoint = string.byte;
encodeUCS = function(str) return str end;
iswhitespace = function(c)
................................................................................
-- unicode ranges are optionally generated from consortium data
-- files and injected through a generated source file. if this
-- part of the build process is disabled (e.g. due to lack of
-- internet access, or to keep the size of the executable as
-- small as possible), we still at least can make the ascii
-- ranges available to UTF8 (UTF8 being a superset of ascii)
ss.str.enc.utf8.ranges = ss.str.enc.ascii.ranges
function ss.str.enc.ascii.encodeUCS(str)
local newstr = ''
for c,p in ss.str.each(ss.str.enc.utf8, str, true) do
if c > 0x7F then
newstr = newstr .. '?'
else
................................................................................
end
end
end
for _, v in pairs{'utf8','ascii','raw'} do
ss.str.enc[v].parse_escape = ss.str.enc_generics.pfxescape('\\',ss.str.enc[v])
end
function ss.bitmask_expand(ty, v)
local bitrange = ty[true]
local fb
if bitrange[1] ~= 0 then
fb = v & ((1<<bitrange[1]) - 1) -- first N bits
end
local tbl = {}
for j=bitrange[1], bitrange[2] do
if (fb & (1<<j)) ~= 0 then
tbl[ty[1<<j]] = true
end
end
return tbl, fb
end
function ss.str.classify(enc, ch)
if not enc.ranges then return {} end
if type(ch)=='string' then ch = enc.codepoint(ch) end
for _, r in pairs(enc.ranges) do
if ch >= r[1] and ch <= r[2] then
local p,b = ss.bitmask_expand(ss.str.charprop, r[3])
if b then p[ss.str.charclass[b]] = true end
return p
end
end
return {}
end
function ss.str.each(enc, str, ascode)
if enc.each then return enc.each(enc,str,ascode) end
local pm = {
__index = {
|