Differences From
Artifact [581e1b0127]:
212 212 ascii = {
213 213 len = string.len; char = string.char; codepoint = string.byte;
214 214 iswhitespace = function(c)
215 215 return (c == ' ') or (c == '\t') or (c == '\n')
216 216 end;
217 217 ranges = {
218 218 {0x00,0x1a, cc.ctl};
219 - {0x1b,0x1b, cc.ctl, cp.disallow};
219 + {0x1b,0x1b, cc.ctl | cp.disallow};
220 220 {0x1c,0x1f, cc.ctl};
221 221 {0x20,0x20, cc.space};
222 222 {0x21,0x22, cc.punct};
223 223 {0x23,0x26, cc.symbol};
224 224 {0x27,0x29, cc.punct};
225 225 {0x2a,0x2b, cc.symbol};
226 226 {0x2c,0x2f, cc.punct};
227 - {0x30,0x39, cc.numeral, cp.hexnumeral};
227 + {0x30,0x39, cc.numeral | cp.hexnumeral};
228 228 {0x3a,0x3b, cc.punct};
229 - {0x3c,0x3e, cc.symbol, cp.mathop};
229 + {0x3c,0x3e, cc.symbol | cp.mathop};
230 230 {0x3f,0x3f, cc.punct};
231 231 {0x40,0x40, cc.symbol};
232 - {0x41,0x46, cc.letter, cp.ucase, cp.hexnumeral};
233 - {0x47,0x5a, cc.letter, cp.ucase};
234 - {0x5b,0x5d, cc.symbol, cp.mathop};
235 - {0x5e,0x5e, cc.symbol, mathop};
232 + {0x41,0x46, cc.letter | cp.upper | cp.hexnumeral};
233 + {0x47,0x5a, cc.letter | cp.upper};
234 + {0x5b,0x5d, cc.symbol | cp.mathop};
235 + {0x5e,0x5e, cc.symbol | cp.mathop};
236 236 {0x5f,0x60, cc.symbol};
237 - {0x61,0x66, cc.letter, cp.lcase, cp.hexnumeral};
238 - {0x67,0x7a, cc.letter, cp.lcase};
237 + {0x61,0x66, cc.letter | cp.lower | cp.hexnumeral};
238 + {0x67,0x7a, cc.letter | cp.lower};
239 239 {0x7b,0x7e, cc.symbol};
240 240 {0x7f,0x7f, cc.ctl, cp.disallow};
241 241 }
242 242 };
243 243 raw = {len = string.len; char = string.char; codepoint = string.byte;
244 244 encodeUCS = function(str) return str end;
245 245 iswhitespace = function(c)
................................................................................
250 250
251 251 -- unicode ranges are optionally generated from consortium data
252 252 -- files and injected through a generated source file. if this
253 253 -- part of the build process is disabled (e.g. due to lack of
254 254 -- internet access, or to keep the size of the executable as
255 255 -- small as possible), we still at least can make the ascii
256 256 -- ranges available to UTF8 (UTF8 being a superset of ascii)
257 -ss.str.enc.utf8.ranges = ss.delegate(ss.str.enc.ascii.ranges)
257 +ss.str.enc.utf8.ranges = ss.str.enc.ascii.ranges
258 258
259 259 function ss.str.enc.ascii.encodeUCS(str)
260 260 local newstr = ''
261 261 for c,p in ss.str.each(ss.str.enc.utf8, str, true) do
262 262 if c > 0x7F then
263 263 newstr = newstr .. '?'
264 264 else
................................................................................
266 266 end
267 267 end
268 268 end
269 269
270 270 for _, v in pairs{'utf8','ascii','raw'} do
271 271 ss.str.enc[v].parse_escape = ss.str.enc_generics.pfxescape('\\',ss.str.enc[v])
272 272 end
273 +
274 +function ss.bitmask_expand(ty, v)
275 + local bitrange = ty[true]
276 + local fb
277 + if bitrange[1] ~= 0 then
278 + fb = v & ((1<<bitrange[1]) - 1) -- first N bits
279 + end
280 + local tbl = {}
281 + for j=bitrange[1], bitrange[2] do
282 + if (fb & (1<<j)) ~= 0 then
283 + tbl[ty[1<<j]] = true
284 + end
285 + end
286 + return tbl, fb
287 +end
273 288
274 289 function ss.str.classify(enc, ch)
275 290 if not enc.ranges then return {} end
276 291 if type(ch)=='string' then ch = enc.codepoint(ch) end
277 - -- TODO
292 +
293 + for _, r in pairs(enc.ranges) do
294 + if ch >= r[1] and ch <= r[2] then
295 + local p,b = ss.bitmask_expand(ss.str.charprop, r[3])
296 + if b then p[ss.str.charclass[b]] = true end
297 + return p
298 + end
299 + end
300 +
301 + return {}
278 302 end
279 303
280 304
281 305 function ss.str.each(enc, str, ascode)
282 306 if enc.each then return enc.each(enc,str,ascode) end
283 307 local pm = {
284 308 __index = {