Index: cortav.ct ================================================================== --- cortav.ct +++ cortav.ct @@ -8,10 +8,33 @@ %toc ## cortav vs. markdown the most important difference between cortav and markdown is that cortav is strictly line-oriented. this choice was made to ensure that cortav was relatively easy to parse. so while a simple [`.ct] file may look a bit like a [`.md] file, in reality it's a lot closer to gemtext than any flavor of markdown. +however, the differences go much deeper. the most distinctive feature of cortav is that its syntax is strongly recursive. with markdown, you can apply at most one styling to any given block or span or text. with cortav, you can nest as many styles as you like, and you can style text in places markdown wouldn't ordinarily let you: within headings, inside link text, even in code listings if you absolutely insist (this needs to be turned on by a special directive before the listing in question, however). + +this manual describes cortav exhaustively, but if you just want a quick reference on how markdown translates to cortav, look no further. + +* [*headings]: cortav uses almost the same syntax for headings that markdown does, except it only allows the "ATX style" headings, with one or more hash characters at the start of the line. the only differences from markdown are: +** you can use the unicode section character [`§] instead of [`#] if you're feeling snobby +** you must put a space between the control sequence (the sequence of hashes or section symbols, in this case) and the title text. [`# title] creates a section with the heading text "title", but [`#title] creates a new section with no heading at all; instead, it gives the anonymous section the ID [`title]. and of course, you can combine the two: [`#ttl title] creates a section with the heading text "title" and the ID [`ttl]. what are IDs for? we'll get to that in a little bit +* [*paragraphs] are mostly the same as in markdown, except that a paragraph break occurs after every newline character, not every blank line. paragraphs can be indented by however many spaces you like; such indentation will be ignored. (tabs have a special meaning, however). in cortav, you can also explicitly mark a line of text as a paragraph by preceding it with a period character ([`.]), which is useful if you want to start a paragraph with text that would otherwise be interpreted specially. +* [*italic text] -- or rather, [!emphasized] text -- is written as [`\[!my spiffy italic text\]]. in cortav, these spans can be nested within other spans (or titles, or table cells, or…), and the starting and ending point is unambiguous. +* [*bold text] -- or rather, [*strong] text -- is written as [`\[*my commanding bold text\]]. +* [*bold-italic text] -- or rather, [![*emphasized strong text]] -- has no specific notation. rather, you create it by nesting one span within the other, for instance: [`\[*[!my ostentatious bold-italic text\]]]. +* [*links] are quite different from their markdown equivalents. cortav does not have inline links, as it is intended to be easily readable in both formatted and plain-text format, and long URLs rather disrupt the flow of reading. rather, a link tag is written with the notation [`\[>nifty-link my nifty link\]], where the word [`nifty-link] immediately following the arrow is an [!identifier] indicating the destination of the link. (instead of a greater-than sign, you can also use the unicode arrow symbol [`→].) if the identifier is the same as one you've assigned to a section, cortav produces a link within the document to that section. otherwise, it will look for a [!reference] to tell it the URI for the link. a reference is a key-value pair created by adding a line like [`nifty-link: https://zombo.com] [!indented by exactly one tab]. you can place this reference anywhere you like so long as it's in the same section; if you want to name a reference in another section, you have to prefix it with that section's ID, e.g. [`\[>spiffy-section.nifty-link my nifty link declared in a spiffy section\]]. +* [*lists] use a different syntax from markdown. you can start a line with a [`*] to create an unordered list, or [`:] to create an ordered list; indentation doesn't matter. if you want to nest list items, instead of putting two spaces before the child item, you just add another star or colon. and of course, you can nest lists of different kinds within one another. +* [*horizontal rules] use roughly the same syntax: three or more hyphens on a line of their own ([`\---]). underlines also work ([`___], [`-_-], [`__-__-__] etc). +* some markdown implementations support [*tables]. cortav does too, using a very simple notation. +* [*underlines] are supported by some markdown implementations. in cortav, you can apply them with the notation [`\[_my underlined text\]] -- please just use them sparingly when you render to HTML! +* [*strikethrough] is supported by some extended versions of markdown. cortav uses the notation [`\[~my deleted text\]], with the intended semantics of text that is being removed by some revision of a document. (you can also denote text that is being [!added] by using a plus sign instead of a tilde) +* [*images] are a bit more complicated. see the section on [>rsrc resources] for an explanation. +* [*smart quotes] and [*em dashes] are inserted automatically, just as in markdown, provided you have the [>tsmog transmogrify] extension available. (it is part of the reference implementation and defined by the spec, but not required.) in fact, you can insert longer dashes than em dashes just by increasing the number of hyphens. the reference implementation's transmogrifier also translates ascii arrows like [`\-->] into their unicode equivalents ([`→]). +* [*literals] (also known as [*code text]) can be inserted with the [`\[`int main(void);] syntax. note however that literals are not protected from the transmogrifier, and are parsed like any other span, which may cause problems if the source code you're quoting makes use of such forbidden runes. in this case, you'll want to wrap the code span in a raw span. the syntax for this is [`\[`[\\int main(void);\]]], but since this is a bit unwieldy it can also be abbreviated as [`\[`\\int main(void);\]]. + +of course, this is only a small taste of what cortav can do, not even touching on key features like macros, footnotes, or equation formatting. read the sections on [>onblocks blocks] and [>onspans spans] for all the gory details. + ## encoding a cortav document is made up of a sequence of codepoints. UTF-8 must be supported, but other encodings (such as UTF-32 or C6B) may be supported as well. lines will be derived by splitting the codepoints at the linefeed character or equivalent. note that unearthly encodings like C6B or EBCDIC will need to select their own control sequences. ## file type a cortav source file is identified using a file extension, file type, and/or magic byte sequence. @@ -41,11 +64,11 @@ * for C6B+PS files (parastream), the file should begin with the paragraph [`], which equates to the byte sequence [` 0x3E 2E 14 0C 01 04 00 00 00 03 07 3E 2D], including the parastream header). consequently, this sequence should be ignored by a cortav parser at the start of a file (except as an indication of file format). for FreeDesktop-based systems, the [`build/velartrill-cortav.xml] file included in the repository supplies mappings for the extensions and magic byte sequences. a script is also included which can be registered with xdg-open so that double-clicking on a cortav file will render it out and open it in your default web browser. [`$ make install] will generate the necessary FreeDesktop XML files and register them, as well as install the script and the [`cortav] executable itself. for more information see [>refimpl-build building the reference implementation]. -## structure +##onblocks structure cortav is based on an HTML-like block model, where a document consists of sections, which are made up of blocks, which may contain a sequence of spans. flows of text are automatically conjoined into spans, and blocks are separated by one or more newlines. this means that, unlike in markdown, a single logical paragraph [*cannot] span multiple ASCII lines. the primary purpose of this was to ensure ease of parsing, but also, both markdown and cortav are supposed to be readable from within a plain text editor. this is the 21st century. every reasonable text editor supports soft word wrap, and if yours doesn't, that's entirely your own damn fault. the first character(s) of every line (the "control sequence") indicates the role of that line. if no control sequence is recognized, the line is treated as a paragraph. the currently supported control sequences are listed below. some control sequences have alternate forms, in order to support modern, readable unicode characters as well as plain ascii text. * [*paragraphs] ([`.] [` ¶] [`❡]): a paragraph is a simple block of text. the period control sequence is only necessary if the paragraph text starts with text that would be interpreted as a control sequence otherwise @@ -54,12 +77,12 @@ ** [`#] is a simple section break. ** [`#anchor] opens a new section with the ID [`anchor]. ** [`# header] opens a new section with the title "header". ** [`#anchor header] opens a new section with both the ID [`anchor] and the title "header". ** [`#>conversation] opens a blockquote section named [`conversation] without a header. -* [*nonprinting sections] ([`^]): sometimes, you'll want to create a namespace without actually adding a visible new section to the document. you can achieve this by creating a [!nonprinting section] and defining resources within it. nonprinting sections can also be used to store comments, notes, or other information that is useful to have in the source file without it becoming a part of the output ** [`#&id mime] opens a new inline object [`id] of type [`mime]. useful for embedding SVGs. the ID and mime type must be specified. +* [*nonprinting sections] ([`^]): sometimes, you'll want to create a namespace without actually adding a visible new section to the document. you can achieve this by creating a [!nonprinting section] and defining resources within it. nonprinting sections can also be used to store comments, notes, or other information that is useful to have in the source file without it becoming a part of the output * [*resource] ([`@]): defines a [!resource]. a resource is an file or object that exists outside of the document but which will be included in the document somehow. common examples of resources include images, videos, iframes, or headers/footers. see [>rsrc resources] for more information. * [*lists] ([`*] [`:]): these are like paragraph nodes, but list nodes that occur next to each other will be arranged so as to show they compose a sequence. depth is determined by the number of stars/colons. like headers, a list entry may have an ID that can be used to refer back to it; it is indicated in the same way. if colons are used, this indicates that the order of the items is signifiant. :-lists and *-lists may be intermixed; however, note than only the last character in the sequence actually controls the depth type. * [*directives] ([`%]): a directive issues a hint to the renderer in the form of an arbitrary string. directives are normally ignored if they are not supported, but you may cause a warning to be emitted where the directive is not supported with [`%!] or mark a directive critical with [`%!!] so that rendering will entirely fail if it cannot be parsed. * [*comments] ([`%%]): a comment is a line of text that is simply ignored by the renderer. * [*asides] ([`!]): indicates text that diverges from the narrative, and can be skipped without interrupting it. think of it like block-level parentheses. asides which follow one another are merged as paragraphs of the same aside, usually represented as a sort of box. if the first line of an aside contains a colon, the stretch of styled-text from the beginning to the aside to the colon will be treated as a "type heading," e.g. "Warning:" @@ -79,21 +102,21 @@ * [*embed] ([`&]): embeds a referenced object. can be used to show images or repeat previously defined objects like lists or tables, optionally with a caption. ** [`&$[$macro] [$arg1]|[$arg2]|[$argn]…] invokes a block-level macro with the supplied arguments *** [`&$mymacro arg 1|arg 2|arg 3] ** [`&[$image]] embeds an image or other block-level object. [!image] can be a reference with a url or file path, or it can be an embed section (e.g. for SVG files) ***[`&myimg All that remained of the unfortunate blood magic pageant contestants and audience (police photo)] -** [`&-[$section]] embeds a closed disclosure element. in interactive outputs, this will display as a block [!section] which can be clicked on to view the full contents of the referenced section; in static outputs, it will display as an enclosed box with [$section] as the title text +** [`&-[$section] [$styled-text]] embeds a closed disclosure element. in interactive outputs, this will display as a block [!section] which can be clicked on to view the full contents of the referenced section; if [$styled-text] is present, it overrides the title of the section you are embedding. in static outputs, the disclosure object will display as an enclosed box with [$styled-text] as the title text *** [`&-ex-a Prosecution Exhibit A (GRAPHIC CONTENT)] -** [`&+[$section]] is like the above, but the disclosure element is open by default +** [`&+[$section] [$styled-text]] is like the above, but the disclosure element is open by default * [*horizontal rule] ([`\---]): inserts a horizontal rule or other context break; does not end the section. must be followed by newline. underlines can also be used in place of dashes. * [*page break] ([`\^^]): for formats that support pagination, like HTML (when printed), indicates that the rest of the current page should be blank. for formats that do not, extra margins will be inserted. does not create a new section -* [*page rule] ([`\^^-]): inserts a page break for formats that support them, and a horizontal rule for formats that do not. does not create a new section +* [*page rule] ([`\^-^]): inserts a page break for formats that support them, and a horizontal rule for formats that do not. does not create a new section * [*table cells] ([`+ |]): see [>ex.tab table examples]. * [*equations] ([`=]) block-level equations can be inserted with the [`=] * [*empty lines] (that is, lines consisting of nothing but whitespace) constitute a [!break], which terminates multiline objects that do not have a dedicated termination sequence, for example lists and asides. -## styled text +##onspans styled text most blocks contain a sequence of spans. these spans are produced by interpreting a stream of [*styled-text] following the control sequence. styled-text is a sequence of codepoints potentially interspersed with escapes. an escape is formed by an open square bracket [`\[] followed by a [*span control sequence], and arguments for that sequence like more styled-text. escapes can be nested. * strong {obj *|styled-text}: causes its text to stand out from the narrative, generally rendered as bold or a brighter color. * emphatic {obj !|styled-text}: indicates that its text should be spoken with emphasis, generally rendered as italics * literal {obj `|styled-text}: indicates that its text is a reference to a literal sequence of characters or other discrete token. generally rendered in monospace @@ -102,12 +125,12 @@ * strikeout {obj ~|styled-text}: indicates that its text should be struck through or otherwise indicated for deletion * insertion {obj +|styled-text}: indicates that its text should be indicated as a new addition to the text body. ** consider using a macro definition [`\edit: [~[#1]][+[#2]]] to save typing if you are doing editing work * link \[>[!ref] [!styled-text]\]: produces a hyperlink or cross-reference denoted by [$ref], which may be either a URL specified with a reference or the name of an object like an image or section elsewhere in the document. the unicode characters [`→] and [`🔗] can also be used instead of [`>] to denote a link. * footnote {span ^|ref|[$styled-text]}: annotates the text with a defined footnote. in interactive output media [`\[^citations.qtheo Quantum Theosophy: A Neophyte's Catechism]] will insert a link with the next [`Quantum Theosophy: A Neophyte's Catechism] that, when clicked, causes a footnote to pop up on the screen. for static output media, the text will simply have a superscript integer after it denoting where the footnote is to be found. -* superscript {obj '|[$styled-text]}: -* subscript {obj ,|[$styled-text]}: +* superscript {obj '|[$styled-text]} +* subscript {obj ,|[$styled-text]} * raw \[\\[`raw-text]\]: causes all characters within to be interpreted literally, without expansion. the only special characters are square brackets, which must have a matching closing bracket * raw literal \[$\\[!raw-text]\]: shorthand for [\[$[\…]]] * macro [`\{[!name] [!arguments]\}]: invokes a [>ex.mac macro], specified with a reference * argument {obj #|var}: in macros only, inserts the [$var]-th argument. otherwise, inserts a context variable provided by the renderer. * raw argument {obj ##|var}: like above, but does not evaluate [$var]. @@ -121,10 +144,21 @@ * inline comment {obj %%|...}: ignored. useful for editorial annotations not intended to be part of the rendered product. span: [` \[[*[#1]][$[#2]] [#3]\]] obj: [` \[[*[#1]][$[#2]]\]] +##tabs tables +tables are encoded using a very simple notation. any line that begins with a plus [`+] or bar [`|] denotes a table row. each plus or bar separates one column from the other: a plus opens a new header cell, a bar opens a new normal cell. + +the alignment of a cell can be specified by placing colons at one edge or both edges of the given cell. a colon on the left ([`|: my table cell |]) indicates a left-aligned cell, a colon on right a right-aligned cell ([`| my table cell :|]), and a colon on both sides a centered cell ([`|: my table cell :|]). if you want to use a special character without it being eaten by the table parser, just put a backslash in from of it, e.g. [`| this cell \| contains a pipe \+ a plus sign [!and] ends with a colon \:|]. and of course, table cells are just normal spans -- they can contain any other kind of span formatting you like, such as links, emphasis, or footnotes. + +no other features (like colspans or rowspans) are currently part of the spec but they will be added eventually (if i can figure out a decent way to implement them without creating a huge mess). + +you can finish each row with a bar or plus character, but it's not necessary. only do it if you think it makes the source easier to read. + +* [>ex.tab an example of table notation] + ##ident identifiers any identifier (including a reference) that is defined within a named section must be referred to from outside that section as [`[!sec].[!obj]], where [$sec] is the ID of the containing section and [$obj] is the ID of the object one wishes to reference. ##rsrc resources a [!resource] represents content that is not encoded directly into the source file, but which is embedded by some means in the output. resources can either be [!embedded], in which case they are compiled into the final document itself, or they can be [!linked], in which case the final document only contains a URI or similar tag referencing the resource. not all render backends support both linking and embedding embedding, nor do all backends support all object types (for instance, [`groff] does not support video embedding.) @@ -323,11 +357,11 @@ * {d quote} transcludes another file, without expanding the text except for paragraphs * {d embed}, where possible, embeds another file as an object within the current one. in HTML this could be accomplished with e.g. an iframe. * {d expand} causes the next object (usually a code block) to be fully expanded when it would otherwise not be * {d font} controls the font stack, for outputs that support changing fonts. see [>fonts fonts] for more information. * {d lang} changes the current language, which is used by extensions to e.g. control typographical conventions, and may be encoded into the output by certain renderers (e.g. HTML). note that quotes and blockquotes can be set to a separate language with a simpler syntax. the language should be notated using IETF language tags -** {d lang is x-ranuir-CR8} sets the current language to Ranuir as spoken in the Central Worlds, written in Corran and encoded using UTF-8. this might be used at the top of a document to set its primary language. +** {d lang is x-ranuir-Cent-CR8} sets the current language to Ranuir as spoken in the Central Worlds, written in Corran and encoded using UTF-8. this might be used at the top of a document to set its primary language. ** {d lang push gsw-u-sd-chzh} temporarily switches to Zürich German, e.g. to quote a German passage in an otherwise Ranuir document ** {d lang sec en-US} switches to American English for the duration of a section. does not affect the language stack. ** {d lang pop} drops the current language off the language stack, returning to whatever was pushed or set before it. this would be used, for instance, at the end of a passage * {d pragma} supplies semantic data about author intent, the kind of information the document contains and hints about how it should be displayed to the user. think of them like offhand remarks to the renderer -- there's no guarantee that it'll pay any attention, but if it does, your document will look better. pragmas have no scope; they affect the entire document. the pragma function exists primarily as a means to allow parameters that would normally need to be specified on e.g. the command line to be encoded in the document instead in a way that multiple implementations can understand. a few standard pragmas are defined. ** {d pragma layout} gives a hint on how the document should be layed out. the first hint that is understood will be applied; all others will be discarded. standard hints include: @@ -455,14 +489,14 @@ * [` \[%lua exp [!script]\]]: evaluates [$script] and emits the string it returns (if any) in expanded span context. * [`%lua raw [!script]]: evaluates [$script] and emits the string array it returns (if any) in raw block context. * [`%lua exp [!script]]: evaluates [$script] and emits the string array it returns (if any) in expanded block context. the interpreter should provide a [`cortav] table with the objects: -* ctx: contains context variables +* [`ctx]: contains context variables used files should return a table with the following members -* macros: an array of functions that return strings or arrays of strings when invoked. these will be injected into the global macro namespace. +* [`macros]: an array of functions that return strings or arrays of strings when invoked. these will be injected into the global macro namespace. ###ts ts the [*ts] extension allows documents to be marked up for basic classification constraints and automatically redacted. if you are seriously relying on ts for confidentiality, make damn sure you start the file with [$%[*requires] ts], so that rendering will fail with an error if the extension isn't supported. ts enables the directives: @@ -573,14 +607,75 @@ + build | [`build] | the directory where generated objects will be placed; useful for out-of-tree builds + bin-prefix | [`[$$prefix]/bin] | directory to install the executables to" + default-format-flags | [`-m html:width 35em] | a list of flags that will be passed by the viewer script to [`cortav] when generating a html fille the following targets are supplied to automate the build: -* [`install] builds everything, installs the executable and the viewer script to [$$bin_prefix], and registers the viewer script with XDG +* [`install] builds everything, installs the bytecode-executable and the viewer script to [$$bin_prefix], and registers the viewer script with XDG +* [`install-bin] is like [`install] but installs the binary version instead of the bytecode one * [`excise] deletes everything installed and deregisters the file handlers (note that the same variables must be passed to [`exicse] as were passed to [`install]!) * [`clean] deletes build artifacts from the [$$build] directory like it was never there * [`wipe] is equivalent to [`$ make excise && make clean] + +if you don't want to install [`cortav], you can just run [`$ make] without arguments to build the executable. + +there are two different ways of building the driver. one is to generate a bytecode file that can be executed directly as a script. this is the most straightforward method, and requires only [`lua] and [`luac]. however, it has several substantial downsides: because it's only a bytecode file, it requires the [`lua] interpreter to run -- and in some environments, the security characteristics of the [`lua] interpreter may make this undesirable. it also must hardcode the path to the lua interpreter (though admittedly this is easy enough to fix if you copy it to another machine of the same architecture). [`lua] is not an entirely predictable environment, as it is controlled by environment variables and may hypothetically do things like load default libraries or alter paths in ways that disrupt the workings of [`cortav]. finally, because the bytecode file is not a binary executable, it cannot directly be given enhanced capabilities on unix-like systems through filesystem metadata -- SUID and caps will be ignored by the kernel. while this is of no importance in ordinary operation, there are niche cases where this could be troublesome. + +a potentially superior alternative is to build [`cortav] as a directly executable binary. when you tell [`make] to build the binary version, it first compiles the driver to raw bytecode, then invokes [`tool/makeshim.lua] to create a C source file embedding that bytecode, which is then piped into a C compiler. the huge downside, of course, is that building the cortav driver in this way requires a C compiler. however, the binary that it produces is easier to distribute to other computers -- you can even statically link in lua so it can run on systems where lua isn't installed. + +to build the binary version, run [`$ make build/cortav.bin]. if you want to make the build to link lua statically, you'll additionally need to supply lua's library prefix in the variable [`lua-lib-prefix]. some example incantations: +* [`$ make build/cortav.bin lua-lib-prefix=/usr/lib] on most Linux distros +* [`$ make build/cortav.bin lua-lib-prefix=/usr/local/lib] on FreeBSD +* [`$ make build/cortav.bin lua-lib-prefix=$(nix path-info nixpkgs.lua5_3)/lib] on NixOS, or on OSX if you're using the Nix package manager + +alternately, you can build lua yourself and link the static library in place without installing it systemwide, which is useful if you want to build a specialized version of lua to link with (or if the sysop doesn't want your grubby luser hands all over his precious filesystem). note that if you're building a self-contained version of [`cortav] to distribute, you may want to slim down the binary by building lua without its parser, as the self-contained version of the driver only needs the bytecode VM part of lua to run. + +#### build variables +there are numerous variables you can use to control the build process. + ++ lua | path to the lua interpreter [`cortav] should be built and run with ++ luac | path to the lua compiler ++ sh | path to a bourne-compatible shell ++ extens | list of paths to extensions to enable, defaults to [`ext/*.lua]. use [`extens\+=[$path]] to add additional extensions from out of tree ++ rendrs | list of paths to renderers to enable, defaults to [`render/*.lua] ++ build | path to the build directory, defaults to [`build]. change this for out-of-tree builds ++ executable | name of the executable to be generated, defaults to [`cortav] ++ default-format-flags | specifies command line options that the viewer script should pass to [`cortav] ++ prefix | where files should be installed, defaults to [`$HOME/.local] ++ bin-prefix | where executables should be installed, defaults to [`$(prefix)/bin] ++ debug | if set, builds executables with debugging symbols; if absent, executables are stripped ++ encoding-data | if set, embeds character class data for supported multibyte encodings into the program. on by default; [`$ make encoding-data=] to unset ++ encoding-data-ucs | path to the UnicodeData.txt file for UCS-based encodings like UTF-8. by default it is automatically downloaded with [`curl] ++ encoding-data-ucs-url | where to download UnicodeData.txt from, if encoding-data-ucs is not changed. defaults to the unicode consortium website + +#### deterministic builds +some operating systems, like NixOS, require packages that can be built in reproducible ways. this implies that all data, all [!state] that goes into producing a package needs to be accounted for before the build proper begins. the [`cortav] build process needs to be slightly altered to support such a build process. + +while the cortav specification itself does not concern itself with matters like whether a particular character is a numeral or a letter, optimal typesetting in some cases requires such information. this is the case for the equation span- and block-types, which need to be able to distinguish between literals, variables, and mathematical symbols in [^alas-math the equations they format]. the ASCII charset is small enough that exhaustive character class information can be manually hardcoded into a cortav implementation, the various encodings of Unicode most certainly are not. + + alas-math: sadly, i was not at any point consulted by any of the generations of mathematicians stretching back into antiquity who devised their notations without any regard for machine-readability. [!for shame!] + +for this reason, the reference implementation of cortav embeds the file [`UnicodeData.txt], a database maintained by the Unicode Consortium. this is a rather large file that updates for each new Unicode version, so it is downloaded as part of the build process. to build on NixOS, you'll need to either disable the features that rely on this database (not recommended), or download the database yourself and tell the build script where to find it. this is the approach the official nix expression will take when i can be bothered to write it. see the examples below for how to conduct a deterministic build + +~~~ deterministic build with unicode database [sh] ~~~ +/src $ mkdir cortav && cd cortav +/src/cortav $ fossil clone https://c.hale.su/cortav .fossil && fossil open .fossil +/src/cortav $ curl https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt > /tmp/unicode.txt +/src/cortav $ make build/cortav encoding-data-ucs=/tmp/unicode.txt +~~~ +~~~ [sh] deterministic build [!without] unicode database ~~~ +/src $ mkdir cortav && cd cortav +/src/cortav $ fossil clone https://c.hale.su/cortav .fossil && fossil open .fossil +/src/cortav $ make build/cortav encoding-data= +~~~ + +! while most of the data used is taken directly from UnicodeData.txt, the database generated by [`tools/ucs.lua] splices in some extra character information before generating a database. this is partly because certain characters may not be classified in a useful way and need to be manually overwritten. however, the reference implementation also seeks to provide accurate data for certain character sets that are not part of unicode proper and can be expressed in UTF only through its private use areas. +! currently, only the [>corran Corran] script is currently supported in this fashion, but i intend to add [>tengwar Tengwar] as well. if there is a con-script or any other informally encoded script you would like supported by the reference implementation, please open an issue. + +[*do note] that no cortav implementation needs to concern itself with character class data. this functionality is provided in the reference implementation strictly as an (optional) extension to the spec to improve usability, not as a normative requirement. + + corran: http://ʞ.cc/fic/spirals/society + tengwar: https://en.wikipedia.org/wiki/Tengwar ###refimpl-switches switches [`cortav.lua] offers various switches to control its behavior. + long + short + function + | [`--out [!file]] :|:[`-o]:| sets the output file (default stdout) | @@ -640,11 +735,11 @@ * [`%[*html] link [$rel] [$mime] [$href]]: inserts a [`] tag in the header, for example, to link in an alternate stylesheet, or help feed readers find your atom or rss feed. ** [`%[*html] link alternate\\ stylesheet text/css /res/style2.css] ** [`%[*html] link alternate application/atom+xml /feed.atom] * [`%[*html] style [$id]]: adds the stylesheet referenced by [$id] into the document stylesheet. the stylesheet is specified using a [>rsrc resource]. -#### stylsheets +#### stylesheets the [`html] backend offers some additional directives for external CSS files that are embedded into the document, in order to simplify integration with the accent mechanism. these are: * [`@[*fg]]: resolves to a color expression denoting the selected foreground color. equivalent to [`[*tone](1)] * [`@[*bg]]: resolves to a color expression denoting the selected background color. equivalent to [`[*tone](0)] * [`@[*tone]\[/[$alpha]\]([$fac] \[[$shift] \[[$saturate]\]\] )]: resolves to a color expression. [$fac] is a floating-point value scaling from the background color to the foreground color. [$shift] is a value in degrees controlling how far the hue will shift relative to the accent. [$saturate] is a floating-point value controlling how satured the color is. @@ -659,11 +754,11 @@ * string [`groff:annotate] controls how footnotes will be handled. ** [`footnote] places footnotes at the end of the page they are referenced on. if the same footnote is used on multiple pages, it will be duplicated on each. ** [`secnote] places footnotes at the end of each section. footnotes used in multiple sections will be duplicated for each ** [`endnote] places all footnotes at the end of the rendered document. -* string [`groff:dev] names an output device (such as [`dvi] or[`pdf]). if this mode is present, [`groff] will be automatically invoked +* string [`groff:dev] names an output device (such as [`dvi] or [`pdf]). if this mode is present, [`groff] will be automatically invoked * string [`groff:title-page] takes an identifier that names a section. this section will be treated as the title page for the document. ### directives * [`%[*pragma] title-page [$id]] sets the title page to section [$id]. this causes it to be specially formatted, with a large, centered title and subtitle. @@ -734,11 +829,11 @@ (nodes "footnode1-caption-text")) (text (id . "text4") "has thus far had little to say on the matter, provoking rampant speculation among the faithful.") (footnote-def (id . "footnote1-def") (nodes "footnote1-text") - (text (id . "footnote1-text") "Currently recognized as legitimate successor to Peter of Terra by 2,756 sects, rejected by 678 of mostly Neo-Lutheran origin, and decried as an antipope by 73, most notably Pope Peter II of Centaurus Secundus, leader of the ongoing relativistic crusade against star systems owned by Microsoft.") + (text (id . "footnote1-text") "Currently recognized as legitimate successor to Peter of Terra by 2,756 sects, rejected by 678 of mostly Neo-Lutheran origin, and decried as an antipope by 73, most notably Pope Peter II of Centaurum Secundus, leader of the ongoing relativistic crusade against star systems owned by Microsoft.") ;;; snip ;;; (document (nodes "section1" "section2"))) ~~~ Index: ext/toc.lua ================================================================== --- ext/toc.lua +++ ext/toc.lua @@ -1,31 +1,93 @@ local ct = require 'cortav' local ss = require 'sirsem' local css_toc = [[ - + @media screen and (max-width: calc(@[width]:[100vw] * 2)) { + ol.toc { + float: right; + background: @bg; + padding: 0 2em; + margin-right: -4em; + } + } ]] +local css_toc_fixed_lod = [[ + @media (min-width: calc(@[width]:[100vw] * 2)) { + ol.toc { + background: linear-gradient(to right, transparent 25%, @tone(0.1 50)), + @tone/0.4(-0.1 50); + } + ol.toc > li > ol li { + background: linear-gradient(to right, transparent, rgba(0,0,0,0.4)); + } + } +]] local css_toc_fixed = [[ - @media (min-width: calc(@[width]:[100vw] + 20em)) { + @media screen and (min-width: calc(@[width]:[100vw] * 2)) { ol.toc { position: fixed; - padding-top: 1em; padding-bottom: 1em; - padding-right: 1em; - margin-top: 0; margin-bottom: 0; right: 0; top: 0; bottom: 0; max-width: calc(50vw - ((@[width]:[0]) / 2) - 3.5em); overflow-y: auto; + background: @tone/0.4(-0.1 50); + padding: 1em 1em; + padding-right: 0; + border-left: 1px solid @tone(-2 50); + margin: 0; } - @media (max-width: calc(@[width]:[100vw] + 30em)) { + @media (max-width: calc(@[width]:[100vw] * 2.5)) { ol.toc { max-width: calc(100vw - ((@[width]:[0])) - 9.5em); } body { margin-left: 5em; } } + ol.toc li { + padding: 0; + margin-left: 1em; + } + ol.toc a[href] { + display: block; + padding: 0.15em 0; + color: @tone(0.8 50); + background: linear-gradient(to right, transparent, @tone(0.3 50)); + background-position-x: 10em; + background-repeat: no-repeat; + transition: 0.25s; + } + ol.toc a[href]:not(:hover) { + text-decoration-color: transparent; + } + @supports not (text-decoration-color: transparent) { + ol.toc a[href]:not(:hover) { + text-decoration: none; + } + } + ol.toc a[href]:hover { + color: @tone(1.3 50); + background-position-x: 0%; + } + ol.toc ol { + font-size: 95%; + width: 100%; + padding-left: 0; + } + ol.toc > li { + list-style: upper-roman; + } + ol.toc > li > a { + font-weight: bold; + } + ol.toc > ol > li { + list-style: decimal; + } + ol.toc > li > ol > li > ol > li { + list-style: enclosed; + } } ]] ct.ext.install { id = 'toc'; @@ -39,10 +101,11 @@ end; render_html_init = function(job, render) render.stylesets.toc = css_toc render.stylesets.tocFixed = css_toc_fixed + render.stylesets.tocFixedLOD = css_toc_fixed_lod end; render_html_ir_assemble = function(job, render, ir) -- the custom position state is part of the document job, -- but rendering is a separate job, so we need to get the @@ -106,13 +169,16 @@ -- "renderer.state" contains the stateglob of the renderer -- itself, not to be confused with the "state" parameter -- which contains this extension's share of the job state -- we use it to activate the stylesets we injected earlier - renderer.state.stylesets_active.toc = true - if renderer.state.opts['width'] then - renderer.state.stylesets_active.tocFixed = true + renderer.state.style_add'toc' + if renderer.state.opts.width then + renderer.state.style_add'tocFixed' + end + if not renderer.state.opts['dark-on-light'] then + renderer.state.style_add'tocFixedLOD' end -- assemble a tree of links from the document section -- structure. this is tricky, because we need a tree, but -- all we have is a flat list with depth values attached to Index: makefile ================================================================== --- makefile +++ makefile @@ -46,11 +46,10 @@ executable = cortav default-format-flags = -m html:width 40em prefix = $(HOME)/.local bin-prefix = $(prefix)/bin -share-prefix = $(prefix)/share/$(executable) lua-standalone = $(if $(lua-lib-prefix),$(lua-lib-prefix)/liblua.a,-llua) lua-bindeps = -lm -ldl dbg-flags-luac = $(if $(debug),,-s) @@ -68,16 +67,18 @@ # support so that cortav can do fancy things like format math # equations by character class (e.g. italicizing variables) # this is not necessary for parsing the format, and can be # disabled by blanking the encoding-data list when building # ($ make encoding-data=) -encoding-data = ucstbls -encoding-files = $(patsubst %,$(build)/%.lc,$(encoding-data)) -encoding-data-ucs = https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt +encoding-data = 1 +encoding-data-ucs = $(build)/unicode.txt +encoding-data-ucs-url = https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt + +encoding-files = $(if $(encoding-data),$(build)/ucstbls.lc,) # "standalone" bytecode file that can be run directly as a script -$(build)/$(executable): $(build)/$(executable).ct +$(build)/$(executable): $(build)/$(executable).lc echo '#!$(lua)' > $@ cat $< >>$@ chmod +x $@ # raw bytecode without shebang header, must be run as `lua cortav.lc` @@ -85,13 +86,13 @@ @echo ' » building with extensions $(extens-names)' @echo ' » building with renderers $(rendrs-names)' $(luac) $(dbg-flags-luac) -o $@ $^ # true standalone binary, wraps bytecode file and (optionally) lua -$(build)/$(executable).bin: $(build)/$(executable).lc - $(lua) tools/makeshim.lua $< |\ - $(CC) -o$@ -xc - -xnone $(lua-standalone) $(lua-bindeps) +$(build)/$(executable).bin: $(build)/$(executable).lc tool/makeshim.lua + $(lua) tool/makeshim.lua $< |\ + $(CC) -s -o$@ -xc - -xnone $(lua-standalone) $(lua-bindeps) $(build)/cortav.html: cortav.ct $(build)/$(executable) | $(build)/ $(build)/$(executable) $< -o $@ -m render:format html -y html:fossil-uv .PHONY: syncdoc @@ -120,13 +121,13 @@ %/: mkdir -p $@ $(build)/unicode.txt: | $(build)/ - curl $(encoding-data-ucs) > $@ -$(build)/ucstbls.lc: $(build)/unicode.txt tools/ucs.lua | $(build)/ - $(lua) tools/ucs.lua $< | $(luac) -o $@ - + curl $(encoding-data-ucs-url) > $@ +$(build)/ucstbls.lc: $(encoding-data-ucs) tool/ucs.lua | $(build)/ + $(lua) tool/ucs.lua $< | $(luac) -o $@ - .PHONY: install install: $(build)/cortav $(build)/cortav-view.sh $(build)/velartrill-cortav-view.desktop | $(bin-prefix)/ install $(build)/$(executable) $(bin-prefix) install $(build)/cortav-view.sh $(bin-prefix) Index: render/html.lua ================================================================== --- render/html.lua +++ render/html.lua @@ -22,34 +22,73 @@ python = { color = 0xffd277 }; ruby = { color = 0xcdd6ff }; } local stylesets = { + list = [[ + @counter-style enclosed { + system: extends decimal; + prefix: "("; + suffix: ") "; + } + ul, ol { + padding: 0 1em; + } + li { + padding: 0.1em 0; + } + ]]; + list_ordered = [[]]; + list_unordered = [[]]; footnote = [[ div.footnote { - font-family: 90%; - display: none; + font-family: 90%; grid-template-columns: 1em 1fr min-content; grid-template-rows: 1fr min-content; position: fixed; padding: 1em; - background: @tone(0.05); - border: black; + background: @tone(0.03); margin:auto; } - div.footnote:target { display:grid; } @media screen { div.footnote { + display: grid; left: 10em; right: 10em; max-width: calc(@width + 2em); max-height: 30vw; bottom: 1em; + border: 1px solid black; + transform: translateY(200%); + transition: 0.4s; + z-index: 100; + } + div.footnote:target { + transform: translateY(0%); + } + #cover { + position: fixed; + top: 0; + left: 0; + height: 100vh; width: 100vw; + background: linear-gradient(to top, + @tone/0.8(-0.07), + @tone/0.4(-0.07)); + opacity: 0%; + transition: 1s; + pointer-events: none; + backdrop-filter: blur(0px); + } + div.footnote:target ~ #cover { + opacity: 100%; + pointer-events: all; + backdrop-filter: blur(5px); } } @media print { div.footnote { + display: grid; position: relative; } div.footnote:first-of-type { border-top: 1px solid black; } @@ -57,17 +96,18 @@ div.footnote > a[href="#0"]{ grid-row: 2/3; grid-column: 3/4; display: block; - padding: 0.2em 0.7em; text-align: center; + padding: 0 0.3em; text-decoration: none; background: @tone(0.2); color: @tone(1); border: 1px solid black; margin-top: 0.6em; + font-size: 150%; -webkit-user-select: none; -ms-user-select: none; user-select: none; -webkit-user-drag: none; user-drag: none; @@ -92,11 +132,14 @@ } div.footnote > div.text { grid-row: 1/2; grid-column: 2/4; padding-left: 1em; - overflow-y: scroll; + overflow-y: auto; + } + div.footnote > div.text > p:first-child { + margin-top: 0; } ]]; header = [[ body { padding: 0 2.5em !important } h1,h2,h3,h4,h5,h6 { border-bottom: 1px solid @tone(0.7); } @@ -207,25 +250,27 @@ } ]]; code = [[ code { display: inline-block; - background: @tone(0.9); - color: @bg; + background: @tone(-1); + color: @tone(0.7); font-family: monospace; font-size: 90%; - padding: 3px 5px; + padding: 2px 5px; + user-select: all; } ]]; var = [[ var { font-style: italic; font-family: monospace; color: @tone(0.7); + font-size: 90%; } code var { - color: @tone(0.25); + color: @tone(0.4); } ]]; math = [[ span.equation { display: inline-block; @@ -240,18 +285,24 @@ ]]; editors_markup = [[]]; block_code_listing = [[ figure.listing { font-family: monospace; - background: @tone(0.05); - color: @fg; + background: @tone(0.05 20); + color: @tone(1 20); padding: 0; margin: 0.3em 0; counter-reset: line-number; position: relative; - border: 1px solid @fg; + border: 1px solid @tone(1 20); } + :not(figure.listing) + figure.listing { + margin-top: 1em; + } + figure.listing + :not(figure.listing) { + margin-top: 1em; + } figure.listing>div { white-space: pre-wrap; tab-size: 3; -moz-tab-size: 3; counter-increment: line-number; @@ -261,11 +312,11 @@ figure.listing>:is(div,hr)::before { width: 1.0em; padding: 0.2em 0.4em; text-align: right; display: inline-block; - background-color: @tone(0.2); + background-color: @tone(0.2 20); border-right: 1px solid @fg; content: counter(line-number); margin-right: 0.3em; } figure.listing>hr::before { @@ -272,18 +323,18 @@ color: transparent; padding-top: 0; padding-bottom: 0; } figure.listing>div::before { - color: @fg; + color: @tone(1 20); } figure.listing>div:last-child::before { padding-bottom: 0.5em; } figure.listing>figcaption:first-child { border: none; - border-bottom: 1px solid @fg; + border-bottom: 1px solid @tone(1 20); } figure.listing>figcaption::after { display: block; float: right; font-weight: normal; @@ -294,27 +345,55 @@ figure.listing>figcaption { font-family: sans-serif; font-size: 120%; padding: 0.2em 0.4em; border: none; - color: @tone(2); + color: @tone(2 20); } figure.listing > hr { border: none; margin: 0; height: 0.7em; counter-increment: line-number; } ]]; + root = [[ + body { + font-size: 16pt; + page-break-before: always; + } + h1 { + page-break-before: always; + } + h1,h2,h3,h4,h5,h6 { + page-break-after: avoid; + } + ]]; + } + + local stylesNeeded = { + flags = {}; + order = {}; } + local function addStyle(sty) + -- convenience function, also just in case i end up having + -- to change the goddamn implementation again + if not stylesNeeded.flags[sty] then + stylesNeeded.flags[sty] = true + table.insert(stylesNeeded.order, sty) + return true + end + return false + end - local stylesNeeded = {} + addStyle 'root' local render_state_handle = { doc = doc; opts = opts; style_rules = styles; -- use stylesneeded if at all possible + style_add = addStyle; stylesets = stylesets; stylesets_active = stylesNeeded; obj_htmlid = getSafeID; -- remaining fields added later } @@ -402,49 +481,54 @@ doc.docjob:hook('meddle_span', s) return s end local cssRulesFor = {} + function getCSSImageForResource(r) + return '' -- TODO + end + local function getSpanRenderers(procs) local tag, elt, catenate = procs.tag, procs.elt, procs.catenate local span_renderers = {} local plainrdr = getBaseRenderers(tagproc.toTXT, span_renderers) local htmlSpan = getBaseRenderers(procs, span_renderers).htmlSpan function span_renderers.format(sp,...) local tags = { strong = 'strong', emph = 'em', strike = 'del', insert = 'ins', literal = 'code', variable = 'var'} if sp.style == 'literal' and not opts['fossil-uv'] then - stylesNeeded.code = true + addStyle 'code' elseif sp.style == 'strike' or sp.style == 'insert' then - stylesNeeded.editors_markup = true + addStyle 'editors_markup' elseif sp.style == 'variable' then - stylesNeeded.var = true + addStyle 'var' end return tag(tags[sp.style],nil,htmlSpan(sp.spans,...)) end function span_renderers.deref(t,b,s) local r = b.origin:ref(t.ref) local name = t.ref if name:find'%.' then name = name:match '^[^.]*%.(.+)$' end if type(r) == 'string' then - stylesNeeded.abbr = true + addStyle 'abbr' return tag('abbr',{title=r},next(t.spans) and htmlSpan(t.spans,b,s) or name) end if r.kind == 'resource' then local rid = getSafeID(r, 'res-') if r.class == 'image' then if not cssRulesFor[r] then local css = prepcss(string.format([[ section p > .%s { + background: %s; } - ]], rid)) + ]], rid, getCSSImageForResource(r))) stylesets[r] = css cssRulesFor[r] = css - stylesNeeded[r] = true + addStyle(r) end - return tag('div',{class=rid},catenate{'blaah'}) + return tag('div',{class=rid},catenate{''}) elseif r.class == 'video' then local vid = {} return tag('video',nil,vid) elseif r.class == 'font' then b.origin:fail('fonts cannot be instantiated, use %font directive instead') @@ -506,11 +590,11 @@ local mctx = b.origin:clone() mctx.invocation = m return htmlSpan(ct.parse_span(r, mctx),b,s) end function span_renderers.math(m,b,s) - stylesNeeded.math = true + addStyle 'math' local spans = {} local function fmt(sp, target) for i,v in ipairs(sp) do if type(v) == 'string' then local x = ct.tool.mathfmt(b.origin, v) @@ -541,11 +625,11 @@ elseif d.failthru then return htmlSpan(d.spans, b, s) end end function span_renderers.footnote(f,b,s) - stylesNeeded.footnote = true + addStyle 'footnote' local source, sid, ssec = b.origin:ref(f.ref) local cnc = getSafeID(ssec) .. ' ' .. sid local fn if footnotes[cnc] then fn = footnotes[cnc] @@ -569,11 +653,11 @@ local block_renderers = { anchor = function(b,s) return tag('a',{id = getSafeID(b)},null()) end; paragraph = function(b,s) - stylesNeeded.paragraph = true; + addStyle 'paragraph' return tag('p', nil, sr.htmlSpan(b.spans, b, s), b) end; directive = function(b,s) -- deal with renderer directives local _, cmd, args = b.words(2) @@ -584,11 +668,11 @@ end end; label = function(b,s) if ct.sec.is(b.captions) then if not (opts['fossil-uv'] or opts.snippet) then - stylesNeeded.header = true + addStyle 'header' end local h = math.min(6,math.max(1,b.captions.depth)) return tag(f('h%u',h), nil, sr.htmlSpan(b.spans, b, s), b) else -- handle other uses of labels here @@ -608,11 +692,11 @@ table.insert(tb, tag('tr',nil,catenate(row))) end return tag('table',nil,catenate(tb)) end; listing = function(b,s) - stylesNeeded.block_code_listing = true + addStyle 'block_code_listing' local nodes = ss.map(function(l) if #l > 0 then return tag('div',nil,sr.htmlSpan(l, b, s)) else return elt('hr') @@ -624,11 +708,11 @@ if b.lang then langsused[b.lang] = true end return tag('figure', {class='listing', lang=b.lang, id=b.id and getSafeID(b)}, catenate(nodes)) end; aside = function(b,s) local bn = {} - stylesNeeded.aside = true + addStyle 'aside' if #b.lines == 1 then bn[1] = sr.htmlSpan(b.lines[1], b, s) else for _,v in pairs(b.lines) do table.insert(bn, tag('p', {}, sr.htmlSpan(v, b, s))) @@ -690,11 +774,11 @@ }, block, sec) end end if rd then if opts['heading-anchors'] and block == sec.heading_node then - stylesNeeded.headingAnchors = true + addStyle 'headingAnchors' table.insert(rd.nodes, ' ') table.insert(rd.nodes, { tag = 'a'; attrs = {href = '#' .. irs.attrs.id, class='anchor'}; nodes = {type(opts['heading-anchors'])=='string' and opts['heading-anchors'] or '§'}; @@ -737,14 +821,17 @@ end renderBlocks(ftir,body) local note = tag('div',{class='footnote',id=fn.id}, { tag('div',{class='number'}, tostring(fn.num)), tag('div',{class='text'}, body.nodes), - tag('a',{href='#0'},'close') + tag('a',{href='#0'},'⤫') }) table.insert(ir, note) end + if next(footnotes) then + table.insert(ir, tagproc.toIR.tag('div',{id='cover'},'')) + end -- restructure passes runhook('ir_restructure_pre', ir) ---- list insertion pass @@ -751,14 +838,15 @@ local lists = {} for _, sec in pairs(ir) do if sec.tag == 'section' then local i = 1 while i <= #sec.nodes do local v = sec.nodes[i] if v.tag == 'li' then + addStyle 'list' local ltag if v.src.ordered - then ltag = 'ol' - else ltag = 'ul' + then ltag = 'ol' addStyle 'list_ordered' + else ltag = 'ul' addStyle 'list_unordered' end local last = i>1 and sec.nodes[i-1] if last and last.embed == 'list' and not ( last.ref[#last.ref].src.depth == v.src.depth and last.ref[#last.ref].src.ordered ~= v.src.ordered @@ -902,15 +990,15 @@ end if opts.accent then table.insert(styles, string.format(':root {--accent:%s}', opts.accent)) end if opts.accent or (not opts['dark-on-light']) and (not opts['fossil-uv']) then - stylesNeeded.accent = true + addStyle 'accent' end - for k in pairs(stylesNeeded) do + for _,k in pairs(stylesNeeded.order) do if not stylesets[k] then ct.exns.unimpl('styleset %s not implemented (!)', k):throw() end table.insert(styles, prepcss(stylesets[k])) end local head = {} ADDED tool/makeshim.lua Index: tool/makeshim.lua ================================================================== --- tool/makeshim.lua +++ tool/makeshim.lua @@ -0,0 +1,86 @@ +-- [ʞ] tools/makeshim.lua +-- ~ lexi hale +-- 🄯 AGPLv3 +-- ? this program creates a C source file embedding +-- cortav, for the purposes of standalone deployment +-- without a lua interpreter, or for the purposes of +-- giving cortav extra privileges + +local includes = [[ +#include +#include +#include +#include +extern int luaL_openlibs(lua_State* l); +]] + + +local main = [[ +int main(int argc, char** argv) { + lua_State* l = luaL_newstate(); + luaL_openlibs(l); + + // pass arguments thru to lua + lua_newtable(l); + for(size_t i = 0; i < argc; ++i) { + lua_pushstring(l,argv[i]); + lua_rawseti(l, -2, i); + } + lua_setglobal(l, "arg"); + + // load and run our payload + int e = luaL_loadbufferx(l, ct_bytecode, sizeof(ct_bytecode), "cortav", "b"); + if (e != LUA_OK) { + printf("some kind of error idk fam\n"); + return -1; + } + + lua_call(l, 0, 0); + + // normal termination is by the os.exit() call + return -1; +} +]] + +local function setfile(i, dflt, mode) + if arg[i] and arg[i] ~= '' then + local fn = io.open(arg[i], mode) + if fn then + return fn + end + io.stderr:write('(' .. arg[0]..' fatal) cannot open file '..arg[i]) + end + return dflt +end + +local src = setfile(1, io.stdin, "rb") +local dest = setfile(2, io.stdout, "w") + +local cstr = {} +local strtpl = 'static char ct_bytecode [%u] = {%s};' +local lines = {includes} + +local bytes = {} + +local bn = 1 +local len = 0 +while true do + local byte = src:read(1) + if not byte then break end + local str = tostring(byte:byte(1))..',' + -- make sure our source file is parseable by + -- a compliant C compiler + len = len + string.len(str) + if len >= 4096 then + len = 0 + bytes[bn]='\n' + bn = bn + 1 + end + bytes[bn] = str + bn = bn + 1 +end + +table.insert(lines, strtpl:format(#bytes, table.concat(bytes))) +table.insert(lines, main) + +dest:write(table.concat(lines, '\n')) ADDED tool/ucs.lua Index: tool/ucs.lua ================================================================== --- tool/ucs.lua +++ tool/ucs.lua @@ -0,0 +1,158 @@ +-- [ʞ] tools/ucs.lua +-- ~ lexi hale +-- ? table generator for unicode character classes +-- 🄯 AGPLv3 + + +local tpl = [[ +local ss = require 'sirsem' +ss.str.enc.utf8.ranges = {%s} +]] + +local enum = function(syms) + local e = {} + for i,v in pairs(syms) do + e[v] = i + e[i] = v + end + return e +end + +local file = io.stdin +local path +if arg[1] then + path = arg[1] + file = io.open(path, 'rb') +end + +local ss = require'sirsem' +local basictype = ss.str.charclass +local props = ss.str.charprop +local overrides = { + [0x200B] = basictype.space | props.wordsep; -- database entry is wrong +} + +local mask = ~0 -- mask out irrelevant properties to compactify database + +local function parsecat(tbl) + local c,p,b = 0,props,basictype + if overrides[tbl.codepoint] then + c = overrides[tbl.codepoint] + elseif tbl.class == 'Nd' then c = b.numeral + elseif tbl.class == 'No' then c = b.numeral | p.diac + elseif tbl.class == 'Cc' then + if tbl.kind == 'S' + or tbl.kind == 'WS' + or tbl.kind == 'B' then c = b.space | p.wordsep + else c = b.ctl | p.wordbreak | p.disallow end + elseif tbl.class == 'Lu' then c = b.letter | p.upper + elseif tbl.class == 'Ll' then c = b.letter | p.lower + elseif tbl.class == 'Lo' + or tbl.class == 'Lt' then c = b.letter + elseif tbl.class == 'Po' then c = b.punct | p.wordbreak + elseif tbl.class == 'Sm' then c = b.symbol | p.wordsep + elseif tbl.class == 'Ps' then c = b.punct | p.brack | p.left + elseif tbl.class == 'Pe' then c = b.punct | p.brack | p.right + elseif tbl.class == 'Pc' + or tbl.class == 'Pd' + or tbl.class == 'Sk' + or tbl.class == 'Sc' then c = b.symbol + elseif tbl.class == 'Zs' then c = b.space + if tbl.kind == 'WS' then c=c|p.wordsep end + elseif tbl.class == 'So' then c = b.glyph + elseif tbl.class == 'Mn' then c = b.symbol | p.diac | p.superimpose + end + return c & mask +end + +local ranuirAlpha = {0xe39d, 0xe39f, 0xe3ad, 0xe3af, 0xe3b5, 0xe3b7, 0xe3b9, 0xe3bb, 0xe3bd, 0xe3be, 0xe3bf, 0xe3c5, 0xe3c7, 0xe3c9, 0xe3cb, 0xe3cc, 0xe3cd, 0xe3ce, 0xe3cf} +local ranuirSpecial = { + [0xe390] = basictype.space | props.wordsep; +} + +local ranuir = {} +for _,v in pairs(ranuirAlpha) do ranuir[v] = basictype.letter end +for k,v in pairs(ranuirSpecial) do ranuir[k] = v end +local ranuirKeys = {} +for k in pairs(ranuir) do table.insert(ranuirKeys, k) end +table.sort(ranuirKeys) + +local recs = {} +local ranuirok = false +for ln in file:lines() do + local v = {} + for s in ln:gmatch('[^;]*') do + table.insert(v, s) + end + v[1] = tonumber(v[1],0x10) +-- if v[1] > 0x7f then -- discard ASCII, we already have that + local code = { + codepoint = v[1]; + name = v[2]; + class = v[3]; + kind = v[5]; + } + code.cat = parsecat(code) + + if (not ranuirok) and code.codepoint > 0xe390 then + for _,ri in pairs(ranuirKeys) do + table.insert(recs, { + codepoint = ri; + cat = ranuir[ri]; + }) + end + ranuirok = true + end + + if code.cat ~= 0 then + table.insert(recs,code) + end +-- end +end + + +local ranges = {} +local last = recs[1] +local start = last +local altern = false +local flush = function(i) + local new = {start.codepoint, last.codepoint, last.cat} + if altern then + new[3] = new[3] | props.upper | props.lower + end + table.insert(ranges, new) + altern = false +end +for i, r in ipairs(recs) do + if r.cat ~= last.cat then + -- we can massively compactify this set with one weird trick: + -- most non-ascii cased character sets are not in AAAAaaaa, + -- but rather AaAaAa order. so we can look for this simple + -- pattern and compress it, shaving c. 1/3rd off our dataset + local ambi = props.upper | props.lower + if (altern or (start == last and (last.cat & props.upper) ~= 0)) and + ((r.cat &~ ambi) == (last.cat &~ ambi)) then + altern = true + last = r + else + flush() + start = r + end + elseif altern then + flush() + start = r + end + last = r +end +flush() + +-- the data has been collected and formatted in the manner we +-- need; now we just need to emit it as a lua table + +local tab = {} +local top = 1 +for k,v in pairs(ranges) do + tab[top] = string.format('{0x%x,0x%x,%u}',table.unpack(v)) + top = top + 1 +end +io.stdout:write(string.format(tpl, table.concat(tab,',\n'))) DELETED tools/makeshim.lua Index: tools/makeshim.lua ================================================================== --- tools/makeshim.lua +++ tools/makeshim.lua @@ -1,86 +0,0 @@ --- [ʞ] tools/makeshim.lua --- ~ lexi hale --- 🄯 AGPLv3 --- ? this program creates a C source file embedding --- cortav, for the purposes of standalone deployment --- without a lua interpreter, or for the purposes of --- giving cortav extra privileges - -local includes = [[ -#include -#include -#include -#include -extern int luaL_openlibs(lua_State* l); -]] - - -local main = [[ -int main(int argc, char** argv) { - lua_State* l = luaL_newstate(); - luaL_openlibs(l); - - // pass arguments thru to lua - lua_newtable(l); - for(size_t i = argc; i < argc; ++i) { - lua_pushstring(l,argv[i]); - lua_rawseti(l, -2, i); - } - lua_setglobal(l, "arg"); - - // load and run our payload - int e = luaL_loadbufferx(l, ct_bytecode, sizeof(ct_bytecode), "cortav", "b"); - if (e != LUA_OK) { - printf("some kind of error idk fam\n"); - return -1; - } - - lua_call(l, 0, 0); - - // normal termination is by the os.exit() call - return -1; -} -]] - -local function setfile(i, dflt, mode) - if arg[i] and arg[i] ~= '' then - local fn = io.open(arg[i], mode) - if fn then - return fn - end - io.stderr:write('(' .. arg[0]..' fatal) cannot open file '..arg[i]) - end - return dflt -end - -local src = setfile(1, io.stdin, "rb") -local dest = setfile(2, io.stdout, "w") - -local cstr = {} -local strtpl = 'static char ct_bytecode [%u] = {%s};' -local lines = {includes} - -local bytes = {} - -local bn = 1 -local len = 0 -while true do - local byte = src:read(1) - if not byte then break end - local str = tostring(byte:byte(1))..',' - -- make sure our source file is parseable by - -- a compliant C compiler - len = len + string.len(str) - if len >= 4096 then - len = 0 - bytes[bn]='\n' - bn = bn + 1 - end - bytes[bn] = str - bn = bn + 1 -end - -table.insert(lines, strtpl:format(#bytes, table.concat(bytes))) -table.insert(lines, main) - -dest:write(table.concat(lines, '\n')) DELETED tools/ucs.lua Index: tools/ucs.lua ================================================================== --- tools/ucs.lua +++ tools/ucs.lua @@ -1,158 +0,0 @@ --- [ʞ] tools/ucs.lua --- ~ lexi hale --- ? table generator for unicode character classes --- 🄯 AGPLv3 - - -local tpl = [[ -local ss = require 'sirsem' -ss.str.enc.utf8.ranges = {%s} -]] - -local enum = function(syms) - local e = {} - for i,v in pairs(syms) do - e[v] = i - e[i] = v - end - return e -end - -local file = io.stdin -local path -if arg[1] then - path = arg[1] - file = io.open(path, 'rb') -end - -local ss = require'sirsem' -local basictype = ss.str.charclass -local props = ss.str.charprop -local overrides = { - [0x200B] = basictype.space | props.wordsep; -- database entry is wrong -} - -local mask = ~0 -- mask out irrelevant properties to compactify database - -local function parsecat(tbl) - local c,p,b = 0,props,basictype - if overrides[tbl.codepoint] then - c = overrides[tbl.codepoint] - elseif tbl.class == 'Nd' then c = b.numeral - elseif tbl.class == 'No' then c = b.numeral | p.diac - elseif tbl.class == 'Cc' then - if tbl.kind == 'S' - or tbl.kind == 'WS' - or tbl.kind == 'B' then c = b.space | p.wordsep - else c = b.ctl | p.wordbreak | p.disallow end - elseif tbl.class == 'Lu' then c = b.letter | p.upper - elseif tbl.class == 'Ll' then c = b.letter | p.lower - elseif tbl.class == 'Lo' - or tbl.class == 'Lt' then c = b.letter - elseif tbl.class == 'Po' then c = b.punct | p.wordbreak - elseif tbl.class == 'Sm' then c = b.symbol | p.wordsep - elseif tbl.class == 'Ps' then c = b.punct | p.brack | p.left - elseif tbl.class == 'Pe' then c = b.punct | p.brack | p.right - elseif tbl.class == 'Pc' - or tbl.class == 'Pd' - or tbl.class == 'Sk' - or tbl.class == 'Sc' then c = b.symbol - elseif tbl.class == 'Zs' then c = b.space - if tbl.kind == 'WS' then c=c|p.wordsep end - elseif tbl.class == 'So' then c = b.glyph - elseif tbl.class == 'Mn' then c = b.symbol | p.diac | p.superimpose - end - return c & mask -end - -local ranuirAlpha = {0xe39d, 0xe39f, 0xe3ad, 0xe3af, 0xe3b5, 0xe3b7, 0xe3b9, 0xe3bb, 0xe3bd, 0xe3be, 0xe3bf, 0xe3c5, 0xe3c7, 0xe3c9, 0xe3cb, 0xe3cc, 0xe3cd, 0xe3ce, 0xe3cf} -local ranuirSpecial = { - [0xe390] = basictype.space | props.wordsep; -} - -local ranuir = {} -for _,v in pairs(ranuirAlpha) do ranuir[v] = basictype.letter end -for k,v in pairs(ranuirSpecial) do ranuir[k] = v end -local ranuirKeys = {} -for k in pairs(ranuir) do table.insert(ranuirKeys, k) end -table.sort(ranuirKeys) - -local recs = {} -local ranuirok = false -for ln in file:lines() do - local v = {} - for s in ln:gmatch('[^;]*') do - table.insert(v, s) - end - v[1] = tonumber(v[1],0x10) --- if v[1] > 0x7f then -- discard ASCII, we already have that - local code = { - codepoint = v[1]; - name = v[2]; - class = v[3]; - kind = v[5]; - } - code.cat = parsecat(code) - - if (not ranuirok) and code.codepoint > 0xe390 then - for _,ri in pairs(ranuirKeys) do - table.insert(recs, { - codepoint = ri; - cat = ranuir[ri]; - }) - end - ranuirok = true - end - - if code.cat ~= 0 then - table.insert(recs,code) - end --- end -end - - -local ranges = {} -local last = recs[1] -local start = last -local altern = false -local flush = function(i) - local new = {start.codepoint, last.codepoint, last.cat} - if altern then - new[3] = new[3] | props.upper | props.lower - end - table.insert(ranges, new) - altern = false -end -for i, r in ipairs(recs) do - if r.cat ~= last.cat then - -- we can massively compactify this set with one weird trick: - -- most non-ascii cased character sets are not in AAAAaaaa, - -- but rather AaAaAa order. so we can look for this simple - -- pattern and compress it, shaving c. 1/3rd off our dataset - local ambi = props.upper | props.lower - if (altern or (start == last and (last.cat & props.upper) ~= 0)) and - ((r.cat &~ ambi) == (last.cat &~ ambi)) then - altern = true - last = r - else - flush() - start = r - end - elseif altern then - flush() - start = r - end - last = r -end -flush() - --- the data has been collected and formatted in the manner we --- need; now we just need to emit it as a lua table - -local tab = {} -local top = 1 -for k,v in pairs(ranges) do - tab[top] = string.format('{0x%x,0x%x,%u}',table.unpack(v)) - top = top + 1 -end -io.stdout:write(string.format(tpl, table.concat(tab,',\n')))