Index: cli.lua ================================================================== --- cli.lua +++ cli.lua @@ -32,11 +32,16 @@ if not mode['render:format'] then error 'what output format should i translate the input to?' end if mode['render:format'] == 'none' then return 0 end if not ct.render[mode['render:format']] then - ct.exns.unimpl('output format “%s” unsupported', mode['render:format']):throw() + if (not ct.render.html) and not _G.native then + -- we may be running uncompiled; otherwise something is seriously broken + require('render.' .. mode['render:format']) + else + ct.exns.unimpl('output format “%s” unsupported', mode['render:format']):throw() + end end local render_opts = ss.kmap(function(k,v) return k:sub(2+#mode['render:format']) end, ss.kfilter(mode, function(m) Index: cortav.ct ================================================================== --- cortav.ct +++ cortav.ct @@ -1,8 +1,13 @@ +%% this is the reference specification that i used to initially cobble together my +%% spec for the language i was going to implement, and which i then expanded +%% as i added features to the reference implementation. it's a mess and it +%% urgently needs to be rewritten into a more accessible and navigable +%% document for new users. TODO + # cortav specification -[*cortav] is a markup language designed to be a simpler, but more capable alternative to markdown. its name derives from the [>dict Ranuir words] [!cor] "writing" and [!tav] "document", translating to something like "(plain) text document". - +[*cortav] is a markup language designed to be a simpler, [!well-specified], and more capable alternative to markdown. its name derives from the [>dict Ranuir words] [!cor] "writing" and [!tav] "document", translating to something like "(plain) text document". dict: http://ʞ.cc/fic/spirals/glossary the cortav [!format] can be called [!cortavgil], or [!gil cortavi], to differentiate it from the reference implementation [!cortavsir] or [!sir cortavi]. %toc @@ -72,11 +77,11 @@ * level 3: [*publishing]. implements all currently standardized core behavior, including zero or more extensions. * level 4: [*reference]. implements all currently standardized behavior, including [!all] standardized extensions. ! note that which translators are implemented is not specified by level, as this is, naturally, implementation-dependent. (it would make rather little sense for the blurb parser of a cortav-enabled blog engine to support generating PDFs, after all.) level encodes only which features of the cortav [!language] are supported. -##onblocks structure +##onblocks structure (block elements) cortav is based on an HTML-like block model, where a document consists of sections, which are made up of blocks, which may contain a sequence of spans. flows of text are automatically conjoined into spans, and blocks are separated by one or more newlines. this means that, unlike in markdown, a single logical paragraph [*cannot] span multiple ASCII lines. the primary purpose of this was to ensure ease of parsing, but also, both markdown and cortav are supposed to be readable from within a plain text editor. this is the 21st century. every reasonable text editor supports soft word wrap, and if yours doesn't, that's entirely your own damn fault. hard-wrapping lines is incredibly user-hostile, especially to users on mobile devices with small screens. cortav does not allow it. the first character(s) of every line (the "control sequence") indicates the role of that line. if no control sequence is recognized, the line is treated as a paragraph. the currently supported control sequences are listed below. some control sequences have alternate forms, in order to support modern, readable unicode characters as well as plain ascii text. * [*paragraphs] ([`.] [` ¶] [`❡]): a paragraph is a simple block of text. the period control sequence is only necessary if the paragraph text starts with text that would be interpreted as a control sequence otherwise @@ -84,12 +89,12 @@ * [*section starts] [`#] [`§]: starts a new section. all sections have an associated depth, determined by the number of sequence repetitions (e.g. "###" indicates depth three). sections may have headers and IDs; both are optional. IDs, if present, are a sequence of raw-text immediately following the hash marks. if the line has one or more space character followed by styled-text, a header will be attached. the character immediately following the hashes can specify a particular type of section. e.g.: ** [`#] is a simple section break. ** [`#anchor] opens a new section with the ID [`anchor]. ** [`# header] opens a new section with the title "header". ** [`#anchor header] opens a new section with both the ID [`anchor] and the title "header". -* [*nonprinting sections] ([`^]): sometimes, you'll want to create a namespace without actually adding a visible new section to the document. you can achieve this by creating a [!nonprinting section] and defining resources within it. nonprinting sections can also be used to store comments, notes, to-dos, or other meta-information that is useful to have in the source file without it becoming a part of the output. nonprinting sections can be used for a sort of "literate markup," where resource and reference definitions can intermingle with human-readable narrative about those definitions. -* [*resource] ([`@]): defines a [!resource]. a resource is a file or object that is to be embedded in the document somehow. common examples of resources include images, videos, iframes, or headers/footers. resources can be defined inline, or reference external objects. see [>rsrc resources] for more information. +* [*nonprinting sections] ([`^]): sometimes, you'll want to create a namespace without actually adding a visible new section to the document. you can achieve this by creating a [!nonprinting section] and defining resources within it. nonprinting sections can also be used to store comments, notes, to-dos, or other meta-information that is useful to have in the source file without it becoming a part of the output. nonprinting sections can be used for a sort of "literate markup," where resource and reference definitions can intermingle with human-readable narrative about those definitions. note that unlike comments, nonprinting sections are still parsed and can still affect other sections by means of definitions and pragmata. +* [*resource] ([`@]): defines a [!resource]. a resource is a file or object that is to be embedded in the document somehow. common examples of resources include images, videos, iframes, or headers/footers. resources can be defined inline, or reference external objects that are read in either at compile-time or view-time. see [>rsrc resources] for more information. * [*lists] ([`*] [`:]): these are like paragraph nodes, but list nodes that occur next to each other will be arranged so as to show they compose a sequence. depth is determined by the number of stars/colons. like headers, a list entry may have an ID that can be used to refer back to it; it is indicated in the same way. if colons are used, this indicates that the order of the items is signifiant. [`:]-lists and [`*]-lists may be intermixed; however, note than only the last character in the sequence actually controls the type. a blank line terminates the current list. * [*directives] ([`%]): a directive issues a hint to the renderer in the form of an arbitrary string. directives are normally ignored if they are not supported, but you may cause a warning to be emitted where the directive is not supported with [`%!] or mark a directive critical with [`%!!] so that rendering will entirely fail if it cannot be obeyed. * [*comments] ([`%%]): a comment is a line of text that is simply ignored by the renderer. * [*asides] ([`!]): indicates text that diverges from the narrative, and can be skipped without interrupting it. think of it like block-level parentheses. asides which follow one another are merged as paragraphs of the same aside, usually represented as a sort of box. if the first line of an aside contains a colon, the stretch of styled-text from the beginning to the aside to the colon will be treated as a "type heading," e.g. "Warning:" * [*code] ([`~~~]): a line beginning with ~~~ begins or terminates a block of code. code blocks are by default not parsed, but parsing can be activated by preceding the code block with an [`%[*expand]] directive. the opening line should look like one of the below @@ -105,12 +110,12 @@ ** a [*reference] is a general mechanism for out-of-line metadata, and references are used in many different ways -- e.g. to specify link destinations, footnote contents, abbreviations, or macro bodies. to ensure that a definition is interpreted as a reference, rather than as metadata for an object, precede it with a blank line. def-tab-enc: in encodings without tab characters, a definition is opened by a line beginning with two blanks, and continued by a line beginning with four blanks. def-ex: [*open a new reference]: [`[!\\t][$key]: [$value]] [*continue a reference]: [`[!\\t\\t][$value]] * [*quotation] ([`<]): a line of the form [`<[$name]> [$quote]] denotes an utterance by [$name]. -* [*blockquote] ([`>]): alternate blockquote syntax. can be nested by repeating the [`>] character. -* [*subtitle/caption] (["--]): attaches a subtitle to the previous header, or caption to the previous object +* [*blockquote] ([`>[$id] [$body]]): "inline" blockquote syntax. can be nested by repeating the [`>] character. the [$id] is optional, but the [`>] character must be immediately followed by whitespace if the block is not to have an ID. +* [*subtitle/caption] (["--]): attaches a subtitle to the previous header, or caption to the previous object. after a blockquote, attaches an attribution line * [*embed] ([`&]): embeds a referenced object. can be used to show images or repeat previously defined objects like lists or tables, optionally with a caption. an embed line can be followed immediately by a sequence of [*definitions] in the same way that resource definitions can, to override resource properties on a per-instance basis. note that only presentation-related properties like [$desc] can be meaningful overridden, as embed does not trigger a re-render of the parse tree; if you want to override e.g. context variables, use a multiline macro invocation instead. ** [`&[$image]] embeds an image or other block-level object. [!image] can be a reference with a url or file path, or it can be an embed section (e.g. for SVG files) ***[`&myimg All that remained of the unfortunate blood magic pageant contestants and audience (police photo)] ** [`&-[$ident] [$styled-text]] embeds a closed disclosure element containing the text of the named object (a nonprinting section or cortav resource should usually be used to store the content; it can also name an image or video, of course). in interactive outputs, this will display as a block which can be clicked on to view the full contents of the referenced object [$ident]; if [$styled-text] is present, it overrides the title of the section you are embedding (if any). in static outputs, the disclosure object will display as an enclosed box with [$styled-text] as the title text *** [`&-ex-a Prosecution Exhibit A (GRAPHIC CONTENT)] @@ -124,11 +129,11 @@ * [*cross-references] ([`=>] [`⇒]): inserts a block-level link. has two forms for the sake of gemtext compatibility. [$styled-text] is a descriptive text of the destination. especially useful for menus and gemtext output. ** the cortav syntax is [`=>[$ident] [$styled-text]], where [$ident] is an identifier; links to the same destination as [`\[>[$ident] [$styled-text]\]] would ** the compatibility syntax is [`=> [$uri] [$styled-text]] (note the space before [$uri]!). instead of taking an identifier for an object in the document, it directly accepts a URI. note that this is not formally equivalent to gemtext's link syntax, which also allows paths in place of URIs; [`cortav] does not. the gemtext line ["=> /somewhere] would need to be expressed as ["=> file:/somewhere], and ["=> /somewhere?key=val] as ["http:/somewhere?key=val] (or ["gemini:/somewhere?key=val], if the result is to be served over a gemini server). * [*empty lines] (that is, lines consisting of nothing but whitespace) constitute a [!break], which terminates multiline objects that do not have a dedicated termination sequence, for example lists and asides. -##onspans styled text +##onspans styled text (span elements) most blocks contain a sequence of spans. these spans are produced by interpreting a stream of [*styled-text] following the control sequence. styled-text is a sequence of codepoints potentially interspersed with escapes. an escape is formed by an open square bracket [`\[] followed by a [*span control sequence], and arguments for that sequence like more styled-text. escapes can be nested. * strong {obj *|styled-text}: causes its text to stand out from the narrative, generally rendered as bold or a brighter color. * emphatic {obj !|styled-text}: indicates that its text should be spoken with emphasis, generally rendered as italics * custom style {span .|id|[$styled-text]}: applies a specially defined font style. for example, if you have defined [`caution] to mean "demibold italic underline", cortav will try to apply the proper weight and styling within the constraints of the current font to the span [$styled-text]. see the [>fonts-sty fonts section] for more information about this mechanism. @@ -147,14 +152,15 @@ * macro [` \{[$name] [$arguments]}]: invokes a [>ex.mac macro] inline, specified with a reference. if the result of macro expansion contains newlines, they will be treated as line breaks, rather than paragraph breaks as they would be in a multiline context. * argument {obj #|var}: in macros only, inserts the [$var]-th argument. otherwise, inserts a context variable provided by the renderer. * raw argument {obj ##|var}: like above, but does not evaluate [$var]. * term {obj &|name}, {span &|name|[$expansion]}: quotes a defined term with a link to its definition, optionally with a custom expansion of the term (for instance, to expand the first use of an acronym) * inline image {obj &@|name}: shows a small image or other object inline. the unicode character [`🖼] can also be used instead of [`&@]. -* unicode codepoint {obj U+|hex-integer}: inserts an arbitrary UCS codepoint in the output, specified by [$hex-integer]. lowercase [`u] is also legal. +* unicode codepoint {obj U|hex-integer}: inserts an arbitrary UCS codepoint in the output, specified by [$hex-integer]. lowercase [`u] is also legal, as are [`U+] and [`u+]. * math mode {obj =|equation}: activates additional transformations on the span to format it as a mathematical equation; e.g. [`*] becomes [`×] and [`/] --> [`÷]. * extension {span %|ext|…}: invokes extension named in [$ext]. [$ext] will usually be an extension name followed by a symbol (often a period) and then an extension-specific directive, although for some simple extensions it may just be the plain extension name. further syntax and semantics depend on the extension. this syntax can also be used to apply formatting specific to certain renderers, such as assigning a CSS class in the [`html] renderer (["[%html.myclass my [!styled] text]]). -* critical extension {span %!|ext|…}: like [!extension], but will trigger an error if the requested extension is not available +* important extension {span %!|ext|…}: like [!extension], but will issue a warning if the requested extension is not available +* critical extension {span %!!|ext|…}: like [!important extension], but will trigger an error and abort compilation if the requested extension is not available * extension text {span %:|ext|[$styled-text]}: like [!extension], but when the requested extension is not present, [$styled-text] wlil be emitted as-is. this is a better way to apply CSS classes, as the text will still be visible when rendered to formats other than HTML. * inline comment {obj %%|...}: ignored. useful for editorial annotations not intended to be part of the rendered product. span: [` \[[*[#1]][$[#2]] [#3]\]] obj: [` \[[*[#1]][$[#2]]\]] @@ -583,25 +589,23 @@ ** {d pragma accent-spread} is a factor that controls the "spread" of hues used in the document. if 0, only the accent color will be used; if larger, other hues will be used in addition to the primary accent color. ** {d pragma dark-on-light on\|off} controls whether the color scheme used should be light-on-dark or dark-on-light ** {d pragma page-width} indicates how wide the pages should be ** {d pragma title-page} specifies a section to use as a title page, for renderer backends that support pagination -! note on pragmata: particularly when working with collections of documents, you should not keep formatting metadata in the documents themselves! the best thing to do is to have a makefile for compiling the documents using whatever tools you want to support, and encoding the rendering options in this file (for the reference implementation this currently means as command line arguments, but eventually it will support intent files as well) so they can all be changed in one place; pragmas should instead be used for per-document [*overrides] of default settings. +! note on pragmata: particularly when working with collections of documents, you should not keep shared formatting metadata duplicated across the documents themselves! the best thing to do is to have a makefile for compiling the documents using whatever tools you want to support, and encoding the rendering options in this file (for the reference implementation this currently means as command line arguments, but eventually it will support intent files as well) so they can all be changed in one place; pragmas should instead be used for per-document [*overrides] of default settings. ! a workaround for the lack of intent files in the reference implementation is to have a single pseudo-stylesheet that contains only {d pragma} statements, and then import this file from each individual source file using the {d include} directive. this is suboptimal and recommended only when you need to ensure compatibility between different implementations. ! when creating HTML files, an even better alternative may be to turn off style generation entirely and link in an external, hand-written CSS stylesheet. this is generally the way you should compile sources for existing websites if you aren't going to write your own extension. ##ex examples ~~~ blockquotes #bq [cortav] ~~~ the following excerpts of text were recovered from a partially erased hard drive found in the Hawthorne manor in the weeks after the Incident. context is unknown. -#> -—spoke to the man under the bridge again, the one who likes to bite the heads off the fish, and he suggested i take a brief sabbatical and journey to the Wandering Oak (where all paths meet) in search of inspiration and the forsaken sword of Pirate Queen Granuaile. a capital idea! i shall depart upon the morrow, having honored the Lord Odin and poisoned my accursed minstrels as is tradition— -—can't smell my soul anymore, but that's beside the point entirely— -—that second moon (always have wondered why nobody else seems to notice the damn fool thing except on Michaelmas day). alas, my luck did not endure, and i was soon to find myself knee-deep in— -—just have to see about that, won't we!— -# +> —spoke to the man under the bridge again, the one who likes to bite the heads off the fish, and he suggested i take a brief sabbatical and journey to the Wandering Oak (where all paths meet) in search of inspiration and the forsaken sword of Pirate Queen Granuaile. a capital idea! i shall depart upon the morrow, having honored the Lord Odin and poisoned my accursed minstrels as is tradition— +> —can't smell my soul anymore, but that's beside the point entirely— +> —that second moon (always have wondered why nobody else seems to notice the damn fool thing except on Michaelmas day). alas, my luck did not endure, and i was soon to find myself knee-deep in— +> —just have to see about that, won't we!— the nearest surviving relative of Lord Hawthorne is believed to be a wandering beggar with a small pet meerkat who sells cursed wooden trinkets to unwary children. she will not be contacted, as the officers of the Yard fear her. ~~~ ~~~links & notes #lnr [cortav] ~~~ @@ -658,11 +662,11 @@ .danger: (unknown) $agent ZUCCHINI PARABLE .civil: Zephram "Rolodex" Goldberg .danger: Category Scarlet -$agent RHADAMANTH EXQUISITE +$agent RHADAMANTH EXCISE .roe: Eliminate with extreme prejudice; CBRN deployment authorized .danger: [*Unquantifiable] ~~~ ~~~ tables #tab [cortav] ~~~ @@ -764,11 +768,11 @@ used files should return a table with the following members * [`macros]: an array of functions that return strings or arrays of strings when invoked. these will be injected into the global macro namespace. ###ts ts -the [*ts] extension allows documents to be marked up for basic classification constraints and automatically redacted. if you are seriously relying on [`ts] for confidentiality, make damn sure you start the file with [$%[*requires] ts], so that rendering will fail with an error if the extension isn't supported. +the [*ts] extension allows documents to be marked up for basic classification constraints and automatically redacted. if you are seriously relying on [`ts] for confidentiality, make damn sure you start the file with [$%!![*needs] ts], so that rendering will fail with an error if the extension isn't supported. [`ts] currently has no support for misinformation. [`ts] enables the directives: * [`%[*ts] class [$scope level] ([$styled-text])]: indicates a classification level for either the whole document (scope [$doc]) or the next section (scope [$sec]). if the ts level is below [$level], the section will be redacted or rendering will fail with an error, as appropriate. if styled-text is included, this will be treated as the name of the classification level. Index: cortav.lua ================================================================== --- cortav.lua +++ cortav.lua @@ -1,9 +1,19 @@ -- [ʞ] cortav.lua -- ~ lexi hale -- © AGPLv3 -- ? reference implementation of the cortav document language +-- +-- ! TODO refactor encoding logic. it's a complete +-- mess and i seem to have repeatedly gotten +-- confused about how it's supposed to work. +-- the whole shitshow needs to be replaced +-- with a clean, simple paradigm: documents +-- are translated to UTF8 on the way in, and +-- translate back out on the way out. trying +-- to cope with multiple simultaneous +-- encodings in memory is a disaster zone. local ss = require 'sirsem' -- aliases for commonly used sirsem funcs local startswith = ss.str.begins local dump = ss.dump @@ -734,10 +744,17 @@ spans = {str}; origin = o; }}; origin = o; } + end + local function unicodepoint(s,c) + local cp = tonumber(s, 16) + return { + kind = 'codepoint'; + code = cp; + } end ct.spanctls = { {seq = '!', parse = formatter 'emph'}; {seq = '*', parse = formatter 'strong'}; {seq = '~', parse = formatter 'strike'}; @@ -796,10 +813,16 @@ {seq = '>', parse = insert_link}; {seq = '→', parse = insert_link}; {seq = '🔗', parse = insert_link}; {seq = '##', parse = insert_var_ref(true)}; {seq = '#', parse = insert_var_ref(false)}; + + {seq = 'U+', parse = unicodepoint}; + {seq = 'u+', parse = unicodepoint}; + {seq = 'U', parse = unicodepoint}; + {seq = 'u', parse = unicodepoint}; + {seq = '%%', parse = function (s,c) local com = s:match '^%%%%%s*(.*)$' return { kind = 'comment'; comment = com; Index: desk/cortav.xml ================================================================== --- desk/cortav.xml +++ desk/cortav.xml @@ -51,11 +51,12 @@ - + + @@ -180,11 +181,12 @@ - + + @@ -237,11 +239,11 @@ - + @@ -253,10 +255,11 @@ + Index: makefile ================================================================== --- makefile +++ makefile @@ -34,10 +34,13 @@ # script to the repository, i'll happily take merge requests :) lua != which lua luac != which luac sh != which sh + +#sterilize the operating theatre +lua += -E extens = $(wildcard ext/*.lua) extens-names ?= $(basename $(notdir $(extens))) rendrs = $(wildcard render/*.lua) rendrs-names ?= $(basename $(notdir $(rendrs))) @@ -59,17 +62,10 @@ endif dbg-flags-luac = $(if $(debug),,-s) dbg-flags-cc = $(if $(debug),-g,-s) -# sterilize the operating theatre -export LUA_PATH=./?.lua;./?.lc -export LUA_PATH_5_3=./?.lc;./?.lua -export LUA_PATH_5_4=./?.lc;./?.lua -export LUA_INIT= -export LUA_INIT_5_3= -export LUA_INIT_5_4= # by default, we fetch and parse information about encodings we # support so that cortav can do fancy things like format math # equations by character class (e.g. italicizing variables) # this is not necessary for parsing the format, and can be @@ -109,15 +105,16 @@ syncdoc: $(build)/cortav.html fossil uv add $< --as cortav.html fossil uv sync --all # clean is written in overly cautious fashion to minimize damage, -# just in case it ever gets invoked in a bad way +# just in case it ever gets invoked in a bad way (e.g. build=/) .PHONY: clean clean: rm -f $(build)/*.{html,lc,sh,txt,desktop} \ - $(build)/$(executable){,.bin} + $(build)/$(executable){,.bin} \ + $(build)/bind rmdir $(build) $(build)/%.sh: desk/%.sh | $(build)/ echo >$@ "#!$(sh)" echo >>$@ 'cortav_exec="$(bin-prefix)/$(executable)"' Index: render/groff.lua ================================================================== --- render/groff.lua +++ render/groff.lua @@ -345,11 +345,15 @@ elseif s.style == 'insert' then rs.macAdd 'insert' rcc.prop.color = 'new' end rs.renderSpans(rcc, s.spans, b, sec) - end; + end + + function spanRenderers.codepoint(rc, s, b, sec) + utf8.char(s.code) + end function spanRenderers.link(rc, l, b, sec) rs.renderSpans(rc, l.spans, b, sec) rs.linkctr = rs.linkctr + 1 rs.macAdd 'footnote' Index: render/html.lua ================================================================== --- render/html.lua +++ render/html.lua @@ -696,10 +696,17 @@ elseif sp.style == 'variable' then addStyle 'var' end return tag(tags[sp.style],nil,htmlSpan(sp.spans,...)) end + + function span_renderers.codepoint(t,b,s) + -- is this a UTF8 output? + return utf8.char(t.code) + -- else + -- return string.format("&#%u;", code) + end function span_renderers.deref(t,b,s) local r = b.origin:ref(t.ref) local name = t.ref if name:find'%.' then name = name:match '^[^.]*%.(.+)$' end Index: sirsem.lua ================================================================== --- sirsem.lua +++ sirsem.lua @@ -19,10 +19,17 @@ end ss = namespace 'sirsem' ss.namespace = namespace end +-- the C shim provides extra functionality that cannot +-- be implemented in pure Lua. this functionality is +-- accessed through the _G.native namespace. native +-- functions should not be called directly; rather, +-- they should be called from sirsem.lua wrappers that +-- can provide alternative implementations or error +-- messages when cortav is build in plain lua mode local native = _G.native function ss.map(fn, lst) local new = {} for k,v in pairs(lst) do