util  ord.c at [a38de374f1]

File ord.c artifact aa1f140ed9 part of check-in a38de374f1


/* [ʞ] ord.c - integer converter
 *  ~ lexi hale <lexi@hale.su>
 *  © AGPLv3
 *  * ord has no dependencies except for libc.
 *  ? ord converts integers to ascii characters
 *    and back. written because the only fucking
 *    way to do this in shell is FUCKING PRINTF.
 *  $ cc ord.c -o ord [-D_IO=(LIBC|POSIX)]
 *  	- the flag D_IO will instruct ord.c whether
 *  	  to use POSIX io primitives (write and read)
 *  	  instead of libc primitives (printf). if
 *  	  you're on a UNIX system, POSIX primitives
 *  	  will be used by default, but you can block
 *  	  them with LIBC or force them with POSIX.
 *  	  if you are on a POSIX- compliant system,
 *  	  you *should* use POSIX IO, for improved
 *  	  performance and safety. */

#if (defined(__unix__) && _IO != LIBC) || (_IO == POSIX)
#	define _POSIX_IO
#endif

#ifdef _POSIX_IO
#	include <unistd.h>
#	define say(x) (write(2, (x), (sizeof (x))))
#	define print(sz,x) (write(1, (x), (sz)))
#	define forposix(x) x
#	define forlibc(x)
#else
#	include <stdio.h>
#	define say(x) (fprintf(stderr, (x)))
#	define print(x) (printf("%s",(x)))
#	define forposix(x)
#	define forlibc(x) x
#endif
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <limits.h>
#define sz(x) ( sizeof (x) / sizeof (x) [0] )

enum /* constants */ {
	null = 0,

	/* ascii address space */
	numspace        = (0x39 - 0x30) + 1, /* 10 */
	alphaspace      = (0x5a - 0x41) + 1, /* 26 */
	smallalphaspace = (0x7a - 0x61) + 1, /* 26 */

	/* base representations */
	imaxbase = numspace + alphaspace,    /* 36 */
	maxbase = imaxbase + smallalphaspace /* 62 */
};

typedef unsigned long long word;
typedef _Bool bool;
enum { false = 0, true = 1 };

typedef struct pair { uint8_t val; const char* str; } pair;

#define error_list \
	e(domain, "bad argument passed for domain") \
	e(find, "could not find key in table") \
	e(syntax, "invalid syntax") \
	e(base, "that base is out of range") \
	e(overflow, "a memory overflow has occurred") \
	e(ebcdic, "nice try, mr ibm-san")

typedef enum bad {
	ok = 0, fail = 1,
#	define e(name, desc) bad_##name,
		error_list
#	undef e
} bad;

bad tblget(size_t stacksz, const pair* haystack, const char* needle, uint8_t* val) {
	for (size_t i = 0; i<stacksz; ++i) {
		if (strcmp(haystack[i].str, needle) == ok) {
			*val = haystack[i].val;
			return ok;
		}
	}
	return bad_find;
}

enum argument {
	arg_to, arg_set, arg_base,

	arg_asc,

	arg_bin, arg_trn, arg_oct, arg_dec,
	arg_duo, arg_hex, arg_b32, arg_b64,

	switch_prefix, param_prefix,
	switch_lowercase,

	arg_ebcdic,
};

word bases[] = {
	[arg_asc] =  0,
	[arg_bin] =  2,
	[arg_trn] =  3,
	[arg_oct] =  8,
	[arg_dec] = 10,
	[arg_duo] = 12,
	[arg_hex] = 16,
	[arg_b32] = 32,
};

const char* prefixes [] = { null,
	[ 0] = "\1" "@",
	[ 2] = "\2" "0b",
	[ 3] = "\2" "0t",
	[ 8] = "\1" "0",
	[12] = "\2" "0d",
	[16] = "\2" "0x",
};

const pair argtbl[] = {
	{arg_to, "to"},
	{arg_base, "base"},

	{arg_set, "--"}, {arg_set, "raw"},

	{arg_asc, "asc"}, {arg_asc, "ascii"},

	{arg_bin, "bin"}, {arg_bin, "binary"},
	{arg_trn, "trn"}, {arg_trn, "tern"}, {arg_trn, "ternary"}, {arg_trn, "trinary"},
	{arg_oct, "oct"}, {arg_oct, "octal"},
	{arg_dec, "dec"}, {arg_dec, "decimal"},
	{arg_duo, "duo"}, {arg_duo, "duodecimal"},
	{arg_hex, "hex"}, {arg_hex, "hexadecimal"},

	{arg_hex, "b32"}, {arg_hex, "base32"}, /* not padded! */

	{switch_prefix, "-p"}, {switch_prefix, "--prefix"},
	{switch_lowercase, "-l"}, {switch_lowercase, "--lowercase"},
	{param_prefix, "-m"}, {param_prefix, "--manual-prefix"},

	{arg_ebcdic, "ebcdic"},
};

bad asctoi(const char* s, word* ret) {
	word val = 0;
	enum { base = 128 };

	for (;*s!=null;++s) {
		uint8_t v = *s;
		if (v > base) return bad_domain;

		val *= base;
		val += v;
	}

	*ret = val;
	return ok;
}

bad atoi(word base, const char* s, word* ret) {
	/* s must be a null-terminated ASCII numeral string */
	if (base > maxbase) return bad_base;

	/* override the default base if it's a basèd literal */
	if (s[0] == '@' || base == 0) return asctoi(s + (s[0]=='@'),ret);
	else if (s[0] == '0' && s[1] == 'x') base = 16, s += 2;
	else if (s[0] == '0' && s[1] == 'd') base = 10, s += 2;
	else if (s[0] == '0' && s[1] == 'b') base =  2, s += 2;
	else if (s[0] == '0' && s[1] == 't') base =  3, s += 2;
	else if (s[0] == '0')                base =  8, s += 1;

	bool insens = (base <= imaxbase);
	word val = 0;

	for (;*s!=null;++s) {
		uint8_t v = *s;
		if(v >= 0x30 && v <= 0x39) v -= 0x30; else {
			if(v >= 0x61 && v <= 0x7a) {
				if (insens) v -= 0x20; else {
					v = numspace + alphaspace + (v - 0x61);
					goto checkval;
				}
			}
			if(v >= 0x41 && v <= 0x5a) v = numspace + (v - 0x41);
				else return bad_domain;
		}
		checkval: if (v >= base) return bad_domain;

		val *= base;
		val += v;
	}

	*ret = val;
	return ok;
}

/* needed for efficiency's sake, but really sucky -
 * this table needs to be kept in sync with the
 * itoa algorithm by hand. unfortunately, given C's
 * abject lack of metaprogramming, we have to do this
 * by hand. */
const char baseref[] = /* numerals[10] */ "0123456789"
	/* bigalpha[26] */ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
	/* smallalpha[26] */ "abcdefghijklmnopqrstuvwxyz";
_Static_assert (sizeof baseref - 1 == maxbase);

bad itoasc(word val, const char* buf_start, char* buf_end, char** newbuf) {
	char* ptr = buf_end;

	*ptr-- = 0;
	while(val > 0) {
		if (ptr < buf_start) return bad_overflow;
		word rem = val % 128;
		val /= 128;
		*ptr-- = (char)rem;
	}

	if (newbuf != null) *newbuf = ptr + 1;
	return ok;
}

bool lowercase = false;
bad itoa(word base, word val, const char* buf_start,
		char* buf_end, char** newbuf) {

	char* ptr = buf_end;

	if (base > maxbase) return bad_base;
	if (base == 0) return itoasc(val, buf_start, buf_end, newbuf);

	*ptr-- = 0;
	while(val > 0) {
		if (ptr < buf_start) return bad_overflow;
		word rem = val % base;
		val /= base;
		char out = baseref[rem];
		if (lowercase && base < imaxbase)
			if (out >= 'A' && out <= 'Z')
				out += ('a' - 'A');
		*ptr-- = out;
	}

	if (newbuf != null) *newbuf = ptr + 1;
	return ok;
}

bad run(const int argc, const char** argv) {
#	ifndef _POSIX_IO
		/* fuck your buffering, it only ever makes
		 * things worse */
		setvbuf(stdout,null,_IONBF);
#	endif
	word rv;
	
	enum { set_in, set_out, _set_sz } curset = set_in;
	word base[_set_sz] = { 10, 0 };

	const char* in_vals[argc]; *in_vals = null; /* null-terminated! */
	const char** invalp = in_vals;
	const char* pfxstr;
	forposix(size_t pfxstrlen);

	
	bool raw = false;
	bool prefix = false;

	for (const char** arg = argv + 1; *arg != null; ++arg) {
		uint8_t tblval;
		if (*arg[0] == '%') { ++ *arg; goto number; } else
		if (!raw && (tblget(sz(argtbl),argtbl, *arg, &tblval) == ok)) {
			enum argument symbol = (enum argument) tblval;
			switch (symbol) {
				case arg_to: {
					if (curset == set_out) return bad_syntax;
					else curset = set_out;
				} break;

				/* treat all further arguments as numbers */
				case arg_set: { raw = true; } break;
				case arg_ebcdic: { return bad_ebcdic; } break;

				/* specify base with numeral */
				case arg_base: {
					if (arg[1] == null) return bad_syntax;
					word basekind;
					bad e = atoi(10, arg[1], &basekind);
					if (e == ok) {
						if (basekind > maxbase) return bad_base;
						base[curset] = basekind;
					} else return e;
					++arg;
				} break;

				/* specify an output prefix */
				case param_prefix: {
					if (arg[1] == null) return bad_syntax;
					prefix = true; pfxstr = arg[1];
					forposix(pfxstrlen = strlen(pfxstr));
					++arg;
				} break;

				/* specify an automatic output prefix */
				case switch_prefix: { prefix = true; pfxstr = null; } break;
				case switch_lowercase: { lowercase = true; } break;

				default: {
					/* assume base shorthand */
					base[curset] = bases[symbol];
				}
			}
		} else /* bad_find */ number: {
			/* we assume it's a number - error checking will
			 * happen once we know how to interpret it */
			*invalp++=*arg; *invalp=null;
		}
	}

	/* if an ascii string was passed, change to hexadecimal output */
	if (base[set_in] == 0 && curset != set_out) base[set_out] = 16;

	size_t max_numeral_len = 0;
	/* 0 = ascii rep (0 .. 127); one char = 7 bits */
	if (base[set_out] ==  0) max_numeral_len = (sizeof(word) * CHAR_BIT) / 7; else
	if (base[set_out] ==  1) max_numeral_len = 1024; /* pls don't */ else
	/* note for unary: actual max is ((word) -1) but we cannot actually allocate
	 * that much fucking memory, so we limit to 1KiB and crash if it needs more */
	if (base[set_out] <=  2) max_numeral_len = (sizeof(word) * CHAR_BIT); else
	if (base[set_out] <=  8) max_numeral_len = (sizeof(word) * CHAR_BIT) / 3; else
	if (base[set_out] <= 16) max_numeral_len = (sizeof(word) * CHAR_BIT) / 4; else
	/* (base[set_out] <= 32) */ max_numeral_len = (sizeof(word) * CHAR_BIT) / 5;

	/* this is i think the only sane-ish way to do it that
	 * doesn't involve *shudder* logarithms
		TODO: find a better way to do this??? */

	size_t bufmax = (invalp - in_vals) * max_numeral_len;
	char buf [bufmax];
	char* ptr = (buf + bufmax) - 1;
	forposix(char* lastptr = ptr);

	for (const char** s = in_vals; *s != null; ++s) {
		word val;
		bad e = atoi(base[set_in], *s, &val);
		if (e == ok) {
			bad e = itoa(base[set_out], val, buf, ptr, &ptr);

			if (prefix) {
				if (pfxstr != null) { print(pfxstrlen, pfxstr); }
				else if (base[set_out] < sz(prefixes)) {
					print((size_t)prefixes[base[set_out]][0],
							prefixes[base[set_out]] + 1);
				}
			}
			print(lastptr-ptr, ptr);
			print(1, "\n");
			forposix(lastptr = ptr);
		} else {
			return e;
		}
	}
}

void usage(const char* name) {
#	ifdef _POSIX_IO
		typedef struct pstr { size_t len; const char* str; } pstr;
#		define p(x) {sizeof (x "\n"), (x "\n")}
		size_t namelen = strlen(name);
#	else
		typedef const char* pstr;
#		define p(x) (x "\n")
#	endif
#	define OR "\x1b[34m|\x1b[93m" 
#	define plus "\x1b[94m+\x1b[m"
#	define par(s) "<\x1b[4m" s "\x1b[24m>"
#	define lit(l) "\x1b[3m" l "\x1b[23m"
#	define box(s) "\x1b[94m[\x1b[93m" s "\x1b[94m]\x1b[m"
		const pstr forms[] = {
			p(box(box("options") " " par("in:spec")) " " par("value:int") plus " "
					box(lit("to") " " box(par("out:spec")))),
			p(box(box("options") " " par("in:spec")) " " box(lit("to") " " box(par("out:spec")))
					" " lit("--") " " par("value:int") plus),
		}, specs[] = {
			p(box(lit("bin") OR lit("tern") OR lit("oct")
					OR lit("dec") OR lit("hex") OR
					lit("base") " " box("0-9") plus OR "asc")),
		}, ints[] = {
			p("default base: \x1b[94m.+\x1b[m"),
			p("binary literal: "lit("0b") box("01") plus),
			p("ternary literal: "lit("0t") box("012") plus),
			p("hex literal: "lit("0x") box("0-9A-Fa-f") plus),
			p("ascii literal: "lit("@") "\x1b[94m.+\x1b[m"),
			p("interpret any string (e.g. a keyword) as integer: " lit("%") box("0-9A-Za-z") plus),
		}, opts[] = {
			p("-p --prefix       : print known prefix codes on output strings"),
			p("-m --manual-prefix: specify a manual prefix to print before each number"),
			p("-l --lowercase    : prefer lowercase for case-insensitive bases"),
		};
#	undef p
#	undef OR
#	undef plus

#	define hl_on  "\x1b[1m" 
#	define hl_off "\x1b[21m"
	enum { ansilen = sizeof (hl_on hl_off) };
#	define hl(x) (hl_on x hl_off)
		const char form_head []= hl("usage: ");
		const char spec_head []= hl("- spec: ");
		const char int_head  []= hl("- int: ");
		const char opt_head  []= hl("- options: ");
		const char space     []=    "           "; /* sigh */
#	undef hl
#	undef hl_on
#	undef hl_off

#	ifdef _POSIX_IO
#		define _say(sz, s) write(2, (s), (sz));
#		define vsay _say
#		define display(hd) _say(sizeof (hd), (hd));
#		define pline(l)    _say((l).len, (l).str);
#	else
#		define _say(sz, s) printf("%.*s", sz, s);
#		define display(hd) printf("%s",(hd));
#		define vsay(sz, s) display(s)
#		define pline(l)    display(l);
#	endif

#	define space(x) _say(x, space);
#	define glow(x) say("\x1b[95m"); { x }; say("\x1b[m");
#	define section(x,prefix) display(x##_head); \
		for(size_t i = 0; i < sz(x##s); ++ i) { \
		if (i>0) space(sizeof x##_head - ansilen); \
			{ prefix; }; pline(x##s[i]); }

	section(form,glow(vsay(namelen, name)); space(1));
	section(spec,);
	section(int,);
	section(opt,);
}

int main(int argc, const char** argv) {
	if (argc == 0) return -1;
	if (argc == 1) usage(argv[0]);
	bad e = run(argc, argv);
	switch (e) {
		case ok: return 0;
#		define e(kind, desc) case bad_##kind:\
				 say("\x1b[31;1merror:\x1b[m "); say(desc "\n"); break;
			error_list
#		undef e
	}
}