# SPDX-License-Identifier: BSD-2-Clause function tokenize_into(input, ta, chars_left, tal, mp) { chars_done = 0 chars_left = length(input) tal = length(ta) while(chars_left > 0) { if(match(input, /^[ ]+/)) { } else if(match(input, /^~/)) { ta[++tal] = substr(input, 1, RLENGTH) } else if(match(input, /^,@/)) { # unquote-splicing ta[++tal] = substr(input, 1, RLENGTH) } else if(match(input, /^[\[\]{}()'`,!^@]/)) { # special single char ta[++tal] = substr(input, 1, RLENGTH) } else if(match(input, /^"(\\.|[^\\"])*"?/)) { # double-quoted string ta[++tal] = substr(input, 1, RLENGTH) } else if(match(input, /^;.*/)) { # comment ta[++tal] = substr(input, 1, RLENGTH) } else if(match(input, /^[^ \[\]{}('"`,;)@]+/)) { # non-special chars ta[++tal] = substr(input, 1, RLENGTH) } else { logg_err("tokz", "unrecognized input at char " \ chars_done ": " input) exit 1 } if(RSTART != 0) { # all patterns are anchored to ^ so RSTART is always 1 input = substr(input, 1+RLENGTH) chars_left -= RLENGTH chars_done += RLENGTH } else { logg_err("tokz", "at char " chars_done ", token not matched: " input) exit 1 } # logg_dbg("tokz", " -> " tal " tokens; " chars_left " chars_left; input: " input) } } function read_str(s, i, ta, tal) { delete ta[1] # make sure ta is an array for gawk -c tokenize_into(s, ta) tal = length(ta) i[1] = 1 # make i an array so we can pass it by reference return read_form(i, ta, tal, 0) } function read_form(i,ta,tal, quote) { if(ta[i[1]] == "'") { quote = "quote" i[1]++ } else if(ta[i[1]] == "`") { quote = "quasiquote" i[1]++ } else if(ta[i[1]] == ",") { quote = "unquote" i[1]++ } else if(ta[i[1]] == ",@") { quote = "unquote-splicing" i[1]++ } else { quote = 0 } if(match(ta[i[1]], /^\(/)) { # logg_dbg("read_form", "( at token " i[1]) i[1] += 1 return read_list(i,ta,tal,quote) } else { return read_atom(i,ta,tal,quote) } } function read_list(i, ta, tal, quote, prevtail, head) { head = _nil() # logg_dbg("read_list", "at beginning, i is " i[1] "; token is " ta[i[1]]) for(; (i[1]<=tal) && (ta[i[1]] !~ /^[).]/); i[1]++) { # logg_dbg("rd_l", "in loop, i is " i[1] "; token is " ta[i[1]]) head = _cons(read_form(i, ta, tal), head) } # logg_dbg("read_list", "after loop, i[1] is " i[1] "; token is " ta[i[1]] "; head is " head) prevtail = head head = _nreverse(head) if(ta[i[1]] == ".") { i[1] += 1 _set_cdr(prevtail, read_form(i, ta, tal)) i[1] += 1 } else if(ta[i[1]] ~ /^\)/) { # properly terminated # logg_dbg("read_list", "after _nreverse, head is " head) } else { logg_err("read_list", "unbalanced parentheses at token: " ta[i[1]-1]) return _nil() } if(quote){ # logg_dbg("read_list", "wrapping in " quote) head = _cons(_symbol(quote), _cons(head, _nil())) } return head } function read_atom(i, ta, tal, quote, this, ans, self_quoting) { # examples, separated by spaces: 3 3.14159 3e10 +5 -3.5e-26 # # this is more restrictive than awk's idea of a double literal # (e.g. no 0x stuff) this = ta[i[1]] # logg_dbg("read_atom", "token is " this) self_quoting = 1 if(this ~ /^(\+|-)?([0-9]+\.)?[0-9]+([eE][+-]?[0-9]+)?$/) { ans = _number(this) } else if(tolower(this) == "true") { ans = _true() } else if(tolower(this) == "false") { ans = _false() } else if(tolower(this) == "nil") { ans = _nil() } else if(this == ".") { ans = "." } else { if(ta[i[1]] ~ "^\"") { # strip quotes ans = _string(substr(ta[i[1]], 2, length(ta[i[1]])-2)) } else { self_quoting = 0 ans = _symbol(ta[i[1]]) } } if(quote) { if(self_quoting) { logg_err("read_atom", "attempt to " quote " self-quoting atom " _repr(ans)) } else if(quote == "quasiquote") { logg_err("read_atom", "attempt to " quote " an atom") } else { ans = _cons(_symbol(quote), _cons(ans, _nil())) } } return ans } function _smoke_test_reader_tokenizer() { tokenize_into(" ,~@(foo)", ta) for(ti=1; ti<=length(ta); ti++) { print "tokenarray[" ti "] = " ta[ti] } } function _smoke_test_reader() { x = read_str("(foo \"bar\" baz 3.14159 (sublist 1 2 3))", ta) logg_inf("_smoke_test_reader", "final result: " x ", being " _repr(x)) }