jaredj/glotawk: reader.awk

# SPDX-License-Identifier: BSD-2-Clause

function tokenize_into(input, ta,      chars_left, tal, mp) {
    chars_done = 0
    chars_left = length(input)
    tal = length(ta)
    while(chars_left > 0) {
        if(match(input, /^[ 	]+/)) {
        } else if(match(input, /^~/)) {
            ta[++tal] = substr(input, 1, RLENGTH)
        } else if(match(input, /^,@/)) { # unquote-splicing
            ta[++tal] = substr(input, 1, RLENGTH)
        } else if(match(input, /^[\[\]{}()'`,!^@]/)) { # special single char
            ta[++tal] = substr(input, 1, RLENGTH)
        } else if(match(input, /^"(\\.|[^\\"])*"?/)) { # double-quoted string
            ta[++tal] = substr(input, 1, RLENGTH)
        } else if(match(input, /^;.*/)) { # comment
            ta[++tal] = substr(input, 1, RLENGTH)
        } else if(match(input, /^[^ 	\[\]{}('"`,;)@]+/)) { # non-special chars
            ta[++tal] = substr(input, 1, RLENGTH)
        } else {
            logg_err("tokz", "unrecognized input at char " \
                     chars_done ": " input)
            exit 1
        }
        if(RSTART != 0) {
            # all patterns are anchored to ^ so RSTART is always 1
            input = substr(input, 1+RLENGTH)
            chars_left -= RLENGTH
            chars_done += RLENGTH
        } else {
            logg_err("tokz", "at char " chars_done ", token not matched: " input)
            exit 1
        }
#        logg_dbg("tokz", "     -> " tal " tokens; " chars_left " chars_left; input: " input)
    }
}

function read_str(s,         i, ta, tal) {
    delete ta[1]    # make sure ta is an array for gawk -c
    tokenize_into(s, ta)
    tal = length(ta)
    i[1] = 1 # make i an array so we can pass it by reference
    return read_form(i, ta, tal, 0)
}
    
function read_form(i,ta,tal,      quote) {
    if(ta[i[1]] == "'") {
        quote = "quote"
        i[1]++
    } else if(ta[i[1]] == "`") {
        quote = "quasiquote"
        i[1]++
    } else if(ta[i[1]] == ",") {
        quote = "unquote"
        i[1]++
    } else if(ta[i[1]] == ",@") {
        quote = "unquote-splicing"
        i[1]++
    } else {
        quote = 0
    }
    if(match(ta[i[1]], /^\(/)) {
#        logg_dbg("read_form", "( at token " i[1])
        i[1] += 1
        return read_list(i,ta,tal,quote)
    } else {
        return read_atom(i,ta,tal,quote)
    }
}

function read_list(i, ta, tal, quote,       prevtail, head) {
    head = _nil()
#    logg_dbg("read_list", "at beginning, i is " i[1] "; token is " ta[i[1]])
    for(; (i[1]<=tal) && (ta[i[1]] !~ /^[).]/); i[1]++) {
#        logg_dbg("rd_l", "in loop, i is " i[1] "; token is " ta[i[1]])
        head = _cons(read_form(i, ta, tal), head)
    }
#    logg_dbg("read_list", "after loop, i[1] is " i[1] "; token is " ta[i[1]] "; head is " head)
    prevtail = head
    head = _nreverse(head)
    if(ta[i[1]] == ".") {
        i[1] += 1
        _set_cdr(prevtail, read_form(i, ta, tal))
        i[1] += 1
    } else if(ta[i[1]] ~ /^\)/) { # properly terminated
#        logg_dbg("read_list", "after _nreverse, head is " head)
    } else {
        logg_err("read_list", "unbalanced parentheses at token: " ta[i[1]-1])
        return _nil()
    }
    if(quote){
#        logg_dbg("read_list", "wrapping in " quote)
        head = _cons(_symbol(quote),
                     _cons(head, _nil()))
    }
    return head
}

function read_atom(i, ta, tal, quote,  this, ans, self_quoting) {
    # examples, separated by spaces: 3 3.14159 3e10 +5 -3.5e-26
    #
    # this is more restrictive than awk's idea of a double literal
    # (e.g. no 0x stuff)
    this = ta[i[1]]
#    logg_dbg("read_atom", "token is " this)
    self_quoting = 1
    if(this ~ /^(\+|-)?([0-9]+\.)?[0-9]+([eE][+-]?[0-9]+)?$/) {
        ans = _number(this)
    } else if(tolower(this) == "true") {
        ans = _true()
    } else if(tolower(this) == "false") {
        ans = _false()
    } else if(tolower(this) == "nil") {
        ans = _nil()
    } else if(this == ".") {
        ans = "."
    } else {
        if(ta[i[1]] ~ "^\"") {
            # strip quotes
            ans = _string(substr(ta[i[1]], 2, length(ta[i[1]])-2))
        } else {
            self_quoting = 0
            ans = _symbol(ta[i[1]])
        }
    }
    if(quote) {
        if(self_quoting) {
            logg_err("read_atom",
                     "attempt to " quote " self-quoting atom " _repr(ans))
        } else if(quote == "quasiquote") {
            logg_err("read_atom", "attempt to " quote " an atom")
        } else {
            ans = _cons(_symbol(quote), _cons(ans, _nil()))
        }
    }
    return ans
}

function _smoke_test_reader_tokenizer() {
    tokenize_into("     ,~@(foo)", ta)
    for(ti=1; ti<=length(ta); ti++) {
        print "tokenarray[" ti "] = " ta[ti]
    }
}

function _smoke_test_reader() {
    x = read_str("(foo \"bar\" baz 3.14159 (sublist 1 2 3))", ta)
    logg_inf("_smoke_test_reader", "final result: " x ", being " _repr(x))
}