jaredj/glotawk: reader.awk

# SPDX-License-Identifier: BSD-2-Clause

BEGIN {
    _string_unescapes["a"] = "\a"
    _string_unescapes["b"] = "\b"
    _string_unescapes["f"] = "\f"
    _string_unescapes["n"] = "\n"
    _string_unescapes["r"] = "\r"
    _string_unescapes["t"] = "\t"
    _string_unescapes["v"] = "\v"
    _string_unescapes["\\"] = "\\"
    _string_unescapes["\""] = "\""
}

function _forget_parse_state(tokens, where, inside_string) {
    delete tokens
    delete where
    inside_string[1] = 0
}

function _push_token(t, tokens, where,    n) {
#    logg_dbg("_push_token", t)
    n = length(tokens)+1
    tokens[n] = t
    where[n] = FILENAME ":" FNR
}

function tokenize(input, tokens, where, inside_string,     chars_done, chars_left, top) {
    chars_done = 0
    chars_left = length(input)
    while(chars_left > 0) {
#        logg_dbg("tokz", "input is [" input "]; inside string? " inside_string[1])
        top = length(tokens)
        # warning: don't match() regexes on anything but input here.
        # we use RSTART and RLENGTH below, which match() sets. all
        # patterns must start with ^, so RSTART will always be 1.
        if(inside_string[1]) {
            if(match(input, /^"/)) {    # immediate string end
                tokens[top] = tokens[top] substr(input, 1, RLENGTH)
                inside_string[1] = 0
            } else if(match(input, /^(\\[^"]|[^\\"])*[^\\]"/)) { # stuff"
#                logg_dbg("tokz", "leaving string")
                tokens[top] = tokens[top] substr(input, 1, RLENGTH)
                inside_string[1] = 0
            } else {                    # no end quote in input. stay inside
                # newline inside string escaped?
                if(match(input, /^(\\[^\n]|[^\\])*\\$/)) {
                    # keep the backslash off
                    tokens[top] = tokens[top] substr(input, 1, RLENGTH-1)
                } else {
                    tokens[top] = tokens[top] input "\n"
                    # everything's OK, but we have not actually
                    # matched any regexps. set RSTART and RLENGTH to
                    # say we matched the whole line.
                    RSTART = 1
                    RLENGTH = length(input)+1
                }
            }
        } else {
            if(match(input, /^[ 	]+/))
                ;
            else if(match(input, /^~/))
                _push_token(substr(input, 1, RLENGTH), tokens, where)
            else if(match(input, /^,@/))               # unquote-splicing
                _push_token(substr(input, 1, RLENGTH), tokens, where)
            else if(match(input, /^[][()'`,!^@]/)) # special single char
                _push_token(substr(input, 1, RLENGTH), tokens, where)
            else if(match(input, /^"(\\[^\n]|[^\\"])*"/)) { # double-quoted string
#                logg_dbg("tokz", "complete string")
                _push_token(substr(input, 1, RLENGTH), tokens, where)
            } else if(match(input, /^"(\\([^"]|$)|[^\\"])*/)) {  # dq string no end
#                logg_dbg("tokz", "incomplete string")
                if(substr(input, RLENGTH, 1) == "\\")
                    _push_token(substr(input, 1, RLENGTH-1), tokens, where)
                else
                    _push_token(substr(input, 1, RLENGTH) "\n", tokens, where)
                inside_string[1] = 1
#                logg_dbg("tokz", "inside_string[1]")
            } else if(match(input, /^;[^\n]*$/)) {             # comment
                # don't push any tokens at all: it's difficult to pick
                # out comment tokens when reading, but easy here
                #
                # _push_token(substr(input, 1, RLENGTH), tokens, where)
            } else if(match(input, /^[^][ 	('"`,;)@]+/)) # non-special chars
                _push_token(substr(input, 1, RLENGTH), tokens, where)
            else {
                if(!FILENAME || FILENAME == "-") {
                    logg_err("tokz", "at char " chars_done ": " \
                             "unrecognized input: " input)
                    chars_left = 0
                    _forget_parse_state(tokens, where, inside_string)
                } else {
                    logg_err("tokz",
                             " " FILENAME ":" FNR ": "             \
                             "at char " chars_done ": "            \
                             "unrecognized input: " input)
                    exit(1)
                }
            }
        }
#        logg_dbg_array("tokz", "tokens", tokens)

        if(RSTART != 0) {
            # all patterns are anchored to ^ so RSTART is always 1
            input = substr(input, 1+RLENGTH)
            chars_left -= RLENGTH
            chars_done += RLENGTH
        } else {
            if(!FILENAME || FILENAME == "-") {
                logg_err("tokz", "at char " chars_done ": "\
                         "token not matched: " input)
                chars_left = 0
                _forget_parse_state(tokens, where, inside_string)
            } else {
                logg_err("tokz", " " FILENAME ":" FNR ": " \
                         "at char " chars_done ": " \
                         "token not matched: " input)
                exit(1)
            }
        }
    }
}
 

# before, with the mal-step-2-like reader, we knew the ending
# nesting_level would be 0. there would always be one form to
# evaluate. the question was whether it was an atom or a list; if a
# list, recursion would take us to the end of the form (and no
# farther: multiple forms on the same line would be ignored). if the
# nesting level at the end were not zero, it would be an error, on
# that line.
#
# now, with multiline forms, the form and the line are not constrained
# to the same extent.
#
# - there may not be a complete form to read yet. we want to avoid
#   starting to read anything until we know we have enough tokens to
#   finish reading it.
#
# - there could be multiple forms on one
#   line, and so we may end up reading more than one.
#
# - there could be a fractional form at the end. we want to consume
#   the full form(s) and leave the half forms.
#
# and that means that when we read a list, we need its starting _and
# ending_ indices.

function read_forms_into(rvs, tokens, where, inside_string,     a, b,  i, t, quote, topp, list_found, nesting_level) {
    # this is just to tell awk it is an array. it should never have
    # anything at [0] because awk is 1-based.
    delete rvs[0]
    if(!a) {
        a=1
        b=length(tokens)
        # don't try to read an incomplete string
        if(inside_string[1]) b--
#        logg_dbg_array("read_forms_into", "tokens", tokens)
        topp=1   # we are a toplevel read_forms_into
    }
#    logg_dbg("read_forms_into", "a " a " b " b " ingoing length(rvs) " length(rvs))
    while(a <= b) {    # we may break out, too, below
        t = tokens[a]
#        logg_dbg("read_forms_into", "reading " a " = " t)
#        logg_dbg("read_forms_into", "token " t " quote? " quote)
        if(t == "'") {
            quote = "quote";            a++; continue
        } else if(t == "`") {
            quote = "quasiquote";       a++; continue
        } else if(t == ",") {
            quote = "unquote";          a++; continue
        } else if(t == ",@") {
            quote = "unquote-splicing"; a++; continue
        }

        if(t == "(") {
            list_found = 0
            nesting_level = 1
            for(i=a+1; i<=b; i++) {
                if(tokens[i] == "(") nesting_level++
                else if(tokens[i] == ")") nesting_level--
                if(nesting_level==0) {
                    # the list begun at a has ended at i
                    list_found = 1
                    break
                }
            }
            if(list_found) {
#                logg_dbg("read_forms_into", "list from " a " to " i)
                rvs[length(rvs)+1] = read_list(tokens, a+1, i-1, quote, where, inside_string)
                quote = 0
#                logg_dbg("read_forms_into", "done from " a " to " i) 
                a = i+1
            } else {
#                logg_dbg("read_forms_into", "incomplete list, not reading")
                # the list must not be completely input yet. we can't
                # read it.
                return
            }
        } else if(t == ")") {
            # we already dealt with positive nesting levels; this is
            # negative
            logg_err("read_forms_into", "too many )'s at " where[length(where)])
            _forget_parse_state(tokens, where, inside_string)
            return
        } else {
#            logg_dbg("read_forms_into", "atom at " a " token is " t " quote? " quote)
            rvs[length(rvs)+1] = read_atom(a, tokens, quote)
            quote = 0
            a++
        }
    }
#    logg_dbg("read_forms_into", "a " a " b " b " lrvs " length(rvs))
    if(topp) {
        # we have read all the tokens. but! maybe we are inside_string[1]
        if(!inside_string[1]) {
            # ok. we can forget everything.
            _forget_parse_state(tokens, where, inside_string)
        }
    }
}

# support older api, where we put one string in, and get one eval'd
# value out. there can be multiple expressions so we'll return the
# value of the last one. really this is a bit like _evprog but with an
# awk array. anyway this is the api lib-eval.awk uses.
function eval_read_str(s,     l, i, rv) {
    tokenize(s, _TOKENS, _WHERE, _INSIDE_STRING)
    read_forms_into(_TO_EVAL, _TOKENS, _WHERE, _INSIDE_STRING)
    l = length(_TO_EVAL)
    for(i=1; i<=l; i++)
        rv = _eval(_TO_EVAL[i])
    delete _TO_EVAL
}

function read_list(tokens, a, b, quote, where, inside_string,     i, forms, prevtail, head) {
    head = _nil()
    delete forms
    ## orientation: tokens[a-1] == "(" and tokens[b+1] == ")"
    if(((b-a+1) >= 3) && tokens[b-1] == ".") {
        # the end of the list is dotted
#        logg_dbg("read_list", "dotted. reading forms " b-2 " and " b ".")
        read_forms_into(forms, tokens, where, inside_string, a, b-2)
        read_forms_into(forms, tokens, where, inside_string, b, b)
        head = _cons(forms[length(forms)-1], forms[length(forms)])
        delete forms[length(forms)]
        delete forms[length(forms)]
    } else {
#        logg_dbg("read_list", "proper. reading forms " a " through " b ".")
        read_forms_into(forms, tokens, where, inside_string, a, b)
    }            
    for(i=length(forms); i>=1; i--) {
        head = _cons(forms[i], head)
    }
    if(quote){
#        logg_dbg("read_list", "wrapping in " quote)
        head = _cons(_symbol(quote),
                     _cons(head, _nil()))
    }
    return head
}

function read_atom(a, tokens, quote,    this, ans, self_quoting, ch) {
    # examples, separated by spaces: 3 3.14159 3e10 +5 -3.5e-26
    #
    # this is more restrictive than awk's idea of a double literal
    # (e.g. no 0x stuff)
    this = tokens[a]
#    logg_dbg("read_atom", "token is " this " quote? " quote)
    self_quoting = 1
    if(this ~ /^(\+|-)?([0-9]+\.)?[0-9]+([eE][+-]?[0-9]+)?$/) {
        ans = _number(this)
    } else if(tolower(this) == "true") {
        ans = _true()
    } else if(tolower(this) == "false") {
        ans = _false()
    } else if(tolower(this) == "nil") {
        ans = _nil()
    } else if(this == ".") {
        ans = "."
    } else {
        if(substr(this, 1, 1) == "\"") {
#            logg_dbg("read_atom", "string")
            # strip quotes
            ans = substr(this, 2, length(this)-2)
            # effectuate backslash-escapes. note we are only
            # supporting single-letter escapes, not \012 style octal
            # escapes.
            #
#            logg_dbg("read_atom", "string before backslash escapes: " ans)
            while(match(ans, /\\[abfnrtv"]/)) {
                ch = _string_unescapes[substr(ans, RSTART+1, 1)]
                ans = substr(ans, 1, RSTART-1) ch substr(ans, RSTART+RLENGTH)
            }
            # only unescape backslashes \\ once.
            gsub(/\\\\/, "\\", ans)
# careful. we are logging an unescaped string; this could trigger
# command injection. if the earlier one didn't.
#            logg_dbg("read_atom", "string after backslash escapes: " ans)
            # ok.
            ans = _string(ans)
        } else {
            self_quoting = 0
            ans = _symbol(this)
        }
    }
    if(quote) {
        if(self_quoting) {
            logg_err("read_atom",
                     "attempt to " quote " self-quoting atom " _repr(ans))
        } else if(quote == "quasiquote") {
            logg_err("read_atom", "attempt to " quote " an atom")
        } else {
            ans = _cons(_symbol(quote), _cons(ans, _nil()))
        }
    }
    return ans
}

function _incomplete_parse_at_end(tokens, where, inside_string,     i, l, nesting_level, unclosed) {
    delete unclosed
    l = length(tokens)
    if(l) {
        if(inside_string[1]) {
            logg_err("_incomplete_parse_at_end",
                     "still inside string begun at " where[l])
        }
        for(i=1; i<=l; i++) {
            if(tokens[i] == "(") {
                unclosed[length(unclosed)+1] = i
            } else if(tokens[i] == ")") {
                delete unclosed[length(unclosed)]
            }
        }
        if(length(unclosed) > 0) {
            for(i=length(unclosed); i>=1; i--) {
                logg_err("_incomplete_parse_at_end",
                         "still inside list (" tokens[unclosed[i]+1] \
                         "... begun at " where[unclosed[i]])
            }
        }
    }   
}