# SPDX-License-Identifier: BSD-2-Clause BEGIN { _string_unescapes["a"] = "\a" _string_unescapes["b"] = "\b" _string_unescapes["f"] = "\f" _string_unescapes["n"] = "\n" _string_unescapes["r"] = "\r" _string_unescapes["t"] = "\t" _string_unescapes["v"] = "\v" _string_unescapes["\\"] = "\\" _string_unescapes["\""] = "\"" } function _forget_parse_state(tokens, where, inside_string) { delete tokens delete where inside_string[1] = 0 } function _push_token(t, tokens, where, n) { # logg_dbg("_push_token", t) n = length(tokens)+1 tokens[n] = t where[n] = FILENAME ":" FNR } function tokenize(input, tokens, where, inside_string, chars_done, chars_left, top) { chars_done = 0 chars_left = length(input) while(chars_left > 0) { # logg_dbg("tokz", "input is [" input "]; inside string? " inside_string[1]) top = length(tokens) # warning: don't match() regexes on anything but input here. # we use RSTART and RLENGTH below, which match() sets. all # patterns must start with ^, so RSTART will always be 1. if(inside_string[1]) { if(match(input, /^"/)) { # immediate string end tokens[top] = tokens[top] substr(input, 1, RLENGTH) inside_string[1] = 0 } else if(match(input, /^(\\[^"]|[^\\"])*[^\\]"/)) { # stuff" # logg_dbg("tokz", "leaving string") tokens[top] = tokens[top] substr(input, 1, RLENGTH) inside_string[1] = 0 } else { # no end quote in input. stay inside # newline inside string escaped? if(match(input, /^(\\[^\n]|[^\\])*\\$/)) { # keep the backslash off tokens[top] = tokens[top] substr(input, 1, RLENGTH-1) } else { tokens[top] = tokens[top] input "\n" # everything's OK, but we have not actually # matched any regexps. set RSTART and RLENGTH to # say we matched the whole line. RSTART = 1 RLENGTH = length(input)+1 } } } else { if(match(input, /^[ ]+/)) ; else if(match(input, /^~/)) _push_token(substr(input, 1, RLENGTH), tokens, where) else if(match(input, /^,@/)) # unquote-splicing _push_token(substr(input, 1, RLENGTH), tokens, where) else if(match(input, /^[][()'`,!^@]/)) # special single char _push_token(substr(input, 1, RLENGTH), tokens, where) else if(match(input, /^"(\\[^\n]|[^\\"])*"/)) { # double-quoted string # logg_dbg("tokz", "complete string") _push_token(substr(input, 1, RLENGTH), tokens, where) } else if(match(input, /^"(\\([^"]|$)|[^\\"])*/)) { # dq string no end # logg_dbg("tokz", "incomplete string") if(substr(input, RLENGTH, 1) == "\\") _push_token(substr(input, 1, RLENGTH-1), tokens, where) else _push_token(substr(input, 1, RLENGTH) "\n", tokens, where) inside_string[1] = 1 # logg_dbg("tokz", "inside_string[1]") } else if(match(input, /^;[^\n]*$/)) { # comment # don't push any tokens at all: it's difficult to pick # out comment tokens when reading, but easy here # # _push_token(substr(input, 1, RLENGTH), tokens, where) } else if(match(input, /^[^][ ('"`,;)@]+/)) # non-special chars _push_token(substr(input, 1, RLENGTH), tokens, where) else { if(!FILENAME || FILENAME == "-") { logg_err("tokz", "at char " chars_done ": " \ "unrecognized input: " input) chars_left = 0 _forget_parse_state(tokens, where, inside_string) } else { logg_err("tokz", " " FILENAME ":" FNR ": " \ "at char " chars_done ": " \ "unrecognized input: " input) exit(1) } } } # logg_dbg_array("tokz", "tokens", tokens) if(RSTART != 0) { # all patterns are anchored to ^ so RSTART is always 1 input = substr(input, 1+RLENGTH) chars_left -= RLENGTH chars_done += RLENGTH } else { if(!FILENAME || FILENAME == "-") { logg_err("tokz", "at char " chars_done ": "\ "token not matched: " input) chars_left = 0 _forget_parse_state(tokens, where, inside_string) } else { logg_err("tokz", " " FILENAME ":" FNR ": " \ "at char " chars_done ": " \ "token not matched: " input) exit(1) } } } } # before, with the mal-step-2-like reader, we knew the ending # nesting_level would be 0. there would always be one form to # evaluate. the question was whether it was an atom or a list; if a # list, recursion would take us to the end of the form (and no # farther: multiple forms on the same line would be ignored). if the # nesting level at the end were not zero, it would be an error, on # that line. # # now, with multiline forms, the form and the line are not constrained # to the same extent. # # - there may not be a complete form to read yet. we want to avoid # starting to read anything until we know we have enough tokens to # finish reading it. # # - there could be multiple forms on one # line, and so we may end up reading more than one. # # - there could be a fractional form at the end. we want to consume # the full form(s) and leave the half forms. # # and that means that when we read a list, we need its starting _and # ending_ indices. function read_forms_into(rvs, tokens, where, inside_string, a, b, i, t, quote, topp, list_found, nesting_level) { # this is just to tell awk it is an array. it should never have # anything at [0] because awk is 1-based. delete rvs[0] if(!a) { a=1 b=length(tokens) # don't try to read an incomplete string if(inside_string[1]) b-- # logg_dbg_array("read_forms_into", "tokens", tokens) topp=1 # we are a toplevel read_forms_into } # logg_dbg("read_forms_into", "a " a " b " b " ingoing length(rvs) " length(rvs)) while(a <= b) { # we may break out, too, below t = tokens[a] # logg_dbg("read_forms_into", "reading " a " = " t) # logg_dbg("read_forms_into", "token " t " quote? " quote) if(t == "'") { quote = "quote"; a++; continue } else if(t == "`") { quote = "quasiquote"; a++; continue } else if(t == ",") { quote = "unquote"; a++; continue } else if(t == ",@") { quote = "unquote-splicing"; a++; continue } if(t == "(") { list_found = 0 nesting_level = 1 for(i=a+1; i<=b; i++) { if(tokens[i] == "(") nesting_level++ else if(tokens[i] == ")") nesting_level-- if(nesting_level==0) { # the list begun at a has ended at i list_found = 1 break } } if(list_found) { # logg_dbg("read_forms_into", "list from " a " to " i) rvs[length(rvs)+1] = read_list(tokens, a+1, i-1, quote, where, inside_string) quote = 0 # logg_dbg("read_forms_into", "done from " a " to " i) a = i+1 } else { # logg_dbg("read_forms_into", "incomplete list, not reading") # the list must not be completely input yet. we can't # read it. return } } else if(t == ")") { # we already dealt with positive nesting levels; this is # negative logg_err("read_forms_into", "too many )'s at " where[length(where)]) _forget_parse_state(tokens, where, inside_string) return } else { # logg_dbg("read_forms_into", "atom at " a " token is " t " quote? " quote) rvs[length(rvs)+1] = read_atom(a, tokens, quote) quote = 0 a++ } } # logg_dbg("read_forms_into", "a " a " b " b " lrvs " length(rvs)) if(topp) { # we have read all the tokens. but! maybe we are inside_string[1] if(!inside_string[1]) { # ok. we can forget everything. _forget_parse_state(tokens, where, inside_string) } } } # support older api, where we put one string in, and get one eval'd # value out. there can be multiple expressions so we'll return the # value of the last one. really this is a bit like _evprog but with an # awk array. anyway this is the api lib-eval.awk uses. function eval_read_str(s, l, i, rv) { tokenize(s, _TOKENS, _WHERE, _INSIDE_STRING) read_forms_into(_TO_EVAL, _TOKENS, _WHERE, _INSIDE_STRING) l = length(_TO_EVAL) for(i=1; i<=l; i++) rv = _eval(_TO_EVAL[i]) delete _TO_EVAL } function read_list(tokens, a, b, quote, where, inside_string, i, forms, prevtail, head) { head = _nil() delete forms ## orientation: tokens[a-1] == "(" and tokens[b+1] == ")" if(((b-a+1) >= 3) && tokens[b-1] == ".") { # the end of the list is dotted # logg_dbg("read_list", "dotted. reading forms " b-2 " and " b ".") read_forms_into(forms, tokens, where, inside_string, a, b-2) read_forms_into(forms, tokens, where, inside_string, b, b) head = _cons(forms[length(forms)-1], forms[length(forms)]) delete forms[length(forms)] delete forms[length(forms)] } else { # logg_dbg("read_list", "proper. reading forms " a " through " b ".") read_forms_into(forms, tokens, where, inside_string, a, b) } for(i=length(forms); i>=1; i--) { head = _cons(forms[i], head) } if(quote){ # logg_dbg("read_list", "wrapping in " quote) head = _cons(_symbol(quote), _cons(head, _nil())) } return head } function read_atom(a, tokens, quote, this, ans, self_quoting, ch) { # examples, separated by spaces: 3 3.14159 3e10 +5 -3.5e-26 # # this is more restrictive than awk's idea of a double literal # (e.g. no 0x stuff) this = tokens[a] # logg_dbg("read_atom", "token is " this " quote? " quote) self_quoting = 1 if(this ~ /^(\+|-)?([0-9]+\.)?[0-9]+([eE][+-]?[0-9]+)?$/) { ans = _number(this) } else if(tolower(this) == "true") { ans = _true() } else if(tolower(this) == "false") { ans = _false() } else if(tolower(this) == "nil") { ans = _nil() } else if(this == ".") { ans = "." } else { if(substr(this, 1, 1) == "\"") { # logg_dbg("read_atom", "string") # strip quotes ans = substr(this, 2, length(this)-2) # effectuate backslash-escapes. note we are only # supporting single-letter escapes, not \012 style octal # escapes. # # logg_dbg("read_atom", "string before backslash escapes: " ans) while(match(ans, /\\[abfnrtv"]/)) { ch = _string_unescapes[substr(ans, RSTART+1, 1)] ans = substr(ans, 1, RSTART-1) ch substr(ans, RSTART+RLENGTH) } # only unescape backslashes \\ once. gsub(/\\\\/, "\\", ans) # careful. we are logging an unescaped string; this could trigger # command injection. if the earlier one didn't. # logg_dbg("read_atom", "string after backslash escapes: " ans) # ok. ans = _string(ans) } else { self_quoting = 0 ans = _symbol(this) } } if(quote) { if(self_quoting) { logg_err("read_atom", "attempt to " quote " self-quoting atom " _repr(ans)) } else if(quote == "quasiquote") { logg_err("read_atom", "attempt to " quote " an atom") } else { ans = _cons(_symbol(quote), _cons(ans, _nil())) } } return ans } function _incomplete_parse_at_end(tokens, where, inside_string, i, l, nesting_level, unclosed) { delete unclosed l = length(tokens) if(l) { if(inside_string[1]) { logg_err("_incomplete_parse_at_end", "still inside string begun at " where[l]) } for(i=1; i<=l; i++) { if(tokens[i] == "(") { unclosed[length(unclosed)+1] = i } else if(tokens[i] == ")") { delete unclosed[length(unclosed)] } } if(length(unclosed) > 0) { for(i=length(unclosed); i>=1; i--) { logg_err("_incomplete_parse_at_end", "still inside list (" tokens[unclosed[i]+1] \ "... begun at " where[unclosed[i]]) } } } }