2CXQ53RHKGIT5KR7VHOVRVHCD5MK4V2J3AJDQ3CNSJLHJXXA4GXQC
YU2N2HEPNDEB65LGBWESEHNZNCZ63H3MGT4R7VLX4KMZHFYC4SIAC
L7YV2TJYOBRNUT7FPSY352O3BPZD7CNO3XG7PY433L54XYBY5AZQC
UW27LKXM2BJ77FQLTY4WPKDSSWI2RFNFRJ7CB4U3TS7KYVIV72LQC
KMV35KHLCXIV5YQUDRSTS6PIMLNIL2KTJZS77XXMPA6IX37NHZ2QC
A56GZW6HLIZQ6NX47XFPEPPGLZAA725KNYST7VK2MMK3DCCIXAXAC
PXI442CY2KQHHAIJ3UNCWKTAI4IFYNGYEBRQMDR6T53YZTY2VMMQC
NBEO3TPNOUG7MRFYRSDDFB5TQKTEDL6GHHFQVOB5MXVPIBKFNZKAC
O6PFGAUDYCMK6SC6V5RB5ELXZ7W54OB7XPYCMECCA4BSBUVLFAPAC
GW4AAYNF7I66D72G5PMFTQRK7B4KZVYKAHKRPC2IY7IX37JKEHJQC
JDZASPALXSFZOL3MXCKBPX74CUD3W743ZJ6W2422FIJ7NJOD67ZAC
A2JAXDJWT2FAKADYOY6QOQ7LQRMTTCDIOYT7STSESVHLZQEQJBMAC
5XO7IKBGCVXGVWMDJDE5MELS4FWRITKAU6NNV36NQ4TOZRR7UQ7QC
MPN7OJSZD5CS5N7WWS3ZSOYE7ZRCABIBHZDMHVS6IT25EO2INK7AC
function rep(s, form, result) {
form = read_str(s)
result = _eval(form)
_print(result)
if(N > (N_at_last_gc + GC_EVERY)) {
# for debugging garbage collector:
# logg_dbg("rep", "auto-gc at N " N)
# _dump_dot("pre-gc-" N ".dot")
# _gc_dot(_cons(_GLOBALS,
# _cons(_MACROS,
# _cons(_COMPARED_SYMBOLS,
# _nil()))),
# "auto-gc-" N "-marks.dot",
# "auto-gc-" N "-sweeps.dot")
# _dump_dot("post-gc-" N ".dot")
if(PROMPT == 0) PROMPT = "user> "
_prompt()
if(PROMPT == 0) PROMPT = "* "
_forget_parse_state()
# it appears that during BEGIN, if we are operating on a file
# specified on the command line, FILENAME is not set yet. So we
# have to find out whether any filename was specified, instead.
if(length(ARGV) < 2) _prompt()
{ rep($0); _prompt() }
{
# the reason to keep lists of parser state is so we can have forms
# that span lines, rather than requiring each line of input to be
# a complete form.
tokenize($0)
read_forms_into(_TO_EVAL)
if(!FILENAME || (FILENAME == "-")) _print_eval_all()
else _just_eval_all()
if(length(_TOKENS) == 0 && length(_TO_EVAL) == 0) {
# we have come to the end of all complete expressions in the
# input so far. this is important so that we don't GC data
# that we have read but not yet eval'd.
_maybe_gc()
_prompt()
}
}
END {
_incomplete_parse_at_end()
}
function tokenize_into(input, ta, chars_left, tal, mp) {
function _forget_parse_state() {
# logg_dbg("_forget_parse_state")
delete _TOKENS
delete _WHERE
_INSIDE_STRING = 0
}
function _push_token(t, n) {
# logg_dbg("_push_token", t)
n = length(_TOKENS)+1
_TOKENS[n] = t
_WHERE[n] = FILENAME ":" FNR
}
function tokenize(input, chars_done, chars_left, top) {
if(match(input, /^[ ]+/)) {
} else if(match(input, /^~/)) {
ta[++tal] = substr(input, 1, RLENGTH)
} else if(match(input, /^,@/)) { # unquote-splicing
ta[++tal] = substr(input, 1, RLENGTH)
} else if(match(input, /^[\[\]{}()'`,!^@]/)) { # special single char
ta[++tal] = substr(input, 1, RLENGTH)
} else if(match(input, /^"(\\.|[^\\"])*"?/)) { # double-quoted string
ta[++tal] = substr(input, 1, RLENGTH)
} else if(match(input, /^;.*/)) { # comment
ta[++tal] = substr(input, 1, RLENGTH)
} else if(match(input, /^[^ \[\]{}('"`,;)@]+/)) { # non-special chars
ta[++tal] = substr(input, 1, RLENGTH)
# logg_dbg("tokz", "input is [" input "]; inside string? " _INSIDE_STRING)
top = length(_TOKENS)
# warning: don't match() regexes on anything but input here.
# we use RSTART and RLENGTH below, which match() sets. all
# patterns must start with ^, so RSTART will always be 1.
if(_INSIDE_STRING) {
if(match(input, /^"/)) { # immediate string end
_TOKENS[top] = _TOKENS[top] substr(input, 1, RLENGTH)
_INSIDE_STRING = 0
} else if(match(input, /^(\\[^"]|[^\\"])*[^\\]"/)) { # stuff"
# logg_dbg("tokz", "leaving string")
_TOKENS[top] = _TOKENS[top] substr(input, 1, RLENGTH)
_INSIDE_STRING = 0
} else { # no end quote in input. stay inside
# newline inside string escaped?
if(match(input, /^(\\.|[^\\])*\\$/)) {
# keep the backslash off
_TOKENS[top] = _TOKENS[top] substr(input, 1, RLENGTH-1)
} else {
_TOKENS[top] = _TOKENS[top] input "\n"
# everything's OK, but we have not actually
# matched any regexps. set RSTART and RLENGTH to
# say we matched the whole line.
RSTART = 1
RLENGTH = length(input)+1
}
}
logg_err("tokz", "unrecognized input at char " \
chars_done ": " input)
exit 1
if(match(input, /^[ ]+/))
;
else if(match(input, /^~/))
_push_token(substr(input, 1, RLENGTH))
else if(match(input, /^,@/)) # unquote-splicing
_push_token(substr(input, 1, RLENGTH))
else if(match(input, /^[\[\]{}()'`,!^@]/)) # special single char
_push_token(substr(input, 1, RLENGTH))
else if(match(input, /^"(\\.|[^\\"])*"/)) { # double-quoted string
# logg_dbg("tokz", "complete string")
_push_token(substr(input, 1, RLENGTH))
} else if(match(input, /^"(\\([^"]|$)|[^\\"])*/)) { # dq string no end
# logg_dbg("tokz", "incomplete string")
if(substr(input, RLENGTH, 1) == "\\")
_push_token(substr(input, 1, RLENGTH-1))
else
_push_token(substr(input, 1, RLENGTH) "\n")
_INSIDE_STRING = 1
# logg_dbg("tokz", "_INSIDE_STRING")
} else if(match(input, /^;.*/)) # comment
_push_token(substr(input, 1, RLENGTH))
else if(match(input, /^[^ \[\]{}('"`,;)@]+/)) # non-special chars
_push_token(substr(input, 1, RLENGTH))
else {
if(!FILENAME || FILENAME == "-") {
logg_err("tokz", "at char " chars_done ": " \
"unrecognized input: " input)
chars_left = 0
_forget_parse_state()
} else {
logg_err("tokz",
" " FILENAME ":" FNR ": " \
"at char " chars_done ": " \
"unrecognized input: " input)
exit(1)
}
}
logg_err("tokz", "at char " chars_done ", token not matched: " input)
exit 1
if(!FILENAME || FILENAME == "-") {
logg_err("tokz", "at char " chars_done ": "\
"token not matched: " input)
chars_left = 0
_forget_parse_state()
} else {
logg_err("tokz", " " FILENAME ":" FNR ": " \
"at char " chars_done ": " \
"token not matched: " input)
exit(1)
}
function read_str(s, i, ta, tal) {
delete ta[1] # make sure ta is an array for gawk -c
tokenize_into(s, ta)
tal = length(ta)
i[1] = 1 # make i an array so we can pass it by reference
return read_form(i, ta, tal, 0)
}
function read_form(i,ta,tal, quote) {
if(ta[i[1]] == "'") {
quote = "quote"
i[1]++
} else if(ta[i[1]] == "`") {
quote = "quasiquote"
i[1]++
} else if(ta[i[1]] == ",") {
quote = "unquote"
i[1]++
} else if(ta[i[1]] == ",@") {
quote = "unquote-splicing"
i[1]++
} else {
quote = 0
# before, with the mal-step-2-like reader, we knew the ending
# nesting_level would be 0. there would always be one form to
# evaluate. the question was whether it was an atom or a list; if a
# list, recursion would take us to the end of the form (and no
# farther: multiple forms on the same line would be ignored). if the
# nesting level at the end were not zero, it would be an error, on
# that line.
#
# now, with multiline forms, the form and the line are not constrained
# to the same extent.
#
# - there may not be a complete form to read yet. we want to avoid
# starting to read anything until we know we have enough tokens to
# finish reading it.
#
# - there could be multiple forms on one
# line, and so we may end up reading more than one.
#
# - there could be a fractional form at the end. we want to consume
# the full form(s) and leave the half forms.
#
# and that means that when we read a list, we need its starting _and
# ending_ indices.
function read_forms_into(rvs, a, b, i, t, quote, topp, list_found, nesting_level) {
delete rvs[1]
if(!a) {
a=1
b=length(_TOKENS)
# don't try to read an incomplete string
if(_INSIDE_STRING) b--
topp=1 # we are the toplevel read_forms_into
if(match(ta[i[1]], /^\(/)) {
# logg_dbg("read_form", "( at token " i[1])
i[1] += 1
return read_list(i,ta,tal,quote)
} else {
return read_atom(i,ta,tal,quote)
logg_dbg("read_forms_into", "a " a " b " b " ingoing length(rvs) " length(rvs))
while(a <= b) { # we may break out, too, below
t = _TOKENS[a]
# logg_dbg("read_forms_into", "reading " a " = " t)
logg_dbg("read_forms_into", "token " t " quote? " quote)
if(t == "'") {
quote = "quote"; a++; continue
} else if(t == "`") {
quote = "quasiquote"; a++; continue
} else if(t == ",") {
quote = "unquote"; a++; continue
} else if(t == ",@") {
quote = "unquote-splicing"; a++; continue
}
if(t == "(") {
list_found = 0
nesting_level = 1
for(i=a+1; i<=b; i++) {
if(_TOKENS[i] == "(") nesting_level++
else if(_TOKENS[i] == ")") nesting_level--
if(nesting_level==0) {
# the list begun at a has ended at i
list_found = 1
break
}
}
if(list_found) {
# logg_dbg("read_forms_into", "list from " a " to " i)
rvs[length(rvs)+1] = read_list(a+1, i-1, quote)
quote = 0
# logg_dbg("read_forms_into", "done from " a " to " i)
a = i+1
} else {
# logg_dbg("read_forms_into", "incomplete list, not reading")
# the list must not be completely input yet. we can't
# read it.
return
}
} else if(t == ")") {
# we already dealt with positive nesting levels; this is
# negative
logg_err("read_forms_into", "too many )'s at " _WHERE[length(_WHERE)])
_forget_parse_state()
return
} else {
logg_dbg("read_forms_into", "atom at " a " token is " t " quote? " quote)
rvs[length(rvs)+1] = read_atom(a, quote)
quote = 0
a++
}
}
# logg_dbg("read_forms_into", "a " a " b " b " lrvs " length(rvs))
if(topp) {
# we have read all the _TOKENS. but! maybe we are _INSIDE_STRING
if(!_INSIDE_STRING) {
# ok. we can forget everything.
_forget_parse_state()
}
function read_list(i, ta, tal, quote, prevtail, head) {
# support older api, where we put one string in, and get one eval'd
# value out. there can be multiple expressions so we'll return the
# value of the last one. really this is a bit like _evprog but with an
# awk array. anyway this is the api lib-eval.awk uses.
function eval_read_str(s, l, i, rv) {
tokenize(s)
read_forms_into(_TO_EVAL)
l = length(_TO_EVAL)
for(i=1; i<=l; i++)
rv = _eval(_TO_EVAL[i])
delete _TO_EVAL
}
function read_list(a, b, quote, i, forms, prevtail, head) {
for(; (i[1]<=tal) && (ta[i[1]] !~ /^[).]/); i[1]++) {
# logg_dbg("rd_l", "in loop, i is " i[1] "; token is " ta[i[1]])
head = _cons(read_form(i, ta, tal), head)
}
# logg_dbg("read_list", "after loop, i[1] is " i[1] "; token is " ta[i[1]] "; head is " head)
prevtail = head
head = _nreverse(head)
if(ta[i[1]] == ".") {
i[1] += 1
_set_cdr(prevtail, read_form(i, ta, tal))
i[1] += 1
} else if(ta[i[1]] ~ /^\)/) { # properly terminated
# logg_dbg("read_list", "after _nreverse, head is " head)
## orientation: _TOKENS[a-1] == "(" and _TOKENS[b+1] == ")"
if(((b-a+1) >= 3) && _TOKENS[b-1] == ".") {
# the end of the list is dotted
# logg_dbg("read_list", "dotted. reading forms " b-2 " and " b ".")
read_forms_into(forms, a, b-2)
read_forms_into(forms, b, b)
head = _cons(forms[length(forms)-1], forms[length(forms)])
delete forms[length(forms)]
delete forms[length(forms)]
delete forms[length(forms)]
function _smoke_test_reader() {
x = read_str("(foo \"bar\" baz 3.14159 (sublist 1 2 3))", ta)
logg_inf("_smoke_test_reader", "final result: " x ", being " _repr(x))
function _incomplete_parse_at_end( i, l, nesting_level, unclosed) {
delete unclosed
l = length(_TOKENS)
if(l) {
if(_INSIDE_STRING) {
logg_err("_incomplete_parse_at_end",
"still inside string begun at " _WHERE[l])
}
for(i=1; i<=l; i++) {
if(_TOKENS[i] == "(") {
unclosed[length(unclosed)+1] = i
} else if(_TOKENS[i] == ")") {
delete unclosed[length(unclosed)]
}
}
if(length(unclosed) > 0) {
for(i=length(unclosed); i>=1; i--) {
logg_err("_incomplete_parse_at_end",
"still inside list (" _TOKENS[unclosed[i]+1] \
"... begun at " _WHERE[unclosed[i]])
}
}
}
}
function _maybe_gc() {
if(N > (N_at_last_gc + GC_EVERY)) {
# for debugging garbage collector:
# logg_dbg("rep", "auto-gc at N " N)
# _dump_dot("pre-gc-" N ".dot")
# _gc_dot(_cons(_GLOBALS,
# _cons(_MACROS,
# _cons(_COMPARED_SYMBOLS,
# _nil()))),
# "auto-gc-" N "-marks.dot",
# "auto-gc-" N "-sweeps.dot")
# _dump_dot("post-gc-" N ".dot")
_gc(_cons(_GLOBALS,
_cons(_MACROS,
_cons(_COMPARED_SYMBOLS,
_nil()))))
N_at_last_gc = N
}