Oak

syntax.oak

← Standard library See on GitHub ↗

// libsyntax implements a tokenizer, parser, and code formatter for Oak

{
	default: default
	fromHex: fromHex
	range: range
	slice: slice
	append: append
	contains?: contains?
	map: map
	each: each
	last: last
	take: take
	first: first
	filter: filter
	reduce: reduce
} := import('std')
{
	digit?: digit?
	word?: word?
	space?: space?
	cut: cut
	contains?: strContains?
	join: join
	replace: replace
	startsWith?: startsWith?
	trimStart: trimStart
	trimEnd: trimEnd
	trim: trim
} := import('str')
{
	min: min
	max: max
} := import('math')
{
	format: format
	printf: printf
} := import('fmt')

fn shebang?(text) text |> startsWith?('#!')

fn renderPos(pos) '[' + string(pos.1) + ':' + string(pos.2) + ']'

fn renderToken(token) if token.val {
	? -> format('{{ 0 }} {{ 1 }}', string(token.type), renderPos(token.pos))
	_ -> format('{{ 0 }}({{ 1 }}) {{ 2 }}', string(token.type), token.val, renderPos(token.pos))
}

// Tokenizer is a full-fidelity, lossless tokenizer for Oak. It produces a
// stream of valid Oak token types, plus any shebang, newlines, and comments it
// finds. To produce an AST, those non-standard non-AST tokens should be
// filtered out of the list first.
//
// Methods:
//
// fn tokenize()    returns a list of tokens
fn Tokenizer(source) {
	index := 0
	line := 1
	col := 1

	fn TokenAt(type, pos, val) {
		type: type
		val: val
		pos: pos
	}

	fn Token(type, val) TokenAt(type, [index, line, col], val)

	fn eof? index = len(source)
	fn peek source.(index)
	fn peekAhead(n) if index + n >= len(source) {
		true -> ' '
		_ -> source.(index + n)
	}
	fn next {
		char := source.(index)
		if index < len(source) -> index <- index + 1
		if char {
			'\n' -> {
				line <- line + 1
				col <- 1
			}
			_ -> col <- col + 1
		}
		char
	}
	fn back {
		if index > 0 -> index <- index - 1
		if source.(index) {
			// TODO: reset col correctly on backtrack
			'\n' -> line <- line - 1
			_ -> col <- col - 1
		}
	}

	fn readUntilChar(c) {
		fn sub(acc) if !eof?() & peek() != c {
			true -> sub(acc << next())
			_ -> acc
		}
		sub('')
	}
	fn readValidIdentifier {
		fn sub(acc) if eof?() {
			true -> acc
			_ -> {
				c := next()
				if word?(c) | c = '_' | c = '?' | c = '!' {
					true -> sub(acc << c)
					_ -> {
						back()
						acc
					}
				}
			}
		}
		sub('')
	}
	fn readValidNumeral {
		sawDot? := false
		fn sub(acc) if eof?() {
			true -> acc
			_ -> {
				c := next()
				if {
					digit?(c) -> sub(acc << c)
					c = '.' & !sawDot? -> {
						sawDot? <- true
						sub(acc << c)
					}
					_ -> {
						back()
						acc
					}
				}
			}
		}
		sub('')
	}
	fn nextToken {
		pos := [index, line, col]
		if c := next() {
			',' -> TokenAt(:comma, pos)
			'.' -> if peek() = '.' & peekAhead(1) = '.' {
				true -> {
					next()
					next()
					TokenAt(:ellipsis, pos)
				}
				_ -> TokenAt(:dot, pos)
			}
			'(' -> TokenAt(:leftParen, pos)
			')' -> TokenAt(:rightParen, pos)
			'[' -> TokenAt(:leftBracket, pos)
			']' -> TokenAt(:rightBracket, pos)
			'{' -> TokenAt(:leftBrace, pos)
			'}' -> TokenAt(:rightBrace, pos)
			':' -> if peek() {
				'=' -> {
					next()
					TokenAt(:assign, pos)
				}
				_ -> TokenAt(:colon, pos)
			}
			'<' -> if peek() {
				'<' -> {
					next()
					TokenAt(:pushArrow, pos)
				}
				'-' -> {
					next()
					TokenAt(:nonlocalAssign, pos)
				}
				'=' -> {
					next()
					TokenAt(:leq, pos)
				}
				_ -> TokenAt(:less, pos)
			}
			'?' -> TokenAt(:qmark, pos)
			'!' -> if peek() {
				'=' -> {
					next()
					TokenAt(:neq, pos)
				}
				_ -> TokenAt(:exclam, pos)
			}
			'+' -> TokenAt(:plus, pos)
			'-' -> if peek() {
				'>' -> {
					next()
					TokenAt(:branchArrow, pos)
				}
				_ -> TokenAt(:minus, pos)
			}
			'*' -> TokenAt(:times, pos)
			'/' -> if peek() {
				'/' -> {
					// line comment
					next()
					commentString := readUntilChar('\n') |> trimEnd()
					if commentString |> trim() = '' -> commentString <- ''
					TokenAt(:comment, pos, commentString)
				}
				_ -> TokenAt(:divide, pos)
			}
			'%' -> TokenAt(:modulus, pos)
			'^' -> TokenAt(:xor, pos)
			'&' -> TokenAt(:and, pos)
			'|' -> if peek() {
				'>' -> {
					next()
					TokenAt(:pipeArrow, pos)
				}
				_ -> TokenAt(:or, pos)
			}
			'>' -> if peek() {
				'=' -> {
					next()
					TokenAt(:geq, pos)
				}
				_ -> TokenAt(:greater, pos)
			}
			'=' -> TokenAt(:eq, pos)
			'\'' -> {
				fn sub(payload) if charInString := next() {
					?, '\'' -> payload
					'\\' -> if c := next() {
						? -> payload
						_ -> sub(payload << '\\' << c)
					}
					_ -> sub(payload << charInString)
				}
				TokenAt(:stringLiteral, pos, sub(''))
			}
			_ -> if {
				digit?(c) -> TokenAt(:numberLiteral, pos, c << readValidNumeral())
				_ -> if payload := c << readValidIdentifier() {
					'_' -> TokenAt(:underscore, pos)
					'if' -> TokenAt(:ifKeyword, pos)
					'fn' -> TokenAt(:fnKeyword, pos)
					'with' -> TokenAt(:withKeyword, pos)
					'true' -> TokenAt(:trueLiteral, pos)
					'false' -> TokenAt(:falseLiteral, pos)
					_ -> TokenAt(:identifier, pos, payload)
				}
			}
		}
	}
	fn tokenize {
		tokens := []

		// the tokenizer itself completely ignores shebang lines, as it is not
		// a language concern -- it is the responsibility of the caller to
		// process or ignore shebang lines as necessary
		if peek() = '#' & peekAhead(1) = '!' -> {
			readUntilChar('\n')
			if !eof?() -> next()
		}

		// snip whitespace before
		fn eatSpace if space?(sp := peek()) -> {
			if sp = '\n' -> tokens << Token(:newline)
			next()
			eatSpace()
		}
		eatSpace()

		lastTok := Token(:comma)
		fn sub {
			nextTok := nextToken()

			if !(
				[:leftParen, :leftBracket, :leftBrace, :comma] |> contains?(lastTok.type)
			) & [:rightParen, :rightBracket, :rightBrace] |> contains?(nextTok.type) -> tokens << TokenAt(:comma, nextTok.pos)

			tokens << nextTok
			if nextTok.type = :comment -> nextTok := lastTok

			// snip whitespace after
			fn eatSpaceAutoInsertComma if space?(peek()) -> {
				if peek() {
					'\n' -> {
						if nextTok.type {
							:comma, :leftParen, :leftBracket, :leftBrace
							:plus, :minus, :times, :divide, :modulus, :xor
							:and, :or, :exclam, :greater, :less, :eq, :geq
							:leq, :assign, :nonlocalAssign, :dot, :colon
							:fnKeyword, :ifKeyword, :withKeyword
							:pipeArrow, :branchArrow, :pushArrow -> ?
							_ -> {
								nextTok <- Token(:comma)
								tokens << nextTok
							}
						}
						tokens << Token(:newline)
					}
				}
				next()
				eatSpaceAutoInsertComma()
			}
			eatSpaceAutoInsertComma()

			if nextTok.type {
				:comment -> ?
				_ -> lastTok <- nextTok
			}

			if !eof?() -> sub()
		}

		// do not start tokenizing into empty file
		if !eof?() -> sub()

		if lastTok.type {
			:comma -> ?
			_ -> tokens << TokenAt(:comma, [
				len(source)
				line
				col
			])
		}

		tokens
	}

	{
		// undocumented API for libsyntax.print
		readUntilChar: readUntilChar
		tokenize: tokenize
	}
}

// tokenize takes Oak source text and returns a list of tokens
fn tokenize(text) Tokenizer(text).tokenize()

// Parser takes a raw token stream, potentially including newlines and
// comments, and generates a list of clean Oak AST nodes.
//
// Methods:
//
// fn parse()   returns a list of AST nodes
fn Parser(tokens) {
	index := 0
	minBinaryPrec := [0]

	// for parsing purposes, we must ignore non-semantic tokens
	tokens := tokens |> filter(fn(tok) if tok.type {
		:newline, :comment -> false
		_ -> true
	})

	fn error(msg, pos) {
		type: :error
		error: msg
		pos: pos
	}

	fn lastMinPrec minBinaryPrec.(len(minBinaryPrec) - 1)
	fn pushMinPrec(prec) minBinaryPrec << prec
	fn popMinPrec minBinaryPrec <- slice(minBinaryPrec, 0, len(minBinaryPrec) - 1)
	fn eof? index = len(tokens)
	fn peek tokens.(index)
	fn peekAhead(n) if index + n > len(tokens) {
		true -> { type: :comma }
		_ -> tokens.(index + n)
	}
	fn next {
		tok := tokens.(index)
		if index < len(tokens) -> index <- index + 1
		tok
	}
	fn back if index > 0 -> index <- index - 1
	fn lastTokenPos if lastTok := last(tokens) {
		? -> ?
		_ -> lastTok.pos
	}
	fn expect(type) if eof?() {
		true -> error(format('Unexpected end of input, expected {{0}}', type), lastTokenPos())
		_ -> {
			nextTok := next()
			if nextTok.type {
				type -> nextTok
				_ -> error(format('Unexpected token {{0}}, expected {{1}}', renderToken(nextTok), type), nextTok.pos)
			}
		}
	}
	fn readUntilTokenType(type) {
		tokens := []
		fn sub if !eof() & peek().type != type {
			true -> {
				tokens << next()
				sub()
			}
			_ -> tokens
		}
		sub()
	}

	fn notError(x, withNotErr) if x {
		{ type: :error, error: _, pos: _ } -> x
		_ -> withNotErr(x)
	}

	fn parseAssignment(left) if peek().type {
		:assign, :nonlocalAssign -> {
			nxt := next()
			node := {
				type: :assignment
				tok: nxt
				local?: nxt.type = :assign
				left: left
			}

			with notError(right := parseNode()) fn {
				node.right := right
				node
			}
		}
		_ -> left
	}

	// parseUnit is responsible for parsing the smallest complete syntactic
	// "units" of Oak's syntax, like literals including function literals,
	// grouped expressions in blocks, and if/with expressions.
	fn parseUnit if eof?() {
		true -> error('Unexpected end of input', lastTokenPos())
		_ -> {
			tok := next()
			if tok.type {
				:qmark -> { type: :null, tok: tok }
				:stringLiteral -> {
					type: :string
					tok: tok
					val: {
						verbatim := tok.val
						fn sub(parsed, i) if c := verbatim.(i) {
							? -> parsed
							'\\' -> if escapedChar := verbatim.(i + 1) {
								't' -> sub(parsed << '\t', i + 2)
								'n' -> sub(parsed << '\n', i + 2)
								'r' -> sub(parsed << '\r', i + 2)
								'f' -> sub(parsed << '\f', i + 2)
								'x' -> if c1 := verbatim.(i + 2) {
									? -> sub(parsed << escapedChar, i + 2)
									_ -> if c2 := verbatim.(i + 3) {
										? -> sub(parsed << escapedChar << c1, i + 3)
										_ -> if code := fromHex(c1 + c2) {
											? -> sub(parsed << escapedChar << c1 << c2, i + 4)
											_ -> sub(parsed << char(code), i + 4)
										}
									}
								}
								_ -> sub(parsed << escapedChar, i + 2)
							}
							_ -> sub(parsed << c, i + 1)
						}
						sub('', 0)
					}
				}
				:numberLiteral -> if tok.val |> strContains?('.') {
					true -> if parsed := float(tok.val) {
						? -> error(format('Could not parse floating point number {{0}}', tok.val), tok.pos)
						_ -> {
							type: :float
							tok: tok
							val: parsed
						}
					}
					_ -> if parsed := int(tok.val) {
						? -> error(format('Could not parse integer number {{0}}', tok.val), tok.pos)
						_ -> {
							type: :int
							tok: tok
							val: parsed
						}
					}
				}
				:trueLiteral -> {
					type: :bool
					tok: tok
					val: true
				}
				:falseLiteral -> {
					type: :bool
					tok: tok
					val: false
				}
				:colon -> if peek().type {
					:identifier -> {
						type: :atom
						tok: tok
						val: next().val
					}
					:ifKeyword -> {
						next()
						{ type: :atom, tok: tok, val: 'if' }
					}
					:fnKeyword -> {
						next()
						{ type: :atom, tok: tok, val: 'fn' }
					}
					:withKeyword -> {
						next()
						{ type: :atom, tok: tok, val: 'with' }
					}
					:trueLiteral -> {
						next()
						{ type: :atom, tok: tok, val: 'true' }
					}
					:falseLiteral -> {
						next()
						{ type: :atom, tok: tok, val: 'false' }
					}
					_ -> error(format('Expected identifier after ":", got {{0}}', renderToken(peek())), peek().pos)
				}
				:leftBracket -> {
					pushMinPrec(0)

					itemNodes := []
					fn sub if eof?() {
						true -> error('Unexpected end of input inside list', lastTokenPos())
						_ -> if peek().type {
							:rightBracket -> ?
							_ -> with notError(node := parseNode()) fn {
								with notError(err := expect(:comma)) fn {
									itemNodes << node
									sub()
								}
							}
						}
					}
					with notError(sub()) fn {
						with notError(err := expect(:rightBracket)) fn {
							popMinPrec()

							{
								type: :list
								tok: tok
								elems: itemNodes
							}
						}
					}
				}
				:leftBrace -> {
					pushMinPrec(0)

					// empty {} is always considered an object -- an empty block is illegal
					if peek().type {
						:rightBrace -> {
							next() // eat the rightBrace
							popMinPrec()

							{
								type: :object
								tok: tok
								entries: []
							}
						}
						_ -> with notError(firstExpr := parseNode()) fn if eof?() {
							true -> error('Unexpected end of input inside block or object', lastTokenPos())
							_ -> if peek().type {
								:colon -> {
									// it's an object
									next() // eat the colon
									with notError(valExpr := parseNode()) fn {
										with notError(expect(:comma)) fn {
											entries := [{ key: firstExpr, val: valExpr }]

											fn sub if !eof?() -> if peek().type {
												:rightBrace -> ?
												_ -> with notError(key := parseNode()) fn {
													with notError(expect(:colon)) fn {
														with notError(val := parseNode()) fn {
															with notError(expect(:comma)) fn {
																entries << { key: key, val: val }
																sub()
															}
														}
													}
												}
											}
											with notError(sub()) fn {
												with notError(expect(:rightBrace)) fn {
													popMinPrec()

													{
														type: :object
														tok: tok
														entries: entries
													}
												}
											}
										}
									}
								}
								_ -> with notError(expect(:comma)) fn {
									// it's a block
									exprs := [firstExpr]

									fn sub if eof?() {
										true -> error('Unexpected end of input inside block or object', lastTokenPos())
										_ -> if peek().type {
											:rightBrace -> ?
											_ -> with notError(expr := parseNode()) fn {
												with notError(expect(:comma)) fn {
													exprs << expr
													sub()
												}
											}
										}
									}
									with notError(sub()) fn {
										with notError(expect(:rightBrace)) fn {
											popMinPrec()

											{
												type: :block
												tok: tok
												exprs: exprs
											}
										}
									}
								}
							}
						}
					}
				}
				:fnKeyword -> {
					pushMinPrec(0)

					name := if peek().type {
						// optional named fn
						:identifier -> next().val
						_ -> ''
					}

					args := []
					restArg := ''

					fn parseBody with notError(body := parseNode()) fn {
						// Exception to the "{} is empty object" rule is that `fn
						// {}` parses as a function with an empty block as a body.
						if body {
							{ type: :object, tok: _, entries: [] } -> body <- {
								type: :block
								tok: body.tok
								exprs: []
							}
						}
						popMinPrec()

						{
							type: :function
							name: name
							tok: tok
							args: args
							restArg: restArg
							body: body
						}
					}

					if peek().type {
						:leftParen -> {
							// optional argument list
							next() // eat the leftParen

							fn sub if !eof?() -> if peek().type {
								:rightParen -> ?
								_ -> {
									arg := expect(:identifier)
									if arg.type {
										:error -> {
											back() // try again

											with notError(expect(:underscore)) fn {
												args << '_'
												with notError(expect(:comma)) fn {
													sub()
												}
											}
										}
										_ -> if peek().type {
											:ellipsis -> {
												restArg <- arg.val
												next() // eat the ellipsis
												with notError(expect(:comma)) fn {
													sub()
												}
											}
											_ -> {
												args << arg.val
												with notError(expect(:comma)) fn {
													sub()
												}
											}
										}
									}
								}
							}
							with notError(sub()) fn {
								with notError(expect(:rightParen)) fn {
									parseBody()
								}
							}
						}
						_ -> parseBody()
					}
				}
				:underscore -> {
					type: :empty
					tok: tok
				}
				:identifier -> {
					type: :identifier
					tok: tok
					val: tok.val
				}
				:minus, :exclam -> with notError(right := parseSubNode()) fn {
					type: :unary
					tok: tok
					op: tok.type
					right: right
				}
				:ifKeyword -> {
					// We want to support multi-target branches, but don't want to
					// incur the performance overhead in the interpreter/evaluator
					// of keeping every single target as a Go slice, when the vast
					// majority of targets will be single-value, which requires
					// just a pointer to an astNode.
					//
					// So instead of doing that, we penalize the multi-value case
					// by essentially considering it syntax sugar and splitting
					// such branches into multiple AST branches, each with one
					// target value.
					pushMinPrec(0)

					// if no explicit condition is provided (i.e. if the keyword is
					// followed by a { ... }), we assume the condition is "true" to
					// allow for the useful `if { case, case ... }` pattern.
					condNode := if peek().type {
						:leftBrace -> {
							type: :bool
							val: true
							tok: tok
						}
						_ -> parseNode()
					}

					if eof?() {
						true -> error('Unexpected end of input in if expression', lastTokenPos())
						_ -> if peek().type {
							:branchArrow -> {
								arrowTok := next()
								with notError(body := parseNode()) fn {
									{
										type: :ifExpr
										tok: tok
										cond: condNode
										branches: [{
											type: :ifBranch
											target: {
												type: :bool
												val: true
												tok: arrowTok
											}
											body: body
										}]
									}
								}
							}
							_ -> with notError(condNode) fn {
								with notError(expect(:leftBrace)) fn {
									fn subBranch(branches) if eof?() {
										false -> if peek().type {
											:rightBrace -> branches
											_ -> {
												fn subTarget(targets) if eof?() {
													true -> targets
													_ -> with notError(target := parseNode()) fn if peek().type {
														:branchArrow -> targets << target
														_ -> with notError(expect(:comma)) fn {
															subTarget(targets << target)
														}
													}
												}
												with notError(targets := subTarget([])) fn {
													with notError(expect(:branchArrow)) fn {
														with notError(body := parseNode()) fn {
															with notError(expect(:comma)) fn {
																subBranch(branches |> append(targets |> with map() fn(target) {
																	type: :ifBranch
																	target: target
																	body: body
																}))
															}
														}
													}
												}
											}
										}
										_ -> branches
									}
									with notError(branches := subBranch([])) fn {
										with notError(expect(:rightBrace)) fn {
											popMinPrec()

											{
												type: :ifExpr
												tok: tok
												cond: condNode
												branches: branches
											}
										}
									}
								}
							}
						}
					}
				}
				:withKeyword -> {
					pushMinPrec(0)
					with notError(base := parseNode()) fn if base.type {
						:fnCall -> with notError(lastArg := parseNode()) fn {
							popMinPrec()
							base.args << lastArg
							base
						}
						_ -> error(format('with keyword should be followed by a fn call, found {{0}}', base), tok.pos)
					}
				}
				:leftParen -> {
					pushMinPrec(0)

					fn subExpr(exprs) if eof?() {
						true -> error('Unexpected end of input inside block', lastTokenPos())
						_ -> if peek().type {
							:rightParen -> exprs
							_ -> with notError(expr := parseNode()) fn {
								with notError(expect(:comma)) fn {
									subExpr(exprs << expr)
								}
							}
						}
					}
					with notError(exprs := subExpr([])) fn {
						with notError(expect(:rightParen)) fn {
							popMinPrec()
							{
								type: :block
								tok: tok
								exprs: exprs
							}
						}
					}
				}
				_ -> error(format('Unexpected token {{0}} at start of unit', renderToken(tok)), tok.pos)
			}
		}
	}

	fn infixOpPrecedence(op) if op {
		:plus, :minus -> 40
		:times, :divide -> 50
		:modulus -> 80
		:eq, :greater, :less, :geq, :leq, :neq -> 30
		:and -> 20
		:xor -> 15
		:or -> 10
		// assignment-like semantics
		:pushArrow -> 1
		_ -> -1
	}

	// parseSubNode is responsible for parsing independent "terms" in the Oak
	// syntax, like terms in unary and binary expressions and in pipelines. It
	// is in between parseUnit and parseNode.
	fn parseSubNode {
		pushMinPrec(0)
		with notError(node := parseUnit()) fn {
			fn sub if !eof?() -> if peek().type {
				:dot -> {
					nxt := next() // eat the dot
					with notError(right := parseUnit()) fn {
						node <- {
							type: :propertyAccess
							tok: nxt
							left: node
							right: right
						}
						sub()
					}
				}
				:leftParen -> {
					nxt := next() // eat the leftParen

					args := []
					restArg := ?
					fn subArg if !eof?() -> if peek().type {
						:rightParen -> with notError(expect(:rightParen)) fn {}
						_ -> with notError(arg := parseNode()) fn if eof?() {
							true -> error('Unexpected end of input inside argument list', lastTokenPos())
							_ -> if peek().type {
								:ellipsis -> {
									next() // eat the ellipsis
									with notError(expect(:comma)) fn {
										restArg <- arg
										subArg()
									}
								}
								:comma -> {
									next() // eat the comma
									args << arg
									subArg()
								}
								_ -> error(format('Expected comma after arg in argument list, got {{0}}', peek().type), peek().pos)
							}
						}
					}
					with notError(subArg()) fn {
						node <- {
							type: :fnCall
							function: node
							args: args
							restArg: restArg
							tok: nxt
						}
						sub()
					}
				}
			}
			with notError(sub()) fn {
				popMinPrec()
				node
			}
		}
	}

	// parseNode returns the next top-level astNode from the parser
	fn parseNode with notError(node := parseSubNode()) fn {
		fn sub if !eof?() -> if peek().type {
			:comma -> ?
			// whatever follows an assignment expr cannot bind to the
			// assignment expression itself by syntax rule, so we simply
			// return
			:assign, :nonlocalAssign -> node <- parseAssignment(node)
			// this case implements a mini Pratt parser threaded through
			// the larger Oak syntax parser, using the parser struct itself
			// to keep track of the power / precedence stack since other
			// forms may be parsed in between, as in 1 + f(g(x := y)) + 2
			:plus, :minus, :times, :divide, :modulus, :xor, :and, :or
			:pushArrow, :greater, :less, :eq, :geq, :leq, :neq -> {
				minPrec := lastMinPrec()
				fn subBinary if eof?() {
					true -> error('Incomplete binary expression', lastTokenPos())
					_ -> {
						peeked := peek()
						op := peeked.type
						prec := infixOpPrecedence(op)
						if prec > minPrec -> {
							next() // eat the operator

							if eof?() {
								true -> error(format('Incomplete binary expression with {{0}}', { type: op }), peek().pos)
								_ -> {
									pushMinPrec(prec)
									with notError(right := parseNode()) fn {
										popMinPrec()

										node <- {
											type: :binary
											tok: peeked
											op: op
											left: node
											right: right
										}
										subBinary()
									}
								}
							}
						}
					}
				}
				with notError(subBinary()) fn {
					// whatever follows a binary expr cannot bind to the
					// binary expression by syntax rule, so we simply
					// return
					node
				}
			}
			:pipeArrow -> {
				pipe := next() // eat the pipe
				with notError(pipeRight := parseSubNode()) fn if pipeRight.type {
					:fnCall -> {
						pipeRight.args := append([node], pipeRight.args)
						node <- pipeRight
						sub()
					}
					_ -> error(format('Expected function call after |>, got {{0}}', pipeRight), pipe.pos)
				}
			}
		}
		// the trailing comma is handled as necessary in callers of parseNode
		with notError(sub()) fn {
			node
		}
	}

	{
		parse: fn {
			// parse
			nodes := []
			fn sub if !eof?() -> with notError(node := parseNode()) fn {
				with notError(expect(:comma)) fn {
					nodes << node
					sub()
				}
			}
			with notError(sub()) fn {
				nodes
			}
		}
	}
}

// parse takes Oak source text and returns a list of AST nodes
fn parse(text) {
	tokens := tokenize(text)
	Parser(tokens).parse()
}

// Printer takes a list of Oak tokens and pretty-prints the source code into a
// string. As a rule, all newlines are preserved, including those in comments.
// This differs from the approach of other pretty-printers that prefer to
// decide on newlines themselves, and is an intentional design decision.
//
// Printer is a *token stream-based formatter*. This means unlike many
// production pretty-printers, syntax.Printer formats code directly from a flat
// token stream rather than a syntax tree. This has pros and cons.
//
// Pros:
// 1. Resilience. This approach can generally format incomplete code, or code
//    that may contain syntax errors the parser may not parse. The printer
//    rarely breaks on bad input.
// 2. Simplicity. Because the input is a flat list of tokens, the algorithm is
//    dramatically simpler to write and understand. Each token determines how
//    it's printed, and each line determines how it's indented.
// 3. Performance. The simpler design of iterating through a flat list yields
//    better performance than a full AST based approach.
//
// Cons:
// 1. Fuzzy correctness, especially on edge cases. There are some edge cases
//    where Printer will yield unexpected output because of context the
//    algorithm cannot disambiguate fully without a proper AST. Some of these
//    are documented in test/syntax.test.oak.
// 2. Lack of ability to make simplifying AST transformations. Some printers
//    can make basic AST transformations like removing redundant parentheses
//    during pretty-printing, but using a token based approach precludes us
//    from doing so.
//
// Methods:
//
// fn print()   returns a pretty-printed string
fn Printer(tokens) {
	// create a string with N tabs in it
	fn tabs(n) if {
		n > 0 -> tabs(n - 1) << '\t'
		_ -> ''
	}

	// a single token -> its printed value
	fn render(token) if token.type {
		:comment -> '//' + token.val
		:comma -> ','
		:dot -> '.'
		:leftParen -> '('
		:rightParen -> ')'
		:leftBracket -> '['
		:rightBracket -> ']'
		:leftBrace -> '{'
		:rightBrace -> '}'
		:assign -> ':='
		:nonlocalAssign -> '<-'
		:pipeArrow -> '|>'
		:branchArrow -> '->'
		:pushArrow -> '<<'
		:colon -> ':'
		:ellipsis -> '...'
		:qmark -> '?'
		:exclam -> '!'
		:plus -> '+'
		:minus -> '-'
		:times -> '*'
		:divide -> '/'
		:modulus -> '%'
		:xor -> '^'
		:and -> '&'
		:or -> '|'
		:greater -> '>'
		:less -> '<'
		:eq -> '='
		:geq -> '>='
		:leq -> '<='
		:neq -> '!='
		:ifKeyword -> 'if'
		:fnKeyword -> 'fn'
		:withKeyword -> 'with'
		:underscore -> '_'
		:identifier -> token.val
		:trueLiteral -> 'true'
		:falseLiteral -> 'false'
		:stringLiteral -> '\'' << token.val << '\''
		:numberLiteral -> token.val
		_ -> {
			printf('Unknown token {{0}}', token)
			string(token)
		}
	}

	// does a token require a space to follow it in well-formatted code?
	fn connectingToken?(tokenType) if tokenType {
		:assign
		:nonlocalAssign
		:pipeArrow
		:branchArrow
		:pushArrow
		:colon
		:plus, :minus, :times, :divide, :modulus
		:xor, :and, :or
		:greater, :less, :eq, :geq, :leq, :neq -> true
		_ -> false
	}

	// shorthand for the last item of a list
	fn last(list) list.(len(list) - 1)

	fn print {
		// we keep track of lines of code and their corresponding
		// indent levels separately and merge them at the end.
		//
		// this turns out to be simpler than trying to adjust
		// indentations while also adding on lines of code.
		lines := ['']
		indents := []

		// algorithm state
		curr := 0
		// to properly indent a line, we can't simply track a cumulative
		// indentation level; we need to know what the *lowest level of
		// indentation achieved in that line* is to account for some cases (see
		// "overlapping braces" in test/syntax). We do this by keeping a list
		// of indentation levels after processing every token in a line.
		//
		// We still need to keep track of curr itself to keep track of indent
		// level across lines, and between tokens.
		currs := [0]
		// indicates whether the following line should have a hanging indent
		hanging? := false

		// shorthand for adding tokens and spaces to algorithm state
		fn add(s, tabs) {
			if last(lines) |> trim('\t') {
				'' -> last(lines) << trimStart(s)
				_ -> last(lines) << s
			}
			curr <- curr + tabs
			currs << curr
		}

		// purelyDescendingPrefix takes a list and returns a slice of the list
		// starting at index 0 (a "prefix slice") that contains purely
		// descending elements.
		//
		// For example, a purelyDescendingPrefix of [5, 2, 1, 1, 10, 3, 8] is
		// [5, 2, 1].
		fn purelyDescendingPrefix(list) if len(list) {
			0 -> list
			_ -> {
				fn sub(i) if {
					i = len(list) -> list |> take(i)
					list.(i - 1) > list.(i) -> sub(i + 1)
					_ -> list |> take(i)
				}
				sub(1)
			}
		}

		// indentLine encapsulates the full indentation logic (except for
		// indentation collapsing, described below) in Printer's formatting
		// algorithm.
		fn indentLine(lastType) {
			// using min(currs...) instead of curr ensures that if a line
			// closes some indentation levels and re-opens other indentation
			// levels, the line is indented correctly. It does this by using a
			// "minimum achieved indentation" during a line's token stream
			// rather than the final indent level, so that we account for the
			// same line opening and closing blocks.
			//
			// In code like this:
			// 1 | {
			// 2 |     something
			// 3 | } + [{
			// 4 |     something else
			// 5 | }]
			//
			// In line 3, a pure count of open blocks would have the line
			// indented 1 level, but intuitively this isn't what we want. We
			// want line 3 to be indented 0 levels, because the open block was
			// closed and a different block was opened.
			indent := min(currs...)

			// account for hanging indents
			if {
				// this check ensures correctness of cases where a hanging
				// indent is preferred even though a given line closes an open
				// block, because there is content before the block is closed.
				//
				// In code like this:
				// 1 | [1, 2, 3
				// 2 |     4, 5, 6]
				//
				// In line 2, a pure count of open blocks would have the line
				// indented 0 levels, but intuitively this isn't what we want,
				// because there is content preceding the closing token ']'.
				//
				// We account for this by counting the length of the *purely
				// descending prefix* of indents after every token in the
				// current line. In other words, we check if the line closes
				// existing indents immediately with closing )]}s, or if there
				// is non-indentation related content at the start of the line.
				//
				// If there is such content (len(prefix) <= 1) and the line
				// eventually closes some indents (prefix.0 > indent), we hang
				// this line's indent.
				{
					prefix := purelyDescendingPrefix(currs)
					len(prefix) <= 1 & default(prefix.0, 0) > indent
				}
				// the hanging? flag is set by the previous line, from an
				// incomplete infix operator or other similar constructs
				hanging? -> indent <- indent + 1
			}

			// set the hanging indent flag for this current line, so we can
			// indent accordingly at start of next line
			hanging? <- if {
				connectingToken?(lastType)
				lastType = :dot -> true
				_ -> false
			}

			indent
		}

		// in this loop, we ask whether a space should come before each token.
		// as a result: a token is only responsible for adding a space before
		// it, not after it
		tokens |> with each() fn(token, i) {
			nextType := default(tokens.(i + 1), { type: :newline }).type

			// indentation and token spacing are based on lastType, so it must
			// ignore any comments and newlines in between comments that
			// precede the current token. This algorithm traces back until it
			// can reach a non-comment token to determine lastType(s).
			[lastLastType, lastType] := if i {
				0 -> [:newline, :newline]
				1 -> [:newline, tokens.(i - 1).type]
				_ -> {
					fn sub(prev) if prev {
						// -2 may appear in recursive cases
						-2, -1, 0 -> prev
						_ -> if [tokens.(prev).type, tokens.(prev - 1).type] {
							// multiline comments
							[:newline, :comment]
							[:comment, :newline] -> sub(prev - 2)
							// in-line comments, coming after tokens in the same line
							[:comment, _] -> sub(prev - 1)
							_ -> prev
						}
					}
					prev := sub(i - 1) |> max(0)

					[
						if prev {
							0 -> :newline
							_ -> tokens.(prev - 1).type
						}
						tokens.(prev).type
					]
				}
			}

			if [lastType, token.type, nextType] {
				// this match clause is responsible for computing the correct
				// level of indentation for the line that comes before this
				// current newline.
				//
				// as a result, we compute each line's indentation level only
				// after we fully tokenize and render the line itself.
				[_, :newline, _] -> {
					indents << indentLine(lastType)
					currs <- [curr]
					lines << ''
				}
				[_, :dot, _] -> add('.', 0)

				// opening delimiters
				[_, :leftParen, _] -> if {
					connectingToken?(lastType)
					lastType = :comma
					lastType = :ifKeyword
					lastType = :withKeyword -> add(' ' << render(token), 1)
					_ -> add(render(token), 1)
				}
				[_, :leftBracket, _]
				[_, :leftBrace, _] -> if lastType {
					:leftParen
					:leftBracket -> add(render(token), 1)
					_ -> add(' ' << render(token), 1)
				}

				// if following a newline, these are exempt from the following rule
				[:newline, :rightParen, _]
				[:newline, :rightBracket, _]
				[:newline, :rightBrace, _] -> add(render(token), -1)

				// ensure { thing } and not {thing}
				[_, :rightParen, _]
				[_, :rightBracket, _] -> add(render(token), -1)
				// empty object/block special case
				[:leftBrace, :rightBrace, _] -> add(render(token), -1)
				[_, :rightBrace, _] -> add(' ' << render(token), -1)

				// no-ops, where we rely on automatic comma insertion
				[_, :comma, :newline]
				[_, :comma, :rightParen]
				[_, :comma, :rightBracket]
				[_, :comma, :rightBrace] -> ?

				// special cases for colon used in atoms
				[_, :colon, _] -> if {
					lastType = :comma
					lastType = :leftBrace
					connectingToken?(lastType) -> add(' :', 0)
					_ -> add(':', 0)
				}

				// no-space tokens
				[_, :comma, _]
				[_, :ellipsis, _] -> add(render(token), 0)

				// parens and lists don't include inner space
				[:leftParen, _, _]
				[:leftBracket, _, _]

				// no-space operators
				[:dot, _, _]
				[:exclam, _, _] -> add(render(token), 0)

				// unary exprs (which may also be used as binary infix)
				[:minus, _, _] -> if lastLastType {
					:leftParen, :leftBracket, :leftBrace
					:ifKeyword
					:exclam
					:minus
					:newline
					:comma
					:colon -> add(render(token), 0)
					_ -> if connectingToken?(lastLastType) {
						true -> add(render(token), 0)
						_ -> add(' ' << render(token), 0)
					}
				}

				// atom special case
				[:colon, :identifier, _]
				[:colon, :ifKeyword, _]
				[:colon, :fnKeyword, _]
				[:colon, :withKeyword, _]
				[:colon, :trueLiteral, _]
				[:colon, :falseLiteral, _] -> if lastLastType {
					// if token before colon cannot be end of an object key,
					// don't insert a space. This isn't a perfect rule but
					// seems to be a good-enough algorithm.
					:leftParen, :leftBracket, :leftBrace
					:rightParen, :rightBracket, :rightBrace
					:ifKeyword
					:exclam
					:minus
					:newline
					:comma
					:colon -> add(render(token), 0)
					_ -> if connectingToken?(lastLastType) {
						true -> add(render(token), 0)
						_ -> add(' ' << render(token), 0)
					}
				}

				[:leftBrace, _, _]
				_ -> add(' ' << render(token), 0)
			}
		}

		// indent last line
		indents << indentLine(last(tokens))

		// indentation collapsing:
		//
		// sometimes, multiple delimiter openers in a single line or a callback
		// being passed into a function will add multiple levels of indentation
		// in the above algorithm, but we only really want to indent one level
		// at a time visually, even if semantically there are multiple levels
		// of nesting present. We try to detect and "collapse" these
		// indentations into single levels of tab here.
		//
		// we do this by scanning lines and finding groups of lines that are
		// indented by more than 1 level at a time, and de-indenting them until
		// they're only indented one level.
		indents |> with each() fn(n, i) if i {
			0 -> ?
			_ -> if n < indents.(i - 1) -> {
				// backtrack to the line immediately following the first line
				// where indent > n
				fn sub(j) if {
					indents.(j) > n -> sub(j - 1)
					_ -> j + 1
				}
				target := sub(i - 1)

				// if the range from target to current line is indented by more
				// than 1, de-dent them accordingly
				if indents.(target) - n > 1 -> {
					diff := indents.(target) - n
					range(target, i) |> with each() fn(j) {
						indents.(j) := indents.(j) - diff + 1
					}
				}
			}
		}

		indentedLines := lines |> with map() fn(line, i) if line {
			// we don't indent empty lines
			'' -> ''
			_ -> tabs(indents.(i)) << line
		}
		indentedLines |> join('\n')
	}

	{
		print: print
	}
}

// print takes an Oak source file and returns a pretty-printed string.
// Unlike Printer.print, print also correctly processes shebang lines.
fn print(text) {
	tokens := tokenize(text)
	printer := Printer(tokens)
	if shebang?(text) {
		true -> text |> cut('\n') |> first() << '\n' << printer.print()
		_ -> printer.print()
	}
}