lrparser

A LR parser implementation in Nim.

LR parsers (https://en.wikipedia.org/wiki/LR_parser) are a way to convert format grammars (sets of rules) into a program that can convert some text into a structured tree structure.

They can be used to parse expressions such as: x+x*x+x into (x+(x*x))+x.

More complex grammar allow to parse source code to create programming languages.

To parse a string, you need to first split it into tokens as the parser only accepts seqs of Tokens (a token is a wrapper around a string, that provides information about the position of the token for error reporting)

To do this, you can write your own tokenizer (for example, input.split(" ").mapIt(Token(content: it)) can be a simple tokenizer), or you can use the tokenize function that provides basic tokenizing abilities and outputs tokens instead of strings.

Example:

import tables, strutils

let input_string = """
        4 * 41 + 3 * (7 + 2)
    """
# The ruleset
let grammar = @[
    ("E", @["E", "+", "T"]),
    ("E", @["T"]),
    ("T", @["T", "*", "F"]),
    ("T", @["F"]),
    ("F", @["(", "E", ")"]),
    ("F", @["LIT"]),
]

# We need to convert the strings to ids so that the parser understands them
let tokenIdTable = makeTokenIdTable(grammar) # returns a Table[string, int]
let int_grammar = convertGrammar(grammar, tokenIdTable)

proc labeler(s: string): int =
    # Function that will detect the types of the tokens
    if s[0] == '*': return tokenIdTable["*"]
    if s[0] == '+': return tokenIdTable["+"]
    if s[0] == '(': return tokenIdTable["("]
    if s[0] == ')': return tokenIdTable[")"]
    return tokenIdTable["LIT"]

# Build the grammar to use it.
# This might take time so consider doing this at compile time for maximum performance.
let full_grammar = constructGrammar(int_grammar)

# We convert the string into "tokens" (i.e a seq of strings with some metadata)
let tokens = tokenize(input_string, @["*", "+", "(", ")"], @[" ", "\t",
        "\n"], labeler)

# We parse the string
let ast = parse(tokens, full_grammar)

# We evaluate the tree
proc eval(t: TokenTreeRef): int =
    if t.content != "":
        let r = parseInt(t.content)
        return r

    if t.children.len == 3:
        # id is the type of the token.
        if t.id == tokenIdTable["E"]:
            return eval(t.children[0]) + eval(t.children[2])
        if t.id == tokenIdTable["T"]:
            return eval(t.children[0]) * eval(t.children[2])
        if t.id == tokenIdTable["F"]:
            return eval(t.children[1])

    return eval(t.children[0])

let r = eval(ast)
echo input_string.strip, " = ", r

Types

TokenTreeRef = ref Token
Token = object
  children*: seq[TokenTreeRef]
  content*: string
  id*: int
  charcount*: int
  line*: int
  col*: int
Represents a piece of string. Might be inside an AST. When in an AST, the children tokens are in children. (col,line) are the position of the token in the original string/file. charcount is the position in the string in bytes. id is the type of the token given by the TokenLabeler
TokenLabeler = proc (s: string): int
Grammar = object
  table: seq[Table[int, (StateTypes, int)]]
  grammar: seq[(int, seq[int])]
Internal representation of the set of rules provided

Procs

proc `$`(ttr: TokenTreeRef): string {...}{.raises: [ValueError], tags: [].}
proc tokenize(s: string; tokenBreakersThatAreTokens: seq[string];
              tokenBreakers: seq[string]; tl: TokenLabeler): seq[Token] {...}{.
    raises: [], tags: [].}
Split the string s along the tokenBreakers and returns it as a list of tokens
proc constructGrammar(grammar: seq[(int, seq[int])]): Grammar {...}{.
    raises: [KeyError], tags: [].}
Build a parsing table from a given grammar. Be careful, the size of the grammar is
proc parseAll(input_string: seq[Token]; g: Grammar;
              debugTable: Table[int, string]): seq[TokenTreeRef] {...}{.
    raises: [ValueError, KeyError], tags: [].}
proc parse(input_string: seq[Token]; g: Grammar): TokenTreeRef {...}{.
    raises: [ValueError, KeyError], tags: [].}
Perform the parsing of a sequence of tokens. Returns a tree made of tokens, that follows the set of rules described in the grammar g provided. In case of syntax errors, print the location of the error and return a partially parsed tree. Use parseAll if you need better error management
proc convertGrammar[T: enum](g: seq[(T, seq[T])]): seq[(int, seq[int])]
proc makeTokenIdTable[T](g: seq[(T, seq[T])]): Table[T, int]
proc convertGrammar[T](g: seq[(T, seq[T])]; tab: Table[T, int]): seq[
    (int, seq[int])]