Antlr4 PLSQL Get Token Types

41 views Asked by At

I'm using the following Python code to print the tokens in a PLSQL source file.

from antlr4 import *
from antlr4.tree.Tree import TerminalNodeImpl
from antlr4.tree.Trees import Trees
from PlSqlLexer import PlSqlLexer
from PlSqlParser import PlSqlParser

import sys
import json


def main():
    with open(sys.argv[1], 'r') as file:
        filesrc = file.read()

    lexer = PlSqlLexer(InputStream(filesrc))
    parser = PlSqlParser(CommonTokenStream(lexer))
    tree = parser.sql_script()
    traverse(tree, parser.ruleNames)

def traverse(tree, rule_names, indent = 0):
    tree
    if tree.getText() == "<EOF>":
        return
    elif isinstance(tree, TerminalNodeImpl):
        print("{0}TOKEN='{1}'".format("  " * indent, tree.getText() ))  ## <<< Prints Token
        #print (tree)
        n = 1
    else:
        print("{0}{1}".format("  " * indent, rule_names[tree.getRuleIndex()]))
        for child in tree.children:
            traverse(child, rule_names, indent + 1)

if __name__ == '__main__':
    main()

When run with a PLSQL source file it will give out like this:

  TOKEN='CREATE'
  TOKEN='OR'
  TOKEN='REPLACE'
  TOKEN='PACKAGE'
  TOKEN='BODY'
  TOKEN='pa_temp'
  TOKEN='AS'
  TOKEN='PROCEDURE'
  TOKEN='pr_new_item'
  TOKEN='('
  TOKEN='p_item'
  TOKEN='IN'
  TOKEN='items'
.
.
.

But I would to print also what the token type is (procedure start, variable, table, etc).

I have tried to do print ( json.dumps(tree) ) and print( json.dumps(parser) to see if there anything useful but this just errors like:

TypeError: Object of type PlSqlLexer is not JSON serializable

1

There are 1 answers

2
Bart Kiers On BEST ANSWER

I have a small utility method that converts the parse tree ANTLR produces into a dict, which can be converted into json. Let's say your grammar look like:

grammar Expr;

parse
 : expr EOF
 ;

expr
 : MIN expr              #unaryExpr
 | expr (MUL | DIV) expr #mulExpr
 | expr (ADD | MIN) expr #addExpr
 | OPAR expr  CPAR       #nestedExpr
 | NUM                   #numExpr
 ;

NUM : [0-9]+ ('.' [0-9]+)?;
MUL : '*';
DIV : '/';
ADD : '+';
MIN : '-';
OPAR : '(';
CPAR : ')';
SPACES : [ \t\r\n]+ -> skip;

then the input 2 * (555 - -50) / 42 will be parsed as:

enter image description here

You can use the following Python code to convert the parse tree to a dict:

import antlr4
import json
from ExprLexer import ExprLexer
from ExprParser import ExprParser


def to_dict(root, rule_names):
    dictionary = {}
    __traverse(root, dictionary, rule_names)
    return dictionary


def __traverse(tree, dictionary, symbolic_lexer_names):
    if tree.getChildCount() == 0:
        dictionary['type'] = tree.symbol.type
        dictionary['name'] = 'EOF' if tree.symbol.type == -1 else symbolic_lexer_names[tree.symbol.type]
        dictionary['text'] = tree.symbol.text
    else:
        name = f'{str(type(tree).__name__)[0].lower()}{str(type(tree).__name__)[1:]}'.replace('Context', '')
        dictionary[name] = []
        for i in range(0, tree.getChildCount()):
            nested = {}
            dictionary[name].append(nested)
            __traverse(tree.getChild(i), nested, symbolic_lexer_names)


def main(source):
    lexer = ExprLexer(antlr4.InputStream(source))
    parser = ExprParser(antlr4.CommonTokenStream(lexer))
    dictionary = to_dict(parser.parse(), lexer.symbolicNames)
    print(json.dumps(dictionary, indent=2))


if __name__ == '__main__':
    main('2 * (555 - -50) / 42')

which will print:

{
  "parse": [
    {
      "mulExpr": [
        {
          "mulExpr": [
            {
              "numExpr": [
                {
                  "type": 1,
                  "name": "NUM",
                  "text": "2"
                }
              ]
            },
            {
              "type": 2,
              "name": "MUL",
              "text": "*"
            },
            {
              "nestedExpr": [
                {
                  "type": 6,
                  "name": "OPAR",
                  "text": "("
                },
                {
                  "addExpr": [
                    {
                      "numExpr": [
                        {
                          "type": 1,
                          "name": "NUM",
                          "text": "555"
                        }
                      ]
                    },
                    {
                      "type": 5,
                      "name": "MIN",
                      "text": "-"
                    },
                    {
                      "unaryExpr": [
                        {
                          "type": 5,
                          "name": "MIN",
                          "text": "-"
                        },
                        {
                          "numExpr": [
                            {
                              "type": 1,
                              "name": "NUM",
                              "text": "50"
                            }
                          ]
                        }
                      ]
                    }
                  ]
                },
                {
                  "type": 7,
                  "name": "CPAR",
                  "text": ")"
                }
              ]
            }
          ]
        },
        {
          "type": 3,
          "name": "DIV",
          "text": "/"
        },
        {
          "numExpr": [
            {
              "type": 1,
              "name": "NUM",
              "text": "42"
            }
          ]
        }
      ]
    },
    {
      "type": -1,
      "name": "EOF",
      "text": "<EOF>"
    }
  ]
}