File: antlr_grammar.py

package info (click to toggle)
pyparsing 3.1.3-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 2,928 kB
sloc: python: 25,633; ansic: 422; makefile: 22
file content (388 lines) | stat: -rw-r--r-- 11,143 bytes
parent folder | download | duplicates (3)
"""
antlr_grammar.py

Created on 4 sept. 2010

@author: luca

Submitted by Luca DallOlio, September, 2010
(Minor updates by Paul McGuire, June, 2012)
(Code idiom updates by Paul McGuire, April, 2019)
"""
from pyparsing import (
    Word,
    ZeroOrMore,
    printables,
    Suppress,
    OneOrMore,
    Group,
    LineEnd,
    Optional,
    White,
    originalTextFor,
    hexnums,
    nums,
    Combine,
    Literal,
    Keyword,
    cStyleComment,
    Regex,
    Forward,
    MatchFirst,
    And,
    oneOf,
    alphas,
    alphanums,
    delimitedList,
    Char,
)

# http://www.antlr.org/grammar/ANTLR/ANTLRv3.g

(
    QUOTE,
    APOS,
    EQ,
    LBRACK,
    RBRACK,
    LBRACE,
    RBRACE,
    LPAR,
    RPAR,
    ROOT,
    BANG,
    AT,
    TIL,
    SEMI,
    COLON,
    VERT,
) = map(Suppress, "\"'=[]{}()^!@~;:|")
BSLASH = Literal("\\")
keywords = (
    SRC_,
    SCOPE_,
    OPTIONS_,
    TOKENS_,
    FRAGMENT,
    ID,
    LEXER,
    PARSER,
    GRAMMAR,
    TREE,
    CATCH,
    FINALLY,
    THROWS,
    PROTECTED,
    PUBLIC,
    PRIVATE,
) = map(
    Keyword,
    """src scope options tokens fragment id lexer parser grammar tree catch finally throws protected
       public private """.split(),
)
KEYWORD = MatchFirst(keywords)

# Tokens
EOL = Suppress(LineEnd())  # $
SGL_PRINTABLE = Char(printables)
singleTextString = originalTextFor(
    ZeroOrMore(~EOL + (White(" \t") | Word(printables)))
).leaveWhitespace()
XDIGIT = hexnums
INT = Word(nums)
ESC = BSLASH + (
    oneOf(list(r"nrtbf\">" + "'")) | ("u" + Word(hexnums, exact=4)) | SGL_PRINTABLE
)
LITERAL_CHAR = ESC | ~(APOS | BSLASH) + SGL_PRINTABLE
CHAR_LITERAL = APOS + LITERAL_CHAR + APOS
STRING_LITERAL = APOS + Combine(OneOrMore(LITERAL_CHAR)) + APOS
DOUBLE_QUOTE_STRING_LITERAL = '"' + ZeroOrMore(LITERAL_CHAR) + '"'
DOUBLE_ANGLE_STRING_LITERAL = "<<" + ZeroOrMore(SGL_PRINTABLE) + ">>"
TOKEN_REF = Word(alphas.upper(), alphanums + "_")
RULE_REF = Word(alphas.lower(), alphanums + "_")
ACTION_ESC = (
    BSLASH.suppress() + APOS
    | BSLASH.suppress()
    | BSLASH.suppress() + (~(APOS | QUOTE) + SGL_PRINTABLE)
)
ACTION_CHAR_LITERAL = APOS + (ACTION_ESC | ~(BSLASH | APOS) + SGL_PRINTABLE) + APOS
ACTION_STRING_LITERAL = (
    QUOTE + ZeroOrMore(ACTION_ESC | ~(BSLASH | QUOTE) + SGL_PRINTABLE) + QUOTE
)

SRC = SRC_.suppress() + ACTION_STRING_LITERAL("file") + INT("line")
id = TOKEN_REF | RULE_REF
SL_COMMENT = (
    Suppress("//") + Suppress("$ANTLR") + SRC
    | ZeroOrMore(~EOL + Word(printables)) + EOL
)
ML_COMMENT = cStyleComment
WS = OneOrMore(
    Suppress(" ") | Suppress("\t") | (Optional(Suppress("\r")) + Literal("\n"))
)
WS_LOOP = ZeroOrMore(SL_COMMENT | ML_COMMENT)
NESTED_ARG_ACTION = Forward()
NESTED_ARG_ACTION << (
    LBRACK
    + ZeroOrMore(NESTED_ARG_ACTION | ACTION_STRING_LITERAL | ACTION_CHAR_LITERAL)
    + RBRACK
)
ARG_ACTION = NESTED_ARG_ACTION
NESTED_ACTION = Forward()
NESTED_ACTION << (
    LBRACE
    + ZeroOrMore(
        NESTED_ACTION
        | SL_COMMENT
        | ML_COMMENT
        | ACTION_STRING_LITERAL
        | ACTION_CHAR_LITERAL
    )
    + RBRACE
)
ACTION = NESTED_ACTION + Optional("?")
SCOPE = SCOPE_.suppress()
OPTIONS = OPTIONS_.suppress() + LBRACE  # + WS_LOOP + Suppress('{')
TOKENS = TOKENS_.suppress() + LBRACE  # + WS_LOOP + Suppress('{')
TREE_BEGIN = ROOT + LPAR
RANGE = Suppress("..")
REWRITE = Suppress("->")

# General Parser Definitions

# Grammar heading
optionValue = id | STRING_LITERAL | CHAR_LITERAL | INT | Literal("*").setName("s")

option = Group(id("id") + EQ + optionValue("value"))("option")
optionsSpec = OPTIONS + Group(OneOrMore(option + SEMI))("options") + RBRACE
tokenSpec = (
    Group(TOKEN_REF("token_ref") + (EQ + (STRING_LITERAL | CHAR_LITERAL)("lit")))(
        "token"
    )
    + SEMI
)
tokensSpec = TOKENS + Group(OneOrMore(tokenSpec))("tokens") + RBRACE
attrScope = SCOPE_.suppress() + id + ACTION
grammarType = LEXER + PARSER + TREE
actionScopeName = id | LEXER("l") | PARSER("p")
action = AT + Optional(actionScopeName + Suppress("::")) + id + ACTION

grammarHeading = (
    Optional(ML_COMMENT("ML_COMMENT"))
    + Optional(grammarType)
    + GRAMMAR
    + id("grammarName")
    + SEMI
    + Optional(optionsSpec)
    + Optional(tokensSpec)
    + ZeroOrMore(attrScope)
    + ZeroOrMore(action)
)

modifier = PROTECTED | PUBLIC | PRIVATE | FRAGMENT
ruleAction = AT + id + ACTION
throwsSpec = THROWS.suppress() + delimitedList(id)
ruleScopeSpec = (
    (SCOPE_.suppress() + ACTION)
    | (SCOPE_.suppress() + delimitedList(id) + SEMI)
    | (SCOPE_.suppress() + ACTION + SCOPE_.suppress() + delimitedList(id) + SEMI)
)
unary_op = oneOf("^ !")
notTerminal = CHAR_LITERAL | TOKEN_REF | STRING_LITERAL
terminal = (
    CHAR_LITERAL | TOKEN_REF + Optional(ARG_ACTION) | STRING_LITERAL | "."
) + Optional(unary_op)
block = Forward()
notSet = TIL + (notTerminal | block)
rangeNotPython = CHAR_LITERAL("c1") + RANGE + CHAR_LITERAL("c2")
atom = Group(
    (rangeNotPython + Optional(unary_op)("op"))
    | terminal
    | (notSet + Optional(unary_op)("op"))
    | (RULE_REF + Optional(ARG_ACTION("arg")) + Optional(unary_op)("op"))
)
element = Forward()
treeSpec = ROOT + LPAR + element * (2,) + RPAR
ebnfSuffix = oneOf("? * +")
ebnf = block + Optional(ebnfSuffix("op") | "=>")
elementNoOptionSpec = (
    (id("result_name") + oneOf("= +=")("labelOp") + atom("atom") + Optional(ebnfSuffix))
    | (id("result_name") + oneOf("= +=")("labelOp") + block + Optional(ebnfSuffix))
    | atom("atom") + Optional(ebnfSuffix)
    | ebnf
    | ACTION
    | (treeSpec + Optional(ebnfSuffix))
)  # |   SEMPRED ( '=>' -> GATED_SEMPRED | -> SEMPRED )
element <<= Group(elementNoOptionSpec)("element")
# Do not ask me why group is needed twice... seems like the xml that you see is not always the real structure?
alternative = Group(Group(OneOrMore(element))("elements"))
rewrite = Optional(Literal("TODO REWRITE RULES TODO"))
block <<= (
    LPAR
    + Optional(Optional(optionsSpec("opts")) + COLON)
    + Group(
        alternative("a1")
        + rewrite
        + Group(ZeroOrMore(VERT + alternative("a2") + rewrite))("alternatives")
    )("block")
    + RPAR
)
altList = (
    alternative("a1")
    + rewrite
    + Group(ZeroOrMore(VERT + alternative("a2") + rewrite))("alternatives")
)
exceptionHandler = CATCH.suppress() + ARG_ACTION + ACTION
finallyClause = FINALLY.suppress() + ACTION
exceptionGroup = (OneOrMore(exceptionHandler) + Optional(finallyClause)) | finallyClause

ruleHeading = (
    Optional(ML_COMMENT)("ruleComment")
    + Optional(modifier)("modifier")
    + id("ruleName")
    + Optional("!")
    + Optional(ARG_ACTION("arg"))
    + Optional(Suppress("returns") + ARG_ACTION("rt"))
    + Optional(throwsSpec)
    + Optional(optionsSpec)
    + Optional(ruleScopeSpec)
    + ZeroOrMore(ruleAction)
)
rule = Group(ruleHeading + COLON + altList + SEMI + Optional(exceptionGroup))("rule")

grammarDef = grammarHeading + Group(OneOrMore(rule))("rules")


def grammar():
    return grammarDef


def __antlrAlternativesConverter(pyparsingRules, antlrBlock):
    rule = None
    if (
        hasattr(antlrBlock, "alternatives")
        and antlrBlock.alternatives != ""
        and len(antlrBlock.alternatives) > 0
    ):
        alternatives = []
        alternatives.append(__antlrAlternativeConverter(pyparsingRules, antlrBlock.a1))
        for alternative in antlrBlock.alternatives:
            alternatives.append(
                __antlrAlternativeConverter(pyparsingRules, alternative)
            )
        rule = MatchFirst(alternatives)("anonymous_or")
    elif hasattr(antlrBlock, "a1") and antlrBlock.a1 != "":
        rule = __antlrAlternativeConverter(pyparsingRules, antlrBlock.a1)
    else:
        raise Exception("Not yet implemented")
    assert rule != None
    return rule


def __antlrAlternativeConverter(pyparsingRules, antlrAlternative):
    elementList = []
    for element in antlrAlternative.elements:
        rule = None
        if hasattr(element.atom, "c1") and element.atom.c1 != "":
            regex = r"[" + str(element.atom.c1[0]) + "-" + str(element.atom.c2[0] + "]")
            rule = Regex(regex)("anonymous_regex")
        elif hasattr(element, "block") and element.block != "":
            rule = __antlrAlternativesConverter(pyparsingRules, element.block)
        else:
            ruleRef = element.atom[0]
            assert ruleRef in pyparsingRules
            rule = pyparsingRules[ruleRef](ruleRef)
        if hasattr(element, "op") and element.op != "":
            if element.op == "+":
                rule = Group(OneOrMore(rule))("anonymous_one_or_more")
            elif element.op == "*":
                rule = Group(ZeroOrMore(rule))("anonymous_zero_or_more")
            elif element.op == "?":
                rule = Optional(rule)
            else:
                raise Exception("rule operator not yet implemented : " + element.op)
        rule = rule
        elementList.append(rule)
    if len(elementList) > 1:
        rule = Group(And(elementList))("anonymous_and")
    else:
        rule = elementList[0]
    assert rule is not None
    return rule


def __antlrRuleConverter(pyparsingRules, antlrRule):
    rule = None
    rule = __antlrAlternativesConverter(pyparsingRules, antlrRule)
    assert rule != None
    rule(antlrRule.ruleName)
    return rule


def antlrConverter(antlrGrammarTree):
    pyparsingRules = {}

    antlrTokens = {}
    for antlrToken in antlrGrammarTree.tokens:
        antlrTokens[antlrToken.token_ref] = antlrToken.lit
    for antlrTokenName, antlrToken in list(antlrTokens.items()):
        pyparsingRules[antlrTokenName] = Literal(antlrToken)

    antlrRules = {}
    for antlrRule in antlrGrammarTree.rules:
        antlrRules[antlrRule.ruleName] = antlrRule
        pyparsingRules[antlrRule.ruleName] = Forward()  # antlr is a top down grammar
    for antlrRuleName, antlrRule in list(antlrRules.items()):
        pyparsingRule = __antlrRuleConverter(pyparsingRules, antlrRule)
        assert pyparsingRule != None
        pyparsingRules[antlrRuleName] <<= pyparsingRule

    return pyparsingRules


if __name__ == "__main__":

    text = """\
grammar SimpleCalc;

options {
    language = Python;
}

tokens {
    PLUS     = '+' ;
    MINUS    = '-' ;
    MULT    = '*' ;
    DIV    = '/' ;
}

/*------------------------------------------------------------------
 * PARSER RULES
 *------------------------------------------------------------------*/

expr    : term ( ( PLUS | MINUS )  term )* ;

term    : factor ( ( MULT | DIV ) factor )* ;

factor    : NUMBER ;


/*------------------------------------------------------------------
 * LEXER RULES
 *------------------------------------------------------------------*/

NUMBER    : (DIGIT)+ ;

/* WHITESPACE : ( '\t' | ' ' | '\r' | '\n'| '\u000C' )+     { $channel = HIDDEN; } ; */

fragment DIGIT    : '0'..'9' ;

"""

    grammar().validate()
    antlrGrammarTree = grammar().parseString(text)
    print(antlrGrammarTree.dump())
    pyparsingRules = antlrConverter(antlrGrammarTree)
    pyparsingRule = pyparsingRules["expr"]
    pyparsingTree = pyparsingRule.parseString("2 - 5 * 42 + 7 / 25")
    print(pyparsingTree.dump())