Merge pull request #42 from Strumenta/feature/pylasu-antlr-parser

alessiostalla · web-flow · commit ffba3701e8af · 2025-07-23T12:18:27.000+02:00
Introduce a PylasuANTLRParser class modeled after Kolasu.
diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml
@@ -23,7 +23,7 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           pip install -r Requirements.txt
-          curl -O https://www.antlr.org/download/antlr-4.11.1-complete.jar
+          pip install antlr4-cli
       - name: Lint with flake8
         run: |
           pip install flake8
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,11 @@
 All notable changes to this project from version 0.4.0 upwards are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
+## [0.9.0] – Not yet released
+
+### Added
+- `PylasuANTLRParser` class modeled after the Kolasu equivalent
+
 ## [0.8.1] – 2025-02-21
 
 ### Added
diff --git a/Requirements.txt b/Requirements.txt
@@ -1,2 +1,2 @@
-antlr4-python3-runtime==4.11.1
+antlr4-python3-runtime==4.13.2
 pyecore==0.12.2; extra == 'ecore'
diff --git a/pylasu/__init__.py b/pylasu/__init__.py
@@ -1 +1 @@
-VERSION = "0.8.1"
+VERSION = "0.9.0"
diff --git a/pylasu/model/model.py b/pylasu/model/model.py
@@ -133,14 +133,14 @@ def process_annotated_property(cl: type, name: str, decl_type):
         fields = dataclasses.fields(cl)
     except TypeError:
         fields = tuple()
-    for field in fields:
-        if field.name == name and PYLASU_FEATURE in field.metadata:
-            feature = field.metadata[PYLASU_FEATURE]
+    for fld in fields:
+        if fld.name == name and PYLASU_FEATURE in fld.metadata:
+            feature = fld.metadata[PYLASU_FEATURE]
             feature.name = name
             if isinstance(decl_type, type):
                 feature.type = decl_type
-            elif type(field.type) is str:
-                feature.type = try_to_resolve_string_type(field.type, name, cl)
+            elif type(fld.type) is str:
+                feature.type = try_to_resolve_string_type(fld.type, name, cl)
             return feature
     return compute_feature_from_annotation(cl, name, decl_type)
 
diff --git a/pylasu/parsing/__init__.py b/pylasu/parsing/__init__.py
@@ -0,0 +1 @@
+from .results import FirstStageParsingResult, ParsingResultWithFirstStage
diff --git a/pylasu/parsing/antlr.py b/pylasu/parsing/antlr.py
@@ -0,0 +1,153 @@
+import time
+from abc import abstractmethod
+from typing import Optional, List, Union
+
+from antlr4 import CommonTokenStream, InputStream, Lexer, Parser, ParserATNSimulator, ParserRuleContext, \
+    PredictionContextCache, Recognizer, Token, TokenStream
+from antlr4.error.ErrorListener import ErrorListener
+from pylasu.model import walk, Source, Position, Node, Point
+from pylasu.model.processing import assign_parents
+from pylasu.parsing.parse_tree import token_end_point
+from pylasu.parsing.results import ParsingResultWithFirstStage, FirstStageParsingResult
+from pylasu.validation import Issue, IssueType
+
+
+class PylasuANTLRParser:
+    """ A complete description of a multi-stage ANTLR-based parser, from source code to AST.
+
+    You should extend this class to implement the parts that are specific to your language.
+
+    Note: instances of this class are thread-safe and they're meant to be reused. Do not create a new PylasuANTLRParser
+    instance every time you need to parse some source code, or performance may suffer."""
+    def __init__(self):
+        self.prediction_context_cache = PredictionContextCache()
+
+    def parse(self, input_stream: Union[InputStream, str], consider_range: bool = True,
+              measure_lexing_time: bool = False, source: Optional[Source] = None):
+        """Parses source code, returning a result that includes an AST and a collection of parse issues
+        (errors, warnings).
+        The parsing is done in accordance to the StarLasu methodology i.e. a first-stage parser builds a parse tree
+        which is then mapped onto a higher-level tree called the AST.
+        @param inputStream the source code.
+        @param charset the character set in which the input is encoded.
+        @param considerPosition if true (the default), parsed AST nodes record their position in the input text.
+        @param measureLexingTime if true, the result will include a measurement of the time spent in lexing i.e.
+        breaking the input stream into tokens."""
+        start = time.time_ns()
+        if type(input_stream) is str:
+            input_stream = InputStream(input_stream)
+        first_stage = self.parse_first_stage(input_stream, measure_lexing_time)
+        issues = first_stage.issues
+        ast = self.parse_tree_to_ast(first_stage.root, consider_range, issues, source)
+        self.assign_parents(ast)
+        ast = self.post_process_ast(ast, issues) if ast else ast
+        if ast and not consider_range:
+            # Remove parseTreeNodes because they cause the range to be computed
+            for node in walk(ast):
+                node.origin = None
+        now = time.time_ns()
+        return ParsingResultWithFirstStage(
+            issues,
+            ast,
+            input_stream.getText(0, input_stream.index + 1),
+            (now - start) // 1_000_000,
+            first_stage,
+            source,
+        )
+
+    def parse_first_stage(
+            self, input_stream: InputStream, measure_lexing_time: bool = False, source: Source = None
+    ) -> FirstStageParsingResult:
+        """Executes only the first stage of the parser, i.e., the production of a parse tree. Usually, you'll want to
+        use the [parse] method, that returns an AST which is simpler to use and query."""
+        issues = []
+        lexing_time: Optional[int] = None
+        total_time: int = time.time_ns()
+        parser = self.create_parser(input_stream, issues)
+        if measure_lexing_time:
+            token_stream = parser.getInputStream()
+            if isinstance(token_stream, CommonTokenStream):
+                lexing_time = time.time_ns()
+                token_stream.fill()
+                token_stream.seek(0)
+                lexing_time = (time.time_ns() - lexing_time) // 1_000_000
+        root = self.invoke_root_rule(parser)
+        if root:
+            self.verify_parse_tree(parser, issues, root)
+        total_time = (time.time_ns() - total_time) // 1_000_000
+        return FirstStageParsingResult(issues, root, None, total_time, lexing_time, source)
+
+    def create_parser(self, input_stream: InputStream, issues: List[Issue]) -> Parser:
+        """Creates the first-stage parser."""
+        lexer = self.create_antlr_lexer(input_stream)
+        self.attach_listeners(lexer, issues)
+        token_stream = self.create_token_stream(lexer)
+        parser = self.create_antlr_parser(token_stream)
+        # Assign interpreter to avoid caching DFA states indefinitely across executions
+        parser._interp = \
+            ParserATNSimulator(parser, parser.atn, parser._interp.decisionToDFA, self.prediction_context_cache)
+        self.attach_listeners(parser, issues)
+        return parser
+
+    def invoke_root_rule(self, parser: Parser):
+        """Invokes the parser's root rule, i.e., the method which is responsible for parsing the entire input.
+        Usually this is the topmost rule, the one with index 0 (as also assumed by other libraries such as antlr4-c3),
+        so this method invokes that rule. If your grammar/parser is structured differently, or if you're using this to
+        parse only a portion of the input or a subset of the language, you have to override this method to invoke the
+        correct entry point."""
+        return getattr(parser, parser.ruleNames[0])()
+
+    def verify_parse_tree(self, parser: Parser, issues: List[Issue], root: ParserRuleContext):
+        """Checks the parse tree for correctness.
+        If you're concerned about performance, you may want to override this to do nothing."""
+        last_token: Token = parser.getTokenStream().get(parser.getTokenStream().index)
+        if last_token.type != Token.EOF:
+            issues.append(
+                Issue(
+                    IssueType.SYNTACTIC,
+                    "The whole input was not consumed",
+                    position=Position(token_end_point(last_token), token_end_point(last_token))
+                )
+            )
+        # TODO Kolasu also traverses the parse tree searching for exceptions
+
+    def assign_parents(self, ast):
+        if ast:
+            assign_parents(ast)
+
+    def post_process_ast(self, ast, issues):
+        return ast
+
+    def create_token_stream(self, lexer: Lexer) -> TokenStream:
+        return CommonTokenStream(lexer)
+
+    @abstractmethod
+    def create_antlr_lexer(self, input_stream: InputStream) -> Lexer:
+        """Creates the lexer."""
+        pass
+
+    @abstractmethod
+    def create_antlr_parser(self, token_stream: TokenStream) -> Parser:
+        """Creates the first-stage parser."""
+        pass
+
+    @abstractmethod
+    def parse_tree_to_ast(self, root, consider_range: bool, issues: List[Issue], source: Source) -> Optional[Node]:
+        pass
+
+    def attach_listeners(self, recognizer: Recognizer, issues: List[Issue]):
+        recognizer.removeErrorListeners()
+        recognizer.addErrorListener(ParserErrorListener(issues))
+
+
+class ParserErrorListener(ErrorListener):
+    def __init__(self, issues: List[Issue]):
+        self.issues = issues
+
+    def syntaxError(self, recognizer, offending_symbol, line, column, msg, e):
+        start_point = Point(line, column)
+        end_point = start_point
+        if isinstance(offending_symbol, Token):
+            end_point = token_end_point(offending_symbol)
+        msg = (msg or "unspecified").capitalize()
+        self.issues.append(Issue(IssueType.SYNTACTIC, msg, position=Position(start_point, end_point)))
diff --git a/pylasu/parsing/results.py b/pylasu/parsing/results.py
@@ -1,37 +1,25 @@
-from dataclasses import dataclass
-from typing import List
+from dataclasses import dataclass, field
+from typing import List, Optional
 
-from antlr4 import ParserRuleContext, Token
-
-from pylasu.model import Source
-from pylasu.validation.validation import WithIssues, IssueType, Issue
-
-
-@dataclass
-class FirstStageResult(WithIssues):
-    parse_tree: ParserRuleContext
+from pylasu.model import Source, Node
+from pylasu.validation.validation import Issue
 
 
 @dataclass
-class LexingResult(WithIssues):
-    tokens: List[Token]
+class FirstStageParsingResult:
+    issues: List[Issue]
+    root: Optional[Node]
+    code: Optional[str] = None
+    time: int = None
+    lexing_time: int = None
+    source: Source = None
 
 
 @dataclass
-class IssuesErrorListener:
-    """This Error Listener should be used with ANTLR lexers and parsers to capture issues"""
-    type: IssueType
-    source: Source
-    issues: WithIssues
-
-    def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e):
-        self.issues.append(Issue(type=self.type, message=msg))
-
-    def reportAmbiguity(self, recognizer, dfa, startIndex, stopIndex, exact, ambigAlts, configs):
-        pass
-
-    def reportAttemptingFullContext(self, recognizer, dfa, startIndex, stopIndex, conflictingAlts, configs):
-        pass
-
-    def reportContextSensitivity(self, recognizer, dfa, startIndex, stopIndex, prediction, configs):
-        pass
+class ParsingResultWithFirstStage:
+    issues: List[Issue] = field(default_factory=list)
+    root: Node = None
+    code: str = None
+    time: int = None
+    first_stage: FirstStageParsingResult = None
+    source: Source = None
diff --git a/tests/SimpleLangParser.g4 b/tests/SimpleLangParser.g4
@@ -3,7 +3,7 @@ parser grammar SimpleLangParser;
 options { tokenVocab = SimpleLangLexer; }
 
 compilationUnit:
-    statement+;
+    statement+ EOF;
 
 statement:
       DISPLAY expression #displayStmt
diff --git a/tests/generate-test-parsers.sh b/tests/generate-test-parsers.sh
@@ -1,3 +1,3 @@
-java -cp ../antlr-4.11.1-complete.jar org.antlr.v4.Tool -Dlanguage=Python3 -visitor -o simple_lang SimpleLangLexer.g4 SimpleLangParser.g4
-java -cp ../antlr-4.11.1-complete.jar org.antlr.v4.Tool -Dlanguage=Python3 -visitor -o antlr_entity AntlrEntityLexer.g4 AntlrEntityParser.g4
-java -cp ../antlr-4.11.1-complete.jar org.antlr.v4.Tool -Dlanguage=Python3 -visitor -o antlr_script AntlrScriptLexer.g4 AntlrScriptParser.g4
+antlr4 -Dlanguage=Python3 -visitor -o simple_lang SimpleLangLexer.g4 SimpleLangParser.g4
+antlr4 -Dlanguage=Python3 -visitor -o antlr_entity AntlrEntityLexer.g4 AntlrEntityParser.g4
+antlr4 -Dlanguage=Python3 -visitor -o antlr_script AntlrScriptLexer.g4 AntlrScriptParser.g4
diff --git a/tests/parsing/__init__.py b/tests/parsing/__init__.py
diff --git a/tests/parsing/test_pylasu_antlr_parser.py b/tests/parsing/test_pylasu_antlr_parser.py
@@ -0,0 +1,53 @@
+import unittest
+from typing import List, Optional
+
+from antlr4 import TokenStream, InputStream
+
+from pylasu.model import Source, Node, Position, Point
+from pylasu.parsing.antlr import PylasuANTLRParser
+from pylasu.validation import Issue
+from tests.simple_lang.SimpleLangLexer import SimpleLangLexer
+from tests.simple_lang.SimpleLangParser import SimpleLangParser
+
+
+class SimpleLangPylasuParser(PylasuANTLRParser):
+
+    def create_antlr_lexer(self, input_stream: InputStream):
+        return SimpleLangLexer(input_stream)
+
+    def create_antlr_parser(self, token_stream: TokenStream):
+        return SimpleLangParser(token_stream)
+
+    def parse_tree_to_ast(self, root, consider_range: bool, issues: List[Issue], source: Source) -> Optional[Node]:
+        return None
+
+
+class KolasuParserTest(unittest.TestCase):
+    def test_lexing(self):
+        parser = SimpleLangPylasuParser()
+        result = parser.parse("""set a = 10
+set b = ""
+display c
+""")
+        self.assertIsNotNone(result)
+        # TODO we don't have Pylasu Tokens yet
+
+    def test_issues_are_capitalized(self):
+        parser = SimpleLangPylasuParser()
+        result = parser.parse("""set set a = 10
+display c
+""")
+        self.assertTrue(result.issues)
+        self.assertTrue([i for i in result.issues if i.message.startswith("Extraneous input 'set'")])
+        self.assertTrue([i for i in result.issues if i.message.startswith("Mismatched input 'c'")])
+
+    def test_issues_have_not_flat_position(self):
+        parser = SimpleLangPylasuParser()
+        result = parser.parse("""set set a = 10
+display c
+""")
+        self.assertTrue(result.issues)
+        extraneous_input = [i for i in result.issues if i.message.startswith("Extraneous input 'set'")][0]
+        self.assertEqual(Position(Point(1, 4), Point(1, 7)), extraneous_input.position)
+        mismatched_input = [i for i in result.issues if i.message.startswith("Mismatched input 'c'")][0]
+        self.assertEqual(Position(Point(2, 8), Point(2, 9)), mismatched_input.position)

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-antlr4-python3-runtime==4.11.1`
	`1`	`+antlr4-python3-runtime==4.13.2`
`2`	`2`	`pyecore==0.12.2; extra == 'ecore'`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from .results import FirstStageParsingResult, ParsingResultWithFirstStage`