Skip to content

Commit ffba370

Browse files
Merge pull request #42 from Strumenta/feature/pylasu-antlr-parser
Introduce a PylasuANTLRParser class modeled after Kolasu.
2 parents 133dadc + f4b83d0 commit ffba370

File tree

12 files changed

+242
-42
lines changed

12 files changed

+242
-42
lines changed

.github/workflows/pythonapp.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ jobs:
2323
run: |
2424
python -m pip install --upgrade pip
2525
pip install -r Requirements.txt
26-
curl -O https://www.antlr.org/download/antlr-4.11.1-complete.jar
26+
pip install antlr4-cli
2727
- name: Lint with flake8
2828
run: |
2929
pip install flake8

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
All notable changes to this project from version 0.4.0 upwards are documented in this file.
33
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
44

5+
## [0.9.0] – Not yet released
6+
7+
### Added
8+
- `PylasuANTLRParser` class modeled after the Kolasu equivalent
9+
510
## [0.8.1] – 2025-02-21
611

712
### Added

Requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
antlr4-python3-runtime==4.11.1
1+
antlr4-python3-runtime==4.13.2
22
pyecore==0.12.2; extra == 'ecore'

pylasu/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
VERSION = "0.8.1"
1+
VERSION = "0.9.0"

pylasu/model/model.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -133,14 +133,14 @@ def process_annotated_property(cl: type, name: str, decl_type):
133133
fields = dataclasses.fields(cl)
134134
except TypeError:
135135
fields = tuple()
136-
for field in fields:
137-
if field.name == name and PYLASU_FEATURE in field.metadata:
138-
feature = field.metadata[PYLASU_FEATURE]
136+
for fld in fields:
137+
if fld.name == name and PYLASU_FEATURE in fld.metadata:
138+
feature = fld.metadata[PYLASU_FEATURE]
139139
feature.name = name
140140
if isinstance(decl_type, type):
141141
feature.type = decl_type
142-
elif type(field.type) is str:
143-
feature.type = try_to_resolve_string_type(field.type, name, cl)
142+
elif type(fld.type) is str:
143+
feature.type = try_to_resolve_string_type(fld.type, name, cl)
144144
return feature
145145
return compute_feature_from_annotation(cl, name, decl_type)
146146

pylasu/parsing/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .results import FirstStageParsingResult, ParsingResultWithFirstStage

pylasu/parsing/antlr.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
import time
2+
from abc import abstractmethod
3+
from typing import Optional, List, Union
4+
5+
from antlr4 import CommonTokenStream, InputStream, Lexer, Parser, ParserATNSimulator, ParserRuleContext, \
6+
PredictionContextCache, Recognizer, Token, TokenStream
7+
from antlr4.error.ErrorListener import ErrorListener
8+
from pylasu.model import walk, Source, Position, Node, Point
9+
from pylasu.model.processing import assign_parents
10+
from pylasu.parsing.parse_tree import token_end_point
11+
from pylasu.parsing.results import ParsingResultWithFirstStage, FirstStageParsingResult
12+
from pylasu.validation import Issue, IssueType
13+
14+
15+
class PylasuANTLRParser:
16+
""" A complete description of a multi-stage ANTLR-based parser, from source code to AST.
17+
18+
You should extend this class to implement the parts that are specific to your language.
19+
20+
Note: instances of this class are thread-safe and they're meant to be reused. Do not create a new PylasuANTLRParser
21+
instance every time you need to parse some source code, or performance may suffer."""
22+
def __init__(self):
23+
self.prediction_context_cache = PredictionContextCache()
24+
25+
def parse(self, input_stream: Union[InputStream, str], consider_range: bool = True,
26+
measure_lexing_time: bool = False, source: Optional[Source] = None):
27+
"""Parses source code, returning a result that includes an AST and a collection of parse issues
28+
(errors, warnings).
29+
The parsing is done in accordance to the StarLasu methodology i.e. a first-stage parser builds a parse tree
30+
which is then mapped onto a higher-level tree called the AST.
31+
@param inputStream the source code.
32+
@param charset the character set in which the input is encoded.
33+
@param considerPosition if true (the default), parsed AST nodes record their position in the input text.
34+
@param measureLexingTime if true, the result will include a measurement of the time spent in lexing i.e.
35+
breaking the input stream into tokens."""
36+
start = time.time_ns()
37+
if type(input_stream) is str:
38+
input_stream = InputStream(input_stream)
39+
first_stage = self.parse_first_stage(input_stream, measure_lexing_time)
40+
issues = first_stage.issues
41+
ast = self.parse_tree_to_ast(first_stage.root, consider_range, issues, source)
42+
self.assign_parents(ast)
43+
ast = self.post_process_ast(ast, issues) if ast else ast
44+
if ast and not consider_range:
45+
# Remove parseTreeNodes because they cause the range to be computed
46+
for node in walk(ast):
47+
node.origin = None
48+
now = time.time_ns()
49+
return ParsingResultWithFirstStage(
50+
issues,
51+
ast,
52+
input_stream.getText(0, input_stream.index + 1),
53+
(now - start) // 1_000_000,
54+
first_stage,
55+
source,
56+
)
57+
58+
def parse_first_stage(
59+
self, input_stream: InputStream, measure_lexing_time: bool = False, source: Source = None
60+
) -> FirstStageParsingResult:
61+
"""Executes only the first stage of the parser, i.e., the production of a parse tree. Usually, you'll want to
62+
use the [parse] method, that returns an AST which is simpler to use and query."""
63+
issues = []
64+
lexing_time: Optional[int] = None
65+
total_time: int = time.time_ns()
66+
parser = self.create_parser(input_stream, issues)
67+
if measure_lexing_time:
68+
token_stream = parser.getInputStream()
69+
if isinstance(token_stream, CommonTokenStream):
70+
lexing_time = time.time_ns()
71+
token_stream.fill()
72+
token_stream.seek(0)
73+
lexing_time = (time.time_ns() - lexing_time) // 1_000_000
74+
root = self.invoke_root_rule(parser)
75+
if root:
76+
self.verify_parse_tree(parser, issues, root)
77+
total_time = (time.time_ns() - total_time) // 1_000_000
78+
return FirstStageParsingResult(issues, root, None, total_time, lexing_time, source)
79+
80+
def create_parser(self, input_stream: InputStream, issues: List[Issue]) -> Parser:
81+
"""Creates the first-stage parser."""
82+
lexer = self.create_antlr_lexer(input_stream)
83+
self.attach_listeners(lexer, issues)
84+
token_stream = self.create_token_stream(lexer)
85+
parser = self.create_antlr_parser(token_stream)
86+
# Assign interpreter to avoid caching DFA states indefinitely across executions
87+
parser._interp = \
88+
ParserATNSimulator(parser, parser.atn, parser._interp.decisionToDFA, self.prediction_context_cache)
89+
self.attach_listeners(parser, issues)
90+
return parser
91+
92+
def invoke_root_rule(self, parser: Parser):
93+
"""Invokes the parser's root rule, i.e., the method which is responsible for parsing the entire input.
94+
Usually this is the topmost rule, the one with index 0 (as also assumed by other libraries such as antlr4-c3),
95+
so this method invokes that rule. If your grammar/parser is structured differently, or if you're using this to
96+
parse only a portion of the input or a subset of the language, you have to override this method to invoke the
97+
correct entry point."""
98+
return getattr(parser, parser.ruleNames[0])()
99+
100+
def verify_parse_tree(self, parser: Parser, issues: List[Issue], root: ParserRuleContext):
101+
"""Checks the parse tree for correctness.
102+
If you're concerned about performance, you may want to override this to do nothing."""
103+
last_token: Token = parser.getTokenStream().get(parser.getTokenStream().index)
104+
if last_token.type != Token.EOF:
105+
issues.append(
106+
Issue(
107+
IssueType.SYNTACTIC,
108+
"The whole input was not consumed",
109+
position=Position(token_end_point(last_token), token_end_point(last_token))
110+
)
111+
)
112+
# TODO Kolasu also traverses the parse tree searching for exceptions
113+
114+
def assign_parents(self, ast):
115+
if ast:
116+
assign_parents(ast)
117+
118+
def post_process_ast(self, ast, issues):
119+
return ast
120+
121+
def create_token_stream(self, lexer: Lexer) -> TokenStream:
122+
return CommonTokenStream(lexer)
123+
124+
@abstractmethod
125+
def create_antlr_lexer(self, input_stream: InputStream) -> Lexer:
126+
"""Creates the lexer."""
127+
pass
128+
129+
@abstractmethod
130+
def create_antlr_parser(self, token_stream: TokenStream) -> Parser:
131+
"""Creates the first-stage parser."""
132+
pass
133+
134+
@abstractmethod
135+
def parse_tree_to_ast(self, root, consider_range: bool, issues: List[Issue], source: Source) -> Optional[Node]:
136+
pass
137+
138+
def attach_listeners(self, recognizer: Recognizer, issues: List[Issue]):
139+
recognizer.removeErrorListeners()
140+
recognizer.addErrorListener(ParserErrorListener(issues))
141+
142+
143+
class ParserErrorListener(ErrorListener):
144+
def __init__(self, issues: List[Issue]):
145+
self.issues = issues
146+
147+
def syntaxError(self, recognizer, offending_symbol, line, column, msg, e):
148+
start_point = Point(line, column)
149+
end_point = start_point
150+
if isinstance(offending_symbol, Token):
151+
end_point = token_end_point(offending_symbol)
152+
msg = (msg or "unspecified").capitalize()
153+
self.issues.append(Issue(IssueType.SYNTACTIC, msg, position=Position(start_point, end_point)))

pylasu/parsing/results.py

Lines changed: 18 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,37 +1,25 @@
1-
from dataclasses import dataclass
2-
from typing import List
1+
from dataclasses import dataclass, field
2+
from typing import List, Optional
33

4-
from antlr4 import ParserRuleContext, Token
5-
6-
from pylasu.model import Source
7-
from pylasu.validation.validation import WithIssues, IssueType, Issue
8-
9-
10-
@dataclass
11-
class FirstStageResult(WithIssues):
12-
parse_tree: ParserRuleContext
4+
from pylasu.model import Source, Node
5+
from pylasu.validation.validation import Issue
136

147

158
@dataclass
16-
class LexingResult(WithIssues):
17-
tokens: List[Token]
9+
class FirstStageParsingResult:
10+
issues: List[Issue]
11+
root: Optional[Node]
12+
code: Optional[str] = None
13+
time: int = None
14+
lexing_time: int = None
15+
source: Source = None
1816

1917

2018
@dataclass
21-
class IssuesErrorListener:
22-
"""This Error Listener should be used with ANTLR lexers and parsers to capture issues"""
23-
type: IssueType
24-
source: Source
25-
issues: WithIssues
26-
27-
def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e):
28-
self.issues.append(Issue(type=self.type, message=msg))
29-
30-
def reportAmbiguity(self, recognizer, dfa, startIndex, stopIndex, exact, ambigAlts, configs):
31-
pass
32-
33-
def reportAttemptingFullContext(self, recognizer, dfa, startIndex, stopIndex, conflictingAlts, configs):
34-
pass
35-
36-
def reportContextSensitivity(self, recognizer, dfa, startIndex, stopIndex, prediction, configs):
37-
pass
19+
class ParsingResultWithFirstStage:
20+
issues: List[Issue] = field(default_factory=list)
21+
root: Node = None
22+
code: str = None
23+
time: int = None
24+
first_stage: FirstStageParsingResult = None
25+
source: Source = None

tests/SimpleLangParser.g4

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ parser grammar SimpleLangParser;
33
options { tokenVocab = SimpleLangLexer; }
44

55
compilationUnit:
6-
statement+;
6+
statement+ EOF;
77

88
statement:
99
DISPLAY expression #displayStmt

tests/generate-test-parsers.sh

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
java -cp ../antlr-4.11.1-complete.jar org.antlr.v4.Tool -Dlanguage=Python3 -visitor -o simple_lang SimpleLangLexer.g4 SimpleLangParser.g4
2-
java -cp ../antlr-4.11.1-complete.jar org.antlr.v4.Tool -Dlanguage=Python3 -visitor -o antlr_entity AntlrEntityLexer.g4 AntlrEntityParser.g4
3-
java -cp ../antlr-4.11.1-complete.jar org.antlr.v4.Tool -Dlanguage=Python3 -visitor -o antlr_script AntlrScriptLexer.g4 AntlrScriptParser.g4
1+
antlr4 -Dlanguage=Python3 -visitor -o simple_lang SimpleLangLexer.g4 SimpleLangParser.g4
2+
antlr4 -Dlanguage=Python3 -visitor -o antlr_entity AntlrEntityLexer.g4 AntlrEntityParser.g4
3+
antlr4 -Dlanguage=Python3 -visitor -o antlr_script AntlrScriptLexer.g4 AntlrScriptParser.g4

0 commit comments

Comments
 (0)