Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/pythonapp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install -r Requirements.txt
curl -O https://www.antlr.org/download/antlr-4.11.1-complete.jar
pip install antlr4-cli
- name: Lint with flake8
run: |
pip install flake8
Expand Down
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
All notable changes to this project from version 0.4.0 upwards are documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

## [0.9.0] – Not yet released

### Added
- `PylasuANTLRParser` class modeled after the Kolasu equivalent

## [0.8.1] – 2025-02-21

### Added
Expand Down
2 changes: 1 addition & 1 deletion Requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
antlr4-python3-runtime==4.11.1
antlr4-python3-runtime==4.13.2
pyecore==0.12.2; extra == 'ecore'
2 changes: 1 addition & 1 deletion pylasu/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
VERSION = "0.8.1"
VERSION = "0.9.0"
10 changes: 5 additions & 5 deletions pylasu/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,14 +133,14 @@ def process_annotated_property(cl: type, name: str, decl_type):
fields = dataclasses.fields(cl)
except TypeError:
fields = tuple()
for field in fields:
if field.name == name and PYLASU_FEATURE in field.metadata:
feature = field.metadata[PYLASU_FEATURE]
for fld in fields:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why renaming field into fld?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It wasn't passing the linter because it shadowed an import

if fld.name == name and PYLASU_FEATURE in fld.metadata:
feature = fld.metadata[PYLASU_FEATURE]
feature.name = name
if isinstance(decl_type, type):
feature.type = decl_type
elif type(field.type) is str:
feature.type = try_to_resolve_string_type(field.type, name, cl)
elif type(fld.type) is str:
feature.type = try_to_resolve_string_type(fld.type, name, cl)
return feature
return compute_feature_from_annotation(cl, name, decl_type)

Expand Down
1 change: 1 addition & 0 deletions pylasu/parsing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .results import FirstStageParsingResult, ParsingResultWithFirstStage
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice touch!

153 changes: 153 additions & 0 deletions pylasu/parsing/antlr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import time
from abc import abstractmethod
from typing import Optional, List, Union

from antlr4 import CommonTokenStream, InputStream, Lexer, Parser, ParserATNSimulator, ParserRuleContext, \
PredictionContextCache, Recognizer, Token, TokenStream
from antlr4.error.ErrorListener import ErrorListener
from pylasu.model import walk, Source, Position, Node, Point
from pylasu.model.processing import assign_parents
from pylasu.parsing.parse_tree import token_end_point
from pylasu.parsing.results import ParsingResultWithFirstStage, FirstStageParsingResult
from pylasu.validation import Issue, IssueType


class PylasuANTLRParser:
""" A complete description of a multi-stage ANTLR-based parser, from source code to AST.
You should extend this class to implement the parts that are specific to your language.
Note: instances of this class are thread-safe and they're meant to be reused. Do not create a new PylasuANTLRParser
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this come verbatim from Kolasu or is this class really thread-safe?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It comes from Kolasu but I don't see why it shouldn't apply here too

instance every time you need to parse some source code, or performance may suffer."""
def __init__(self):
self.prediction_context_cache = PredictionContextCache()

def parse(self, input_stream: Union[InputStream, str], consider_range: bool = True,
measure_lexing_time: bool = False, source: Optional[Source] = None):
"""Parses source code, returning a result that includes an AST and a collection of parse issues
(errors, warnings).
The parsing is done in accordance to the StarLasu methodology i.e. a first-stage parser builds a parse tree
which is then mapped onto a higher-level tree called the AST.
@param inputStream the source code.
@param charset the character set in which the input is encoded.
@param considerPosition if true (the default), parsed AST nodes record their position in the input text.
@param measureLexingTime if true, the result will include a measurement of the time spent in lexing i.e.
breaking the input stream into tokens."""
start = time.time_ns()
if type(input_stream) is str:
input_stream = InputStream(input_stream)
first_stage = self.parse_first_stage(input_stream, measure_lexing_time)
issues = first_stage.issues
ast = self.parse_tree_to_ast(first_stage.root, consider_range, issues, source)
self.assign_parents(ast)
ast = self.post_process_ast(ast, issues) if ast else ast
if ast and not consider_range:
# Remove parseTreeNodes because they cause the range to be computed
for node in walk(ast):
node.origin = None
now = time.time_ns()
return ParsingResultWithFirstStage(
issues,
ast,
input_stream.getText(0, input_stream.index + 1),
(now - start) // 1_000_000,
first_stage,
source,
)

def parse_first_stage(
self, input_stream: InputStream, measure_lexing_time: bool = False, source: Source = None
) -> FirstStageParsingResult:
"""Executes only the first stage of the parser, i.e., the production of a parse tree. Usually, you'll want to
use the [parse] method, that returns an AST which is simpler to use and query."""
issues = []
lexing_time: Optional[int] = None
total_time: int = time.time_ns()
parser = self.create_parser(input_stream, issues)
if measure_lexing_time:
token_stream = parser.getInputStream()
if isinstance(token_stream, CommonTokenStream):
lexing_time = time.time_ns()
token_stream.fill()
token_stream.seek(0)
lexing_time = (time.time_ns() - lexing_time) // 1_000_000
root = self.invoke_root_rule(parser)
if root:
self.verify_parse_tree(parser, issues, root)
total_time = (time.time_ns() - total_time) // 1_000_000
return FirstStageParsingResult(issues, root, None, total_time, lexing_time, source)

def create_parser(self, input_stream: InputStream, issues: List[Issue]) -> Parser:
"""Creates the first-stage parser."""
lexer = self.create_antlr_lexer(input_stream)
self.attach_listeners(lexer, issues)
token_stream = self.create_token_stream(lexer)
parser = self.create_antlr_parser(token_stream)
# Assign interpreter to avoid caching DFA states indefinitely across executions
parser._interp = \
ParserATNSimulator(parser, parser.atn, parser._interp.decisionToDFA, self.prediction_context_cache)
self.attach_listeners(parser, issues)
return parser

def invoke_root_rule(self, parser: Parser):
"""Invokes the parser's root rule, i.e., the method which is responsible for parsing the entire input.
Usually this is the topmost rule, the one with index 0 (as also assumed by other libraries such as antlr4-c3),
so this method invokes that rule. If your grammar/parser is structured differently, or if you're using this to
parse only a portion of the input or a subset of the language, you have to override this method to invoke the
correct entry point."""
return getattr(parser, parser.ruleNames[0])()

def verify_parse_tree(self, parser: Parser, issues: List[Issue], root: ParserRuleContext):
"""Checks the parse tree for correctness.
If you're concerned about performance, you may want to override this to do nothing."""
last_token: Token = parser.getTokenStream().get(parser.getTokenStream().index)
if last_token.type != Token.EOF:
issues.append(
Issue(
IssueType.SYNTACTIC,
"The whole input was not consumed",
position=Position(token_end_point(last_token), token_end_point(last_token))
)
)
# TODO Kolasu also traverses the parse tree searching for exceptions
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this something we should do here?
Should we add a test for this? Or perhaps add an issue to come back to this later?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll add an issue


def assign_parents(self, ast):
if ast:
assign_parents(ast)

def post_process_ast(self, ast, issues):
return ast

def create_token_stream(self, lexer: Lexer) -> TokenStream:
return CommonTokenStream(lexer)

@abstractmethod
def create_antlr_lexer(self, input_stream: InputStream) -> Lexer:
"""Creates the lexer."""
pass

@abstractmethod
def create_antlr_parser(self, token_stream: TokenStream) -> Parser:
"""Creates the first-stage parser."""
pass

@abstractmethod
def parse_tree_to_ast(self, root, consider_range: bool, issues: List[Issue], source: Source) -> Optional[Node]:
pass

def attach_listeners(self, recognizer: Recognizer, issues: List[Issue]):
recognizer.removeErrorListeners()
recognizer.addErrorListener(ParserErrorListener(issues))


class ParserErrorListener(ErrorListener):
def __init__(self, issues: List[Issue]):
self.issues = issues

def syntaxError(self, recognizer, offending_symbol, line, column, msg, e):
start_point = Point(line, column)
end_point = start_point
if isinstance(offending_symbol, Token):
end_point = token_end_point(offending_symbol)
msg = (msg or "unspecified").capitalize()
self.issues.append(Issue(IssueType.SYNTACTIC, msg, position=Position(start_point, end_point)))
48 changes: 18 additions & 30 deletions pylasu/parsing/results.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,25 @@
from dataclasses import dataclass
from typing import List
from dataclasses import dataclass, field
from typing import List, Optional

from antlr4 import ParserRuleContext, Token

from pylasu.model import Source
from pylasu.validation.validation import WithIssues, IssueType, Issue


@dataclass
class FirstStageResult(WithIssues):
parse_tree: ParserRuleContext
from pylasu.model import Source, Node
from pylasu.validation.validation import Issue


@dataclass
class LexingResult(WithIssues):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we need also the LexingResult, in case just lexing is invoked?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We'll need it but it has a different structure in Kolasu, at the moment it's not used (because we don't have the lex methods). I'll open an issue about that too

tokens: List[Token]
class FirstStageParsingResult:
issues: List[Issue]
root: Optional[Node]
code: Optional[str] = None
time: int = None
lexing_time: int = None
source: Source = None


@dataclass
class IssuesErrorListener:
"""This Error Listener should be used with ANTLR lexers and parsers to capture issues"""
type: IssueType
source: Source
issues: WithIssues

def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e):
self.issues.append(Issue(type=self.type, message=msg))

def reportAmbiguity(self, recognizer, dfa, startIndex, stopIndex, exact, ambigAlts, configs):
pass

def reportAttemptingFullContext(self, recognizer, dfa, startIndex, stopIndex, conflictingAlts, configs):
pass

def reportContextSensitivity(self, recognizer, dfa, startIndex, stopIndex, prediction, configs):
pass
class ParsingResultWithFirstStage:
issues: List[Issue] = field(default_factory=list)
root: Node = None
code: str = None
time: int = None
first_stage: FirstStageParsingResult = None
source: Source = None
2 changes: 1 addition & 1 deletion tests/SimpleLangParser.g4
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ parser grammar SimpleLangParser;
options { tokenVocab = SimpleLangLexer; }

compilationUnit:
statement+;
statement+ EOF;

statement:
DISPLAY expression #displayStmt
Expand Down
6 changes: 3 additions & 3 deletions tests/generate-test-parsers.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
java -cp ../antlr-4.11.1-complete.jar org.antlr.v4.Tool -Dlanguage=Python3 -visitor -o simple_lang SimpleLangLexer.g4 SimpleLangParser.g4
java -cp ../antlr-4.11.1-complete.jar org.antlr.v4.Tool -Dlanguage=Python3 -visitor -o antlr_entity AntlrEntityLexer.g4 AntlrEntityParser.g4
java -cp ../antlr-4.11.1-complete.jar org.antlr.v4.Tool -Dlanguage=Python3 -visitor -o antlr_script AntlrScriptLexer.g4 AntlrScriptParser.g4
antlr4 -Dlanguage=Python3 -visitor -o simple_lang SimpleLangLexer.g4 SimpleLangParser.g4
antlr4 -Dlanguage=Python3 -visitor -o antlr_entity AntlrEntityLexer.g4 AntlrEntityParser.g4
antlr4 -Dlanguage=Python3 -visitor -o antlr_script AntlrScriptLexer.g4 AntlrScriptParser.g4
Empty file added tests/parsing/__init__.py
Empty file.
53 changes: 53 additions & 0 deletions tests/parsing/test_pylasu_antlr_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import unittest
from typing import List, Optional

from antlr4 import TokenStream, InputStream

from pylasu.model import Source, Node, Position, Point
from pylasu.parsing.antlr import PylasuANTLRParser
from pylasu.validation import Issue
from tests.simple_lang.SimpleLangLexer import SimpleLangLexer
from tests.simple_lang.SimpleLangParser import SimpleLangParser


class SimpleLangPylasuParser(PylasuANTLRParser):

def create_antlr_lexer(self, input_stream: InputStream):
return SimpleLangLexer(input_stream)

def create_antlr_parser(self, token_stream: TokenStream):
return SimpleLangParser(token_stream)

def parse_tree_to_ast(self, root, consider_range: bool, issues: List[Issue], source: Source) -> Optional[Node]:
return None


class KolasuParserTest(unittest.TestCase):
def test_lexing(self):
parser = SimpleLangPylasuParser()
result = parser.parse("""set a = 10
set b = ""
display c
""")
self.assertIsNotNone(result)
# TODO we don't have Pylasu Tokens yet

def test_issues_are_capitalized(self):
parser = SimpleLangPylasuParser()
result = parser.parse("""set set a = 10
display c
""")
self.assertTrue(result.issues)
self.assertTrue([i for i in result.issues if i.message.startswith("Extraneous input 'set'")])
self.assertTrue([i for i in result.issues if i.message.startswith("Mismatched input 'c'")])

def test_issues_have_not_flat_position(self):
parser = SimpleLangPylasuParser()
result = parser.parse("""set set a = 10
display c
""")
self.assertTrue(result.issues)
extraneous_input = [i for i in result.issues if i.message.startswith("Extraneous input 'set'")][0]
self.assertEqual(Position(Point(1, 4), Point(1, 7)), extraneous_input.position)
mismatched_input = [i for i in result.issues if i.message.startswith("Mismatched input 'c'")][0]
self.assertEqual(Position(Point(2, 8), Point(2, 9)), mismatched_input.position)