Skip to content

Commit e09a422

Browse files
committed
add antlr test case, corpus, results
1 parent e224ba7 commit e09a422

File tree

7 files changed

+2416
-96
lines changed

7 files changed

+2416
-96
lines changed

corpus/antlr4/samples/ANTLRv4Lexer.g4

Lines changed: 331 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,331 @@
1+
/*
2+
* [The "BSD license"]
3+
* Copyright (c) 2014 Terence Parr
4+
* Copyright (c) 2014 Sam Harwell
5+
* All rights reserved.
6+
*
7+
* Redistribution and use in source and binary forms, with or without
8+
* modification, are permitted provided that the following conditions
9+
* are met:
10+
*
11+
* 1. Redistributions of source code must retain the above copyright
12+
* notice, this list of conditions and the following disclaimer.
13+
* 2. Redistributions in binary form must reproduce the above copyright
14+
* notice, this list of conditions and the following disclaimer in the
15+
* documentation and/or other materials provided with the distribution.
16+
* 3. The name of the author may not be used to endorse or promote products
17+
* derived from this software without specific prior written permission.
18+
*
19+
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20+
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21+
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22+
* IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23+
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24+
* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25+
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26+
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27+
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28+
* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29+
*/
30+
31+
/** A grammar for ANTLR v4 tokens */
32+
lexer grammar ANTLRv4Lexer;
33+
34+
tokens {
35+
TOKEN_REF,
36+
RULE_REF,
37+
LEXER_CHAR_SET
38+
}
39+
40+
@members {
41+
/** Track whether we are inside of a rule and whether it is lexical parser.
42+
* _currentRuleType==Token.INVALID_TYPE means that we are outside of a rule.
43+
* At the first sign of a rule name reference and _currentRuleType==invalid,
44+
* we can assume that we are starting a parser rule. Similarly, seeing
45+
* a token reference when not already in rule means starting a token
46+
* rule. The terminating ';' of a rule, flips this back to invalid type.
47+
*
48+
* This is not perfect logic but works. For example, "grammar T;" means
49+
* that we start and stop a lexical rule for the "T;". Dangerous but works.
50+
*
51+
* The whole point of this state information is to distinguish
52+
* between [..arg actions..] and [charsets]. Char sets can only occur in
53+
* lexical rules and arg actions cannot occur.
54+
*/
55+
private int _currentRuleType = Token.INVALID_TYPE;
56+
57+
public int getCurrentRuleType() {
58+
return _currentRuleType;
59+
}
60+
61+
public void setCurrentRuleType(int ruleType) {
62+
this._currentRuleType = ruleType;
63+
}
64+
65+
protected void handleBeginArgAction() {
66+
if (inLexerRule()) {
67+
pushMode(LexerCharSet);
68+
more();
69+
}
70+
else {
71+
pushMode(ArgAction);
72+
more();
73+
}
74+
}
75+
76+
@Override
77+
public Token emit() {
78+
if (_type == TOKEN_REF || _type==RULE_REF ) {
79+
if (_currentRuleType == Token.INVALID_TYPE) { // if outside of rule def
80+
_currentRuleType = _type; // set to inside lexer or parser rule
81+
}
82+
}
83+
else if (_type == SEMI) { // exit rule def
84+
_currentRuleType = Token.INVALID_TYPE;
85+
}
86+
87+
return super.emit();
88+
}
89+
90+
private boolean inLexerRule() {
91+
return _currentRuleType == TOKEN_REF;
92+
}
93+
private boolean inParserRule() { // not used, but added for clarity
94+
return _currentRuleType == RULE_REF;
95+
}
96+
}
97+
98+
DOC_COMMENT
99+
: '/**' .*? ('*/' | EOF)
100+
;
101+
102+
BLOCK_COMMENT
103+
: '/*' .*? ('*/' | EOF) -> channel(HIDDEN)
104+
;
105+
106+
LINE_COMMENT
107+
: '//' ~[\r\n]* -> channel(HIDDEN)
108+
;
109+
110+
BEGIN_ARG_ACTION
111+
: '[' {handleBeginArgAction();}
112+
;
113+
114+
// OPTIONS and TOKENS must also consume the opening brace that captures
115+
// their option block, as this is the easiest way to parse it separate
116+
// to an ACTION block, despite it using the same {} delimiters.
117+
//
118+
OPTIONS : 'options' [ \t\f\n\r]* '{' ;
119+
TOKENS : 'tokens' [ \t\f\n\r]* '{' ;
120+
CHANNELS : 'channels' [ \t\f\n\r]* '{' ;
121+
122+
IMPORT : 'import' ;
123+
FRAGMENT : 'fragment' ;
124+
LEXER : 'lexer' ;
125+
PARSER : 'parser' ;
126+
GRAMMAR : 'grammar' ;
127+
PROTECTED : 'protected' ;
128+
PUBLIC : 'public' ;
129+
PRIVATE : 'private' ;
130+
RETURNS : 'returns' ;
131+
LOCALS : 'locals' ;
132+
THROWS : 'throws' ;
133+
CATCH : 'catch' ;
134+
FINALLY : 'finally' ;
135+
MODE : 'mode' ;
136+
137+
COLON : ':' ;
138+
COLONCOLON : '::' ;
139+
COMMA : ',' ;
140+
SEMI : ';' ;
141+
LPAREN : '(' ;
142+
RPAREN : ')' ;
143+
RARROW : '->' ;
144+
LT : '<' ;
145+
GT : '>' ;
146+
ASSIGN : '=' ;
147+
QUESTION : '?' ;
148+
STAR : '*' ;
149+
PLUS : '+' ;
150+
PLUS_ASSIGN : '+=' ;
151+
OR : '|' ;
152+
DOLLAR : '$' ;
153+
DOT : '.' ;
154+
RANGE : '..' ;
155+
AT : '@' ;
156+
POUND : '#' ;
157+
NOT : '~' ;
158+
RBRACE : '}' ;
159+
160+
/** Allow unicode rule/token names */
161+
//ID : NameStartChar NameChar*;
162+
// ##################### to allow testing ANTLR grammars in intellij preview
163+
RULE_REF : [a-z][a-zA-Z_0-9]* ;
164+
TOKEN_REF : [A-Z][a-zA-Z_0-9]* ;
165+
166+
167+
fragment
168+
NameChar
169+
: NameStartChar
170+
| '0'..'9'
171+
| '_'
172+
| '\u00B7'
173+
| '\u0300'..'\u036F'
174+
| '\u203F'..'\u2040'
175+
;
176+
177+
fragment
178+
NameStartChar
179+
: 'A'..'Z'
180+
| 'a'..'z'
181+
| '\u00C0'..'\u00D6'
182+
| '\u00D8'..'\u00F6'
183+
| '\u00F8'..'\u02FF'
184+
| '\u0370'..'\u037D'
185+
| '\u037F'..'\u1FFF'
186+
| '\u200C'..'\u200D'
187+
| '\u2070'..'\u218F'
188+
| '\u2C00'..'\u2FEF'
189+
| '\u3001'..'\uD7FF'
190+
| '\uF900'..'\uFDCF'
191+
| '\uFDF0'..'\uFFFD'
192+
; // ignores | ['\u10000-'\uEFFFF] ;
193+
194+
INT : [0-9]+
195+
;
196+
197+
// ANTLR makes no distinction between a single character literal and a
198+
// multi-character string. All literals are single quote delimited and
199+
// may contain unicode escape sequences of the form \uxxxx, where x
200+
// is a valid hexadecimal number (as per Java basically).
201+
STRING_LITERAL
202+
: '\'' (ESC_SEQ | ~['\r\n\\])* '\''
203+
;
204+
205+
UNTERMINATED_STRING_LITERAL
206+
: '\'' (ESC_SEQ | ~['\r\n\\])*
207+
;
208+
209+
// Any kind of escaped character that we can embed within ANTLR
210+
// literal strings.
211+
fragment
212+
ESC_SEQ
213+
: '\\'
214+
( // The standard escaped character set such as tab, newline, etc.
215+
[btnfr"'\\]
216+
| // A Java style Unicode escape sequence
217+
UNICODE_ESC
218+
| // Invalid escape
219+
.
220+
| // Invalid escape at end of file
221+
EOF
222+
)
223+
;
224+
225+
fragment
226+
UNICODE_ESC
227+
: 'u' (HEX_DIGIT (HEX_DIGIT (HEX_DIGIT HEX_DIGIT?)?)?)?
228+
;
229+
230+
fragment
231+
HEX_DIGIT : [0-9a-fA-F] ;
232+
233+
WS : [ \t\r\n\f]+ -> channel(HIDDEN) ;
234+
235+
// Many language targets use {} as block delimiters and so we
236+
// must recursively match {} delimited blocks to balance the
237+
// braces. Additionally, we must make some assumptions about
238+
// literal string representation in the target language. We assume
239+
// that they are delimited by ' or " and so consume these
240+
// in their own alts so as not to inadvertantly match {}.
241+
242+
ACTION
243+
: '{'
244+
( ACTION
245+
| ACTION_ESCAPE
246+
| ACTION_STRING_LITERAL
247+
| ACTION_CHAR_LITERAL
248+
| '/*' .*? '*/' // ('*/' | EOF)
249+
| '//' ~[\r\n]*
250+
| .
251+
)*?
252+
('}'|EOF)
253+
;
254+
255+
fragment
256+
ACTION_ESCAPE
257+
: '\\' .
258+
;
259+
260+
fragment
261+
ACTION_STRING_LITERAL
262+
: '"' (ACTION_ESCAPE | ~["\\])* '"'
263+
;
264+
265+
fragment
266+
ACTION_CHAR_LITERAL
267+
: '\'' (ACTION_ESCAPE | ~['\\])* '\''
268+
;
269+
270+
// -----------------
271+
// Illegal Character
272+
//
273+
// This is an illegal character trap which is always the last rule in the
274+
// lexer specification. It matches a single character of any value and being
275+
// the last rule in the file will match when no other rule knows what to do
276+
// about the character. It is reported as an error but is not passed on to the
277+
// parser. This means that the parser to deal with the gramamr file anyway
278+
// but we will not try to analyse or code generate from a file with lexical
279+
// errors.
280+
//
281+
ERRCHAR
282+
: . -> channel(HIDDEN)
283+
;
284+
285+
mode ArgAction; // E.g., [int x, List<String> a[]]
286+
287+
NESTED_ARG_ACTION
288+
: '[' -> more, pushMode(ArgAction)
289+
;
290+
291+
ARG_ACTION_ESCAPE
292+
: '\\' . -> more
293+
;
294+
295+
ARG_ACTION_STRING_LITERAL
296+
: ('"' ('\\' . | ~["\\])* '"')-> more
297+
;
298+
299+
ARG_ACTION_CHAR_LITERAL
300+
: ('"' '\\' . | ~["\\] '"') -> more
301+
;
302+
303+
ARG_ACTION
304+
: ']' -> popMode
305+
;
306+
307+
UNTERMINATED_ARG_ACTION // added this to return non-EOF token type here. EOF did something weird
308+
: EOF -> popMode
309+
;
310+
311+
ARG_ACTION_CHAR // must be last
312+
: . -> more
313+
;
314+
315+
316+
mode LexerCharSet;
317+
318+
LEXER_CHAR_SET_BODY
319+
: ( ~[\]\\]
320+
| '\\' .
321+
)
322+
-> more
323+
;
324+
325+
LEXER_CHAR_SET
326+
: ']' -> popMode
327+
;
328+
329+
UNTERMINATED_CHAR_SET
330+
: EOF -> popMode
331+
;

0 commit comments

Comments
 (0)