|
| 1 | +/* |
| 2 | + * [The "BSD license"] |
| 3 | + * Copyright (c) 2014 Terence Parr |
| 4 | + * Copyright (c) 2014 Sam Harwell |
| 5 | + * All rights reserved. |
| 6 | + * |
| 7 | + * Redistribution and use in source and binary forms, with or without |
| 8 | + * modification, are permitted provided that the following conditions |
| 9 | + * are met: |
| 10 | + * |
| 11 | + * 1. Redistributions of source code must retain the above copyright |
| 12 | + * notice, this list of conditions and the following disclaimer. |
| 13 | + * 2. Redistributions in binary form must reproduce the above copyright |
| 14 | + * notice, this list of conditions and the following disclaimer in the |
| 15 | + * documentation and/or other materials provided with the distribution. |
| 16 | + * 3. The name of the author may not be used to endorse or promote products |
| 17 | + * derived from this software without specific prior written permission. |
| 18 | + * |
| 19 | + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
| 20 | + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| 21 | + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. |
| 22 | + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, |
| 23 | + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT |
| 24 | + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 25 | + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 26 | + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 27 | + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF |
| 28 | + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 | + */ |
| 30 | + |
| 31 | +/** A grammar for ANTLR v4 tokens */ |
| 32 | +lexer grammar ANTLRv4Lexer; |
| 33 | + |
| 34 | +tokens { |
| 35 | + TOKEN_REF, |
| 36 | + RULE_REF, |
| 37 | + LEXER_CHAR_SET |
| 38 | +} |
| 39 | + |
| 40 | +@members { |
| 41 | + /** Track whether we are inside of a rule and whether it is lexical parser. |
| 42 | + * _currentRuleType==Token.INVALID_TYPE means that we are outside of a rule. |
| 43 | + * At the first sign of a rule name reference and _currentRuleType==invalid, |
| 44 | + * we can assume that we are starting a parser rule. Similarly, seeing |
| 45 | + * a token reference when not already in rule means starting a token |
| 46 | + * rule. The terminating ';' of a rule, flips this back to invalid type. |
| 47 | + * |
| 48 | + * This is not perfect logic but works. For example, "grammar T;" means |
| 49 | + * that we start and stop a lexical rule for the "T;". Dangerous but works. |
| 50 | + * |
| 51 | + * The whole point of this state information is to distinguish |
| 52 | + * between [..arg actions..] and [charsets]. Char sets can only occur in |
| 53 | + * lexical rules and arg actions cannot occur. |
| 54 | + */ |
| 55 | + private int _currentRuleType = Token.INVALID_TYPE; |
| 56 | +
|
| 57 | + public int getCurrentRuleType() { |
| 58 | + return _currentRuleType; |
| 59 | + } |
| 60 | + |
| 61 | + public void setCurrentRuleType(int ruleType) { |
| 62 | + this._currentRuleType = ruleType; |
| 63 | + } |
| 64 | + |
| 65 | + protected void handleBeginArgAction() { |
| 66 | + if (inLexerRule()) { |
| 67 | + pushMode(LexerCharSet); |
| 68 | + more(); |
| 69 | + } |
| 70 | + else { |
| 71 | + pushMode(ArgAction); |
| 72 | + more(); |
| 73 | + } |
| 74 | + } |
| 75 | + |
| 76 | + @Override |
| 77 | + public Token emit() { |
| 78 | + if (_type == TOKEN_REF || _type==RULE_REF ) { |
| 79 | + if (_currentRuleType == Token.INVALID_TYPE) { // if outside of rule def |
| 80 | + _currentRuleType = _type; // set to inside lexer or parser rule |
| 81 | + } |
| 82 | + } |
| 83 | + else if (_type == SEMI) { // exit rule def |
| 84 | + _currentRuleType = Token.INVALID_TYPE; |
| 85 | + } |
| 86 | + |
| 87 | + return super.emit(); |
| 88 | + } |
| 89 | + |
| 90 | + private boolean inLexerRule() { |
| 91 | + return _currentRuleType == TOKEN_REF; |
| 92 | + } |
| 93 | + private boolean inParserRule() { // not used, but added for clarity |
| 94 | + return _currentRuleType == RULE_REF; |
| 95 | + } |
| 96 | +} |
| 97 | + |
| 98 | +DOC_COMMENT |
| 99 | + : '/**' .*? ('*/' | EOF) |
| 100 | + ; |
| 101 | + |
| 102 | +BLOCK_COMMENT |
| 103 | + : '/*' .*? ('*/' | EOF) -> channel(HIDDEN) |
| 104 | + ; |
| 105 | + |
| 106 | +LINE_COMMENT |
| 107 | + : '//' ~[\r\n]* -> channel(HIDDEN) |
| 108 | + ; |
| 109 | + |
| 110 | +BEGIN_ARG_ACTION |
| 111 | + : '[' {handleBeginArgAction();} |
| 112 | + ; |
| 113 | + |
| 114 | +// OPTIONS and TOKENS must also consume the opening brace that captures |
| 115 | +// their option block, as this is the easiest way to parse it separate |
| 116 | +// to an ACTION block, despite it using the same {} delimiters. |
| 117 | +// |
| 118 | +OPTIONS : 'options' [ \t\f\n\r]* '{' ; |
| 119 | +TOKENS : 'tokens' [ \t\f\n\r]* '{' ; |
| 120 | +CHANNELS : 'channels' [ \t\f\n\r]* '{' ; |
| 121 | + |
| 122 | +IMPORT : 'import' ; |
| 123 | +FRAGMENT : 'fragment' ; |
| 124 | +LEXER : 'lexer' ; |
| 125 | +PARSER : 'parser' ; |
| 126 | +GRAMMAR : 'grammar' ; |
| 127 | +PROTECTED : 'protected' ; |
| 128 | +PUBLIC : 'public' ; |
| 129 | +PRIVATE : 'private' ; |
| 130 | +RETURNS : 'returns' ; |
| 131 | +LOCALS : 'locals' ; |
| 132 | +THROWS : 'throws' ; |
| 133 | +CATCH : 'catch' ; |
| 134 | +FINALLY : 'finally' ; |
| 135 | +MODE : 'mode' ; |
| 136 | + |
| 137 | +COLON : ':' ; |
| 138 | +COLONCOLON : '::' ; |
| 139 | +COMMA : ',' ; |
| 140 | +SEMI : ';' ; |
| 141 | +LPAREN : '(' ; |
| 142 | +RPAREN : ')' ; |
| 143 | +RARROW : '->' ; |
| 144 | +LT : '<' ; |
| 145 | +GT : '>' ; |
| 146 | +ASSIGN : '=' ; |
| 147 | +QUESTION : '?' ; |
| 148 | +STAR : '*' ; |
| 149 | +PLUS : '+' ; |
| 150 | +PLUS_ASSIGN : '+=' ; |
| 151 | +OR : '|' ; |
| 152 | +DOLLAR : '$' ; |
| 153 | +DOT : '.' ; |
| 154 | +RANGE : '..' ; |
| 155 | +AT : '@' ; |
| 156 | +POUND : '#' ; |
| 157 | +NOT : '~' ; |
| 158 | +RBRACE : '}' ; |
| 159 | + |
| 160 | +/** Allow unicode rule/token names */ |
| 161 | +//ID : NameStartChar NameChar*; |
| 162 | +// ##################### to allow testing ANTLR grammars in intellij preview |
| 163 | +RULE_REF : [a-z][a-zA-Z_0-9]* ; |
| 164 | +TOKEN_REF : [A-Z][a-zA-Z_0-9]* ; |
| 165 | + |
| 166 | + |
| 167 | +fragment |
| 168 | +NameChar |
| 169 | + : NameStartChar |
| 170 | + | '0'..'9' |
| 171 | + | '_' |
| 172 | + | '\u00B7' |
| 173 | + | '\u0300'..'\u036F' |
| 174 | + | '\u203F'..'\u2040' |
| 175 | + ; |
| 176 | + |
| 177 | +fragment |
| 178 | +NameStartChar |
| 179 | + : 'A'..'Z' |
| 180 | + | 'a'..'z' |
| 181 | + | '\u00C0'..'\u00D6' |
| 182 | + | '\u00D8'..'\u00F6' |
| 183 | + | '\u00F8'..'\u02FF' |
| 184 | + | '\u0370'..'\u037D' |
| 185 | + | '\u037F'..'\u1FFF' |
| 186 | + | '\u200C'..'\u200D' |
| 187 | + | '\u2070'..'\u218F' |
| 188 | + | '\u2C00'..'\u2FEF' |
| 189 | + | '\u3001'..'\uD7FF' |
| 190 | + | '\uF900'..'\uFDCF' |
| 191 | + | '\uFDF0'..'\uFFFD' |
| 192 | + ; // ignores | ['\u10000-'\uEFFFF] ; |
| 193 | + |
| 194 | +INT : [0-9]+ |
| 195 | + ; |
| 196 | + |
| 197 | +// ANTLR makes no distinction between a single character literal and a |
| 198 | +// multi-character string. All literals are single quote delimited and |
| 199 | +// may contain unicode escape sequences of the form \uxxxx, where x |
| 200 | +// is a valid hexadecimal number (as per Java basically). |
| 201 | +STRING_LITERAL |
| 202 | + : '\'' (ESC_SEQ | ~['\r\n\\])* '\'' |
| 203 | + ; |
| 204 | + |
| 205 | +UNTERMINATED_STRING_LITERAL |
| 206 | + : '\'' (ESC_SEQ | ~['\r\n\\])* |
| 207 | + ; |
| 208 | +
|
| 209 | +// Any kind of escaped character that we can embed within ANTLR |
| 210 | +// literal strings. |
| 211 | +fragment |
| 212 | +ESC_SEQ |
| 213 | + : '\\' |
| 214 | + ( // The standard escaped character set such as tab, newline, etc. |
| 215 | + [btnfr"'\\] |
| 216 | + | // A Java style Unicode escape sequence |
| 217 | + UNICODE_ESC |
| 218 | + | // Invalid escape |
| 219 | + . |
| 220 | + | // Invalid escape at end of file |
| 221 | + EOF |
| 222 | + ) |
| 223 | + ; |
| 224 | + |
| 225 | +fragment |
| 226 | +UNICODE_ESC |
| 227 | + : 'u' (HEX_DIGIT (HEX_DIGIT (HEX_DIGIT HEX_DIGIT?)?)?)? |
| 228 | + ; |
| 229 | + |
| 230 | +fragment |
| 231 | +HEX_DIGIT : [0-9a-fA-F] ; |
| 232 | + |
| 233 | +WS : [ \t\r\n\f]+ -> channel(HIDDEN) ; |
| 234 | + |
| 235 | +// Many language targets use {} as block delimiters and so we |
| 236 | +// must recursively match {} delimited blocks to balance the |
| 237 | +// braces. Additionally, we must make some assumptions about |
| 238 | +// literal string representation in the target language. We assume |
| 239 | +// that they are delimited by ' or " and so consume these |
| 240 | +// in their own alts so as not to inadvertantly match {}. |
| 241 | + |
| 242 | +ACTION |
| 243 | + : '{' |
| 244 | + ( ACTION |
| 245 | + | ACTION_ESCAPE |
| 246 | + | ACTION_STRING_LITERAL |
| 247 | + | ACTION_CHAR_LITERAL |
| 248 | + | '/*' .*? '*/' // ('*/' | EOF) |
| 249 | + | '//' ~[\r\n]* |
| 250 | + | . |
| 251 | + )*? |
| 252 | + ('}'|EOF) |
| 253 | + ; |
| 254 | + |
| 255 | +fragment |
| 256 | +ACTION_ESCAPE |
| 257 | + : '\\' . |
| 258 | + ; |
| 259 | + |
| 260 | +fragment |
| 261 | +ACTION_STRING_LITERAL |
| 262 | + : '"' (ACTION_ESCAPE | ~["\\])* '"' |
| 263 | + ; |
| 264 | +
|
| 265 | +fragment |
| 266 | +ACTION_CHAR_LITERAL |
| 267 | + : '\'' (ACTION_ESCAPE | ~['\\])* '\'' |
| 268 | + ; |
| 269 | + |
| 270 | +// ----------------- |
| 271 | +// Illegal Character |
| 272 | +// |
| 273 | +// This is an illegal character trap which is always the last rule in the |
| 274 | +// lexer specification. It matches a single character of any value and being |
| 275 | +// the last rule in the file will match when no other rule knows what to do |
| 276 | +// about the character. It is reported as an error but is not passed on to the |
| 277 | +// parser. This means that the parser to deal with the gramamr file anyway |
| 278 | +// but we will not try to analyse or code generate from a file with lexical |
| 279 | +// errors. |
| 280 | +// |
| 281 | +ERRCHAR |
| 282 | + : . -> channel(HIDDEN) |
| 283 | + ; |
| 284 | + |
| 285 | +mode ArgAction; // E.g., [int x, List<String> a[]] |
| 286 | + |
| 287 | + NESTED_ARG_ACTION |
| 288 | + : '[' -> more, pushMode(ArgAction) |
| 289 | + ; |
| 290 | + |
| 291 | + ARG_ACTION_ESCAPE |
| 292 | + : '\\' . -> more |
| 293 | + ; |
| 294 | + |
| 295 | + ARG_ACTION_STRING_LITERAL |
| 296 | + : ('"' ('\\' . | ~["\\])* '"')-> more |
| 297 | + ; |
| 298 | +
|
| 299 | + ARG_ACTION_CHAR_LITERAL |
| 300 | + : ('"' '\\' . | ~["\\] '"') -> more |
| 301 | + ; |
| 302 | + |
| 303 | + ARG_ACTION |
| 304 | + : ']' -> popMode |
| 305 | + ; |
| 306 | + |
| 307 | + UNTERMINATED_ARG_ACTION // added this to return non-EOF token type here. EOF did something weird |
| 308 | + : EOF -> popMode |
| 309 | + ; |
| 310 | + |
| 311 | + ARG_ACTION_CHAR // must be last |
| 312 | + : . -> more |
| 313 | + ; |
| 314 | + |
| 315 | + |
| 316 | +mode LexerCharSet; |
| 317 | + |
| 318 | + LEXER_CHAR_SET_BODY |
| 319 | + : ( ~[\]\\] |
| 320 | + | '\\' . |
| 321 | + ) |
| 322 | + -> more |
| 323 | + ; |
| 324 | + |
| 325 | + LEXER_CHAR_SET |
| 326 | + : ']' -> popMode |
| 327 | + ; |
| 328 | + |
| 329 | + UNTERMINATED_CHAR_SET |
| 330 | + : EOF -> popMode |
| 331 | + ; |
0 commit comments