Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,19 @@ welcome here!
Including a parser imports *all* of the nonterminals defined in the parser, as
well as any JS, macros, and config options defined there.

### Custom lexers

You can pass a `lexer` instance to Parser, which must have the following interface:

* `reset(chunk, Info)`: set the internal buffer to `chunk`, and restore line/col/state info taken from `save()`.
* `next() -> Token` return e.g. `{type, value, line, col, …}`. Only the `value` attribute is required.
* `save() -> Info` -> return an object describing the current line/col etc. This allows us to preserve this information between `feed()` calls, and also to support `Parser#rewind()`. The exact structure is lexer-specific; nearley doesn't care what's in it.
* `formatError(token)` -> return a string with an error message describing the line/col of the offending token. You might like to include a preview of the line in question.
* `has(tokenType)` -> return true if the lexer can emit tokens with that name. Used to resolve `%`-specifiers in compiled nearley grammars.

If Parser isn't given a lexer option, it will look for a `.lexer` attribute on its Grammar. The `@lexer` directive allows exporting a lexer object from your `.ne` grammar file. (See `json.ne` for an example.)


### Custom tokens

Nearley assumes by default that your fundamental unit of parsing, called a
Expand Down
73 changes: 33 additions & 40 deletions examples/json.ne
Original file line number Diff line number Diff line change
@@ -1,7 +1,36 @@
# http://www.json.org/
# http://www.asciitable.com/
@{%

const moo = require('moo')

let lexer = moo.compile({
space: {match: /\s+/, lineBreaks: true},
number: /-?(?:[0-9]|[1-9][0-9]+)(?:\.[0-9]+)?(?:[eE][-+]?[0-9]+)?\b/,
string: /"(?:\\["bfnrt\/\\]|\\u[a-fA-F0-9]{4}|[^"\\])*"/,
'{': '{',
'}': '}',
'[': '[',
']': ']',
',': ',',
':': ':',
true: 'true',
false: 'false',
null: 'null',
})

// TODO add has() to moo
lexer.has = function(name) {
return lexer.groups.find(function(group) {
return group.tokenType === name;
});
}

%}

json -> [\s]:* (object | array) [\s]:* {% function(d) { return d[1][0]; } %}
@lexer lexer

json -> _ (object | array) _ {% function(d) { return d[1][0]; } %}

object -> "{" _ "}" {% function(d) { return {}; } %}
| "{" _ pair (_ "," _ pair):* _ "}" {% extractObject %}
Expand All @@ -18,35 +47,15 @@ value ->
| "false" {% function(d) { return false; } %}
| "null" {% function(d) { return null; } %}

number -> "-":? ("0" | intPart) fracPart:? expPart:? {% extractNumber %}
number -> %number {% function(d) { return parseFloat(d[0].value) } %}

string -> "\"" validChar:* "\"" {% function(d) { return d[1].join("") } %}
string -> %string {% function(d) { return JSON.parse(d[0].value) } %}

pair -> key _ ":" _ value {% function(d) { return [d[0], d[4]]; } %}

key -> string {% id %}

intPart -> [1-9] [0-9]:* {% function(d) { return d[0] + d[1].join(""); } %}

fracPart -> "." [0-9]:* {% function(d) { return d[0] + d[1].join(""); } %}

expPart -> [eE] [+-]:? [0-9]:* {% function(d) { return d[0] + (d[1] || '') + d[2].join(""); } %}

validChar ->
[^"\\] {% function(d) { return d[0]; } %}
| "\\\"" {% function(d) { return "\""; } %}
| "\\\\" {% function(d) { return "\\"; } %}
| "\\/" {% function(d) { return "/"; } %}
| "\\n" {% function(d) { return "\n"; } %}
| "\\b" {% function(d) { return "\b"; } %}
| "\\f" {% function(d) { return "\f"; } %}
| "\\r" {% function(d) { return "\r"; } %}
| "\\t" {% function(d) { return "\t"; } %}
| "\\u" hex hex hex hex {% unicodehex %}

hex -> [0-9a-fA-F] {% function(d) { return d[0]; } %}

_ -> null | [\s]:+ {% function(d) { return null; } %}
_ -> null | %space {% function(d) { return null; } %}

@{%

Expand Down Expand Up @@ -76,20 +85,4 @@ function extractArray(d) {
return output;
}

function unicodehex(d) {
let codePoint = parseInt(d[1]+d[2]+d[3]+d[4], 16);

// Handle '\\'
if (codePoint == 92) {
return "\\";
}

return String.fromCodePoint(codePoint);
}

function extractNumber(d) {
let value = (d[0] || '') + d[1] + (d[2] || '') + (d[3] || '');
return parseFloat(value);
}

%}
7 changes: 6 additions & 1 deletion lib/compile.js
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,17 @@ function Compile(structure, opts) {
if (!token.literal.length) {
return null;
}
if (token.literal.length === 1) {
if (token.literal.length === 1 || result.config.lexer) {
return token;
}
return buildStringToken(ruleName, token, env);
}
if (token.token) {
if (result.config.lexer) {
var name = token.token;
var expr = result.config.lexer + ".has(" + JSON.stringify(name) + ") ? {type: " + JSON.stringify(name) + "} : " + name;
return {token: "(" + expr + ")"};
}
return token;
}

Expand Down
3 changes: 3 additions & 0 deletions lib/generate.js
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ generate.js = generate._default = generate.javascript = function (parser, export
output += "function id(x) {return x[0]; }\n";
output += parser.body.join('\n');
output += "var grammar = {\n";
output += " Lexer: " + parser.config.lexer + ",\n";
output += " ParserRules: " +
serializeRules(parser.rules, generate.javascript.builtinPostprocessors)
+ "\n";
Expand Down Expand Up @@ -141,6 +142,7 @@ generate.cs = generate.coffee = generate.coffeescript = function (parser, export
output += " id = (d)->d[0]\n";
output += tabulateString(dedentFunc(parser.body.join('\n')), ' ') + '\n';
output += " grammar = {\n";
output += " Lexer: " + parser.config.lexer + ",\n";
output += " ParserRules: " +
tabulateString(
serializeRules(parser.rules, generate.coffeescript.builtinPostprocessors),
Expand Down Expand Up @@ -173,6 +175,7 @@ generate.ts = generate.typescript = function (parser, exportName) {
output += "interface NearleyRule {name:string; symbols:NearleySymbol[]; postprocess?:(d:any[],loc?:number,reject?:{})=>any};\n";
output += "type NearleySymbol = string | {literal:any} | {test:(token:any) => boolean};\n";
output += "export var grammar : NearleyGrammar = {\n";
output += " Lexer: " + parser.config.lexer + ",\n";
output += " ParserRules: " + serializeRules(parser.rules, generate.typescript.builtinPostprocessors) + "\n";
output += " , ParserStart: " + JSON.stringify(parser.start) + "\n";
output += "}\n";
Expand Down
1 change: 1 addition & 0 deletions lib/nearley-language-bootstrapped.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
(function () {
function id(x) {return x[0]; }
var grammar = {
Lexer: undefined,
ParserRules: [
{"name": "dqstring$ebnf$1", "symbols": []},
{"name": "dqstring$ebnf$1", "symbols": ["dqstring$ebnf$1", "dstrchar"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}},
Expand Down
124 changes: 109 additions & 15 deletions lib/nearley.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ Rule.highestId = 0;

Rule.prototype.toString = function(withCursorAt) {
function stringifySymbolSequence (e) {
return (e.literal) ? JSON.stringify(e.literal)
: e.toString();
return e.literal ? JSON.stringify(e.literal) :
e.type ? '%' + e.type : e.toString();
}
var symbolSequence = (typeof withCursorAt === "undefined")
? this.symbols.map(stringifySymbolSequence).join(' ')
Expand Down Expand Up @@ -101,7 +101,7 @@ Column.prototype.process = function(nextColumn) {

// special-case nullables
if (state.reference === this.index) {
// make sure future predictors of this rule get completed.
// make sure future predictors of this rule get completed.
var exp = state.rule.name;
(this.completed[exp] = this.completed[exp] || []).push(state);
}
Expand Down Expand Up @@ -168,12 +168,63 @@ function Grammar(rules, start) {

// So we can allow passing (rules, start) directly to Parser for backwards compatibility
Grammar.fromCompiled = function(rules, start) {
var lexer = rules.Lexer;
if (rules.ParserStart) {
start = rules.ParserStart;
rules = rules.ParserRules;
}
var rules = rules.map(function (r) { return (new Rule(r.name, r.symbols, r.postprocess)); });
return new Grammar(rules, start);
var g = new Grammar(rules, start);
g.lexer = lexer; // nb. storing lexer on Grammar is iffy, but unavoidable
return g;
}


function StreamLexer() {
this.reset("");
}

StreamLexer.prototype.reset = function(data, state) {
this.buffer = data;
this.index = 0;
this.line = state ? state.line : 1;
this.lastLineBreak = state ? -state.col : 0;
}

StreamLexer.prototype.next = function() {
if (this.index < this.buffer.length) {
var ch = this.buffer[this.index++];
if (ch === '\n') {
this.line += 1;
this.lastLineBreak = this.index;
}
return {value: ch};
}
}

StreamLexer.prototype.save = function() {
return {
line: this.line,
col: this.index - this.lastLineBreak,
}
}

StreamLexer.prototype.formatError = function(token, message) {
// nb. this gets called after consuming the offending token,
// so the culprit is index-1
var buffer = this.buffer;
if (typeof buffer === 'string') {
var nextLineBreak = buffer.indexOf('\n', this.index);
if (nextLineBreak === -1) nextLineBreak = buffer.length;
var line = buffer.substring(this.lastLineBreak, nextLineBreak)
var col = this.index - this.lastLineBreak;
message += " at line " + this.line + " col " + col + ":\n\n";
message += " " + line + "\n"
message += " " + Array(col).join(" ") + "^"
return message;
} else {
return message + " at index " + (this.index - 1);
}
}


Expand All @@ -189,11 +240,16 @@ function Parser(rules, start, options) {
// Read options
this.options = {
keepHistory: false,
lexer: grammar.lexer || new StreamLexer,
};
for (var key in (options || {})) {
this.options[key] = options[key];
}

// Setup lexer
this.lexer = this.options.lexer;
this.lexerState = undefined;

// Setup a table
var column = new Column(grammar, 0);
var table = this.table = [column];
Expand All @@ -203,14 +259,17 @@ function Parser(rules, start, options) {
column.predict(grammar.start);
// TODO what if start rule is nullable?
column.process();
this.current = 0;
this.current = 0; // token index
}

// create a reserved token for indicating a parse fail
Parser.fail = {};

Parser.prototype.feed = function(chunk) {
for (var chunkPos = 0; chunkPos < chunk.length; chunkPos++) {
var lexer = this.lexer;
lexer.reset(chunk, this.lexerState);

while (token = lexer.next()) {
// We add new states to table[current+1]
var column = this.table[this.current];

Expand All @@ -224,18 +283,18 @@ Parser.prototype.feed = function(chunk) {
this.table.push(nextColumn);

// Advance all tokens that expect the symbol
// So for each state in the previous row,

var token = chunk[chunkPos];
var literal = token.value;
var value = lexer.constructor === StreamLexer ? token.value : token;
var scannable = column.scannable;
for (var w = scannable.length; w--; ) {
var state = scannable[w];
var expect = state.rule.symbols[state.dot];
// Try to consume the token
// either regex or literal
if (expect.test ? expect.test(token) : expect.literal === token) {
if (expect.test ? expect.test(value) :
expect.type ? expect.type === token.type
: expect.literal === literal) {
// Add it
var value = token;
var next = state.nextState({data: value, token: token, isToken: true});
nextColumn.states.push(next);
}
Expand All @@ -254,16 +313,24 @@ Parser.prototype.feed = function(chunk) {
// If needed, throw an error:
if (nextColumn.states.length === 0) {
// No states at all! This is not good.
var err = new Error(
"nearley: No possible parsings (@" + (this.current)
+ ": '" + token + "')."
);
var message = this.lexer.formatError(token, "invalid syntax") + "\n";
message += "Unexpected " + (token.type ? token.type + " token: " : "");
message += JSON.stringify(token.value !== undefined ? token.value : token) + "\n";
var err = new Error(message);
err.offset = this.current;
throw err;
}

// maybe save lexer state
if (this.options.keepHistory) {
column.lexerState = lexer.save()
}

this.current++;
}
if (column) {
this.lexerState = lexer.save()
}

// Incrementally keep track of results
this.results = this.finish();
Expand All @@ -272,6 +339,33 @@ Parser.prototype.feed = function(chunk) {
return this;
};

Parser.prototype.save = function() {
var column = this.table[this.current];
column.lexerState = this.lexerState;
return column;
};

Parser.prototype.restore = function(column) {
var index = column.index;
this.current = index;
this.table[index] = column;
this.table.splice(index + 1);
this.lexerState = column.lexerState;

// Incrementally keep track of results
this.results = this.finish();
};

// nb. deprecated: use save/restore instead!
Parser.prototype.rewind = function(index) {
if (!this.options.keepHistory) {
throw new Error('set option `keepHistory` to enable rewinding')
}
// nb. recall column (table) indicies fall between token indicies.
// col 0 -- token 0 -- col 1
this.restore(this.table[index]);
};

Parser.prototype.finish = function() {
// Return the possible parsings
var considerations = [];
Expand Down
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
"chai": "^3.4.1",
"coffee-script": "^1.10.0",
"microtime": "^2.1.2",
"mocha": "^2.3.4"
"mocha": "^2.3.4",
"moo": "^0.3.1"
}
}
Loading