diff --git a/README.md b/README.md index ca5ede5e..92914378 100644 --- a/README.md +++ b/README.md @@ -283,6 +283,19 @@ welcome here! Including a parser imports *all* of the nonterminals defined in the parser, as well as any JS, macros, and config options defined there. +### Custom lexers + +You can pass a `lexer` instance to Parser, which must have the following interface: + +* `reset(chunk, Info)`: set the internal buffer to `chunk`, and restore line/col/state info taken from `save()`. +* `next() -> Token` return e.g. `{type, value, line, col, …}`. Only the `value` attribute is required. +* `save() -> Info` -> return an object describing the current line/col etc. This allows us to preserve this information between `feed()` calls, and also to support `Parser#rewind()`. The exact structure is lexer-specific; nearley doesn't care what's in it. +* `formatError(token)` -> return a string with an error message describing the line/col of the offending token. You might like to include a preview of the line in question. +* `has(tokenType)` -> return true if the lexer can emit tokens with that name. Used to resolve `%`-specifiers in compiled nearley grammars. + +If Parser isn't given a lexer option, it will look for a `.lexer` attribute on its Grammar. The `@lexer` directive allows exporting a lexer object from your `.ne` grammar file. (See `json.ne` for an example.) + + ### Custom tokens Nearley assumes by default that your fundamental unit of parsing, called a diff --git a/examples/json.ne b/examples/json.ne index 43d43f3c..498f78a2 100644 --- a/examples/json.ne +++ b/examples/json.ne @@ -1,7 +1,36 @@ # http://www.json.org/ # http://www.asciitable.com/ +@{% + +const moo = require('moo') + +let lexer = moo.compile({ + space: {match: /\s+/, lineBreaks: true}, + number: /-?(?:[0-9]|[1-9][0-9]+)(?:\.[0-9]+)?(?:[eE][-+]?[0-9]+)?\b/, + string: /"(?:\\["bfnrt\/\\]|\\u[a-fA-F0-9]{4}|[^"\\])*"/, + '{': '{', + '}': '}', + '[': '[', + ']': ']', + ',': ',', + ':': ':', + true: 'true', + false: 'false', + null: 'null', +}) + +// TODO add has() to moo +lexer.has = function(name) { + return lexer.groups.find(function(group) { + return group.tokenType === name; + }); +} + +%} -json -> [\s]:* (object | array) [\s]:* {% function(d) { return d[1][0]; } %} +@lexer lexer + +json -> _ (object | array) _ {% function(d) { return d[1][0]; } %} object -> "{" _ "}" {% function(d) { return {}; } %} | "{" _ pair (_ "," _ pair):* _ "}" {% extractObject %} @@ -18,35 +47,15 @@ value -> | "false" {% function(d) { return false; } %} | "null" {% function(d) { return null; } %} -number -> "-":? ("0" | intPart) fracPart:? expPart:? {% extractNumber %} +number -> %number {% function(d) { return parseFloat(d[0].value) } %} -string -> "\"" validChar:* "\"" {% function(d) { return d[1].join("") } %} +string -> %string {% function(d) { return JSON.parse(d[0].value) } %} pair -> key _ ":" _ value {% function(d) { return [d[0], d[4]]; } %} key -> string {% id %} -intPart -> [1-9] [0-9]:* {% function(d) { return d[0] + d[1].join(""); } %} - -fracPart -> "." [0-9]:* {% function(d) { return d[0] + d[1].join(""); } %} - -expPart -> [eE] [+-]:? [0-9]:* {% function(d) { return d[0] + (d[1] || '') + d[2].join(""); } %} - -validChar -> - [^"\\] {% function(d) { return d[0]; } %} - | "\\\"" {% function(d) { return "\""; } %} - | "\\\\" {% function(d) { return "\\"; } %} - | "\\/" {% function(d) { return "/"; } %} - | "\\n" {% function(d) { return "\n"; } %} - | "\\b" {% function(d) { return "\b"; } %} - | "\\f" {% function(d) { return "\f"; } %} - | "\\r" {% function(d) { return "\r"; } %} - | "\\t" {% function(d) { return "\t"; } %} - | "\\u" hex hex hex hex {% unicodehex %} - -hex -> [0-9a-fA-F] {% function(d) { return d[0]; } %} - -_ -> null | [\s]:+ {% function(d) { return null; } %} +_ -> null | %space {% function(d) { return null; } %} @{% @@ -76,20 +85,4 @@ function extractArray(d) { return output; } -function unicodehex(d) { - let codePoint = parseInt(d[1]+d[2]+d[3]+d[4], 16); - - // Handle '\\' - if (codePoint == 92) { - return "\\"; - } - - return String.fromCodePoint(codePoint); -} - -function extractNumber(d) { - let value = (d[0] || '') + d[1] + (d[2] || '') + (d[3] || ''); - return parseFloat(value); -} - %} diff --git a/lib/compile.js b/lib/compile.js index e66a11b5..0435ec28 100644 --- a/lib/compile.js +++ b/lib/compile.js @@ -118,12 +118,17 @@ function Compile(structure, opts) { if (!token.literal.length) { return null; } - if (token.literal.length === 1) { + if (token.literal.length === 1 || result.config.lexer) { return token; } return buildStringToken(ruleName, token, env); } if (token.token) { + if (result.config.lexer) { + var name = token.token; + var expr = result.config.lexer + ".has(" + JSON.stringify(name) + ") ? {type: " + JSON.stringify(name) + "} : " + name; + return {token: "(" + expr + ")"}; + } return token; } diff --git a/lib/generate.js b/lib/generate.js index 6da780bb..c3a0fe7b 100644 --- a/lib/generate.js +++ b/lib/generate.js @@ -112,6 +112,7 @@ generate.js = generate._default = generate.javascript = function (parser, export output += "function id(x) {return x[0]; }\n"; output += parser.body.join('\n'); output += "var grammar = {\n"; + output += " Lexer: " + parser.config.lexer + ",\n"; output += " ParserRules: " + serializeRules(parser.rules, generate.javascript.builtinPostprocessors) + "\n"; @@ -141,6 +142,7 @@ generate.cs = generate.coffee = generate.coffeescript = function (parser, export output += " id = (d)->d[0]\n"; output += tabulateString(dedentFunc(parser.body.join('\n')), ' ') + '\n'; output += " grammar = {\n"; + output += " Lexer: " + parser.config.lexer + ",\n"; output += " ParserRules: " + tabulateString( serializeRules(parser.rules, generate.coffeescript.builtinPostprocessors), @@ -173,6 +175,7 @@ generate.ts = generate.typescript = function (parser, exportName) { output += "interface NearleyRule {name:string; symbols:NearleySymbol[]; postprocess?:(d:any[],loc?:number,reject?:{})=>any};\n"; output += "type NearleySymbol = string | {literal:any} | {test:(token:any) => boolean};\n"; output += "export var grammar : NearleyGrammar = {\n"; + output += " Lexer: " + parser.config.lexer + ",\n"; output += " ParserRules: " + serializeRules(parser.rules, generate.typescript.builtinPostprocessors) + "\n"; output += " , ParserStart: " + JSON.stringify(parser.start) + "\n"; output += "}\n"; diff --git a/lib/nearley-language-bootstrapped.js b/lib/nearley-language-bootstrapped.js index 0560c18e..0af76903 100644 --- a/lib/nearley-language-bootstrapped.js +++ b/lib/nearley-language-bootstrapped.js @@ -3,6 +3,7 @@ (function () { function id(x) {return x[0]; } var grammar = { + Lexer: undefined, ParserRules: [ {"name": "dqstring$ebnf$1", "symbols": []}, {"name": "dqstring$ebnf$1", "symbols": ["dqstring$ebnf$1", "dstrchar"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}}, diff --git a/lib/nearley.js b/lib/nearley.js index e8134d1c..cd039936 100644 --- a/lib/nearley.js +++ b/lib/nearley.js @@ -17,8 +17,8 @@ Rule.highestId = 0; Rule.prototype.toString = function(withCursorAt) { function stringifySymbolSequence (e) { - return (e.literal) ? JSON.stringify(e.literal) - : e.toString(); + return e.literal ? JSON.stringify(e.literal) : + e.type ? '%' + e.type : e.toString(); } var symbolSequence = (typeof withCursorAt === "undefined") ? this.symbols.map(stringifySymbolSequence).join(' ') @@ -101,7 +101,7 @@ Column.prototype.process = function(nextColumn) { // special-case nullables if (state.reference === this.index) { - // make sure future predictors of this rule get completed. + // make sure future predictors of this rule get completed. var exp = state.rule.name; (this.completed[exp] = this.completed[exp] || []).push(state); } @@ -168,12 +168,63 @@ function Grammar(rules, start) { // So we can allow passing (rules, start) directly to Parser for backwards compatibility Grammar.fromCompiled = function(rules, start) { + var lexer = rules.Lexer; if (rules.ParserStart) { start = rules.ParserStart; rules = rules.ParserRules; } var rules = rules.map(function (r) { return (new Rule(r.name, r.symbols, r.postprocess)); }); - return new Grammar(rules, start); + var g = new Grammar(rules, start); + g.lexer = lexer; // nb. storing lexer on Grammar is iffy, but unavoidable + return g; +} + + +function StreamLexer() { + this.reset(""); +} + +StreamLexer.prototype.reset = function(data, state) { + this.buffer = data; + this.index = 0; + this.line = state ? state.line : 1; + this.lastLineBreak = state ? -state.col : 0; +} + +StreamLexer.prototype.next = function() { + if (this.index < this.buffer.length) { + var ch = this.buffer[this.index++]; + if (ch === '\n') { + this.line += 1; + this.lastLineBreak = this.index; + } + return {value: ch}; + } +} + +StreamLexer.prototype.save = function() { + return { + line: this.line, + col: this.index - this.lastLineBreak, + } +} + +StreamLexer.prototype.formatError = function(token, message) { + // nb. this gets called after consuming the offending token, + // so the culprit is index-1 + var buffer = this.buffer; + if (typeof buffer === 'string') { + var nextLineBreak = buffer.indexOf('\n', this.index); + if (nextLineBreak === -1) nextLineBreak = buffer.length; + var line = buffer.substring(this.lastLineBreak, nextLineBreak) + var col = this.index - this.lastLineBreak; + message += " at line " + this.line + " col " + col + ":\n\n"; + message += " " + line + "\n" + message += " " + Array(col).join(" ") + "^" + return message; + } else { + return message + " at index " + (this.index - 1); + } } @@ -189,11 +240,16 @@ function Parser(rules, start, options) { // Read options this.options = { keepHistory: false, + lexer: grammar.lexer || new StreamLexer, }; for (var key in (options || {})) { this.options[key] = options[key]; } + // Setup lexer + this.lexer = this.options.lexer; + this.lexerState = undefined; + // Setup a table var column = new Column(grammar, 0); var table = this.table = [column]; @@ -203,14 +259,17 @@ function Parser(rules, start, options) { column.predict(grammar.start); // TODO what if start rule is nullable? column.process(); - this.current = 0; + this.current = 0; // token index } // create a reserved token for indicating a parse fail Parser.fail = {}; Parser.prototype.feed = function(chunk) { - for (var chunkPos = 0; chunkPos < chunk.length; chunkPos++) { + var lexer = this.lexer; + lexer.reset(chunk, this.lexerState); + + while (token = lexer.next()) { // We add new states to table[current+1] var column = this.table[this.current]; @@ -224,18 +283,18 @@ Parser.prototype.feed = function(chunk) { this.table.push(nextColumn); // Advance all tokens that expect the symbol - // So for each state in the previous row, - - var token = chunk[chunkPos]; + var literal = token.value; + var value = lexer.constructor === StreamLexer ? token.value : token; var scannable = column.scannable; for (var w = scannable.length; w--; ) { var state = scannable[w]; var expect = state.rule.symbols[state.dot]; // Try to consume the token // either regex or literal - if (expect.test ? expect.test(token) : expect.literal === token) { + if (expect.test ? expect.test(value) : + expect.type ? expect.type === token.type + : expect.literal === literal) { // Add it - var value = token; var next = state.nextState({data: value, token: token, isToken: true}); nextColumn.states.push(next); } @@ -254,16 +313,24 @@ Parser.prototype.feed = function(chunk) { // If needed, throw an error: if (nextColumn.states.length === 0) { // No states at all! This is not good. - var err = new Error( - "nearley: No possible parsings (@" + (this.current) - + ": '" + token + "')." - ); + var message = this.lexer.formatError(token, "invalid syntax") + "\n"; + message += "Unexpected " + (token.type ? token.type + " token: " : ""); + message += JSON.stringify(token.value !== undefined ? token.value : token) + "\n"; + var err = new Error(message); err.offset = this.current; throw err; } + // maybe save lexer state + if (this.options.keepHistory) { + column.lexerState = lexer.save() + } + this.current++; } + if (column) { + this.lexerState = lexer.save() + } // Incrementally keep track of results this.results = this.finish(); @@ -272,6 +339,33 @@ Parser.prototype.feed = function(chunk) { return this; }; +Parser.prototype.save = function() { + var column = this.table[this.current]; + column.lexerState = this.lexerState; + return column; +}; + +Parser.prototype.restore = function(column) { + var index = column.index; + this.current = index; + this.table[index] = column; + this.table.splice(index + 1); + this.lexerState = column.lexerState; + + // Incrementally keep track of results + this.results = this.finish(); +}; + +// nb. deprecated: use save/restore instead! +Parser.prototype.rewind = function(index) { + if (!this.options.keepHistory) { + throw new Error('set option `keepHistory` to enable rewinding') + } + // nb. recall column (table) indicies fall between token indicies. + // col 0 -- token 0 -- col 1 + this.restore(this.table[index]); +}; + Parser.prototype.finish = function() { // Return the possible parsings var considerations = []; diff --git a/package.json b/package.json index 6a7a323f..14564015 100644 --- a/package.json +++ b/package.json @@ -40,6 +40,7 @@ "chai": "^3.4.1", "coffee-script": "^1.10.0", "microtime": "^2.1.2", - "mocha": "^2.3.4" + "mocha": "^2.3.4", + "moo": "^0.3.1" } } diff --git a/test/launch.js b/test/launch.js index b43ebb09..511c1af7 100644 --- a/test/launch.js +++ b/test/launch.js @@ -148,10 +148,40 @@ describe("nearleyc", function() { describe('Parser', function() { + let testGrammar = compile(` + y -> x:+ + x -> [a-z0-9] | "\\n" + `) + + it('shows line number in errors', function() { + (() => parse(testGrammar, 'abc\n12!')).should.throw( + 'invalid syntax at line 2 col 3:\n' + + '\n' + + ' 12!\n' + + ' ^' + ) + }) + + it('shows token index in errors', function() { + (() => parse(testGrammar, ['1', '2', '!'])).should.throw( + 'invalid syntax at index 2' + ) + }) + var tosh = compile(read("examples/tosh.ne")); - // TODO: save/restore - /* + it('can save state', function() { + let first = "say 'hello'"; + let second = " for 2 secs"; + let p = new nearley.Parser(tosh, { keepHistory: true }); + p.feed(first); + p.current.should.equal(11) + p.table.length.should.equal(12) + var col = p.save(); + col.index.should.equal(11) + col.lexerState.col.should.equal(first.length) + }); + it('can rewind', function() { let first = "say 'hello'"; let second = " for 2 secs"; @@ -163,6 +193,7 @@ describe('Parser', function() { p.feed(second); p.rewind(first.length); + p.current.should.equal(11) p.table.length.should.equal(12) @@ -173,6 +204,36 @@ describe('Parser', function() { let p = new nearley.Parser(tosh, {}); p.rewind.should.throw(); }) - */ + + it('restores line numbers', function() { + let p = new nearley.Parser(testGrammar); + p.feed('abc\n') + p.save().lexerState.line.should.equal(2) + p.feed('123\n') + var col = p.save(); + col.lexerState.line.should.equal(3) + p.feed('q') + p.restore(col); + p.lexer.line.should.equal(3) + p.feed('z') + }); + + it('restores column number', function() { + let p = new nearley.Parser(testGrammar); + p.feed('foo\nbar') + var col = p.save(); + col.lexerState.line.should.equal(2) + col.lexerState.col.should.equal(3) + p.feed('123'); + p.lexerState.col.should.equal(6) + + p.restore(col); + p.lexerState.line.should.equal(2) + p.lexerState.col.should.equal(3) + p.feed('456') + p.lexerState.col.should.equal(6) + }); + + // TODO: moo save/restore });