kach · tjvr · Apr 24, 2017 · Apr 17, 2017 · Mar 28, 2017 · Apr 17, 2017
diff --git a/README.md b/README.md
@@ -283,6 +283,19 @@ welcome here!
 Including a parser imports *all* of the nonterminals defined in the parser, as
 well as any JS, macros, and config options defined there.
 
+### Custom lexers
+
+You can pass a `lexer` instance to Parser, which must have the following interface:
+
+* `reset(chunk, Info)`: set the internal buffer to `chunk`, and restore line/col/state info taken from `save()`.
+* `next() -> Token` return e.g. `{type, value, line, col, …}`. Only the `value` attribute is required.
+* `save() -> Info` -> return an object describing the current line/col etc. This allows us to preserve this information between `feed()` calls, and also to support `Parser#rewind()`. The exact structure is lexer-specific; nearley doesn't care what's in it.
+* `formatError(token)` -> return a string with an error message describing the line/col of the offending token. You might like to include a preview of the line in question.
+* `has(tokenType)` -> return true if the lexer can emit tokens with that name. Used to resolve `%`-specifiers in compiled nearley grammars.
+
+If Parser isn't given a lexer option, it will look for a `.lexer` attribute on its Grammar. The `@lexer` directive allows exporting a lexer object from your `.ne` grammar file. (See `json.ne` for an example.)
+
+
 ### Custom tokens
 
 Nearley assumes by default that your fundamental unit of parsing, called a

diff --git a/examples/json.ne b/examples/json.ne
@@ -1,7 +1,36 @@
 # http://www.json.org/
 # http://www.asciitable.com/
+@{%
+
+const moo = require('moo')
+
+let lexer = moo.compile({
+    space: {match: /\s+/, lineBreaks: true},
+    number: /-?(?:[0-9]|[1-9][0-9]+)(?:\.[0-9]+)?(?:[eE][-+]?[0-9]+)?\b/,
+    string: /"(?:\\["bfnrt\/\\]|\\u[a-fA-F0-9]{4}|[^"\\])*"/,
+    '{': '{',
+    '}': '}',
+    '[': '[',
+    ']': ']',
+    ',': ',',
+    ':': ':',
+    true: 'true',
+    false: 'false',
+    null: 'null',
+})
+
+// TODO add has() to moo
+lexer.has = function(name) {
+  return lexer.groups.find(function(group) {
+    return group.tokenType === name;
+  });
+}
+
+%}
 
-json -> [\s]:* (object | array) [\s]:* {% function(d) { return d[1][0]; } %}
+@lexer lexer
+
+json -> _ (object | array) _ {% function(d) { return d[1][0]; } %}
 
 object -> "{" _ "}" {% function(d) { return {}; } %}
     | "{" _ pair (_ "," _ pair):* _ "}" {% extractObject %}
@@ -18,35 +47,15 @@ value ->
     | "false" {% function(d) { return false; } %}
     | "null" {% function(d) { return null; } %}
 
-number -> "-":? ("0" | intPart) fracPart:? expPart:? {% extractNumber %}
+number -> %number {% function(d) { return parseFloat(d[0].value) } %}
 
-string -> "\"" validChar:* "\"" {% function(d) { return d[1].join("") } %}
+string -> %string {% function(d) { return JSON.parse(d[0].value) } %}
 
 pair -> key _ ":" _ value {% function(d) { return [d[0], d[4]]; } %}
 
 key -> string {% id %}
 
-intPart -> [1-9] [0-9]:* {% function(d) { return d[0] + d[1].join(""); } %}
-
-fracPart -> "." [0-9]:* {% function(d) { return d[0] + d[1].join(""); } %}
-
-expPart -> [eE] [+-]:? [0-9]:* {% function(d) { return d[0] + (d[1] || '') + d[2].join(""); } %}
-
-validChar ->
-      [^"\\] {% function(d) { return d[0]; } %}
-    | "\\\"" {% function(d) { return "\""; } %}
-    | "\\\\" {% function(d) { return "\\"; } %}
-    | "\\/" {% function(d) { return "/"; } %}
-    | "\\n" {% function(d) { return "\n"; } %}
-    | "\\b" {% function(d) { return "\b"; } %}
-    | "\\f" {% function(d) { return "\f"; } %}
-    | "\\r" {% function(d) { return "\r"; } %}
-    | "\\t" {% function(d) { return "\t"; } %}
-    | "\\u" hex hex hex hex {% unicodehex %}
-
-hex -> [0-9a-fA-F] {% function(d) { return d[0]; } %}
-
-_ -> null | [\s]:+ {% function(d) { return null; } %}
+_ -> null | %space {% function(d) { return null; } %}
 
 @{%
 
@@ -76,20 +85,4 @@ function extractArray(d) {
     return output;
 }
 
-function unicodehex(d) {
-    let codePoint = parseInt(d[1]+d[2]+d[3]+d[4], 16);
-
-    // Handle '\\'
-    if (codePoint == 92) {
-        return "\\";
-    }
-
-    return String.fromCodePoint(codePoint);
-}
-
-function extractNumber(d) {
-    let value = (d[0] || '') + d[1] + (d[2] || '') + (d[3] || '');
-    return parseFloat(value);
-}
-
 %}
diff --git a/lib/compile.js b/lib/compile.js
@@ -118,12 +118,17 @@ function Compile(structure, opts) {
             if (!token.literal.length) {
                 return null;
             }
-            if (token.literal.length === 1) {
+            if (token.literal.length === 1 || result.config.lexer) {
                 return token;
             }
             return buildStringToken(ruleName, token, env);
         }
         if (token.token) {
+            if (result.config.lexer) {
+                var name = token.token;
+                var expr = result.config.lexer + ".has(" + JSON.stringify(name) + ") ? {type: " + JSON.stringify(name) + "} : " + name;
+                return {token: "(" + expr + ")"};
+            }
             return token;
         }
 

diff --git a/lib/generate.js b/lib/generate.js
@@ -112,6 +112,7 @@ generate.js = generate._default = generate.javascript = function (parser, export
     output += "function id(x) {return x[0]; }\n";
     output += parser.body.join('\n');
     output += "var grammar = {\n";
+    output += "    Lexer: " + parser.config.lexer + ",\n";
     output += "    ParserRules: " +
         serializeRules(parser.rules, generate.javascript.builtinPostprocessors)
         + "\n";
@@ -141,6 +142,7 @@ generate.cs = generate.coffee = generate.coffeescript = function (parser, export
     output += "  id = (d)->d[0]\n";
     output += tabulateString(dedentFunc(parser.body.join('\n')), '  ') + '\n';
     output += "  grammar = {\n";
+    output += "    Lexer: " + parser.config.lexer + ",\n";
     output += "    ParserRules: " +
         tabulateString(
                 serializeRules(parser.rules, generate.coffeescript.builtinPostprocessors),
@@ -173,6 +175,7 @@ generate.ts = generate.typescript = function (parser, exportName) {
     output += "interface NearleyRule {name:string; symbols:NearleySymbol[]; postprocess?:(d:any[],loc?:number,reject?:{})=>any};\n";
     output += "type NearleySymbol = string | {literal:any} | {test:(token:any) => boolean};\n";
     output += "export var grammar : NearleyGrammar = {\n";
+    output += "    Lexer: " + parser.config.lexer + ",\n";
     output += "    ParserRules: " + serializeRules(parser.rules, generate.typescript.builtinPostprocessors) + "\n";
     output += "  , ParserStart: " + JSON.stringify(parser.start) + "\n";
     output += "}\n";

diff --git a/lib/nearley-language-bootstrapped.js b/lib/nearley-language-bootstrapped.js
@@ -3,6 +3,7 @@
 (function () {
 function id(x) {return x[0]; }
 var grammar = {
+    Lexer: undefined,
     ParserRules: [
     {"name": "dqstring$ebnf$1", "symbols": []},
     {"name": "dqstring$ebnf$1", "symbols": ["dqstring$ebnf$1", "dstrchar"], "postprocess": function arrpush(d) {return d[0].concat([d[1]]);}},

diff --git a/lib/nearley.js b/lib/nearley.js
@@ -17,8 +17,8 @@ Rule.highestId = 0;
 
 Rule.prototype.toString = function(withCursorAt) {
     function stringifySymbolSequence (e) {
-        return (e.literal) ? JSON.stringify(e.literal)
-                           : e.toString();
+        return e.literal ? JSON.stringify(e.literal) :
+               e.type ? '%' + e.type : e.toString();
     }
     var symbolSequence = (typeof withCursorAt === "undefined")
                          ? this.symbols.map(stringifySymbolSequence).join(' ')
@@ -101,7 +101,7 @@ Column.prototype.process = function(nextColumn) {
 
                 // special-case nullables
                 if (state.reference === this.index) {
-                    // make sure future predictors of this rule get completed. 
+                    // make sure future predictors of this rule get completed.
                     var exp = state.rule.name;
                     (this.completed[exp] = this.completed[exp] || []).push(state);
                 }
@@ -168,12 +168,63 @@ function Grammar(rules, start) {
 
 // So we can allow passing (rules, start) directly to Parser for backwards compatibility
 Grammar.fromCompiled = function(rules, start) {
+    var lexer = rules.Lexer;
     if (rules.ParserStart) {
       start = rules.ParserStart;
       rules = rules.ParserRules;
     }
     var rules = rules.map(function (r) { return (new Rule(r.name, r.symbols, r.postprocess)); });
-    return new Grammar(rules, start);
+    var g = new Grammar(rules, start);
+    g.lexer = lexer; // nb. storing lexer on Grammar is iffy, but unavoidable
+    return g;
+}
+
+
+function StreamLexer() {
+  this.reset("");
+}
+
+StreamLexer.prototype.reset = function(data, state) {
+    this.buffer = data;
+    this.index = 0;
+    this.line = state ? state.line : 1;
+    this.lastLineBreak = state ? -state.col : 0;
+}
+
+StreamLexer.prototype.next = function() {
+    if (this.index < this.buffer.length) {
+        var ch = this.buffer[this.index++];
+        if (ch === '\n') {
+          this.line += 1;
+          this.lastLineBreak = this.index;
+        }
+        return {value: ch};
+    }
+}
+
+StreamLexer.prototype.save = function() {
+  return {
+    line: this.line,
+    col: this.index - this.lastLineBreak,
+  }
+}
+
+StreamLexer.prototype.formatError = function(token, message) {
+    // nb. this gets called after consuming the offending token,
+    // so the culprit is index-1
+    var buffer = this.buffer;
+    if (typeof buffer === 'string') {
+        var nextLineBreak = buffer.indexOf('\n', this.index);
+        if (nextLineBreak === -1) nextLineBreak = buffer.length;
+        var line = buffer.substring(this.lastLineBreak, nextLineBreak)
+        var col = this.index - this.lastLineBreak;
+        message += " at line " + this.line + " col " + col + ":\n\n";
+        message += "  " + line + "\n"
+        message += "  " + Array(col).join(" ") + "^"
+        return message;
+    } else {
+        return message + " at index " + (this.index - 1);
+    }
 }
 
 
@@ -189,11 +240,16 @@ function Parser(rules, start, options) {
     // Read options
     this.options = {
         keepHistory: false,
+        lexer: grammar.lexer || new StreamLexer,
     };
     for (var key in (options || {})) {
         this.options[key] = options[key];
     }
 
+    // Setup lexer
+    this.lexer = this.options.lexer;
+    this.lexerState = undefined;
+
     // Setup a table
     var column = new Column(grammar, 0);
     var table = this.table = [column];
@@ -203,14 +259,17 @@ function Parser(rules, start, options) {
     column.predict(grammar.start);
     // TODO what if start rule is nullable?
     column.process();
-    this.current = 0;
+    this.current = 0; // token index
 }
 
 // create a reserved token for indicating a parse fail
 Parser.fail = {};
 
 Parser.prototype.feed = function(chunk) {
-    for (var chunkPos = 0; chunkPos < chunk.length; chunkPos++) {
+    var lexer = this.lexer;
+    lexer.reset(chunk, this.lexerState);
+
+    while (token = lexer.next()) {
         // We add new states to table[current+1]
         var column = this.table[this.current];
 
@@ -224,18 +283,18 @@ Parser.prototype.feed = function(chunk) {
         this.table.push(nextColumn);
 
         // Advance all tokens that expect the symbol
-        // So for each state in the previous row,
-
-        var token = chunk[chunkPos];
+        var literal = token.value;
+        var value = lexer.constructor === StreamLexer ? token.value : token;
         var scannable = column.scannable;
         for (var w = scannable.length; w--; ) {
             var state = scannable[w];
             var expect = state.rule.symbols[state.dot];
             // Try to consume the token
             // either regex or literal
-            if (expect.test ? expect.test(token) : expect.literal === token) {
+            if (expect.test ? expect.test(value) :
+                expect.type ? expect.type === token.type
+                            : expect.literal === literal) {
                 // Add it
-                var value = token;
                 var next = state.nextState({data: value, token: token, isToken: true});
                 nextColumn.states.push(next);
             }
@@ -254,16 +313,24 @@ Parser.prototype.feed = function(chunk) {
         // If needed, throw an error:
         if (nextColumn.states.length === 0) {
             // No states at all! This is not good.
-            var err = new Error(
-                "nearley: No possible parsings (@" + (this.current)
-                    + ": '" + token + "')."
-            );
+            var message = this.lexer.formatError(token, "invalid syntax") + "\n";
+            message += "Unexpected " + (token.type ? token.type + " token: " : "");
+            message += JSON.stringify(token.value !== undefined ? token.value : token) + "\n";
+            var err = new Error(message);
             err.offset = this.current;
             throw err;
         }
 
+        // maybe save lexer state
+        if (this.options.keepHistory) {
+          column.lexerState = lexer.save()
+        }
+
         this.current++;
     }
+    if (column) {
+      this.lexerState = lexer.save()
+    }
 
     // Incrementally keep track of results
     this.results = this.finish();
@@ -272,6 +339,33 @@ Parser.prototype.feed = function(chunk) {
     return this;
 };
 
+Parser.prototype.save = function() {
+    var column = this.table[this.current];
+    column.lexerState = this.lexerState;
+    return column;
+};
+
+Parser.prototype.restore = function(column) {
+    var index = column.index;
+    this.current = index;
+    this.table[index] = column;
+    this.table.splice(index + 1);
+    this.lexerState = column.lexerState;
+
+    // Incrementally keep track of results
+    this.results = this.finish();
+};
+
+// nb. deprecated: use save/restore instead!
+Parser.prototype.rewind = function(index) {
+    if (!this.options.keepHistory) {
+        throw new Error('set option `keepHistory` to enable rewinding')
+    }
+    // nb. recall column (table) indicies fall between token indicies.
+    //        col 0   --   token 0   --   col 1
+    this.restore(this.table[index]);
+};
+
 Parser.prototype.finish = function() {
     // Return the possible parsings
     var considerations = [];

diff --git a/package.json b/package.json
@@ -40,6 +40,7 @@
     "chai": "^3.4.1",
     "coffee-script": "^1.10.0",
     "microtime": "^2.1.2",
-    "mocha": "^2.3.4"
+    "mocha": "^2.3.4",
+    "moo": "^0.3.1"
   }
 }