Merge pull request #238 from sql-formatter-org/optimize-tokenizer

nene · web-flow · commit 1530fcccd758 · 2022-06-15T21:38:08.000+03:00
Attempt to optimize tokenizer
diff --git a/src/core/Tokenizer.ts b/src/core/Tokenizer.ts
@@ -3,8 +3,20 @@ import { equalizeWhitespace, escapeRegExp, id } from 'src/utils';
 import * as regexFactory from './regexFactory';
 import { type Token, TokenType } from './token';
 
-export const WHITESPACE_REGEX = /^(\s+)/u;
-const NULL_REGEX = /(?!)/; // zero-width negative lookahead, matches nothing
+// A note about regular expressions
+//
+// We're using a sticky flag "y" in all tokenizing regexes.
+// This works a bit like ^, anchoring the regex to the start,
+// but when ^ anchores the regex to the start of string (or line),
+// the sticky flag anchors it to search start position, which we
+// can change by setting RegExp.lastIndex.
+//
+// This allows us to avoid slicing off tokens from the start of input string
+// (which we used in the past) and just move the match start position forward,
+// which is much more performant on long strings.
+
+const WHITESPACE_REGEX = /(\s+)/uy;
+const NULL_REGEX = /(?!)/uy; // zero-width negative lookahead, matches nothing
 
 const toCanonicalKeyword = (text: string) => equalizeWhitespace(text.toUpperCase());
 
@@ -68,6 +80,10 @@ export default class Tokenizer {
   private REGEX_MAP: Record<TokenType, RegExp>;
   private quotedIdentRegex: RegExp;
   private paramPatterns: ParamPattern[];
+  // The input SQL string to process
+  private input = '';
+  // Current position in string
+  private index = 0;
 
   private preprocess = (tokens: Token[]) => tokens;
 
@@ -117,12 +133,12 @@ export default class Tokenizer {
       ]),
       [TokenType.BLOCK_START]: regexFactory.createParenRegex(cfg.blockStart ?? ['(']),
       [TokenType.BLOCK_END]: regexFactory.createParenRegex(cfg.blockEnd ?? [')']),
-      [TokenType.RESERVED_CASE_START]: /^(CASE)\b/iu,
-      [TokenType.RESERVED_CASE_END]: /^(END)\b/iu,
+      [TokenType.RESERVED_CASE_START]: /(CASE)\b/iuy,
+      [TokenType.RESERVED_CASE_END]: /(END)\b/iuy,
       [TokenType.LINE_COMMENT]: regexFactory.createLineCommentRegex(cfg.lineCommentTypes ?? ['--']),
-      [TokenType.BLOCK_COMMENT]: /^(\/\*[^]*?(?:\*\/|$))/u,
+      [TokenType.BLOCK_COMMENT]: /(\/\*[^]*?(?:\*\/|$))/uy,
       [TokenType.NUMBER]:
-        /^(0x[0-9a-fA-F]+|0b[01]+|(-\s*)?[0-9]+(\.[0-9]*)?([eE][-+]?[0-9]+(\.[0-9]+)?)?)/u,
+        /(0x[0-9a-fA-F]+|0b[01]+|(-\s*)?[0-9]+(\.[0-9]*)?([eE][-+]?[0-9]+(\.[0-9]+)?)?)/uy,
       [TokenType.PARAMETER]: NULL_REGEX, // matches nothing
       [TokenType.EOF]: NULL_REGEX, // matches nothing
     };
@@ -152,7 +168,7 @@ export default class Tokenizer {
       },
       {
         // ? placeholders
-        regex: cfg.positionalParams ? /^(\?)/ : undefined,
+        regex: cfg.positionalParams ? /(\?)/uy : undefined,
         parseKey: v => v.slice(1),
       },
     ]);
@@ -172,62 +188,61 @@ export default class Tokenizer {
    * @returns {Token[]} output token stream
    */
   public tokenize(input: string): Token[] {
+    this.input = input;
+    this.index = 0;
     const tokens: Token[] = [];
     let token: Token | undefined;
 
-    // Keep processing the string until it is empty
-    while (input.length) {
+    // Keep processing the string until end is reached
+    while (this.index < this.input.length) {
       // grab any preceding whitespace
-      const whitespaceBefore = this.getWhitespace(input);
-      input = input.substring(whitespaceBefore.length);
+      const whitespaceBefore = this.getWhitespace();
 
-      if (input.length) {
+      if (this.index < this.input.length) {
         // Get the next token and the token type
-        token = this.getNextToken(input, token);
+        token = this.getNextToken(token);
         if (!token) {
-          throw new Error(`Parse error: Unexpected "${input.slice(0, 100)}"`);
+          throw new Error(`Parse error: Unexpected "${input.slice(this.index, 100)}"`);
         }
-        // Advance the string
-        input = input.substring(token.text.length);
 
         tokens.push({ ...token, whitespaceBefore });
       }
     }
     return this.preprocess(tokens);
   }
 
-  /** Matches preceding whitespace if present */
-  private getWhitespace(input: string): string {
-    const matches = input.match(WHITESPACE_REGEX);
-    return matches ? matches[1] : '';
+  private getWhitespace(): string {
+    WHITESPACE_REGEX.lastIndex = this.index;
+    const matches = this.input.match(WHITESPACE_REGEX);
+    if (matches) {
+      // Advance current position by matched whitespace length
+      this.index += matches[1].length;
+      return matches[1];
+    } else {
+      return '';
+    }
   }
 
-  /** Attempts to match next token from input string, tests RegExp patterns in decreasing priority */
-  private getNextToken(input: string, previousToken?: Token): Token | undefined {
+  private getNextToken(previousToken?: Token): Token | undefined {
     return (
-      this.matchToken(TokenType.LINE_COMMENT, input) ||
-      this.matchToken(TokenType.BLOCK_COMMENT, input) ||
-      this.matchToken(TokenType.STRING, input) ||
-      this.matchQuotedIdentToken(input) ||
-      this.matchToken(TokenType.VARIABLE, input) ||
-      this.matchToken(TokenType.BLOCK_START, input) ||
-      this.matchToken(TokenType.BLOCK_END, input) ||
-      this.matchPlaceholderToken(input) ||
-      this.matchToken(TokenType.NUMBER, input) ||
-      this.matchReservedWordToken(input, previousToken) ||
-      this.matchToken(TokenType.IDENT, input) ||
-      this.matchToken(TokenType.OPERATOR, input)
+      this.matchToken(TokenType.LINE_COMMENT) ||
+      this.matchToken(TokenType.BLOCK_COMMENT) ||
+      this.matchToken(TokenType.STRING) ||
+      this.matchQuotedIdentToken() ||
+      this.matchToken(TokenType.VARIABLE) ||
+      this.matchToken(TokenType.BLOCK_START) ||
+      this.matchToken(TokenType.BLOCK_END) ||
+      this.matchPlaceholderToken() ||
+      this.matchToken(TokenType.NUMBER) ||
+      this.matchReservedWordToken(previousToken) ||
+      this.matchToken(TokenType.IDENT) ||
+      this.matchToken(TokenType.OPERATOR)
     );
   }
 
-  /**
-   * Attempts to match a placeholder token pattern
-   * @return {Token | undefined} - The placeholder token if found, otherwise undefined
-   */
-  private matchPlaceholderToken(input: string): Token | undefined {
+  private matchPlaceholderToken(): Token | undefined {
     for (const { regex, parseKey } of this.paramPatterns) {
       const token = this.match({
-        input,
         regex,
         type: TokenType.PARAMETER,
         transform: id,
@@ -243,20 +258,15 @@ export default class Tokenizer {
     return key.replace(new RegExp(escapeRegExp('\\' + quoteChar), 'gu'), quoteChar);
   }
 
-  private matchQuotedIdentToken(input: string): Token | undefined {
+  private matchQuotedIdentToken(): Token | undefined {
     return this.match({
-      input,
       regex: this.quotedIdentRegex,
       type: TokenType.IDENT,
       transform: id,
     });
   }
 
-  /**
-   * Attempts to match a Reserved word token pattern, avoiding edge cases of Reserved words within string tokens
-   * @return {Token | undefined} - The Reserved word token if found, otherwise undefined
-   */
-  private matchReservedWordToken(input: string, previousToken?: Token): Token | undefined {
+  private matchReservedWordToken(previousToken?: Token): Token | undefined {
     // A reserved word cannot be preceded by a '.'
     // this makes it so in "mytable.from", "from" is not considered a reserved word
     if (previousToken?.value === '.') {
@@ -265,57 +275,50 @@ export default class Tokenizer {
 
     // prioritised list of Reserved token types
     return (
-      this.matchReservedToken(TokenType.RESERVED_CASE_START, input) ||
-      this.matchReservedToken(TokenType.RESERVED_CASE_END, input) ||
-      this.matchReservedToken(TokenType.RESERVED_COMMAND, input) ||
-      this.matchReservedToken(TokenType.RESERVED_BINARY_COMMAND, input) ||
-      this.matchReservedToken(TokenType.RESERVED_DEPENDENT_CLAUSE, input) ||
-      this.matchReservedToken(TokenType.RESERVED_LOGICAL_OPERATOR, input) ||
-      this.matchReservedToken(TokenType.RESERVED_KEYWORD, input) ||
-      this.matchReservedToken(TokenType.RESERVED_JOIN_CONDITION, input)
+      this.matchReservedToken(TokenType.RESERVED_CASE_START) ||
+      this.matchReservedToken(TokenType.RESERVED_CASE_END) ||
+      this.matchReservedToken(TokenType.RESERVED_COMMAND) ||
+      this.matchReservedToken(TokenType.RESERVED_BINARY_COMMAND) ||
+      this.matchReservedToken(TokenType.RESERVED_DEPENDENT_CLAUSE) ||
+      this.matchReservedToken(TokenType.RESERVED_LOGICAL_OPERATOR) ||
+      this.matchReservedToken(TokenType.RESERVED_KEYWORD) ||
+      this.matchReservedToken(TokenType.RESERVED_JOIN_CONDITION)
     );
   }
 
   // Helper for matching RESERVED_* tokens which need to be transformed to canonical form
-  private matchReservedToken(tokenType: TokenType, input: string): Token | undefined {
+  private matchReservedToken(tokenType: TokenType): Token | undefined {
     return this.match({
-      input,
       type: tokenType,
       regex: this.REGEX_MAP[tokenType],
       transform: toCanonicalKeyword,
     });
   }
 
   // Shorthand for `match` that looks up regex from REGEX_MAP
-  private matchToken(tokenType: TokenType, input: string): Token | undefined {
+  private matchToken(tokenType: TokenType): Token | undefined {
     return this.match({
-      input,
       type: tokenType,
       regex: this.REGEX_MAP[tokenType],
       transform: id,
     });
   }
 
-  /**
-   * Attempts to match RegExp from head of input, returning undefined if not found
-   * @param {string} _.input - The string to match
-   * @param {TokenType} _.type - The type of token to match against
-   * @param {RegExp} _.regex - The regex to match
-   * @return {Token | undefined} - The matched token if found, otherwise undefined
-   */
+  // Attempts to match RegExp at current position in input
   private match({
-    input,
     type,
     regex,
     transform,
   }: {
-    input: string;
     type: TokenType;
     regex: RegExp;
     transform: (s: string) => string;
   }): Token | undefined {
-    const matches = input.match(regex);
+    regex.lastIndex = this.index;
+    const matches = this.input.match(regex);
     if (matches) {
+      // Advance current position by matched token length
+      this.index += matches[1].length;
       return {
         type,
         text: matches[1],
diff --git a/src/core/formatCommaPositions.ts b/src/core/formatCommaPositions.ts
@@ -1,7 +1,7 @@
 import type { CommaPosition } from 'src/types';
 import { maxLength } from 'src/utils';
 
-import { WHITESPACE_REGEX } from './Tokenizer';
+const WHITESPACE_REGEX = /^(\s+)/u;
 
 /**
  * Handles comma placement - either before, after or tabulated
diff --git a/src/core/regexFactory.ts b/src/core/regexFactory.ts
@@ -63,8 +63,8 @@ export const createOperatorRegex = (monadOperators: string, polyadOperators: str
  */
 export const createLineCommentRegex = (lineCommentTypes: string[]): RegExp =>
   new RegExp(
-    `^((?:${lineCommentTypes.map(c => escapeRegExp(c)).join('|')}).*?)(?:\r\n|\r|\n|$)`,
-    'u'
+    `((?:${lineCommentTypes.map(c => escapeRegExp(c)).join('|')}).*?)(?:\r\n|\r|\n|$)`,
+    'uy'
   );
 
 /**
@@ -75,7 +75,7 @@ export const createReservedWordRegex = (
   identChars: IdentChars = {}
 ): RegExp => {
   if (reservedKeywords.length === 0) {
-    return /^\b$/u;
+    return /\b$/uy;
   }
 
   const avoidIdentChars = rejectIdentCharsPattern(identChars);
@@ -84,7 +84,7 @@ export const createReservedWordRegex = (
     .join('|')
     .replace(/ /gu, '\\s+');
 
-  return new RegExp(`^(${reservedKeywordsPattern})${avoidIdentChars}\\b`, 'iu');
+  return new RegExp(`(${reservedKeywordsPattern})${avoidIdentChars}\\b`, 'iuy');
 };
 
 // Negative lookahead to avoid matching a keyword that's actually part of identifier,
@@ -192,4 +192,4 @@ export const createParameterRegex = (types: string[], pattern: string): RegExp |
   return patternToRegex(`(?:${typesRegex})(?:${pattern})`);
 };
 
-const patternToRegex = (pattern: string): RegExp => new RegExp('^(' + pattern + ')', 'u');
+const patternToRegex = (pattern: string): RegExp => new RegExp('(' + pattern + ')', 'uy');