Skip to content

Commit 1530fcc

Browse files
authored
Merge pull request #238 from sql-formatter-org/optimize-tokenizer
Attempt to optimize tokenizer
2 parents f44bf62 + 9f18506 commit 1530fcc

File tree

3 files changed

+78
-75
lines changed

3 files changed

+78
-75
lines changed

src/core/Tokenizer.ts

Lines changed: 72 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,20 @@ import { equalizeWhitespace, escapeRegExp, id } from 'src/utils';
33
import * as regexFactory from './regexFactory';
44
import { type Token, TokenType } from './token';
55

6-
export const WHITESPACE_REGEX = /^(\s+)/u;
7-
const NULL_REGEX = /(?!)/; // zero-width negative lookahead, matches nothing
6+
// A note about regular expressions
7+
//
8+
// We're using a sticky flag "y" in all tokenizing regexes.
9+
// This works a bit like ^, anchoring the regex to the start,
10+
// but when ^ anchores the regex to the start of string (or line),
11+
// the sticky flag anchors it to search start position, which we
12+
// can change by setting RegExp.lastIndex.
13+
//
14+
// This allows us to avoid slicing off tokens from the start of input string
15+
// (which we used in the past) and just move the match start position forward,
16+
// which is much more performant on long strings.
17+
18+
const WHITESPACE_REGEX = /(\s+)/uy;
19+
const NULL_REGEX = /(?!)/uy; // zero-width negative lookahead, matches nothing
820

921
const toCanonicalKeyword = (text: string) => equalizeWhitespace(text.toUpperCase());
1022

@@ -68,6 +80,10 @@ export default class Tokenizer {
6880
private REGEX_MAP: Record<TokenType, RegExp>;
6981
private quotedIdentRegex: RegExp;
7082
private paramPatterns: ParamPattern[];
83+
// The input SQL string to process
84+
private input = '';
85+
// Current position in string
86+
private index = 0;
7187

7288
private preprocess = (tokens: Token[]) => tokens;
7389

@@ -117,12 +133,12 @@ export default class Tokenizer {
117133
]),
118134
[TokenType.BLOCK_START]: regexFactory.createParenRegex(cfg.blockStart ?? ['(']),
119135
[TokenType.BLOCK_END]: regexFactory.createParenRegex(cfg.blockEnd ?? [')']),
120-
[TokenType.RESERVED_CASE_START]: /^(CASE)\b/iu,
121-
[TokenType.RESERVED_CASE_END]: /^(END)\b/iu,
136+
[TokenType.RESERVED_CASE_START]: /(CASE)\b/iuy,
137+
[TokenType.RESERVED_CASE_END]: /(END)\b/iuy,
122138
[TokenType.LINE_COMMENT]: regexFactory.createLineCommentRegex(cfg.lineCommentTypes ?? ['--']),
123-
[TokenType.BLOCK_COMMENT]: /^(\/\*[^]*?(?:\*\/|$))/u,
139+
[TokenType.BLOCK_COMMENT]: /(\/\*[^]*?(?:\*\/|$))/uy,
124140
[TokenType.NUMBER]:
125-
/^(0x[0-9a-fA-F]+|0b[01]+|(-\s*)?[0-9]+(\.[0-9]*)?([eE][-+]?[0-9]+(\.[0-9]+)?)?)/u,
141+
/(0x[0-9a-fA-F]+|0b[01]+|(-\s*)?[0-9]+(\.[0-9]*)?([eE][-+]?[0-9]+(\.[0-9]+)?)?)/uy,
126142
[TokenType.PARAMETER]: NULL_REGEX, // matches nothing
127143
[TokenType.EOF]: NULL_REGEX, // matches nothing
128144
};
@@ -152,7 +168,7 @@ export default class Tokenizer {
152168
},
153169
{
154170
// ? placeholders
155-
regex: cfg.positionalParams ? /^(\?)/ : undefined,
171+
regex: cfg.positionalParams ? /(\?)/uy : undefined,
156172
parseKey: v => v.slice(1),
157173
},
158174
]);
@@ -172,62 +188,61 @@ export default class Tokenizer {
172188
* @returns {Token[]} output token stream
173189
*/
174190
public tokenize(input: string): Token[] {
191+
this.input = input;
192+
this.index = 0;
175193
const tokens: Token[] = [];
176194
let token: Token | undefined;
177195

178-
// Keep processing the string until it is empty
179-
while (input.length) {
196+
// Keep processing the string until end is reached
197+
while (this.index < this.input.length) {
180198
// grab any preceding whitespace
181-
const whitespaceBefore = this.getWhitespace(input);
182-
input = input.substring(whitespaceBefore.length);
199+
const whitespaceBefore = this.getWhitespace();
183200

184-
if (input.length) {
201+
if (this.index < this.input.length) {
185202
// Get the next token and the token type
186-
token = this.getNextToken(input, token);
203+
token = this.getNextToken(token);
187204
if (!token) {
188-
throw new Error(`Parse error: Unexpected "${input.slice(0, 100)}"`);
205+
throw new Error(`Parse error: Unexpected "${input.slice(this.index, 100)}"`);
189206
}
190-
// Advance the string
191-
input = input.substring(token.text.length);
192207

193208
tokens.push({ ...token, whitespaceBefore });
194209
}
195210
}
196211
return this.preprocess(tokens);
197212
}
198213

199-
/** Matches preceding whitespace if present */
200-
private getWhitespace(input: string): string {
201-
const matches = input.match(WHITESPACE_REGEX);
202-
return matches ? matches[1] : '';
214+
private getWhitespace(): string {
215+
WHITESPACE_REGEX.lastIndex = this.index;
216+
const matches = this.input.match(WHITESPACE_REGEX);
217+
if (matches) {
218+
// Advance current position by matched whitespace length
219+
this.index += matches[1].length;
220+
return matches[1];
221+
} else {
222+
return '';
223+
}
203224
}
204225

205-
/** Attempts to match next token from input string, tests RegExp patterns in decreasing priority */
206-
private getNextToken(input: string, previousToken?: Token): Token | undefined {
226+
private getNextToken(previousToken?: Token): Token | undefined {
207227
return (
208-
this.matchToken(TokenType.LINE_COMMENT, input) ||
209-
this.matchToken(TokenType.BLOCK_COMMENT, input) ||
210-
this.matchToken(TokenType.STRING, input) ||
211-
this.matchQuotedIdentToken(input) ||
212-
this.matchToken(TokenType.VARIABLE, input) ||
213-
this.matchToken(TokenType.BLOCK_START, input) ||
214-
this.matchToken(TokenType.BLOCK_END, input) ||
215-
this.matchPlaceholderToken(input) ||
216-
this.matchToken(TokenType.NUMBER, input) ||
217-
this.matchReservedWordToken(input, previousToken) ||
218-
this.matchToken(TokenType.IDENT, input) ||
219-
this.matchToken(TokenType.OPERATOR, input)
228+
this.matchToken(TokenType.LINE_COMMENT) ||
229+
this.matchToken(TokenType.BLOCK_COMMENT) ||
230+
this.matchToken(TokenType.STRING) ||
231+
this.matchQuotedIdentToken() ||
232+
this.matchToken(TokenType.VARIABLE) ||
233+
this.matchToken(TokenType.BLOCK_START) ||
234+
this.matchToken(TokenType.BLOCK_END) ||
235+
this.matchPlaceholderToken() ||
236+
this.matchToken(TokenType.NUMBER) ||
237+
this.matchReservedWordToken(previousToken) ||
238+
this.matchToken(TokenType.IDENT) ||
239+
this.matchToken(TokenType.OPERATOR)
220240
);
221241
}
222242

223-
/**
224-
* Attempts to match a placeholder token pattern
225-
* @return {Token | undefined} - The placeholder token if found, otherwise undefined
226-
*/
227-
private matchPlaceholderToken(input: string): Token | undefined {
243+
private matchPlaceholderToken(): Token | undefined {
228244
for (const { regex, parseKey } of this.paramPatterns) {
229245
const token = this.match({
230-
input,
231246
regex,
232247
type: TokenType.PARAMETER,
233248
transform: id,
@@ -243,20 +258,15 @@ export default class Tokenizer {
243258
return key.replace(new RegExp(escapeRegExp('\\' + quoteChar), 'gu'), quoteChar);
244259
}
245260

246-
private matchQuotedIdentToken(input: string): Token | undefined {
261+
private matchQuotedIdentToken(): Token | undefined {
247262
return this.match({
248-
input,
249263
regex: this.quotedIdentRegex,
250264
type: TokenType.IDENT,
251265
transform: id,
252266
});
253267
}
254268

255-
/**
256-
* Attempts to match a Reserved word token pattern, avoiding edge cases of Reserved words within string tokens
257-
* @return {Token | undefined} - The Reserved word token if found, otherwise undefined
258-
*/
259-
private matchReservedWordToken(input: string, previousToken?: Token): Token | undefined {
269+
private matchReservedWordToken(previousToken?: Token): Token | undefined {
260270
// A reserved word cannot be preceded by a '.'
261271
// this makes it so in "mytable.from", "from" is not considered a reserved word
262272
if (previousToken?.value === '.') {
@@ -265,57 +275,50 @@ export default class Tokenizer {
265275

266276
// prioritised list of Reserved token types
267277
return (
268-
this.matchReservedToken(TokenType.RESERVED_CASE_START, input) ||
269-
this.matchReservedToken(TokenType.RESERVED_CASE_END, input) ||
270-
this.matchReservedToken(TokenType.RESERVED_COMMAND, input) ||
271-
this.matchReservedToken(TokenType.RESERVED_BINARY_COMMAND, input) ||
272-
this.matchReservedToken(TokenType.RESERVED_DEPENDENT_CLAUSE, input) ||
273-
this.matchReservedToken(TokenType.RESERVED_LOGICAL_OPERATOR, input) ||
274-
this.matchReservedToken(TokenType.RESERVED_KEYWORD, input) ||
275-
this.matchReservedToken(TokenType.RESERVED_JOIN_CONDITION, input)
278+
this.matchReservedToken(TokenType.RESERVED_CASE_START) ||
279+
this.matchReservedToken(TokenType.RESERVED_CASE_END) ||
280+
this.matchReservedToken(TokenType.RESERVED_COMMAND) ||
281+
this.matchReservedToken(TokenType.RESERVED_BINARY_COMMAND) ||
282+
this.matchReservedToken(TokenType.RESERVED_DEPENDENT_CLAUSE) ||
283+
this.matchReservedToken(TokenType.RESERVED_LOGICAL_OPERATOR) ||
284+
this.matchReservedToken(TokenType.RESERVED_KEYWORD) ||
285+
this.matchReservedToken(TokenType.RESERVED_JOIN_CONDITION)
276286
);
277287
}
278288

279289
// Helper for matching RESERVED_* tokens which need to be transformed to canonical form
280-
private matchReservedToken(tokenType: TokenType, input: string): Token | undefined {
290+
private matchReservedToken(tokenType: TokenType): Token | undefined {
281291
return this.match({
282-
input,
283292
type: tokenType,
284293
regex: this.REGEX_MAP[tokenType],
285294
transform: toCanonicalKeyword,
286295
});
287296
}
288297

289298
// Shorthand for `match` that looks up regex from REGEX_MAP
290-
private matchToken(tokenType: TokenType, input: string): Token | undefined {
299+
private matchToken(tokenType: TokenType): Token | undefined {
291300
return this.match({
292-
input,
293301
type: tokenType,
294302
regex: this.REGEX_MAP[tokenType],
295303
transform: id,
296304
});
297305
}
298306

299-
/**
300-
* Attempts to match RegExp from head of input, returning undefined if not found
301-
* @param {string} _.input - The string to match
302-
* @param {TokenType} _.type - The type of token to match against
303-
* @param {RegExp} _.regex - The regex to match
304-
* @return {Token | undefined} - The matched token if found, otherwise undefined
305-
*/
307+
// Attempts to match RegExp at current position in input
306308
private match({
307-
input,
308309
type,
309310
regex,
310311
transform,
311312
}: {
312-
input: string;
313313
type: TokenType;
314314
regex: RegExp;
315315
transform: (s: string) => string;
316316
}): Token | undefined {
317-
const matches = input.match(regex);
317+
regex.lastIndex = this.index;
318+
const matches = this.input.match(regex);
318319
if (matches) {
320+
// Advance current position by matched token length
321+
this.index += matches[1].length;
319322
return {
320323
type,
321324
text: matches[1],

src/core/formatCommaPositions.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import type { CommaPosition } from 'src/types';
22
import { maxLength } from 'src/utils';
33

4-
import { WHITESPACE_REGEX } from './Tokenizer';
4+
const WHITESPACE_REGEX = /^(\s+)/u;
55

66
/**
77
* Handles comma placement - either before, after or tabulated

src/core/regexFactory.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ export const createOperatorRegex = (monadOperators: string, polyadOperators: str
6363
*/
6464
export const createLineCommentRegex = (lineCommentTypes: string[]): RegExp =>
6565
new RegExp(
66-
`^((?:${lineCommentTypes.map(c => escapeRegExp(c)).join('|')}).*?)(?:\r\n|\r|\n|$)`,
67-
'u'
66+
`((?:${lineCommentTypes.map(c => escapeRegExp(c)).join('|')}).*?)(?:\r\n|\r|\n|$)`,
67+
'uy'
6868
);
6969

7070
/**
@@ -75,7 +75,7 @@ export const createReservedWordRegex = (
7575
identChars: IdentChars = {}
7676
): RegExp => {
7777
if (reservedKeywords.length === 0) {
78-
return /^\b$/u;
78+
return /\b$/uy;
7979
}
8080

8181
const avoidIdentChars = rejectIdentCharsPattern(identChars);
@@ -84,7 +84,7 @@ export const createReservedWordRegex = (
8484
.join('|')
8585
.replace(/ /gu, '\\s+');
8686

87-
return new RegExp(`^(${reservedKeywordsPattern})${avoidIdentChars}\\b`, 'iu');
87+
return new RegExp(`(${reservedKeywordsPattern})${avoidIdentChars}\\b`, 'iuy');
8888
};
8989

9090
// Negative lookahead to avoid matching a keyword that's actually part of identifier,
@@ -192,4 +192,4 @@ export const createParameterRegex = (types: string[], pattern: string): RegExp |
192192
return patternToRegex(`(?:${typesRegex})(?:${pattern})`);
193193
};
194194

195-
const patternToRegex = (pattern: string): RegExp => new RegExp('^(' + pattern + ')', 'u');
195+
const patternToRegex = (pattern: string): RegExp => new RegExp('(' + pattern + ')', 'uy');

0 commit comments

Comments
 (0)