@@ -3,8 +3,20 @@ import { equalizeWhitespace, escapeRegExp, id } from 'src/utils';
3
3
import * as regexFactory from './regexFactory' ;
4
4
import { type Token , TokenType } from './token' ;
5
5
6
- export const WHITESPACE_REGEX = / ^ ( \s + ) / u;
7
- const NULL_REGEX = / (? ! ) / ; // zero-width negative lookahead, matches nothing
6
+ // A note about regular expressions
7
+ //
8
+ // We're using a sticky flag "y" in all tokenizing regexes.
9
+ // This works a bit like ^, anchoring the regex to the start,
10
+ // but when ^ anchores the regex to the start of string (or line),
11
+ // the sticky flag anchors it to search start position, which we
12
+ // can change by setting RegExp.lastIndex.
13
+ //
14
+ // This allows us to avoid slicing off tokens from the start of input string
15
+ // (which we used in the past) and just move the match start position forward,
16
+ // which is much more performant on long strings.
17
+
18
+ const WHITESPACE_REGEX = / ( \s + ) / uy;
19
+ const NULL_REGEX = / (? ! ) / uy; // zero-width negative lookahead, matches nothing
8
20
9
21
const toCanonicalKeyword = ( text : string ) => equalizeWhitespace ( text . toUpperCase ( ) ) ;
10
22
@@ -68,6 +80,10 @@ export default class Tokenizer {
68
80
private REGEX_MAP : Record < TokenType , RegExp > ;
69
81
private quotedIdentRegex : RegExp ;
70
82
private paramPatterns : ParamPattern [ ] ;
83
+ // The input SQL string to process
84
+ private input = '' ;
85
+ // Current position in string
86
+ private index = 0 ;
71
87
72
88
private preprocess = ( tokens : Token [ ] ) => tokens ;
73
89
@@ -117,12 +133,12 @@ export default class Tokenizer {
117
133
] ) ,
118
134
[ TokenType . BLOCK_START ] : regexFactory . createParenRegex ( cfg . blockStart ?? [ '(' ] ) ,
119
135
[ TokenType . BLOCK_END ] : regexFactory . createParenRegex ( cfg . blockEnd ?? [ ')' ] ) ,
120
- [ TokenType . RESERVED_CASE_START ] : / ^ ( C A S E ) \b / iu ,
121
- [ TokenType . RESERVED_CASE_END ] : / ^ ( E N D ) \b / iu ,
136
+ [ TokenType . RESERVED_CASE_START ] : / ( C A S E ) \b / iuy ,
137
+ [ TokenType . RESERVED_CASE_END ] : / ( E N D ) \b / iuy ,
122
138
[ TokenType . LINE_COMMENT ] : regexFactory . createLineCommentRegex ( cfg . lineCommentTypes ?? [ '--' ] ) ,
123
- [ TokenType . BLOCK_COMMENT ] : / ^ ( \/ \* [ ^ ] * ?(?: \* \/ | $ ) ) / u ,
139
+ [ TokenType . BLOCK_COMMENT ] : / ( \/ \* [ ^ ] * ?(?: \* \/ | $ ) ) / uy ,
124
140
[ TokenType . NUMBER ] :
125
- / ^ ( 0 x [ 0 - 9 a - f A - F ] + | 0 b [ 0 1 ] + | ( - \s * ) ? [ 0 - 9 ] + ( \. [ 0 - 9 ] * ) ? ( [ e E ] [ - + ] ? [ 0 - 9 ] + ( \. [ 0 - 9 ] + ) ? ) ? ) / u ,
141
+ / ( 0 x [ 0 - 9 a - f A - F ] + | 0 b [ 0 1 ] + | ( - \s * ) ? [ 0 - 9 ] + ( \. [ 0 - 9 ] * ) ? ( [ e E ] [ - + ] ? [ 0 - 9 ] + ( \. [ 0 - 9 ] + ) ? ) ? ) / uy ,
126
142
[ TokenType . PARAMETER ] : NULL_REGEX , // matches nothing
127
143
[ TokenType . EOF ] : NULL_REGEX , // matches nothing
128
144
} ;
@@ -152,7 +168,7 @@ export default class Tokenizer {
152
168
} ,
153
169
{
154
170
// ? placeholders
155
- regex : cfg . positionalParams ? / ^ ( \? ) / : undefined ,
171
+ regex : cfg . positionalParams ? / ( \? ) / uy : undefined ,
156
172
parseKey : v => v . slice ( 1 ) ,
157
173
} ,
158
174
] ) ;
@@ -172,62 +188,61 @@ export default class Tokenizer {
172
188
* @returns {Token[] } output token stream
173
189
*/
174
190
public tokenize ( input : string ) : Token [ ] {
191
+ this . input = input ;
192
+ this . index = 0 ;
175
193
const tokens : Token [ ] = [ ] ;
176
194
let token : Token | undefined ;
177
195
178
- // Keep processing the string until it is empty
179
- while ( input . length ) {
196
+ // Keep processing the string until end is reached
197
+ while ( this . index < this . input . length ) {
180
198
// grab any preceding whitespace
181
- const whitespaceBefore = this . getWhitespace ( input ) ;
182
- input = input . substring ( whitespaceBefore . length ) ;
199
+ const whitespaceBefore = this . getWhitespace ( ) ;
183
200
184
- if ( input . length ) {
201
+ if ( this . index < this . input . length ) {
185
202
// Get the next token and the token type
186
- token = this . getNextToken ( input , token ) ;
203
+ token = this . getNextToken ( token ) ;
187
204
if ( ! token ) {
188
- throw new Error ( `Parse error: Unexpected "${ input . slice ( 0 , 100 ) } "` ) ;
205
+ throw new Error ( `Parse error: Unexpected "${ input . slice ( this . index , 100 ) } "` ) ;
189
206
}
190
- // Advance the string
191
- input = input . substring ( token . text . length ) ;
192
207
193
208
tokens . push ( { ...token , whitespaceBefore } ) ;
194
209
}
195
210
}
196
211
return this . preprocess ( tokens ) ;
197
212
}
198
213
199
- /** Matches preceding whitespace if present */
200
- private getWhitespace ( input : string ) : string {
201
- const matches = input . match ( WHITESPACE_REGEX ) ;
202
- return matches ? matches [ 1 ] : '' ;
214
+ private getWhitespace ( ) : string {
215
+ WHITESPACE_REGEX . lastIndex = this . index ;
216
+ const matches = this . input . match ( WHITESPACE_REGEX ) ;
217
+ if ( matches ) {
218
+ // Advance current position by matched whitespace length
219
+ this . index += matches [ 1 ] . length ;
220
+ return matches [ 1 ] ;
221
+ } else {
222
+ return '' ;
223
+ }
203
224
}
204
225
205
- /** Attempts to match next token from input string, tests RegExp patterns in decreasing priority */
206
- private getNextToken ( input : string , previousToken ?: Token ) : Token | undefined {
226
+ private getNextToken ( previousToken ?: Token ) : Token | undefined {
207
227
return (
208
- this . matchToken ( TokenType . LINE_COMMENT , input ) ||
209
- this . matchToken ( TokenType . BLOCK_COMMENT , input ) ||
210
- this . matchToken ( TokenType . STRING , input ) ||
211
- this . matchQuotedIdentToken ( input ) ||
212
- this . matchToken ( TokenType . VARIABLE , input ) ||
213
- this . matchToken ( TokenType . BLOCK_START , input ) ||
214
- this . matchToken ( TokenType . BLOCK_END , input ) ||
215
- this . matchPlaceholderToken ( input ) ||
216
- this . matchToken ( TokenType . NUMBER , input ) ||
217
- this . matchReservedWordToken ( input , previousToken ) ||
218
- this . matchToken ( TokenType . IDENT , input ) ||
219
- this . matchToken ( TokenType . OPERATOR , input )
228
+ this . matchToken ( TokenType . LINE_COMMENT ) ||
229
+ this . matchToken ( TokenType . BLOCK_COMMENT ) ||
230
+ this . matchToken ( TokenType . STRING ) ||
231
+ this . matchQuotedIdentToken ( ) ||
232
+ this . matchToken ( TokenType . VARIABLE ) ||
233
+ this . matchToken ( TokenType . BLOCK_START ) ||
234
+ this . matchToken ( TokenType . BLOCK_END ) ||
235
+ this . matchPlaceholderToken ( ) ||
236
+ this . matchToken ( TokenType . NUMBER ) ||
237
+ this . matchReservedWordToken ( previousToken ) ||
238
+ this . matchToken ( TokenType . IDENT ) ||
239
+ this . matchToken ( TokenType . OPERATOR )
220
240
) ;
221
241
}
222
242
223
- /**
224
- * Attempts to match a placeholder token pattern
225
- * @return {Token | undefined } - The placeholder token if found, otherwise undefined
226
- */
227
- private matchPlaceholderToken ( input : string ) : Token | undefined {
243
+ private matchPlaceholderToken ( ) : Token | undefined {
228
244
for ( const { regex, parseKey } of this . paramPatterns ) {
229
245
const token = this . match ( {
230
- input,
231
246
regex,
232
247
type : TokenType . PARAMETER ,
233
248
transform : id ,
@@ -243,20 +258,15 @@ export default class Tokenizer {
243
258
return key . replace ( new RegExp ( escapeRegExp ( '\\' + quoteChar ) , 'gu' ) , quoteChar ) ;
244
259
}
245
260
246
- private matchQuotedIdentToken ( input : string ) : Token | undefined {
261
+ private matchQuotedIdentToken ( ) : Token | undefined {
247
262
return this . match ( {
248
- input,
249
263
regex : this . quotedIdentRegex ,
250
264
type : TokenType . IDENT ,
251
265
transform : id ,
252
266
} ) ;
253
267
}
254
268
255
- /**
256
- * Attempts to match a Reserved word token pattern, avoiding edge cases of Reserved words within string tokens
257
- * @return {Token | undefined } - The Reserved word token if found, otherwise undefined
258
- */
259
- private matchReservedWordToken ( input : string , previousToken ?: Token ) : Token | undefined {
269
+ private matchReservedWordToken ( previousToken ?: Token ) : Token | undefined {
260
270
// A reserved word cannot be preceded by a '.'
261
271
// this makes it so in "mytable.from", "from" is not considered a reserved word
262
272
if ( previousToken ?. value === '.' ) {
@@ -265,57 +275,50 @@ export default class Tokenizer {
265
275
266
276
// prioritised list of Reserved token types
267
277
return (
268
- this . matchReservedToken ( TokenType . RESERVED_CASE_START , input ) ||
269
- this . matchReservedToken ( TokenType . RESERVED_CASE_END , input ) ||
270
- this . matchReservedToken ( TokenType . RESERVED_COMMAND , input ) ||
271
- this . matchReservedToken ( TokenType . RESERVED_BINARY_COMMAND , input ) ||
272
- this . matchReservedToken ( TokenType . RESERVED_DEPENDENT_CLAUSE , input ) ||
273
- this . matchReservedToken ( TokenType . RESERVED_LOGICAL_OPERATOR , input ) ||
274
- this . matchReservedToken ( TokenType . RESERVED_KEYWORD , input ) ||
275
- this . matchReservedToken ( TokenType . RESERVED_JOIN_CONDITION , input )
278
+ this . matchReservedToken ( TokenType . RESERVED_CASE_START ) ||
279
+ this . matchReservedToken ( TokenType . RESERVED_CASE_END ) ||
280
+ this . matchReservedToken ( TokenType . RESERVED_COMMAND ) ||
281
+ this . matchReservedToken ( TokenType . RESERVED_BINARY_COMMAND ) ||
282
+ this . matchReservedToken ( TokenType . RESERVED_DEPENDENT_CLAUSE ) ||
283
+ this . matchReservedToken ( TokenType . RESERVED_LOGICAL_OPERATOR ) ||
284
+ this . matchReservedToken ( TokenType . RESERVED_KEYWORD ) ||
285
+ this . matchReservedToken ( TokenType . RESERVED_JOIN_CONDITION )
276
286
) ;
277
287
}
278
288
279
289
// Helper for matching RESERVED_* tokens which need to be transformed to canonical form
280
- private matchReservedToken ( tokenType : TokenType , input : string ) : Token | undefined {
290
+ private matchReservedToken ( tokenType : TokenType ) : Token | undefined {
281
291
return this . match ( {
282
- input,
283
292
type : tokenType ,
284
293
regex : this . REGEX_MAP [ tokenType ] ,
285
294
transform : toCanonicalKeyword ,
286
295
} ) ;
287
296
}
288
297
289
298
// Shorthand for `match` that looks up regex from REGEX_MAP
290
- private matchToken ( tokenType : TokenType , input : string ) : Token | undefined {
299
+ private matchToken ( tokenType : TokenType ) : Token | undefined {
291
300
return this . match ( {
292
- input,
293
301
type : tokenType ,
294
302
regex : this . REGEX_MAP [ tokenType ] ,
295
303
transform : id ,
296
304
} ) ;
297
305
}
298
306
299
- /**
300
- * Attempts to match RegExp from head of input, returning undefined if not found
301
- * @param {string } _.input - The string to match
302
- * @param {TokenType } _.type - The type of token to match against
303
- * @param {RegExp } _.regex - The regex to match
304
- * @return {Token | undefined } - The matched token if found, otherwise undefined
305
- */
307
+ // Attempts to match RegExp at current position in input
306
308
private match ( {
307
- input,
308
309
type,
309
310
regex,
310
311
transform,
311
312
} : {
312
- input : string ;
313
313
type : TokenType ;
314
314
regex : RegExp ;
315
315
transform : ( s : string ) => string ;
316
316
} ) : Token | undefined {
317
- const matches = input . match ( regex ) ;
317
+ regex . lastIndex = this . index ;
318
+ const matches = this . input . match ( regex ) ;
318
319
if ( matches ) {
320
+ // Advance current position by matched token length
321
+ this . index += matches [ 1 ] . length ;
319
322
return {
320
323
type,
321
324
text : matches [ 1 ] ,
0 commit comments