Skip to content

Commit c354127

Browse files
committed
[Wasm] Add like_prefix() and like_suffix() optimizations.
The general `like()` computes a SQL LIKE using a dynamic programming approach in O(n*m) runtime where `n` is the string length and `m` is the pattern length. However, for static patterns known at query compile time we can introduce specialzations. We now support prefix and suffix expressions, i.e. patterns of the form `.*%` and `%.*`, for which the actual search at query runtime can be performed in O(n).
1 parent bc4d48c commit c354127

File tree

2 files changed

+70
-0
lines changed

2 files changed

+70
-0
lines changed

src/backend/WasmUtil.cpp

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,14 @@ void ExprCompiler::operator()(const ast::BinaryExpr &e)
366366
set(like_contains(str, pattern));
367367
break;
368368
}
369+
if (std::regex_match(*pattern, std::regex("[^_%\\\\]+%"))) { // prefix expression
370+
set(like_prefix(str, pattern));
371+
break;
372+
}
373+
if (std::regex_match(*pattern, std::regex("%[^_%\\\\]+"))) { // suffix expression
374+
set(like_suffix(str, pattern));
375+
break;
376+
}
369377
}
370378
/* no specialization applicable, fallback to general dynamic programming approach */
371379
(*this)(*e.rhs);
@@ -3624,6 +3632,60 @@ _Boolx1 m::wasm::like_contains(NChar _str, const ThreadSafePooledString &_patter
36243632
}
36253633
}
36263634

3635+
_Boolx1 m::wasm::like_prefix(NChar str, const ThreadSafePooledString &pattern)
3636+
{
3637+
M_insist(std::regex_match(*pattern, std::regex("[^_%\\\\]+%")), "invalid prefix pattern");
3638+
3639+
/*----- Create lower bound. -----*/
3640+
const int32_t len_pattern = strlen(*pattern) - 1; // minus 1 due to ending `%`
3641+
auto _lower_bound = Module::Allocator().raw_malloc<char>(len_pattern + 1);
3642+
for (std::size_t i = 0; i < len_pattern; ++i)
3643+
_lower_bound[i] = (*pattern)[i];
3644+
_lower_bound[len_pattern] = '\0';
3645+
NChar lower_bound(Ptr<Charx1>(_lower_bound), false, len_pattern, true);
3646+
3647+
/*----- Create upper bound. -----*/
3648+
auto _upper_bound = Module::Allocator().raw_malloc<char>(len_pattern + 1);
3649+
for (std::size_t i = 0; i < len_pattern - 1; ++i)
3650+
_upper_bound[i] = (*pattern)[i];
3651+
const char last_char = (*pattern)[len_pattern - 1];
3652+
_upper_bound[len_pattern - 1] = last_char + 1; // increment last character for upper bound
3653+
_upper_bound[len_pattern] = '\0';
3654+
NChar upper_bound(Ptr<Charx1>(_upper_bound), false, len_pattern, true);
3655+
3656+
/*----- Compute result by checking whether given string is in created interval. -----*/
3657+
auto str_cpy = str.clone();
3658+
return strcmp(str_cpy, lower_bound, GE) and strcmp(str, upper_bound, LT);
3659+
}
3660+
3661+
_Boolx1 m::wasm::like_suffix(NChar str, const ThreadSafePooledString &pattern)
3662+
{
3663+
M_insist(std::regex_match(*pattern, std::regex("%[^_%\\\\]+")), "invalid suffix pattern");
3664+
3665+
/*----- Create lower bound. -----*/
3666+
const int32_t len_pattern = strlen(*pattern) - 1; // minus 1 due to starting `%`
3667+
auto _lower_bound = Module::Allocator().raw_malloc<char>(len_pattern + 1);
3668+
for (std::size_t i = 0; i < len_pattern; ++i)
3669+
_lower_bound[i] = (*pattern)[i + 1]; // access pattern with offset +1 due to starting `%`
3670+
_lower_bound[len_pattern] = '\0';
3671+
NChar lower_bound(Ptr<Charx1>(_lower_bound), false, len_pattern, true);
3672+
3673+
/*----- Create upper bound. -----*/
3674+
auto _upper_bound = Module::Allocator().raw_malloc<char>(len_pattern + 1);
3675+
const char first_char = (*pattern)[1]; // access first character at offset 1 due to starting `%`
3676+
_upper_bound[0] = first_char + 1; // increment first character for upper bound
3677+
for (std::size_t i = 1; i < len_pattern; ++i)
3678+
_upper_bound[i] = (*pattern)[i + 1]; // access pattern with offset +1 due to starting `%`
3679+
_upper_bound[len_pattern] = '\0';
3680+
NChar upper_bound(Ptr<Charx1>(_upper_bound), false, len_pattern, true);
3681+
3682+
/*----- Compute result by checking whether given string is in created interval when reversed. -----*/
3683+
const auto max_length = std::max<uint32_t>(str.length(), len_pattern); // use maximal length due to reversed strncmp
3684+
auto str_cpy = str.clone();
3685+
return strncmp(str_cpy, lower_bound, U32x1(max_length), GE, true) and
3686+
strncmp(str, upper_bound, U32x1(max_length), LT, true);
3687+
}
3688+
36273689

36283690
/*======================================================================================================================
36293691
* comparator

src/backend/WasmUtil.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1357,6 +1357,14 @@ _Boolx1 like(NChar str, NChar pattern, const char escape_char = '\\');
13571357
* Knuth–Morris–Pratt algorithm and represents a special case of the SQL LIKE in which the pattern is known at query
13581358
* compile time and has the form `%[^_%\\]+%`. */
13591359
_Boolx1 like_contains(NChar str, const ThreadSafePooledString &pattern);
1360+
/** Checks whether the string \p str has the prefix \p pattern. The implementation is based on rewriting to string
1361+
* comparisons and represents a special case of the SQL LIKE in which the pattern is known at query compile time and
1362+
* has the form `[^_%\\]+%`. */
1363+
_Boolx1 like_prefix(NChar str, const ThreadSafePooledString &pattern);
1364+
/** Checks whether the string \p str has the suffix \p pattern. The implementation is based on rewriting to string
1365+
* comparisons and represents a special case of the SQL LIKE in which the pattern is known at query compile time and
1366+
* has the form `%[^_%\\]+`. */
1367+
_Boolx1 like_suffix(NChar str, const ThreadSafePooledString &pattern);
13601368

13611369

13621370
/*======================================================================================================================

0 commit comments

Comments
 (0)