Skip to content

Commit 803cd82

Browse files
authored
Optimizations for mb_trim (#12803)
* Fast path for when there is nothing to trim in mb_trim * Make mb_trim decide between linear search vs hash table lookup Using empirical experiments I noticed that on my i7-4790 the hash table approach becomes faster once we have more than 4 code points in the trim characters, when evaluated on the worst case. This patch changes the logic so that a hash table is used for a large number of trim characters, and linear search when the number of trim characters is <= 4.
1 parent fc1b467 commit 803cd82

File tree

2 files changed

+49
-10
lines changed

2 files changed

+49
-10
lines changed

ext/mbstring/mbstring.c

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2951,12 +2951,21 @@ typedef enum {
29512951
MB_BOTH_TRIM = 3
29522952
} mb_trim_mode;
29532953

2954-
static zend_always_inline bool is_trim_wchar(uint32_t w, const HashTable *ht)
2954+
static bool is_trim_wchar(uint32_t w, const HashTable *ht, const uint32_t *default_chars, size_t default_chars_length)
29552955
{
2956-
return zend_hash_index_exists(ht, w);
2956+
if (ht) {
2957+
return zend_hash_index_exists(ht, w);
2958+
} else {
2959+
for (size_t i = 0; i < default_chars_length; i++) {
2960+
if (w == default_chars[i]) {
2961+
return true;
2962+
}
2963+
}
2964+
return false;
2965+
}
29572966
}
29582967

2959-
static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, mb_trim_mode mode, const mbfl_encoding *enc)
2968+
static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, const uint32_t *default_chars, size_t default_chars_length, mb_trim_mode mode, const mbfl_encoding *enc)
29602969
{
29612970
unsigned char *in = (unsigned char*)ZSTR_VAL(str);
29622971
uint32_t wchar_buf[128];
@@ -2974,7 +2983,7 @@ static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht,
29742983

29752984
for (size_t i = 0; i < out_len; i++) {
29762985
uint32_t w = wchar_buf[i];
2977-
if (is_trim_wchar(w, what_ht)) {
2986+
if (is_trim_wchar(w, what_ht, default_chars, default_chars_length)) {
29782987
if (mode & MB_LTRIM) {
29792988
left += 1;
29802989
}
@@ -2990,6 +2999,9 @@ static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht,
29902999
}
29913000
}
29923001

3002+
if (left == 0 && right == 0) {
3003+
return zend_string_copy(str);
3004+
}
29933005
return mb_get_substr(str, left, total_len - (right + left), enc);
29943006
}
29953007

@@ -3012,7 +3024,7 @@ static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, c
30123024
for (size_t i = 0; i < trim_default_chars_length; i++) {
30133025
zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val);
30143026
}
3015-
zend_string* retval = trim_each_wchar(str, &what_ht, mode, enc);
3027+
zend_string* retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
30163028
zend_hash_destroy(&what_ht);
30173029

30183030
return retval;
@@ -3027,18 +3039,32 @@ static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_t
30273039
size_t what_len = ZSTR_LEN(what);
30283040
HashTable what_ht;
30293041
zval val;
3030-
ZVAL_TRUE(&val);
3031-
zend_hash_init(&what_ht, what_len, NULL, NULL, false);
3042+
bool hash_initialized = false;
30323043

30333044
while (what_len) {
30343045
what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state);
30353046
ZEND_ASSERT(what_out_len <= 128);
3036-
for (size_t i = 0; i < what_out_len; i++) {
3037-
zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
3047+
3048+
if (what_out_len <= 4 && !hash_initialized) {
3049+
return trim_each_wchar(str, NULL, what_wchar_buf, what_out_len, mode, enc);
3050+
} else {
3051+
if (!hash_initialized) {
3052+
hash_initialized = true;
3053+
ZVAL_TRUE(&val);
3054+
zend_hash_init(&what_ht, what_len, NULL, NULL, false);
3055+
}
3056+
for (size_t i = 0; i < what_out_len; i++) {
3057+
zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
3058+
}
30383059
}
30393060
}
30403061

3041-
zend_string *retval = trim_each_wchar(str, &what_ht, mode, enc);
3062+
if (UNEXPECTED(!hash_initialized)) {
3063+
/* This is only possible if what is empty */
3064+
return zend_string_copy(str);
3065+
}
3066+
3067+
zend_string *retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
30423068
zend_hash_destroy(&what_ht);
30433069

30443070
return retval;

ext/mbstring/tests/mb_trim.phpt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,15 @@ var_dump(mb_trim(str_repeat(" ", 129)));
4040
var_dump(mb_trim(str_repeat(" ", 129) . "a"));
4141
var_dump(mb_rtrim(str_repeat(" ", 129) . "a"));
4242

43+
echo "== Very long trim characters ==\n";
44+
$trim_chars = "";
45+
for ($i = 1024; $i < 2048; $i++) {
46+
$trim_chars .= mb_chr($i);
47+
}
48+
var_dump(mb_trim($trim_chars . "hello" . $trim_chars, $trim_chars));
49+
var_dump(strlen(mb_ltrim($trim_chars . "hello" . $trim_chars, $trim_chars)));
50+
var_dump(strlen(mb_rtrim($trim_chars . "hello" . $trim_chars, $trim_chars)));
51+
4352
echo "== mb_ltrim ==\n";
4453
var_dump(mb_ltrim("あああああああああああああああああああああああああああああああああいああああ", ""));
4554
echo "== mb_rtrim ==\n";
@@ -103,6 +112,10 @@ string(26) " あいうおえお  a"
103112
string(0) ""
104113
string(1) "a"
105114
string(388) "                                                                                                                                 a"
115+
== Very long trim characters ==
116+
string(5) "hello"
117+
int(2053)
118+
int(2053)
106119
== mb_ltrim ==
107120
string(15) "いああああ"
108121
== mb_rtrim ==

0 commit comments

Comments
 (0)