diff --git a/ext/dom/html_document.c b/ext/dom/html_document.c index 18c6c909ad672..da56f6cf5b6e2 100644 --- a/ext/dom/html_document.c +++ b/ext/dom/html_document.c @@ -27,6 +27,7 @@ #include "namespace_compat.h" #include "private_data.h" #include "dom_properties.h" +#include "swar.h" #include #include #include @@ -512,6 +513,30 @@ static bool dom_process_parse_chunk( return true; } +/* This seeks, using SWAR techniques, to the first non-ASCII byte in a UTF-8 input. + * Returns true if the entire input was consumed without encountering non-ASCII, false otherwise. */ +static zend_always_inline bool dom_seek_utf8_non_ascii(const lxb_char_t **data, const lxb_char_t *end) +{ + while (*data + sizeof(size_t) <= end) { + size_t bytes; + memcpy(&bytes, *data, sizeof(bytes)); + /* If the top bit is set, it's not ASCII. */ + if ((bytes & SWAR_REPEAT(0x80)) != 0) { + return false; + } + *data += sizeof(size_t); + } + + while (*data < end) { + if (**data > 0x80) { + return false; + } + (*data)++; + } + + return true; +} + static bool dom_decode_encode_fast_path( lexbor_libxml2_bridge_parse_context *ctx, lxb_html_document_t *document, @@ -527,13 +552,13 @@ static bool dom_decode_encode_fast_path( const lxb_char_t *last_output = buf_ref; while (buf_ref != buf_end) { /* Fast path converts non-validated UTF-8 -> validated UTF-8 */ - if (decoding_encoding_ctx->decode.u.utf_8.need == 0 && *buf_ref < 0x80) { + if (decoding_encoding_ctx->decode.u.utf_8.need == 0) { /* Fast path within the fast path: try to skip non-mb bytes in bulk if we are not in a state where we - * need more UTF-8 bytes to complete a sequence. - * It might be tempting to use SIMD here, but it turns out that this is less efficient because - * we need to process the same byte multiple times sometimes when mixing ASCII with multibyte. */ - buf_ref++; - continue; + * need more UTF-8 bytes to complete a sequence. */ + if (dom_seek_utf8_non_ascii(&buf_ref, buf_end)) { + ZEND_ASSERT(buf_ref == buf_end); + break; + } } const lxb_char_t *buf_ref_backup = buf_ref; lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end); diff --git a/ext/dom/lexbor/lexbor/html/tokenizer.c b/ext/dom/lexbor/lexbor/html/tokenizer.c index 0bd9aec504f06..1eb2749e4aa3a 100644 --- a/ext/dom/lexbor/lexbor/html/tokenizer.c +++ b/ext/dom/lexbor/lexbor/html/tokenizer.c @@ -14,6 +14,7 @@ #define LXB_HTML_TAG_RES_DATA #define LXB_HTML_TAG_RES_SHS_DATA #include "lexbor/html/tag_res.h" +#include "swar.h" #define LXB_HTML_TKZ_TEMP_SIZE (4096 * 4) @@ -304,6 +305,24 @@ lxb_html_tokenizer_begin(lxb_html_tokenizer_t *tkz) return LXB_STATUS_OK; } +static inline size_t count_utf8_codepoints(size_t bytes) +{ + /* Top 2 bits must not be 10 to increase the count, or if starting from a full count: must be 10 to decrease the count. + * We can see that the first bit must not be 1 and second must be 0, i.e. not "first & ~second". + * We also have to shift to align the bits on top of each other. */ + size_t firsts = bytes & SWAR_REPEAT(0b10000000); + size_t seconds = bytes & SWAR_REPEAT(0b01000000); + size_t matches = firsts & ~(seconds << 1); + + size_t cnt = sizeof(size_t); + while (matches) { + matches &= matches - 1; + cnt--; + } + + return cnt; +} + lxb_status_t lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, size_t size) @@ -315,8 +334,26 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data, tkz->last = end; while (data < end) { - size_t current_column = tkz->current_column; const lxb_char_t *new_data = tkz->state(tkz, data, end); + size_t current_column = tkz->current_column; + + if (SWAR_IS_LITTLE_ENDIAN) { + while (data + sizeof(size_t) <= new_data) { + size_t bytes; + memcpy(&bytes, data, sizeof(size_t)); + + size_t matches = SWAR_HAS_ZERO(bytes ^ SWAR_REPEAT(0x0A)); + if (matches) { + data += (((matches - 1) & SWAR_ONES) * SWAR_ONES) >> (sizeof(size_t) * 8 - 8); + tkz->current_line++; + current_column = 0; + } else { + data += sizeof(size_t); + current_column += count_utf8_codepoints(bytes); + } + } + } + while (data < new_data) { /* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */ if (*data == '\n') {