diff --git a/ext/dom/html_document.c b/ext/dom/html_document.c
index 18c6c909ad672..da56f6cf5b6e2 100644
--- a/ext/dom/html_document.c
+++ b/ext/dom/html_document.c
@@ -27,6 +27,7 @@
#include "namespace_compat.h"
#include "private_data.h"
#include "dom_properties.h"
+#include "swar.h"
#include
#include
#include
@@ -512,6 +513,30 @@ static bool dom_process_parse_chunk(
return true;
}
+/* This seeks, using SWAR techniques, to the first non-ASCII byte in a UTF-8 input.
+ * Returns true if the entire input was consumed without encountering non-ASCII, false otherwise. */
+static zend_always_inline bool dom_seek_utf8_non_ascii(const lxb_char_t **data, const lxb_char_t *end)
+{
+ while (*data + sizeof(size_t) <= end) {
+ size_t bytes;
+ memcpy(&bytes, *data, sizeof(bytes));
+ /* If the top bit is set, it's not ASCII. */
+ if ((bytes & SWAR_REPEAT(0x80)) != 0) {
+ return false;
+ }
+ *data += sizeof(size_t);
+ }
+
+ while (*data < end) {
+ if (**data > 0x80) {
+ return false;
+ }
+ (*data)++;
+ }
+
+ return true;
+}
+
static bool dom_decode_encode_fast_path(
lexbor_libxml2_bridge_parse_context *ctx,
lxb_html_document_t *document,
@@ -527,13 +552,13 @@ static bool dom_decode_encode_fast_path(
const lxb_char_t *last_output = buf_ref;
while (buf_ref != buf_end) {
/* Fast path converts non-validated UTF-8 -> validated UTF-8 */
- if (decoding_encoding_ctx->decode.u.utf_8.need == 0 && *buf_ref < 0x80) {
+ if (decoding_encoding_ctx->decode.u.utf_8.need == 0) {
/* Fast path within the fast path: try to skip non-mb bytes in bulk if we are not in a state where we
- * need more UTF-8 bytes to complete a sequence.
- * It might be tempting to use SIMD here, but it turns out that this is less efficient because
- * we need to process the same byte multiple times sometimes when mixing ASCII with multibyte. */
- buf_ref++;
- continue;
+ * need more UTF-8 bytes to complete a sequence. */
+ if (dom_seek_utf8_non_ascii(&buf_ref, buf_end)) {
+ ZEND_ASSERT(buf_ref == buf_end);
+ break;
+ }
}
const lxb_char_t *buf_ref_backup = buf_ref;
lxb_codepoint_t codepoint = lxb_encoding_decode_utf_8_single(&decoding_encoding_ctx->decode, &buf_ref, buf_end);
diff --git a/ext/dom/lexbor/lexbor/html/tokenizer.c b/ext/dom/lexbor/lexbor/html/tokenizer.c
index 0bd9aec504f06..1eb2749e4aa3a 100644
--- a/ext/dom/lexbor/lexbor/html/tokenizer.c
+++ b/ext/dom/lexbor/lexbor/html/tokenizer.c
@@ -14,6 +14,7 @@
#define LXB_HTML_TAG_RES_DATA
#define LXB_HTML_TAG_RES_SHS_DATA
#include "lexbor/html/tag_res.h"
+#include "swar.h"
#define LXB_HTML_TKZ_TEMP_SIZE (4096 * 4)
@@ -304,6 +305,24 @@ lxb_html_tokenizer_begin(lxb_html_tokenizer_t *tkz)
return LXB_STATUS_OK;
}
+static inline size_t count_utf8_codepoints(size_t bytes)
+{
+ /* Top 2 bits must not be 10 to increase the count, or if starting from a full count: must be 10 to decrease the count.
+ * We can see that the first bit must not be 1 and second must be 0, i.e. not "first & ~second".
+ * We also have to shift to align the bits on top of each other. */
+ size_t firsts = bytes & SWAR_REPEAT(0b10000000);
+ size_t seconds = bytes & SWAR_REPEAT(0b01000000);
+ size_t matches = firsts & ~(seconds << 1);
+
+ size_t cnt = sizeof(size_t);
+ while (matches) {
+ matches &= matches - 1;
+ cnt--;
+ }
+
+ return cnt;
+}
+
lxb_status_t
lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
size_t size)
@@ -315,8 +334,26 @@ lxb_html_tokenizer_chunk(lxb_html_tokenizer_t *tkz, const lxb_char_t *data,
tkz->last = end;
while (data < end) {
- size_t current_column = tkz->current_column;
const lxb_char_t *new_data = tkz->state(tkz, data, end);
+ size_t current_column = tkz->current_column;
+
+ if (SWAR_IS_LITTLE_ENDIAN) {
+ while (data + sizeof(size_t) <= new_data) {
+ size_t bytes;
+ memcpy(&bytes, data, sizeof(size_t));
+
+ size_t matches = SWAR_HAS_ZERO(bytes ^ SWAR_REPEAT(0x0A));
+ if (matches) {
+ data += (((matches - 1) & SWAR_ONES) * SWAR_ONES) >> (sizeof(size_t) * 8 - 8);
+ tkz->current_line++;
+ current_column = 0;
+ } else {
+ data += sizeof(size_t);
+ current_column += count_utf8_codepoints(bytes);
+ }
+ }
+ }
+
while (data < new_data) {
/* Codepoints < 0x80 are encoded the same as their ASCII counterpart, so '\n' will uniquely identify a newline. */
if (*data == '\n') {