From 506d360e54b5c539b503982dd422196a1b7ab48c Mon Sep 17 00:00:00 2001
From: LB <LB@MayContainCode.com>
Date: Sun, 27 Jul 2025 18:46:58 +0200
Subject: [PATCH] [WIP] Add utf8proc_isequal_normalized

---
 utf8proc.c | 324 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 utf8proc.h |  73 ++++++++++++
 2 files changed, 397 insertions(+)

diff --git a/utf8proc.c b/utf8proc.c
index 87cec942..91f0f0a8 100644
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -822,3 +822,327 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8
     UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE);
   return retval;
 }
+
+UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized_custom(utf8proc_processing_state_t *a, utf8proc_processing_state_t *b, utf8proc_option_t options,
+  utf8proc_custom_func a_custom_func, void *a_custom_data, utf8proc_custom_func b_custom_func, void *b_custom_data
+) {
+  const utf8proc_bool a_len_terminated = (a->str.len >= 0);
+  const utf8proc_bool b_len_terminated = (b->str.len >= 0);
+  /* which source string(s) we need to read more from */
+  utf8proc_bool a_consume = true;
+  utf8proc_bool b_consume = true;
+  /* structure to simplify rollback for combining char multipass processing */
+  const utf8proc_ssize_t decomposed_max_len = 8;
+  struct {
+    /* results of utf8proc_iterate */
+    utf8proc_int32_t codepoint;
+    utf8proc_ssize_t consumed;
+    /* results of utf8proc_decompose_char */
+    utf8proc_int32_t decomposed[8];
+    utf8proc_ssize_t decomposed_len;
+    int last_boundclass;
+    /* combing class tracking state */
+    utf8proc_ssize_t decomposed_pos;
+    utf8proc_propval_t combining_class;
+  } a_decomposing_current = {0},
+    b_decomposing_current = {0},
+    a_decomposing_combining_start = {0},
+    b_decomposing_combining_start = {0},
+    a_decomposing_combining_end = {0},
+    b_decomposing_combining_end = {0};
+  /* combining class tracking state */
+  utf8proc_ssize_t pos = 0;
+  utf8proc_bool combining_initialized = false;
+  utf8proc_propval_t combining_class_current = 0;
+  utf8proc_propval_t combining_class_next = 0;
+  utf8proc_string8_view_t a_combining_start = a->str;
+  utf8proc_string8_view_t b_combining_start = b->str;
+  utf8proc_string8_view_t a_combining_end = a->str;
+  utf8proc_string8_view_t b_combining_end = b->str;
+  utf8proc_uint8_t combining_classes_finished[(UTF8PROC_COMBINING_CLASS_MAX + 1 + CHAR_BIT)/CHAR_BIT] = {0};
+  const utf8proc_ssize_t combining_classes_finished_len = sizeof(combining_classes_finished)/sizeof(combining_classes_finished[0]);
+  /* initialize/clear error state */
+  a->error = 0;
+  b->error = 0;
+  a->str_at_error.ptr = NULL;
+  b->str_at_error.ptr = NULL;
+  a->str_at_error.len = 0;
+  b->str_at_error.len = 0;
+  /* force compatible options:
+     - must use UTF8PROC_DECOMPOSE, not UTF8PROC_COMPOSE.
+     - we choose when to add UTF8PROC_NULLTERM on a case-by-case basis (not needed currently).
+     - can't use UTF8PROC_CHARBOUND because it would break `unsafe_get_property`. */
+  options = (utf8proc_option_t)((options & ~(unsigned int)(UTF8PROC_COMPOSE|UTF8PROC_NULLTERM|UTF8PROC_CHARBOUND))|UTF8PROC_DECOMPOSE);
+  /* primary loop: each iteration pulls data from one or both strings */
+  while (1) {
+    /* read a code point from each - utf8proc_iterate handles null termination with negative length on its own */
+    if (a_consume) a_decomposing_current.consumed = utf8proc_iterate(a->str.ptr, a->str.len, &a_decomposing_current.codepoint);
+    if (b_consume) b_decomposing_current.consumed = utf8proc_iterate(b->str.ptr, b->str.len, &b_decomposing_current.codepoint);
+    /* check for errors, roll back string views if needed */
+    if (a_decomposing_current.consumed < 0) {
+      a->error = a_decomposing_current.consumed;
+      a->str_at_error = a->str;
+    }
+    if (b_decomposing_current.consumed < 0) {
+      b->error = b_decomposing_current.consumed;
+      b->str_at_error = b->str;
+    }
+    if (a->error || b->error) {
+      if (combining_initialized) {
+        a->str = a_combining_start;
+        b->str = b_combining_start;
+      }
+      return;
+    }
+    /* if we reach the end of one string, we may still need to process more
+       of the other due to ignorable sequences, and the combining class code
+       needs to make a judgement upon reaching the end of a combining sequence.
+       so from this point forward code must be guarded against this possibility. */
+    if (!combining_initialized && a_decomposing_current.consumed == 0 && b_decomposing_current.consumed == 0) {
+      /* true end of both strings, must be equal */
+      return;
+    }
+    /* apply each code point filter */
+    if (a_custom_func && a_consume && a_decomposing_current.consumed) a_decomposing_current.codepoint = a_custom_func(a_decomposing_current.codepoint, a_custom_data);
+    if (b_custom_func && b_consume && b_decomposing_current.consumed) b_decomposing_current.codepoint = b_custom_func(b_decomposing_current.codepoint, b_custom_data);
+    /* ASCII fast path is only suitable if we consumed both at once and not in combining mode */
+    if (!combining_initialized && a_consume && b_consume && a_decomposing_current.consumed && b_decomposing_current.consumed
+    && a_decomposing_current.codepoint < 0x80 && b_decomposing_current.codepoint < 0x80) {
+      /* fast path for common ASCII case */
+      if (options & UTF8PROC_CASEFOLD) {
+        if (0x41 <= a_decomposing_current.codepoint && a_decomposing_current.codepoint <= 0x5A) a_decomposing_current.codepoint += 0x20;
+        if (0x41 <= b_decomposing_current.codepoint && b_decomposing_current.codepoint <= 0x5A) b_decomposing_current.codepoint += 0x20;
+      }
+      if (a_decomposing_current.codepoint != b_decomposing_current.codepoint) {
+        /* mismatch detected */
+        return;
+      }
+      /* equal so far */
+      a->str.ptr += a_decomposing_current.consumed;
+      a->str.len -= a_decomposing_current.consumed * a_len_terminated;
+      b->str.ptr += b_decomposing_current.consumed;
+      b->str.len -= b_decomposing_current.consumed * b_len_terminated;
+      a_consume = true;
+      b_consume = true;
+      continue;
+    }
+    /* now time to decompose */
+    #define UTF8PROC_LAMBDA(ab) \
+    if (ab##_consume && ab##_decomposing_current.consumed) { \
+      /* we got a code point, decompose it */ \
+      ab##_decomposing_current.decomposed_len = utf8proc_decompose_char(ab##_decomposing_current.codepoint, \
+        ab##_decomposing_current.decomposed, decomposed_max_len, options, &ab##_decomposing_current.last_boundclass); \
+      ab##_decomposing_current.decomposed_pos = 0; \
+      ab##_consume = false; \
+      /* check for errors */ \
+      if (ab##_decomposing_current.decomposed_len < 0) { \
+        ab->error = ab##_decomposing_current.decomposed_len; \
+        ab->str_at_error = ab->str; \
+      } else if (ab##_decomposing_current.decomposed_len > decomposed_max_len) { \
+        /* should never happen in practice, just for static analysis. */ \
+        ab->error = UTF8PROC_ERROR_OVERFLOW; \
+        ab->str_at_error = ab->str; \
+      } else if (ab##_decomposing_current.decomposed_len == 0) { \
+        /* ignorable sequence, need to consume more */ \
+        ab->str.ptr += ab##_decomposing_current.consumed; \
+        ab->str.len -= ab##_decomposing_current.consumed * ab##_len_terminated; \
+        ab##_consume = true; \
+      } \
+    } else { \
+      ab##_consume = false; \
+    }
+    /* run the above for both strings */
+    UTF8PROC_LAMBDA(a);
+    UTF8PROC_LAMBDA(b);
+    #undef UTF8PROC_LAMBDA
+    /* check for errors, roll back string views if needed */
+    if (a->error || b->error) {
+      if (combining_initialized) {
+        a->str = a_combining_start;
+        b->str = b_combining_start;
+      }
+      return;
+    }
+    /* check for ignorable sequences */
+    if (a_consume || b_consume) {
+      continue;
+    }
+    /* now that ignorable sequences have been handled, check for end of either string */
+    if (!combining_initialized && (a_decomposing_current.consumed == 0 || b_decomposing_current.consumed == 0)) {
+      /* one or both strings ended, either equal or inequal */
+      return;
+    }
+    /* at this point both decomposed buffers need to be compared. when the
+       strings are fully normalized, the decomposed chars are sorted in
+       order of combining class, which could mean having to sort the entire
+       decomposed string in the worst case. since we only need to compare
+       them as-if they are normalized, we can just go one combining class
+       at a time. we have to be careful around ends of strings to make
+       sure the string views are properly updated to NOT FURTHER THAN the
+       first difference in the strings, which may be a large combining seq.
+       */
+    while (1) {
+      /* do we need to decompose more? */
+      if (a_decomposing_current.consumed && a_decomposing_current.decomposed_pos >= a_decomposing_current.decomposed_len) {
+        a_consume = true;
+        a->str.ptr += a_decomposing_current.consumed;
+        a->str.len -= a_decomposing_current.consumed * a_len_terminated;
+      }
+      if (b_decomposing_current.consumed && b_decomposing_current.decomposed_pos >= b_decomposing_current.decomposed_len) {
+        b_consume = true;
+        b->str.ptr += b_decomposing_current.consumed;
+        b->str.len -= b_decomposing_current.consumed * b_len_terminated;
+      }
+      if (a_consume || b_consume) {
+        /* use outer loop to pull more data */
+        break;
+      }
+      /* get the combining class of each current code point, or 0 for end of string */
+      if (a_decomposing_current.consumed) {
+        a_decomposing_current.combining_class = unsafe_get_property(a_decomposing_current.decomposed[a_decomposing_current.decomposed_pos])->combining_class;
+      } else {
+        a_decomposing_current.combining_class = 0;
+      }
+      if (b_decomposing_current.consumed) {
+        b_decomposing_current.combining_class = unsafe_get_property(b_decomposing_current.decomposed[b_decomposing_current.decomposed_pos])->combining_class;
+      } else {
+        b_decomposing_current.combining_class = 0;
+      }
+      /* static analysis guards, always false in practice */
+      if (a_decomposing_current.combining_class/CHAR_BIT >= combining_classes_finished_len) {
+        a->error = UTF8PROC_ERROR_OVERFLOW;
+        a->str_at_error = a->str;
+      }
+      if (b_decomposing_current.combining_class/CHAR_BIT >= combining_classes_finished_len) {
+        b->error = UTF8PROC_ERROR_OVERFLOW;
+        b->str_at_error = b->str;
+      }
+      if (a->error || b->error) {
+        if (combining_initialized) {
+          a->str = a_combining_start;
+          b->str = b_combining_start;
+        }
+        return;
+      }
+      /* do either have a combining class of 0 (non-combining)? */
+      if (a_decomposing_current.combining_class == 0 || b_decomposing_current.combining_class == 0) {
+        if (combining_initialized) {
+          /* we've reached the end of the combining sequence */
+          if (a_decomposing_current.combining_class == 0) {
+            a_combining_end = a->str;
+            a_decomposing_combining_end = a_decomposing_current;
+          }
+          if (b_decomposing_current.combining_class == 0) {
+            b_combining_end = b->str;
+            b_decomposing_combining_end = b_decomposing_current;
+          }
+          if (combining_class_next != 0) {
+            /* prepare for the next pass */
+            utf8proc_uint8_t *elem = &(combining_classes_finished[combining_class_current/CHAR_BIT]);
+            const utf8proc_uint8_t mask = (utf8proc_uint8_t)(1 << (combining_class_current % CHAR_BIT));
+            *elem |= mask;
+            combining_class_current = combining_class_next;
+            combining_class_next = 0;
+            /* roll back for next pass */
+            a->str = a_combining_start;
+            b->str = b_combining_start;
+            a_decomposing_current = a_decomposing_combining_start;
+            b_decomposing_current = b_decomposing_combining_start;
+            continue;
+          }
+          /* else exit combining mode */
+          if (a_combining_end.ptr == a_combining_start.ptr && a_decomposing_combining_end.decomposed_pos == a_decomposing_combining_start.decomposed_pos
+          ||  b_combining_end.ptr == b_combining_start.ptr && b_decomposing_combining_end.decomposed_pos == b_decomposing_combining_start.decomposed_pos) {
+            /* didn't reach the end of one of the sequences yet - mismatch detected */
+            a->str = a_combining_start;
+            b->str = b_combining_start;
+            return;
+          }
+          /* roll forward to the ends of the combining sequence */
+          a->str = a_combining_end;
+          b->str = b_combining_end;
+          a_decomposing_current = a_decomposing_combining_end;
+          b_decomposing_current = b_decomposing_combining_end;
+          /* resume normal processing in outer loop */
+          combining_initialized = false;
+          break;
+        }
+        /* else not in combining mode and at least one is non-combining */
+        if (a_decomposing_current.combining_class != b_decomposing_current.combining_class) {
+          /* mismatch detected */
+          return;
+        }
+        /* both are non-combining,compare the decomposed buffers */
+        if (a_decomposing_current.decomposed[a_decomposing_current.decomposed_pos] != b_decomposing_current.decomposed[b_decomposing_current.decomposed_pos]) {
+          /* mismatch detected */
+          return;
+        }
+        /* equal so far */
+        ++a_decomposing_current.decomposed_pos;
+        ++b_decomposing_current.decomposed_pos;
+        continue;
+      }
+      /* both nonzero combining class, initialize combining mode:
+         we go one combining class at a time, comparing the decomposed chars
+         of that class in order while consuming more from the input strings
+         as needed and noting the next class until we reach a non-combining
+         char. then, if there's another combining class, we roll back and
+         start from the beginning of the sequence again. */
+      if (!combining_initialized) {
+        combining_class_current = a_decomposing_current.combining_class;
+        combining_class_next = ((a_decomposing_current.combining_class == b_decomposing_current.combining_class)? 0 : b_decomposing_current.combining_class);
+        a_combining_start = a->str;
+        b_combining_start = b->str;
+        a_combining_end = a->str;
+        b_combining_end = b->str;
+        a_decomposing_combining_end = a_decomposing_combining_start = a_decomposing_current;
+        b_decomposing_combining_end = b_decomposing_combining_start = b_decomposing_current;
+        for (pos = 0; pos < combining_classes_finished_len; ++pos) {
+          combining_classes_finished[pos] = 0;
+        }
+        combining_initialized = true;
+      }
+      /* pull more data from one or both until we get both to be current class */
+      if (a_decomposing_current.combining_class != combining_class_current) {
+        /* is this an unseen class we can target next? */
+        if (combining_class_next == 0) {
+          const utf8proc_uint8_t elem = combining_classes_finished[a_decomposing_current.combining_class/CHAR_BIT];
+          const utf8proc_uint8_t mask = (utf8proc_uint8_t)(1 << (a_decomposing_current.combining_class % CHAR_BIT));
+          if ((elem & mask) == 0) {
+            combining_class_next = a_decomposing_current.combining_class;
+          }
+        }
+        ++a_decomposing_current.decomposed_pos;
+      }
+      if (b_decomposing_current.combining_class != combining_class_current) {
+        /* is this an unseen class we can target next? */
+        if (combining_class_next == 0) {
+          const utf8proc_uint8_t elem = combining_classes_finished[b_decomposing_current.combining_class/CHAR_BIT];
+          const utf8proc_uint8_t mask = (utf8proc_uint8_t)(1 << (b_decomposing_current.combining_class % CHAR_BIT));
+          if ((elem & mask) == 0) {
+            combining_class_next = b_decomposing_current.combining_class;
+          }
+        }
+        ++b_decomposing_current.decomposed_pos;
+      }
+      if (a_decomposing_current.combining_class != combining_class_current || b_decomposing_current.combining_class != combining_class_current) {
+        continue;
+      }
+      /* both are the current combining class, compare the decomposed buffers */
+      if (a_decomposing_current.decomposed[a_decomposing_current.decomposed_pos] != b_decomposing_current.decomposed[b_decomposing_current.decomposed_pos]) {
+        /* mismatch detected, roll back string views and exit */
+        a->str = a_combining_start;
+        b->str = b_combining_start;
+        return;
+      }
+      /* equal so far */
+      ++a_decomposing_current.decomposed_pos;
+      ++b_decomposing_current.decomposed_pos;
+      continue;
+    }
+  }
+}
+
+UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized(utf8proc_processing_state_t *a, utf8proc_processing_state_t *b, utf8proc_option_t options) {
+  utf8proc_isequal_normalized_custom(a, b, options, NULL, NULL, NULL, NULL);
+}
diff --git a/utf8proc.h b/utf8proc.h
index 59e53893..abc79f77 100644
--- a/utf8proc.h
+++ b/utf8proc.h
@@ -315,6 +315,48 @@ typedef struct utf8proc_property_struct {
   unsigned indic_conjunct_break:2;
 } utf8proc_property_t;
 
+/** Every `combining_class` in any valid `utf8proc_property_t` is nonnegative and less than or equal to this. */
+#define UTF8PROC_COMBINING_CLASS_MAX 255
+
+/** Struct for a read-only view of a UTF-8 string. A len of -1 means null-terminated. */
+typedef struct utf8proc_string8_view_struct {
+  const utf8proc_uint8_t *ptr;
+  utf8proc_ssize_t len;
+} utf8proc_string8_view_t;
+
+/**
+ * Struct for an in+out view of a string and an associated error.
+ * Use by @ref utf8proc_isequal_normalized to determine equivalence
+ * or longest common sequence, or to report an error and know which
+ * string had the error and where.
+ */
+typedef struct utf8proc_processing_state_struct {
+  /**
+   * The source UTF-8 string which is being read from.
+   * The contents of the string are never modified but it is expected
+   * that the view itself (pointer and length) will be updated to the
+   * remainder of the string which hasn't yet been successfully read.
+   * This will become an empty string once fully read.
+   */
+  utf8proc_string8_view_t str;
+  /**
+   * Any error from string processing. This is set to 0 when reading
+   * starts and left untouched unless there is an error. When set,
+   * the `str` member will be updated to no further than the source
+   * of the error, though there can still be some distance between
+   * the start of `str` and the actual source of the error depending
+   * on algorithm and string contents (e.g. combining chars). Use
+   * the `str_at_error` member instead to know the actual start
+   * of the problematic sequence.
+   */
+  utf8proc_ssize_t error;
+  /**
+   * The actual position of the error (if any) or an empty string
+   * otherwise.
+   */
+  utf8proc_string8_view_t str_at_error;
+} utf8proc_processing_state_t;
+
 /** Unicode categories. */
 typedef enum {
   UTF8PROC_CATEGORY_CN  = 0, /**< Other, not assigned */
@@ -787,6 +829,37 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str);
 /** @} */
 
+/**
+ * Algorithm for efficiently comparing two strings for equality while ignoring
+ * differences in Unicode normalization. If a mismatch is found or an error
+ * occurs, the `str` members are updated to the remainder of each string,
+ * not further than the first difference or error. If the strings are equal,
+ * both `str` members will become empty.
+ *
+ * You can also use this function for finding the longest common starting
+ * sequence, and you can apply your own methodology for handling or skipping
+ * over invalid UTF-8 sequences found in each provided string. Just be aware
+ * that due to how Unicode combining characters work, the error could be
+ * potentially quite far ahead of where the updated string views point. Use
+ * the `str_at_error` members to help with analyzing errors.
+ */
+UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized(utf8proc_processing_state_t *a, utf8proc_processing_state_t *b, utf8proc_option_t options);
+
+/**
+ * Like utf8proc_isequal_normalized(), but also takes `custom_func` mapping
+ * functions that are called on each codepoint in their corresponding string
+ * before any other transformations (along with a `custom_data` pointer that
+ * is passed through to `custom_func`). The `custom_func` arguments are ignored
+ * if they are `NULL`. See @ref utf8proc_custom_func for more info.
+ * 
+ * Exercise caution: due to how Unicode combining characters work, the algorithm
+ * may backtrack and resume processing at an earlier point of the string without
+ * warning.
+ */
+UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized_custom(utf8proc_processing_state_t *a, utf8proc_processing_state_t *b, utf8proc_option_t options,
+  utf8proc_custom_func a_custom_func, void *a_custom_data, utf8proc_custom_func b_custom_func, void *b_custom_data
+);
+
 #ifdef __cplusplus
 }
 #endif