[WIP] Add utf8proc_isequal_normalized

LB-- · LB-- · commit 5b31811f08bc · 2025-07-20T18:39:08.000+02:00
diff --git a/utf8proc.c b/utf8proc.c
@@ -822,3 +822,180 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8
     UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE);
   return retval;
 }
+
+/**
+* Helper function used by utf8proc_isequal_normalized.
+* Reads and sorts the next sequence of combining characters.
+* If buf is not large enough, calculates minimum length by processing
+* the whole rest of the string instead of just the next combining characters.
+*/
+static void utf8proc_decompose_next_chars(utf8proc_processing_state_t *state, const utf8proc_option_t options,
+  utf8proc_custom_func custom_func, void *custom_data
+) {
+  utf8proc_ssize_t buf_needed = 0, buf_needed_max = 1;
+  utf8proc_span32_t buf_remaining = state->buf;
+  int last_boundclass = 0;
+  state->error = 0;
+  while (state->str.len > 0) {
+    /* read a char from `state->str` and decompose it to `buf_remaining` */
+    utf8proc_int32_t c;
+    utf8proc_ssize_t str_consumed, buf_consumed;
+    str_consumed = utf8proc_iterate(state->str.ptr, state->str.len, &c);
+    if (str_consumed < 1) {
+      /* error or end of string */
+      state->error = str_consumed;
+      return;
+    } else if (str_consumed > state->str.len) {
+      /* string ends mid-way */
+      state->error = UTF8PROC_ERROR_INVALIDUTF8;
+      return;
+    }
+    if (custom_func) {
+      c = custom_func(c, custom_data);
+    }
+	/* successfully read from `state->str`, now time to decompose */
+    if (c < 0x80) {
+      /* fast path for common ASCII case */
+      last_boundclass = 0;
+      if (state->error != 0) {
+        /* just looking for the longest combining sequence, this isn't it */
+        continue;
+      }
+      if (buf_remaining.len_available < 1) {
+        /* not enough space */
+        buf_remaining.len_available = 0;
+        state->buf.ptr = buf_remaining.ptr = NULL;
+        state->error = UTF8PROC_ERROR_NOMEM;
+        /* now just looking for the longest combining sequence, this isn't it */
+        continue;
+      }
+      /* success */
+      buf_consumed = buf_needed = 1;
+      if ((options & UTF8PROC_CASEFOLD) && 0x41 <= c && c <= 0x5A) {
+        *buf_remaining.ptr = c + 0x20;
+      } else {
+        *buf_remaining.ptr = c;
+      }
+      state->str.ptr += str_consumed;
+      state->str.len -= str_consumed;
+      buf_remaining.ptr += 1;
+      buf_remaining.len_available -= 1;
+      /* ASCII characters are all zero combining class */
+      break;
+    } else {
+      buf_consumed = utf8proc_decompose_char(c, buf_remaining.ptr, buf_remaining.len_available, options, &last_boundclass);
+      if (buf_consumed < 0) {
+        /* error */
+        state->error = buf_consumed;
+        return;
+      }
+      buf_needed += buf_consumed;
+      if (state->error == 0 && buf_consumed > buf_remaining.len_available) {
+        /* not enough space */
+        buf_remaining.len_available = 0;
+        state->buf.ptr = buf_remaining.ptr = NULL;
+        state->error = UTF8PROC_ERROR_NOMEM;
+      }
+    }
+    /* success */
+    state->str.ptr += str_consumed;
+    state->str.len -= str_consumed;
+    if (buf_needed == 0) {
+      /* ignorable sequence - skip and try next */
+      continue;
+    }
+    if (state->error == 0) {
+      buf_remaining.ptr += buf_consumed;
+      buf_remaining.len_available -= buf_consumed;
+    }
+    /* decomposed chars must be sorted in ascending order of combining class,
+       which means we need to keep fetching chars until we get to non-combining */
+    if (buf_consumed == 0 || state->str.len <= 0 || unsafe_get_property(c)->combining_class == 0) {
+      /* done decomposing this sequence */
+      if (state->error == 0) {
+        /* time to finish up and optionally sort it */
+        break;
+      }
+      /* else we're trying to find the longest decomposed sequence */
+      if (buf_needed > buf_needed_max) {
+        buf_needed_max = buf_needed;
+      }
+      /* reset for next sequence */
+      buf_needed = 0;
+    }
+  }
+  if (state->buf.ptr == NULL) {
+    state->buf.len_used = buf_needed_max;
+  } else {
+    state->buf.len_used = buf_needed;
+  }
+  if (buf_needed > 1 && state->error == 0 && buf_needed <= state->buf.len_available) {
+    /* sort by combining class (similar code is in utf8proc_decompose_custom implementation) */
+    utf8proc_ssize_t pos = 0;
+    const utf8proc_ssize_t second_to_last = buf_needed - 1;
+    while (pos < second_to_last) {
+      utf8proc_int32_t uc1, uc2;
+      const utf8proc_property_t *property1, *property2;
+      uc1 = state->buf.ptr[pos];
+      uc2 = state->buf.ptr[pos+1];
+      property1 = unsafe_get_property(uc1);
+      property2 = unsafe_get_property(uc2);
+      if (property1->combining_class > property2->combining_class &&
+          property2->combining_class > 0) {
+        state->buf.ptr[pos] = uc2;
+        state->buf.ptr[pos+1] = uc1;
+        if (pos > 0) pos--; else pos++;
+      } else {
+        pos++;
+      }
+    }
+  }
+}
+
+static utf8proc_string8_view_t utf8proc_purify_strlen(utf8proc_string8_view_t str) {
+  if (str.len < 0) {
+    if(str.ptr == NULL) {
+      str.len = 0;
+    }
+    else for(str.len = 0; str.ptr[str.len] != '\0'; ++str.len) { }
+  }
+  return str;
+}
+
+UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized(utf8proc_processing_state_t *a, utf8proc_processing_state_t *b, utf8proc_option_t options,
+  utf8proc_custom_func custom_func, void *custom_data
+) {
+  a->str = utf8proc_purify_strlen(a->str);
+  b->str = utf8proc_purify_strlen(b->str);
+  options = (utf8proc_option_t)((options & ~(unsigned int)UTF8PROC_COMPOSE)|UTF8PROC_DECOMPOSE);
+  while (1) {
+    const utf8proc_string8_view_t original_a = a->str;
+    const utf8proc_string8_view_t original_b = b->str;
+    if(a->str.len == 0 || b->str.len == 0) {
+      /* end of string */
+      return;
+    }
+    utf8proc_decompose_next_chars(a, options, custom_func, custom_data);
+    utf8proc_decompose_next_chars(b, options, custom_func, custom_data);
+    if (a->error == 0 && b->error == 0) {
+      utf8proc_ssize_t pos;
+      /* success - compare the work buffers for equality */
+      if (a->buf.len_used != b->buf.len_used) {
+        /* mismatch found */
+        return;
+      }
+      for(pos = 0; pos < a->buf.len_used; ++pos) {
+        if(a->buf.ptr[pos] != b->buf.ptr[pos]) {
+          /* mismatch found */
+          return;
+        }
+      }
+      /* equal so far */
+      continue;
+    }
+    /* error - restore unprocessed strings and exit */
+    a->str = original_a;
+    b->str = original_b;
+    return;
+  }
+}
diff --git a/utf8proc.h b/utf8proc.h
@@ -315,6 +315,54 @@ typedef struct utf8proc_property_struct {
   unsigned indic_conjunct_break:2;
 } utf8proc_property_t;
 
+/** Struct for a read-only view of a UTF-8 string. A len of -1 means null-terminated. */
+typedef struct utf8proc_string8_view_struct {
+  const utf8proc_uint8_t *ptr;
+  utf8proc_ssize_t len;
+} utf8proc_string8_view_t;
+
+/**
+* Struct for a read-write view of a buffer of Unicode code points.
+* The len_* fields must be nonnegative and len_available must always
+* be greater than or equal to len_used.
+*/
+typedef struct utf8proc_span32_struct {
+	utf8proc_int32_t *ptr;
+	utf8proc_ssize_t len_used, len_available;
+} utf8proc_span32_t;
+
+/**
+* Struct containing information about a string in processing.
+* Used for re-entrant processing algorithms such as
+* @ref utf8proc_isequal_normalized
+*/
+typedef struct utf8proc_processing_state_struct {
+  /**
+  * The source UTF-8 string which is being processed.
+  * When processing is halted for any reason, this will be updated
+  * to the remainder of the string which hasn't yet been processed.
+  * This will become an empty string once fully processed.
+  */
+  utf8proc_string8_view_t str;
+  /**
+  * A temporary work buffer used during processing, generally
+  * for holding and sorting combining characters. Depending
+  * on `str` contents, this may need to be dynamically reallocated.
+  * In that case, `buf.ptr` is set to NULL and `buf.len_used` is
+  * updated to the minimum required length (as far as can be known
+  * before other errors stop processing) while `buf.len_available`
+  * is left untouched for comparison. The contents of this buffer
+  * are arbitrary and do not need to be cleared nor preserved, so
+  * the space can be re-used across multiple processing attempts.
+  */
+  utf8proc_span32_t buf;
+  /**
+  * Any error from string processing. This is set to 0 when processing
+  * starts and left untouched unless there is an error.
+  */
+  utf8proc_ssize_t error;
+} utf8proc_processing_state_t;
+
 /** Unicode categories. */
 typedef enum {
   UTF8PROC_CATEGORY_CN  = 0, /**< Other, not assigned */
@@ -787,6 +835,35 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
 UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str);
 /** @} */
 
+/**
+* Re-entrant algorithm for efficiently comparing two strings for equality
+* while ignoring differences in Unicode normalization. If a mismatch is
+* found or an error occurs, the `str` members are updated to the remainder
+* of each string, starting with the culprit(s). If the strings are equal,
+* both `str` members will become empty.
+*
+* As noted on @ref utf8proc_processing_state_t the `buf.ptr` member will be
+* NULL if the provided buffer is too small, consult `buf.len_used` to see that
+* it is greater than `buf.len_available` and re-allocate as needed to satisfy
+* the requirement. The `buf` buffers must not overlap, but they may coexist
+* within a single memory allocation if desired, just just must be properly
+* aligned in accordance with the alignment requirements of utf8proc_int32_t.
+*
+* Note also that because the `str` members are updated, you can resume comparing
+* the string from where you left off after growing the `buf` buffers or fixing
+* erroneous UTF-8 sequences, so there's no wasteful re-processing of parts of
+* the string that have already been deemed equivalent.
+*
+* You can also use this function for finding the longest common starting
+* sequence, and you can apply your own methodology for handling or skipping
+* over invalid UTF-8 sequences found in each provided string.
+*
+* @param custom_func Optional, see @ref utf8proc_custom_func for info.
+*/
+UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized(utf8proc_processing_state_t *a, utf8proc_processing_state_t *b, utf8proc_option_t options,
+  utf8proc_custom_func custom_func, void *custom_data
+);
+
 #ifdef __cplusplus
 }
 #endif