Skip to content

Commit 5b31811

Browse files
committed
[WIP] Add utf8proc_isequal_normalized
1 parent 20d192a commit 5b31811

File tree

2 files changed

+254
-0
lines changed

2 files changed

+254
-0
lines changed

utf8proc.c

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -822,3 +822,180 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8
822822
UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE);
823823
return retval;
824824
}
825+
826+
/**
827+
* Helper function used by utf8proc_isequal_normalized.
828+
* Reads and sorts the next sequence of combining characters.
829+
* If buf is not large enough, calculates minimum length by processing
830+
* the whole rest of the string instead of just the next combining characters.
831+
*/
832+
static void utf8proc_decompose_next_chars(utf8proc_processing_state_t *state, const utf8proc_option_t options,
833+
utf8proc_custom_func custom_func, void *custom_data
834+
) {
835+
utf8proc_ssize_t buf_needed = 0, buf_needed_max = 1;
836+
utf8proc_span32_t buf_remaining = state->buf;
837+
int last_boundclass = 0;
838+
state->error = 0;
839+
while (state->str.len > 0) {
840+
/* read a char from `state->str` and decompose it to `buf_remaining` */
841+
utf8proc_int32_t c;
842+
utf8proc_ssize_t str_consumed, buf_consumed;
843+
str_consumed = utf8proc_iterate(state->str.ptr, state->str.len, &c);
844+
if (str_consumed < 1) {
845+
/* error or end of string */
846+
state->error = str_consumed;
847+
return;
848+
} else if (str_consumed > state->str.len) {
849+
/* string ends mid-way */
850+
state->error = UTF8PROC_ERROR_INVALIDUTF8;
851+
return;
852+
}
853+
if (custom_func) {
854+
c = custom_func(c, custom_data);
855+
}
856+
/* successfully read from `state->str`, now time to decompose */
857+
if (c < 0x80) {
858+
/* fast path for common ASCII case */
859+
last_boundclass = 0;
860+
if (state->error != 0) {
861+
/* just looking for the longest combining sequence, this isn't it */
862+
continue;
863+
}
864+
if (buf_remaining.len_available < 1) {
865+
/* not enough space */
866+
buf_remaining.len_available = 0;
867+
state->buf.ptr = buf_remaining.ptr = NULL;
868+
state->error = UTF8PROC_ERROR_NOMEM;
869+
/* now just looking for the longest combining sequence, this isn't it */
870+
continue;
871+
}
872+
/* success */
873+
buf_consumed = buf_needed = 1;
874+
if ((options & UTF8PROC_CASEFOLD) && 0x41 <= c && c <= 0x5A) {
875+
*buf_remaining.ptr = c + 0x20;
876+
} else {
877+
*buf_remaining.ptr = c;
878+
}
879+
state->str.ptr += str_consumed;
880+
state->str.len -= str_consumed;
881+
buf_remaining.ptr += 1;
882+
buf_remaining.len_available -= 1;
883+
/* ASCII characters are all zero combining class */
884+
break;
885+
} else {
886+
buf_consumed = utf8proc_decompose_char(c, buf_remaining.ptr, buf_remaining.len_available, options, &last_boundclass);
887+
if (buf_consumed < 0) {
888+
/* error */
889+
state->error = buf_consumed;
890+
return;
891+
}
892+
buf_needed += buf_consumed;
893+
if (state->error == 0 && buf_consumed > buf_remaining.len_available) {
894+
/* not enough space */
895+
buf_remaining.len_available = 0;
896+
state->buf.ptr = buf_remaining.ptr = NULL;
897+
state->error = UTF8PROC_ERROR_NOMEM;
898+
}
899+
}
900+
/* success */
901+
state->str.ptr += str_consumed;
902+
state->str.len -= str_consumed;
903+
if (buf_needed == 0) {
904+
/* ignorable sequence - skip and try next */
905+
continue;
906+
}
907+
if (state->error == 0) {
908+
buf_remaining.ptr += buf_consumed;
909+
buf_remaining.len_available -= buf_consumed;
910+
}
911+
/* decomposed chars must be sorted in ascending order of combining class,
912+
which means we need to keep fetching chars until we get to non-combining */
913+
if (buf_consumed == 0 || state->str.len <= 0 || unsafe_get_property(c)->combining_class == 0) {
914+
/* done decomposing this sequence */
915+
if (state->error == 0) {
916+
/* time to finish up and optionally sort it */
917+
break;
918+
}
919+
/* else we're trying to find the longest decomposed sequence */
920+
if (buf_needed > buf_needed_max) {
921+
buf_needed_max = buf_needed;
922+
}
923+
/* reset for next sequence */
924+
buf_needed = 0;
925+
}
926+
}
927+
if (state->buf.ptr == NULL) {
928+
state->buf.len_used = buf_needed_max;
929+
} else {
930+
state->buf.len_used = buf_needed;
931+
}
932+
if (buf_needed > 1 && state->error == 0 && buf_needed <= state->buf.len_available) {
933+
/* sort by combining class (similar code is in utf8proc_decompose_custom implementation) */
934+
utf8proc_ssize_t pos = 0;
935+
const utf8proc_ssize_t second_to_last = buf_needed - 1;
936+
while (pos < second_to_last) {
937+
utf8proc_int32_t uc1, uc2;
938+
const utf8proc_property_t *property1, *property2;
939+
uc1 = state->buf.ptr[pos];
940+
uc2 = state->buf.ptr[pos+1];
941+
property1 = unsafe_get_property(uc1);
942+
property2 = unsafe_get_property(uc2);
943+
if (property1->combining_class > property2->combining_class &&
944+
property2->combining_class > 0) {
945+
state->buf.ptr[pos] = uc2;
946+
state->buf.ptr[pos+1] = uc1;
947+
if (pos > 0) pos--; else pos++;
948+
} else {
949+
pos++;
950+
}
951+
}
952+
}
953+
}
954+
955+
static utf8proc_string8_view_t utf8proc_purify_strlen(utf8proc_string8_view_t str) {
956+
if (str.len < 0) {
957+
if(str.ptr == NULL) {
958+
str.len = 0;
959+
}
960+
else for(str.len = 0; str.ptr[str.len] != '\0'; ++str.len) { }
961+
}
962+
return str;
963+
}
964+
965+
UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized(utf8proc_processing_state_t *a, utf8proc_processing_state_t *b, utf8proc_option_t options,
966+
utf8proc_custom_func custom_func, void *custom_data
967+
) {
968+
a->str = utf8proc_purify_strlen(a->str);
969+
b->str = utf8proc_purify_strlen(b->str);
970+
options = (utf8proc_option_t)((options & ~(unsigned int)UTF8PROC_COMPOSE)|UTF8PROC_DECOMPOSE);
971+
while (1) {
972+
const utf8proc_string8_view_t original_a = a->str;
973+
const utf8proc_string8_view_t original_b = b->str;
974+
if(a->str.len == 0 || b->str.len == 0) {
975+
/* end of string */
976+
return;
977+
}
978+
utf8proc_decompose_next_chars(a, options, custom_func, custom_data);
979+
utf8proc_decompose_next_chars(b, options, custom_func, custom_data);
980+
if (a->error == 0 && b->error == 0) {
981+
utf8proc_ssize_t pos;
982+
/* success - compare the work buffers for equality */
983+
if (a->buf.len_used != b->buf.len_used) {
984+
/* mismatch found */
985+
return;
986+
}
987+
for(pos = 0; pos < a->buf.len_used; ++pos) {
988+
if(a->buf.ptr[pos] != b->buf.ptr[pos]) {
989+
/* mismatch found */
990+
return;
991+
}
992+
}
993+
/* equal so far */
994+
continue;
995+
}
996+
/* error - restore unprocessed strings and exit */
997+
a->str = original_a;
998+
b->str = original_b;
999+
return;
1000+
}
1001+
}

utf8proc.h

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,54 @@ typedef struct utf8proc_property_struct {
315315
unsigned indic_conjunct_break:2;
316316
} utf8proc_property_t;
317317

318+
/** Struct for a read-only view of a UTF-8 string. A len of -1 means null-terminated. */
319+
typedef struct utf8proc_string8_view_struct {
320+
const utf8proc_uint8_t *ptr;
321+
utf8proc_ssize_t len;
322+
} utf8proc_string8_view_t;
323+
324+
/**
325+
* Struct for a read-write view of a buffer of Unicode code points.
326+
* The len_* fields must be nonnegative and len_available must always
327+
* be greater than or equal to len_used.
328+
*/
329+
typedef struct utf8proc_span32_struct {
330+
utf8proc_int32_t *ptr;
331+
utf8proc_ssize_t len_used, len_available;
332+
} utf8proc_span32_t;
333+
334+
/**
335+
* Struct containing information about a string in processing.
336+
* Used for re-entrant processing algorithms such as
337+
* @ref utf8proc_isequal_normalized
338+
*/
339+
typedef struct utf8proc_processing_state_struct {
340+
/**
341+
* The source UTF-8 string which is being processed.
342+
* When processing is halted for any reason, this will be updated
343+
* to the remainder of the string which hasn't yet been processed.
344+
* This will become an empty string once fully processed.
345+
*/
346+
utf8proc_string8_view_t str;
347+
/**
348+
* A temporary work buffer used during processing, generally
349+
* for holding and sorting combining characters. Depending
350+
* on `str` contents, this may need to be dynamically reallocated.
351+
* In that case, `buf.ptr` is set to NULL and `buf.len_used` is
352+
* updated to the minimum required length (as far as can be known
353+
* before other errors stop processing) while `buf.len_available`
354+
* is left untouched for comparison. The contents of this buffer
355+
* are arbitrary and do not need to be cleared nor preserved, so
356+
* the space can be re-used across multiple processing attempts.
357+
*/
358+
utf8proc_span32_t buf;
359+
/**
360+
* Any error from string processing. This is set to 0 when processing
361+
* starts and left untouched unless there is an error.
362+
*/
363+
utf8proc_ssize_t error;
364+
} utf8proc_processing_state_t;
365+
318366
/** Unicode categories. */
319367
typedef enum {
320368
UTF8PROC_CATEGORY_CN = 0, /**< Other, not assigned */
@@ -787,6 +835,35 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
787835
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str);
788836
/** @} */
789837

838+
/**
839+
* Re-entrant algorithm for efficiently comparing two strings for equality
840+
* while ignoring differences in Unicode normalization. If a mismatch is
841+
* found or an error occurs, the `str` members are updated to the remainder
842+
* of each string, starting with the culprit(s). If the strings are equal,
843+
* both `str` members will become empty.
844+
*
845+
* As noted on @ref utf8proc_processing_state_t the `buf.ptr` member will be
846+
* NULL if the provided buffer is too small, consult `buf.len_used` to see that
847+
* it is greater than `buf.len_available` and re-allocate as needed to satisfy
848+
* the requirement. The `buf` buffers must not overlap, but they may coexist
849+
* within a single memory allocation if desired, just just must be properly
850+
* aligned in accordance with the alignment requirements of utf8proc_int32_t.
851+
*
852+
* Note also that because the `str` members are updated, you can resume comparing
853+
* the string from where you left off after growing the `buf` buffers or fixing
854+
* erroneous UTF-8 sequences, so there's no wasteful re-processing of parts of
855+
* the string that have already been deemed equivalent.
856+
*
857+
* You can also use this function for finding the longest common starting
858+
* sequence, and you can apply your own methodology for handling or skipping
859+
* over invalid UTF-8 sequences found in each provided string.
860+
*
861+
* @param custom_func Optional, see @ref utf8proc_custom_func for info.
862+
*/
863+
UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized(utf8proc_processing_state_t *a, utf8proc_processing_state_t *b, utf8proc_option_t options,
864+
utf8proc_custom_func custom_func, void *custom_data
865+
);
866+
790867
#ifdef __cplusplus
791868
}
792869
#endif

0 commit comments

Comments
 (0)