Skip to content

Commit 3d0ff43

Browse files
committed
[WIP] Add utf8proc_isequal_normalized
1 parent 20d192a commit 3d0ff43

File tree

2 files changed

+359
-0
lines changed

2 files changed

+359
-0
lines changed

utf8proc.c

Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -822,3 +822,289 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8
822822
UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE);
823823
return retval;
824824
}
825+
826+
UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized_custom(utf8proc_processing_state_t *a, utf8proc_processing_state_t *b, utf8proc_option_t options,
827+
utf8proc_custom_func a_custom_func, void *a_custom_data, utf8proc_custom_func b_custom_func, void *b_custom_data
828+
) {
829+
const utf8proc_bool a_len_terminated = (a->str.len >= 0);
830+
const utf8proc_bool b_len_terminated = (b->str.len >= 0);
831+
/* which source string(s) we need to read more from */
832+
utf8proc_bool a_consume = true;
833+
utf8proc_bool b_consume = true;
834+
/* results of utf8proc_iterate for each string */
835+
utf8proc_int32_t a_codepoint = 0;
836+
utf8proc_int32_t b_codepoint = 0;
837+
utf8proc_ssize_t a_consumed = 0;
838+
utf8proc_ssize_t b_consumed = 0;
839+
/* structure to simplify rollback for combining char multipass processing */
840+
const utf8proc_ssize_t decomposed_max_len = 8;
841+
struct {
842+
/* results of utf8proc_decompose_char for each string */
843+
utf8proc_int32_t a_decomposed[8];
844+
utf8proc_int32_t b_decomposed[8];
845+
utf8proc_ssize_t a_decomposed_len;
846+
utf8proc_ssize_t b_decomposed_len;
847+
int a_last_boundclass;
848+
int b_last_boundclass;
849+
/* combing class tracking state for each string */
850+
utf8proc_ssize_t a_decomposed_pos;
851+
utf8proc_ssize_t b_decomposed_pos;
852+
} decomposing_current = {0}, decomposing_combining_start = {0};
853+
/* combing class tracking state for each string */
854+
utf8proc_propval_t a_combining_class = 0;
855+
utf8proc_propval_t b_combining_class = 0;
856+
utf8proc_ssize_t pos = 0;
857+
utf8proc_bool combining_initialized = false;
858+
utf8proc_propval_t combining_class_current = 0;
859+
utf8proc_propval_t combining_class_next = 0;
860+
utf8proc_string8_view_t a_combining_start = a->str;
861+
utf8proc_string8_view_t b_combining_start = b->str;
862+
utf8proc_uint8_t combining_classes_finished[(UTF8PROC_COMBINING_CLASS_MAX + 1 + CHAR_BIT)/CHAR_BIT] = {0};
863+
const utf8proc_ssize_t combining_classes_finished_len = sizeof(combining_classes_finished)/sizeof(combining_classes_finished[0]);
864+
/* initialize/clear error state */
865+
a->error = 0;
866+
b->error = 0;
867+
a->str_at_error.ptr = NULL;
868+
b->str_at_error.ptr = NULL;
869+
a->str_at_error.len = 0;
870+
b->str_at_error.len = 0;
871+
/* force compatible options:
872+
- muse use UTF8PROC_DECOMPOSE, not UTF8PROC_COMPOSE.
873+
- we choose when to add UTF8PROC_NULLTERM on a case-by-case basis (not needed currently).
874+
- can't use UTF8PROC_CHARBOUND because it would break `unsafe_get_property`. */
875+
options = (utf8proc_option_t)((options & ~(unsigned int)(UTF8PROC_COMPOSE|UTF8PROC_NULLTERM|UTF8PROC_CHARBOUND))|UTF8PROC_DECOMPOSE);
876+
/* primary loop: each iteration pulls data from one or both strings */
877+
while (a_consume || b_consume || combining_initialized) {
878+
/* read a code point from each - utf8proc_iterate handles null termination with negative length on its own */
879+
if (a_consume) a_consumed = utf8proc_iterate(a->str.ptr, a->str.len, &a_codepoint);
880+
if (b_consume) b_consumed = utf8proc_iterate(b->str.ptr, b->str.len, &b_codepoint);
881+
/* check for errors, roll back string views if needed */
882+
if (a_consumed < 0) {
883+
a->error = a_consumed;
884+
a->str_at_error = a->str;
885+
}
886+
if (b_consumed < 0) {
887+
b->error = b_consumed;
888+
b->str_at_error = b->str;
889+
}
890+
if (a->error || b->error) {
891+
if (combining_initialized) {
892+
a->str = a_combining_start;
893+
b->str = b_combining_start;
894+
}
895+
return;
896+
}
897+
/* if we reach the end of one string, we may still need to process more
898+
of the other due to ignorable sequences, and the combining class code
899+
needs to make a judgement upon reaching the end of a combining sequence.
900+
so from this point forward code must be guarded against this possibility. */
901+
if (!combining_initialized && a_consumed == 0 && b_consumed == 0) {
902+
/* true end of both strings, must be equal */
903+
return;
904+
}
905+
/* apply each code point filter */
906+
if (a_custom_func && a_consume && a_consumed) a_codepoint = a_custom_func(a_codepoint, a_custom_data);
907+
if (b_custom_func && b_consume && b_consumed) b_codepoint = b_custom_func(b_codepoint, b_custom_data);
908+
/* ASCII fast path is only suitable if we consumed both at once and not in combining mode */
909+
if (!combining_initialized && a_consume && b_consume && a_consumed && b_consumed && a_codepoint < 0x80 && b_codepoint < 0x80) {
910+
/* fast path for common ASCII case */
911+
if (options & UTF8PROC_CASEFOLD) {
912+
if (0x41 <= a_codepoint && a_codepoint <= 0x5A) a_codepoint += 0x20;
913+
if (0x41 <= b_codepoint && b_codepoint <= 0x5A) b_codepoint += 0x20;
914+
}
915+
if (a_codepoint != b_codepoint) {
916+
/* mismatch detected */
917+
return;
918+
}
919+
/* equal so far */
920+
a->str.ptr += a_consumed;
921+
a->str.len -= a_consumed * a_len_terminated;
922+
b->str.ptr += b_consumed;
923+
b->str.len -= b_consumed * b_len_terminated;
924+
a_consume = true;
925+
b_consume = true;
926+
continue;
927+
}
928+
/* now time to decompose */
929+
#define UTF8PROC_LAMBDA(ab) \
930+
if (ab##_consume && ab##_consumed) { \
931+
/* we got a code point, decompose it */ \
932+
decomposing_current.ab##_decomposed_len = utf8proc_decompose_char(ab##_codepoint, decomposing_current.ab##_decomposed, decomposed_max_len, options, &decomposing_current.ab##_last_boundclass); \
933+
decomposing_current.ab##_decomposed_pos = 0; \
934+
ab##_consume = false; \
935+
/* check for errors */ \
936+
if (decomposing_current.ab##_decomposed_len < 0) { \
937+
ab->error = decomposing_current.ab##_decomposed_len; \
938+
ab->str_at_error = ab->str; \
939+
} else if (decomposing_current.ab##_decomposed_len > decomposed_max_len) { \
940+
/* should never happen in practice, just for static analysis. */ \
941+
ab->error = UTF8PROC_ERROR_OVERFLOW; \
942+
ab->str_at_error = ab->str; \
943+
} else if (decomposing_current.ab##_decomposed_len == 0) { \
944+
/* ignorable sequence, need to consume more */ \
945+
ab->str.ptr += ab##_consumed; \
946+
ab->str.len -= ab##_consumed * ab##_len_terminated; \
947+
ab##_consume = true; \
948+
} \
949+
} else { \
950+
ab##_consume = false; \
951+
}
952+
/* run the above for both strings */
953+
UTF8PROC_LAMBDA(a);
954+
UTF8PROC_LAMBDA(b);
955+
#undef UTF8PROC_LAMBDA
956+
/* check for errors, roll back string views if needed */
957+
if (a->error || b->error) {
958+
if (combining_initialized) {
959+
a->str = a_combining_start;
960+
b->str = b_combining_start;
961+
}
962+
return;
963+
}
964+
/* check for ignorable sequences */
965+
if (a_consume || b_consume) {
966+
continue;
967+
}
968+
/* now that ignorable sequences have been handled, check for end of either string */
969+
if (!combining_initialized && (a_consumed == 0 || b_consumed == 0)) {
970+
/* one or both strings ended, either equal or inequal */
971+
return;
972+
}
973+
/* at this point both decomposed buffers need to be compared. when the
974+
strings are fully normalized, the decomposed chars are sorted in
975+
order of combining class, which could mean having to sort the entire
976+
decomposed string in the worst case. since we only need to compare
977+
them as-if they are normalized, we can just go one combining class
978+
at a time. we have to be careful around ends of strings to make
979+
sure the string views are properly updated to NOT FURTHER THAN the
980+
first difference in the strings, which may be a large combining seq.
981+
*/
982+
while (!a_consume && !b_consume) {
983+
/* do we need to decompose more? */
984+
if (decomposing_current.a_decomposed_pos >= decomposing_current.a_decomposed_len) {
985+
a_consume = true;
986+
a->str.ptr += a_consumed;
987+
a->str.len -= a_consumed * a_len_terminated;
988+
}
989+
if (decomposing_current.b_decomposed_pos >= decomposing_current.b_decomposed_len) {
990+
b_consume = true;
991+
b->str.ptr += b_consumed;
992+
b->str.len -= b_consumed * b_len_terminated;
993+
}
994+
if (a_consume || b_consume) {
995+
continue;
996+
}
997+
/* get the combining class of each current code point */
998+
a_combining_class = unsafe_get_property(decomposing_current.a_decomposed[decomposing_current.a_decomposed_pos])->combining_class;
999+
b_combining_class = unsafe_get_property(decomposing_current.b_decomposed[decomposing_current.b_decomposed_pos])->combining_class;
1000+
/* static analysis guards, always false in practice */
1001+
if (a_combining_class/CHAR_BIT >= combining_classes_finished_len) {
1002+
a->error = UTF8PROC_ERROR_OVERFLOW;
1003+
a->str_at_error = a->str;
1004+
}
1005+
if (b_combining_class/CHAR_BIT >= combining_classes_finished_len) {
1006+
b->error = UTF8PROC_ERROR_OVERFLOW;
1007+
b->str_at_error = b->str;
1008+
}
1009+
if (a->error || b->error) {
1010+
if (combining_initialized) {
1011+
a->str = a_combining_start;
1012+
b->str = b_combining_start;
1013+
}
1014+
return;
1015+
}
1016+
/* do either have a combining class of 0 (non-combining)? */
1017+
if (a_combining_class == 0 || b_combining_class == 0) {
1018+
if (combining_initialized) {
1019+
/* we've reached the end of the combining sequence */
1020+
if (combining_class_next != 0) {
1021+
/* prepare for the next pass */
1022+
utf8proc_uint8_t *elem = &(combining_classes_finished[combining_class_current/CHAR_BIT]);
1023+
const utf8proc_uint8_t mask = (utf8proc_uint8_t)(1 << (combining_class_current % CHAR_BIT));
1024+
*elem |= mask;
1025+
combining_class_current = combining_class_next;
1026+
combining_class_next = 0;
1027+
a->str = a_combining_start;
1028+
b->str = b_combining_start;
1029+
decomposing_current = decomposing_combining_start;
1030+
a_consume = true;
1031+
b_consume = true;
1032+
continue;
1033+
}
1034+
/* else exit combining mode and carry on as normal */
1035+
combining_initialized = false;
1036+
}
1037+
if (a_combining_class != b_combining_class) {
1038+
/* mismatch detected */
1039+
return;
1040+
}
1041+
if (decomposing_current.a_decomposed[decomposing_current.a_decomposed_pos] != decomposing_current.b_decomposed[decomposing_current.b_decomposed_pos]) {
1042+
/* mismatch detected */
1043+
return;
1044+
}
1045+
/* equal so far */
1046+
++decomposing_current.a_decomposed_pos;
1047+
++decomposing_current.b_decomposed_pos;
1048+
continue;
1049+
}
1050+
/* both nonzero combining class, initialize combining mode:
1051+
we go one combining class at a time, comparing the decomposed chars
1052+
of that class in order while consuming more from the input strings
1053+
as needed and noting the next class until we reach a non-combining
1054+
char. then, if there's another combining class, we roll back and
1055+
start from the beginning of the sequence again. */
1056+
if (!combining_initialized) {
1057+
combining_class_current = a_combining_class;
1058+
combining_class_next = ((a_combining_class == b_combining_class)? 0 : b_combining_class);
1059+
a_combining_start = a->str;
1060+
b_combining_start = b->str;
1061+
decomposing_combining_start = decomposing_current;
1062+
for (pos = 0; pos < combining_classes_finished_len; ++pos) {
1063+
combining_classes_finished[pos] = 0;
1064+
}
1065+
combining_initialized = true;
1066+
}
1067+
/* pull more data from one or both until we get both to be current class */
1068+
if (a_combining_class != combining_class_current) {
1069+
/* is this an unseen class we can target next? */
1070+
if (combining_class_next == 0) {
1071+
const utf8proc_uint8_t elem = combining_classes_finished[a_combining_class/CHAR_BIT];
1072+
const utf8proc_uint8_t mask = (utf8proc_uint8_t)(1 << (a_combining_class % CHAR_BIT));
1073+
if ((elem & mask) == 0) {
1074+
combining_class_next = a_combining_class;
1075+
}
1076+
}
1077+
++decomposing_current.a_decomposed_pos;
1078+
}
1079+
if (b_combining_class != combining_class_current) {
1080+
/* is this an unseen class we can target next? */
1081+
if (combining_class_next == 0) {
1082+
const utf8proc_uint8_t elem = combining_classes_finished[b_combining_class/CHAR_BIT];
1083+
const utf8proc_uint8_t mask = (utf8proc_uint8_t)(1 << (b_combining_class % CHAR_BIT));
1084+
if ((elem & mask) == 0) {
1085+
combining_class_next = b_combining_class;
1086+
}
1087+
}
1088+
++decomposing_current.b_decomposed_pos;
1089+
}
1090+
if (a_combining_class != combining_class_current || b_combining_class != combining_class_current) {
1091+
continue;
1092+
}
1093+
/* both are the current combining class, compare the decomposed buffers */
1094+
if (decomposing_current.a_decomposed[decomposing_current.a_decomposed_pos] != decomposing_current.b_decomposed[decomposing_current.b_decomposed_pos]) {
1095+
/* mismatch detected, roll back string views and exit */
1096+
a->str = a_combining_start;
1097+
b->str = b_combining_start;
1098+
return;
1099+
}
1100+
/* equal so far */
1101+
++decomposing_current.a_decomposed_pos;
1102+
++decomposing_current.b_decomposed_pos;
1103+
continue;
1104+
}
1105+
}
1106+
}
1107+
1108+
UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized(utf8proc_processing_state_t *a, utf8proc_processing_state_t *b, utf8proc_option_t options) {
1109+
utf8proc_isequal_normalized_custom(a, b, options, NULL, NULL, NULL, NULL);
1110+
}

utf8proc.h

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,48 @@ typedef struct utf8proc_property_struct {
315315
unsigned indic_conjunct_break:2;
316316
} utf8proc_property_t;
317317

318+
/** Every `combining_class` in any valid `utf8proc_property_t` is nonnegative and less than or equal to this. */
319+
#define UTF8PROC_COMBINING_CLASS_MAX 255
320+
321+
/** Struct for a read-only view of a UTF-8 string. A len of -1 means null-terminated. */
322+
typedef struct utf8proc_string8_view_struct {
323+
const utf8proc_uint8_t *ptr;
324+
utf8proc_ssize_t len;
325+
} utf8proc_string8_view_t;
326+
327+
/**
328+
* Struct for an in+out view of a string and an associated error.
329+
* Use by @ref utf8proc_isequal_normalized to determine equivalence
330+
* or longest common sequence, or to report an error and know which
331+
* string had the error and where.
332+
*/
333+
typedef struct utf8proc_processing_state_struct {
334+
/**
335+
* The source UTF-8 string which is being read from.
336+
* The contents of the string are never modified but it is expected
337+
* that the view itself (pointer and length) will be updated to the
338+
* remainder of the string which hasn't yet been successfully read.
339+
* This will become an empty string once fully read.
340+
*/
341+
utf8proc_string8_view_t str;
342+
/**
343+
* Any error from string processing. This is set to 0 when reading
344+
* starts and left untouched unless there is an error. When set,
345+
* the `str` member will be updated to no further than the source
346+
* of the error, though there can still be some distance between
347+
* the start of `str` and the actual source of the error depending
348+
* on algorithm and string contents (e.g. combining chars). Use
349+
* the `str_at_error` member instead to know the actual start
350+
* of the problematic sequence.
351+
*/
352+
utf8proc_ssize_t error;
353+
/**
354+
* The actual position of the error (if any) or an empty string
355+
* otherwise.
356+
*/
357+
utf8proc_string8_view_t str_at_error;
358+
} utf8proc_processing_state_t;
359+
318360
/** Unicode categories. */
319361
typedef enum {
320362
UTF8PROC_CATEGORY_CN = 0, /**< Other, not assigned */
@@ -787,6 +829,37 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str);
787829
UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str);
788830
/** @} */
789831

832+
/**
833+
* Algorithm for efficiently comparing two strings for equality while ignoring
834+
* differences in Unicode normalization. If a mismatch is found or an error
835+
* occurs, the `str` members are updated to the remainder of each string,
836+
* not further than the first difference or error. If the strings are equal,
837+
* both `str` members will become empty.
838+
*
839+
* You can also use this function for finding the longest common starting
840+
* sequence, and you can apply your own methodology for handling or skipping
841+
* over invalid UTF-8 sequences found in each provided string. Just be aware
842+
* that due to how Unicode combining characters work, the error could be
843+
* potentially quite far ahead of where the updated string views point. Use
844+
* the `str_at_error` members to help with analyzing errors.
845+
*/
846+
UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized(utf8proc_processing_state_t *a, utf8proc_processing_state_t *b, utf8proc_option_t options);
847+
848+
/**
849+
* Like utf8proc_isequal_normalized(), but also takes `custom_func` mapping
850+
* functions that are called on each codepoint in their corresponding string
851+
* before any other transformations (along with a `custom_data` pointer that
852+
* is passed through to `custom_func`). The `custom_func` arguments are ignored
853+
* if they are `NULL`. See @ref utf8proc_custom_func for more info.
854+
*
855+
* Exercise caution: due to how Unicode combining characters work, the algorithm
856+
* may backtrack and resume processing at an earlier point of the string without
857+
* warning.
858+
*/
859+
UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized_custom(utf8proc_processing_state_t *a, utf8proc_processing_state_t *b, utf8proc_option_t options,
860+
utf8proc_custom_func a_custom_func, void *a_custom_data, utf8proc_custom_func b_custom_func, void *b_custom_data
861+
);
862+
790863
#ifdef __cplusplus
791864
}
792865
#endif

0 commit comments

Comments
 (0)