Skip to content

Commit 506d360

Browse files
committed
[WIP] Add utf8proc_isequal_normalized
1 parent 20d192a commit 506d360

File tree

2 files changed

+397
-0
lines changed

2 files changed

+397
-0
lines changed

utf8proc.c

Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -822,3 +822,327 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8
822822
UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE);
823823
return retval;
824824
}
825+
826+
UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized_custom(utf8proc_processing_state_t *a, utf8proc_processing_state_t *b, utf8proc_option_t options,
827+
utf8proc_custom_func a_custom_func, void *a_custom_data, utf8proc_custom_func b_custom_func, void *b_custom_data
828+
) {
829+
const utf8proc_bool a_len_terminated = (a->str.len >= 0);
830+
const utf8proc_bool b_len_terminated = (b->str.len >= 0);
831+
/* which source string(s) we need to read more from */
832+
utf8proc_bool a_consume = true;
833+
utf8proc_bool b_consume = true;
834+
/* structure to simplify rollback for combining char multipass processing */
835+
const utf8proc_ssize_t decomposed_max_len = 8;
836+
struct {
837+
/* results of utf8proc_iterate */
838+
utf8proc_int32_t codepoint;
839+
utf8proc_ssize_t consumed;
840+
/* results of utf8proc_decompose_char */
841+
utf8proc_int32_t decomposed[8];
842+
utf8proc_ssize_t decomposed_len;
843+
int last_boundclass;
844+
/* combing class tracking state */
845+
utf8proc_ssize_t decomposed_pos;
846+
utf8proc_propval_t combining_class;
847+
} a_decomposing_current = {0},
848+
b_decomposing_current = {0},
849+
a_decomposing_combining_start = {0},
850+
b_decomposing_combining_start = {0},
851+
a_decomposing_combining_end = {0},
852+
b_decomposing_combining_end = {0};
853+
/* combining class tracking state */
854+
utf8proc_ssize_t pos = 0;
855+
utf8proc_bool combining_initialized = false;
856+
utf8proc_propval_t combining_class_current = 0;
857+
utf8proc_propval_t combining_class_next = 0;
858+
utf8proc_string8_view_t a_combining_start = a->str;
859+
utf8proc_string8_view_t b_combining_start = b->str;
860+
utf8proc_string8_view_t a_combining_end = a->str;
861+
utf8proc_string8_view_t b_combining_end = b->str;
862+
utf8proc_uint8_t combining_classes_finished[(UTF8PROC_COMBINING_CLASS_MAX + 1 + CHAR_BIT)/CHAR_BIT] = {0};
863+
const utf8proc_ssize_t combining_classes_finished_len = sizeof(combining_classes_finished)/sizeof(combining_classes_finished[0]);
864+
/* initialize/clear error state */
865+
a->error = 0;
866+
b->error = 0;
867+
a->str_at_error.ptr = NULL;
868+
b->str_at_error.ptr = NULL;
869+
a->str_at_error.len = 0;
870+
b->str_at_error.len = 0;
871+
/* force compatible options:
872+
- must use UTF8PROC_DECOMPOSE, not UTF8PROC_COMPOSE.
873+
- we choose when to add UTF8PROC_NULLTERM on a case-by-case basis (not needed currently).
874+
- can't use UTF8PROC_CHARBOUND because it would break `unsafe_get_property`. */
875+
options = (utf8proc_option_t)((options & ~(unsigned int)(UTF8PROC_COMPOSE|UTF8PROC_NULLTERM|UTF8PROC_CHARBOUND))|UTF8PROC_DECOMPOSE);
876+
/* primary loop: each iteration pulls data from one or both strings */
877+
while (1) {
878+
/* read a code point from each - utf8proc_iterate handles null termination with negative length on its own */
879+
if (a_consume) a_decomposing_current.consumed = utf8proc_iterate(a->str.ptr, a->str.len, &a_decomposing_current.codepoint);
880+
if (b_consume) b_decomposing_current.consumed = utf8proc_iterate(b->str.ptr, b->str.len, &b_decomposing_current.codepoint);
881+
/* check for errors, roll back string views if needed */
882+
if (a_decomposing_current.consumed < 0) {
883+
a->error = a_decomposing_current.consumed;
884+
a->str_at_error = a->str;
885+
}
886+
if (b_decomposing_current.consumed < 0) {
887+
b->error = b_decomposing_current.consumed;
888+
b->str_at_error = b->str;
889+
}
890+
if (a->error || b->error) {
891+
if (combining_initialized) {
892+
a->str = a_combining_start;
893+
b->str = b_combining_start;
894+
}
895+
return;
896+
}
897+
/* if we reach the end of one string, we may still need to process more
898+
of the other due to ignorable sequences, and the combining class code
899+
needs to make a judgement upon reaching the end of a combining sequence.
900+
so from this point forward code must be guarded against this possibility. */
901+
if (!combining_initialized && a_decomposing_current.consumed == 0 && b_decomposing_current.consumed == 0) {
902+
/* true end of both strings, must be equal */
903+
return;
904+
}
905+
/* apply each code point filter */
906+
if (a_custom_func && a_consume && a_decomposing_current.consumed) a_decomposing_current.codepoint = a_custom_func(a_decomposing_current.codepoint, a_custom_data);
907+
if (b_custom_func && b_consume && b_decomposing_current.consumed) b_decomposing_current.codepoint = b_custom_func(b_decomposing_current.codepoint, b_custom_data);
908+
/* ASCII fast path is only suitable if we consumed both at once and not in combining mode */
909+
if (!combining_initialized && a_consume && b_consume && a_decomposing_current.consumed && b_decomposing_current.consumed
910+
&& a_decomposing_current.codepoint < 0x80 && b_decomposing_current.codepoint < 0x80) {
911+
/* fast path for common ASCII case */
912+
if (options & UTF8PROC_CASEFOLD) {
913+
if (0x41 <= a_decomposing_current.codepoint && a_decomposing_current.codepoint <= 0x5A) a_decomposing_current.codepoint += 0x20;
914+
if (0x41 <= b_decomposing_current.codepoint && b_decomposing_current.codepoint <= 0x5A) b_decomposing_current.codepoint += 0x20;
915+
}
916+
if (a_decomposing_current.codepoint != b_decomposing_current.codepoint) {
917+
/* mismatch detected */
918+
return;
919+
}
920+
/* equal so far */
921+
a->str.ptr += a_decomposing_current.consumed;
922+
a->str.len -= a_decomposing_current.consumed * a_len_terminated;
923+
b->str.ptr += b_decomposing_current.consumed;
924+
b->str.len -= b_decomposing_current.consumed * b_len_terminated;
925+
a_consume = true;
926+
b_consume = true;
927+
continue;
928+
}
929+
/* now time to decompose */
930+
#define UTF8PROC_LAMBDA(ab) \
931+
if (ab##_consume && ab##_decomposing_current.consumed) { \
932+
/* we got a code point, decompose it */ \
933+
ab##_decomposing_current.decomposed_len = utf8proc_decompose_char(ab##_decomposing_current.codepoint, \
934+
ab##_decomposing_current.decomposed, decomposed_max_len, options, &ab##_decomposing_current.last_boundclass); \
935+
ab##_decomposing_current.decomposed_pos = 0; \
936+
ab##_consume = false; \
937+
/* check for errors */ \
938+
if (ab##_decomposing_current.decomposed_len < 0) { \
939+
ab->error = ab##_decomposing_current.decomposed_len; \
940+
ab->str_at_error = ab->str; \
941+
} else if (ab##_decomposing_current.decomposed_len > decomposed_max_len) { \
942+
/* should never happen in practice, just for static analysis. */ \
943+
ab->error = UTF8PROC_ERROR_OVERFLOW; \
944+
ab->str_at_error = ab->str; \
945+
} else if (ab##_decomposing_current.decomposed_len == 0) { \
946+
/* ignorable sequence, need to consume more */ \
947+
ab->str.ptr += ab##_decomposing_current.consumed; \
948+
ab->str.len -= ab##_decomposing_current.consumed * ab##_len_terminated; \
949+
ab##_consume = true; \
950+
} \
951+
} else { \
952+
ab##_consume = false; \
953+
}
954+
/* run the above for both strings */
955+
UTF8PROC_LAMBDA(a);
956+
UTF8PROC_LAMBDA(b);
957+
#undef UTF8PROC_LAMBDA
958+
/* check for errors, roll back string views if needed */
959+
if (a->error || b->error) {
960+
if (combining_initialized) {
961+
a->str = a_combining_start;
962+
b->str = b_combining_start;
963+
}
964+
return;
965+
}
966+
/* check for ignorable sequences */
967+
if (a_consume || b_consume) {
968+
continue;
969+
}
970+
/* now that ignorable sequences have been handled, check for end of either string */
971+
if (!combining_initialized && (a_decomposing_current.consumed == 0 || b_decomposing_current.consumed == 0)) {
972+
/* one or both strings ended, either equal or inequal */
973+
return;
974+
}
975+
/* at this point both decomposed buffers need to be compared. when the
976+
strings are fully normalized, the decomposed chars are sorted in
977+
order of combining class, which could mean having to sort the entire
978+
decomposed string in the worst case. since we only need to compare
979+
them as-if they are normalized, we can just go one combining class
980+
at a time. we have to be careful around ends of strings to make
981+
sure the string views are properly updated to NOT FURTHER THAN the
982+
first difference in the strings, which may be a large combining seq.
983+
*/
984+
while (1) {
985+
/* do we need to decompose more? */
986+
if (a_decomposing_current.consumed && a_decomposing_current.decomposed_pos >= a_decomposing_current.decomposed_len) {
987+
a_consume = true;
988+
a->str.ptr += a_decomposing_current.consumed;
989+
a->str.len -= a_decomposing_current.consumed * a_len_terminated;
990+
}
991+
if (b_decomposing_current.consumed && b_decomposing_current.decomposed_pos >= b_decomposing_current.decomposed_len) {
992+
b_consume = true;
993+
b->str.ptr += b_decomposing_current.consumed;
994+
b->str.len -= b_decomposing_current.consumed * b_len_terminated;
995+
}
996+
if (a_consume || b_consume) {
997+
/* use outer loop to pull more data */
998+
break;
999+
}
1000+
/* get the combining class of each current code point, or 0 for end of string */
1001+
if (a_decomposing_current.consumed) {
1002+
a_decomposing_current.combining_class = unsafe_get_property(a_decomposing_current.decomposed[a_decomposing_current.decomposed_pos])->combining_class;
1003+
} else {
1004+
a_decomposing_current.combining_class = 0;
1005+
}
1006+
if (b_decomposing_current.consumed) {
1007+
b_decomposing_current.combining_class = unsafe_get_property(b_decomposing_current.decomposed[b_decomposing_current.decomposed_pos])->combining_class;
1008+
} else {
1009+
b_decomposing_current.combining_class = 0;
1010+
}
1011+
/* static analysis guards, always false in practice */
1012+
if (a_decomposing_current.combining_class/CHAR_BIT >= combining_classes_finished_len) {
1013+
a->error = UTF8PROC_ERROR_OVERFLOW;
1014+
a->str_at_error = a->str;
1015+
}
1016+
if (b_decomposing_current.combining_class/CHAR_BIT >= combining_classes_finished_len) {
1017+
b->error = UTF8PROC_ERROR_OVERFLOW;
1018+
b->str_at_error = b->str;
1019+
}
1020+
if (a->error || b->error) {
1021+
if (combining_initialized) {
1022+
a->str = a_combining_start;
1023+
b->str = b_combining_start;
1024+
}
1025+
return;
1026+
}
1027+
/* do either have a combining class of 0 (non-combining)? */
1028+
if (a_decomposing_current.combining_class == 0 || b_decomposing_current.combining_class == 0) {
1029+
if (combining_initialized) {
1030+
/* we've reached the end of the combining sequence */
1031+
if (a_decomposing_current.combining_class == 0) {
1032+
a_combining_end = a->str;
1033+
a_decomposing_combining_end = a_decomposing_current;
1034+
}
1035+
if (b_decomposing_current.combining_class == 0) {
1036+
b_combining_end = b->str;
1037+
b_decomposing_combining_end = b_decomposing_current;
1038+
}
1039+
if (combining_class_next != 0) {
1040+
/* prepare for the next pass */
1041+
utf8proc_uint8_t *elem = &(combining_classes_finished[combining_class_current/CHAR_BIT]);
1042+
const utf8proc_uint8_t mask = (utf8proc_uint8_t)(1 << (combining_class_current % CHAR_BIT));
1043+
*elem |= mask;
1044+
combining_class_current = combining_class_next;
1045+
combining_class_next = 0;
1046+
/* roll back for next pass */
1047+
a->str = a_combining_start;
1048+
b->str = b_combining_start;
1049+
a_decomposing_current = a_decomposing_combining_start;
1050+
b_decomposing_current = b_decomposing_combining_start;
1051+
continue;
1052+
}
1053+
/* else exit combining mode */
1054+
if (a_combining_end.ptr == a_combining_start.ptr && a_decomposing_combining_end.decomposed_pos == a_decomposing_combining_start.decomposed_pos
1055+
|| b_combining_end.ptr == b_combining_start.ptr && b_decomposing_combining_end.decomposed_pos == b_decomposing_combining_start.decomposed_pos) {
1056+
/* didn't reach the end of one of the sequences yet - mismatch detected */
1057+
a->str = a_combining_start;
1058+
b->str = b_combining_start;
1059+
return;
1060+
}
1061+
/* roll forward to the ends of the combining sequence */
1062+
a->str = a_combining_end;
1063+
b->str = b_combining_end;
1064+
a_decomposing_current = a_decomposing_combining_end;
1065+
b_decomposing_current = b_decomposing_combining_end;
1066+
/* resume normal processing in outer loop */
1067+
combining_initialized = false;
1068+
break;
1069+
}
1070+
/* else not in combining mode and at least one is non-combining */
1071+
if (a_decomposing_current.combining_class != b_decomposing_current.combining_class) {
1072+
/* mismatch detected */
1073+
return;
1074+
}
1075+
/* both are non-combining,compare the decomposed buffers */
1076+
if (a_decomposing_current.decomposed[a_decomposing_current.decomposed_pos] != b_decomposing_current.decomposed[b_decomposing_current.decomposed_pos]) {
1077+
/* mismatch detected */
1078+
return;
1079+
}
1080+
/* equal so far */
1081+
++a_decomposing_current.decomposed_pos;
1082+
++b_decomposing_current.decomposed_pos;
1083+
continue;
1084+
}
1085+
/* both nonzero combining class, initialize combining mode:
1086+
we go one combining class at a time, comparing the decomposed chars
1087+
of that class in order while consuming more from the input strings
1088+
as needed and noting the next class until we reach a non-combining
1089+
char. then, if there's another combining class, we roll back and
1090+
start from the beginning of the sequence again. */
1091+
if (!combining_initialized) {
1092+
combining_class_current = a_decomposing_current.combining_class;
1093+
combining_class_next = ((a_decomposing_current.combining_class == b_decomposing_current.combining_class)? 0 : b_decomposing_current.combining_class);
1094+
a_combining_start = a->str;
1095+
b_combining_start = b->str;
1096+
a_combining_end = a->str;
1097+
b_combining_end = b->str;
1098+
a_decomposing_combining_end = a_decomposing_combining_start = a_decomposing_current;
1099+
b_decomposing_combining_end = b_decomposing_combining_start = b_decomposing_current;
1100+
for (pos = 0; pos < combining_classes_finished_len; ++pos) {
1101+
combining_classes_finished[pos] = 0;
1102+
}
1103+
combining_initialized = true;
1104+
}
1105+
/* pull more data from one or both until we get both to be current class */
1106+
if (a_decomposing_current.combining_class != combining_class_current) {
1107+
/* is this an unseen class we can target next? */
1108+
if (combining_class_next == 0) {
1109+
const utf8proc_uint8_t elem = combining_classes_finished[a_decomposing_current.combining_class/CHAR_BIT];
1110+
const utf8proc_uint8_t mask = (utf8proc_uint8_t)(1 << (a_decomposing_current.combining_class % CHAR_BIT));
1111+
if ((elem & mask) == 0) {
1112+
combining_class_next = a_decomposing_current.combining_class;
1113+
}
1114+
}
1115+
++a_decomposing_current.decomposed_pos;
1116+
}
1117+
if (b_decomposing_current.combining_class != combining_class_current) {
1118+
/* is this an unseen class we can target next? */
1119+
if (combining_class_next == 0) {
1120+
const utf8proc_uint8_t elem = combining_classes_finished[b_decomposing_current.combining_class/CHAR_BIT];
1121+
const utf8proc_uint8_t mask = (utf8proc_uint8_t)(1 << (b_decomposing_current.combining_class % CHAR_BIT));
1122+
if ((elem & mask) == 0) {
1123+
combining_class_next = b_decomposing_current.combining_class;
1124+
}
1125+
}
1126+
++b_decomposing_current.decomposed_pos;
1127+
}
1128+
if (a_decomposing_current.combining_class != combining_class_current || b_decomposing_current.combining_class != combining_class_current) {
1129+
continue;
1130+
}
1131+
/* both are the current combining class, compare the decomposed buffers */
1132+
if (a_decomposing_current.decomposed[a_decomposing_current.decomposed_pos] != b_decomposing_current.decomposed[b_decomposing_current.decomposed_pos]) {
1133+
/* mismatch detected, roll back string views and exit */
1134+
a->str = a_combining_start;
1135+
b->str = b_combining_start;
1136+
return;
1137+
}
1138+
/* equal so far */
1139+
++a_decomposing_current.decomposed_pos;
1140+
++b_decomposing_current.decomposed_pos;
1141+
continue;
1142+
}
1143+
}
1144+
}
1145+
1146+
UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized(utf8proc_processing_state_t *a, utf8proc_processing_state_t *b, utf8proc_option_t options) {
1147+
utf8proc_isequal_normalized_custom(a, b, options, NULL, NULL, NULL, NULL);
1148+
}

0 commit comments

Comments
 (0)