@@ -822,3 +822,180 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8
822
822
UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE );
823
823
return retval ;
824
824
}
825
+
826
+ /**
827
+ * Helper function used by utf8proc_isequal_normalized.
828
+ * Reads and sorts the next sequence of combining characters.
829
+ * If buf is not large enough, calculates minimum length by processing
830
+ * the whole rest of the string instead of just the next combining characters.
831
+ */
832
+ static void utf8proc_decompose_next_chars (utf8proc_processing_state_t * state , const utf8proc_option_t options ,
833
+ utf8proc_custom_func custom_func , void * custom_data
834
+ ) {
835
+ utf8proc_ssize_t buf_needed = 0 , buf_needed_max = 1 ;
836
+ utf8proc_span32_t buf_remaining = state -> buf ;
837
+ int last_boundclass = 0 ;
838
+ state -> error = 0 ;
839
+ while (state -> str .len > 0 ) {
840
+ /* read a char from `state->str` and decompose it to `buf_remaining` */
841
+ utf8proc_int32_t c ;
842
+ utf8proc_ssize_t str_consumed , buf_consumed ;
843
+ str_consumed = utf8proc_iterate (state -> str .ptr , state -> str .len , & c );
844
+ if (str_consumed < 1 ) {
845
+ /* error or end of string */
846
+ state -> error = str_consumed ;
847
+ return ;
848
+ } else if (str_consumed > state -> str .len ) {
849
+ /* string ends mid-way */
850
+ state -> error = UTF8PROC_ERROR_INVALIDUTF8 ;
851
+ return ;
852
+ }
853
+ if (custom_func ) {
854
+ c = custom_func (c , custom_data );
855
+ }
856
+ /* successfully read from `state->str`, now time to decompose */
857
+ if (c < 0x80 ) {
858
+ /* fast path for common ASCII case */
859
+ last_boundclass = 0 ;
860
+ if (state -> error != 0 ) {
861
+ /* just looking for the longest combining sequence, this isn't it */
862
+ continue ;
863
+ }
864
+ if (buf_remaining .len_available < 1 ) {
865
+ /* not enough space */
866
+ buf_remaining .len_available = 0 ;
867
+ state -> buf .ptr = buf_remaining .ptr = NULL ;
868
+ state -> error = UTF8PROC_ERROR_NOMEM ;
869
+ /* now just looking for the longest combining sequence, this isn't it */
870
+ continue ;
871
+ }
872
+ /* success */
873
+ buf_consumed = buf_needed = 1 ;
874
+ if ((options & UTF8PROC_CASEFOLD ) && 0x41 <= c && c <= 0x5A ) {
875
+ * buf_remaining .ptr = c + 0x20 ;
876
+ } else {
877
+ * buf_remaining .ptr = c ;
878
+ }
879
+ state -> str .ptr += str_consumed ;
880
+ state -> str .len -= str_consumed ;
881
+ buf_remaining .ptr += 1 ;
882
+ buf_remaining .len_available -= 1 ;
883
+ /* ASCII characters are all zero combining class */
884
+ break ;
885
+ } else {
886
+ buf_consumed = utf8proc_decompose_char (c , buf_remaining .ptr , buf_remaining .len_available , options , & last_boundclass );
887
+ if (buf_consumed < 0 ) {
888
+ /* error */
889
+ state -> error = buf_consumed ;
890
+ return ;
891
+ }
892
+ buf_needed += buf_consumed ;
893
+ if (state -> error == 0 && buf_consumed > buf_remaining .len_available ) {
894
+ /* not enough space */
895
+ buf_remaining .len_available = 0 ;
896
+ state -> buf .ptr = buf_remaining .ptr = NULL ;
897
+ state -> error = UTF8PROC_ERROR_NOMEM ;
898
+ }
899
+ }
900
+ /* success */
901
+ state -> str .ptr += str_consumed ;
902
+ state -> str .len -= str_consumed ;
903
+ if (buf_needed == 0 ) {
904
+ /* ignorable sequence - skip and try next */
905
+ continue ;
906
+ }
907
+ if (state -> error == 0 ) {
908
+ buf_remaining .ptr += buf_consumed ;
909
+ buf_remaining .len_available -= buf_consumed ;
910
+ }
911
+ /* decomposed chars must be sorted in ascending order of combining class,
912
+ which means we need to keep fetching chars until we get to non-combining */
913
+ if (buf_consumed == 0 || state -> str .len <= 0 || unsafe_get_property (c )-> combining_class == 0 ) {
914
+ /* done decomposing this sequence */
915
+ if (state -> error == 0 ) {
916
+ /* time to finish up and optionally sort it */
917
+ break ;
918
+ }
919
+ /* else we're trying to find the longest decomposed sequence */
920
+ if (buf_needed > buf_needed_max ) {
921
+ buf_needed_max = buf_needed ;
922
+ }
923
+ /* reset for next sequence */
924
+ buf_needed = 0 ;
925
+ }
926
+ }
927
+ if (state -> buf .ptr == NULL ) {
928
+ state -> buf .len_used = buf_needed_max ;
929
+ } else {
930
+ state -> buf .len_used = buf_needed ;
931
+ }
932
+ if (buf_needed > 1 && state -> error == 0 && buf_needed <= state -> buf .len_available ) {
933
+ /* sort by combining class (similar code is in utf8proc_decompose_custom implementation) */
934
+ utf8proc_ssize_t pos = 0 ;
935
+ const utf8proc_ssize_t second_to_last = buf_needed - 1 ;
936
+ while (pos < second_to_last ) {
937
+ utf8proc_int32_t uc1 , uc2 ;
938
+ const utf8proc_property_t * property1 , * property2 ;
939
+ uc1 = state -> buf .ptr [pos ];
940
+ uc2 = state -> buf .ptr [pos + 1 ];
941
+ property1 = unsafe_get_property (uc1 );
942
+ property2 = unsafe_get_property (uc2 );
943
+ if (property1 -> combining_class > property2 -> combining_class &&
944
+ property2 -> combining_class > 0 ) {
945
+ state -> buf .ptr [pos ] = uc2 ;
946
+ state -> buf .ptr [pos + 1 ] = uc1 ;
947
+ if (pos > 0 ) pos -- ; else pos ++ ;
948
+ } else {
949
+ pos ++ ;
950
+ }
951
+ }
952
+ }
953
+ }
954
+
955
+ static utf8proc_string8_view_t utf8proc_purify_strlen (utf8proc_string8_view_t str ) {
956
+ if (str .len < 0 ) {
957
+ if (str .ptr == NULL ) {
958
+ str .len = 0 ;
959
+ }
960
+ else for (str .len = 0 ; str .ptr [str .len ] != '\0' ; ++ str .len ) { }
961
+ }
962
+ return str ;
963
+ }
964
+
965
+ UTF8PROC_DLLEXPORT void utf8proc_isequal_normalized (utf8proc_processing_state_t * a , utf8proc_processing_state_t * b , utf8proc_option_t options ,
966
+ utf8proc_custom_func custom_func , void * custom_data
967
+ ) {
968
+ a -> str = utf8proc_purify_strlen (a -> str );
969
+ b -> str = utf8proc_purify_strlen (b -> str );
970
+ options = (utf8proc_option_t )((options & ~(unsigned int )UTF8PROC_COMPOSE )|UTF8PROC_DECOMPOSE );
971
+ while (1 ) {
972
+ const utf8proc_string8_view_t original_a = a -> str ;
973
+ const utf8proc_string8_view_t original_b = b -> str ;
974
+ if (a -> str .len == 0 || b -> str .len == 0 ) {
975
+ /* end of string */
976
+ return ;
977
+ }
978
+ utf8proc_decompose_next_chars (a , options , custom_func , custom_data );
979
+ utf8proc_decompose_next_chars (b , options , custom_func , custom_data );
980
+ if (a -> error == 0 && b -> error == 0 ) {
981
+ utf8proc_ssize_t pos ;
982
+ /* success - compare the work buffers for equality */
983
+ if (a -> buf .len_used != b -> buf .len_used ) {
984
+ /* mismatch found */
985
+ return ;
986
+ }
987
+ for (pos = 0 ; pos < a -> buf .len_used ; ++ pos ) {
988
+ if (a -> buf .ptr [pos ] != b -> buf .ptr [pos ]) {
989
+ /* mismatch found */
990
+ return ;
991
+ }
992
+ }
993
+ /* equal so far */
994
+ continue ;
995
+ }
996
+ /* error - restore unprocessed strings and exit */
997
+ a -> str = original_a ;
998
+ b -> str = original_b ;
999
+ return ;
1000
+ }
1001
+ }
0 commit comments