@@ -651,19 +651,30 @@ def factorize_(
651
651
652
652
found_groups .append (np .array (expect ))
653
653
else :
654
+ idx , groups = pd .factorize (flat , sort = sort ) # type: ignore[arg-type]
654
655
if expect is not None and reindex :
655
- sorter = np .argsort (expect )
656
- groups = expect [(sorter ,)] if sort else expect
657
- idx = np .searchsorted (expect , flat , sorter = sorter )
658
- mask = ~ np .isin (flat , expect ) | isnull (flat ) | (idx == len (expect ))
659
- if not sort :
660
- # idx is the index in to the sorted array.
661
- # if we didn't want sorting, unsort it back
662
- idx [(idx == len (expect ),)] = - 1
663
- idx = sorter [(idx ,)]
664
- idx [mask ] = - 1
665
- else :
666
- idx , groups = pd .factorize (flat , sort = sort ) # type: ignore[arg-type]
656
+ assert sort
657
+ # https://stackoverflow.com/questions/5036816/numpy-lookup-map-or-point/5036900#5036900
658
+ # sorter = np.argsort(expect)
659
+ # groups = expect[(sorter,)] if sort else expect
660
+ #ii = np.argsort(groups)
661
+ #C = np.digitize(idx, groups[ii]) - 1
662
+ #idx = ii[C]
663
+ # key=np.argsort(groups)
664
+ # idx=key[groups[key].searchsorted(idx)]
665
+ inds = np .searchsorted (expect , groups )
666
+ # print(groups, inds)
667
+ mask = ~ np .isin (groups , expect ) | (inds == len (expect ))
668
+ codes_to_nan_out = np .arange (len (groups ))[mask ]
669
+ print (codes_to_nan_out , groupvar .shape , len (groups ))
670
+ # codes_to_nan_out, groups, groups[codes_to_nan_out]
671
+ # key=np.argsort(expect)
672
+ # key = np.arange(len(expect))
673
+ # idx=key[groups[key].searchsorted(idx)]
674
+ idx = idx [ ]
675
+ idx [np .isin (idx , codes_to_nan_out )] = - 1
676
+ print (np .unique (idx ))
677
+
667
678
668
679
found_groups .append (np .array (groups ))
669
680
factorized .append (idx .reshape (groupvar .shape ))
0 commit comments