@@ -651,19 +651,30 @@ def factorize_(
651651
652652 found_groups .append (np .array (expect ))
653653 else :
654+ idx , groups = pd .factorize (flat , sort = sort ) # type: ignore[arg-type]
654655 if expect is not None and reindex :
655- sorter = np .argsort (expect )
656- groups = expect [(sorter ,)] if sort else expect
657- idx = np .searchsorted (expect , flat , sorter = sorter )
658- mask = ~ np .isin (flat , expect ) | isnull (flat ) | (idx == len (expect ))
659- if not sort :
660- # idx is the index in to the sorted array.
661- # if we didn't want sorting, unsort it back
662- idx [(idx == len (expect ),)] = - 1
663- idx = sorter [(idx ,)]
664- idx [mask ] = - 1
665- else :
666- idx , groups = pd .factorize (flat , sort = sort ) # type: ignore[arg-type]
656+ assert sort
657+ # https://stackoverflow.com/questions/5036816/numpy-lookup-map-or-point/5036900#5036900
658+ # sorter = np.argsort(expect)
659+ # groups = expect[(sorter,)] if sort else expect
660+ #ii = np.argsort(groups)
661+ #C = np.digitize(idx, groups[ii]) - 1
662+ #idx = ii[C]
663+ # key=np.argsort(groups)
664+ # idx=key[groups[key].searchsorted(idx)]
665+ inds = np .searchsorted (expect , groups )
666+ # print(groups, inds)
667+ mask = ~ np .isin (groups , expect ) | (inds == len (expect ))
668+ codes_to_nan_out = np .arange (len (groups ))[mask ]
669+ print (codes_to_nan_out , groupvar .shape , len (groups ))
670+ # codes_to_nan_out, groups, groups[codes_to_nan_out]
671+ # key=np.argsort(expect)
672+ # key = np.arange(len(expect))
673+ # idx=key[groups[key].searchsorted(idx)]
674+ idx = idx [ ]
675+ idx [np .isin (idx , codes_to_nan_out )] = - 1
676+ print (np .unique (idx ))
677+
667678
668679 found_groups .append (np .array (groups ))
669680 factorized .append (idx .reshape (groupvar .shape ))
0 commit comments