@@ -201,7 +201,7 @@ def resolve_feature_engine(
201201 feature_engine : FeatureEngine ,
202202) -> FeatureEngineConcrete : # noqa
203203
204- if feature_engine in ["none" , "pandas" , DIRTY_CAT , "torch" , CUDA_CAT ]:
204+ if feature_engine in ["none" , "pandas" , "dirty_cat" , "torch" , "cu_cat" ]:
205205 return feature_engine # type: ignore
206206 if feature_engine == "auto" :
207207 has_dependancy_text_ , _ , _ = lazy_import_has_dependancy_text ()
@@ -967,19 +967,19 @@ def process_dirty_dataframes(
967967 the data encoder, and the label encoder.
968968 """
969969
970- if feature_engine == CUDA_CAT :
970+ if feature_engine == "cu_cat" :
971971 assert_imported_cucat ()
972- from cu_cat import SuperVectorizer , GapEncoder # , SimilarityEncoder
972+ from cu_cat import SuperVectorizer , GapEncoder
973973 from cuml .preprocessing import FunctionTransformer
974-
975- else : # if feature_engine == "dirty_cat": # DIRTY_CAT
976- from dirty_cat import SuperVectorizer , GapEncoder # , SimilarityEncoder
974+
975+ else :
976+ from dirty_cat import SuperVectorizer , GapEncoder
977977 from sklearn .preprocessing import FunctionTransformer
978978
979979 t = time ()
980980
981981 if not is_dataframe_all_numeric (ndf ):
982- if feature_engine == CUDA_CAT :
982+ if feature_engine == "cu_cat" :
983983 data_encoder = SuperVectorizer (
984984 auto_cast = True ,
985985 cardinality_threshold = cardinality_threshold_target ,
@@ -1010,9 +1010,9 @@ def process_dirty_dataframes(
10101010 features_transformed = data_encoder .get_feature_names_out ()
10111011
10121012 all_transformers = data_encoder .transformers
1013- if feature_engine == CUDA_CAT :
1013+ if feature_engine == "cu_cat" :
10141014 logger .info (f"-Shape of [[cu_cat fit]] data { X_enc .shape } " )
1015- elif feature_engine == DIRTY_CAT :
1015+ else :
10161016 logger .info (f"-Shape of [[dirty_cat fit]] data { X_enc .shape } " )
10171017 logger .debug (f"-Transformers: \n { all_transformers } \n " )
10181018 logger .debug (
@@ -1058,7 +1058,7 @@ def process_dirty_dataframes(
10581058 t2 = time ()
10591059 logger .debug ("-Fitting Targets --\n %s" , y .columns )
10601060
1061- if feature_engine == CUDA_CAT :
1061+ if feature_engine == "cu_cat" :
10621062 label_encoder = SuperVectorizer (
10631063 auto_cast = True ,
10641064 cardinality_threshold = cardinality_threshold_target ,
@@ -1486,10 +1486,17 @@ def process_edge_dataframes(
14861486 other_df , y
14871487 )
14881488 # add the two datasets together
1489- if feature_engine == 'pandas' :
1490- X_enc = pd .concat ([T , X_enc ], axis = 1 )
1491- elif feature_engine == 'cudf' :
1489+ has_dependancy_cudf_ , import_exn , cudf = lazy_import_has_dependancy_cudf ()
1490+ T_type = str (getmodule (T ))
1491+ X_type = str (getmodule (X_enc ))
1492+ if 'cudf' in T_type and 'cudf' in X_type :
14921493 X_enc = cudf .concat ([T , X_enc ], axis = 1 )
1494+ elif 'pd' in T_type and 'pd' in X_type :
1495+ X_enc = pd .concat ([T , X_enc ], axis = 1 )
1496+ elif 'cudf' in T_type and 'pd' in X_type :
1497+ X_enc = cudf .concat ([cudf .from_pandas (T ), X_enc ], axis = 1 )
1498+ elif 'pd' in T_type and 'cudf' in X_type :
1499+ X_enc = cudf .concat ([T , cudf .from_pandas (X_enc )], axis = 1 )
14931500 # then scale them
14941501 X_encs , y_encs , scaling_pipeline , scaling_pipeline_target = smart_scaler ( # noqa
14951502 X_enc ,
@@ -1556,21 +1563,17 @@ def process_edge_dataframes(
15561563 if not X_enc .empty and not T .empty :
15571564 logger .debug ("-" * 60 )
15581565 logger .debug ("<= Found Edges and Dirty_cat encoding =>" )
1566+ has_dependancy_cudf_ , import_exn , cudf = lazy_import_has_dependancy_cudf ()
15591567 T_type = str (getmodule (T ))
15601568 X_type = str (getmodule (X_enc ))
15611569 if 'cudf' in T_type and 'cudf' in X_type :
15621570 X_enc = cudf .concat ([T , X_enc ], axis = 1 )
15631571 elif 'pd' in T_type and 'pd' in X_type :
15641572 X_enc = pd .concat ([T , X_enc ], axis = 1 )
1565- else :
1566- try :
1567- X_enc = cudf .concat ([cudf .from_pandas (T ), X_enc ], axis = 1 )
1568- except :
1569- pass
1570- try :
1571- X_enc = cudf .concat ([T , cudf .from_pandas (X_enc )], axis = 1 )
1572- except :
1573- pass
1573+ elif 'cudf' in T_type and 'pd' in X_type :
1574+ X_enc = cudf .concat ([cudf .from_pandas (T ), X_enc ], axis = 1 )
1575+ elif 'pd' in T_type and 'cudf' in X_type :
1576+ X_enc = cudf .concat ([T , cudf .from_pandas (X_enc )], axis = 1 )
15741577 elif not T .empty and X_enc .empty :
15751578 logger .debug ("-" * 60 )
15761579 logger .debug ("<= Found only Edges =>" )
@@ -1750,7 +1753,18 @@ def transform(
17501753
17511754 # concat text to dirty_cat, with text in front.
17521755 if not tX .empty and not X .empty :
1753- X = pd .concat ([tX , X ], axis = 1 )
1756+ has_dependancy_cudf_ , import_exn , cudf = lazy_import_has_dependancy_cudf ()
1757+ T_type = str (getmodule (tX ))
1758+ X_type = str (getmodule (X ))
1759+ if 'cudf' in T_type and 'cudf' in X_type :
1760+ X = cudf .concat ([tX , X ], axis = 1 )
1761+ elif 'pd' in T_type and 'pd' in X_type :
1762+ X = pd .concat ([tX , X ], axis = 1 )
1763+ elif 'cudf' in T_type and 'pd' in X_type :
1764+ X = cudf .concat ([cudf .from_pandas (tX ), X ], axis = 1 )
1765+ elif 'pd' in T_type and 'cudf' in X_type :
1766+ X = cudf .concat ([tX , cudf .from_pandas (X )], axis = 1 )
1767+ # X = pd.concat([tX, X], axis=1)
17541768 logger .info ("--Combining both Textual and Numeric/Dirty_Cat" )
17551769 elif not tX .empty and X .empty :
17561770 X = tX # textual
@@ -1765,7 +1779,18 @@ def transform(
17651779
17661780 # now if edges, add T at front
17671781 if kind == "edges" :
1768- X = pd .concat ([T , X ], axis = 1 ) # edges, text, dirty_cat
1782+ # X = pd.concat([T, X], axis=1) # edges, text, dirty_cat
1783+ has_dependancy_cudf_ , import_exn , cudf = lazy_import_has_dependancy_cudf ()
1784+ T_type = str (getmodule (T ))
1785+ X_type = str (getmodule (X ))
1786+ if 'cudf' in T_type and 'cudf' in X_type :
1787+ X = cudf .concat ([T , X ], axis = 1 )
1788+ elif 'pd' in T_type and 'pd' in X_type :
1789+ X = pd .concat ([T , X ], axis = 1 )
1790+ elif 'cudf' in T_type and 'pd' in X_type :
1791+ X = cudf .concat ([cudf .from_pandas (T ), X ], axis = 1 )
1792+ elif 'pd' in T_type and 'cudf' in X_type :
1793+ X = cudf .concat ([T , cudf .from_pandas (X )], axis = 1 )
17691794 logger .info ("-Combining MultiLabelBinarizer with previous features" )
17701795
17711796 logger .info ("-" * 40 )
@@ -2656,10 +2681,11 @@ def featurize(
26562681 """
26572682 feature_engine = resolve_feature_engine (feature_engine )
26582683
2659- if feature_engine == 'dirty_cat' :
2660- assert_imported_min ()
2661- elif feature_engine == 'cu_cat' :
2684+
2685+ if feature_engine == "cu_cat" :
26622686 assert_imported_cucat ()
2687+ else :
2688+ assert_imported_min ()
26632689
26642690 if inplace :
26652691 res = self
0 commit comments