diff --git a/skfeaturellm/feature_engineer.py b/skfeaturellm/feature_engineer.py index d9d0259..07016d3 100644 --- a/skfeaturellm/feature_engineer.py +++ b/skfeaturellm/feature_engineer.py @@ -15,7 +15,7 @@ from skfeaturellm.schemas import FeatureEngineeringIdea from skfeaturellm.transformations import TransformationPipeline from skfeaturellm.types import ProblemType -from skfeaturellm.utils.validation import check_is_fitted +from skfeaturellm.utils.validation import check_is_fitted, validate_data class LLMFeatureEngineer( @@ -53,6 +53,14 @@ def __init__( verbose: int = 0, **kwargs, ): + if max_features is not None and ( + not isinstance(max_features, int) or max_features < 1 + ): + raise ValueError( + f"max_features must be a positive integer or None, got {max_features!r}" + ) + if not isinstance(verbose, int) or verbose < 0: + raise ValueError(f"verbose must be a non-negative integer, got {verbose!r}") self.problem_type = ProblemType(problem_type) self.model_name = model_name self.target_col = target_col @@ -87,18 +95,19 @@ def fit( self : LLMFeatureEngineer The fitted transformer """ + validate_data(X, y, estimator_name=self.__class__.__name__) + + self.n_features_in_ = X.shape[1] + self.feature_names_in_ = list(X.columns) + if feature_descriptions is None: - # Extract feature descriptions from DataFrame feature_descriptions = [ {"name": col, "type": str(X[col].dtype), "description": ""} for col in X.columns ] - dataset_statistics = prompt_utils.format_dataset_statistics( X, y, self.problem_type ) - - # Generate feature engineering ideas self.generated_features_ideas_ = ( self.llm_interface.generate_engineered_features( feature_descriptions=feature_descriptions, @@ -108,7 +117,6 @@ def fit( dataset_statistics=dataset_statistics, ).ideas ) - return self def transform(self, X: pd.DataFrame) -> pd.DataFrame: @@ -125,20 +133,20 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: pd.DataFrame Input dataframe with the generated features """ - # if fit has not been called, raise an error check_is_fitted(self) - - # Convert LLM output to executor config and apply prefix to feature names + if not hasattr(self, "feature_names_in_"): + self.feature_names_in_ = list(X.columns) + validate_data(X, estimator_name=self.__class__.__name__) + missing_cols = set(self.feature_names_in_) - set(X.columns) + if missing_cols: + raise ValueError( + f"X is missing columns that were present during fit: {sorted(missing_cols)}" + ) executor_config = self._build_executor_config(self.generated_features_ideas_) - - # Create executor with raise_on_error=False to skip failed transformations executor = TransformationPipeline.from_dict( executor_config, raise_on_error=False ) - - # Execute transformations result_df = executor.fit(X).transform(X) - return result_df def to_transformer( @@ -229,12 +237,28 @@ def fit_selective( # pylint: disable=too-many-arguments The fitted transformer. Call ``transform()`` to apply the selected features and ``to_transformer()`` to export them for production. """ + validate_data(X, y, estimator_name=self.__class__.__name__) + if not isinstance(n_rounds, int) or n_rounds < 1: + raise ValueError(f"n_rounds must be a positive integer, got {n_rounds!r}") + if eval_set is not None: + if ( + not isinstance(eval_set, tuple) + or len(eval_set) != 2 + or not isinstance(eval_set[0], pd.DataFrame) + or not isinstance(eval_set[1], pd.Series) + ): + raise ValueError( + "eval_set must be a tuple of (pd.DataFrame, pd.Series), " + f"got {type(eval_set)!r}" + ) + self.n_features_in_ = X.shape[1] + self.feature_names_in_ = list(X.columns) + if feature_descriptions is None: feature_descriptions = [ {"name": col, "type": str(X[col].dtype), "description": ""} for col in X.columns ] - dataset_statistics = prompt_utils.format_dataset_statistics( X, y, self.problem_type ) @@ -448,7 +472,6 @@ def _build_executor_config( transformations = [] for idea in ideas: config = idea.to_executor_dict() - # Apply feature prefix config["feature_name"] = f"{self.feature_prefix}{config['feature_name']}" transformations.append(config) @@ -480,14 +503,21 @@ def evaluate_features( check_is_fitted(self) feature_evaluator = FeatureEvaluator(self.problem_type) - X_transformed = self.transform(X) if not is_transformed else X - generated_features_names = [ f"{self.feature_prefix}{idea.feature_name}" for idea in self.generated_features_ideas_ ] - + if is_transformed: + missing = [ + col + for col in generated_features_names + if col not in X_transformed.columns + ] + if missing: + raise ValueError( + f"Expected generated feature columns not found in X: {missing}" + ) return feature_evaluator.evaluate( X_transformed, y, features=generated_features_names ) diff --git a/skfeaturellm/utils/validation.py b/skfeaturellm/utils/validation.py index c4ed329..6080138 100644 --- a/skfeaturellm/utils/validation.py +++ b/skfeaturellm/utils/validation.py @@ -1,5 +1,7 @@ from inspect import isclass +import pandas as pd + from skfeaturellm.exceptions import NotFittedError @@ -24,3 +26,48 @@ def _is_fitted(estimator): v for v in vars(estimator) if v.endswith("_") and not v.startswith("__") ] return len(fitted_attrs) > 0 + + +def validate_data( + X, + y=None, + *, + estimator_name: str = "estimator", +) -> None: + """Validate input data X and optional target y. + + Parameters + ---------- + X : object + Input features to validate. + y : object, optional + Target variable to validate. + estimator_name : str + Name of the estimator, used in error messages. + + Raises + ------ + ValueError + If X is not a non-empty DataFrame, or X and y have different lengths. + TypeError + If X is not a DataFrame or y is not a Series. + """ + + if not isinstance(X, pd.DataFrame): + raise ValueError( + f"[{estimator_name}] X must be a pandas DataFrame, " + f"got {type(X).__name__!r}" + ) + if X.empty: + raise ValueError(f"[{estimator_name}] X must not be empty.") + if y is not None: + if not isinstance(y, pd.Series): + raise ValueError( + f"[{estimator_name}] y must be a pandas Series or None, " + f"got {type(y).__name__!r}" + ) + if len(y) != len(X): + raise ValueError( + f"[{estimator_name}] X and y must have the same length, " + f"got X={len(X)} and y={len(y)}" + ) diff --git a/tests/test_feature_engineer.py b/tests/test_feature_engineer.py index c4d305f..eda764f 100644 --- a/tests/test_feature_engineer.py +++ b/tests/test_feature_engineer.py @@ -693,3 +693,133 @@ def test_to_transformer_filter_by_unprefixed_name(mocker, sample_data_frame): assert len(transformer.transformations) == 1 assert transformer.transformations[0]["feature_name"] == "llm_feat_age_double" + + +# ============================================================================= +# Test: Input validation (new) +# ============================================================================= + + +def test_init_invalid_max_features(mocker): + """__init__() raises ValueError for invalid max_features.""" + mocker.patch("skfeaturellm.llm_interface.init_chat_model") + with pytest.raises(ValueError, match="max_features must be a positive integer"): + LLMFeatureEngineer(problem_type="classification", max_features=0) + + +def test_init_invalid_verbose(mocker): + """__init__() raises ValueError for negative verbose.""" + mocker.patch("skfeaturellm.llm_interface.init_chat_model") + with pytest.raises(ValueError, match="verbose must be a non-negative integer"): + LLMFeatureEngineer(problem_type="classification", verbose=-1) + + +def test_fit_invalid_X_not_dataframe(mocker): + """fit() raises ValueError when X is not a DataFrame.""" + mocker.patch("skfeaturellm.llm_interface.init_chat_model") + engineer = LLMFeatureEngineer(problem_type="classification") + with pytest.raises(ValueError, match="X must be a pandas DataFrame"): + engineer.fit([[1, 2], [3, 4]]) + + +def test_fit_invalid_X_empty(mocker): + """fit() raises ValueError when X is empty.""" + mocker.patch("skfeaturellm.llm_interface.init_chat_model") + engineer = LLMFeatureEngineer(problem_type="classification") + with pytest.raises(ValueError, match="X must not be empty"): + engineer.fit(pd.DataFrame()) + + +def test_fit_invalid_y_not_series(mocker, sample_data_frame): + """fit() raises ValueError when y is not a Series.""" + mocker.patch("skfeaturellm.llm_interface.init_chat_model") + engineer = LLMFeatureEngineer(problem_type="classification") + with pytest.raises(ValueError, match="y must be a pandas Series"): + engineer.fit(sample_data_frame, y=[0, 1]) + + +def test_fit_invalid_y_length_mismatch(mocker, sample_data_frame): + """fit() raises ValueError when X and y have different lengths.""" + mocker.patch("skfeaturellm.llm_interface.init_chat_model") + engineer = LLMFeatureEngineer(problem_type="classification") + with pytest.raises(ValueError, match="same length"): + engineer.fit(sample_data_frame, y=pd.Series([0, 1, 2])) + + +def test_fit_stores_n_features_in(mocker, sample_data_frame): + """fit() stores n_features_in_ and feature_names_in_ after fitting.""" + mock_ideas = Mock() + mock_ideas.ideas = [] + mocker.patch( + "skfeaturellm.llm_interface.LLMInterface.generate_engineered_features", + return_value=mock_ideas, + ) + mocker.patch("skfeaturellm.llm_interface.init_chat_model") + engineer = LLMFeatureEngineer(problem_type="classification") + engineer.fit(sample_data_frame) + assert engineer.n_features_in_ == 3 + assert engineer.feature_names_in_ == ["age", "income", "city"] + + +def test_transform_raises_missing_columns(mocker, sample_data_frame): + """transform() raises ValueError when X is missing fit-time columns.""" + mock_ideas = Mock() + mock_ideas.ideas = [] + mocker.patch( + "skfeaturellm.llm_interface.LLMInterface.generate_engineered_features", + return_value=mock_ideas, + ) + mocker.patch("skfeaturellm.llm_interface.init_chat_model") + engineer = LLMFeatureEngineer(problem_type="classification") + engineer.fit(sample_data_frame) + X_missing = sample_data_frame.drop(columns=["age"]) + with pytest.raises(ValueError, match="missing columns"): + engineer.transform(X_missing) + + +def test_fit_selective_invalid_n_rounds(mocker, numeric_data_frame): + """fit_selective() raises ValueError for n_rounds < 1.""" + from sklearn.feature_selection import SelectKBest, f_classif + + mocker.patch("skfeaturellm.llm_interface.init_chat_model") + engineer = LLMFeatureEngineer(problem_type="classification") + y = pd.Series([0, 1]) + with pytest.raises(ValueError, match="n_rounds must be a positive integer"): + engineer.fit_selective( + numeric_data_frame, y, SelectKBest(f_classif, k=1), n_rounds=0 + ) + + +def test_fit_selective_invalid_eval_set(mocker, numeric_data_frame): + """fit_selective() raises ValueError for malformed eval_set.""" + from sklearn.feature_selection import SelectKBest, f_classif + + mocker.patch("skfeaturellm.llm_interface.init_chat_model") + engineer = LLMFeatureEngineer(problem_type="classification") + y = pd.Series([0, 1]) + with pytest.raises(ValueError, match="eval_set must be a tuple"): + engineer.fit_selective( + numeric_data_frame, y, SelectKBest(f_classif, k=1), eval_set="bad" + ) + + +def test_evaluate_features_missing_columns_raises(mocker, sample_data_frame): + """evaluate_features(is_transformed=True) raises ValueError for missing generated columns.""" + mocker.patch("skfeaturellm.llm_interface.init_chat_model") + engineer = LLMFeatureEngineer( + problem_type="classification", feature_prefix="llm_feat_" + ) + engineer.generated_features_ideas_ = [ + FeatureEngineeringIdea( + type="mul", + feature_name="age_double", + columns=["age"], + parameters={"constant": 2.0}, + description="Double the age", + ) + ] + y = pd.Series([0, 1]) + with pytest.raises( + ValueError, match="Expected generated feature columns not found" + ): + engineer.evaluate_features(sample_data_frame, y, is_transformed=True)