Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 49 additions & 19 deletions skfeaturellm/feature_engineer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from skfeaturellm.schemas import FeatureEngineeringIdea
from skfeaturellm.transformations import TransformationPipeline
from skfeaturellm.types import ProblemType
from skfeaturellm.utils.validation import check_is_fitted
from skfeaturellm.utils.validation import check_is_fitted, validate_data


class LLMFeatureEngineer(
Expand Down Expand Up @@ -53,6 +53,14 @@ def __init__(
verbose: int = 0,
**kwargs,
):
if max_features is not None and (
not isinstance(max_features, int) or max_features < 1
):
raise ValueError(
f"max_features must be a positive integer or None, got {max_features!r}"
)
if not isinstance(verbose, int) or verbose < 0:
raise ValueError(f"verbose must be a non-negative integer, got {verbose!r}")
self.problem_type = ProblemType(problem_type)
self.model_name = model_name
self.target_col = target_col
Expand Down Expand Up @@ -87,18 +95,19 @@ def fit(
self : LLMFeatureEngineer
The fitted transformer
"""
validate_data(X, y, estimator_name=self.__class__.__name__)

self.n_features_in_ = X.shape[1]
self.feature_names_in_ = list(X.columns)

if feature_descriptions is None:
# Extract feature descriptions from DataFrame
feature_descriptions = [
{"name": col, "type": str(X[col].dtype), "description": ""}
for col in X.columns
]

dataset_statistics = prompt_utils.format_dataset_statistics(
X, y, self.problem_type
)

# Generate feature engineering ideas
self.generated_features_ideas_ = (
self.llm_interface.generate_engineered_features(
feature_descriptions=feature_descriptions,
Expand All @@ -108,7 +117,6 @@ def fit(
dataset_statistics=dataset_statistics,
).ideas
)

return self

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -125,20 +133,20 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
pd.DataFrame
Input dataframe with the generated features
"""
# if fit has not been called, raise an error
check_is_fitted(self)

# Convert LLM output to executor config and apply prefix to feature names
if not hasattr(self, "feature_names_in_"):
self.feature_names_in_ = list(X.columns)
validate_data(X, estimator_name=self.__class__.__name__)
missing_cols = set(self.feature_names_in_) - set(X.columns)
if missing_cols:
raise ValueError(
f"X is missing columns that were present during fit: {sorted(missing_cols)}"
)
executor_config = self._build_executor_config(self.generated_features_ideas_)

# Create executor with raise_on_error=False to skip failed transformations
executor = TransformationPipeline.from_dict(
executor_config, raise_on_error=False
)

# Execute transformations
result_df = executor.fit(X).transform(X)

return result_df

def to_transformer(
Expand Down Expand Up @@ -229,12 +237,28 @@ def fit_selective( # pylint: disable=too-many-arguments
The fitted transformer. Call ``transform()`` to apply the selected
features and ``to_transformer()`` to export them for production.
"""
validate_data(X, y, estimator_name=self.__class__.__name__)
if not isinstance(n_rounds, int) or n_rounds < 1:
raise ValueError(f"n_rounds must be a positive integer, got {n_rounds!r}")
if eval_set is not None:
if (
not isinstance(eval_set, tuple)
or len(eval_set) != 2
or not isinstance(eval_set[0], pd.DataFrame)
or not isinstance(eval_set[1], pd.Series)
):
raise ValueError(
"eval_set must be a tuple of (pd.DataFrame, pd.Series), "
f"got {type(eval_set)!r}"
)
self.n_features_in_ = X.shape[1]
self.feature_names_in_ = list(X.columns)

if feature_descriptions is None:
feature_descriptions = [
{"name": col, "type": str(X[col].dtype), "description": ""}
for col in X.columns
]

dataset_statistics = prompt_utils.format_dataset_statistics(
X, y, self.problem_type
)
Expand Down Expand Up @@ -448,7 +472,6 @@ def _build_executor_config(
transformations = []
for idea in ideas:
config = idea.to_executor_dict()
# Apply feature prefix
config["feature_name"] = f"{self.feature_prefix}{config['feature_name']}"
transformations.append(config)

Expand Down Expand Up @@ -480,14 +503,21 @@ def evaluate_features(
check_is_fitted(self)

feature_evaluator = FeatureEvaluator(self.problem_type)

X_transformed = self.transform(X) if not is_transformed else X

generated_features_names = [
f"{self.feature_prefix}{idea.feature_name}"
for idea in self.generated_features_ideas_
]

if is_transformed:
missing = [
col
for col in generated_features_names
if col not in X_transformed.columns
]
if missing:
raise ValueError(
f"Expected generated feature columns not found in X: {missing}"
)
return feature_evaluator.evaluate(
X_transformed, y, features=generated_features_names
)
47 changes: 47 additions & 0 deletions skfeaturellm/utils/validation.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from inspect import isclass

import pandas as pd

from skfeaturellm.exceptions import NotFittedError


Expand All @@ -24,3 +26,48 @@ def _is_fitted(estimator):
v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")
]
return len(fitted_attrs) > 0


def validate_data(
X,
y=None,
*,
estimator_name: str = "estimator",
) -> None:
"""Validate input data X and optional target y.

Parameters
----------
X : object
Input features to validate.
y : object, optional
Target variable to validate.
estimator_name : str
Name of the estimator, used in error messages.

Raises
------
ValueError
If X is not a non-empty DataFrame, or X and y have different lengths.
TypeError
If X is not a DataFrame or y is not a Series.
"""

if not isinstance(X, pd.DataFrame):
raise ValueError(
f"[{estimator_name}] X must be a pandas DataFrame, "
f"got {type(X).__name__!r}"
)
if X.empty:
raise ValueError(f"[{estimator_name}] X must not be empty.")
if y is not None:
if not isinstance(y, pd.Series):
raise ValueError(
f"[{estimator_name}] y must be a pandas Series or None, "
f"got {type(y).__name__!r}"
)
if len(y) != len(X):
raise ValueError(
f"[{estimator_name}] X and y must have the same length, "
f"got X={len(X)} and y={len(y)}"
)
130 changes: 130 additions & 0 deletions tests/test_feature_engineer.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,3 +693,133 @@ def test_to_transformer_filter_by_unprefixed_name(mocker, sample_data_frame):

assert len(transformer.transformations) == 1
assert transformer.transformations[0]["feature_name"] == "llm_feat_age_double"


# =============================================================================
# Test: Input validation (new)
# =============================================================================


def test_init_invalid_max_features(mocker):
"""__init__() raises ValueError for invalid max_features."""
mocker.patch("skfeaturellm.llm_interface.init_chat_model")
with pytest.raises(ValueError, match="max_features must be a positive integer"):
LLMFeatureEngineer(problem_type="classification", max_features=0)


def test_init_invalid_verbose(mocker):
"""__init__() raises ValueError for negative verbose."""
mocker.patch("skfeaturellm.llm_interface.init_chat_model")
with pytest.raises(ValueError, match="verbose must be a non-negative integer"):
LLMFeatureEngineer(problem_type="classification", verbose=-1)


def test_fit_invalid_X_not_dataframe(mocker):
"""fit() raises ValueError when X is not a DataFrame."""
mocker.patch("skfeaturellm.llm_interface.init_chat_model")
engineer = LLMFeatureEngineer(problem_type="classification")
with pytest.raises(ValueError, match="X must be a pandas DataFrame"):
engineer.fit([[1, 2], [3, 4]])


def test_fit_invalid_X_empty(mocker):
"""fit() raises ValueError when X is empty."""
mocker.patch("skfeaturellm.llm_interface.init_chat_model")
engineer = LLMFeatureEngineer(problem_type="classification")
with pytest.raises(ValueError, match="X must not be empty"):
engineer.fit(pd.DataFrame())


def test_fit_invalid_y_not_series(mocker, sample_data_frame):
"""fit() raises ValueError when y is not a Series."""
mocker.patch("skfeaturellm.llm_interface.init_chat_model")
engineer = LLMFeatureEngineer(problem_type="classification")
with pytest.raises(ValueError, match="y must be a pandas Series"):
engineer.fit(sample_data_frame, y=[0, 1])


def test_fit_invalid_y_length_mismatch(mocker, sample_data_frame):
"""fit() raises ValueError when X and y have different lengths."""
mocker.patch("skfeaturellm.llm_interface.init_chat_model")
engineer = LLMFeatureEngineer(problem_type="classification")
with pytest.raises(ValueError, match="same length"):
engineer.fit(sample_data_frame, y=pd.Series([0, 1, 2]))


def test_fit_stores_n_features_in(mocker, sample_data_frame):
"""fit() stores n_features_in_ and feature_names_in_ after fitting."""
mock_ideas = Mock()
mock_ideas.ideas = []
mocker.patch(
"skfeaturellm.llm_interface.LLMInterface.generate_engineered_features",
return_value=mock_ideas,
)
mocker.patch("skfeaturellm.llm_interface.init_chat_model")
engineer = LLMFeatureEngineer(problem_type="classification")
engineer.fit(sample_data_frame)
assert engineer.n_features_in_ == 3
assert engineer.feature_names_in_ == ["age", "income", "city"]


def test_transform_raises_missing_columns(mocker, sample_data_frame):
"""transform() raises ValueError when X is missing fit-time columns."""
mock_ideas = Mock()
mock_ideas.ideas = []
mocker.patch(
"skfeaturellm.llm_interface.LLMInterface.generate_engineered_features",
return_value=mock_ideas,
)
mocker.patch("skfeaturellm.llm_interface.init_chat_model")
engineer = LLMFeatureEngineer(problem_type="classification")
engineer.fit(sample_data_frame)
X_missing = sample_data_frame.drop(columns=["age"])
with pytest.raises(ValueError, match="missing columns"):
engineer.transform(X_missing)


def test_fit_selective_invalid_n_rounds(mocker, numeric_data_frame):
"""fit_selective() raises ValueError for n_rounds < 1."""
from sklearn.feature_selection import SelectKBest, f_classif

mocker.patch("skfeaturellm.llm_interface.init_chat_model")
engineer = LLMFeatureEngineer(problem_type="classification")
y = pd.Series([0, 1])
with pytest.raises(ValueError, match="n_rounds must be a positive integer"):
engineer.fit_selective(
numeric_data_frame, y, SelectKBest(f_classif, k=1), n_rounds=0
)


def test_fit_selective_invalid_eval_set(mocker, numeric_data_frame):
"""fit_selective() raises ValueError for malformed eval_set."""
from sklearn.feature_selection import SelectKBest, f_classif

mocker.patch("skfeaturellm.llm_interface.init_chat_model")
engineer = LLMFeatureEngineer(problem_type="classification")
y = pd.Series([0, 1])
with pytest.raises(ValueError, match="eval_set must be a tuple"):
engineer.fit_selective(
numeric_data_frame, y, SelectKBest(f_classif, k=1), eval_set="bad"
)


def test_evaluate_features_missing_columns_raises(mocker, sample_data_frame):
"""evaluate_features(is_transformed=True) raises ValueError for missing generated columns."""
mocker.patch("skfeaturellm.llm_interface.init_chat_model")
engineer = LLMFeatureEngineer(
problem_type="classification", feature_prefix="llm_feat_"
)
engineer.generated_features_ideas_ = [
FeatureEngineeringIdea(
type="mul",
feature_name="age_double",
columns=["age"],
parameters={"constant": 2.0},
description="Double the age",
)
]
y = pd.Series([0, 1])
with pytest.raises(
ValueError, match="Expected generated feature columns not found"
):
engineer.evaluate_features(sample_data_frame, y, is_transformed=True)