diff --git a/autokaggle/__init__.py b/autokaggle/__init__.py index e69de29..6744a7a 100644 --- a/autokaggle/__init__.py +++ b/autokaggle/__init__.py @@ -0,0 +1,2 @@ +from autokaggle.auto_ml import Classifier, Regressor +from autokaggle.ensemblers import * diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py new file mode 100644 index 0000000..3cfef69 --- /dev/null +++ b/autokaggle/auto_ml.py @@ -0,0 +1,635 @@ +from sklearn.base import BaseEstimator, is_classifier +from abc import abstractmethod +import numpy as np +import os +import random +import json +from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error +from joblib import dump, load + +from autokaggle.preprocessor import Preprocessor +from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, \ + read_json +from lightgbm import LGBMClassifier, LGBMRegressor +from autokaggle.config import Config, CLASSIFICATION_PREP_HPARAM_SPACE, \ + REGRESSION_PREP_HPARAM_SPACE, \ + REGRESSION_BASE_HPARAM_SPACE, CLASSIFICATION_BASE_HPARAM_SPACE, \ + CLASSIFICATION_HPARAM_SPACE, REGRESSION_HPARAM_SPACE +from sklearn.model_selection import StratifiedKFold, KFold +import hyperopt +from hyperopt import tpe, hp, fmin, Trials, STATUS_OK, STATUS_FAIL +from sklearn.model_selection import cross_val_score +from autokaggle.ensemblers import RankedEnsemblingModel, StackedEnsemblingModel +from imblearn.over_sampling import SMOTE, SMOTENC +import collections + + +class AutoKaggle(BaseEstimator): + """ Automated Machine Learning system class. + + AutoKaggle implements an end to end automated ML system. It initiates and + searches for the optimum ML pipeline. The user can use it with the simple + `fit()` and `predict()` methods like Sci-kit learn estimators. + The user can specify various parameters controlling different components + of the system. + # Arguments + path: String. OS path for storing temporary model parameters. + verbose: Bool. Defines the verbosity of the logging. + time_limit: Int. Time budget for performing search and fit pipeline. + use_ensembling: Bool. Defines whether to use an ensemble of models + num_estimators_ensemble: Int. Maximum number of estimators to be used + in an ensemble + ensemble_strategy: String. Strategy to ensemble models + ensemble_method: String. Aggregation method if ensemble_strategy is + set to ranked_ensembling + random_ensemble: Bool. Whether the ensembling estimators are picked + randomly. + diverse_ensemble: Bool. Whether estimators from different families are + picked. + ensembling_search_iter: Int. Search iterations for ensembling + hyper-parameter search + search_algo: String. Search strategy for hyper-parameter search. + search_iter: Int. Number of iterations used for hyper-parameter search. + cv_folds: Int. Number of Cross Validation folds. + subsample_ratio: Percent of subsample used for for hyper-parameter + search. + data_info: list(String). Lists the datatypes of each feature column. + stack_probabilities: Bool. Whether to use class probabilities in + ensembling. + upsample_classes: Bool. Whether to upsample less represented classes + num_p_hparams: Int. Number of preprocessor search spaces. + """ + + def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, + num_estimators_ensemble=50, ensemble_strategy='stacking', + ensemble_method='max_voting', + search_iter=500, cv_folds=3, subsample_ratio=0.1, + random_ensemble=False, diverse_ensemble=True, + stack_probabilities=False, data_info=None, upsample_classes=False, + ensembling_search_iter=10, + search_algo='random', num_p_hparams=10): + self.is_trained = False + if not path: + path = rand_temp_folder_generator() + self.config = Config(path=path, verbose=verbose, time_limit=time_limit, + use_ensembling=use_ensembling, + num_estimators_ensemble=num_estimators_ensemble, + ensemble_strategy=ensemble_strategy, + ensemble_method=ensemble_method, + search_iter=search_iter, cv_folds=cv_folds, + subsample_ratio=subsample_ratio, + random_ensemble=random_ensemble, + diverse_ensemble=diverse_ensemble, + stack_probabilities=stack_probabilities, + data_info=data_info, upsample_classes=upsample_classes, + ensembling_search_iter=ensembling_search_iter, + search_algo=search_algo, + num_p_hparams=num_p_hparams) + self.pipeline = None + self.m_hparams = None + self.m_hparams_base = None + self.p_hparams_base = None + + def fit(self, x, y, time_limit=None, data_info=None): + """ Train an autoML system. + # Arguments + x: A numpy.ndarray instance containing the training data. + y: training label vector. + time_limit: remaining time budget. + data_info: meta-features of the dataset, which is an numpy.ndarray + describing the feature type of each + column in raw_x. The feature type include: 'TIME' for temporal + feature, 'NUM' for other numerical feature, + and 'CAT' for categorical feature. + # Returns + None + Both inputs X and y are numpy arrays. + If fit is called multiple times on incremental data (train, test1, test2, + etc.) + you should warm-start your training from the pre-trained model. Past data + will + NOT be available for re-training. + """ + self.config.time_limit = time_limit if time_limit else 24 * 60 * 60 + + # Extract or read data info + self.config.data_info = data_info if data_info is not None else \ + self.extract_data_info(x) + + if self.config.verbose: + print('DATA_INFO: {}'.format(self.config.data_info)) + print('#TIME features: {}'.format(sum(self.config.data_info == 'TIME'))) + print('#NUM features: {}'.format(sum(self.config.data_info == 'NUM'))) + print('#CAT features: {}'.format(sum(self.config.data_info == 'CAT'))) + + if x.shape[1] == 0: + raise ValueError("No feature exist!") + + x, y = self.resample(x, y) + + if self.config.objective == 'classification': + n_classes = len(set(y)) + self.config.objective = 'binary' if n_classes == 2 else 'multiclass' + + # self.pipeline = AutoPipe(LGBMClassifier, {}, {}, self.config) + # Search the top preprocessing setting + trials = self.search(x, y, self.p_hparams_base, self.m_hparams_base) + p_hparams = self.get_top_prep(trials, self.config.num_p_hparams) + # Search the best pipelines + trials = self.search(x, y, p_hparams, self.m_hparams_base) + self.pipeline = self.get_best_pipeline(trials) + # Fit data + self.pipeline.fit(x, y) + self.is_trained = True + + def predict(self, x_test): + """ Generate prediction on the test data for the given task. + # Arguments + x_test: A numpy.ndarray instance containing the test data. + # Returns + A numpy array for the predictions on the x_test. + This function provides predictions of labels on (test) data. + """ + y = self.pipeline.predict(x_test) + if y is None: + raise ValueError("Tabular predictor does not exist") + return y + + def predict_proba(self, x_test): + """ Predict label probabilities on the test data for the given + classification task. + # Arguments + x_test: A numpy.ndarray instance containing the test data. + # Returns + A numpy array for the prediction probabilities on the x_test. + The function returns predicted probabilities for every class label. + """ + y = self.pipeline.predict_proba(x_test) + if y is None: + raise ValueError("Tabular predictor does not exist") + return y + + def evaluate(self, x_test, y_test): + """ Predict label probabilities on the test data for the given + classification task. + # Arguments + x_test: A numpy.ndarray instance containing the training data. + y_test: A numpy array with ground truth labels for the test data + # Returns + An evaluation score based on the task type. + """ + if self.config.verbose: + print('objective:', self.config.objective) + y_pred = self.predict(x_test) + results = None + if self.config.objective == 'binary': + results = roc_auc_score(y_test, y_pred) + elif self.config.objective == 'multiclass': + results = f1_score(y_test, y_pred, average='weighted') + elif self.config.objective == 'regression': + results = mean_squared_error(y_test, y_pred) + return results + + def resample(self, x, y): + """ Up-samples the input data + # Arguments + x: A numpy array for features + y: A numpy array for target + # Returns + Up-sampled version of the dataset + """ + if self.config.upsample_classes: + x, y = SMOTE( + sampling_strategy=self.config.resampling_strategy).fit_resample(x, y) + while x.shape[0] < 60: + x = np.concatenate([x, x], axis=0) + y = np.concatenate([y, y], axis=0) + return x, y + + def subsample(self, x, y, sample_percent): + """ Takes a sub-sample of the input data, for the hyper-parameter search. + # Arguments + x: A numpy array for features + y: A numpy array for target + sample_percent: Minimum percentage of the data to be maintained + # Returns + Down-sampled dataset + """ + # TODO: Add way to balance the subsample + # Set small sample for hyper-param search + if x.shape[0] > 600: + grid_train_percentage = max(600.0 / x.shape[0], sample_percent) + else: + grid_train_percentage = 1 + grid_n = int(x.shape[0] * grid_train_percentage) + idx = random.sample(list(range(x.shape[0])), grid_n) + grid_train_x, grid_train_y = x[idx, :], y[idx] + return grid_train_x, grid_train_y + + def search(self, x, y, prep_space, model_space): + """ Do hyper-parameter search to find optimal machine learning pipeline. + # Arguments + x: A numpy array for features + y: A numpy array for target + prep_space: Hyper-parameter search space for preprocessors + model_space: Hyper-parameter search space for estimators + # Returns + List of hyper-parameter trials + """ + grid_train_x, grid_train_y = \ + self.subsample(x, y, sample_percent=self.config.subsample_ratio) + score_metric, skf = self.get_skf(self.config.cv_folds) + + def objective_func(params): + model_class = params['estimator']['model'] + m_params = params['estimator']['param'] + p_params = params['prep'] + pipeline = AutoPipe(model_class=model_class, m_params=m_params, + p_params=p_params, config=self.config) + try: + eval_score = cross_val_score(pipeline, grid_train_x, grid_train_y, + scoring=score_metric, cv=skf).mean() + status = STATUS_OK + except ValueError as e: + print(e) + eval_score = float('-inf') + status = STATUS_FAIL + if self.config.verbose: + print("CV Score:", eval_score) + print("\n=================") + loss = 1 - eval_score if status == STATUS_OK else float('inf') + return {'loss': loss, 'status': status, 'model_class': model_class, + 'm_params': m_params, + 'p_params': p_params} + + trials = Trials() + search_space = {'prep': prep_space, 'estimator': model_space} + _ = fmin(objective_func, search_space, algo=self.config.search_algo, + trials=trials, + max_evals=self.config.search_iter, + rstate=np.random.RandomState(self.config.random_state)) + return trials + + def get_best_pipeline(self, trials): + """ Finds the optimal pipeline from the given list of search trials. + # Arguments + trials: List of hyper-parameter search trials + # Returns + Optimal pipeline based on the given list of trials + """ + if self.config.use_ensembling: + best_pipeline = self.setup_ensemble(trials) + else: + opt = trials.best_trial['result'] + best_pipeline = AutoPipe(opt['model_class'], opt['m_params'], + opt['p_params'], self.config) + if self.config.verbose: + print("The best hyperparameter setting found:") + print(opt) + return best_pipeline + + @staticmethod + def get_top_prep(trials, n): + """ Find the list of top N preprocessor settings. + # Arguments + trials: List of hyper-parameter search trials + n: Maximum number of preprocessor settings required + # Returns + List of the top N optimal preprocessor settings. + """ + best_trials = [t for t in trials.results if t['loss'] != float('inf')] + best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False) + top_p_hparams, count = [], 0 + for trial in best_trials: + if trial['p_params'] not in top_p_hparams: + top_p_hparams.append(trial) + count += 1 + if count > n: + break + + return hp.choice('p_params', top_p_hparams) + + @abstractmethod + def get_skf(self, folds): + """ Get the scoring metric and the cross validation folds for evaluation. + # Arguments + folds: NUmber of cross validation folds + # Returns + Scoring metric and cross validation folds. + """ + pass + + def pick_diverse_estimators(self, trial_list): + """ Selects the best hyper-parameter settings from each estimator family. + # Arguments + trial_list: List of the hyper-parameter search trials. + # Returns + List of top hyper-parameter spaces equally selected from each + estimator family. + """ + groups = collections.defaultdict(list) + + for obj in trial_list: + groups[obj['model_class']].append(obj) + estimator_list = [] + idx, j = 0, 0 + while idx < self.config.num_estimators_ensemble: + for grp in groups.values(): + if j < len(grp): + est = AutoPipe(grp[j]['model_class'], grp[j]['m_params'], + grp[j]['p_params'], self.config) + estimator_list.append(est) + idx += 1 + j += 1 + return estimator_list + + def setup_ensemble(self, trials): + """ Generates the optimal ensembling estimator based on the given setting. + # Arguments + trials: List of the hyper-parameter search trials. + # Returns + An ensembling estimator to be trained using the base estimators picked + from trials. + """ + # Filter the unsuccessful hparam spaces i.e. 'loss' == float('inf') + best_trials = [t for t in trials.results if t['loss'] != float('inf')] + best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False) + + self.config.num_estimators_ensemble = min( + self.config.num_estimators_ensemble, len(best_trials)) + + if self.config.random_ensemble: + np.random.shuffle(best_trials) + + if self.config.diverse_ensemble: + estimator_list = self.pick_diverse_estimators(best_trials) + else: + estimator_list = [] + for i in range(self.config.num_estimators_ensemble): + est = AutoPipe(best_trials[i]['model_class'], + best_trials[i]['m_params'], + best_trials[i]['p_params'], + self.config) + estimator_list.append(est) + + if self.config.ensemble_strategy == 'stacking': + best_estimator_ = StackedEnsemblingModel(estimator_list, + config=self.config) + else: + best_estimator_ = RankedEnsemblingModel(estimator_list, + config=self.config) + return best_estimator_ + + @staticmethod + def extract_data_info(raw_x): + """ + Extracts the data info automatically based on the type of each feature in + raw_x. + # Arguments + raw_x: a numpy.ndarray instance containing the training data. + # Returns + A list of data-types for each feature in the data. + """ + data_info = [] + row_num, col_num = raw_x.shape + for col_idx in range(col_num): + try: + raw_x[:, col_idx].astype(np.float) + data_info.append('NUM') + except: + data_info.append('CAT') + return np.array(data_info) + + +class Classifier(AutoKaggle): + """ Extends AutoKaggle for Classification. + + Extends the AutoKaggle specific to the classification requirements. + # Arguments + path: String. OS path for storing temporary model parameters. + verbose: Bool. Defines the verbosity of the logging. + time_limit: Int. Time budget for performing search and fit pipeline. + use_ensembling: Bool. Defines whether to use an ensemble of models + num_estimators_ensemble: Int. Maximum number of estimators to be used + in an ensemble + ensemble_strategy: String. Strategy to ensemble models + ensemble_method: String. Aggregation method if ensemble_strategy is + set to ranked_ensembling + random_ensemble: Bool. Whether the ensembling estimators are picked + randomly. + diverse_ensemble: Bool. Whether estimators from different families are + picked. + ensembling_search_iter: Int. Search iterations for ensembling + hyper-parameter search + search_algo: String. Search strategy for hyper-parameter search. + search_iter: Int. Number of iterations used for hyper-parameter search. + cv_folds: Int. Number of Cross Validation folds. + subsample_ratio: Percent of subsample used for for hyper-parameter + search. + data_info: list(String). Lists the datatypes of each feature column. + stack_probabilities: Bool. Whether to use class probabilities in + ensembling. + upsample_classes: Bool. Whether to upsample less represented classes + num_p_hparams: Int. Number of preprocessor search spaces. + """ + + def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, + num_estimators_ensemble=50, ensemble_strategy='stacking', + ensemble_method='max_voting', + search_iter=500, cv_folds=3, subsample_ratio=0.1, + random_ensemble=False, diverse_ensemble=True, + stack_probabilities=False, data_info=None, upsample_classes=False, + ensembling_search_iter=10, + search_algo='random', num_p_hparams=10): + super().__init__(path=path, verbose=verbose, time_limit=time_limit, + use_ensembling=use_ensembling, + num_estimators_ensemble=num_estimators_ensemble, + ensemble_strategy=ensemble_strategy, + ensemble_method=ensemble_method, search_iter=search_iter, + cv_folds=cv_folds, + subsample_ratio=subsample_ratio, + random_ensemble=random_ensemble, + diverse_ensemble=diverse_ensemble, + stack_probabilities=stack_probabilities, + data_info=data_info, + upsample_classes=upsample_classes, + ensembling_search_iter=ensembling_search_iter, + search_algo=search_algo, num_p_hparams=num_p_hparams) + self.config.objective = 'classification' + self.m_hparams = hp.choice('classifier', + [CLASSIFICATION_HPARAM_SPACE[m] for m in + self.config.classification_models]) + self.m_hparams_base = hp.choice('classifier', + [CLASSIFICATION_BASE_HPARAM_SPACE[m] for m in + self.config.classification_models]) + self.p_hparams_base = CLASSIFICATION_PREP_HPARAM_SPACE + + def get_skf(self, folds): + """ + See the base class. + """ + if self.config.objective == 'binary': + score_metric = 'roc_auc' + skf = StratifiedKFold(n_splits=folds, shuffle=True, + random_state=self.config.random_state) + else: + score_metric = 'f1_weighted' + skf = StratifiedKFold(n_splits=folds, shuffle=True, + random_state=self.config.random_state) + return score_metric, skf + + +class Regressor(AutoKaggle): + """ Extends AutoKaggle for Regression + + Extends the AutoKaggle specific to the regression requirements. + # Arguments + path: String. OS path for storing temporary model parameters. + verbose: Bool. Defines the verbosity of the logging. + time_limit: Int. Time budget for performing search and fit pipeline. + use_ensembling: Bool. Defines whether to use an ensemble of models + num_estimators_ensemble: Int. Maximum number of estimators to be used + in an ensemble + ensemble_strategy: String. Strategy to ensemble models + ensemble_method: String. Aggregation method if ensemble_strategy is + set to ranked_ensembling + random_ensemble: Bool. Whether the ensembling estimators are picked + randomly. + diverse_ensemble: Bool. Whether estimators from different families are + picked. + ensembling_search_iter: Int. Search iterations for ensembling + hyper-parameter search + search_algo: String. Search strategy for hyper-parameter search. + search_iter: Int. Number of iterations used for hyper-parameter search. + cv_folds: Int. Number of Cross Validation folds. + subsample_ratio: Percent of subsample used for for hyper-parameter + search. + data_info: list(String). Lists the datatypes of each feature column. + stack_probabilities: Bool. Whether to use class probabilities in + ensembling. + upsample_classes: Bool. Whether to upsample less represented classes + num_p_hparams: Int. Number of preprocessor search spaces. + """ + + def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, + num_estimators_ensemble=50, ensemble_strategy='stacking', + ensemble_method='max_voting', + search_iter=500, cv_folds=3, subsample_ratio=0.1, + random_ensemble=False, diverse_ensemble=True, + stack_probabilities=False, data_info=None, upsample_classes=False, + ensembling_search_iter=10, + search_algo='random', num_p_hparams=10): + super().__init__(path=path, verbose=verbose, time_limit=time_limit, + use_ensembling=use_ensembling, + num_estimators_ensemble=num_estimators_ensemble, + ensemble_strategy=ensemble_strategy, + ensemble_method=ensemble_method, search_iter=search_iter, + cv_folds=cv_folds, + subsample_ratio=subsample_ratio, + random_ensemble=random_ensemble, + diverse_ensemble=diverse_ensemble, + stack_probabilities=stack_probabilities, + data_info=data_info, + upsample_classes=upsample_classes, + ensembling_search_iter=ensembling_search_iter, + search_algo=search_algo, num_p_hparams=num_p_hparams) + self.config.objective = 'regression' + self.m_hparams = hp.choice('regressor', [REGRESSION_HPARAM_SPACE[m] for m in + self.config.regression_models]) + self.m_hparams_base = hp.choice('regressor', + [REGRESSION_BASE_HPARAM_SPACE[m] for m in + self.config.classification_models]) + self.p_hparams_base = REGRESSION_PREP_HPARAM_SPACE + + def get_skf(self, folds): + """ + See the base class. + """ + return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, + random_state=self.config.random_state) + + +class AutoPipe(BaseEstimator): + """ Implements a machine learning pipeline. + + Implements a machine learning pipeline with preprocessor and estimator. A + user can call fit(), and predict() methods on it. It is used as a search + unit in AutoKaggle's hyeper-parameter search. + # Arguments + config: Config. Defines the configuration of various components of the + pipeline. + m_params: Dict. Hyper-parameter search space for estimator. + p_params: Dict. Hyper-parameter search space for preprocessor. + model_class: Estimator. Class name of the estimator used in the pipeline. + _estimator_type: String. Denotes if the estimator is 'classifier' or + 'regressor' + prep: Preprocessor. Instance of the Preprocessor class, which does + basic feature preprocessing and feature + engineering + model: Estimator. Instance of the estimator class which learns a + machine learning model and predicts on the + given data. + """ + + def __init__(self, model_class, m_params, p_params, config): + self.prep = None + self.model = None + self.config = config + self.m_params = m_params + self.p_params = p_params + self.model_class = model_class + self._estimator_type = 'classifier' if is_classifier( + model_class) else 'regressor' + + def fit(self, x, y): + """ Trains the given pipeline. + # Arguments + x: A numpy.ndarray instance containing the training data. + y: training label vector. + # Returns + None + """ + self.prep = Preprocessor(self.config, self.p_params) + self.model = self.model_class(**self.m_params) + x = self.prep.fit_transform(x, y) + self.model.fit(x, y) + + def predict(self, x): + """ Generate prediction on the test data for the given task. + # Arguments + x: A numpy.ndarray instance containing the test data. + # Returns + A numpy array for the predictions on the x. + This function provides predictions of labels on (test) data. + """ + x = self.prep.transform(x) + return self.model.predict(x) + + def predict_proba(self, x): + """ Predict label probabilities on the test data for the given + classification task. + # Arguments + x: A numpy.ndarray instance containing the test data. + # Returns + A numpy array for the prediction probabilities on the x. + The function returns predicted probabilities for every class label. + """ + x = self.prep.transform(x) + try: + return self.model.predict_proba(x) + except AttributeError: + return self.model.predict(x) + + def decision_function(self, x): + """ Returns the decision function learned by the estimator. + # Arguments + x: A numpy.ndarray instance containing the test data. + # Returns + Decision function learned by the estimator. + This is used by the scorers to evaluate the pipeline. + """ + x = self.prep.transform(x) + try: + return self.model.decision_function(x) + except AttributeError: + raise AttributeError diff --git a/autokaggle/config.py b/autokaggle/config.py new file mode 100644 index 0000000..729c156 --- /dev/null +++ b/autokaggle/config.py @@ -0,0 +1,351 @@ +from sklearn.base import BaseEstimator +from autokaggle.utils import rand_temp_folder_generator, ensure_dir +import hyperopt +from hyperopt import hp +from sklearn.svm import SVC +from sklearn.neighbors import KNeighborsClassifier +from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \ + RandomForestRegressor, AdaBoostRegressor, \ + ExtraTreesRegressor +from sklearn.linear_model import Ridge +from lightgbm import LGBMClassifier, LGBMRegressor +from catboost import CatBoostClassifier, Pool, CatBoostRegressor +import numpy as np + + +class Config: + """ Configuration for various autoML components. + + Defines the common configuration of different auto ML components. It is + shared between AutoKaggle, AutoPipe, Preprocessor and Ensembling class. + + # Arguments + path: String. OS path for storing temporary model parameters. + verbose: Bool. Defines the verbosity of the logging. + time_limit: Int. Time budget for performing search and fit pipeline. + use_ensembling: Bool. Defines whether to use an ensemble of models + num_estimators_ensemble: Int. Maximum number of estimators to be used + in an ensemble + ensemble_strategy: String. Strategy to ensemble models + ensemble_method: String. Aggregation method if ensemble_strategy is + set to ranked_ensembling + random_ensemble: Bool. Whether the ensembling estimators are picked + randomly. + diverse_ensemble: Bool. Whether estimators from different families are + picked. + ensembling_search_iter: Int. Search iterations for ensembling + hyper-parameter search + search_algo: String. Search strategy for hyper-parameter search. + search_iter: Int. Number of iterations used for hyper-parameter search. + cv_folds: Int. Number of Cross Validation folds. + subsample_ratio: Percent of subsample used for for hyper-parameter + search. + data_info: list(String). Lists the datatypes of each feature column. + stack_probabilities: Bool. Whether to use class probabilities in + ensembling. + upsample_classes: Bool. Whether to upsample less represented classes + num_p_hparams: Int. Number of preprocessor search spaces. + """ + + def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True, + num_estimators_ensemble=50, + ensemble_strategy='stacking', ensemble_method='max_voting', + search_iter=500, cv_folds=3, + subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True, + stack_probabilities=False, + data_info=None, upsample_classes=False, ensembling_search_iter=10, + search_algo='random', + num_p_hparams=10): + self.verbose = verbose + self.path = path if path is not None else rand_temp_folder_generator() + ensure_dir(self.path) + if self.verbose: + print('Path:', self.path) + self.time_limit = time_limit + self.objective = None + self.use_ensembling = use_ensembling + self.hparams = None + self.num_estimators_ensemble = num_estimators_ensemble + self.ensemble_strategy = ensemble_strategy + self.ensemble_method = ensemble_method + self.random_ensemble = random_ensemble + self.search_iter = search_iter + self.cv_folds = cv_folds + self.subsample_ratio = subsample_ratio + self.resampling_strategy = 'auto' + self.random_state = 1001 + self.classification_models = ['knn', 'svm', 'lgbm', 'random_forest', + 'adaboost'] + # self.classification_models = ['knn', 'lgbm', 'random_forest',] + self.regression_models = ['extratree', 'ridge', 'lgbm', 'random_forest', + 'adaboost', 'catboost'] + self.diverse_ensemble = diverse_ensemble + self.stack_probabilities = stack_probabilities + self.data_info = data_info + self.upsample_classes = upsample_classes + self.ensembling_search_iter = ensembling_search_iter + self.search_algo = hyperopt.rand.suggest if search_algo == 'random' else \ + hyperopt.tpe.suggest + self.num_p_hparams = num_p_hparams + + def update(self, options): + for k, v in options.items(): + if hasattr(self, k): + setattr(self, k, v) + + +KNN_CLASSIFIER_PARAMS = { + 'n_neighbors': hp.choice('n_neighbors_knn', [1, 2, 4, 8, 16, 32, 64, 100]), + 'weights': hp.choice('weight_knn', ['uniform', 'distance']), + 'metric': hp.choice('metric_knn', + ["euclidean", "manhattan", "chebyshev", "minkowski"]), + 'p': hp.choice('p_knn', range(1, 3)), +} + +SVM_CLASSIFIER_PARAMS = { + 'C': hp.loguniform('C_svm', np.log(0.03125), np.log(32768)), + 'kernel': hp.choice('kernel_svm', ['rbf', 'poly', 'sigmoid']), + 'degree': hp.choice('degree_svm', range(2, 6)), + 'gamma': hp.loguniform('gamma_svm', np.log(3e-5), np.log(8)), + 'max_iter': 50000, +} + +RANDOM_FOREST_CLASSIFIER_PARAMS = { + 'criterion': hp.choice('criterion_rf', ['entropy', 'gini']), + 'max_features': hp.uniform('max_features_rf', 0, 1.0), + 'n_estimators': hp.choice('n_estimators_rf', [100, 50]), + 'min_samples_leaf': hp.choice('min_samples_leaf_rf', range(1, 20)), + 'min_samples_split': hp.choice('min_samples_split_rf', range(2, 20)), +} + +LGBM_CLASSIFIER_PARAMS = { + 'boosting_type': 'gbdt', + 'min_split_gain': 0.1, + 'subsample': 0.8, + 'num_leaves': 80, + 'colsample_bytree': hp.uniform('colsample_bytree_lgbm', 0.4, 0.8), + 'min_child_weight': hp.choice('min_child_weight_lgbm', range(1, 100)), + 'max_depth': hp.choice('max_depth_lgbm', range(5, 10)), + 'n_estimators': hp.choice('n_estimators_lgbm', range(50, 200)), + 'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-2), + high=np.log(2)), +} + +ADABOOST_CLASSIFIER_PARAMS = { + 'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']), + 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 500)), + 'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2), + high=np.log(2)), +} + +CATBOOST_CLASSIFIER_PARAMS = { + 'iterations': hp.choice('iterations_catboost', [5, 10]), + 'depth': hp.choice('depth_catboost', range(4, 11)), + 'learning_rate': hp.loguniform('learning_rate_catboost', low=np.log(1e-3), + high=np.log(1)), + 'loss_function': hp.choice('loss_function_catboost', + ['Logloss', 'CrossEntropy']), + 'verbose': True, + 'leaf_estimation_iterations': 10, + 'l2_leaf_reg': hp.choice('l2_leaf_reg_catboost', np.logspace(-20, -19, 3)) +} + +EXTRA_TREES_REGRESSOR_PARAMS = { + 'n_estimators': hp.choice('n_estimators_extra_trees', [50, 100, 200]), + 'criterion': hp.choice('criterion_extra_trees', ['mse', 'friedman_mse', 'mae']), + 'max_features': hp.uniform('max_features_extra_trees', 0, 1.0), + 'min_samples_leaf': hp.choice('min_samples_leaf_extra_trees', range(1, 20)), + 'min_samples_split': hp.choice('min_samples_split_extra_trees', range(2, 20)), + 'min_impurity_decrease': 0.0, + 'bootstrap': hp.choice('bootstrap_extra_trees', [True, False]), +} + +RIDGE_REGRESSOR_PARAMS = { + 'fit_intercept': True, + 'tol': hp.loguniform('tol_ridge', 1e-5, 1e-1), + 'alpha': hp.loguniform('alpha_ridge', np.log(1e-5), np.log(10)) +} + +RANDOM_FOREST_REGRESSOR_PARAMS = { + 'criterion': hp.choice('criterion_rf', ['mse', 'friedman_mse', 'mae']), + 'max_features': hp.uniform('max_features_rf', 0.1, 1.0), + 'n_estimators': hp.choice('n_estimators_rf', [50, 100, 200]), + 'min_samples_leaf': hp.choice('min_samples_leaf_rf', range(1, 10)), + 'min_samples_split': hp.choice('min_samples_split_rf', range(2, 10)), + 'bootstrap': hp.choice('bootstrap_rf', [True, False]), +} + +LGBM_REGRESSOR_PARAMS = { + 'boosting_type': 'gbdt', + 'min_split_gain': 0.1, + 'subsample': 0.8, + 'num_leaves': 80, + 'colsample_bytree': hp.uniform('colsample_bytree_lgbm', 0.4, 0.8), + 'min_child_weight': hp.choice('min_child_weight_lgbm', range(1, 100)), + 'max_depth': hp.choice('max_depth_lgbm', range(5, 10)), + 'n_estimators': hp.choice('n_estimators_lgbm', range(50, 200)), + 'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-5), + high=np.log(1)), +} + +ADABOOST_REGRESSOR_PARAMS = { + 'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]), + 'n_estimators': hp.choice('n_estimators_adaboost', range(50, 300)), + 'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2), + high=np.log(2)), + # 'max_depth': hp.choice('max_depth_adaboost', range(1, 11)), +} + +CATBOOST_REGRESSOR_PARAMS = { + 'iterations': 2, + 'depth': hp.choice('depth_catboost', range(4, 10)), + 'learning_rate': 1, + 'loss_function': 'RMSE', + 'verbose': True +} + +REGRESSION_HPARAM_SPACE = { + 'extratree': { + 'model': ExtraTreesRegressor, + 'param': EXTRA_TREES_REGRESSOR_PARAMS + }, + 'ridge': { + 'model': Ridge, + 'param': RIDGE_REGRESSOR_PARAMS + }, + 'random_forest': { + 'model': RandomForestRegressor, + 'param': RANDOM_FOREST_REGRESSOR_PARAMS + }, + 'lgbm': { + 'model': LGBMRegressor, + 'param': LGBM_REGRESSOR_PARAMS + }, + 'adaboost': { + 'model': AdaBoostRegressor, + 'param': ADABOOST_REGRESSOR_PARAMS + }, + 'catboost': { + 'model': CatBoostRegressor, + 'param': CATBOOST_REGRESSOR_PARAMS + } +} + +CLASSIFICATION_HPARAM_SPACE = { + 'knn': { + 'model': KNeighborsClassifier, + 'param': KNN_CLASSIFIER_PARAMS + }, + 'svm': { + 'model': SVC, + 'param': SVM_CLASSIFIER_PARAMS + }, + 'random_forest': { + 'model': RandomForestClassifier, + 'param': RANDOM_FOREST_CLASSIFIER_PARAMS + }, + 'lgbm': { + 'model': LGBMClassifier, + 'param': LGBM_CLASSIFIER_PARAMS + }, + 'adaboost': { + 'model': AdaBoostClassifier, + 'param': ADABOOST_CLASSIFIER_PARAMS + }, + 'catboost': { + 'model': CatBoostClassifier, + 'param': CATBOOST_CLASSIFIER_PARAMS + } +} + +CLASSIFICATION_BASE_HPARAM_SPACE = { + 'knn': { + 'model': KNeighborsClassifier, + 'param': {} + }, + 'svm': { + 'model': SVC, + 'param': {} + }, + 'random_forest': { + 'model': RandomForestClassifier, + 'param': {} + }, + 'lgbm': { + 'model': LGBMClassifier, + 'param': {} + }, + 'adaboost': { + 'model': AdaBoostClassifier, + 'param': {} + }, + 'catboost': { + 'model': CatBoostClassifier, + 'param': {} + } +} + +REGRESSION_BASE_HPARAM_SPACE = { + 'extratree': { + 'model': ExtraTreesRegressor, + 'param': {} + }, + 'ridge': { + 'model': Ridge, + 'param': {} + }, + 'random_forest': { + 'model': RandomForestRegressor, + 'param': {} + }, + 'lgbm': { + 'model': LGBMRegressor, + 'param': {} + }, + 'adaboost': { + 'model': AdaBoostRegressor, + 'param': {} + }, + 'catboost': { + 'model': CatBoostRegressor, + 'param': {} + } +} + +REGRESSION_PREP_HPARAM_SPACE = { + 'cat_encoding': hp.choice('cat_enc', + ['count', 'target+count', 'target+label', 'label']), + 'scaling': hp.choice('scaling', [True, False]), + 'log_transform': hp.choice('log_transform', [True, False]), + 'power_transform': hp.choice('power_transform', [True, False]), + 'pca': hp.choice('pca', [True, False]), + 'binning': hp.choice('binning', [True, False]), + 'add_time_offset': hp.choice('add_time_offset', [True, False]), + 'add_time_diff': hp.choice('add_time_diff', [True, False]), + # 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max', + # 'min', None]), + # 'cat_cat_strategy': hp.choice('cat_cat_strategy', ['count', 'nunique', None]), + 'imputation_strategy': hp.choice('imputation_strategy', + ['most_frequent', 'zero']), + 'pearson_thresh': hp.uniform('pearson_thresh', 0.001, 0.01), + 'feat_importance_thresh': hp.uniform('feat_importance_thresh', 0.001, 0.01) +} + +CLASSIFICATION_PREP_HPARAM_SPACE = { + 'cat_encoding': hp.choice('cat_enc', + ['target', 'count', 'target+count', 'target+label']), + 'scaling': hp.choice('scaling', [True, False]), + 'log_transform': hp.choice('log_transform', [True, False]), + 'power_transform': hp.choice('power_transform', [True, False]), + 'pca': hp.choice('pca', [True, False]), + 'binning': hp.choice('binning', [True, False]), + 'add_time_offset': hp.choice('add_time_offset', [True, False]), + 'add_time_diff': hp.choice('add_time_diff', [True, False]), + # 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max', + # 'min', None]), + # 'cat_cat_strategy': hp.choice('cat_cat_strategy', ['count', 'nunique', None]), + 'imputation_strategy': hp.choice('imputation_strategy', + ['most_frequent', 'zero']), + 'pearson_thresh': hp.uniform('pearson_thresh', 0.001, 0.01), + 'feat_importance_thresh': hp.uniform('feat_importance_thresh', 0.001, 0.01) +} diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py new file mode 100644 index 0000000..95d0435 --- /dev/null +++ b/autokaggle/ensemblers.py @@ -0,0 +1,208 @@ +from sklearn.base import BaseEstimator +from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, \ + read_json +from abc import abstractmethod +import numpy as np +import os +import random +import json +from statistics import mode + +from sklearn.model_selection import RandomizedSearchCV, train_test_split +from sklearn.model_selection import StratifiedKFold, KFold +from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error +from joblib import dump, load +from scipy import stats +from lightgbm import LGBMClassifier, LGBMRegressor +import collections +from sklearn.model_selection import RandomizedSearchCV, cross_val_score +import hyperopt +from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK +from autokaggle.config import REGRESSION_HPARAM_SPACE, CLASSIFICATION_HPARAM_SPACE, \ + CLASSIFICATION_BASE_HPARAM_SPACE, \ + REGRESSION_BASE_HPARAM_SPACE + + +class EnsemblingModel: + """ Base class for ensembling estimators. + + This class creates an ensembling estimator from a given list of estimators. + The user can call fit() and predict() methods, similar to the scikit-learn + estimators. + + # Arguments + config: Config. Defines the configuration of various components of the + autoML pipeline. + estimator_list: List. List of the estimators, to be used for building an + ensemble. + """ + + def __init__(self, estimator_list, config): + self.config = config + self.estimator_list = estimator_list + + @abstractmethod + def fit(self, x, y): + """ Trains the ensemble of estimators on the training data. + # Arguments + X: A numpy array instance containing the training data. + # Returns + None + """ + pass + + @abstractmethod + def predict(self, x): + """ Generate prediction on the test data for the given task. + # Arguments + X: A numpy array instance containing the test data. + # Returns + A numpy array for the predictions on the x_test. + This function provides predicts on the input data using the ensemble of + estimators. + """ + pass + + +class RankedEnsemblingModel(EnsemblingModel): + """ Implements ensembling using ranking based methods. + + This class implements randing based ensembling using ensembling methods + amongst: ('mean', 'median', 'max' and 'majority_voting') + """ + + def fit(self, x, y): + for est in self.estimator_list: + est.fit(x, y) + + def predict(self, x): + predictions = np.zeros((len(x), len(self.estimator_list))) + for i, est in enumerate(self.estimator_list): + predictions[:, i] = est.predict(x) + + if self.config.ensemble_method == 'median': + return np.median(predictions, axis=1) + elif self.config.ensemble_method == 'mean': + return np.mean(predictions, axis=1) + elif self.config.ensemble_method == 'max': + return np.max(predictions, axis=1) + elif self.config.ensemble_method == 'min': + return np.min(predictions, axis=1) + elif self.config.ensemble_method == 'max_voting': + return stats.mode(predictions, axis=1)[0] + + +class StackedEnsemblingModel(EnsemblingModel): + """ Implements a stacking based ensembling estimator. + + This class creates an ensembling estimator using stacking. It trains an + Light-GBM model on the predictions of the base estimator. + + # Arguments + stacking_estimator: LightGBM estimator. Meta-learning algorithm for the + stacking estimator. + """ + + def __init__(self, estimator_list, config): + super().__init__(estimator_list, config) + self.stacking_estimator = None + + if self.config.objective == 'regression': + self.hparams = hp.choice('regressor', + [REGRESSION_BASE_HPARAM_SPACE['lgbm']]) + self.config.stack_probabilities = False + else: + self.hparams = hp.choice('classifier', + [CLASSIFICATION_BASE_HPARAM_SPACE['lgbm']]) + + def get_model_predictions(self, X): + """ Generate the combined predictions from the list of the estimators. + # Arguments + X: A numpy array instance containing the training/test data. + # Returns + A numpy array for the predictions of all the estimators in the list. + """ + if self.config.stack_probabilities: + predictions = np.zeros((len(X), 1)) + for i, est in enumerate(self.estimator_list): + try: + new = est.predict_proba(X)[:, :-1] + predictions = np.hstack([predictions, new]) + except AttributeError: + new = np.reshape(est.predict(X), (-1, 1)) + predictions = np.hstack([predictions, new]) + predictions = predictions[:, 1:] + else: + predictions = np.zeros((len(X), len(self.estimator_list))) + for i, est in enumerate(self.estimator_list): + predictions[:, i] = est.predict(X) + return predictions + + def fit(self, x, y): + x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2) + for est in self.estimator_list: + est.fit(x_train, y_train) + predictions = self.get_model_predictions(x_val) + self.stacking_estimator = self.search(predictions, y_val) + self.stacking_estimator.fit(predictions, y_val) + + def search(self, x, y): + """ Search function to find best hyper-param setting for the stacking model. + # Arguments + x: A numpy array instance containing the training data + # Returns + List of trials on various hyper-parameter settings. + """ + score_metric, skf = self.get_skf(self.config.cv_folds) + + def objective_func(args): + clf = args['model'](**args['param']) + try: + eval_score = cross_val_score(clf, x, y, scoring=score_metric, + cv=skf).mean() + except ValueError: + eval_score = 0 + if self.config.verbose: + print("Ensembling CV Score:", eval_score) + print("\n=================") + return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args} + + trials = Trials() + best = fmin(objective_func, self.hparams, algo=self.config.search_algo, + trials=trials, + max_evals=self.config.ensembling_search_iter, + rstate=np.random.RandomState(self.config.random_state)) + + opt = space_eval(self.hparams, best) + best_estimator_ = opt['model'](**opt['param']) + if self.config.verbose: + print("The best hyperparameter setting found for stacking:") + print(opt) + return best_estimator_ + + def predict(self, x): + predictions = self.get_model_predictions(x) + return self.stacking_estimator.predict(predictions) + + def get_skf(self, folds): + """ Get scoring metric and cross validation folds for the task type + # Arguments + folds: Number of cross validation folds + # Returns + Scoring metric and CV folds + """ + if self.config.objective == 'binary': + score_metric = 'roc_auc' + skf = StratifiedKFold(n_splits=folds, shuffle=True, + random_state=self.config.random_state) + elif self.config.objective == 'multiclass': + score_metric = 'f1_weighted' + skf = StratifiedKFold(n_splits=folds, shuffle=True, + random_state=self.config.random_state) + elif self.config.objective == 'regression': + score_metric = 'neg_mean_squared_error' + skf = KFold(n_splits=folds, shuffle=True, + random_state=self.config.random_state) + else: + ValueError("Invalid objective") + return score_metric, skf diff --git a/autokaggle/hparam_space/knn_hp.json b/autokaggle/hparam_space/knn_hp.json new file mode 100644 index 0000000..609ff31 --- /dev/null +++ b/autokaggle/hparam_space/knn_hp.json @@ -0,0 +1 @@ +{"n_neighbors": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20], "weights": ["uniform", "distance"], "algorithm": ["ball_tree", "kd_tree", "brute"], "leaf_size": [5, 10, 15, 20, 25, 30, 35, 40, 45, 50], "metric": ["minkowski", "euclidean", "manhattan", "chebyshev"], "p": [1, 2, 3]} \ No newline at end of file diff --git a/autokaggle/hparam_space/lgbm_hp.json b/autokaggle/hparam_space/lgbm_hp.json new file mode 100644 index 0000000..b2f6311 --- /dev/null +++ b/autokaggle/hparam_space/lgbm_hp.json @@ -0,0 +1 @@ +[{"boosting_type": ["gbdt"], "min_child_weight": [5, 10, 30, 50, 60, 80, 100], "min_split_gain": [0.1], "subsample": [0.8], "colsample_bytree": [0.6, 0.7], "max_depth": [5, 8, 10], "n_estimators": [50], "num_leaves": [80], "learning_rate": [0.3]}, {"learning_rate": [0.03, 0.045, 0.06, 0.075, 0.85, 0.95, 0.105, 0.12], "n_estimators": [100, 150, 200]}] \ No newline at end of file diff --git a/autokaggle/hparam_space/rf_hp.json b/autokaggle/hparam_space/rf_hp.json new file mode 100644 index 0000000..c23c577 --- /dev/null +++ b/autokaggle/hparam_space/rf_hp.json @@ -0,0 +1 @@ +[{"criterion": ["entropy", "gini"], "max_features": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "n_estimators": [300], "min_samples_leaf": [1]}, {"min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {"n_estimators": [50, 100, 150, 200]}] \ No newline at end of file diff --git a/autokaggle/hparam_space/svm_hp.json b/autokaggle/hparam_space/svm_hp.json new file mode 100644 index 0000000..8642e7f --- /dev/null +++ b/autokaggle/hparam_space/svm_hp.json @@ -0,0 +1 @@ +{"C": [0.001, 0.1, 1, 10, 100, 1000, 10000], "gamma": [1e-05, 100000.0], "kernel": ["rbf", "poly", "linear", "sigmoid"], "degree": [2, 3, 4, 5], "max_iter": [50000]} \ No newline at end of file diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py new file mode 100644 index 0000000..f5ba361 --- /dev/null +++ b/autokaggle/preprocessor.py @@ -0,0 +1,1296 @@ +import numpy as np +import pandas as pd +import scipy +import itertools +from scipy.stats import pearsonr +from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler, PowerTransformer, \ + KBinsDiscretizer, OneHotEncoder +from sklearn.base import TransformerMixin +from sklearn.base import BaseEstimator +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from abc import abstractmethod +import collections +from lightgbm import LGBMClassifier, LGBMRegressor + +LEVEL_HIGH = 32 + + +class Preprocessor(TransformerMixin): + """ Implements basic preprocessing and feature engineering class. + + Preprocessor takes care of the basic preprocessing and feature engineering of + the input data. Similar to Scikit-learn transformers,it implements the fit() + and transform() methods. TO acheive this It applies various feature + primitives in a sequence using scikit-learn pipeline. + # Arguments + config: Config. Defines the configuration of various components of the + AutoML pipeline. + params: Dict. Hyper-parameter search space for preprocessor. + pipeline: Pipeline. Sci-kit learn pipeline class to apply the feature + primitives in sequence + """ + + def __init__(self, config, params): + self.config = config + self.params = params + self.pipeline = None + + def fit(self, raw_x, y): + """ This function trains the preprocessor chain + # Arguments + raw_x: A numpy array instance containing the training data data. + y: A numpy array instance containing training label vector. + # Returns + None + This function fits the preprocessor chain on the given training data + """ + data = TabularData(raw_x, self.config.data_info, self.config.verbose) + + steps = [] + steps.extend(self.get_imputation_pipeline(self.params)) + steps.extend(self.get_higher_order_pipeline(self.params)) + steps.extend(self.get_categorical_pipeline(self.params)) + steps.extend(self.get_numerical_pipeline(self.params)) + steps.extend(self.get_time_pipeline(self.params)) + steps.extend(self.get_filtering_pipeline(self.params)) + self.pipeline = Pipeline(steps) + + self.pipeline.fit(data, y) + + return self + + def transform(self, raw_x): + """ Generate data transformation on the given data. + # Arguments + raw_x: a numpy array instance containing the training/testing data + # Returns + A numpy array instance containing the transformed data. + This function provides transforms the input data by applying the + transformations using the pre-trained preprocessor chain. + """ + # Get Meta-Feature + data = TabularData(raw_x, self.config.data_info, self.config.verbose) + a = self.pipeline.transform(data).X + return a.values + + @staticmethod + def get_categorical_pipeline(params): + """ Generate pipeline of primitives for categorical features. + # Arguments + params: Hyper-parameter setting for the preprocessors. + # Returns + List of primitives to be applied (based on the given setting) + """ + choice = params.get('cat_encoding', 'target') + cat_pipeline = [] + if choice == 'target': + cat_pipeline.append(('target_encoder', TargetEncoder(operation='upd', + selected_type='CAT') + )) + elif choice == 'label': + cat_pipeline.append( + ('label_encoder', LabelEncode(operation='upd', selected_type='CAT'))) + elif choice == 'count': + cat_pipeline.append( + ('count_encoder', CatCount(operation='upd', selected_type='CAT'))) + elif choice == 'target+count': + cat_pipeline.append(('target_encoder', TargetEncoder(operation='add', + selected_type='CAT') + )) + cat_pipeline.append( + ('count_encoder', CatCount(operation='upd', selected_type='CAT'))) + elif choice == 'one_hot': + cat_pipeline.append( + ('one_hot_encoder', OneHot(operation='upd', selected_type='CAT'))) + elif choice == 'target+label': + cat_pipeline.append(('target_encoder', TargetEncoder(operation='add', + selected_type='CAT') + )) + cat_pipeline.append( + ('label_encoder', LabelEncode(operation='upd', selected_type='CAT'))) + else: + raise ValueError + return cat_pipeline + + @staticmethod + def get_numerical_pipeline(params): + """ Generate pipeline of primitives for numerical features. + # Arguments + params: Hyper-parameter setting for the preprocessors. + # Returns + List of primitives to be applied (based on the given setting) + """ + scaling = params.get('scaling', True) + log_transform = params.get('log_transform', False) + power_transform = params.get('power_transform', False) + pca = params.get('pca', False) + binning = params.get('binning', False) + + numeric_pipeline = [] + if scaling: + numeric_pipeline.append( + ('scaler', TabScaler(operation='upd', selected_type='NUM'))) + if log_transform: + numeric_pipeline.append(('log_transform', + LogTransform(operation='upd', + selected_type='NUM'))) + if power_transform: + numeric_pipeline.append( + ('boxcox', BoxCox(operation='upd', selected_type='NUM'))) + if pca: + numeric_pipeline.append( + ('pca', TabPCA(operation='add', selected_type='NUM'))) + if binning: + numeric_pipeline.append( + ('binning', Binning(operation='add', selected_type='NUM'))) + return numeric_pipeline + + def get_filtering_pipeline(self, params): + """ Generate pipeline of primitives to filter less useful features. + # Arguments + params: Hyper-parameter setting for the preprocessors. + # Returns + List of primitives to be applied (based on the given setting) + """ + pearson_thresh = params.get('pearson_thresh', 0) + feat_importance_thresh = params.get('feat_importance_thresh', 0) + + filter_pipeline = [ + ('filter', FilterConstant(operation='del', selected_type='ALL'))] + if pearson_thresh > 0: + filter_pipeline.append( + ('pearson_corr', FeatureFilter(operation='del', selected_type='ALL', + threshold=pearson_thresh))) + if feat_importance_thresh > 0: + filter_pipeline.append( + ('lgbm_feat_selection', + FeatureImportance(operation='del', + selected_type='ALL', + threshold=feat_importance_thresh, + task_type=self.config.objective))) + return filter_pipeline + + @staticmethod + def get_time_pipeline(params): + """ Generate pipeline of primitives for time features. + # Arguments + params: Hyper-parameter setting for the preprocessors. + # Returns + List of primitives to be applied (based on the given setting) + """ + add_offset = params.get('add_time_offset', False) + add_diff = params.get('add_time_diff', False) + time_pipeline = [] + if add_offset: + time_pipeline.append( + ('time_offset', TimeOffset(operation='upd', selected_type='TIME'))) + if add_diff: + time_pipeline.append( + ('time_diff', TimeDiff(operation='add', selected_type='TIME'))) + return time_pipeline + + @staticmethod + def get_imputation_pipeline(params): + """ Generate pipeline of primitives to impute the missing values. + # Arguments + params: Hyper-parameter setting for the preprocessors. + # Returns + List of primitives to be applied (based on the given setting) + """ + strategy = params.get('imputation_strategy', 'most_frequent') + impute_pipeline = [('imputer', + Imputation(operation='upd', selected_type='ALL', + strategy=strategy))] + return impute_pipeline + + @staticmethod + def get_higher_order_pipeline(params): + """ Generate pipeline of primitives to generate cross-column features. + # Arguments + params: Hyper-parameter setting for the preprocessors. + # Returns + List of primitives to be applied (based on the given setting) + """ + cat_num_strategy = params.get('cat_num_strategy', None) + cat_cat_strategy = params.get('cat_cat_strategy', None) + pipeline = [] + if cat_num_strategy: + pipeline.append(('cat_num_encoder', + CatNumEncoder(operation='add', selected_type1='CAT', + selected_type2='NUM', + strategy=cat_num_strategy))) + if cat_cat_strategy: + pipeline.append(('cat_cat_encoder', + CatCatEncoder(operation='add', selected_type1='CAT', + selected_type2='CAT', + strategy=cat_cat_strategy))) + return pipeline + + +class TabularData: + """ Represents the data and its meta-info. + + TabularData includes the training/testing data along with its meta info such + as data types, cardinality etc. The user can update the data and its meta + info as well as select the features matching the criteria. + # Arguments + verbose: Bool. Determines the verbosity of the logging. + data_info: Dict. Dictionary mapping the feature names to their data_types + total_samples: Int. Number of samples in the data + cat_col: List. List of the categorical features + num_col: List. List of the numerical features + time_col: List. List of the time features + n_cat: Int. Number of categorical features + n_num: Int. Number of numerical features + n_time: Int. Number of time features + cat_cardinality: Dict. Dictionary mapping categorical feature names of + their cardinality (no. of unique values) + generated_features: List. List of the newly added features. (In + addition to the pre-existing columns) + num_info: Dict. Dictionary mapping numeircal column to their meta info + such as range, std etc. + """ + + def __init__(self, raw_x, data_info, verbose=True): + self.cat_col = None + self.num_col = None + self.time_col = None + self.n_cat = 0 + self.n_time = 0 + self.n_num = 0 + self.cat_cardinality = None + self.generated_features = None + self.num_info = None + self.verbose = verbose + self.data_info = {str(i): data_info[i] for i in range(len(data_info))} + self.total_samples = raw_x.shape[0] + self.refresh_col_types() + + # Convert sparse to dense if needed + raw_x = raw_x.toarray() if type( + raw_x) == scipy.sparse.csr.csr_matrix else raw_x + + # To pandas Dataframe + if type(raw_x) != pd.DataFrame: + raw_x = pd.DataFrame(raw_x, + columns=[str(i) for i in range(raw_x.shape[1])]) + + self.X = raw_x + # self.update_cat_cardinality() + + def update_type(self, columns, new_type): + """ Updates the column datatype. + # Arguments + column: List of columns whose data_type needs update. + new_type: New data_type (either of 'CAT', 'NUM' or 'TIME'). + # Returns + None. + This function updates the data types of given list of columns. + """ + for c in columns: + self.data_info[c] = new_type + + def delete_type(self, columns): + """ Delete the columns from the feature to data_type mapping. + # Arguments + column: List of columns whose data_type needs update. + # Returns + None + This function removes the selected columns from the data_info dictionary. + """ + for c in columns: + _ = self.data_info.pop(c, 0) + + def rename_cols(self, key): + """ Provides a rename function to add new columns without collision. + # Arguments + key: Identifier for renaming + # Returns + Renaming function which takes current column name and outputs a new + unique column name. + """ + + def rename_fn(col_name): + col_name = str(col_name) + col_name += '_' + key + while col_name in self.X.columns: + col_name += '_' + key + return col_name + + return rename_fn + + def update(self, operation, columns, x_tr, new_type=None, key=''): + """ Updates the TabularData after applying primitive. + # Arguments + operation: Primitive operation applied ('add', 'update' or 'delete'). + columns: List of columns affected. + x_tr: Transformed (or newly generated) features + new_type: Data type of the new column + key: Name key for renaming the new columns + # Returns + None + This function takes the transformed (or generated) features after applying + the primitive and updates the + TabularData. + """ + if operation == 'upd': + if x_tr is not None: + self.X[columns] = x_tr + if new_type is not None: + self.update_type(columns, new_type) + elif operation == 'add': + if x_tr is not None: + x_tr = x_tr.rename(columns=self.rename_cols(key)) + self.X = pd.concat([self.X, x_tr], axis=1) + self.update_type(x_tr.columns, new_type) + elif operation == 'del': + if len(columns) != 0: + self.X.drop(columns=columns, inplace=True) + self.delete_type(columns) + else: + print("invalid operation") + self.refresh_col_types() + + def refresh_col_types(self): + """ Updates the column_types based on the data_info + # Arguments + None + # Returns + None + This function updates the cat, num and time column lists based on (any) + updates in the data_info. + """ + self.cat_col = [k for k, v in self.data_info.items() if v == 'CAT'] + self.num_col = [k for k, v in self.data_info.items() if v == 'NUM'] + self.time_col = [k for k, v in self.data_info.items() if v == 'TIME'] + self.n_time = len(self.time_col) + self.n_num = len(self.num_col) + self.n_cat = len(self.cat_col) + + def update_cat_cardinality(self): + """ Update categorical cardinality mapping for all categorical columns. + # Arguments + None + # Returns + None + """ + # TODO: too slow make it faster + if not self.cat_cardinality: + self.cat_cardinality = {} + for c in self.cat_col: + self.cat_cardinality[c] = len(set(self.X[c])) + + def select_columns(self, data_type): + """ Returns all the columns matching the input data_type + # Arguments + data_type: Required type of the data (either of 'CAT', 'NUM', 'TIME' or + 'ALL') + # Returns + List of the feature columns matching the input criteria. + """ + self.refresh_col_types() + if data_type == 'CAT': + return self.cat_col + elif data_type == 'TIME': + return self.time_col + elif data_type == 'NUM': + return self.num_col + elif data_type == 'ALL': + return list(self.data_info.keys()) + else: + print('invalid Type') + return [] + + +class Primitive(BaseEstimator, TransformerMixin): + """ Base class for the single order data transformation function. + + Primitive learns and applies the data transformation on a given set of + features. The user can use fit() and transform() functions to apply these + transformations. + + # Arguments + options: Dict. Special arguments specific to the given primitive. + selected_type: 'String'. Specifies the type of features the + transformation is supposed to be applied to. + operation: 'String'. Specifies the type of operation from 'add', 'update' + or 'delete' + name_key : 'String'. Signature key to rename the column after applying + the primitive. + selected: 'List'. List of the selected features, on which the + transformation will be applied + drop_columns: 'List'. List of the features which would be dropped after + applying the transformation. + supported_ops: Tuple. Specifies the allowed list of operations for this + primitive. + """ + + def __init__(self, operation='upd', selected_type=None, **kwargs): + self.options = None + self.selected = None + self.drop_columns = None + self.supported_ops = ('add', 'upd', 'del') + self.selected_type = selected_type + self.operation = operation + self.init_vars(**kwargs) + self.name_key = self.__class__.__name__ + + def init_vars(self, **kwargs): + """ Initialize the primitive specific variables (which are not defined in the + base class) + # Arguments + kwargs: Dictionary containing primitive specific variables + # Returns + None. + """ + self.options = kwargs + + def fit(self, data, y=None): + """ A wrapper function to train the given primitive on the input training + data. + # Arguments + data: A TabularData instance of training data. + y: A numpy array of the target values. + # Returns + None + """ + self.selected = data.select_columns(self.selected_type) + if self.operation not in self.supported_ops: + print("Operation {} not supported for {}".format(self.operation, + self.__class__.__name__) + ) + self.selected = None + if not self.selected: + return self + return self._fit(data, y) + + def transform(self, data, y=None): + """ A wrapper function to generate transformation on the input data based on + pre-trained primitive. + # Arguments + data: Input training/testing data in TabularData form. + y: A numpy array of the target values. + # Returns + A TabularData instance of the transformed data. + """ + if not self.selected: + return data + return self._transform(data, y) + + @abstractmethod + def _fit(self, data, y=None): + """ Contains the actual implementation of training the primitive (implemented + in the child class) + # Arguments + data: A TabularData instance of training data. + y: A numpy array of the target values. + # Returns + None + """ + pass + + @abstractmethod + def _transform(self, data, y=None): + """ Contains the actual implementation of transforming the data using + primitive. (implemented in the child class) + # Arguments + data: Input training/testing data in TabularData form. + y: A numpy array of the target values. + # Returns + A TabularData instance of the transformed data. + """ + pass + + +class PrimitiveHigherOrder: + """ Base class for the cross-order data transformation function. + + PrimitiveHigherOrder learns and applies the data transformation across two + sets of features. The user can use fit() and transform() functions to + apply these transformations. + + # Arguments + options: Dict. Special arguments specific to the given primitive. + selected_type1: 'String'. Specifies the first type of features the + transformation is supposed to be applied to. + selected_type2: 'String'. Specifies the second type of features the + transformation is supposed to be applied to. + operation: 'String'. Specifies the type of operation from 'add', 'update' + or 'delete' + name_key : 'String'. Signature key to rename the column after applying + the primitive. + selected_1: 'List'. List of the selected features in the first set, on + which the transformation will be + applied + selected_2: 'List'. List of the selected features in the second set, on + which the transformation will be + applied + drop_columns: 'List'. List of the features which would be dropped after + applying the transformation. + supported_ops: Tuple. Specifies the allowed list of operations for this + primitive. + """ + + def __init__(self, operation='upd', selected_type1=None, selected_type2=None, + **kwargs): + self.options = None + self.selected_1 = None + self.selected_2 = None + self.drop_columns = None + self.supported_ops = ('add', 'upd', 'del') + self.operation = operation + self.selected_type1 = selected_type1 + self.selected_type2 = selected_type2 + self.init_vars(**kwargs) + self.name_key = self.__class__.__name__ + + def init_vars(self, **kwargs): + """ Initialize the primitive specific variables (which are not defined in the + base class) + # Arguments + kwargs: Dictionary containing primitive specific variables + # Returns + None. + """ + self.options = kwargs + + def fit(self, data, y=None): + """ A wrapper function to train the given primitive on the input training + data. + # Arguments + data: A TabularData instance of training data. + y: A numpy array of the target values. + # Returns + None + """ + self.selected_1 = data.select_columns(self.selected_type1) + self.selected_2 = data.select_columns(self.selected_type2) + + if self.operation not in self.supported_ops: + print("Operation {} not supported for {}".format(self.operation, + self.__class__.__name__) + ) + self.selected_1 = None + self.selected_2 = None + if not self.selected_1 or not self.selected_2: + return self + return self._fit(data, y) + + def transform(self, data, y=None): + """ A wrapper function to generate transformation on the input data based on + pre-trained primitive. + # Arguments + data: Input training/testing data in TabularData form. + y: A numpy array of the target values. + # Returns + A TabularData instance of the transformed data. + """ + if not self.selected_1 or not self.selected_2: + return data + return self._transform(data, y) + + @abstractmethod + def _fit(self, data, y=None): + """ Contains the actual implementation of training the primitive (implemented + in the child class) + # Arguments + data: A TabularData instance of training data. + y: A numpy array of the target values. + # Returns + None + """ + pass + + @abstractmethod + def _transform(self, data, y=None): + """ Contains the actual implementation of transforming the data using + primitive. (implemented in the child class) + # Arguments + data: Input training/testing data in TabularData form. + y: A numpy array of the target values. + # Returns + A TabularData instance of the transformed data. + """ + pass + + +class TabScaler(Primitive): + """ Standard Scaler primitive. + + TabScaler scales the selected numerical features to have 0 mean and unit + variance. + + # Arguments + scaler: StandardScaler. Instance of scikit-learn StandardScaler object + """ + scaler = None + supported_ops = ('add', 'upd') + + def _fit(self, data, y=None): + self.scaler = StandardScaler() + self.scaler.fit(data.X[self.selected], y) + return self + + def _transform(self, data, y=None): + x_tr = self.scaler.transform(data.X[self.selected]) + data.update(self.operation, self.selected, x_tr, new_type='NUM', + key=self.name_key) + return data + + +class BoxCox(Primitive): + """ Power Transform primitive. + + The class applies BoxCox power transformation to make the selected features + have normal distribution. + + # Arguments + transformer: PowerTransformer. Instance of scikit-learn PowerTransformer + object + """ + transformer = None + supported_ops = ('add', 'upd') + + def _fit(self, data, y=None): + self.transformer = PowerTransformer() + self.transformer.fit(data.X[self.selected], y) + return self + + def _transform(self, data, y=None): + x_tr = self.transformer.transform(data.X[self.selected]) + data.update(self.operation, self.selected, x_tr, new_type='NUM', + key=self.name_key) + return data + + +class Binning(Primitive): + """ Numerical binning primitive. + + The class applies divides the given numeric column in the list of buckets, + based on the range of their values. + + # Arguments + binner: KBinsDiscretizer. Instance of scikit-learn KBinsDiscretizer + object + strategy: String. Strategy used to define width of the bins. Possible + options are: (‘uniform’, ‘quantile’, + ‘kmeans’) + encoding: String. Method used to encode the transformed result. Possible + options are: (‘onehot’, + ‘onehot-dense’, ‘ordinal’) + """ + binner = None + strategy = None + encoding = None + supported_ops = ('add', 'upd') + + def init_vars(self, strategy='quantile', encoding='ordinal'): + self.strategy = strategy + self.encoding = encoding + + def _fit(self, data, y=None): + self.binner = KBinsDiscretizer(strategy=self.strategy, encode=self.encoding) + self.binner.fit(data.X[self.selected], y) + return self + + def _transform(self, data, y=None): + x_tr = pd.DataFrame(self.binner.transform(data.X[self.selected])) + data.update(self.operation, self.selected, x_tr, new_type='NUM', + key=self.name_key) + return data + + +class OneHot(Primitive): + """ One Hot Encoder for categorical features. + + The class applies one hot encoding to categorical features, using the + sklearn implementation. + + # Arguments + ohe: OneHotEncoder. Instance of scikit-learn OneHotEncoder object + """ + ohe = None + supported_ops = ('add', 'upd') + + def _fit(self, data, y=None): + self.ohe = OneHotEncoder(sparse=False, handle_unknown='ignore') + self.ohe.fit(data.X[self.selected], y) + return self + + def _transform(self, data, y=None): + x_tr = pd.DataFrame(self.ohe.transform(data.X[self.selected])) + if self.operation == 'add': + data.update(self.operation, self.selected, x_tr, new_type='NUM', + key=self.name_key) + elif self.operation == 'upd': + data.update('add', self.selected, x_tr, new_type='NUM', + key=self.name_key) + data.update('del', self.selected, None, None, key=self.name_key) + return data + + +class LabelEncode(Primitive): + """ Label Encoder for categorical features. + + The class applies Label Encoding to categorical features, By mapping each + category to a numerical value. + + # Arguments + cat_to_int_label: Dict. Mapping from categories to their assigned integer + value + unknown_key_dict: Dict. Mapping for each categorical feature column to + the integer value to replace the previously unseen categories + """ + cat_to_int_label = None + unknown_key_dict = None + supported_ops = ('add', 'upd') + + def _fit(self, data, y=None): + self.cat_to_int_label = {} + self.unknown_key_dict = {} + for col in self.selected: + self.cat_to_int_label[col] = {key: idx for idx, key in + enumerate(set(data.X[col]))} + self.unknown_key_dict[col] = len(self.cat_to_int_label[col]) + return self + + def _transform(self, data, y=None): + x_tr = pd.DataFrame() + for col in self.selected: + x_tr[col] = data.X[col].apply( + lambda key: self.cat_to_int_label[col].get(key, + self.unknown_key_dict[ + col])) + data.update(self.operation, self.selected, x_tr, new_type='NUM', + key=self.name_key) + return data + + +class TargetEncoder(Primitive): + """ Target Encoder for categorical features. + + The class applies target encoding to categorical features, By learning + the mapping of category to numeric value + based on some aggregation of the target value. + + # Arguments + target_encoding_map: Dict. Mapping from categories to their assigned + numeric value + """ + target_encoding_map = None + supported_ops = ('add', 'upd') + + @staticmethod + def calc_smooth_mean(df, by, on, alpha=5): + """ Calculates the smoothed means on the target value. + # Arguments + df: Input dataframe + by: Groupby column (categorical column) + on: Target column + alpha: smoothing factor + # Returns + smoothed mean and the overall mean + """ + # Compute the global mean + mean = df[on].mean() + + # Compute the number of values and the mean of each group + agg = df.groupby(by)[on].agg(['count', 'mean']) + counts = agg['count'] + means = agg['mean'] + + # Compute the "smoothed" means + smooth = (counts * means + alpha * mean) / (counts + alpha) + return smooth, mean + + def _fit(self, data, y=None): + X = data.X + self.target_encoding_map = {} + X['target'] = y + for col in self.selected: + self.target_encoding_map[col] = self.calc_smooth_mean(X, col, 'target', + alpha=5) + X.drop('target', axis=1, inplace=True) + return self + + def _transform(self, data, y=None): + x_tr = pd.DataFrame() + for col in self.selected: + x_tr[col] = data.X[col].map(self.target_encoding_map[col][0], + self.target_encoding_map[col][1]) + data.update(self.operation, self.selected, x_tr, new_type='NUM', + key=self.name_key) + return data + + +class CatCatEncoder(PrimitiveHigherOrder): + """ Cross column feature generator between categorical and categorical columns. + + The class learns a new features based on the values of selected two + categorical features. + + # Arguments + cat_cat_map: Dict. Mapping from cat-cat combination to the an assigned + numeric value + strategy: String. Aggregation strategy to learn the mapping between + cat-cat combination to numeric value + """ + supported_ops = ('add',) + cat_cat_map = None + strategy = None + + def init_vars(self, strategy='count'): + self.strategy = strategy + + @staticmethod + def cat_cat_count(df, col1, col2, strategy='count'): + """ Generate mapping for cat-cat combination to the numerical value based on + the given strategy. + # Arguments + col1: First categorical column + col2: Second categorical column + strategy: Aggregation strategy + # Returns + Mapping from cat-cat combination to the numeric value.. + """ + if strategy == 'count': + mapping = df.groupby([col1])[col2].count() + elif strategy == 'nunique': + mapping = df.groupby([col1])[col2].nunique() + else: + mapping = df.groupby([col1])[col2].count() // df.groupby([col1])[ + col2].nunique() + return mapping + + def _fit(self, data, y=None): + self.cat_cat_map = {} + self.selected_1 = list(set(self.selected_1 + self.selected_2)) + for col1, col2 in itertools.combinations(self.selected_1, 2): + self.cat_cat_map[col1 + '_cross_' + col2] = \ + self.cat_cat_count(data.X, + col1, + col2, + self.strategy) + return self + + def _transform(self, data, y=None): + x_tr = pd.DataFrame() + for col1, col2 in itertools.combinations(self.selected_1, 2): + if col1 + '_cross_' + col2 in self.cat_cat_map: + x_tr[col1 + '_cross_' + col2] = data.X[col1].map( + self.cat_cat_map[col1 + '_cross_' + col2]) + data.update(self.operation, self.selected_1, x_tr, new_type='NUM', + key=self.name_key) + return data + + +class CatNumEncoder(PrimitiveHigherOrder): + """ Cross column feature generator between categorical and numerical columns. + + The class learns a new features based on the values of selected categorical + and numerical features. + + # Arguments + cat_num_map: Dict. Mapping from cat-num combination to the an assigned + numeric value + strategy: String. Aggregation strategy to learn the mapping between + cat-num combination to numeric value + """ + supported_ops = ('add',) + cat_num_map = None + strategy = None + + def init_vars(self, strategy='mean'): + self.strategy = strategy + + @staticmethod + def cat_num_interaction(df, col1, col2, method='mean'): + """ Generate mapping for cat-num combination to the numerical value based on + the given strategy. + # Arguments + col1: categorical column + col2: numerical column + method: Aggregation strategy + # Returns + Mapping from cat-num combination to the numeric value.. + """ + if method == 'mean': + mapping = df.groupby([col1])[col2].mean() + elif method == 'std': + mapping = df.groupby([col1])[col2].std() + elif method == 'max': + mapping = df.groupby([col1])[col2].max() + elif method == 'min': + mapping = df.groupby([col1])[col2].min() + else: + mapping = df.groupby([col1])[col2].mean() + + return mapping + + def _fit(self, data, y=None): + self.cat_num_map = {} + for col1 in self.selected_1: + for col2 in self.selected_2: + self.cat_num_map[col1 + '_cross_' + col2] = self.cat_num_interaction( + data.X, col1, col2, self.strategy) + return self + + def _transform(self, data, y=None): + x_tr = pd.DataFrame() + for col1 in self.selected_1: + for col2 in self.selected_2: + if col1 + '_cross_' + col2 in self.cat_num_map: + x_tr[col1 + '_cross_' + col2] = data.X[col1].map( + self.cat_num_map[col1 + '_cross_' + col2]) + data.update(self.operation, self.selected_1, x_tr, new_type='NUM', + key=self.name_key) + return data + + +class CatBinEncoder(PrimitiveHigherOrder): + """ Cross column feature generator between categorical and binary columns. + + The class learns a new features based on the values of selected categorical + and binary features. + + # Arguments + cat_bin_map: Dict. Mapping from cat-bin combination to the an assigned + numeric value + strategy: String. Aggregation strategy to learn the mapping between + cat-bin combination to numeric value + """ + supported_ops = ('add',) + cat_bin_map = None + strategy = None + + def init_vars(self, strategy='percent_true'): + self.strategy = strategy + + @staticmethod + def cat_bin_interaction(df, col1, col2, strategy='percent_true'): + """ Generate mapping for cat-bin combination to the numerical value based on + the given strategy. + # Arguments + col1: Categorical column + col2: Binary column + strategy: Aggregation strategy + # Returns + Mapping from cat-bin combination to the numeric value.. + """ + if strategy == 'percent_true': + mapping = df.groupby([col1])[col2].mean() + elif strategy == 'count': + mapping = df.groupby([col1])[col2].count() + else: + mapping = df.groupby([col1])[col2].mean() + return mapping + + def _fit(self, data, y=None): + self.cat_bin_map = {} + for col1 in self.selected_1: + for col2 in self.selected_2: + self.cat_bin_map[col1 + '_cross_' + col2] = self.cat_bin_interaction( + data.X, col1, col2, self.strategy) + return self + + def _transform(self, data, y=None): + x_tr = pd.DataFrame() + for col1 in self.selected_1: + for col2 in self.selected_2: + if col1 + '_cross_' + col2 in self.cat_bin_map: + x_tr[col1 + '_cross_' + col2] = data.X[col1].map( + self.cat_bin_map[col1 + '_cross_' + col2]) + data.update(self.operation, self.selected_1, x_tr, new_type='NUM', + key=self.name_key) + return data + + +class FilterConstant(Primitive): + """ Filters the constant or very low variance columns. + + The class finds the non-changing or very low variance columns and marked them + for deletion, so that they are not used by the machine learning estimator. + """ + drop_columns = None + supported_ops = ('del',) + + def _fit(self, data, y=None): + X = data.X[self.selected] + self.drop_columns = X.columns[(X.max(axis=0) - X.min(axis=0) == 0)].tolist() + return self + + def _transform(self, data, y=None): + data.update(self.operation, self.drop_columns, None, new_type=None, + key=self.name_key) + return data + + +class TimeDiff(Primitive): + """ Adds features based on difference of time values. + + This class generates the features as time difference between two selected + time columns. + """ + supported_ops = ('add',) + + def _fit(self, data, y=None): + return self + + def _transform(self, data, y=None): + x_tr = pd.DataFrame() + for a, b in itertools.combinations(self.selected, 2): + x_tr[a + '-' + b] = data.X[a] - data.X[b] + data.update(self.operation, self.selected, x_tr, new_type='TIME', + key=self.name_key) + return data + + +class TimeOffset(Primitive): + """ Updates the time features in terms of difference from the start value. + + This class updates the time features such that they are represented as a + difference from the start time. + + # Arguments + start_time: Int. Starting time of the selected time feature. + """ + start_time = None + supported_ops = ('add', 'upd') + + def _fit(self, data, y=None): + self.start_time = data.X[self.selected].min(axis=0) + return self + + def _transform(self, data, y=None): + x_tr = pd.DataFrame() + x_tr[self.selected] = data.X[self.selected] - self.start_time + data.update(self.operation, self.selected, x_tr, new_type='TIME', + key=self.name_key) + return data + + +class TabPCA(Primitive): + """ Generates new features by finding PCA of the selected features. + + The class calculates the PCA of the selected features and adds the + transformation as new set of features. + # Arguments + pca: PCA. Scikit-lean PCA class. + """ + pca = None + supported_ops = ('add',) + + def _fit(self, data, y=None): + self.pca = PCA(n_components=0.99, svd_solver='full') + self.pca.fit(data.X[self.selected]) + return self + + def _transform(self, data, y=None): + x_pca = self.pca.transform(data.X[self.selected]) + x_pca = pd.DataFrame(x_pca, columns=['pca_' + str(i) for i in + range(x_pca.shape[1])]) + data.update(self.operation, self.selected, x_pca, new_type='NUM', + key=self.name_key) + return data + + +class CatCount(Primitive): + """ Count Encoding. + + Replaces the cargorical variables by their occrance count. + # Arguments + count_dict: Dict. Mapping of the categories to their respective frequency + count. + unknown_key: Float. Mapping value for previously unseen category. + """ + count_dict = None + unknown_key = 0 + supported_ops = ('add', 'upd') + + def _fit(self, data, y=None): + self.count_dict = {} + for col in self.selected: + self.count_dict[col] = collections.Counter(data.X[col]) + return self + + def _transform(self, data, y=None): + x_tr = pd.DataFrame() + for col in self.selected: + x_tr[col] = data.X[col].apply( + lambda key: self.count_dict[col].get(key, self.unknown_key)) + data.update(self.operation, self.selected, x_tr, new_type='NUM', + key=self.name_key) + return data + + +class LogTransform(Primitive): + """ Calculates the log transformation. + + The class Calculates the log transform value of the given numeric feature. + The formula is: sign(x) * log(1 + mod(x)) + """ + name_key = 'log_' + supported_ops = ('add', 'upd') + + def _fit(self, data, y=None): + return self + + def _transform(self, data, y=None): + x_tr = pd.DataFrame() + for col in self.selected: + x_tr[self.name_key + col] = np.sign(data.X[col]) * np.log( + 1 + np.abs(data.X[col])) + data.update(self.operation, self.selected, x_tr, new_type='NUM', + key=self.name_key) + return data + + +class Imputation(Primitive): + """ Filters the features based on Pearson Correlation. + + The class removes the features who have low pearson correlation with the + target. + # Arguments + threshold: Float. Threshold for filtering features. + """ + impute_dict = None + supported_ops = ('add', 'upd') + strategy = None + + def init_vars(self, strategy='most_frequent'): + self.strategy = strategy + + def _fit(self, data, y=None): + self.impute_dict = {} + for col in self.selected: + if self.strategy == 'most_frequent': + value_counts = data.X[col].value_counts() + self.impute_dict[ + col] = value_counts.idxmax() if not value_counts.empty else 0 + elif self.strategy == 'zero': + self.impute_dict[col] = 0 + else: + raise ValueError + return self + + def _transform(self, data, y=None): + x_tr = pd.DataFrame() + for col in self.selected: + x_tr[col] = data.X[col].fillna(self.impute_dict[col]) + data.update(self.operation, self.selected, x_tr, new_type=None, + key=self.name_key) + return data + + +class FeatureFilter(Primitive): + """ Filters the features based on Pearson Correlation. + + The class removes the features who have low pearson correlation with the + target. + # Arguments + threshold: Float. Threshold for filtering features. + """ + threshold = None + supported_ops = ('del',) + + def init_vars(self, threshold=0.001): + if threshold == 0: + self.selected = None + self.threshold = threshold + self.drop_columns = [] + + def _fit(self, data, y=None): + for col in self.selected: + mu = abs(pearsonr(data.X[col], y)[0]) + if np.isnan(mu): + mu = 0 + if mu < self.threshold: + self.drop_columns.append(col) + return self + + def _transform(self, data, y=None): + data.update(self.operation, self.drop_columns, None, new_type=None, + key=self.name_key) + return data + + +class FeatureImportance(Primitive): + """ Filters the features based on feature importance score. + + The class learns a Light GBM estimator for the given data and based on the + feature importance scores, filters the features with importance lower than + the threshold. + # Arguments + threshold: Float. Threshold for filtering features. + task_type: 'String'. Specifies the task type amongst: ('classification', + 'regression') + """ + threshold = None + task_type = 'classification' + supported_ops = ('del',) + + def init_vars(self, threshold=0.001, task_type='classification'): + if threshold == 0: + self.selected = None + self.threshold = threshold + self.drop_columns = [] + self.task_type = task_type + + def _fit(self, data, y=None): + if self.task_type == 'classification': + n_classes = len(set(y)) + if n_classes == 2: + estimator = LGBMClassifier(silent=False, + verbose=-1, + n_jobs=1, + objective='binary') + else: + estimator = LGBMClassifier(silent=False, + verbose=-1, + n_jobs=1, + num_class=n_classes, + objective='multiclass') + else: + # self.task_type == 'regression' + estimator = LGBMRegressor(silent=False, + verbose=-1, + n_jobs=1, + objective='regression') + estimator.fit(data.X, y) + feature_importance = estimator.feature_importances_ + feature_importance = feature_importance / feature_importance.mean() + self.drop_columns = data.X.columns[ + np.where(feature_importance < self.threshold)[0]] + return self + + def _transform(self, data, y=None): + data.update(self.operation, self.drop_columns, None, new_type=None, + key=self.name_key) + return data + + +if __name__ == "__main__": + ntime, nnum, ncat = 4, 10, 8 + nsample = 1000 + x_num = np.random.random([nsample, nnum]) + x_time = np.random.random([nsample, ntime]) + x_cat = np.random.randint(0, 10, [nsample, ncat]) + + x_all = np.concatenate([x_num, x_time, x_cat], axis=1) + x_train = x_all[:int(nsample * 0.8), :] + x_test = x_all[int(nsample * 0.8):, :] + + y_all = np.random.randint(0, 2, nsample) + y_train = y_all[:int(nsample * 0.8)] + y_test = y_all[int(nsample * 0.8):] + + datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) + print(x_train[:4, 20]) + prep = Preprocessor() + prep.fit(x_train, y_train, 24 * 60 * 60, datainfo) + x_new = prep.transform(x_train) + + print("-----") + print(x_new[:4, 2]) diff --git a/autokaggle/tabular_preprocessor.py b/autokaggle/tabular_preprocessor.py deleted file mode 100644 index 1bf6d76..0000000 --- a/autokaggle/tabular_preprocessor.py +++ /dev/null @@ -1,344 +0,0 @@ -import numpy as np -from pandas import DataFrame -from scipy.stats import pearsonr - -LEVEL_HIGH = 32 - - -def parallel_function(labels, first_batch_keys, task): - if task == 'label': - if min(labels) > first_batch_keys: - labels = labels - np.min(labels) - return labels.reshape(labels.shape[0], 1) - - elif task == 'frequency': - cat_dict = {} - n_rows = labels.shape[0] - labels = np.expand_dims(labels, axis=1) - - if min(labels) > first_batch_keys: - labels = labels - np.min(labels) - - frequencies = np.zeros((n_rows, 1)) - - for row_index in range(n_rows): - key = labels[row_index, 0] - if key in cat_dict: - cat_dict[key] += 1 - else: - cat_dict[key] = 1 - - n_level = len(cat_dict) - key_to_frequency = {} - - for key in cat_dict.keys(): - key_to_frequency[key] = cat_dict[key] / n_rows * n_level - - for row_index in range(n_rows): - key = labels[row_index, 0] - frequencies[row_index][0] = key_to_frequency[key] - - return frequencies - elif task == 'num_cat': - df = DataFrame(data=labels) - return df.join(df.groupby(1)[0].mean(), - rsuffix='r', - on=1).values[:, -1:] - elif task == 'cat_cat': - df = DataFrame(data=labels) - df[3] = list(range(len(labels))) - return df.join(df.groupby([0, 1]).count(), - rsuffix='r', - on=(0, 1)).values[:, -1:] - elif task == 'train_num_cat': - y = first_batch_keys[0] - df = DataFrame(data=labels) - fe = df.join(df.groupby(1)[0].mean(), - rsuffix='r', - on=1).values[:, -1:] - mu = abs(pearsonr(np.squeeze(np.array(fe)), y)[0]) - if np.isnan(mu): - mu = 0 - return [[first_batch_keys[1], first_batch_keys[2], mu, mu], first_batch_keys[3]] - - elif task == 'train_cat_cat': - y = first_batch_keys[0] - df = DataFrame(data=labels) - df[3] = list(range(len(labels))) - fe = df.join(df.groupby([0, 1]).count(), - rsuffix='r', - on=(0, 1)).values[:, -1:] - mu = abs(pearsonr(np.squeeze(np.array(fe)), y)[0]) - if np.isnan(mu): - mu = 0 - return [[first_batch_keys[1], first_batch_keys[2], mu], first_batch_keys[3]] - return None - - -def call_parallel(tasks): - results = [] - for t in tasks: - results.append(parallel_function(t[0], t[1], t[2])) - return results - - -class TabularPreprocessor: - def __init__(self): - """ - Initialization function for tabular preprocessor. - """ - self.num_cat_pair = {} - - self.total_samples = 0 - - self.cat_to_int_label = {} - self.n_first_batch_keys = {} - self.high_level_cat_keys = [] - - self.feature_add_high_cat = 0 - self.feature_add_cat_num = 0 - self.feature_add_cat_cat = 0 - self.order_num_cat_pair = {} - - self.rest = None - self.budget = None - self.data_info = None - self.n_time = None - self.n_num = None - self.n_cat = None - - def remove_useless(self, x): - self.rest = np.where(np.max(x, 0) - np.min(x, 0) != 0)[0] - return x[:, self.rest] - - def process_time(self, x): - cols = range(self.n_time) - if len(cols) > 10: - cols = cols[:10] - x_time = x[:, cols] - for i in cols: - for j in range(i + 1, len(cols)): - x = np.append(x, np.expand_dims(x_time[:, i] - x_time[:, j], 1), 1) - return x - - def extract_data(self, raw_x): - # only get numerical variables - ret = np.concatenate([raw_x['TIME'], raw_x['NUM'], raw_x['CAT']], axis=1) - n_rows = ret.shape[0] - n_num_col = ret.shape[1] - self.n_cat - - n_cat_col = self.n_cat - if n_cat_col <= 0: - return ret.astype(np.float64) - - # preprocess (multi-value) categorical data - for col_index in range(n_num_col, n_num_col + n_cat_col): - for row_index in range(n_rows): - key = str(ret[row_index, col_index]) - if key in self.cat_to_int_label[col_index]: - ret[row_index, col_index] = self.cat_to_int_label[col_index][key] - continue - new_value = len(self.cat_to_int_label[col_index]) - self.cat_to_int_label[col_index][key] = new_value - ret[row_index, col_index] = new_value - - return ret.astype(np.float64) - - def cat_to_num(self, x, y=None): - if y is not None: - mark = self.n_time + self.n_num - - for col_index in range(self.n_time + self.n_num, self.n_time + self.n_num + self.n_cat): - if self.n_first_batch_keys[col_index] <= LEVEL_HIGH: - self.num_cat_pair[mark] = (col_index,) - mark += 1 - else: - self.num_cat_pair[mark] = (col_index, col_index) - mark += 1 - - mark_1 = 0 - tasks = [] - for i, cat_col_index1 in enumerate(self.high_level_cat_keys): - for cat_col_index2 in self.high_level_cat_keys[i + 1:]: - tasks.append((x[:, (cat_col_index1, cat_col_index2)], - [y, cat_col_index1, cat_col_index2, mark_1], - 'train_cat_cat')) - mark_1 += 1 - - all_results = call_parallel(tasks) - - num_cat_pair_1 = {} - pearsonr_dict_1 = {} - for result in all_results: - if result[0][-1] > 0.001: - pearsonr_dict_1[result[1]] = result[0][-1] - num_cat_pair_1[result[1]] = result[0] - pearsonr_high_1 = sorted(pearsonr_dict_1, key=pearsonr_dict_1.get, reverse=True)[:self.feature_add_cat_cat] - num_cat_pair_1 = {key: num_cat_pair_1[key] for key in pearsonr_high_1} - num_cat_pair_1 = {i + mark: num_cat_pair_1[key] for i, key in enumerate(num_cat_pair_1)} - self.num_cat_pair.update(num_cat_pair_1) - mark += len(pearsonr_high_1) - - mark_2 = 0 - tasks_2 = [] - for cat_col_index in self.high_level_cat_keys: - for num_col_index in range(self.n_time, self.n_time + self.n_num): - tasks_2.append((x[:, (num_col_index, cat_col_index)], - [y, num_col_index, cat_col_index, mark_2], - 'train_num_cat')) - mark_2 += 1 - - all_results = call_parallel(tasks_2) - - num_cat_pair_2 = {} - pearsonr_dict_2 = {} - for result in all_results: - if result[0][-1] > 0.001: - pearsonr_dict_2[result[1]] = result[0][-1] - num_cat_pair_2[result[1]] = result[0] - pearsonr_high_2 = sorted(pearsonr_dict_2, key=pearsonr_dict_2.get, reverse=True)[:self.feature_add_cat_num] - num_cat_pair_2 = {key: num_cat_pair_2[key] for key in pearsonr_high_2} - num_cat_pair_2 = {i + mark: num_cat_pair_2[key] for i, key in enumerate(num_cat_pair_2)} - self.num_cat_pair.update(num_cat_pair_2) - self.order_num_cat_pair = sorted(list(self.num_cat_pair.keys())) - print('num_cat_pair_2:', num_cat_pair_2) - - tasks = [] - for key in self.order_num_cat_pair: - if len(self.num_cat_pair[key]) == 1: - (col_index,) = self.num_cat_pair[key] - tasks.append((x[:, col_index], self.n_first_batch_keys[col_index], 'label')) - if len(self.num_cat_pair[key]) == 2: - (col_index, col_index) = self.num_cat_pair[key] - tasks.append((x[:, col_index], self.n_first_batch_keys[col_index], 'frequency')) - if len(self.num_cat_pair[key]) == 3: - (cat_col_index1, cat_col_index2, mu) = self.num_cat_pair[key] - tasks.append((x[:, (cat_col_index1, - cat_col_index2)], self.n_first_batch_keys[cat_col_index1], 'cat_cat')) - elif len(self.num_cat_pair[key]) == 4: - (num_col_index, cat_col_index, mu, a) = self.num_cat_pair[key] - tasks.append((x[:, (num_col_index, cat_col_index)], self.n_first_batch_keys[cat_col_index], 'num_cat')) - - results = call_parallel(tasks) - all_num = x.shape[1] - self.n_cat - results = [x[:, :all_num]] + results - ret = np.concatenate(results, axis=1) - - return ret - - def fit(self, raw_x, y, time_limit, data_info): - """ - This function should train the model parameters. - - Args: - raw_x: a numpy.ndarray instance containing the training data. - y: training label vector. - time_limit: remaining time budget. - data_info: meta-features of the dataset, which is an numpy.ndarray describing the - feature type of each column in raw_x. The feature type include: - 'TIME' for temporal feature, 'NUM' for other numerical feature, - and 'CAT' for categorical feature. - """ - # Get Meta-Feature - self.budget = time_limit - self.data_info = data_info if data_info is not None else self.extract_data_info(raw_x) - print('QQ: {}'.format(self.data_info)) - - self.n_time = sum(self.data_info == 'TIME') - self.n_num = sum(self.data_info == 'NUM') - self.n_cat = sum(self.data_info == 'CAT') - - self.total_samples = raw_x.shape[0] - - print('QQ1: {}'.format(self.n_time)) - print('QQ2: {}'.format(self.n_num)) - print('QQ3: {}'.format(self.n_cat)) - raw_x = {'TIME': raw_x[:, self.data_info == 'TIME'], - 'NUM': raw_x[:, self.data_info == 'NUM'], - 'CAT': raw_x[:, self.data_info == 'CAT']} - - - for col_index in range(self.n_num + self.n_time, self.n_num + self.n_time + self.n_cat): - self.cat_to_int_label[col_index] = {} - - x = self.extract_data(raw_x) - - d_size = x.shape[0] * x.shape[1] / self.budget - if d_size > 35000: - self.feature_add_high_cat = 0 - else: - self.feature_add_high_cat = 10 - - # Iterate cat features - for col_index in range(self.n_num + self.n_time, self.n_num + self.n_time + self.n_cat): - self.n_first_batch_keys[col_index] = len(self.cat_to_int_label[col_index]) - high_level_cat_keys_tmp = sorted(self.n_first_batch_keys, key=self.n_first_batch_keys.get, reverse=True)[ - :self.feature_add_high_cat] - for i in high_level_cat_keys_tmp: - if self.n_first_batch_keys[i] > 1e2: - self.high_level_cat_keys.append(i) - - # Convert NaN to zeros - x = np.nan_to_num(x) - - # Encode high-order categorical data to numerical with frequency - x = self.cat_to_num(x, y) - - x = self.process_time(x) - x = self.remove_useless(x) - - return x - - def encode(self, raw_x, time_limit=None): - """ - This function should train the model parameters. - - Args: - raw_x: a numpy.ndarray instance containing the training/testing data. - time_limit: remaining time budget. - Both inputs X and y are numpy arrays. - If fit is called multiple times on incremental data (train, test1, test2, etc.) - you should warm-start your training from the pre-trained model. Past data will - NOT be available for re-training. - """ - # Get Meta-Feature - if time_limit is None: - if self.budget is None: - time_limit = 24 * 60 * 60 - self.budget = time_limit - else: - self.budget = time_limit - - raw_x = {'TIME': raw_x[:, self.data_info == 'TIME'], - 'NUM': raw_x[:, self.data_info == 'NUM'], - 'CAT': raw_x[:, self.data_info == 'CAT']} - x = self.extract_data(raw_x) - - # Convert NaN to zeros - x = np.nan_to_num(x) - - # Encode high-order categorical data to numerical with frequency - x = self.cat_to_num(x) - - x = self.process_time(x) - if self.rest is not None: - x = x[:, self.rest] - return x - - @staticmethod - def extract_data_info(raw_x): - """ - This function extracts the data info automatically based on the type of each feature in raw_x. - - Args: - raw_x: a numpy.ndarray instance containing the training data. - """ - data_info = [] - row_num, col_num = raw_x.shape - for col_idx in range(col_num): - try: - raw_x[:, col_idx].astype(np.float) - data_info.append('NUM') - except: - data_info.append('CAT') - return np.array(data_info) diff --git a/autokaggle/tabular_supervised.py b/autokaggle/tabular_supervised.py deleted file mode 100644 index 3f74390..0000000 --- a/autokaggle/tabular_supervised.py +++ /dev/null @@ -1,256 +0,0 @@ -from abc import abstractmethod - -import os -from lightgbm import LGBMClassifier, LGBMRegressor -from sklearn.model_selection import RandomizedSearchCV -from sklearn.model_selection import StratifiedKFold, KFold -from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error -import numpy as np -import random - -from autokaggle.tabular_preprocessor import TabularPreprocessor -from autokaggle.utils import rand_temp_folder_generator, ensure_dir - - -class TabularSupervised: - def __init__(self, path=None, verbose=True): - """ - Initialization function for tabular supervised learner. - """ - self.verbose = verbose - self.is_trained = False - self.clf = None - self.objective = None - self.tabular_preprocessor = None - self.path = path if path is not None else rand_temp_folder_generator() - ensure_dir(self.path) - if self.verbose: - print('Path:', path) - self.save_filename = os.path.join(self.path, 'lgbm.txt') - self.time_limit = None - self.lgbm = None - - def search(self, search_space, search_iter, n_estimators, x, y): - if 'n_estimators' in search_space: - del search_space['n_estimators'] - params = { - 'boosting_type': ['gbdt'], - 'min_child_weight': [5], - 'min_split_gain': [1.0], - 'subsample': [0.8], - 'colsample_bytree': [0.6], - 'max_depth': [10], - 'n_estimators': n_estimators, - 'num_leaves': [70], - 'learning_rate': [0.04], - } - params.update(search_space) - if self.verbose: - print(params) - folds = 3 - score_metric, skf = self.get_skf(folds) - - random_search = RandomizedSearchCV(self.lgbm, param_distributions=params, n_iter=search_iter, - scoring=score_metric, - n_jobs=1, cv=skf, verbose=0, random_state=1001) - - random_search.fit(x, y) - self.clf = random_search.best_estimator_ - - return random_search.best_params_ - - @abstractmethod - def get_skf(self, folds): - pass - - def fit(self, x, y, time_limit=None, data_info=None): - """ - This function should train the model parameters. - - Args: - x: A numpy.ndarray instance containing the training data. - y: training label vector. - time_limit: remaining time budget. - data_info: meta-features of the dataset, which is an numpy.ndarray describing the - feature type of each column in raw_x. The feature type include: - 'TIME' for temporal feature, 'NUM' for other numerical feature, - and 'CAT' for categorical feature. - Both inputs X and y are numpy arrays. - If fit is called multiple times on incremental data (train, test1, test2, etc.) - you should warm-start your training from the pre-trained model. Past data will - NOT be available for re-training. - """ - - if time_limit is None: - time_limit = 24 * 60 * 60 - self.time_limit = time_limit - - self.init_lgbm(y) - - self.tabular_preprocessor = TabularPreprocessor() - - if x.shape[1] == 0: - raise ValueError("No feature exist!") - - x = self.tabular_preprocessor.fit(x, y, self.time_limit, data_info) - - if x.shape[0] > 600: - grid_train_percentage = max(600.0 / x.shape[0], 0.1) - else: - grid_train_percentage = 1 - grid_n = int(x.shape[0] * grid_train_percentage) - idx = random.sample(list(range(x.shape[0])), grid_n) - - grid_train_x = x[idx, :] - grid_train_y = y[idx] - - while x.shape[0] < 60: - x = np.concatenate([x, x], axis=0) - y = np.concatenate([y, y], axis=0) - - response_rate = sum(y) / len(y) - - if not self.is_trained: - # Two-step cross-validation for hyperparameter selection - if self.verbose: - print('-----------------Search Regularization Params---------------------') - if response_rate < 0.005: - depth_choice = [5] - else: - depth_choice = [8, 10] - - params = { - 'min_split_gain': [0.1], - 'max_depth': depth_choice, - 'min_child_weight': [5, 10, 30, 50, 60, 80, 100], - 'colsample_bytree': [0.6, 0.7], - 'learning_rate': [0.3], - 'subsample': [0.8], - 'num_leaves': [80], - } - - search_iter = 14 - n_estimators_choice = [50] - best_param = self.search( - params, - search_iter, - n_estimators_choice, - grid_train_x, grid_train_y) - - if self.verbose: - print('-----------------Search Learning Rate---------------------') - for key, value in best_param.items(): - best_param[key] = [value] - best_param['learning_rate'] = [0.03, 0.045, 0.06, 0.075, 0.85, 0.95, 0.105, 0.12] - n_estimators_choice = [100, 150, 200] - search_iter = 16 - - self.search( - best_param, - search_iter, - n_estimators_choice, - grid_train_x, grid_train_y) - - if self.verbose: - print('self.clf', self.clf) - self.is_trained = True - - # Fit Model - self.clf.fit(x, y) - - self.clf.booster_.save_model(self.save_filename) - - if self.verbose: - print("The whole available data is: ") - print("Real-FIT: dim(X)= [{:d}, {:d}]".format(x.shape[0], x.shape[1])) - - print('Feature Importance:') - print(self.clf.feature_importances_) - - @abstractmethod - def init_lgbm(self, y): - pass - - def predict(self, x_test): - """ - This function should provide predictions of labels on (test) data. - The function predict eventually casdn return probabilities or continuous values. - """ - x_test = self.tabular_preprocessor.encode(x_test) - y = self.clf.predict(x_test, ) - if y is None: - raise ValueError("Tabular predictor does not exist") - return y - - @abstractmethod - def evaluate(self, x_test, y_test): - pass - - def final_fit(self, x_train, y_train): - x_train = self.tabular_preprocessor.encode(x_train) - self.clf.fit(x_train, y_train) - - -class TabularRegressor(TabularSupervised): - """TabularRegressor class. - It is used for tabular data regression with lightgbm regressor. - """ - - def __init__(self, path=None): - super().__init__(path) - self.objective = 'regression' - - def evaluate(self, x_test, y_test): - y_pred = self.predict(x_test) - return mean_squared_error(y_test, y_pred) - - def init_lgbm(self, y): - self.lgbm = LGBMRegressor(silent=False, - verbose=-1, - n_jobs=1, - objective=self.objective) - - def get_skf(self, folds): - return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=1001) - - -class TabularClassifier(TabularSupervised): - """TabularClassifier class. - It is used for tabular data classification with lightgbm classifier. - """ - - def init_lgbm(self, y): - n_classes = len(set(y)) - if n_classes == 2: - self.objective = 'binary' - self.lgbm = LGBMClassifier(silent=False, - verbose=-1, - n_jobs=1, - objective=self.objective) - else: - self.objective = 'multiclass' - self.lgbm = LGBMClassifier(silent=False, - verbose=-1, - n_jobs=1, - num_class=n_classes, - objective=self.objective) - - def evaluate(self, x_test, y_test): - if self.verbose: - print('objective:', self.objective) - y_pred = self.predict(x_test) - results = None - if self.objective == 'binary': - results = roc_auc_score(y_test, y_pred) - elif self.objective == 'multiclass': - results = f1_score(y_test, y_pred, average='weighted') - return results - - def get_skf(self, folds): - if self.lgbm.objective == 'binary': - score_metric = 'roc_auc' - skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001) - else: - score_metric = 'f1_weighted' - skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001) - return score_metric, skf diff --git a/autokaggle/utils.py b/autokaggle/utils.py index 62b833f..bb3aa69 100644 --- a/autokaggle/utils.py +++ b/autokaggle/utils.py @@ -2,6 +2,12 @@ import tempfile import string import random +import json + + +def generate_rand_string(size): + chars = string.ascii_uppercase + string.digits + return ''.join(random.choice(chars) for _ in range(size)) def ensure_dir(directory): @@ -17,11 +23,27 @@ def temp_path_generator(): def rand_temp_folder_generator(): - """Create and return a temporary directory with the path name '/temp_dir_name/autokeras' (E:g:- /tmp/autokeras).""" - chars = string.ascii_uppercase + string.digits - size = 6 - random_suffix = ''.join(random.choice(chars) for _ in range(size)) + """ + Create and return a temporary directory with the path name + '/temp_dir_name/autokeras' (E:g:- /tmp/autokeras). + """ sys_temp = temp_path_generator() - path = sys_temp + '_' + random_suffix + path = sys_temp + '_' + generate_rand_string(6) ensure_dir(path) return path + + +def write_json(data, filename): + with open(filename, 'w') as outfile: + json.dump(data, outfile) + + +def read_json(filename): + with open(filename, 'rb') as infile: + return json.load(infile) + + +def write_csv(filename, line): + with open(filename, "a") as f: + f.write(", ".join(map(str, line))) + f.write("\n") diff --git a/examples/benchmarking.py b/examples/benchmarking.py new file mode 100644 index 0000000..57cd47e --- /dev/null +++ b/examples/benchmarking.py @@ -0,0 +1,456 @@ +import string +import random +import sys + +sys.path.append("../") +import numpy as np +import pandas as pd +import sklearn.model_selection +import sklearn.datasets +from sklearn.metrics import r2_score, roc_auc_score, accuracy_score, f1_score, \ + balanced_accuracy_score, \ + mean_absolute_error, mean_squared_error +# from autosklearn.regression import AutoSklearnRegressor +# from autosklearn.classification import AutoSklearnClassifier +from autokaggle import * +from autokaggle.utils import * +import openml + +openml.config.apikey = '3c7196c92a274c3b9405a7e26e9f848e' +import warnings +from abc import abstractmethod +import statistics + + +def generate_rand_string(size): + chars = string.ascii_uppercase + string.digits + return ''.join(random.choice(chars) for _ in range(size)) + + +class BenchmarkingBase: + """ Base class for benchmarking autoML platforms. + + This class benchmarks the performance of the given autoML platform. The + user can call evaluate() method to evaluate the performance on a single + task or run_automation() for the list of the tasks. The tasks are OpenML + tasks, which specify the dataset and the train/test/validation folds etc. + + # Arguments + results: List. List of the results for each evaluation + sess_name: String. Name of the evaluation session, used for storing + the results. + cls_desc: List. List of the columns to be added in classification result + rgs_desc: List. List of the columns to be added in regression result + cls_results: DataFrame. Table storing the classification results + rgs_results: DataFrame. Table storing the regression results + """ + results = None + cls_desc = ["automl_model", "task_id", "time_limit", "accuracy", + "balanced_accuracy", "F1_score", "AUC"] + rgs_desc = ["automl_model", "task_id", "time_limit", "MSE", "MAE", "R2_score"] + + def __init__(self, supress_warnings=True, sess_name=""): + if supress_warnings: + warnings.filterwarnings('ignore') + self.results = [] + self.sess_name = generate_rand_string(6) if not sess_name else sess_name + self.cls_results = pd.DataFrame(columns=self.cls_desc) + self.rgs_results = pd.DataFrame(columns=self.rgs_desc) + + def measure_performance_cls(self, y_true, y_pred, binary=False): + """ Calculate the performance of the classification task + # Arguments + y_true: A numpy array containing the ground truth labels + y_pred: A numpy array containing the predicted labels + binary: Boolean specifying if the objective isbinary or multiclass + # Returns + list of the performance scores based on various evaluation metrics. + """ + accuracy = accuracy_score(y_true, y_pred) + ber = balanced_accuracy_score(y_true, y_pred) + f1 = f1_score(y_true, y_pred, average="binary") if binary else f1_score( + y_true, y_pred, average="weighted") + auc = roc_auc_score(y_true, y_pred) if binary else "-" + return [accuracy, ber, f1, auc] + + def measure_performance_rgs(self, y_true, y_pred): + """ Calculate the performance of the regression task + # Arguments + y_true: A numpy array containing the ground truth + y_pred: A numpy array containing the predicted values + # Returns + list of the performance scores based on various evaluation metrics. + """ + mse = mean_squared_error(y_true, y_pred) + mae = mean_absolute_error(y_true, y_pred) + r2 = r2_score(y_true, y_pred) + return [mse, mae, r2] + + def export_results(self): + """ Writes the results to a CSV file. + # Arguments + None + # Returns + None + """ + if len(self.cls_results) > 0: + self.cls_results.to_csv(self.sess_name + "_classification_results.csv", + index=False) + if len(self.rgs_results) > 0: + self.rgs_results.to_csv(self.sess_name + "_regression_results.csv", + index=False) + + @abstractmethod + def evaluate(self, task, time_limit): + """ Evaluates the performance of the single task. + # Arguments + task: Id of the OpenML task flow + time_limit: Budget for the given task + # Returns + List of performance scores of the autoML system on the given task. + """ + pass + + def run_automation(self, task_list, time_limit=10 * 60): + """ Evaluate the list of the tasks in sequence + # Arguments + task_list: List of OpenML task ids + time_limit: Budget for each of the task + # Returns + None + """ + for task in task_list: + try: + self.evaluate(task, time_limit=time_limit) + self.export_results() + except: + print("task: {} didnt work".format(task)) + + def time_lapse(self, task_id, + time_limits=[30, 40, 50, 60, 90, 120, 150, 180, 240, 300]): + """ Evaluate the task on different time_limits + # Arguments + task_id: Id of the OpenML task flow + time_limits: List of the time_limits to test the performance on + # Returns + List of combined results of the autoML on each of the time_limit + This function evaluates and compares the performance of the autoML system + on different time_limits. It is helpful to understand the amount of + improvement with increase in time budget + """ + tl_results = [] + for time_limit in time_limits: + tl_results.append(self.evaluate(task_id, time_limit=time_limit)) + return tl_results + + def get_dataset_splits(self, task_id): + """ Get the train/test splits for the given task + # Arguments + task_id: Id of OpenML task flow + # Returns + Train/Test datasets in numpy array format + """ + task = openml.tasks.get_task(task_id) + train_indices, test_indices = task.get_train_test_split_indices() + dataset = task.get_dataset() + X, y, categorical_indicator, attribute_names = dataset.get_data( + target=task.target_name, dataset_format='array') + + x_train, y_train = X[train_indices], y[train_indices] + x_test, y_test = X[test_indices], y[test_indices] + return x_train, y_train, x_test, y_test + + +class BenchmarkingAutoKaggle(BenchmarkingBase): + """ Extends the benchmarking class for evaluating AutoKaggle. + + This class evaluates the performance of AutoKaggle on the input + classification or regression task_list. + """ + + def get_data_info(self, dataset, num_cols): + """ Get the info of each feature data type + # Arguments + dataset: dataset id in OpenML + num_cols: Total number of columns + # Returns + A numpy array containing the data_type of each feature column + """ + nominal_feat = dataset.get_features_by_type('nominal') + numerical_feat = dataset.get_features_by_type('numeric') + string_feat = dataset.get_features_by_type('string') + date_feat = dataset.get_features_by_type('date') + + data_info = [] + for i in range(num_cols): + if i in date_feat: + data_info.append("TIM") + elif i in numerical_feat: + data_info.append("NUM") + else: + data_info.append("CAT") + return np.array(data_info) + + def evaluate(self, task_id, time_limit=10 * 60): + """ + See base class. + """ + task_info = ["autokaggle", task_id, time_limit] + task = openml.tasks.get_task(task_id) + train_indices, test_indices = task.get_train_test_split_indices() + dataset = task.get_dataset() + X, y, categorical_indicator, attribute_names = dataset.get_data( + target=task.target_name, dataset_format='array') + + x_train, y_train = X[train_indices], y[train_indices] + x_test, y_test = X[test_indices], y[test_indices] + + # Create feature type list from openml.org indicator + data_info = self.get_data_info(dataset, len(attribute_names)) + + # Train + if task.task_type == 'Supervised Classification': + automl = Classifier() + elif task.task_type == 'Supervised Regression': + automl = Regressor() + else: + print("UNSUPPORTED TASK_TYPE") + assert (0) + + automl.fit(x_train, y_train, time_limit=time_limit, data_info=data_info) + + # Evaluate + y_hat = automl.predict(x_test) + + if task.task_type == 'Supervised Classification': + is_binary = True if len(task.class_labels) <= 2 else False + result = task_info + self.measure_performance_cls(y_test, y_hat, + binary=is_binary) + self.cls_results.loc[len(self.cls_results)] = result + elif task.task_type == 'Supervised Regression': + result = task_info + self.measure_performance_rgs(y_test, y_hat) + self.rgs_results.loc[len(self.rgs_results)] = result + print(result) + return result + + # + # class BenchmarkingAutoSklearn(BenchmarkingBase): + """ Extends the benchmarking class for evaluating AutoSklearn. + + This class evaluates the performance of AutoKaggle on the input + classification or regression task_list. + """ + + +# def get_data_info(self, categorical_indicator): +# return ['Categorical' if ci else 'Numerical' for ci in categorical +# indicator] +# +# def evaluate(self, task_id, time_limit=10*60): +# task_info = ["autosklearn", task_id, time_limit] +# task = openml.tasks.get_task(task_id) +# train_indices, test_indices = task.get_train_test_split_indices() +# dataset = task.get_dataset() +# X, y, categorical_indicator, attribute_names = dataset.get_data( +# target=task.target_name, dataset_format='array') +# +# x_train, y_train = X[train_indices], y[train_indices] +# x_test, y_test = X[test_indices], y[test_indices] +# +# # Create feature type list from openml.org indicator +# feat_type = self.get_data_info(categorical_indicator) +# +# # Train +# if task.task_type == 'Supervised Classification': +# automl = AutoSklearnClassifier( +# time_left_for_this_task=time_limit, +# per_run_time_limit=time_limit//10, **kwargs) +# elif task.task_type == 'Supervised Regression': +# automl = AutoSklearnRegressor( +# time_left_for_this_task=time_limit, +# per_run_time_limit=time_limit//10, **kwargs) +# else: +# print("UNSUPPORTED TASK_TYPE") +# assert(0) +# +# automl.fit(x_train, y_train, feat_type=feat_type) +# +# y_hat = automl.predict(x_test) +# if task.task_type == 'Supervised Classification': +# is_binary = True if len(task.class_labels) <= 2 else False +# result = task_info + self.measure_performance_cls(y_test, y_hat, +# binary=is_binary) +# self.cls_results.loc[len(self.cls_results)] = result +# elif task.task_type == 'Supervised Regression': +# result = task_info + self.measure_performance_rgs(y_test, y_hat) +# self.rgs_results.loc[len(self.rgs_results)] = result +# self.results.append(result) +# print(result) +# return result + + +def get_dataset_ids(task_ids): + """ Fetches the dataset_ids. + # Arguments + task_ids: List of ids of OpenML task flows + # Returns + dataset_list: List of the dataset Ids + """ + if type(task_ids) == list: + return [openml.tasks.get_task(t_id).dataset_id for t_id in task_ids] + else: + return openml.tasks.get_task(task_ids).dataset_id + + +def get_task_info(task_ids): + """ Fetches the dataset_ids and the task objective. + # Arguments + task_ids: List of ids of OpenML task flows. + # Returns + dataset_list: List of the dataset Ids. + task_types: List of the task type (such as 'binary/multiclass + classification' or 'regression' + """ + task_types = [] + dataset_list = [] + for i, t_id in enumerate(task_ids): + task = openml.tasks.get_task(t_id) + dataset = openml.datasets.get_dataset(task.dataset_id) + if task.task_type_id == 1: + _, y, _, _ = dataset.get_data(target=task.target_name, + dataset_format='array') + task_type = "Binary Classification" if len( + set(y)) <= 2 else "Multiclass classification ({})".format( + len(set(y))) + else: + task_type = "Regression" + task_types.append(task_type) + dataset_list.append(dataset) + return dataset_list, task_types + + +def get_dataset_properties(task_ids): + """ Fetches the properties of the dataset for given task flow id + # Arguments + task_ids: List of ids of OpenML task flows + # Returns + Dataframe containing the info of each of the dataset. + This function provides the dataset info such as number of instances, number of + numeric/nominal/string columns etc. + """ + dataset_list, task_types = get_task_info(task_ids) + df = pd.DataFrame( + columns=["Name", "#Samples", "Task_Type", "#Numeric", "#Nominal", "#String", + "#Date"]) + for i, dataset in enumerate(dataset_list): + df.loc[i] = [ + dataset.name, + dataset.qualities["NumberOfInstances"], + task_types[i], + len(dataset.get_features_by_type('numeric')), + len(dataset.get_features_by_type('nominal')), + len(dataset.get_features_by_type('string')), + len(dataset.get_features_by_type('date')), + ] + return df + + +def get_performance_table(filename, metric): + """ Generates a comprehensive report table of AutoML performance. + # Arguments + filename: A csv file containing the results of AutoML runs + metric: Scoring metric to be used for comparison + # Returns + Pandas Dataframe listing the performance of different AutoML systems on + the given datasets. + This function reads the results csv and converts it into the performance table + based on the median of the results for each task. + """ + test = pd.read_csv(filename) + perf = pd.DataFrame(columns=["Name", "AutoKaggle", "AutoSklearn", "H2O.ai"]) + task_ids = list(set(test["task_id"])) + dataset_ids = get_dataset_ids(task_ids) + + test = test.set_index(["task_id", "automl_model"]) + test.sort_index(inplace=True) + for i, t_id in enumerate(task_ids): + try: + name = openml.datasets.get_dataset(dataset_ids[i]).name + auto_kaggle = test.loc[(t_id, "autokaggle")][metric].median()\ + if (t_id, "autokaggle") in test.index else np.nan + auto_sklearn = test.loc[(t_id, "autosklearn")][metric].median()\ + if (t_id, "autosklearn") in test.index else np.nan + h2o_ai = test.loc[(t_id, "autosklearn")][metric].median()\ + if (t_id, "autosklearn") in test.index else np.nan + perf.loc[i] = [name, auto_kaggle, auto_sklearn, h2o_ai] + except Exception as e: + print(e) + return perf + + +def style_results(res): + """ Highlights the best result in the results column + # Arguments + res: Dataframe containing the results of various AutoML runs + # Returns + Highlighed data-frame + """ + + def highlight_max(s): + """ + Highlight the maximum in a Series yellow. + """ + is_max = s == s.max() + return ['background-color: yellow' if v else '' for v in is_max] + + res = res.set_index("Name") + res.style.apply(highlight_max, axis=1) + return res + + +def get_box_plot(results, task_id, metric): + """ Generates a box plot of the variance in the result. + # Arguments + results: Results of various runs using AutoML systems + task_id: Id for OpenML task flow + metric: Score metric considered for the box-plot + # Returns + None + Builds and displays the box plot showing the variance in results for the + AutoML performance on the given dataset. + """ + auto_sklearn = list(results.loc[(task_id, "autosklearn")][metric]) + auto_kaggle = list(results.loc[(task_id, "autokaggle")][metric]) + med_sk = statistics.median(auto_sklearn) + med_ak = statistics.median(auto_kaggle) + while len(auto_sklearn) < len(auto_kaggle): + auto_sklearn.append(med_sk) + while len(auto_sklearn) > len(auto_kaggle): + auto_kaggle.append(med_ak) + temp = pd.DataFrame( + data={"Autokaggle": auto_kaggle, "AutoSklearn": auto_sklearn}) + temp.boxplot() + + +if __name__ == "__main__": + regression_task_list = [52948, 2295, 4823, 2285, 4729, 4990, 4958, 2280, 4834, + 4850, 4839] + classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954, + 14951, 59, 24, 146230, 31, 10101, + 9914, 3020, 3524, 3573, 3962] + ak = BenchmarkingAutoKaggle(sess_name='test_perf') + import time + + # t1 = time.time() + # for _ in range(1): + # ak.run_automation(classification_task_list) + # t2 = time.time() + # print(t2-t1) + np.random.seed(1001) + random.seed(1001) + import time + + t1 = time.time() + ak.evaluate(3021) + t2 = time.time() + print(t2 - t1) diff --git a/examples/tabular_classification_binary.py b/examples/tabular_classification_binary.py index df472a5..e5d3b6f 100644 --- a/examples/tabular_classification_binary.py +++ b/examples/tabular_classification_binary.py @@ -1,5 +1,8 @@ import numpy as np -from autokaggle import TabularClassifier +import sys + +sys.path.append("..") +from autokaggle import * if __name__ == '__main__': ntime, nnum, ncat = 4, 10, 8 @@ -16,9 +19,9 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = TabularClassifier() + clf = Classifier() datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) AUC = clf.evaluate(x_test, y_test) - print(AUC) + print(AUC) \ No newline at end of file diff --git a/examples/tabular_classification_multiclass.py b/examples/tabular_classification_multiclass.py index 7515841..3426dd3 100644 --- a/examples/tabular_classification_multiclass.py +++ b/examples/tabular_classification_multiclass.py @@ -1,6 +1,8 @@ import numpy as np -from autokaggle import TabularClassifier - +import sys +sys.path.append("..") +# print(sys.path) +from autokaggle import * if __name__ == '__main__': ntime, nnum, ncat = 4, 10, 8 nsample = 10000 @@ -16,9 +18,9 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = TabularClassifier() + clf = Classifier() datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo) F1_score = clf.evaluate(x_test, y_test) - print(F1_score) + print(F1_score) \ No newline at end of file diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py index 8ba95a8..bf97bc7 100644 --- a/examples/tabular_regression.py +++ b/examples/tabular_regression.py @@ -1,5 +1,7 @@ import numpy as np -from autokaggle import TabularRegressor +import sys +sys.path.append("..") +from autokaggle import * if __name__ == '__main__': ntime, nnum, ncat = 4, 10, 8 @@ -16,7 +18,7 @@ y_train = y_all[:int(nsample * 0.8)] y_test = y_all[int(nsample * 0.8):] - clf = TabularRegressor() + clf = Regressor() datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat) clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)