diff --git a/autokaggle/__init__.py b/autokaggle/__init__.py
index e69de29..6744a7a 100644
--- a/autokaggle/__init__.py
+++ b/autokaggle/__init__.py
@@ -0,0 +1,2 @@
+from autokaggle.auto_ml import Classifier, Regressor
+from autokaggle.ensemblers import *
diff --git a/autokaggle/auto_ml.py b/autokaggle/auto_ml.py
new file mode 100644
index 0000000..3cfef69
--- /dev/null
+++ b/autokaggle/auto_ml.py
@@ -0,0 +1,635 @@
+from sklearn.base import BaseEstimator, is_classifier
+from abc import abstractmethod
+import numpy as np
+import os
+import random
+import json
+from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
+from joblib import dump, load
+
+from autokaggle.preprocessor import Preprocessor
+from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, \
+    read_json
+from lightgbm import LGBMClassifier, LGBMRegressor
+from autokaggle.config import Config, CLASSIFICATION_PREP_HPARAM_SPACE, \
+    REGRESSION_PREP_HPARAM_SPACE, \
+    REGRESSION_BASE_HPARAM_SPACE, CLASSIFICATION_BASE_HPARAM_SPACE, \
+    CLASSIFICATION_HPARAM_SPACE, REGRESSION_HPARAM_SPACE
+from sklearn.model_selection import StratifiedKFold, KFold
+import hyperopt
+from hyperopt import tpe, hp, fmin, Trials, STATUS_OK, STATUS_FAIL
+from sklearn.model_selection import cross_val_score
+from autokaggle.ensemblers import RankedEnsemblingModel, StackedEnsemblingModel
+from imblearn.over_sampling import SMOTE, SMOTENC
+import collections
+
+
+class AutoKaggle(BaseEstimator):
+    """ Automated Machine Learning system class.
+
+        AutoKaggle implements an end to end automated ML system. It initiates and
+        searches for the optimum ML pipeline. The user can use it with the simple
+        `fit()` and  `predict()` methods like Sci-kit learn estimators.
+        The user can specify various parameters controlling different components
+        of the system.
+        # Arguments
+            path: String. OS path for storing temporary model parameters.
+            verbose: Bool. Defines the verbosity of the logging.
+            time_limit: Int. Time budget for performing search and fit pipeline.
+            use_ensembling: Bool. Defines whether to use an ensemble of models
+            num_estimators_ensemble: Int. Maximum number of estimators to be used
+            in an ensemble
+            ensemble_strategy: String. Strategy to ensemble models
+            ensemble_method: String. Aggregation method if ensemble_strategy is
+            set to ranked_ensembling
+            random_ensemble: Bool. Whether the ensembling estimators are picked
+            randomly.
+            diverse_ensemble: Bool. Whether estimators from different families are
+            picked.
+            ensembling_search_iter: Int. Search iterations for ensembling
+            hyper-parameter search
+            search_algo: String. Search strategy for hyper-parameter search.
+            search_iter: Int. Number of iterations used for hyper-parameter search.
+            cv_folds: Int. Number of Cross Validation folds.
+            subsample_ratio: Percent of subsample used for for hyper-parameter
+            search.
+            data_info: list(String). Lists the datatypes of each feature column.
+            stack_probabilities: Bool. Whether to use class probabilities in
+            ensembling.
+            upsample_classes: Bool. Whether to upsample less represented classes
+            num_p_hparams: Int. Number of preprocessor search spaces.
+    """
+
+    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True,
+                 num_estimators_ensemble=50, ensemble_strategy='stacking',
+                 ensemble_method='max_voting',
+                 search_iter=500, cv_folds=3, subsample_ratio=0.1,
+                 random_ensemble=False, diverse_ensemble=True,
+                 stack_probabilities=False, data_info=None, upsample_classes=False,
+                 ensembling_search_iter=10,
+                 search_algo='random', num_p_hparams=10):
+        self.is_trained = False
+        if not path:
+            path = rand_temp_folder_generator()
+        self.config = Config(path=path, verbose=verbose, time_limit=time_limit,
+                             use_ensembling=use_ensembling,
+                             num_estimators_ensemble=num_estimators_ensemble,
+                             ensemble_strategy=ensemble_strategy,
+                             ensemble_method=ensemble_method,
+                             search_iter=search_iter, cv_folds=cv_folds,
+                             subsample_ratio=subsample_ratio,
+                             random_ensemble=random_ensemble,
+                             diverse_ensemble=diverse_ensemble,
+                             stack_probabilities=stack_probabilities,
+                             data_info=data_info, upsample_classes=upsample_classes,
+                             ensembling_search_iter=ensembling_search_iter,
+                             search_algo=search_algo,
+                             num_p_hparams=num_p_hparams)
+        self.pipeline = None
+        self.m_hparams = None
+        self.m_hparams_base = None
+        self.p_hparams_base = None
+
+    def fit(self, x, y, time_limit=None, data_info=None):
+        """ Train an autoML system.
+        # Arguments
+            x: A numpy.ndarray instance containing the training data.
+            y: training label vector.
+            time_limit: remaining time budget.
+            data_info: meta-features of the dataset, which is an numpy.ndarray
+            describing the feature type of each
+             column in raw_x. The feature type include: 'TIME' for temporal
+             feature, 'NUM' for other numerical feature,
+             and 'CAT' for categorical feature.
+        # Returns
+            None
+        Both inputs X and y are numpy arrays.
+        If fit is called multiple times on incremental data (train, test1, test2,
+        etc.)
+        you should warm-start your training from the pre-trained model. Past data
+        will
+        NOT be available for re-training.
+        """
+        self.config.time_limit = time_limit if time_limit else 24 * 60 * 60
+
+        # Extract or read data info
+        self.config.data_info = data_info if data_info is not None else \
+            self.extract_data_info(x)
+
+        if self.config.verbose:
+            print('DATA_INFO: {}'.format(self.config.data_info))
+            print('#TIME features: {}'.format(sum(self.config.data_info == 'TIME')))
+            print('#NUM features: {}'.format(sum(self.config.data_info == 'NUM')))
+            print('#CAT features: {}'.format(sum(self.config.data_info == 'CAT')))
+
+        if x.shape[1] == 0:
+            raise ValueError("No feature exist!")
+
+        x, y = self.resample(x, y)
+
+        if self.config.objective == 'classification':
+            n_classes = len(set(y))
+            self.config.objective = 'binary' if n_classes == 2 else 'multiclass'
+
+        # self.pipeline = AutoPipe(LGBMClassifier, {}, {}, self.config)
+        # Search the top preprocessing setting
+        trials = self.search(x, y, self.p_hparams_base, self.m_hparams_base)
+        p_hparams = self.get_top_prep(trials, self.config.num_p_hparams)
+        # Search the best pipelines
+        trials = self.search(x, y, p_hparams, self.m_hparams_base)
+        self.pipeline = self.get_best_pipeline(trials)
+        # Fit data
+        self.pipeline.fit(x, y)
+        self.is_trained = True
+
+    def predict(self, x_test):
+        """ Generate prediction on the test data for the given task.
+        # Arguments
+            x_test: A numpy.ndarray instance containing the test data.
+        # Returns
+            A numpy array for the predictions on the x_test.
+        This function provides predictions of labels on (test) data.
+        """
+        y = self.pipeline.predict(x_test)
+        if y is None:
+            raise ValueError("Tabular predictor does not exist")
+        return y
+
+    def predict_proba(self, x_test):
+        """ Predict label probabilities on the test data for the given
+        classification task.
+        # Arguments
+            x_test: A numpy.ndarray instance containing the test data.
+        # Returns
+            A numpy array for the prediction probabilities on the x_test.
+        The function returns predicted probabilities for every class label.
+        """
+        y = self.pipeline.predict_proba(x_test)
+        if y is None:
+            raise ValueError("Tabular predictor does not exist")
+        return y
+
+    def evaluate(self, x_test, y_test):
+        """ Predict label probabilities on the test data for the given
+        classification task.
+        # Arguments
+            x_test: A numpy.ndarray instance containing the training data.
+            y_test: A numpy array with ground truth labels for the test data
+        # Returns
+            An evaluation score based on the task type.
+        """
+        if self.config.verbose:
+            print('objective:', self.config.objective)
+        y_pred = self.predict(x_test)
+        results = None
+        if self.config.objective == 'binary':
+            results = roc_auc_score(y_test, y_pred)
+        elif self.config.objective == 'multiclass':
+            results = f1_score(y_test, y_pred, average='weighted')
+        elif self.config.objective == 'regression':
+            results = mean_squared_error(y_test, y_pred)
+        return results
+
+    def resample(self, x, y):
+        """ Up-samples the input data
+        # Arguments
+            x: A numpy array for features
+            y: A numpy array for target
+        # Returns
+            Up-sampled version of the dataset
+        """
+        if self.config.upsample_classes:
+            x, y = SMOTE(
+                sampling_strategy=self.config.resampling_strategy).fit_resample(x, y)
+        while x.shape[0] < 60:
+            x = np.concatenate([x, x], axis=0)
+            y = np.concatenate([y, y], axis=0)
+        return x, y
+
+    def subsample(self, x, y, sample_percent):
+        """ Takes a sub-sample of the input data, for the hyper-parameter search.
+        # Arguments
+            x: A numpy array for features
+            y: A numpy array for target
+            sample_percent: Minimum percentage of the  data to be maintained
+        # Returns
+            Down-sampled dataset
+        """
+        # TODO: Add way to balance the subsample
+        # Set small sample for hyper-param search
+        if x.shape[0] > 600:
+            grid_train_percentage = max(600.0 / x.shape[0], sample_percent)
+        else:
+            grid_train_percentage = 1
+        grid_n = int(x.shape[0] * grid_train_percentage)
+        idx = random.sample(list(range(x.shape[0])), grid_n)
+        grid_train_x, grid_train_y = x[idx, :], y[idx]
+        return grid_train_x, grid_train_y
+
+    def search(self, x, y, prep_space, model_space):
+        """ Do hyper-parameter search to find optimal machine learning pipeline.
+        # Arguments
+            x: A numpy array for features
+            y: A numpy array for target
+            prep_space: Hyper-parameter search space for preprocessors
+            model_space: Hyper-parameter search space for estimators
+        # Returns
+            List of hyper-parameter trials
+        """
+        grid_train_x, grid_train_y = \
+            self.subsample(x, y, sample_percent=self.config.subsample_ratio)
+        score_metric, skf = self.get_skf(self.config.cv_folds)
+
+        def objective_func(params):
+            model_class = params['estimator']['model']
+            m_params = params['estimator']['param']
+            p_params = params['prep']
+            pipeline = AutoPipe(model_class=model_class, m_params=m_params,
+                                p_params=p_params, config=self.config)
+            try:
+                eval_score = cross_val_score(pipeline, grid_train_x, grid_train_y,
+                                             scoring=score_metric, cv=skf).mean()
+                status = STATUS_OK
+            except ValueError as e:
+                print(e)
+                eval_score = float('-inf')
+                status = STATUS_FAIL
+            if self.config.verbose:
+                print("CV Score:", eval_score)
+                print("\n=================")
+            loss = 1 - eval_score if status == STATUS_OK else float('inf')
+            return {'loss': loss, 'status': status, 'model_class': model_class,
+                    'm_params': m_params,
+                    'p_params': p_params}
+
+        trials = Trials()
+        search_space = {'prep': prep_space, 'estimator': model_space}
+        _ = fmin(objective_func, search_space, algo=self.config.search_algo,
+                 trials=trials,
+                 max_evals=self.config.search_iter,
+                 rstate=np.random.RandomState(self.config.random_state))
+        return trials
+
+    def get_best_pipeline(self, trials):
+        """ Finds the optimal pipeline from the given list of search trials.
+        # Arguments
+            trials: List of hyper-parameter search trials
+        # Returns
+            Optimal pipeline based on the given list of trials
+        """
+        if self.config.use_ensembling:
+            best_pipeline = self.setup_ensemble(trials)
+        else:
+            opt = trials.best_trial['result']
+            best_pipeline = AutoPipe(opt['model_class'], opt['m_params'],
+                                     opt['p_params'], self.config)
+            if self.config.verbose:
+                print("The best hyperparameter setting found:")
+                print(opt)
+        return best_pipeline
+
+    @staticmethod
+    def get_top_prep(trials, n):
+        """ Find the list of top N preprocessor settings.
+        # Arguments
+            trials: List of hyper-parameter search trials
+            n: Maximum number of preprocessor settings required
+        # Returns
+            List of the top N optimal preprocessor settings.
+        """
+        best_trials = [t for t in trials.results if t['loss'] != float('inf')]
+        best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False)
+        top_p_hparams, count = [], 0
+        for trial in best_trials:
+            if trial['p_params'] not in top_p_hparams:
+                top_p_hparams.append(trial)
+                count += 1
+                if count > n:
+                    break
+
+        return hp.choice('p_params', top_p_hparams)
+
+    @abstractmethod
+    def get_skf(self, folds):
+        """ Get the scoring metric and the cross validation folds for evaluation.
+        # Arguments
+            folds: NUmber of cross validation folds
+        # Returns
+            Scoring metric and cross validation folds.
+        """
+        pass
+
+    def pick_diverse_estimators(self, trial_list):
+        """ Selects the best hyper-parameter settings from each estimator family.
+        # Arguments
+            trial_list: List of the hyper-parameter search trials.
+        # Returns
+            List of top hyper-parameter spaces equally selected from each
+            estimator family.
+        """
+        groups = collections.defaultdict(list)
+
+        for obj in trial_list:
+            groups[obj['model_class']].append(obj)
+        estimator_list = []
+        idx, j = 0, 0
+        while idx < self.config.num_estimators_ensemble:
+            for grp in groups.values():
+                if j < len(grp):
+                    est = AutoPipe(grp[j]['model_class'], grp[j]['m_params'],
+                                   grp[j]['p_params'], self.config)
+                    estimator_list.append(est)
+                    idx += 1
+            j += 1
+        return estimator_list
+
+    def setup_ensemble(self, trials):
+        """ Generates the optimal ensembling estimator based on the given setting.
+        # Arguments
+            trials: List of the hyper-parameter search trials.
+        # Returns
+            An ensembling estimator to be trained using the base estimators picked
+            from trials.
+        """
+        # Filter the unsuccessful hparam spaces i.e. 'loss' == float('inf')
+        best_trials = [t for t in trials.results if t['loss'] != float('inf')]
+        best_trials = sorted(best_trials, key=lambda k: k['loss'], reverse=False)
+
+        self.config.num_estimators_ensemble = min(
+            self.config.num_estimators_ensemble, len(best_trials))
+
+        if self.config.random_ensemble:
+            np.random.shuffle(best_trials)
+
+        if self.config.diverse_ensemble:
+            estimator_list = self.pick_diverse_estimators(best_trials)
+        else:
+            estimator_list = []
+            for i in range(self.config.num_estimators_ensemble):
+                est = AutoPipe(best_trials[i]['model_class'],
+                               best_trials[i]['m_params'],
+                               best_trials[i]['p_params'],
+                               self.config)
+                estimator_list.append(est)
+
+        if self.config.ensemble_strategy == 'stacking':
+            best_estimator_ = StackedEnsemblingModel(estimator_list,
+                                                     config=self.config)
+        else:
+            best_estimator_ = RankedEnsemblingModel(estimator_list,
+                                                    config=self.config)
+        return best_estimator_
+
+    @staticmethod
+    def extract_data_info(raw_x):
+        """
+        Extracts the data info automatically based on the type of each feature in
+        raw_x.
+        # Arguments
+            raw_x: a numpy.ndarray instance containing the training data.
+        # Returns
+            A list of data-types for each feature in the data.
+        """
+        data_info = []
+        row_num, col_num = raw_x.shape
+        for col_idx in range(col_num):
+            try:
+                raw_x[:, col_idx].astype(np.float)
+                data_info.append('NUM')
+            except:
+                data_info.append('CAT')
+        return np.array(data_info)
+
+
+class Classifier(AutoKaggle):
+    """ Extends AutoKaggle for Classification.
+
+        Extends the AutoKaggle specific to the classification requirements.
+        # Arguments
+            path: String. OS path for storing temporary model parameters.
+            verbose: Bool. Defines the verbosity of the logging.
+            time_limit: Int. Time budget for performing search and fit pipeline.
+            use_ensembling: Bool. Defines whether to use an ensemble of models
+            num_estimators_ensemble: Int. Maximum number of estimators to be used
+            in an ensemble
+            ensemble_strategy: String. Strategy to ensemble models
+            ensemble_method: String. Aggregation method if ensemble_strategy is
+            set to ranked_ensembling
+            random_ensemble: Bool. Whether the ensembling estimators are picked
+            randomly.
+            diverse_ensemble: Bool. Whether estimators from different families are
+            picked.
+            ensembling_search_iter: Int. Search iterations for ensembling
+            hyper-parameter search
+            search_algo: String. Search strategy for hyper-parameter search.
+            search_iter: Int. Number of iterations used for hyper-parameter search.
+            cv_folds: Int. Number of Cross Validation folds.
+            subsample_ratio: Percent of subsample used for for hyper-parameter
+            search.
+            data_info: list(String). Lists the datatypes of each feature column.
+            stack_probabilities: Bool. Whether to use class probabilities in
+            ensembling.
+            upsample_classes: Bool. Whether to upsample less represented classes
+            num_p_hparams: Int. Number of preprocessor search spaces.
+    """
+
+    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True,
+                 num_estimators_ensemble=50, ensemble_strategy='stacking',
+                 ensemble_method='max_voting',
+                 search_iter=500, cv_folds=3, subsample_ratio=0.1,
+                 random_ensemble=False, diverse_ensemble=True,
+                 stack_probabilities=False, data_info=None, upsample_classes=False,
+                 ensembling_search_iter=10,
+                 search_algo='random', num_p_hparams=10):
+        super().__init__(path=path, verbose=verbose, time_limit=time_limit,
+                         use_ensembling=use_ensembling,
+                         num_estimators_ensemble=num_estimators_ensemble,
+                         ensemble_strategy=ensemble_strategy,
+                         ensemble_method=ensemble_method, search_iter=search_iter,
+                         cv_folds=cv_folds,
+                         subsample_ratio=subsample_ratio,
+                         random_ensemble=random_ensemble,
+                         diverse_ensemble=diverse_ensemble,
+                         stack_probabilities=stack_probabilities,
+                         data_info=data_info,
+                         upsample_classes=upsample_classes,
+                         ensembling_search_iter=ensembling_search_iter,
+                         search_algo=search_algo, num_p_hparams=num_p_hparams)
+        self.config.objective = 'classification'
+        self.m_hparams = hp.choice('classifier',
+                                   [CLASSIFICATION_HPARAM_SPACE[m] for m in
+                                    self.config.classification_models])
+        self.m_hparams_base = hp.choice('classifier',
+                                        [CLASSIFICATION_BASE_HPARAM_SPACE[m] for m in
+                                         self.config.classification_models])
+        self.p_hparams_base = CLASSIFICATION_PREP_HPARAM_SPACE
+
+    def get_skf(self, folds):
+        """
+            See the base class.
+        """
+        if self.config.objective == 'binary':
+            score_metric = 'roc_auc'
+            skf = StratifiedKFold(n_splits=folds, shuffle=True,
+                                  random_state=self.config.random_state)
+        else:
+            score_metric = 'f1_weighted'
+            skf = StratifiedKFold(n_splits=folds, shuffle=True,
+                                  random_state=self.config.random_state)
+        return score_metric, skf
+
+
+class Regressor(AutoKaggle):
+    """ Extends AutoKaggle for Regression
+
+        Extends the AutoKaggle specific to the regression requirements.
+        # Arguments
+            path: String. OS path for storing temporary model parameters.
+            verbose: Bool. Defines the verbosity of the logging.
+            time_limit: Int. Time budget for performing search and fit pipeline.
+            use_ensembling: Bool. Defines whether to use an ensemble of models
+            num_estimators_ensemble: Int. Maximum number of estimators to be used
+            in an ensemble
+            ensemble_strategy: String. Strategy to ensemble models
+            ensemble_method: String. Aggregation method if ensemble_strategy is
+            set to ranked_ensembling
+            random_ensemble: Bool. Whether the ensembling estimators are picked
+            randomly.
+            diverse_ensemble: Bool. Whether estimators from different families are
+            picked.
+            ensembling_search_iter: Int. Search iterations for ensembling
+            hyper-parameter search
+            search_algo: String. Search strategy for hyper-parameter search.
+            search_iter: Int. Number of iterations used for hyper-parameter search.
+            cv_folds: Int. Number of Cross Validation folds.
+            subsample_ratio: Percent of subsample used for for hyper-parameter
+            search.
+            data_info: list(String). Lists the datatypes of each feature column.
+            stack_probabilities: Bool. Whether to use class probabilities in
+            ensembling.
+            upsample_classes: Bool. Whether to upsample less represented classes
+            num_p_hparams: Int. Number of preprocessor search spaces.
+    """
+
+    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True,
+                 num_estimators_ensemble=50, ensemble_strategy='stacking',
+                 ensemble_method='max_voting',
+                 search_iter=500, cv_folds=3, subsample_ratio=0.1,
+                 random_ensemble=False, diverse_ensemble=True,
+                 stack_probabilities=False, data_info=None, upsample_classes=False,
+                 ensembling_search_iter=10,
+                 search_algo='random', num_p_hparams=10):
+        super().__init__(path=path, verbose=verbose, time_limit=time_limit,
+                         use_ensembling=use_ensembling,
+                         num_estimators_ensemble=num_estimators_ensemble,
+                         ensemble_strategy=ensemble_strategy,
+                         ensemble_method=ensemble_method, search_iter=search_iter,
+                         cv_folds=cv_folds,
+                         subsample_ratio=subsample_ratio,
+                         random_ensemble=random_ensemble,
+                         diverse_ensemble=diverse_ensemble,
+                         stack_probabilities=stack_probabilities,
+                         data_info=data_info,
+                         upsample_classes=upsample_classes,
+                         ensembling_search_iter=ensembling_search_iter,
+                         search_algo=search_algo, num_p_hparams=num_p_hparams)
+        self.config.objective = 'regression'
+        self.m_hparams = hp.choice('regressor', [REGRESSION_HPARAM_SPACE[m] for m in
+                                                 self.config.regression_models])
+        self.m_hparams_base = hp.choice('regressor',
+                                        [REGRESSION_BASE_HPARAM_SPACE[m] for m in
+                                         self.config.classification_models])
+        self.p_hparams_base = REGRESSION_PREP_HPARAM_SPACE
+
+    def get_skf(self, folds):
+        """
+            See the base class.
+        """
+        return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True,
+                                               random_state=self.config.random_state)
+
+
+class AutoPipe(BaseEstimator):
+    """ Implements a machine learning pipeline.
+
+        Implements a machine learning pipeline with preprocessor and estimator. A
+        user can call fit(), and predict() methods on it. It is used as a  search
+        unit in AutoKaggle's hyeper-parameter search.
+        # Arguments
+            config: Config. Defines the configuration of various components of the
+            pipeline.
+            m_params: Dict. Hyper-parameter search space for estimator.
+            p_params: Dict. Hyper-parameter search space for preprocessor.
+            model_class: Estimator. Class name of the estimator used in the pipeline.
+            _estimator_type: String. Denotes if the estimator is 'classifier' or
+            'regressor'
+            prep: Preprocessor. Instance of the Preprocessor class, which does
+            basic feature preprocessing and feature
+            engineering
+            model: Estimator. Instance of the estimator class which learns a
+            machine learning model and predicts on the
+            given data.
+    """
+
+    def __init__(self, model_class, m_params, p_params, config):
+        self.prep = None
+        self.model = None
+        self.config = config
+        self.m_params = m_params
+        self.p_params = p_params
+        self.model_class = model_class
+        self._estimator_type = 'classifier' if is_classifier(
+            model_class) else 'regressor'
+
+    def fit(self, x, y):
+        """ Trains the given pipeline.
+        # Arguments
+            x: A numpy.ndarray instance containing the training data.
+            y: training label vector.
+        # Returns
+            None
+        """
+        self.prep = Preprocessor(self.config, self.p_params)
+        self.model = self.model_class(**self.m_params)
+        x = self.prep.fit_transform(x, y)
+        self.model.fit(x, y)
+
+    def predict(self, x):
+        """ Generate prediction on the test data for the given task.
+        # Arguments
+            x: A numpy.ndarray instance containing the test data.
+        # Returns
+            A numpy array for the predictions on the x.
+        This function provides predictions of labels on (test) data.
+        """
+        x = self.prep.transform(x)
+        return self.model.predict(x)
+
+    def predict_proba(self, x):
+        """ Predict label probabilities on the test data for the given
+        classification task.
+        # Arguments
+            x: A numpy.ndarray instance containing the test data.
+        # Returns
+            A numpy array for the prediction probabilities on the x.
+        The function returns predicted probabilities for every class label.
+        """
+        x = self.prep.transform(x)
+        try:
+            return self.model.predict_proba(x)
+        except AttributeError:
+            return self.model.predict(x)
+
+    def decision_function(self, x):
+        """ Returns the decision function learned by the estimator.
+        # Arguments
+            x: A numpy.ndarray instance containing the test data.
+        # Returns
+            Decision function learned by the estimator.
+        This is used by the scorers to evaluate the pipeline.
+        """
+        x = self.prep.transform(x)
+        try:
+            return self.model.decision_function(x)
+        except AttributeError:
+            raise AttributeError
diff --git a/autokaggle/config.py b/autokaggle/config.py
new file mode 100644
index 0000000..729c156
--- /dev/null
+++ b/autokaggle/config.py
@@ -0,0 +1,351 @@
+from sklearn.base import BaseEstimator
+from autokaggle.utils import rand_temp_folder_generator, ensure_dir
+import hyperopt
+from hyperopt import hp
+from sklearn.svm import SVC
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
+    RandomForestRegressor, AdaBoostRegressor, \
+    ExtraTreesRegressor
+from sklearn.linear_model import Ridge
+from lightgbm import LGBMClassifier, LGBMRegressor
+from catboost import CatBoostClassifier, Pool, CatBoostRegressor
+import numpy as np
+
+
+class Config:
+    """ Configuration for various autoML components.
+
+        Defines the common configuration of different auto ML components. It is
+        shared between AutoKaggle, AutoPipe, Preprocessor and Ensembling class.
+
+        # Arguments
+            path: String. OS path for storing temporary model parameters.
+            verbose: Bool. Defines the verbosity of the logging.
+            time_limit: Int. Time budget for performing search and fit pipeline.
+            use_ensembling: Bool. Defines whether to use an ensemble of models
+            num_estimators_ensemble: Int. Maximum number of estimators to be used
+            in an ensemble
+            ensemble_strategy: String. Strategy to ensemble models
+            ensemble_method: String. Aggregation method if ensemble_strategy is
+            set to ranked_ensembling
+            random_ensemble: Bool. Whether the ensembling estimators are picked
+            randomly.
+            diverse_ensemble: Bool. Whether estimators from different families are
+            picked.
+            ensembling_search_iter: Int. Search iterations for ensembling
+            hyper-parameter search
+            search_algo: String. Search strategy for hyper-parameter search.
+            search_iter: Int. Number of iterations used for hyper-parameter search.
+            cv_folds: Int. Number of Cross Validation folds.
+            subsample_ratio: Percent of subsample used for for hyper-parameter
+            search.
+            data_info: list(String). Lists the datatypes of each feature column.
+            stack_probabilities: Bool. Whether to use class probabilities in
+            ensembling.
+            upsample_classes: Bool. Whether to upsample less represented classes
+            num_p_hparams: Int. Number of preprocessor search spaces.
+    """
+
+    def __init__(self, path=None, verbose=True, time_limit=None, use_ensembling=True,
+                 num_estimators_ensemble=50,
+                 ensemble_strategy='stacking', ensemble_method='max_voting',
+                 search_iter=500, cv_folds=3,
+                 subsample_ratio=0.1, random_ensemble=False, diverse_ensemble=True,
+                 stack_probabilities=False,
+                 data_info=None, upsample_classes=False, ensembling_search_iter=10,
+                 search_algo='random',
+                 num_p_hparams=10):
+        self.verbose = verbose
+        self.path = path if path is not None else rand_temp_folder_generator()
+        ensure_dir(self.path)
+        if self.verbose:
+            print('Path:', self.path)
+        self.time_limit = time_limit
+        self.objective = None
+        self.use_ensembling = use_ensembling
+        self.hparams = None
+        self.num_estimators_ensemble = num_estimators_ensemble
+        self.ensemble_strategy = ensemble_strategy
+        self.ensemble_method = ensemble_method
+        self.random_ensemble = random_ensemble
+        self.search_iter = search_iter
+        self.cv_folds = cv_folds
+        self.subsample_ratio = subsample_ratio
+        self.resampling_strategy = 'auto'
+        self.random_state = 1001
+        self.classification_models = ['knn', 'svm', 'lgbm', 'random_forest',
+                                      'adaboost']
+        # self.classification_models = ['knn', 'lgbm', 'random_forest',]
+        self.regression_models = ['extratree', 'ridge', 'lgbm', 'random_forest',
+                                  'adaboost', 'catboost']
+        self.diverse_ensemble = diverse_ensemble
+        self.stack_probabilities = stack_probabilities
+        self.data_info = data_info
+        self.upsample_classes = upsample_classes
+        self.ensembling_search_iter = ensembling_search_iter
+        self.search_algo = hyperopt.rand.suggest if search_algo == 'random' else \
+            hyperopt.tpe.suggest
+        self.num_p_hparams = num_p_hparams
+
+    def update(self, options):
+        for k, v in options.items():
+            if hasattr(self, k):
+                setattr(self, k, v)
+
+
+KNN_CLASSIFIER_PARAMS = {
+    'n_neighbors': hp.choice('n_neighbors_knn', [1, 2, 4, 8, 16, 32, 64, 100]),
+    'weights': hp.choice('weight_knn', ['uniform', 'distance']),
+    'metric': hp.choice('metric_knn',
+                        ["euclidean", "manhattan", "chebyshev", "minkowski"]),
+    'p': hp.choice('p_knn', range(1, 3)),
+}
+
+SVM_CLASSIFIER_PARAMS = {
+    'C': hp.loguniform('C_svm', np.log(0.03125), np.log(32768)),
+    'kernel': hp.choice('kernel_svm', ['rbf', 'poly', 'sigmoid']),
+    'degree': hp.choice('degree_svm', range(2, 6)),
+    'gamma': hp.loguniform('gamma_svm', np.log(3e-5), np.log(8)),
+    'max_iter': 50000,
+}
+
+RANDOM_FOREST_CLASSIFIER_PARAMS = {
+    'criterion': hp.choice('criterion_rf', ['entropy', 'gini']),
+    'max_features': hp.uniform('max_features_rf', 0, 1.0),
+    'n_estimators': hp.choice('n_estimators_rf', [100, 50]),
+    'min_samples_leaf': hp.choice('min_samples_leaf_rf', range(1, 20)),
+    'min_samples_split': hp.choice('min_samples_split_rf', range(2, 20)),
+}
+
+LGBM_CLASSIFIER_PARAMS = {
+    'boosting_type': 'gbdt',
+    'min_split_gain': 0.1,
+    'subsample': 0.8,
+    'num_leaves': 80,
+    'colsample_bytree': hp.uniform('colsample_bytree_lgbm', 0.4, 0.8),
+    'min_child_weight': hp.choice('min_child_weight_lgbm', range(1, 100)),
+    'max_depth': hp.choice('max_depth_lgbm', range(5, 10)),
+    'n_estimators': hp.choice('n_estimators_lgbm', range(50, 200)),
+    'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-2),
+                                   high=np.log(2)),
+}
+
+ADABOOST_CLASSIFIER_PARAMS = {
+    'algorithm': hp.choice('algorithm_adaboost', ['SAMME.R', 'SAMME']),
+    'n_estimators': hp.choice('n_estimators_adaboost', range(50, 500)),
+    'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2),
+                                   high=np.log(2)),
+}
+
+CATBOOST_CLASSIFIER_PARAMS = {
+    'iterations': hp.choice('iterations_catboost', [5, 10]),
+    'depth': hp.choice('depth_catboost', range(4, 11)),
+    'learning_rate': hp.loguniform('learning_rate_catboost', low=np.log(1e-3),
+                                   high=np.log(1)),
+    'loss_function': hp.choice('loss_function_catboost',
+                               ['Logloss', 'CrossEntropy']),
+    'verbose': True,
+    'leaf_estimation_iterations': 10,
+    'l2_leaf_reg': hp.choice('l2_leaf_reg_catboost', np.logspace(-20, -19, 3))
+}
+
+EXTRA_TREES_REGRESSOR_PARAMS = {
+    'n_estimators': hp.choice('n_estimators_extra_trees', [50, 100, 200]),
+    'criterion': hp.choice('criterion_extra_trees', ['mse', 'friedman_mse', 'mae']),
+    'max_features': hp.uniform('max_features_extra_trees', 0, 1.0),
+    'min_samples_leaf': hp.choice('min_samples_leaf_extra_trees', range(1, 20)),
+    'min_samples_split': hp.choice('min_samples_split_extra_trees', range(2, 20)),
+    'min_impurity_decrease': 0.0,
+    'bootstrap': hp.choice('bootstrap_extra_trees', [True, False]),
+}
+
+RIDGE_REGRESSOR_PARAMS = {
+    'fit_intercept': True,
+    'tol': hp.loguniform('tol_ridge', 1e-5, 1e-1),
+    'alpha': hp.loguniform('alpha_ridge', np.log(1e-5), np.log(10))
+}
+
+RANDOM_FOREST_REGRESSOR_PARAMS = {
+    'criterion': hp.choice('criterion_rf', ['mse', 'friedman_mse', 'mae']),
+    'max_features': hp.uniform('max_features_rf', 0.1, 1.0),
+    'n_estimators': hp.choice('n_estimators_rf', [50, 100, 200]),
+    'min_samples_leaf': hp.choice('min_samples_leaf_rf', range(1, 10)),
+    'min_samples_split': hp.choice('min_samples_split_rf', range(2, 10)),
+    'bootstrap': hp.choice('bootstrap_rf', [True, False]),
+}
+
+LGBM_REGRESSOR_PARAMS = {
+    'boosting_type': 'gbdt',
+    'min_split_gain': 0.1,
+    'subsample': 0.8,
+    'num_leaves': 80,
+    'colsample_bytree': hp.uniform('colsample_bytree_lgbm', 0.4, 0.8),
+    'min_child_weight': hp.choice('min_child_weight_lgbm', range(1, 100)),
+    'max_depth': hp.choice('max_depth_lgbm', range(5, 10)),
+    'n_estimators': hp.choice('n_estimators_lgbm', range(50, 200)),
+    'learning_rate': hp.loguniform('learning_rate_lgbm', low=np.log(1e-5),
+                                   high=np.log(1)),
+}
+
+ADABOOST_REGRESSOR_PARAMS = {
+    'loss': hp.choice('loss_adaboost', ["linear", "square", "exponential"]),
+    'n_estimators': hp.choice('n_estimators_adaboost', range(50, 300)),
+    'learning_rate': hp.loguniform('learning_rate_adaboost', low=np.log(1e-2),
+                                   high=np.log(2)),
+    # 'max_depth': hp.choice('max_depth_adaboost', range(1, 11)),
+}
+
+CATBOOST_REGRESSOR_PARAMS = {
+    'iterations': 2,
+    'depth': hp.choice('depth_catboost', range(4, 10)),
+    'learning_rate': 1,
+    'loss_function': 'RMSE',
+    'verbose': True
+}
+
+REGRESSION_HPARAM_SPACE = {
+    'extratree': {
+        'model': ExtraTreesRegressor,
+        'param': EXTRA_TREES_REGRESSOR_PARAMS
+    },
+    'ridge': {
+        'model': Ridge,
+        'param': RIDGE_REGRESSOR_PARAMS
+    },
+    'random_forest': {
+        'model': RandomForestRegressor,
+        'param': RANDOM_FOREST_REGRESSOR_PARAMS
+    },
+    'lgbm': {
+        'model': LGBMRegressor,
+        'param': LGBM_REGRESSOR_PARAMS
+    },
+    'adaboost': {
+        'model': AdaBoostRegressor,
+        'param': ADABOOST_REGRESSOR_PARAMS
+    },
+    'catboost': {
+        'model': CatBoostRegressor,
+        'param': CATBOOST_REGRESSOR_PARAMS
+    }
+}
+
+CLASSIFICATION_HPARAM_SPACE = {
+    'knn': {
+        'model': KNeighborsClassifier,
+        'param': KNN_CLASSIFIER_PARAMS
+    },
+    'svm': {
+        'model': SVC,
+        'param': SVM_CLASSIFIER_PARAMS
+    },
+    'random_forest': {
+        'model': RandomForestClassifier,
+        'param': RANDOM_FOREST_CLASSIFIER_PARAMS
+    },
+    'lgbm': {
+        'model': LGBMClassifier,
+        'param': LGBM_CLASSIFIER_PARAMS
+    },
+    'adaboost': {
+        'model': AdaBoostClassifier,
+        'param': ADABOOST_CLASSIFIER_PARAMS
+    },
+    'catboost': {
+        'model': CatBoostClassifier,
+        'param': CATBOOST_CLASSIFIER_PARAMS
+    }
+}
+
+CLASSIFICATION_BASE_HPARAM_SPACE = {
+    'knn': {
+        'model': KNeighborsClassifier,
+        'param': {}
+    },
+    'svm': {
+        'model': SVC,
+        'param': {}
+    },
+    'random_forest': {
+        'model': RandomForestClassifier,
+        'param': {}
+    },
+    'lgbm': {
+        'model': LGBMClassifier,
+        'param': {}
+    },
+    'adaboost': {
+        'model': AdaBoostClassifier,
+        'param': {}
+    },
+    'catboost': {
+        'model': CatBoostClassifier,
+        'param': {}
+    }
+}
+
+REGRESSION_BASE_HPARAM_SPACE = {
+    'extratree': {
+        'model': ExtraTreesRegressor,
+        'param': {}
+    },
+    'ridge': {
+        'model': Ridge,
+        'param': {}
+    },
+    'random_forest': {
+        'model': RandomForestRegressor,
+        'param': {}
+    },
+    'lgbm': {
+        'model': LGBMRegressor,
+        'param': {}
+    },
+    'adaboost': {
+        'model': AdaBoostRegressor,
+        'param': {}
+    },
+    'catboost': {
+        'model': CatBoostRegressor,
+        'param': {}
+    }
+}
+
+REGRESSION_PREP_HPARAM_SPACE = {
+    'cat_encoding': hp.choice('cat_enc',
+                              ['count', 'target+count', 'target+label', 'label']),
+    'scaling': hp.choice('scaling', [True, False]),
+    'log_transform': hp.choice('log_transform', [True, False]),
+    'power_transform': hp.choice('power_transform', [True, False]),
+    'pca': hp.choice('pca', [True, False]),
+    'binning': hp.choice('binning', [True, False]),
+    'add_time_offset': hp.choice('add_time_offset', [True, False]),
+    'add_time_diff': hp.choice('add_time_diff', [True, False]),
+    # 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max',
+    # 'min', None]),
+    # 'cat_cat_strategy': hp.choice('cat_cat_strategy', ['count', 'nunique', None]),
+    'imputation_strategy': hp.choice('imputation_strategy',
+                                     ['most_frequent', 'zero']),
+    'pearson_thresh': hp.uniform('pearson_thresh', 0.001, 0.01),
+    'feat_importance_thresh': hp.uniform('feat_importance_thresh', 0.001, 0.01)
+}
+
+CLASSIFICATION_PREP_HPARAM_SPACE = {
+    'cat_encoding': hp.choice('cat_enc',
+                              ['target', 'count', 'target+count', 'target+label']),
+    'scaling': hp.choice('scaling', [True, False]),
+    'log_transform': hp.choice('log_transform', [True, False]),
+    'power_transform': hp.choice('power_transform', [True, False]),
+    'pca': hp.choice('pca', [True, False]),
+    'binning': hp.choice('binning', [True, False]),
+    'add_time_offset': hp.choice('add_time_offset', [True, False]),
+    'add_time_diff': hp.choice('add_time_diff', [True, False]),
+    # 'cat_num_strategy': hp.choice('cat_num_strategy', ['mean', 'std', 'max',
+    # 'min', None]),
+    # 'cat_cat_strategy': hp.choice('cat_cat_strategy', ['count', 'nunique', None]),
+    'imputation_strategy': hp.choice('imputation_strategy',
+                                     ['most_frequent', 'zero']),
+    'pearson_thresh': hp.uniform('pearson_thresh', 0.001, 0.01),
+    'feat_importance_thresh': hp.uniform('feat_importance_thresh', 0.001, 0.01)
+}
diff --git a/autokaggle/ensemblers.py b/autokaggle/ensemblers.py
new file mode 100644
index 0000000..95d0435
--- /dev/null
+++ b/autokaggle/ensemblers.py
@@ -0,0 +1,208 @@
+from sklearn.base import BaseEstimator
+from autokaggle.utils import rand_temp_folder_generator, ensure_dir, write_json, \
+    read_json
+from abc import abstractmethod
+import numpy as np
+import os
+import random
+import json
+from statistics import mode
+
+from sklearn.model_selection import RandomizedSearchCV, train_test_split
+from sklearn.model_selection import StratifiedKFold, KFold
+from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
+from joblib import dump, load
+from scipy import stats
+from lightgbm import LGBMClassifier, LGBMRegressor
+import collections
+from sklearn.model_selection import RandomizedSearchCV, cross_val_score
+import hyperopt
+from hyperopt import tpe, hp, fmin, space_eval, Trials, STATUS_OK
+from autokaggle.config import REGRESSION_HPARAM_SPACE, CLASSIFICATION_HPARAM_SPACE, \
+    CLASSIFICATION_BASE_HPARAM_SPACE, \
+    REGRESSION_BASE_HPARAM_SPACE
+
+
+class EnsemblingModel:
+    """ Base class for ensembling estimators.
+
+        This class creates an ensembling estimator from a given list of estimators.
+        The user can call fit() and predict() methods, similar to the scikit-learn
+        estimators.
+
+        # Arguments
+            config: Config. Defines the configuration of various components of the
+            autoML pipeline.
+            estimator_list: List. List of the estimators, to be used for building an
+            ensemble.
+    """
+
+    def __init__(self, estimator_list, config):
+        self.config = config
+        self.estimator_list = estimator_list
+
+    @abstractmethod
+    def fit(self, x, y):
+        """ Trains the ensemble of estimators on the training data.
+        # Arguments
+            X: A numpy array instance containing the training data.
+        # Returns
+            None
+        """
+        pass
+
+    @abstractmethod
+    def predict(self, x):
+        """ Generate prediction on the test data for the given task.
+        # Arguments
+            X: A numpy array instance containing the test data.
+        # Returns
+            A numpy array for the predictions on the x_test.
+        This function provides predicts on the input data using the ensemble of
+        estimators.
+        """
+        pass
+
+
+class RankedEnsemblingModel(EnsemblingModel):
+    """ Implements ensembling using ranking based methods.
+
+        This class implements randing based ensembling using ensembling methods
+        amongst: ('mean', 'median', 'max' and 'majority_voting')
+    """
+
+    def fit(self, x, y):
+        for est in self.estimator_list:
+            est.fit(x, y)
+
+    def predict(self, x):
+        predictions = np.zeros((len(x), len(self.estimator_list)))
+        for i, est in enumerate(self.estimator_list):
+            predictions[:, i] = est.predict(x)
+
+        if self.config.ensemble_method == 'median':
+            return np.median(predictions, axis=1)
+        elif self.config.ensemble_method == 'mean':
+            return np.mean(predictions, axis=1)
+        elif self.config.ensemble_method == 'max':
+            return np.max(predictions, axis=1)
+        elif self.config.ensemble_method == 'min':
+            return np.min(predictions, axis=1)
+        elif self.config.ensemble_method == 'max_voting':
+            return stats.mode(predictions, axis=1)[0]
+
+
+class StackedEnsemblingModel(EnsemblingModel):
+    """ Implements a stacking based ensembling estimator.
+
+        This class creates an ensembling estimator using stacking. It trains an
+        Light-GBM model on the predictions of the base estimator.
+
+        # Arguments
+            stacking_estimator: LightGBM estimator. Meta-learning algorithm for the
+            stacking estimator.
+    """
+
+    def __init__(self, estimator_list, config):
+        super().__init__(estimator_list, config)
+        self.stacking_estimator = None
+
+        if self.config.objective == 'regression':
+            self.hparams = hp.choice('regressor',
+                                     [REGRESSION_BASE_HPARAM_SPACE['lgbm']])
+            self.config.stack_probabilities = False
+        else:
+            self.hparams = hp.choice('classifier',
+                                     [CLASSIFICATION_BASE_HPARAM_SPACE['lgbm']])
+
+    def get_model_predictions(self, X):
+        """ Generate the combined predictions from the list of the estimators.
+        # Arguments
+            X: A numpy array instance containing the training/test data.
+        # Returns
+            A numpy array for the predictions of all the estimators in the list.
+        """
+        if self.config.stack_probabilities:
+            predictions = np.zeros((len(X), 1))
+            for i, est in enumerate(self.estimator_list):
+                try:
+                    new = est.predict_proba(X)[:, :-1]
+                    predictions = np.hstack([predictions, new])
+                except AttributeError:
+                    new = np.reshape(est.predict(X), (-1, 1))
+                    predictions = np.hstack([predictions, new])
+            predictions = predictions[:, 1:]
+        else:
+            predictions = np.zeros((len(X), len(self.estimator_list)))
+            for i, est in enumerate(self.estimator_list):
+                predictions[:, i] = est.predict(X)
+        return predictions
+
+    def fit(self, x, y):
+        x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2)
+        for est in self.estimator_list:
+            est.fit(x_train, y_train)
+        predictions = self.get_model_predictions(x_val)
+        self.stacking_estimator = self.search(predictions, y_val)
+        self.stacking_estimator.fit(predictions, y_val)
+
+    def search(self, x, y):
+        """ Search function to find best hyper-param setting for the stacking model.
+        # Arguments
+            x: A numpy array instance containing the training data
+        # Returns
+            List of trials on various hyper-parameter settings.
+        """
+        score_metric, skf = self.get_skf(self.config.cv_folds)
+
+        def objective_func(args):
+            clf = args['model'](**args['param'])
+            try:
+                eval_score = cross_val_score(clf, x, y, scoring=score_metric,
+                                             cv=skf).mean()
+            except ValueError:
+                eval_score = 0
+            if self.config.verbose:
+                print("Ensembling CV Score:", eval_score)
+                print("\n=================")
+            return {'loss': 1 - eval_score, 'status': STATUS_OK, 'space': args}
+
+        trials = Trials()
+        best = fmin(objective_func, self.hparams, algo=self.config.search_algo,
+                    trials=trials,
+                    max_evals=self.config.ensembling_search_iter,
+                    rstate=np.random.RandomState(self.config.random_state))
+
+        opt = space_eval(self.hparams, best)
+        best_estimator_ = opt['model'](**opt['param'])
+        if self.config.verbose:
+            print("The best hyperparameter setting found for stacking:")
+            print(opt)
+        return best_estimator_
+
+    def predict(self, x):
+        predictions = self.get_model_predictions(x)
+        return self.stacking_estimator.predict(predictions)
+
+    def get_skf(self, folds):
+        """ Get scoring metric and cross validation folds for the task type
+        # Arguments
+            folds: Number of cross validation folds
+        # Returns
+            Scoring metric and CV folds
+        """
+        if self.config.objective == 'binary':
+            score_metric = 'roc_auc'
+            skf = StratifiedKFold(n_splits=folds, shuffle=True,
+                                  random_state=self.config.random_state)
+        elif self.config.objective == 'multiclass':
+            score_metric = 'f1_weighted'
+            skf = StratifiedKFold(n_splits=folds, shuffle=True,
+                                  random_state=self.config.random_state)
+        elif self.config.objective == 'regression':
+            score_metric = 'neg_mean_squared_error'
+            skf = KFold(n_splits=folds, shuffle=True,
+                        random_state=self.config.random_state)
+        else:
+            ValueError("Invalid objective")
+        return score_metric, skf
diff --git a/autokaggle/hparam_space/knn_hp.json b/autokaggle/hparam_space/knn_hp.json
new file mode 100644
index 0000000..609ff31
--- /dev/null
+++ b/autokaggle/hparam_space/knn_hp.json
@@ -0,0 +1 @@
+{"n_neighbors": [2, 4, 6, 8, 10, 12, 14, 16, 18, 20], "weights": ["uniform", "distance"], "algorithm": ["ball_tree", "kd_tree", "brute"], "leaf_size": [5, 10, 15, 20, 25, 30, 35, 40, 45, 50], "metric": ["minkowski", "euclidean", "manhattan", "chebyshev"], "p": [1, 2, 3]}
\ No newline at end of file
diff --git a/autokaggle/hparam_space/lgbm_hp.json b/autokaggle/hparam_space/lgbm_hp.json
new file mode 100644
index 0000000..b2f6311
--- /dev/null
+++ b/autokaggle/hparam_space/lgbm_hp.json
@@ -0,0 +1 @@
+[{"boosting_type": ["gbdt"], "min_child_weight": [5, 10, 30, 50, 60, 80, 100], "min_split_gain": [0.1], "subsample": [0.8], "colsample_bytree": [0.6, 0.7], "max_depth": [5, 8, 10], "n_estimators": [50], "num_leaves": [80], "learning_rate": [0.3]}, {"learning_rate": [0.03, 0.045, 0.06, 0.075, 0.85, 0.95, 0.105, 0.12], "n_estimators": [100, 150, 200]}]
\ No newline at end of file
diff --git a/autokaggle/hparam_space/rf_hp.json b/autokaggle/hparam_space/rf_hp.json
new file mode 100644
index 0000000..c23c577
--- /dev/null
+++ b/autokaggle/hparam_space/rf_hp.json
@@ -0,0 +1 @@
+[{"criterion": ["entropy", "gini"], "max_features": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], "n_estimators": [300], "min_samples_leaf": [1]}, {"min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, {"n_estimators": [50, 100, 150, 200]}]
\ No newline at end of file
diff --git a/autokaggle/hparam_space/svm_hp.json b/autokaggle/hparam_space/svm_hp.json
new file mode 100644
index 0000000..8642e7f
--- /dev/null
+++ b/autokaggle/hparam_space/svm_hp.json
@@ -0,0 +1 @@
+{"C": [0.001, 0.1, 1, 10, 100, 1000, 10000], "gamma": [1e-05, 100000.0], "kernel": ["rbf", "poly", "linear", "sigmoid"], "degree": [2, 3, 4, 5], "max_iter": [50000]}
\ No newline at end of file
diff --git a/autokaggle/preprocessor.py b/autokaggle/preprocessor.py
new file mode 100644
index 0000000..f5ba361
--- /dev/null
+++ b/autokaggle/preprocessor.py
@@ -0,0 +1,1296 @@
+import numpy as np
+import pandas as pd
+import scipy
+import itertools
+from scipy.stats import pearsonr
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler, PowerTransformer, \
+    KBinsDiscretizer, OneHotEncoder
+from sklearn.base import TransformerMixin
+from sklearn.base import BaseEstimator
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from abc import abstractmethod
+import collections
+from lightgbm import LGBMClassifier, LGBMRegressor
+
+LEVEL_HIGH = 32
+
+
+class Preprocessor(TransformerMixin):
+    """ Implements basic preprocessing and feature engineering class.
+
+        Preprocessor takes care of the basic preprocessing and feature engineering of
+        the input data. Similar to Scikit-learn transformers,it implements the fit()
+        and transform() methods. TO acheive this It applies various feature
+        primitives in a sequence using scikit-learn pipeline.
+        # Arguments
+            config: Config. Defines the configuration of various components of the
+            AutoML pipeline.
+            params: Dict. Hyper-parameter search space for preprocessor.
+            pipeline: Pipeline. Sci-kit learn pipeline class to apply the feature
+            primitives in sequence
+    """
+
+    def __init__(self, config, params):
+        self.config = config
+        self.params = params
+        self.pipeline = None
+
+    def fit(self, raw_x, y):
+        """ This function trains the preprocessor chain
+        # Arguments
+            raw_x: A numpy array instance containing the training data data.
+            y: A numpy array instance containing training label vector.
+        # Returns
+            None
+        This function fits the preprocessor chain on the given training data
+        """
+        data = TabularData(raw_x, self.config.data_info, self.config.verbose)
+
+        steps = []
+        steps.extend(self.get_imputation_pipeline(self.params))
+        steps.extend(self.get_higher_order_pipeline(self.params))
+        steps.extend(self.get_categorical_pipeline(self.params))
+        steps.extend(self.get_numerical_pipeline(self.params))
+        steps.extend(self.get_time_pipeline(self.params))
+        steps.extend(self.get_filtering_pipeline(self.params))
+        self.pipeline = Pipeline(steps)
+
+        self.pipeline.fit(data, y)
+
+        return self
+
+    def transform(self, raw_x):
+        """ Generate data transformation on the given data.
+        # Arguments
+            raw_x: a numpy array instance containing the training/testing data
+        # Returns
+            A numpy array instance containing the transformed data.
+        This function provides transforms the input data by applying the
+        transformations using the pre-trained preprocessor chain.
+        """
+        # Get Meta-Feature
+        data = TabularData(raw_x, self.config.data_info, self.config.verbose)
+        a = self.pipeline.transform(data).X
+        return a.values
+
+    @staticmethod
+    def get_categorical_pipeline(params):
+        """ Generate pipeline of primitives for categorical features.
+        # Arguments
+            params: Hyper-parameter setting for the preprocessors.
+        # Returns
+            List of primitives to be applied (based on the given setting)
+        """
+        choice = params.get('cat_encoding', 'target')
+        cat_pipeline = []
+        if choice == 'target':
+            cat_pipeline.append(('target_encoder', TargetEncoder(operation='upd',
+                                                                 selected_type='CAT')
+                                 ))
+        elif choice == 'label':
+            cat_pipeline.append(
+                ('label_encoder', LabelEncode(operation='upd', selected_type='CAT')))
+        elif choice == 'count':
+            cat_pipeline.append(
+                ('count_encoder', CatCount(operation='upd', selected_type='CAT')))
+        elif choice == 'target+count':
+            cat_pipeline.append(('target_encoder', TargetEncoder(operation='add',
+                                                                 selected_type='CAT')
+                                 ))
+            cat_pipeline.append(
+                ('count_encoder', CatCount(operation='upd', selected_type='CAT')))
+        elif choice == 'one_hot':
+            cat_pipeline.append(
+                ('one_hot_encoder', OneHot(operation='upd', selected_type='CAT')))
+        elif choice == 'target+label':
+            cat_pipeline.append(('target_encoder', TargetEncoder(operation='add',
+                                                                 selected_type='CAT')
+                                 ))
+            cat_pipeline.append(
+                ('label_encoder', LabelEncode(operation='upd', selected_type='CAT')))
+        else:
+            raise ValueError
+        return cat_pipeline
+
+    @staticmethod
+    def get_numerical_pipeline(params):
+        """ Generate pipeline of primitives for numerical features.
+        # Arguments
+            params: Hyper-parameter setting for the preprocessors.
+        # Returns
+            List of primitives to be applied (based on the given setting)
+        """
+        scaling = params.get('scaling', True)
+        log_transform = params.get('log_transform', False)
+        power_transform = params.get('power_transform', False)
+        pca = params.get('pca', False)
+        binning = params.get('binning', False)
+
+        numeric_pipeline = []
+        if scaling:
+            numeric_pipeline.append(
+                ('scaler', TabScaler(operation='upd', selected_type='NUM')))
+        if log_transform:
+            numeric_pipeline.append(('log_transform',
+                                     LogTransform(operation='upd',
+                                                  selected_type='NUM')))
+        if power_transform:
+            numeric_pipeline.append(
+                ('boxcox', BoxCox(operation='upd', selected_type='NUM')))
+        if pca:
+            numeric_pipeline.append(
+                ('pca', TabPCA(operation='add', selected_type='NUM')))
+        if binning:
+            numeric_pipeline.append(
+                ('binning', Binning(operation='add', selected_type='NUM')))
+        return numeric_pipeline
+
+    def get_filtering_pipeline(self, params):
+        """ Generate pipeline of primitives to filter less useful features.
+        # Arguments
+            params: Hyper-parameter setting for the preprocessors.
+        # Returns
+            List of primitives to be applied (based on the given setting)
+        """
+        pearson_thresh = params.get('pearson_thresh', 0)
+        feat_importance_thresh = params.get('feat_importance_thresh', 0)
+
+        filter_pipeline = [
+            ('filter', FilterConstant(operation='del', selected_type='ALL'))]
+        if pearson_thresh > 0:
+            filter_pipeline.append(
+                ('pearson_corr', FeatureFilter(operation='del', selected_type='ALL',
+                                               threshold=pearson_thresh)))
+        if feat_importance_thresh > 0:
+            filter_pipeline.append(
+                ('lgbm_feat_selection',
+                 FeatureImportance(operation='del',
+                                   selected_type='ALL',
+                                   threshold=feat_importance_thresh,
+                                   task_type=self.config.objective)))
+        return filter_pipeline
+
+    @staticmethod
+    def get_time_pipeline(params):
+        """ Generate pipeline of primitives for time features.
+        # Arguments
+            params: Hyper-parameter setting for the preprocessors.
+        # Returns
+            List of primitives to be applied (based on the given setting)
+        """
+        add_offset = params.get('add_time_offset', False)
+        add_diff = params.get('add_time_diff', False)
+        time_pipeline = []
+        if add_offset:
+            time_pipeline.append(
+                ('time_offset', TimeOffset(operation='upd', selected_type='TIME')))
+        if add_diff:
+            time_pipeline.append(
+                ('time_diff', TimeDiff(operation='add', selected_type='TIME')))
+        return time_pipeline
+
+    @staticmethod
+    def get_imputation_pipeline(params):
+        """ Generate pipeline of primitives to impute the missing values.
+        # Arguments
+            params: Hyper-parameter setting for the preprocessors.
+        # Returns
+            List of primitives to be applied (based on the given setting)
+        """
+        strategy = params.get('imputation_strategy', 'most_frequent')
+        impute_pipeline = [('imputer',
+                            Imputation(operation='upd', selected_type='ALL',
+                                       strategy=strategy))]
+        return impute_pipeline
+
+    @staticmethod
+    def get_higher_order_pipeline(params):
+        """ Generate pipeline of primitives to generate cross-column features.
+        # Arguments
+            params: Hyper-parameter setting for the preprocessors.
+        # Returns
+            List of primitives to be applied (based on the given setting)
+        """
+        cat_num_strategy = params.get('cat_num_strategy', None)
+        cat_cat_strategy = params.get('cat_cat_strategy', None)
+        pipeline = []
+        if cat_num_strategy:
+            pipeline.append(('cat_num_encoder',
+                             CatNumEncoder(operation='add', selected_type1='CAT',
+                                           selected_type2='NUM',
+                                           strategy=cat_num_strategy)))
+        if cat_cat_strategy:
+            pipeline.append(('cat_cat_encoder',
+                             CatCatEncoder(operation='add', selected_type1='CAT',
+                                           selected_type2='CAT',
+                                           strategy=cat_cat_strategy)))
+        return pipeline
+
+
+class TabularData:
+    """ Represents the data and its meta-info.
+
+        TabularData includes the training/testing data along with its meta info such
+        as data types, cardinality etc. The user can update the data and its meta
+        info as well as select the features matching the criteria.
+        # Arguments
+            verbose: Bool. Determines the verbosity of the logging.
+            data_info: Dict. Dictionary mapping the feature names to their data_types
+            total_samples: Int. Number of samples in the data
+            cat_col: List. List of the categorical features
+            num_col: List. List of the numerical features
+            time_col: List. List of the time features
+            n_cat: Int. Number of categorical features
+            n_num: Int. Number of numerical features
+            n_time: Int. Number of time features
+            cat_cardinality: Dict. Dictionary mapping categorical feature names of
+            their cardinality (no. of unique values)
+            generated_features: List. List of the newly added features. (In
+            addition to the pre-existing columns)
+            num_info: Dict. Dictionary mapping numeircal column to their meta info
+            such as range, std etc.
+    """
+
+    def __init__(self, raw_x, data_info, verbose=True):
+        self.cat_col = None
+        self.num_col = None
+        self.time_col = None
+        self.n_cat = 0
+        self.n_time = 0
+        self.n_num = 0
+        self.cat_cardinality = None
+        self.generated_features = None
+        self.num_info = None
+        self.verbose = verbose
+        self.data_info = {str(i): data_info[i] for i in range(len(data_info))}
+        self.total_samples = raw_x.shape[0]
+        self.refresh_col_types()
+
+        # Convert sparse to dense if needed
+        raw_x = raw_x.toarray() if type(
+            raw_x) == scipy.sparse.csr.csr_matrix else raw_x
+
+        # To pandas Dataframe
+        if type(raw_x) != pd.DataFrame:
+            raw_x = pd.DataFrame(raw_x,
+                                 columns=[str(i) for i in range(raw_x.shape[1])])
+
+        self.X = raw_x
+        # self.update_cat_cardinality()
+
+    def update_type(self, columns, new_type):
+        """ Updates the column datatype.
+        # Arguments
+            column: List of columns whose data_type needs update.
+            new_type: New data_type (either of 'CAT', 'NUM' or 'TIME').
+        # Returns
+            None.
+        This function updates the data types of given list of columns.
+        """
+        for c in columns:
+            self.data_info[c] = new_type
+
+    def delete_type(self, columns):
+        """ Delete the columns from the feature to data_type mapping.
+        # Arguments
+            column: List of columns whose data_type needs update.
+        # Returns
+            None
+        This function removes the selected columns from the data_info dictionary.
+        """
+        for c in columns:
+            _ = self.data_info.pop(c, 0)
+
+    def rename_cols(self, key):
+        """ Provides a rename function to add new columns without collision.
+        # Arguments
+            key: Identifier for renaming
+        # Returns
+            Renaming function which takes current column name and outputs a new
+            unique column name.
+        """
+
+        def rename_fn(col_name):
+            col_name = str(col_name)
+            col_name += '_' + key
+            while col_name in self.X.columns:
+                col_name += '_' + key
+            return col_name
+
+        return rename_fn
+
+    def update(self, operation, columns, x_tr, new_type=None, key=''):
+        """ Updates the TabularData after applying primitive.
+        # Arguments
+            operation: Primitive operation applied ('add', 'update' or 'delete').
+            columns: List of columns affected.
+            x_tr: Transformed (or newly generated) features
+            new_type: Data type of the new column
+            key: Name key for renaming the new columns
+        # Returns
+            None
+        This function takes the transformed (or generated) features after applying
+        the primitive and updates the
+        TabularData.
+        """
+        if operation == 'upd':
+            if x_tr is not None:
+                self.X[columns] = x_tr
+            if new_type is not None:
+                self.update_type(columns, new_type)
+        elif operation == 'add':
+            if x_tr is not None:
+                x_tr = x_tr.rename(columns=self.rename_cols(key))
+                self.X = pd.concat([self.X, x_tr], axis=1)
+                self.update_type(x_tr.columns, new_type)
+        elif operation == 'del':
+            if len(columns) != 0:
+                self.X.drop(columns=columns, inplace=True)
+                self.delete_type(columns)
+        else:
+            print("invalid operation")
+        self.refresh_col_types()
+
+    def refresh_col_types(self):
+        """ Updates the column_types based on the data_info
+        # Arguments
+            None
+        # Returns
+            None
+        This function updates the cat, num and time column lists based on (any)
+        updates in the data_info.
+        """
+        self.cat_col = [k for k, v in self.data_info.items() if v == 'CAT']
+        self.num_col = [k for k, v in self.data_info.items() if v == 'NUM']
+        self.time_col = [k for k, v in self.data_info.items() if v == 'TIME']
+        self.n_time = len(self.time_col)
+        self.n_num = len(self.num_col)
+        self.n_cat = len(self.cat_col)
+
+    def update_cat_cardinality(self):
+        """ Update categorical cardinality mapping for all categorical columns.
+        # Arguments
+            None
+        # Returns
+            None
+        """
+        # TODO: too slow make it faster
+        if not self.cat_cardinality:
+            self.cat_cardinality = {}
+        for c in self.cat_col:
+            self.cat_cardinality[c] = len(set(self.X[c]))
+
+    def select_columns(self, data_type):
+        """ Returns all the columns matching the input data_type
+        # Arguments
+            data_type: Required type of the data (either of 'CAT', 'NUM', 'TIME' or
+            'ALL')
+        # Returns
+            List of the feature columns matching the input criteria.
+        """
+        self.refresh_col_types()
+        if data_type == 'CAT':
+            return self.cat_col
+        elif data_type == 'TIME':
+            return self.time_col
+        elif data_type == 'NUM':
+            return self.num_col
+        elif data_type == 'ALL':
+            return list(self.data_info.keys())
+        else:
+            print('invalid Type')
+            return []
+
+
+class Primitive(BaseEstimator, TransformerMixin):
+    """ Base class for the single order data transformation function.
+
+        Primitive learns and applies the data transformation on a given set of
+        features. The user can use fit() and transform() functions to apply these
+        transformations.
+
+        # Arguments
+            options: Dict. Special arguments specific to the given primitive.
+            selected_type: 'String'. Specifies the type of features the
+            transformation is supposed to be applied to.
+            operation: 'String'. Specifies the type of operation from 'add', 'update'
+             or 'delete'
+            name_key : 'String'. Signature key to rename the column after applying
+            the primitive.
+            selected: 'List'. List of the selected features, on which the
+            transformation will be applied
+            drop_columns: 'List'. List of the features which would be dropped after
+            applying the transformation.
+            supported_ops: Tuple. Specifies the allowed list of operations for this
+            primitive.
+    """
+
+    def __init__(self, operation='upd', selected_type=None, **kwargs):
+        self.options = None
+        self.selected = None
+        self.drop_columns = None
+        self.supported_ops = ('add', 'upd', 'del')
+        self.selected_type = selected_type
+        self.operation = operation
+        self.init_vars(**kwargs)
+        self.name_key = self.__class__.__name__
+
+    def init_vars(self, **kwargs):
+        """ Initialize the primitive specific variables (which are not defined in the
+        base class)
+        # Arguments
+            kwargs: Dictionary containing primitive specific variables
+        # Returns
+            None.
+        """
+        self.options = kwargs
+
+    def fit(self, data, y=None):
+        """ A wrapper function to train the given primitive on the input training
+        data.
+        # Arguments
+            data: A TabularData instance of training data.
+            y: A numpy array of the target values.
+        # Returns
+            None
+        """
+        self.selected = data.select_columns(self.selected_type)
+        if self.operation not in self.supported_ops:
+            print("Operation {} not supported for {}".format(self.operation,
+                                                             self.__class__.__name__)
+                  )
+            self.selected = None
+        if not self.selected:
+            return self
+        return self._fit(data, y)
+
+    def transform(self, data, y=None):
+        """ A wrapper function to generate transformation on the input data based on
+        pre-trained primitive.
+        # Arguments
+            data: Input training/testing data in TabularData form.
+            y: A numpy array of the target values.
+        # Returns
+            A TabularData instance of the transformed data.
+        """
+        if not self.selected:
+            return data
+        return self._transform(data, y)
+
+    @abstractmethod
+    def _fit(self, data, y=None):
+        """ Contains the actual implementation of training the primitive (implemented
+        in the child class)
+        # Arguments
+            data: A TabularData instance of training data.
+            y: A numpy array of the target values.
+        # Returns
+            None
+        """
+        pass
+
+    @abstractmethod
+    def _transform(self, data, y=None):
+        """ Contains the actual implementation of transforming the data using
+        primitive. (implemented in the child class)
+        # Arguments
+            data: Input training/testing data in TabularData form.
+            y: A numpy array of the target values.
+        # Returns
+            A TabularData instance of the transformed data.
+        """
+        pass
+
+
+class PrimitiveHigherOrder:
+    """ Base class for the cross-order data transformation function.
+
+        PrimitiveHigherOrder learns and applies the data transformation across two
+        sets of features. The user can use fit() and transform() functions to
+        apply these transformations.
+
+        # Arguments
+            options: Dict. Special arguments specific to the given primitive.
+            selected_type1: 'String'. Specifies the first type of features the
+            transformation is supposed to be applied to.
+            selected_type2: 'String'. Specifies the second type of features the
+            transformation is supposed to be applied to.
+            operation: 'String'. Specifies the type of operation from 'add', 'update'
+             or 'delete'
+            name_key : 'String'. Signature key to rename the column after applying
+            the primitive.
+            selected_1: 'List'. List of the selected features in the first set, on
+            which the transformation will be
+            applied
+            selected_2: 'List'. List of the selected features in the second set, on
+            which the transformation will be
+            applied
+            drop_columns: 'List'. List of the features which would be dropped after
+            applying the transformation.
+            supported_ops: Tuple. Specifies the allowed list of operations for this
+            primitive.
+    """
+
+    def __init__(self, operation='upd', selected_type1=None, selected_type2=None,
+                 **kwargs):
+        self.options = None
+        self.selected_1 = None
+        self.selected_2 = None
+        self.drop_columns = None
+        self.supported_ops = ('add', 'upd', 'del')
+        self.operation = operation
+        self.selected_type1 = selected_type1
+        self.selected_type2 = selected_type2
+        self.init_vars(**kwargs)
+        self.name_key = self.__class__.__name__
+
+    def init_vars(self, **kwargs):
+        """ Initialize the primitive specific variables (which are not defined in the
+        base class)
+        # Arguments
+            kwargs: Dictionary containing primitive specific variables
+        # Returns
+            None.
+        """
+        self.options = kwargs
+
+    def fit(self, data, y=None):
+        """ A wrapper function to train the given primitive on the input training
+        data.
+        # Arguments
+            data: A TabularData instance of training data.
+            y: A numpy array of the target values.
+        # Returns
+            None
+        """
+        self.selected_1 = data.select_columns(self.selected_type1)
+        self.selected_2 = data.select_columns(self.selected_type2)
+
+        if self.operation not in self.supported_ops:
+            print("Operation {} not supported for {}".format(self.operation,
+                                                             self.__class__.__name__)
+                  )
+            self.selected_1 = None
+            self.selected_2 = None
+        if not self.selected_1 or not self.selected_2:
+            return self
+        return self._fit(data, y)
+
+    def transform(self, data, y=None):
+        """ A wrapper function to generate transformation on the input data based on
+        pre-trained primitive.
+        # Arguments
+            data: Input training/testing data in TabularData form.
+            y: A numpy array of the target values.
+        # Returns
+            A TabularData instance of the transformed data.
+        """
+        if not self.selected_1 or not self.selected_2:
+            return data
+        return self._transform(data, y)
+
+    @abstractmethod
+    def _fit(self, data, y=None):
+        """ Contains the actual implementation of training the primitive (implemented
+        in the child class)
+        # Arguments
+            data: A TabularData instance of training data.
+            y: A numpy array of the target values.
+        # Returns
+            None
+        """
+        pass
+
+    @abstractmethod
+    def _transform(self, data, y=None):
+        """ Contains the actual implementation of transforming the data using
+        primitive. (implemented in the child class)
+        # Arguments
+            data: Input training/testing data in TabularData form.
+            y: A numpy array of the target values.
+        # Returns
+            A TabularData instance of the transformed data.
+        """
+        pass
+
+
+class TabScaler(Primitive):
+    """ Standard Scaler primitive.
+
+        TabScaler scales the selected numerical features to have 0 mean and unit
+        variance.
+
+        # Arguments
+            scaler: StandardScaler. Instance of scikit-learn StandardScaler object
+    """
+    scaler = None
+    supported_ops = ('add', 'upd')
+
+    def _fit(self, data, y=None):
+        self.scaler = StandardScaler()
+        self.scaler.fit(data.X[self.selected], y)
+        return self
+
+    def _transform(self, data, y=None):
+        x_tr = self.scaler.transform(data.X[self.selected])
+        data.update(self.operation, self.selected, x_tr, new_type='NUM',
+                    key=self.name_key)
+        return data
+
+
+class BoxCox(Primitive):
+    """ Power Transform primitive.
+
+        The class applies BoxCox power transformation to make the selected features
+        have normal distribution.
+
+        # Arguments
+            transformer: PowerTransformer. Instance of scikit-learn PowerTransformer
+            object
+    """
+    transformer = None
+    supported_ops = ('add', 'upd')
+
+    def _fit(self, data, y=None):
+        self.transformer = PowerTransformer()
+        self.transformer.fit(data.X[self.selected], y)
+        return self
+
+    def _transform(self, data, y=None):
+        x_tr = self.transformer.transform(data.X[self.selected])
+        data.update(self.operation, self.selected, x_tr, new_type='NUM',
+                    key=self.name_key)
+        return data
+
+
+class Binning(Primitive):
+    """ Numerical binning primitive.
+
+        The class applies divides the given numeric column in the list of buckets,
+        based on the range of their values.
+
+        # Arguments
+            binner: KBinsDiscretizer. Instance of scikit-learn KBinsDiscretizer
+            object
+            strategy: String. Strategy used to define width of the bins. Possible
+            options are: (‘uniform’, ‘quantile’,
+            ‘kmeans’)
+            encoding: String. Method used to encode the transformed result. Possible
+            options are: (‘onehot’,
+            ‘onehot-dense’, ‘ordinal’)
+    """
+    binner = None
+    strategy = None
+    encoding = None
+    supported_ops = ('add', 'upd')
+
+    def init_vars(self, strategy='quantile', encoding='ordinal'):
+        self.strategy = strategy
+        self.encoding = encoding
+
+    def _fit(self, data, y=None):
+        self.binner = KBinsDiscretizer(strategy=self.strategy, encode=self.encoding)
+        self.binner.fit(data.X[self.selected], y)
+        return self
+
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame(self.binner.transform(data.X[self.selected]))
+        data.update(self.operation, self.selected, x_tr, new_type='NUM',
+                    key=self.name_key)
+        return data
+
+
+class OneHot(Primitive):
+    """ One Hot Encoder for categorical features.
+
+        The class applies one hot encoding to categorical features, using the
+        sklearn implementation.
+
+        # Arguments
+            ohe: OneHotEncoder. Instance of scikit-learn OneHotEncoder object
+    """
+    ohe = None
+    supported_ops = ('add', 'upd')
+
+    def _fit(self, data, y=None):
+        self.ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
+        self.ohe.fit(data.X[self.selected], y)
+        return self
+
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame(self.ohe.transform(data.X[self.selected]))
+        if self.operation == 'add':
+            data.update(self.operation, self.selected, x_tr, new_type='NUM',
+                        key=self.name_key)
+        elif self.operation == 'upd':
+            data.update('add', self.selected, x_tr, new_type='NUM',
+                        key=self.name_key)
+            data.update('del', self.selected, None, None, key=self.name_key)
+        return data
+
+
+class LabelEncode(Primitive):
+    """ Label Encoder for categorical features.
+
+        The class applies Label Encoding to categorical features, By mapping each
+        category to a numerical value.
+
+        # Arguments
+            cat_to_int_label: Dict. Mapping from categories to their assigned integer
+            value
+            unknown_key_dict: Dict. Mapping for each categorical feature column to
+            the integer value to replace the previously unseen categories
+    """
+    cat_to_int_label = None
+    unknown_key_dict = None
+    supported_ops = ('add', 'upd')
+
+    def _fit(self, data, y=None):
+        self.cat_to_int_label = {}
+        self.unknown_key_dict = {}
+        for col in self.selected:
+            self.cat_to_int_label[col] = {key: idx for idx, key in
+                                          enumerate(set(data.X[col]))}
+            self.unknown_key_dict[col] = len(self.cat_to_int_label[col])
+        return self
+
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
+        for col in self.selected:
+            x_tr[col] = data.X[col].apply(
+                lambda key: self.cat_to_int_label[col].get(key,
+                                                           self.unknown_key_dict[
+                                                               col]))
+        data.update(self.operation, self.selected, x_tr, new_type='NUM',
+                    key=self.name_key)
+        return data
+
+
+class TargetEncoder(Primitive):
+    """ Target Encoder for categorical features.
+
+        The class applies target encoding to categorical features, By learning
+        the mapping of category to numeric value
+        based on some aggregation of the target value.
+
+        # Arguments
+            target_encoding_map: Dict. Mapping from categories to their assigned
+            numeric value
+    """
+    target_encoding_map = None
+    supported_ops = ('add', 'upd')
+
+    @staticmethod
+    def calc_smooth_mean(df, by, on, alpha=5):
+        """ Calculates the smoothed means on the target value.
+        # Arguments
+            df: Input dataframe
+            by: Groupby column (categorical column)
+            on: Target column
+            alpha: smoothing factor
+        # Returns
+            smoothed mean and the overall mean
+        """
+        # Compute the global mean
+        mean = df[on].mean()
+
+        # Compute the number of values and the mean of each group
+        agg = df.groupby(by)[on].agg(['count', 'mean'])
+        counts = agg['count']
+        means = agg['mean']
+
+        # Compute the "smoothed" means
+        smooth = (counts * means + alpha * mean) / (counts + alpha)
+        return smooth, mean
+
+    def _fit(self, data, y=None):
+        X = data.X
+        self.target_encoding_map = {}
+        X['target'] = y
+        for col in self.selected:
+            self.target_encoding_map[col] = self.calc_smooth_mean(X, col, 'target',
+                                                                  alpha=5)
+        X.drop('target', axis=1, inplace=True)
+        return self
+
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
+        for col in self.selected:
+            x_tr[col] = data.X[col].map(self.target_encoding_map[col][0],
+                                        self.target_encoding_map[col][1])
+        data.update(self.operation, self.selected, x_tr, new_type='NUM',
+                    key=self.name_key)
+        return data
+
+
+class CatCatEncoder(PrimitiveHigherOrder):
+    """ Cross column feature generator between categorical and categorical columns.
+
+        The class learns a new features based on the values of selected two
+        categorical features.
+
+        # Arguments
+            cat_cat_map: Dict. Mapping from cat-cat combination to the an assigned
+            numeric value
+            strategy: String. Aggregation strategy to learn the mapping between
+            cat-cat combination to numeric value
+    """
+    supported_ops = ('add',)
+    cat_cat_map = None
+    strategy = None
+
+    def init_vars(self, strategy='count'):
+        self.strategy = strategy
+
+    @staticmethod
+    def cat_cat_count(df, col1, col2, strategy='count'):
+        """ Generate mapping for cat-cat combination to the numerical value based on
+        the given strategy.
+        # Arguments
+            col1: First categorical column
+            col2: Second categorical column
+            strategy: Aggregation strategy
+        # Returns
+            Mapping from cat-cat combination to the numeric value..
+        """
+        if strategy == 'count':
+            mapping = df.groupby([col1])[col2].count()
+        elif strategy == 'nunique':
+            mapping = df.groupby([col1])[col2].nunique()
+        else:
+            mapping = df.groupby([col1])[col2].count() // df.groupby([col1])[
+                col2].nunique()
+        return mapping
+
+    def _fit(self, data, y=None):
+        self.cat_cat_map = {}
+        self.selected_1 = list(set(self.selected_1 + self.selected_2))
+        for col1, col2 in itertools.combinations(self.selected_1, 2):
+            self.cat_cat_map[col1 + '_cross_' + col2] = \
+                self.cat_cat_count(data.X,
+                                   col1,
+                                   col2,
+                                   self.strategy)
+        return self
+
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
+        for col1, col2 in itertools.combinations(self.selected_1, 2):
+            if col1 + '_cross_' + col2 in self.cat_cat_map:
+                x_tr[col1 + '_cross_' + col2] = data.X[col1].map(
+                    self.cat_cat_map[col1 + '_cross_' + col2])
+        data.update(self.operation, self.selected_1, x_tr, new_type='NUM',
+                    key=self.name_key)
+        return data
+
+
+class CatNumEncoder(PrimitiveHigherOrder):
+    """ Cross column feature generator between categorical and numerical columns.
+
+        The class learns a new features based on the values of selected categorical
+        and numerical features.
+
+        # Arguments
+            cat_num_map: Dict. Mapping from cat-num combination to the an assigned
+            numeric value
+            strategy: String. Aggregation strategy to learn the mapping between
+            cat-num combination to numeric value
+    """
+    supported_ops = ('add',)
+    cat_num_map = None
+    strategy = None
+
+    def init_vars(self, strategy='mean'):
+        self.strategy = strategy
+
+    @staticmethod
+    def cat_num_interaction(df, col1, col2, method='mean'):
+        """ Generate mapping for cat-num combination to the numerical value based on
+        the given strategy.
+        # Arguments
+            col1: categorical column
+            col2: numerical column
+            method: Aggregation strategy
+        # Returns
+            Mapping from cat-num combination to the numeric value..
+        """
+        if method == 'mean':
+            mapping = df.groupby([col1])[col2].mean()
+        elif method == 'std':
+            mapping = df.groupby([col1])[col2].std()
+        elif method == 'max':
+            mapping = df.groupby([col1])[col2].max()
+        elif method == 'min':
+            mapping = df.groupby([col1])[col2].min()
+        else:
+            mapping = df.groupby([col1])[col2].mean()
+
+        return mapping
+
+    def _fit(self, data, y=None):
+        self.cat_num_map = {}
+        for col1 in self.selected_1:
+            for col2 in self.selected_2:
+                self.cat_num_map[col1 + '_cross_' + col2] = self.cat_num_interaction(
+                    data.X, col1, col2, self.strategy)
+        return self
+
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
+        for col1 in self.selected_1:
+            for col2 in self.selected_2:
+                if col1 + '_cross_' + col2 in self.cat_num_map:
+                    x_tr[col1 + '_cross_' + col2] = data.X[col1].map(
+                        self.cat_num_map[col1 + '_cross_' + col2])
+        data.update(self.operation, self.selected_1, x_tr, new_type='NUM',
+                    key=self.name_key)
+        return data
+
+
+class CatBinEncoder(PrimitiveHigherOrder):
+    """ Cross column feature generator between categorical and binary columns.
+
+        The class learns a new features based on the values of selected categorical
+        and binary features.
+
+        # Arguments
+            cat_bin_map: Dict. Mapping from cat-bin combination to the an assigned
+            numeric value
+            strategy: String. Aggregation strategy to learn the mapping between
+            cat-bin combination to numeric value
+    """
+    supported_ops = ('add',)
+    cat_bin_map = None
+    strategy = None
+
+    def init_vars(self, strategy='percent_true'):
+        self.strategy = strategy
+
+    @staticmethod
+    def cat_bin_interaction(df, col1, col2, strategy='percent_true'):
+        """ Generate mapping for cat-bin combination to the numerical value based on
+        the given strategy.
+        # Arguments
+            col1: Categorical column
+            col2: Binary column
+            strategy: Aggregation strategy
+        # Returns
+            Mapping from cat-bin combination to the numeric value..
+        """
+        if strategy == 'percent_true':
+            mapping = df.groupby([col1])[col2].mean()
+        elif strategy == 'count':
+            mapping = df.groupby([col1])[col2].count()
+        else:
+            mapping = df.groupby([col1])[col2].mean()
+        return mapping
+
+    def _fit(self, data, y=None):
+        self.cat_bin_map = {}
+        for col1 in self.selected_1:
+            for col2 in self.selected_2:
+                self.cat_bin_map[col1 + '_cross_' + col2] = self.cat_bin_interaction(
+                    data.X, col1, col2, self.strategy)
+        return self
+
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
+        for col1 in self.selected_1:
+            for col2 in self.selected_2:
+                if col1 + '_cross_' + col2 in self.cat_bin_map:
+                    x_tr[col1 + '_cross_' + col2] = data.X[col1].map(
+                        self.cat_bin_map[col1 + '_cross_' + col2])
+        data.update(self.operation, self.selected_1, x_tr, new_type='NUM',
+                    key=self.name_key)
+        return data
+
+
+class FilterConstant(Primitive):
+    """ Filters the constant or very low variance columns.
+
+        The class finds the non-changing or very low variance columns and marked them
+        for deletion, so that they are not used by the machine learning estimator.
+    """
+    drop_columns = None
+    supported_ops = ('del',)
+
+    def _fit(self, data, y=None):
+        X = data.X[self.selected]
+        self.drop_columns = X.columns[(X.max(axis=0) - X.min(axis=0) == 0)].tolist()
+        return self
+
+    def _transform(self, data, y=None):
+        data.update(self.operation, self.drop_columns, None, new_type=None,
+                    key=self.name_key)
+        return data
+
+
+class TimeDiff(Primitive):
+    """ Adds features based on difference of time values.
+
+        This class generates the features as time difference between two selected
+        time columns.
+    """
+    supported_ops = ('add',)
+
+    def _fit(self, data, y=None):
+        return self
+
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
+        for a, b in itertools.combinations(self.selected, 2):
+            x_tr[a + '-' + b] = data.X[a] - data.X[b]
+        data.update(self.operation, self.selected, x_tr, new_type='TIME',
+                    key=self.name_key)
+        return data
+
+
+class TimeOffset(Primitive):
+    """ Updates the time features in terms of difference from the start value.
+
+        This class updates the time features such that they are represented as a
+        difference from the start time.
+
+        # Arguments
+            start_time: Int. Starting time of the selected time feature.
+    """
+    start_time = None
+    supported_ops = ('add', 'upd')
+
+    def _fit(self, data, y=None):
+        self.start_time = data.X[self.selected].min(axis=0)
+        return self
+
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
+        x_tr[self.selected] = data.X[self.selected] - self.start_time
+        data.update(self.operation, self.selected, x_tr, new_type='TIME',
+                    key=self.name_key)
+        return data
+
+
+class TabPCA(Primitive):
+    """ Generates new features by finding PCA of the selected features.
+
+        The class calculates the PCA of the selected features and adds the
+        transformation as new set of features.
+        # Arguments
+            pca: PCA. Scikit-lean PCA class.
+    """
+    pca = None
+    supported_ops = ('add',)
+
+    def _fit(self, data, y=None):
+        self.pca = PCA(n_components=0.99, svd_solver='full')
+        self.pca.fit(data.X[self.selected])
+        return self
+
+    def _transform(self, data, y=None):
+        x_pca = self.pca.transform(data.X[self.selected])
+        x_pca = pd.DataFrame(x_pca, columns=['pca_' + str(i) for i in
+                                             range(x_pca.shape[1])])
+        data.update(self.operation, self.selected, x_pca, new_type='NUM',
+                    key=self.name_key)
+        return data
+
+
+class CatCount(Primitive):
+    """ Count Encoding.
+
+        Replaces the cargorical variables by their occrance count.
+        # Arguments
+            count_dict: Dict. Mapping of the categories to their respective frequency
+            count.
+            unknown_key: Float. Mapping value for previously unseen category.
+    """
+    count_dict = None
+    unknown_key = 0
+    supported_ops = ('add', 'upd')
+
+    def _fit(self, data, y=None):
+        self.count_dict = {}
+        for col in self.selected:
+            self.count_dict[col] = collections.Counter(data.X[col])
+        return self
+
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
+        for col in self.selected:
+            x_tr[col] = data.X[col].apply(
+                lambda key: self.count_dict[col].get(key, self.unknown_key))
+        data.update(self.operation, self.selected, x_tr, new_type='NUM',
+                    key=self.name_key)
+        return data
+
+
+class LogTransform(Primitive):
+    """ Calculates the log transformation.
+
+        The class Calculates the log transform value of the given numeric feature.
+        The formula is: sign(x) * log(1 + mod(x))
+    """
+    name_key = 'log_'
+    supported_ops = ('add', 'upd')
+
+    def _fit(self, data, y=None):
+        return self
+
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
+        for col in self.selected:
+            x_tr[self.name_key + col] = np.sign(data.X[col]) * np.log(
+                1 + np.abs(data.X[col]))
+        data.update(self.operation, self.selected, x_tr, new_type='NUM',
+                    key=self.name_key)
+        return data
+
+
+class Imputation(Primitive):
+    """ Filters the features based on Pearson Correlation.
+
+        The class removes the features who have low pearson correlation with the
+        target.
+        # Arguments
+            threshold: Float. Threshold for filtering features.
+    """
+    impute_dict = None
+    supported_ops = ('add', 'upd')
+    strategy = None
+
+    def init_vars(self, strategy='most_frequent'):
+        self.strategy = strategy
+
+    def _fit(self, data, y=None):
+        self.impute_dict = {}
+        for col in self.selected:
+            if self.strategy == 'most_frequent':
+                value_counts = data.X[col].value_counts()
+                self.impute_dict[
+                    col] = value_counts.idxmax() if not value_counts.empty else 0
+            elif self.strategy == 'zero':
+                self.impute_dict[col] = 0
+            else:
+                raise ValueError
+        return self
+
+    def _transform(self, data, y=None):
+        x_tr = pd.DataFrame()
+        for col in self.selected:
+            x_tr[col] = data.X[col].fillna(self.impute_dict[col])
+        data.update(self.operation, self.selected, x_tr, new_type=None,
+                    key=self.name_key)
+        return data
+
+
+class FeatureFilter(Primitive):
+    """ Filters the features based on Pearson Correlation.
+
+        The class removes the features who have low pearson correlation with the
+        target.
+        # Arguments
+            threshold: Float. Threshold for filtering features.
+    """
+    threshold = None
+    supported_ops = ('del',)
+
+    def init_vars(self, threshold=0.001):
+        if threshold == 0:
+            self.selected = None
+        self.threshold = threshold
+        self.drop_columns = []
+
+    def _fit(self, data, y=None):
+        for col in self.selected:
+            mu = abs(pearsonr(data.X[col], y)[0])
+            if np.isnan(mu):
+                mu = 0
+            if mu < self.threshold:
+                self.drop_columns.append(col)
+        return self
+
+    def _transform(self, data, y=None):
+        data.update(self.operation, self.drop_columns, None, new_type=None,
+                    key=self.name_key)
+        return data
+
+
+class FeatureImportance(Primitive):
+    """ Filters the features based on feature importance score.
+
+        The class learns a Light GBM estimator for the given data and based on the
+        feature importance scores, filters the features with importance lower than
+        the threshold.
+        # Arguments
+            threshold: Float. Threshold for filtering features.
+            task_type: 'String'. Specifies the task type amongst: ('classification',
+            'regression')
+    """
+    threshold = None
+    task_type = 'classification'
+    supported_ops = ('del',)
+
+    def init_vars(self, threshold=0.001, task_type='classification'):
+        if threshold == 0:
+            self.selected = None
+        self.threshold = threshold
+        self.drop_columns = []
+        self.task_type = task_type
+
+    def _fit(self, data, y=None):
+        if self.task_type == 'classification':
+            n_classes = len(set(y))
+            if n_classes == 2:
+                estimator = LGBMClassifier(silent=False,
+                                           verbose=-1,
+                                           n_jobs=1,
+                                           objective='binary')
+            else:
+                estimator = LGBMClassifier(silent=False,
+                                           verbose=-1,
+                                           n_jobs=1,
+                                           num_class=n_classes,
+                                           objective='multiclass')
+        else:
+            # self.task_type == 'regression'
+            estimator = LGBMRegressor(silent=False,
+                                      verbose=-1,
+                                      n_jobs=1,
+                                      objective='regression')
+        estimator.fit(data.X, y)
+        feature_importance = estimator.feature_importances_
+        feature_importance = feature_importance / feature_importance.mean()
+        self.drop_columns = data.X.columns[
+            np.where(feature_importance < self.threshold)[0]]
+        return self
+
+    def _transform(self, data, y=None):
+        data.update(self.operation, self.drop_columns, None, new_type=None,
+                    key=self.name_key)
+        return data
+
+
+if __name__ == "__main__":
+    ntime, nnum, ncat = 4, 10, 8
+    nsample = 1000
+    x_num = np.random.random([nsample, nnum])
+    x_time = np.random.random([nsample, ntime])
+    x_cat = np.random.randint(0, 10, [nsample, ncat])
+
+    x_all = np.concatenate([x_num, x_time, x_cat], axis=1)
+    x_train = x_all[:int(nsample * 0.8), :]
+    x_test = x_all[int(nsample * 0.8):, :]
+
+    y_all = np.random.randint(0, 2, nsample)
+    y_train = y_all[:int(nsample * 0.8)]
+    y_test = y_all[int(nsample * 0.8):]
+
+    datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
+    print(x_train[:4, 20])
+    prep = Preprocessor()
+    prep.fit(x_train, y_train, 24 * 60 * 60, datainfo)
+    x_new = prep.transform(x_train)
+
+    print("-----")
+    print(x_new[:4, 2])
diff --git a/autokaggle/tabular_preprocessor.py b/autokaggle/tabular_preprocessor.py
deleted file mode 100644
index 1bf6d76..0000000
--- a/autokaggle/tabular_preprocessor.py
+++ /dev/null
@@ -1,344 +0,0 @@
-import numpy as np
-from pandas import DataFrame
-from scipy.stats import pearsonr
-
-LEVEL_HIGH = 32
-
-
-def parallel_function(labels, first_batch_keys, task):
-    if task == 'label':
-        if min(labels) > first_batch_keys:
-            labels = labels - np.min(labels)
-        return labels.reshape(labels.shape[0], 1)
-
-    elif task == 'frequency':
-        cat_dict = {}
-        n_rows = labels.shape[0]
-        labels = np.expand_dims(labels, axis=1)
-
-        if min(labels) > first_batch_keys:
-            labels = labels - np.min(labels)
-
-        frequencies = np.zeros((n_rows, 1))
-
-        for row_index in range(n_rows):
-            key = labels[row_index, 0]
-            if key in cat_dict:
-                cat_dict[key] += 1
-            else:
-                cat_dict[key] = 1
-
-        n_level = len(cat_dict)
-        key_to_frequency = {}
-
-        for key in cat_dict.keys():
-            key_to_frequency[key] = cat_dict[key] / n_rows * n_level
-
-        for row_index in range(n_rows):
-            key = labels[row_index, 0]
-            frequencies[row_index][0] = key_to_frequency[key]
-
-        return frequencies
-    elif task == 'num_cat':
-        df = DataFrame(data=labels)
-        return df.join(df.groupby(1)[0].mean(),
-                       rsuffix='r',
-                       on=1).values[:, -1:]
-    elif task == 'cat_cat':
-        df = DataFrame(data=labels)
-        df[3] = list(range(len(labels)))
-        return df.join(df.groupby([0, 1]).count(),
-                       rsuffix='r',
-                       on=(0, 1)).values[:, -1:]
-    elif task == 'train_num_cat':
-        y = first_batch_keys[0]
-        df = DataFrame(data=labels)
-        fe = df.join(df.groupby(1)[0].mean(),
-                     rsuffix='r',
-                     on=1).values[:, -1:]
-        mu = abs(pearsonr(np.squeeze(np.array(fe)), y)[0])
-        if np.isnan(mu):
-            mu = 0
-        return [[first_batch_keys[1], first_batch_keys[2], mu, mu], first_batch_keys[3]]
-
-    elif task == 'train_cat_cat':
-        y = first_batch_keys[0]
-        df = DataFrame(data=labels)
-        df[3] = list(range(len(labels)))
-        fe = df.join(df.groupby([0, 1]).count(),
-                     rsuffix='r',
-                     on=(0, 1)).values[:, -1:]
-        mu = abs(pearsonr(np.squeeze(np.array(fe)), y)[0])
-        if np.isnan(mu):
-            mu = 0
-        return [[first_batch_keys[1], first_batch_keys[2], mu], first_batch_keys[3]]
-    return None
-
-
-def call_parallel(tasks):
-    results = []
-    for t in tasks:
-        results.append(parallel_function(t[0], t[1], t[2]))
-    return results
-
-
-class TabularPreprocessor:
-    def __init__(self):
-        """
-        Initialization function for tabular preprocessor.
-        """
-        self.num_cat_pair = {}
-
-        self.total_samples = 0
-
-        self.cat_to_int_label = {}
-        self.n_first_batch_keys = {}
-        self.high_level_cat_keys = []
-
-        self.feature_add_high_cat = 0
-        self.feature_add_cat_num = 0
-        self.feature_add_cat_cat = 0
-        self.order_num_cat_pair = {}
-
-        self.rest = None
-        self.budget = None
-        self.data_info = None
-        self.n_time = None
-        self.n_num = None
-        self.n_cat = None
-
-    def remove_useless(self, x):
-        self.rest = np.where(np.max(x, 0) - np.min(x, 0) != 0)[0]
-        return x[:, self.rest]
-
-    def process_time(self, x):
-        cols = range(self.n_time)
-        if len(cols) > 10:
-            cols = cols[:10]
-        x_time = x[:, cols]
-        for i in cols:
-            for j in range(i + 1, len(cols)):
-                x = np.append(x, np.expand_dims(x_time[:, i] - x_time[:, j], 1), 1)
-        return x
-
-    def extract_data(self, raw_x):
-        # only get numerical variables
-        ret = np.concatenate([raw_x['TIME'], raw_x['NUM'], raw_x['CAT']], axis=1)
-        n_rows = ret.shape[0]
-        n_num_col = ret.shape[1] - self.n_cat
-
-        n_cat_col = self.n_cat
-        if n_cat_col <= 0:
-            return ret.astype(np.float64)
-
-        # preprocess (multi-value) categorical data
-        for col_index in range(n_num_col, n_num_col + n_cat_col):
-            for row_index in range(n_rows):
-                key = str(ret[row_index, col_index])
-                if key in self.cat_to_int_label[col_index]:
-                    ret[row_index, col_index] = self.cat_to_int_label[col_index][key]
-                    continue
-                new_value = len(self.cat_to_int_label[col_index])
-                self.cat_to_int_label[col_index][key] = new_value
-                ret[row_index, col_index] = new_value
-
-        return ret.astype(np.float64)
-
-    def cat_to_num(self, x, y=None):
-        if y is not None:
-            mark = self.n_time + self.n_num
-
-            for col_index in range(self.n_time + self.n_num, self.n_time + self.n_num + self.n_cat):
-                if self.n_first_batch_keys[col_index] <= LEVEL_HIGH:
-                    self.num_cat_pair[mark] = (col_index,)
-                    mark += 1
-                else:
-                    self.num_cat_pair[mark] = (col_index, col_index)
-                    mark += 1
-
-            mark_1 = 0
-            tasks = []
-            for i, cat_col_index1 in enumerate(self.high_level_cat_keys):
-                for cat_col_index2 in self.high_level_cat_keys[i + 1:]:
-                    tasks.append((x[:, (cat_col_index1, cat_col_index2)],
-                                  [y, cat_col_index1, cat_col_index2, mark_1],
-                                  'train_cat_cat'))
-                    mark_1 += 1
-
-            all_results = call_parallel(tasks)
-
-            num_cat_pair_1 = {}
-            pearsonr_dict_1 = {}
-            for result in all_results:
-                if result[0][-1] > 0.001:
-                    pearsonr_dict_1[result[1]] = result[0][-1]
-                    num_cat_pair_1[result[1]] = result[0]
-            pearsonr_high_1 = sorted(pearsonr_dict_1, key=pearsonr_dict_1.get, reverse=True)[:self.feature_add_cat_cat]
-            num_cat_pair_1 = {key: num_cat_pair_1[key] for key in pearsonr_high_1}
-            num_cat_pair_1 = {i + mark: num_cat_pair_1[key] for i, key in enumerate(num_cat_pair_1)}
-            self.num_cat_pair.update(num_cat_pair_1)
-            mark += len(pearsonr_high_1)
-
-            mark_2 = 0
-            tasks_2 = []
-            for cat_col_index in self.high_level_cat_keys:
-                for num_col_index in range(self.n_time, self.n_time + self.n_num):
-                    tasks_2.append((x[:, (num_col_index, cat_col_index)],
-                                    [y, num_col_index, cat_col_index, mark_2],
-                                    'train_num_cat'))
-                    mark_2 += 1
-
-            all_results = call_parallel(tasks_2)
-
-            num_cat_pair_2 = {}
-            pearsonr_dict_2 = {}
-            for result in all_results:
-                if result[0][-1] > 0.001:
-                    pearsonr_dict_2[result[1]] = result[0][-1]
-                    num_cat_pair_2[result[1]] = result[0]
-            pearsonr_high_2 = sorted(pearsonr_dict_2, key=pearsonr_dict_2.get, reverse=True)[:self.feature_add_cat_num]
-            num_cat_pair_2 = {key: num_cat_pair_2[key] for key in pearsonr_high_2}
-            num_cat_pair_2 = {i + mark: num_cat_pair_2[key] for i, key in enumerate(num_cat_pair_2)}
-            self.num_cat_pair.update(num_cat_pair_2)
-            self.order_num_cat_pair = sorted(list(self.num_cat_pair.keys()))
-            print('num_cat_pair_2:', num_cat_pair_2)
-
-        tasks = []
-        for key in self.order_num_cat_pair:
-            if len(self.num_cat_pair[key]) == 1:
-                (col_index,) = self.num_cat_pair[key]
-                tasks.append((x[:, col_index], self.n_first_batch_keys[col_index], 'label'))
-            if len(self.num_cat_pair[key]) == 2:
-                (col_index, col_index) = self.num_cat_pair[key]
-                tasks.append((x[:, col_index], self.n_first_batch_keys[col_index], 'frequency'))
-            if len(self.num_cat_pair[key]) == 3:
-                (cat_col_index1, cat_col_index2, mu) = self.num_cat_pair[key]
-                tasks.append((x[:, (cat_col_index1,
-                                    cat_col_index2)], self.n_first_batch_keys[cat_col_index1], 'cat_cat'))
-            elif len(self.num_cat_pair[key]) == 4:
-                (num_col_index, cat_col_index, mu, a) = self.num_cat_pair[key]
-                tasks.append((x[:, (num_col_index, cat_col_index)], self.n_first_batch_keys[cat_col_index], 'num_cat'))
-
-        results = call_parallel(tasks)
-        all_num = x.shape[1] - self.n_cat
-        results = [x[:, :all_num]] + results
-        ret = np.concatenate(results, axis=1)
-
-        return ret
-
-    def fit(self, raw_x, y, time_limit, data_info):
-        """
-        This function should train the model parameters.
-
-        Args:
-            raw_x: a numpy.ndarray instance containing the training data.
-            y: training label vector.
-            time_limit: remaining time budget.
-            data_info: meta-features of the dataset, which is an numpy.ndarray describing the
-             feature type of each column in raw_x. The feature type include:
-                     'TIME' for temporal feature, 'NUM' for other numerical feature,
-                     and 'CAT' for categorical feature.
-        """
-        # Get Meta-Feature
-        self.budget = time_limit
-        self.data_info = data_info if data_info is not None else self.extract_data_info(raw_x)
-        print('QQ: {}'.format(self.data_info))
-
-        self.n_time = sum(self.data_info == 'TIME')
-        self.n_num = sum(self.data_info == 'NUM')
-        self.n_cat = sum(self.data_info == 'CAT')
-
-        self.total_samples = raw_x.shape[0]
-
-        print('QQ1: {}'.format(self.n_time))
-        print('QQ2: {}'.format(self.n_num))
-        print('QQ3: {}'.format(self.n_cat))
-        raw_x = {'TIME': raw_x[:, self.data_info == 'TIME'],
-                 'NUM': raw_x[:, self.data_info == 'NUM'],
-                 'CAT': raw_x[:, self.data_info == 'CAT']}
-
-
-        for col_index in range(self.n_num + self.n_time, self.n_num + self.n_time + self.n_cat):
-            self.cat_to_int_label[col_index] = {}
-
-        x = self.extract_data(raw_x)
-
-        d_size = x.shape[0] * x.shape[1] / self.budget
-        if d_size > 35000:
-            self.feature_add_high_cat = 0
-        else:
-            self.feature_add_high_cat = 10
-
-        # Iterate cat features
-        for col_index in range(self.n_num + self.n_time, self.n_num + self.n_time + self.n_cat):
-            self.n_first_batch_keys[col_index] = len(self.cat_to_int_label[col_index])
-        high_level_cat_keys_tmp = sorted(self.n_first_batch_keys, key=self.n_first_batch_keys.get, reverse=True)[
-                                  :self.feature_add_high_cat]
-        for i in high_level_cat_keys_tmp:
-            if self.n_first_batch_keys[i] > 1e2:
-                self.high_level_cat_keys.append(i)
-
-        # Convert NaN to zeros
-        x = np.nan_to_num(x)
-
-        # Encode high-order categorical data to numerical with frequency
-        x = self.cat_to_num(x, y)
-
-        x = self.process_time(x)
-        x = self.remove_useless(x)
-
-        return x
-
-    def encode(self, raw_x, time_limit=None):
-        """
-        This function should train the model parameters.
-
-        Args:
-            raw_x: a numpy.ndarray instance containing the training/testing data.
-            time_limit: remaining time budget.
-        Both inputs X and y are numpy arrays.
-        If fit is called multiple times on incremental data (train, test1, test2, etc.)
-        you should warm-start your training from the pre-trained model. Past data will
-        NOT be available for re-training.
-        """
-        # Get Meta-Feature
-        if time_limit is None:
-            if self.budget is None:
-                time_limit = 24 * 60 * 60
-                self.budget = time_limit
-        else:
-            self.budget = time_limit
-
-        raw_x = {'TIME': raw_x[:, self.data_info == 'TIME'],
-                 'NUM': raw_x[:, self.data_info == 'NUM'],
-                 'CAT': raw_x[:, self.data_info == 'CAT']}
-        x = self.extract_data(raw_x)
-
-        # Convert NaN to zeros
-        x = np.nan_to_num(x)
-
-        # Encode high-order categorical data to numerical with frequency
-        x = self.cat_to_num(x)
-
-        x = self.process_time(x)
-        if self.rest is not None:
-            x = x[:, self.rest]
-        return x
-
-    @staticmethod
-    def extract_data_info(raw_x):
-        """
-        This function extracts the data info automatically based on the type of each feature in raw_x.
-
-        Args:
-            raw_x: a numpy.ndarray instance containing the training data.
-        """
-        data_info = []
-        row_num, col_num = raw_x.shape
-        for col_idx in range(col_num):
-            try:
-                raw_x[:, col_idx].astype(np.float)
-                data_info.append('NUM')
-            except:
-                data_info.append('CAT')
-        return np.array(data_info)
diff --git a/autokaggle/tabular_supervised.py b/autokaggle/tabular_supervised.py
deleted file mode 100644
index 3f74390..0000000
--- a/autokaggle/tabular_supervised.py
+++ /dev/null
@@ -1,256 +0,0 @@
-from abc import abstractmethod
-
-import os
-from lightgbm import LGBMClassifier, LGBMRegressor
-from sklearn.model_selection import RandomizedSearchCV
-from sklearn.model_selection import StratifiedKFold, KFold
-from sklearn.metrics import roc_auc_score, f1_score, mean_squared_error
-import numpy as np
-import random
-
-from autokaggle.tabular_preprocessor import TabularPreprocessor
-from autokaggle.utils import rand_temp_folder_generator, ensure_dir
-
-
-class TabularSupervised:
-    def __init__(self, path=None, verbose=True):
-        """
-        Initialization function for tabular supervised learner.
-        """
-        self.verbose = verbose
-        self.is_trained = False
-        self.clf = None
-        self.objective = None
-        self.tabular_preprocessor = None
-        self.path = path if path is not None else rand_temp_folder_generator()
-        ensure_dir(self.path)
-        if self.verbose:
-            print('Path:', path)
-        self.save_filename = os.path.join(self.path, 'lgbm.txt')
-        self.time_limit = None
-        self.lgbm = None
-
-    def search(self, search_space, search_iter, n_estimators, x, y):
-        if 'n_estimators' in search_space:
-            del search_space['n_estimators']
-        params = {
-            'boosting_type': ['gbdt'],
-            'min_child_weight': [5],
-            'min_split_gain': [1.0],
-            'subsample': [0.8],
-            'colsample_bytree': [0.6],
-            'max_depth': [10],
-            'n_estimators': n_estimators,
-            'num_leaves': [70],
-            'learning_rate': [0.04],
-        }
-        params.update(search_space)
-        if self.verbose:
-            print(params)
-        folds = 3
-        score_metric, skf = self.get_skf(folds)
-
-        random_search = RandomizedSearchCV(self.lgbm, param_distributions=params, n_iter=search_iter,
-                                           scoring=score_metric,
-                                           n_jobs=1, cv=skf, verbose=0, random_state=1001)
-
-        random_search.fit(x, y)
-        self.clf = random_search.best_estimator_
-
-        return random_search.best_params_
-
-    @abstractmethod
-    def get_skf(self, folds):
-        pass
-
-    def fit(self, x, y, time_limit=None, data_info=None):
-        """
-        This function should train the model parameters.
-
-        Args:
-            x: A numpy.ndarray instance containing the training data.
-            y: training label vector.
-            time_limit: remaining time budget.
-            data_info: meta-features of the dataset, which is an numpy.ndarray describing the
-             feature type of each column in raw_x. The feature type include:
-                     'TIME' for temporal feature, 'NUM' for other numerical feature,
-                     and 'CAT' for categorical feature.
-        Both inputs X and y are numpy arrays.
-        If fit is called multiple times on incremental data (train, test1, test2, etc.)
-        you should warm-start your training from the pre-trained model. Past data will
-        NOT be available for re-training.
-        """
-
-        if time_limit is None:
-            time_limit = 24 * 60 * 60
-        self.time_limit = time_limit
-
-        self.init_lgbm(y)
-
-        self.tabular_preprocessor = TabularPreprocessor()
-
-        if x.shape[1] == 0:
-            raise ValueError("No feature exist!")
-
-        x = self.tabular_preprocessor.fit(x, y, self.time_limit, data_info)
-
-        if x.shape[0] > 600:
-            grid_train_percentage = max(600.0 / x.shape[0], 0.1)
-        else:
-            grid_train_percentage = 1
-        grid_n = int(x.shape[0] * grid_train_percentage)
-        idx = random.sample(list(range(x.shape[0])), grid_n)
-
-        grid_train_x = x[idx, :]
-        grid_train_y = y[idx]
-
-        while x.shape[0] < 60:
-            x = np.concatenate([x, x], axis=0)
-            y = np.concatenate([y, y], axis=0)
-
-        response_rate = sum(y) / len(y)
-
-        if not self.is_trained:
-            # Two-step cross-validation for hyperparameter selection
-            if self.verbose:
-                print('-----------------Search Regularization Params---------------------')
-            if response_rate < 0.005:
-                depth_choice = [5]
-            else:
-                depth_choice = [8, 10]
-
-            params = {
-                'min_split_gain': [0.1],
-                'max_depth': depth_choice,
-                'min_child_weight': [5, 10, 30, 50, 60, 80, 100],
-                'colsample_bytree': [0.6, 0.7],
-                'learning_rate': [0.3],
-                'subsample': [0.8],
-                'num_leaves': [80],
-            }
-
-            search_iter = 14
-            n_estimators_choice = [50]
-            best_param = self.search(
-                params,
-                search_iter,
-                n_estimators_choice,
-                grid_train_x, grid_train_y)
-
-            if self.verbose:
-                print('-----------------Search Learning Rate---------------------')
-            for key, value in best_param.items():
-                best_param[key] = [value]
-            best_param['learning_rate'] = [0.03, 0.045, 0.06, 0.075, 0.85, 0.95, 0.105, 0.12]
-            n_estimators_choice = [100, 150, 200]
-            search_iter = 16
-
-            self.search(
-                best_param,
-                search_iter,
-                n_estimators_choice,
-                grid_train_x, grid_train_y)
-
-            if self.verbose:
-                print('self.clf', self.clf)
-            self.is_trained = True
-
-        # Fit Model
-        self.clf.fit(x, y)
-
-        self.clf.booster_.save_model(self.save_filename)
-
-        if self.verbose:
-            print("The whole available data is: ")
-            print("Real-FIT: dim(X)= [{:d}, {:d}]".format(x.shape[0], x.shape[1]))
-
-            print('Feature Importance:')
-            print(self.clf.feature_importances_)
-
-    @abstractmethod
-    def init_lgbm(self, y):
-        pass
-
-    def predict(self, x_test):
-        """
-        This function should provide predictions of labels on (test) data.
-        The function predict eventually casdn return probabilities or continuous values.
-        """
-        x_test = self.tabular_preprocessor.encode(x_test)
-        y = self.clf.predict(x_test, )
-        if y is None:
-            raise ValueError("Tabular predictor does not exist")
-        return y
-
-    @abstractmethod
-    def evaluate(self, x_test, y_test):
-        pass
-
-    def final_fit(self, x_train, y_train):
-        x_train = self.tabular_preprocessor.encode(x_train)
-        self.clf.fit(x_train, y_train)
-
-
-class TabularRegressor(TabularSupervised):
-    """TabularRegressor class.
-    It is used for tabular data regression with lightgbm regressor.
-    """
-
-    def __init__(self, path=None):
-        super().__init__(path)
-        self.objective = 'regression'
-
-    def evaluate(self, x_test, y_test):
-        y_pred = self.predict(x_test)
-        return mean_squared_error(y_test, y_pred)
-
-    def init_lgbm(self, y):
-        self.lgbm = LGBMRegressor(silent=False,
-                                  verbose=-1,
-                                  n_jobs=1,
-                                  objective=self.objective)
-
-    def get_skf(self, folds):
-        return 'neg_mean_squared_error', KFold(n_splits=folds, shuffle=True, random_state=1001)
-
-
-class TabularClassifier(TabularSupervised):
-    """TabularClassifier class.
-     It is used for tabular data classification with lightgbm classifier.
-    """
-
-    def init_lgbm(self, y):
-        n_classes = len(set(y))
-        if n_classes == 2:
-            self.objective = 'binary'
-            self.lgbm = LGBMClassifier(silent=False,
-                                       verbose=-1,
-                                       n_jobs=1,
-                                       objective=self.objective)
-        else:
-            self.objective = 'multiclass'
-            self.lgbm = LGBMClassifier(silent=False,
-                                       verbose=-1,
-                                       n_jobs=1,
-                                       num_class=n_classes,
-                                       objective=self.objective)
-
-    def evaluate(self, x_test, y_test):
-        if self.verbose:
-            print('objective:', self.objective)
-        y_pred = self.predict(x_test)
-        results = None
-        if self.objective == 'binary':
-            results = roc_auc_score(y_test, y_pred)
-        elif self.objective == 'multiclass':
-            results = f1_score(y_test, y_pred, average='weighted')
-        return results
-
-    def get_skf(self, folds):
-        if self.lgbm.objective == 'binary':
-            score_metric = 'roc_auc'
-            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
-        else:
-            score_metric = 'f1_weighted'
-            skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1001)
-        return score_metric, skf
diff --git a/autokaggle/utils.py b/autokaggle/utils.py
index 62b833f..bb3aa69 100644
--- a/autokaggle/utils.py
+++ b/autokaggle/utils.py
@@ -2,6 +2,12 @@
 import tempfile
 import string
 import random
+import json
+
+
+def generate_rand_string(size):
+    chars = string.ascii_uppercase + string.digits
+    return ''.join(random.choice(chars) for _ in range(size))
 
 
 def ensure_dir(directory):
@@ -17,11 +23,27 @@ def temp_path_generator():
 
 
 def rand_temp_folder_generator():
-    """Create and return a temporary directory with the path name '/temp_dir_name/autokeras' (E:g:- /tmp/autokeras)."""
-    chars = string.ascii_uppercase + string.digits
-    size = 6
-    random_suffix = ''.join(random.choice(chars) for _ in range(size))
+    """
+    Create and return a temporary directory with the path name
+    '/temp_dir_name/autokeras' (E:g:- /tmp/autokeras).
+    """
     sys_temp = temp_path_generator()
-    path = sys_temp + '_' + random_suffix
+    path = sys_temp + '_' + generate_rand_string(6)
     ensure_dir(path)
     return path
+
+
+def write_json(data, filename):
+    with open(filename, 'w') as outfile:
+        json.dump(data, outfile)
+
+
+def read_json(filename):
+    with open(filename, 'rb') as infile:
+        return json.load(infile)
+
+
+def write_csv(filename, line):
+    with open(filename, "a") as f:
+        f.write(", ".join(map(str, line)))
+        f.write("\n")
diff --git a/examples/benchmarking.py b/examples/benchmarking.py
new file mode 100644
index 0000000..57cd47e
--- /dev/null
+++ b/examples/benchmarking.py
@@ -0,0 +1,456 @@
+import string
+import random
+import sys
+
+sys.path.append("../")
+import numpy as np
+import pandas as pd
+import sklearn.model_selection
+import sklearn.datasets
+from sklearn.metrics import r2_score, roc_auc_score, accuracy_score, f1_score, \
+    balanced_accuracy_score, \
+    mean_absolute_error, mean_squared_error
+# from autosklearn.regression import AutoSklearnRegressor
+# from autosklearn.classification import AutoSklearnClassifier
+from autokaggle import *
+from autokaggle.utils import *
+import openml
+
+openml.config.apikey = '3c7196c92a274c3b9405a7e26e9f848e'
+import warnings
+from abc import abstractmethod
+import statistics
+
+
+def generate_rand_string(size):
+    chars = string.ascii_uppercase + string.digits
+    return ''.join(random.choice(chars) for _ in range(size))
+
+
+class BenchmarkingBase:
+    """ Base class for benchmarking autoML platforms.
+
+        This class benchmarks the performance of the given autoML platform. The
+        user can call evaluate() method to evaluate the performance on a single
+        task or run_automation() for the list of the tasks. The tasks are OpenML
+        tasks, which specify the dataset and the train/test/validation folds etc.
+
+        # Arguments
+            results: List. List of the results for each evaluation
+            sess_name: String. Name of the evaluation session, used for storing
+            the results.
+            cls_desc: List. List of the columns to be added in classification result
+            rgs_desc: List. List of the columns to be added in regression result
+            cls_results: DataFrame. Table storing the classification results
+            rgs_results: DataFrame. Table storing the regression results
+    """
+    results = None
+    cls_desc = ["automl_model", "task_id", "time_limit", "accuracy",
+                "balanced_accuracy", "F1_score", "AUC"]
+    rgs_desc = ["automl_model", "task_id", "time_limit", "MSE", "MAE", "R2_score"]
+
+    def __init__(self, supress_warnings=True, sess_name=""):
+        if supress_warnings:
+            warnings.filterwarnings('ignore')
+        self.results = []
+        self.sess_name = generate_rand_string(6) if not sess_name else sess_name
+        self.cls_results = pd.DataFrame(columns=self.cls_desc)
+        self.rgs_results = pd.DataFrame(columns=self.rgs_desc)
+
+    def measure_performance_cls(self, y_true, y_pred, binary=False):
+        """ Calculate the performance of the classification task
+        # Arguments
+            y_true: A numpy array containing the ground truth labels
+            y_pred: A numpy array containing the predicted labels
+            binary: Boolean specifying if the objective isbinary or multiclass
+        # Returns
+            list of the performance scores based on various evaluation metrics.
+        """
+        accuracy = accuracy_score(y_true, y_pred)
+        ber = balanced_accuracy_score(y_true, y_pred)
+        f1 = f1_score(y_true, y_pred, average="binary") if binary else f1_score(
+            y_true, y_pred, average="weighted")
+        auc = roc_auc_score(y_true, y_pred) if binary else "-"
+        return [accuracy, ber, f1, auc]
+
+    def measure_performance_rgs(self, y_true, y_pred):
+        """ Calculate the performance of the regression task
+        # Arguments
+            y_true: A numpy array containing the ground truth
+            y_pred: A numpy array containing the predicted values
+        # Returns
+            list of the performance scores based on various evaluation metrics.
+        """
+        mse = mean_squared_error(y_true, y_pred)
+        mae = mean_absolute_error(y_true, y_pred)
+        r2 = r2_score(y_true, y_pred)
+        return [mse, mae, r2]
+
+    def export_results(self):
+        """ Writes the results to a CSV file.
+        # Arguments
+            None
+        # Returns
+            None
+        """
+        if len(self.cls_results) > 0:
+            self.cls_results.to_csv(self.sess_name + "_classification_results.csv",
+                                    index=False)
+        if len(self.rgs_results) > 0:
+            self.rgs_results.to_csv(self.sess_name + "_regression_results.csv",
+                                    index=False)
+
+    @abstractmethod
+    def evaluate(self, task, time_limit):
+        """ Evaluates the performance of the single task.
+        # Arguments
+            task: Id of the OpenML task flow
+            time_limit: Budget for the given task
+        # Returns
+            List of performance scores of the autoML system on the given task.
+        """
+        pass
+
+    def run_automation(self, task_list, time_limit=10 * 60):
+        """ Evaluate the list of the tasks in sequence
+        # Arguments
+            task_list: List of OpenML task ids
+            time_limit: Budget for each of the task
+        # Returns
+            None
+        """
+        for task in task_list:
+            try:
+                self.evaluate(task, time_limit=time_limit)
+                self.export_results()
+            except:
+                print("task: {} didnt work".format(task))
+
+    def time_lapse(self, task_id,
+                   time_limits=[30, 40, 50, 60, 90, 120, 150, 180, 240, 300]):
+        """ Evaluate the task on different time_limits
+        # Arguments
+            task_id: Id of the OpenML task flow
+            time_limits: List of the time_limits to test the performance on
+        # Returns
+            List of combined results of the autoML on each of the time_limit
+        This function evaluates and compares the performance of the autoML system
+        on different time_limits. It is helpful to understand the amount of
+        improvement with increase in time budget
+        """
+        tl_results = []
+        for time_limit in time_limits:
+            tl_results.append(self.evaluate(task_id, time_limit=time_limit))
+        return tl_results
+
+    def get_dataset_splits(self, task_id):
+        """ Get the train/test splits for the given task
+        # Arguments
+            task_id: Id of OpenML task flow
+        # Returns
+            Train/Test datasets in numpy array format
+        """
+        task = openml.tasks.get_task(task_id)
+        train_indices, test_indices = task.get_train_test_split_indices()
+        dataset = task.get_dataset()
+        X, y, categorical_indicator, attribute_names = dataset.get_data(
+            target=task.target_name, dataset_format='array')
+
+        x_train, y_train = X[train_indices], y[train_indices]
+        x_test, y_test = X[test_indices], y[test_indices]
+        return x_train, y_train, x_test, y_test
+
+
+class BenchmarkingAutoKaggle(BenchmarkingBase):
+    """ Extends the benchmarking class for evaluating AutoKaggle.
+
+        This class evaluates the performance of AutoKaggle on the input
+        classification or regression task_list.
+    """
+
+    def get_data_info(self, dataset, num_cols):
+        """ Get the info of each feature data type
+        # Arguments
+            dataset: dataset id in OpenML
+            num_cols: Total number of columns
+        # Returns
+            A numpy array containing the data_type of each feature column
+        """
+        nominal_feat = dataset.get_features_by_type('nominal')
+        numerical_feat = dataset.get_features_by_type('numeric')
+        string_feat = dataset.get_features_by_type('string')
+        date_feat = dataset.get_features_by_type('date')
+
+        data_info = []
+        for i in range(num_cols):
+            if i in date_feat:
+                data_info.append("TIM")
+            elif i in numerical_feat:
+                data_info.append("NUM")
+            else:
+                data_info.append("CAT")
+        return np.array(data_info)
+
+    def evaluate(self, task_id, time_limit=10 * 60):
+        """
+            See base class.
+        """
+        task_info = ["autokaggle", task_id, time_limit]
+        task = openml.tasks.get_task(task_id)
+        train_indices, test_indices = task.get_train_test_split_indices()
+        dataset = task.get_dataset()
+        X, y, categorical_indicator, attribute_names = dataset.get_data(
+            target=task.target_name, dataset_format='array')
+
+        x_train, y_train = X[train_indices], y[train_indices]
+        x_test, y_test = X[test_indices], y[test_indices]
+
+        # Create feature type list from openml.org indicator
+        data_info = self.get_data_info(dataset, len(attribute_names))
+
+        # Train
+        if task.task_type == 'Supervised Classification':
+            automl = Classifier()
+        elif task.task_type == 'Supervised Regression':
+            automl = Regressor()
+        else:
+            print("UNSUPPORTED TASK_TYPE")
+            assert (0)
+
+        automl.fit(x_train, y_train, time_limit=time_limit, data_info=data_info)
+
+        # Evaluate
+        y_hat = automl.predict(x_test)
+
+        if task.task_type == 'Supervised Classification':
+            is_binary = True if len(task.class_labels) <= 2 else False
+            result = task_info + self.measure_performance_cls(y_test, y_hat,
+                                                              binary=is_binary)
+            self.cls_results.loc[len(self.cls_results)] = result
+        elif task.task_type == 'Supervised Regression':
+            result = task_info + self.measure_performance_rgs(y_test, y_hat)
+            self.rgs_results.loc[len(self.rgs_results)] = result
+        print(result)
+        return result
+
+    #
+    # class BenchmarkingAutoSklearn(BenchmarkingBase):
+    """ Extends the benchmarking class for evaluating AutoSklearn.
+    
+        This class evaluates the performance of AutoKaggle on the input 
+        classification or regression task_list.
+    """
+
+
+#     def get_data_info(self, categorical_indicator):
+#         return ['Categorical' if ci else 'Numerical' for ci in categorical
+#         indicator]
+#
+#     def evaluate(self, task_id, time_limit=10*60):
+#         task_info = ["autosklearn", task_id, time_limit]
+#         task = openml.tasks.get_task(task_id)
+#         train_indices, test_indices = task.get_train_test_split_indices()
+#         dataset = task.get_dataset()
+#         X, y, categorical_indicator, attribute_names = dataset.get_data(
+#         target=task.target_name, dataset_format='array')
+#
+#         x_train, y_train = X[train_indices], y[train_indices]
+#         x_test, y_test = X[test_indices], y[test_indices]
+#
+#         # Create feature type list from openml.org indicator
+#         feat_type = self.get_data_info(categorical_indicator)
+#
+#         # Train
+#         if task.task_type == 'Supervised Classification':
+#             automl = AutoSklearnClassifier(
+#                 time_left_for_this_task=time_limit,
+#                 per_run_time_limit=time_limit//10, **kwargs)
+#         elif task.task_type == 'Supervised Regression':
+#             automl = AutoSklearnRegressor(
+#                 time_left_for_this_task=time_limit,
+#                 per_run_time_limit=time_limit//10, **kwargs)
+#         else:
+#             print("UNSUPPORTED TASK_TYPE")
+#             assert(0)
+#
+#         automl.fit(x_train, y_train, feat_type=feat_type)
+#
+#         y_hat = automl.predict(x_test)
+#         if task.task_type == 'Supervised Classification':
+#             is_binary = True if len(task.class_labels) <= 2 else False
+#             result = task_info + self.measure_performance_cls(y_test, y_hat,
+#             binary=is_binary)
+#             self.cls_results.loc[len(self.cls_results)] = result
+#         elif task.task_type == 'Supervised Regression':
+#             result = task_info + self.measure_performance_rgs(y_test, y_hat)
+#             self.rgs_results.loc[len(self.rgs_results)] = result
+#         self.results.append(result)
+#         print(result)
+#         return result
+
+
+def get_dataset_ids(task_ids):
+    """ Fetches the dataset_ids.
+    # Arguments
+        task_ids: List of ids of OpenML task flows
+    # Returns
+        dataset_list: List of the dataset Ids
+    """
+    if type(task_ids) == list:
+        return [openml.tasks.get_task(t_id).dataset_id for t_id in task_ids]
+    else:
+        return openml.tasks.get_task(task_ids).dataset_id
+
+
+def get_task_info(task_ids):
+    """ Fetches the dataset_ids and the task objective.
+    # Arguments
+        task_ids: List of ids of OpenML task flows.
+    # Returns
+        dataset_list: List of the dataset Ids.
+        task_types: List of the task type (such as 'binary/multiclass
+        classification' or 'regression'
+    """
+    task_types = []
+    dataset_list = []
+    for i, t_id in enumerate(task_ids):
+        task = openml.tasks.get_task(t_id)
+        dataset = openml.datasets.get_dataset(task.dataset_id)
+        if task.task_type_id == 1:
+            _, y, _, _ = dataset.get_data(target=task.target_name,
+                                          dataset_format='array')
+            task_type = "Binary Classification" if len(
+                set(y)) <= 2 else "Multiclass classification ({})".format(
+                len(set(y)))
+        else:
+            task_type = "Regression"
+        task_types.append(task_type)
+        dataset_list.append(dataset)
+    return dataset_list, task_types
+
+
+def get_dataset_properties(task_ids):
+    """ Fetches the properties of the dataset for given task flow id
+    # Arguments
+        task_ids: List of ids of OpenML task flows
+    # Returns
+        Dataframe containing the info of each of the dataset.
+    This function provides the dataset info such as number of instances, number of
+    numeric/nominal/string columns etc.
+    """
+    dataset_list, task_types = get_task_info(task_ids)
+    df = pd.DataFrame(
+        columns=["Name", "#Samples", "Task_Type", "#Numeric", "#Nominal", "#String",
+                 "#Date"])
+    for i, dataset in enumerate(dataset_list):
+        df.loc[i] = [
+            dataset.name,
+            dataset.qualities["NumberOfInstances"],
+            task_types[i],
+            len(dataset.get_features_by_type('numeric')),
+            len(dataset.get_features_by_type('nominal')),
+            len(dataset.get_features_by_type('string')),
+            len(dataset.get_features_by_type('date')),
+        ]
+    return df
+
+
+def get_performance_table(filename, metric):
+    """ Generates a comprehensive report table of AutoML performance.
+    # Arguments
+        filename: A csv file containing the results of AutoML runs
+        metric: Scoring metric to be used for comparison
+    # Returns
+        Pandas Dataframe listing the performance of different AutoML systems on
+        the given datasets.
+    This function reads the results csv and converts it into the performance table
+    based on the median of the results for each task.
+    """
+    test = pd.read_csv(filename)
+    perf = pd.DataFrame(columns=["Name", "AutoKaggle", "AutoSklearn", "H2O.ai"])
+    task_ids = list(set(test["task_id"]))
+    dataset_ids = get_dataset_ids(task_ids)
+
+    test = test.set_index(["task_id", "automl_model"])
+    test.sort_index(inplace=True)
+    for i, t_id in enumerate(task_ids):
+        try:
+            name = openml.datasets.get_dataset(dataset_ids[i]).name
+            auto_kaggle = test.loc[(t_id, "autokaggle")][metric].median()\
+                if (t_id, "autokaggle") in test.index else np.nan
+            auto_sklearn = test.loc[(t_id, "autosklearn")][metric].median()\
+                if (t_id, "autosklearn") in test.index else np.nan
+            h2o_ai = test.loc[(t_id, "autosklearn")][metric].median()\
+                if (t_id, "autosklearn") in test.index else np.nan
+            perf.loc[i] = [name, auto_kaggle, auto_sklearn, h2o_ai]
+        except Exception as e:
+            print(e)
+    return perf
+
+
+def style_results(res):
+    """ Highlights the best result in the results column
+    # Arguments
+        res: Dataframe containing the results of various AutoML runs
+    # Returns
+        Highlighed data-frame
+    """
+
+    def highlight_max(s):
+        """
+        Highlight the maximum in a Series yellow.
+        """
+        is_max = s == s.max()
+        return ['background-color: yellow' if v else '' for v in is_max]
+
+    res = res.set_index("Name")
+    res.style.apply(highlight_max, axis=1)
+    return res
+
+
+def get_box_plot(results, task_id, metric):
+    """ Generates a box plot of the variance in the result.
+    # Arguments
+        results: Results of various runs using AutoML systems
+        task_id: Id for OpenML task flow
+        metric: Score metric considered for the box-plot
+    # Returns
+        None
+    Builds and displays the box plot showing the variance in results for the
+    AutoML performance on the given dataset.
+    """
+    auto_sklearn = list(results.loc[(task_id, "autosklearn")][metric])
+    auto_kaggle = list(results.loc[(task_id, "autokaggle")][metric])
+    med_sk = statistics.median(auto_sklearn)
+    med_ak = statistics.median(auto_kaggle)
+    while len(auto_sklearn) < len(auto_kaggle):
+        auto_sklearn.append(med_sk)
+    while len(auto_sklearn) > len(auto_kaggle):
+        auto_kaggle.append(med_ak)
+    temp = pd.DataFrame(
+        data={"Autokaggle": auto_kaggle, "AutoSklearn": auto_sklearn})
+    temp.boxplot()
+
+
+if __name__ == "__main__":
+    regression_task_list = [52948, 2295, 4823, 2285, 4729, 4990, 4958, 2280, 4834,
+                            4850, 4839]
+    classification_task_list = [3021, 45, 2071, 2076, 3638, 3780, 3902, 3945, 3954,
+                                14951, 59, 24, 146230, 31, 10101,
+                                9914, 3020, 3524, 3573, 3962]
+    ak = BenchmarkingAutoKaggle(sess_name='test_perf')
+    import time
+
+    # t1 = time.time()
+    # for _ in range(1):
+    #     ak.run_automation(classification_task_list)
+    # t2 = time.time()
+    # print(t2-t1)
+    np.random.seed(1001)
+    random.seed(1001)
+    import time
+
+    t1 = time.time()
+    ak.evaluate(3021)
+    t2 = time.time()
+    print(t2 - t1)
diff --git a/examples/tabular_classification_binary.py b/examples/tabular_classification_binary.py
index df472a5..e5d3b6f 100644
--- a/examples/tabular_classification_binary.py
+++ b/examples/tabular_classification_binary.py
@@ -1,5 +1,8 @@
 import numpy as np
-from autokaggle import TabularClassifier
+import sys
+
+sys.path.append("..")
+from autokaggle import *
 
 if __name__ == '__main__':
     ntime, nnum, ncat = 4, 10, 8
@@ -16,9 +19,9 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = TabularClassifier()
+    clf = Classifier()
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 
     AUC = clf.evaluate(x_test, y_test)
-    print(AUC)
+    print(AUC)
\ No newline at end of file
diff --git a/examples/tabular_classification_multiclass.py b/examples/tabular_classification_multiclass.py
index 7515841..3426dd3 100644
--- a/examples/tabular_classification_multiclass.py
+++ b/examples/tabular_classification_multiclass.py
@@ -1,6 +1,8 @@
 import numpy as np
-from autokaggle import TabularClassifier
-
+import sys
+sys.path.append("..")
+# print(sys.path)
+from autokaggle import *
 if __name__ == '__main__':
     ntime, nnum, ncat = 4, 10, 8
     nsample = 10000
@@ -16,9 +18,9 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = TabularClassifier()
+    clf = Classifier()
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)
 
     F1_score = clf.evaluate(x_test, y_test)
-    print(F1_score)
+    print(F1_score)
\ No newline at end of file
diff --git a/examples/tabular_regression.py b/examples/tabular_regression.py
index 8ba95a8..bf97bc7 100644
--- a/examples/tabular_regression.py
+++ b/examples/tabular_regression.py
@@ -1,5 +1,7 @@
 import numpy as np
-from autokaggle import TabularRegressor
+import sys
+sys.path.append("..")
+from autokaggle import *
 
 if __name__ == '__main__':
     ntime, nnum, ncat = 4, 10, 8
@@ -16,7 +18,7 @@
     y_train = y_all[:int(nsample * 0.8)]
     y_test = y_all[int(nsample * 0.8):]
 
-    clf = TabularRegressor()
+    clf = Regressor()
     datainfo = np.array(['TIME'] * ntime + ['NUM'] * nnum + ['CAT'] * ncat)
     clf.fit(x_train, y_train, time_limit=12 * 60 * 60, data_info=datainfo)