Completes OPEN-3481 Introduce support for baseline models on the Python API

gustavocidornelas · gustavocidornelas · commit 6549c753009d · 2023-02-07T17:13:11.000-03:00
diff --git a/docs/source/reference/upload.rst b/docs/source/reference/upload.rst
@@ -32,6 +32,7 @@ Add models and datasets
    OpenlayerClient.add_model   
    OpenlayerClient.add_dataset
    OpenlayerClient.add_dataframe
+   OpenlayerClient.add_baseline_model
 
 Version control flow
 --------------------
diff --git a/openlayer/__init__.py b/openlayer/__init__.py
@@ -16,6 +16,7 @@
 from .version import __version__  # noqa: F401
 
 OPENLAYER_DIR = os.path.join(os.path.expanduser("~"), ".openlayer")
+VALID_RESOURCE_NAMES = {"baseline-model", "model", "training", "validation"}
 
 
 class OpenlayerClient(object):
@@ -441,6 +442,85 @@ def add_model(
                 force=force,
             )
 
+    def add_baseline_model(
+        self,
+        project_id: int,
+        task_type: TaskType,
+        model_config_file_path: Optional[str] = None,
+        force: bool = False,
+    ):
+        """
+        **Coming soon...**
+
+        Add a baseline model to the project.
+
+        Baseline models should be added together with training and validation
+        sets. A model will then be trained on the platform using AutoML, using
+        the parameters provided in the model config file.
+
+        .. important::
+            This feature is experimental and currently under development. Only
+            tabular classification tasks are supported for now.
+
+        Parameters
+        ----------
+        model_config_file_path : str, optional
+            Path to the model configuration YAML file. If not provided, the default
+            model config will be used.
+
+            .. admonition:: What's on the model config file?
+
+                For baseline models, the content of the YAML file should contain:
+
+                - ``ensembleSize`` : int, default 10
+                    Number of models ensembled.
+                - ``randomSeed`` : int, default 42
+                    Random seed to be used for model training.
+                - ``timeout`` : int, default 60
+                    Maximum time (in seconds) to train all the models.
+                - ``perRunLimit`` : int, optional
+                    Maximum time (in seconds) to train each model.
+                - ``metadata`` : Dict[str, any], default {}
+                    Dictionary containing metadata about the model. This is the
+                    metadata that will be displayed on the Openlayer platform.
+        force : bool, optional
+            Whether to force the addition of the baseline model to the project.
+            If set to True, any existing staged baseline model will be overwritten.
+        """
+        if task_type is not TaskType.TabularClassification:
+            raise exceptions.OpenlayerException(
+                "Only tabular classification is supported for model baseline for now."
+            )
+
+        # Validate the baseline model
+        baseline_model_validator = validators.BaselineModelValidator(
+            model_config_file_path=model_config_file_path,
+        )
+        failed_validations = baseline_model_validator.validate()
+
+        if failed_validations:
+            raise exceptions.OpenlayerValidationError(
+                "There are issues with the baseline model. \n"
+                "Make sure to fix all of the issues listed above before the upload.",
+            ) from None
+
+        # Load model config and augment with defaults
+        model_config = {}
+        if model_config_file_path is not None:
+            model_config = utils.read_yaml(model_config_file_path)
+        model_data = schemas.BaselineModelSchema().load(model_config)
+
+        # Copy relevant resources to temp directory
+        with tempfile.TemporaryDirectory() as temp_dir:
+            utils.write_yaml(model_data, f"{temp_dir}/model_config.yaml")
+
+            self._stage_resource(
+                resource_name="baseline-model",
+                resource_dir=temp_dir,
+                project_id=project_id,
+                force=force,
+            )
+
     def add_dataset(
         self,
         file_path: str,
@@ -1034,7 +1114,6 @@ def status(self, project_id: int):
         :obj:`commit` method).
         """
         project_dir = f"{OPENLAYER_DIR}/{project_id}/staging"
-        valid_resource_names = ["model", "training", "validation"]
 
         if not os.listdir(project_dir):
             print(
@@ -1046,7 +1125,7 @@ def status(self, project_id: int):
         if not os.path.exists(f"{project_dir}/commit.yaml"):
             print("The following resources are staged, waiting to be committed:")
             for file in os.listdir(project_dir):
-                if file in valid_resource_names:
+                if file in VALID_RESOURCE_NAMES:
                     print(f"\t - {file}")
             print("Use the `commit` method to add a commit message to your changes.")
             return
@@ -1055,7 +1134,7 @@ def status(self, project_id: int):
             commit = yaml.safe_load(commit_file)
         print("The following resources are committed, waiting to be pushed:")
         for file in os.listdir(project_dir):
-            if file != "commit.yaml":
+            if file in VALID_RESOURCE_NAMES:
                 print(f"\t - {file}")
         print(f"Commit message from {commit['date']}:")
         print(f"\t {commit['message']}")
@@ -1128,31 +1207,43 @@ def _stage_resource(
         force : bool
             Whether to overwrite the resource if it already exists in the staging area.
         """
-        if resource_name not in ["model", "training", "validation"]:
+        if resource_name not in VALID_RESOURCE_NAMES:
             raise ValueError(
-                f"Resource name must be one of 'model', 'training', or 'validation',"
-                f" but got {resource_name}."
+                f"Resource name must be one of 'baseline-model', 'model', 'training', or 'validation',"
+                f" but got '{resource_name}'."
             )
 
-        staging_dir = f"{OPENLAYER_DIR}/{project_id}/staging/{resource_name}"
+        project_dir = f"{OPENLAYER_DIR}/{project_id}/staging"
+
+        resources_staged = utils.list_resources_in_bundle(project_dir)
 
-        # Append 'dataset' to the end of the resource name for the prints
-        if resource_name in ["training", "validation"]:
-            resource_name += " dataset"
+        if resource_name == "model" and "baseline-model" in resources_staged:
+            raise exceptions.OpenlayerException(
+                "Trying to stage a `model` when there is a `baseline-model` already staged."
+                + " You can either add a `model` or a `baseline-model`, but not both at the same time."
+                + " Please remove one of them from the staging area using the `restore` method."
+            ) from None
 
-        if os.path.exists(staging_dir):
-            print(f"Found an existing {resource_name} staged.")
-            overwrite = "n"
+        if resource_name == "baseline-model" and "model" in resources_staged:
+            raise exceptions.OpenlayerException(
+                "Trying to stage a `baseline-model` when there is a `model` already staged."
+                + " You can either add a `model` or a `baseline-model`, but not both at the same time."
+                + " Please remove one of them from the staging area using the `restore` method."
+            ) from None
+
+        if resource_name in resources_staged:
+            print(f"Found an existing `{resource_name}` resource staged.")
 
+            overwrite = "n"
             if not force:
                 overwrite = input("Do you want to overwrite it? [y/n] ")
             if overwrite.lower() == "y" or force:
-                print(f"Overwriting previously staged {resource_name}...")
-                shutil.rmtree(staging_dir)
+                print(f"Overwriting previously staged `{resource_name}` resource...")
+                shutil.rmtree(project_dir + "/" + resource_name)
             else:
-                print(f"Keeping the existing {resource_name} staged.")
+                print(f"Keeping the existing `{resource_name}` resource staged.")
                 return
 
-        shutil.copytree(resource_dir, staging_dir)
+        shutil.copytree(resource_dir, project_dir + "/" + resource_name)
 
-        print(f"Staged the {resource_name}!")
+        print(f"Staged the `{resource_name}` resource!")
diff --git a/openlayer/projects.py b/openlayer/projects.py
@@ -43,6 +43,15 @@ def add_model(
             *args, project_id=self.id, task_type=tasks.TaskType(self.taskType), **kwargs
         )
 
+    def add_baseline_model(
+        self,
+        *args,
+        **kwargs,
+    ):
+        return self.client.add_baseline(
+            *args, project_id=self.id, task_type=tasks.TaskType(self.taskType), **kwargs
+        )
+
     def add_dataset(
         self,
         *args,
diff --git a/openlayer/schemas.py b/openlayer/schemas.py
@@ -18,6 +18,16 @@
 )
 
 # ---------------------------------- Schemas --------------------------------- #
+class BaselineModelSchema(ma.Schema):
+    """Schema for baseline models."""
+
+    ensembleSize = ma.fields.Int(load_default=10)
+    metadata = ma.fields.Dict(allow_none=True, load_default={})
+    perRunLimit = ma.fields.Int(load_default=None, allow_none=True)
+    randomSeed = ma.fields.Int(load_default=42)
+    timeout = ma.fields.Int(load_default=60)
+
+
 class CommitSchema(ma.Schema):
     """Schema for commits."""
 
diff --git a/openlayer/utils.py b/openlayer/utils.py
@@ -89,3 +89,23 @@ def get_exception_stacktrace(err: Exception):
         str: the stacktrace of the most recent exception.
     """
     return "".join(traceback.format_exception(type(err), err, err.__traceback__))
+
+
+def list_resources_in_bundle(bundle_path: str) -> list:
+    """Lists the resources in the bundle.
+
+    Args:
+        bundle_path (str): the path to the bundle.
+
+    Returns:
+        list: the list of resources in the bundle.
+    """
+    # TODO: factor out list of valid resources
+    VALID_RESOURCES = {"baseline-model", "model", "training", "validation"}
+
+    resources = []
+
+    for resource in os.listdir(bundle_path):
+        if resource in VALID_RESOURCES:
+            resources.append(resource)
+    return resources
diff --git a/openlayer/validators.py b/openlayer/validators.py
@@ -24,6 +24,68 @@
 from . import schemas, utils
 
 
+class BaselineModelValidator:
+    """Validates the baseline model.
+
+    Parameters
+    ----------
+    model_config_file_path : Optional[str], optional
+        The path to the model config file, by default None
+    """
+
+    def __init__(self, model_config_file_path: Optional[str] = None):
+        self.model_config_file_path = model_config_file_path
+
+    def _validate_model_config(self):
+        """Validates the model config file."""
+        model_config_failed_validations = []
+
+        # File existence check
+        if self.model_config_file_path:
+            if not os.path.isfile(os.path.expanduser(self.model_config_file_path)):
+                model_config_failed_validations.append(
+                    f"File `{self.model_config_file_path}` does not exist."
+                )
+            else:
+                with open(self.model_config_file_path, "r") as stream:
+                    self.model_config = yaml.safe_load(stream)
+
+        if self.model_config:
+            baseline_model_schema = schemas.BaselineModelSchema()
+            try:
+                baseline_model_schema.load(self.model_config)
+            except ma.ValidationError as err:
+                model_config_failed_validations.extend(
+                    _format_marshmallow_error_message(err)
+                )
+
+        # Print results of the validation
+        if model_config_failed_validations:
+            print("Baseline model config failed validations: \n")
+            _list_failed_validation_messages(model_config_failed_validations)
+
+        # Add the `model_config.yaml` failed validations to the list of all failed validations
+        self.failed_validations.extend(model_config_failed_validations)
+
+    def validate(self) -> List[str]:
+        """Validates the baseline model.
+
+        Returns
+        -------
+        List[str]
+            The list of failed validations.
+        """
+        self.failed_validations = []
+
+        if self.model_config_file_path:
+            self._validate_model_config()
+
+        if not self.failed_validations:
+            print("All baseline model validations passed!")
+
+        return self.failed_validations
+
+
 class CommitBundleValidator:
     """Validates the commit bundle prior to push.
 
@@ -44,7 +106,7 @@ def __init__(
         skip_dataset_validation: bool = False,
     ):
         self.bundle_path = bundle_path
-        self._bundle_resources = self._list_resources_in_bundle()
+        self._bundle_resources = utils.list_resources_in_bundle(bundle_path)
         self._skip_model_validation = skip_model_validation
         self._skip_dataset_validation = skip_dataset_validation
         self.failed_validations = []
@@ -55,8 +117,10 @@ def _validate_bundle_state(self):
         This includes:
         - When a "model" is included, you always need to provide predictions for both
           "validation" and "training" (regardless of artifact or no artifact).
-        - When a "model" is not included, you always need to NOT upload predictions with
-          one exception:
+        - When a "baseline-model" is included, you always need to provide a "training"
+          and "validation" set without predictions.
+        - When a "model" nor a "baseline-model" are included, you always need to NOT upload predictions
+          with one exception:
             - "validation" set only in bundle, which means the predictions are for the
             previous model version.
         """
@@ -95,6 +159,24 @@ def _validate_bundle_state(self):
                     "training and a validation sets with predictions in the column "
                     "`predictions_column_name`."
                 )
+        elif "baseline-model" in self._bundle_resources:
+            if (
+                "training" not in self._bundle_resources
+                or "validation" not in self._bundle_resources
+            ):
+                bundle_state_failed_validations.append(
+                    "To push a baseline model to the platform, you must provide "
+                    "training and validation sets."
+                )
+            elif (
+                training_predictions_column_name is not None
+                and validation_predictions_column_name is not None
+            ):
+                bundle_state_failed_validations.append(
+                    "To push a baseline model to the platform, you must not provide "
+                    "training and a validation sets without predictions in the column "
+                    "`predictions_column_name`."
+                )
         else:
             if (
                 "training" in self._bundle_resources
@@ -142,6 +224,17 @@ def _validate_bundle_resources(self):
                 validation_set_validator.validate()
             )
 
+        if (
+            "baseline-model" in self._bundle_resources
+            and not self._skip_model_validation
+        ):
+            baseline_model_validator = BaselineModelValidator(
+                model_config_file_path=f"{self.bundle_path}/baseline-model/model_config.yaml"
+            )
+            bundle_resources_failed_validations.extend(
+                baseline_model_validator.validate()
+            )
+
         if "model" in self._bundle_resources and not self._skip_model_validation:
             model_files = os.listdir(f"{self.bundle_path}/model")
             # Shell model
@@ -183,18 +276,6 @@ def _validate_bundle_resources(self):
         # Add the bundle resources failed validations to the list of all failed validations
         self.failed_validations.extend(bundle_resources_failed_validations)
 
-    def _list_resources_in_bundle(self) -> List[str]:
-        """Lists the resources in a commit bundle."""
-        # TODO: factor out list of valid resources
-        VALID_RESOURCES = ["model", "training", "validation"]
-
-        resources = []
-
-        for resource in os.listdir(self.bundle_path):
-            if resource in VALID_RESOURCES:
-                resources.append(resource)
-        return resources
-
     def _load_dataset_from_bundle(self, label: str) -> pd.DataFrame:
         """Loads a dataset from a commit bundle.