From 5761f0b2dff702325a957676de683f147b14a5e4 Mon Sep 17 00:00:00 2001
From: othertea <othertea@fastmail.com>
Date: Sat, 9 Sep 2023 12:23:15 -0700
Subject: [PATCH 01/11] feat: add val/test split

---
 protein_lm/configs/train/toy_hf.yaml       |  5 +-
 protein_lm/configs/train/toy_localcsv.yaml |  5 +-
 protein_lm/modeling/getters/dataset.py     | 93 ++++++++++++++++++++--
 protein_lm/modeling/scripts/train.py       |  6 +-
 4 files changed, 97 insertions(+), 12 deletions(-)

diff --git a/protein_lm/configs/train/toy_hf.yaml b/protein_lm/configs/train/toy_hf.yaml
index 255f237..140a5e9 100644
--- a/protein_lm/configs/train/toy_hf.yaml
+++ b/protein_lm/configs/train/toy_hf.yaml
@@ -2,7 +2,10 @@
 dataset:
   dataset_type: "huggingface"
   dataset_loc: "zpn/uniref50"
-  train_sample_size: 100
+  subsample_size: 1000
+  split_seed: 2
+  val_size: 10
+  test_size: 10
   sequence_column_name: "sequence"
   max_sequence_length: 10
 
diff --git a/protein_lm/configs/train/toy_localcsv.yaml b/protein_lm/configs/train/toy_localcsv.yaml
index 1162af5..d03341c 100644
--- a/protein_lm/configs/train/toy_localcsv.yaml
+++ b/protein_lm/configs/train/toy_localcsv.yaml
@@ -2,7 +2,10 @@
 dataset:
   dataset_type: "csv"
   dataset_loc: "protein_lm/dataset/uniref/uniref50_trimmed.csv"
-  train_sample_size: 100
+  subsample_size: 100
+  split_seed: 2
+  val_size: 10
+  test_size: 10
   sequence_column_name: "sequence"
   max_sequence_length: 10
 
diff --git a/protein_lm/modeling/getters/dataset.py b/protein_lm/modeling/getters/dataset.py
index 8ac6055..4bb515b 100644
--- a/protein_lm/modeling/getters/dataset.py
+++ b/protein_lm/modeling/getters/dataset.py
@@ -1,6 +1,7 @@
 from typing import Dict, Literal, Optional
 
 from datasets import Dataset, load_dataset
+from datasets.dataset_dict import DatasetDict
 from pydantic import BaseModel
 
 
@@ -10,8 +11,19 @@ class DatasetConfig(BaseModel):
     # The path if local or the huggingface dataset name if huggingface
     dataset_loc: str
 
-    # train sample size to limit to, if any
-    train_sample_size: Optional[int] = None
+    # sample size to limit to, if any, usually for debugging
+    subsample_size: Optional[int] = None
+
+    """
+    Args for splitting into train, val, test
+    to be updated once we have more options
+    """
+    # split seed
+    split_seed: Optional[int] = None
+    # size of validation dataset
+    val_size: int
+    # size of test dataset
+    test_size: int
 
     # name of the column that contains the sequence
     sequence_column_name: str
@@ -39,20 +51,85 @@ def set_labels(result):
     return result
 
 
-def get_local_dataset(config: DatasetConfig) -> Dataset:
-    train_ds = load_dataset("csv", data_files=config.dataset_loc)["train"]
-    return train_ds
+def train_val_test_split(
+    dataset_dict: DatasetDict,
+    config: DatasetConfig,
+) -> DatasetDict:
+    """
+    Given a dictionary of datasets that only contains the split "train",
+    optionally subsamples it, and then splits it
+    so that it has potentially 3 splits: "train", "val", "test", where
+    "val" and "test" splits do not exist if the specified sizes are 0
+    """
+    assert list(dataset_dict.keys()) == [
+        "train"
+    ], f"{train_val_test_split.__name__} expects its input to have the keys \
+        ['train'] but the input has keys {list(dataset_dict.keys())}"
+
+    dataset = dataset_dict["train"]
+
+    val_size = config.val_size
+    test_size = config.test_size
+
+    assert isinstance(
+        dataset, Dataset
+    ), f"Invalid dataset type {type(dataset)}, only datasets.Dataset allowed"
+
+    dataset = dataset.shuffle(seed=config.split_seed)
+
+    if config.subsample_size is not None:
+        dataset = dataset.select(range(config.subsample_size))
+
+    valtest_size = val_size + test_size
+
+    if valtest_size > 0:
+        train_valtest = dataset.train_test_split(
+            test_size=val_size + test_size,
+            shuffle=False,
+        )
+        split_dict = {
+            "train": train_valtest["train"],
+        }
+        if test_size > 0 and val_size > 0:
+            test_val = train_valtest["test"].train_test_split(
+                test_size=test_size,
+                shuffle=False,
+            )
+            split_dict["val"] = test_val["train"]
+            split_dict["test"] = test_val["test"]
+        elif val_size > 0:
+            split_dict["val"] = train_valtest["test"]
+        else:
+            split_dict["train"] = train_valtest["test"]
+    else:
+        split_dict = {
+            "train": dataset,
+        }
+
+    split_dataset_dict = DatasetDict(split_dict)
+    return split_dataset_dict
+
+
+def get_csv_dataset(config: DatasetConfig) -> Dataset:
+    # note that a csv is read as having just one split "train"
+    dataset_dict = load_dataset("csv", data_files=config.dataset_loc)
+    return train_val_test_split(dataset_dict, config)
 
 
 def get_huggingface_dataset(config: DatasetConfig) -> Dataset:
-    train_ds = load_dataset(config.dataset_loc, streaming=True, split="train")
-    return train_ds
+    # Currently, the huggingface datasets we use (e.g., zpn/uniref50) has only
+    # one split "train"
+    dataset_dict = load_dataset(config.dataset_loc)
+    return train_val_test_split(dataset_dict, config)
 
 
 def get_dataset(config_dict: Dict, tokenizer) -> Dataset:
     config = DatasetConfig(**config_dict)
+
+    # So far, both datasets we handle have just one split, "train"
+    # so that is the only case we handle for now
     if config.dataset_type == "csv":
-        train_ds = get_local_dataset(config)
+        train_ds = get_csv_dataset(config)
     elif config.dataset_type == "huggingface":
         train_ds = get_huggingface_dataset(config)
     else:
diff --git a/protein_lm/modeling/scripts/train.py b/protein_lm/modeling/scripts/train.py
index 6ab4745..c295f42 100644
--- a/protein_lm/modeling/scripts/train.py
+++ b/protein_lm/modeling/scripts/train.py
@@ -1,6 +1,7 @@
 import argparse
 import math
 
+
 import yaml
 from transformers import Trainer
 
@@ -23,7 +24,7 @@ def train(
 
     tokenizer = get_tokenizer(config_dict=config_dict["tokenizer"])
 
-    train_ds = get_dataset(
+    dataset = get_dataset(
         config_dict=config_dict["dataset"],
         tokenizer=tokenizer,
     )
@@ -44,7 +45,8 @@ def train(
     trainer = Trainer(
         model=model,
         args=training_args,
-        train_dataset=train_ds,
+        train_dataset=dataset["train"],
+        eval_dataset=dataset.get("val", None),
         data_collator=data_collator,
     )
 

From 6a3975a5623babe609928fd722b419176fa39523 Mon Sep 17 00:00:00 2001
From: othertea <othertea@fastmail.com>
Date: Sat, 9 Sep 2023 15:18:03 -0700
Subject: [PATCH 02/11] feat: add optional wandb logging and refactor configs

---
 protein_lm/configs/train/toy_hf.yaml         | 22 +++++++-----
 protein_lm/configs/train/toy_localcsv.yaml   |  4 +--
 protein_lm/modeling/getters/training_args.py | 36 ++------------------
 protein_lm/modeling/getters/wandb_log.py     | 28 +++++++++++++++
 protein_lm/modeling/scripts/train.py         |  6 +++-
 5 files changed, 52 insertions(+), 44 deletions(-)
 create mode 100644 protein_lm/modeling/getters/wandb_log.py

diff --git a/protein_lm/configs/train/toy_hf.yaml b/protein_lm/configs/train/toy_hf.yaml
index 140a5e9..74b73b8 100644
--- a/protein_lm/configs/train/toy_hf.yaml
+++ b/protein_lm/configs/train/toy_hf.yaml
@@ -9,21 +9,27 @@ dataset:
   sequence_column_name: "sequence"
   max_sequence_length: 10
 
-# corresponds to TrainingArgsConfig
-training_args:
-  output_dir: "checkpoints/toy"
-  max_steps: 1
-  num_train_epochs: 1
+# corresponds to HuggingFace's TrainingArguments
+training_arguments:
+  output_dir: "checkpoints/toy_hf"
+  num_train_epochs: 2
   learning_rate: 0.1
   weight_decay: 0.1
   save_strategy: "epoch"
-  per_device_train_batch_size: 1
-  save_steps: 1
-  report_to: "none"
+  per_device_train_batch_size: 10
+  save_steps: 5
+  evaluation_strategy: "steps"
+  eval_steps: 5
+  report_to: "wandb"
   label_names:
     - 'labels'
   no_cuda: false
 
+# corresponds to WandBConfig
+wandb:
+  name: "toy_hf"
+  dir: "wandb_files/"
+
 # corresponds to TokenizerConfig
 tokenizer:
   tokenizer_type: "APT"
diff --git a/protein_lm/configs/train/toy_localcsv.yaml b/protein_lm/configs/train/toy_localcsv.yaml
index d03341c..a661fdf 100644
--- a/protein_lm/configs/train/toy_localcsv.yaml
+++ b/protein_lm/configs/train/toy_localcsv.yaml
@@ -9,8 +9,8 @@ dataset:
   sequence_column_name: "sequence"
   max_sequence_length: 10
 
-# corresponds to TrainingArgsConfig
-training_args:
+# corresponds to HuggingFace's TrainingArguments
+training_arguments:
   output_dir: "checkpoints/toy"
   max_steps: 1
   num_train_epochs: 1
diff --git a/protein_lm/modeling/getters/training_args.py b/protein_lm/modeling/getters/training_args.py
index 5116513..6c5e398 100644
--- a/protein_lm/modeling/getters/training_args.py
+++ b/protein_lm/modeling/getters/training_args.py
@@ -1,44 +1,14 @@
 import os
-from typing import Dict, List, Union
+from typing import Dict
 
-from pydantic import BaseModel, FieldValidationInfo, field_validator
 from transformers import TrainingArguments
 
 
-class TrainingArgsConfig(BaseModel):
-    per_device_train_batch_size: int
-    learning_rate: float
-    weight_decay: float
-    num_train_epochs: int
-    max_steps: int
-    save_steps: int
-    output_dir: str
-    save_strategy: str
-    report_to: str
-    label_names: List[str]
-    no_cuda: bool
-
-    @field_validator(
-        "per_device_train_batch_size",
-        "num_train_epochs",
-        "weight_decay",
-        "learning_rate",
-        "save_steps",
-    )
-    @classmethod
-    def check_gt_zero(cls, v: Union[int, float], info: FieldValidationInfo):
-        if v <= 0:
-            raise ValueError(f"trainer.{info.field_name} must be greater than 0")
-        return v
-
-
 def get_training_args(config_dict: Dict) -> TrainingArguments:
-    config = TrainingArgsConfig(**config_dict)
+    config = TrainingArguments(**config_dict)
 
     if not os.path.isdir(config.output_dir):
         print(f"creating checkpoint directory at {config.output_dir}")
         os.makedirs(config.output_dir)
 
-    return TrainingArguments(
-        **config_dict,
-    )
+    return config
diff --git a/protein_lm/modeling/getters/wandb_log.py b/protein_lm/modeling/getters/wandb_log.py
new file mode 100644
index 0000000..2902796
--- /dev/null
+++ b/protein_lm/modeling/getters/wandb_log.py
@@ -0,0 +1,28 @@
+import wandb
+from pydantic import BaseModel
+from typing import Dict, Optional
+import os
+
+
+class WandBConfig(BaseModel):
+    project: str = "protein_lm_scaling"
+    name: str
+    # directory to save to
+    dir: Optional[str] = None
+
+
+def setup_wandb(full_config_dict: Dict) -> None:
+    """
+    Sets up logging via wieghts and biases
+    Args:
+        full_config_dict: contains the full config, not just
+    the part corresponding to wandb, so that it can be logged
+    """
+    assert "wandb" in full_config_dict, f"If using wandb, need wandb section in config"
+    wandb_config = WandBConfig(**full_config_dict["wandb"])
+    if wandb_config.dir is not None:
+        if not os.path.isdir(wandb_config.dir):
+            print(f"creating wandb directory at {wandb_config.dir}")
+            os.makedirs(wandb_config.dir)
+
+    wandb.init(**dict(wandb_config), config=full_config_dict)
diff --git a/protein_lm/modeling/scripts/train.py b/protein_lm/modeling/scripts/train.py
index c295f42..3288921 100644
--- a/protein_lm/modeling/scripts/train.py
+++ b/protein_lm/modeling/scripts/train.py
@@ -10,6 +10,7 @@
 from protein_lm.modeling.getters.model import get_model
 from protein_lm.modeling.getters.tokenizer import get_tokenizer
 from protein_lm.modeling.getters.training_args import get_training_args
+from protein_lm.modeling.getters.wandb_log import setup_wandb
 
 
 def train(
@@ -39,9 +40,12 @@ def train(
     )
 
     training_args = get_training_args(
-        config_dict=config_dict["training_args"],
+        config_dict=config_dict["training_arguments"],
     )
 
+    if "wandb" in training_args.report_to:
+        setup_wandb(config_dict)
+
     trainer = Trainer(
         model=model,
         args=training_args,

From 3ed1781cb5bc8786201be597fe0253eef00bad62 Mon Sep 17 00:00:00 2001
From: othertea <othertea@fastmail.com>
Date: Sat, 9 Sep 2023 15:24:00 -0700
Subject: [PATCH 03/11] docs: remove extra newline

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index f104fcf..428d848 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,6 @@ The goal of this project is to uncover the best approach to scale large protein
 ## Installing enviroment
 
 ```
-
 conda env create -f protein_lm.yml
 conda activate protein_lm_env
 pip install -e .

From 246bfeb1e575d43000857147d00c7943d50e41fe Mon Sep 17 00:00:00 2001
From: othertea <othertea@fastmail.com>
Date: Sat, 9 Sep 2023 15:25:23 -0700
Subject: [PATCH 04/11] chore: add wandb dependency

---
 protein_lm.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/protein_lm.yml b/protein_lm.yml
index f94266b..a8f7472 100644
--- a/protein_lm.yml
+++ b/protein_lm.yml
@@ -9,6 +9,7 @@ dependencies:
   - numpy
   - pytorch
   - pydantic>=2.0
+  - wandb
   - pip:
       - transformers
       - datasets

From 4d9ca006df7c7b43c29cd415c67182502d277cae Mon Sep 17 00:00:00 2001
From: othertea <othertea@fastmail.com>
Date: Sat, 9 Sep 2023 20:38:45 -0700
Subject: [PATCH 05/11] chore: setup wandb with env vars

---
 protein_lm/modeling/getters/wandb_log.py | 23 +++++++++--------------
 protein_lm/modeling/scripts/train.py     |  4 +++-
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/protein_lm/modeling/getters/wandb_log.py b/protein_lm/modeling/getters/wandb_log.py
index 2902796..8f4b227 100644
--- a/protein_lm/modeling/getters/wandb_log.py
+++ b/protein_lm/modeling/getters/wandb_log.py
@@ -11,18 +11,13 @@ class WandBConfig(BaseModel):
     dir: Optional[str] = None
 
 
-def setup_wandb(full_config_dict: Dict) -> None:
-    """
-    Sets up logging via wieghts and biases
-    Args:
-        full_config_dict: contains the full config, not just
-    the part corresponding to wandb, so that it can be logged
-    """
-    assert "wandb" in full_config_dict, f"If using wandb, need wandb section in config"
-    wandb_config = WandBConfig(**full_config_dict["wandb"])
-    if wandb_config.dir is not None:
-        if not os.path.isdir(wandb_config.dir):
-            print(f"creating wandb directory at {wandb_config.dir}")
-            os.makedirs(wandb_config.dir)
+def setup_wandb(config_dict: Dict) -> None:
+    config = WandBConfig(**config_dict)
+    if config.dir is not None:
+        if not os.path.isdir(config.dir):
+            print(f"creating wandb directory at {config.dir}")
+            os.makedirs(config.dir)
 
-    wandb.init(**dict(wandb_config), config=full_config_dict)
+    os.environ["WANDB_PROJECT"] = config.project
+    os.environ["WANDB_NAME"] = config.name
+    os.environ["WANDB_DIR"] = config.dir
diff --git a/protein_lm/modeling/scripts/train.py b/protein_lm/modeling/scripts/train.py
index 3288921..fad3f6c 100644
--- a/protein_lm/modeling/scripts/train.py
+++ b/protein_lm/modeling/scripts/train.py
@@ -44,7 +44,9 @@ def train(
     )
 
     if "wandb" in training_args.report_to:
-        setup_wandb(config_dict)
+        setup_wandb(
+            config_dict["wandb"],
+        )
 
     trainer = Trainer(
         model=model,

From ebd4efd949292fdf04ba9909a12c436057c20780 Mon Sep 17 00:00:00 2001
From: othertea <othertea@fastmail.com>
Date: Sun, 10 Sep 2023 12:39:06 -0700
Subject: [PATCH 06/11] docs: update README training section

---
 README.md | 27 +++++++++++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 428d848..8d42b7a 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,9 @@ pip install -e protein_lm/tokenizer/rust_trie
 
 ## Training
 
-An example data file is provided in the `protein_lm/dataset/uniref` folder and an example toy training config yaml that uses this dataset is provided: `protein_lm/configs/train/toy_localcsv.yaml`. To use this config, at the root project directory (e.g., `protein_lm_scaling/`), run
+### Toy using local dataset
+
+We recommend using a toy tiny dataset for testing and debugging new changes that do not rely on having a large datset. Such a small dataset is provided in the `protein_lm/dataset/uniref` folder and an example toy training config yaml that uses this dataset is provided in `protein_lm/configs/train/toy_localcsv.yaml`. To use this config, at the root project directory (e.g., `protein_lm_scaling/`), run
 
 ```
 python protein_lm/modeling/scripts/train.py --config-file protein_lm/configs/train/toy_localcsv.yaml
@@ -33,12 +35,33 @@ This config is actually the default, so the above is equivalent to
 python protein_lm/modeling/scripts/train.py
 ```
 
-An example config yaml of using a dataset from huggingface is `protein_lm/configs/train/toy_hf.yaml`, which you can run with
+### Toy using a HuggingFace dataset
+
+For testing with a HuggingFace dataset, we have an example config yaml in `protein_lm/configs/train/toy_hf.yaml`. Note that training with this config is a little more involved than the above `protein_lm/configs/train/toy_localcsv.yaml`:
+
+* When first run, the script will download the [processed uniref50 dataset](https://huggingface.co/datasets/zpn/uniref50), which could take some time.
+* This config will log the loss values and other metrics to Weights and Biases. This will require you to create a wandb account.
+
+You can run with this config by:
 
 ```
 python protein_lm/modeling/scripts/train.py --config-file protein_lm/configs/train/toy_hf.yaml
 ```
 
+### Running on multiple gpus
+
+We can run on a single node with multiple gpus by
+
+```
+torchrun --standalone --nnodes=1 --nproc-per-node <num_gpus> protein_lm/modeling/scripts/train.py --config-file <config_file>
+```
+
+For example, to run on a single node with 3 gpus with the provided `protein_lm/configs/train/toy_hf.yaml` config file, we can run with
+
+```
+torchrun --standalone --nnodes=1 --nproc-per-node 3 protein_lm/modeling/scripts/train.py --config-file protein_lm/configs/train/toy_hf.yaml
+```
+
 ## Getting involved
 Your involvement is welcome! If you are interested, you can 
 - Join the `#protein-lm-scaling` channel on the [OpenBioML discord server](https://discord.com/invite/GgDBFP8ZEt).

From 3ab8f5e39124d7543cf07221c7d65144a835de41 Mon Sep 17 00:00:00 2001
From: othertea <othertea@fastmail.com>
Date: Sun, 10 Sep 2023 12:41:52 -0700
Subject: [PATCH 07/11] fix: do not manually move model to gpu

---
 protein_lm/modeling/getters/model.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/protein_lm/modeling/getters/model.py b/protein_lm/modeling/getters/model.py
index 0acc031..2ed70c1 100644
--- a/protein_lm/modeling/getters/model.py
+++ b/protein_lm/modeling/getters/model.py
@@ -33,7 +33,4 @@ def get_model(config_dict: Dict):
             config=model_config,
         )
 
-    if torch.cuda.is_available():
-        model.cuda()
-
     return model

From fb5dca1232943d0d23a9c1e15ee6ec5251a75b11 Mon Sep 17 00:00:00 2001
From: othertea <othertea@fastmail.com>
Date: Sun, 10 Sep 2023 13:01:55 -0700
Subject: [PATCH 08/11] chore: set ddp_find_unused_parameters to false in
 config

---
 protein_lm/configs/train/toy_hf.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/protein_lm/configs/train/toy_hf.yaml b/protein_lm/configs/train/toy_hf.yaml
index 74b73b8..94a2714 100644
--- a/protein_lm/configs/train/toy_hf.yaml
+++ b/protein_lm/configs/train/toy_hf.yaml
@@ -24,6 +24,7 @@ training_arguments:
   label_names:
     - 'labels'
   no_cuda: false
+  ddp_find_unused_parameters: false
 
 # corresponds to WandBConfig
 wandb:

From b8e728a6fdd6a0ac5d911e1158c8e030e9f39443 Mon Sep 17 00:00:00 2001
From: othertea <othertea@fastmail.com>
Date: Sun, 10 Sep 2023 13:16:14 -0700
Subject: [PATCH 09/11] chore: remove manual logging

---
 protein_lm/modeling/scripts/train.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/protein_lm/modeling/scripts/train.py b/protein_lm/modeling/scripts/train.py
index fad3f6c..58b65e2 100644
--- a/protein_lm/modeling/scripts/train.py
+++ b/protein_lm/modeling/scripts/train.py
@@ -56,17 +56,8 @@ def train(
         data_collator=data_collator,
     )
 
-    train_result = trainer.train()
-    trainer.save_model()  # Saves the tokenizer too for easy upload
-    metrics = train_result.metrics
-    try:
-        perplexity = math.exp(metrics["train_loss"])
-    except OverflowError:
-        perplexity = float("inf")
-    metrics["perplexity"] = perplexity
-    print("metrics:", metrics)
-    trainer.log_metrics("train", metrics)
-    trainer.save_metrics("train", metrics)
+    trainer.train()
+    trainer.save_model()
     trainer.save_state()
 
 

From cc5f98cfd20638ab58cb975565c2576731d637cb Mon Sep 17 00:00:00 2001
From: othertea <othertea@fastmail.com>
Date: Sat, 16 Sep 2023 10:43:13 -0700
Subject: [PATCH 10/11] feat: handle case where huggingface dataset has
 'train','val','test' splits

---
 protein_lm/modeling/getters/dataset.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/protein_lm/modeling/getters/dataset.py b/protein_lm/modeling/getters/dataset.py
index 4bb515b..a32656f 100644
--- a/protein_lm/modeling/getters/dataset.py
+++ b/protein_lm/modeling/getters/dataset.py
@@ -61,9 +61,9 @@ def train_val_test_split(
     so that it has potentially 3 splits: "train", "val", "test", where
     "val" and "test" splits do not exist if the specified sizes are 0
     """
-    assert list(dataset_dict.keys()) == [
+    assert set(dataset_dict.keys()) == {
         "train"
-    ], f"{train_val_test_split.__name__} expects its input to have the keys \
+    }, f"{train_val_test_split.__name__} expects its input to have the keys \
         ['train'] but the input has keys {list(dataset_dict.keys())}"
 
     dataset = dataset_dict["train"]
@@ -117,17 +117,21 @@ def get_csv_dataset(config: DatasetConfig) -> Dataset:
 
 
 def get_huggingface_dataset(config: DatasetConfig) -> Dataset:
-    # Currently, the huggingface datasets we use (e.g., zpn/uniref50) has only
-    # one split "train"
     dataset_dict = load_dataset(config.dataset_loc)
+    if set(dataset_dict.keys()) == {"train", "val", "test"}:
+        return dataset_dict
+
+    assert set(dataset_dict.keys()) == {
+        "train"
+    }, f"Huggingface DatasetDicts should have the keys {{'train'}} or \
+        {{'train', 'val', 'split'}} but this DatasetDict has keys \
+            {set(dataset_dict.keys())}"
     return train_val_test_split(dataset_dict, config)
 
 
 def get_dataset(config_dict: Dict, tokenizer) -> Dataset:
     config = DatasetConfig(**config_dict)
 
-    # So far, both datasets we handle have just one split, "train"
-    # so that is the only case we handle for now
     if config.dataset_type == "csv":
         train_ds = get_csv_dataset(config)
     elif config.dataset_type == "huggingface":

From c9e223789bf05dc1fb0ff623e1b3d598a164a88c Mon Sep 17 00:00:00 2001
From: othertea <othertea@fastmail.com>
Date: Sat, 23 Sep 2023 08:26:19 -0700
Subject: [PATCH 11/11] fix: correctly set test split if val_size is zero

---
 protein_lm/modeling/getters/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/protein_lm/modeling/getters/dataset.py b/protein_lm/modeling/getters/dataset.py
index a32656f..54e5d5a 100644
--- a/protein_lm/modeling/getters/dataset.py
+++ b/protein_lm/modeling/getters/dataset.py
@@ -100,7 +100,7 @@ def train_val_test_split(
         elif val_size > 0:
             split_dict["val"] = train_valtest["test"]
         else:
-            split_dict["train"] = train_valtest["test"]
+            split_dict["test"] = train_valtest["test"]
     else:
         split_dict = {
             "train": dataset,