From 5761f0b2dff702325a957676de683f147b14a5e4 Mon Sep 17 00:00:00 2001 From: othertea Date: Sat, 9 Sep 2023 12:23:15 -0700 Subject: [PATCH 01/11] feat: add val/test split --- protein_lm/configs/train/toy_hf.yaml | 5 +- protein_lm/configs/train/toy_localcsv.yaml | 5 +- protein_lm/modeling/getters/dataset.py | 93 ++++++++++++++++++++-- protein_lm/modeling/scripts/train.py | 6 +- 4 files changed, 97 insertions(+), 12 deletions(-) diff --git a/protein_lm/configs/train/toy_hf.yaml b/protein_lm/configs/train/toy_hf.yaml index 255f237..140a5e9 100644 --- a/protein_lm/configs/train/toy_hf.yaml +++ b/protein_lm/configs/train/toy_hf.yaml @@ -2,7 +2,10 @@ dataset: dataset_type: "huggingface" dataset_loc: "zpn/uniref50" - train_sample_size: 100 + subsample_size: 1000 + split_seed: 2 + val_size: 10 + test_size: 10 sequence_column_name: "sequence" max_sequence_length: 10 diff --git a/protein_lm/configs/train/toy_localcsv.yaml b/protein_lm/configs/train/toy_localcsv.yaml index 1162af5..d03341c 100644 --- a/protein_lm/configs/train/toy_localcsv.yaml +++ b/protein_lm/configs/train/toy_localcsv.yaml @@ -2,7 +2,10 @@ dataset: dataset_type: "csv" dataset_loc: "protein_lm/dataset/uniref/uniref50_trimmed.csv" - train_sample_size: 100 + subsample_size: 100 + split_seed: 2 + val_size: 10 + test_size: 10 sequence_column_name: "sequence" max_sequence_length: 10 diff --git a/protein_lm/modeling/getters/dataset.py b/protein_lm/modeling/getters/dataset.py index 8ac6055..4bb515b 100644 --- a/protein_lm/modeling/getters/dataset.py +++ b/protein_lm/modeling/getters/dataset.py @@ -1,6 +1,7 @@ from typing import Dict, Literal, Optional from datasets import Dataset, load_dataset +from datasets.dataset_dict import DatasetDict from pydantic import BaseModel @@ -10,8 +11,19 @@ class DatasetConfig(BaseModel): # The path if local or the huggingface dataset name if huggingface dataset_loc: str - # train sample size to limit to, if any - train_sample_size: Optional[int] = None + # sample size to limit to, if any, usually for debugging + subsample_size: Optional[int] = None + + """ + Args for splitting into train, val, test + to be updated once we have more options + """ + # split seed + split_seed: Optional[int] = None + # size of validation dataset + val_size: int + # size of test dataset + test_size: int # name of the column that contains the sequence sequence_column_name: str @@ -39,20 +51,85 @@ def set_labels(result): return result -def get_local_dataset(config: DatasetConfig) -> Dataset: - train_ds = load_dataset("csv", data_files=config.dataset_loc)["train"] - return train_ds +def train_val_test_split( + dataset_dict: DatasetDict, + config: DatasetConfig, +) -> DatasetDict: + """ + Given a dictionary of datasets that only contains the split "train", + optionally subsamples it, and then splits it + so that it has potentially 3 splits: "train", "val", "test", where + "val" and "test" splits do not exist if the specified sizes are 0 + """ + assert list(dataset_dict.keys()) == [ + "train" + ], f"{train_val_test_split.__name__} expects its input to have the keys \ + ['train'] but the input has keys {list(dataset_dict.keys())}" + + dataset = dataset_dict["train"] + + val_size = config.val_size + test_size = config.test_size + + assert isinstance( + dataset, Dataset + ), f"Invalid dataset type {type(dataset)}, only datasets.Dataset allowed" + + dataset = dataset.shuffle(seed=config.split_seed) + + if config.subsample_size is not None: + dataset = dataset.select(range(config.subsample_size)) + + valtest_size = val_size + test_size + + if valtest_size > 0: + train_valtest = dataset.train_test_split( + test_size=val_size + test_size, + shuffle=False, + ) + split_dict = { + "train": train_valtest["train"], + } + if test_size > 0 and val_size > 0: + test_val = train_valtest["test"].train_test_split( + test_size=test_size, + shuffle=False, + ) + split_dict["val"] = test_val["train"] + split_dict["test"] = test_val["test"] + elif val_size > 0: + split_dict["val"] = train_valtest["test"] + else: + split_dict["train"] = train_valtest["test"] + else: + split_dict = { + "train": dataset, + } + + split_dataset_dict = DatasetDict(split_dict) + return split_dataset_dict + + +def get_csv_dataset(config: DatasetConfig) -> Dataset: + # note that a csv is read as having just one split "train" + dataset_dict = load_dataset("csv", data_files=config.dataset_loc) + return train_val_test_split(dataset_dict, config) def get_huggingface_dataset(config: DatasetConfig) -> Dataset: - train_ds = load_dataset(config.dataset_loc, streaming=True, split="train") - return train_ds + # Currently, the huggingface datasets we use (e.g., zpn/uniref50) has only + # one split "train" + dataset_dict = load_dataset(config.dataset_loc) + return train_val_test_split(dataset_dict, config) def get_dataset(config_dict: Dict, tokenizer) -> Dataset: config = DatasetConfig(**config_dict) + + # So far, both datasets we handle have just one split, "train" + # so that is the only case we handle for now if config.dataset_type == "csv": - train_ds = get_local_dataset(config) + train_ds = get_csv_dataset(config) elif config.dataset_type == "huggingface": train_ds = get_huggingface_dataset(config) else: diff --git a/protein_lm/modeling/scripts/train.py b/protein_lm/modeling/scripts/train.py index 6ab4745..c295f42 100644 --- a/protein_lm/modeling/scripts/train.py +++ b/protein_lm/modeling/scripts/train.py @@ -1,6 +1,7 @@ import argparse import math + import yaml from transformers import Trainer @@ -23,7 +24,7 @@ def train( tokenizer = get_tokenizer(config_dict=config_dict["tokenizer"]) - train_ds = get_dataset( + dataset = get_dataset( config_dict=config_dict["dataset"], tokenizer=tokenizer, ) @@ -44,7 +45,8 @@ def train( trainer = Trainer( model=model, args=training_args, - train_dataset=train_ds, + train_dataset=dataset["train"], + eval_dataset=dataset.get("val", None), data_collator=data_collator, ) From 6a3975a5623babe609928fd722b419176fa39523 Mon Sep 17 00:00:00 2001 From: othertea Date: Sat, 9 Sep 2023 15:18:03 -0700 Subject: [PATCH 02/11] feat: add optional wandb logging and refactor configs --- protein_lm/configs/train/toy_hf.yaml | 22 +++++++----- protein_lm/configs/train/toy_localcsv.yaml | 4 +-- protein_lm/modeling/getters/training_args.py | 36 ++------------------ protein_lm/modeling/getters/wandb_log.py | 28 +++++++++++++++ protein_lm/modeling/scripts/train.py | 6 +++- 5 files changed, 52 insertions(+), 44 deletions(-) create mode 100644 protein_lm/modeling/getters/wandb_log.py diff --git a/protein_lm/configs/train/toy_hf.yaml b/protein_lm/configs/train/toy_hf.yaml index 140a5e9..74b73b8 100644 --- a/protein_lm/configs/train/toy_hf.yaml +++ b/protein_lm/configs/train/toy_hf.yaml @@ -9,21 +9,27 @@ dataset: sequence_column_name: "sequence" max_sequence_length: 10 -# corresponds to TrainingArgsConfig -training_args: - output_dir: "checkpoints/toy" - max_steps: 1 - num_train_epochs: 1 +# corresponds to HuggingFace's TrainingArguments +training_arguments: + output_dir: "checkpoints/toy_hf" + num_train_epochs: 2 learning_rate: 0.1 weight_decay: 0.1 save_strategy: "epoch" - per_device_train_batch_size: 1 - save_steps: 1 - report_to: "none" + per_device_train_batch_size: 10 + save_steps: 5 + evaluation_strategy: "steps" + eval_steps: 5 + report_to: "wandb" label_names: - 'labels' no_cuda: false +# corresponds to WandBConfig +wandb: + name: "toy_hf" + dir: "wandb_files/" + # corresponds to TokenizerConfig tokenizer: tokenizer_type: "APT" diff --git a/protein_lm/configs/train/toy_localcsv.yaml b/protein_lm/configs/train/toy_localcsv.yaml index d03341c..a661fdf 100644 --- a/protein_lm/configs/train/toy_localcsv.yaml +++ b/protein_lm/configs/train/toy_localcsv.yaml @@ -9,8 +9,8 @@ dataset: sequence_column_name: "sequence" max_sequence_length: 10 -# corresponds to TrainingArgsConfig -training_args: +# corresponds to HuggingFace's TrainingArguments +training_arguments: output_dir: "checkpoints/toy" max_steps: 1 num_train_epochs: 1 diff --git a/protein_lm/modeling/getters/training_args.py b/protein_lm/modeling/getters/training_args.py index 5116513..6c5e398 100644 --- a/protein_lm/modeling/getters/training_args.py +++ b/protein_lm/modeling/getters/training_args.py @@ -1,44 +1,14 @@ import os -from typing import Dict, List, Union +from typing import Dict -from pydantic import BaseModel, FieldValidationInfo, field_validator from transformers import TrainingArguments -class TrainingArgsConfig(BaseModel): - per_device_train_batch_size: int - learning_rate: float - weight_decay: float - num_train_epochs: int - max_steps: int - save_steps: int - output_dir: str - save_strategy: str - report_to: str - label_names: List[str] - no_cuda: bool - - @field_validator( - "per_device_train_batch_size", - "num_train_epochs", - "weight_decay", - "learning_rate", - "save_steps", - ) - @classmethod - def check_gt_zero(cls, v: Union[int, float], info: FieldValidationInfo): - if v <= 0: - raise ValueError(f"trainer.{info.field_name} must be greater than 0") - return v - - def get_training_args(config_dict: Dict) -> TrainingArguments: - config = TrainingArgsConfig(**config_dict) + config = TrainingArguments(**config_dict) if not os.path.isdir(config.output_dir): print(f"creating checkpoint directory at {config.output_dir}") os.makedirs(config.output_dir) - return TrainingArguments( - **config_dict, - ) + return config diff --git a/protein_lm/modeling/getters/wandb_log.py b/protein_lm/modeling/getters/wandb_log.py new file mode 100644 index 0000000..2902796 --- /dev/null +++ b/protein_lm/modeling/getters/wandb_log.py @@ -0,0 +1,28 @@ +import wandb +from pydantic import BaseModel +from typing import Dict, Optional +import os + + +class WandBConfig(BaseModel): + project: str = "protein_lm_scaling" + name: str + # directory to save to + dir: Optional[str] = None + + +def setup_wandb(full_config_dict: Dict) -> None: + """ + Sets up logging via wieghts and biases + Args: + full_config_dict: contains the full config, not just + the part corresponding to wandb, so that it can be logged + """ + assert "wandb" in full_config_dict, f"If using wandb, need wandb section in config" + wandb_config = WandBConfig(**full_config_dict["wandb"]) + if wandb_config.dir is not None: + if not os.path.isdir(wandb_config.dir): + print(f"creating wandb directory at {wandb_config.dir}") + os.makedirs(wandb_config.dir) + + wandb.init(**dict(wandb_config), config=full_config_dict) diff --git a/protein_lm/modeling/scripts/train.py b/protein_lm/modeling/scripts/train.py index c295f42..3288921 100644 --- a/protein_lm/modeling/scripts/train.py +++ b/protein_lm/modeling/scripts/train.py @@ -10,6 +10,7 @@ from protein_lm.modeling.getters.model import get_model from protein_lm.modeling.getters.tokenizer import get_tokenizer from protein_lm.modeling.getters.training_args import get_training_args +from protein_lm.modeling.getters.wandb_log import setup_wandb def train( @@ -39,9 +40,12 @@ def train( ) training_args = get_training_args( - config_dict=config_dict["training_args"], + config_dict=config_dict["training_arguments"], ) + if "wandb" in training_args.report_to: + setup_wandb(config_dict) + trainer = Trainer( model=model, args=training_args, From 3ed1781cb5bc8786201be597fe0253eef00bad62 Mon Sep 17 00:00:00 2001 From: othertea Date: Sat, 9 Sep 2023 15:24:00 -0700 Subject: [PATCH 03/11] docs: remove extra newline --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index f104fcf..428d848 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,6 @@ The goal of this project is to uncover the best approach to scale large protein ## Installing enviroment ``` - conda env create -f protein_lm.yml conda activate protein_lm_env pip install -e . From 246bfeb1e575d43000857147d00c7943d50e41fe Mon Sep 17 00:00:00 2001 From: othertea Date: Sat, 9 Sep 2023 15:25:23 -0700 Subject: [PATCH 04/11] chore: add wandb dependency --- protein_lm.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/protein_lm.yml b/protein_lm.yml index f94266b..a8f7472 100644 --- a/protein_lm.yml +++ b/protein_lm.yml @@ -9,6 +9,7 @@ dependencies: - numpy - pytorch - pydantic>=2.0 + - wandb - pip: - transformers - datasets From 4d9ca006df7c7b43c29cd415c67182502d277cae Mon Sep 17 00:00:00 2001 From: othertea Date: Sat, 9 Sep 2023 20:38:45 -0700 Subject: [PATCH 05/11] chore: setup wandb with env vars --- protein_lm/modeling/getters/wandb_log.py | 23 +++++++++-------------- protein_lm/modeling/scripts/train.py | 4 +++- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/protein_lm/modeling/getters/wandb_log.py b/protein_lm/modeling/getters/wandb_log.py index 2902796..8f4b227 100644 --- a/protein_lm/modeling/getters/wandb_log.py +++ b/protein_lm/modeling/getters/wandb_log.py @@ -11,18 +11,13 @@ class WandBConfig(BaseModel): dir: Optional[str] = None -def setup_wandb(full_config_dict: Dict) -> None: - """ - Sets up logging via wieghts and biases - Args: - full_config_dict: contains the full config, not just - the part corresponding to wandb, so that it can be logged - """ - assert "wandb" in full_config_dict, f"If using wandb, need wandb section in config" - wandb_config = WandBConfig(**full_config_dict["wandb"]) - if wandb_config.dir is not None: - if not os.path.isdir(wandb_config.dir): - print(f"creating wandb directory at {wandb_config.dir}") - os.makedirs(wandb_config.dir) +def setup_wandb(config_dict: Dict) -> None: + config = WandBConfig(**config_dict) + if config.dir is not None: + if not os.path.isdir(config.dir): + print(f"creating wandb directory at {config.dir}") + os.makedirs(config.dir) - wandb.init(**dict(wandb_config), config=full_config_dict) + os.environ["WANDB_PROJECT"] = config.project + os.environ["WANDB_NAME"] = config.name + os.environ["WANDB_DIR"] = config.dir diff --git a/protein_lm/modeling/scripts/train.py b/protein_lm/modeling/scripts/train.py index 3288921..fad3f6c 100644 --- a/protein_lm/modeling/scripts/train.py +++ b/protein_lm/modeling/scripts/train.py @@ -44,7 +44,9 @@ def train( ) if "wandb" in training_args.report_to: - setup_wandb(config_dict) + setup_wandb( + config_dict["wandb"], + ) trainer = Trainer( model=model, From ebd4efd949292fdf04ba9909a12c436057c20780 Mon Sep 17 00:00:00 2001 From: othertea Date: Sun, 10 Sep 2023 12:39:06 -0700 Subject: [PATCH 06/11] docs: update README training section --- README.md | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 428d848..8d42b7a 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,9 @@ pip install -e protein_lm/tokenizer/rust_trie ## Training -An example data file is provided in the `protein_lm/dataset/uniref` folder and an example toy training config yaml that uses this dataset is provided: `protein_lm/configs/train/toy_localcsv.yaml`. To use this config, at the root project directory (e.g., `protein_lm_scaling/`), run +### Toy using local dataset + +We recommend using a toy tiny dataset for testing and debugging new changes that do not rely on having a large datset. Such a small dataset is provided in the `protein_lm/dataset/uniref` folder and an example toy training config yaml that uses this dataset is provided in `protein_lm/configs/train/toy_localcsv.yaml`. To use this config, at the root project directory (e.g., `protein_lm_scaling/`), run ``` python protein_lm/modeling/scripts/train.py --config-file protein_lm/configs/train/toy_localcsv.yaml @@ -33,12 +35,33 @@ This config is actually the default, so the above is equivalent to python protein_lm/modeling/scripts/train.py ``` -An example config yaml of using a dataset from huggingface is `protein_lm/configs/train/toy_hf.yaml`, which you can run with +### Toy using a HuggingFace dataset + +For testing with a HuggingFace dataset, we have an example config yaml in `protein_lm/configs/train/toy_hf.yaml`. Note that training with this config is a little more involved than the above `protein_lm/configs/train/toy_localcsv.yaml`: + +* When first run, the script will download the [processed uniref50 dataset](https://huggingface.co/datasets/zpn/uniref50), which could take some time. +* This config will log the loss values and other metrics to Weights and Biases. This will require you to create a wandb account. + +You can run with this config by: ``` python protein_lm/modeling/scripts/train.py --config-file protein_lm/configs/train/toy_hf.yaml ``` +### Running on multiple gpus + +We can run on a single node with multiple gpus by + +``` +torchrun --standalone --nnodes=1 --nproc-per-node protein_lm/modeling/scripts/train.py --config-file +``` + +For example, to run on a single node with 3 gpus with the provided `protein_lm/configs/train/toy_hf.yaml` config file, we can run with + +``` +torchrun --standalone --nnodes=1 --nproc-per-node 3 protein_lm/modeling/scripts/train.py --config-file protein_lm/configs/train/toy_hf.yaml +``` + ## Getting involved Your involvement is welcome! If you are interested, you can - Join the `#protein-lm-scaling` channel on the [OpenBioML discord server](https://discord.com/invite/GgDBFP8ZEt). From 3ab8f5e39124d7543cf07221c7d65144a835de41 Mon Sep 17 00:00:00 2001 From: othertea Date: Sun, 10 Sep 2023 12:41:52 -0700 Subject: [PATCH 07/11] fix: do not manually move model to gpu --- protein_lm/modeling/getters/model.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/protein_lm/modeling/getters/model.py b/protein_lm/modeling/getters/model.py index 0acc031..2ed70c1 100644 --- a/protein_lm/modeling/getters/model.py +++ b/protein_lm/modeling/getters/model.py @@ -33,7 +33,4 @@ def get_model(config_dict: Dict): config=model_config, ) - if torch.cuda.is_available(): - model.cuda() - return model From fb5dca1232943d0d23a9c1e15ee6ec5251a75b11 Mon Sep 17 00:00:00 2001 From: othertea Date: Sun, 10 Sep 2023 13:01:55 -0700 Subject: [PATCH 08/11] chore: set ddp_find_unused_parameters to false in config --- protein_lm/configs/train/toy_hf.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/protein_lm/configs/train/toy_hf.yaml b/protein_lm/configs/train/toy_hf.yaml index 74b73b8..94a2714 100644 --- a/protein_lm/configs/train/toy_hf.yaml +++ b/protein_lm/configs/train/toy_hf.yaml @@ -24,6 +24,7 @@ training_arguments: label_names: - 'labels' no_cuda: false + ddp_find_unused_parameters: false # corresponds to WandBConfig wandb: From b8e728a6fdd6a0ac5d911e1158c8e030e9f39443 Mon Sep 17 00:00:00 2001 From: othertea Date: Sun, 10 Sep 2023 13:16:14 -0700 Subject: [PATCH 09/11] chore: remove manual logging --- protein_lm/modeling/scripts/train.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/protein_lm/modeling/scripts/train.py b/protein_lm/modeling/scripts/train.py index fad3f6c..58b65e2 100644 --- a/protein_lm/modeling/scripts/train.py +++ b/protein_lm/modeling/scripts/train.py @@ -56,17 +56,8 @@ def train( data_collator=data_collator, ) - train_result = trainer.train() - trainer.save_model() # Saves the tokenizer too for easy upload - metrics = train_result.metrics - try: - perplexity = math.exp(metrics["train_loss"]) - except OverflowError: - perplexity = float("inf") - metrics["perplexity"] = perplexity - print("metrics:", metrics) - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) + trainer.train() + trainer.save_model() trainer.save_state() From cc5f98cfd20638ab58cb975565c2576731d637cb Mon Sep 17 00:00:00 2001 From: othertea Date: Sat, 16 Sep 2023 10:43:13 -0700 Subject: [PATCH 10/11] feat: handle case where huggingface dataset has 'train','val','test' splits --- protein_lm/modeling/getters/dataset.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/protein_lm/modeling/getters/dataset.py b/protein_lm/modeling/getters/dataset.py index 4bb515b..a32656f 100644 --- a/protein_lm/modeling/getters/dataset.py +++ b/protein_lm/modeling/getters/dataset.py @@ -61,9 +61,9 @@ def train_val_test_split( so that it has potentially 3 splits: "train", "val", "test", where "val" and "test" splits do not exist if the specified sizes are 0 """ - assert list(dataset_dict.keys()) == [ + assert set(dataset_dict.keys()) == { "train" - ], f"{train_val_test_split.__name__} expects its input to have the keys \ + }, f"{train_val_test_split.__name__} expects its input to have the keys \ ['train'] but the input has keys {list(dataset_dict.keys())}" dataset = dataset_dict["train"] @@ -117,17 +117,21 @@ def get_csv_dataset(config: DatasetConfig) -> Dataset: def get_huggingface_dataset(config: DatasetConfig) -> Dataset: - # Currently, the huggingface datasets we use (e.g., zpn/uniref50) has only - # one split "train" dataset_dict = load_dataset(config.dataset_loc) + if set(dataset_dict.keys()) == {"train", "val", "test"}: + return dataset_dict + + assert set(dataset_dict.keys()) == { + "train" + }, f"Huggingface DatasetDicts should have the keys {{'train'}} or \ + {{'train', 'val', 'split'}} but this DatasetDict has keys \ + {set(dataset_dict.keys())}" return train_val_test_split(dataset_dict, config) def get_dataset(config_dict: Dict, tokenizer) -> Dataset: config = DatasetConfig(**config_dict) - # So far, both datasets we handle have just one split, "train" - # so that is the only case we handle for now if config.dataset_type == "csv": train_ds = get_csv_dataset(config) elif config.dataset_type == "huggingface": From c9e223789bf05dc1fb0ff623e1b3d598a164a88c Mon Sep 17 00:00:00 2001 From: othertea Date: Sat, 23 Sep 2023 08:26:19 -0700 Subject: [PATCH 11/11] fix: correctly set test split if val_size is zero --- protein_lm/modeling/getters/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/protein_lm/modeling/getters/dataset.py b/protein_lm/modeling/getters/dataset.py index a32656f..54e5d5a 100644 --- a/protein_lm/modeling/getters/dataset.py +++ b/protein_lm/modeling/getters/dataset.py @@ -100,7 +100,7 @@ def train_val_test_split( elif val_size > 0: split_dict["val"] = train_valtest["test"] else: - split_dict["train"] = train_valtest["test"] + split_dict["test"] = train_valtest["test"] else: split_dict = { "train": dataset,