diff --git a/README.md b/README.md index c4a87e3..8d42b7a 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,6 @@ The goal of this project is to uncover the best approach to scale large protein ## Installing enviroment ``` - conda env create -f protein_lm.yml conda activate protein_lm_env pip install -e . @@ -22,7 +21,9 @@ pip install -e protein_lm/tokenizer/rust_trie ## Training -An example data file is provided in the `protein_lm/dataset/uniref` folder and an example toy training config yaml that uses this dataset is provided: `protein_lm/configs/train/toy_localcsv.yaml`. To use this config, at the root project directory (e.g., `protein_lm_scaling/`), run +### Toy using local dataset + +We recommend using a toy tiny dataset for testing and debugging new changes that do not rely on having a large datset. Such a small dataset is provided in the `protein_lm/dataset/uniref` folder and an example toy training config yaml that uses this dataset is provided in `protein_lm/configs/train/toy_localcsv.yaml`. To use this config, at the root project directory (e.g., `protein_lm_scaling/`), run ``` python protein_lm/modeling/scripts/train.py --config-file protein_lm/configs/train/toy_localcsv.yaml @@ -34,12 +35,32 @@ This config is actually the default, so the above is equivalent to python protein_lm/modeling/scripts/train.py ``` -An example config yaml of using a dataset from huggingface is `protein_lm/configs/train/toy_hf.yaml`, which you can run with +### Toy using a HuggingFace dataset + +For testing with a HuggingFace dataset, we have an example config yaml in `protein_lm/configs/train/toy_hf.yaml`. Note that training with this config is a little more involved than the above `protein_lm/configs/train/toy_localcsv.yaml`: + +* When first run, the script will download the [processed uniref50 dataset](https://huggingface.co/datasets/zpn/uniref50), which could take some time. +* This config will log the loss values and other metrics to Weights and Biases. This will require you to create a wandb account. + +You can run with this config by: ``` python protein_lm/modeling/scripts/train.py --config-file protein_lm/configs/train/toy_hf.yaml ``` +### Running on multiple gpus + +We can run on a single node with multiple gpus by + +``` +torchrun --standalone --nnodes=1 --nproc-per-node protein_lm/modeling/scripts/train.py --config-file +``` + +For example, to run on a single node with 3 gpus with the provided `protein_lm/configs/train/toy_hf.yaml` config file, we can run with + +``` +torchrun --standalone --nnodes=1 --nproc-per-node 3 protein_lm/modeling/scripts/train.py --config-file protein_lm/configs/train/toy_hf.yaml +``` ## Getting involved Your involvement is welcome! If you are interested, you can diff --git a/protein_lm.yml b/protein_lm.yml index d17ee1e..9353534 100644 --- a/protein_lm.yml +++ b/protein_lm.yml @@ -9,6 +9,7 @@ dependencies: - numpy - pytorch - pydantic>=2.0 + - wandb - rust - pip: - transformers diff --git a/protein_lm/configs/train/toy_hf.yaml b/protein_lm/configs/train/toy_hf.yaml index a32efbd..d959dc4 100644 --- a/protein_lm/configs/train/toy_hf.yaml +++ b/protein_lm/configs/train/toy_hf.yaml @@ -2,24 +2,34 @@ dataset: dataset_type: "huggingface" dataset_loc: "zpn/uniref50" - train_sample_size: 100 + subsample_size: 1000 + split_seed: 2 + val_size: 10 + test_size: 10 sequence_column_name: "sequence" max_sequence_length: 10 -# corresponds to TrainingArgsConfig -training_args: - output_dir: "checkpoints/toy" - max_steps: 1 - num_train_epochs: 1 +# corresponds to HuggingFace's TrainingArguments +training_arguments: + output_dir: "checkpoints/toy_hf" + num_train_epochs: 2 learning_rate: 0.1 weight_decay: 0.1 save_strategy: "epoch" - per_device_train_batch_size: 1 - save_steps: 1 - report_to: "none" + per_device_train_batch_size: 10 + save_steps: 5 + evaluation_strategy: "steps" + eval_steps: 5 + report_to: "wandb" label_names: - 'labels' no_cuda: false + ddp_find_unused_parameters: false + +# corresponds to WandBConfig +wandb: + name: "toy_hf" + dir: "wandb_files/" # corresponds to TokenizerConfig tokenizer: diff --git a/protein_lm/configs/train/toy_localcsv.yaml b/protein_lm/configs/train/toy_localcsv.yaml index 90175fb..4e06a25 100644 --- a/protein_lm/configs/train/toy_localcsv.yaml +++ b/protein_lm/configs/train/toy_localcsv.yaml @@ -2,12 +2,15 @@ dataset: dataset_type: "csv" dataset_loc: "protein_lm/dataset/uniref/uniref50_trimmed.csv" - train_sample_size: 100 + subsample_size: 100 + split_seed: 2 + val_size: 10 + test_size: 10 sequence_column_name: "sequence" max_sequence_length: 10 -# corresponds to TrainingArgsConfig -training_args: +# corresponds to HuggingFace's TrainingArguments +training_arguments: output_dir: "checkpoints/toy" max_steps: 1 num_train_epochs: 1 diff --git a/protein_lm/modeling/getters/dataset.py b/protein_lm/modeling/getters/dataset.py index 8ac6055..54e5d5a 100644 --- a/protein_lm/modeling/getters/dataset.py +++ b/protein_lm/modeling/getters/dataset.py @@ -1,6 +1,7 @@ from typing import Dict, Literal, Optional from datasets import Dataset, load_dataset +from datasets.dataset_dict import DatasetDict from pydantic import BaseModel @@ -10,8 +11,19 @@ class DatasetConfig(BaseModel): # The path if local or the huggingface dataset name if huggingface dataset_loc: str - # train sample size to limit to, if any - train_sample_size: Optional[int] = None + # sample size to limit to, if any, usually for debugging + subsample_size: Optional[int] = None + + """ + Args for splitting into train, val, test + to be updated once we have more options + """ + # split seed + split_seed: Optional[int] = None + # size of validation dataset + val_size: int + # size of test dataset + test_size: int # name of the column that contains the sequence sequence_column_name: str @@ -39,20 +51,89 @@ def set_labels(result): return result -def get_local_dataset(config: DatasetConfig) -> Dataset: - train_ds = load_dataset("csv", data_files=config.dataset_loc)["train"] - return train_ds +def train_val_test_split( + dataset_dict: DatasetDict, + config: DatasetConfig, +) -> DatasetDict: + """ + Given a dictionary of datasets that only contains the split "train", + optionally subsamples it, and then splits it + so that it has potentially 3 splits: "train", "val", "test", where + "val" and "test" splits do not exist if the specified sizes are 0 + """ + assert set(dataset_dict.keys()) == { + "train" + }, f"{train_val_test_split.__name__} expects its input to have the keys \ + ['train'] but the input has keys {list(dataset_dict.keys())}" + + dataset = dataset_dict["train"] + + val_size = config.val_size + test_size = config.test_size + + assert isinstance( + dataset, Dataset + ), f"Invalid dataset type {type(dataset)}, only datasets.Dataset allowed" + + dataset = dataset.shuffle(seed=config.split_seed) + + if config.subsample_size is not None: + dataset = dataset.select(range(config.subsample_size)) + + valtest_size = val_size + test_size + + if valtest_size > 0: + train_valtest = dataset.train_test_split( + test_size=val_size + test_size, + shuffle=False, + ) + split_dict = { + "train": train_valtest["train"], + } + if test_size > 0 and val_size > 0: + test_val = train_valtest["test"].train_test_split( + test_size=test_size, + shuffle=False, + ) + split_dict["val"] = test_val["train"] + split_dict["test"] = test_val["test"] + elif val_size > 0: + split_dict["val"] = train_valtest["test"] + else: + split_dict["test"] = train_valtest["test"] + else: + split_dict = { + "train": dataset, + } + + split_dataset_dict = DatasetDict(split_dict) + return split_dataset_dict + + +def get_csv_dataset(config: DatasetConfig) -> Dataset: + # note that a csv is read as having just one split "train" + dataset_dict = load_dataset("csv", data_files=config.dataset_loc) + return train_val_test_split(dataset_dict, config) def get_huggingface_dataset(config: DatasetConfig) -> Dataset: - train_ds = load_dataset(config.dataset_loc, streaming=True, split="train") - return train_ds + dataset_dict = load_dataset(config.dataset_loc) + if set(dataset_dict.keys()) == {"train", "val", "test"}: + return dataset_dict + + assert set(dataset_dict.keys()) == { + "train" + }, f"Huggingface DatasetDicts should have the keys {{'train'}} or \ + {{'train', 'val', 'split'}} but this DatasetDict has keys \ + {set(dataset_dict.keys())}" + return train_val_test_split(dataset_dict, config) def get_dataset(config_dict: Dict, tokenizer) -> Dataset: config = DatasetConfig(**config_dict) + if config.dataset_type == "csv": - train_ds = get_local_dataset(config) + train_ds = get_csv_dataset(config) elif config.dataset_type == "huggingface": train_ds = get_huggingface_dataset(config) else: diff --git a/protein_lm/modeling/getters/model.py b/protein_lm/modeling/getters/model.py index 0acc031..2ed70c1 100644 --- a/protein_lm/modeling/getters/model.py +++ b/protein_lm/modeling/getters/model.py @@ -33,7 +33,4 @@ def get_model(config_dict: Dict): config=model_config, ) - if torch.cuda.is_available(): - model.cuda() - return model diff --git a/protein_lm/modeling/getters/training_args.py b/protein_lm/modeling/getters/training_args.py index 5116513..6c5e398 100644 --- a/protein_lm/modeling/getters/training_args.py +++ b/protein_lm/modeling/getters/training_args.py @@ -1,44 +1,14 @@ import os -from typing import Dict, List, Union +from typing import Dict -from pydantic import BaseModel, FieldValidationInfo, field_validator from transformers import TrainingArguments -class TrainingArgsConfig(BaseModel): - per_device_train_batch_size: int - learning_rate: float - weight_decay: float - num_train_epochs: int - max_steps: int - save_steps: int - output_dir: str - save_strategy: str - report_to: str - label_names: List[str] - no_cuda: bool - - @field_validator( - "per_device_train_batch_size", - "num_train_epochs", - "weight_decay", - "learning_rate", - "save_steps", - ) - @classmethod - def check_gt_zero(cls, v: Union[int, float], info: FieldValidationInfo): - if v <= 0: - raise ValueError(f"trainer.{info.field_name} must be greater than 0") - return v - - def get_training_args(config_dict: Dict) -> TrainingArguments: - config = TrainingArgsConfig(**config_dict) + config = TrainingArguments(**config_dict) if not os.path.isdir(config.output_dir): print(f"creating checkpoint directory at {config.output_dir}") os.makedirs(config.output_dir) - return TrainingArguments( - **config_dict, - ) + return config diff --git a/protein_lm/modeling/getters/wandb_log.py b/protein_lm/modeling/getters/wandb_log.py new file mode 100644 index 0000000..8f4b227 --- /dev/null +++ b/protein_lm/modeling/getters/wandb_log.py @@ -0,0 +1,23 @@ +import wandb +from pydantic import BaseModel +from typing import Dict, Optional +import os + + +class WandBConfig(BaseModel): + project: str = "protein_lm_scaling" + name: str + # directory to save to + dir: Optional[str] = None + + +def setup_wandb(config_dict: Dict) -> None: + config = WandBConfig(**config_dict) + if config.dir is not None: + if not os.path.isdir(config.dir): + print(f"creating wandb directory at {config.dir}") + os.makedirs(config.dir) + + os.environ["WANDB_PROJECT"] = config.project + os.environ["WANDB_NAME"] = config.name + os.environ["WANDB_DIR"] = config.dir diff --git a/protein_lm/modeling/scripts/train.py b/protein_lm/modeling/scripts/train.py index 6ab4745..58b65e2 100644 --- a/protein_lm/modeling/scripts/train.py +++ b/protein_lm/modeling/scripts/train.py @@ -1,6 +1,7 @@ import argparse import math + import yaml from transformers import Trainer @@ -9,6 +10,7 @@ from protein_lm.modeling.getters.model import get_model from protein_lm.modeling.getters.tokenizer import get_tokenizer from protein_lm.modeling.getters.training_args import get_training_args +from protein_lm.modeling.getters.wandb_log import setup_wandb def train( @@ -23,7 +25,7 @@ def train( tokenizer = get_tokenizer(config_dict=config_dict["tokenizer"]) - train_ds = get_dataset( + dataset = get_dataset( config_dict=config_dict["dataset"], tokenizer=tokenizer, ) @@ -38,27 +40,24 @@ def train( ) training_args = get_training_args( - config_dict=config_dict["training_args"], + config_dict=config_dict["training_arguments"], ) + if "wandb" in training_args.report_to: + setup_wandb( + config_dict["wandb"], + ) + trainer = Trainer( model=model, args=training_args, - train_dataset=train_ds, + train_dataset=dataset["train"], + eval_dataset=dataset.get("val", None), data_collator=data_collator, ) - train_result = trainer.train() - trainer.save_model() # Saves the tokenizer too for easy upload - metrics = train_result.metrics - try: - perplexity = math.exp(metrics["train_loss"]) - except OverflowError: - perplexity = float("inf") - metrics["perplexity"] = perplexity - print("metrics:", metrics) - trainer.log_metrics("train", metrics) - trainer.save_metrics("train", metrics) + trainer.train() + trainer.save_model() trainer.save_state()