From b4f3c3d8ddcd1033ded73fde3e27847e89ffcca0 Mon Sep 17 00:00:00 2001 From: haileyschoelkopf Date: Mon, 27 Jun 2022 21:02:32 -0400 Subject: [PATCH 1/5] composable-sft first pass --- .../efficient_ft/composable_sft/__init__.py | 1 + .../efficient_ft/composable_sft/sft_args.py | 116 ++++++++++++++++ scripts/lang_adapt/madx_run_clm.py | 131 +++++++++++++----- scripts/lang_adapt/run_clm_sft.sh | 64 +++++++++ 4 files changed, 275 insertions(+), 37 deletions(-) create mode 100644 scripts/lang_adapt/efficient_ft/composable_sft/__init__.py create mode 100644 scripts/lang_adapt/efficient_ft/composable_sft/sft_args.py create mode 100755 scripts/lang_adapt/run_clm_sft.sh diff --git a/scripts/lang_adapt/efficient_ft/composable_sft/__init__.py b/scripts/lang_adapt/efficient_ft/composable_sft/__init__.py new file mode 100644 index 0000000..a8771c6 --- /dev/null +++ b/scripts/lang_adapt/efficient_ft/composable_sft/__init__.py @@ -0,0 +1 @@ +from .sft_args import SftArguments \ No newline at end of file diff --git a/scripts/lang_adapt/efficient_ft/composable_sft/sft_args.py b/scripts/lang_adapt/efficient_ft/composable_sft/sft_args.py new file mode 100644 index 0000000..23e3bee --- /dev/null +++ b/scripts/lang_adapt/efficient_ft/composable_sft/sft_args.py @@ -0,0 +1,116 @@ +from dataclasses import dataclass, field +from typing import Optional +# TODO: add attribution to Composable SFT authors/paper/code + +"""A modified version of the SftArguments class that drops unused args and changes default values for language adaptation.""" + +@dataclass +class SftArguments: + "Arguments pertaining to sparse fine-tuning configuration.""" + + train_sft: bool = field( + default=False, metadata={"help": "Whether to train sparse fine-tuning."} + ) # newly added + + lang_ft: Optional[str] = field( + default=None, metadata={"help": "Path to saved language SparseFineTuning."} + ) + task_ft: Optional[str] = field( + default=None, metadata={"help": "Path to saved task SparseFineTuning."} + ) + + sparse_ft_method: Optional[str] = field( + default='LotteryTicket', + metadata={"help": 'Sparse fine-tuning method. Can be LotteryTicket or Random.'}, + ) + + full_l1_reg: Optional[float] = field( + default=0.1, metadata={"help": "Coefficient of L1 regularisation during full fine-tuning."} + ) # changed from 0.0 to 0.1 + sparse_l1_reg: Optional[float] = field( + default=0.1, metadata={"help": "Coefficient of L1 regularisation during sparse fine-tuning."} + ) # changed from 0.0 to 0.1 + apply_reg_to_sparse_only: bool = field( + default=False, + metadata={ + "help": "If true, only applies regularisation to those parameters which are eligible for sparse fine-tuning." + }, + ) + + freeze_embeddings: bool = field( + default=False, + metadata={ + "help": "Whether to freeze embeddings." + }, + ) + freeze_head: bool = field( + default=False, + metadata={"help": "Whether to freeze language modeling head."}, + ) + untie_embeddings: bool = field( + default=False, + metadata={"help": "Whether to untie input and output embeddings."}, + ) + freeze_decoder: bool = field( + default=False, + metadata={"help": "Whether to freeze only output embeddings."}, + ) + freeze_layer_norm: bool = field( + default=True, + metadata={"help": "Whether to freeze layer normalisation parameters."}, + ) # changed from False to True + + ft_params_proportion: Optional[float] = field( + default=0.01, + metadata={ + "help": "The proportion of model parameters for which to learn non-zero differences during fine-tuning." + }, + ) + ft_params_num: Optional[int] = field( + default=None, + metadata={ + "help": "The number of model parameters for which to learn non-zero differences during fine-tuning." + }, + ) + n_ft_iterations: Optional[int] = field( + default=5, + metadata={ + "help": "The number of parameter selection iterations during fine-tuning." + }, + ) + full_ft_min_steps_per_iteration: Optional[int] = field( + default=10, + metadata={ + "help": "Minimum number of steps per parameter selection iteration during full fine-tuning." + }, + ) + sparse_ft_min_steps_per_iteration: Optional[int] = field( + default=10, + metadata={ + "help": "Minimum of steps per parameter selection iteration during sparse fine-tuning." + }, + ) + full_ft_max_steps_per_iteration: Optional[int] = field( + default=100, + metadata={ + "help": "Maximum number of steps per parameter selection iteration during full fine-tuning." + }, + ) + sparse_ft_max_steps_per_iteration: Optional[int] = field( + default=100, + metadata={ + "help": "Maximum of steps per parameter selection iteration during sparse fine-tuning." + }, + ) + full_ft_max_epochs_per_iteration: Optional[int] = field( + default=None, + metadata={ + "help": "Maximum number of epochs per parameter selection iteration during full fine-tuning." + }, + ) + sparse_ft_max_epochs_per_iteration: Optional[int] = field( + default=None, + metadata={ + "help": "Maximum number of epochs per parameter selection iteration during sparse fine-tuning." + }, + ) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index 2f5cf5c..d50ccce 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -1,7 +1,11 @@ """ Source: https://github.com/Adapter-Hub/adapter-transformers/blob/master/examples/language-modeling/run_clm.py """ - +#TODO: hailey composable sft impl. (this comment shouldn't make it into main!) + # use the LT Trainer class from composable-sft + # see how this interacts with adapters + # see how this interacts with the embeddings being trained + # todo: computations for changing import logging import math import os @@ -41,6 +45,13 @@ from transformers.utils import check_min_version from transformers.utils.versions import require_version +from sft import ( + LotteryTicketSparseFineTuner, + SFT, +) + +from efficient_ft.composable_sft import SftArguments + # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.11.0") @@ -53,6 +64,8 @@ MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) +trainer_class_mapping = {'emb': Trainer, 'emb-and-adpt': AdapterTrainer, 'emb-then-adpt': AdapterTrainer, 'emb-and-sft': LotteryTicketSparseFineTuner} + @dataclass class ModelArguments: @@ -105,7 +118,7 @@ class ModelArguments: ) lang_adapt_strategies: str = field( default="", - metadata={"help": "choose one of the three strategies - 'emb', 'emb-and-adpt', 'emb-then-adpt'"}, + metadata={"help": "choose one of the three strategies - 'emb', 'emb-and-adpt', 'emb-then-adpt', 'emb-and-sft'"}, ) embedding_strategies: str = field( default="", @@ -432,7 +445,7 @@ def group_texts(examples): logger.info(f"✅ saved lm_data to {saved_lm_datasets_fp}") return lm_datasets -def modify_model(adapter_args, data_args, model_args, tokenizer, model): +def modify_model(adapter_args, sft_args, data_args, model_args, tokenizer, model): #if "emb" in model_args.lang_adapt_strategies: # if "replace" in model_args.embedding_strategies: # for name, param in model.named_parameters(): @@ -448,10 +461,7 @@ def get_adapter_config(adapter_args, model_args): adapter_config = PrefixTuningConfig(bottleneck_size = 800, leave_out = [i for i in range(0,24) if not i in adapters2use] ) - - else: - if model_args.adapter_placement == "all": adapter_config = AdapterConfig.load( adapter_args.adapter_config, @@ -513,7 +523,7 @@ def get_adapter_config(adapter_args, model_args): raise ValueError( "Adapters can only be loaded in adapters training mode." "Use --train_adapter to enable adapter training" - ) + ) print(f"✅ Use Embedding Strategy: {model_args.embedding_strategies}") @@ -575,47 +585,82 @@ def zero_grad(grad): # model.tie_weights() #elif model_args.embedding_strategies == "replace": # model.resize_token_embeddings(len(tokenizer)) + + if sft_args.train_sft: # Hailey: might need to put some more args here. + lm_head = model.lm_head + + if sft_args.freeze_head: + for param in lm_head.parameters(): + param.requires_grad = False + # if sft_args.load_sft: + # model.load_sft(sft_args.load_sft) + if sft_args.freeze_layer_norm: + for name, param in model.named_parameters(): + if "layer_norm" in name or "ln_f" in name: + param.requires_grad = False + + trainable_params = 0 frozen_params = 0 emb_params = 0 - for name, param in model.named_parameters(): - if "word_embeddings" in name or "wte" in name or "wpe" in name or "lm_head" in name: - param.requires_grad = True - emb_params += param.numel() - elif model_args.lang_adapt_strategies == "emb": - param.requires_grad = False - - if not param.requires_grad: - print(f"🥶 Frozen layer '{name}'") - frozen_params += param.numel() - else: - print(f"🚀 Trainable layer '{name}'") - trainable_params += param.numel() + if adapter_args.train_adapter: + for name, param in model.named_parameters(): + if "word_embeddings" in name or "wte" in name or "wpe" in name or "lm_head" in name: + param.requires_grad = True + emb_params += param.numel() + elif model_args.lang_adapt_strategies == "emb": + param.requires_grad = False + + if not param.requires_grad: + print(f"🥶 Frozen layer '{name}'") + frozen_params += param.numel() + else: + print(f"🚀 Trainable layer '{name}'") + trainable_params += param.numel() + elif sft_args.train_sft: + for name, param in model.named_parameters(): + if "word_embeddings" in name or "wte" in name or "wpe" in name or "lm_head" in name: + param.requires_grad = True + emb_params += param.numel() + elif model_args.lang_adapt_strategies == "emb": + param.requires_grad = True + + if not param.requires_grad: + print(f"🥶 Frozen layer '{name}'") + frozen_params += param.numel() + elif "word_embeddings" in name or "wte" in name or "wpe" in name and param.requires_grad: + print(f"🚀 Trainable layer '{name}'") + trainable_params += param.numel() + else: + print(f"🚀 Sparsely Trainable layer '{name}'") + trainable_params += param.numel() print(f"Total frozen parameters: {frozen_params}") print(f"Total emb parameters (wte, wpe): {emb_params}") print(f"Total trainable parameters: {trainable_params}") + + def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. - parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments)) + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, MultiLingAdapterArguments, SftArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. - model_args, data_args, training_args, adapter_args = parser.parse_json_file( + model_args, data_args, training_args, adapter_args, sft_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1]) ) else: - model_args, data_args, training_args, adapter_args = parser.parse_args_into_dataclasses() + model_args, data_args, training_args, adapter_args, sft_args = parser.parse_args_into_dataclasses() training_args.data_dir = f'{training_args.output_dir}' - assert model_args.lang_adapt_strategies in ('emb', 'emb-and-adpt', 'emb-then-adpt') + assert model_args.lang_adapt_strategies in ('emb', 'emb-and-adpt', 'emb-then-adpt', 'emb-and-sft') assert model_args.embedding_strategies in ('replace', 'extend', 'overlap-replace') # Setup logging @@ -663,7 +708,7 @@ def main(): tokenizer = load_tokenizer(model_args) model = load_model(model_args, tokenizer) - modify_model(adapter_args, data_args, model_args, tokenizer, model) + modify_model(adapter_args, sft_args, data_args, model_args, tokenizer, model) # Preprocessing the datasets. lm_datasets = get_lm_dataset(training_args, data_args, model_args, tokenizer) @@ -673,11 +718,20 @@ def main(): if training_args.do_eval: eval_dataset = lm_datasets["validation"] + # only needed for composable sft + maskable_params = [ + n for n, p in model.named_parameters() + if n.startswith(model.base_model_prefix) and p.requires_grad and not + ("wte" in n or "wpe" in n or "word_embedding" in n or "lm_head" in n or "ln_f") + ] + # Initialize our Trainer - trainer_class = AdapterTrainer if adapter_args.train_adapter else Trainer + trainer_class = trainer_class_mapping[model_args.lang_adapt_strategies] trainer = trainer_class( model=model, args=training_args, + **{'sft_args': sft_args} if 'sft' in model_args.lang_adapt_strategies else {}, + **{'maskable_params': maskable_params} if 'sft' in model_args.lang_adapt_strategies else {}, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=tokenizer, @@ -690,7 +744,7 @@ def main(): # print("Embeddings at start of run:", model.get_input_embeddings().weight[250880:,:]) # get original weight for embedding layer - # orig_embeddings = model.get_input_embeddings().weight.detach().clone() # clone original weight for embedding layer + orig_embeddings = model.get_input_embeddings().weight.detach().clone() # clone original weight for embedding layer # Training if training_args.do_train: checkpoint = None @@ -725,17 +779,20 @@ def main(): trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() - + + if 'sft' in model_args.lang_adapt_strategies: + trainer.sft().save(f'{training_args.output_dir}/') + # uncomment to test whether extending vocab gradient masking is working correctly. - # if model_args.embedding_strategies == "extend": - # print("Unsliced, post-training:", model.get_input_embeddings().weight) # get updated weight - # if not torch.equal(orig_embeddings[:250880, :], model.get_input_embeddings().weight[:250880, :]): - # raise ValueError("embedding layer is updated where it shouldn't....") - - # if torch.equal(orig_embeddings[250880:, :], model.get_input_embeddings().weight[250880:, :]): - # print("original embeddings:", orig_embeddings[250880:, :]) - # print("updated embeddings:", model.get_input_embeddings().weight[250880:, :]) - # raise ValueError("embedding layer is not updated where it should....") + if model_args.embedding_strategies == "extend": + print("Unsliced, post-training:", model.get_input_embeddings().weight) # get updated weight + if not torch.equal(orig_embeddings[:250880, :], model.get_input_embeddings().weight[:250880, :]): + raise ValueError("embedding layer is updated where it shouldn't....") + + if torch.equal(orig_embeddings[250880:, :], model.get_input_embeddings().weight[250880:, :]): + print("original embeddings:", orig_embeddings[250880:, :]) + print("updated embeddings:", model.get_input_embeddings().weight[250880:, :]) + raise ValueError("embedding layer is not updated where it should....") # Evaluation diff --git a/scripts/lang_adapt/run_clm_sft.sh b/scripts/lang_adapt/run_clm_sft.sh new file mode 100755 index 0000000..ccf0edc --- /dev/null +++ b/scripts/lang_adapt/run_clm_sft.sh @@ -0,0 +1,64 @@ +# axis +LANG="th" +MAX_TRAIN_SAMPLES=100000 +BIGS_MODEL="bigscience/bloom-350m" +ADPT_REDUCTION_FACTOR=16 +ADPT_STRATEGY="emb-and-sft" +EMB_STRATEGY="extend" + +tokenizer_dir=./tokenizers/tok_bloom-350m_th_oscar_10000samples_5000vocab_extend/ +cache_dir="./cache" +output_dir="./sft_testing" +logging_dir="./sft_testing" +mkdir -p $output_dir +mkdir -p $logging_dir + +CUDA_VISIBLE_DEVICES=4 python madx_run_clm.py \ + --seed 0 \ + --fp16 \ + --model_name_or_path $BIGS_MODEL \ + --tokenizer_name $tokenizer_dir \ + --dataset_name oscar \ + --dataset_config_name "unshuffled_deduplicated_$LANG" \ + --cache_dir $cache_dir \ + --logging_dir $logging_dir \ + --report_to "tensorboard" \ + --learning_rate 0.001 \ + --do_train \ + --do_eval \ + --train_sft \ + --load_best_model_at_end \ + --output_dir $output_dir \ + --preprocessing_num_workers 8 \ + --overwrite_output_dir \ + --per_device_train_batch_size 1 \ + --gradient_accumulation_steps 8 \ + --per_device_eval_batch_size 1 \ + --eval_accumulation_steps 8 \ + --eval_steps 1000 \ + --evaluation_strategy "steps" \ + --max_eval_samples 5000 \ + --logging_steps 100 \ + --save_steps 5000 \ + --save_strategy "steps" \ + --max_train_samples $MAX_TRAIN_SAMPLES \ + --max_steps 50000 \ + --lang_adapt_strategies "$ADPT_STRATEGY" \ + --embedding_strategies "$EMB_STRATEGY" \ + --adapter_reduction_factor $ADPT_REDUCTION_FACTOR \ + --language $LANG \ + --full_ft_min_steps_per_iteration 10000 \ + --sparse_ft_min_steps_per_iteration 10000 \ + --full_ft_max_steps_per_iteration 10000 \ + --sparse_ft_max_steps_per_iteration 10000 \ + --n_ft_iterations 5 \ + # --full_ft_max_epochs_per_iteration 100 \ + # --sparse_ft_max_epochs_per_iteration 100 \ + +# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.1/lib64 +# export PATH=$PATH:/usr/local/cuda-11.1/bin +# export CUDA_HOME=/usr/local/cuda-11.1 + + + + From d8638c94133347c983b5555b9bf06321c2bf3bd4 Mon Sep 17 00:00:00 2001 From: haileyschoelkopf Date: Wed, 29 Jun 2022 15:39:19 -0400 Subject: [PATCH 2/5] add K calculation --- scripts/lang_adapt/madx_run_clm.py | 32 +++++++++++++++++++++++++++++- scripts/lang_adapt/run_clm_sft.sh | 21 +++++++------------- 2 files changed, 38 insertions(+), 15 deletions(-) diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index d50ccce..f557acb 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -718,11 +718,41 @@ def main(): if training_args.do_eval: eval_dataset = lm_datasets["validation"] + # compute K value for SFT (https://arxiv.org/pdf/2110.07560.pdf) + if sft_args.train_sft and not adapter_args.train_adapter: + # override the K value if adapter_reduction_factor is set + if adapter_args.adapter_reduction_factor: + logger.info(f"Overriding K value for SFT with adapter_reduction_factor: {adapter_args.train_adapter}") + # calc appropriate K value + num_layers = len(model.transformer.h) + sft_k = num_layers * model.transformer.word_embeddings.weight.shape[1] ** 2 // adapter_args.adapter_reduction_factor * 2 #* 2 for the up and down proj + + sft_k += model.transformer.word_embeddings.weight.shape[1] ** 2 // 2 # inv adapters. TODO: if we use other adapter configs, this breaks (code works, but K no longer matches adapter budget) + + sft_args.ft_params_num = int(sft_k) + logger.info(f"K value for SFT is {sft_args.ft_params_num}") + + if adapter_args.train_adapter: + trainable_params = 0 + for name, param in model.named_parameters(): + if "adapter" in name: + print(f"🚀 Trainable layer '{name}'") + trainable_params += param.numel() + logger.info(f"adapter elements: {trainable_params}") + + num_layers = len(model.transformer.h) + sft_k = num_layers * model.transformer.word_embeddings.weight.shape[1] ** 2 // adapter_args.adapter_reduction_factor * 2 #* 2 for the up and down proj + + sft_k += model.transformer.word_embeddings.weight.shape[1] ** 2 // 2 # inv adapters. TODO: if we use other adapter configs, this breaks (code works, but K no longer matches adapter budget) + + sft_args.ft_params_num = int(sft_k) + logger.info(f"K value for SFT is {sft_args.ft_params_num}") + # only needed for composable sft maskable_params = [ n for n, p in model.named_parameters() if n.startswith(model.base_model_prefix) and p.requires_grad and not - ("wte" in n or "wpe" in n or "word_embedding" in n or "lm_head" in n or "ln_f") + ("wte" in n or "wpe" in n or "word_embedding" in n or "lm_head" in n) ] # Initialize our Trainer diff --git a/scripts/lang_adapt/run_clm_sft.sh b/scripts/lang_adapt/run_clm_sft.sh index ccf0edc..b1c6178 100755 --- a/scripts/lang_adapt/run_clm_sft.sh +++ b/scripts/lang_adapt/run_clm_sft.sh @@ -8,8 +8,8 @@ EMB_STRATEGY="extend" tokenizer_dir=./tokenizers/tok_bloom-350m_th_oscar_10000samples_5000vocab_extend/ cache_dir="./cache" -output_dir="./sft_testing" -logging_dir="./sft_testing" +output_dir="./sft_testing_short" +logging_dir="./sft_testing_short" mkdir -p $output_dir mkdir -p $logging_dir @@ -38,7 +38,7 @@ CUDA_VISIBLE_DEVICES=4 python madx_run_clm.py \ --eval_steps 1000 \ --evaluation_strategy "steps" \ --max_eval_samples 5000 \ - --logging_steps 100 \ + --logging_steps 10 \ --save_steps 5000 \ --save_strategy "steps" \ --max_train_samples $MAX_TRAIN_SAMPLES \ @@ -47,17 +47,10 @@ CUDA_VISIBLE_DEVICES=4 python madx_run_clm.py \ --embedding_strategies "$EMB_STRATEGY" \ --adapter_reduction_factor $ADPT_REDUCTION_FACTOR \ --language $LANG \ - --full_ft_min_steps_per_iteration 10000 \ - --sparse_ft_min_steps_per_iteration 10000 \ - --full_ft_max_steps_per_iteration 10000 \ - --sparse_ft_max_steps_per_iteration 10000 \ - --n_ft_iterations 5 \ - # --full_ft_max_epochs_per_iteration 100 \ - # --sparse_ft_max_epochs_per_iteration 100 \ - -# export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.1/lib64 -# export PATH=$PATH:/usr/local/cuda-11.1/bin -# export CUDA_HOME=/usr/local/cuda-11.1 + --full_ft_max_steps_per_iteration 200 \ + --sparse_ft_max_steps_per_iteration 200 \ + --n_ft_iterations 5 + From ed085e29ca10b5c597c9881ecd041e072c9d084f Mon Sep 17 00:00:00 2001 From: haileyschoelkopf Date: Thu, 30 Jun 2022 10:41:44 -0400 Subject: [PATCH 3/5] commit most recent sft changes --- .../efficient_ft/composable_sft/sft_args.py | 10 +++++----- scripts/lang_adapt/madx_run_clm.py | 4 ++++ scripts/lang_adapt/run_clm_sft.sh | 18 +++++++++--------- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/scripts/lang_adapt/efficient_ft/composable_sft/sft_args.py b/scripts/lang_adapt/efficient_ft/composable_sft/sft_args.py index 23e3bee..170c149 100644 --- a/scripts/lang_adapt/efficient_ft/composable_sft/sft_args.py +++ b/scripts/lang_adapt/efficient_ft/composable_sft/sft_args.py @@ -61,7 +61,7 @@ class SftArguments: ) # changed from False to True ft_params_proportion: Optional[float] = field( - default=0.01, + default=None, metadata={ "help": "The proportion of model parameters for which to learn non-zero differences during fine-tuning." }, @@ -79,25 +79,25 @@ class SftArguments: }, ) full_ft_min_steps_per_iteration: Optional[int] = field( - default=10, + default=None, metadata={ "help": "Minimum number of steps per parameter selection iteration during full fine-tuning." }, ) sparse_ft_min_steps_per_iteration: Optional[int] = field( - default=10, + default=None, metadata={ "help": "Minimum of steps per parameter selection iteration during sparse fine-tuning." }, ) full_ft_max_steps_per_iteration: Optional[int] = field( - default=100, + default=None, metadata={ "help": "Maximum number of steps per parameter selection iteration during full fine-tuning." }, ) sparse_ft_max_steps_per_iteration: Optional[int] = field( - default=100, + default=None, metadata={ "help": "Maximum of steps per parameter selection iteration during sparse fine-tuning." }, diff --git a/scripts/lang_adapt/madx_run_clm.py b/scripts/lang_adapt/madx_run_clm.py index f557acb..26a653a 100644 --- a/scripts/lang_adapt/madx_run_clm.py +++ b/scripts/lang_adapt/madx_run_clm.py @@ -659,6 +659,10 @@ def main(): model_args, data_args, training_args, adapter_args, sft_args = parser.parse_args_into_dataclasses() training_args.data_dir = f'{training_args.output_dir}' + + if sft_args.train_sft and training_args.max_steps: + # override sparse_ft_max_steps_per_iteration if training_args.max_steps is set + sft_args.sparse_ft_max_steps_per_iteration = training_args.max_steps assert model_args.lang_adapt_strategies in ('emb', 'emb-and-adpt', 'emb-then-adpt', 'emb-and-sft') assert model_args.embedding_strategies in ('replace', 'extend', 'overlap-replace') diff --git a/scripts/lang_adapt/run_clm_sft.sh b/scripts/lang_adapt/run_clm_sft.sh index b1c6178..78c846a 100755 --- a/scripts/lang_adapt/run_clm_sft.sh +++ b/scripts/lang_adapt/run_clm_sft.sh @@ -4,16 +4,16 @@ MAX_TRAIN_SAMPLES=100000 BIGS_MODEL="bigscience/bloom-350m" ADPT_REDUCTION_FACTOR=16 ADPT_STRATEGY="emb-and-sft" -EMB_STRATEGY="extend" +EMB_STRATEGY="replace" tokenizer_dir=./tokenizers/tok_bloom-350m_th_oscar_10000samples_5000vocab_extend/ cache_dir="./cache" -output_dir="./sft_testing_short" -logging_dir="./sft_testing_short" +output_dir="./sft_testing_save" +logging_dir="./sft_testing_save" mkdir -p $output_dir mkdir -p $logging_dir -CUDA_VISIBLE_DEVICES=4 python madx_run_clm.py \ +CUDA_VISIBLE_DEVICES=5 python madx_run_clm.py \ --seed 0 \ --fp16 \ --model_name_or_path $BIGS_MODEL \ @@ -35,10 +35,10 @@ CUDA_VISIBLE_DEVICES=4 python madx_run_clm.py \ --gradient_accumulation_steps 8 \ --per_device_eval_batch_size 1 \ --eval_accumulation_steps 8 \ - --eval_steps 1000 \ + --eval_steps 500 \ --evaluation_strategy "steps" \ --max_eval_samples 5000 \ - --logging_steps 10 \ + --logging_steps 100 \ --save_steps 5000 \ --save_strategy "steps" \ --max_train_samples $MAX_TRAIN_SAMPLES \ @@ -47,9 +47,9 @@ CUDA_VISIBLE_DEVICES=4 python madx_run_clm.py \ --embedding_strategies "$EMB_STRATEGY" \ --adapter_reduction_factor $ADPT_REDUCTION_FACTOR \ --language $LANG \ - --full_ft_max_steps_per_iteration 200 \ - --sparse_ft_max_steps_per_iteration 200 \ - --n_ft_iterations 5 + --full_ft_max_steps_per_iteration 2500 \ + --sparse_ft_max_steps_per_iteration 10000 \ + --n_ft_iterations 1 From ee1a9d154108ea11d83ef094bb3d67c3efd1abe7 Mon Sep 17 00:00:00 2001 From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Date: Fri, 1 Jul 2022 12:10:36 -0400 Subject: [PATCH 4/5] Delete unnecessary extra files --- scripts/lang_adapt/efficient_ft/composable_sft/__init__.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 scripts/lang_adapt/efficient_ft/composable_sft/__init__.py diff --git a/scripts/lang_adapt/efficient_ft/composable_sft/__init__.py b/scripts/lang_adapt/efficient_ft/composable_sft/__init__.py deleted file mode 100644 index a8771c6..0000000 --- a/scripts/lang_adapt/efficient_ft/composable_sft/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .sft_args import SftArguments \ No newline at end of file From 02885679f69a9e37a29f08dbd9704e11220aa53f Mon Sep 17 00:00:00 2001 From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Date: Fri, 1 Jul 2022 12:10:48 -0400 Subject: [PATCH 5/5] Delete sft_args.py --- .../efficient_ft/composable_sft/sft_args.py | 116 ------------------ 1 file changed, 116 deletions(-) delete mode 100644 scripts/lang_adapt/efficient_ft/composable_sft/sft_args.py diff --git a/scripts/lang_adapt/efficient_ft/composable_sft/sft_args.py b/scripts/lang_adapt/efficient_ft/composable_sft/sft_args.py deleted file mode 100644 index 170c149..0000000 --- a/scripts/lang_adapt/efficient_ft/composable_sft/sft_args.py +++ /dev/null @@ -1,116 +0,0 @@ -from dataclasses import dataclass, field -from typing import Optional -# TODO: add attribution to Composable SFT authors/paper/code - -"""A modified version of the SftArguments class that drops unused args and changes default values for language adaptation.""" - -@dataclass -class SftArguments: - "Arguments pertaining to sparse fine-tuning configuration.""" - - train_sft: bool = field( - default=False, metadata={"help": "Whether to train sparse fine-tuning."} - ) # newly added - - lang_ft: Optional[str] = field( - default=None, metadata={"help": "Path to saved language SparseFineTuning."} - ) - task_ft: Optional[str] = field( - default=None, metadata={"help": "Path to saved task SparseFineTuning."} - ) - - sparse_ft_method: Optional[str] = field( - default='LotteryTicket', - metadata={"help": 'Sparse fine-tuning method. Can be LotteryTicket or Random.'}, - ) - - full_l1_reg: Optional[float] = field( - default=0.1, metadata={"help": "Coefficient of L1 regularisation during full fine-tuning."} - ) # changed from 0.0 to 0.1 - sparse_l1_reg: Optional[float] = field( - default=0.1, metadata={"help": "Coefficient of L1 regularisation during sparse fine-tuning."} - ) # changed from 0.0 to 0.1 - apply_reg_to_sparse_only: bool = field( - default=False, - metadata={ - "help": "If true, only applies regularisation to those parameters which are eligible for sparse fine-tuning." - }, - ) - - freeze_embeddings: bool = field( - default=False, - metadata={ - "help": "Whether to freeze embeddings." - }, - ) - freeze_head: bool = field( - default=False, - metadata={"help": "Whether to freeze language modeling head."}, - ) - untie_embeddings: bool = field( - default=False, - metadata={"help": "Whether to untie input and output embeddings."}, - ) - freeze_decoder: bool = field( - default=False, - metadata={"help": "Whether to freeze only output embeddings."}, - ) - freeze_layer_norm: bool = field( - default=True, - metadata={"help": "Whether to freeze layer normalisation parameters."}, - ) # changed from False to True - - ft_params_proportion: Optional[float] = field( - default=None, - metadata={ - "help": "The proportion of model parameters for which to learn non-zero differences during fine-tuning." - }, - ) - ft_params_num: Optional[int] = field( - default=None, - metadata={ - "help": "The number of model parameters for which to learn non-zero differences during fine-tuning." - }, - ) - n_ft_iterations: Optional[int] = field( - default=5, - metadata={ - "help": "The number of parameter selection iterations during fine-tuning." - }, - ) - full_ft_min_steps_per_iteration: Optional[int] = field( - default=None, - metadata={ - "help": "Minimum number of steps per parameter selection iteration during full fine-tuning." - }, - ) - sparse_ft_min_steps_per_iteration: Optional[int] = field( - default=None, - metadata={ - "help": "Minimum of steps per parameter selection iteration during sparse fine-tuning." - }, - ) - full_ft_max_steps_per_iteration: Optional[int] = field( - default=None, - metadata={ - "help": "Maximum number of steps per parameter selection iteration during full fine-tuning." - }, - ) - sparse_ft_max_steps_per_iteration: Optional[int] = field( - default=None, - metadata={ - "help": "Maximum of steps per parameter selection iteration during sparse fine-tuning." - }, - ) - full_ft_max_epochs_per_iteration: Optional[int] = field( - default=None, - metadata={ - "help": "Maximum number of epochs per parameter selection iteration during full fine-tuning." - }, - ) - sparse_ft_max_epochs_per_iteration: Optional[int] = field( - default=None, - metadata={ - "help": "Maximum number of epochs per parameter selection iteration during sparse fine-tuning." - }, - )