From cd3b981f6160d570db0fa29593d2468bc5813408 Mon Sep 17 00:00:00 2001 From: shengdinghu Date: Fri, 22 Apr 2022 19:30:05 +0800 Subject: [PATCH] update examples --- .gitignore | 9 +- examples/examples_prompt/README.md | 49 +- .../examples_prompt/configs/gen_albert.py | 116 +++++ examples/examples_prompt/configs/gen_beit.py | 450 ++++++++++++++++ examples/examples_prompt/configs/gen_bert.py | 116 +++++ examples/examples_prompt/configs/gen_gpt.py | 433 ++++++++++++++++ .../examples_prompt/configs/gen_roberta.py | 143 ++++++ examples/examples_prompt/configs/gen_t5.py | 444 ++++++++++++++++ examples/examples_prompt/run.py | 482 ------------------ examples/examples_prompt/run.sh | 7 - examples/examples_prompt/run_mlm.sh | 11 - .../examples_seq2seq/configs/config_gen_bs.py | 411 +++++++++++++++ .../configs/config_gen_bs1.py | 411 +++++++++++++++ .../configs/config_gen_bs64.py | 411 +++++++++++++++ .../configs/config_gen_bs8.py | 411 +++++++++++++++ 15 files changed, 3355 insertions(+), 549 deletions(-) create mode 100644 examples/examples_prompt/configs/gen_albert.py create mode 100644 examples/examples_prompt/configs/gen_beit.py create mode 100644 examples/examples_prompt/configs/gen_bert.py create mode 100644 examples/examples_prompt/configs/gen_gpt.py create mode 100644 examples/examples_prompt/configs/gen_roberta.py create mode 100644 examples/examples_prompt/configs/gen_t5.py delete mode 100644 examples/examples_prompt/run.py delete mode 100644 examples/examples_prompt/run.sh delete mode 100644 examples/examples_prompt/run_mlm.sh create mode 100644 examples/examples_seq2seq/configs/config_gen_bs.py create mode 100644 examples/examples_seq2seq/configs/config_gen_bs1.py create mode 100644 examples/examples_seq2seq/configs/config_gen_bs64.py create mode 100644 examples/examples_seq2seq/configs/config_gen_bs8.py diff --git a/.gitignore b/.gitignore index bd8f570..9bb4b23 100644 --- a/.gitignore +++ b/.gitignore @@ -17,18 +17,17 @@ _build/ outputs/ log.txt **/DeltaHub/ +**/sfs_scripts/ *beans/ -**/examples/*/configs/ - +**/examples/*/configs/* !examples/*/configs/config_gen.py **/jupyter_notebook_examples/ !examples/jupyter_notebook_examples/*.py - - -!**/examples/*/configs/*.py +!examples/*/configs/*.py **/outputs_search/**/*.bin **/outputs_search/**/*.pt + *.db **/nohup.out **/examples/examples_bmtrain/BigModels/down_data diff --git a/examples/examples_prompt/README.md b/examples/examples_prompt/README.md index 38c5b22..fdb2ded 100644 --- a/examples/examples_prompt/README.md +++ b/examples/examples_prompt/README.md @@ -10,55 +10,16 @@ This will add `examples_seq2seq` to the environment path of the python lib. ## Generating the json configuration file +```shell +python configs/gen_$BACKBONETYPE.py --job $YOURJOB +#e.g. python configs/gen_beit.py --job lora_beit-base-patch16-224 ``` -python config_gen.py --job $job_name - -``` -The available job configuration (e.g., `--job lora_t5-base`) can be seen from `config_gen.py`. You can also +The available job configuration (e.g., `--job lora_beit-base-patch16-224`) can be seen from the scripts. You can also create your only configuration. ## Run the code ``` -python run_seq2seq.py configs/$job_name/$dataset.json +CUDA_VISIBLE_DEVICES=1 python src/run.py configs/lora_beit-base-patch16-224/beans.json ``` - -## Possible Errors - -1. -``` -ValueError: You must login to the Hugging Face hub on this computer by typing `transformers-cli login` and entering your credentials to use `use_auth_token=Tr -ue`. Alternatively, you can pass your own token as the `use_auth_token` argument. -``` -- Solution 1: Please register an account on [HuggingFace](https://huggingface.co/) -Then run transformers-cli login on your command line to enter the username and password. - -- Solution 2: Disable push_to_hub by modifying in the config.json : "push_to_hub": False - -2. -``` -OSError: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once). -``` - -- Solution 1: -``` -wget -P ~ https://github.com/git-lfs/git-lfs/releases/download/v3.0.2/git-lfs-linux-amd64-v3.0.2.tar.gz -cd ~ -tar -xvzf git-lfs-linux-amd64-v3.0.2.tar.gz -export PATH=~:$PATH -git-lfs install -``` - -- Solution 2: Disable push_to_hub by modifying in the config.json : "push_to_hub": False - - -3. dataset connection error - -Solution 1: open a python console, running the error command again, may not be useful - -Solution 2: download the dataset by yourself on a internect connected machine, saved to disk and transfer to your server, at last load_from_disk. - - -## Link to the original training scripts -This example repo is based on the [compacter training scripts](https://github.com/rabeehk/compacter), with compacter-related lines removed. Thanks to the authors of the original repo. In addition, in private correspondence with the authors, they shared the codes to create the json configs. Thanks again for their efforts. diff --git a/examples/examples_prompt/configs/gen_albert.py b/examples/examples_prompt/configs/gen_albert.py new file mode 100644 index 0000000..be9af6d --- /dev/null +++ b/examples/examples_prompt/configs/gen_albert.py @@ -0,0 +1,116 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} + + +#### ROBERTA###### +BaseConfigs['albert-xlarge-v2'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}albert-xlarge-v2", + "tokenizer_name": f"{PATHBASE}albert-xlarge-v2", + "save_total_limit": 1, + # For glue datasets. + "is_seq2seq": False, + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps" + } + +AllConfigs['prefix_albert-xlarge-v2'] = copy.deepcopy(BaseConfigs['albert-xlarge-v2']) +AllConfigs['prefix_albert-xlarge-v2'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/albert-xlarge-v2/", + }) + +AllConfigs['soft_prompt_albert-xlarge-v2'] = copy.deepcopy(BaseConfigs['albert-xlarge-v2']) +AllConfigs['soft_prompt_albert-xlarge-v2'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/albert-xlarge-v2/", + }) + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/examples/examples_prompt/configs/gen_beit.py b/examples/examples_prompt/configs/gen_beit.py new file mode 100644 index 0000000..9b61108 --- /dev/null +++ b/examples/examples_prompt/configs/gen_beit.py @@ -0,0 +1,450 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['beit-base-patch16-224'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "num_classes"): zip( + ["beans"], + ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20], + [256], + [ 32], + [ 32],#, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0], # *7 +[0] *8, + [200],# 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200],#, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [ 3], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}beit-base-patch16-224", + "tokenizer_name": f"{PATHBASE}beit-base-patch16-224", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps", + "datasets_load_from_disk":False, + } + +AllConfigs['bitfit_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['bitfit_beit-base-patch16-224'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/beit-base-patch16-224/", + }) + +AllConfigs['adapter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['adapter_beit-base-patch16-224'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/beit-base-patch16-224/", + }) + +AllConfigs['lora_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['lora_beit-base-patch16-224'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layernorm_after", + "classifier" + ], + "modified_modules":[ + "query", + "value", + ], + "lora_r": 8, + "output_dir": "outputs/lora/beit-base-patch16-224/", + }) + +AllConfigs['compacter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['compacter_beit-base-patch16-224'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/beit-base-patch16-224/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['compacter++_beit-base-patch16-224'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/beit-base-patch16-224/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['low_rank_adapter_beit-base-patch16-224'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/beit-base-patch16-224/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['soft_prompt_beit-base-patch16-224'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/beit-base-patch16-224/", + }) + +AllConfigs['prefix_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['prefix_beit-base-patch16-224'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/beit-base-patch16-224/", + }) + +AllConfigs['soft_prompt_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224']) +AllConfigs['soft_prompt_beit-base-patch16-224'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/beit-base-patch16-224/", + }) +#### beit-base-patch16-224 +BaseConfigs['t5-small'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}t5-small", + "tokenizer_name": f"{PATHBASE}t5-small", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps" + } + +AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small']) +AllConfigs['prefix_t5-small'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/t5-small/", + }) + + + + +#### ROBERTA###### +BaseConfigs['roberta-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}roberta-base", + "tokenizer_name": f"{PATHBASE}roberta-base", + "save_total_limit": 1, + # For glue datasets. + "is_seq2seq": False, + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + + + +AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['bitfit_roberta-base'].update({ + "delta_type": "bitfit", + "learning_rate": 1e-3, + "output_dir": "outputs/bitfit/roberta-base/", + }) + +AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['none_roberta-base'].update({ + "delta_type": "none", + "learning_rate": 1e-5, + "output_dir": "outputs/none/roberta-base/", + }) + + +AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['lora_roberta-base'].update({ + "delta_type": "lora", + "learning_rate": 1e-3, + "output_dir": "outputs/lora/roberta-base/", + }) + +AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['adapter_roberta-base'].update({ + "delta_type": "adapter", + "learning_rate": 1e-3, + "output_dir": "outputs/adapter/roberta-base/", + }) + +AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['low_rank_adapter_roberta-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 1e-3, + "output_dir": "outputs/low_rank_adapter/roberta-base/", + }) + +#### ROBERTA###### +BaseConfigs['bert-base-cased'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}bert-base-cased", + "tokenizer_name": f"{PATHBASE}bert-base-cased", + "save_total_limit": 1, + # For glue datasets. + "is_seq2seq": False, + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased']) +AllConfigs['prefix_bert-base-cased'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/bert-base-cased/", + }) + +AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased']) +AllConfigs['soft_prompt_bert-base-cased'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/bert-base-cased/", + }) + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/examples/examples_prompt/configs/gen_bert.py b/examples/examples_prompt/configs/gen_bert.py new file mode 100644 index 0000000..2fbaba8 --- /dev/null +++ b/examples/examples_prompt/configs/gen_bert.py @@ -0,0 +1,116 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} + + +#### ROBERTA###### +BaseConfigs['bert-base-cased'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}bert-base-cased", + "tokenizer_name": f"{PATHBASE}bert-base-cased", + "save_total_limit": 1, + # For glue datasets. + "is_seq2seq": False, + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps" + } + +AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased']) +AllConfigs['prefix_bert-base-cased'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/bert-base-cased/", + }) + +AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased']) +AllConfigs['soft_prompt_bert-base-cased'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/bert-base-cased/", + }) + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/examples/examples_prompt/configs/gen_gpt.py b/examples/examples_prompt/configs/gen_gpt.py new file mode 100644 index 0000000..d33e355 --- /dev/null +++ b/examples/examples_prompt/configs/gen_gpt.py @@ -0,0 +1,433 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['t5-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}t5-base", + "tokenizer_name": f"{PATHBASE}t5-base", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['bitfit_t5-base'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-base/", + }) + +AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['adapter_t5-base'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-base/", + }) + +AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['lora_t5-base'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-base/", + }) + +AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter++_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['low_rank_adapter_t5-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/t5-base/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['soft_prompt_t5-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/t5-base/", + }) + +AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['prefix_t5-base'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/t5-base/", + }) + +#### T5-base +BaseConfigs['t5-small'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}t5-small", + "tokenizer_name": f"{PATHBASE}t5-small", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small']) +AllConfigs['prefix_t5-small'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/t5-small/", + }) + + + + +#### ROBERTA###### +BaseConfigs['roberta-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}roberta-base", + "tokenizer_name": f"{PATHBASE}roberta-base", + "save_total_limit": 1, + # For glue datasets. + "is_seq2seq": False, + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + + + +AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['bitfit_roberta-base'].update({ + "delta_type": "bitfit", + "learning_rate": 1e-3, + "output_dir": "outputs/bitfit/roberta-base/", + }) + +AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['none_roberta-base'].update({ + "delta_type": "none", + "learning_rate": 1e-5, + "output_dir": "outputs/none/roberta-base/", + }) + + +AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['lora_roberta-base'].update({ + "delta_type": "lora", + "learning_rate": 1e-3, + "output_dir": "outputs/lora/roberta-base/", + }) + +AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['adapter_roberta-base'].update({ + "delta_type": "adapter", + "learning_rate": 1e-3, + "output_dir": "outputs/adapter/roberta-base/", + }) + +AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['low_rank_adapter_roberta-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 1e-3, + "output_dir": "outputs/low_rank_adapter/roberta-base/", + }) + +#### ROBERTA###### +BaseConfigs['bert-base-cased'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}bert-base-cased", + "tokenizer_name": f"{PATHBASE}bert-base-cased", + "save_total_limit": 1, + # For glue datasets. + "is_seq2seq": False, + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased']) +AllConfigs['prefix_bert-base-cased'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/bert-base-cased/", + }) + +AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased']) +AllConfigs['soft_prompt_bert-base-cased'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/bert-base-cased/", + }) + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/examples/examples_prompt/configs/gen_roberta.py b/examples/examples_prompt/configs/gen_roberta.py new file mode 100644 index 0000000..c21baa5 --- /dev/null +++ b/examples/examples_prompt/configs/gen_roberta.py @@ -0,0 +1,143 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} + +#### ROBERTA###### +BaseConfigs['roberta-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}roberta-base", + "tokenizer_name": f"{PATHBASE}roberta-base", + "save_total_limit": 1, + # For glue datasets. + "is_seq2seq": False, + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + + + +AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['bitfit_roberta-base'].update({ + "delta_type": "bitfit", + "learning_rate": 1e-3, + "output_dir": "outputs/bitfit/roberta-base/", + }) + +AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['none_roberta-base'].update({ + "delta_type": "none", + "learning_rate": 1e-5, + "output_dir": "outputs/none/roberta-base/", + }) + + +AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['lora_roberta-base'].update({ + "delta_type": "lora", + "learning_rate": 1e-3, + "output_dir": "outputs/lora/roberta-base/", + }) + +AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['adapter_roberta-base'].update({ + "delta_type": "adapter", + "learning_rate": 1e-3, + "output_dir": "outputs/adapter/roberta-base/", + }) + +AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['low_rank_adapter_roberta-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 1e-3, + "output_dir": "outputs/low_rank_adapter/roberta-base/", + }) + + +AllConfigs['soft_prompt_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['soft_prompt_roberta-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/roberta-base/", + }) + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/examples/examples_prompt/configs/gen_t5.py b/examples/examples_prompt/configs/gen_t5.py new file mode 100644 index 0000000..b2e15a5 --- /dev/null +++ b/examples/examples_prompt/configs/gen_t5.py @@ -0,0 +1,444 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['t5-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}t5-base", + "tokenizer_name": f"{PATHBASE}t5-base", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps" + } + +AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['bitfit_t5-base'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-base/", + }) + +AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['adapter_t5-base'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-base/", + }) + +AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['lora_t5-base'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-base/", + }) + +AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter++_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['low_rank_adapter_t5-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/t5-base/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['soft_prompt_t5-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/t5-base/", + }) + +AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['prefix_t5-base'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/t5-base/", + }) + +AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['soft_prompt_t5-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/t5-base/", + }) +#### T5-base +BaseConfigs['t5-small'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}t5-small", + "tokenizer_name": f"{PATHBASE}t5-small", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps" + } + +AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small']) +AllConfigs['prefix_t5-small'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/t5-small/", + }) + + + + +#### ROBERTA###### +BaseConfigs['roberta-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}roberta-base", + "tokenizer_name": f"{PATHBASE}roberta-base", + "save_total_limit": 1, + # For glue datasets. + "is_seq2seq": False, + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + + + +AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['bitfit_roberta-base'].update({ + "delta_type": "bitfit", + "learning_rate": 1e-3, + "output_dir": "outputs/bitfit/roberta-base/", + }) + +AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['none_roberta-base'].update({ + "delta_type": "none", + "learning_rate": 1e-5, + "output_dir": "outputs/none/roberta-base/", + }) + + +AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['lora_roberta-base'].update({ + "delta_type": "lora", + "learning_rate": 1e-3, + "output_dir": "outputs/lora/roberta-base/", + }) + +AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['adapter_roberta-base'].update({ + "delta_type": "adapter", + "learning_rate": 1e-3, + "output_dir": "outputs/adapter/roberta-base/", + }) + +AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['low_rank_adapter_roberta-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 1e-3, + "output_dir": "outputs/low_rank_adapter/roberta-base/", + }) + +#### ROBERTA###### +BaseConfigs['bert-base-cased'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}bert-base-cased", + "tokenizer_name": f"{PATHBASE}bert-base-cased", + "save_total_limit": 1, + # For glue datasets. + "is_seq2seq": False, + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased']) +AllConfigs['prefix_bert-base-cased'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/bert-base-cased/", + }) + +AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased']) +AllConfigs['soft_prompt_bert-base-cased'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/bert-base-cased/", + }) + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/examples/examples_prompt/run.py b/examples/examples_prompt/run.py deleted file mode 100644 index 962f608..0000000 --- a/examples/examples_prompt/run.py +++ /dev/null @@ -1,482 +0,0 @@ -# coding=utf-8 -# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Fine-tuning the library models for sequence to sequence. -""" -# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments. -import functools -import logging -from opendelta.utils.delta_center import create_hub_repo_name -import torch -import os -os.environ['MKL_THREADING_LAYER'] = 'GNU' -os.environ['MKL_SERVICE_FORCE_INTEL'] = '1' -import sys -import subprocess -from typing import Optional, List - -from datasets import load_dataset, load_metric, concatenate_datasets -import transformers -from transformers import ( - AutoConfig, - AutoModelForSeq2SeqLM, - AutoTokenizer, - HfArgumentParser, - MBartTokenizer, - default_data_collator, - set_seed, -) -from transformers.trainer_utils import is_main_process, get_last_checkpoint -# from ..seq2seq.utils import get_adapter_config -from examples_prompt.data_processors import AutoTask, TaskDataCollatorForSeq2Seq, AutoPostProcessor -from examples_prompt.seq2seq_trainer import Seq2SeqTrainer -# from training_args import AdapterTrainingArguments -from examples_prompt.trainers.trainer_utils import save_training_config -from dataclasses import dataclass, field - -from transformers.models.t5.modeling_t5 import T5Config, T5ForConditionalGeneration -from examples_prompt.utils.args import ModelArguments -from examples_prompt.trainers.trainer_args import TrainingArguments, DataTrainingArguments -from transformers.trainer import Trainer -from examples_prompt.metrics.metrics import transform_for_generation -import json -logger = logging.getLogger(__name__) - - - -TASK_TO_METRICS = {"mrpc": ["accuracy", "f1"], - "cola": ['matthews_correlation'], - "stsb": ['pearson', 'spearmanr'], - 'sst2': ['accuracy'], - "mnli": ["accuracy"], - "mnli_mismatched": ["accuracy"], - "mnli_matched": ["accuracy"], - "qnli": ["accuracy"], - "rte": ["accuracy"], - "wnli": ["accuracy"], - "qqp": ["accuracy", "f1"], - "superglue-boolq": ["accuracy"], - "superglue-rte": ["accuracy"], - "superglue-cb": ["f1_multiclass", "accuracy"], - "superglue-copa": ["accuracy"], - "superglue-multirc": ["f1", "em"], - "superglue-wic": ["accuracy"], - "superglue-wsc.fixed": ["accuracy"], - "superglue-record": ["f1", "em"] - } - - -class RemainArgHfArgumentParser(HfArgumentParser): - def parse_json_file(self, json_file: str, return_remaining_args=True ): - """ - Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the - dataclass types. - """ - import argparse - import json - from pathlib import Path - import dataclasses - - data = json.loads(Path(json_file).read_text()) - outputs = [] - for dtype in self.dataclass_types: - keys = {f.name for f in dataclasses.fields(dtype) if f.init} - inputs = {k: data.pop(k) for k in list(data.keys()) if k in keys} - obj = dtype(**inputs) - outputs.append(obj) - - remain_args = argparse.ArgumentParser() - remain_args.__dict__.update(data) - if return_remaining_args: - return (*outputs, remain_args) - else: - return (*outputs,) - - - -def main(): - - # See all possible arguments in src/transformers/training_args.py - # or by passing the --help flag to this script. - # We now keep distinct sets of args, for a cleaner separation of concerns. - parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args, delta_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args, delta_args = parser.parse_args_into_dataclasses(return_remaining_strings=True) - - - # Detecting last checkpoint. - last_checkpoint = None - if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: - last_checkpoint = get_last_checkpoint(training_args.output_dir) - print("#### last_checkpoint ", last_checkpoint) - if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: - ''' - raise ValueError( - f"Output directory ({training_args.output_dir}) already exists and is not empty. " - "Use --overwrite_output_dir to overcome." - ) - ''' - pass - elif last_checkpoint is not None: - logger.info( - f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " - "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." - ) - - # Setup logging - logging.basicConfig( - format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", - datefmt="%m/%d/%Y %H:%M:%S", - handlers=[logging.StreamHandler(sys.stdout)], - ) - logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) - - # Log on each process the small summary: - logger.warning( - f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" - + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" - ) - # Set the verbosity to info of the Transformers logger (on main process only): - if is_main_process(training_args.local_rank): - transformers.utils.logging.set_verbosity_info() - logger.info("Training/evaluation parameters %s", training_args) - - # Set seed before initializing model. - set_seed(training_args.seed) - - # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) - # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ - # (the dataset will be downloaded automatically from the datasets Hub). - # - # For CSV/JSON files in the summarization task, this script will use the first column for the full texts and the - # second column for the summaries (unless you specify column names for this with the `text_column` and - # `summary_column` arguments). - # For translation, only JSON files are supported, with one field named "translation" containing two keys for the - # source and target languages (unless you adapt what follows). - # - # In distributed training, the load_dataset function guarantee that only one local process can concurrently - # download the dataset. - # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at - # https://huggingface.co/docs/datasets/loading_datasets.html. - - # Load pretrained model and tokenizer - # - # Distributed training: - # The .from_pretrained methods guarantee that only one local process can concurrently - # download model & vocab. - config = AutoConfig.from_pretrained( - model_args.config_name if model_args.config_name else model_args.model_name_or_path, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - config.dropout_rate = 0.0 - tokenizer = AutoTokenizer.from_pretrained( - model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, - cache_dir=model_args.cache_dir, - use_fast=model_args.use_fast_tokenizer, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - model = AutoModelForSeq2SeqLM.from_pretrained( - model_args.model_name_or_path, - from_tf=bool(".ckpt" in model_args.model_name_or_path), - config=config, - cache_dir=model_args.cache_dir, - revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) - model.resize_token_embeddings(len(tokenizer)) - - - if delta_args.delta_type.lower() != "none": - from opendelta import AutoDeltaConfig,AutoDeltaModel - delta_config = AutoDeltaConfig.from_dict(vars(delta_args)) - delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=model) - delta_model.freeze_module(set_state_dict = True) - delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True) - - - # model parallelize - if hasattr(training_args, "model_parallel") and training_args.model_parallel: - logger.info('parallelize model!') - model.parallelize() - - data_args.dataset_name = [data_args.task_name] - data_args.eval_dataset_name = [data_args.eval_dataset_name] - data_args.test_dataset_name = [data_args.test_dataset_name] - data_args.dataset_config_name = [data_args.dataset_config_name] - data_args.eval_dataset_config_name = [data_args.eval_dataset_config_name] - data_args.test_dataset_config_name = [data_args.test_dataset_config_name] - assert len(data_args.dataset_name) == len(data_args.dataset_config_name) - if data_args.eval_dataset_name is not None: - assert len(data_args.eval_dataset_name) == len(data_args.eval_dataset_config_name) - if data_args.test_dataset_name is not None: - assert len(data_args.test_dataset_name) == len(data_args.test_dataset_config_name) - - # Temporarily set max_target_length for training. - #max_target_length = data_args.max_target_length - padding = "max_length" if data_args.pad_to_max_length else False - - def preprocess_function(examples, max_target_length): - # max_target_length += 1 - model_inputs = tokenizer([s+"" for s in examples['source']], max_length=data_args.max_source_length, - padding=padding, truncation=True) - # # Setup the tokenizer for targets - with tokenizer.as_target_tokenizer(): - labels = tokenizer([t for t in examples['target']], max_length=max_target_length, padding=padding, truncation=True) - # model_inputs = tokenizer([s for s in examples['source']], max_length=data_args.max_source_length, - # padding=padding, truncation=True) - # Setup the tokenizer for targets - # with tokenizer.as_target_tokenizer(): - # labels = tokenizer([t for t in examples['target']], max_length=max_target_length, padding=padding, truncation=True) - # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore - # padding in the loss. - if padding == "max_length" and data_args.ignore_pad_token_for_loss: - labels["input_ids"] = [ - [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] - ] - model_inputs["labels"] = labels["input_ids"] - model_inputs["extra_fields"] = examples['extra_fields'] - return model_inputs - - column_names = ['source', 'target', 'label', 'extra_fields'] - performance_metrics = {} - if training_args.do_train: - train_datasets = [AutoTask.get(dataset_name, - dataset_config_name, - seed=data_args.data_seed).get( - split="train", - split_validation_test=training_args.split_validation_test, - add_prefix=True, - n_obs=data_args.max_train_samples) - for dataset_name, dataset_config_name\ - in zip(data_args.dataset_name, data_args.dataset_config_name)] - max_target_lengths = [AutoTask.get(dataset_name, dataset_config_name).get_max_target_length(\ - tokenizer=tokenizer, default_max_length=data_args.max_target_length)\ - for dataset_name, dataset_config_name in zip(data_args.dataset_name, data_args.dataset_config_name)] - for i, train_dataset in enumerate(train_datasets): - train_datasets[i] = train_datasets[i].map( - functools.partial(preprocess_function, max_target_length=max_target_lengths[i]), - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, # if train_dataset != "superglue-record" else column_names+["answers"], - load_from_cache_file=not data_args.overwrite_cache, - ) - train_dataset = concatenate_datasets(train_datasets) - print(f"Train dataset size {len(train_dataset)}") - - if training_args.do_eval: - eval_datasets = {eval_dataset: AutoTask.get(eval_dataset, eval_dataset_config, - seed=data_args.data_seed).get( - split="validation", - split_validation_test=training_args.split_validation_test, - add_prefix=True, - n_obs=data_args.max_val_samples) - for eval_dataset, eval_dataset_config in zip(data_args.eval_dataset_name, data_args.eval_dataset_config_name)} - max_target_lengths = [AutoTask.get(dataset_name, dataset_config_name).get_max_target_length( \ - tokenizer=tokenizer, default_max_length=data_args.max_target_length) \ - for dataset_name, dataset_config_name in zip(data_args.eval_dataset_name, data_args.eval_dataset_config_name)] - for k, name in enumerate(eval_datasets): - eval_datasets[name] = eval_datasets[name].map( - functools.partial(preprocess_function, max_target_length=max_target_lengths[k]), - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, # if name != "superglue-record" else column_names+["answers"], - load_from_cache_file=not data_args.overwrite_cache, - ) - - if training_args.do_test: - test_datasets = {test_dataset: AutoTask.get(test_dataset, test_dataset_config, - seed=data_args.data_seed).get( - split="test", - split_validation_test=training_args.split_validation_test, - add_prefix=True, - n_obs=data_args.max_test_samples) - for test_dataset, test_dataset_config in zip(data_args.test_dataset_name, data_args.test_dataset_config_name)} - max_target_lengths = [AutoTask.get(dataset_name, dataset_config_name).get_max_target_length( \ - tokenizer=tokenizer, default_max_length=data_args.max_target_length) \ - for dataset_name, dataset_config_name in zip(data_args.test_dataset_name, data_args.test_dataset_config_name)] - for k, name in enumerate(test_datasets): - test_datasets[name] = test_datasets[name].map( - functools.partial(preprocess_function, max_target_length=max_target_lengths[k]), - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=column_names, - load_from_cache_file=not data_args.overwrite_cache, - ) - - # Data collator - label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id - if data_args.pad_to_max_length: - data_collator = default_data_collator - else: - data_collator = TaskDataCollatorForSeq2Seq( - tokenizer, - label_pad_token_id=label_pad_token_id, - pad_to_multiple_of=8 if training_args.fp16 else None, - ) - - - # Metric, we assume we have only one training task. - eval_metrics = [AutoTask.get(dataset_name, dataset_config_name).metric\ - for dataset_name, dataset_config_name in zip(data_args.dataset_name, data_args.dataset_config_name)][0] - - # Extracts the extra information needed to evaluate on each dataset. - # These information are only used in the compute_metrics. - # We will assume that the test/eval dataloader does not change the order of - # the data. - data_info = {"eval": eval_datasets[data_args.eval_dataset_name[0]]['extra_fields'], - "test": test_datasets[data_args.test_dataset_name[0]]['extra_fields'], - "train": train_dataset['extra_fields']} - def compute_metrics(eval_preds): - preds, labels, data_info = eval_preds - post_processor = AutoPostProcessor.get(data_args.dataset_name[0], tokenizer, - data_args.ignore_pad_token_for_loss) - decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info) - decoded_preds, decoded_labels = transform_for_generation(decoded_preds, decoded_labels) - result = {} - for metric in eval_metrics: - result.update(metric(decoded_preds, decoded_labels)) - return result - - - # Initialize our Trainer - if training_args.is_seq2seq == True: - trainer = Seq2SeqTrainer( - model=model, - args=training_args, - delta_args=delta_args, - train_dataset=train_dataset if training_args.do_train else None, - eval_dataset=list(eval_datasets.values())[0] if training_args.do_eval else None, - data_info = data_info, - tokenizer=tokenizer, - data_collator=data_collator, - compute_metrics=compute_metrics if training_args.predict_with_generate else None, - evaluation_metrics = TASK_TO_METRICS[data_args.dataset_name[0]], - ) - else: - trainer = Trainer( - model=model, - args=training_args, - train_dataset=train_dataset if training_args.do_train else None, - eval_dataset=eval_dataset if training_args.do_eval else None, - compute_metrics=compute_metrics, - tokenizer=tokenizer, - data_collator=data_collator, - ) - - - # Saves training config. - if trainer.is_world_process_zero(): - os.makedirs(training_args.output_dir, exist_ok=True) - save_training_config(sys.argv[1], training_args.output_dir) - - # Training - if training_args.do_train: - checkpoint = None - if training_args.resume_from_checkpoint is not None: - checkpoint = training_args.resume_from_checkpoint - elif last_checkpoint is not None: - checkpoint = last_checkpoint - - if training_args.compute_time: - torch.cuda.synchronize() # wait for move to complete - start = torch.cuda.Event(enable_timing=True) - end = torch.cuda.Event(enable_timing=True) - start.record() - - train_result = trainer.train(resume_from_checkpoint=checkpoint) - - if training_args.compute_time: - end.record() - torch.cuda.synchronize() # wait for all_reduce to complete - total_time = start.elapsed_time(end)/(1000*60) - performance_metrics.update({"total_time in minutes ": total_time}) - - trainer.save_model() # Saves the tokenizer too for easy upload - train_metrics = train_result.metrics - max_train_samples = ( - data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) - ) - train_metrics["train_samples"] = min(max_train_samples, len(train_dataset)) - trainer.log_metrics("train", train_metrics) - trainer.save_metrics("train", train_metrics) - trainer.save_state() - - if torch.cuda.is_available() and training_args.compute_memory: - peak_memory = (torch.cuda.max_memory_allocated() / 1024 ** 2)/1000 - print( - "Memory utilization", - peak_memory, - "GB" - ) - performance_metrics.update({"peak_memory": peak_memory}) - if training_args.compute_memory or training_args.compute_time: - print(performance_metrics) - trainer.save_metrics("performance", performance_metrics) - - # Evaluation - results = {} - if training_args.do_eval: - logger.info("*** Evaluate ***") - for task, eval_dataset in eval_datasets.items(): - metrics = trainer.evaluate(eval_dataset=eval_dataset, - max_length=data_args.val_max_target_length, num_beams=data_args.num_beams, - ) - trainer.log_metrics("eval", metrics) - trainer.save_metrics("eval", metrics) - results['evaluate'] = metrics - - # Test - if training_args.do_test: - logger.info("*** Test ***") - for task, test_dataset in test_datasets.items(): - metrics = trainer.evaluate(eval_dataset=test_dataset, - max_length=data_args.test_max_target_length, num_beams=data_args.num_beams, - metric_key_prefix="test" - ) - trainer.log_metrics("test", metrics) - trainer.save_metrics("test", metrics) - results['test'] = metrics - - repo_name = create_hub_repo_name(root="DeltaHub", - dataset=data_args.task_name, - delta_type = delta_args.delta_type, - model_name_or_path= model_args.model_name_or_path) - results['repo_name'] = repo_name - if delta_args.delta_type.lower() != "none": - if training_args.push_to_hub: # TODO add description here - delta_model.save_finetuned(push_to_hub=True, save_directory=repo_name, use_auth_token=True) - # trainer.push_to_hub(**kwargs) - else: - delta_model.save_finetuned(push_to_hub=False, save_directory=repo_name, use_auth_token=True) - - with open(f"{training_args.output_dir}/results.json", 'w') as fout: - string = json.dumps(results, indent=4,sort_keys=True) - fout.write(string+"\n") - - return results - - - - -if __name__ == "__main__": - result = main() - diff --git a/examples/examples_prompt/run.sh b/examples/examples_prompt/run.sh deleted file mode 100644 index 9b2781e..0000000 --- a/examples/examples_prompt/run.sh +++ /dev/null @@ -1,7 +0,0 @@ -files=(cola mnli mrpc qnli qqp rte sst2 stsb superglue-boolq superglue-cb superglue-copa superglue-multirc superglue-record superglue-wic superglue-wsc.fixed) -for ((i=$1; i<=$2; i++)) -do - dataset=${files[i]} - echo "id$i:$dataset" - TOKENIZERS_PARALLELISM=false python run.py configs/$3/$dataset.json -done \ No newline at end of file diff --git a/examples/examples_prompt/run_mlm.sh b/examples/examples_prompt/run_mlm.sh deleted file mode 100644 index 0836f20..0000000 --- a/examples/examples_prompt/run_mlm.sh +++ /dev/null @@ -1,11 +0,0 @@ - -python configs/config_gen.py --job $3 -echo "Regenerate config" - -files=(cola mnli mrpc qnli qqp rte sst2 stsb superglue-boolq superglue-cb superglue-copa superglue-multirc superglue-record superglue-wic superglue-wsc.fixed) -for ((i=$1; i<=$2; i++)) -do - dataset=${files[i]} - echo "id$i:$dataset" - TOKENIZERS_PARALLELISM=false python run_mlm.py configs/$3/$dataset.json -done \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/config_gen_bs.py b/examples/examples_seq2seq/configs/config_gen_bs.py new file mode 100644 index 0000000..4cf3c8e --- /dev/null +++ b/examples/examples_seq2seq/configs/config_gen_bs.py @@ -0,0 +1,411 @@ +import collections +import copy + +BS = 1 +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['t5-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "t5-base", + "tokenizer_name": "t5-base", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +BaseConfigs['t5-large'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "/home/hushengding/plm_cache/t5-large", + "tokenizer_name": "/home/hushengding/plm_cache/t5-large", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +BaseConfigs['t5-3b'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "/home/hushengding/plm_cache/t5-3b", + "tokenizer_name": "/home/hushengding/plm_cache/t5-3b", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['bitfit_t5-base'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-base/", + }) + + + +AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['adapter_t5-base'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-base/", + }) + +AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['lora_t5-base'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-base/", + }) + +AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter++_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['low_rank_adapter_t5-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/t5-base/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['soft_prompt_t5-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/t5-base/", + }) + +AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['prefix_t5-base'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/t5-base/", + }) + +AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['none_t5-base'].update({ + "delta_type": "none", + "learning_rate": 3e-5, + "output_dir": "outputs/none/t5-base/", + }) + +AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['bitfit_t5-large'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-large/", + }) + +AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['none_t5-large'].update({ + "delta_type": "none", + "learning_rate": 3e-5, + "output_dir": "outputs/none/t5-large/", + }) + + +AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['bitfit_t5-3b'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-3b/", + }) + +AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['none_t5-3b'].update({ + "delta_type": "none", + "learning_rate": 3e-5, + "output_dir": "outputs/none/t5-3b/", + }) + +AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['adapter_t5-3b'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-3b/", + }) + +AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['adapter_t5-large'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-large/", + }) + +AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['lora_t5-large'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-large/", + }) + +AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['lora_t5-3b'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-3b/", + }) + + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"./{args.job}_{BS}/"): + os.mkdir(f"./{args.job}_{BS}/") + + for job_name in all_config_jsons: + with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + + \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/config_gen_bs1.py b/examples/examples_seq2seq/configs/config_gen_bs1.py new file mode 100644 index 0000000..4cf3c8e --- /dev/null +++ b/examples/examples_seq2seq/configs/config_gen_bs1.py @@ -0,0 +1,411 @@ +import collections +import copy + +BS = 1 +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['t5-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "t5-base", + "tokenizer_name": "t5-base", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +BaseConfigs['t5-large'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "/home/hushengding/plm_cache/t5-large", + "tokenizer_name": "/home/hushengding/plm_cache/t5-large", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +BaseConfigs['t5-3b'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "/home/hushengding/plm_cache/t5-3b", + "tokenizer_name": "/home/hushengding/plm_cache/t5-3b", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['bitfit_t5-base'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-base/", + }) + + + +AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['adapter_t5-base'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-base/", + }) + +AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['lora_t5-base'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-base/", + }) + +AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter++_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['low_rank_adapter_t5-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/t5-base/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['soft_prompt_t5-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/t5-base/", + }) + +AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['prefix_t5-base'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/t5-base/", + }) + +AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['none_t5-base'].update({ + "delta_type": "none", + "learning_rate": 3e-5, + "output_dir": "outputs/none/t5-base/", + }) + +AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['bitfit_t5-large'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-large/", + }) + +AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['none_t5-large'].update({ + "delta_type": "none", + "learning_rate": 3e-5, + "output_dir": "outputs/none/t5-large/", + }) + + +AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['bitfit_t5-3b'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-3b/", + }) + +AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['none_t5-3b'].update({ + "delta_type": "none", + "learning_rate": 3e-5, + "output_dir": "outputs/none/t5-3b/", + }) + +AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['adapter_t5-3b'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-3b/", + }) + +AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['adapter_t5-large'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-large/", + }) + +AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['lora_t5-large'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-large/", + }) + +AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['lora_t5-3b'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-3b/", + }) + + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"./{args.job}_{BS}/"): + os.mkdir(f"./{args.job}_{BS}/") + + for job_name in all_config_jsons: + with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + + \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/config_gen_bs64.py b/examples/examples_seq2seq/configs/config_gen_bs64.py new file mode 100644 index 0000000..90426fe --- /dev/null +++ b/examples/examples_seq2seq/configs/config_gen_bs64.py @@ -0,0 +1,411 @@ +import collections +import copy + +BS = 64 +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['t5-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "t5-base", + "tokenizer_name": "t5-base", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +BaseConfigs['t5-large'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "/home/hushengding/plm_cache/t5-large", + "tokenizer_name": "/home/hushengding/plm_cache/t5-large", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +BaseConfigs['t5-3b'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "/home/hushengding/plm_cache/t5-3b", + "tokenizer_name": "/home/hushengding/plm_cache/t5-3b", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['bitfit_t5-base'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-base/", + }) + + + +AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['adapter_t5-base'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-base/", + }) + +AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['lora_t5-base'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-base/", + }) + +AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter++_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['low_rank_adapter_t5-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/t5-base/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['soft_prompt_t5-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/t5-base/", + }) + +AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['prefix_t5-base'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/t5-base/", + }) + +AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['none_t5-base'].update({ + "delta_type": "none", + "learning_rate": 3e-5, + "output_dir": "outputs/none/t5-base/", + }) + +AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['bitfit_t5-large'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-large/", + }) + +AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['none_t5-large'].update({ + "delta_type": "none", + "learning_rate": 3e-5, + "output_dir": "outputs/none/t5-large/", + }) + + +AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['bitfit_t5-3b'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-3b/", + }) + +AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['none_t5-3b'].update({ + "delta_type": "none", + "learning_rate": 3e-5, + "output_dir": "outputs/none/t5-3b/", + }) + +AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['adapter_t5-3b'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-3b/", + }) + +AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['adapter_t5-large'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-large/", + }) + +AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['lora_t5-large'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-large/", + }) + +AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['lora_t5-3b'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-3b/", + }) + + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"./{args.job}_{BS}/"): + os.mkdir(f"./{args.job}_{BS}/") + + for job_name in all_config_jsons: + with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + + \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/config_gen_bs8.py b/examples/examples_seq2seq/configs/config_gen_bs8.py new file mode 100644 index 0000000..5e48edb --- /dev/null +++ b/examples/examples_seq2seq/configs/config_gen_bs8.py @@ -0,0 +1,411 @@ +import collections +import copy + +BS = 8 +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['t5-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "t5-base", + "tokenizer_name": "t5-base", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +BaseConfigs['t5-large'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "/home/hushengding/plm_cache/t5-large", + "tokenizer_name": "/home/hushengding/plm_cache/t5-large", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +BaseConfigs['t5-3b'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "/home/hushengding/plm_cache/t5-3b", + "tokenizer_name": "/home/hushengding/plm_cache/t5-3b", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + +AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['bitfit_t5-base'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-base/", + }) + + + +AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['adapter_t5-base'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-base/", + }) + +AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['lora_t5-base'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-base/", + }) + +AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['compacter++_t5-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/t5-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['low_rank_adapter_t5-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/t5-base/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['soft_prompt_t5-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/t5-base/", + }) + +AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['prefix_t5-base'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/t5-base/", + }) + +AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['none_t5-base'].update({ + "delta_type": "none", + "learning_rate": 3e-5, + "output_dir": "outputs/none/t5-base/", + }) + +AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['bitfit_t5-large'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-large/", + }) + +AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['none_t5-large'].update({ + "delta_type": "none", + "learning_rate": 3e-5, + "output_dir": "outputs/none/t5-large/", + }) + + +AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['bitfit_t5-3b'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/t5-3b/", + }) + +AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['none_t5-3b'].update({ + "delta_type": "none", + "learning_rate": 3e-5, + "output_dir": "outputs/none/t5-3b/", + }) + +AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['adapter_t5-3b'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-3b/", + }) + +AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['adapter_t5-large'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/t5-large/", + }) + +AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) +AllConfigs['lora_t5-large'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-large/", + }) + +AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) +AllConfigs['lora_t5-3b'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/t5-3b/", + }) + + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"./{args.job}_{BS}/"): + os.mkdir(f"./{args.job}_{BS}/") + + for job_name in all_config_jsons: + with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + + \ No newline at end of file