update examples
This commit is contained in:
parent
42747f2b81
commit
cd3b981f61
|
@ -17,18 +17,17 @@ _build/
|
|||
outputs/
|
||||
log.txt
|
||||
**/DeltaHub/
|
||||
**/sfs_scripts/
|
||||
*beans/
|
||||
**/examples/*/configs/
|
||||
|
||||
**/examples/*/configs/*
|
||||
!examples/*/configs/config_gen.py
|
||||
**/jupyter_notebook_examples/
|
||||
!examples/jupyter_notebook_examples/*.py
|
||||
|
||||
|
||||
!**/examples/*/configs/*.py
|
||||
!examples/*/configs/*.py
|
||||
**/outputs_search/**/*.bin
|
||||
**/outputs_search/**/*.pt
|
||||
|
||||
|
||||
*.db
|
||||
**/nohup.out
|
||||
**/examples/examples_bmtrain/BigModels/down_data
|
||||
|
|
|
@ -10,55 +10,16 @@ This will add `examples_seq2seq` to the environment path of the python lib.
|
|||
|
||||
## Generating the json configuration file
|
||||
|
||||
```shell
|
||||
python configs/gen_$BACKBONETYPE.py --job $YOURJOB
|
||||
#e.g. python configs/gen_beit.py --job lora_beit-base-patch16-224
|
||||
```
|
||||
python config_gen.py --job $job_name
|
||||
|
||||
```
|
||||
The available job configuration (e.g., `--job lora_t5-base`) can be seen from `config_gen.py`. You can also
|
||||
The available job configuration (e.g., `--job lora_beit-base-patch16-224`) can be seen from the scripts. You can also
|
||||
create your only configuration.
|
||||
|
||||
|
||||
## Run the code
|
||||
|
||||
```
|
||||
python run_seq2seq.py configs/$job_name/$dataset.json
|
||||
CUDA_VISIBLE_DEVICES=1 python src/run.py configs/lora_beit-base-patch16-224/beans.json
|
||||
```
|
||||
|
||||
## Possible Errors
|
||||
|
||||
1.
|
||||
```
|
||||
ValueError: You must login to the Hugging Face hub on this computer by typing `transformers-cli login` and entering your credentials to use `use_auth_token=Tr
|
||||
ue`. Alternatively, you can pass your own token as the `use_auth_token` argument.
|
||||
```
|
||||
- Solution 1: Please register an account on [HuggingFace](https://huggingface.co/)
|
||||
Then run transformers-cli login on your command line to enter the username and password.
|
||||
|
||||
- Solution 2: Disable push_to_hub by modifying in the config.json : "push_to_hub": False
|
||||
|
||||
2.
|
||||
```
|
||||
OSError: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once).
|
||||
```
|
||||
|
||||
- Solution 1:
|
||||
```
|
||||
wget -P ~ https://github.com/git-lfs/git-lfs/releases/download/v3.0.2/git-lfs-linux-amd64-v3.0.2.tar.gz
|
||||
cd ~
|
||||
tar -xvzf git-lfs-linux-amd64-v3.0.2.tar.gz
|
||||
export PATH=~:$PATH
|
||||
git-lfs install
|
||||
```
|
||||
|
||||
- Solution 2: Disable push_to_hub by modifying in the config.json : "push_to_hub": False
|
||||
|
||||
|
||||
3. dataset connection error
|
||||
|
||||
Solution 1: open a python console, running the error command again, may not be useful
|
||||
|
||||
Solution 2: download the dataset by yourself on a internect connected machine, saved to disk and transfer to your server, at last load_from_disk.
|
||||
|
||||
|
||||
## Link to the original training scripts
|
||||
This example repo is based on the [compacter training scripts](https://github.com/rabeehk/compacter), with compacter-related lines removed. Thanks to the authors of the original repo. In addition, in private correspondence with the authors, they shared the codes to create the json configs. Thanks again for their efforts.
|
||||
|
|
|
@ -0,0 +1,116 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['albert-xlarge-v2'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}albert-xlarge-v2",
|
||||
"tokenizer_name": f"{PATHBASE}albert-xlarge-v2",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_albert-xlarge-v2'] = copy.deepcopy(BaseConfigs['albert-xlarge-v2'])
|
||||
AllConfigs['prefix_albert-xlarge-v2'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/albert-xlarge-v2/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_albert-xlarge-v2'] = copy.deepcopy(BaseConfigs['albert-xlarge-v2'])
|
||||
AllConfigs['soft_prompt_albert-xlarge-v2'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/albert-xlarge-v2/",
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"configs/{args.job}/"):
|
||||
os.mkdir(f"configs/{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,450 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['beit-base-patch16-224'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "num_classes"): zip(
|
||||
["beans"],
|
||||
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20],
|
||||
[256],
|
||||
[ 32],
|
||||
[ 32],#, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0], # *7 +[0] *8,
|
||||
[200],# 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200],#, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[ 3],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}beit-base-patch16-224",
|
||||
"tokenizer_name": f"{PATHBASE}beit-base-patch16-224",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps",
|
||||
"datasets_load_from_disk":False,
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['bitfit_beit-base-patch16-224'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/beit-base-patch16-224/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['adapter_beit-base-patch16-224'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/beit-base-patch16-224/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['lora_beit-base-patch16-224'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layernorm_after",
|
||||
"classifier"
|
||||
],
|
||||
"modified_modules":[
|
||||
"query",
|
||||
"value",
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/beit-base-patch16-224/",
|
||||
})
|
||||
|
||||
AllConfigs['compacter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['compacter_beit-base-patch16-224'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter/beit-base-patch16-224/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
AllConfigs['compacter++_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['compacter++_beit-base-patch16-224'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
"modified_modules": [
|
||||
"DenseReluDense"
|
||||
],
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter++/beit-base-patch16-224/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['low_rank_adapter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['low_rank_adapter_beit-base-patch16-224'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/low_rank_adapter/beit-base-patch16-224/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['soft_prompt_beit-base-patch16-224'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-2,
|
||||
"soft_token_num":100,
|
||||
"token_init": False,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/beit-base-patch16-224/",
|
||||
})
|
||||
|
||||
AllConfigs['prefix_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['prefix_beit-base-patch16-224'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/beit-base-patch16-224/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['soft_prompt_beit-base-patch16-224'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/beit-base-patch16-224/",
|
||||
})
|
||||
#### beit-base-patch16-224
|
||||
BaseConfigs['t5-small'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}t5-small",
|
||||
"tokenizer_name": f"{PATHBASE}t5-small",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
|
||||
AllConfigs['prefix_t5-small'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-small/",
|
||||
})
|
||||
|
||||
|
||||
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['roberta-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}roberta-base",
|
||||
"tokenizer_name": f"{PATHBASE}roberta-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
|
||||
|
||||
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['bitfit_roberta-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/bitfit/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['none_roberta-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 1e-5,
|
||||
"output_dir": "outputs/none/roberta-base/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['lora_roberta-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/lora/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['adapter_roberta-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/adapter/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['low_rank_adapter_roberta-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/low_rank_adapter/roberta-base/",
|
||||
})
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['bert-base-cased'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}bert-base-cased",
|
||||
"tokenizer_name": f"{PATHBASE}bert-base-cased",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['prefix_bert-base-cased'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/bert-base-cased/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['soft_prompt_bert-base-cased'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/bert-base-cased/",
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"configs/{args.job}/"):
|
||||
os.mkdir(f"configs/{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,116 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['bert-base-cased'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}bert-base-cased",
|
||||
"tokenizer_name": f"{PATHBASE}bert-base-cased",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['prefix_bert-base-cased'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/bert-base-cased/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['soft_prompt_bert-base-cased'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/bert-base-cased/",
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"configs/{args.job}/"):
|
||||
os.mkdir(f"configs/{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,433 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['t5-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}t5-base",
|
||||
"tokenizer_name": f"{PATHBASE}t5-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['bitfit_t5-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['adapter_t5-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['lora_t5-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter++_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
"modified_modules": [
|
||||
"DenseReluDense"
|
||||
],
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter++/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['low_rank_adapter_t5-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/low_rank_adapter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['soft_prompt_t5-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-2,
|
||||
"soft_token_num":100,
|
||||
"token_init": False,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['prefix_t5-base'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-base/",
|
||||
})
|
||||
|
||||
#### T5-base
|
||||
BaseConfigs['t5-small'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}t5-small",
|
||||
"tokenizer_name": f"{PATHBASE}t5-small",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
|
||||
AllConfigs['prefix_t5-small'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-small/",
|
||||
})
|
||||
|
||||
|
||||
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['roberta-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}roberta-base",
|
||||
"tokenizer_name": f"{PATHBASE}roberta-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
|
||||
|
||||
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['bitfit_roberta-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/bitfit/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['none_roberta-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 1e-5,
|
||||
"output_dir": "outputs/none/roberta-base/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['lora_roberta-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/lora/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['adapter_roberta-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/adapter/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['low_rank_adapter_roberta-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/low_rank_adapter/roberta-base/",
|
||||
})
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['bert-base-cased'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}bert-base-cased",
|
||||
"tokenizer_name": f"{PATHBASE}bert-base-cased",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['prefix_bert-base-cased'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/bert-base-cased/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['soft_prompt_bert-base-cased'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/bert-base-cased/",
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"configs/{args.job}/"):
|
||||
os.mkdir(f"configs/{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,143 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['roberta-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}roberta-base",
|
||||
"tokenizer_name": f"{PATHBASE}roberta-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
|
||||
|
||||
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['bitfit_roberta-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/bitfit/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['none_roberta-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 1e-5,
|
||||
"output_dir": "outputs/none/roberta-base/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['lora_roberta-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/lora/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['adapter_roberta-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/adapter/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['low_rank_adapter_roberta-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/low_rank_adapter/roberta-base/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['soft_prompt_roberta-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/roberta-base/",
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"configs/{args.job}/"):
|
||||
os.mkdir(f"configs/{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,444 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['t5-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}t5-base",
|
||||
"tokenizer_name": f"{PATHBASE}t5-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['bitfit_t5-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['adapter_t5-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['lora_t5-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter++_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
"modified_modules": [
|
||||
"DenseReluDense"
|
||||
],
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter++/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['low_rank_adapter_t5-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/low_rank_adapter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['soft_prompt_t5-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-2,
|
||||
"soft_token_num":100,
|
||||
"token_init": False,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['prefix_t5-base'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['soft_prompt_t5-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/t5-base/",
|
||||
})
|
||||
#### T5-base
|
||||
BaseConfigs['t5-small'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}t5-small",
|
||||
"tokenizer_name": f"{PATHBASE}t5-small",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
|
||||
AllConfigs['prefix_t5-small'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-small/",
|
||||
})
|
||||
|
||||
|
||||
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['roberta-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}roberta-base",
|
||||
"tokenizer_name": f"{PATHBASE}roberta-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
|
||||
|
||||
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['bitfit_roberta-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/bitfit/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['none_roberta-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 1e-5,
|
||||
"output_dir": "outputs/none/roberta-base/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['lora_roberta-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/lora/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['adapter_roberta-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/adapter/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['low_rank_adapter_roberta-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/low_rank_adapter/roberta-base/",
|
||||
})
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['bert-base-cased'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}bert-base-cased",
|
||||
"tokenizer_name": f"{PATHBASE}bert-base-cased",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['prefix_bert-base-cased'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/bert-base-cased/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['soft_prompt_bert-base-cased'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/bert-base-cased/",
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"configs/{args.job}/"):
|
||||
os.mkdir(f"configs/{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
|
@ -1,482 +0,0 @@
|
|||
# coding=utf-8
|
||||
# Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Fine-tuning the library models for sequence to sequence.
|
||||
"""
|
||||
# You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
|
||||
import functools
|
||||
import logging
|
||||
from opendelta.utils.delta_center import create_hub_repo_name
|
||||
import torch
|
||||
import os
|
||||
os.environ['MKL_THREADING_LAYER'] = 'GNU'
|
||||
os.environ['MKL_SERVICE_FORCE_INTEL'] = '1'
|
||||
import sys
|
||||
import subprocess
|
||||
from typing import Optional, List
|
||||
|
||||
from datasets import load_dataset, load_metric, concatenate_datasets
|
||||
import transformers
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModelForSeq2SeqLM,
|
||||
AutoTokenizer,
|
||||
HfArgumentParser,
|
||||
MBartTokenizer,
|
||||
default_data_collator,
|
||||
set_seed,
|
||||
)
|
||||
from transformers.trainer_utils import is_main_process, get_last_checkpoint
|
||||
# from ..seq2seq.utils import get_adapter_config
|
||||
from examples_prompt.data_processors import AutoTask, TaskDataCollatorForSeq2Seq, AutoPostProcessor
|
||||
from examples_prompt.seq2seq_trainer import Seq2SeqTrainer
|
||||
# from training_args import AdapterTrainingArguments
|
||||
from examples_prompt.trainers.trainer_utils import save_training_config
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from transformers.models.t5.modeling_t5 import T5Config, T5ForConditionalGeneration
|
||||
from examples_prompt.utils.args import ModelArguments
|
||||
from examples_prompt.trainers.trainer_args import TrainingArguments, DataTrainingArguments
|
||||
from transformers.trainer import Trainer
|
||||
from examples_prompt.metrics.metrics import transform_for_generation
|
||||
import json
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
||||
TASK_TO_METRICS = {"mrpc": ["accuracy", "f1"],
|
||||
"cola": ['matthews_correlation'],
|
||||
"stsb": ['pearson', 'spearmanr'],
|
||||
'sst2': ['accuracy'],
|
||||
"mnli": ["accuracy"],
|
||||
"mnli_mismatched": ["accuracy"],
|
||||
"mnli_matched": ["accuracy"],
|
||||
"qnli": ["accuracy"],
|
||||
"rte": ["accuracy"],
|
||||
"wnli": ["accuracy"],
|
||||
"qqp": ["accuracy", "f1"],
|
||||
"superglue-boolq": ["accuracy"],
|
||||
"superglue-rte": ["accuracy"],
|
||||
"superglue-cb": ["f1_multiclass", "accuracy"],
|
||||
"superglue-copa": ["accuracy"],
|
||||
"superglue-multirc": ["f1", "em"],
|
||||
"superglue-wic": ["accuracy"],
|
||||
"superglue-wsc.fixed": ["accuracy"],
|
||||
"superglue-record": ["f1", "em"]
|
||||
}
|
||||
|
||||
|
||||
class RemainArgHfArgumentParser(HfArgumentParser):
|
||||
def parse_json_file(self, json_file: str, return_remaining_args=True ):
|
||||
"""
|
||||
Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the
|
||||
dataclass types.
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
import dataclasses
|
||||
|
||||
data = json.loads(Path(json_file).read_text())
|
||||
outputs = []
|
||||
for dtype in self.dataclass_types:
|
||||
keys = {f.name for f in dataclasses.fields(dtype) if f.init}
|
||||
inputs = {k: data.pop(k) for k in list(data.keys()) if k in keys}
|
||||
obj = dtype(**inputs)
|
||||
outputs.append(obj)
|
||||
|
||||
remain_args = argparse.ArgumentParser()
|
||||
remain_args.__dict__.update(data)
|
||||
if return_remaining_args:
|
||||
return (*outputs, remain_args)
|
||||
else:
|
||||
return (*outputs,)
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
# See all possible arguments in src/transformers/training_args.py
|
||||
# or by passing the --help flag to this script.
|
||||
# We now keep distinct sets of args, for a cleaner separation of concerns.
|
||||
parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
|
||||
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
|
||||
# If we pass only one argument to the script and it's the path to a json file,
|
||||
# let's parse it to get our arguments.
|
||||
model_args, data_args, training_args, delta_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
|
||||
else:
|
||||
model_args, data_args, training_args, delta_args = parser.parse_args_into_dataclasses(return_remaining_strings=True)
|
||||
|
||||
|
||||
# Detecting last checkpoint.
|
||||
last_checkpoint = None
|
||||
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
|
||||
last_checkpoint = get_last_checkpoint(training_args.output_dir)
|
||||
print("#### last_checkpoint ", last_checkpoint)
|
||||
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
|
||||
'''
|
||||
raise ValueError(
|
||||
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
|
||||
"Use --overwrite_output_dir to overcome."
|
||||
)
|
||||
'''
|
||||
pass
|
||||
elif last_checkpoint is not None:
|
||||
logger.info(
|
||||
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
|
||||
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
|
||||
)
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
|
||||
|
||||
# Log on each process the small summary:
|
||||
logger.warning(
|
||||
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
||||
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
||||
)
|
||||
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||
if is_main_process(training_args.local_rank):
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
logger.info("Training/evaluation parameters %s", training_args)
|
||||
|
||||
# Set seed before initializing model.
|
||||
set_seed(training_args.seed)
|
||||
|
||||
# Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
|
||||
# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
|
||||
# (the dataset will be downloaded automatically from the datasets Hub).
|
||||
#
|
||||
# For CSV/JSON files in the summarization task, this script will use the first column for the full texts and the
|
||||
# second column for the summaries (unless you specify column names for this with the `text_column` and
|
||||
# `summary_column` arguments).
|
||||
# For translation, only JSON files are supported, with one field named "translation" containing two keys for the
|
||||
# source and target languages (unless you adapt what follows).
|
||||
#
|
||||
# In distributed training, the load_dataset function guarantee that only one local process can concurrently
|
||||
# download the dataset.
|
||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||
# https://huggingface.co/docs/datasets/loading_datasets.html.
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
# Distributed training:
|
||||
# The .from_pretrained methods guarantee that only one local process can concurrently
|
||||
# download model & vocab.
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
config.dropout_rate = 0.0
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_fast=model_args.use_fast_tokenizer,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(
|
||||
model_args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
|
||||
if delta_args.delta_type.lower() != "none":
|
||||
from opendelta import AutoDeltaConfig,AutoDeltaModel
|
||||
delta_config = AutoDeltaConfig.from_dict(vars(delta_args))
|
||||
delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=model)
|
||||
delta_model.freeze_module(set_state_dict = True)
|
||||
delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True)
|
||||
|
||||
|
||||
# model parallelize
|
||||
if hasattr(training_args, "model_parallel") and training_args.model_parallel:
|
||||
logger.info('parallelize model!')
|
||||
model.parallelize()
|
||||
|
||||
data_args.dataset_name = [data_args.task_name]
|
||||
data_args.eval_dataset_name = [data_args.eval_dataset_name]
|
||||
data_args.test_dataset_name = [data_args.test_dataset_name]
|
||||
data_args.dataset_config_name = [data_args.dataset_config_name]
|
||||
data_args.eval_dataset_config_name = [data_args.eval_dataset_config_name]
|
||||
data_args.test_dataset_config_name = [data_args.test_dataset_config_name]
|
||||
assert len(data_args.dataset_name) == len(data_args.dataset_config_name)
|
||||
if data_args.eval_dataset_name is not None:
|
||||
assert len(data_args.eval_dataset_name) == len(data_args.eval_dataset_config_name)
|
||||
if data_args.test_dataset_name is not None:
|
||||
assert len(data_args.test_dataset_name) == len(data_args.test_dataset_config_name)
|
||||
|
||||
# Temporarily set max_target_length for training.
|
||||
#max_target_length = data_args.max_target_length
|
||||
padding = "max_length" if data_args.pad_to_max_length else False
|
||||
|
||||
def preprocess_function(examples, max_target_length):
|
||||
# max_target_length += 1
|
||||
model_inputs = tokenizer([s+"<extra_id_0>" for s in examples['source']], max_length=data_args.max_source_length,
|
||||
padding=padding, truncation=True)
|
||||
# # Setup the tokenizer for targets
|
||||
with tokenizer.as_target_tokenizer():
|
||||
labels = tokenizer([t for t in examples['target']], max_length=max_target_length, padding=padding, truncation=True)
|
||||
# model_inputs = tokenizer([s for s in examples['source']], max_length=data_args.max_source_length,
|
||||
# padding=padding, truncation=True)
|
||||
# Setup the tokenizer for targets
|
||||
# with tokenizer.as_target_tokenizer():
|
||||
# labels = tokenizer([t for t in examples['target']], max_length=max_target_length, padding=padding, truncation=True)
|
||||
# If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
|
||||
# padding in the loss.
|
||||
if padding == "max_length" and data_args.ignore_pad_token_for_loss:
|
||||
labels["input_ids"] = [
|
||||
[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
|
||||
]
|
||||
model_inputs["labels"] = labels["input_ids"]
|
||||
model_inputs["extra_fields"] = examples['extra_fields']
|
||||
return model_inputs
|
||||
|
||||
column_names = ['source', 'target', 'label', 'extra_fields']
|
||||
performance_metrics = {}
|
||||
if training_args.do_train:
|
||||
train_datasets = [AutoTask.get(dataset_name,
|
||||
dataset_config_name,
|
||||
seed=data_args.data_seed).get(
|
||||
split="train",
|
||||
split_validation_test=training_args.split_validation_test,
|
||||
add_prefix=True,
|
||||
n_obs=data_args.max_train_samples)
|
||||
for dataset_name, dataset_config_name\
|
||||
in zip(data_args.dataset_name, data_args.dataset_config_name)]
|
||||
max_target_lengths = [AutoTask.get(dataset_name, dataset_config_name).get_max_target_length(\
|
||||
tokenizer=tokenizer, default_max_length=data_args.max_target_length)\
|
||||
for dataset_name, dataset_config_name in zip(data_args.dataset_name, data_args.dataset_config_name)]
|
||||
for i, train_dataset in enumerate(train_datasets):
|
||||
train_datasets[i] = train_datasets[i].map(
|
||||
functools.partial(preprocess_function, max_target_length=max_target_lengths[i]),
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
remove_columns=column_names, # if train_dataset != "superglue-record" else column_names+["answers"],
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
)
|
||||
train_dataset = concatenate_datasets(train_datasets)
|
||||
print(f"Train dataset size {len(train_dataset)}")
|
||||
|
||||
if training_args.do_eval:
|
||||
eval_datasets = {eval_dataset: AutoTask.get(eval_dataset, eval_dataset_config,
|
||||
seed=data_args.data_seed).get(
|
||||
split="validation",
|
||||
split_validation_test=training_args.split_validation_test,
|
||||
add_prefix=True,
|
||||
n_obs=data_args.max_val_samples)
|
||||
for eval_dataset, eval_dataset_config in zip(data_args.eval_dataset_name, data_args.eval_dataset_config_name)}
|
||||
max_target_lengths = [AutoTask.get(dataset_name, dataset_config_name).get_max_target_length( \
|
||||
tokenizer=tokenizer, default_max_length=data_args.max_target_length) \
|
||||
for dataset_name, dataset_config_name in zip(data_args.eval_dataset_name, data_args.eval_dataset_config_name)]
|
||||
for k, name in enumerate(eval_datasets):
|
||||
eval_datasets[name] = eval_datasets[name].map(
|
||||
functools.partial(preprocess_function, max_target_length=max_target_lengths[k]),
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
remove_columns=column_names, # if name != "superglue-record" else column_names+["answers"],
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
)
|
||||
|
||||
if training_args.do_test:
|
||||
test_datasets = {test_dataset: AutoTask.get(test_dataset, test_dataset_config,
|
||||
seed=data_args.data_seed).get(
|
||||
split="test",
|
||||
split_validation_test=training_args.split_validation_test,
|
||||
add_prefix=True,
|
||||
n_obs=data_args.max_test_samples)
|
||||
for test_dataset, test_dataset_config in zip(data_args.test_dataset_name, data_args.test_dataset_config_name)}
|
||||
max_target_lengths = [AutoTask.get(dataset_name, dataset_config_name).get_max_target_length( \
|
||||
tokenizer=tokenizer, default_max_length=data_args.max_target_length) \
|
||||
for dataset_name, dataset_config_name in zip(data_args.test_dataset_name, data_args.test_dataset_config_name)]
|
||||
for k, name in enumerate(test_datasets):
|
||||
test_datasets[name] = test_datasets[name].map(
|
||||
functools.partial(preprocess_function, max_target_length=max_target_lengths[k]),
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
remove_columns=column_names,
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
)
|
||||
|
||||
# Data collator
|
||||
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
|
||||
if data_args.pad_to_max_length:
|
||||
data_collator = default_data_collator
|
||||
else:
|
||||
data_collator = TaskDataCollatorForSeq2Seq(
|
||||
tokenizer,
|
||||
label_pad_token_id=label_pad_token_id,
|
||||
pad_to_multiple_of=8 if training_args.fp16 else None,
|
||||
)
|
||||
|
||||
|
||||
# Metric, we assume we have only one training task.
|
||||
eval_metrics = [AutoTask.get(dataset_name, dataset_config_name).metric\
|
||||
for dataset_name, dataset_config_name in zip(data_args.dataset_name, data_args.dataset_config_name)][0]
|
||||
|
||||
# Extracts the extra information needed to evaluate on each dataset.
|
||||
# These information are only used in the compute_metrics.
|
||||
# We will assume that the test/eval dataloader does not change the order of
|
||||
# the data.
|
||||
data_info = {"eval": eval_datasets[data_args.eval_dataset_name[0]]['extra_fields'],
|
||||
"test": test_datasets[data_args.test_dataset_name[0]]['extra_fields'],
|
||||
"train": train_dataset['extra_fields']}
|
||||
def compute_metrics(eval_preds):
|
||||
preds, labels, data_info = eval_preds
|
||||
post_processor = AutoPostProcessor.get(data_args.dataset_name[0], tokenizer,
|
||||
data_args.ignore_pad_token_for_loss)
|
||||
decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info)
|
||||
decoded_preds, decoded_labels = transform_for_generation(decoded_preds, decoded_labels)
|
||||
result = {}
|
||||
for metric in eval_metrics:
|
||||
result.update(metric(decoded_preds, decoded_labels))
|
||||
return result
|
||||
|
||||
|
||||
# Initialize our Trainer
|
||||
if training_args.is_seq2seq == True:
|
||||
trainer = Seq2SeqTrainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
delta_args=delta_args,
|
||||
train_dataset=train_dataset if training_args.do_train else None,
|
||||
eval_dataset=list(eval_datasets.values())[0] if training_args.do_eval else None,
|
||||
data_info = data_info,
|
||||
tokenizer=tokenizer,
|
||||
data_collator=data_collator,
|
||||
compute_metrics=compute_metrics if training_args.predict_with_generate else None,
|
||||
evaluation_metrics = TASK_TO_METRICS[data_args.dataset_name[0]],
|
||||
)
|
||||
else:
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=train_dataset if training_args.do_train else None,
|
||||
eval_dataset=eval_dataset if training_args.do_eval else None,
|
||||
compute_metrics=compute_metrics,
|
||||
tokenizer=tokenizer,
|
||||
data_collator=data_collator,
|
||||
)
|
||||
|
||||
|
||||
# Saves training config.
|
||||
if trainer.is_world_process_zero():
|
||||
os.makedirs(training_args.output_dir, exist_ok=True)
|
||||
save_training_config(sys.argv[1], training_args.output_dir)
|
||||
|
||||
# Training
|
||||
if training_args.do_train:
|
||||
checkpoint = None
|
||||
if training_args.resume_from_checkpoint is not None:
|
||||
checkpoint = training_args.resume_from_checkpoint
|
||||
elif last_checkpoint is not None:
|
||||
checkpoint = last_checkpoint
|
||||
|
||||
if training_args.compute_time:
|
||||
torch.cuda.synchronize() # wait for move to complete
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
start.record()
|
||||
|
||||
train_result = trainer.train(resume_from_checkpoint=checkpoint)
|
||||
|
||||
if training_args.compute_time:
|
||||
end.record()
|
||||
torch.cuda.synchronize() # wait for all_reduce to complete
|
||||
total_time = start.elapsed_time(end)/(1000*60)
|
||||
performance_metrics.update({"total_time in minutes ": total_time})
|
||||
|
||||
trainer.save_model() # Saves the tokenizer too for easy upload
|
||||
train_metrics = train_result.metrics
|
||||
max_train_samples = (
|
||||
data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
|
||||
)
|
||||
train_metrics["train_samples"] = min(max_train_samples, len(train_dataset))
|
||||
trainer.log_metrics("train", train_metrics)
|
||||
trainer.save_metrics("train", train_metrics)
|
||||
trainer.save_state()
|
||||
|
||||
if torch.cuda.is_available() and training_args.compute_memory:
|
||||
peak_memory = (torch.cuda.max_memory_allocated() / 1024 ** 2)/1000
|
||||
print(
|
||||
"Memory utilization",
|
||||
peak_memory,
|
||||
"GB"
|
||||
)
|
||||
performance_metrics.update({"peak_memory": peak_memory})
|
||||
if training_args.compute_memory or training_args.compute_time:
|
||||
print(performance_metrics)
|
||||
trainer.save_metrics("performance", performance_metrics)
|
||||
|
||||
# Evaluation
|
||||
results = {}
|
||||
if training_args.do_eval:
|
||||
logger.info("*** Evaluate ***")
|
||||
for task, eval_dataset in eval_datasets.items():
|
||||
metrics = trainer.evaluate(eval_dataset=eval_dataset,
|
||||
max_length=data_args.val_max_target_length, num_beams=data_args.num_beams,
|
||||
)
|
||||
trainer.log_metrics("eval", metrics)
|
||||
trainer.save_metrics("eval", metrics)
|
||||
results['evaluate'] = metrics
|
||||
|
||||
# Test
|
||||
if training_args.do_test:
|
||||
logger.info("*** Test ***")
|
||||
for task, test_dataset in test_datasets.items():
|
||||
metrics = trainer.evaluate(eval_dataset=test_dataset,
|
||||
max_length=data_args.test_max_target_length, num_beams=data_args.num_beams,
|
||||
metric_key_prefix="test"
|
||||
)
|
||||
trainer.log_metrics("test", metrics)
|
||||
trainer.save_metrics("test", metrics)
|
||||
results['test'] = metrics
|
||||
|
||||
repo_name = create_hub_repo_name(root="DeltaHub",
|
||||
dataset=data_args.task_name,
|
||||
delta_type = delta_args.delta_type,
|
||||
model_name_or_path= model_args.model_name_or_path)
|
||||
results['repo_name'] = repo_name
|
||||
if delta_args.delta_type.lower() != "none":
|
||||
if training_args.push_to_hub: # TODO add description here
|
||||
delta_model.save_finetuned(push_to_hub=True, save_directory=repo_name, use_auth_token=True)
|
||||
# trainer.push_to_hub(**kwargs)
|
||||
else:
|
||||
delta_model.save_finetuned(push_to_hub=False, save_directory=repo_name, use_auth_token=True)
|
||||
|
||||
with open(f"{training_args.output_dir}/results.json", 'w') as fout:
|
||||
string = json.dumps(results, indent=4,sort_keys=True)
|
||||
fout.write(string+"\n")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
result = main()
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
files=(cola mnli mrpc qnli qqp rte sst2 stsb superglue-boolq superglue-cb superglue-copa superglue-multirc superglue-record superglue-wic superglue-wsc.fixed)
|
||||
for ((i=$1; i<=$2; i++))
|
||||
do
|
||||
dataset=${files[i]}
|
||||
echo "id$i:$dataset"
|
||||
TOKENIZERS_PARALLELISM=false python run.py configs/$3/$dataset.json
|
||||
done
|
|
@ -1,11 +0,0 @@
|
|||
|
||||
python configs/config_gen.py --job $3
|
||||
echo "Regenerate config"
|
||||
|
||||
files=(cola mnli mrpc qnli qqp rte sst2 stsb superglue-boolq superglue-cb superglue-copa superglue-multirc superglue-record superglue-wic superglue-wsc.fixed)
|
||||
for ((i=$1; i<=$2; i++))
|
||||
do
|
||||
dataset=${files[i]}
|
||||
echo "id$i:$dataset"
|
||||
TOKENIZERS_PARALLELISM=false python run_mlm.py configs/$3/$dataset.json
|
||||
done
|
|
@ -0,0 +1,411 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
BS = 1
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['t5-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "t5-base",
|
||||
"tokenizer_name": "t5-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
BaseConfigs['t5-large'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-large",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-large",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
BaseConfigs['t5-3b'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-3b",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-3b",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['bitfit_t5-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-base/",
|
||||
})
|
||||
|
||||
|
||||
|
||||
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['adapter_t5-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['lora_t5-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter++_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
"modified_modules": [
|
||||
"DenseReluDense"
|
||||
],
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter++/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['low_rank_adapter_t5-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/low_rank_adapter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['soft_prompt_t5-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-2,
|
||||
"soft_token_num":100,
|
||||
"token_init": False,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['prefix_t5-base'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['none_t5-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['bitfit_t5-large'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['none_t5-large'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-large/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['bitfit_t5-3b'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['none_t5-3b'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['adapter_t5-3b'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['adapter_t5-large'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['lora_t5-large'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['lora_t5-3b'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-3b/",
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"./{args.job}_{BS}/"):
|
||||
os.mkdir(f"./{args.job}_{BS}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,411 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
BS = 1
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['t5-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "t5-base",
|
||||
"tokenizer_name": "t5-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
BaseConfigs['t5-large'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-large",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-large",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
BaseConfigs['t5-3b'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-3b",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-3b",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['bitfit_t5-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-base/",
|
||||
})
|
||||
|
||||
|
||||
|
||||
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['adapter_t5-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['lora_t5-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter++_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
"modified_modules": [
|
||||
"DenseReluDense"
|
||||
],
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter++/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['low_rank_adapter_t5-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/low_rank_adapter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['soft_prompt_t5-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-2,
|
||||
"soft_token_num":100,
|
||||
"token_init": False,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['prefix_t5-base'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['none_t5-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['bitfit_t5-large'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['none_t5-large'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-large/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['bitfit_t5-3b'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['none_t5-3b'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['adapter_t5-3b'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['adapter_t5-large'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['lora_t5-large'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['lora_t5-3b'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-3b/",
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"./{args.job}_{BS}/"):
|
||||
os.mkdir(f"./{args.job}_{BS}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,411 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
BS = 64
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['t5-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "t5-base",
|
||||
"tokenizer_name": "t5-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
BaseConfigs['t5-large'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-large",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-large",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
BaseConfigs['t5-3b'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-3b",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-3b",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['bitfit_t5-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-base/",
|
||||
})
|
||||
|
||||
|
||||
|
||||
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['adapter_t5-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['lora_t5-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter++_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
"modified_modules": [
|
||||
"DenseReluDense"
|
||||
],
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter++/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['low_rank_adapter_t5-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/low_rank_adapter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['soft_prompt_t5-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-2,
|
||||
"soft_token_num":100,
|
||||
"token_init": False,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['prefix_t5-base'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['none_t5-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['bitfit_t5-large'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['none_t5-large'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-large/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['bitfit_t5-3b'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['none_t5-3b'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['adapter_t5-3b'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['adapter_t5-large'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['lora_t5-large'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['lora_t5-3b'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-3b/",
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"./{args.job}_{BS}/"):
|
||||
os.mkdir(f"./{args.job}_{BS}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,411 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
BS = 8
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['t5-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "t5-base",
|
||||
"tokenizer_name": "t5-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
BaseConfigs['t5-large'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-large",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-large",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
BaseConfigs['t5-3b'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-3b",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-3b",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['bitfit_t5-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-base/",
|
||||
})
|
||||
|
||||
|
||||
|
||||
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['adapter_t5-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['lora_t5-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter++_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
"modified_modules": [
|
||||
"DenseReluDense"
|
||||
],
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter++/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['low_rank_adapter_t5-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/low_rank_adapter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['soft_prompt_t5-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-2,
|
||||
"soft_token_num":100,
|
||||
"token_init": False,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['prefix_t5-base'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['none_t5-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['bitfit_t5-large'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['none_t5-large'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-large/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['bitfit_t5-3b'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['none_t5-3b'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['adapter_t5-3b'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['adapter_t5-large'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['lora_t5-large'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['lora_t5-3b'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-3b/",
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"./{args.job}_{BS}/"):
|
||||
os.mkdir(f"./{args.job}_{BS}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue