2022-04-22 19:30:05 +08:00
|
|
|
import collections
|
|
|
|
import copy
|
|
|
|
|
|
|
|
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
2022-06-06 16:21:55 +08:00
|
|
|
# PATHBASE="/home/hushengding/plm_cache/"
|
2022-04-22 19:30:05 +08:00
|
|
|
|
|
|
|
AllConfigs = {}
|
|
|
|
|
|
|
|
BaseConfigs = {}
|
|
|
|
BaseConfigs['t5-base'] = {
|
|
|
|
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
|
|
|
"max_source_length",
|
|
|
|
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
|
|
|
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
|
|
|
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
|
|
|
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
|
|
|
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
|
|
|
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
|
|
|
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
|
|
|
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
|
|
|
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
|
|
|
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
|
|
|
[0] *7 +[0] *8,
|
|
|
|
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
|
|
|
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
|
|
|
),
|
|
|
|
"do_train": True,
|
|
|
|
"do_eval": True,
|
|
|
|
"do_test": True,
|
|
|
|
|
|
|
|
"model_name_or_path": f"{PATHBASE}t5-base",
|
|
|
|
"tokenizer_name": f"{PATHBASE}t5-base",
|
|
|
|
"save_total_limit": 1,
|
|
|
|
# For glue datasets.
|
|
|
|
"split_validation_test": True,
|
|
|
|
"seed": 42,
|
|
|
|
"dataset_config_name": ["en"],
|
|
|
|
"eval_dataset_config_name": ["en"],
|
|
|
|
"test_dataset_config_name": ["en"],
|
|
|
|
# other configurations.
|
|
|
|
"predict_with_generate": True,
|
|
|
|
# To evaluate during training.
|
|
|
|
"load_best_model_at_end": True,
|
|
|
|
"metric_for_best_model": "average_metrics",
|
|
|
|
"greater_is_better": True,
|
|
|
|
"evaluation_strategy": "steps",
|
|
|
|
"overwrite_output_dir": True,
|
2022-06-08 16:55:02 +08:00
|
|
|
"push_to_hf": False,
|
|
|
|
"push_to_dc": True,
|
2022-06-06 16:21:55 +08:00
|
|
|
"save_strategy": "steps",
|
|
|
|
"datasets_load_from_disk": True,
|
2022-06-08 16:55:02 +08:00
|
|
|
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
|
|
|
"backbone_model": "t5", # use in delta center,
|
|
|
|
"model_path_public": "t5-base", # use in delta center,
|
|
|
|
|
2022-04-22 19:30:05 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
|
|
|
AllConfigs['bitfit_t5-base'].update({
|
|
|
|
"delta_type": "bitfit",
|
|
|
|
"learning_rate": 3e-4,
|
|
|
|
"output_dir": "outputs/bitfit/t5-base/",
|
|
|
|
})
|
|
|
|
|
|
|
|
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
|
|
|
AllConfigs['adapter_t5-base'].update({
|
|
|
|
"delta_type": "adapter",
|
|
|
|
"learning_rate": 3e-4,
|
|
|
|
"unfrozen_modules": [
|
|
|
|
"deltas",
|
|
|
|
"layer_norm",
|
|
|
|
"final_layer_norm"
|
|
|
|
],
|
|
|
|
"bottleneck_dim":24,
|
|
|
|
"output_dir": "outputs/adapter/t5-base/",
|
|
|
|
})
|
|
|
|
|
|
|
|
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
|
|
|
AllConfigs['lora_t5-base'].update({
|
|
|
|
"delta_type": "lora",
|
|
|
|
"learning_rate": 3e-4,
|
|
|
|
"unfrozen_modules": [
|
|
|
|
"deltas",
|
|
|
|
],
|
|
|
|
"lora_r": 8,
|
|
|
|
"output_dir": "outputs/lora/t5-base/",
|
|
|
|
})
|
|
|
|
|
|
|
|
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
|
|
|
AllConfigs['compacter_t5-base'].update({
|
|
|
|
"delta_type": "compacter",
|
|
|
|
"learning_rate": 3e-3,
|
|
|
|
"unfrozen_modules": [
|
|
|
|
"deltas",
|
|
|
|
"layer_norm",
|
|
|
|
"final_layer_norm"
|
|
|
|
],
|
|
|
|
"output_dir": "outputs/compacter/t5-base/",
|
|
|
|
"non_linearity": "gelu_new",
|
|
|
|
|
|
|
|
#Compacter.
|
|
|
|
"hypercomplex_division": 4,
|
|
|
|
"hypercomplex_adapters": True,
|
|
|
|
"hypercomplex_nonlinearity": "glorot-uniform",
|
|
|
|
# gradient clip and clamp
|
|
|
|
"gradient_clip": False,
|
|
|
|
"phm_clamp": False,
|
|
|
|
"normalize_phm_weight": False,
|
|
|
|
"learn_phm": True,
|
|
|
|
# shared one side
|
|
|
|
"factorized_phm": True,
|
|
|
|
"shared_phm_rule": False,
|
|
|
|
"factorized_phm_rule": False,
|
|
|
|
"phm_c_init": "normal",
|
|
|
|
"phm_init_range": 0.0001,
|
|
|
|
"use_bias_down_sampler": True,
|
|
|
|
"use_bias_up_sampler": True,
|
|
|
|
})
|
|
|
|
|
|
|
|
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
|
|
|
AllConfigs['compacter++_t5-base'].update({
|
|
|
|
"delta_type": "compacter",
|
|
|
|
"learning_rate": 3e-3,
|
|
|
|
"do_train": True,
|
|
|
|
"do_eval": True,
|
|
|
|
"do_test": True,
|
|
|
|
"modified_modules": [
|
|
|
|
"DenseReluDense"
|
|
|
|
],
|
|
|
|
"unfrozen_modules": [
|
|
|
|
"deltas",
|
|
|
|
"layer_norm",
|
|
|
|
"final_layer_norm"
|
|
|
|
],
|
|
|
|
"output_dir": "outputs/compacter++/t5-base/",
|
|
|
|
"non_linearity": "gelu_new",
|
|
|
|
|
|
|
|
#Compacter.
|
|
|
|
"hypercomplex_division": 4,
|
|
|
|
"hypercomplex_adapters": True,
|
|
|
|
"hypercomplex_nonlinearity": "glorot-uniform",
|
|
|
|
# gradient clip and clamp
|
|
|
|
"gradient_clip": False,
|
|
|
|
"phm_clamp": False,
|
|
|
|
"normalize_phm_weight": False,
|
|
|
|
"learn_phm": True,
|
|
|
|
# shared one side
|
|
|
|
"factorized_phm": True,
|
|
|
|
"shared_phm_rule": False,
|
|
|
|
"factorized_phm_rule": False,
|
|
|
|
"phm_c_init": "normal",
|
|
|
|
"phm_init_range": 0.0001,
|
|
|
|
"use_bias_down_sampler": True,
|
|
|
|
"use_bias_up_sampler": True,
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
|
|
|
AllConfigs['low_rank_adapter_t5-base'].update({
|
|
|
|
"delta_type": "low_rank_adapter",
|
|
|
|
"learning_rate": 3e-4,
|
|
|
|
"unfrozen_modules": [
|
|
|
|
"deltas",
|
|
|
|
"layer_norm",
|
|
|
|
"final_layer_norm"
|
|
|
|
],
|
|
|
|
"output_dir": "outputs/low_rank_adapter/t5-base/",
|
|
|
|
"non_linearity": "gelu_new",
|
|
|
|
"low_rank_w_init": "glorot-uniform",
|
|
|
|
"low_rank_rank": 1,
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
|
|
|
AllConfigs['soft_prompt_t5-base'].update({
|
|
|
|
"delta_type": "soft_prompt",
|
|
|
|
"learning_rate": 3e-2,
|
|
|
|
"soft_token_num":100,
|
|
|
|
"token_init": False,
|
|
|
|
"unfrozen_modules": [
|
|
|
|
"deltas",
|
|
|
|
],
|
|
|
|
"output_dir": "outputs/soft_prompt/t5-base/",
|
|
|
|
})
|
|
|
|
|
|
|
|
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
|
|
|
AllConfigs['prefix_t5-base'].update({
|
|
|
|
"delta_type": "prefix",
|
|
|
|
"learning_rate": 3e-4,
|
|
|
|
"unfrozen_modules": [
|
|
|
|
"deltas",
|
|
|
|
],
|
2022-06-06 21:10:49 +08:00
|
|
|
"reparameterize": False,
|
2022-04-22 19:30:05 +08:00
|
|
|
"output_dir": "outputs/prefix/t5-base/",
|
|
|
|
})
|
|
|
|
|
|
|
|
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
|
|
|
AllConfigs['soft_prompt_t5-base'].update({
|
|
|
|
"delta_type": "soft_prompt",
|
|
|
|
"learning_rate": 3e-4,
|
|
|
|
"unfrozen_modules": [
|
|
|
|
"deltas",
|
|
|
|
],
|
|
|
|
"output_dir": "outputs/soft_prompt/t5-base/",
|
|
|
|
})
|
|
|
|
#### T5-base
|
|
|
|
BaseConfigs['t5-small'] = {
|
|
|
|
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
|
|
|
"max_source_length",
|
|
|
|
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
|
|
|
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
|
|
|
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
|
|
|
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
|
|
|
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
|
|
|
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
|
|
|
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
|
|
|
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
|
|
|
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
|
|
|
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
|
|
|
[0] *7 +[0] *8,
|
|
|
|
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
|
|
|
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
|
|
|
),
|
|
|
|
"do_train": True,
|
|
|
|
"do_eval": True,
|
|
|
|
"do_test": True,
|
|
|
|
|
|
|
|
"model_name_or_path": f"{PATHBASE}t5-small",
|
|
|
|
"tokenizer_name": f"{PATHBASE}t5-small",
|
|
|
|
"save_total_limit": 1,
|
|
|
|
# For glue datasets.
|
|
|
|
"split_validation_test": True,
|
|
|
|
"seed": 42,
|
|
|
|
"dataset_config_name": ["en"],
|
|
|
|
"eval_dataset_config_name": ["en"],
|
|
|
|
"test_dataset_config_name": ["en"],
|
|
|
|
# other configurations.
|
|
|
|
"predict_with_generate": True,
|
|
|
|
# To evaluate during training.
|
|
|
|
"load_best_model_at_end": True,
|
|
|
|
"metric_for_best_model": "average_metrics",
|
|
|
|
"greater_is_better": True,
|
|
|
|
"evaluation_strategy": "steps",
|
|
|
|
"overwrite_output_dir": True,
|
|
|
|
"push_to_hub": False,
|
|
|
|
"push_to_delta_center": True,
|
|
|
|
"save_strategy": "steps"
|
|
|
|
}
|
|
|
|
|
|
|
|
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
|
|
|
|
AllConfigs['prefix_t5-small'].update({
|
|
|
|
"delta_type": "prefix",
|
|
|
|
"learning_rate": 3e-4,
|
|
|
|
"unfrozen_modules": [
|
|
|
|
"deltas",
|
|
|
|
],
|
|
|
|
"output_dir": "outputs/prefix/t5-small/",
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
import argparse
|
|
|
|
import json
|
|
|
|
import os
|
|
|
|
parser = argparse.ArgumentParser("Parser to generate configuration")
|
|
|
|
parser.add_argument("--job", type=str)
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
config = AllConfigs[args.job]
|
|
|
|
|
|
|
|
Cartesian_product = []
|
|
|
|
for key in config:
|
|
|
|
if isinstance(key, tuple):
|
|
|
|
Cartesian_product.append(key)
|
|
|
|
all_config_jsons = {}
|
|
|
|
for key_tuple in Cartesian_product:
|
|
|
|
for zipped in config[key_tuple]:
|
|
|
|
job_name = zipped[0]
|
|
|
|
all_config_jsons[job_name] = {}
|
|
|
|
for key_name, zipped_elem in zip(key_tuple, zipped):
|
|
|
|
if key_name != 'job_name':
|
|
|
|
all_config_jsons[job_name][key_name] = zipped_elem
|
|
|
|
for key in config:
|
|
|
|
if not isinstance(key, tuple):
|
|
|
|
for job_name in all_config_jsons:
|
|
|
|
if key == "output_dir":
|
|
|
|
all_config_jsons[job_name][key] = config[key] + job_name
|
|
|
|
else:
|
|
|
|
all_config_jsons[job_name][key] = config[key]
|
|
|
|
|
|
|
|
|
|
|
|
if not os.path.exists(f"configs/{args.job}/"):
|
|
|
|
os.mkdir(f"configs/{args.job}/")
|
|
|
|
|
|
|
|
for job_name in all_config_jsons:
|
|
|
|
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
|
|
|
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
|
|
|
|
|
|
|
|
|
|
|
|