From 3468eb872c967fe546997ae5636a13f733731f90 Mon Sep 17 00:00:00 2001 From: Achazwl <323163497@qq.com> Date: Thu, 24 Feb 2022 23:21:31 +0800 Subject: [PATCH 1/3] PA init --- .gitignore | 3 +- .../examples_seq2seq/configs/config_gen.py | 13 ++ .../configs/adapter_roberta-base/cola.json | 46 ++++ .../configs/adapter_roberta-base/mnli.json | 46 ++++ .../configs/adapter_roberta-base/mrpc.json | 46 ++++ .../configs/adapter_roberta-base/qnli.json | 46 ++++ .../configs/adapter_roberta-base/qqp.json | 46 ++++ .../configs/adapter_roberta-base/rte.json | 46 ++++ .../configs/adapter_roberta-base/sst2.json | 46 ++++ .../configs/adapter_roberta-base/stsb.json | 46 ++++ .../adapter_roberta-base/superglue-boolq.json | 46 ++++ .../adapter_roberta-base/superglue-cb.json | 46 ++++ .../adapter_roberta-base/superglue-copa.json | 46 ++++ .../superglue-multirc.json | 46 ++++ .../superglue-record.json | 46 ++++ .../adapter_roberta-base/superglue-wic.json | 46 ++++ .../superglue-wsc.fixed.json | 46 ++++ .../configs/config_gen.py | 14 ++ .../parallel_adapter_roberta-base/cola.json | 46 ++++ .../parallel_adapter_roberta-base/mnli.json | 46 ++++ .../parallel_adapter_roberta-base/mrpc.json | 46 ++++ .../parallel_adapter_roberta-base/qnli.json | 46 ++++ .../parallel_adapter_roberta-base/qqp.json | 46 ++++ .../parallel_adapter_roberta-base/rte.json | 46 ++++ .../parallel_adapter_roberta-base/sst2.json | 46 ++++ .../parallel_adapter_roberta-base/stsb.json | 46 ++++ .../superglue-boolq.json | 46 ++++ .../superglue-cb.json | 46 ++++ .../superglue-copa.json | 46 ++++ .../superglue-multirc.json | 46 ++++ .../superglue-record.json | 46 ++++ .../superglue-wic.json | 46 ++++ .../superglue-wsc.fixed.json | 46 ++++ opendelta/auto_delta.py | 2 + opendelta/delta_models/adapter.py | 22 +- opendelta/delta_models/lora_old.py | 126 ----------- opendelta/delta_models/parallel_adapter.py | 196 ++++++++++++++++++ 37 files changed, 1609 insertions(+), 147 deletions(-) create mode 100644 examples/examples_text-classification/configs/adapter_roberta-base/cola.json create mode 100644 examples/examples_text-classification/configs/adapter_roberta-base/mnli.json create mode 100644 examples/examples_text-classification/configs/adapter_roberta-base/mrpc.json create mode 100644 examples/examples_text-classification/configs/adapter_roberta-base/qnli.json create mode 100644 examples/examples_text-classification/configs/adapter_roberta-base/qqp.json create mode 100644 examples/examples_text-classification/configs/adapter_roberta-base/rte.json create mode 100644 examples/examples_text-classification/configs/adapter_roberta-base/sst2.json create mode 100644 examples/examples_text-classification/configs/adapter_roberta-base/stsb.json create mode 100644 examples/examples_text-classification/configs/adapter_roberta-base/superglue-boolq.json create mode 100644 examples/examples_text-classification/configs/adapter_roberta-base/superglue-cb.json create mode 100644 examples/examples_text-classification/configs/adapter_roberta-base/superglue-copa.json create mode 100644 examples/examples_text-classification/configs/adapter_roberta-base/superglue-multirc.json create mode 100644 examples/examples_text-classification/configs/adapter_roberta-base/superglue-record.json create mode 100644 examples/examples_text-classification/configs/adapter_roberta-base/superglue-wic.json create mode 100644 examples/examples_text-classification/configs/adapter_roberta-base/superglue-wsc.fixed.json create mode 100644 examples/examples_text-classification/configs/parallel_adapter_roberta-base/cola.json create mode 100644 examples/examples_text-classification/configs/parallel_adapter_roberta-base/mnli.json create mode 100644 examples/examples_text-classification/configs/parallel_adapter_roberta-base/mrpc.json create mode 100644 examples/examples_text-classification/configs/parallel_adapter_roberta-base/qnli.json create mode 100644 examples/examples_text-classification/configs/parallel_adapter_roberta-base/qqp.json create mode 100644 examples/examples_text-classification/configs/parallel_adapter_roberta-base/rte.json create mode 100644 examples/examples_text-classification/configs/parallel_adapter_roberta-base/sst2.json create mode 100644 examples/examples_text-classification/configs/parallel_adapter_roberta-base/stsb.json create mode 100644 examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-boolq.json create mode 100644 examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-cb.json create mode 100644 examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-copa.json create mode 100644 examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-multirc.json create mode 100644 examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-record.json create mode 100644 examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wic.json create mode 100644 examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wsc.fixed.json delete mode 100644 opendelta/delta_models/lora_old.py create mode 100644 opendelta/delta_models/parallel_adapter.py diff --git a/.gitignore b/.gitignore index e099ba4..2009b66 100644 --- a/.gitignore +++ b/.gitignore @@ -17,4 +17,5 @@ _build/ outputs/ log.txt **/DeltaHub/ -*beans \ No newline at end of file +*beans +transformers diff --git a/examples/examples_seq2seq/configs/config_gen.py b/examples/examples_seq2seq/configs/config_gen.py index 073a112..70b1291 100644 --- a/examples/examples_seq2seq/configs/config_gen.py +++ b/examples/examples_seq2seq/configs/config_gen.py @@ -66,6 +66,19 @@ AllConfigs['adapter_t5-base'].update({ "output_dir": "outputs/adapter/t5-base/", }) +AllConfigs['parallel_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['parallel_adapter_t5-base'].update({ + "delta_type": "parallel_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/parallel_adapter/t5-base/", + }) + AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) AllConfigs['lora_t5-base'].update({ "delta_type": "lora", diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/cola.json b/examples/examples_text-classification/configs/adapter_roberta-base/cola.json new file mode 100644 index 0000000..f2b7146 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/cola.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "cola", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/adapter/roberta-base/cola", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "cola", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "cola", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/mnli.json b/examples/examples_text-classification/configs/adapter_roberta-base/mnli.json new file mode 100644 index 0000000..91ecb3e --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/mnli.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mnli", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/adapter/roberta-base/mnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "mnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mnli", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/mrpc.json b/examples/examples_text-classification/configs/adapter_roberta-base/mrpc.json new file mode 100644 index 0000000..df7a01e --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/mrpc.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mrpc", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/adapter/roberta-base/mrpc", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "mrpc", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mrpc", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/qnli.json b/examples/examples_text-classification/configs/adapter_roberta-base/qnli.json new file mode 100644 index 0000000..5292173 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/qnli.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qnli", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/adapter/roberta-base/qnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "qnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qnli", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/qqp.json b/examples/examples_text-classification/configs/adapter_roberta-base/qqp.json new file mode 100644 index 0000000..471844c --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/qqp.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qqp", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/adapter/roberta-base/qqp", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "qqp", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qqp", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/rte.json b/examples/examples_text-classification/configs/adapter_roberta-base/rte.json new file mode 100644 index 0000000..01bef33 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/rte.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/adapter/roberta-base/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/sst2.json b/examples/examples_text-classification/configs/adapter_roberta-base/sst2.json new file mode 100644 index 0000000..8638837 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/sst2.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "sst2", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/adapter/roberta-base/sst2", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "sst2", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "sst2", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/stsb.json b/examples/examples_text-classification/configs/adapter_roberta-base/stsb.json new file mode 100644 index 0000000..751ccc1 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/stsb.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "stsb", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/adapter/roberta-base/stsb", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "stsb", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "stsb", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/superglue-boolq.json b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-boolq.json new file mode 100644 index 0000000..37fcc44 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-boolq.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-boolq", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/adapter/roberta-base/superglue-boolq", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-boolq", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-boolq", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/superglue-cb.json b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-cb.json new file mode 100644 index 0000000..5a7c2f8 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-cb.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-cb", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/adapter/roberta-base/superglue-cb", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-cb", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-cb", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/superglue-copa.json b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-copa.json new file mode 100644 index 0000000..c7af0f7 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-copa.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-copa", + "eval_steps": 50, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 40, + "output_dir": "outputs/adapter/roberta-base/superglue-copa", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 50, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-copa", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-copa", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/superglue-multirc.json b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-multirc.json new file mode 100644 index 0000000..8625c6c --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-multirc.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-multirc", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/adapter/roberta-base/superglue-multirc", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-multirc", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-multirc", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/superglue-record.json b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-record.json new file mode 100644 index 0000000..9326a30 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-record.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-record", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 512, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/adapter/roberta-base/superglue-record", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-record", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-record", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/superglue-wic.json b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-wic.json new file mode 100644 index 0000000..f561411 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-wic.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wic", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/adapter/roberta-base/superglue-wic", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-wic", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wic", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/adapter_roberta-base/superglue-wsc.fixed.json b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-wsc.fixed.json new file mode 100644 index 0000000..a017357 --- /dev/null +++ b/examples/examples_text-classification/configs/adapter_roberta-base/superglue-wsc.fixed.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wsc.fixed", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/adapter/roberta-base/superglue-wsc.fixed", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-wsc.fixed", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wsc.fixed", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/config_gen.py b/examples/examples_text-classification/configs/config_gen.py index 14101ce..554fd5e 100644 --- a/examples/examples_text-classification/configs/config_gen.py +++ b/examples/examples_text-classification/configs/config_gen.py @@ -161,6 +161,20 @@ AllConfigs['adapter_roberta-base'].update({ "output_dir": "outputs/adapter/roberta-base/", }) +AllConfigs['parallel_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) +AllConfigs['parallel_adapter_roberta-base'].update({ + "delta_type": "parallel_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier", + ], + "bottleneck_dim":24, + "output_dir": "outputs/parallel_adapter/roberta-base/", + }) + AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) AllConfigs['lora_roberta-base'].update({ "delta_type": "lora", diff --git a/examples/examples_text-classification/configs/parallel_adapter_roberta-base/cola.json b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/cola.json new file mode 100644 index 0000000..093e646 --- /dev/null +++ b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/cola.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "cola", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/cola", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "cola", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "cola", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/parallel_adapter_roberta-base/mnli.json b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/mnli.json new file mode 100644 index 0000000..a0dc9ec --- /dev/null +++ b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/mnli.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mnli", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/mnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "mnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mnli", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/parallel_adapter_roberta-base/mrpc.json b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/mrpc.json new file mode 100644 index 0000000..9c9c060 --- /dev/null +++ b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/mrpc.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mrpc", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/mrpc", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "mrpc", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mrpc", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/parallel_adapter_roberta-base/qnli.json b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/qnli.json new file mode 100644 index 0000000..021ee0e --- /dev/null +++ b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/qnli.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qnli", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/qnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "qnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qnli", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/parallel_adapter_roberta-base/qqp.json b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/qqp.json new file mode 100644 index 0000000..be3afde --- /dev/null +++ b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/qqp.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "qqp", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/qqp", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "qqp", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "qqp", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/parallel_adapter_roberta-base/rte.json b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/rte.json new file mode 100644 index 0000000..3a1710f --- /dev/null +++ b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/rte.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/parallel_adapter_roberta-base/sst2.json b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/sst2.json new file mode 100644 index 0000000..21b6f89 --- /dev/null +++ b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/sst2.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "sst2", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/sst2", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "sst2", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "sst2", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/parallel_adapter_roberta-base/stsb.json b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/stsb.json new file mode 100644 index 0000000..5845f4f --- /dev/null +++ b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/stsb.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "stsb", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/stsb", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "stsb", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "stsb", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-boolq.json b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-boolq.json new file mode 100644 index 0000000..48747fe --- /dev/null +++ b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-boolq.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-boolq", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-boolq", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-boolq", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-boolq", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-cb.json b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-cb.json new file mode 100644 index 0000000..2e8a874 --- /dev/null +++ b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-cb.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-cb", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-cb", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-cb", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-cb", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-copa.json b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-copa.json new file mode 100644 index 0000000..46c7216 --- /dev/null +++ b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-copa.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-copa", + "eval_steps": 50, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 40, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-copa", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 50, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-copa", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-copa", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-multirc.json b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-multirc.json new file mode 100644 index 0000000..60ba873 --- /dev/null +++ b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-multirc.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-multirc", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-multirc", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-multirc", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-multirc", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-record.json b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-record.json new file mode 100644 index 0000000..4ce9097 --- /dev/null +++ b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-record.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-record", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 512, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 3, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-record", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-record", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-record", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wic.json b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wic.json new file mode 100644 index 0000000..c920a7a --- /dev/null +++ b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wic.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wic", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-wic", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-wic", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wic", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wsc.fixed.json b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wsc.fixed.json new file mode 100644 index 0000000..563af04 --- /dev/null +++ b/examples/examples_text-classification/configs/parallel_adapter_roberta-base/superglue-wsc.fixed.json @@ -0,0 +1,46 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "parallel_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wsc.fixed", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "eval_accuracy", + "model_name_or_path": "roberta-base", + "num_train_epochs": 20, + "output_dir": "outputs/parallel_adapter/roberta-base/superglue-wsc.fixed", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_hub": true, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "task_name": "superglue-wsc.fixed", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wsc.fixed", + "tokenizer_name": "roberta-base", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm", + "classifier" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/opendelta/auto_delta.py b/opendelta/auto_delta.py index 9680c36..ae24419 100644 --- a/opendelta/auto_delta.py +++ b/opendelta/auto_delta.py @@ -18,6 +18,7 @@ DELTA_CONFIG_MAPPING = { "compacter":"CompacterConfig", "prefix": "PrefixConfig", "soft_prompt": "SoftPromptConfig", + "parallel_adapter": "ParallelAdapterConfig", } DELTA_MODEL_MAPPING = { @@ -28,6 +29,7 @@ DELTA_MODEL_MAPPING = { "compacter": "CompacterModel", "prefix": "PrefixModel", "soft_prompt": "SoftPromptModel", + "parallel_adapter": "ParallelAdapterModel", } class _LazyConfigMapping(OrderedDict): diff --git a/opendelta/delta_models/adapter.py b/opendelta/delta_models/adapter.py index cf7822e..7f9bf56 100644 --- a/opendelta/delta_models/adapter.py +++ b/opendelta/delta_models/adapter.py @@ -105,7 +105,6 @@ class AdapterConfig(BaseDeltaConfig): self, bottleneck_dim: Optional[int]=24, non_linearity: Optional[str]='gelu_new', - sequential: Optional[str] = True, **kwargs ): super().__init__(**kwargs) @@ -136,13 +135,8 @@ class AdapterModel(DeltaBase): backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. bottleneck_dim (:obj:`int`): The dimension of the adapter's bottleneck. non_linearity (:obj:`str`): The non linearity of the adapter. - sequential (:obj:`str`): Whether insert the adapter in a sequential manner, as opposed to a parallel manner. - See `Towards a Unified View of Parameter-Efficient Transfer Learning `_ - for detail. - modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only - the implemented ones) - unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen - together with the prefix parameters. + modified_modules (:obj:`List[str]`): modules to add adapter after them. + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen together with the adapter parameters. common_structure (:obj:`bool`): whether using name-based addressing witha common structure mapping. """ @@ -153,7 +147,6 @@ class AdapterModel(DeltaBase): backbone_model: nn.Module, bottleneck_dim: Optional[int]=24, non_linearity: Optional[str]='gelu_new', - sequential: Optional[str] = True, modified_modules: Optional[bool] = None, unfrozen_modules: Optional[bool] = None, common_structure: Optional[bool] = None, @@ -177,17 +170,6 @@ class AdapterModel(DeltaBase): self.modified_modules, ) - - def add_all_delta_to_backbone(self, - module: nn.Module, - modified_modules: List[str], - ) -> nn.Module: - for key, _ in module.named_modules(): - if self.find_key(key, modified_modules): - self.update_module(module, key) - self._pseudo_data_to_instantiate(module) - self.mark_as_delta() - return module def update_module(self, module: nn.Module, key: str): _, _, ref = self.find_module(module, key) diff --git a/opendelta/delta_models/lora_old.py b/opendelta/delta_models/lora_old.py deleted file mode 100644 index d4954dc..0000000 --- a/opendelta/delta_models/lora_old.py +++ /dev/null @@ -1,126 +0,0 @@ -from typing import Optional, Union - -from opendelta.utils.signature import get_arg_names, get_arg_names_inside_func -from opendelta.utils.name_based_addressing import * -from opendelta.basemodel import DeltaBase -from transformers.models.t5 import T5ForConditionalGeneration -import loralib as lora -import torch.nn as nn -from opendelta import BaseDeltaConfig - -class LoraConfig(BaseDeltaConfig): - r""" - This is the configuration class to store the configuration of a :py:class:`~LoraModel` - - """ - def __init__( - self, - lora_r=8, - lora_alpha=16, - lora_dropout=0.0, - **kwargs - ): - super().__init__(**kwargs) - arg_names = get_arg_names_inside_func(self.__init__) - for arg_name in arg_names: - if not hasattr(self, arg_name): # the arg has not been registered in parent config - setattr(self, arg_name, locals()[arg_name]) - - -class LoraModel(DeltaBase): - r""" The implementation of `LoRA: Low-Rank Adaptation of Large Language Models `_ . - Thanks for their `loralib `_, we use loralib.linear - to replace the linear layer of the backbone model. - - class attributes: - - default_modified_modules = ['attn.q', 'attn.v'] According to the paper, they modify q and v matrix in the - attention layer. However, other linears can also be modified, and may lead to better performance. - - .. note:: - modified_modules should point to linear layer. We currently don't support broadcast to all linears in - a module's child modules. - - - delta_type = "lora" - - - Args: - backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. - lora_r (:obj:`int`, *optional*): the rank of the lora parameters. The smaller lora_r is , the fewer parameters lora has. - lora_alpha (:obj:`bool`, *optional*): A hyper-parameter to control the init scale of loralib.linear . - lora_dropout (:obj:`bool`, *optional*): The dropout rate in lora.linear. - modified_modules (:obj:`List[str]`): For prefix tuning, the it must refer to an attention layer (Currently, only - the implemented ones) - unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen - together with the prefix parameters. - common_structure (:obj:`bool`): whether using name-based addressing witha common structure mapping. - - """ - - config_class = LoraConfig - delta_type = "lora" - default_modified_modules = ['attn.q', 'attn.v'] - def __init__(self, - backbone_model: nn.Module, - lora_r=8, - lora_alpha=16, - lora_dropout=0.0, - modified_modules: Optional[bool] = None, - unfrozen_modules: Optional[bool] = None, - common_structure: Optional[bool] = None, - interactive_modify: Optional[Union[bool, int]] = False, - ): - DeltaBase.__init__(self, - backbone_model, - modified_modules=modified_modules, - unfrozen_modules=unfrozen_modules, - common_structure=common_structure, - interactive_modify=interactive_modify, - ) - arg_names = get_arg_names_inside_func(self.__init__) - for arg_name in arg_names: - if not hasattr(self, arg_name): # not registered in parent class - setattr(self, arg_name, locals()[arg_name]) - - self.delta_modules = nn.ModuleList() - - self.add_all_delta_to_backbone(self.backbone_model, - self.modified_modules, - ) - - - - def update_module(self, module: nn.Module, key: str): - parent_ref, child_name, child_ref = self.find_module(module, key) - new_module = self.new_module_like(child_module=child_ref) - self.replace_module(parent_ref, child_name, child_ref, new_module, delta_name="lora") - - def _pseudo_data_to_instantiate(self, module): - # no need to pass pseudo input, so overwrite it - pass - - def new_module_like(self, child_module): - if isinstance(child_module, nn.Linear): - in_features, out_features = child_module.in_features, child_module.out_features - new_module = lora.Linear(in_features=in_features, - out_features=out_features, - r=self.lora_r, - lora_alpha=self.lora_alpha, - lora_dropout=self.lora_dropout) - new_module.weight = child_module.weight - new_module.bias = child_module.bias # if bias is None, also copy - else: - raise NotImplementedError - return new_module - - - - def mark_as_delta(self, module: nn.Module = None): - if module is None: - module=self - for n, p in module.named_parameters(): - param_name = n.split(".")[-1] - if "lora_A" in param_name or "lora_B" in param_name: # only lora_A, lora_B is the delta parameter. - setattr(p, "_is_delta", True) - - - \ No newline at end of file diff --git a/opendelta/delta_models/parallel_adapter.py b/opendelta/delta_models/parallel_adapter.py new file mode 100644 index 0000000..7290614 --- /dev/null +++ b/opendelta/delta_models/parallel_adapter.py @@ -0,0 +1,196 @@ +from functools import partial +from random import random +from typing import Optional, Union +from opendelta.utils.signature import get_arg_names_inside_func +from opendelta.utils.name_based_addressing import * +from opendelta.utils.cuda import get_device +from opendelta.basemodel import DeltaBase +import loralib as lora +import torch.nn as nn +import torch +import math +from opendelta.delta_models.layers.activations import Activations +import inspect +from opendelta import BaseDeltaConfig +import opendelta.utils.logging as logging +logger = logging.get_logger(__name__) + +class ParallelAdapterLayer(nn.Module): + r"""A layer of adapter tuning module. + """ + layer_count = 0 + + @classmethod + def count_layer(cls): + cls.layer_count += 1 + + @classmethod + def get_layer_count(cls): + return cls.layer_count + + def __init__(self, bottleneck_dim=24, non_linearity='gelu_new', scaled=1, device=None): + super().__init__() + self.bottleneck_dim = bottleneck_dim + self.device = device + self.instantiated = False + self.non_linearity = non_linearity + self.scaled = scaled + + self.layer_id = ParallelAdapterLayer.get_layer_count() + ParallelAdapterLayer.count_layer() + + + def instantiate(self, hidden_dim): + self.modulelist = nn.Sequential() + self.modulelist.add_module("down_proj",nn.Linear(hidden_dim, self.bottleneck_dim, device=self.device)) + + # select non-linearity + self.modulelist.add_module("non_linear", Activations(self.non_linearity.lower())) + + self.modulelist.add_module("up_proj", nn.Linear(self.bottleneck_dim, self.hidden_dim, device=self.device)) + + # TODO: + # If we want to have a layer norm on output, we apply it later after a separate residual connection + # This means that we learn a new output layer norm, which replaces another layer norm learned in the bert layer + # if self.add_layer_norm_after: + # self.adapter_norm_after = nn.LayerNorm(self.input_size) + + self.instantiated = True + # initialize the weight, which is important for fast convergence and better performance. + self.apply(self._init_weight) + + def _init_weight(self, module): + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=0.01) + if module.bias is not None: + module.bias.data.zero_() + + + def pre_forward(self, *args, **kwargs): + r""" Get the hidden_states from the PLM's layer output, pass it into the adapter, + then combined with the main hidden_states. Finally pass it into the subsequent layer. + + """ + if isinstance(args, tuple): + hiddens = args[0] + elif isinstance(args, torch.Tensor): + hiddens = args + else: + raise TypeError + + + if not self.instantiated: + self.hidden_dim = hiddens.shape[-1] + logger.debug(f"Got hidden dim hidden_dim {self.hidden_dim}") + self.instantiate(hidden_dim=self.hidden_dim) + + + self.adapter_output = self.modulelist(hiddens) * self.scaled + hiddens # TODO add hiddens? + return args, kwargs + + def post_forward(self, *args, **kwargs): + if isinstance(args, tuple): + output = args[0] + elif isinstance(args, torch.Tensor): + output = args + else: + raise TypeError + + modified_output = self.adapter_output + output + return modified_output + + + +class ParallelAdapterConfig(BaseDeltaConfig): + r""" + This is the configuration class to store the configuration of a :py:class:`~ParallelAdapterModel` + + """ + def __init__( + self, + bottleneck_dim: Optional[int]=24, + non_linearity: Optional[str]='gelu_new', + scaled: Optional[float]=1., + **kwargs + ): + super().__init__(**kwargs) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # the arg has not been registered in parent config + setattr(self, arg_name, locals()[arg_name]) + + + +class ParallelAdapterModel(DeltaBase): + r""" The implementation of Parallel Adapter(`TOWARDS A UNIFIED VIEW OF PARAMETER-EFFICIENT TRANSFER LEARNING `_ ) . + Add adapter to the designated ``modified_modules``. In parallel paradigm, The modules' output is then passed into the adapter's + post_forward. + + .. note:: + We **assume** the output of the modified module is the hidden state or a tuple where hidden state is the + first element. This is true for most PLMs. However, we admit that currently it's not rigorous, We will improve + it in the next version. Currently, if you encount an error here for you backbone, you can modify the code to + get the hidden state. + + class attributes: + - default_modified_modules = ["attn", "ff"] According to the Adapter paper, we add adapter to the attention layer + and feed forward layer. + - delta_type = "adapter" + + Args: + backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. + bottleneck_dim (:obj:`int`): The dimension of the adapter's bottleneck. + non_linearity (:obj:`str`): The non linearity of the adapter. + modified_modules (:obj:`List[str]`): modules to add parallel adapter. Must be paired. For examples, ["attn", "attn", "ff.w1", "ff.w2"] add one parallel adapter from attn's input to attn's output, and another one from ff.w1's input to ff.w2's output. + unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen together with the parallel adapter parameters. + common_structure (:obj:`bool`): whether using name-based addressing witha common structure mapping. + + """ + config_class = ParallelAdapterConfig + delta_type = "adapter" + default_modified_modules = ["attn", "attn", "ff.w1", "ff.w2"] + def __init__(self, + backbone_model: nn.Module, + bottleneck_dim: Optional[int]=24, + non_linearity: Optional[str]='gelu_new', + modified_modules: Optional[bool] = None, + unfrozen_modules: Optional[bool] = None, + common_structure: Optional[bool] = None, + interactive_modify: Optional[Union[bool, int]] = False, + ): + DeltaBase.__init__(self, + backbone_model, + modified_modules=modified_modules, + unfrozen_modules=unfrozen_modules, + common_structure=common_structure, + interactive_modify=interactive_modify, + ) + arg_names = get_arg_names_inside_func(self.__init__) + for arg_name in arg_names: + if not hasattr(self, arg_name): # not registered in parent class + setattr(self, arg_name, locals()[arg_name]) + + self.delta_modules = nn.ModuleList() + + self.ith = 0 + self.add_all_delta_to_backbone(self.backbone_model, + self.modified_modules, + ) + + + def update_module(self, module: nn.Module, key: str): + _, _, ref = self.find_module(module, key) + if self.ith % 2 == 0: + adapterlayer = self.new_module_like(ref) + self.insert_before_module(ref, delta_module=adapterlayer, delta_name="parallel_adapter") + else: + adapterlayer = self.delta_moduels[-1] + self.insert_after_module(ref, delta_module=adapterlayer, delta_name="parallel_adapter") + self.ith += 1 + + def new_module_like(self, module): + module_device = get_device(module) + adapterlayer = ParallelAdapterLayer(bottleneck_dim=self.bottleneck_dim, non_linearity=self.non_linearity, device=module_device) + self.delta_modules.append(adapterlayer) + return adapterlayer + \ No newline at end of file From 0d38e6509f7281e20a3e92c3a62b9a4db9da3e9d Mon Sep 17 00:00:00 2001 From: Achazwl <323163497@qq.com> Date: Sat, 26 Feb 2022 09:00:12 +0800 Subject: [PATCH 2/3] init --- opendelta/basemodel.py | 113 +++++++-------------- opendelta/delta_models/lora.py | 6 ++ opendelta/delta_models/parallel_adapter.py | 40 ++++---- opendelta/utils/data_parallel.py | 63 ++++++++---- 4 files changed, 108 insertions(+), 114 deletions(-) diff --git a/opendelta/basemodel.py b/opendelta/basemodel.py index ac21334..16d3af7 100644 --- a/opendelta/basemodel.py +++ b/opendelta/basemodel.py @@ -22,6 +22,7 @@ from opendelta import logging from opendelta.utils.structure_mapping import CommonStructureMap from opendelta.utils.interactive.web import interactive from opendelta.utils.data_parallel import new_replicate_for_data_parallel +from opendelta.utils.data_parallel import caller_map logger = logging.get_logger(__name__) def is_leaf_module(module): @@ -480,7 +481,41 @@ class DeltaBase(nn.Module, SaveLoadMixin): """ raise NotImplementedError - def insert_sequential_module(self, module, delta_module=None, delta_name='delta', strict=False, _delta_info=None): + def insert_module(self, module, method, delta_module=None, delta_name='delta', strict=False, _delta_info=None): + if strict: + if hasattr(module.forward, "__wrapped__"): + raise RuntimeWarning("The forward function might have been wrapped by a decorator, is it intended?") + + # record info for plug and unplug and nested wrap + if _delta_info is None: + if delta_module is None: + raise RuntimeError("delta module can't be none to ensure successful replicate of the parent module.") + + _delta_info = {"method": method, + "delta_module": delta_module, + "delta_name": delta_name, + "delta_belong": self, + "state": "on"} + self._register_delta_infos(parent_module=module, + _delta_info = _delta_info) + else: + delta_module = _delta_info["delta_module"] + delta_name = _delta_info["delta_name"] + + setattr(module, _delta_info['delta_name'], _delta_info["delta_module"]) + + if _delta_info["method"] in caller_map.keys(): + caller = caller_map[_delta_info["method"]] + new_forward = decorate(module.forward, caller, extras=(module, _delta_info['delta_name']), kwsyntax=True) # decorator.decorate helps preserving the functions metadata (signature, etc.). + module.forward = new_forward.__get__(module, type(module)) # func.__get__(object, type(object)) register a function as an object's method + # for DataParallel's copy behavior. Experimental: + # may have bugs when module.forward is nestedly wrapped. + module._replicate_for_data_parallel = new_replicate_for_data_parallel.__get__(module, type(module)) + else: + raise NotImplementedError(f"_delta_info['method']=='{_delta_info['method']}' is not supported") + + + def insert_sequential_module(self, module, delta_module=None, delta_name='delta', strict=False, _delta_info=None): r"""insert a module (previous not exists in the code base) before/after a module. Specifically, it modifies the forward function of the original module to firstly pass the arguments into the new module's forward function and then pass it into the original ones. The new module can also be inserted after the original module with similar mechanism. @@ -496,46 +531,7 @@ class DeltaBase(nn.Module, SaveLoadMixin): original delta is passed through ``_delta_info``. """ - def _caller(_org_func, org_module, delta_name, *args, **kwargs): - args = args[1:] # the first argument here is ``self`` - delta_module = getattr(org_module, delta_name) - if hasattr(delta_module, "pre_forward"):# is not None: - args, kwargs = delta_module.pre_forward(*args, **kwargs) - # from IPython import embed - # embed(header = "true") - ret = _org_func(*args, **kwargs) - if hasattr(delta_module, "post_forward"):# is not None: - ret = delta_module.post_forward(ret) - return ret - - - if strict: - if hasattr(module.forward, "__wrapped__"): - raise RuntimeWarning("The forward function might have been wrapped by a decorator, is it intended?") - - # record info for plug and unplug and nested wrap - if _delta_info is None: - if delta_module is None: - raise RuntimeError("delta module can't be none to ensure successful replicate of the parent module.") - - _delta_info = {"method": "insert_sequential", - "delta_module": delta_module, - "delta_name": delta_name, - "delta_belong": self, - "state": "on"} - self._register_delta_infos(parent_module=module, - _delta_info = _delta_info) - else: - delta_module = _delta_info["delta_module"] - delta_name = _delta_info["delta_name"] - - setattr(module, _delta_info['delta_name'], _delta_info["delta_module"]) - - new_forward = decorate(module.forward, _caller, extras=(module, _delta_info['delta_name']), kwsyntax=True) # decorator.decorate helps preserving the functions metadata (signature, etc.). - module.forward = new_forward.__get__(module, type(module)) # func.__get__(object, type(object)) register a function as an object's method - # for DataParallel's copy behavior. Experimental: - # may have bugs when module.forward is nestedly wrapped. - module._replicate_for_data_parallel = new_replicate_for_data_parallel.__get__(module, type(module)) + self.insert_module(module, "sequential", delta_module, delta_name, strict, _delta_info) def insert_parallel_module(self, module, delta_module=None, delta_name='delta', strict=False, _delta_info=None): @@ -555,40 +551,7 @@ class DeltaBase(nn.Module, SaveLoadMixin): """ - def _caller(_org_func, org_module, delta_name, *args, **kwargs): - args = args[1:] # the first argument here is ``self`` - delta_module = getattr(org_module, delta_name) - ret_1 = _org_func(*args, **kwargs) - ret_2 = delta_module.forward(*args, **kwargs) - return ret_1 + ret_2 - - if strict: - if hasattr(module.forward, "__wrapped__"): - raise RuntimeWarning("The forward function might have been wrapped by a decorator, is it intended?") - - # record info for plug and unplug and nested wrap - if _delta_info is None: - if delta_module is None: - raise RuntimeError("delta module can't be none to ensure successful replicate of the parent module.") - - _delta_info = {"method": "insert_parallel", - "delta_module": delta_module, - "delta_name": delta_name, - "delta_belong": self, - "state": "on"} - self._register_delta_infos(parent_module=module, - _delta_info = _delta_info) - else: - delta_module = _delta_info["delta_module"] - delta_name = _delta_info["delta_name"] - - setattr(module, _delta_info['delta_name'], _delta_info["delta_module"]) - - new_forward = decorate(module.forward, _caller, extras=(module, _delta_info['delta_name']), kwsyntax=True) # decorator.decorate helps preserving the functions metadata (signature, etc.). - module.forward = new_forward.__get__(module, type(module)) # func.__get__(object, type(object)) register a function as an object's method - # for DataParallel's copy behavior. Experimental: - # may have bugs when module.forward is nestedly wrapped. - module._replicate_for_data_parallel = new_replicate_for_data_parallel.__get__(module, type(module)) + self.insert_module(module, "parallel", delta_module, delta_name, strict, _delta_info) def set_active_state_dict(self, module: nn.Module): diff --git a/opendelta/delta_models/lora.py b/opendelta/delta_models/lora.py index 492fea6..09399eb 100644 --- a/opendelta/delta_models/lora.py +++ b/opendelta/delta_models/lora.py @@ -44,6 +44,12 @@ class LowRankLinear(nn.Module): def forward(self, x): return (self.lora_dropout(x) @ self.lora_A.T @ self.lora_B.T) * self.scaling + # def pre_forward(self, *args, **kwargs): + # return (args[0] + (self.lora_dropout(args[0]) @ self.lora_A.T @ self.lora_B.T) * self.scaling,), {} + + # def post_forward(self, *args, **kwargs): + # return args[0] + (self.lora_dropout(args[0]) @ self.lora_A.T @ self.lora_B.T) * self.scaling + class LoraConfig(BaseDeltaConfig): r""" diff --git a/opendelta/delta_models/parallel_adapter.py b/opendelta/delta_models/parallel_adapter.py index 7290614..b6530f3 100644 --- a/opendelta/delta_models/parallel_adapter.py +++ b/opendelta/delta_models/parallel_adapter.py @@ -49,12 +49,6 @@ class ParallelAdapterLayer(nn.Module): self.modulelist.add_module("up_proj", nn.Linear(self.bottleneck_dim, self.hidden_dim, device=self.device)) - # TODO: - # If we want to have a layer norm on output, we apply it later after a separate residual connection - # This means that we learn a new output layer norm, which replaces another layer norm learned in the bert layer - # if self.add_layer_norm_after: - # self.adapter_norm_after = nn.LayerNorm(self.input_size) - self.instantiated = True # initialize the weight, which is important for fast convergence and better performance. self.apply(self._init_weight) @@ -85,19 +79,25 @@ class ParallelAdapterLayer(nn.Module): self.instantiate(hidden_dim=self.hidden_dim) - self.adapter_output = self.modulelist(hiddens) * self.scaled + hiddens # TODO add hiddens? + self.adapter_output = self.modulelist(hiddens) * self.scaled return args, kwargs - def post_forward(self, *args, **kwargs): - if isinstance(args, tuple): - output = args[0] - elif isinstance(args, torch.Tensor): - output = args + def post_forward(self, output, **kwargs): + if isinstance(output, tuple): + hidden = output[0] + elif isinstance(output, torch.Tensor): + hidden = output else: raise TypeError - modified_output = self.adapter_output + output - return modified_output + modified_output = self.adapter_output + hidden + if isinstance(output, tuple): + output = (modified_output,) + output[1:] + elif isinstance(output, torch.Tensor): + output = modified_output + else: + raise TypeError + return output @@ -141,7 +141,7 @@ class ParallelAdapterModel(DeltaBase): backbone_model (:obj:`transformers.PretrainedModels`): The backbone model to be modified. bottleneck_dim (:obj:`int`): The dimension of the adapter's bottleneck. non_linearity (:obj:`str`): The non linearity of the adapter. - modified_modules (:obj:`List[str]`): modules to add parallel adapter. Must be paired. For examples, ["attn", "attn", "ff.w1", "ff.w2"] add one parallel adapter from attn's input to attn's output, and another one from ff.w1's input to ff.w2's output. + modified_modules (:obj:`List[str]`): modules to add parallel adapter. Must be paired and have the save order in layer. For examples, ["attn", "attn", "ff.w1", "ff.w2"] add one parallel adapter from attn's input to attn's output, and another one from ff.w1's input to ff.w2's output. unfrozen_modules (:obj:`List[str]`, *optional*, default to :obj:`None`): The modules that should be unfrozen together with the parallel adapter parameters. common_structure (:obj:`bool`): whether using name-based addressing witha common structure mapping. @@ -182,11 +182,13 @@ class ParallelAdapterModel(DeltaBase): _, _, ref = self.find_module(module, key) if self.ith % 2 == 0: adapterlayer = self.new_module_like(ref) - self.insert_before_module(ref, delta_module=adapterlayer, delta_name="parallel_adapter") - else: - adapterlayer = self.delta_moduels[-1] - self.insert_after_module(ref, delta_module=adapterlayer, delta_name="parallel_adapter") + self.insert_module(ref, "before", delta_module=adapterlayer, delta_name="parallel_adapter") + if self.ith % 2 == 1 or self.modified_modules[self.ith] == self.modified_modules[self.ith + 1]: + adapterlayer = self.delta_modules[-1] + self.insert_module(ref, "after", delta_module=adapterlayer, delta_name="parallel_adapter") + self.ith |= 1 self.ith += 1 + self.ith %= len(self.modified_modules) def new_module_like(self, module): module_device = get_device(module) diff --git a/opendelta/utils/data_parallel.py b/opendelta/utils/data_parallel.py index ca0c4c0..8c32297 100644 --- a/opendelta/utils/data_parallel.py +++ b/opendelta/utils/data_parallel.py @@ -4,26 +4,50 @@ from opendelta.utils.decorate import decorate from collections import OrderedDict +def sequential_caller(_org_func, org_module, delta_name, *args, **kwargs): + args = args[1:] # the first argument here is ``self`` + delta_module = getattr(org_module, delta_name) + if hasattr(delta_module, "pre_forward"): + args, kwargs = delta_module.pre_forward(*args, **kwargs) + ret = _org_func(*args, **kwargs) + if hasattr(delta_module, "post_forward"): + ret = delta_module.post_forward(ret) + return ret + +def before_caller(_org_func, org_module, delta_name, *args, **kwargs): + args = args[1:] # the first argument here is ``self`` + delta_module = getattr(org_module, delta_name) + if hasattr(delta_module, "pre_forward"): + args, kwargs = delta_module.pre_forward(*args, **kwargs) + ret = _org_func(*args, **kwargs) + return ret + +def after_caller(_org_func, org_module, delta_name, *args, **kwargs): + args = args[1:] # the first argument here is ``self`` + delta_module = getattr(org_module, delta_name) + ret = _org_func(*args, **kwargs) + if hasattr(delta_module, "post_forward"): + ret = delta_module.post_forward(ret) + return ret + +def parallel_caller(_org_func, org_module, delta_name, *args, **kwargs): + args = args[1:] # the first argument here is ``self`` + delta_module = getattr(org_module, delta_name) + ret_1 = _org_func(*args, **kwargs) + ret_2 = delta_module.forward(*args, **kwargs) + return ret_1 + ret_2 + +caller_map = { + "sequential": sequential_caller, + "parallel": parallel_caller, + "before": before_caller, + "after": after_caller, +} + def new_replicate_for_data_parallel(self): r""" self is the parent module. """ # rewrite the replicate in DataParallel. - def _sequential_caller(_org_func, org_module, delta_name, *args, **kwargs): - args = args[1:] # the first argument here is ``self`` - delta_module = getattr(org_module, delta_name) - if hasattr(delta_module, "pre_forward"): - args, kwargs = delta_module.pre_forward(*args, **kwargs) - ret = _org_func(*args, **kwargs) - if hasattr(delta_module, "post_forward"): - ret = delta_module.post_forward(ret) - return ret - - def _parallel_caller(_org_func, org_module, delta_name, *args, **kwargs): - args = args[1:] # the first argument here is ``self`` - delta_module = getattr(org_module, delta_name) - ret_1 = _org_func(*args, **kwargs) - ret_2 = delta_module.forward(*args, **kwargs) - return ret_1 + ret_2 replica = self.__new__(type(self)) org_forward = replica.forward replica.__dict__ = self.__dict__.copy() @@ -33,10 +57,9 @@ def new_replicate_for_data_parallel(self): for _delta_info in self._delta_infos: if _delta_info['state'] == 'on': - if _delta_info['method'] == "insert_sequential": - new_forward = decorate(replica.forward, _sequential_caller, extras=(replica, _delta_info['delta_name']), kwsyntax=True) - elif _delta_info['method'] == "insert_parallel": - new_forward = decorate(replica.forward, _parallel_caller, extras=(replica, _delta_info['delta_name']), kwsyntax=True) + if _delta_info['method'] in caller_map.keys(): + caller = caller_map[_delta_info['method']] + new_forward = decorate(replica.forward, caller, extras=(replica, _delta_info['delta_name']), kwsyntax=True) else: raise NotImplementedError(f"data_parallel for _delta_info['method']=='{_delta_info['method']}' is not supported") replica.__dict__['forward'] = new_forward.__get__(replica, type(replica)) From 3867c0d8dc9f6ed7aa1754f39cd6578c7067110b Mon Sep 17 00:00:00 2001 From: shengdinghu Date: Mon, 17 Oct 2022 08:44:44 +0000 Subject: [PATCH 3/3] merge parallel-adapter succeed --- docs/source/notes/faq.md | 5 +++++ opendelta/basemodel.py | 10 ++-------- opendelta/delta_models/parallel_adapter.py | 11 ++++++----- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/source/notes/faq.md b/docs/source/notes/faq.md index 056399e..164c3a0 100644 --- a/docs/source/notes/faq.md +++ b/docs/source/notes/faq.md @@ -7,3 +7,8 @@ 2. **Available Models with default configurations are ..., Please manually add the delta models by speicifying 'modified_modules' based on the visualization of your model structure** Although most pre-trained models (PTMs) use the transformers archtecture, they are implemented differently. For example, the attention module in GPT2 and BERT is not only named differently, but also implemented in different ways. Common structure mapping mapps the different name conventions of different PTMs into a unified name convention. But there are many PTMs that we do not currently cover. But don't worry! For these models, you can figure out which modules should you modify by simply [visualizing the PTMs](visualization), and then specify the `modified modules` manually (See [name-based addressing](namebasedaddr)). + + +3. **Requires a dummy_inputs to be passed through the model to understand the dimensionality of each tensor in the computation graph. The {module.__class__.__name__} Class has no dummy_inputs, and automatically created dummy_inputs failed.** + + The `dummy_inputs` can be any data that make `backbone_model.forward(**dummy_inputs)` succeed. Only the form and shape of the `dummy_inputs` matter. To set dummy_inputs for your model, please use: `setattr(backbone_model, 'dummy_inputs', some_dummy_inputs)` before initializing `{self.__class__.__name__}`. \ No newline at end of file diff --git a/opendelta/basemodel.py b/opendelta/basemodel.py index 6ed815e..b7bf648 100644 --- a/opendelta/basemodel.py +++ b/opendelta/basemodel.py @@ -372,7 +372,7 @@ class DeltaBase(nn.Module, SaveLoadMixin): except: _auto_dummy_fail = True if _auto_dummy_fail: - raise AttributeError(f"\nThe {self.__class__.__name__} requires a pseudo-data to be passed through the model to understand the dimensionality of each tensor in the computation graph. \nThe automatically created dummy inputs failed.\nThe `dummy_inputs` can be any data that make `backbone_model.forward(**dummy_inputs)` succeed. Only the form and shape of the `dummy_inputs` matter.\n\tTo set dummy_inputs for your model, please use: `setattr(backbone_model, 'dummy_inputs', some_dummy_inputs)` before initializing `{self.__class__.__name__}` ") + raise AttributeError(f"\n\tThe {self.__class__.__name__} requires a dummy_inputs to be passed through the model to understand the dimensionality of each tensor in the computation graph. \n\t The {module.__class__.__name__} Class has no dummy_inputs, and automatically created dummy_inputs failed.\n\t Refer to `https://opendelta.readthedocs.io/en/latest/notes/faq.html` for detail.") @@ -804,13 +804,7 @@ class DeltaBase(nn.Module, SaveLoadMixin): if _delta_info['method'] == "replace": setattr(submodule, _delta_info["child_name"], _delta_info['org_module']) - elif _delta_info['method'] == "insert_sequential": - if hasattr(submodule.forward, "__wrapped__"): - submodule.forward = submodule.forward.__wrapped__ - delattr(submodule, _delta_info["delta_name"]) - else: - raise AttributeError("submodule {}'s forward has no attribute __wrapped__. It's not a wrapped function.".format(name)) - elif _delta_info['method'] == "insert_parallel": + elif _delta_info['method'] in ["sequential", "before", "after", "parallel"]: if hasattr(submodule.forward, "__wrapped__"): submodule.forward = submodule.forward.__wrapped__ delattr(submodule, _delta_info["delta_name"]) diff --git a/opendelta/delta_models/parallel_adapter.py b/opendelta/delta_models/parallel_adapter.py index b6530f3..1024394 100644 --- a/opendelta/delta_models/parallel_adapter.py +++ b/opendelta/delta_models/parallel_adapter.py @@ -5,12 +5,9 @@ from opendelta.utils.signature import get_arg_names_inside_func from opendelta.utils.name_based_addressing import * from opendelta.utils.cuda import get_device from opendelta.basemodel import DeltaBase -import loralib as lora import torch.nn as nn import torch -import math from opendelta.delta_models.layers.activations import Activations -import inspect from opendelta import BaseDeltaConfig import opendelta.utils.logging as logging logger = logging.get_logger(__name__) @@ -147,13 +144,16 @@ class ParallelAdapterModel(DeltaBase): """ config_class = ParallelAdapterConfig - delta_type = "adapter" - default_modified_modules = ["attn", "attn", "ff.w1", "ff.w2"] + delta_type = "parallel_adapter" + default_modified_modules = ["attn@", "attn@", "ff@.w1@", "ff@.w2@"] + # default_modified_modules = ["attn", "attn", "ff.w1", "ff.w2"] + _need_pseudo_data = True def __init__(self, backbone_model: nn.Module, bottleneck_dim: Optional[int]=24, non_linearity: Optional[str]='gelu_new', modified_modules: Optional[bool] = None, + exclude_modules: Optional[List[str]] = None, unfrozen_modules: Optional[bool] = None, common_structure: Optional[bool] = None, interactive_modify: Optional[Union[bool, int]] = False, @@ -161,6 +161,7 @@ class ParallelAdapterModel(DeltaBase): DeltaBase.__init__(self, backbone_model, modified_modules=modified_modules, + exclude_modules=exclude_modules, unfrozen_modules=unfrozen_modules, common_structure=common_structure, interactive_modify=interactive_modify,