From 76db56c812cfb99eef9081c6797a5a4bf50a0d96 Mon Sep 17 00:00:00 2001 From: namezhenzhang Date: Fri, 29 Jul 2022 21:26:51 +0800 Subject: [PATCH] backbones and corresponding configs --- examples/examples_prompt/backbones/bart.py | 3 +- examples/examples_prompt/backbones/beit.py | 8 +- .../examples_prompt/backbones/bigbird_.py | 169 +++++++++++++++++ .../examples_prompt/backbones/blenderbot.py | 2 +- examples/examples_prompt/backbones/opt.py | 171 ++++++++++++++++++ examples/examples_prompt/backbones/vit.py | 0 .../adapter_clip-vit-base-patch32/beans.json | 48 +++++ .../configs/adapter_opt-350m/wikitext.json | 53 ++++++ .../beans.json | 53 ++++++ .../configs/bitfit_t5-large/rte.json | 51 ++++++ .../configs/compacter_blenderbot-3b/sst2.json | 66 +++++++ .../compacter_deberta-v2-xlarge/mnli.json | 51 ++++++ .../compacter_long-t5-tglobal-large/rte.json | 51 ++++++ examples/examples_prompt/configs/gen_clip.py | 2 +- .../lora_beit-large-patch16-224/cifar10.json | 52 ++++++ .../configs/lora_gpt-j-6B/wikitext.json | 52 ++++++ .../lora_roberta-large/superglue-boolq.json | 52 ++++++ .../lora_xlm-roberta-large/superglue-wic.json | 52 ++++++ .../low_rank_adapter_gpt2/wikitext.json | 52 ++++++ .../configs/prefix_bert-large-cased/rte.json | 51 ++++++ .../superglue-boolq.json | 51 ++++++ .../data_processors/processor.py | 8 +- .../examples_prompt/data_processors/tasks.py | 69 ++++++- examples/examples_prompt/metrics/metrics.py | 12 +- examples/examples_prompt/src/run.py | 13 +- examples/examples_prompt/utils/args.py | 54 +++++- 26 files changed, 1229 insertions(+), 17 deletions(-) create mode 100644 examples/examples_prompt/backbones/bigbird_.py create mode 100644 examples/examples_prompt/backbones/opt.py create mode 100644 examples/examples_prompt/backbones/vit.py create mode 100644 examples/examples_prompt/configs/adapter_clip-vit-base-patch32/beans.json create mode 100644 examples/examples_prompt/configs/adapter_opt-350m/wikitext.json create mode 100644 examples/examples_prompt/configs/adapter_vit-large-patch16-224-in21k/beans.json create mode 100644 examples/examples_prompt/configs/bitfit_t5-large/rte.json create mode 100644 examples/examples_prompt/configs/compacter_blenderbot-3b/sst2.json create mode 100644 examples/examples_prompt/configs/compacter_deberta-v2-xlarge/mnli.json create mode 100644 examples/examples_prompt/configs/compacter_long-t5-tglobal-large/rte.json create mode 100644 examples/examples_prompt/configs/lora_beit-large-patch16-224/cifar10.json create mode 100644 examples/examples_prompt/configs/lora_gpt-j-6B/wikitext.json create mode 100644 examples/examples_prompt/configs/lora_roberta-large/superglue-boolq.json create mode 100644 examples/examples_prompt/configs/lora_xlm-roberta-large/superglue-wic.json create mode 100644 examples/examples_prompt/configs/low_rank_adapter_gpt2/wikitext.json create mode 100644 examples/examples_prompt/configs/prefix_bert-large-cased/rte.json create mode 100644 examples/examples_prompt/configs/soft_prompt_bart-large/superglue-boolq.json diff --git a/examples/examples_prompt/backbones/bart.py b/examples/examples_prompt/backbones/bart.py index bab8303..9a30a9a 100644 --- a/examples/examples_prompt/backbones/bart.py +++ b/examples/examples_prompt/backbones/bart.py @@ -43,7 +43,8 @@ def preprocess_function(raw_example, **kwargs): def get_backbone(model_args, **kwargs): config = AutoConfig.from_pretrained( - model_args.config_name if model_args.config_name else model_args.model_name_or_path, + # model_args.config_name if model_args.config_name else model_args.model_name_or_path, + model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, diff --git a/examples/examples_prompt/backbones/beit.py b/examples/examples_prompt/backbones/beit.py index 4494fed..c35bd4e 100644 --- a/examples/examples_prompt/backbones/beit.py +++ b/examples/examples_prompt/backbones/beit.py @@ -8,7 +8,6 @@ from transformers import ( AutoFeatureExtractor, AutoModelForImageClassification, ) -from transformers import ViTFeatureExtractor from transformers import Trainer as HfTrainer import torch.nn as nn @@ -26,9 +25,10 @@ def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): def preprocess_function(raw_example, **kwargs): # from IPython import embed; embed(header="Therefa") tokenizer = kwargs['tokenizer'] - model_inputs = tokenizer(raw_example['image'], return_tensors='pt') + # print(np.array(raw_example['img']).shape) + model_inputs = tokenizer(np.array(raw_example['image']), return_tensors='pt') model_inputs['pixel_values'] = model_inputs['pixel_values'].squeeze() - model_inputs['labels'] = raw_example['labels'] + model_inputs['labels'] = raw_example['label'] return model_inputs def compute_metrics(eval_preds, dataset_name, eval_metric): @@ -55,7 +55,7 @@ def mask_token_func(tokenizer, ith_mask=0): def get_remove_columns(dataset_features): # dataset_features.pop("label") - print("remove_columns: {}".format(dataset_features)) + # print("remove_columns: {}".format(dataset_features)) return dataset_features class DataCollator(HfDataCollatorMixin): diff --git a/examples/examples_prompt/backbones/bigbird_.py b/examples/examples_prompt/backbones/bigbird_.py new file mode 100644 index 0000000..8945103 --- /dev/null +++ b/examples/examples_prompt/backbones/bigbird_.py @@ -0,0 +1,169 @@ +from openpromptu.data_utils import InputExample +import torch +from transformers.data.data_collator import torch_default_data_collator +from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin +from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator +import numpy as np +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, +) + +from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer +import copy +from torch.nn import CrossEntropyLoss + +def preprocess_function(raw_example, **kwargs): + tokenizer = kwargs['tokenizer'] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + + example = InputExample(**raw_example) + # example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length, + padding="max_length", truncation=True) + return model_inputs + + + +def compute_metrics(eval_preds, dataset_name, eval_metric): + pass + +def mask_token_func(tokenizer, ith_mask=0): + return tokenizer.pad_token + +def get_remove_columns(dataset_features): + # dataset_features.remove("label") + return dataset_features + +def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): + from openpromptu.prompts import GenerationVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func) + return template, verbalizer, tokenizer_wrapper + + +def get_backbone(model_args, **kwargs): + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + # config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + return config, tokenizer, model + +class Trainer(HfSeq2SeqTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.eval_task = eval_task + self.compute_metrics = self._compute_metrics + + def compute_loss(self, model, inputs, return_outputs=False): + + labels=copy.deepcopy(inputs['input_ids']) + # labels[labels==self.tokenizer.pad_token_id]=-100 + outputs = model(**inputs) + logits = outputs.logits + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1)) + + return (loss, outputs) if return_outputs else loss + + def prediction_step( + self, + model, #nn.Module, + inputs, #Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only, #: bool, + ignore_keys, #: Optional[List[str]] = None, + ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + inputs = self._prepare_inputs(inputs) + with torch.no_grad(): + labels=copy.deepcopy(inputs['input_ids']) + # labels[labels==self.tokenizer.pad_token_id]=-100 + outputs = model(**inputs) + logits = outputs.logits + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous().long() + loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu() + loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss) + + if prediction_loss_only: + return (loss, None, None) + else: + # non pad label + shift_labels = shift_labels.view(-1).detach().cpu() + nonpad_idx = shift_labels!=self.tokenizer.pad_token_id + shift_labels = shift_labels[nonpad_idx] + # the probability at the corresponding position + shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu() + target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device) + shift_logits = shift_logits.softmax(dim=-1)[target_position] + + + return (loss, shift_logits, shift_labels) + + def _compute_metrics(self, eval_preds): + + preds, labels = eval_preds + + result = {} + for metric in self.eval_task.metric: + result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id)) + + average_metric = sum(result.values())/len(result) + result.update({"average_metrics":average_metric}) + return result \ No newline at end of file diff --git a/examples/examples_prompt/backbones/blenderbot.py b/examples/examples_prompt/backbones/blenderbot.py index c1e8876..20702fa 100644 --- a/examples/examples_prompt/backbones/blenderbot.py +++ b/examples/examples_prompt/backbones/blenderbot.py @@ -165,7 +165,7 @@ class Trainer(HfSeq2SeqTrainer): return (loss, generated_tokens, labels) def _compute_metrics(self, eval_preds): - from IPython import embed; embed(header="In compute metrics") + # from IPython import embed; embed(header="In compute metrics") preds, labels = eval_preds decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True) decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True) diff --git a/examples/examples_prompt/backbones/opt.py b/examples/examples_prompt/backbones/opt.py new file mode 100644 index 0000000..5902bc9 --- /dev/null +++ b/examples/examples_prompt/backbones/opt.py @@ -0,0 +1,171 @@ +from openpromptu.data_utils import InputExample +import torch +from transformers.data.data_collator import torch_default_data_collator +from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin +from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator +import numpy as np +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, +) + +from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer +import copy +from torch.nn import CrossEntropyLoss + +def preprocess_function(raw_example, **kwargs): + tokenizer = kwargs['tokenizer'] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + + example = InputExample(**raw_example) + # example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length, + padding="max_length", truncation=True) + return model_inputs + + + +def compute_metrics(eval_preds, dataset_name, eval_metric): + pass + +def mask_token_func(tokenizer, ith_mask=0): + return tokenizer.pad_token + +def get_remove_columns(dataset_features): + # dataset_features.remove("label") + return dataset_features + +def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): + from openpromptu.prompts import GenerationVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="tail", mask_token_func=mask_token_func) + return template, verbalizer, tokenizer_wrapper + + +def get_backbone(model_args, **kwargs): + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + # config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + if not hasattr(tokenizer,"pad_token") or (hasattr(tokenizer,"pad_token") and tokenizer.pad_token==None): + tokenizer.pad_token = tokenizer.eos_token + + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + return config, tokenizer, model + +class Trainer(HfSeq2SeqTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.eval_task = eval_task + self.compute_metrics = self._compute_metrics + + def compute_loss(self, model, inputs, return_outputs=False): + + labels=copy.deepcopy(inputs['input_ids']) + # labels[labels==self.tokenizer.pad_token_id]=-100 + outputs = model(**inputs) + logits = outputs.logits + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1)) + + return (loss, outputs) if return_outputs else loss + + def prediction_step( + self, + model, #nn.Module, + inputs, #Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only, #: bool, + ignore_keys, #: Optional[List[str]] = None, + ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + inputs = self._prepare_inputs(inputs) + with torch.no_grad(): + labels=copy.deepcopy(inputs['input_ids']) + # labels[labels==self.tokenizer.pad_token_id]=-100 + outputs = model(**inputs) + logits = outputs.logits + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous().long() + loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu() + loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss) + + if prediction_loss_only: + return (loss, None, None) + else: + # non pad label + shift_labels = shift_labels.view(-1).detach().cpu() + nonpad_idx = shift_labels!=self.tokenizer.pad_token_id + shift_labels = shift_labels[nonpad_idx] + # the probability at the corresponding position + shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu() + target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device) + shift_logits = shift_logits.softmax(dim=-1)[target_position] + + + return (loss, shift_logits, shift_labels) + + def _compute_metrics(self, eval_preds): + + preds, labels = eval_preds + + result = {} + for metric in self.eval_task.metric: + result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id)) + + average_metric = sum(result.values())/len(result) + result.update({"average_metrics":average_metric}) + return result \ No newline at end of file diff --git a/examples/examples_prompt/backbones/vit.py b/examples/examples_prompt/backbones/vit.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/examples_prompt/configs/adapter_clip-vit-base-patch32/beans.json b/examples/examples_prompt/configs/adapter_clip-vit-base-patch32/beans.json new file mode 100644 index 0000000..5f46495 --- /dev/null +++ b/examples/examples_prompt/configs/adapter_clip-vit-base-patch32/beans.json @@ -0,0 +1,48 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "beans", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32", + "num_classes": 3, + "num_train_epochs": 20, + "output_dir": "outputs/adapter/clip-vit-base-patch32/beans", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_delta_center": true, + "push_to_hub": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "beans", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "beans", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/adapter_opt-350m/wikitext.json b/examples/examples_prompt/configs/adapter_opt-350m/wikitext.json new file mode 100644 index 0000000..af141ff --- /dev/null +++ b/examples/examples_prompt/configs/adapter_opt-350m/wikitext.json @@ -0,0 +1,53 @@ +{ + "backbone_model": "opt", + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "wikitext", + "eval_steps": 200, + "evaluation_strategy": "steps", + "gradient_accumulation_steps":2, + "greater_is_better": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 900, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m", + "model_path_public": "opt-350m", + "num_train_epochs": 3, + "output_dir": "outputs/adapter/opt-350m/wikitext", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 6, + "per_device_train_batch_size": 6, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "wikitext", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "wikitext", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["self_attn"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/adapter_vit-large-patch16-224-in21k/beans.json b/examples/examples_prompt/configs/adapter_vit-large-patch16-224-in21k/beans.json new file mode 100644 index 0000000..ff7551a --- /dev/null +++ b/examples/examples_prompt/configs/adapter_vit-large-patch16-224-in21k/beans.json @@ -0,0 +1,53 @@ +{ + "backbone_model": "vit", + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": false, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "beans", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k", + "model_path_public": "vit-large-patch16-224-in21k", + "num_classes": 3, + "num_train_epochs": 20, + "output_dir": "outputs/adapter/vit-large-patch16-224-in21k/beans", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "beans", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "beans", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["output"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/bitfit_t5-large/rte.json b/examples/examples_prompt/configs/bitfit_t5-large/rte.json new file mode 100644 index 0000000..04e7f77 --- /dev/null +++ b/examples/examples_prompt/configs/bitfit_t5-large/rte.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "t5-large", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/t5-large", + "model_path_public": "t5-large", + "num_train_epochs": 20, + "output_dir": "outputs/bitfit/t5-large/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/t5-large", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attn", "ff", "layer_norm"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/compacter_blenderbot-3b/sst2.json b/examples/examples_prompt/configs/compacter_blenderbot-3b/sst2.json new file mode 100644 index 0000000..2862f6e --- /dev/null +++ b/examples/examples_prompt/configs/compacter_blenderbot-3b/sst2.json @@ -0,0 +1,66 @@ +{ + "backbone_model": "blenderbot", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "compacter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "sst2", + "eval_steps": 200, + "evaluation_strategy": "steps", + "factorized_phm": true, + "factorized_phm_rule": false, + "gradient_clip": false, + "greater_is_better": true, + "hypercomplex_adapters": true, + "hypercomplex_division": 4, + "hypercomplex_nonlinearity": "glorot-uniform", + "learn_phm": true, + "learning_rate": 0.003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b", + "model_path_public": "blenderbot-3b", + "non_linearity": "gelu_new", + "normalize_phm_weight": false, + "num_train_epochs": 3, + "output_dir": "outputs/compacter/blenderbot-3b/sst2", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "phm_c_init": "normal", + "phm_clamp": false, + "phm_init_range": 0.0001, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "shared_phm_rule": false, + "split_validation_test": true, + "task_name": "sst2", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "sst2", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "use_bias_down_sampler": true, + "use_bias_up_sampler": true, + "warmup_steps": 0, + "modified_modules":["fc2"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/compacter_deberta-v2-xlarge/mnli.json b/examples/examples_prompt/configs/compacter_deberta-v2-xlarge/mnli.json new file mode 100644 index 0000000..23c38d7 --- /dev/null +++ b/examples/examples_prompt/configs/compacter_deberta-v2-xlarge/mnli.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "deberta-v2-xlarge", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "compacter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mnli", + "eval_steps": 500, + "evaluation_strategy": "steps", + "greater_is_better": true, + "is_seq2seq": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge", + "num_train_epochs": 3, + "output_dir": "outputs/compacter/deberta-v2-xlarge/mnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hub": false, + "save_steps": 500, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "mnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mnli", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attention"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/compacter_long-t5-tglobal-large/rte.json b/examples/examples_prompt/configs/compacter_long-t5-tglobal-large/rte.json new file mode 100644 index 0000000..eb3d7c1 --- /dev/null +++ b/examples/examples_prompt/configs/compacter_long-t5-tglobal-large/rte.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "long-t5", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "compacter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large", + "model_path_public": "long-t5-tglobal-large", + "num_train_epochs": 20, + "output_dir": "outputs/compacter/long-t5-tglobal-large/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attn", "ff", "layer_norm"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/gen_clip.py b/examples/examples_prompt/configs/gen_clip.py index e7cb94d..41a59c5 100644 --- a/examples/examples_prompt/configs/gen_clip.py +++ b/examples/examples_prompt/configs/gen_clip.py @@ -2,7 +2,7 @@ import collections import copy PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" -PATHBASE="/home/hushengding/plm_cache/" +# PATHBASE="/home/hushengding/plm_cache/" AllConfigs = {} diff --git a/examples/examples_prompt/configs/lora_beit-large-patch16-224/cifar10.json b/examples/examples_prompt/configs/lora_beit-large-patch16-224/cifar10.json new file mode 100644 index 0000000..1a4d789 --- /dev/null +++ b/examples/examples_prompt/configs/lora_beit-large-patch16-224/cifar10.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "beit", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk", + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "cifar10", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224", + "model_path_public": "beit-large-patch16-224", + "num_classes": 10, + "num_train_epochs": 20, + "output_dir": "outputs/lora/beit-large-patch16-224/cifar10", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "cifar10", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "cifar10", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["query","value"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/lora_gpt-j-6B/wikitext.json b/examples/examples_prompt/configs/lora_gpt-j-6B/wikitext.json new file mode 100644 index 0000000..11ebfde --- /dev/null +++ b/examples/examples_prompt/configs/lora_gpt-j-6B/wikitext.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "gpt-j", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "wikitext", + "eval_steps": 500, + "evaluation_strategy": "steps", + "gradient_accumulation_steps":4, + "greater_is_better": false, + "learning_rate": 0.00003, + "load_best_model_at_end": true, + "max_source_length": 512, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B", + "model_path_public": "gpt-j-6B", + "num_train_epochs": 2, + "output_dir": "outputs/lora/gpt-j-6B/wikitext", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 2, + "per_device_train_batch_size": 2, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 500, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "wikitext", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "wikitext", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["20.attn.q_proj","21.attn.q_proj","22.attn.q_proj","23.attn.q_proj","24.attn.q_proj","25.attn.q_proj","26.attn.q_proj","27.attn.q_proj"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/lora_roberta-large/superglue-boolq.json b/examples/examples_prompt/configs/lora_roberta-large/superglue-boolq.json new file mode 100644 index 0000000..9ef9cff --- /dev/null +++ b/examples/examples_prompt/configs/lora_roberta-large/superglue-boolq.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "roberta-large", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-boolq", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "is_seq2seq": false, + "learning_rate": 0.0001, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large", + "model_path_public": "roberta-large", + "num_train_epochs": 20, + "output_dir": "outputs/lora/roberta-large/superglue-boolq", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": false, + "push_to_hub": false, + "push_to_dc": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "superglue-boolq", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-boolq", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["query","value"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/lora_xlm-roberta-large/superglue-wic.json b/examples/examples_prompt/configs/lora_xlm-roberta-large/superglue-wic.json new file mode 100644 index 0000000..35a42f1 --- /dev/null +++ b/examples/examples_prompt/configs/lora_xlm-roberta-large/superglue-wic.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "xlm-roberta-large", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wic", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "is_seq2seq": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large", + "model_path_public": "xlm-roberta-large", + "num_train_epochs": 20, + "output_dir": "outputs/lora/xlm-roberta-large/superglue-wic", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hub": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "superglue-wic", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wic", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["query","value"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/low_rank_adapter_gpt2/wikitext.json b/examples/examples_prompt/configs/low_rank_adapter_gpt2/wikitext.json new file mode 100644 index 0000000..3a60852 --- /dev/null +++ b/examples/examples_prompt/configs/low_rank_adapter_gpt2/wikitext.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "gpt2", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "low_rank_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "wikitext", + "eval_steps": 200, + "evaluation_strategy": "steps", + "gradient_accumulation_steps":1, + "greater_is_better": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 768, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt2", + "model_path_public": "gpt2", + "num_train_epochs": 2, + "output_dir": "outputs/low_rank_adapter/gpt2/wikitext", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "wikitext", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "wikitext", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt2", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attn","mlp"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/prefix_bert-large-cased/rte.json b/examples/examples_prompt/configs/prefix_bert-large-cased/rte.json new file mode 100644 index 0000000..5d67563 --- /dev/null +++ b/examples/examples_prompt/configs/prefix_bert-large-cased/rte.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "bert-large-cased", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "is_seq2seq": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/bert-large-cased/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hub": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attention"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/soft_prompt_bart-large/superglue-boolq.json b/examples/examples_prompt/configs/soft_prompt_bart-large/superglue-boolq.json new file mode 100644 index 0000000..19cbbba --- /dev/null +++ b/examples/examples_prompt/configs/soft_prompt_bart-large/superglue-boolq.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "bart", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "soft_prompt", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-boolq", + "eval_steps": 500, + "evaluation_strategy": "steps", + "gradient_accumulation_steps":1, + "greater_is_better": true, + "learning_rate": 0.1, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bart-large", + "model_path_public": "bart-large", + "num_train_epochs": 50, + "output_dir": "outputs/soft_prompt/bart-large/superglue-boolq", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 500, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "soft_token_num":100, + "split_validation_test": true, + "task_name": "superglue-boolq", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-boolq", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bart-large", + "token_init": true, + "unfrozen_modules": [ + "deltas" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_prompt/data_processors/processor.py b/examples/examples_prompt/data_processors/processor.py index 035bc5d..9986100 100644 --- a/examples/examples_prompt/data_processors/processor.py +++ b/examples/examples_prompt/data_processors/processor.py @@ -93,4 +93,10 @@ class AbstractTask(abc.ABC): # shuffles the data and samples it. if n_obs is not None: dataset = self.subsample(dataset, n_obs) - return dataset.map(self.preprocessor) + + this_method = getattr(self.__class__, 'preprocessor') + base_method = getattr(AbstractTask, 'preprocessor') + if this_method is not base_method: + return dataset.map(self.preprocessor) + else: + return dataset diff --git a/examples/examples_prompt/data_processors/tasks.py b/examples/examples_prompt/data_processors/tasks.py index 55048fe..044f419 100644 --- a/examples/examples_prompt/data_processors/tasks.py +++ b/examples/examples_prompt/data_processors/tasks.py @@ -545,12 +545,74 @@ class Beans(AbstractTask): def load_dataset(self, split): # from IPython import embed; embed(header="beans") if self.data_args.datasets_load_from_disk: - return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.wic")[split] + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/beans")[split] else: return datasets.load_dataset('beans', split=split, script_version="master") +class Wikitext(AbstractTask): + #wikitext-2-v1 + name = "wikitext" + # labels_list = ['angular_leaf_spot', 'bean_rust', "healthy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.perplexity] + metric_names = ["perplexity"] + verbalizers = { + "0": { + } + } + templates_text = { + "0": """{"meta":"text"}""" + } + split_valid_to_make_test = True + def load_dataset(self, split): + # from IPython import embed; embed(header="beans") + if self.data_args.datasets_load_from_disk: + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/wikitext")[split] + else: + return datasets.load_dataset('wikitext','wikitext-2-v1', split=split, script_version="master") + +class Cifar10(AbstractTask): + name = "cifar10" + + split_to_data_split = {"train": "train", + "validation": "test", + "test": "test"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + def load_dataset(self, split): + if self.data_args.datasets_load_from_disk: + d = datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/cifar10")[split].select(range(100)) + print(d) + return d + else: + return datasets.load_dataset('cifar10', split=split, script_version="master") + # def preprocessor(self, example): + # example_ = {} + # example_["image"] = example["image"] + # example_["labels"] = example["label"] + + # return example_ +class Fashion_MNIST(AbstractTask): + name = "Fashion-MNIST" + + split_to_data_split = {"train": "train", + "validation": "test", + "test": "test"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + def load_dataset(self, split): + if self.data_args.datasets_load_from_disk: + d = datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/fashion_mnist")[split] + print(d) + return d + else: + return datasets.load_dataset('fashion_mnist', split=split, script_version="master") TASK_MAPPING = OrderedDict( [ @@ -570,7 +632,10 @@ TASK_MAPPING = OrderedDict( ('superglue-multirc', SuperGLUEMultiRC), ('superglue-wic', SuperGLUEWIC), # ('superglue-record', SuperGLUERecord) - ('beans', Beans) + ('beans', Beans), + ('wikitext',Wikitext), + ('cifar10',Cifar10), + ('fashion_mnist',Fashion_MNIST) ] ) diff --git a/examples/examples_prompt/metrics/metrics.py b/examples/examples_prompt/metrics/metrics.py index 1b8125b..94267b0 100644 --- a/examples/examples_prompt/metrics/metrics.py +++ b/examples/examples_prompt/metrics/metrics.py @@ -11,6 +11,14 @@ import sklearn.metrics logger = getLogger(__name__) +def perplexity(outputs, targets,ignore_index=-100): + """Computes the perplexity accuracy.""" + + ce = -np.log(outputs).mean() + # ce = F.cross_entropy(torch.Tensor(outputs).view(-1, outputs.shape[-1]), torch.Tensor(targets).view(-1).long(),ignore_index=ignore_index) + + return {"perplexity":float(np.exp(ce))} + def accuracy(predictions, targets) -> dict: """Computes the average accuracy.""" return {"accuracy": 100 * ((np.array(predictions) == np.array(targets)).mean())} @@ -102,8 +110,8 @@ def f1_score(predictions, targets) -> dict: Returns: F1 score, where any prediction != 0 or 1 is counted as wrong. """ - targets = targets.astype(np.int32) - predictions = predictions.astype(np.int32) + targets = np.array(targets).astype(np.int32) + predictions = np.array(predictions).astype(np.int32) return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)} # TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow diff --git a/examples/examples_prompt/src/run.py b/examples/examples_prompt/src/run.py index 81a3de2..aca5210 100644 --- a/examples/examples_prompt/src/run.py +++ b/examples/examples_prompt/src/run.py @@ -31,6 +31,7 @@ os.environ['MKL_THREADING_LAYER'] = 'GNU' os.environ['MKL_SERVICE_FORCE_INTEL'] = '1' os.environ["TOKENIZERS_PARALLELISM"] = "false" sys.path.append(os.path.join(os.getcwd(), "../")) +# sys.path.append(os.path.join(os.getcwd(), "/mnt/sfs_turbo/zhangzhen/OpenDelta")) sys.path.append(os.path.join(os.getcwd())) import functools @@ -120,7 +121,8 @@ def main(): - if os.path.basename(model_args.model_name_or_path).startswith("t5"): + if os.path.basename(model_args.model_name_or_path).startswith("t5") \ + or os.path.basename(model_args.model_name_or_path).startswith("long-t5") : from examples_prompt.backbones.t5 import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts from examples_prompt.backbones.t5 import Trainer, DataCollator elif os.path.basename(model_args.model_name_or_path).startswith("blenderbot"): @@ -128,7 +130,9 @@ def main(): from examples_prompt.backbones.blenderbot import Trainer, DataCollator elif os.path.basename(model_args.model_name_or_path).startswith("roberta") \ or os.path.basename(model_args.model_name_or_path).startswith("bert") \ - or os.path.basename(model_args.model_name_or_path).startswith("albert") : + or os.path.basename(model_args.model_name_or_path).startswith("albert") \ + or os.path.basename(model_args.model_name_or_path).startswith("xlm-roberta") \ + or os.path.basename(model_args.model_name_or_path).startswith("deberta") : from examples_prompt.backbones.bert import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts from examples_prompt.backbones.bert import Trainer, DataCollator elif os.path.basename(model_args.model_name_or_path).startswith("beit"): @@ -143,6 +147,10 @@ def main(): elif os.path.basename(model_args.model_name_or_path).startswith("clip"): from examples_prompt.backbones.clip import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts from examples_prompt.backbones.clip import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("opt") \ + or os.path.basename(model_args.model_name_or_path).startswith("gpt"): + from examples_prompt.backbones.opt import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.opt import Trainer, DataCollator @@ -329,6 +337,7 @@ def main(): list_tags = ['NLI'], dict_tags = {'purpose':'for testing'}, delay_push=True, + test_result=all_results['test'] ) diff --git a/examples/examples_prompt/utils/args.py b/examples/examples_prompt/utils/args.py index 7d20750..c898a66 100644 --- a/examples/examples_prompt/utils/args.py +++ b/examples/examples_prompt/utils/args.py @@ -256,10 +256,57 @@ class AdapterArguments: bottleneck_dim: Optional[int] = field( default=24, metadata={"help": "the dimension of the bottleneck layer"} ) +@dataclass +class LoRAArguments: + lora_r: Optional[int] = field( + default=8, metadata={"help": "the rank of the LoRA metrics."} + ) +@dataclass +class PrefixArguments: + pass +@dataclass +class BitFitArguments: + pass +@dataclass +class SoftPromptArguments: + soft_token_num: Optional[int] = field( + default=100, metadata={"help": "the num of soft tokens."} + ) +@dataclass +class CompacterArguments: + pass +@dataclass +class LowRankAdapterArguments: + pass + +# from opendelta.delta_models.adapter import AdapterConfig +# from opendelta.delta_models.bitfit import BitFitConfig +# from opendelta.delta_models.compacter import CompacterConfig +# from opendelta.delta_models.lora import LoraArguments +# from opendelta.delta_models.low_rank_adapter import LowRankAdapterConfig +# from opendelta.delta_models.prefix import PrefixConfig +# from opendelta.delta_models.soft_prompt import SoftPromptConfig +# DELTAARGMAP = { +# "adapter": AdapterConfig, +# "lora":LoraArguments, +# "prefix":PrefixConfig, +# "bitfit":BitFitConfig, +# "soft_prompt":SoftPromptConfig, +# "compacter":CompacterConfig, +# "low_rank_adapter":LowRankAdapterConfig + +# } DELTAARGMAP = { - "adapter": AdapterArguments + "adapter": AdapterArguments, + "lora":LoRAArguments, + "prefix":PrefixArguments, + "bitfit":BitFitArguments, + "soft_prompt":SoftPromptArguments, + "compacter":CompacterArguments, + "low_rank_adapter":LowRankAdapterArguments + } # TODO: add more specific delta arguments @@ -310,13 +357,14 @@ class RemainArgHfArgumentParser(HfArgumentParser): for d in outputs: if isinstance(d, DeltaArguments): # merge the specific delta arguments d.merge_arguments(outputs[-1]) - return *(outputs[:-1]), remain_args + + return [*(outputs[:-1]), remain_args] else: outputs = self.parse_args_into_dataclasses(args=data_list, return_remaining_strings=return_remaining_args) for d in outputs: if isinstance(d, DeltaArguments): d.merge_arguments(outputs[-1]) - return (*(outputs[:-1]),) + return [*(outputs[:-1]),] def parse_args_into_dataclasses( self, args=None, return_remaining_strings=False, look_for_args_file=True, args_filename=None