diff --git a/.gitignore b/.gitignore index 9bb4b23..cba2a5c 100644 --- a/.gitignore +++ b/.gitignore @@ -35,4 +35,21 @@ log.txt **/examples/examples_bmtrain/BMPretrain **/examples/examples_bmtrain/BigModels/BigModels/results **/Delta_Memory/ +**/output/ +**/thunlp/ +**/saved_ckpts/ + +DeltaCenter-Python-Client/ +backbone_structure +delta_checkpoints +gitop.sh +load_dataset_and_model.ipynb +load_model.py +scripts +t.py +t.sh +!examples/examples_prompt/configs/*/*.json +!examples/examples_prompt/configs/** +**/delta_checkpoints/ +**/outputs/ diff --git a/README.md b/README.md index 1f84490..8739407 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,11 @@ python setup.py install python setup.py develop ``` +If you encounter network error using setup.py, please firstly install the dependencies via +```shell +pip install -r requirements.txt && python setup.py develop +``` + ## Must Try ```python diff --git a/dist/opendelta-0.2.0-py3-none-any.whl b/dist/opendelta-0.2.0-py3-none-any.whl new file mode 100644 index 0000000..c00ffc5 Binary files /dev/null and b/dist/opendelta-0.2.0-py3-none-any.whl differ diff --git a/dist/opendelta-0.2.0.tar.gz b/dist/opendelta-0.2.0.tar.gz new file mode 100644 index 0000000..c7468a2 Binary files /dev/null and b/dist/opendelta-0.2.0.tar.gz differ diff --git a/dist/opendelta-0.2.1-py3-none-any.whl b/dist/opendelta-0.2.1-py3-none-any.whl new file mode 100644 index 0000000..6fbe1ca Binary files /dev/null and b/dist/opendelta-0.2.1-py3-none-any.whl differ diff --git a/dist/opendelta-0.2.1.tar.gz b/dist/opendelta-0.2.1.tar.gz new file mode 100644 index 0000000..a915207 Binary files /dev/null and b/dist/opendelta-0.2.1.tar.gz differ diff --git a/dist/opendelta-0.2.2-py3-none-any.whl b/dist/opendelta-0.2.2-py3-none-any.whl new file mode 100644 index 0000000..f0d580e Binary files /dev/null and b/dist/opendelta-0.2.2-py3-none-any.whl differ diff --git a/dist/opendelta-0.2.2.tar.gz b/dist/opendelta-0.2.2.tar.gz new file mode 100644 index 0000000..5400092 Binary files /dev/null and b/dist/opendelta-0.2.2.tar.gz differ diff --git a/dist/opendelta-0.2.3-py3-none-any.whl b/dist/opendelta-0.2.3-py3-none-any.whl new file mode 100644 index 0000000..ac3e3d9 Binary files /dev/null and b/dist/opendelta-0.2.3-py3-none-any.whl differ diff --git a/dist/opendelta-0.2.3.tar.gz b/dist/opendelta-0.2.3.tar.gz new file mode 100644 index 0000000..2a2d57e Binary files /dev/null and b/dist/opendelta-0.2.3.tar.gz differ diff --git a/dist/opendelta-0.2.4-py3-none-any.whl b/dist/opendelta-0.2.4-py3-none-any.whl new file mode 100644 index 0000000..6f82355 Binary files /dev/null and b/dist/opendelta-0.2.4-py3-none-any.whl differ diff --git a/dist/opendelta-0.2.4.tar.gz b/dist/opendelta-0.2.4.tar.gz new file mode 100644 index 0000000..456ad09 Binary files /dev/null and b/dist/opendelta-0.2.4.tar.gz differ diff --git a/examples/examples_prompt/README.md b/examples/examples_prompt/README.md index e9d5249..d6b3329 100644 --- a/examples/examples_prompt/README.md +++ b/examples/examples_prompt/README.md @@ -1,24 +1,59 @@ -# !!!!This example collection is still under develop, please wait for some time to use it. +# Examples of using opendelta together with 🤗 transformers. -## install the repo +In this repo, we construct a very general pipeline to train and test a PLM using +🤗 transformers. + +The pipeline was constructed together with [openpromptu](https://pypi.org/project/openpromptu/), which is a light and +model-agnostic version of [openprompt](https://github.com/thunlp/OpenPrompt). + +## Pool of PLMs +We are going to adapt most of the models in 🤗 transformers +in the repos. The different pipeline, processing, or configurations are specified +in `./backbones/`. You can add your own model in this file to support customized models. + + +### A example script to run the repo in offline mode ```bash -cd ../ -python setup_seq2seq.py develop +conda activate [YOURENV] +PATHBASE=[YOURPATH] + +JOBNAME="adapter_t5-base" +DATASET="superglue-cb" + +cd $PATHBASE/OpenDelta/examples/examples_prompt/ +python configs/gen_t5.py --job $JOBNAME + +export TRANSFORMERS_OFFLINE=1 +export HF_DATASETS_OFFLINE=1 +python src/run.py configs/$JOBNAME/$DATASET.json \ +--model_name_or_path [YOURPATH_TO_T5_BASE] \ +--tokenizer_name [YOURPATH_TO_T5_BASE] \ +--datasets_saved_path [YOURPATH_TO_CB_DATASETS] \ +--finetuned_delta_path ${PATHBASE}/delta_checkpoints/ \ +--num_train_epochs 20 \ +--bottleneck_dim 24 \ +--delay_push True ``` -This will add `examples_seq2seq` to the environment path of the python lib. -## Generating the json configuration file +## A example of quick testing the repo. -```shell -python configs/gen_$BACKBONETYPE.py --job $YOURJOB -#e.g. python configs/gen_beit.py --job lora_beit-base-patch16-224 -``` -The available job configuration (e.g., `--job lora_beit-base-patch16-224`) can be seen from the scripts. You can also -create your only configuration. +```bash +conda activate [YOURENV] +PATHBASE=[YOURPATH] +JOBNAME="adapter_t5-base" +DATASET="superglue-cb" -## Run the code +cd $PATHBASE/OpenDelta/examples/examples_prompt/ -``` -CUDA_VISIBLE_DEVICES=1 python src/run.py configs/lora_beit-base-patch16-224/beans.json -``` +export TRANSFORMERS_OFFLINE=1 +export HF_DATASETS_OFFLINE=1 +export DELTACENTER_OFFLINE=0 +python src/test.py configs/$JOBNAME/$DATASET.json \ +--model_name_or_path [YOURPATH_TO_T5_BASE] \ +--tokenizer_name [YOURPATH_TO_T5_BASE] \ +--datasets_saved_path [YOURPATH_TO_CB_DATASETS] \ +--finetuned_delta_path thunlp/t5-base_adapter_superglue-cb_20220701171436c80 \ +--delta_cache_dir "./delta_checkpoints/" \ +--force_download True +``` \ No newline at end of file diff --git a/examples/examples_prompt/backbones/bart.py b/examples/examples_prompt/backbones/bart.py index bab8303..6b9dd92 100644 --- a/examples/examples_prompt/backbones/bart.py +++ b/examples/examples_prompt/backbones/bart.py @@ -26,14 +26,14 @@ def preprocess_function(raw_example, **kwargs): example = InputExample(**raw_example) - try: - example = verbalizer.wrap_one_example(example) - example, other = template.wrap_one_example(example) - input_sentence = tokenizer_wrapper.merge_wrapped_example(example) - model_inputs = tokenizer(input_sentence, max_length=256, - padding="max_length", truncation=True) - except: - from IPython import embed; embed(header="Therer") + + example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=256, + padding="max_length", truncation=True) + + with tokenizer.as_target_tokenizer(): label = tokenizer(other['tgt_text']).input_ids @@ -43,7 +43,8 @@ def preprocess_function(raw_example, **kwargs): def get_backbone(model_args, **kwargs): config = AutoConfig.from_pretrained( - model_args.config_name if model_args.config_name else model_args.model_name_or_path, + # model_args.config_name if model_args.config_name else model_args.model_name_or_path, + model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, diff --git a/examples/examples_prompt/backbones/beit.py b/examples/examples_prompt/backbones/beit.py index 4494fed..c35bd4e 100644 --- a/examples/examples_prompt/backbones/beit.py +++ b/examples/examples_prompt/backbones/beit.py @@ -8,7 +8,6 @@ from transformers import ( AutoFeatureExtractor, AutoModelForImageClassification, ) -from transformers import ViTFeatureExtractor from transformers import Trainer as HfTrainer import torch.nn as nn @@ -26,9 +25,10 @@ def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): def preprocess_function(raw_example, **kwargs): # from IPython import embed; embed(header="Therefa") tokenizer = kwargs['tokenizer'] - model_inputs = tokenizer(raw_example['image'], return_tensors='pt') + # print(np.array(raw_example['img']).shape) + model_inputs = tokenizer(np.array(raw_example['image']), return_tensors='pt') model_inputs['pixel_values'] = model_inputs['pixel_values'].squeeze() - model_inputs['labels'] = raw_example['labels'] + model_inputs['labels'] = raw_example['label'] return model_inputs def compute_metrics(eval_preds, dataset_name, eval_metric): @@ -55,7 +55,7 @@ def mask_token_func(tokenizer, ith_mask=0): def get_remove_columns(dataset_features): # dataset_features.pop("label") - print("remove_columns: {}".format(dataset_features)) + # print("remove_columns: {}".format(dataset_features)) return dataset_features class DataCollator(HfDataCollatorMixin): diff --git a/examples/examples_prompt/backbones/bigbird_.py b/examples/examples_prompt/backbones/bigbird_.py new file mode 100644 index 0000000..8945103 --- /dev/null +++ b/examples/examples_prompt/backbones/bigbird_.py @@ -0,0 +1,169 @@ +from openpromptu.data_utils import InputExample +import torch +from transformers.data.data_collator import torch_default_data_collator +from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin +from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator +import numpy as np +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, +) + +from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer +import copy +from torch.nn import CrossEntropyLoss + +def preprocess_function(raw_example, **kwargs): + tokenizer = kwargs['tokenizer'] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + + example = InputExample(**raw_example) + # example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length, + padding="max_length", truncation=True) + return model_inputs + + + +def compute_metrics(eval_preds, dataset_name, eval_metric): + pass + +def mask_token_func(tokenizer, ith_mask=0): + return tokenizer.pad_token + +def get_remove_columns(dataset_features): + # dataset_features.remove("label") + return dataset_features + +def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): + from openpromptu.prompts import GenerationVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func) + return template, verbalizer, tokenizer_wrapper + + +def get_backbone(model_args, **kwargs): + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + # config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + return config, tokenizer, model + +class Trainer(HfSeq2SeqTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.eval_task = eval_task + self.compute_metrics = self._compute_metrics + + def compute_loss(self, model, inputs, return_outputs=False): + + labels=copy.deepcopy(inputs['input_ids']) + # labels[labels==self.tokenizer.pad_token_id]=-100 + outputs = model(**inputs) + logits = outputs.logits + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1)) + + return (loss, outputs) if return_outputs else loss + + def prediction_step( + self, + model, #nn.Module, + inputs, #Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only, #: bool, + ignore_keys, #: Optional[List[str]] = None, + ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + inputs = self._prepare_inputs(inputs) + with torch.no_grad(): + labels=copy.deepcopy(inputs['input_ids']) + # labels[labels==self.tokenizer.pad_token_id]=-100 + outputs = model(**inputs) + logits = outputs.logits + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous().long() + loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu() + loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss) + + if prediction_loss_only: + return (loss, None, None) + else: + # non pad label + shift_labels = shift_labels.view(-1).detach().cpu() + nonpad_idx = shift_labels!=self.tokenizer.pad_token_id + shift_labels = shift_labels[nonpad_idx] + # the probability at the corresponding position + shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu() + target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device) + shift_logits = shift_logits.softmax(dim=-1)[target_position] + + + return (loss, shift_logits, shift_labels) + + def _compute_metrics(self, eval_preds): + + preds, labels = eval_preds + + result = {} + for metric in self.eval_task.metric: + result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id)) + + average_metric = sum(result.values())/len(result) + result.update({"average_metrics":average_metric}) + return result \ No newline at end of file diff --git a/examples/examples_prompt/backbones/blenderbot.py b/examples/examples_prompt/backbones/blenderbot.py index c1e8876..54e4ec8 100644 --- a/examples/examples_prompt/backbones/blenderbot.py +++ b/examples/examples_prompt/backbones/blenderbot.py @@ -26,14 +26,13 @@ def preprocess_function(raw_example, **kwargs): example = InputExample(**raw_example) - try: - example = verbalizer.wrap_one_example(example) - example, other = template.wrap_one_example(example) - input_sentence = tokenizer_wrapper.merge_wrapped_example(example) - model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length, - padding="max_length", truncation=True) - except: - from IPython import embed; embed(header="Therer") + + example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length, + padding="max_length", truncation=True) + with tokenizer.as_target_tokenizer(): label = tokenizer(other['tgt_text']).input_ids @@ -165,7 +164,7 @@ class Trainer(HfSeq2SeqTrainer): return (loss, generated_tokens, labels) def _compute_metrics(self, eval_preds): - from IPython import embed; embed(header="In compute metrics") + # from IPython import embed; embed(header="In compute metrics") preds, labels = eval_preds decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True) decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True) diff --git a/examples/examples_prompt/backbones/opt.py b/examples/examples_prompt/backbones/opt.py new file mode 100644 index 0000000..5902bc9 --- /dev/null +++ b/examples/examples_prompt/backbones/opt.py @@ -0,0 +1,171 @@ +from openpromptu.data_utils import InputExample +import torch +from transformers.data.data_collator import torch_default_data_collator +from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin +from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator +import numpy as np +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, +) + +from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer +import copy +from torch.nn import CrossEntropyLoss + +def preprocess_function(raw_example, **kwargs): + tokenizer = kwargs['tokenizer'] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + + example = InputExample(**raw_example) + # example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length, + padding="max_length", truncation=True) + return model_inputs + + + +def compute_metrics(eval_preds, dataset_name, eval_metric): + pass + +def mask_token_func(tokenizer, ith_mask=0): + return tokenizer.pad_token + +def get_remove_columns(dataset_features): + # dataset_features.remove("label") + return dataset_features + +def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): + from openpromptu.prompts import GenerationVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="tail", mask_token_func=mask_token_func) + return template, verbalizer, tokenizer_wrapper + + +def get_backbone(model_args, **kwargs): + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + # config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + if not hasattr(tokenizer,"pad_token") or (hasattr(tokenizer,"pad_token") and tokenizer.pad_token==None): + tokenizer.pad_token = tokenizer.eos_token + + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + return config, tokenizer, model + +class Trainer(HfSeq2SeqTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.eval_task = eval_task + self.compute_metrics = self._compute_metrics + + def compute_loss(self, model, inputs, return_outputs=False): + + labels=copy.deepcopy(inputs['input_ids']) + # labels[labels==self.tokenizer.pad_token_id]=-100 + outputs = model(**inputs) + logits = outputs.logits + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1)) + + return (loss, outputs) if return_outputs else loss + + def prediction_step( + self, + model, #nn.Module, + inputs, #Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only, #: bool, + ignore_keys, #: Optional[List[str]] = None, + ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + inputs = self._prepare_inputs(inputs) + with torch.no_grad(): + labels=copy.deepcopy(inputs['input_ids']) + # labels[labels==self.tokenizer.pad_token_id]=-100 + outputs = model(**inputs) + logits = outputs.logits + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous().long() + loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu() + loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss) + + if prediction_loss_only: + return (loss, None, None) + else: + # non pad label + shift_labels = shift_labels.view(-1).detach().cpu() + nonpad_idx = shift_labels!=self.tokenizer.pad_token_id + shift_labels = shift_labels[nonpad_idx] + # the probability at the corresponding position + shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu() + target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device) + shift_logits = shift_logits.softmax(dim=-1)[target_position] + + + return (loss, shift_logits, shift_labels) + + def _compute_metrics(self, eval_preds): + + preds, labels = eval_preds + + result = {} + for metric in self.eval_task.metric: + result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id)) + + average_metric = sum(result.values())/len(result) + result.update({"average_metrics":average_metric}) + return result \ No newline at end of file diff --git a/examples/examples_prompt/backbones/t5.py b/examples/examples_prompt/backbones/t5.py index 7a6edf0..15e7f21 100644 --- a/examples/examples_prompt/backbones/t5.py +++ b/examples/examples_prompt/backbones/t5.py @@ -26,14 +26,13 @@ def preprocess_function(raw_example, **kwargs): example = InputExample(**raw_example) - try: - example = verbalizer.wrap_one_example(example) - example, other = template.wrap_one_example(example) - input_sentence = tokenizer_wrapper.merge_wrapped_example(example) - model_inputs = tokenizer(input_sentence, max_length=256, - padding="max_length", truncation=True) - except: - from IPython import embed; embed(header="Therer") + + example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=256, + padding="max_length", truncation=True) + with tokenizer.as_target_tokenizer(): label = tokenizer(other['tgt_text']).input_ids diff --git a/examples/examples_seq2seq/__init__.py b/examples/examples_prompt/backbones/vit.py similarity index 100% rename from examples/examples_seq2seq/__init__.py rename to examples/examples_prompt/backbones/vit.py diff --git a/examples/examples_prompt/collect_result.jsonl b/examples/examples_prompt/collect_result.jsonl deleted file mode 100644 index 990a2d9..0000000 --- a/examples/examples_prompt/collect_result.jsonl +++ /dev/null @@ -1,59 +0,0 @@ -# the final results will be populated here.{ - "evaluate": { - "epoch": 20.0, - "eval_accuracy": 89.2156862745098, - "eval_average_metrics": 90.76168929110105, - "eval_f1": 92.3076923076923, - "eval_loss": 0.16493959724903107, - "eval_runtime": 1.6391, - "eval_samples_per_second": 124.455 - }, - "repo_name": "DeltaHub/bitfit_t5-base_mrpc", - "test": { - "epoch": 20.0, - "test_accuracy": 88.23529411764706, - "test_average_metrics": 89.97971602434077, - "test_f1": 91.72413793103448, - "test_loss": 0.14968213438987732, - "test_runtime": 1.6344, - "test_samples_per_second": 124.82 - } -} -{ - "evaluate": { - "epoch": 20.0, - "eval_average_metrics": 52.10265668831534, - "eval_loss": 0.3603779077529907, - "eval_matthews_correlation": 52.10265668831534, - "eval_runtime": 1.0808, - "eval_samples_per_second": 482.046 - }, - "repo_name": "DeltaHub/bitfit_t5-base_cola", - "test": { - "epoch": 20.0, - "test_average_metrics": 54.209563471221934, - "test_loss": 0.2853100299835205, - "test_matthews_correlation": 54.209563471221934, - "test_runtime": 1.056, - "test_samples_per_second": 494.304 - } -} -{ - "evaluate": { - "epoch": 20.0, - "eval_average_metrics": 53.80613287067274, - "eval_loss": 0.25723716616630554, - "eval_matthews_correlation": 53.80613287067274, - "eval_runtime": 1.0583, - "eval_samples_per_second": 492.299 - }, - "repo_name": "DeltaHub/bitfit_t5-base_cola", - "test": { - "epoch": 20.0, - "test_average_metrics": 54.32497579543861, - "test_loss": 0.22327613830566406, - "test_matthews_correlation": 54.32497579543861, - "test_runtime": 1.0556, - "test_samples_per_second": 494.507 - } -} diff --git a/examples/examples_prompt/configs/adapter_clip-vit-base-patch32/beans.json b/examples/examples_prompt/configs/adapter_clip-vit-base-patch32/beans.json new file mode 100644 index 0000000..5f46495 --- /dev/null +++ b/examples/examples_prompt/configs/adapter_clip-vit-base-patch32/beans.json @@ -0,0 +1,48 @@ +{ + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "beans", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32", + "num_classes": 3, + "num_train_epochs": 20, + "output_dir": "outputs/adapter/clip-vit-base-patch32/beans", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_delta_center": true, + "push_to_hub": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "beans", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "beans", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/adapter_opt-350m/wikitext.json b/examples/examples_prompt/configs/adapter_opt-350m/wikitext.json new file mode 100644 index 0000000..af141ff --- /dev/null +++ b/examples/examples_prompt/configs/adapter_opt-350m/wikitext.json @@ -0,0 +1,53 @@ +{ + "backbone_model": "opt", + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "wikitext", + "eval_steps": 200, + "evaluation_strategy": "steps", + "gradient_accumulation_steps":2, + "greater_is_better": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 900, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m", + "model_path_public": "opt-350m", + "num_train_epochs": 3, + "output_dir": "outputs/adapter/opt-350m/wikitext", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 6, + "per_device_train_batch_size": 6, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "wikitext", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "wikitext", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["self_attn"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/adapter_vit-large-patch16-224-in21k/beans.json b/examples/examples_prompt/configs/adapter_vit-large-patch16-224-in21k/beans.json new file mode 100644 index 0000000..ff7551a --- /dev/null +++ b/examples/examples_prompt/configs/adapter_vit-large-patch16-224-in21k/beans.json @@ -0,0 +1,53 @@ +{ + "backbone_model": "vit", + "bottleneck_dim": 24, + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": false, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "beans", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k", + "model_path_public": "vit-large-patch16-224-in21k", + "num_classes": 3, + "num_train_epochs": 20, + "output_dir": "outputs/adapter/vit-large-patch16-224-in21k/beans", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "beans", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "beans", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["output"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/bitfit_t5-large/rte.json b/examples/examples_prompt/configs/bitfit_t5-large/rte.json new file mode 100644 index 0000000..04e7f77 --- /dev/null +++ b/examples/examples_prompt/configs/bitfit_t5-large/rte.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "t5-large", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "bitfit", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/t5-large", + "model_path_public": "t5-large", + "num_train_epochs": 20, + "output_dir": "outputs/bitfit/t5-large/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/t5-large", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attn", "ff", "layer_norm"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/compacter_blenderbot-3b/sst2.json b/examples/examples_prompt/configs/compacter_blenderbot-3b/sst2.json new file mode 100644 index 0000000..2862f6e --- /dev/null +++ b/examples/examples_prompt/configs/compacter_blenderbot-3b/sst2.json @@ -0,0 +1,66 @@ +{ + "backbone_model": "blenderbot", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "compacter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "sst2", + "eval_steps": 200, + "evaluation_strategy": "steps", + "factorized_phm": true, + "factorized_phm_rule": false, + "gradient_clip": false, + "greater_is_better": true, + "hypercomplex_adapters": true, + "hypercomplex_division": 4, + "hypercomplex_nonlinearity": "glorot-uniform", + "learn_phm": true, + "learning_rate": 0.003, + "load_best_model_at_end": true, + "max_source_length": 128, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b", + "model_path_public": "blenderbot-3b", + "non_linearity": "gelu_new", + "normalize_phm_weight": false, + "num_train_epochs": 3, + "output_dir": "outputs/compacter/blenderbot-3b/sst2", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "phm_c_init": "normal", + "phm_clamp": false, + "phm_init_range": 0.0001, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "shared_phm_rule": false, + "split_validation_test": true, + "task_name": "sst2", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "sst2", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "use_bias_down_sampler": true, + "use_bias_up_sampler": true, + "warmup_steps": 0, + "modified_modules":["fc2"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/compacter_deberta-v2-xlarge/mnli.json b/examples/examples_prompt/configs/compacter_deberta-v2-xlarge/mnli.json new file mode 100644 index 0000000..23c38d7 --- /dev/null +++ b/examples/examples_prompt/configs/compacter_deberta-v2-xlarge/mnli.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "deberta-v2-xlarge", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "compacter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "mnli", + "eval_steps": 500, + "evaluation_strategy": "steps", + "greater_is_better": true, + "is_seq2seq": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge", + "num_train_epochs": 3, + "output_dir": "outputs/compacter/deberta-v2-xlarge/mnli", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hub": false, + "save_steps": 500, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "mnli", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "mnli", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attention"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/compacter_long-t5-tglobal-large/rte.json b/examples/examples_prompt/configs/compacter_long-t5-tglobal-large/rte.json new file mode 100644 index 0000000..eb3d7c1 --- /dev/null +++ b/examples/examples_prompt/configs/compacter_long-t5-tglobal-large/rte.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "long-t5", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "compacter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large", + "model_path_public": "long-t5-tglobal-large", + "num_train_epochs": 20, + "output_dir": "outputs/compacter/long-t5-tglobal-large/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attn", "ff", "layer_norm"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/gen_clip.py b/examples/examples_prompt/configs/gen_clip.py index e7cb94d..41a59c5 100644 --- a/examples/examples_prompt/configs/gen_clip.py +++ b/examples/examples_prompt/configs/gen_clip.py @@ -2,7 +2,7 @@ import collections import copy PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" -PATHBASE="/home/hushengding/plm_cache/" +# PATHBASE="/home/hushengding/plm_cache/" AllConfigs = {} diff --git a/examples/examples_prompt/configs/gen_t5.py b/examples/examples_prompt/configs/gen_t5.py index 8876197..7040fb6 100644 --- a/examples/examples_prompt/configs/gen_t5.py +++ b/examples/examples_prompt/configs/gen_t5.py @@ -45,11 +45,14 @@ BaseConfigs['t5-base'] = { "greater_is_better": True, "evaluation_strategy": "steps", "overwrite_output_dir": True, - "push_to_hub": False, - "push_to_delta_center": True, + "push_to_hf": False, + "push_to_dc": True, "save_strategy": "steps", "datasets_load_from_disk": True, - "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/" + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "backbone_model": "t5", # use in delta center, + "model_path_public": "t5-base", # use in delta center, + } AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) diff --git a/examples/examples_prompt/configs/lora_beit-large-patch16-224/cifar10.json b/examples/examples_prompt/configs/lora_beit-large-patch16-224/cifar10.json new file mode 100644 index 0000000..1a4d789 --- /dev/null +++ b/examples/examples_prompt/configs/lora_beit-large-patch16-224/cifar10.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "beit", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk", + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "cifar10", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224", + "model_path_public": "beit-large-patch16-224", + "num_classes": 10, + "num_train_epochs": 20, + "output_dir": "outputs/lora/beit-large-patch16-224/cifar10", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "cifar10", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "cifar10", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["query","value"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/lora_gpt-j-6B/wikitext.json b/examples/examples_prompt/configs/lora_gpt-j-6B/wikitext.json new file mode 100644 index 0000000..11ebfde --- /dev/null +++ b/examples/examples_prompt/configs/lora_gpt-j-6B/wikitext.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "gpt-j", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "wikitext", + "eval_steps": 500, + "evaluation_strategy": "steps", + "gradient_accumulation_steps":4, + "greater_is_better": false, + "learning_rate": 0.00003, + "load_best_model_at_end": true, + "max_source_length": 512, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B", + "model_path_public": "gpt-j-6B", + "num_train_epochs": 2, + "output_dir": "outputs/lora/gpt-j-6B/wikitext", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 2, + "per_device_train_batch_size": 2, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 500, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "wikitext", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "wikitext", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["20.attn.q_proj","21.attn.q_proj","22.attn.q_proj","23.attn.q_proj","24.attn.q_proj","25.attn.q_proj","26.attn.q_proj","27.attn.q_proj"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/lora_roberta-large/superglue-boolq.json b/examples/examples_prompt/configs/lora_roberta-large/superglue-boolq.json new file mode 100644 index 0000000..9ef9cff --- /dev/null +++ b/examples/examples_prompt/configs/lora_roberta-large/superglue-boolq.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "roberta-large", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-boolq", + "eval_steps": 200, + "evaluation_strategy": "steps", + "greater_is_better": true, + "is_seq2seq": false, + "learning_rate": 0.0001, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large", + "model_path_public": "roberta-large", + "num_train_epochs": 20, + "output_dir": "outputs/lora/roberta-large/superglue-boolq", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": false, + "push_to_hub": false, + "push_to_dc": true, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "superglue-boolq", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-boolq", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["query","value"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/lora_xlm-roberta-large/superglue-wic.json b/examples/examples_prompt/configs/lora_xlm-roberta-large/superglue-wic.json new file mode 100644 index 0000000..35a42f1 --- /dev/null +++ b/examples/examples_prompt/configs/lora_xlm-roberta-large/superglue-wic.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "xlm-roberta-large", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "lora", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-wic", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "is_seq2seq": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large", + "model_path_public": "xlm-roberta-large", + "num_train_epochs": 20, + "output_dir": "outputs/lora/xlm-roberta-large/superglue-wic", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hub": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "superglue-wic", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-wic", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["query","value"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/low_rank_adapter_gpt2/wikitext.json b/examples/examples_prompt/configs/low_rank_adapter_gpt2/wikitext.json new file mode 100644 index 0000000..3a60852 --- /dev/null +++ b/examples/examples_prompt/configs/low_rank_adapter_gpt2/wikitext.json @@ -0,0 +1,52 @@ +{ + "backbone_model": "gpt2", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "low_rank_adapter", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "wikitext", + "eval_steps": 200, + "evaluation_strategy": "steps", + "gradient_accumulation_steps":1, + "greater_is_better": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 768, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt2", + "model_path_public": "gpt2", + "num_train_epochs": 2, + "output_dir": "outputs/low_rank_adapter/gpt2/wikitext", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 200, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "wikitext", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "wikitext", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt2", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attn","mlp"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/prefix_bert-large-cased/rte.json b/examples/examples_prompt/configs/prefix_bert-large-cased/rte.json new file mode 100644 index 0000000..5d67563 --- /dev/null +++ b/examples/examples_prompt/configs/prefix_bert-large-cased/rte.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "bert-large-cased", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "prefix", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "rte", + "eval_steps": 100, + "evaluation_strategy": "steps", + "greater_is_better": true, + "is_seq2seq": false, + "learning_rate": 0.0003, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased", + "num_train_epochs": 20, + "output_dir": "outputs/prefix/bert-large-cased/rte", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 16, + "per_device_train_batch_size": 16, + "predict_with_generate": false, + "push_to_dc": true, + "push_to_hub": false, + "save_steps": 100, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "split_validation_test": true, + "task_name": "rte", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "rte", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased", + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "warmup_steps": 0, + "modified_modules":["attention"] +} \ No newline at end of file diff --git a/examples/examples_prompt/configs/soft_prompt_bart-large/superglue-boolq.json b/examples/examples_prompt/configs/soft_prompt_bart-large/superglue-boolq.json new file mode 100644 index 0000000..19cbbba --- /dev/null +++ b/examples/examples_prompt/configs/soft_prompt_bart-large/superglue-boolq.json @@ -0,0 +1,51 @@ +{ + "backbone_model": "bart", + "dataset_config_name": [ + "en" + ], + "datasets_load_from_disk": true, + "datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/", + "delta_type": "soft_prompt", + "do_eval": true, + "do_test": true, + "do_train": true, + "eval_dataset_config_name": [ + "en" + ], + "eval_dataset_name": "superglue-boolq", + "eval_steps": 500, + "evaluation_strategy": "steps", + "gradient_accumulation_steps":1, + "greater_is_better": true, + "learning_rate": 0.1, + "load_best_model_at_end": true, + "max_source_length": 256, + "metric_for_best_model": "average_metrics", + "model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bart-large", + "model_path_public": "bart-large", + "num_train_epochs": 50, + "output_dir": "outputs/soft_prompt/bart-large/superglue-boolq", + "overwrite_output_dir": true, + "per_device_eval_batch_size": 32, + "per_device_train_batch_size": 32, + "predict_with_generate": true, + "push_to_dc": true, + "push_to_hf": false, + "save_steps": 500, + "save_strategy": "steps", + "save_total_limit": 1, + "seed": 42, + "soft_token_num":100, + "split_validation_test": true, + "task_name": "superglue-boolq", + "test_dataset_config_name": [ + "en" + ], + "test_dataset_name": "superglue-boolq", + "tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bart-large", + "token_init": true, + "unfrozen_modules": [ + "deltas" + ], + "warmup_steps": 0 +} \ No newline at end of file diff --git a/examples/examples_prompt/data_processors/processor.py b/examples/examples_prompt/data_processors/processor.py index 035bc5d..9986100 100644 --- a/examples/examples_prompt/data_processors/processor.py +++ b/examples/examples_prompt/data_processors/processor.py @@ -93,4 +93,10 @@ class AbstractTask(abc.ABC): # shuffles the data and samples it. if n_obs is not None: dataset = self.subsample(dataset, n_obs) - return dataset.map(self.preprocessor) + + this_method = getattr(self.__class__, 'preprocessor') + base_method = getattr(AbstractTask, 'preprocessor') + if this_method is not base_method: + return dataset.map(self.preprocessor) + else: + return dataset diff --git a/examples/examples_prompt/data_processors/tasks.py b/examples/examples_prompt/data_processors/tasks.py index aee5478..7d0402a 100644 --- a/examples/examples_prompt/data_processors/tasks.py +++ b/examples/examples_prompt/data_processors/tasks.py @@ -12,22 +12,16 @@ import logging import numpy as np import torch import re -from openprompt.prompts import ManualTemplate, ManualVerbalizer -from openprompt.plms.utils import TokenizerWrapper -from openprompt.data_utils import InputExample -from openprompt.prompts import GenerationVerbalizer import itertools - +import os logger = logging.getLogger(__name__) - from transformers.models.auto.tokenization_auto import tokenizer_class_from_name from typing import List, Dict from collections import defaultdict -from openprompt.utils import round_list import warnings @@ -68,7 +62,8 @@ class COLA(AbstractTask): } def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.cola")[split] else: return datasets.load_dataset('glue', 'cola', @@ -96,7 +91,8 @@ class SST2(AbstractTask): } def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.sst2")[split] else: return datasets.load_dataset('glue', 'sst2', @@ -123,10 +119,9 @@ class MRPC(AbstractTask): } - - def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.mrpc")[split] else: return datasets.load_dataset('glue', 'mrpc', split=split, script_version="master") @@ -152,7 +147,8 @@ class QQP(AbstractTask): def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.qqp")[split] else: return datasets.load_dataset('glue', 'qqp', @@ -208,7 +204,8 @@ class MNLI(AbstractTask): } def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.mnli")[split] else: return datasets.load_dataset('glue', 'mnli', split=split, script_version="master") @@ -243,7 +240,8 @@ class QNLI(AbstractTask): def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.qnli")[split] else: return datasets.load_dataset('glue', 'qnli', split=split, script_version="master") @@ -279,7 +277,8 @@ class RTE(AbstractTask): } def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.rte")[split] else: return datasets.load_dataset('glue', 'rte', @@ -306,7 +305,8 @@ class WNLI(AbstractTask): def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.wnli")[split] else: return datasets.load_dataset('glue', 'wnli', split=split, script_version="master") @@ -334,7 +334,8 @@ class SuperGLUEBoolQ(AbstractTask): } def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.boolq")[split] else: return datasets.load_dataset('super_glue', 'boolq', split=split, script_version="master") @@ -347,8 +348,8 @@ class SuperGLUECB(AbstractTask): split_to_data_split = {"train": "train", "validation": "validation", "test": "validation"} - metric = [metrics.mean_multiclass_f1(num_classes=3), metrics.accuracy] - metric_names = ["f1_multiclass", "accuracy"] + metric = [metrics.accuracy] + metric_names = ["accuracy"] verbalizers = { "0":{"0": "yes", @@ -361,7 +362,8 @@ class SuperGLUECB(AbstractTask): } def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.cb")[split] else: return datasets.load_dataset('super_glue', 'cb', split=split, script_version="master") @@ -387,7 +389,8 @@ class SuperGLUECOPA(AbstractTask): } def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.copa")[split] else: return datasets.load_dataset('super_glue', 'copa', split=split, script_version="master") @@ -416,7 +419,8 @@ class SuperGLUEMultiRC(AbstractTask): def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.multirc")[split] else: return datasets.load_dataset('super_glue', 'multirc', split=split, script_version="master") @@ -459,7 +463,8 @@ class SuperGLUEWIC(AbstractTask): } def load_dataset(self, split): - if self.data_args.datasets_load_from_disk: + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.wic")[split] else: return datasets.load_dataset('super_glue', 'wic', split=split, script_version="master") @@ -549,13 +554,76 @@ class Beans(AbstractTask): def load_dataset(self, split): # from IPython import embed; embed(header="beans") - if self.data_args.datasets_load_from_disk: - return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.wic")[split] + offline = os.environ.get("HF_DATASETS_OFFLINE", "0") + if offline == '1': + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/beans")[split] else: return datasets.load_dataset('beans', split=split, script_version="master") +class Wikitext(AbstractTask): + #wikitext-2-v1 + name = "wikitext" + # labels_list = ['angular_leaf_spot', 'bean_rust', "healthy"] + split_to_data_split = {"train": "train", + "validation": "validation", + "test": "validation"} + metric = [metrics.perplexity] + metric_names = ["perplexity"] + verbalizers = { + "0": { + } + } + templates_text = { + "0": """{"meta":"text"}""" + } + split_valid_to_make_test = True + def load_dataset(self, split): + # from IPython import embed; embed(header="beans") + if self.data_args.datasets_load_from_disk: + return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/wikitext")[split] + else: + return datasets.load_dataset('wikitext','wikitext-2-v1', split=split, script_version="master") + +class Cifar10(AbstractTask): + name = "cifar10" + + split_to_data_split = {"train": "train", + "validation": "test", + "test": "test"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + def load_dataset(self, split): + if self.data_args.datasets_load_from_disk: + d = datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/cifar10")[split].select(range(100)) + print(d) + return d + else: + return datasets.load_dataset('cifar10', split=split, script_version="master") + # def preprocessor(self, example): + # example_ = {} + # example_["image"] = example["image"] + # example_["labels"] = example["label"] + + # return example_ +class Fashion_MNIST(AbstractTask): + name = "Fashion-MNIST" + + split_to_data_split = {"train": "train", + "validation": "test", + "test": "test"} + metric = [metrics.accuracy] + metric_names = ["accuracy"] + + def load_dataset(self, split): + if self.data_args.datasets_load_from_disk: + d = datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/fashion_mnist")[split] + print(d) + return d + else: + return datasets.load_dataset('fashion_mnist', split=split, script_version="master") TASK_MAPPING = OrderedDict( [ @@ -575,7 +643,10 @@ TASK_MAPPING = OrderedDict( ('superglue-multirc', SuperGLUEMultiRC), ('superglue-wic', SuperGLUEWIC), # ('superglue-record', SuperGLUERecord) - ('beans', Beans) + ('beans', Beans), + ('wikitext',Wikitext), + ('cifar10',Cifar10), + ('fashion_mnist',Fashion_MNIST) ] ) diff --git a/examples/examples_prompt/metrics/metrics.py b/examples/examples_prompt/metrics/metrics.py index b9c7cb0..94267b0 100644 --- a/examples/examples_prompt/metrics/metrics.py +++ b/examples/examples_prompt/metrics/metrics.py @@ -11,6 +11,14 @@ import sklearn.metrics logger = getLogger(__name__) +def perplexity(outputs, targets,ignore_index=-100): + """Computes the perplexity accuracy.""" + + ce = -np.log(outputs).mean() + # ce = F.cross_entropy(torch.Tensor(outputs).view(-1, outputs.shape[-1]), torch.Tensor(targets).view(-1).long(),ignore_index=ignore_index) + + return {"perplexity":float(np.exp(ce))} + def accuracy(predictions, targets) -> dict: """Computes the average accuracy.""" return {"accuracy": 100 * ((np.array(predictions) == np.array(targets)).mean())} @@ -47,20 +55,20 @@ def spearman_corrcoef(predictions, targets) -> dict: -def spearman_corrcoef(predictions, targets) -> dict: - """Computes Spearman correlation coefficient.""" - # TODO: we need to do postprocessors in a clean way for each dataset. - from examples_seq2seq.data_processors.postprocessors import string_to_float - targets = [string_to_float(target) for target in targets] - predictions= [string_to_float(prediction) for prediction in predictions] - spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0] +# def spearman_corrcoef(predictions, targets) -> dict: +# """Computes Spearman correlation coefficient.""" +# # TODO: we need to do postprocessors in a clean way for each dataset. +# from examples_seq2seq.data_processors.postprocessors import string_to_float +# targets = [string_to_float(target) for target in targets] +# predictions= [string_to_float(prediction) for prediction in predictions] +# spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0] - # Note that if all the predictions will be the same, spearman - # correlation is nan, to gaurad against this, we check the output - # and return 0 in this case. - if math.isnan(spearman_corrcoef): - spearman_corrcoef = 0 - return {"spearmanr": spearman_corrcoef} +# # Note that if all the predictions will be the same, spearman +# # correlation is nan, to gaurad against this, we check the output +# # and return 0 in this case. +# if math.isnan(spearman_corrcoef): +# spearman_corrcoef = 0 +# return {"spearmanr": spearman_corrcoef} def f1_score_with_invalid(predictions, targets) -> dict: @@ -102,8 +110,8 @@ def f1_score(predictions, targets) -> dict: Returns: F1 score, where any prediction != 0 or 1 is counted as wrong. """ - targets = targets.astype(np.int32) - predictions = predictions.astype(np.int32) + targets = np.array(targets).astype(np.int32) + predictions = np.array(predictions).astype(np.int32) return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)} # TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow diff --git a/examples/examples_prompt/src/run.py b/examples/examples_prompt/src/run.py index 5d29700..aca5210 100644 --- a/examples/examples_prompt/src/run.py +++ b/examples/examples_prompt/src/run.py @@ -26,10 +26,12 @@ You can also adapt this script on your own tasks. import os import sys + os.environ['MKL_THREADING_LAYER'] = 'GNU' os.environ['MKL_SERVICE_FORCE_INTEL'] = '1' os.environ["TOKENIZERS_PARALLELISM"] = "false" sys.path.append(os.path.join(os.getcwd(), "../")) +# sys.path.append(os.path.join(os.getcwd(), "/mnt/sfs_turbo/zhangzhen/OpenDelta")) sys.path.append(os.path.join(os.getcwd())) import functools @@ -56,7 +58,7 @@ from transformers.trainer_utils import is_main_process, get_last_checkpoint from data_processors import AutoTask #, #TaskDataCollatorForSeq2Seq, AutoPostProcessor, data_collator from utils import read_json, save_json -from utils.args import ModelArguments, TrainingArguments, DataTrainingArguments, RemainArgHfArgumentParser +from utils.args import ModelArguments, TrainingArguments, DataTrainingArguments, DeltaArguments, RemainArgHfArgumentParser logger = logging.getLogger(__name__) @@ -66,16 +68,14 @@ def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. - parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) - if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): - # If we pass only one argument to the script and it's the path to a json file, - # let's parse it to get our arguments. - model_args, data_args, training_args, delta_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) - else: - model_args, data_args, training_args, delta_args = parser.parse_args_into_dataclasses(return_remaining_strings=True) + parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, DeltaArguments)) + # You can provide a json file with contains the arguments and use the --argument some_arg to override or append to the json file. + json_file, cmd_args = (os.path.abspath(sys.argv[1]), sys.argv[2:]) if sys.argv[1].endswith(".json") else (None, sys.argv[1:]) + model_args, data_args, training_args, delta_args, remain_args = parser.parse_json_file_with_cmd_args(json_file=json_file, command_line_args=cmd_args) + logger.warning("The following arguments not used! {}".format(remain_args)) - print(f"{training_args.output_dir}/results.json") + logger.info(f"The results will be used in {training_args.output_dir}/results.json") # exit() # Detecting last checkpoint. last_checkpoint = None @@ -121,7 +121,8 @@ def main(): - if os.path.basename(model_args.model_name_or_path).startswith("t5"): + if os.path.basename(model_args.model_name_or_path).startswith("t5") \ + or os.path.basename(model_args.model_name_or_path).startswith("long-t5") : from examples_prompt.backbones.t5 import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts from examples_prompt.backbones.t5 import Trainer, DataCollator elif os.path.basename(model_args.model_name_or_path).startswith("blenderbot"): @@ -129,7 +130,9 @@ def main(): from examples_prompt.backbones.blenderbot import Trainer, DataCollator elif os.path.basename(model_args.model_name_or_path).startswith("roberta") \ or os.path.basename(model_args.model_name_or_path).startswith("bert") \ - or os.path.basename(model_args.model_name_or_path).startswith("albert") : + or os.path.basename(model_args.model_name_or_path).startswith("albert") \ + or os.path.basename(model_args.model_name_or_path).startswith("xlm-roberta") \ + or os.path.basename(model_args.model_name_or_path).startswith("deberta") : from examples_prompt.backbones.bert import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts from examples_prompt.backbones.bert import Trainer, DataCollator elif os.path.basename(model_args.model_name_or_path).startswith("beit"): @@ -144,6 +147,10 @@ def main(): elif os.path.basename(model_args.model_name_or_path).startswith("clip"): from examples_prompt.backbones.clip import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts from examples_prompt.backbones.clip import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("opt") \ + or os.path.basename(model_args.model_name_or_path).startswith("gpt"): + from examples_prompt.backbones.opt import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.opt import Trainer, DataCollator @@ -161,7 +168,8 @@ def main(): if delta_args.delta_type.lower() != "none": from opendelta import AutoDeltaConfig,AutoDeltaModel - delta_config = AutoDeltaConfig.from_dict(vars(delta_args)) + from dataclasses import asdict + delta_config = AutoDeltaConfig.from_dict(asdict(delta_args)) delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=model) delta_model.freeze_module(set_state_dict = True) delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True) @@ -278,14 +286,9 @@ def main(): if torch.cuda.is_available() and training_args.compute_memory: peak_memory = (torch.cuda.max_memory_allocated() / 1024 ** 2)/1000 - print( - "Memory utilization", - peak_memory, - "GB" - ) performance_metrics.update({"peak_memory": peak_memory}) if training_args.compute_memory or training_args.compute_time: - print("Efficiency Statistics {}".format(performance_metrics)) + logger.info("Efficiency Statistics {}".format(performance_metrics)) trainer.save_metrics("performance", performance_metrics) # Evaluation @@ -313,17 +316,30 @@ def main(): trainer.save_metrics(f"{data_args.task_name}_test", metrics) all_results['test'][data_args.task_name] = metrics + # from opendelta.utils.delta_hub import create_hub_repo_name + # from opendelta.utils.delta_center import create_delta_center_args, create_repo_name + # repo_name = create_hub_repo_name(root="DeltaHub", # dataset=data_args.task_name, # delta_type = delta_args.delta_type, # model_name_or_path= model_args.model_name_or_path) - # results['repo_name'] = repo_name - # if delta_args.delta_type.lower() != "none": - # if training_args.push_to_hub: # TODO add description here - # delta_model.save_finetuned(push_to_hub=True, save_directory=repo_name, use_auth_token=True) - # # trainer.push_to_hub(**kwargs) - # else: - # delta_model.save_finetuned(push_to_hub=False, save_directory=repo_name, use_auth_token=True) + + # center_args = + # repo_name = create_repo_name(prefix="", center_args=center_args) + # all_results['repo_name'] = repo_name + + + delta_model.save_finetuned(finetuned_delta_path=delta_args.finetuned_delta_path, + push_to_dc=training_args.push_to_dc, + center_args={"test_performance":all_results['test'][data_args.task_name]['test_average_metrics'], + }, + center_args_pool = {**vars(model_args), **vars(data_args), **vars(training_args), **vars(delta_args)}, + list_tags = ['NLI'], + dict_tags = {'purpose':'for testing'}, + delay_push=True, + test_result=all_results['test'] + ) + with open(f"{training_args.output_dir}/results.json", 'w') as fout: diff --git a/examples/examples_prompt/src/test.py b/examples/examples_prompt/src/test.py new file mode 100644 index 0000000..fb17494 --- /dev/null +++ b/examples/examples_prompt/src/test.py @@ -0,0 +1,344 @@ +# coding=utf-8 +# Copyright OpenDelta Team and THUNLP lab. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +A unified runing scripts for most models to do down stream tasks in a +prompt learning fashion, i.e., No classification head, all tasks are casted +to mask prediction or span prediction tasks. + +Processing relevant to different backbone models are stored in ../backbones/ + +Adding A few lines to integrate the Delta tuning methods. + +You can also adapt this script on your own tasks. +""" + +import os +import sys +os.environ['MKL_THREADING_LAYER'] = 'GNU' +os.environ['MKL_SERVICE_FORCE_INTEL'] = '1' +os.environ["TOKENIZERS_PARALLELISM"] = "false" +sys.path.append(os.path.join(os.getcwd(), "../")) +sys.path.append(os.path.join(os.getcwd())) + +import functools +import logging +import torch +import json +import numpy as np + +import transformers +from transformers import ( + AutoConfig, + AutoModelForMaskedLM, + AutoModelForSeq2SeqLM, + AutoTokenizer, + DataCollatorForSeq2Seq, + # HfArgumentParser, + # MBartTokenizer, + # default_data_collator, + Trainer, + Seq2SeqTrainer, + set_seed, +) +from transformers.trainer_utils import is_main_process, get_last_checkpoint + +from data_processors import AutoTask #, #TaskDataCollatorForSeq2Seq, AutoPostProcessor, data_collator +from utils import read_json, save_json +from utils.args import ModelArguments, TrainingArguments, DataTrainingArguments, RemainArgHfArgumentParser, DeltaArguments + + +logger = logging.getLogger(__name__) + + +def main(): + parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, DeltaArguments)) + + # You can provide a json file with contains the arguments and use the --argument some_arg to override or append to the json file. + json_file, cmd_args = (os.path.abspath(sys.argv[1]), sys.argv[2:]) if sys.argv[1].endswith(".json") else (None, sys.argv[1:]) + model_args, data_args, training_args, delta_args, remain_args = parser.parse_json_file_with_cmd_args(json_file=json_file, command_line_args=cmd_args) + logger.warning("The following arguments not used! {}".format(remain_args)) + + # # exit() + # # Detecting last checkpoint. + # last_checkpoint = None + # if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + # last_checkpoint = get_last_checkpoint(training_args.output_dir) + # print("#### last_checkpoint ", last_checkpoint) + # if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + # ''' + # raise ValueError( + # f"Output directory ({training_args.output_dir}) already exists and is not empty. " + # "Use --overwrite_output_dir to overcome." + # ) + # ''' + # pass + # elif last_checkpoint is not None: + # logger.info( + # f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + # "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + # ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + # Set the verbosity to info of the Transformers logger (on main process only): + if is_main_process(training_args.local_rank): + transformers.utils.logging.set_verbosity_info() + # logger.info("Training/evaluation parameters %s", training_args, model_args, data_args, delta_args) + logger.info("{}\n{}\n{}\n{}".format(training_args, model_args, data_args, delta_args)) + + + # Set seed before initializing model. + set_seed(training_args.seed) + + + + if os.path.basename(model_args.model_name_or_path).startswith("t5"): + from examples_prompt.backbones.t5 import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.t5 import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("blenderbot"): + from examples_prompt.backbones.blenderbot import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.blenderbot import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("roberta") \ + or os.path.basename(model_args.model_name_or_path).startswith("bert") \ + or os.path.basename(model_args.model_name_or_path).startswith("albert") : + from examples_prompt.backbones.bert import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.bert import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("beit"): + from examples_prompt.backbones.beit import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.beit import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("bart"): + from examples_prompt.backbones.bart import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.bart import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("bigbird"): + from examples_prompt.backbones.bigbird import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.bigbird import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("clip"): + from examples_prompt.backbones.clip import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.clip import Trainer, DataCollator + + + + config, tokenizer, model = get_backbone(model_args=model_args) + + # model parallelize + if hasattr(training_args, "model_parallel") and training_args.model_parallel: + logger.info('parallelize model!') + model.parallelize() + + from opendelta import Visualization + Visualization(model).structure_graph() + + if delta_args.delta_type.lower() != "none": + from opendelta.delta_models.adapter import AdapterConfig, AdapterModel + delta_config = AdapterConfig.from_finetuned(finetuned_delta_path=delta_args.finetuned_delta_path) + delta_model = AdapterModel.from_finetuned(finetuned_delta_path=delta_args.finetuned_delta_path, + delta_config=delta_config, + backbone_model=model, + force_download=delta_args.force_download, + cache_dir=delta_args.delta_cache_dir) + # delta_model.freeze_module(set_state_dict = True) + delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True) + + + performance_metrics = {} + + + + + non_empty_splits_names = [] + # if training_args.do_train: + # non_empty_splits_names.append("train") + # if training_args.do_eval: + # non_empty_splits_names.append("eval") + if training_args.do_test: + non_empty_splits_names.append("test") + splits = {} + for split_name in ['test']: + if split_name not in non_empty_splits_names: + splits[split_name] = None + continue + + task = AutoTask.get(data_args.task_name, + data_args.dataset_config_name, + data_args=data_args, + seed=data_args.data_sample_seed) + + dataset = task.get(split=split_name, + split_validation_test=training_args.split_validation_test, + n_obs=data_args.max_train_samples) + + + + template, _verbalizer, tokenizer_wrapper = get_prompts(task, tokenizer, data_args) + + + dataset = dataset.map( + functools.partial(preprocess_function, + data_args=data_args, + tokenizer=tokenizer, + template=template, + verbalizer=_verbalizer, + tokenizer_wrapper=tokenizer_wrapper, + split=split_name), + batched=False, + num_proc=data_args.preprocessing_num_workers, + remove_columns=get_remove_columns(list(dataset.features.keys())), + load_from_cache_file=not data_args.overwrite_cache, + ) + # from IPython import embed; embed() + splits[split_name] = dataset + if split_name == "test": + eval_task = task + verbalizer = _verbalizer + + + + trainer = Trainer( + model=model, + verbalizer=verbalizer, + eval_task=eval_task, + args=training_args, + # train_dataset=splits['train'], + # eval_dataset=splits['eval'], + tokenizer=tokenizer, + data_collator=DataCollator(tokenizer), + ) + + + def save_training_config(config_file, output_dir): + json_data = read_json(config_file) + save_json(os.path.join(output_dir, "training_config.json"), json_data) + + + # Saves training config. + if trainer.is_world_process_zero(): + save_training_config(sys.argv[1], training_args.output_dir) + + # # Training + # if training_args.do_train: + # checkpoint = None + # if training_args.resume_from_checkpoint is not None: + # checkpoint = training_args.resume_from_checkpoint + # elif last_checkpoint is not None: + # checkpoint = last_checkpoint + + # if training_args.compute_time: + # torch.cuda.synchronize() # wait for move to complete + # start = torch.cuda.Event(enable_timing=True) + # end = torch.cuda.Event(enable_timing=True) + # start.record() + + # train_result = trainer.train(resume_from_checkpoint=checkpoint) + + # if training_args.compute_time: + # end.record() + # torch.cuda.synchronize() # wait for all_reduce to complete + # total_time = start.elapsed_time(end)/(1000*60) + # performance_metrics.update({"total_time in minutes ": total_time}) + + # trainer.save_model() # Saves the tokenizer too for easy upload + # train_metrics = train_result.metrics + # max_train_samples = ( + # data_args.max_train_samples if data_args.max_train_samples is not None else len(splits['train']) + # ) + # train_metrics["train_samples"] = min(max_train_samples, len(splits['train'])) + # trainer.log_metrics("train", train_metrics) + # trainer.save_metrics("train", train_metrics) + # trainer.save_state() + + # if torch.cuda.is_available() and training_args.compute_memory: + # peak_memory = (torch.cuda.max_memory_allocated() / 1024 ** 2)/1000 + # print( + # "Memory utilization", + # peak_memory, + # "GB" + # ) + # performance_metrics.update({"peak_memory": peak_memory}) + # if training_args.compute_memory or training_args.compute_time: + # print("Efficiency Statistics {}".format(performance_metrics)) + # trainer.save_metrics("performance", performance_metrics) + + # Evaluation + all_results = {} + + # all_results['evaluate'] = {} + + # if training_args.do_eval: + # logger.info("*** Evaluate ***") + + # metrics = trainer.evaluate(eval_dataset=splits['eval'], + # ) + # trainer.log_metrics(f"{data_args.task_name}_eval", metrics) + # trainer.save_metrics(f"{data_args.task_name}_eval", metrics) + # all_results['evaluate'][data_args.task_name] = metrics + + # Test + all_results['test'] = {} + if training_args.do_test: + logger.info("*** Test ***") + metrics = trainer.evaluate(eval_dataset=splits['test'], + metric_key_prefix="test" + ) + trainer.log_metrics(f"{data_args.task_name}_test", metrics) + trainer.save_metrics(f"{data_args.task_name}_test", metrics) + all_results['test'][data_args.task_name] = metrics + + # from opendelta.utils.delta_hub import create_hub_repo_name + # from opendelta.utils.delta_center import create_delta_center_args, create_repo_name + + # repo_name = create_hub_repo_name(root="DeltaHub", + # dataset=data_args.task_name, + # delta_type = delta_args.delta_type, + # model_name_or_path= model_args.model_name_or_path) + + # center_args = + # repo_name = create_repo_name(prefix="", center_args=center_args) + # all_results['repo_name'] = repo_name + + + # delta_model.save_finetuned(push_to_hf=training_args.push_to_hf, + # push_to_dc=training_args.push_to_dc, + # center_args={}, + # center_args_pool = {**vars(model_args), **vars(data_args), **vars(training_args), **vars(delta_args)}, + # delay_push=True, + # ) + + print(all_results) + + + + # with open(f"{training_args.output_dir}/results.json", 'w') as fout: + # string = json.dumps(all_results, indent=4,sort_keys=True) + # fout.write(string+"\n") + + return all_results + + + + +if __name__ == "__main__": + result = main() + diff --git a/examples/examples_prompt/utils/args.py b/examples/examples_prompt/utils/args.py index aefec9a..23bdab8 100644 --- a/examples/examples_prompt/utils/args.py +++ b/examples/examples_prompt/utils/args.py @@ -1,6 +1,10 @@ from dataclasses import dataclass, field from typing import Optional, List from transformers import HfArgumentParser +from pathlib import Path +import sys + + @dataclass class ModelArguments: @@ -81,6 +85,10 @@ class TrainingArguments(HfTrainingArguments): remove_unused_columns: Optional[bool] = field( default=False, metadata={"help": "Remove columns not required by the model when using an nlp.Dataset."} ) + push_to_hf: Optional[bool] = field(default=False, metadata={"help": "Push the model to huggingface model hub."}) + push_to_dc: Optional[bool] = field(default=True, metadata={"help": "Push the model to delta center."}) + + @@ -211,28 +219,254 @@ class DataTrainingArguments: self.test_max_target_length = self.max_target_length + +import dataclasses + +@dataclass +class DeltaArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + delta_type: str= field(default="", metadata={"help": "the type of delta"}) + backbone_model: Optional[str] = field( + default="", metadata={"help": "the backbone model"} + ) + model_path_public: Optional[str] = field( + default="", metadata={"help": "the path (url) of the publicly available backbone model"} + ) + modified_modules: Optional[List[str]] = field( + default_factory=lambda: None, metadata={"help": "the modules inside the backbone to be modified"} + ) + unfrozen_modules: Optional[List[str]] = field( + default_factory=lambda:["deltas"], metadata={"help": "the modules inside the backbone or in the delta modules that need to be unfrozen"} + ) + finetuned_delta_path: Optional[str] = field( + default=None, metadata={"help": "the path of the finetuned delta model"} + ) + force_download: Optional[bool] = field( + default=False, metadata={"help": "whether to download the checkpoint form delta center no matter whether it exists"} + ) + local_files_only: Optional[bool] = field( + default=False, metadata={"help": "whether not to look for file in delta center"} + ) + delta_cache_dir: Optional[str] = field( + default=None, metadata={"help": "The cache path defined by user. If not set, we will firstly look into the"+ + " working directory and then into the default cache path (ususally ~/.cache/delta_center)."} + ) + delay_push: Optional[bool] = field( + default=True, metadata={ + 'help':'whether push the checkpoint to delta center later.' + } + ) + + def merge_arguments(self, objb): + print(objb) + self.__class__ = dataclasses.make_dataclass('DeltaArgument', fields=[(s.name, s.type, getattr(objb, s.name)) for s in dataclasses.fields(objb)], bases=(DeltaArguments,)) + + + + +@dataclass +class AdapterArguments: + bottleneck_dim: Optional[int] = field( + default=24, metadata={"help": "the dimension of the bottleneck layer"} + ) +@dataclass +class LoRAArguments: + lora_r: Optional[int] = field( + default=8, metadata={"help": "the rank of the LoRA metrics."} + ) +@dataclass +class PrefixArguments: + pass +@dataclass +class BitFitArguments: + pass +@dataclass +class SoftPromptArguments: + soft_token_num: Optional[int] = field( + default=100, metadata={"help": "the num of soft tokens."} + ) + +@dataclass +class CompacterArguments: + pass +@dataclass +class LowRankAdapterArguments: + pass + +# from opendelta.delta_models.adapter import AdapterConfig +# from opendelta.delta_models.bitfit import BitFitConfig +# from opendelta.delta_models.compacter import CompacterConfig +# from opendelta.delta_models.lora import LoraArguments +# from opendelta.delta_models.low_rank_adapter import LowRankAdapterConfig +# from opendelta.delta_models.prefix import PrefixConfig +# from opendelta.delta_models.soft_prompt import SoftPromptConfig +# DELTAARGMAP = { +# "adapter": AdapterConfig, +# "lora":LoraArguments, +# "prefix":PrefixConfig, +# "bitfit":BitFitConfig, +# "soft_prompt":SoftPromptConfig, +# "compacter":CompacterConfig, +# "low_rank_adapter":LowRankAdapterConfig + +# } + +DELTAARGMAP = { + "adapter": AdapterArguments, + "lora":LoRAArguments, + "prefix":PrefixArguments, + "bitfit":BitFitArguments, + "soft_prompt":SoftPromptArguments, + "compacter":CompacterArguments, + "low_rank_adapter":LowRankAdapterArguments + +} + +# TODO: add more specific delta arguments + + + class RemainArgHfArgumentParser(HfArgumentParser): - def parse_json_file(self, json_file: str, return_remaining_args=True ): + '''This is a more powerful version of argument parser. + It can receiven both command line arguments and json file arguments. + The command line arguments will override the json file arguments. + The parser will load the specific delta arguments (e.g. Adapter's) + according to the delta_type argument. And merge the specific delta arguments + with the common delta arguments. + ''' + def parse_json_file_with_cmd_args(self, json_file: str, command_line_args=None, return_remaining_args=True ): """ Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the dataclass types. """ - import argparse + import json from pathlib import Path - import dataclasses + + data = json.loads(Path(json_file).read_text()) + + + data_str = "" + if command_line_args is None: + command_line_args = [] + for key in data: + if "--"+key not in command_line_args: + if isinstance(data[key], list): + data_str += "--"+key + for elem in data[key]: + data_str+=" "+ str(elem) + data_str += " " + else: + data_str+= "--" + key + " " + str(data[key]) + " " + + data_list = data_str.split() + data_list += command_line_args + + + if return_remaining_args: + outputs, remain_args = self.parse_args_into_dataclasses(args=data_list, return_remaining_strings=return_remaining_args) + for d in outputs: + if isinstance(d, DeltaArguments): # merge the specific delta arguments + d.merge_arguments(outputs[-1]) + + return [*(outputs[:-1]), remain_args] + else: + outputs = self.parse_args_into_dataclasses(args=data_list, return_remaining_strings=return_remaining_args) + for d in outputs: + if isinstance(d, DeltaArguments): + d.merge_arguments(outputs[-1]) + return [*(outputs[:-1]),] + + def parse_args_into_dataclasses( + self, args=None, return_remaining_strings=False, look_for_args_file=True, args_filename=None + ): + """ + Parse command-line args into instances of the specified dataclass types. + + This relies on argparse's `ArgumentParser.parse_known_args`. See the doc at: + docs.python.org/3.7/library/argparse.html#argparse.ArgumentParser.parse_args + + Args: + args: + List of strings to parse. The default is taken from sys.argv. (same as argparse.ArgumentParser) + return_remaining_strings: + If true, also return a list of remaining argument strings. + look_for_args_file: + If true, will look for a ".args" file with the same base name as the entry point script for this + process, and will append its potential content to the command line args. + args_filename: + If not None, will uses this file instead of the ".args" file specified in the previous argument. + + Returns: + Tuple consisting of: + + - the dataclass instances in the same order as they were passed to the initializer.abspath + - if applicable, an additional namespace for more (non-dataclass backed) arguments added to the parser + after initialization. + - The potential list of remaining argument strings. (same as argparse.ArgumentParser.parse_known_args) + """ + if args_filename or (look_for_args_file and len(sys.argv)): + if args_filename: + args_file = Path(args_filename) + else: + args_file = Path(sys.argv[0]).with_suffix(".args") + + if args_file.exists(): + fargs = args_file.read_text().split() + args = fargs + args if args is not None else fargs + sys.argv[1:] + # in case of duplicate arguments the first one has precedence + # so we append rather than prepend. + namespace, remaining_args = self.parse_known_args(args=args) + + # conditionally add delta arguments + deltatype_args = DELTAARGMAP[namespace.delta_type] + self.dataclass_types.append(deltatype_args) + self._add_dataclass_arguments(deltatype_args) + + # parse the arguments again, this time with the specific delta type's arguments + namespace, remaining_args = self.parse_known_args(args=args) + + outputs = [] for dtype in self.dataclass_types: keys = {f.name for f in dataclasses.fields(dtype) if f.init} - inputs = {k: data.pop(k) for k in list(data.keys()) if k in keys} + inputs = {k: v for k, v in vars(namespace).items() if k in keys} + for k in keys: + delattr(namespace, k) obj = dtype(**inputs) outputs.append(obj) - - remain_args = argparse.ArgumentParser() - remain_args.__dict__.update(data) - if return_remaining_args: - return (*outputs, remain_args) + if len(namespace.__dict__) > 0: + # additional namespace. + outputs.append(namespace) + if return_remaining_strings: + return (outputs, remaining_args) else: - return (*outputs,) \ No newline at end of file + if remaining_args: + raise ValueError(f"Some specified arguments are not used by the HfArgumentParser: {remaining_args}") + + return outputs + + # namespace, remaining_args = self.parse_known_args(args=data_list) + + # print("Here", command_line_args, data_list,namespace, remaining_args) + # data.update(remain_args) + + # outputs = [] + # for dtype in self.dataclass_types: + # keys = {f.name for f in dataclasses.fields(dtype) if f.init} + # inputs = {k: namespace.get(k) for k in list(data.keys()) if k in keys} + # obj = dtype(**inputs) + # outputs.append(obj) + + # # remain_args = argparse.ArgumentParser() + # remain_args.__dict__.update(remain_args) + # if return_remaining_args: + # return (*outputs, remain_args) + # else: + # return (*outputs,) + + diff --git a/examples/examples_seq2seq/README.md b/examples/legacies/examples_seq2seq/README.md similarity index 100% rename from examples/examples_seq2seq/README.md rename to examples/legacies/examples_seq2seq/README.md diff --git a/examples/examples_seq2seq/metrics/__init__.py b/examples/legacies/examples_seq2seq/__init__.py similarity index 100% rename from examples/examples_seq2seq/metrics/__init__.py rename to examples/legacies/examples_seq2seq/__init__.py diff --git a/examples/examples_seq2seq/configs/config_gen_bs.py b/examples/legacies/examples_seq2seq/configs/config_gen_bs.py similarity index 100% rename from examples/examples_seq2seq/configs/config_gen_bs.py rename to examples/legacies/examples_seq2seq/configs/config_gen_bs.py diff --git a/examples/examples_seq2seq/data_processors/__init__.py b/examples/legacies/examples_seq2seq/data_processors/__init__.py similarity index 100% rename from examples/examples_seq2seq/data_processors/__init__.py rename to examples/legacies/examples_seq2seq/data_processors/__init__.py diff --git a/examples/examples_seq2seq/data_processors/data_collator.py b/examples/legacies/examples_seq2seq/data_processors/data_collator.py similarity index 100% rename from examples/examples_seq2seq/data_processors/data_collator.py rename to examples/legacies/examples_seq2seq/data_processors/data_collator.py diff --git a/examples/examples_seq2seq/data_processors/postprocessors.py b/examples/legacies/examples_seq2seq/data_processors/postprocessors.py similarity index 100% rename from examples/examples_seq2seq/data_processors/postprocessors.py rename to examples/legacies/examples_seq2seq/data_processors/postprocessors.py diff --git a/examples/examples_seq2seq/data_processors/tasks.py b/examples/legacies/examples_seq2seq/data_processors/tasks.py similarity index 100% rename from examples/examples_seq2seq/data_processors/tasks.py rename to examples/legacies/examples_seq2seq/data_processors/tasks.py diff --git a/examples/examples_seq2seq/data_processors/utils.py b/examples/legacies/examples_seq2seq/data_processors/utils.py similarity index 100% rename from examples/examples_seq2seq/data_processors/utils.py rename to examples/legacies/examples_seq2seq/data_processors/utils.py diff --git a/examples/examples_seq2seq/utils/__init__.py b/examples/legacies/examples_seq2seq/metrics/__init__.py similarity index 100% rename from examples/examples_seq2seq/utils/__init__.py rename to examples/legacies/examples_seq2seq/metrics/__init__.py diff --git a/examples/examples_seq2seq/metrics/metrics.py b/examples/legacies/examples_seq2seq/metrics/metrics.py similarity index 100% rename from examples/examples_seq2seq/metrics/metrics.py rename to examples/legacies/examples_seq2seq/metrics/metrics.py diff --git a/examples/examples_seq2seq/metrics/qa_utils.py b/examples/legacies/examples_seq2seq/metrics/qa_utils.py similarity index 100% rename from examples/examples_seq2seq/metrics/qa_utils.py rename to examples/legacies/examples_seq2seq/metrics/qa_utils.py diff --git a/examples/examples_seq2seq/run_seq2seq.py b/examples/legacies/examples_seq2seq/run_seq2seq.py similarity index 100% rename from examples/examples_seq2seq/run_seq2seq.py rename to examples/legacies/examples_seq2seq/run_seq2seq.py diff --git a/examples/examples_seq2seq/seq2seq_trainer.py b/examples/legacies/examples_seq2seq/seq2seq_trainer.py similarity index 100% rename from examples/examples_seq2seq/seq2seq_trainer.py rename to examples/legacies/examples_seq2seq/seq2seq_trainer.py diff --git a/examples/examples_seq2seq/trainers/__init__.py b/examples/legacies/examples_seq2seq/trainers/__init__.py similarity index 100% rename from examples/examples_seq2seq/trainers/__init__.py rename to examples/legacies/examples_seq2seq/trainers/__init__.py diff --git a/examples/examples_seq2seq/trainers/model_args.py b/examples/legacies/examples_seq2seq/trainers/model_args.py similarity index 100% rename from examples/examples_seq2seq/trainers/model_args.py rename to examples/legacies/examples_seq2seq/trainers/model_args.py diff --git a/examples/examples_seq2seq/trainers/seq2seq_trainer.py b/examples/legacies/examples_seq2seq/trainers/seq2seq_trainer.py similarity index 100% rename from examples/examples_seq2seq/trainers/seq2seq_trainer.py rename to examples/legacies/examples_seq2seq/trainers/seq2seq_trainer.py diff --git a/examples/examples_seq2seq/trainers/trainer.py b/examples/legacies/examples_seq2seq/trainers/trainer.py similarity index 100% rename from examples/examples_seq2seq/trainers/trainer.py rename to examples/legacies/examples_seq2seq/trainers/trainer.py diff --git a/examples/examples_seq2seq/trainers/trainer_args.py b/examples/legacies/examples_seq2seq/trainers/trainer_args.py similarity index 100% rename from examples/examples_seq2seq/trainers/trainer_args.py rename to examples/legacies/examples_seq2seq/trainers/trainer_args.py diff --git a/examples/examples_seq2seq/trainers/trainer_utils.py b/examples/legacies/examples_seq2seq/trainers/trainer_utils.py similarity index 100% rename from examples/examples_seq2seq/trainers/trainer_utils.py rename to examples/legacies/examples_seq2seq/trainers/trainer_utils.py diff --git a/examples/legacies/examples_seq2seq/utils/__init__.py b/examples/legacies/examples_seq2seq/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/examples/examples_seq2seq/utils/utils.py b/examples/legacies/examples_seq2seq/utils/utils.py similarity index 100% rename from examples/examples_seq2seq/utils/utils.py rename to examples/legacies/examples_seq2seq/utils/utils.py diff --git a/examples/examples_text-classification/README.md b/examples/legacies/examples_text-classification/README.md similarity index 100% rename from examples/examples_text-classification/README.md rename to examples/legacies/examples_text-classification/README.md diff --git a/examples/examples_text-classification/configs/config_gen.py b/examples/legacies/examples_text-classification/configs/config_gen.py similarity index 100% rename from examples/examples_text-classification/configs/config_gen.py rename to examples/legacies/examples_text-classification/configs/config_gen.py diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_cola.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_cola.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_cola.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_cola.json diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_mnli.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_mnli.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_mnli.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_mnli.json diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_mrpc.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_mrpc.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_mrpc.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_mrpc.json diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_qnli.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_qnli.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_qnli.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_qnli.json diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_qqp.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_qqp.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_qqp.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_qqp.json diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_rte.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_rte.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_rte.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_rte.json diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_sst2.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_sst2.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_sst2.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_sst2.json diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_stsb.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_stsb.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_stsb.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_stsb.json diff --git a/examples/examples_text-classification/configs/lora_roberta-base/lora_wnli.json b/examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_wnli.json similarity index 100% rename from examples/examples_text-classification/configs/lora_roberta-base/lora_wnli.json rename to examples/legacies/examples_text-classification/configs/lora_roberta-base/lora_wnli.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/cola.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/cola.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/cola.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/cola.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/mnli.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/mnli.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/mnli.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/mnli.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/mrpc.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/mrpc.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/mrpc.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/mrpc.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/qnli.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/qnli.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/qnli.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/qnli.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/qqp.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/qqp.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/qqp.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/qqp.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/rte.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/rte.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/rte.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/rte.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/sst2.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/sst2.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/sst2.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/sst2.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/stsb.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/stsb.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/stsb.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/stsb.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-boolq.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-boolq.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/superglue-boolq.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-boolq.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-cb.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-cb.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/superglue-cb.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-cb.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-copa.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-copa.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/superglue-copa.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-copa.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-multirc.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-multirc.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/superglue-multirc.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-multirc.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-record.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-record.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/superglue-record.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-record.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-wic.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-wic.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/superglue-wic.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-wic.json diff --git a/examples/examples_text-classification/configs/prefix_roberta-base/superglue-wsc.fixed.json b/examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-wsc.fixed.json similarity index 100% rename from examples/examples_text-classification/configs/prefix_roberta-base/superglue-wsc.fixed.json rename to examples/legacies/examples_text-classification/configs/prefix_roberta-base/superglue-wsc.fixed.json diff --git a/examples/examples_text-classification/metrics/glue.py b/examples/legacies/examples_text-classification/metrics/glue.py similarity index 100% rename from examples/examples_text-classification/metrics/glue.py rename to examples/legacies/examples_text-classification/metrics/glue.py diff --git a/examples/examples_text-classification/requirements.txt b/examples/legacies/examples_text-classification/requirements.txt similarity index 100% rename from examples/examples_text-classification/requirements.txt rename to examples/legacies/examples_text-classification/requirements.txt diff --git a/examples/examples_text-classification/run.sh b/examples/legacies/examples_text-classification/run.sh similarity index 100% rename from examples/examples_text-classification/run.sh rename to examples/legacies/examples_text-classification/run.sh diff --git a/examples/examples_text-classification/run_glue.py b/examples/legacies/examples_text-classification/run_glue.py similarity index 100% rename from examples/examples_text-classification/run_glue.py rename to examples/legacies/examples_text-classification/run_glue.py diff --git a/examples/examples_text-classification/util.py b/examples/legacies/examples_text-classification/util.py similarity index 100% rename from examples/examples_text-classification/util.py rename to examples/legacies/examples_text-classification/util.py diff --git a/examples/setup_seq2seq.py b/examples/legacies/setup_seq2seq.py similarity index 100% rename from examples/setup_seq2seq.py rename to examples/legacies/setup_seq2seq.py diff --git a/examples/setup_prompt.py b/examples/setup_prompt.py deleted file mode 100755 index 5a9c74d..0000000 --- a/examples/setup_prompt.py +++ /dev/null @@ -1,48 +0,0 @@ -"""Install Compacter.""" -import os -import setuptools -from torch.utils.cpp_extension import BuildExtension, CUDAExtension - -#os.environ['TORCH_CUDA_ARCH_LIST']="3.5;3.7;6.1;7.0;7.5;8.6+PTX" - -def setup_package(): - long_description = "examples_prompt" - setuptools.setup( - name='examples_prompt', - version='0.0.1', - description='textual prompt example', - long_description=long_description, - long_description_content_type='text/markdown', - author='Shengding Hu', - license='MIT License', - packages=setuptools.find_packages( - exclude=['docs', 'tests', 'scripts']), - dependency_links=[ - 'https://download.pytorch.org/whl/torch_stable.html', - ], - classifiers=[ - 'Intended Audience :: Developers', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: MIT License', - 'Topic :: Scientific/Engineering :: Artificial Intelligence', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7.10', - ], - keywords='text nlp machinelearning', - # ext_modules=[ - # CUDAExtension('seq2seq.projections.fwh_cuda', - # sources=[ - # 'seq2seq/projections/fwh_cuda/fwh_cpp.cpp', - # 'seq2seq/projections/fwh_cuda/fwh_cu.cu', - # ] - # ) - # ] - # , - cmdclass={"build_ext": BuildExtension}, - install_requires=[ - ], - ) - - -if __name__ == '__main__': - setup_package() diff --git a/opendelta/__init__.py b/opendelta/__init__.py index f9301d2..6d38799 100644 --- a/opendelta/__init__.py +++ b/opendelta/__init__.py @@ -1,5 +1,5 @@ -__version__ = "0.1.0" +__version__ = "0.2.4" class GlobalSetting: def __init__(self): diff --git a/opendelta/auto_delta.py b/opendelta/auto_delta.py index 6240d0c..ac9fa61 100644 --- a/opendelta/auto_delta.py +++ b/opendelta/auto_delta.py @@ -2,16 +2,14 @@ from copy import deepcopy from typing import Any, Dict, OrderedDict from opendelta.utils.visualization import Visualization import torch.nn as nn -from transformers.file_utils import PushToHubMixin from opendelta.utils.logging import get_logger import importlib from opendelta.delta_configs import BaseDeltaConfig -from opendelta.basemodel import DeltaBase logger = get_logger(__name__) DELTA_CONFIG_MAPPING = { - "lora": "LoraConfig", + "lora": "LoraConfig", "low_rank_adapter": "LowRankAdapterConfig", "bitfit": "BitFitConfig", "adapter":"AdapterConfig", @@ -91,18 +89,18 @@ class AutoDeltaConfig: "AutoConfig is designed to be instantiated " "using the ``AutoConfig.from_pretrained(pretrained_model_name_or_path)`` method." ) - + @classmethod def from_dict(cls, config_dict: Dict[str, Any], **kwargs): - r""" Instantiate a DeltaConfig according to the dict. Automatically load the config specified by + r""" Instantiate a DeltaConfig according to the dict. Automatically load the config specified by :obj:`delta_type`. Args: config_dict (:obj:`dict`): The dict of configs of delta model. - kwargs: Other keyword argument pass to initialize the config. + kwargs: Other keyword argument pass to initialize the config. >>> config = AutoDeltaConfig.from_dict({"delta_type":"lora"}) # This will load the dault lora config. - >>> config = AutoDeltaConfig.from_dict({"delta_type":"lora", "lora_r":5}) # Will load the default lora config, with lora_r = 5 + >>> config = AutoDeltaConfig.from_dict({"delta_type":"lora", "lora_r":5}) # Will load the default lora config, with lora_r = 5 """ config_dict = deepcopy(config_dict) @@ -114,7 +112,7 @@ class AutoDeltaConfig: @classmethod - def from_finetuned(cls, finetuned_model_name_or_path, **kwargs): + def from_finetuned(cls, finetuned_delta_path, **kwargs): r""" Instantiate one of the configuration classes of the library from a finetuned delta model configuration. The configuration class to instantiate is selected based on the ``delta_type`` property of the config object that @@ -122,18 +120,18 @@ class AutoDeltaConfig: Parameters: - finetuned_model_name_or_path (:obj:`str` or :obj:`os.PathLike`, *optional*): + finetuned_delta_path (:obj:`str` or :obj:`os.PathLike`, *optional*): Can be either: - A string, the *model id* of a finetuned delta model configuration hosted inside a model repo on huggingface.co. Valid model ids can be located at the root-level, like ``Davin/lora``, or namespaced under a user or organization name, like ``DeltaHub/lora_t5-base_mrpc``. - A path to a *directory* containing a configuration file saved using the - :py:meth:`DeltaBase.save_finetuned` method, + :py:meth:`DeltaBase.save_finetuned` method, e.g., ``./my_model_directory/``. - A path or url to a saved configuration JSON *file*, e.g., ``./my_model_directory/configuration.json``. - The last two option are not tested but inherited from huggingface. + The last two option are not tested but inherited from huggingface. cache_dir (:obj:`str` or :obj:`os.PathLike`, *optional*): Path to a directory in which a downloaded pretrained model configuration should be cached if the standard cache should not be used. @@ -163,9 +161,9 @@ class AutoDeltaConfig: The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the ``return_unused_kwargs`` keyword parameter. - + Examples: - + .. code-block:: python from transformers import AutoConfig @@ -173,25 +171,24 @@ class AutoDeltaConfig: """ - kwargs["name_or_path"] = finetuned_model_name_or_path - config_dict, _ = BaseDeltaConfig.get_config_dict(finetuned_model_name_or_path, **kwargs) + config_dict, kwargs = BaseDeltaConfig.get_config_dict(finetuned_delta_path, **kwargs) if "delta_type" in config_dict: config_class = LAZY_CONFIG_MAPPING[config_dict["delta_type"]] return config_class.from_dict(config_dict, **kwargs) else: # Fallback: use pattern matching on the string. for pattern, config_class in LAZY_CONFIG_MAPPING.items(): - if pattern in str(finetuned_model_name_or_path): + if pattern in str(finetuned_delta_path): return config_class.from_dict(config_dict, **kwargs) raise ValueError( - f"Unrecognized model in {finetuned_model_name_or_path}. " + f"Unrecognized model in {finetuned_delta_path}. " f"Should have a `delta_type` key in the loaded config, or contain one of the following strings " f"in its name: {', '.join(LAZY_CONFIG_MAPPING.keys())}" ) -### AutoModels below +### AutoModels below class _LazyAutoMapping(OrderedDict): """ @@ -323,20 +320,20 @@ class AutoDeltaModel: f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or " f"`{self.__class__.__name__}.from_config(config)` methods." ) - + @classmethod def from_config(cls, config, backbone_model, **kwargs): #-> "DeltaBase": r"""Automatically instantiates a delta model based on the :obj:`config`. The delta model correspond to the delta - :obj:`config` will be loaded and initialized using the arguments in :obj:`config`. + :obj:`config` will be loaded and initialized using the arguments in :obj:`config`. .. note:: - Only using :meth:`from_config` method will not load the finetuned weight file (e.g., pytorch_model.bin). - Please use from_finetuned directly. + Only using :meth:`from_config` method will not load the finetuned weight file (e.g., pytorch_model.bin). + Please use from_finetuned directly. Args: config (:obj:`BaseDeltaConfig`): backbone_model (:obj:`nn.Module`): - + Examples: .. code-block:: python @@ -355,53 +352,54 @@ class AutoDeltaModel: ) @classmethod - def from_finetuned(cls, finetuned_model_name_or_path, backbone_model, *model_args, **kwargs): - r""" Automatically instantiated a delta model and load the finetuned checkpoints based on the - :obj:`finetuned_model_name_or_path`, which can either be a string pointing to a local path or a url pointint to - the delta hub. It will check the hash after loading the delta model to see whether the correct backbone and - delta checkpoint are used. + def from_finetuned(cls, finetuned_delta_path, backbone_model, *model_args, **kwargs): + r""" Automatically instantiated a delta model and load the finetuned checkpoints based on the + :obj:`finetuned_delta_path`, which can either be a string pointing to a local path or a url pointint to + the delta hub. It will check the hash after loading the delta model to see whether the correct backbone and + delta checkpoint are used. Args: - finetuned_model_name_or_path (:obj:`str` or :obj:`os.PathLike`, *optional*): + finetuned_delta_path (:obj:`str` or :obj:`os.PathLike`, *optional*): Can be either: - A string, the *model id* of a finetuned delta model configuration hosted inside a model repo on huggingface.co. Valid model ids can be located at the root-level, like ``Davin/lora``, or namespaced under a user or organization name, like ``DeltaHub/lora_t5-base_mrpc``. - A path to a *directory* containing a configuration file saved using the - :py:meth:`DeltaBase.save_finetuned` method, + :py:meth:`DeltaBase.save_finetuned` method, e.g., ``./my_model_directory/``. - A path or url to a saved configuration JSON *file*, e.g., ``./my_model_directory/configuration.json``. - The last two option are not tested but inherited from huggingface. + The last two option are not tested but inherited from huggingface. backbone_model (:obj:`nn.Module`): The backbone model to be modified. model_args: Other argument for initialize the model. + kwargs: Other kwargs that will be passed into DeltaBase.from_finetuned. Example: - + .. code-block:: python delta_model = AutoDeltaModel.from_finetuned("DeltaHub/lora_t5-base-mrpc", backbone_model) """ - config = kwargs.pop("config", None) + delta_config = kwargs.pop("delta_config", None) - if not isinstance(config, BaseDeltaConfig): - config, kwargs = AutoDeltaConfig.from_finetuned( - finetuned_model_name_or_path, return_unused_kwargs=True, **kwargs + if not isinstance(delta_config, BaseDeltaConfig): + delta_config, kwargs = AutoDeltaConfig.from_finetuned( + finetuned_delta_path, return_unused_kwargs=True, **kwargs ) - if type(config) in cls._delta_model_mapping.keys(): - model_class = cls._delta_model_mapping[type(config)] - return model_class.from_finetuned(finetuned_model_name_or_path, backbone_model, *model_args, **kwargs) + if type(delta_config) in cls._delta_model_mapping.keys(): + model_class = cls._delta_model_mapping[type(delta_config)] + return model_class.from_finetuned(finetuned_delta_path, backbone_model, *model_args, delta_config=delta_config, **kwargs) raise ValueError( f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}." ) - - + + if __name__ == "__main__": diff --git a/opendelta/basemodel.py b/opendelta/basemodel.py index 0f6cbc6..1bb7f2b 100644 --- a/opendelta/basemodel.py +++ b/opendelta/basemodel.py @@ -671,21 +671,46 @@ class DeltaBase(nn.Module, SaveLoadMixin): if visualization: from opendelta import Visualization Visualization(module).structure_graph() + + self.get_statistics(module) if trainable_ratio: - n_trainable = self.num_trainable_parameters(module) - n_total = self.num_total_parameters(module) - logger.info("Trainable Ratio: {:2f}%".format(n_trainable/n_total*100)) + logger.info("Trainable Ratio: {:2f}%".format(self.stat['trainable_ratio']*100)) if delta_ratio: - n_delta = self.num_delta_parameters(module) - n_total = self.num_total_parameters(module) - logger.info("Delta Parameter Ratio: {:2f}%".format(n_delta/n_total*100)) + logger.info("Delta Parameter Ratio: {:2f}%".format(self.stat['delta_ratio']*100)) if cuda_memory: - cudamem = 0 - maxcudamem = 0 - for device_id in range(torch.cuda.device_count()): - cudamem += torch.cuda.memory_allocated(f"cuda:{device_id}")/1024**3 - maxcudamem += torch.cuda.max_memory_allocated(f"cuda:{device_id}")/1024**3 - logger.info("Static Memory {:.2f} GB, Max Memory {:.2f} GB".format(cudamem, maxcudamem)) + logger.info("Static Memory {:.2f} GB, Max Memory {:.2f} GB".format(self.stat['cudamem'], self.stat['maxcudamem'])) + + + def get_statistics(self, module=None): + r"""Get the statistics of the parameters in the delta modules. + + Args: + module (:obj:`nn.Module`, *optional*): The module to compute the statistics. + + Returns: + :obj:`dict`: The statistics of the parameters in the delta modules. + + """ + if module is None: + module = self.backbone_model + + self.stat = {} + n_trainable = self.num_trainable_parameters(module) + n_total = self.num_total_parameters(module) + + self.stat['trainable_ratio'] = n_trainable/n_total + + n_delta = self.num_delta_parameters(module) + n_total = self.num_total_parameters(module) + self.stat['delta_ratio'] = n_delta/n_total + + cudamem = 0 + maxcudamem = 0 + for device_id in range(torch.cuda.device_count()): + cudamem += torch.cuda.memory_allocated(f"cuda:{device_id}")/1024**3 + maxcudamem += torch.cuda.max_memory_allocated(f"cuda:{device_id}")/1024**3 + self.stat['cudamem'] = cudamem + self.stat['maxcudamem'] = maxcudamem diff --git a/opendelta/delta_configs.py b/opendelta/delta_configs.py index 5e789ef..b84644b 100644 --- a/opendelta/delta_configs.py +++ b/opendelta/delta_configs.py @@ -5,15 +5,6 @@ from opendelta import __version__ as opendelta_version from opendelta.utils import logging from opendelta.utils.signature import get_arg_names, get_arg_names_inside_func import transformers -from transformers.file_utils import ( - PushToHubMixin, - is_offline_mode, - cached_path, - is_remote_url, - get_list_of_files, - hf_bucket_url, -) -from packaging import version import json import copy @@ -26,7 +17,7 @@ logger = logging.get_logger(__name__) FULL_CONFIGURATION_FILE = "config.json" _re_configuration_file = re.compile(r"config\.(.*)\.json") -class BaseDeltaConfig(PushToHubMixin): +class BaseDeltaConfig: r"""Base class for all configuration classes. Handles a few parameters common to all delta models' configurations as well as methods for loading/downloading/saving configurations. @@ -108,7 +99,7 @@ class BaseDeltaConfig(PushToHubMixin): @classmethod - def from_finetuned(cls, finetuned_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "BaseDeltaConfig": + def from_finetuned(cls, finetuned_delta_path: Union[str, os.PathLike], **kwargs) -> "BaseDeltaConfig": r""" Instantiate a :obj:`BaseDeltaConfig` (or a derived class) from a finetined delta module configuration. @@ -132,7 +123,7 @@ class BaseDeltaConfig(PushToHubMixin): delta_config = LoraConfig.from_finetuned("DeltaHub/lora_t5-base_mrpc") """ - config_dict, kwargs = cls.get_config_dict(finetuned_model_name_or_path, **kwargs) + config_dict, kwargs = cls.get_config_dict(finetuned_delta_path, **kwargs) if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: logger.warn( f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " @@ -202,6 +193,7 @@ class BaseDeltaConfig(PushToHubMixin): config_dict.pop(config_key) unused_config_keys.append(config_key) logger.warning(f"The following keys are not used by {cls}.__init__ function: {unused_config_keys}") + config = cls(**config_dict) @@ -215,7 +207,7 @@ class BaseDeltaConfig(PushToHubMixin): to_remove.append(key) for key in to_remove: kwargs.pop(key, None) - logger.info(f"Model config {config}") + logger.info(f"Model config\n{config}") if return_unused_kwargs: return config, kwargs @@ -224,96 +216,58 @@ class BaseDeltaConfig(PushToHubMixin): @classmethod def get_config_dict( - cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + cls, finetuned_delta_path: Union[str, os.PathLike], **kwargs ) -> Tuple[Dict[str, Any], Dict[str, Any]]: """[NODOC] - From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a + From a ``finetuned_delta_path``, resolve to a dictionary of parameters, to be used for instantiating a [``PretrainedConfig``] using ``from_dict``. Parameters: - pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): + finetuned_delta_path (:obj:`str` or :obj:`os.PathLike`): The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. Returns: :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object. """ - cache_dir = kwargs.pop("cache_dir", None) - force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) - proxies = kwargs.pop("proxies", None) - use_auth_token = kwargs.pop("use_auth_token", None) - local_files_only = kwargs.pop("local_files_only", False) - revision = kwargs.pop("revision", None) + cache_dir = kwargs.get("cache_dir", None) + force_download = kwargs.get("force_download", False) + # resume_download = kwargs.pop("resume_download", False) + # proxies = kwargs.pop("proxies", None) + # use_auth_token = kwargs.pop("use_auth_token", None) + local_files_only = kwargs.get("local_files_only", False) + # revision = kwargs.pop("revision", None) # from_pipeline = kwargs.pop("_from_pipeline", None) - from_auto_class = kwargs.pop("_from_auto", False) + # from_auto_class = kwargs.pop("_from_auto", False) - user_agent = {"file_type": "config", "from_auto_class": from_auto_class} + # user_agent = {"file_type": "config", "from_auto_class": from_auto_class} # if from_pipeline is not None: # user_agent["using_pipeline"] = from_pipeline - if is_offline_mode() and not local_files_only: - logger.info("Offline mode: forcing local_files_only=True") + if os.environ.get("DELTACENTER_OFFLINE", '0') == '1': + logger.info("Delta Center offline mode!") local_files_only = True - pretrained_model_name_or_path = str(pretrained_model_name_or_path) - if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): - config_file = pretrained_model_name_or_path + finetuned_delta_path = str(finetuned_delta_path) + + if cache_dir is not None: + cached_finetuned_delta_path = os.path.join(cache_dir, finetuned_delta_path) else: - configuration_file = get_configuration_file( - pretrained_model_name_or_path, - revision=revision, - use_auth_token=use_auth_token, - local_files_only=local_files_only, - ) + cached_finetuned_delta_path = finetuned_delta_path + if os.path.isfile(cached_finetuned_delta_path): + local_files_only = True + elif os.path.isdir(cached_finetuned_delta_path): + # cached_finetuned_delta_path = os.path.join(cached_finetuned_delta_path, 'config.json') + local_files_only = True - if os.path.isdir(pretrained_model_name_or_path): - config_file = os.path.join(pretrained_model_name_or_path, configuration_file) - else: - config_file = hf_bucket_url( - pretrained_model_name_or_path, filename=configuration_file, revision=revision, mirror=None - ) - - try: - # Load from URL or cache if already cached - resolved_config_file = cached_path( - config_file, - cache_dir=cache_dir, - force_download=force_download, - proxies=proxies, - resume_download=resume_download, - local_files_only=local_files_only, - use_auth_token=use_auth_token, - user_agent=user_agent, - ) - # Load config dict - config_dict = cls._dict_from_json_file(resolved_config_file) - - except EnvironmentError as err: - logger.error(err) - msg = ( - f"Can't load config for '{pretrained_model_name_or_path}'. Make sure that:\n\n" - f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n" - f" (make sure '{pretrained_model_name_or_path}' is not a path to a local directory with something else, in that case)\n\n" - f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {CONFIG_NAME} file\n\n" - ) - - if revision is not None: - msg += f"- or '{revision}' is a valid git identifier (branch name, a tag name, or a commit id) that exists for this model name as listed on its model page on 'https://huggingface.co/models'\n\n" - - raise EnvironmentError(msg) - - except (json.JSONDecodeError, UnicodeDecodeError): - msg = ( - f"Couldn't reach server at '{config_file}' to download configuration file or " - "configuration file is not a valid JSON file. " - f"Please check network or file content here: {resolved_config_file}." - ) - raise EnvironmentError(msg) - - if resolved_config_file == config_file: - logger.info(f"loading configuration file {config_file}") - else: - logger.info(f"loading configuration file {config_file} from cache at {resolved_config_file}") + # if local_files_only: + # config_dict = cls._dict_from_json_file(cached_finetuned_delta_path) + if not local_files_only or force_download: + from .utils.delta_center import download as dcdownload + # try to download from DeltaCenter + cached_finetuned_delta_path = dcdownload(finetuned_delta_path, force_download=force_download, cache_dir=cache_dir) + kwargs['force_download'] = False # Has been downloaded, not more forcing + cached_finetuned_delta_path = os.path.join(cached_finetuned_delta_path, 'config.json') + config_dict = cls._dict_from_json_file(cached_finetuned_delta_path) return config_dict, kwargs @classmethod @@ -427,53 +381,6 @@ class BaseDeltaConfig(PushToHubMixin): -def get_configuration_file( - path_or_repo: Union[str, os.PathLike], - revision: Optional[str] = None, - use_auth_token: Optional[Union[bool, str]] = None, - local_files_only: bool = False, -) -> str: - """ - Get the configuration file to use for this version of transformers. - Args: - path_or_repo (`:obj:str` or `:obj:os.PathLike`): - Can be either the id of a repo on huggingface.co or a path to a *directory*. - revision(`:obj:str`, *optional*, defaults to ``"main"``): - The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any - identifier allowed by git. - use_auth_token (:obj:`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token generated - when running ``transformers-cli login`` (stored in ``~/.huggingface``). - local_files_only (:obj:`bool`, *optional*, defaults to :obj:`False`): - Whether or not to only rely on local files and not to attempt to download any files. - Returns: - :obj:`str`: The configuration file to use. - """ - # Inspect all files from the repo/folder. - all_files = get_list_of_files( - path_or_repo, revision=revision, use_auth_token=use_auth_token, local_files_only=local_files_only - ) - configuration_files_map = {} - for file_name in all_files: - search = _re_configuration_file.search(file_name) - if search is not None: - v = search.groups()[0] - configuration_files_map[v] = os.path.split(file_name)[-1] - available_versions = sorted(configuration_files_map.keys()) - # Defaults to FULL_CONFIGURATION_FILE and then try to look at some newer versions. - configuration_file = FULL_CONFIGURATION_FILE - # transformers_version_ = version.parse(transformers_version) - for v in available_versions: - # if version.parse(v) <= transformers_version_: - configuration_file = configuration_files_map[v] - # else: - # # No point going further since the versions are sorted. - # break - - return configuration_file - - if __name__ == "__main__": myconfig = BaseDeltaConfig.from_pretrained("../ckpts/lora/") myconfig.save_pretrained("../ckpts/lora.1/") diff --git a/opendelta/delta_models/adapter.py b/opendelta/delta_models/adapter.py index a1b821f..e706026 100644 --- a/opendelta/delta_models/adapter.py +++ b/opendelta/delta_models/adapter.py @@ -11,6 +11,8 @@ from opendelta import BaseDeltaConfig import opendelta.utils.logging as logging import numpy as np from opendelta import global_setting +from dataclasses import dataclass, field + logger = logging.get_logger(__name__) @@ -20,10 +22,18 @@ class InterFaceMixin: self._reverse_axis_order = np.argsort(self._axis_order).tolist() def _transpose(self, tensor): - return tensor.permute(*self._axis_order) + if tensor.dim() == 3: + return tensor.permute(*self._axis_order) + else: + return tensor + + def _reverse_transpose(self, tensor): - return tensor.permute(*self._reverse_axis_order).contiguous() + if tensor.dim() == 3: + return tensor.permute(*self._reverse_axis_order).contiguous() + else: + return tensor def _convert_data_type(self, tensor): self._data_type_record = tensor.dtype @@ -35,6 +45,8 @@ class InterFaceMixin: + + class AdapterLayer(nn.Module, InterFaceMixin): r"""A layer of adapter tuning module. """ @@ -139,7 +151,7 @@ class AdapterConfig(BaseDeltaConfig): self, bottleneck_dim: Optional[int]=24, non_linearity: Optional[str]='gelu_new', - sequential: Optional[str] = True, + sequential: Optional[bool] = True, **kwargs ): super().__init__(**kwargs) diff --git a/opendelta/delta_models/lora.py b/opendelta/delta_models/lora.py index 38c629a..0b285dc 100644 --- a/opendelta/delta_models/lora.py +++ b/opendelta/delta_models/lora.py @@ -3,10 +3,10 @@ from typing import Optional, Union from opendelta.utils.signature import get_arg_names, get_arg_names_inside_func from opendelta.utils.name_based_addressing import * from opendelta.basemodel import DeltaBase -from transformers.models.t5 import T5ForConditionalGeneration import torch.nn as nn from opendelta import BaseDeltaConfig import math +from dataclasses import dataclass, field class LowRankLinear(nn.Module): # ------------------------------------------------------------------------------------------ @@ -40,6 +40,11 @@ class LowRankLinear(nn.Module): def forward(self, x): return (self.lora_dropout(x) @ self.lora_A.T @ self.lora_B.T) * self.scaling +@dataclass +class LoraArguments: + r: int = 8 + lora_alpha: int = 16 + lora_dropout: float = 0.0 class LoraConfig(BaseDeltaConfig): r""" diff --git a/opendelta/utils/delta_center.py b/opendelta/utils/delta_center.py new file mode 100644 index 0000000..9bf185d --- /dev/null +++ b/opendelta/utils/delta_center.py @@ -0,0 +1,10 @@ +from DeltaCenter import OssClient +from .file_utils import default_cache_path + + +def download(finetuned_delta_path, cache_dir=None, force_download=False): + if cache_dir is None: + cache_dir = default_cache_path + path_to_unzip_file = OssClient.download(finetuned_delta_path, dest=cache_dir, force_download=force_download) + return path_to_unzip_file + diff --git a/opendelta/utils/delta_hub.py b/opendelta/utils/delta_hub.py index d0da33e..504fc54 100644 --- a/opendelta/utils/delta_hub.py +++ b/opendelta/utils/delta_hub.py @@ -4,6 +4,8 @@ def create_hub_repo_name(root = "DeltaHub", dataset = None, delta_type = None, model_name_or_path = None, + center_value_only_tags = None, + center_key_value_tags = None ): r"""Currently, it's only a simple concatenation of the arguments. """ @@ -14,6 +16,9 @@ def create_hub_repo_name(root = "DeltaHub", repo_name.append(f"{model_name_or_path}") repo_name.append(f"{dataset}") + repo_name.extend(list(center_value_only_tags) if center_value_only_tags else [None]) + repo_name.extend([f"{k}-{v}" for k,v in center_key_value_tags.items()] if center_key_value_tags else [None]) + repo_name = "_".join(repo_name) repo_name = root+"/"+repo_name diff --git a/opendelta/utils/file_utils.py b/opendelta/utils/file_utils.py new file mode 100644 index 0000000..2e82768 --- /dev/null +++ b/opendelta/utils/file_utils.py @@ -0,0 +1,3 @@ +import os +default_cache_path = "{}/.cache/delta_center/".format(os.path.expanduser('~')) +WEIGHTS_NAME = 'pytorch_model.bin' \ No newline at end of file diff --git a/opendelta/utils/saving_loading_utils.py b/opendelta/utils/saving_loading_utils.py index eaeac58..d8633b5 100644 --- a/opendelta/utils/saving_loading_utils.py +++ b/opendelta/utils/saving_loading_utils.py @@ -1,27 +1,93 @@ -from io import RawIOBase -from tarfile import HeaderError -from typing import Union, Optional, Callable +from typing import Dict, List, Union, Optional, Callable from opendelta.delta_configs import BaseDeltaConfig -from opendelta.utils.model_md5 import gen_model_hash +from opendelta.utils.model_md5 import gen_model_hash, gen_parameter_hash import torch import os from opendelta import logging import torch.nn as nn -from transformers.file_utils import ( - WEIGHTS_NAME, - PushToHubMixin, - is_offline_mode, - is_remote_url, - hf_bucket_url, - cached_path, - ) -from transformers.utils.dummy_pt_objects import PreTrainedModel -import hashlib +from DeltaCenter import OssClient +import yaml +from dataclasses import dataclass, field, fields +import datetime +from .file_utils import WEIGHTS_NAME logger = logging.get_logger(__name__) -class SaveLoadMixin(PushToHubMixin): + + +alternative_names = { + "train_tasks": ["train_tasks", "train_task", "task_name"], +} + + +@dataclass +class DeltaCenterArguments: + """ + The arguments that are used to distinguish between different delta models on the DeltaCenter + """ + name: str = field(default="", + metadata={"help": "The name of the delta model checkpoint"} + ) + backbone_model: str = field(default="", + metadata={"help": "The backbone model of the delta model"} + ) + backbone_model_path_public: str = field( + default = None, + metadata={"help": "Publicly available path (url) to pretrained model or model identifier from huggingface.co/models"} + ) + delta_type: str = field( + default=None, + metadata={"help": "the type of type model, e.g., adapter, lora, etc."} + ) + train_tasks: Optional[Union[List[str], str]]= field( + default=None, + metadata={"help": "the task(s) that the delta is trained on"} + ) + train_datasets: Optional[Union[List[str], str]]= field( + default=None, + metadata={"help": "the datasets(s) that the delta is trained on"} + ) + checkpoint_size: Optional[float] = field( + default=None, + metadata={"help": "the size of the checkpoint, in MB"} + ) + test_tasks: Optional[Union[List[str], str]] = field( + default=None, + metadata={"help": "the task(s) that the delta is tested on"} + ) + test_datasets: Optional[Union[List[str], str]] = field( + default=None, + metadata={"help": "the dataset(s) that the delta is tested on"} + ) + test_performance: Optional[float] = field( + default=None, + metadata={"help": "the performance of the model on the test set"} + ) + test_metrics: Optional[str] = field( + default=None, + metadata={"help": "the metrics used by the model"} + ) + trainable_ratio: Optional[float] = field( + default=None, + metadata={"help": "the ratio of trainable parameters in the model"} + ) + delta_ratio: Optional[float] = field( + default=None, + metadata={"help": "the ratio of delta parameters in the model"} + ) + usage: Optional[str] = field( + default="", + metadata={"help": "the usage code of the model"} + ) + license: Optional[str] = field( + default="apache-2.0", + metadata={"help": "the license of the model"} + ) + + + +class SaveLoadMixin: def add_configs_when_saving(self,): self.config.backbone_class = self.backbone_model.__class__.__name__ self.config.backbone_checkpoint_name = os.path.split(self.backbone_model.config._name_or_path.strip("/"))[-1] @@ -32,365 +98,319 @@ class SaveLoadMixin(PushToHubMixin): def save_finetuned( self, - save_directory: Optional[Union[str, os.PathLike]] = "./output/", + finetuned_delta_path: Optional[Union[str, os.PathLike]] = "./delta_checkpoints/", save_config: bool = True, state_dict: Optional[dict] = None, save_function: Callable = torch.save, - push_to_hub: bool = False, - **kwargs, + push_to_dc: bool = True, + center_args: Optional[Union[DeltaCenterArguments, dict]] = dict(), + center_args_pool: Optional[dict] = dict(), + list_tags: Optional[List] = list(), + dict_tags: Optional[Dict] = dict(), + delay_push: bool = False, + test_result = None, + usage: Optional[str] = "", ): r""" Save a model and its configuration file to a directory, so that it can be re-loaded using the - :py:meth:`~DeltaBase.from_finetuned` class method. + :py:meth:`~DeltaBase.save_finetuned` class method. Arguments: - save_directory (:obj:`str` or :obj:`os.PathLike`): - Directory to which to save. Will be created if it doesn't exist. - save_config (:obj:`bool`, *optional*, defaults to :obj:`True`): - Whether or not to save the config of the model. Useful when in distributed training like TPUs and need - to call this function on all processes. In this case, set ``save_config=True`` only on the main process - to avoid race conditions. - state_dict (nested dictionary of :obj:`torch.Tensor`): - The state dictionary of the model to save. Will default to ``self.state_dict()``, but can be used to only - save parts of the model or if special precautions need to be taken when recovering the state dictionary - of a model (like when using model parallelism). - save_function (:obj:`Callable`): - The function to use to save the state dictionary. Useful on distributed training like TPUs when one - need to replace ``torch.save`` by another method. - push_to_hub (:obj:`bool`, *optional*, defaults to :obj:`False`): - Whether or not to push your model to the HuggingFace model hub after saving it. - - .. tip:: - - Using ``push_to_hub=True`` will synchronize the repository you are pushing to with ``save_directory``, - which requires ``save_directory`` to be a local clone of the repo you are pushing to if it's an existing - folder. Pass along ``temp_dir=True`` to use a temporary directory instead. - - kwargs: - Additional key word arguments passed along to the :py:meth:`~file_utils.PushToHubMixin.push_to_hub` method. - - .. note:: - - You may need to install git-lfs on your machine. - - .. code-block:: bash - - wget -P ~ https://github.com/git-lfs/git-lfs/releases/download/v3.0.2/git-lfs-linux-amd64-v3.0.2.tar.gz - cd ~ - tar -xvzf git-lfs-linux-amd64-v3.0.2.tar.gz - export PATH=~:$PATH - git-lfs install + finetuned_delta_path: (optional) path to the directory where the model and its configuration file will be saved. + If not specified, the model will be saved in the directory ``./delta_checkpoints/``, + which is a subdirectory of the current working directory. + save_config: (optional) if ``True``, the configuration file will be saved in the same directory as the + model file. if ``False``, only the state dict will be saved. + state_dict: (optional) a dictionary containing the model's state_dict. If not specified, the + state_dict is loaded from the backbone model's trainable parameters. + save_function: (optional) the function used to save the model. Defaults to ``torch.save``. + state_dict_only: (optional) if ``True``, only the state_dict will be saved. + push_to_dc: (optional) if ``True``, the model will prepare things to pushed to the DeltaCenter. + This includes: + - creating a configuration file for the model + - creating a directory for the model + - saving the model's trainable parameters + - pushing the model to the DeltaCenter + center_args: (optional) the arguments that are used to distinguish between different delta models on the DeltaCenter + center_args_pool: (optional) a dictionary containing the arguments that are used to distinguish between different delta models on the DeltaCenter + list_tags: (optional) a list of tags that will be added to the model's configuration file + dict_tags: (optional) a dictionary of tags that will be added to the model's configuration file + delay_push: (optional) if ``True``, the model will not be pushed to the DeltaCenter. This is useful if you want to + push the model later. """ + + # create the config to save, including model hash, etc. + if save_config: + if not hasattr(self, "config"): + self.create_config_from_model() + self.add_configs_when_saving() + + if push_to_dc: + final_center_args = self.create_delta_center_args(center_args=center_args, + center_args_pool=center_args_pool) + + save_directory = finetuned_delta_path if os.path.isfile(save_directory): logger.error(f"Provided path ({save_directory}) should be a directory, not a file") return - if push_to_hub: - commit_message = kwargs.pop("commit_message", None) - repo = self._create_or_get_repo(save_directory, **kwargs) - os.makedirs(save_directory, exist_ok=True) - # Only save the model itself if we are using distributed training - + if push_to_dc: + save_directory = os.path.join(save_directory, final_center_args.name) + os.makedirs(save_directory, exist_ok=True) + model_to_save = self.backbone_model# unwrap_model(self) # Save the model if state_dict is None: state_dict = model_to_save.state_dict() - - # Save the config - if save_config: - if not hasattr(self, "config"): - self.create_config_from_model() - self.add_configs_when_saving() - self.config.save_finetuned(save_directory) - # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(save_directory, WEIGHTS_NAME) save_function(state_dict, output_model_file) - logger.info(f"Model weights saved in {output_model_file}") + # Save the config + if save_config: + self.config.save_finetuned(save_directory) - if push_to_hub: - url = self._push_to_hub(repo, commit_message=commit_message) - logger.info(f"Model pushed to the hub in this commit: {url}") + + logger.info("\n"+"*"*30+f"\nYou delta models has been saved locally to:\n\t{os.path.abspath(save_directory)}" + ) + + state_dict_total_params = sum(p.numel() for p in state_dict.values()) + other_tags={} + other_tags.update({'state_dict_total_params(M)':state_dict_total_params/1024/1024}) + other_tags.update({'test_result':test_result}) + if push_to_dc: + logger.info("Creating yaml file for delta center") + self.create_yml(save_directory, final_center_args, list_tags, dict_tags, other_tags) + + if not delay_push: + OssClient.upload(base_dir=save_directory) + else: + logger.info(f"Delay push: you can push it to the delta center later using \n\tpython -m DeltaCenter upload {os.path.abspath(save_directory)}\n" + +"*"*30) + + + + + def create_yml(self, save_dir, config, list_tags=list(), dict_tags=dict(),other_tags=None): + f = open("{}/config.yml".format(save_dir), 'w') + config_dict = vars(config) + config_dict['dict_tags'] = dict_tags + config_dict['list_tags'] = list_tags + if other_tags is not None: + config_dict.update(other_tags) + yaml.safe_dump(config_dict, f) + f.close() + + def load_checkpoint(self, path, load_func=torch.load, backbone_model=None): + r"""Simple method for loading only the checkpoint + """ + if backbone_model is None: + backbone_model = self.backbone_model + self.backbone_model.load_state_dict(load_func(f"{path}/{WEIGHTS_NAME}"), strict=False) + + def save_checkpoint(self, path, save_func=torch.save, backbone_model=None): + r"""Simple method for saving only the checkpoint""" + if backbone_model is None: + backbone_model = self.backbone_model + save_func(backbone_model.state_dict(), f"{path}/{WEIGHTS_NAME}") @classmethod - def from_finetuned(cls, - finetuned_model_name_or_path: Optional[Union[str, os.PathLike]], - backbone_model: nn.Module, - *model_args, - check_hash: Optional[bool] = True, - **kwargs): + def from_finetuned(cls, + finetuned_delta_path: Optional[Union[str, os.PathLike]], + backbone_model: nn.Module, + delta_config = None, + cache_dir: Optional[Union[str, os.PathLike]] = None, + state_dict: Optional[dict] = None, + *model_args, + force_download: Optional[bool] = False, + check_hash: Optional[bool] = True, + local_files_only: Optional[bool] = False, + **kwargs): r""" Instantiate a finetuned delta model from a path. - The backbone_model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated). + The backbone_model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated). To further train the model, you can use the :meth:`freeze_module ` method. Parameters: - - finetuned_model_name_or_path (:obj:`str` or :obj:`os.PathLike`, *optional*): - Can be either: - - - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. - Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a - user or organization name, like ``dbmdz/bert-base-german-cased``. - - A path to a *directory* containing model weights saved using - :meth:`SaveLoadMixin.save_finetuned`, e.g., ``./my_model_directory/``. - - A path or url to a *tensorflow index checkpoint file* (e.g, ``./tf_model/model.ckpt.index``). In - this case, ``from_tf`` should be set to ``True`` and a configuration object should be provided as - ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a - PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. - - A path or url to a model folder containing a *flax checkpoint file* in *.msgpack* format (e.g, - ``./flax_model/`` containing ``flax_model.msgpack``). In this case, ``from_flax`` should be set to - ``True``. - - ``None`` if you are both providing the configuration and state dictionary (resp. with keyword - arguments ``config`` and ``state_dict``). - backbone_model (:obj:`torch.nn.Module`): The backbone model to be modified. - model_args (sequence of positional arguments, *optional*): - All remaining positional arguments will be passed to the underlying model's ``__init__`` method. - config (Union[:obj:`BaseDeltaConfig`, :obj:`str`, :obj:`os.PathLike`], *optional*): Can be either: - - an instance of a class derived from :class:`~PretrainedConfig`, - - a string or path valid as input to :py:meth:`~PretrainedConfig.from_pretrained`. - - Configuration for the model to use instead of an automatically loaded configuration. Configuration can - be automatically loaded when: - - - The model is a model provided by the library (loaded with the *model id* string of a pretrained - model). - - The model was saved using :py:meth:`~PreTrainedModel.save_pretrained` and is reloaded by supplying the - save directory. - - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a - configuration JSON file named *config.json* is found in the directory. - state_dict (Dict[:obj:`str`, :obj:`torch.Tensor`], *optional*): - A state dictionary to use instead of a state dictionary loaded from saved weights file. - This option can be used if you want to create a model from a pretrained configuration but load your own - weights. In this case though, you should check if using :py:meth:`~PreTrainedModel.save_pretrained` and - :py:meth:`~PreTrainedModel.from_pretrained` is not a simpler option. - cache_dir (:obj:`Union[str, os.PathLike]`, *optional*): - Path to a directory in which a downloaded pretrained model configuration should be cached if the - standard cache should not be used. - force_download (:obj:`bool`, *optional*, defaults to :obj:`False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - resume_download (:obj:`bool`, *optional*, defaults to :obj:`False`): - Whether or not to delete incompletely received files. Will attempt to resume the download if such a - file exists. - proxies (:obj:`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - local_files_only(:obj:`bool`, *optional*, defaults to :obj:`False`): - Whether or not to only look at local files (i.e., do not try to download the model). - use_auth_token (:obj:`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token generated - when running ``transformers-cli login`` (stored in ``~/.huggingface``). - revision(:obj:`str`, *optional*, defaults to ``"main"``): - The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a - git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any - identifier allowed by git. - mirror(:obj:`str`, *optional*): - Mirror source to accelerate downloads in China. If you are from China and have an accessibility - problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. - Please refer to the mirror site for more information. - torch_dtype (:obj:`str` or :obj:`torch.dtype`, *optional*): - Override the default :obj:`torch.dtype` and load the model under this dtype. If ``"auto"`` is passed the dtype - will be automatically derived from the model's weights. - - .. warning:: - - This feature is inherited from HuggingFace. We do not guarantee its usefulness currently. - One should only disable *_fast_init* to ensure backwards compatibility with `transformers.__version__ < - 4.6.0` for seeded model initialization. This argument will be removed at the next major version. See - `pull request 11471 `_ for more information. - kwargs (remaining dictionary of keyword arguments, *optional*): - Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., - ``output_attentions=True``). Behaves differently depending on whether a ``config`` is provided or - automatically loaded: - - - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the - underlying model's ``__init__`` method (we assume all relevant updates to the configuration have - already been done) - - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class - initialization function (:py:meth:`~PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that - corresponds to a configuration attribute will be used to override said attribute with the - supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute - will be passed to the underlying model's ``__init__`` function. - - .. tip:: - Passing ``use_auth_token=True`` is required when you want to use a private model. - - .. code-block:: python - - from transformers import AutoModelForSeq2SeqLM - t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-base") - from opendelta import AutoDeltaModel - delta = AutoDeltaModel.from_finetuned("DeltaHub/lora_t5-base_mrpc", backbone_model=t5) - delta.log() - - - + finetuned_delta_path: (optional) path to the directory where the model and its configuration file will be saved. + If not specified, the model will be loaded from the directory cahce directory. (see ``cache_dir``), + backbone_model: the backbone model that will be used to instantiate the finetuned delta model. + delta_config: (optional) the configuration file of the finetuned delta model. If not specified, the configuration file + is loaded from the directory ``finetuned_delta_path``. + cache_dir: (optional) path to the directory where the model and its configuration file will be saved. + If not specified, we will first look into current working directory, then the cache directory of your system, e.g., ~/.cache/delta_center/, + state_dict: (optional) a dictionary containing the model's state_dict. If not specified, the + state_dict is loaded from the ``finetuned_delta_path``. + force_download: (optional) if ``True``, the model will be downloaded from the internet even if it is already + present in the cache directory. + check_hash: (optional) if ``True``, check whether the hash of the model once it's trained differs from what we load now. + local_files_only: (optional) if ``True``, the model will be loaded from the local cache directory. """ - config = kwargs.pop("config", None) - state_dict = kwargs.pop("state_dict", None) - cache_dir = kwargs.pop("cache_dir", None) - - # ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False) - force_download = kwargs.pop("force_download", False) - resume_download = kwargs.pop("resume_download", False) - proxies = kwargs.pop("proxies", None) - # output_loading_info = kwargs.pop("output_loading_info", False) - local_files_only = kwargs.pop("local_files_only", False) - use_auth_token = kwargs.pop("use_auth_token", None) - revision = kwargs.pop("revision", None) - mirror = kwargs.pop("mirror", None) - from_pipeline = kwargs.pop("_from_pipeline", None) - from_auto_class = kwargs.pop("_from_auto", False) - # _fast_init = kwargs.pop("_fast_init", True) - torch_dtype = kwargs.pop("torch_dtype", None) - # low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", False) - user_agent = {"file_type": "model", "framework": "pytorch", "from_auto_class": from_auto_class} - - if is_offline_mode() and not local_files_only: - logger.info("Offline mode: forcing local_files_only=True") + if os.environ.get("DELTACENTER_OFFLINE", '0') == '1': + logger.info("Delta Center offline mode!") local_files_only = True # Load config if we don't provide a configuration - if not isinstance(config, BaseDeltaConfig): - config_path = config if config is not None else finetuned_model_name_or_path - config, model_kwargs = cls.config_class.from_finetuned( - config_path, - cache_dir=cache_dir, + + + finetuned_delta_path = str(finetuned_delta_path) + + if cache_dir is not None: + cached_finetuned_delta_path = os.path.join(cache_dir, finetuned_delta_path) + else: + cached_finetuned_delta_path = finetuned_delta_path + + download_from_dc = False + if os.path.isfile(cached_finetuned_delta_path): + raise RuntimeError( + f"You should pass a directory to load a delta checkpoint instead of a file, " + f"since we need the delta's configuration file." + ) + elif os.path.isdir(cached_finetuned_delta_path): + if os.path.isfile(os.path.join(cached_finetuned_delta_path, WEIGHTS_NAME)): + # Load from a PyTorch checkpoint + weight_file = os.path.join(cached_finetuned_delta_path, WEIGHTS_NAME) + else: + raise EnvironmentError( + f"Error no file named {WEIGHTS_NAME} found in " + f"directory {cached_finetuned_delta_path}." + ) + + else: + # try to download from DeltaCenter + from .delta_center import download as dcdownload + cached_finetuned_delta_path = dcdownload(finetuned_delta_path, cache_dir=cache_dir, force_download=force_download) + download_from_dc = True + weight_file = os.path.join(cached_finetuned_delta_path, WEIGHTS_NAME) + + if state_dict is None: + state_dict = torch.load(weight_file, map_location="cpu") + + if not isinstance(delta_config, BaseDeltaConfig): + delta_config, model_kwargs = cls.config_class.from_finetuned( + cached_finetuned_delta_path, + cache_dir=None, return_unused_kwargs=True, - force_download=force_download, - resume_download=resume_download, - proxies=proxies, - local_files_only=local_files_only, - use_auth_token=use_auth_token, - revision=revision, - _from_auto=from_auto_class, - _from_pipeline=from_pipeline, + local_files_only=True if download_from_dc else local_files_only, # has been downloaded **kwargs, ) else: model_kwargs = kwargs - # Load model - if finetuned_model_name_or_path is not None: - finetuned_model_name_or_path = str(finetuned_model_name_or_path) - if os.path.isdir(finetuned_model_name_or_path): - if os.path.isfile(os.path.join(finetuned_model_name_or_path, WEIGHTS_NAME)): - # Load from a PyTorch checkpoint - archive_file = os.path.join(finetuned_model_name_or_path, WEIGHTS_NAME) - else: - raise EnvironmentError( - f"Error no file named {WEIGHTS_NAME} found in " - f"directory {finetuned_model_name_or_path}." - ) - elif os.path.isfile(finetuned_model_name_or_path) or is_remote_url(finetuned_model_name_or_path): - archive_file = finetuned_model_name_or_path - else: - archive_file = hf_bucket_url( - finetuned_model_name_or_path, - filename=WEIGHTS_NAME, - revision=revision, - mirror=mirror, - ) - try: - # Load from URL or cache if already cached #TODO + # Initialize the model from config and attach the delta model to the backbone_model. + delta_model = cls.from_config(delta_config, backbone_model, *model_args, **model_kwargs, ) - resolved_archive_file = cached_path( - archive_file, - cache_dir=cache_dir, - force_download=force_download, - proxies=proxies, - resume_download=resume_download, - local_files_only=local_files_only, - use_auth_token=use_auth_token, - user_agent=user_agent, - ) - except EnvironmentError as err: - logger.error(err) - msg = ( - f"Can't load weights for '{finetuned_model_name_or_path}'. Make sure that:\n\n" - ) - - if revision is not None: - msg += f"- or '{revision}' is a valid git identifier (branch name, a tag name, or a commit id) that exists for this model name as listed on its model page on 'https://huggingface.co/models'\n\n" - - raise EnvironmentError(msg) - - if resolved_archive_file == archive_file: - logger.info(f"loading weights file {archive_file}") - else: - logger.info(f"loading weights file {archive_file} from cache at {resolved_archive_file}") - else: - resolved_archive_file = None - - # load pt weights early so that we know which dtype to init the model under - - if state_dict is None: - try: - state_dict = torch.load(resolved_archive_file, map_location="cpu") - except Exception as e: - try: - with open(resolved_archive_file) as f: - if f.read().startswith("version"): - raise OSError( - "You seem to have cloned a repository without having git-lfs installed. Please install " - "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder " - "you cloned." - ) - else: - raise ValueError from e - except (UnicodeDecodeError, ValueError): - raise OSError( - f"Unable to load weights from pytorch checkpoint file for '{finetuned_model_name_or_path}' " - f"at '{resolved_archive_file}'. " - "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True." - ) - - # set dtype to instantiate the model under: - # 1. If torch_dtype is not None, we use that dtype - # 2. If torch_dtype is "auto", we auto-detect dtype from the loaded state_dict, by checking its first - # weights entry - we assume all weights are of the same dtype - # we also may have config.torch_dtype available, but we won't rely on it till v5 - dtype_orig = None - if torch_dtype is not None: - if isinstance(torch_dtype, str): - if torch_dtype == "auto": - torch_dtype = next(iter(state_dict.values())).dtype - else: - raise ValueError( - f"`torch_dtype` can be either a `torch.dtype` or `auto`, but received {torch_dtype}" - ) - dtype_orig = cls._set_default_torch_dtype(torch_dtype) - - - # Initialize the model from config and attach the delta model to the backbone_model. - delta_model = cls.from_config(config, backbone_model, *model_args, **model_kwargs, ) - - # load the state_dict into the backbone_model. As the delta model's parameter + # load the state_dict into the backbone_model. As the delta model's parameter # is the same object as the deltas in the backbone model with different reference name, # the state_dict will also be loaded into the delta model. delta_model._load_state_dict_into_backbone(backbone_model, state_dict) backbone_hash = gen_model_hash(backbone_model) - if check_hash and hasattr(config, "backbone_hash") and \ - config.backbone_hash is not None and \ - config.backbone_hash != backbone_hash: - logger.warning("The config has an hash of the backbone model, and is" - "different from the hash of the loaded model. This indicates a mismatch" - "between the backbone model that the delta checkpoint is based on and" - "the one you loaded. You propobability need to Train the model instead of" - "directly inference. ") + + if check_hash: + if hasattr(delta_config, "backbone_hash") and \ + delta_config.backbone_hash is not None and \ + delta_config.backbone_hash != backbone_hash: + logger.warning("The config has an hash of the backbone model, and is" + "different from the hash of the loaded model. This indicates a mismatch" + "between the backbone model that the delta checkpoint is based on and" + "the one you loaded. You propobability need to Train the model instead of" + "directly inference. ") + else: + logger.info("Hash-check passed. You can safely use this checkpoint directly.") + else: + logger.warning("Parameters' hash has not been checked!") + # Set model in evaluation mode to deactivate DropOut modules by default backbone_model.eval() return delta_model - + + + def create_delta_center_args(self, center_args, center_args_pool): + """ + Create the delta center args for the center model. + center_args has higher priority than center_args_pool. + + """ + mdict = {} + field = fields(DeltaCenterArguments) + + + for f in field: + exist = False + # first is center_args, exact match + if f.name in center_args: + mdict[f.name] = center_args[f.name] + continue + # second is center_args_pool, can use alternative names + if f.name in center_args_pool: + mdict[f.name] = center_args_pool[f.name] + exist = True + elif f.name in alternative_names: + for altername in alternative_names[f.name]: + if altername in center_args_pool: + mdict[f.name] = center_args_pool[altername] + exist = True + break + # if not exist, find from self.stat or set to default + if not exist: + if f.name in self.stat: + mdict[f.name] = self.stat[f.name] + else: + mdict[f.name] = f.default + + # if eventualy name is not set, create a default one + if mdict['name'] is None or mdict['name'] == '': + logger.info("Name is not set, use default name.") + mdict['name'] = self.create_default_name(**mdict) + + if len(mdict['usage']) == 0: + logger.info("Usage is not set, use default usage.") + mdict['usage'] = self.create_default_usage(mdict['name']) + + + center_args = DeltaCenterArguments(**mdict) + return center_args + + def create_default_usage(self, name): + usage_str = """from opendelta import AutoDeltaModel\n""" + \ + """delta_model = AutoDeltaModel.from_finetuned('{name_with_userid}', backbone_model=model)\n""" + \ + """delta_model.freeze_module() # if you are going to further train it \n""" + \ + """delta_model.log()""" + return usage_str + + def create_default_name(self, **kwargs): + r"""Currently, it's only a simple concatenation of the arguments. + """ + + reponame = "" + reponame += kwargs["backbone_model_path_public"].split("/")[-1]+"_" if kwargs['backbone_model_path_public'] is not None else kwargs['backbone_model'] + reponame += kwargs["delta_type"]+"_" if kwargs["delta_type"] is not None else "" + + # tasks + if isinstance(kwargs["train_tasks"], list): + train_tasks = "+".join(kwargs["train_tasks"]) + elif kwargs["train_tasks"] is not None: + train_tasks = kwargs["train_tasks"] + else: + logger.warning("train_tasks are not find in all arguments. Do you miss them?") + train_tasks = None + reponame += train_tasks+"_" if train_tasks is not None else "" + + # time + reponame += datetime.datetime.now().strftime("%Y%m%d%H%M%S") #+ gen_model_hash(model=self.backbone_model) + + # model hash + if hasattr(self.config, "backbone_hash"): + reponame += self.config.backbone_hash[:3] + return reponame + diff --git a/opendelta/utils/signature.py b/opendelta/utils/signature.py index b559f92..41aa95e 100644 --- a/opendelta/utils/signature.py +++ b/opendelta/utils/signature.py @@ -4,10 +4,10 @@ from collections import namedtuple def signature(f): r"""Get the function f 's input arguments. A useful gadget when some function slot might be instantiated into multiple functions. - + Args: f (:obj:`function`) : the function to get the input arguments. - + Returns: namedtuple : of args, default, varargs, keywords, respectively.s @@ -34,7 +34,7 @@ def signature(f): ] or None argspec = namedtuple('Signature', ['args', 'defaults', 'varargs', 'keywords']) - return argspec(args, defaults, varargs, keywords) + return argspec(args, defaults, varargs, keywords) def get_arg_names(f): r""" Get a functions argument name, remove the ``self`` argument @@ -45,6 +45,7 @@ def get_arg_names(f): return args + def get_arg_names_inside_func(func): r""" Get the functions argument name inside the function itself. Remove ``self`` argument. """ diff --git a/opendelta/utils/structure_mapping.py b/opendelta/utils/structure_mapping.py index 4cdc507..8772c08 100644 --- a/opendelta/utils/structure_mapping.py +++ b/opendelta/utils/structure_mapping.py @@ -3,6 +3,29 @@ import copy import opendelta.utils.logging as logging from opendelta.utils.visualization import Visualization logger = logging.get_logger(__name__) +opt_mapping = { + "model.decoder.embed_tokens": {"__name__":"embeddings"}, + "model.decoder.embed_positions": {"__name__":""}, + "model.decoder.project_out": {"__name__":""}, + "model.decoder.project_in": {"__name__":""}, + "model.decoder": {"__name__":"decoder", + "layer": {"__name__":"block", + "$": {"__name__":"$", + "self_attn": {"__name__":"attn", + "q_proj": {"__name__":"q"}, + "k_proj": {"__name__":"k"}, + "v_proj": {"__name__":"v"}, + "out_proj": {"__name__":"proj"} + }, + "self_attn_layer_norm": {"__name__":"layer_norm"}, + "fc1": {"__name__":"ff.w1"}, + "fc2": {"__name__":"ff.w2"}, + "final_layer_norm": {"__name__":"layer_norm"}, + } + } + } +} + t5_mapping = { "shared": {"__name__":"embeddings"}, "encoder": {"__name__":"encoder", @@ -24,7 +47,7 @@ t5_mapping = { } } }, - "final_layer_norm": {"__name__":"layer_norm"}, + "final_layer_norm": {"__name__":"layer_norm"}, }, "decoder": {"__name__":"decoder", "embed_tokens": {"__name__":"embeddings"}, @@ -199,8 +222,14 @@ distilbert_mapping = { } } + +MAPPINGERROR_MSG = "We haven't provide common structure mapping for this backbone model." + \ + " If it is a common enough PLM, please check whether it is wrapped by other wrapper model, e.g., XXXForSequenceClassification." +\ + "Please manually add the "+\ + "delta models by speicifying 'modified_modules' based on the visualization of model structure. Refer to `https://opendelta.readthedocs.io/en/latest/notes/faq.html` for detail." + def transform(org_key, mapping, strict=True, warning=False, verbose=False): - + chain = org_key.split(".") query = "" node = mapping @@ -215,7 +244,7 @@ def transform(org_key, mapping, strict=True, warning=False, verbose=False): if strict: if warning: print(f"'{org_key}' has no common mapping.") - return + return else: new_chain.append(query) else: @@ -226,19 +255,19 @@ def transform(org_key, mapping, strict=True, warning=False, verbose=False): new_chain.append(query) query = "" else: - query += "." + query += "." if query!="": if strict: if warning: print("A part of the orginial key hasn't been matched!") - return + return else: new_chain.append(query.strip(".")) # tailing query new_key = ".".join(new_chain) if verbose: print(f"{org_key} => {new_key}") return new_key - + @@ -255,7 +284,7 @@ def mapping_for_SequenceClassification(mapping, type): mapping["classifier"] = {"__name__": "classifier"} elif type == "deberta": mapping.pop("lm_predictions.lm_head") - mapping["pooler"] = {"__name__": "classifier"} + mapping["pooler"] = {"__name__": "classifier"} mapping["classifier"] = {"__name__": "classifier"} else: raise NotImplementedError @@ -265,6 +294,14 @@ def mapping_for_ConditionalGeneration(mapping, type): mapping = copy.deepcopy(mapping) if type == "t5": mapping["lm_head"] = {"__name__":"lm_head.proj"} + else: + raise NotImplementedError(MAPPINGERROR_MSG.format()) + return mapping + +def mapping_for_CausalLM(mapping, type): + mapping = copy.deepcopy(mapping) + if type == "opt": + mapping["lm_head"] = {"__name__":"lm_head.proj"} else: raise NotImplementedError return mapping @@ -273,22 +310,23 @@ class _LazyLoading(OrderedDict): def __init__(self, mapping): self._mapping_string = mapping self._mapping = {} - + def __getitem__(self, key): if key not in self._mapping_string: - raise KeyError(key) + raise KeyError(MAPPINGERROR_MSG) value = self._mapping_string[key] self._mapping[key] = eval(value) - return self._mapping[key] - + return self._mapping[key] + def keys(self): return list(self._mapping_string.keys()) - + def __contains__(self, item): return item in self._mapping_string + class CommonStructureMap(object): r""" A lazy loading structure map. """ @@ -296,9 +334,10 @@ class CommonStructureMap(object): "RobertaForSequenceClassification": """mapping_for_SequenceClassification(roberta_mapping, "roberta")""", "RobertaForMaskedLM": "roberta_mapping", "BertForMaskedLM": "bert_mapping", - "BertForSequenceClassification": """mapping_for_SequenceClassification(bert_mapping, "bert")""", "T5ForConditionalGeneration": """mapping_for_ConditionalGeneration(t5_mapping, "t5")""", - "DebertaV2ForSequenceClassification": """mapping_for_SequenceClassification(debertav2_mapping, "deberta")""" + "DebertaV2ForSequenceClassification": """mapping_for_SequenceClassification(debertav2_mapping, "deberta")""", + "CLIPModel":"""""", + "OPTForCausalLM":"""mapping_for_CausalLM(opt_mapping,"opt")""" }) SpecialModelInverseMaps = { @@ -315,8 +354,17 @@ class CommonStructureMap(object): """ backbone_class = type(backbone_model).__name__ if backbone_class not in cls.Mappings: - raise KeyError(backbone_class) - mapping = cls.Mappings[backbone_class] + raise KeyError(MAPPINGERROR_MSG) + + try: + mapping = cls.Mappings[backbone_class] + except KeyError: + logger.error(MAPPINGERROR_MSG) + exit(-1) + + + + if visualize: logger.info("Since you are using the common structure mapping, draw the transformed parameter structure for checking.") vis = Visualization(backbone_model) @@ -346,4 +394,4 @@ if __name__ == "__main__": for name, _ in plm.named_modules(): transform(name, t5_mapping, strict=True, warning=False) - + diff --git a/requirements.txt b/requirements.txt index a53c347..5439382 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,6 @@ decorator rich web.py gitpython +scipy +sklearn +delta_center_client==0.0.4 diff --git a/setup.py b/setup.py index 31e0d58..cbaef15 100644 --- a/setup.py +++ b/setup.py @@ -3,24 +3,34 @@ import setuptools import os import os -def get_requirements(path): - print("path is :", path) - ret = [] - with open(os.path.join(path, "requirements.txt"), encoding="utf-8") as freq: - for line in freq.readlines(): - ret.append( line.strip() ) +requires = """torch>=1.8.0 +transformers>=4.10.0 +datasets==1.17.0 +sentencepiece>=0.1.96 +tqdm>=4.62.2 +# loralib +decorator +rich +web.py +gitpython +delta_center_client==0.0.4 +""" + +def get_requirements(): + ret = [x for x in requires.split("\n") if len(x)>0] + print("requirements:", ret) return ret -path = os.path.dirname(os.path.abspath(__file__)) -requires = get_requirements(path) -print(requires) + +# path = os.path.dirname(os.path.abspath(__file__)) +# requires = get_requirements(path) with open('README.md', 'r') as f: setuptools.setup( name = 'opendelta', - version = "0.1.0", + version = "0.2.4", description = "An open source framework for delta learning (parameter efficient learning).", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", @@ -30,10 +40,10 @@ with open('README.md', 'r') as f: url="https://github.com/thunlp/OpenDelta", keywords = ['PLM', 'Parameter-efficient-Learning', 'AI', 'NLP'], python_requires=">=3.6.0", - install_requires=requires, + install_requires=get_requirements(), package_dir={'opendelta':'opendelta'}, package_data= { - 'opendelta':["utils/interactive/templates/*.html"], + 'opendelta':["utils/interactive/templates/*.html", 'requirments.txt'], }, include_package_data=True, packages=setuptools.find_packages(),