Merge branch 'delta_center_dev' into check_pr_33
This commit is contained in:
commit
f6f3b01916
|
@ -35,4 +35,21 @@ log.txt
|
|||
**/examples/examples_bmtrain/BMPretrain
|
||||
**/examples/examples_bmtrain/BigModels/BigModels/results
|
||||
**/Delta_Memory/
|
||||
**/output/
|
||||
**/thunlp/
|
||||
**/saved_ckpts/
|
||||
|
||||
|
||||
DeltaCenter-Python-Client/
|
||||
backbone_structure
|
||||
delta_checkpoints
|
||||
gitop.sh
|
||||
load_dataset_and_model.ipynb
|
||||
load_model.py
|
||||
scripts
|
||||
t.py
|
||||
t.sh
|
||||
!examples/examples_prompt/configs/*/*.json
|
||||
!examples/examples_prompt/configs/**
|
||||
**/delta_checkpoints/
|
||||
**/outputs/
|
||||
|
|
|
@ -72,6 +72,11 @@ python setup.py install
|
|||
python setup.py develop
|
||||
```
|
||||
|
||||
If you encounter network error using setup.py, please firstly install the dependencies via
|
||||
```shell
|
||||
pip install -r requirements.txt && python setup.py develop
|
||||
```
|
||||
|
||||
## Must Try
|
||||
|
||||
```python
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -1,24 +1,59 @@
|
|||
# !!!!This example collection is still under develop, please wait for some time to use it.
|
||||
# Examples of using opendelta together with 🤗 transformers.
|
||||
|
||||
## install the repo
|
||||
In this repo, we construct a very general pipeline to train and test a PLM using
|
||||
🤗 transformers.
|
||||
|
||||
The pipeline was constructed together with [openpromptu](https://pypi.org/project/openpromptu/), which is a light and
|
||||
model-agnostic version of [openprompt](https://github.com/thunlp/OpenPrompt).
|
||||
|
||||
## Pool of PLMs
|
||||
We are going to adapt most of the models in 🤗 transformers
|
||||
in the repos. The different pipeline, processing, or configurations are specified
|
||||
in `./backbones/`. You can add your own model in this file to support customized models.
|
||||
|
||||
|
||||
### A example script to run the repo in offline mode
|
||||
```bash
|
||||
cd ../
|
||||
python setup_seq2seq.py develop
|
||||
conda activate [YOURENV]
|
||||
PATHBASE=[YOURPATH]
|
||||
|
||||
JOBNAME="adapter_t5-base"
|
||||
DATASET="superglue-cb"
|
||||
|
||||
cd $PATHBASE/OpenDelta/examples/examples_prompt/
|
||||
python configs/gen_t5.py --job $JOBNAME
|
||||
|
||||
export TRANSFORMERS_OFFLINE=1
|
||||
export HF_DATASETS_OFFLINE=1
|
||||
python src/run.py configs/$JOBNAME/$DATASET.json \
|
||||
--model_name_or_path [YOURPATH_TO_T5_BASE] \
|
||||
--tokenizer_name [YOURPATH_TO_T5_BASE] \
|
||||
--datasets_saved_path [YOURPATH_TO_CB_DATASETS] \
|
||||
--finetuned_delta_path ${PATHBASE}/delta_checkpoints/ \
|
||||
--num_train_epochs 20 \
|
||||
--bottleneck_dim 24 \
|
||||
--delay_push True
|
||||
```
|
||||
This will add `examples_seq2seq` to the environment path of the python lib.
|
||||
|
||||
## Generating the json configuration file
|
||||
## A example of quick testing the repo.
|
||||
|
||||
```shell
|
||||
python configs/gen_$BACKBONETYPE.py --job $YOURJOB
|
||||
#e.g. python configs/gen_beit.py --job lora_beit-base-patch16-224
|
||||
```
|
||||
The available job configuration (e.g., `--job lora_beit-base-patch16-224`) can be seen from the scripts. You can also
|
||||
create your only configuration.
|
||||
```bash
|
||||
conda activate [YOURENV]
|
||||
PATHBASE=[YOURPATH]
|
||||
|
||||
JOBNAME="adapter_t5-base"
|
||||
DATASET="superglue-cb"
|
||||
|
||||
## Run the code
|
||||
cd $PATHBASE/OpenDelta/examples/examples_prompt/
|
||||
|
||||
```
|
||||
CUDA_VISIBLE_DEVICES=1 python src/run.py configs/lora_beit-base-patch16-224/beans.json
|
||||
```
|
||||
export TRANSFORMERS_OFFLINE=1
|
||||
export HF_DATASETS_OFFLINE=1
|
||||
export DELTACENTER_OFFLINE=0
|
||||
python src/test.py configs/$JOBNAME/$DATASET.json \
|
||||
--model_name_or_path [YOURPATH_TO_T5_BASE] \
|
||||
--tokenizer_name [YOURPATH_TO_T5_BASE] \
|
||||
--datasets_saved_path [YOURPATH_TO_CB_DATASETS] \
|
||||
--finetuned_delta_path thunlp/t5-base_adapter_superglue-cb_20220701171436c80 \
|
||||
--delta_cache_dir "./delta_checkpoints/" \
|
||||
--force_download True
|
||||
```
|
|
@ -26,14 +26,14 @@ def preprocess_function(raw_example, **kwargs):
|
|||
example = InputExample(**raw_example)
|
||||
|
||||
|
||||
try:
|
||||
example = verbalizer.wrap_one_example(example)
|
||||
example, other = template.wrap_one_example(example)
|
||||
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
|
||||
model_inputs = tokenizer(input_sentence, max_length=256,
|
||||
padding="max_length", truncation=True)
|
||||
except:
|
||||
from IPython import embed; embed(header="Therer")
|
||||
|
||||
example = verbalizer.wrap_one_example(example)
|
||||
example, other = template.wrap_one_example(example)
|
||||
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
|
||||
model_inputs = tokenizer(input_sentence, max_length=256,
|
||||
padding="max_length", truncation=True)
|
||||
|
||||
|
||||
|
||||
with tokenizer.as_target_tokenizer():
|
||||
label = tokenizer(other['tgt_text']).input_ids
|
||||
|
@ -43,7 +43,8 @@ def preprocess_function(raw_example, **kwargs):
|
|||
|
||||
def get_backbone(model_args, **kwargs):
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
# model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
|
|
|
@ -8,7 +8,6 @@ from transformers import (
|
|||
AutoFeatureExtractor,
|
||||
AutoModelForImageClassification,
|
||||
)
|
||||
from transformers import ViTFeatureExtractor
|
||||
|
||||
from transformers import Trainer as HfTrainer
|
||||
import torch.nn as nn
|
||||
|
@ -26,9 +25,10 @@ def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
|
|||
def preprocess_function(raw_example, **kwargs):
|
||||
# from IPython import embed; embed(header="Therefa")
|
||||
tokenizer = kwargs['tokenizer']
|
||||
model_inputs = tokenizer(raw_example['image'], return_tensors='pt')
|
||||
# print(np.array(raw_example['img']).shape)
|
||||
model_inputs = tokenizer(np.array(raw_example['image']), return_tensors='pt')
|
||||
model_inputs['pixel_values'] = model_inputs['pixel_values'].squeeze()
|
||||
model_inputs['labels'] = raw_example['labels']
|
||||
model_inputs['labels'] = raw_example['label']
|
||||
return model_inputs
|
||||
|
||||
def compute_metrics(eval_preds, dataset_name, eval_metric):
|
||||
|
@ -55,7 +55,7 @@ def mask_token_func(tokenizer, ith_mask=0):
|
|||
|
||||
def get_remove_columns(dataset_features):
|
||||
# dataset_features.pop("label")
|
||||
print("remove_columns: {}".format(dataset_features))
|
||||
# print("remove_columns: {}".format(dataset_features))
|
||||
return dataset_features
|
||||
|
||||
class DataCollator(HfDataCollatorMixin):
|
||||
|
|
|
@ -0,0 +1,169 @@
|
|||
from openpromptu.data_utils import InputExample
|
||||
import torch
|
||||
from transformers.data.data_collator import torch_default_data_collator
|
||||
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
|
||||
from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
|
||||
import numpy as np
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
)
|
||||
|
||||
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
|
||||
import copy
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
def preprocess_function(raw_example, **kwargs):
|
||||
tokenizer = kwargs['tokenizer']
|
||||
data_args = kwargs['data_args']
|
||||
template = kwargs['template']
|
||||
verbalizer = kwargs['verbalizer']
|
||||
tokenizer_wrapper = kwargs['tokenizer_wrapper']
|
||||
|
||||
example = InputExample(**raw_example)
|
||||
# example = verbalizer.wrap_one_example(example)
|
||||
example, other = template.wrap_one_example(example)
|
||||
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
|
||||
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
|
||||
padding="max_length", truncation=True)
|
||||
return model_inputs
|
||||
|
||||
|
||||
|
||||
def compute_metrics(eval_preds, dataset_name, eval_metric):
|
||||
pass
|
||||
|
||||
def mask_token_func(tokenizer, ith_mask=0):
|
||||
return tokenizer.pad_token
|
||||
|
||||
def get_remove_columns(dataset_features):
|
||||
# dataset_features.remove("label")
|
||||
return dataset_features
|
||||
|
||||
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
|
||||
from openpromptu.prompts import GenerationVerbalizer
|
||||
from openpromptu.prompts import ManualTemplate
|
||||
from openpromptu import TokenizerWrapper
|
||||
template = ManualTemplate(text = task.templates_text[template_id])
|
||||
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None)
|
||||
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
|
||||
return template, verbalizer, tokenizer_wrapper
|
||||
|
||||
|
||||
def get_backbone(model_args, **kwargs):
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
# config.dropout_rate = 0.0
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_fast=model_args.use_fast_tokenizer,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
return config, tokenizer, model
|
||||
|
||||
class Trainer(HfSeq2SeqTrainer):
|
||||
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.eval_task = eval_task
|
||||
self.compute_metrics = self._compute_metrics
|
||||
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
|
||||
labels=copy.deepcopy(inputs['input_ids'])
|
||||
# labels[labels==self.tokenizer.pad_token_id]=-100
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.logits
|
||||
shift_logits = logits[..., :-1, :].contiguous()
|
||||
shift_labels = labels[..., 1:].contiguous()
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
|
||||
loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1))
|
||||
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
|
||||
def prediction_step(
|
||||
self,
|
||||
model, #nn.Module,
|
||||
inputs, #Dict[str, Union[torch.Tensor, Any]],
|
||||
prediction_loss_only, #: bool,
|
||||
ignore_keys, #: Optional[List[str]] = None,
|
||||
): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
|
||||
"""
|
||||
Perform an evaluation step on :obj:`model` using obj:`inputs`.
|
||||
|
||||
Subclass and override to inject custom behavior.
|
||||
|
||||
Args:
|
||||
model (:obj:`nn.Module`):
|
||||
The model to evaluate.
|
||||
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
|
||||
The inputs and targets of the model.
|
||||
|
||||
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
|
||||
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
|
||||
prediction_loss_only (:obj:`bool`):
|
||||
Whether or not to return the loss only.
|
||||
|
||||
Return:
|
||||
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
|
||||
labels (each being optional).
|
||||
"""
|
||||
if not self.args.predict_with_generate or prediction_loss_only:
|
||||
return super().prediction_step(
|
||||
model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
|
||||
)
|
||||
|
||||
inputs = self._prepare_inputs(inputs)
|
||||
with torch.no_grad():
|
||||
labels=copy.deepcopy(inputs['input_ids'])
|
||||
# labels[labels==self.tokenizer.pad_token_id]=-100
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.logits
|
||||
shift_logits = logits[..., :-1, :].contiguous()
|
||||
shift_labels = labels[..., 1:].contiguous().long()
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
|
||||
loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu()
|
||||
loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss)
|
||||
|
||||
if prediction_loss_only:
|
||||
return (loss, None, None)
|
||||
else:
|
||||
# non pad label
|
||||
shift_labels = shift_labels.view(-1).detach().cpu()
|
||||
nonpad_idx = shift_labels!=self.tokenizer.pad_token_id
|
||||
shift_labels = shift_labels[nonpad_idx]
|
||||
# the probability at the corresponding position
|
||||
shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu()
|
||||
target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device)
|
||||
shift_logits = shift_logits.softmax(dim=-1)[target_position]
|
||||
|
||||
|
||||
return (loss, shift_logits, shift_labels)
|
||||
|
||||
def _compute_metrics(self, eval_preds):
|
||||
|
||||
preds, labels = eval_preds
|
||||
|
||||
result = {}
|
||||
for metric in self.eval_task.metric:
|
||||
result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id))
|
||||
|
||||
average_metric = sum(result.values())/len(result)
|
||||
result.update({"average_metrics":average_metric})
|
||||
return result
|
|
@ -26,14 +26,13 @@ def preprocess_function(raw_example, **kwargs):
|
|||
example = InputExample(**raw_example)
|
||||
|
||||
|
||||
try:
|
||||
example = verbalizer.wrap_one_example(example)
|
||||
example, other = template.wrap_one_example(example)
|
||||
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
|
||||
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
|
||||
padding="max_length", truncation=True)
|
||||
except:
|
||||
from IPython import embed; embed(header="Therer")
|
||||
|
||||
example = verbalizer.wrap_one_example(example)
|
||||
example, other = template.wrap_one_example(example)
|
||||
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
|
||||
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
|
||||
padding="max_length", truncation=True)
|
||||
|
||||
|
||||
with tokenizer.as_target_tokenizer():
|
||||
label = tokenizer(other['tgt_text']).input_ids
|
||||
|
@ -165,7 +164,7 @@ class Trainer(HfSeq2SeqTrainer):
|
|||
return (loss, generated_tokens, labels)
|
||||
|
||||
def _compute_metrics(self, eval_preds):
|
||||
from IPython import embed; embed(header="In compute metrics")
|
||||
# from IPython import embed; embed(header="In compute metrics")
|
||||
preds, labels = eval_preds
|
||||
decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
|
||||
decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
|
||||
|
|
|
@ -0,0 +1,171 @@
|
|||
from openpromptu.data_utils import InputExample
|
||||
import torch
|
||||
from transformers.data.data_collator import torch_default_data_collator
|
||||
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
|
||||
from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
|
||||
import numpy as np
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
)
|
||||
|
||||
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
|
||||
import copy
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
def preprocess_function(raw_example, **kwargs):
|
||||
tokenizer = kwargs['tokenizer']
|
||||
data_args = kwargs['data_args']
|
||||
template = kwargs['template']
|
||||
verbalizer = kwargs['verbalizer']
|
||||
tokenizer_wrapper = kwargs['tokenizer_wrapper']
|
||||
|
||||
example = InputExample(**raw_example)
|
||||
# example = verbalizer.wrap_one_example(example)
|
||||
example, other = template.wrap_one_example(example)
|
||||
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
|
||||
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
|
||||
padding="max_length", truncation=True)
|
||||
return model_inputs
|
||||
|
||||
|
||||
|
||||
def compute_metrics(eval_preds, dataset_name, eval_metric):
|
||||
pass
|
||||
|
||||
def mask_token_func(tokenizer, ith_mask=0):
|
||||
return tokenizer.pad_token
|
||||
|
||||
def get_remove_columns(dataset_features):
|
||||
# dataset_features.remove("label")
|
||||
return dataset_features
|
||||
|
||||
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
|
||||
from openpromptu.prompts import GenerationVerbalizer
|
||||
from openpromptu.prompts import ManualTemplate
|
||||
from openpromptu import TokenizerWrapper
|
||||
template = ManualTemplate(text = task.templates_text[template_id])
|
||||
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None)
|
||||
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="tail", mask_token_func=mask_token_func)
|
||||
return template, verbalizer, tokenizer_wrapper
|
||||
|
||||
|
||||
def get_backbone(model_args, **kwargs):
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
# config.dropout_rate = 0.0
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_fast=model_args.use_fast_tokenizer,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
if not hasattr(tokenizer,"pad_token") or (hasattr(tokenizer,"pad_token") and tokenizer.pad_token==None):
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
|
||||
return config, tokenizer, model
|
||||
|
||||
class Trainer(HfSeq2SeqTrainer):
|
||||
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.eval_task = eval_task
|
||||
self.compute_metrics = self._compute_metrics
|
||||
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
|
||||
labels=copy.deepcopy(inputs['input_ids'])
|
||||
# labels[labels==self.tokenizer.pad_token_id]=-100
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.logits
|
||||
shift_logits = logits[..., :-1, :].contiguous()
|
||||
shift_labels = labels[..., 1:].contiguous()
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
|
||||
loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1))
|
||||
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
|
||||
def prediction_step(
|
||||
self,
|
||||
model, #nn.Module,
|
||||
inputs, #Dict[str, Union[torch.Tensor, Any]],
|
||||
prediction_loss_only, #: bool,
|
||||
ignore_keys, #: Optional[List[str]] = None,
|
||||
): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
|
||||
"""
|
||||
Perform an evaluation step on :obj:`model` using obj:`inputs`.
|
||||
|
||||
Subclass and override to inject custom behavior.
|
||||
|
||||
Args:
|
||||
model (:obj:`nn.Module`):
|
||||
The model to evaluate.
|
||||
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
|
||||
The inputs and targets of the model.
|
||||
|
||||
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
|
||||
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
|
||||
prediction_loss_only (:obj:`bool`):
|
||||
Whether or not to return the loss only.
|
||||
|
||||
Return:
|
||||
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
|
||||
labels (each being optional).
|
||||
"""
|
||||
if not self.args.predict_with_generate or prediction_loss_only:
|
||||
return super().prediction_step(
|
||||
model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
|
||||
)
|
||||
|
||||
inputs = self._prepare_inputs(inputs)
|
||||
with torch.no_grad():
|
||||
labels=copy.deepcopy(inputs['input_ids'])
|
||||
# labels[labels==self.tokenizer.pad_token_id]=-100
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.logits
|
||||
shift_logits = logits[..., :-1, :].contiguous()
|
||||
shift_labels = labels[..., 1:].contiguous().long()
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
|
||||
loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu()
|
||||
loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss)
|
||||
|
||||
if prediction_loss_only:
|
||||
return (loss, None, None)
|
||||
else:
|
||||
# non pad label
|
||||
shift_labels = shift_labels.view(-1).detach().cpu()
|
||||
nonpad_idx = shift_labels!=self.tokenizer.pad_token_id
|
||||
shift_labels = shift_labels[nonpad_idx]
|
||||
# the probability at the corresponding position
|
||||
shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu()
|
||||
target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device)
|
||||
shift_logits = shift_logits.softmax(dim=-1)[target_position]
|
||||
|
||||
|
||||
return (loss, shift_logits, shift_labels)
|
||||
|
||||
def _compute_metrics(self, eval_preds):
|
||||
|
||||
preds, labels = eval_preds
|
||||
|
||||
result = {}
|
||||
for metric in self.eval_task.metric:
|
||||
result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id))
|
||||
|
||||
average_metric = sum(result.values())/len(result)
|
||||
result.update({"average_metrics":average_metric})
|
||||
return result
|
|
@ -26,14 +26,13 @@ def preprocess_function(raw_example, **kwargs):
|
|||
example = InputExample(**raw_example)
|
||||
|
||||
|
||||
try:
|
||||
example = verbalizer.wrap_one_example(example)
|
||||
example, other = template.wrap_one_example(example)
|
||||
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
|
||||
model_inputs = tokenizer(input_sentence, max_length=256,
|
||||
padding="max_length", truncation=True)
|
||||
except:
|
||||
from IPython import embed; embed(header="Therer")
|
||||
|
||||
example = verbalizer.wrap_one_example(example)
|
||||
example, other = template.wrap_one_example(example)
|
||||
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
|
||||
model_inputs = tokenizer(input_sentence, max_length=256,
|
||||
padding="max_length", truncation=True)
|
||||
|
||||
|
||||
with tokenizer.as_target_tokenizer():
|
||||
label = tokenizer(other['tgt_text']).input_ids
|
||||
|
|
|
@ -1,59 +0,0 @@
|
|||
# the final results will be populated here.{
|
||||
"evaluate": {
|
||||
"epoch": 20.0,
|
||||
"eval_accuracy": 89.2156862745098,
|
||||
"eval_average_metrics": 90.76168929110105,
|
||||
"eval_f1": 92.3076923076923,
|
||||
"eval_loss": 0.16493959724903107,
|
||||
"eval_runtime": 1.6391,
|
||||
"eval_samples_per_second": 124.455
|
||||
},
|
||||
"repo_name": "DeltaHub/bitfit_t5-base_mrpc",
|
||||
"test": {
|
||||
"epoch": 20.0,
|
||||
"test_accuracy": 88.23529411764706,
|
||||
"test_average_metrics": 89.97971602434077,
|
||||
"test_f1": 91.72413793103448,
|
||||
"test_loss": 0.14968213438987732,
|
||||
"test_runtime": 1.6344,
|
||||
"test_samples_per_second": 124.82
|
||||
}
|
||||
}
|
||||
{
|
||||
"evaluate": {
|
||||
"epoch": 20.0,
|
||||
"eval_average_metrics": 52.10265668831534,
|
||||
"eval_loss": 0.3603779077529907,
|
||||
"eval_matthews_correlation": 52.10265668831534,
|
||||
"eval_runtime": 1.0808,
|
||||
"eval_samples_per_second": 482.046
|
||||
},
|
||||
"repo_name": "DeltaHub/bitfit_t5-base_cola",
|
||||
"test": {
|
||||
"epoch": 20.0,
|
||||
"test_average_metrics": 54.209563471221934,
|
||||
"test_loss": 0.2853100299835205,
|
||||
"test_matthews_correlation": 54.209563471221934,
|
||||
"test_runtime": 1.056,
|
||||
"test_samples_per_second": 494.304
|
||||
}
|
||||
}
|
||||
{
|
||||
"evaluate": {
|
||||
"epoch": 20.0,
|
||||
"eval_average_metrics": 53.80613287067274,
|
||||
"eval_loss": 0.25723716616630554,
|
||||
"eval_matthews_correlation": 53.80613287067274,
|
||||
"eval_runtime": 1.0583,
|
||||
"eval_samples_per_second": 492.299
|
||||
},
|
||||
"repo_name": "DeltaHub/bitfit_t5-base_cola",
|
||||
"test": {
|
||||
"epoch": 20.0,
|
||||
"test_average_metrics": 54.32497579543861,
|
||||
"test_loss": 0.22327613830566406,
|
||||
"test_matthews_correlation": 54.32497579543861,
|
||||
"test_runtime": 1.0556,
|
||||
"test_samples_per_second": 494.507
|
||||
}
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
{
|
||||
"bottleneck_dim": 24,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "adapter",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "beans",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32",
|
||||
"num_classes": 3,
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/adapter/clip-vit-base-patch32/beans",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_delta_center": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "beans",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "beans",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
{
|
||||
"backbone_model": "opt",
|
||||
"bottleneck_dim": 24,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "adapter",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "wikitext",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps":2,
|
||||
"greater_is_better": false,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 900,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m",
|
||||
"model_path_public": "opt-350m",
|
||||
"num_train_epochs": 3,
|
||||
"output_dir": "outputs/adapter/opt-350m/wikitext",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 6,
|
||||
"per_device_train_batch_size": 6,
|
||||
"predict_with_generate": true,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "wikitext",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "wikitext",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["self_attn"]
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
{
|
||||
"backbone_model": "vit",
|
||||
"bottleneck_dim": 24,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": false,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "adapter",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "beans",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k",
|
||||
"model_path_public": "vit-large-patch16-224-in21k",
|
||||
"num_classes": 3,
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/adapter/vit-large-patch16-224-in21k/beans",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": false,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "beans",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "beans",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["output"]
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"backbone_model": "t5-large",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 100,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/t5-large",
|
||||
"model_path_public": "t5-large",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/bitfit/t5-large/rte",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 100,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/t5-large",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["attn", "ff", "layer_norm"]
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
{
|
||||
"backbone_model": "blenderbot",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "compacter",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "sst2",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"factorized_phm": true,
|
||||
"factorized_phm_rule": false,
|
||||
"gradient_clip": false,
|
||||
"greater_is_better": true,
|
||||
"hypercomplex_adapters": true,
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
"learn_phm": true,
|
||||
"learning_rate": 0.003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b",
|
||||
"model_path_public": "blenderbot-3b",
|
||||
"non_linearity": "gelu_new",
|
||||
"normalize_phm_weight": false,
|
||||
"num_train_epochs": 3,
|
||||
"output_dir": "outputs/compacter/blenderbot-3b/sst2",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"phm_c_init": "normal",
|
||||
"phm_clamp": false,
|
||||
"phm_init_range": 0.0001,
|
||||
"predict_with_generate": true,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"shared_phm_rule": false,
|
||||
"split_validation_test": true,
|
||||
"task_name": "sst2",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "sst2",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"use_bias_down_sampler": true,
|
||||
"use_bias_up_sampler": true,
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["fc2"]
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"backbone_model": "deberta-v2-xlarge",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "compacter",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mnli",
|
||||
"eval_steps": 500,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"is_seq2seq": false,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge",
|
||||
"num_train_epochs": 3,
|
||||
"output_dir": "outputs/compacter/deberta-v2-xlarge/mnli",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": false,
|
||||
"push_to_dc": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 500,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mnli",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mnli",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["attention"]
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"backbone_model": "long-t5",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "compacter",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 100,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large",
|
||||
"model_path_public": "long-t5-tglobal-large",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/compacter/long-t5-tglobal-large/rte",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 100,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["attn", "ff", "layer_norm"]
|
||||
}
|
|
@ -2,7 +2,7 @@ import collections
|
|||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
# PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
|
|
|
@ -45,11 +45,14 @@ BaseConfigs['t5-base'] = {
|
|||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"push_to_hf": False,
|
||||
"push_to_dc": True,
|
||||
"save_strategy": "steps",
|
||||
"datasets_load_from_disk": True,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/"
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"backbone_model": "t5", # use in delta center,
|
||||
"model_path_public": "t5-base", # use in delta center,
|
||||
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
{
|
||||
"backbone_model": "beit",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk",
|
||||
"delta_type": "lora",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "cifar10",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224",
|
||||
"model_path_public": "beit-large-patch16-224",
|
||||
"num_classes": 10,
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/lora/beit-large-patch16-224/cifar10",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": false,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "cifar10",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "cifar10",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["query","value"]
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
{
|
||||
"backbone_model": "gpt-j",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "lora",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "wikitext",
|
||||
"eval_steps": 500,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps":4,
|
||||
"greater_is_better": false,
|
||||
"learning_rate": 0.00003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 512,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B",
|
||||
"model_path_public": "gpt-j-6B",
|
||||
"num_train_epochs": 2,
|
||||
"output_dir": "outputs/lora/gpt-j-6B/wikitext",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 2,
|
||||
"per_device_train_batch_size": 2,
|
||||
"predict_with_generate": true,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 500,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "wikitext",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "wikitext",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["20.attn.q_proj","21.attn.q_proj","22.attn.q_proj","23.attn.q_proj","24.attn.q_proj","25.attn.q_proj","26.attn.q_proj","27.attn.q_proj"]
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
{
|
||||
"backbone_model": "roberta-large",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "lora",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "superglue-boolq",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"is_seq2seq": false,
|
||||
"learning_rate": 0.0001,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large",
|
||||
"model_path_public": "roberta-large",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/lora/roberta-large/superglue-boolq",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": false,
|
||||
"push_to_hub": false,
|
||||
"push_to_dc": true,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "superglue-boolq",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "superglue-boolq",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["query","value"]
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
{
|
||||
"backbone_model": "xlm-roberta-large",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "lora",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "superglue-wic",
|
||||
"eval_steps": 100,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"is_seq2seq": false,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large",
|
||||
"model_path_public": "xlm-roberta-large",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/lora/xlm-roberta-large/superglue-wic",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": false,
|
||||
"push_to_dc": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 100,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "superglue-wic",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "superglue-wic",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["query","value"]
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
{
|
||||
"backbone_model": "gpt2",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "low_rank_adapter",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "wikitext",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps":1,
|
||||
"greater_is_better": false,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 768,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt2",
|
||||
"model_path_public": "gpt2",
|
||||
"num_train_epochs": 2,
|
||||
"output_dir": "outputs/low_rank_adapter/gpt2/wikitext",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "wikitext",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "wikitext",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt2",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["attn","mlp"]
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"backbone_model": "bert-large-cased",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "prefix",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 100,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"is_seq2seq": false,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/prefix/bert-large-cased/rte",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": false,
|
||||
"push_to_dc": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 100,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["attention"]
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"backbone_model": "bart",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "soft_prompt",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "superglue-boolq",
|
||||
"eval_steps": 500,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps":1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.1,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bart-large",
|
||||
"model_path_public": "bart-large",
|
||||
"num_train_epochs": 50,
|
||||
"output_dir": "outputs/soft_prompt/bart-large/superglue-boolq",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 500,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"soft_token_num":100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "superglue-boolq",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "superglue-boolq",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bart-large",
|
||||
"token_init": true,
|
||||
"unfrozen_modules": [
|
||||
"deltas"
|
||||
],
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -93,4 +93,10 @@ class AbstractTask(abc.ABC):
|
|||
# shuffles the data and samples it.
|
||||
if n_obs is not None:
|
||||
dataset = self.subsample(dataset, n_obs)
|
||||
return dataset.map(self.preprocessor)
|
||||
|
||||
this_method = getattr(self.__class__, 'preprocessor')
|
||||
base_method = getattr(AbstractTask, 'preprocessor')
|
||||
if this_method is not base_method:
|
||||
return dataset.map(self.preprocessor)
|
||||
else:
|
||||
return dataset
|
||||
|
|
|
@ -12,22 +12,16 @@ import logging
|
|||
import numpy as np
|
||||
import torch
|
||||
import re
|
||||
from openprompt.prompts import ManualTemplate, ManualVerbalizer
|
||||
from openprompt.plms.utils import TokenizerWrapper
|
||||
from openprompt.data_utils import InputExample
|
||||
from openprompt.prompts import GenerationVerbalizer
|
||||
import itertools
|
||||
|
||||
import os
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
||||
from transformers.models.auto.tokenization_auto import tokenizer_class_from_name
|
||||
|
||||
from typing import List, Dict
|
||||
from collections import defaultdict
|
||||
from openprompt.utils import round_list
|
||||
import warnings
|
||||
|
||||
|
||||
|
@ -68,7 +62,8 @@ class COLA(AbstractTask):
|
|||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
offline = os.environ.get("HF_DATASETS_OFFLINE", "0")
|
||||
if offline == '1':
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.cola")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'cola',
|
||||
|
@ -96,7 +91,8 @@ class SST2(AbstractTask):
|
|||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
offline = os.environ.get("HF_DATASETS_OFFLINE", "0")
|
||||
if offline == '1':
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.sst2")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'sst2',
|
||||
|
@ -123,10 +119,9 @@ class MRPC(AbstractTask):
|
|||
}
|
||||
|
||||
|
||||
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
offline = os.environ.get("HF_DATASETS_OFFLINE", "0")
|
||||
if offline == '1':
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.mrpc")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'mrpc', split=split, script_version="master")
|
||||
|
@ -152,7 +147,8 @@ class QQP(AbstractTask):
|
|||
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
offline = os.environ.get("HF_DATASETS_OFFLINE", "0")
|
||||
if offline == '1':
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.qqp")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'qqp',
|
||||
|
@ -208,7 +204,8 @@ class MNLI(AbstractTask):
|
|||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
offline = os.environ.get("HF_DATASETS_OFFLINE", "0")
|
||||
if offline == '1':
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.mnli")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'mnli', split=split, script_version="master")
|
||||
|
@ -243,7 +240,8 @@ class QNLI(AbstractTask):
|
|||
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
offline = os.environ.get("HF_DATASETS_OFFLINE", "0")
|
||||
if offline == '1':
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.qnli")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'qnli', split=split, script_version="master")
|
||||
|
@ -279,7 +277,8 @@ class RTE(AbstractTask):
|
|||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
offline = os.environ.get("HF_DATASETS_OFFLINE", "0")
|
||||
if offline == '1':
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.rte")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'rte',
|
||||
|
@ -306,7 +305,8 @@ class WNLI(AbstractTask):
|
|||
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
offline = os.environ.get("HF_DATASETS_OFFLINE", "0")
|
||||
if offline == '1':
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.wnli")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'wnli', split=split, script_version="master")
|
||||
|
@ -334,7 +334,8 @@ class SuperGLUEBoolQ(AbstractTask):
|
|||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
offline = os.environ.get("HF_DATASETS_OFFLINE", "0")
|
||||
if offline == '1':
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.boolq")[split]
|
||||
else:
|
||||
return datasets.load_dataset('super_glue', 'boolq', split=split, script_version="master")
|
||||
|
@ -347,8 +348,8 @@ class SuperGLUECB(AbstractTask):
|
|||
split_to_data_split = {"train": "train",
|
||||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
metric = [metrics.mean_multiclass_f1(num_classes=3), metrics.accuracy]
|
||||
metric_names = ["f1_multiclass", "accuracy"]
|
||||
metric = [metrics.accuracy]
|
||||
metric_names = ["accuracy"]
|
||||
|
||||
verbalizers = {
|
||||
"0":{"0": "yes",
|
||||
|
@ -361,7 +362,8 @@ class SuperGLUECB(AbstractTask):
|
|||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
offline = os.environ.get("HF_DATASETS_OFFLINE", "0")
|
||||
if offline == '1':
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.cb")[split]
|
||||
else:
|
||||
return datasets.load_dataset('super_glue', 'cb', split=split, script_version="master")
|
||||
|
@ -387,7 +389,8 @@ class SuperGLUECOPA(AbstractTask):
|
|||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
offline = os.environ.get("HF_DATASETS_OFFLINE", "0")
|
||||
if offline == '1':
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.copa")[split]
|
||||
else:
|
||||
return datasets.load_dataset('super_glue', 'copa', split=split, script_version="master")
|
||||
|
@ -416,7 +419,8 @@ class SuperGLUEMultiRC(AbstractTask):
|
|||
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
offline = os.environ.get("HF_DATASETS_OFFLINE", "0")
|
||||
if offline == '1':
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.multirc")[split]
|
||||
else:
|
||||
return datasets.load_dataset('super_glue', 'multirc', split=split, script_version="master")
|
||||
|
@ -459,7 +463,8 @@ class SuperGLUEWIC(AbstractTask):
|
|||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
offline = os.environ.get("HF_DATASETS_OFFLINE", "0")
|
||||
if offline == '1':
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.wic")[split]
|
||||
else:
|
||||
return datasets.load_dataset('super_glue', 'wic', split=split, script_version="master")
|
||||
|
@ -549,13 +554,76 @@ class Beans(AbstractTask):
|
|||
|
||||
def load_dataset(self, split):
|
||||
# from IPython import embed; embed(header="beans")
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.wic")[split]
|
||||
offline = os.environ.get("HF_DATASETS_OFFLINE", "0")
|
||||
if offline == '1':
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/beans")[split]
|
||||
else:
|
||||
return datasets.load_dataset('beans', split=split, script_version="master")
|
||||
|
||||
class Wikitext(AbstractTask):
|
||||
#wikitext-2-v1
|
||||
name = "wikitext"
|
||||
# labels_list = ['angular_leaf_spot', 'bean_rust', "healthy"]
|
||||
split_to_data_split = {"train": "train",
|
||||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
metric = [metrics.perplexity]
|
||||
metric_names = ["perplexity"]
|
||||
|
||||
verbalizers = {
|
||||
"0": {
|
||||
}
|
||||
}
|
||||
|
||||
templates_text = {
|
||||
"0": """{"meta":"text"}"""
|
||||
}
|
||||
split_valid_to_make_test = True
|
||||
def load_dataset(self, split):
|
||||
# from IPython import embed; embed(header="beans")
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/wikitext")[split]
|
||||
else:
|
||||
return datasets.load_dataset('wikitext','wikitext-2-v1', split=split, script_version="master")
|
||||
|
||||
class Cifar10(AbstractTask):
|
||||
name = "cifar10"
|
||||
|
||||
split_to_data_split = {"train": "train",
|
||||
"validation": "test",
|
||||
"test": "test"}
|
||||
metric = [metrics.accuracy]
|
||||
metric_names = ["accuracy"]
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
d = datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/cifar10")[split].select(range(100))
|
||||
print(d)
|
||||
return d
|
||||
else:
|
||||
return datasets.load_dataset('cifar10', split=split, script_version="master")
|
||||
# def preprocessor(self, example):
|
||||
# example_ = {}
|
||||
# example_["image"] = example["image"]
|
||||
# example_["labels"] = example["label"]
|
||||
|
||||
# return example_
|
||||
class Fashion_MNIST(AbstractTask):
|
||||
name = "Fashion-MNIST"
|
||||
|
||||
split_to_data_split = {"train": "train",
|
||||
"validation": "test",
|
||||
"test": "test"}
|
||||
metric = [metrics.accuracy]
|
||||
metric_names = ["accuracy"]
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
d = datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/fashion_mnist")[split]
|
||||
print(d)
|
||||
return d
|
||||
else:
|
||||
return datasets.load_dataset('fashion_mnist', split=split, script_version="master")
|
||||
|
||||
TASK_MAPPING = OrderedDict(
|
||||
[
|
||||
|
@ -575,7 +643,10 @@ TASK_MAPPING = OrderedDict(
|
|||
('superglue-multirc', SuperGLUEMultiRC),
|
||||
('superglue-wic', SuperGLUEWIC),
|
||||
# ('superglue-record', SuperGLUERecord)
|
||||
('beans', Beans)
|
||||
('beans', Beans),
|
||||
('wikitext',Wikitext),
|
||||
('cifar10',Cifar10),
|
||||
('fashion_mnist',Fashion_MNIST)
|
||||
]
|
||||
)
|
||||
|
||||
|
|
|
@ -11,6 +11,14 @@ import sklearn.metrics
|
|||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
def perplexity(outputs, targets,ignore_index=-100):
|
||||
"""Computes the perplexity accuracy."""
|
||||
|
||||
ce = -np.log(outputs).mean()
|
||||
# ce = F.cross_entropy(torch.Tensor(outputs).view(-1, outputs.shape[-1]), torch.Tensor(targets).view(-1).long(),ignore_index=ignore_index)
|
||||
|
||||
return {"perplexity":float(np.exp(ce))}
|
||||
|
||||
def accuracy(predictions, targets) -> dict:
|
||||
"""Computes the average accuracy."""
|
||||
return {"accuracy": 100 * ((np.array(predictions) == np.array(targets)).mean())}
|
||||
|
@ -47,20 +55,20 @@ def spearman_corrcoef(predictions, targets) -> dict:
|
|||
|
||||
|
||||
|
||||
def spearman_corrcoef(predictions, targets) -> dict:
|
||||
"""Computes Spearman correlation coefficient."""
|
||||
# TODO: we need to do postprocessors in a clean way for each dataset.
|
||||
from examples_seq2seq.data_processors.postprocessors import string_to_float
|
||||
targets = [string_to_float(target) for target in targets]
|
||||
predictions= [string_to_float(prediction) for prediction in predictions]
|
||||
spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0]
|
||||
# def spearman_corrcoef(predictions, targets) -> dict:
|
||||
# """Computes Spearman correlation coefficient."""
|
||||
# # TODO: we need to do postprocessors in a clean way for each dataset.
|
||||
# from examples_seq2seq.data_processors.postprocessors import string_to_float
|
||||
# targets = [string_to_float(target) for target in targets]
|
||||
# predictions= [string_to_float(prediction) for prediction in predictions]
|
||||
# spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0]
|
||||
|
||||
# Note that if all the predictions will be the same, spearman
|
||||
# correlation is nan, to gaurad against this, we check the output
|
||||
# and return 0 in this case.
|
||||
if math.isnan(spearman_corrcoef):
|
||||
spearman_corrcoef = 0
|
||||
return {"spearmanr": spearman_corrcoef}
|
||||
# # Note that if all the predictions will be the same, spearman
|
||||
# # correlation is nan, to gaurad against this, we check the output
|
||||
# # and return 0 in this case.
|
||||
# if math.isnan(spearman_corrcoef):
|
||||
# spearman_corrcoef = 0
|
||||
# return {"spearmanr": spearman_corrcoef}
|
||||
|
||||
|
||||
def f1_score_with_invalid(predictions, targets) -> dict:
|
||||
|
@ -102,8 +110,8 @@ def f1_score(predictions, targets) -> dict:
|
|||
Returns:
|
||||
F1 score, where any prediction != 0 or 1 is counted as wrong.
|
||||
"""
|
||||
targets = targets.astype(np.int32)
|
||||
predictions = predictions.astype(np.int32)
|
||||
targets = np.array(targets).astype(np.int32)
|
||||
predictions = np.array(predictions).astype(np.int32)
|
||||
return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)}
|
||||
|
||||
# TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow
|
||||
|
|
|
@ -26,10 +26,12 @@ You can also adapt this script on your own tasks.
|
|||
|
||||
import os
|
||||
import sys
|
||||
|
||||
os.environ['MKL_THREADING_LAYER'] = 'GNU'
|
||||
os.environ['MKL_SERVICE_FORCE_INTEL'] = '1'
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
sys.path.append(os.path.join(os.getcwd(), "../"))
|
||||
# sys.path.append(os.path.join(os.getcwd(), "/mnt/sfs_turbo/zhangzhen/OpenDelta"))
|
||||
sys.path.append(os.path.join(os.getcwd()))
|
||||
|
||||
import functools
|
||||
|
@ -56,7 +58,7 @@ from transformers.trainer_utils import is_main_process, get_last_checkpoint
|
|||
|
||||
from data_processors import AutoTask #, #TaskDataCollatorForSeq2Seq, AutoPostProcessor, data_collator
|
||||
from utils import read_json, save_json
|
||||
from utils.args import ModelArguments, TrainingArguments, DataTrainingArguments, RemainArgHfArgumentParser
|
||||
from utils.args import ModelArguments, TrainingArguments, DataTrainingArguments, DeltaArguments, RemainArgHfArgumentParser
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -66,16 +68,14 @@ def main():
|
|||
# See all possible arguments in src/transformers/training_args.py
|
||||
# or by passing the --help flag to this script.
|
||||
# We now keep distinct sets of args, for a cleaner separation of concerns.
|
||||
parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
|
||||
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
|
||||
# If we pass only one argument to the script and it's the path to a json file,
|
||||
# let's parse it to get our arguments.
|
||||
model_args, data_args, training_args, delta_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
|
||||
else:
|
||||
model_args, data_args, training_args, delta_args = parser.parse_args_into_dataclasses(return_remaining_strings=True)
|
||||
parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, DeltaArguments))
|
||||
|
||||
# You can provide a json file with contains the arguments and use the --argument some_arg to override or append to the json file.
|
||||
json_file, cmd_args = (os.path.abspath(sys.argv[1]), sys.argv[2:]) if sys.argv[1].endswith(".json") else (None, sys.argv[1:])
|
||||
model_args, data_args, training_args, delta_args, remain_args = parser.parse_json_file_with_cmd_args(json_file=json_file, command_line_args=cmd_args)
|
||||
logger.warning("The following arguments not used! {}".format(remain_args))
|
||||
|
||||
print(f"{training_args.output_dir}/results.json")
|
||||
logger.info(f"The results will be used in {training_args.output_dir}/results.json")
|
||||
# exit()
|
||||
# Detecting last checkpoint.
|
||||
last_checkpoint = None
|
||||
|
@ -121,7 +121,8 @@ def main():
|
|||
|
||||
|
||||
|
||||
if os.path.basename(model_args.model_name_or_path).startswith("t5"):
|
||||
if os.path.basename(model_args.model_name_or_path).startswith("t5") \
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("long-t5") :
|
||||
from examples_prompt.backbones.t5 import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.t5 import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("blenderbot"):
|
||||
|
@ -129,7 +130,9 @@ def main():
|
|||
from examples_prompt.backbones.blenderbot import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("roberta") \
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("bert") \
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("albert") :
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("albert") \
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("xlm-roberta") \
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("deberta") :
|
||||
from examples_prompt.backbones.bert import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.bert import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("beit"):
|
||||
|
@ -144,6 +147,10 @@ def main():
|
|||
elif os.path.basename(model_args.model_name_or_path).startswith("clip"):
|
||||
from examples_prompt.backbones.clip import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.clip import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("opt") \
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("gpt"):
|
||||
from examples_prompt.backbones.opt import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.opt import Trainer, DataCollator
|
||||
|
||||
|
||||
|
||||
|
@ -161,7 +168,8 @@ def main():
|
|||
|
||||
if delta_args.delta_type.lower() != "none":
|
||||
from opendelta import AutoDeltaConfig,AutoDeltaModel
|
||||
delta_config = AutoDeltaConfig.from_dict(vars(delta_args))
|
||||
from dataclasses import asdict
|
||||
delta_config = AutoDeltaConfig.from_dict(asdict(delta_args))
|
||||
delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=model)
|
||||
delta_model.freeze_module(set_state_dict = True)
|
||||
delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True)
|
||||
|
@ -278,14 +286,9 @@ def main():
|
|||
|
||||
if torch.cuda.is_available() and training_args.compute_memory:
|
||||
peak_memory = (torch.cuda.max_memory_allocated() / 1024 ** 2)/1000
|
||||
print(
|
||||
"Memory utilization",
|
||||
peak_memory,
|
||||
"GB"
|
||||
)
|
||||
performance_metrics.update({"peak_memory": peak_memory})
|
||||
if training_args.compute_memory or training_args.compute_time:
|
||||
print("Efficiency Statistics {}".format(performance_metrics))
|
||||
logger.info("Efficiency Statistics {}".format(performance_metrics))
|
||||
trainer.save_metrics("performance", performance_metrics)
|
||||
|
||||
# Evaluation
|
||||
|
@ -313,17 +316,30 @@ def main():
|
|||
trainer.save_metrics(f"{data_args.task_name}_test", metrics)
|
||||
all_results['test'][data_args.task_name] = metrics
|
||||
|
||||
# from opendelta.utils.delta_hub import create_hub_repo_name
|
||||
# from opendelta.utils.delta_center import create_delta_center_args, create_repo_name
|
||||
|
||||
# repo_name = create_hub_repo_name(root="DeltaHub",
|
||||
# dataset=data_args.task_name,
|
||||
# delta_type = delta_args.delta_type,
|
||||
# model_name_or_path= model_args.model_name_or_path)
|
||||
# results['repo_name'] = repo_name
|
||||
# if delta_args.delta_type.lower() != "none":
|
||||
# if training_args.push_to_hub: # TODO add description here
|
||||
# delta_model.save_finetuned(push_to_hub=True, save_directory=repo_name, use_auth_token=True)
|
||||
# # trainer.push_to_hub(**kwargs)
|
||||
# else:
|
||||
# delta_model.save_finetuned(push_to_hub=False, save_directory=repo_name, use_auth_token=True)
|
||||
|
||||
# center_args =
|
||||
# repo_name = create_repo_name(prefix="", center_args=center_args)
|
||||
# all_results['repo_name'] = repo_name
|
||||
|
||||
|
||||
delta_model.save_finetuned(finetuned_delta_path=delta_args.finetuned_delta_path,
|
||||
push_to_dc=training_args.push_to_dc,
|
||||
center_args={"test_performance":all_results['test'][data_args.task_name]['test_average_metrics'],
|
||||
},
|
||||
center_args_pool = {**vars(model_args), **vars(data_args), **vars(training_args), **vars(delta_args)},
|
||||
list_tags = ['NLI'],
|
||||
dict_tags = {'purpose':'for testing'},
|
||||
delay_push=True,
|
||||
test_result=all_results['test']
|
||||
)
|
||||
|
||||
|
||||
|
||||
with open(f"{training_args.output_dir}/results.json", 'w') as fout:
|
||||
|
|
|
@ -0,0 +1,344 @@
|
|||
# coding=utf-8
|
||||
# Copyright OpenDelta Team and THUNLP lab. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
A unified runing scripts for most models to do down stream tasks in a
|
||||
prompt learning fashion, i.e., No classification head, all tasks are casted
|
||||
to mask prediction or span prediction tasks.
|
||||
|
||||
Processing relevant to different backbone models are stored in ../backbones/
|
||||
|
||||
Adding A few lines to integrate the Delta tuning methods.
|
||||
|
||||
You can also adapt this script on your own tasks.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
os.environ['MKL_THREADING_LAYER'] = 'GNU'
|
||||
os.environ['MKL_SERVICE_FORCE_INTEL'] = '1'
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
sys.path.append(os.path.join(os.getcwd(), "../"))
|
||||
sys.path.append(os.path.join(os.getcwd()))
|
||||
|
||||
import functools
|
||||
import logging
|
||||
import torch
|
||||
import json
|
||||
import numpy as np
|
||||
|
||||
import transformers
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModelForMaskedLM,
|
||||
AutoModelForSeq2SeqLM,
|
||||
AutoTokenizer,
|
||||
DataCollatorForSeq2Seq,
|
||||
# HfArgumentParser,
|
||||
# MBartTokenizer,
|
||||
# default_data_collator,
|
||||
Trainer,
|
||||
Seq2SeqTrainer,
|
||||
set_seed,
|
||||
)
|
||||
from transformers.trainer_utils import is_main_process, get_last_checkpoint
|
||||
|
||||
from data_processors import AutoTask #, #TaskDataCollatorForSeq2Seq, AutoPostProcessor, data_collator
|
||||
from utils import read_json, save_json
|
||||
from utils.args import ModelArguments, TrainingArguments, DataTrainingArguments, RemainArgHfArgumentParser, DeltaArguments
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def main():
|
||||
parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, DeltaArguments))
|
||||
|
||||
# You can provide a json file with contains the arguments and use the --argument some_arg to override or append to the json file.
|
||||
json_file, cmd_args = (os.path.abspath(sys.argv[1]), sys.argv[2:]) if sys.argv[1].endswith(".json") else (None, sys.argv[1:])
|
||||
model_args, data_args, training_args, delta_args, remain_args = parser.parse_json_file_with_cmd_args(json_file=json_file, command_line_args=cmd_args)
|
||||
logger.warning("The following arguments not used! {}".format(remain_args))
|
||||
|
||||
# # exit()
|
||||
# # Detecting last checkpoint.
|
||||
# last_checkpoint = None
|
||||
# if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
|
||||
# last_checkpoint = get_last_checkpoint(training_args.output_dir)
|
||||
# print("#### last_checkpoint ", last_checkpoint)
|
||||
# if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
|
||||
# '''
|
||||
# raise ValueError(
|
||||
# f"Output directory ({training_args.output_dir}) already exists and is not empty. "
|
||||
# "Use --overwrite_output_dir to overcome."
|
||||
# )
|
||||
# '''
|
||||
# pass
|
||||
# elif last_checkpoint is not None:
|
||||
# logger.info(
|
||||
# f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
|
||||
# "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
|
||||
# )
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
|
||||
|
||||
# Log on each process the small summary:
|
||||
logger.warning(
|
||||
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
||||
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
||||
)
|
||||
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||
if is_main_process(training_args.local_rank):
|
||||
transformers.utils.logging.set_verbosity_info()
|
||||
# logger.info("Training/evaluation parameters %s", training_args, model_args, data_args, delta_args)
|
||||
logger.info("{}\n{}\n{}\n{}".format(training_args, model_args, data_args, delta_args))
|
||||
|
||||
|
||||
# Set seed before initializing model.
|
||||
set_seed(training_args.seed)
|
||||
|
||||
|
||||
|
||||
if os.path.basename(model_args.model_name_or_path).startswith("t5"):
|
||||
from examples_prompt.backbones.t5 import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.t5 import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("blenderbot"):
|
||||
from examples_prompt.backbones.blenderbot import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.blenderbot import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("roberta") \
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("bert") \
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("albert") :
|
||||
from examples_prompt.backbones.bert import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.bert import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("beit"):
|
||||
from examples_prompt.backbones.beit import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.beit import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("bart"):
|
||||
from examples_prompt.backbones.bart import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.bart import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("bigbird"):
|
||||
from examples_prompt.backbones.bigbird import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.bigbird import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("clip"):
|
||||
from examples_prompt.backbones.clip import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.clip import Trainer, DataCollator
|
||||
|
||||
|
||||
|
||||
config, tokenizer, model = get_backbone(model_args=model_args)
|
||||
|
||||
# model parallelize
|
||||
if hasattr(training_args, "model_parallel") and training_args.model_parallel:
|
||||
logger.info('parallelize model!')
|
||||
model.parallelize()
|
||||
|
||||
from opendelta import Visualization
|
||||
Visualization(model).structure_graph()
|
||||
|
||||
if delta_args.delta_type.lower() != "none":
|
||||
from opendelta.delta_models.adapter import AdapterConfig, AdapterModel
|
||||
delta_config = AdapterConfig.from_finetuned(finetuned_delta_path=delta_args.finetuned_delta_path)
|
||||
delta_model = AdapterModel.from_finetuned(finetuned_delta_path=delta_args.finetuned_delta_path,
|
||||
delta_config=delta_config,
|
||||
backbone_model=model,
|
||||
force_download=delta_args.force_download,
|
||||
cache_dir=delta_args.delta_cache_dir)
|
||||
# delta_model.freeze_module(set_state_dict = True)
|
||||
delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True)
|
||||
|
||||
|
||||
performance_metrics = {}
|
||||
|
||||
|
||||
|
||||
|
||||
non_empty_splits_names = []
|
||||
# if training_args.do_train:
|
||||
# non_empty_splits_names.append("train")
|
||||
# if training_args.do_eval:
|
||||
# non_empty_splits_names.append("eval")
|
||||
if training_args.do_test:
|
||||
non_empty_splits_names.append("test")
|
||||
splits = {}
|
||||
for split_name in ['test']:
|
||||
if split_name not in non_empty_splits_names:
|
||||
splits[split_name] = None
|
||||
continue
|
||||
|
||||
task = AutoTask.get(data_args.task_name,
|
||||
data_args.dataset_config_name,
|
||||
data_args=data_args,
|
||||
seed=data_args.data_sample_seed)
|
||||
|
||||
dataset = task.get(split=split_name,
|
||||
split_validation_test=training_args.split_validation_test,
|
||||
n_obs=data_args.max_train_samples)
|
||||
|
||||
|
||||
|
||||
template, _verbalizer, tokenizer_wrapper = get_prompts(task, tokenizer, data_args)
|
||||
|
||||
|
||||
dataset = dataset.map(
|
||||
functools.partial(preprocess_function,
|
||||
data_args=data_args,
|
||||
tokenizer=tokenizer,
|
||||
template=template,
|
||||
verbalizer=_verbalizer,
|
||||
tokenizer_wrapper=tokenizer_wrapper,
|
||||
split=split_name),
|
||||
batched=False,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
remove_columns=get_remove_columns(list(dataset.features.keys())),
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
)
|
||||
# from IPython import embed; embed()
|
||||
splits[split_name] = dataset
|
||||
if split_name == "test":
|
||||
eval_task = task
|
||||
verbalizer = _verbalizer
|
||||
|
||||
|
||||
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
verbalizer=verbalizer,
|
||||
eval_task=eval_task,
|
||||
args=training_args,
|
||||
# train_dataset=splits['train'],
|
||||
# eval_dataset=splits['eval'],
|
||||
tokenizer=tokenizer,
|
||||
data_collator=DataCollator(tokenizer),
|
||||
)
|
||||
|
||||
|
||||
def save_training_config(config_file, output_dir):
|
||||
json_data = read_json(config_file)
|
||||
save_json(os.path.join(output_dir, "training_config.json"), json_data)
|
||||
|
||||
|
||||
# Saves training config.
|
||||
if trainer.is_world_process_zero():
|
||||
save_training_config(sys.argv[1], training_args.output_dir)
|
||||
|
||||
# # Training
|
||||
# if training_args.do_train:
|
||||
# checkpoint = None
|
||||
# if training_args.resume_from_checkpoint is not None:
|
||||
# checkpoint = training_args.resume_from_checkpoint
|
||||
# elif last_checkpoint is not None:
|
||||
# checkpoint = last_checkpoint
|
||||
|
||||
# if training_args.compute_time:
|
||||
# torch.cuda.synchronize() # wait for move to complete
|
||||
# start = torch.cuda.Event(enable_timing=True)
|
||||
# end = torch.cuda.Event(enable_timing=True)
|
||||
# start.record()
|
||||
|
||||
# train_result = trainer.train(resume_from_checkpoint=checkpoint)
|
||||
|
||||
# if training_args.compute_time:
|
||||
# end.record()
|
||||
# torch.cuda.synchronize() # wait for all_reduce to complete
|
||||
# total_time = start.elapsed_time(end)/(1000*60)
|
||||
# performance_metrics.update({"total_time in minutes ": total_time})
|
||||
|
||||
# trainer.save_model() # Saves the tokenizer too for easy upload
|
||||
# train_metrics = train_result.metrics
|
||||
# max_train_samples = (
|
||||
# data_args.max_train_samples if data_args.max_train_samples is not None else len(splits['train'])
|
||||
# )
|
||||
# train_metrics["train_samples"] = min(max_train_samples, len(splits['train']))
|
||||
# trainer.log_metrics("train", train_metrics)
|
||||
# trainer.save_metrics("train", train_metrics)
|
||||
# trainer.save_state()
|
||||
|
||||
# if torch.cuda.is_available() and training_args.compute_memory:
|
||||
# peak_memory = (torch.cuda.max_memory_allocated() / 1024 ** 2)/1000
|
||||
# print(
|
||||
# "Memory utilization",
|
||||
# peak_memory,
|
||||
# "GB"
|
||||
# )
|
||||
# performance_metrics.update({"peak_memory": peak_memory})
|
||||
# if training_args.compute_memory or training_args.compute_time:
|
||||
# print("Efficiency Statistics {}".format(performance_metrics))
|
||||
# trainer.save_metrics("performance", performance_metrics)
|
||||
|
||||
# Evaluation
|
||||
all_results = {}
|
||||
|
||||
# all_results['evaluate'] = {}
|
||||
|
||||
# if training_args.do_eval:
|
||||
# logger.info("*** Evaluate ***")
|
||||
|
||||
# metrics = trainer.evaluate(eval_dataset=splits['eval'],
|
||||
# )
|
||||
# trainer.log_metrics(f"{data_args.task_name}_eval", metrics)
|
||||
# trainer.save_metrics(f"{data_args.task_name}_eval", metrics)
|
||||
# all_results['evaluate'][data_args.task_name] = metrics
|
||||
|
||||
# Test
|
||||
all_results['test'] = {}
|
||||
if training_args.do_test:
|
||||
logger.info("*** Test ***")
|
||||
metrics = trainer.evaluate(eval_dataset=splits['test'],
|
||||
metric_key_prefix="test"
|
||||
)
|
||||
trainer.log_metrics(f"{data_args.task_name}_test", metrics)
|
||||
trainer.save_metrics(f"{data_args.task_name}_test", metrics)
|
||||
all_results['test'][data_args.task_name] = metrics
|
||||
|
||||
# from opendelta.utils.delta_hub import create_hub_repo_name
|
||||
# from opendelta.utils.delta_center import create_delta_center_args, create_repo_name
|
||||
|
||||
# repo_name = create_hub_repo_name(root="DeltaHub",
|
||||
# dataset=data_args.task_name,
|
||||
# delta_type = delta_args.delta_type,
|
||||
# model_name_or_path= model_args.model_name_or_path)
|
||||
|
||||
# center_args =
|
||||
# repo_name = create_repo_name(prefix="", center_args=center_args)
|
||||
# all_results['repo_name'] = repo_name
|
||||
|
||||
|
||||
# delta_model.save_finetuned(push_to_hf=training_args.push_to_hf,
|
||||
# push_to_dc=training_args.push_to_dc,
|
||||
# center_args={},
|
||||
# center_args_pool = {**vars(model_args), **vars(data_args), **vars(training_args), **vars(delta_args)},
|
||||
# delay_push=True,
|
||||
# )
|
||||
|
||||
print(all_results)
|
||||
|
||||
|
||||
|
||||
# with open(f"{training_args.output_dir}/results.json", 'w') as fout:
|
||||
# string = json.dumps(all_results, indent=4,sort_keys=True)
|
||||
# fout.write(string+"\n")
|
||||
|
||||
return all_results
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
result = main()
|
||||
|
|
@ -1,6 +1,10 @@
|
|||
from dataclasses import dataclass, field
|
||||
from typing import Optional, List
|
||||
from transformers import HfArgumentParser
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelArguments:
|
||||
|
@ -81,6 +85,10 @@ class TrainingArguments(HfTrainingArguments):
|
|||
remove_unused_columns: Optional[bool] = field(
|
||||
default=False, metadata={"help": "Remove columns not required by the model when using an nlp.Dataset."}
|
||||
)
|
||||
push_to_hf: Optional[bool] = field(default=False, metadata={"help": "Push the model to huggingface model hub."})
|
||||
push_to_dc: Optional[bool] = field(default=True, metadata={"help": "Push the model to delta center."})
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -211,28 +219,254 @@ class DataTrainingArguments:
|
|||
self.test_max_target_length = self.max_target_length
|
||||
|
||||
|
||||
|
||||
import dataclasses
|
||||
|
||||
@dataclass
|
||||
class DeltaArguments:
|
||||
"""
|
||||
Arguments pertaining to what data we are going to input our model for training and eval.
|
||||
"""
|
||||
delta_type: str= field(default="", metadata={"help": "the type of delta"})
|
||||
backbone_model: Optional[str] = field(
|
||||
default="", metadata={"help": "the backbone model"}
|
||||
)
|
||||
model_path_public: Optional[str] = field(
|
||||
default="", metadata={"help": "the path (url) of the publicly available backbone model"}
|
||||
)
|
||||
modified_modules: Optional[List[str]] = field(
|
||||
default_factory=lambda: None, metadata={"help": "the modules inside the backbone to be modified"}
|
||||
)
|
||||
unfrozen_modules: Optional[List[str]] = field(
|
||||
default_factory=lambda:["deltas"], metadata={"help": "the modules inside the backbone or in the delta modules that need to be unfrozen"}
|
||||
)
|
||||
finetuned_delta_path: Optional[str] = field(
|
||||
default=None, metadata={"help": "the path of the finetuned delta model"}
|
||||
)
|
||||
force_download: Optional[bool] = field(
|
||||
default=False, metadata={"help": "whether to download the checkpoint form delta center no matter whether it exists"}
|
||||
)
|
||||
local_files_only: Optional[bool] = field(
|
||||
default=False, metadata={"help": "whether not to look for file in delta center"}
|
||||
)
|
||||
delta_cache_dir: Optional[str] = field(
|
||||
default=None, metadata={"help": "The cache path defined by user. If not set, we will firstly look into the"+
|
||||
" working directory and then into the default cache path (ususally ~/.cache/delta_center)."}
|
||||
)
|
||||
delay_push: Optional[bool] = field(
|
||||
default=True, metadata={
|
||||
'help':'whether push the checkpoint to delta center later.'
|
||||
}
|
||||
)
|
||||
|
||||
def merge_arguments(self, objb):
|
||||
print(objb)
|
||||
self.__class__ = dataclasses.make_dataclass('DeltaArgument', fields=[(s.name, s.type, getattr(objb, s.name)) for s in dataclasses.fields(objb)], bases=(DeltaArguments,))
|
||||
|
||||
|
||||
|
||||
|
||||
@dataclass
|
||||
class AdapterArguments:
|
||||
bottleneck_dim: Optional[int] = field(
|
||||
default=24, metadata={"help": "the dimension of the bottleneck layer"}
|
||||
)
|
||||
@dataclass
|
||||
class LoRAArguments:
|
||||
lora_r: Optional[int] = field(
|
||||
default=8, metadata={"help": "the rank of the LoRA metrics."}
|
||||
)
|
||||
@dataclass
|
||||
class PrefixArguments:
|
||||
pass
|
||||
@dataclass
|
||||
class BitFitArguments:
|
||||
pass
|
||||
@dataclass
|
||||
class SoftPromptArguments:
|
||||
soft_token_num: Optional[int] = field(
|
||||
default=100, metadata={"help": "the num of soft tokens."}
|
||||
)
|
||||
|
||||
@dataclass
|
||||
class CompacterArguments:
|
||||
pass
|
||||
@dataclass
|
||||
class LowRankAdapterArguments:
|
||||
pass
|
||||
|
||||
# from opendelta.delta_models.adapter import AdapterConfig
|
||||
# from opendelta.delta_models.bitfit import BitFitConfig
|
||||
# from opendelta.delta_models.compacter import CompacterConfig
|
||||
# from opendelta.delta_models.lora import LoraArguments
|
||||
# from opendelta.delta_models.low_rank_adapter import LowRankAdapterConfig
|
||||
# from opendelta.delta_models.prefix import PrefixConfig
|
||||
# from opendelta.delta_models.soft_prompt import SoftPromptConfig
|
||||
# DELTAARGMAP = {
|
||||
# "adapter": AdapterConfig,
|
||||
# "lora":LoraArguments,
|
||||
# "prefix":PrefixConfig,
|
||||
# "bitfit":BitFitConfig,
|
||||
# "soft_prompt":SoftPromptConfig,
|
||||
# "compacter":CompacterConfig,
|
||||
# "low_rank_adapter":LowRankAdapterConfig
|
||||
|
||||
# }
|
||||
|
||||
DELTAARGMAP = {
|
||||
"adapter": AdapterArguments,
|
||||
"lora":LoRAArguments,
|
||||
"prefix":PrefixArguments,
|
||||
"bitfit":BitFitArguments,
|
||||
"soft_prompt":SoftPromptArguments,
|
||||
"compacter":CompacterArguments,
|
||||
"low_rank_adapter":LowRankAdapterArguments
|
||||
|
||||
}
|
||||
|
||||
# TODO: add more specific delta arguments
|
||||
|
||||
|
||||
|
||||
class RemainArgHfArgumentParser(HfArgumentParser):
|
||||
def parse_json_file(self, json_file: str, return_remaining_args=True ):
|
||||
'''This is a more powerful version of argument parser.
|
||||
It can receiven both command line arguments and json file arguments.
|
||||
The command line arguments will override the json file arguments.
|
||||
The parser will load the specific delta arguments (e.g. Adapter's)
|
||||
according to the delta_type argument. And merge the specific delta arguments
|
||||
with the common delta arguments.
|
||||
'''
|
||||
def parse_json_file_with_cmd_args(self, json_file: str, command_line_args=None, return_remaining_args=True ):
|
||||
"""
|
||||
Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the
|
||||
dataclass types.
|
||||
"""
|
||||
import argparse
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
import dataclasses
|
||||
|
||||
|
||||
|
||||
data = json.loads(Path(json_file).read_text())
|
||||
|
||||
|
||||
data_str = ""
|
||||
if command_line_args is None:
|
||||
command_line_args = []
|
||||
for key in data:
|
||||
if "--"+key not in command_line_args:
|
||||
if isinstance(data[key], list):
|
||||
data_str += "--"+key
|
||||
for elem in data[key]:
|
||||
data_str+=" "+ str(elem)
|
||||
data_str += " "
|
||||
else:
|
||||
data_str+= "--" + key + " " + str(data[key]) + " "
|
||||
|
||||
data_list = data_str.split()
|
||||
data_list += command_line_args
|
||||
|
||||
|
||||
if return_remaining_args:
|
||||
outputs, remain_args = self.parse_args_into_dataclasses(args=data_list, return_remaining_strings=return_remaining_args)
|
||||
for d in outputs:
|
||||
if isinstance(d, DeltaArguments): # merge the specific delta arguments
|
||||
d.merge_arguments(outputs[-1])
|
||||
|
||||
return [*(outputs[:-1]), remain_args]
|
||||
else:
|
||||
outputs = self.parse_args_into_dataclasses(args=data_list, return_remaining_strings=return_remaining_args)
|
||||
for d in outputs:
|
||||
if isinstance(d, DeltaArguments):
|
||||
d.merge_arguments(outputs[-1])
|
||||
return [*(outputs[:-1]),]
|
||||
|
||||
def parse_args_into_dataclasses(
|
||||
self, args=None, return_remaining_strings=False, look_for_args_file=True, args_filename=None
|
||||
):
|
||||
"""
|
||||
Parse command-line args into instances of the specified dataclass types.
|
||||
|
||||
This relies on argparse's `ArgumentParser.parse_known_args`. See the doc at:
|
||||
docs.python.org/3.7/library/argparse.html#argparse.ArgumentParser.parse_args
|
||||
|
||||
Args:
|
||||
args:
|
||||
List of strings to parse. The default is taken from sys.argv. (same as argparse.ArgumentParser)
|
||||
return_remaining_strings:
|
||||
If true, also return a list of remaining argument strings.
|
||||
look_for_args_file:
|
||||
If true, will look for a ".args" file with the same base name as the entry point script for this
|
||||
process, and will append its potential content to the command line args.
|
||||
args_filename:
|
||||
If not None, will uses this file instead of the ".args" file specified in the previous argument.
|
||||
|
||||
Returns:
|
||||
Tuple consisting of:
|
||||
|
||||
- the dataclass instances in the same order as they were passed to the initializer.abspath
|
||||
- if applicable, an additional namespace for more (non-dataclass backed) arguments added to the parser
|
||||
after initialization.
|
||||
- The potential list of remaining argument strings. (same as argparse.ArgumentParser.parse_known_args)
|
||||
"""
|
||||
if args_filename or (look_for_args_file and len(sys.argv)):
|
||||
if args_filename:
|
||||
args_file = Path(args_filename)
|
||||
else:
|
||||
args_file = Path(sys.argv[0]).with_suffix(".args")
|
||||
|
||||
if args_file.exists():
|
||||
fargs = args_file.read_text().split()
|
||||
args = fargs + args if args is not None else fargs + sys.argv[1:]
|
||||
# in case of duplicate arguments the first one has precedence
|
||||
# so we append rather than prepend.
|
||||
namespace, remaining_args = self.parse_known_args(args=args)
|
||||
|
||||
# conditionally add delta arguments
|
||||
deltatype_args = DELTAARGMAP[namespace.delta_type]
|
||||
self.dataclass_types.append(deltatype_args)
|
||||
self._add_dataclass_arguments(deltatype_args)
|
||||
|
||||
# parse the arguments again, this time with the specific delta type's arguments
|
||||
namespace, remaining_args = self.parse_known_args(args=args)
|
||||
|
||||
|
||||
outputs = []
|
||||
for dtype in self.dataclass_types:
|
||||
keys = {f.name for f in dataclasses.fields(dtype) if f.init}
|
||||
inputs = {k: data.pop(k) for k in list(data.keys()) if k in keys}
|
||||
inputs = {k: v for k, v in vars(namespace).items() if k in keys}
|
||||
for k in keys:
|
||||
delattr(namespace, k)
|
||||
obj = dtype(**inputs)
|
||||
outputs.append(obj)
|
||||
|
||||
remain_args = argparse.ArgumentParser()
|
||||
remain_args.__dict__.update(data)
|
||||
if return_remaining_args:
|
||||
return (*outputs, remain_args)
|
||||
if len(namespace.__dict__) > 0:
|
||||
# additional namespace.
|
||||
outputs.append(namespace)
|
||||
if return_remaining_strings:
|
||||
return (outputs, remaining_args)
|
||||
else:
|
||||
return (*outputs,)
|
||||
if remaining_args:
|
||||
raise ValueError(f"Some specified arguments are not used by the HfArgumentParser: {remaining_args}")
|
||||
|
||||
return outputs
|
||||
|
||||
# namespace, remaining_args = self.parse_known_args(args=data_list)
|
||||
|
||||
# print("Here", command_line_args, data_list,namespace, remaining_args)
|
||||
# data.update(remain_args)
|
||||
|
||||
# outputs = []
|
||||
# for dtype in self.dataclass_types:
|
||||
# keys = {f.name for f in dataclasses.fields(dtype) if f.init}
|
||||
# inputs = {k: namespace.get(k) for k in list(data.keys()) if k in keys}
|
||||
# obj = dtype(**inputs)
|
||||
# outputs.append(obj)
|
||||
|
||||
# # remain_args = argparse.ArgumentParser()
|
||||
# remain_args.__dict__.update(remain_args)
|
||||
# if return_remaining_args:
|
||||
# return (*outputs, remain_args)
|
||||
# else:
|
||||
# return (*outputs,)
|
||||
|
||||
|
||||
|
|
|
@ -1,48 +0,0 @@
|
|||
"""Install Compacter."""
|
||||
import os
|
||||
import setuptools
|
||||
from torch.utils.cpp_extension import BuildExtension, CUDAExtension
|
||||
|
||||
#os.environ['TORCH_CUDA_ARCH_LIST']="3.5;3.7;6.1;7.0;7.5;8.6+PTX"
|
||||
|
||||
def setup_package():
|
||||
long_description = "examples_prompt"
|
||||
setuptools.setup(
|
||||
name='examples_prompt',
|
||||
version='0.0.1',
|
||||
description='textual prompt example',
|
||||
long_description=long_description,
|
||||
long_description_content_type='text/markdown',
|
||||
author='Shengding Hu',
|
||||
license='MIT License',
|
||||
packages=setuptools.find_packages(
|
||||
exclude=['docs', 'tests', 'scripts']),
|
||||
dependency_links=[
|
||||
'https://download.pytorch.org/whl/torch_stable.html',
|
||||
],
|
||||
classifiers=[
|
||||
'Intended Audience :: Developers',
|
||||
'Intended Audience :: Science/Research',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Topic :: Scientific/Engineering :: Artificial Intelligence',
|
||||
'Programming Language :: Python :: 3',
|
||||
'Programming Language :: Python :: 3.7.10',
|
||||
],
|
||||
keywords='text nlp machinelearning',
|
||||
# ext_modules=[
|
||||
# CUDAExtension('seq2seq.projections.fwh_cuda',
|
||||
# sources=[
|
||||
# 'seq2seq/projections/fwh_cuda/fwh_cpp.cpp',
|
||||
# 'seq2seq/projections/fwh_cuda/fwh_cu.cu',
|
||||
# ]
|
||||
# )
|
||||
# ]
|
||||
# ,
|
||||
cmdclass={"build_ext": BuildExtension},
|
||||
install_requires=[
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
setup_package()
|
|
@ -1,5 +1,5 @@
|
|||
|
||||
__version__ = "0.1.0"
|
||||
__version__ = "0.2.4"
|
||||
|
||||
class GlobalSetting:
|
||||
def __init__(self):
|
||||
|
|
|
@ -2,16 +2,14 @@ from copy import deepcopy
|
|||
from typing import Any, Dict, OrderedDict
|
||||
from opendelta.utils.visualization import Visualization
|
||||
import torch.nn as nn
|
||||
from transformers.file_utils import PushToHubMixin
|
||||
from opendelta.utils.logging import get_logger
|
||||
import importlib
|
||||
from opendelta.delta_configs import BaseDeltaConfig
|
||||
from opendelta.basemodel import DeltaBase
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
DELTA_CONFIG_MAPPING = {
|
||||
"lora": "LoraConfig",
|
||||
"lora": "LoraConfig",
|
||||
"low_rank_adapter": "LowRankAdapterConfig",
|
||||
"bitfit": "BitFitConfig",
|
||||
"adapter":"AdapterConfig",
|
||||
|
@ -91,18 +89,18 @@ class AutoDeltaConfig:
|
|||
"AutoConfig is designed to be instantiated "
|
||||
"using the ``AutoConfig.from_pretrained(pretrained_model_name_or_path)`` method."
|
||||
)
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, config_dict: Dict[str, Any], **kwargs):
|
||||
r""" Instantiate a DeltaConfig according to the dict. Automatically load the config specified by
|
||||
r""" Instantiate a DeltaConfig according to the dict. Automatically load the config specified by
|
||||
:obj:`delta_type`.
|
||||
|
||||
Args:
|
||||
config_dict (:obj:`dict`): The dict of configs of delta model.
|
||||
kwargs: Other keyword argument pass to initialize the config.
|
||||
kwargs: Other keyword argument pass to initialize the config.
|
||||
|
||||
>>> config = AutoDeltaConfig.from_dict({"delta_type":"lora"}) # This will load the dault lora config.
|
||||
>>> config = AutoDeltaConfig.from_dict({"delta_type":"lora", "lora_r":5}) # Will load the default lora config, with lora_r = 5
|
||||
>>> config = AutoDeltaConfig.from_dict({"delta_type":"lora", "lora_r":5}) # Will load the default lora config, with lora_r = 5
|
||||
|
||||
"""
|
||||
config_dict = deepcopy(config_dict)
|
||||
|
@ -114,7 +112,7 @@ class AutoDeltaConfig:
|
|||
|
||||
|
||||
@classmethod
|
||||
def from_finetuned(cls, finetuned_model_name_or_path, **kwargs):
|
||||
def from_finetuned(cls, finetuned_delta_path, **kwargs):
|
||||
r"""
|
||||
Instantiate one of the configuration classes of the library from a finetuned delta model configuration.
|
||||
The configuration class to instantiate is selected based on the ``delta_type`` property of the config object that
|
||||
|
@ -122,18 +120,18 @@ class AutoDeltaConfig:
|
|||
|
||||
Parameters:
|
||||
|
||||
finetuned_model_name_or_path (:obj:`str` or :obj:`os.PathLike`, *optional*):
|
||||
finetuned_delta_path (:obj:`str` or :obj:`os.PathLike`, *optional*):
|
||||
Can be either:
|
||||
|
||||
- A string, the *model id* of a finetuned delta model configuration hosted inside a model repo on
|
||||
huggingface.co. Valid model ids can be located at the root-level, like ``Davin/lora``, or
|
||||
namespaced under a user or organization name, like ``DeltaHub/lora_t5-base_mrpc``.
|
||||
- A path to a *directory* containing a configuration file saved using the
|
||||
:py:meth:`DeltaBase.save_finetuned` method,
|
||||
:py:meth:`DeltaBase.save_finetuned` method,
|
||||
e.g., ``./my_model_directory/``.
|
||||
- A path or url to a saved configuration JSON *file*, e.g.,
|
||||
``./my_model_directory/configuration.json``.
|
||||
The last two option are not tested but inherited from huggingface.
|
||||
The last two option are not tested but inherited from huggingface.
|
||||
cache_dir (:obj:`str` or :obj:`os.PathLike`, *optional*):
|
||||
Path to a directory in which a downloaded pretrained model configuration should be cached if the
|
||||
standard cache should not be used.
|
||||
|
@ -163,9 +161,9 @@ class AutoDeltaConfig:
|
|||
The values in kwargs of any keys which are configuration attributes will be used to override the loaded
|
||||
values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
|
||||
by the ``return_unused_kwargs`` keyword parameter.
|
||||
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from transformers import AutoConfig
|
||||
|
@ -173,25 +171,24 @@ class AutoDeltaConfig:
|
|||
|
||||
"""
|
||||
|
||||
kwargs["name_or_path"] = finetuned_model_name_or_path
|
||||
|
||||
config_dict, _ = BaseDeltaConfig.get_config_dict(finetuned_model_name_or_path, **kwargs)
|
||||
config_dict, kwargs = BaseDeltaConfig.get_config_dict(finetuned_delta_path, **kwargs)
|
||||
if "delta_type" in config_dict:
|
||||
config_class = LAZY_CONFIG_MAPPING[config_dict["delta_type"]]
|
||||
return config_class.from_dict(config_dict, **kwargs)
|
||||
else:
|
||||
# Fallback: use pattern matching on the string.
|
||||
for pattern, config_class in LAZY_CONFIG_MAPPING.items():
|
||||
if pattern in str(finetuned_model_name_or_path):
|
||||
if pattern in str(finetuned_delta_path):
|
||||
return config_class.from_dict(config_dict, **kwargs)
|
||||
|
||||
raise ValueError(
|
||||
f"Unrecognized model in {finetuned_model_name_or_path}. "
|
||||
f"Unrecognized model in {finetuned_delta_path}. "
|
||||
f"Should have a `delta_type` key in the loaded config, or contain one of the following strings "
|
||||
f"in its name: {', '.join(LAZY_CONFIG_MAPPING.keys())}"
|
||||
)
|
||||
|
||||
### AutoModels below
|
||||
### AutoModels below
|
||||
|
||||
class _LazyAutoMapping(OrderedDict):
|
||||
"""
|
||||
|
@ -323,20 +320,20 @@ class AutoDeltaModel:
|
|||
f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or "
|
||||
f"`{self.__class__.__name__}.from_config(config)` methods."
|
||||
)
|
||||
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, config, backbone_model, **kwargs): #-> "DeltaBase":
|
||||
r"""Automatically instantiates a delta model based on the :obj:`config`. The delta model correspond to the delta
|
||||
:obj:`config` will be loaded and initialized using the arguments in :obj:`config`.
|
||||
:obj:`config` will be loaded and initialized using the arguments in :obj:`config`.
|
||||
|
||||
.. note::
|
||||
Only using :meth:`from_config` method will not load the finetuned weight file (e.g., pytorch_model.bin).
|
||||
Please use from_finetuned directly.
|
||||
Only using :meth:`from_config` method will not load the finetuned weight file (e.g., pytorch_model.bin).
|
||||
Please use from_finetuned directly.
|
||||
|
||||
Args:
|
||||
config (:obj:`BaseDeltaConfig`):
|
||||
backbone_model (:obj:`nn.Module`):
|
||||
|
||||
|
||||
Examples:
|
||||
|
||||
.. code-block:: python
|
||||
|
@ -355,53 +352,54 @@ class AutoDeltaModel:
|
|||
)
|
||||
|
||||
@classmethod
|
||||
def from_finetuned(cls, finetuned_model_name_or_path, backbone_model, *model_args, **kwargs):
|
||||
r""" Automatically instantiated a delta model and load the finetuned checkpoints based on the
|
||||
:obj:`finetuned_model_name_or_path`, which can either be a string pointing to a local path or a url pointint to
|
||||
the delta hub. It will check the hash after loading the delta model to see whether the correct backbone and
|
||||
delta checkpoint are used.
|
||||
def from_finetuned(cls, finetuned_delta_path, backbone_model, *model_args, **kwargs):
|
||||
r""" Automatically instantiated a delta model and load the finetuned checkpoints based on the
|
||||
:obj:`finetuned_delta_path`, which can either be a string pointing to a local path or a url pointint to
|
||||
the delta hub. It will check the hash after loading the delta model to see whether the correct backbone and
|
||||
delta checkpoint are used.
|
||||
|
||||
Args:
|
||||
finetuned_model_name_or_path (:obj:`str` or :obj:`os.PathLike`, *optional*):
|
||||
finetuned_delta_path (:obj:`str` or :obj:`os.PathLike`, *optional*):
|
||||
Can be either:
|
||||
|
||||
- A string, the *model id* of a finetuned delta model configuration hosted inside a model repo on
|
||||
huggingface.co. Valid model ids can be located at the root-level, like ``Davin/lora``, or
|
||||
namespaced under a user or organization name, like ``DeltaHub/lora_t5-base_mrpc``.
|
||||
- A path to a *directory* containing a configuration file saved using the
|
||||
:py:meth:`DeltaBase.save_finetuned` method,
|
||||
:py:meth:`DeltaBase.save_finetuned` method,
|
||||
e.g., ``./my_model_directory/``.
|
||||
- A path or url to a saved configuration JSON *file*, e.g.,
|
||||
``./my_model_directory/configuration.json``.
|
||||
The last two option are not tested but inherited from huggingface.
|
||||
The last two option are not tested but inherited from huggingface.
|
||||
|
||||
backbone_model (:obj:`nn.Module`): The backbone model to be modified.
|
||||
model_args: Other argument for initialize the model.
|
||||
kwargs: Other kwargs that will be passed into DeltaBase.from_finetuned.
|
||||
|
||||
Example:
|
||||
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
delta_model = AutoDeltaModel.from_finetuned("DeltaHub/lora_t5-base-mrpc", backbone_model)
|
||||
|
||||
"""
|
||||
config = kwargs.pop("config", None)
|
||||
delta_config = kwargs.pop("delta_config", None)
|
||||
|
||||
if not isinstance(config, BaseDeltaConfig):
|
||||
config, kwargs = AutoDeltaConfig.from_finetuned(
|
||||
finetuned_model_name_or_path, return_unused_kwargs=True, **kwargs
|
||||
if not isinstance(delta_config, BaseDeltaConfig):
|
||||
delta_config, kwargs = AutoDeltaConfig.from_finetuned(
|
||||
finetuned_delta_path, return_unused_kwargs=True, **kwargs
|
||||
)
|
||||
if type(config) in cls._delta_model_mapping.keys():
|
||||
model_class = cls._delta_model_mapping[type(config)]
|
||||
return model_class.from_finetuned(finetuned_model_name_or_path, backbone_model, *model_args, **kwargs)
|
||||
if type(delta_config) in cls._delta_model_mapping.keys():
|
||||
model_class = cls._delta_model_mapping[type(delta_config)]
|
||||
return model_class.from_finetuned(finetuned_delta_path, backbone_model, *model_args, delta_config=delta_config, **kwargs)
|
||||
raise ValueError(
|
||||
f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
|
||||
f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
|
|
|
@ -671,21 +671,46 @@ class DeltaBase(nn.Module, SaveLoadMixin):
|
|||
if visualization:
|
||||
from opendelta import Visualization
|
||||
Visualization(module).structure_graph()
|
||||
|
||||
self.get_statistics(module)
|
||||
if trainable_ratio:
|
||||
n_trainable = self.num_trainable_parameters(module)
|
||||
n_total = self.num_total_parameters(module)
|
||||
logger.info("Trainable Ratio: {:2f}%".format(n_trainable/n_total*100))
|
||||
logger.info("Trainable Ratio: {:2f}%".format(self.stat['trainable_ratio']*100))
|
||||
if delta_ratio:
|
||||
n_delta = self.num_delta_parameters(module)
|
||||
n_total = self.num_total_parameters(module)
|
||||
logger.info("Delta Parameter Ratio: {:2f}%".format(n_delta/n_total*100))
|
||||
logger.info("Delta Parameter Ratio: {:2f}%".format(self.stat['delta_ratio']*100))
|
||||
if cuda_memory:
|
||||
cudamem = 0
|
||||
maxcudamem = 0
|
||||
for device_id in range(torch.cuda.device_count()):
|
||||
cudamem += torch.cuda.memory_allocated(f"cuda:{device_id}")/1024**3
|
||||
maxcudamem += torch.cuda.max_memory_allocated(f"cuda:{device_id}")/1024**3
|
||||
logger.info("Static Memory {:.2f} GB, Max Memory {:.2f} GB".format(cudamem, maxcudamem))
|
||||
logger.info("Static Memory {:.2f} GB, Max Memory {:.2f} GB".format(self.stat['cudamem'], self.stat['maxcudamem']))
|
||||
|
||||
|
||||
def get_statistics(self, module=None):
|
||||
r"""Get the statistics of the parameters in the delta modules.
|
||||
|
||||
Args:
|
||||
module (:obj:`nn.Module`, *optional*): The module to compute the statistics.
|
||||
|
||||
Returns:
|
||||
:obj:`dict`: The statistics of the parameters in the delta modules.
|
||||
|
||||
"""
|
||||
if module is None:
|
||||
module = self.backbone_model
|
||||
|
||||
self.stat = {}
|
||||
n_trainable = self.num_trainable_parameters(module)
|
||||
n_total = self.num_total_parameters(module)
|
||||
|
||||
self.stat['trainable_ratio'] = n_trainable/n_total
|
||||
|
||||
n_delta = self.num_delta_parameters(module)
|
||||
n_total = self.num_total_parameters(module)
|
||||
self.stat['delta_ratio'] = n_delta/n_total
|
||||
|
||||
cudamem = 0
|
||||
maxcudamem = 0
|
||||
for device_id in range(torch.cuda.device_count()):
|
||||
cudamem += torch.cuda.memory_allocated(f"cuda:{device_id}")/1024**3
|
||||
maxcudamem += torch.cuda.max_memory_allocated(f"cuda:{device_id}")/1024**3
|
||||
self.stat['cudamem'] = cudamem
|
||||
self.stat['maxcudamem'] = maxcudamem
|
||||
|
||||
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue