Merge pull request #16 from ShengdingHu/v1.0.0

update seq2seq examples
This commit is contained in:
DingDing 2022-05-11 06:49:44 +08:00 committed by GitHub
commit 211dce1b9d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
43 changed files with 1846 additions and 2619 deletions

View File

@ -0,0 +1,178 @@
from openpromptu.data_utils import InputExample
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
from transformers import (
AutoConfig,
AutoModelForSeq2SeqLM,
AutoTokenizer,
)
from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
import torch
def mask_token_func(tokenizer, ith_mask=0):
return tokenizer.mask_token
def get_remove_columns(dataset_features):
return dataset_features
def preprocess_function(raw_example, **kwargs):
# max_target_length += 1
tokenizer = kwargs['tokenizer']
data_args = kwargs['data_args']
template = kwargs['template']
verbalizer = kwargs['verbalizer']
tokenizer_wrapper = kwargs['tokenizer_wrapper']
split = kwargs['split']
example = InputExample(**raw_example)
try:
example = verbalizer.wrap_one_example(example)
example, other = template.wrap_one_example(example)
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
model_inputs = tokenizer(input_sentence, max_length=256,
padding="max_length", truncation=True)
except:
from IPython import embed; embed(header="Therer")
with tokenizer.as_target_tokenizer():
label = tokenizer(other['tgt_text']).input_ids
model_inputs["labels"] = label
return model_inputs
def get_backbone(model_args, **kwargs):
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
config.dropout_rate = 0.0
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForSeq2SeqLM.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
return config, tokenizer, model
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
from openpromptu.prompts import GenerationVerbalizer
from openpromptu.prompts import ManualTemplate
from openpromptu import TokenizerWrapper
template = ManualTemplate(text = task.templates_text[template_id])
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
return template, verbalizer, tokenizer_wrapper
class Trainer(HfSeq2SeqTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.eval_task = eval_task
self.compute_metrics = self._compute_metrics
def compute_loss(self, model, inputs, return_outputs=False):
outputs = model(**inputs)
if return_outputs:
return (outputs.loss, outputs)
else:
return outputs.loss
def prediction_step(
self,
model, #nn.Module,
inputs, #Dict[str, Union[torch.Tensor, Any]],
prediction_loss_only, #: bool,
ignore_keys, #: Optional[List[str]] = None,
): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
"""
Perform an evaluation step on :obj:`model` using obj:`inputs`.
Subclass and override to inject custom behavior.
Args:
model (:obj:`nn.Module`):
The model to evaluate.
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
The inputs and targets of the model.
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
prediction_loss_only (:obj:`bool`):
Whether or not to return the loss only.
Return:
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
labels (each being optional).
"""
if not self.args.predict_with_generate or prediction_loss_only:
return super().prediction_step(
model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
)
has_labels = "labels" in inputs
inputs = self._prepare_inputs(inputs)
gen_kwargs = {
"max_length": 10, # self._max_length if s is not None else self.model.config.max_length,
"num_beams": 1 #self._num_beams if self._num_beams is not None else self.model.config.num_beams,
}
generated_tokens = self.model.generate(
inputs["input_ids"],
attention_mask=inputs["attention_mask"],
**gen_kwargs,
)
# in case the batch is shorter than max length, the output should be padded
if generated_tokens.shape[-1] < gen_kwargs["max_length"]:
generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
with torch.no_grad():
outputs = model(**inputs)
if has_labels:
if self.label_smoother is not None:
loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
else:
loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
else:
loss = None
if self.args.prediction_loss_only:
return (loss, None, None)
labels = inputs["labels"]
if labels.shape[-1] < gen_kwargs["max_length"]:
labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
# from IPython import embed; embed(header="In seqseqtrainer")
return (loss, generated_tokens, labels)
def _compute_metrics(self, eval_preds):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds
decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
# post_processor = .get(data_args.dataset_name[0], tokenizer,
# data_args.ignore_pad_token_for_loss)
# decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info)
result = {}
for metric in self.eval_task.metric:
result.update(metric(decoded_preds, decoded_labels))
average_metric = sum(result.values())/len(result)
result.update({"average_metrics":average_metric})
return result

View File

@ -13,11 +13,6 @@ from transformers import ViTFeatureExtractor
from transformers import Trainer as HfTrainer
import torch.nn as nn
def process_example(raw_example, **kwargs):
tokenizer = kwargs['tokenizer']
inputs = tokenizer(raw_example['image'], return_tensors='pt')
inputs['labels'] = raw_example['labels']
return inputs
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
# from openpromptu.prompts import ManualVerbalizer

View File

@ -49,7 +49,7 @@ def mask_token_func(tokenizer, ith_mask=0):
return tokenizer.mask_token
def get_remove_columns(dataset_features):
dataset_features.pop("label")
dataset_features.remove("label")
return dataset_features
@ -60,6 +60,7 @@ def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
template = ManualTemplate(text = task.templates_text[template_id])
verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
from IPython import embed; embed()
return template, verbalizer, tokenizer_wrapper
class DataCollator(HfDataCollatorMixin):

View File

@ -0,0 +1,143 @@
from openpromptu.data_utils import InputExample
import torch
from transformers.data.data_collator import torch_default_data_collator
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
import numpy as np
from transformers import (
AutoConfig,
AutoModelForMaskedLM,
AutoTokenizer,
)
from transformers import Trainer as HfTrainer
def preprocess_function(raw_example, **kwargs):
tokenizer = kwargs['tokenizer']
data_args = kwargs['data_args']
template = kwargs['template']
verbalizer = kwargs['verbalizer']
tokenizer_wrapper = kwargs['tokenizer_wrapper']
example = InputExample(**raw_example)
example, other = template.wrap_one_example(example)
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
padding="max_length", truncation=True)
return model_inputs
def compute_metrics(eval_preds, dataset_name, eval_metric):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds.predictions, eval_preds.label_ids
preds = np.argmax(preds, axis=-1)
result = {}
average_metrics = []
for metric in eval_metric:
metric_item = metric(preds, labels)
metric_value = list(metric_item.values())
result.update(metric_item)
average_metrics.extend(metric_value)
print("average:",average_metrics)
average_metric = sum(average_metrics)/len(average_metrics)
result.update({"average_metrics":average_metric})
return result
def mask_token_func(tokenizer, ith_mask=0):
return tokenizer.mask_token
def get_remove_columns(dataset_features):
# from IPython import embed; embed(header="get_remove_columns")
dataset_features.remove("label")
return dataset_features
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
from openpromptu.prompts import ManualVerbalizer
from openpromptu.prompts import ManualTemplate
from openpromptu import TokenizerWrapper
template = ManualTemplate(text = task.templates_text[template_id])
verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
# from IPython import embed; embed()
return template, verbalizer, tokenizer_wrapper
class DataCollator(HfDataCollatorMixin):
def __init__(self, *args, **kwargs):
self.return_tensors='pt'
def torch_call(self, features):
return torch_default_data_collator(features=features)
def get_backbone(model_args, **kwargs):
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
config.dropout_rate = 0.0
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForMaskedLM.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model.resize_token_embeddings(len(tokenizer))
return config, tokenizer, model
class Trainer(HfTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.verbalizer=verbalizer
self.eval_task=eval_task
self.compute_metrics = self._compute_metrics
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.pop('labels')
outputs = model(**inputs)
logits = outputs.get("logits")
input_ids = inputs['input_ids']
verbalizer = self.verbalizer.cuda()
logits_at_mask = logits[torch.where(input_ids == verbalizer.tokenizer.mask_token_id)]
label_logits = verbalizer.process_logits(logits_at_mask)
loss_fct = torch.nn.CrossEntropyLoss()
loss = loss_fct(label_logits, labels)
outputs.logits = label_logits
return (loss, outputs) if return_outputs else loss
def _compute_metrics(self, eval_preds):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds.predictions, eval_preds.label_ids
preds = np.argmax(preds, axis=-1)
result = {}
average_metrics = []
for metric in self.eval_task.metric:
metric_item = metric(preds, labels)
metric_value = list(metric_item.values())
result.update(metric_item)
average_metrics.extend(metric_value)
print("average:",average_metrics)
average_metric = sum(average_metrics)/len(average_metrics)
result.update({"average_metrics":average_metric})
return result

View File

@ -0,0 +1,182 @@
from openpromptu.data_utils import InputExample
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
from transformers import (
AutoConfig,
BlenderbotForConditionalGeneration,
AutoTokenizer,
)
from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
import torch
def mask_token_func(tokenizer, ith_mask=0):
return ""
def get_remove_columns(dataset_features):
return dataset_features
def preprocess_function(raw_example, **kwargs):
# max_target_length += 1
tokenizer = kwargs['tokenizer']
data_args = kwargs['data_args']
template = kwargs['template']
verbalizer = kwargs['verbalizer']
tokenizer_wrapper = kwargs['tokenizer_wrapper']
split = kwargs['split']
example = InputExample(**raw_example)
try:
example = verbalizer.wrap_one_example(example)
example, other = template.wrap_one_example(example)
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
padding="max_length", truncation=True)
except:
from IPython import embed; embed(header="Therer")
with tokenizer.as_target_tokenizer():
label = tokenizer(other['tgt_text']).input_ids
model_inputs["labels"] = label
# from IPython import embed; embed()
return model_inputs
def get_backbone(model_args, **kwargs):
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
config.dropout_rate = 0.0
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = BlenderbotForConditionalGeneration.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
# from IPython import embed; embed()
return config, tokenizer, model
def get_prompts(task, tokenizer, data_args, template_id="blenderbot", verbalizer_id="blenderbot"):
from openpromptu.prompts import GenerationVerbalizer
from openpromptu.prompts import ManualTemplate
from openpromptu import TokenizerWrapper
template = ManualTemplate(text = task.templates_text[template_id])
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
return template, verbalizer, tokenizer_wrapper
class Trainer(HfSeq2SeqTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.eval_task = eval_task
self.compute_metrics = self._compute_metrics
def compute_loss(self, model, inputs, return_outputs=False):
# from IPython import embed; embed()
outputs = model(**inputs)
if return_outputs:
return (outputs.loss, outputs)
else:
return outputs.loss
def prediction_step(
self,
model, #nn.Module,
inputs, #Dict[str, Union[torch.Tensor, Any]],
prediction_loss_only, #: bool,
ignore_keys, #: Optional[List[str]] = None,
): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
"""
Perform an evaluation step on :obj:`model` using obj:`inputs`.
Subclass and override to inject custom behavior.
Args:
model (:obj:`nn.Module`):
The model to evaluate.
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
The inputs and targets of the model.
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
prediction_loss_only (:obj:`bool`):
Whether or not to return the loss only.
Return:
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
labels (each being optional).
"""
if not self.args.predict_with_generate or prediction_loss_only:
return super().prediction_step(
model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
)
has_labels = "labels" in inputs
inputs = self._prepare_inputs(inputs)
gen_kwargs = {
"max_length": 10, # self._max_length if s is not None else self.model.config.max_length,
"num_beams": 1, #self._num_beams if self._num_beams is not None else self.model.config.num_beams,
"min_length": 1 # for blenderbot, generally we set it to be a large number. But in classification, we set it to 1
}
generated_tokens = self.model.generate(
inputs["input_ids"],
attention_mask=inputs["attention_mask"],
**gen_kwargs,
)
# in case the batch is shorter than max length, the output should be padded
if generated_tokens.shape[-1] < gen_kwargs["max_length"]:
generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
with torch.no_grad():
outputs = model(**inputs)
if has_labels:
if self.label_smoother is not None:
loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
else:
loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
else:
loss = None
if self.args.prediction_loss_only:
return (loss, None, None)
labels = inputs["labels"]
if labels.shape[-1] < gen_kwargs["max_length"]:
labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
# from IPython import embed; embed(header="In seqseqtrainer")
return (loss, generated_tokens, labels)
def _compute_metrics(self, eval_preds):
from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds
decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
# post_processor = .get(data_args.dataset_name[0], tokenizer,
# data_args.ignore_pad_token_for_loss)
# decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info)
result = {}
for metric in self.eval_task.metric:
result.update(metric(decoded_preds, decoded_labels))
average_metric = sum(result.values())/len(result)
result.update({"average_metrics":average_metric})
return result

View File

@ -0,0 +1,172 @@
from openpromptu.data_utils import InputExample
import torch
from transformers.data.data_collator import torch_default_data_collator
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
import numpy as np
from transformers import (
CLIPConfig,
CLIPProcessor,
CLIPModel,
)
from transformers import ViTFeatureExtractor
from PIL import Image
from transformers import Trainer as HfTrainer
import torch.nn as nn
def get_prompts(task, tokenizer, data_args, template_id="clip", verbalizer_id="clip"):
from openpromptu.prompts import GenerationVerbalizer
from openpromptu.prompts import ManualTemplate
from openpromptu import TokenizerWrapper
template = ManualTemplate(text = task.templates_text[template_id])
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer.tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
return template, verbalizer, tokenizer_wrapper
def mask_token_func(tokenizer, ith_mask=0):
return tokenizer.mask_token
def preprocess_function(raw_example, **kwargs):
# from IPython import embed; embed(header="Therefa")
tokenizer = kwargs['tokenizer']
# ["a photo of {}" for i in range()]
data_args = kwargs['data_args']
template = kwargs['template']
verbalizer = kwargs['verbalizer']
tokenizer_wrapper = kwargs['tokenizer_wrapper']
example = InputExample(raw_example)
texts = []
for candidate_label in range(verbalizer.num_classes):
tgt_text = verbalizer.wrap_one_example(label=candidate_label)
wrapped_example, other = template.wrap_one_example(example)
input_sentence = tokenizer_wrapper.merge_wrapped_example(wrapped_example, tgt_texts=[tgt_text])
texts.append(input_sentence)
# from IPython import embed; embed()/
image = Image.open(raw_example['image_file_path'])
model_inputs = tokenizer(images=image, text=texts, max_length=16, padding="max_length", truncation=True, return_tensors='pt')
# from IPython import embed; embed()
model_inputs["pixel_values"] = model_inputs["pixel_values"].squeeze()
model_inputs["label"] = example.label
return model_inputs
def compute_metrics(eval_preds, dataset_name, eval_metric):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds.predictions, eval_preds.label_ids
preds = np.argmax(preds, axis=-1)
result = {}
average_metrics = []
for metric in eval_metric:
metric_item = metric(preds, labels)
metric_value = list(metric_item.values())
result.update(metric_item)
average_metrics.extend(metric_value)
print("average:",average_metrics)
average_metric = sum(average_metrics)/len(average_metrics)
result.update({"average_metrics":average_metric})
return result
def get_remove_columns(dataset_features):
# from IPython import embed; embed(header="in remoev")
dataset_features.remove("labels")
print("remove_columns: {}".format(dataset_features))
return dataset_features
class DataCollator(HfDataCollatorMixin):
def __init__(self, *args, **kwargs):
self.return_tensors='pt'
def torch_call(self, features):
# from IPython import embed; embed(header="in data collator")
a = torch_default_data_collator(features=features)
# from IPython import embed; embed(header="in data collator")
a["input_ids"] = a["input_ids"][0]
a["attention_mask"] = a["attention_mask"][0]
return a
def get_backbone(model_args, **kwargs):
config = CLIPConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
# config.dropout_rate = 0.0
tokenizer = CLIPProcessor.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = CLIPModel.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
# config.num_labels = model_args.num_classes
# old_classifier = model.classifier
# model.classifier = nn.Linear(old_classifier.in_features, config.num_labels)
return config, tokenizer, model
class Trainer(HfTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.verbalizer=verbalizer
self.eval_task=eval_task
self.compute_metrics = self._compute_metrics
self.loss_fn = nn.CrossEntropyLoss()
def compute_loss(self, model, inputs, return_outputs=False):
# from IPython import embed; embed()
labels = inputs.pop('labels')
outputs = model(**inputs)
# logits = outputs.get("logits")
logits_per_image = outputs.logits_per_image
loss = self.loss_fn(logits_per_image, labels)
return (loss, outputs) if return_outputs else loss
def _compute_metrics(self, eval_preds):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds.predictions, eval_preds.label_ids
preds = np.argmax(preds, axis=-1)
result = {}
average_metrics = []
for metric in self.eval_task.metric:
metric_item = metric(preds, labels)
metric_value = list(metric_item.values())
result.update(metric_item)
average_metrics.extend(metric_value)
print("average:",average_metrics)
average_metric = sum(average_metrics)/len(average_metrics)
result.update({"average_metrics":average_metric})
from IPython import embed; embed(header="In compute metrics")
return result

View File

@ -1,25 +1,33 @@
import collections
import collections
import copy
BS = 1
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
parser.add_argument("--")
args = parser.parse_args()
BaseConfigs = {}
BaseConfigs['t5-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
@ -27,53 +35,9 @@ BaseConfigs['t5-base'] = {
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": "t5-base",
"tokenizer_name": "t5-base",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
BaseConfigs['t5-large'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": "/home/hushengding/plm_cache/t5-large",
"tokenizer_name": "/home/hushengding/plm_cache/t5-large",
"model_name_or_path": f"{PATHBASE}t5-base",
"tokenizer_name": f"{PATHBASE}t5-base",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
@ -89,63 +53,18 @@ BaseConfigs['t5-large'] = {
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
BaseConfigs['t5-3b'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": "/home/hushengding/plm_cache/t5-3b",
"tokenizer_name": "/home/hushengding/plm_cache/t5-3b",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['bitfit_t5-base'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-base/",
})
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['adapter_t5-base'].update({
"delta_type": "adapter",
@ -185,16 +104,16 @@ AllConfigs['compacter_t5-base'].update({
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
@ -222,16 +141,16 @@ AllConfigs['compacter++_t5-base'].update({
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
@ -252,7 +171,7 @@ AllConfigs['low_rank_adapter_t5-base'].update({
],
"output_dir": "outputs/low_rank_adapter/t5-base/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
@ -279,102 +198,71 @@ AllConfigs['prefix_t5-base'].update({
"output_dir": "outputs/prefix/t5-base/",
})
AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['none_t5-base'].update({
"delta_type": "none",
"learning_rate": 3e-5,
"output_dir": "outputs/none/t5-base/",
})
AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
AllConfigs['bitfit_t5-large'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-large/",
})
AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
AllConfigs['none_t5-large'].update({
"delta_type": "none",
"learning_rate": 3e-5,
"output_dir": "outputs/none/t5-large/",
})
AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
AllConfigs['bitfit_t5-3b'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-3b/",
})
AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
AllConfigs['none_t5-3b'].update({
"delta_type": "none",
"learning_rate": 3e-5,
"output_dir": "outputs/none/t5-3b/",
})
AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
AllConfigs['adapter_t5-3b'].update({
"delta_type": "adapter",
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['soft_prompt_t5-base'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/t5-3b/",
"output_dir": "outputs/soft_prompt/t5-base/",
})
#### T5-base
BaseConfigs['t5-small'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
AllConfigs['adapter_t5-large'].update({
"delta_type": "adapter",
"model_name_or_path": f"{PATHBASE}t5-small",
"tokenizer_name": f"{PATHBASE}t5-small",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
AllConfigs['prefix_t5-small'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/t5-large/",
"output_dir": "outputs/prefix/t5-small/",
})
AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
AllConfigs['lora_t5-large'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"lora_r": 8,
"output_dir": "outputs/lora/t5-large/",
})
AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
AllConfigs['lora_t5-3b'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"lora_r": 8,
"output_dir": "outputs/lora/t5-3b/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
@ -399,13 +287,12 @@ if __name__ == "__main__":
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"./{args.job}_{BS}/"):
os.mkdir(f"./{args.job}_{BS}/")
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,248 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
# PATHBASE=""
AllConfigs = {}
BaseConfigs = {}
BaseConfigs['bart-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}bart-base",
"tokenizer_name": f"{PATHBASE}bart-base",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
AllConfigs['bitfit_bart-base'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/bart-base/",
})
AllConfigs['adapter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
AllConfigs['adapter_bart-base'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/bart-base/",
})
AllConfigs['lora_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
AllConfigs['lora_bart-base'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"modified_modules": [
"q_proj",
"v_proj",
],
"lora_r": 8,
"output_dir": "outputs/lora/bart-base/",
})
AllConfigs['compacter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
AllConfigs['compacter_bart-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter/bart-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['compacter++_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
AllConfigs['compacter++_bart-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"do_train": True,
"do_eval": True,
"do_test": True,
"modified_modules": [
"DenseReluDense"
],
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter++/bart-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['low_rank_adapter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
AllConfigs['low_rank_adapter_bart-base'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/low_rank_adapter/bart-base/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
AllConfigs['soft_prompt_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
AllConfigs['soft_prompt_bart-base'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-2,
"soft_token_num":100,
"token_init": False,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/bart-base/",
})
AllConfigs['prefix_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
AllConfigs['prefix_bart-base'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/bart-base/",
})
AllConfigs['soft_prompt_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
AllConfigs['soft_prompt_bart-base'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/bart-base/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -205,209 +205,9 @@ AllConfigs['soft_prompt_beit-base-patch16-224'].update({
],
"output_dir": "outputs/soft_prompt/beit-base-patch16-224/",
})
#### beit-base-patch16-224
BaseConfigs['t5-small'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}t5-small",
"tokenizer_name": f"{PATHBASE}t5-small",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
AllConfigs['prefix_t5-small'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/t5-small/",
})
#### ROBERTA######
BaseConfigs['roberta-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}roberta-base",
"tokenizer_name": f"{PATHBASE}roberta-base",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['bitfit_roberta-base'].update({
"delta_type": "bitfit",
"learning_rate": 1e-3,
"output_dir": "outputs/bitfit/roberta-base/",
})
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['none_roberta-base'].update({
"delta_type": "none",
"learning_rate": 1e-5,
"output_dir": "outputs/none/roberta-base/",
})
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['lora_roberta-base'].update({
"delta_type": "lora",
"learning_rate": 1e-3,
"output_dir": "outputs/lora/roberta-base/",
})
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['adapter_roberta-base'].update({
"delta_type": "adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/adapter/roberta-base/",
})
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['low_rank_adapter_roberta-base'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/low_rank_adapter/roberta-base/",
})
#### ROBERTA######
BaseConfigs['bert-base-cased'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}bert-base-cased",
"tokenizer_name": f"{PATHBASE}bert-base-cased",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
AllConfigs['prefix_bert-base-cased'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/bert-base-cased/",
})
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
AllConfigs['soft_prompt_bert-base-cased'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/bert-base-cased/",
})
if __name__ == "__main__":
import argparse
import json

View File

@ -0,0 +1,147 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
#### ROBERTA ######
BaseConfigs['bigbird-roberta-large'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}bigbird-roberta-large",
"tokenizer_name": f"{PATHBASE}bigbird-roberta-large",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
AllConfigs['bitfit_bigbird-roberta-large'].update({
"delta_type": "bitfit",
"learning_rate": 1e-3,
"output_dir": "outputs/bitfit/bigbird-roberta-large/",
})
AllConfigs['none_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
AllConfigs['none_bigbird-roberta-large'].update({
"delta_type": "none",
"learning_rate": 1e-5,
"output_dir": "outputs/none/bigbird-roberta-large/",
})
AllConfigs['lora_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
AllConfigs['lora_bigbird-roberta-large'].update({
"delta_type": "lora",
"learning_rate": 1e-3,
"modified_modules": [
"query",
"key",
],
"output_dir": "outputs/lora/bigbird-roberta-large/",
})
AllConfigs['adapter_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
AllConfigs['adapter_bigbird-roberta-large'].update({
"delta_type": "adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/adapter/bigbird-roberta-large/",
})
AllConfigs['low_rank_adapter_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
AllConfigs['low_rank_adapter_bigbird-roberta-large'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/low_rank_adapter/bigbird-roberta-large/",
})
AllConfigs['soft_prompt_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
AllConfigs['soft_prompt_bigbird-roberta-large'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/bigbird-roberta-large/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,254 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
BaseConfigs['blenderbot-400M-distill'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}blenderbot-400M-distill",
"tokenizer_name": f"{PATHBASE}blenderbot-400M-distill",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['bitfit_blenderbot-400M-distill'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/blenderbot-400M-distill/",
})
AllConfigs['adapter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['adapter_blenderbot-400M-distill'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/blenderbot-400M-distill/",
})
AllConfigs['lora_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['lora_blenderbot-400M-distill'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"modified_modules":[
"q_proj",
"v_proj",
],
"lora_r": 8,
"output_dir": "outputs/lora/blenderbot-400M-distill/",
})
AllConfigs['compacter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['compacter_blenderbot-400M-distill'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter/blenderbot-400M-distill/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['compacter++_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['compacter++_blenderbot-400M-distill'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"do_train": True,
"do_eval": True,
"do_test": True,
"modified_modules": [
"DenseReluDense"
],
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter++/blenderbot-400M-distill/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['low_rank_adapter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['low_rank_adapter_blenderbot-400M-distill'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/low_rank_adapter/blenderbot-400M-distill/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
AllConfigs['none_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['none_blenderbot-400M-distill'].update({
"delta_type": "none",
"learning_rate": 1e-5,
"output_dir": "outputs/none/blenderbot-400M-distill/",
})
AllConfigs['soft_prompt_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['soft_prompt_blenderbot-400M-distill'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-2,
"soft_token_num":100,
"token_init": False,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/blenderbot-400M-distill/",
})
AllConfigs['prefix_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['prefix_blenderbot-400M-distill'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/blenderbot-400M-distill/",
})
AllConfigs['soft_prompt_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
AllConfigs['soft_prompt_blenderbot-400M-distill'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/blenderbot-400M-distill/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,303 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
BaseConfigs['clip-vit-base-patch32'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "num_classes"): zip(
["beans"],
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20],
[256],
[ 32],
[ 32],#, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0], # *7 +[0] *8,
[200],# 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200],#, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[ 3],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}clip-vit-base-patch32",
"tokenizer_name": f"{PATHBASE}clip-vit-base-patch32",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['bitfit_clip-vit-base-patch32'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/clip-vit-base-patch32/",
})
AllConfigs['none_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['none_clip-vit-base-patch32'].update({
"delta_type": "none",
"learning_rate": 1e-5,
"output_dir": "outputs/none/clip-vit-base-patch32/",
})
AllConfigs['adapter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['adapter_clip-vit-base-patch32'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/clip-vit-base-patch32/",
})
AllConfigs['lora_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['lora_clip-vit-base-patch32'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"lora_r": 8,
"output_dir": "outputs/lora/clip-vit-base-patch32/",
})
AllConfigs['compacter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['compacter_clip-vit-base-patch32'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter/clip-vit-base-patch32/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['compacter++_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['compacter++_clip-vit-base-patch32'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"do_train": True,
"do_eval": True,
"do_test": True,
"modified_modules": [
"DenseReluDense"
],
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter++/clip-vit-base-patch32/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['low_rank_adapter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['low_rank_adapter_clip-vit-base-patch32'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/low_rank_adapter/clip-vit-base-patch32/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
AllConfigs['soft_prompt_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['soft_prompt_clip-vit-base-patch32'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-2,
"soft_token_num":100,
"token_init": False,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/clip-vit-base-patch32/",
})
AllConfigs['prefix_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['prefix_clip-vit-base-patch32'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/clip-vit-base-patch32/",
})
AllConfigs['soft_prompt_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
AllConfigs['soft_prompt_clip-vit-base-patch32'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/clip-vit-base-patch32/",
})
#### clip-vit-base-patch32
BaseConfigs['t5-small'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}t5-small",
"tokenizer_name": f"{PATHBASE}t5-small",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
AllConfigs['prefix_t5-small'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/t5-small/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -76,8 +76,6 @@ AllConfigs['lora_t5-base'].update({
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"lora_r": 8,
"output_dir": "outputs/lora/t5-base/",
@ -254,154 +252,6 @@ AllConfigs['prefix_t5-small'].update({
})
#### ROBERTA######
BaseConfigs['roberta-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}roberta-base",
"tokenizer_name": f"{PATHBASE}roberta-base",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['bitfit_roberta-base'].update({
"delta_type": "bitfit",
"learning_rate": 1e-3,
"output_dir": "outputs/bitfit/roberta-base/",
})
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['none_roberta-base'].update({
"delta_type": "none",
"learning_rate": 1e-5,
"output_dir": "outputs/none/roberta-base/",
})
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['lora_roberta-base'].update({
"delta_type": "lora",
"learning_rate": 1e-3,
"output_dir": "outputs/lora/roberta-base/",
})
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['adapter_roberta-base'].update({
"delta_type": "adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/adapter/roberta-base/",
})
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['low_rank_adapter_roberta-base'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/low_rank_adapter/roberta-base/",
})
#### ROBERTA######
BaseConfigs['bert-base-cased'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}bert-base-cased",
"tokenizer_name": f"{PATHBASE}bert-base-cased",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
AllConfigs['prefix_bert-base-cased'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/bert-base-cased/",
})
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
AllConfigs['soft_prompt_bert-base-cased'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/bert-base-cased/",
})
if __name__ == "__main__":
import argparse
import json

View File

@ -85,11 +85,14 @@ class SST2(AbstractTask):
"test": "validation"}
verbalizers = {
"0":{"0":"negative","1":"positive"}
"0":{"0":"negative","1":"positive"},
"blenderbot":{"0":"negative","1":"positive"}
}
templates_text = {
"0":"""The sentiment of sentence: "{"meta":"sentence", "shortenable":True} is {"mask"}."""
"0":"""The sentiment of sentence: "{"meta":"sentence", "shortenable":True}" is {"mask"}.""",
"blenderbot": """{"meta":"sentence", "shortenable":True} what is the sentiment?"""
}
def load_dataset(self, split):
@ -533,14 +536,15 @@ class Beans(AbstractTask):
metric_names = ["accuracy"]
verbalizers = {
"0": {
"0": "No",
"1": "Yes",
"clip": {
"angular_leaf_spot": "angular_leaf_spot",
"bean_rust": "bean_rust",
"healthy": "healthy",
}
}
templates_text = {
"0": """{"meta":"sentence1"}"""
"clip":"""a photo of {"mask"} leaf."""
}
def load_dataset(self, split):

View File

@ -124,6 +124,9 @@ def main():
if os.path.basename(model_args.model_name_or_path).startswith("t5"):
from examples_prompt.backbones.t5 import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
from examples_prompt.backbones.t5 import Trainer, DataCollator
elif os.path.basename(model_args.model_name_or_path).startswith("blenderbot"):
from examples_prompt.backbones.blenderbot import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
from examples_prompt.backbones.blenderbot import Trainer, DataCollator
elif os.path.basename(model_args.model_name_or_path).startswith("roberta") \
or os.path.basename(model_args.model_name_or_path).startswith("bert") \
or os.path.basename(model_args.model_name_or_path).startswith("albert") :
@ -132,6 +135,15 @@ def main():
elif os.path.basename(model_args.model_name_or_path).startswith("beit"):
from examples_prompt.backbones.beit import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
from examples_prompt.backbones.beit import Trainer, DataCollator
elif os.path.basename(model_args.model_name_or_path).startswith("bart"):
from examples_prompt.backbones.bart import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
from examples_prompt.backbones.bart import Trainer, DataCollator
elif os.path.basename(model_args.model_name_or_path).startswith("bigbird"):
from examples_prompt.backbones.bigbird import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
from examples_prompt.backbones.bigbird import Trainer, DataCollator
elif os.path.basename(model_args.model_name_or_path).startswith("clip"):
from examples_prompt.backbones.clip import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
from examples_prompt.backbones.clip import Trainer, DataCollator
@ -139,6 +151,9 @@ def main():
config, tokenizer, model = get_backbone(model_args=model_args)
from opendelta import Visualization
Visualization(model).structure_graph()
if delta_args.delta_type.lower() != "none":
from opendelta import AutoDeltaConfig,AutoDeltaModel
delta_config = AutoDeltaConfig.from_dict(vars(delta_args))
@ -174,7 +189,7 @@ def main():
task = AutoTask.get(data_args.task_name,
data_args.dataset_config_name,
data_args=data_args,
seed=data_args.data_seed)
seed=data_args.data_sample_seed)
dataset = task.get(split=split_name,
split_validation_test=training_args.split_validation_test,
@ -182,7 +197,7 @@ def main():
template, _verbalizer, tokenizer_wrapper = get_prompts(task, tokenizer, training_args)
template, _verbalizer, tokenizer_wrapper = get_prompts(task, tokenizer, data_args)
dataset = dataset.map(

View File

@ -197,7 +197,7 @@ class DataTrainingArguments:
datasets_saved_path: Optional[str] = field(
default=None, metadata={"help": "the path of the saved datasets"}
)
data_seed: Optional[int] = field(default=42, metadata={"help": "seed used to shuffle the data."})
data_sample_seed: Optional[int] = field(default=42, metadata={"help": "seed used to shuffle the data."})
model_parallel: Optional[bool] = field(default=False, metadata={"help": "whether apply model parallelization"})

View File

@ -1,21 +0,0 @@
# the final results will be populated here.{
"evaluate": {
"epoch": 20.0,
"eval_accuracy": 89.2156862745098,
"eval_average_metrics": 90.76168929110105,
"eval_f1": 92.3076923076923,
"eval_loss": 0.16493959724903107,
"eval_runtime": 1.6391,
"eval_samples_per_second": 124.455
},
"repo_name": "DeltaHub/bitfit_t5-base_mrpc",
"test": {
"epoch": 20.0,
"test_accuracy": 88.23529411764706,
"test_average_metrics": 89.97971602434077,
"test_f1": 91.72413793103448,
"test_loss": 0.14968213438987732,
"test_runtime": 1.6344,
"test_samples_per_second": 124.82
}
}

View File

@ -1,40 +0,0 @@
{
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "cola",
"eval_steps": 100,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 128,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"num_train_epochs": 20,
"output_dir": "outputs/bitfit/t5-base/cola",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": true,
"save_steps": 100,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "cola",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "cola",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,40 +0,0 @@
{
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mnli",
"eval_steps": 200,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 128,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"num_train_epochs": 3,
"output_dir": "outputs/bitfit/t5-base/mnli",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": true,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "mnli",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mnli",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,40 +0,0 @@
{
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 128,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"num_train_epochs": 20,
"output_dir": "outputs/bitfit/t5-base/mrpc",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": true,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,40 +0,0 @@
{
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "qnli",
"eval_steps": 200,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 128,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"num_train_epochs": 3,
"output_dir": "outputs/bitfit/t5-base/qnli",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": true,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "qnli",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "qnli",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,40 +0,0 @@
{
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "qqp",
"eval_steps": 200,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 128,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"num_train_epochs": 3,
"output_dir": "outputs/bitfit/t5-base/qqp",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": true,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "qqp",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "qqp",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,40 +0,0 @@
{
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "rte",
"eval_steps": 100,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 128,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"num_train_epochs": 20,
"output_dir": "outputs/bitfit/t5-base/rte",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": true,
"save_steps": 100,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "rte",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "rte",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,40 +0,0 @@
{
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "sst2",
"eval_steps": 200,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 128,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"num_train_epochs": 3,
"output_dir": "outputs/bitfit/t5-base/sst2",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": true,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "sst2",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "sst2",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,40 +0,0 @@
{
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "stsb",
"eval_steps": 100,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 128,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"num_train_epochs": 20,
"output_dir": "outputs/bitfit/t5-base/stsb",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": true,
"save_steps": 100,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "stsb",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "stsb",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,40 +0,0 @@
{
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "superglue-boolq",
"eval_steps": 200,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"num_train_epochs": 20,
"output_dir": "outputs/bitfit/t5-base/superglue-boolq",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": true,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "superglue-boolq",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "superglue-boolq",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,40 +0,0 @@
{
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "superglue-cb",
"eval_steps": 100,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"num_train_epochs": 20,
"output_dir": "outputs/bitfit/t5-base/superglue-cb",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": true,
"save_steps": 100,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "superglue-cb",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "superglue-cb",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,40 +0,0 @@
{
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "superglue-copa",
"eval_steps": 50,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"num_train_epochs": 40,
"output_dir": "outputs/bitfit/t5-base/superglue-copa",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": true,
"save_steps": 50,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "superglue-copa",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "superglue-copa",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,40 +0,0 @@
{
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "superglue-multirc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"num_train_epochs": 3,
"output_dir": "outputs/bitfit/t5-base/superglue-multirc",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": true,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "superglue-multirc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "superglue-multirc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,40 +0,0 @@
{
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "superglue-record",
"eval_steps": 200,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 512,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"num_train_epochs": 3,
"output_dir": "outputs/bitfit/t5-base/superglue-record",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 16,
"per_device_train_batch_size": 16,
"predict_with_generate": true,
"push_to_hub": true,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "superglue-record",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "superglue-record",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,40 +0,0 @@
{
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "superglue-wic",
"eval_steps": 100,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"num_train_epochs": 20,
"output_dir": "outputs/bitfit/t5-base/superglue-wic",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": true,
"save_steps": 100,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "superglue-wic",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "superglue-wic",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,40 +0,0 @@
{
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "superglue-wsc.fixed",
"eval_steps": 100,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"num_train_epochs": 20,
"output_dir": "outputs/bitfit/t5-base/superglue-wsc.fixed",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": true,
"save_steps": 100,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "superglue-wsc.fixed",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "superglue-wsc.fixed",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,426 +0,0 @@
import collections
import copy
AllConfigs = {}
BaseConfigs = {}
BaseConfigs['t5-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [8] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [8] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": "t5-base",
"tokenizer_name": "t5-base",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
BaseConfigs['t5-large'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [8] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [8] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": "/home/hushengding/plm_cache/t5-large",
"tokenizer_name": "/home/hushengding/plm_cache/t5-large",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
BaseConfigs['t5-3b'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [8] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [8] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": "/home/hushengding/plm_cache/t5-3b",
"tokenizer_name": "/home/hushengding/plm_cache/t5-3b",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['bitfit_t5-base'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-base/",
})
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['adapter_t5-base'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/t5-base/",
})
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['lora_t5-base'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"lora_r": 8,
"output_dir": "outputs/lora/t5-base/",
})
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['compacter_t5-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter/t5-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['compacter++_t5-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"do_train": True,
"do_eval": True,
"do_test": True,
"modified_modules": [
"DenseReluDense"
],
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter++/t5-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['low_rank_adapter_t5-base'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/low_rank_adapter/t5-base/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
AllConfigs['low_rank_adapter_t5-xxl'] = copy.deepcopy(BaseConfigs['t5-xxl'])
AllConfigs['low_rank_adapter_t5-xxl'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/low_rank_adapter/t5-xxl/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['soft_prompt_t5-base'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-2,
"soft_token_num":100,
"token_init": False,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/t5-base/",
})
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['prefix_t5-base'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/t5-base/",
})
AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['none_t5-base'].update({
"delta_type": "none",
"learning_rate": 3e-5,
"output_dir": "outputs/none/t5-base/",
})
AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
AllConfigs['bitfit_t5-large'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-large/",
})
AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
AllConfigs['none_t5-large'].update({
"delta_type": "none",
"learning_rate": 3e-5,
"output_dir": "outputs/none/t5-large/",
})
AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
AllConfigs['bitfit_t5-3b'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-3b/",
})
AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
AllConfigs['none_t5-3b'].update({
"delta_type": "none",
"learning_rate": 3e-5,
"output_dir": "outputs/none/t5-3b/",
})
AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
AllConfigs['adapter_t5-3b'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/t5-3b/",
})
AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
AllConfigs['adapter_t5-large'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/t5-large/",
})
AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
AllConfigs['lora_t5-large'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"lora_r": 8,
"output_dir": "outputs/lora/t5-large/",
})
AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
AllConfigs['lora_t5-3b'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"lora_r": 8,
"output_dir": "outputs/lora/t5-3b/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"./{args.job}/"):
os.mkdir(f"./{args.job}/")
for job_name in all_config_jsons:
with open(f"./{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -1,4 +1,4 @@
import collections
import collections
import copy
BS = 1
@ -6,13 +6,13 @@ AllConfigs = {}
BaseConfigs = {}
BaseConfigs['t5-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
@ -27,7 +27,7 @@ BaseConfigs['t5-base'] = {
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": "t5-base",
"tokenizer_name": "t5-base",
"save_total_limit": 1,
@ -50,13 +50,13 @@ BaseConfigs['t5-base'] = {
}
BaseConfigs['t5-large'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
@ -71,7 +71,7 @@ BaseConfigs['t5-large'] = {
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": "/home/hushengding/plm_cache/t5-large",
"tokenizer_name": "/home/hushengding/plm_cache/t5-large",
"save_total_limit": 1,
@ -94,13 +94,13 @@ BaseConfigs['t5-large'] = {
}
BaseConfigs['t5-3b'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
@ -115,7 +115,7 @@ BaseConfigs['t5-3b'] = {
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": "/home/hushengding/plm_cache/t5-3b",
"tokenizer_name": "/home/hushengding/plm_cache/t5-3b",
"save_total_limit": 1,
@ -139,8 +139,8 @@ BaseConfigs['t5-3b'] = {
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['bitfit_t5-base'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-base/",
})
@ -185,16 +185,16 @@ AllConfigs['compacter_t5-base'].update({
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
@ -222,16 +222,16 @@ AllConfigs['compacter++_t5-base'].update({
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
@ -252,7 +252,7 @@ AllConfigs['low_rank_adapter_t5-base'].update({
],
"output_dir": "outputs/low_rank_adapter/t5-base/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
@ -288,8 +288,8 @@ AllConfigs['none_t5-base'].update({
AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
AllConfigs['bitfit_t5-large'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-large/",
})
@ -303,8 +303,8 @@ AllConfigs['none_t5-large'].update({
AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
AllConfigs['bitfit_t5-3b'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-3b/",
})
@ -367,7 +367,7 @@ AllConfigs['lora_t5-3b'].update({
"output_dir": "outputs/lora/t5-3b/",
})
if __name__ == "__main__":
import argparse
import json
@ -405,7 +405,6 @@ if __name__ == "__main__":
for job_name in all_config_jsons:
with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -1,411 +0,0 @@
import collections
import copy
BS = 64
AllConfigs = {}
BaseConfigs = {}
BaseConfigs['t5-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": "t5-base",
"tokenizer_name": "t5-base",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
BaseConfigs['t5-large'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": "/home/hushengding/plm_cache/t5-large",
"tokenizer_name": "/home/hushengding/plm_cache/t5-large",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
BaseConfigs['t5-3b'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": "/home/hushengding/plm_cache/t5-3b",
"tokenizer_name": "/home/hushengding/plm_cache/t5-3b",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['bitfit_t5-base'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-base/",
})
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['adapter_t5-base'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/t5-base/",
})
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['lora_t5-base'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"lora_r": 8,
"output_dir": "outputs/lora/t5-base/",
})
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['compacter_t5-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter/t5-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['compacter++_t5-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"do_train": True,
"do_eval": True,
"do_test": True,
"modified_modules": [
"DenseReluDense"
],
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter++/t5-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['low_rank_adapter_t5-base'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/low_rank_adapter/t5-base/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['soft_prompt_t5-base'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-2,
"soft_token_num":100,
"token_init": False,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/t5-base/",
})
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['prefix_t5-base'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/t5-base/",
})
AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['none_t5-base'].update({
"delta_type": "none",
"learning_rate": 3e-5,
"output_dir": "outputs/none/t5-base/",
})
AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
AllConfigs['bitfit_t5-large'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-large/",
})
AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
AllConfigs['none_t5-large'].update({
"delta_type": "none",
"learning_rate": 3e-5,
"output_dir": "outputs/none/t5-large/",
})
AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
AllConfigs['bitfit_t5-3b'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-3b/",
})
AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
AllConfigs['none_t5-3b'].update({
"delta_type": "none",
"learning_rate": 3e-5,
"output_dir": "outputs/none/t5-3b/",
})
AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
AllConfigs['adapter_t5-3b'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/t5-3b/",
})
AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
AllConfigs['adapter_t5-large'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/t5-large/",
})
AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
AllConfigs['lora_t5-large'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"lora_r": 8,
"output_dir": "outputs/lora/t5-large/",
})
AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
AllConfigs['lora_t5-3b'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"lora_r": 8,
"output_dir": "outputs/lora/t5-3b/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"./{args.job}_{BS}/"):
os.mkdir(f"./{args.job}_{BS}/")
for job_name in all_config_jsons:
with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -1,411 +0,0 @@
import collections
import copy
BS = 8
AllConfigs = {}
BaseConfigs = {}
BaseConfigs['t5-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": "t5-base",
"tokenizer_name": "t5-base",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
BaseConfigs['t5-large'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": "/home/hushengding/plm_cache/t5-large",
"tokenizer_name": "/home/hushengding/plm_cache/t5-large",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
BaseConfigs['t5-3b'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": "/home/hushengding/plm_cache/t5-3b",
"tokenizer_name": "/home/hushengding/plm_cache/t5-3b",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['bitfit_t5-base'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-base/",
})
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['adapter_t5-base'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/t5-base/",
})
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['lora_t5-base'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"lora_r": 8,
"output_dir": "outputs/lora/t5-base/",
})
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['compacter_t5-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter/t5-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['compacter++_t5-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"do_train": True,
"do_eval": True,
"do_test": True,
"modified_modules": [
"DenseReluDense"
],
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter++/t5-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['low_rank_adapter_t5-base'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/low_rank_adapter/t5-base/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['soft_prompt_t5-base'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-2,
"soft_token_num":100,
"token_init": False,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/t5-base/",
})
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['prefix_t5-base'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/t5-base/",
})
AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['none_t5-base'].update({
"delta_type": "none",
"learning_rate": 3e-5,
"output_dir": "outputs/none/t5-base/",
})
AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
AllConfigs['bitfit_t5-large'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-large/",
})
AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
AllConfigs['none_t5-large'].update({
"delta_type": "none",
"learning_rate": 3e-5,
"output_dir": "outputs/none/t5-large/",
})
AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
AllConfigs['bitfit_t5-3b'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-3b/",
})
AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
AllConfigs['none_t5-3b'].update({
"delta_type": "none",
"learning_rate": 3e-5,
"output_dir": "outputs/none/t5-3b/",
})
AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
AllConfigs['adapter_t5-3b'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/t5-3b/",
})
AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
AllConfigs['adapter_t5-large'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/t5-large/",
})
AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
AllConfigs['lora_t5-large'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"lora_r": 8,
"output_dir": "outputs/lora/t5-large/",
})
AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
AllConfigs['lora_t5-3b'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"lora_r": 8,
"output_dir": "outputs/lora/t5-3b/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"./{args.job}_{BS}/"):
os.mkdir(f"./{args.job}_{BS}/")
for job_name in all_config_jsons:
with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -1,26 +0,0 @@
lora 32 0.8396220207214355 3.7825655937194824
lora 32 2.773350238800049 10.523799419403076
lora 32 10.683510303497314 32.6428017616272
lora 32 0.8396220207214355 3.7825236320495605
lora 32 2.773350238800049 10.523311138153076
adapter 32 0.8578410148620605 3.986640453338623
adapter 32 2.821873188018799 11.039577007293701
adapter 32 10.696877002716064 33.12049341201782
adapter 8 0.8578410148620605 1.6147065162658691
adapter 8 2.821873188018799 4.828186511993408
adapter 8 10.696877002716064 16.09417200088501
lora 8 0.8396220207214355 1.5540986061096191
lora 8 2.773350238800049 4.664810657501221
lora 1 0.8396220207214355 0.9107160568237305
lora 8 10.683510303497314 15.965403079986572
lora 64 0.8396220207214355 6.777950763702393
lora 1 2.773350238800049 2.9350662231445312
lora 64 2.773350238800049 18.340473651885986
lora 1 10.683510303497314 11.131460189819336
adapter 1 0.8578410148620605 0.9334897994995117
lora 64 10.683510303497314 54.61024713516235
adapter 1 2.821873188018799 2.9950332641601562
adapter 64 0.8578410148620605 7.167330265045166
adapter 1 10.696877002716064 11.156260967254639
adapter 64 2.821873188018799 19.32366418838501
adapter 64 10.696877002716064 55.56023454666138

View File

@ -1,7 +0,0 @@
files=(cola mnli mrpc qnli qqp rte sst2 stsb superglue-boolq superglue-cb superglue-copa superglue-multirc superglue-record superglue-wic superglue-wsc.fixed)
for ((i=$1; i<=$2; i++))
do
dataset=${files[i]}
echo "id$i:$dataset"
TOKENIZERS_PARALLELISM=false python run_seq2seq.py configs/$3/$dataset.json
done

View File

@ -1,34 +0,0 @@
# files=(cola mnli mrpc qnli qqp rte sst2 stsb superglue-boolq superglue-cb superglue-copa superglue-multirc superglue-record superglue-wic superglue-wsc.fixed)
# for ((i=$1; i<=$2; i++))
# do
# dataset=${files[i]}
# echo "id$i:$dataset"
# TOKENIZERS_PARALLELISM=false python run_seq2seq.py configs/$3/$dataset.json
# done
cd configs
for deltatype in "lora" "adapter"
do
for modeltype in "t5-base" "t5-large" "t5-3b"
do
echo $deltatype
python config_gen_bs$2.py --job $deltatype\_$modeltype
done
done
ls
cd ..
for deltatype in "lora" "adapter"
do
for modeltype in "t5-base" "t5-large" "t5-3b"
do
CUDA_VISIBLE_DEVICES=$1 bash run.sh 2 2 $deltatype\_$modeltype\_$2
done
done

View File

@ -19,9 +19,9 @@ Fine-tuning the library models for sequence to sequence.
import functools
import logging
# from opendelta.utils.delta_center import create_hub_repo_name
import torch
import torch
import os
os.environ['MKL_THREADING_LAYER'] = 'GNU'
os.environ['MKL_THREADING_LAYER'] = 'GNU'
os.environ['MKL_SERVICE_FORCE_INTEL'] = '1'
import sys
import subprocess
@ -43,14 +43,14 @@ from transformers.trainer_utils import is_main_process, get_last_checkpoint
from examples_seq2seq.data_processors import AutoTask, TaskDataCollatorForSeq2Seq, AutoPostProcessor
from examples_seq2seq.seq2seq_trainer import Seq2SeqTrainer
# from training_args import AdapterTrainingArguments
from examples_seq2seq.trainers.trainer_utils import save_training_config
from examples_seq2seq.trainers.trainer_utils import save_training_config
from dataclasses import dataclass, field
from transformers.models.t5.modeling_t5 import T5Config, T5ForConditionalGeneration
from examples_seq2seq.trainers.model_args import ModelArguments
from examples_seq2seq.trainers.trainer_args import TrainingArguments, DataTrainingArguments
import tensorboardX
import tensorboardX
tb_writer = tensorboardX.SummaryWriter("Delta_Memory")
logger = logging.getLogger(__name__)
@ -100,7 +100,7 @@ class RemainArgHfArgumentParser(HfArgumentParser):
inputs = {k: data.pop(k) for k in list(data.keys()) if k in keys}
obj = dtype(**inputs)
outputs.append(obj)
remain_args = argparse.ArgumentParser()
remain_args.__dict__.update(data)
if return_remaining_args:
@ -108,41 +108,41 @@ class RemainArgHfArgumentParser(HfArgumentParser):
else:
return (*outputs,)
from transformers.trainer_callback import TrainerCallback
# from transformers.trainer_callback import TrainerCallback
class MyCallback(TrainerCallback):
def __init__(self, *args, **kwargs):
self.delta_args = kwargs.pop("delta_args")
self.trainer_args = kwargs.pop("trainer_args")
self.model_args = kwargs.pop("model_args")
super(MyCallback, self).__init__(*args, **kwargs)
maxcudamem = 0
def on_step_end(self, args, state, control, **kwargs ):
glb_step = state.global_step
cudamem = 0
realcudamem =0
for device_id in range(torch.cuda.device_count()):
cudamem += torch.cuda.memory_allocated(f"cuda:{device_id}")/1024**3
realcudamem += torch.cuda.max_memory_allocated(f"cuda:{device_id}")/1024**3
torch.cuda.reset_peak_memory_stats(f"cuda:{device_id}")
self.maxcudamem = max(self.maxcudamem, realcudamem)
self.cudamem = cudamem
# self.tb_writer.add_scalar("Static Memory (GB)", cudamem, glb_step)
# class MyCallback(TrainerCallback):
# def __init__(self, *args, **kwargs):
# self.delta_args = kwargs.pop("delta_args")
# self.trainer_args = kwargs.pop("trainer_args")
# self.model_args = kwargs.pop("model_args")
# super(MyCallback, self).__init__(*args, **kwargs)
# maxcudamem = 0
# def on_step_end(self, args, state, control, **kwargs ):
# glb_step = state.global_step
# cudamem = 0
# realcudamem =0
# for device_id in range(torch.cuda.device_count()):
# cudamem += torch.cuda.memory_allocated(f"cuda:{device_id}")/1024**3
# realcudamem += torch.cuda.max_memory_allocated(f"cuda:{device_id}")/1024**3
# torch.cuda.reset_peak_memory_stats(f"cuda:{device_id}")
# self.maxcudamem = max(self.maxcudamem, realcudamem)
# self.cudamem = cudamem
# # self.tb_writer.add_scalar("Static Memory (GB)", cudamem, glb_step)
# self.tb_writer.add_scalar("Runtime Memory (GB)", realcudamem, glb_step)
# self.tb_writer.add_scalar("Peak Memory (GB)", self.maxcudamem, glb_step)
if glb_step > 50:
content = f"{self.delta_args.delta_type}\t{self.trainer_args.per_device_train_batch_size}\t{self.model_args.model_name_or_path}\t{self.cudamem}\t{self.maxcudamem}\n"
with open("memory_data.txt", 'a') as fout:
fout.write(content)
exit()
# if glb_step > 50:
# content = f"{self.delta_args.delta_type}\t{self.trainer_args.per_device_train_batch_size}\t{self.model_args.model_name_or_path}\t{self.cudamem}\t{self.maxcudamem}\n"
# with open("memory_data.txt", 'a') as fout:
# fout.write(content)
# exit()
@ -172,7 +172,7 @@ def main():
"Use --overwrite_output_dir to overcome."
)
'''
pass
pass
elif last_checkpoint is not None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
@ -273,7 +273,7 @@ def main():
# Temporarily set max_target_length for training.
#max_target_length = data_args.max_target_length
padding = "max_length" if data_args.pad_to_max_length else False
def preprocess_function(examples, max_target_length):
# max_target_length += 1
# model_inputs = tokenizer([s+"<extra_id_0>" for s in examples['source']], max_length=data_args.max_source_length,
@ -301,7 +301,7 @@ def main():
if training_args.do_train:
train_datasets = [AutoTask.get(dataset_name,
dataset_config_name,
seed=data_args.data_seed).get(
seed=data_args.data_sample_seed).get(
split="train",
split_validation_test=training_args.split_validation_test,
add_prefix=True,
@ -320,11 +320,11 @@ def main():
load_from_cache_file=not data_args.overwrite_cache,
)
train_dataset = concatenate_datasets(train_datasets)
if training_args.do_eval:
eval_datasets = {eval_dataset: AutoTask.get(eval_dataset, eval_dataset_config,
seed=data_args.data_seed).get(
split="validation",
seed=data_args.data_sample_seed).get(
split="validation",
split_validation_test=training_args.split_validation_test,
add_prefix=True,
n_obs=data_args.max_val_samples)
@ -343,8 +343,8 @@ def main():
if training_args.do_test:
test_datasets = {test_dataset: AutoTask.get(test_dataset, test_dataset_config,
seed=data_args.data_seed).get(
split="test",
seed=data_args.data_sample_seed).get(
split="test",
split_validation_test=training_args.split_validation_test,
add_prefix=True,
n_obs=data_args.max_test_samples)
@ -379,10 +379,10 @@ def main():
# Extracts the extra information needed to evaluate on each dataset.
# These information are only used in the compute_metrics.
# We will assume that the test/eval dataloader does not change the order of
# We will assume that the test/eval dataloader does not change the order of
# the data.
data_info = {"eval": eval_datasets[data_args.eval_dataset_name[0]]['extra_fields'],
"test": test_datasets[data_args.test_dataset_name[0]]['extra_fields'],
"test": test_datasets[data_args.test_dataset_name[0]]['extra_fields'],
"train": train_dataset['extra_fields']}
def compute_metrics(eval_preds):
preds, labels, data_info = eval_preds
@ -409,10 +409,10 @@ def main():
evaluation_metrics = TASK_TO_METRICS[data_args.dataset_name[0]],
)
trainer.add_callback(MyCallback(trainer_args=training_args, delta_args=delta_args, model_args=model_args))
# trainer.add_callback(MyCallback(trainer_args=training_args, delta_args=delta_args, model_args=model_args))
# Saves training config.
# Saves training config.
if trainer.is_world_process_zero():
os.makedirs(training_args.output_dir, exist_ok=True)
save_training_config(sys.argv[1], training_args.output_dir)
@ -430,15 +430,15 @@ def main():
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
train_result = trainer.train(resume_from_checkpoint=checkpoint)
if training_args.compute_time:
end.record()
torch.cuda.synchronize() # wait for all_reduce to complete
total_time = start.elapsed_time(end)/(1000*60)
performance_metrics.update({"total_time in minutes ": total_time})
trainer.save_model() # Saves the tokenizer too for easy upload
train_metrics = train_result.metrics
max_train_samples = (
@ -460,7 +460,7 @@ def main():
if training_args.compute_memory or training_args.compute_time:
print(performance_metrics)
trainer.save_metrics("performance", performance_metrics)
# Evaluation
results = {}
if training_args.do_eval:
@ -484,9 +484,9 @@ def main():
trainer.log_metrics("test", metrics)
trainer.save_metrics("test", metrics)
results['test'] = metrics
repo_name = create_hub_repo_name(root="DeltaHub",
dataset=data_args.task_name,
dataset=data_args.task_name,
delta_type = delta_args.delta_type,
model_name_or_path= model_args.model_name_or_path)
results['repo_name'] = repo_name

View File

@ -5,21 +5,8 @@ from typing import Any, Dict, List, Optional, Tuple, Union
from torch.utils.data.dataset import Dataset
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainner
from examples_seq2seq.trainers.trainer import BaseTrainer
from examples_seq2seq.trainers.trainer import BaseTrainer
# if is_sagemaker_mp_enabled():
# import smdistributed.modelparallel.torch as smp
# from transformers.trainer_utils import ShardedDDPOption
# if is_fairscale_available():
# dep_version_check("fairscale")
# import fairscale
# from fairscale.nn.data_parallel import FullyShardedDataParallel as FullyShardedDDP
# from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP
# from fairscale.nn.wrap import auto_wrap
# from fairscale.optim import OSS
# from fairscale.optim.grad_scaler import ShardedGradScaler
from transformers.optimization import Adafactor, AdamW, get_scheduler
from transformers.trainer_pt_utils import get_parameter_names, is_sagemaker_mp_enabled
@ -121,7 +108,7 @@ class Seq2SeqTrainer(HfSeq2SeqTrainner, BaseTrainer):
return (loss, generated_tokens, labels)

View File

@ -1,6 +1,6 @@
from dataclasses import dataclass, field
from typing import Optional, List
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainingArguments
# run_seq2seq parameters.
@dataclass
@ -127,8 +127,9 @@ class DataTrainingArguments:
default=None,
metadata={"help": "Defines a dictionary from tasks to the tasks embeddings."}
)
data_seed: Optional[int] = field(default=42, metadata={"help": "seed used to shuffle the data."})
data_sample_seed: Optional[int] = field(default=42, metadata={"help": "seed used to shuffle the data."})
model_parallel: Optional[bool] = field(default=False, metadata={"help": "whether apply model parallelization"})
def __post_init__(self):

View File

@ -1,4 +1,3 @@
from examples_prompt.metrics.metrics import exact_match
from opendelta.utils.signature import get_arg_names, get_arg_names_inside_func
from opendelta.utils.name_based_addressing import *
from opendelta.utils.cuda import get_device
@ -7,8 +6,6 @@ from typing import *
import torch
import torch.nn as nn
from opendelta import BaseDeltaConfig
from decorator import decorate
import torch.nn.functional as F
from opendelta import logging
logger = logging.get_logger(__name__)