commit
211dce1b9d
|
@ -0,0 +1,178 @@
|
|||
|
||||
from openpromptu.data_utils import InputExample
|
||||
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModelForSeq2SeqLM,
|
||||
AutoTokenizer,
|
||||
)
|
||||
from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
|
||||
import torch
|
||||
|
||||
def mask_token_func(tokenizer, ith_mask=0):
|
||||
return tokenizer.mask_token
|
||||
|
||||
def get_remove_columns(dataset_features):
|
||||
return dataset_features
|
||||
|
||||
def preprocess_function(raw_example, **kwargs):
|
||||
# max_target_length += 1
|
||||
tokenizer = kwargs['tokenizer']
|
||||
data_args = kwargs['data_args']
|
||||
template = kwargs['template']
|
||||
verbalizer = kwargs['verbalizer']
|
||||
tokenizer_wrapper = kwargs['tokenizer_wrapper']
|
||||
split = kwargs['split']
|
||||
example = InputExample(**raw_example)
|
||||
|
||||
|
||||
try:
|
||||
example = verbalizer.wrap_one_example(example)
|
||||
example, other = template.wrap_one_example(example)
|
||||
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
|
||||
model_inputs = tokenizer(input_sentence, max_length=256,
|
||||
padding="max_length", truncation=True)
|
||||
except:
|
||||
from IPython import embed; embed(header="Therer")
|
||||
|
||||
with tokenizer.as_target_tokenizer():
|
||||
label = tokenizer(other['tgt_text']).input_ids
|
||||
|
||||
model_inputs["labels"] = label
|
||||
return model_inputs
|
||||
|
||||
def get_backbone(model_args, **kwargs):
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
config.dropout_rate = 0.0
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_fast=model_args.use_fast_tokenizer,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
|
||||
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(
|
||||
model_args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
return config, tokenizer, model
|
||||
|
||||
|
||||
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
|
||||
from openpromptu.prompts import GenerationVerbalizer
|
||||
from openpromptu.prompts import ManualTemplate
|
||||
from openpromptu import TokenizerWrapper
|
||||
template = ManualTemplate(text = task.templates_text[template_id])
|
||||
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
|
||||
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
|
||||
return template, verbalizer, tokenizer_wrapper
|
||||
|
||||
class Trainer(HfSeq2SeqTrainer):
|
||||
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.eval_task = eval_task
|
||||
self.compute_metrics = self._compute_metrics
|
||||
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
outputs = model(**inputs)
|
||||
if return_outputs:
|
||||
return (outputs.loss, outputs)
|
||||
else:
|
||||
return outputs.loss
|
||||
|
||||
def prediction_step(
|
||||
self,
|
||||
model, #nn.Module,
|
||||
inputs, #Dict[str, Union[torch.Tensor, Any]],
|
||||
prediction_loss_only, #: bool,
|
||||
ignore_keys, #: Optional[List[str]] = None,
|
||||
): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
|
||||
"""
|
||||
Perform an evaluation step on :obj:`model` using obj:`inputs`.
|
||||
|
||||
Subclass and override to inject custom behavior.
|
||||
|
||||
Args:
|
||||
model (:obj:`nn.Module`):
|
||||
The model to evaluate.
|
||||
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
|
||||
The inputs and targets of the model.
|
||||
|
||||
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
|
||||
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
|
||||
prediction_loss_only (:obj:`bool`):
|
||||
Whether or not to return the loss only.
|
||||
|
||||
Return:
|
||||
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
|
||||
labels (each being optional).
|
||||
"""
|
||||
if not self.args.predict_with_generate or prediction_loss_only:
|
||||
return super().prediction_step(
|
||||
model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
|
||||
)
|
||||
|
||||
|
||||
has_labels = "labels" in inputs
|
||||
inputs = self._prepare_inputs(inputs)
|
||||
gen_kwargs = {
|
||||
"max_length": 10, # self._max_length if s is not None else self.model.config.max_length,
|
||||
"num_beams": 1 #self._num_beams if self._num_beams is not None else self.model.config.num_beams,
|
||||
}
|
||||
generated_tokens = self.model.generate(
|
||||
inputs["input_ids"],
|
||||
attention_mask=inputs["attention_mask"],
|
||||
**gen_kwargs,
|
||||
)
|
||||
# in case the batch is shorter than max length, the output should be padded
|
||||
if generated_tokens.shape[-1] < gen_kwargs["max_length"]:
|
||||
generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
|
||||
|
||||
with torch.no_grad():
|
||||
|
||||
outputs = model(**inputs)
|
||||
if has_labels:
|
||||
if self.label_smoother is not None:
|
||||
loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
|
||||
else:
|
||||
loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
|
||||
else:
|
||||
loss = None
|
||||
|
||||
if self.args.prediction_loss_only:
|
||||
return (loss, None, None)
|
||||
|
||||
labels = inputs["labels"]
|
||||
if labels.shape[-1] < gen_kwargs["max_length"]:
|
||||
labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
|
||||
|
||||
# from IPython import embed; embed(header="In seqseqtrainer")
|
||||
return (loss, generated_tokens, labels)
|
||||
|
||||
def _compute_metrics(self, eval_preds):
|
||||
# from IPython import embed; embed(header="In compute metrics")
|
||||
preds, labels = eval_preds
|
||||
decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
|
||||
decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
|
||||
# post_processor = .get(data_args.dataset_name[0], tokenizer,
|
||||
# data_args.ignore_pad_token_for_loss)
|
||||
# decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info)
|
||||
result = {}
|
||||
for metric in self.eval_task.metric:
|
||||
result.update(metric(decoded_preds, decoded_labels))
|
||||
|
||||
average_metric = sum(result.values())/len(result)
|
||||
result.update({"average_metrics":average_metric})
|
||||
return result
|
||||
|
|
@ -13,11 +13,6 @@ from transformers import ViTFeatureExtractor
|
|||
from transformers import Trainer as HfTrainer
|
||||
import torch.nn as nn
|
||||
|
||||
def process_example(raw_example, **kwargs):
|
||||
tokenizer = kwargs['tokenizer']
|
||||
inputs = tokenizer(raw_example['image'], return_tensors='pt')
|
||||
inputs['labels'] = raw_example['labels']
|
||||
return inputs
|
||||
|
||||
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
|
||||
# from openpromptu.prompts import ManualVerbalizer
|
||||
|
|
|
@ -49,7 +49,7 @@ def mask_token_func(tokenizer, ith_mask=0):
|
|||
return tokenizer.mask_token
|
||||
|
||||
def get_remove_columns(dataset_features):
|
||||
dataset_features.pop("label")
|
||||
dataset_features.remove("label")
|
||||
return dataset_features
|
||||
|
||||
|
||||
|
@ -60,6 +60,7 @@ def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
|
|||
template = ManualTemplate(text = task.templates_text[template_id])
|
||||
verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
|
||||
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
|
||||
from IPython import embed; embed()
|
||||
return template, verbalizer, tokenizer_wrapper
|
||||
|
||||
class DataCollator(HfDataCollatorMixin):
|
||||
|
|
|
@ -0,0 +1,143 @@
|
|||
from openpromptu.data_utils import InputExample
|
||||
import torch
|
||||
from transformers.data.data_collator import torch_default_data_collator
|
||||
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
|
||||
import numpy as np
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModelForMaskedLM,
|
||||
AutoTokenizer,
|
||||
)
|
||||
|
||||
from transformers import Trainer as HfTrainer
|
||||
|
||||
|
||||
def preprocess_function(raw_example, **kwargs):
|
||||
tokenizer = kwargs['tokenizer']
|
||||
data_args = kwargs['data_args']
|
||||
template = kwargs['template']
|
||||
verbalizer = kwargs['verbalizer']
|
||||
tokenizer_wrapper = kwargs['tokenizer_wrapper']
|
||||
|
||||
example = InputExample(**raw_example)
|
||||
example, other = template.wrap_one_example(example)
|
||||
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
|
||||
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
|
||||
padding="max_length", truncation=True)
|
||||
return model_inputs
|
||||
|
||||
def compute_metrics(eval_preds, dataset_name, eval_metric):
|
||||
# from IPython import embed; embed(header="In compute metrics")
|
||||
|
||||
preds, labels = eval_preds.predictions, eval_preds.label_ids
|
||||
|
||||
preds = np.argmax(preds, axis=-1)
|
||||
|
||||
result = {}
|
||||
average_metrics = []
|
||||
for metric in eval_metric:
|
||||
metric_item = metric(preds, labels)
|
||||
metric_value = list(metric_item.values())
|
||||
result.update(metric_item)
|
||||
average_metrics.extend(metric_value)
|
||||
print("average:",average_metrics)
|
||||
average_metric = sum(average_metrics)/len(average_metrics)
|
||||
result.update({"average_metrics":average_metric})
|
||||
return result
|
||||
|
||||
def mask_token_func(tokenizer, ith_mask=0):
|
||||
return tokenizer.mask_token
|
||||
|
||||
def get_remove_columns(dataset_features):
|
||||
# from IPython import embed; embed(header="get_remove_columns")
|
||||
dataset_features.remove("label")
|
||||
return dataset_features
|
||||
|
||||
|
||||
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
|
||||
from openpromptu.prompts import ManualVerbalizer
|
||||
from openpromptu.prompts import ManualTemplate
|
||||
from openpromptu import TokenizerWrapper
|
||||
template = ManualTemplate(text = task.templates_text[template_id])
|
||||
verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
|
||||
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
|
||||
# from IPython import embed; embed()
|
||||
return template, verbalizer, tokenizer_wrapper
|
||||
|
||||
class DataCollator(HfDataCollatorMixin):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.return_tensors='pt'
|
||||
|
||||
def torch_call(self, features):
|
||||
return torch_default_data_collator(features=features)
|
||||
|
||||
|
||||
|
||||
def get_backbone(model_args, **kwargs):
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
config.dropout_rate = 0.0
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_fast=model_args.use_fast_tokenizer,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
model = AutoModelForMaskedLM.from_pretrained(
|
||||
model_args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
return config, tokenizer, model
|
||||
|
||||
class Trainer(HfTrainer):
|
||||
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.verbalizer=verbalizer
|
||||
self.eval_task=eval_task
|
||||
self.compute_metrics = self._compute_metrics
|
||||
|
||||
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
labels = inputs.pop('labels')
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.get("logits")
|
||||
input_ids = inputs['input_ids']
|
||||
verbalizer = self.verbalizer.cuda()
|
||||
logits_at_mask = logits[torch.where(input_ids == verbalizer.tokenizer.mask_token_id)]
|
||||
label_logits = verbalizer.process_logits(logits_at_mask)
|
||||
loss_fct = torch.nn.CrossEntropyLoss()
|
||||
loss = loss_fct(label_logits, labels)
|
||||
outputs.logits = label_logits
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
|
||||
def _compute_metrics(self, eval_preds):
|
||||
# from IPython import embed; embed(header="In compute metrics")
|
||||
|
||||
preds, labels = eval_preds.predictions, eval_preds.label_ids
|
||||
|
||||
preds = np.argmax(preds, axis=-1)
|
||||
|
||||
result = {}
|
||||
average_metrics = []
|
||||
for metric in self.eval_task.metric:
|
||||
metric_item = metric(preds, labels)
|
||||
metric_value = list(metric_item.values())
|
||||
result.update(metric_item)
|
||||
average_metrics.extend(metric_value)
|
||||
print("average:",average_metrics)
|
||||
average_metric = sum(average_metrics)/len(average_metrics)
|
||||
result.update({"average_metrics":average_metric})
|
||||
return result
|
||||
|
||||
|
|
@ -0,0 +1,182 @@
|
|||
|
||||
from openpromptu.data_utils import InputExample
|
||||
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
BlenderbotForConditionalGeneration,
|
||||
AutoTokenizer,
|
||||
)
|
||||
from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
|
||||
import torch
|
||||
|
||||
def mask_token_func(tokenizer, ith_mask=0):
|
||||
return ""
|
||||
|
||||
def get_remove_columns(dataset_features):
|
||||
return dataset_features
|
||||
|
||||
def preprocess_function(raw_example, **kwargs):
|
||||
# max_target_length += 1
|
||||
tokenizer = kwargs['tokenizer']
|
||||
data_args = kwargs['data_args']
|
||||
template = kwargs['template']
|
||||
verbalizer = kwargs['verbalizer']
|
||||
tokenizer_wrapper = kwargs['tokenizer_wrapper']
|
||||
split = kwargs['split']
|
||||
example = InputExample(**raw_example)
|
||||
|
||||
|
||||
try:
|
||||
example = verbalizer.wrap_one_example(example)
|
||||
example, other = template.wrap_one_example(example)
|
||||
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
|
||||
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
|
||||
padding="max_length", truncation=True)
|
||||
except:
|
||||
from IPython import embed; embed(header="Therer")
|
||||
|
||||
with tokenizer.as_target_tokenizer():
|
||||
label = tokenizer(other['tgt_text']).input_ids
|
||||
|
||||
model_inputs["labels"] = label
|
||||
# from IPython import embed; embed()
|
||||
return model_inputs
|
||||
|
||||
def get_backbone(model_args, **kwargs):
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
config.dropout_rate = 0.0
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_fast=model_args.use_fast_tokenizer,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
|
||||
|
||||
model = BlenderbotForConditionalGeneration.from_pretrained(
|
||||
model_args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
# from IPython import embed; embed()
|
||||
return config, tokenizer, model
|
||||
|
||||
|
||||
def get_prompts(task, tokenizer, data_args, template_id="blenderbot", verbalizer_id="blenderbot"):
|
||||
from openpromptu.prompts import GenerationVerbalizer
|
||||
from openpromptu.prompts import ManualTemplate
|
||||
from openpromptu import TokenizerWrapper
|
||||
template = ManualTemplate(text = task.templates_text[template_id])
|
||||
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
|
||||
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
|
||||
return template, verbalizer, tokenizer_wrapper
|
||||
|
||||
class Trainer(HfSeq2SeqTrainer):
|
||||
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.eval_task = eval_task
|
||||
self.compute_metrics = self._compute_metrics
|
||||
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
# from IPython import embed; embed()
|
||||
outputs = model(**inputs)
|
||||
if return_outputs:
|
||||
return (outputs.loss, outputs)
|
||||
else:
|
||||
return outputs.loss
|
||||
|
||||
def prediction_step(
|
||||
self,
|
||||
model, #nn.Module,
|
||||
inputs, #Dict[str, Union[torch.Tensor, Any]],
|
||||
prediction_loss_only, #: bool,
|
||||
ignore_keys, #: Optional[List[str]] = None,
|
||||
): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
|
||||
"""
|
||||
Perform an evaluation step on :obj:`model` using obj:`inputs`.
|
||||
|
||||
Subclass and override to inject custom behavior.
|
||||
|
||||
Args:
|
||||
model (:obj:`nn.Module`):
|
||||
The model to evaluate.
|
||||
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
|
||||
The inputs and targets of the model.
|
||||
|
||||
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
|
||||
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
|
||||
prediction_loss_only (:obj:`bool`):
|
||||
Whether or not to return the loss only.
|
||||
|
||||
Return:
|
||||
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
|
||||
labels (each being optional).
|
||||
"""
|
||||
if not self.args.predict_with_generate or prediction_loss_only:
|
||||
return super().prediction_step(
|
||||
model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
|
||||
)
|
||||
|
||||
|
||||
has_labels = "labels" in inputs
|
||||
inputs = self._prepare_inputs(inputs)
|
||||
gen_kwargs = {
|
||||
"max_length": 10, # self._max_length if s is not None else self.model.config.max_length,
|
||||
"num_beams": 1, #self._num_beams if self._num_beams is not None else self.model.config.num_beams,
|
||||
"min_length": 1 # for blenderbot, generally we set it to be a large number. But in classification, we set it to 1
|
||||
}
|
||||
generated_tokens = self.model.generate(
|
||||
inputs["input_ids"],
|
||||
attention_mask=inputs["attention_mask"],
|
||||
**gen_kwargs,
|
||||
)
|
||||
# in case the batch is shorter than max length, the output should be padded
|
||||
if generated_tokens.shape[-1] < gen_kwargs["max_length"]:
|
||||
generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
|
||||
|
||||
with torch.no_grad():
|
||||
|
||||
outputs = model(**inputs)
|
||||
if has_labels:
|
||||
if self.label_smoother is not None:
|
||||
loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
|
||||
else:
|
||||
loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
|
||||
else:
|
||||
loss = None
|
||||
|
||||
if self.args.prediction_loss_only:
|
||||
return (loss, None, None)
|
||||
|
||||
labels = inputs["labels"]
|
||||
if labels.shape[-1] < gen_kwargs["max_length"]:
|
||||
labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
|
||||
|
||||
# from IPython import embed; embed(header="In seqseqtrainer")
|
||||
return (loss, generated_tokens, labels)
|
||||
|
||||
def _compute_metrics(self, eval_preds):
|
||||
from IPython import embed; embed(header="In compute metrics")
|
||||
preds, labels = eval_preds
|
||||
decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
|
||||
decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
|
||||
# post_processor = .get(data_args.dataset_name[0], tokenizer,
|
||||
# data_args.ignore_pad_token_for_loss)
|
||||
# decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info)
|
||||
result = {}
|
||||
for metric in self.eval_task.metric:
|
||||
result.update(metric(decoded_preds, decoded_labels))
|
||||
|
||||
average_metric = sum(result.values())/len(result)
|
||||
result.update({"average_metrics":average_metric})
|
||||
return result
|
||||
|
|
@ -0,0 +1,172 @@
|
|||
from openpromptu.data_utils import InputExample
|
||||
import torch
|
||||
from transformers.data.data_collator import torch_default_data_collator
|
||||
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
|
||||
import numpy as np
|
||||
from transformers import (
|
||||
CLIPConfig,
|
||||
CLIPProcessor,
|
||||
CLIPModel,
|
||||
)
|
||||
from transformers import ViTFeatureExtractor
|
||||
from PIL import Image
|
||||
from transformers import Trainer as HfTrainer
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
|
||||
def get_prompts(task, tokenizer, data_args, template_id="clip", verbalizer_id="clip"):
|
||||
from openpromptu.prompts import GenerationVerbalizer
|
||||
from openpromptu.prompts import ManualTemplate
|
||||
from openpromptu import TokenizerWrapper
|
||||
template = ManualTemplate(text = task.templates_text[template_id])
|
||||
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
|
||||
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer.tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
|
||||
return template, verbalizer, tokenizer_wrapper
|
||||
|
||||
def mask_token_func(tokenizer, ith_mask=0):
|
||||
return tokenizer.mask_token
|
||||
|
||||
def preprocess_function(raw_example, **kwargs):
|
||||
# from IPython import embed; embed(header="Therefa")
|
||||
tokenizer = kwargs['tokenizer']
|
||||
|
||||
# ["a photo of {}" for i in range()]
|
||||
data_args = kwargs['data_args']
|
||||
template = kwargs['template']
|
||||
verbalizer = kwargs['verbalizer']
|
||||
tokenizer_wrapper = kwargs['tokenizer_wrapper']
|
||||
|
||||
example = InputExample(raw_example)
|
||||
|
||||
texts = []
|
||||
|
||||
for candidate_label in range(verbalizer.num_classes):
|
||||
tgt_text = verbalizer.wrap_one_example(label=candidate_label)
|
||||
wrapped_example, other = template.wrap_one_example(example)
|
||||
input_sentence = tokenizer_wrapper.merge_wrapped_example(wrapped_example, tgt_texts=[tgt_text])
|
||||
texts.append(input_sentence)
|
||||
|
||||
# from IPython import embed; embed()/
|
||||
|
||||
image = Image.open(raw_example['image_file_path'])
|
||||
|
||||
model_inputs = tokenizer(images=image, text=texts, max_length=16, padding="max_length", truncation=True, return_tensors='pt')
|
||||
|
||||
# from IPython import embed; embed()
|
||||
model_inputs["pixel_values"] = model_inputs["pixel_values"].squeeze()
|
||||
model_inputs["label"] = example.label
|
||||
return model_inputs
|
||||
|
||||
def compute_metrics(eval_preds, dataset_name, eval_metric):
|
||||
# from IPython import embed; embed(header="In compute metrics")
|
||||
|
||||
preds, labels = eval_preds.predictions, eval_preds.label_ids
|
||||
|
||||
preds = np.argmax(preds, axis=-1)
|
||||
|
||||
result = {}
|
||||
average_metrics = []
|
||||
for metric in eval_metric:
|
||||
metric_item = metric(preds, labels)
|
||||
metric_value = list(metric_item.values())
|
||||
result.update(metric_item)
|
||||
average_metrics.extend(metric_value)
|
||||
print("average:",average_metrics)
|
||||
average_metric = sum(average_metrics)/len(average_metrics)
|
||||
result.update({"average_metrics":average_metric})
|
||||
return result
|
||||
|
||||
|
||||
|
||||
def get_remove_columns(dataset_features):
|
||||
# from IPython import embed; embed(header="in remoev")
|
||||
dataset_features.remove("labels")
|
||||
print("remove_columns: {}".format(dataset_features))
|
||||
return dataset_features
|
||||
|
||||
class DataCollator(HfDataCollatorMixin):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.return_tensors='pt'
|
||||
|
||||
def torch_call(self, features):
|
||||
# from IPython import embed; embed(header="in data collator")
|
||||
a = torch_default_data_collator(features=features)
|
||||
# from IPython import embed; embed(header="in data collator")
|
||||
a["input_ids"] = a["input_ids"][0]
|
||||
a["attention_mask"] = a["attention_mask"][0]
|
||||
return a
|
||||
|
||||
|
||||
def get_backbone(model_args, **kwargs):
|
||||
config = CLIPConfig.from_pretrained(
|
||||
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
# config.dropout_rate = 0.0
|
||||
tokenizer = CLIPProcessor.from_pretrained(
|
||||
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
|
||||
use_fast=model_args.use_fast_tokenizer,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
|
||||
model = CLIPModel.from_pretrained(
|
||||
model_args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
# config.num_labels = model_args.num_classes
|
||||
# old_classifier = model.classifier
|
||||
# model.classifier = nn.Linear(old_classifier.in_features, config.num_labels)
|
||||
|
||||
|
||||
return config, tokenizer, model
|
||||
|
||||
class Trainer(HfTrainer):
|
||||
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.verbalizer=verbalizer
|
||||
self.eval_task=eval_task
|
||||
self.compute_metrics = self._compute_metrics
|
||||
self.loss_fn = nn.CrossEntropyLoss()
|
||||
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
# from IPython import embed; embed()
|
||||
labels = inputs.pop('labels')
|
||||
outputs = model(**inputs)
|
||||
# logits = outputs.get("logits")
|
||||
|
||||
|
||||
logits_per_image = outputs.logits_per_image
|
||||
loss = self.loss_fn(logits_per_image, labels)
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
|
||||
def _compute_metrics(self, eval_preds):
|
||||
# from IPython import embed; embed(header="In compute metrics")
|
||||
|
||||
preds, labels = eval_preds.predictions, eval_preds.label_ids
|
||||
|
||||
preds = np.argmax(preds, axis=-1)
|
||||
|
||||
result = {}
|
||||
average_metrics = []
|
||||
for metric in self.eval_task.metric:
|
||||
metric_item = metric(preds, labels)
|
||||
metric_value = list(metric_item.values())
|
||||
result.update(metric_item)
|
||||
average_metrics.extend(metric_value)
|
||||
print("average:",average_metrics)
|
||||
average_metric = sum(average_metrics)/len(average_metrics)
|
||||
result.update({"average_metrics":average_metric})
|
||||
from IPython import embed; embed(header="In compute metrics")
|
||||
return result
|
||||
|
||||
|
|
@ -1,25 +1,33 @@
|
|||
import collections
|
||||
import collections
|
||||
import copy
|
||||
|
||||
BS = 1
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
parser.add_argument("--")
|
||||
args = parser.parse_args()
|
||||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['t5-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
|
@ -27,53 +35,9 @@ BaseConfigs['t5-base'] = {
|
|||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "t5-base",
|
||||
"tokenizer_name": "t5-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
BaseConfigs['t5-large'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-large",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-large",
|
||||
"model_name_or_path": f"{PATHBASE}t5-base",
|
||||
"tokenizer_name": f"{PATHBASE}t5-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
|
@ -89,63 +53,18 @@ BaseConfigs['t5-large'] = {
|
|||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
BaseConfigs['t5-3b'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-3b",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-3b",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['bitfit_t5-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-base/",
|
||||
})
|
||||
|
||||
|
||||
|
||||
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['adapter_t5-base'].update({
|
||||
"delta_type": "adapter",
|
||||
|
@ -185,16 +104,16 @@ AllConfigs['compacter_t5-base'].update({
|
|||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
|
@ -222,16 +141,16 @@ AllConfigs['compacter++_t5-base'].update({
|
|||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
|
@ -252,7 +171,7 @@ AllConfigs['low_rank_adapter_t5-base'].update({
|
|||
],
|
||||
"output_dir": "outputs/low_rank_adapter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
|
@ -279,102 +198,71 @@ AllConfigs['prefix_t5-base'].update({
|
|||
"output_dir": "outputs/prefix/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['none_t5-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['bitfit_t5-large'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['none_t5-large'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-large/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['bitfit_t5-3b'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['none_t5-3b'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['adapter_t5-3b'].update({
|
||||
"delta_type": "adapter",
|
||||
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['soft_prompt_t5-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-3b/",
|
||||
"output_dir": "outputs/soft_prompt/t5-base/",
|
||||
})
|
||||
#### T5-base
|
||||
BaseConfigs['t5-small'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['adapter_t5-large'].update({
|
||||
"delta_type": "adapter",
|
||||
"model_name_or_path": f"{PATHBASE}t5-small",
|
||||
"tokenizer_name": f"{PATHBASE}t5-small",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
|
||||
AllConfigs['prefix_t5-small'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-large/",
|
||||
"output_dir": "outputs/prefix/t5-small/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['lora_t5-large'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['lora_t5-3b'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-3b/",
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
|
@ -399,13 +287,12 @@ if __name__ == "__main__":
|
|||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"./{args.job}_{BS}/"):
|
||||
os.mkdir(f"./{args.job}_{BS}/")
|
||||
if not os.path.exists(f"configs/{args.job}/"):
|
||||
os.mkdir(f"configs/{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout:
|
||||
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,248 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
# PATHBASE=""
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['bart-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}bart-base",
|
||||
"tokenizer_name": f"{PATHBASE}bart-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
|
||||
AllConfigs['bitfit_bart-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/bart-base/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
|
||||
AllConfigs['adapter_bart-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/bart-base/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
|
||||
AllConfigs['lora_bart-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"modified_modules": [
|
||||
"q_proj",
|
||||
"v_proj",
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/bart-base/",
|
||||
})
|
||||
|
||||
AllConfigs['compacter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
|
||||
AllConfigs['compacter_bart-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter/bart-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
AllConfigs['compacter++_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
|
||||
AllConfigs['compacter++_bart-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
"modified_modules": [
|
||||
"DenseReluDense"
|
||||
],
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter++/bart-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['low_rank_adapter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
|
||||
AllConfigs['low_rank_adapter_bart-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/low_rank_adapter/bart-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
|
||||
AllConfigs['soft_prompt_bart-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-2,
|
||||
"soft_token_num":100,
|
||||
"token_init": False,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/bart-base/",
|
||||
})
|
||||
|
||||
AllConfigs['prefix_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
|
||||
AllConfigs['prefix_bart-base'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/bart-base/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_bart-base'] = copy.deepcopy(BaseConfigs['bart-base'])
|
||||
AllConfigs['soft_prompt_bart-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/bart-base/",
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"configs/{args.job}/"):
|
||||
os.mkdir(f"configs/{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
|
@ -205,209 +205,9 @@ AllConfigs['soft_prompt_beit-base-patch16-224'].update({
|
|||
],
|
||||
"output_dir": "outputs/soft_prompt/beit-base-patch16-224/",
|
||||
})
|
||||
#### beit-base-patch16-224
|
||||
BaseConfigs['t5-small'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}t5-small",
|
||||
"tokenizer_name": f"{PATHBASE}t5-small",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
|
||||
AllConfigs['prefix_t5-small'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-small/",
|
||||
})
|
||||
|
||||
|
||||
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['roberta-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}roberta-base",
|
||||
"tokenizer_name": f"{PATHBASE}roberta-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
|
||||
|
||||
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['bitfit_roberta-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/bitfit/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['none_roberta-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 1e-5,
|
||||
"output_dir": "outputs/none/roberta-base/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['lora_roberta-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/lora/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['adapter_roberta-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/adapter/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['low_rank_adapter_roberta-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/low_rank_adapter/roberta-base/",
|
||||
})
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['bert-base-cased'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}bert-base-cased",
|
||||
"tokenizer_name": f"{PATHBASE}bert-base-cased",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['prefix_bert-base-cased'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/bert-base-cased/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['soft_prompt_bert-base-cased'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/bert-base-cased/",
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
|
|
|
@ -0,0 +1,147 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
|
||||
#### ROBERTA ######
|
||||
BaseConfigs['bigbird-roberta-large'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}bigbird-roberta-large",
|
||||
"tokenizer_name": f"{PATHBASE}bigbird-roberta-large",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
|
||||
|
||||
AllConfigs['bitfit_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
|
||||
AllConfigs['bitfit_bigbird-roberta-large'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/bitfit/bigbird-roberta-large/",
|
||||
})
|
||||
|
||||
AllConfigs['none_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
|
||||
AllConfigs['none_bigbird-roberta-large'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 1e-5,
|
||||
"output_dir": "outputs/none/bigbird-roberta-large/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['lora_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
|
||||
AllConfigs['lora_bigbird-roberta-large'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 1e-3,
|
||||
"modified_modules": [
|
||||
"query",
|
||||
"key",
|
||||
],
|
||||
"output_dir": "outputs/lora/bigbird-roberta-large/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
|
||||
AllConfigs['adapter_bigbird-roberta-large'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/adapter/bigbird-roberta-large/",
|
||||
})
|
||||
|
||||
AllConfigs['low_rank_adapter_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
|
||||
AllConfigs['low_rank_adapter_bigbird-roberta-large'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/low_rank_adapter/bigbird-roberta-large/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large'])
|
||||
AllConfigs['soft_prompt_bigbird-roberta-large'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/bigbird-roberta-large/",
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"configs/{args.job}/"):
|
||||
os.mkdir(f"configs/{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,254 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['blenderbot-400M-distill'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}blenderbot-400M-distill",
|
||||
"tokenizer_name": f"{PATHBASE}blenderbot-400M-distill",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
|
||||
AllConfigs['bitfit_blenderbot-400M-distill'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/blenderbot-400M-distill/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
|
||||
AllConfigs['adapter_blenderbot-400M-distill'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/blenderbot-400M-distill/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
|
||||
AllConfigs['lora_blenderbot-400M-distill'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"modified_modules":[
|
||||
"q_proj",
|
||||
"v_proj",
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/blenderbot-400M-distill/",
|
||||
})
|
||||
|
||||
AllConfigs['compacter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
|
||||
AllConfigs['compacter_blenderbot-400M-distill'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter/blenderbot-400M-distill/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
AllConfigs['compacter++_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
|
||||
AllConfigs['compacter++_blenderbot-400M-distill'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
"modified_modules": [
|
||||
"DenseReluDense"
|
||||
],
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter++/blenderbot-400M-distill/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['low_rank_adapter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
|
||||
AllConfigs['low_rank_adapter_blenderbot-400M-distill'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/low_rank_adapter/blenderbot-400M-distill/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
AllConfigs['none_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
|
||||
AllConfigs['none_blenderbot-400M-distill'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 1e-5,
|
||||
"output_dir": "outputs/none/blenderbot-400M-distill/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
|
||||
AllConfigs['soft_prompt_blenderbot-400M-distill'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-2,
|
||||
"soft_token_num":100,
|
||||
"token_init": False,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/blenderbot-400M-distill/",
|
||||
})
|
||||
|
||||
AllConfigs['prefix_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
|
||||
AllConfigs['prefix_blenderbot-400M-distill'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/blenderbot-400M-distill/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill'])
|
||||
AllConfigs['soft_prompt_blenderbot-400M-distill'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/blenderbot-400M-distill/",
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"configs/{args.job}/"):
|
||||
os.mkdir(f"configs/{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,303 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['clip-vit-base-patch32'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "num_classes"): zip(
|
||||
["beans"],
|
||||
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20],
|
||||
[256],
|
||||
[ 32],
|
||||
[ 32],#, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0], # *7 +[0] *8,
|
||||
[200],# 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200],#, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[ 3],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}clip-vit-base-patch32",
|
||||
"tokenizer_name": f"{PATHBASE}clip-vit-base-patch32",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
|
||||
AllConfigs['bitfit_clip-vit-base-patch32'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/clip-vit-base-patch32/",
|
||||
})
|
||||
|
||||
AllConfigs['none_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
|
||||
AllConfigs['none_clip-vit-base-patch32'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 1e-5,
|
||||
"output_dir": "outputs/none/clip-vit-base-patch32/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
|
||||
AllConfigs['adapter_clip-vit-base-patch32'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/clip-vit-base-patch32/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
|
||||
AllConfigs['lora_clip-vit-base-patch32'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/clip-vit-base-patch32/",
|
||||
})
|
||||
|
||||
AllConfigs['compacter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
|
||||
AllConfigs['compacter_clip-vit-base-patch32'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter/clip-vit-base-patch32/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
AllConfigs['compacter++_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
|
||||
AllConfigs['compacter++_clip-vit-base-patch32'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
"modified_modules": [
|
||||
"DenseReluDense"
|
||||
],
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter++/clip-vit-base-patch32/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['low_rank_adapter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
|
||||
AllConfigs['low_rank_adapter_clip-vit-base-patch32'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/low_rank_adapter/clip-vit-base-patch32/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
|
||||
AllConfigs['soft_prompt_clip-vit-base-patch32'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-2,
|
||||
"soft_token_num":100,
|
||||
"token_init": False,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/clip-vit-base-patch32/",
|
||||
})
|
||||
|
||||
AllConfigs['prefix_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
|
||||
AllConfigs['prefix_clip-vit-base-patch32'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/clip-vit-base-patch32/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32'])
|
||||
AllConfigs['soft_prompt_clip-vit-base-patch32'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/clip-vit-base-patch32/",
|
||||
})
|
||||
#### clip-vit-base-patch32
|
||||
BaseConfigs['t5-small'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}t5-small",
|
||||
"tokenizer_name": f"{PATHBASE}t5-small",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
|
||||
AllConfigs['prefix_t5-small'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-small/",
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"configs/{args.job}/"):
|
||||
os.mkdir(f"configs/{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
|
@ -76,8 +76,6 @@ AllConfigs['lora_t5-base'].update({
|
|||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-base/",
|
||||
|
@ -254,154 +252,6 @@ AllConfigs['prefix_t5-small'].update({
|
|||
})
|
||||
|
||||
|
||||
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['roberta-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}roberta-base",
|
||||
"tokenizer_name": f"{PATHBASE}roberta-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
|
||||
|
||||
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['bitfit_roberta-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/bitfit/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['none_roberta-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 1e-5,
|
||||
"output_dir": "outputs/none/roberta-base/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['lora_roberta-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/lora/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['adapter_roberta-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/adapter/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['low_rank_adapter_roberta-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/low_rank_adapter/roberta-base/",
|
||||
})
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['bert-base-cased'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}bert-base-cased",
|
||||
"tokenizer_name": f"{PATHBASE}bert-base-cased",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['prefix_bert-base-cased'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/bert-base-cased/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['soft_prompt_bert-base-cased'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/bert-base-cased/",
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
|
|
|
@ -85,11 +85,14 @@ class SST2(AbstractTask):
|
|||
"test": "validation"}
|
||||
|
||||
verbalizers = {
|
||||
"0":{"0":"negative","1":"positive"}
|
||||
"0":{"0":"negative","1":"positive"},
|
||||
"blenderbot":{"0":"negative","1":"positive"}
|
||||
|
||||
}
|
||||
|
||||
templates_text = {
|
||||
"0":"""The sentiment of sentence: "{"meta":"sentence", "shortenable":True} is {"mask"}."""
|
||||
"0":"""The sentiment of sentence: "{"meta":"sentence", "shortenable":True}" is {"mask"}.""",
|
||||
"blenderbot": """{"meta":"sentence", "shortenable":True} what is the sentiment?"""
|
||||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
|
@ -533,14 +536,15 @@ class Beans(AbstractTask):
|
|||
metric_names = ["accuracy"]
|
||||
|
||||
verbalizers = {
|
||||
"0": {
|
||||
"0": "No",
|
||||
"1": "Yes",
|
||||
"clip": {
|
||||
"angular_leaf_spot": "angular_leaf_spot",
|
||||
"bean_rust": "bean_rust",
|
||||
"healthy": "healthy",
|
||||
}
|
||||
}
|
||||
|
||||
templates_text = {
|
||||
"0": """{"meta":"sentence1"}"""
|
||||
"clip":"""a photo of {"mask"} leaf."""
|
||||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
|
|
|
@ -124,6 +124,9 @@ def main():
|
|||
if os.path.basename(model_args.model_name_or_path).startswith("t5"):
|
||||
from examples_prompt.backbones.t5 import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.t5 import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("blenderbot"):
|
||||
from examples_prompt.backbones.blenderbot import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.blenderbot import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("roberta") \
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("bert") \
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("albert") :
|
||||
|
@ -132,6 +135,15 @@ def main():
|
|||
elif os.path.basename(model_args.model_name_or_path).startswith("beit"):
|
||||
from examples_prompt.backbones.beit import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.beit import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("bart"):
|
||||
from examples_prompt.backbones.bart import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.bart import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("bigbird"):
|
||||
from examples_prompt.backbones.bigbird import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.bigbird import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("clip"):
|
||||
from examples_prompt.backbones.clip import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.clip import Trainer, DataCollator
|
||||
|
||||
|
||||
|
||||
|
@ -139,6 +151,9 @@ def main():
|
|||
|
||||
config, tokenizer, model = get_backbone(model_args=model_args)
|
||||
|
||||
from opendelta import Visualization
|
||||
Visualization(model).structure_graph()
|
||||
|
||||
if delta_args.delta_type.lower() != "none":
|
||||
from opendelta import AutoDeltaConfig,AutoDeltaModel
|
||||
delta_config = AutoDeltaConfig.from_dict(vars(delta_args))
|
||||
|
@ -174,7 +189,7 @@ def main():
|
|||
task = AutoTask.get(data_args.task_name,
|
||||
data_args.dataset_config_name,
|
||||
data_args=data_args,
|
||||
seed=data_args.data_seed)
|
||||
seed=data_args.data_sample_seed)
|
||||
|
||||
dataset = task.get(split=split_name,
|
||||
split_validation_test=training_args.split_validation_test,
|
||||
|
@ -182,7 +197,7 @@ def main():
|
|||
|
||||
|
||||
|
||||
template, _verbalizer, tokenizer_wrapper = get_prompts(task, tokenizer, training_args)
|
||||
template, _verbalizer, tokenizer_wrapper = get_prompts(task, tokenizer, data_args)
|
||||
|
||||
|
||||
dataset = dataset.map(
|
||||
|
|
|
@ -197,7 +197,7 @@ class DataTrainingArguments:
|
|||
datasets_saved_path: Optional[str] = field(
|
||||
default=None, metadata={"help": "the path of the saved datasets"}
|
||||
)
|
||||
data_seed: Optional[int] = field(default=42, metadata={"help": "seed used to shuffle the data."})
|
||||
data_sample_seed: Optional[int] = field(default=42, metadata={"help": "seed used to shuffle the data."})
|
||||
|
||||
|
||||
model_parallel: Optional[bool] = field(default=False, metadata={"help": "whether apply model parallelization"})
|
||||
|
|
|
@ -1,21 +0,0 @@
|
|||
# the final results will be populated here.{
|
||||
"evaluate": {
|
||||
"epoch": 20.0,
|
||||
"eval_accuracy": 89.2156862745098,
|
||||
"eval_average_metrics": 90.76168929110105,
|
||||
"eval_f1": 92.3076923076923,
|
||||
"eval_loss": 0.16493959724903107,
|
||||
"eval_runtime": 1.6391,
|
||||
"eval_samples_per_second": 124.455
|
||||
},
|
||||
"repo_name": "DeltaHub/bitfit_t5-base_mrpc",
|
||||
"test": {
|
||||
"epoch": 20.0,
|
||||
"test_accuracy": 88.23529411764706,
|
||||
"test_average_metrics": 89.97971602434077,
|
||||
"test_f1": 91.72413793103448,
|
||||
"test_loss": 0.14968213438987732,
|
||||
"test_runtime": 1.6344,
|
||||
"test_samples_per_second": 124.82
|
||||
}
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "cola",
|
||||
"eval_steps": 100,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/bitfit/t5-base/cola",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": true,
|
||||
"save_steps": 100,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "cola",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "cola",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mnli",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"num_train_epochs": 3,
|
||||
"output_dir": "outputs/bitfit/t5-base/mnli",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": true,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mnli",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mnli",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/bitfit/t5-base/mrpc",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": true,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "qnli",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"num_train_epochs": 3,
|
||||
"output_dir": "outputs/bitfit/t5-base/qnli",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": true,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "qnli",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "qnli",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "qqp",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"num_train_epochs": 3,
|
||||
"output_dir": "outputs/bitfit/t5-base/qqp",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": true,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "qqp",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "qqp",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 100,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/bitfit/t5-base/rte",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": true,
|
||||
"save_steps": 100,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "sst2",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"num_train_epochs": 3,
|
||||
"output_dir": "outputs/bitfit/t5-base/sst2",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": true,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "sst2",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "sst2",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "stsb",
|
||||
"eval_steps": 100,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/bitfit/t5-base/stsb",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": true,
|
||||
"save_steps": 100,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "stsb",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "stsb",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "superglue-boolq",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/bitfit/t5-base/superglue-boolq",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": true,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "superglue-boolq",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "superglue-boolq",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "superglue-cb",
|
||||
"eval_steps": 100,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/bitfit/t5-base/superglue-cb",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": true,
|
||||
"save_steps": 100,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "superglue-cb",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "superglue-cb",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "superglue-copa",
|
||||
"eval_steps": 50,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"num_train_epochs": 40,
|
||||
"output_dir": "outputs/bitfit/t5-base/superglue-copa",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": true,
|
||||
"save_steps": 50,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "superglue-copa",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "superglue-copa",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "superglue-multirc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"num_train_epochs": 3,
|
||||
"output_dir": "outputs/bitfit/t5-base/superglue-multirc",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": true,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "superglue-multirc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "superglue-multirc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "superglue-record",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 512,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"num_train_epochs": 3,
|
||||
"output_dir": "outputs/bitfit/t5-base/superglue-record",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": true,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "superglue-record",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "superglue-record",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "superglue-wic",
|
||||
"eval_steps": 100,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/bitfit/t5-base/superglue-wic",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": true,
|
||||
"save_steps": 100,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "superglue-wic",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "superglue-wic",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,40 +0,0 @@
|
|||
{
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "superglue-wsc.fixed",
|
||||
"eval_steps": 100,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/bitfit/t5-base/superglue-wsc.fixed",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": true,
|
||||
"save_steps": 100,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "superglue-wsc.fixed",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "superglue-wsc.fixed",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,426 +0,0 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['t5-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [8] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [8] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "t5-base",
|
||||
"tokenizer_name": "t5-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
|
||||
BaseConfigs['t5-large'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [8] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [8] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-large",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-large",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
BaseConfigs['t5-3b'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [8] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [8] * 8,
|
||||
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-3b",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-3b",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['bitfit_t5-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-base/",
|
||||
})
|
||||
|
||||
|
||||
|
||||
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['adapter_t5-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['lora_t5-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter++_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
"modified_modules": [
|
||||
"DenseReluDense"
|
||||
],
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter++/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['low_rank_adapter_t5-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/low_rank_adapter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
AllConfigs['low_rank_adapter_t5-xxl'] = copy.deepcopy(BaseConfigs['t5-xxl'])
|
||||
AllConfigs['low_rank_adapter_t5-xxl'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/low_rank_adapter/t5-xxl/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['soft_prompt_t5-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-2,
|
||||
"soft_token_num":100,
|
||||
"token_init": False,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['prefix_t5-base'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['none_t5-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['bitfit_t5-large'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['none_t5-large'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-large/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['bitfit_t5-3b'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['none_t5-3b'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['adapter_t5-3b'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['adapter_t5-large'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['lora_t5-large'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['lora_t5-3b'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-3b/",
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"./{args.job}/"):
|
||||
os.mkdir(f"./{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"./{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
import collections
|
||||
import collections
|
||||
import copy
|
||||
|
||||
BS = 1
|
||||
|
@ -6,13 +6,13 @@ AllConfigs = {}
|
|||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['t5-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
|
@ -27,7 +27,7 @@ BaseConfigs['t5-base'] = {
|
|||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
|
||||
"model_name_or_path": "t5-base",
|
||||
"tokenizer_name": "t5-base",
|
||||
"save_total_limit": 1,
|
||||
|
@ -50,13 +50,13 @@ BaseConfigs['t5-base'] = {
|
|||
}
|
||||
|
||||
BaseConfigs['t5-large'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
|
@ -71,7 +71,7 @@ BaseConfigs['t5-large'] = {
|
|||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-large",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-large",
|
||||
"save_total_limit": 1,
|
||||
|
@ -94,13 +94,13 @@ BaseConfigs['t5-large'] = {
|
|||
}
|
||||
|
||||
BaseConfigs['t5-3b'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
|
@ -115,7 +115,7 @@ BaseConfigs['t5-3b'] = {
|
|||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-3b",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-3b",
|
||||
"save_total_limit": 1,
|
||||
|
@ -139,8 +139,8 @@ BaseConfigs['t5-3b'] = {
|
|||
|
||||
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['bitfit_t5-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-base/",
|
||||
})
|
||||
|
||||
|
@ -185,16 +185,16 @@ AllConfigs['compacter_t5-base'].update({
|
|||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
|
@ -222,16 +222,16 @@ AllConfigs['compacter++_t5-base'].update({
|
|||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
|
@ -252,7 +252,7 @@ AllConfigs['low_rank_adapter_t5-base'].update({
|
|||
],
|
||||
"output_dir": "outputs/low_rank_adapter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
|
@ -288,8 +288,8 @@ AllConfigs['none_t5-base'].update({
|
|||
|
||||
AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['bitfit_t5-large'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-large/",
|
||||
})
|
||||
|
||||
|
@ -303,8 +303,8 @@ AllConfigs['none_t5-large'].update({
|
|||
|
||||
AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['bitfit_t5-3b'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-3b/",
|
||||
})
|
||||
|
||||
|
@ -367,7 +367,7 @@ AllConfigs['lora_t5-3b'].update({
|
|||
"output_dir": "outputs/lora/t5-3b/",
|
||||
})
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
|
@ -405,7 +405,6 @@ if __name__ == "__main__":
|
|||
for job_name in all_config_jsons:
|
||||
with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,411 +0,0 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
BS = 64
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['t5-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "t5-base",
|
||||
"tokenizer_name": "t5-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
BaseConfigs['t5-large'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-large",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-large",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
BaseConfigs['t5-3b'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-3b",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-3b",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['bitfit_t5-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-base/",
|
||||
})
|
||||
|
||||
|
||||
|
||||
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['adapter_t5-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['lora_t5-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter++_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
"modified_modules": [
|
||||
"DenseReluDense"
|
||||
],
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter++/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['low_rank_adapter_t5-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/low_rank_adapter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['soft_prompt_t5-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-2,
|
||||
"soft_token_num":100,
|
||||
"token_init": False,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['prefix_t5-base'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['none_t5-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['bitfit_t5-large'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['none_t5-large'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-large/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['bitfit_t5-3b'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['none_t5-3b'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['adapter_t5-3b'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['adapter_t5-large'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['lora_t5-large'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['lora_t5-3b'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-3b/",
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"./{args.job}_{BS}/"):
|
||||
os.mkdir(f"./{args.job}_{BS}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,411 +0,0 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
BS = 8
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['t5-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "t5-base",
|
||||
"tokenizer_name": "t5-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
BaseConfigs['t5-large'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-large",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-large",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
BaseConfigs['t5-3b'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
# [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": "/home/hushengding/plm_cache/t5-3b",
|
||||
"tokenizer_name": "/home/hushengding/plm_cache/t5-3b",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['bitfit_t5-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-base/",
|
||||
})
|
||||
|
||||
|
||||
|
||||
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['adapter_t5-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['lora_t5-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter++_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
"modified_modules": [
|
||||
"DenseReluDense"
|
||||
],
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter++/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['low_rank_adapter_t5-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/low_rank_adapter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['soft_prompt_t5-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-2,
|
||||
"soft_token_num":100,
|
||||
"token_init": False,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['prefix_t5-base'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['none_t5-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['bitfit_t5-large'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['none_t5-large'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-large/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['bitfit_t5-3b'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['none_t5-3b'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 3e-5,
|
||||
"output_dir": "outputs/none/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['adapter_t5-3b'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-3b/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['adapter_t5-large'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large'])
|
||||
AllConfigs['lora_t5-large'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-large/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b'])
|
||||
AllConfigs['lora_t5-3b'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-3b/",
|
||||
})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"./{args.job}_{BS}/"):
|
||||
os.mkdir(f"./{args.job}_{BS}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
lora 32 0.8396220207214355 3.7825655937194824
|
||||
lora 32 2.773350238800049 10.523799419403076
|
||||
lora 32 10.683510303497314 32.6428017616272
|
||||
lora 32 0.8396220207214355 3.7825236320495605
|
||||
lora 32 2.773350238800049 10.523311138153076
|
||||
adapter 32 0.8578410148620605 3.986640453338623
|
||||
adapter 32 2.821873188018799 11.039577007293701
|
||||
adapter 32 10.696877002716064 33.12049341201782
|
||||
adapter 8 0.8578410148620605 1.6147065162658691
|
||||
adapter 8 2.821873188018799 4.828186511993408
|
||||
adapter 8 10.696877002716064 16.09417200088501
|
||||
lora 8 0.8396220207214355 1.5540986061096191
|
||||
lora 8 2.773350238800049 4.664810657501221
|
||||
lora 1 0.8396220207214355 0.9107160568237305
|
||||
lora 8 10.683510303497314 15.965403079986572
|
||||
lora 64 0.8396220207214355 6.777950763702393
|
||||
lora 1 2.773350238800049 2.9350662231445312
|
||||
lora 64 2.773350238800049 18.340473651885986
|
||||
lora 1 10.683510303497314 11.131460189819336
|
||||
adapter 1 0.8578410148620605 0.9334897994995117
|
||||
lora 64 10.683510303497314 54.61024713516235
|
||||
adapter 1 2.821873188018799 2.9950332641601562
|
||||
adapter 64 0.8578410148620605 7.167330265045166
|
||||
adapter 1 10.696877002716064 11.156260967254639
|
||||
adapter 64 2.821873188018799 19.32366418838501
|
||||
adapter 64 10.696877002716064 55.56023454666138
|
|
@ -1,7 +0,0 @@
|
|||
files=(cola mnli mrpc qnli qqp rte sst2 stsb superglue-boolq superglue-cb superglue-copa superglue-multirc superglue-record superglue-wic superglue-wsc.fixed)
|
||||
for ((i=$1; i<=$2; i++))
|
||||
do
|
||||
dataset=${files[i]}
|
||||
echo "id$i:$dataset"
|
||||
TOKENIZERS_PARALLELISM=false python run_seq2seq.py configs/$3/$dataset.json
|
||||
done
|
|
@ -1,34 +0,0 @@
|
|||
# files=(cola mnli mrpc qnli qqp rte sst2 stsb superglue-boolq superglue-cb superglue-copa superglue-multirc superglue-record superglue-wic superglue-wsc.fixed)
|
||||
# for ((i=$1; i<=$2; i++))
|
||||
# do
|
||||
# dataset=${files[i]}
|
||||
# echo "id$i:$dataset"
|
||||
# TOKENIZERS_PARALLELISM=false python run_seq2seq.py configs/$3/$dataset.json
|
||||
# done
|
||||
|
||||
cd configs
|
||||
|
||||
for deltatype in "lora" "adapter"
|
||||
do
|
||||
for modeltype in "t5-base" "t5-large" "t5-3b"
|
||||
do
|
||||
echo $deltatype
|
||||
python config_gen_bs$2.py --job $deltatype\_$modeltype
|
||||
done
|
||||
done
|
||||
|
||||
ls
|
||||
cd ..
|
||||
|
||||
for deltatype in "lora" "adapter"
|
||||
do
|
||||
for modeltype in "t5-base" "t5-large" "t5-3b"
|
||||
do
|
||||
CUDA_VISIBLE_DEVICES=$1 bash run.sh 2 2 $deltatype\_$modeltype\_$2
|
||||
done
|
||||
done
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -19,9 +19,9 @@ Fine-tuning the library models for sequence to sequence.
|
|||
import functools
|
||||
import logging
|
||||
# from opendelta.utils.delta_center import create_hub_repo_name
|
||||
import torch
|
||||
import torch
|
||||
import os
|
||||
os.environ['MKL_THREADING_LAYER'] = 'GNU'
|
||||
os.environ['MKL_THREADING_LAYER'] = 'GNU'
|
||||
os.environ['MKL_SERVICE_FORCE_INTEL'] = '1'
|
||||
import sys
|
||||
import subprocess
|
||||
|
@ -43,14 +43,14 @@ from transformers.trainer_utils import is_main_process, get_last_checkpoint
|
|||
from examples_seq2seq.data_processors import AutoTask, TaskDataCollatorForSeq2Seq, AutoPostProcessor
|
||||
from examples_seq2seq.seq2seq_trainer import Seq2SeqTrainer
|
||||
# from training_args import AdapterTrainingArguments
|
||||
from examples_seq2seq.trainers.trainer_utils import save_training_config
|
||||
from examples_seq2seq.trainers.trainer_utils import save_training_config
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
from transformers.models.t5.modeling_t5 import T5Config, T5ForConditionalGeneration
|
||||
from examples_seq2seq.trainers.model_args import ModelArguments
|
||||
from examples_seq2seq.trainers.trainer_args import TrainingArguments, DataTrainingArguments
|
||||
|
||||
import tensorboardX
|
||||
import tensorboardX
|
||||
tb_writer = tensorboardX.SummaryWriter("Delta_Memory")
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
@ -100,7 +100,7 @@ class RemainArgHfArgumentParser(HfArgumentParser):
|
|||
inputs = {k: data.pop(k) for k in list(data.keys()) if k in keys}
|
||||
obj = dtype(**inputs)
|
||||
outputs.append(obj)
|
||||
|
||||
|
||||
remain_args = argparse.ArgumentParser()
|
||||
remain_args.__dict__.update(data)
|
||||
if return_remaining_args:
|
||||
|
@ -108,41 +108,41 @@ class RemainArgHfArgumentParser(HfArgumentParser):
|
|||
else:
|
||||
return (*outputs,)
|
||||
|
||||
from transformers.trainer_callback import TrainerCallback
|
||||
# from transformers.trainer_callback import TrainerCallback
|
||||
|
||||
class MyCallback(TrainerCallback):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.delta_args = kwargs.pop("delta_args")
|
||||
self.trainer_args = kwargs.pop("trainer_args")
|
||||
self.model_args = kwargs.pop("model_args")
|
||||
super(MyCallback, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
maxcudamem = 0
|
||||
def on_step_end(self, args, state, control, **kwargs ):
|
||||
glb_step = state.global_step
|
||||
cudamem = 0
|
||||
realcudamem =0
|
||||
for device_id in range(torch.cuda.device_count()):
|
||||
cudamem += torch.cuda.memory_allocated(f"cuda:{device_id}")/1024**3
|
||||
realcudamem += torch.cuda.max_memory_allocated(f"cuda:{device_id}")/1024**3
|
||||
torch.cuda.reset_peak_memory_stats(f"cuda:{device_id}")
|
||||
self.maxcudamem = max(self.maxcudamem, realcudamem)
|
||||
self.cudamem = cudamem
|
||||
# self.tb_writer.add_scalar("Static Memory (GB)", cudamem, glb_step)
|
||||
# class MyCallback(TrainerCallback):
|
||||
# def __init__(self, *args, **kwargs):
|
||||
# self.delta_args = kwargs.pop("delta_args")
|
||||
# self.trainer_args = kwargs.pop("trainer_args")
|
||||
# self.model_args = kwargs.pop("model_args")
|
||||
# super(MyCallback, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
# maxcudamem = 0
|
||||
# def on_step_end(self, args, state, control, **kwargs ):
|
||||
# glb_step = state.global_step
|
||||
# cudamem = 0
|
||||
# realcudamem =0
|
||||
# for device_id in range(torch.cuda.device_count()):
|
||||
# cudamem += torch.cuda.memory_allocated(f"cuda:{device_id}")/1024**3
|
||||
# realcudamem += torch.cuda.max_memory_allocated(f"cuda:{device_id}")/1024**3
|
||||
# torch.cuda.reset_peak_memory_stats(f"cuda:{device_id}")
|
||||
# self.maxcudamem = max(self.maxcudamem, realcudamem)
|
||||
# self.cudamem = cudamem
|
||||
# # self.tb_writer.add_scalar("Static Memory (GB)", cudamem, glb_step)
|
||||
# self.tb_writer.add_scalar("Runtime Memory (GB)", realcudamem, glb_step)
|
||||
# self.tb_writer.add_scalar("Peak Memory (GB)", self.maxcudamem, glb_step)
|
||||
if glb_step > 50:
|
||||
content = f"{self.delta_args.delta_type}\t{self.trainer_args.per_device_train_batch_size}\t{self.model_args.model_name_or_path}\t{self.cudamem}\t{self.maxcudamem}\n"
|
||||
with open("memory_data.txt", 'a') as fout:
|
||||
fout.write(content)
|
||||
exit()
|
||||
|
||||
|
||||
# if glb_step > 50:
|
||||
# content = f"{self.delta_args.delta_type}\t{self.trainer_args.per_device_train_batch_size}\t{self.model_args.model_name_or_path}\t{self.cudamem}\t{self.maxcudamem}\n"
|
||||
# with open("memory_data.txt", 'a') as fout:
|
||||
# fout.write(content)
|
||||
# exit()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -172,7 +172,7 @@ def main():
|
|||
"Use --overwrite_output_dir to overcome."
|
||||
)
|
||||
'''
|
||||
pass
|
||||
pass
|
||||
elif last_checkpoint is not None:
|
||||
logger.info(
|
||||
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
|
||||
|
@ -273,7 +273,7 @@ def main():
|
|||
# Temporarily set max_target_length for training.
|
||||
#max_target_length = data_args.max_target_length
|
||||
padding = "max_length" if data_args.pad_to_max_length else False
|
||||
|
||||
|
||||
def preprocess_function(examples, max_target_length):
|
||||
# max_target_length += 1
|
||||
# model_inputs = tokenizer([s+"<extra_id_0>" for s in examples['source']], max_length=data_args.max_source_length,
|
||||
|
@ -301,7 +301,7 @@ def main():
|
|||
if training_args.do_train:
|
||||
train_datasets = [AutoTask.get(dataset_name,
|
||||
dataset_config_name,
|
||||
seed=data_args.data_seed).get(
|
||||
seed=data_args.data_sample_seed).get(
|
||||
split="train",
|
||||
split_validation_test=training_args.split_validation_test,
|
||||
add_prefix=True,
|
||||
|
@ -320,11 +320,11 @@ def main():
|
|||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
)
|
||||
train_dataset = concatenate_datasets(train_datasets)
|
||||
|
||||
|
||||
if training_args.do_eval:
|
||||
eval_datasets = {eval_dataset: AutoTask.get(eval_dataset, eval_dataset_config,
|
||||
seed=data_args.data_seed).get(
|
||||
split="validation",
|
||||
seed=data_args.data_sample_seed).get(
|
||||
split="validation",
|
||||
split_validation_test=training_args.split_validation_test,
|
||||
add_prefix=True,
|
||||
n_obs=data_args.max_val_samples)
|
||||
|
@ -343,8 +343,8 @@ def main():
|
|||
|
||||
if training_args.do_test:
|
||||
test_datasets = {test_dataset: AutoTask.get(test_dataset, test_dataset_config,
|
||||
seed=data_args.data_seed).get(
|
||||
split="test",
|
||||
seed=data_args.data_sample_seed).get(
|
||||
split="test",
|
||||
split_validation_test=training_args.split_validation_test,
|
||||
add_prefix=True,
|
||||
n_obs=data_args.max_test_samples)
|
||||
|
@ -379,10 +379,10 @@ def main():
|
|||
|
||||
# Extracts the extra information needed to evaluate on each dataset.
|
||||
# These information are only used in the compute_metrics.
|
||||
# We will assume that the test/eval dataloader does not change the order of
|
||||
# We will assume that the test/eval dataloader does not change the order of
|
||||
# the data.
|
||||
data_info = {"eval": eval_datasets[data_args.eval_dataset_name[0]]['extra_fields'],
|
||||
"test": test_datasets[data_args.test_dataset_name[0]]['extra_fields'],
|
||||
"test": test_datasets[data_args.test_dataset_name[0]]['extra_fields'],
|
||||
"train": train_dataset['extra_fields']}
|
||||
def compute_metrics(eval_preds):
|
||||
preds, labels, data_info = eval_preds
|
||||
|
@ -409,10 +409,10 @@ def main():
|
|||
evaluation_metrics = TASK_TO_METRICS[data_args.dataset_name[0]],
|
||||
)
|
||||
|
||||
trainer.add_callback(MyCallback(trainer_args=training_args, delta_args=delta_args, model_args=model_args))
|
||||
# trainer.add_callback(MyCallback(trainer_args=training_args, delta_args=delta_args, model_args=model_args))
|
||||
|
||||
|
||||
# Saves training config.
|
||||
# Saves training config.
|
||||
if trainer.is_world_process_zero():
|
||||
os.makedirs(training_args.output_dir, exist_ok=True)
|
||||
save_training_config(sys.argv[1], training_args.output_dir)
|
||||
|
@ -430,15 +430,15 @@ def main():
|
|||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
start.record()
|
||||
|
||||
|
||||
train_result = trainer.train(resume_from_checkpoint=checkpoint)
|
||||
|
||||
|
||||
if training_args.compute_time:
|
||||
end.record()
|
||||
torch.cuda.synchronize() # wait for all_reduce to complete
|
||||
total_time = start.elapsed_time(end)/(1000*60)
|
||||
performance_metrics.update({"total_time in minutes ": total_time})
|
||||
|
||||
|
||||
trainer.save_model() # Saves the tokenizer too for easy upload
|
||||
train_metrics = train_result.metrics
|
||||
max_train_samples = (
|
||||
|
@ -460,7 +460,7 @@ def main():
|
|||
if training_args.compute_memory or training_args.compute_time:
|
||||
print(performance_metrics)
|
||||
trainer.save_metrics("performance", performance_metrics)
|
||||
|
||||
|
||||
# Evaluation
|
||||
results = {}
|
||||
if training_args.do_eval:
|
||||
|
@ -484,9 +484,9 @@ def main():
|
|||
trainer.log_metrics("test", metrics)
|
||||
trainer.save_metrics("test", metrics)
|
||||
results['test'] = metrics
|
||||
|
||||
|
||||
repo_name = create_hub_repo_name(root="DeltaHub",
|
||||
dataset=data_args.task_name,
|
||||
dataset=data_args.task_name,
|
||||
delta_type = delta_args.delta_type,
|
||||
model_name_or_path= model_args.model_name_or_path)
|
||||
results['repo_name'] = repo_name
|
||||
|
|
|
@ -5,21 +5,8 @@ from typing import Any, Dict, List, Optional, Tuple, Union
|
|||
|
||||
from torch.utils.data.dataset import Dataset
|
||||
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainner
|
||||
from examples_seq2seq.trainers.trainer import BaseTrainer
|
||||
from examples_seq2seq.trainers.trainer import BaseTrainer
|
||||
|
||||
# if is_sagemaker_mp_enabled():
|
||||
# import smdistributed.modelparallel.torch as smp
|
||||
|
||||
# from transformers.trainer_utils import ShardedDDPOption
|
||||
|
||||
# if is_fairscale_available():
|
||||
# dep_version_check("fairscale")
|
||||
# import fairscale
|
||||
# from fairscale.nn.data_parallel import FullyShardedDataParallel as FullyShardedDDP
|
||||
# from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP
|
||||
# from fairscale.nn.wrap import auto_wrap
|
||||
# from fairscale.optim import OSS
|
||||
# from fairscale.optim.grad_scaler import ShardedGradScaler
|
||||
|
||||
from transformers.optimization import Adafactor, AdamW, get_scheduler
|
||||
from transformers.trainer_pt_utils import get_parameter_names, is_sagemaker_mp_enabled
|
||||
|
@ -121,7 +108,7 @@ class Seq2SeqTrainer(HfSeq2SeqTrainner, BaseTrainer):
|
|||
|
||||
return (loss, generated_tokens, labels)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from dataclasses import dataclass, field
|
||||
from typing import Optional, List
|
||||
from transformers import Seq2SeqTrainingArguments
|
||||
from transformers import Seq2SeqTrainingArguments
|
||||
# run_seq2seq parameters.
|
||||
|
||||
@dataclass
|
||||
|
@ -127,8 +127,9 @@ class DataTrainingArguments:
|
|||
default=None,
|
||||
metadata={"help": "Defines a dictionary from tasks to the tasks embeddings."}
|
||||
)
|
||||
data_seed: Optional[int] = field(default=42, metadata={"help": "seed used to shuffle the data."})
|
||||
|
||||
data_sample_seed: Optional[int] = field(default=42, metadata={"help": "seed used to shuffle the data."})
|
||||
|
||||
|
||||
model_parallel: Optional[bool] = field(default=False, metadata={"help": "whether apply model parallelization"})
|
||||
|
||||
def __post_init__(self):
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
from examples_prompt.metrics.metrics import exact_match
|
||||
from opendelta.utils.signature import get_arg_names, get_arg_names_inside_func
|
||||
from opendelta.utils.name_based_addressing import *
|
||||
from opendelta.utils.cuda import get_device
|
||||
|
@ -7,8 +6,6 @@ from typing import *
|
|||
import torch
|
||||
import torch.nn as nn
|
||||
from opendelta import BaseDeltaConfig
|
||||
from decorator import decorate
|
||||
import torch.nn.functional as F
|
||||
from opendelta import logging
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
|
Loading…
Reference in New Issue