commit
f8cb2d3138
|
@ -38,5 +38,18 @@ log.txt
|
|||
**/output/
|
||||
**/thunlp/
|
||||
**/saved_ckpts/
|
||||
|
||||
|
||||
DeltaCenter-Python-Client/
|
||||
backbone_structure
|
||||
delta_checkpoints
|
||||
gitop.sh
|
||||
load_dataset_and_model.ipynb
|
||||
load_model.py
|
||||
scripts
|
||||
t.py
|
||||
t.sh
|
||||
!examples/examples_prompt/configs/*/*.json
|
||||
!examples/examples_prompt/configs/**
|
||||
**/delta_checkpoints/
|
||||
**/outputs/
|
||||
|
|
|
@ -43,7 +43,8 @@ def preprocess_function(raw_example, **kwargs):
|
|||
|
||||
def get_backbone(model_args, **kwargs):
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
# model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
|
|
|
@ -8,7 +8,6 @@ from transformers import (
|
|||
AutoFeatureExtractor,
|
||||
AutoModelForImageClassification,
|
||||
)
|
||||
from transformers import ViTFeatureExtractor
|
||||
|
||||
from transformers import Trainer as HfTrainer
|
||||
import torch.nn as nn
|
||||
|
@ -26,9 +25,10 @@ def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
|
|||
def preprocess_function(raw_example, **kwargs):
|
||||
# from IPython import embed; embed(header="Therefa")
|
||||
tokenizer = kwargs['tokenizer']
|
||||
model_inputs = tokenizer(raw_example['image'], return_tensors='pt')
|
||||
# print(np.array(raw_example['img']).shape)
|
||||
model_inputs = tokenizer(np.array(raw_example['image']), return_tensors='pt')
|
||||
model_inputs['pixel_values'] = model_inputs['pixel_values'].squeeze()
|
||||
model_inputs['labels'] = raw_example['labels']
|
||||
model_inputs['labels'] = raw_example['label']
|
||||
return model_inputs
|
||||
|
||||
def compute_metrics(eval_preds, dataset_name, eval_metric):
|
||||
|
@ -55,7 +55,7 @@ def mask_token_func(tokenizer, ith_mask=0):
|
|||
|
||||
def get_remove_columns(dataset_features):
|
||||
# dataset_features.pop("label")
|
||||
print("remove_columns: {}".format(dataset_features))
|
||||
# print("remove_columns: {}".format(dataset_features))
|
||||
return dataset_features
|
||||
|
||||
class DataCollator(HfDataCollatorMixin):
|
||||
|
|
|
@ -0,0 +1,169 @@
|
|||
from openpromptu.data_utils import InputExample
|
||||
import torch
|
||||
from transformers.data.data_collator import torch_default_data_collator
|
||||
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
|
||||
from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
|
||||
import numpy as np
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
)
|
||||
|
||||
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
|
||||
import copy
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
def preprocess_function(raw_example, **kwargs):
|
||||
tokenizer = kwargs['tokenizer']
|
||||
data_args = kwargs['data_args']
|
||||
template = kwargs['template']
|
||||
verbalizer = kwargs['verbalizer']
|
||||
tokenizer_wrapper = kwargs['tokenizer_wrapper']
|
||||
|
||||
example = InputExample(**raw_example)
|
||||
# example = verbalizer.wrap_one_example(example)
|
||||
example, other = template.wrap_one_example(example)
|
||||
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
|
||||
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
|
||||
padding="max_length", truncation=True)
|
||||
return model_inputs
|
||||
|
||||
|
||||
|
||||
def compute_metrics(eval_preds, dataset_name, eval_metric):
|
||||
pass
|
||||
|
||||
def mask_token_func(tokenizer, ith_mask=0):
|
||||
return tokenizer.pad_token
|
||||
|
||||
def get_remove_columns(dataset_features):
|
||||
# dataset_features.remove("label")
|
||||
return dataset_features
|
||||
|
||||
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
|
||||
from openpromptu.prompts import GenerationVerbalizer
|
||||
from openpromptu.prompts import ManualTemplate
|
||||
from openpromptu import TokenizerWrapper
|
||||
template = ManualTemplate(text = task.templates_text[template_id])
|
||||
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None)
|
||||
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
|
||||
return template, verbalizer, tokenizer_wrapper
|
||||
|
||||
|
||||
def get_backbone(model_args, **kwargs):
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
# config.dropout_rate = 0.0
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_fast=model_args.use_fast_tokenizer,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
return config, tokenizer, model
|
||||
|
||||
class Trainer(HfSeq2SeqTrainer):
|
||||
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.eval_task = eval_task
|
||||
self.compute_metrics = self._compute_metrics
|
||||
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
|
||||
labels=copy.deepcopy(inputs['input_ids'])
|
||||
# labels[labels==self.tokenizer.pad_token_id]=-100
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.logits
|
||||
shift_logits = logits[..., :-1, :].contiguous()
|
||||
shift_labels = labels[..., 1:].contiguous()
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
|
||||
loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1))
|
||||
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
|
||||
def prediction_step(
|
||||
self,
|
||||
model, #nn.Module,
|
||||
inputs, #Dict[str, Union[torch.Tensor, Any]],
|
||||
prediction_loss_only, #: bool,
|
||||
ignore_keys, #: Optional[List[str]] = None,
|
||||
): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
|
||||
"""
|
||||
Perform an evaluation step on :obj:`model` using obj:`inputs`.
|
||||
|
||||
Subclass and override to inject custom behavior.
|
||||
|
||||
Args:
|
||||
model (:obj:`nn.Module`):
|
||||
The model to evaluate.
|
||||
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
|
||||
The inputs and targets of the model.
|
||||
|
||||
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
|
||||
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
|
||||
prediction_loss_only (:obj:`bool`):
|
||||
Whether or not to return the loss only.
|
||||
|
||||
Return:
|
||||
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
|
||||
labels (each being optional).
|
||||
"""
|
||||
if not self.args.predict_with_generate or prediction_loss_only:
|
||||
return super().prediction_step(
|
||||
model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
|
||||
)
|
||||
|
||||
inputs = self._prepare_inputs(inputs)
|
||||
with torch.no_grad():
|
||||
labels=copy.deepcopy(inputs['input_ids'])
|
||||
# labels[labels==self.tokenizer.pad_token_id]=-100
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.logits
|
||||
shift_logits = logits[..., :-1, :].contiguous()
|
||||
shift_labels = labels[..., 1:].contiguous().long()
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
|
||||
loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu()
|
||||
loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss)
|
||||
|
||||
if prediction_loss_only:
|
||||
return (loss, None, None)
|
||||
else:
|
||||
# non pad label
|
||||
shift_labels = shift_labels.view(-1).detach().cpu()
|
||||
nonpad_idx = shift_labels!=self.tokenizer.pad_token_id
|
||||
shift_labels = shift_labels[nonpad_idx]
|
||||
# the probability at the corresponding position
|
||||
shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu()
|
||||
target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device)
|
||||
shift_logits = shift_logits.softmax(dim=-1)[target_position]
|
||||
|
||||
|
||||
return (loss, shift_logits, shift_labels)
|
||||
|
||||
def _compute_metrics(self, eval_preds):
|
||||
|
||||
preds, labels = eval_preds
|
||||
|
||||
result = {}
|
||||
for metric in self.eval_task.metric:
|
||||
result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id))
|
||||
|
||||
average_metric = sum(result.values())/len(result)
|
||||
result.update({"average_metrics":average_metric})
|
||||
return result
|
|
@ -164,7 +164,7 @@ class Trainer(HfSeq2SeqTrainer):
|
|||
return (loss, generated_tokens, labels)
|
||||
|
||||
def _compute_metrics(self, eval_preds):
|
||||
from IPython import embed; embed(header="In compute metrics")
|
||||
# from IPython import embed; embed(header="In compute metrics")
|
||||
preds, labels = eval_preds
|
||||
decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
|
||||
decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
|
||||
|
|
|
@ -0,0 +1,171 @@
|
|||
from openpromptu.data_utils import InputExample
|
||||
import torch
|
||||
from transformers.data.data_collator import torch_default_data_collator
|
||||
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
|
||||
from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
|
||||
import numpy as np
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModelForCausalLM,
|
||||
AutoTokenizer,
|
||||
)
|
||||
|
||||
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
|
||||
import copy
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
def preprocess_function(raw_example, **kwargs):
|
||||
tokenizer = kwargs['tokenizer']
|
||||
data_args = kwargs['data_args']
|
||||
template = kwargs['template']
|
||||
verbalizer = kwargs['verbalizer']
|
||||
tokenizer_wrapper = kwargs['tokenizer_wrapper']
|
||||
|
||||
example = InputExample(**raw_example)
|
||||
# example = verbalizer.wrap_one_example(example)
|
||||
example, other = template.wrap_one_example(example)
|
||||
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
|
||||
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
|
||||
padding="max_length", truncation=True)
|
||||
return model_inputs
|
||||
|
||||
|
||||
|
||||
def compute_metrics(eval_preds, dataset_name, eval_metric):
|
||||
pass
|
||||
|
||||
def mask_token_func(tokenizer, ith_mask=0):
|
||||
return tokenizer.pad_token
|
||||
|
||||
def get_remove_columns(dataset_features):
|
||||
# dataset_features.remove("label")
|
||||
return dataset_features
|
||||
|
||||
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
|
||||
from openpromptu.prompts import GenerationVerbalizer
|
||||
from openpromptu.prompts import ManualTemplate
|
||||
from openpromptu import TokenizerWrapper
|
||||
template = ManualTemplate(text = task.templates_text[template_id])
|
||||
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None)
|
||||
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="tail", mask_token_func=mask_token_func)
|
||||
return template, verbalizer, tokenizer_wrapper
|
||||
|
||||
|
||||
def get_backbone(model_args, **kwargs):
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
# config.dropout_rate = 0.0
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_fast=model_args.use_fast_tokenizer,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
if not hasattr(tokenizer,"pad_token") or (hasattr(tokenizer,"pad_token") and tokenizer.pad_token==None):
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
model_args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
|
||||
return config, tokenizer, model
|
||||
|
||||
class Trainer(HfSeq2SeqTrainer):
|
||||
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.eval_task = eval_task
|
||||
self.compute_metrics = self._compute_metrics
|
||||
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
|
||||
labels=copy.deepcopy(inputs['input_ids'])
|
||||
# labels[labels==self.tokenizer.pad_token_id]=-100
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.logits
|
||||
shift_logits = logits[..., :-1, :].contiguous()
|
||||
shift_labels = labels[..., 1:].contiguous()
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
|
||||
loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1))
|
||||
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
|
||||
def prediction_step(
|
||||
self,
|
||||
model, #nn.Module,
|
||||
inputs, #Dict[str, Union[torch.Tensor, Any]],
|
||||
prediction_loss_only, #: bool,
|
||||
ignore_keys, #: Optional[List[str]] = None,
|
||||
): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
|
||||
"""
|
||||
Perform an evaluation step on :obj:`model` using obj:`inputs`.
|
||||
|
||||
Subclass and override to inject custom behavior.
|
||||
|
||||
Args:
|
||||
model (:obj:`nn.Module`):
|
||||
The model to evaluate.
|
||||
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
|
||||
The inputs and targets of the model.
|
||||
|
||||
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
|
||||
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
|
||||
prediction_loss_only (:obj:`bool`):
|
||||
Whether or not to return the loss only.
|
||||
|
||||
Return:
|
||||
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
|
||||
labels (each being optional).
|
||||
"""
|
||||
if not self.args.predict_with_generate or prediction_loss_only:
|
||||
return super().prediction_step(
|
||||
model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
|
||||
)
|
||||
|
||||
inputs = self._prepare_inputs(inputs)
|
||||
with torch.no_grad():
|
||||
labels=copy.deepcopy(inputs['input_ids'])
|
||||
# labels[labels==self.tokenizer.pad_token_id]=-100
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.logits
|
||||
shift_logits = logits[..., :-1, :].contiguous()
|
||||
shift_labels = labels[..., 1:].contiguous().long()
|
||||
loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
|
||||
loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu()
|
||||
loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss)
|
||||
|
||||
if prediction_loss_only:
|
||||
return (loss, None, None)
|
||||
else:
|
||||
# non pad label
|
||||
shift_labels = shift_labels.view(-1).detach().cpu()
|
||||
nonpad_idx = shift_labels!=self.tokenizer.pad_token_id
|
||||
shift_labels = shift_labels[nonpad_idx]
|
||||
# the probability at the corresponding position
|
||||
shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu()
|
||||
target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device)
|
||||
shift_logits = shift_logits.softmax(dim=-1)[target_position]
|
||||
|
||||
|
||||
return (loss, shift_logits, shift_labels)
|
||||
|
||||
def _compute_metrics(self, eval_preds):
|
||||
|
||||
preds, labels = eval_preds
|
||||
|
||||
result = {}
|
||||
for metric in self.eval_task.metric:
|
||||
result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id))
|
||||
|
||||
average_metric = sum(result.values())/len(result)
|
||||
result.update({"average_metrics":average_metric})
|
||||
return result
|
|
@ -0,0 +1,48 @@
|
|||
{
|
||||
"bottleneck_dim": 24,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "adapter",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "beans",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32",
|
||||
"num_classes": 3,
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/adapter/clip-vit-base-patch32/beans",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_delta_center": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "beans",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "beans",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
{
|
||||
"backbone_model": "opt",
|
||||
"bottleneck_dim": 24,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "adapter",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "wikitext",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps":2,
|
||||
"greater_is_better": false,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 900,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m",
|
||||
"model_path_public": "opt-350m",
|
||||
"num_train_epochs": 3,
|
||||
"output_dir": "outputs/adapter/opt-350m/wikitext",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 6,
|
||||
"per_device_train_batch_size": 6,
|
||||
"predict_with_generate": true,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "wikitext",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "wikitext",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["self_attn"]
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
{
|
||||
"backbone_model": "vit",
|
||||
"bottleneck_dim": 24,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": false,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "adapter",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "beans",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k",
|
||||
"model_path_public": "vit-large-patch16-224-in21k",
|
||||
"num_classes": 3,
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/adapter/vit-large-patch16-224-in21k/beans",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": false,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "beans",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "beans",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["output"]
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"backbone_model": "t5-large",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 100,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/t5-large",
|
||||
"model_path_public": "t5-large",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/bitfit/t5-large/rte",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 100,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/t5-large",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["attn", "ff", "layer_norm"]
|
||||
}
|
|
@ -0,0 +1,66 @@
|
|||
{
|
||||
"backbone_model": "blenderbot",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "compacter",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "sst2",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"factorized_phm": true,
|
||||
"factorized_phm_rule": false,
|
||||
"gradient_clip": false,
|
||||
"greater_is_better": true,
|
||||
"hypercomplex_adapters": true,
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
"learn_phm": true,
|
||||
"learning_rate": 0.003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b",
|
||||
"model_path_public": "blenderbot-3b",
|
||||
"non_linearity": "gelu_new",
|
||||
"normalize_phm_weight": false,
|
||||
"num_train_epochs": 3,
|
||||
"output_dir": "outputs/compacter/blenderbot-3b/sst2",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"phm_c_init": "normal",
|
||||
"phm_clamp": false,
|
||||
"phm_init_range": 0.0001,
|
||||
"predict_with_generate": true,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"shared_phm_rule": false,
|
||||
"split_validation_test": true,
|
||||
"task_name": "sst2",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "sst2",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"use_bias_down_sampler": true,
|
||||
"use_bias_up_sampler": true,
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["fc2"]
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"backbone_model": "deberta-v2-xlarge",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "compacter",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mnli",
|
||||
"eval_steps": 500,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"is_seq2seq": false,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge",
|
||||
"num_train_epochs": 3,
|
||||
"output_dir": "outputs/compacter/deberta-v2-xlarge/mnli",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": false,
|
||||
"push_to_dc": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 500,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mnli",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mnli",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["attention"]
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"backbone_model": "long-t5",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "compacter",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 100,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large",
|
||||
"model_path_public": "long-t5-tglobal-large",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/compacter/long-t5-tglobal-large/rte",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 100,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["attn", "ff", "layer_norm"]
|
||||
}
|
|
@ -2,7 +2,7 @@ import collections
|
|||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
# PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
|
|
|
@ -0,0 +1,52 @@
|
|||
{
|
||||
"backbone_model": "beit",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk",
|
||||
"delta_type": "lora",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "cifar10",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224",
|
||||
"model_path_public": "beit-large-patch16-224",
|
||||
"num_classes": 10,
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/lora/beit-large-patch16-224/cifar10",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": false,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "cifar10",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "cifar10",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["query","value"]
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
{
|
||||
"backbone_model": "gpt-j",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "lora",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "wikitext",
|
||||
"eval_steps": 500,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps":4,
|
||||
"greater_is_better": false,
|
||||
"learning_rate": 0.00003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 512,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B",
|
||||
"model_path_public": "gpt-j-6B",
|
||||
"num_train_epochs": 2,
|
||||
"output_dir": "outputs/lora/gpt-j-6B/wikitext",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 2,
|
||||
"per_device_train_batch_size": 2,
|
||||
"predict_with_generate": true,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 500,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "wikitext",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "wikitext",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["20.attn.q_proj","21.attn.q_proj","22.attn.q_proj","23.attn.q_proj","24.attn.q_proj","25.attn.q_proj","26.attn.q_proj","27.attn.q_proj"]
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
{
|
||||
"backbone_model": "roberta-large",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "lora",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "superglue-boolq",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"is_seq2seq": false,
|
||||
"learning_rate": 0.0001,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large",
|
||||
"model_path_public": "roberta-large",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/lora/roberta-large/superglue-boolq",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": false,
|
||||
"push_to_hub": false,
|
||||
"push_to_dc": true,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "superglue-boolq",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "superglue-boolq",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["query","value"]
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
{
|
||||
"backbone_model": "xlm-roberta-large",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "lora",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "superglue-wic",
|
||||
"eval_steps": 100,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"is_seq2seq": false,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large",
|
||||
"model_path_public": "xlm-roberta-large",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/lora/xlm-roberta-large/superglue-wic",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": false,
|
||||
"push_to_dc": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 100,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "superglue-wic",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "superglue-wic",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["query","value"]
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
{
|
||||
"backbone_model": "gpt2",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "low_rank_adapter",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "wikitext",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps":1,
|
||||
"greater_is_better": false,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 768,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt2",
|
||||
"model_path_public": "gpt2",
|
||||
"num_train_epochs": 2,
|
||||
"output_dir": "outputs/low_rank_adapter/gpt2/wikitext",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "wikitext",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "wikitext",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt2",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["attn","mlp"]
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"backbone_model": "bert-large-cased",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "prefix",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 100,
|
||||
"evaluation_strategy": "steps",
|
||||
"greater_is_better": true,
|
||||
"is_seq2seq": false,
|
||||
"learning_rate": 0.0003,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased",
|
||||
"num_train_epochs": 20,
|
||||
"output_dir": "outputs/prefix/bert-large-cased/rte",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": false,
|
||||
"push_to_dc": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 100,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased",
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"warmup_steps": 0,
|
||||
"modified_modules":["attention"]
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
{
|
||||
"backbone_model": "bart",
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"datasets_load_from_disk": true,
|
||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||
"delta_type": "soft_prompt",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "superglue-boolq",
|
||||
"eval_steps": 500,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps":1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.1,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 256,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bart-large",
|
||||
"model_path_public": "bart-large",
|
||||
"num_train_epochs": 50,
|
||||
"output_dir": "outputs/soft_prompt/bart-large/superglue-boolq",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_dc": true,
|
||||
"push_to_hf": false,
|
||||
"save_steps": 500,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 42,
|
||||
"soft_token_num":100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "superglue-boolq",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "superglue-boolq",
|
||||
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bart-large",
|
||||
"token_init": true,
|
||||
"unfrozen_modules": [
|
||||
"deltas"
|
||||
],
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -93,4 +93,10 @@ class AbstractTask(abc.ABC):
|
|||
# shuffles the data and samples it.
|
||||
if n_obs is not None:
|
||||
dataset = self.subsample(dataset, n_obs)
|
||||
return dataset.map(self.preprocessor)
|
||||
|
||||
this_method = getattr(self.__class__, 'preprocessor')
|
||||
base_method = getattr(AbstractTask, 'preprocessor')
|
||||
if this_method is not base_method:
|
||||
return dataset.map(self.preprocessor)
|
||||
else:
|
||||
return dataset
|
||||
|
|
|
@ -556,12 +556,74 @@ class Beans(AbstractTask):
|
|||
# from IPython import embed; embed(header="beans")
|
||||
offline = os.environ.get("HF_DATASETS_OFFLINE", "0")
|
||||
if offline == '1':
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.wic")[split]
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/beans")[split]
|
||||
else:
|
||||
return datasets.load_dataset('beans', split=split, script_version="master")
|
||||
|
||||
class Wikitext(AbstractTask):
|
||||
#wikitext-2-v1
|
||||
name = "wikitext"
|
||||
# labels_list = ['angular_leaf_spot', 'bean_rust', "healthy"]
|
||||
split_to_data_split = {"train": "train",
|
||||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
metric = [metrics.perplexity]
|
||||
metric_names = ["perplexity"]
|
||||
|
||||
verbalizers = {
|
||||
"0": {
|
||||
}
|
||||
}
|
||||
|
||||
templates_text = {
|
||||
"0": """{"meta":"text"}"""
|
||||
}
|
||||
split_valid_to_make_test = True
|
||||
def load_dataset(self, split):
|
||||
# from IPython import embed; embed(header="beans")
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/wikitext")[split]
|
||||
else:
|
||||
return datasets.load_dataset('wikitext','wikitext-2-v1', split=split, script_version="master")
|
||||
|
||||
class Cifar10(AbstractTask):
|
||||
name = "cifar10"
|
||||
|
||||
split_to_data_split = {"train": "train",
|
||||
"validation": "test",
|
||||
"test": "test"}
|
||||
metric = [metrics.accuracy]
|
||||
metric_names = ["accuracy"]
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
d = datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/cifar10")[split].select(range(100))
|
||||
print(d)
|
||||
return d
|
||||
else:
|
||||
return datasets.load_dataset('cifar10', split=split, script_version="master")
|
||||
# def preprocessor(self, example):
|
||||
# example_ = {}
|
||||
# example_["image"] = example["image"]
|
||||
# example_["labels"] = example["label"]
|
||||
|
||||
# return example_
|
||||
class Fashion_MNIST(AbstractTask):
|
||||
name = "Fashion-MNIST"
|
||||
|
||||
split_to_data_split = {"train": "train",
|
||||
"validation": "test",
|
||||
"test": "test"}
|
||||
metric = [metrics.accuracy]
|
||||
metric_names = ["accuracy"]
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
d = datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/fashion_mnist")[split]
|
||||
print(d)
|
||||
return d
|
||||
else:
|
||||
return datasets.load_dataset('fashion_mnist', split=split, script_version="master")
|
||||
|
||||
TASK_MAPPING = OrderedDict(
|
||||
[
|
||||
|
@ -581,7 +643,10 @@ TASK_MAPPING = OrderedDict(
|
|||
('superglue-multirc', SuperGLUEMultiRC),
|
||||
('superglue-wic', SuperGLUEWIC),
|
||||
# ('superglue-record', SuperGLUERecord)
|
||||
('beans', Beans)
|
||||
('beans', Beans),
|
||||
('wikitext',Wikitext),
|
||||
('cifar10',Cifar10),
|
||||
('fashion_mnist',Fashion_MNIST)
|
||||
]
|
||||
)
|
||||
|
||||
|
|
|
@ -11,6 +11,14 @@ import sklearn.metrics
|
|||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
def perplexity(outputs, targets,ignore_index=-100):
|
||||
"""Computes the perplexity accuracy."""
|
||||
|
||||
ce = -np.log(outputs).mean()
|
||||
# ce = F.cross_entropy(torch.Tensor(outputs).view(-1, outputs.shape[-1]), torch.Tensor(targets).view(-1).long(),ignore_index=ignore_index)
|
||||
|
||||
return {"perplexity":float(np.exp(ce))}
|
||||
|
||||
def accuracy(predictions, targets) -> dict:
|
||||
"""Computes the average accuracy."""
|
||||
return {"accuracy": 100 * ((np.array(predictions) == np.array(targets)).mean())}
|
||||
|
@ -102,8 +110,8 @@ def f1_score(predictions, targets) -> dict:
|
|||
Returns:
|
||||
F1 score, where any prediction != 0 or 1 is counted as wrong.
|
||||
"""
|
||||
targets = targets.astype(np.int32)
|
||||
predictions = predictions.astype(np.int32)
|
||||
targets = np.array(targets).astype(np.int32)
|
||||
predictions = np.array(predictions).astype(np.int32)
|
||||
return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)}
|
||||
|
||||
# TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow
|
||||
|
|
|
@ -31,6 +31,7 @@ os.environ['MKL_THREADING_LAYER'] = 'GNU'
|
|||
os.environ['MKL_SERVICE_FORCE_INTEL'] = '1'
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
sys.path.append(os.path.join(os.getcwd(), "../"))
|
||||
# sys.path.append(os.path.join(os.getcwd(), "/mnt/sfs_turbo/zhangzhen/OpenDelta"))
|
||||
sys.path.append(os.path.join(os.getcwd()))
|
||||
|
||||
import functools
|
||||
|
@ -120,7 +121,8 @@ def main():
|
|||
|
||||
|
||||
|
||||
if os.path.basename(model_args.model_name_or_path).startswith("t5"):
|
||||
if os.path.basename(model_args.model_name_or_path).startswith("t5") \
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("long-t5") :
|
||||
from examples_prompt.backbones.t5 import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.t5 import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("blenderbot"):
|
||||
|
@ -128,7 +130,9 @@ def main():
|
|||
from examples_prompt.backbones.blenderbot import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("roberta") \
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("bert") \
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("albert") :
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("albert") \
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("xlm-roberta") \
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("deberta") :
|
||||
from examples_prompt.backbones.bert import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.bert import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("beit"):
|
||||
|
@ -143,6 +147,10 @@ def main():
|
|||
elif os.path.basename(model_args.model_name_or_path).startswith("clip"):
|
||||
from examples_prompt.backbones.clip import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.clip import Trainer, DataCollator
|
||||
elif os.path.basename(model_args.model_name_or_path).startswith("opt") \
|
||||
or os.path.basename(model_args.model_name_or_path).startswith("gpt"):
|
||||
from examples_prompt.backbones.opt import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||
from examples_prompt.backbones.opt import Trainer, DataCollator
|
||||
|
||||
|
||||
|
||||
|
@ -324,11 +332,12 @@ def main():
|
|||
delta_model.save_finetuned(finetuned_delta_path=delta_args.finetuned_delta_path,
|
||||
push_to_dc=training_args.push_to_dc,
|
||||
center_args={"test_performance":all_results['test'][data_args.task_name]['test_average_metrics'],
|
||||
"backbone_model_path_public":model_args.model_name_or_path.split("/")[-1]},
|
||||
},
|
||||
center_args_pool = {**vars(model_args), **vars(data_args), **vars(training_args), **vars(delta_args)},
|
||||
list_tags = ['NLI'],
|
||||
dict_tags = {'purpose':'for testing'},
|
||||
delay_push=delta_args.delay_push,
|
||||
delay_push=True,
|
||||
test_result=all_results['test']
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -271,10 +271,57 @@ class AdapterArguments:
|
|||
bottleneck_dim: Optional[int] = field(
|
||||
default=24, metadata={"help": "the dimension of the bottleneck layer"}
|
||||
)
|
||||
@dataclass
|
||||
class LoRAArguments:
|
||||
lora_r: Optional[int] = field(
|
||||
default=8, metadata={"help": "the rank of the LoRA metrics."}
|
||||
)
|
||||
@dataclass
|
||||
class PrefixArguments:
|
||||
pass
|
||||
@dataclass
|
||||
class BitFitArguments:
|
||||
pass
|
||||
@dataclass
|
||||
class SoftPromptArguments:
|
||||
soft_token_num: Optional[int] = field(
|
||||
default=100, metadata={"help": "the num of soft tokens."}
|
||||
)
|
||||
|
||||
@dataclass
|
||||
class CompacterArguments:
|
||||
pass
|
||||
@dataclass
|
||||
class LowRankAdapterArguments:
|
||||
pass
|
||||
|
||||
# from opendelta.delta_models.adapter import AdapterConfig
|
||||
# from opendelta.delta_models.bitfit import BitFitConfig
|
||||
# from opendelta.delta_models.compacter import CompacterConfig
|
||||
# from opendelta.delta_models.lora import LoraArguments
|
||||
# from opendelta.delta_models.low_rank_adapter import LowRankAdapterConfig
|
||||
# from opendelta.delta_models.prefix import PrefixConfig
|
||||
# from opendelta.delta_models.soft_prompt import SoftPromptConfig
|
||||
# DELTAARGMAP = {
|
||||
# "adapter": AdapterConfig,
|
||||
# "lora":LoraArguments,
|
||||
# "prefix":PrefixConfig,
|
||||
# "bitfit":BitFitConfig,
|
||||
# "soft_prompt":SoftPromptConfig,
|
||||
# "compacter":CompacterConfig,
|
||||
# "low_rank_adapter":LowRankAdapterConfig
|
||||
|
||||
# }
|
||||
|
||||
DELTAARGMAP = {
|
||||
"adapter": AdapterArguments
|
||||
"adapter": AdapterArguments,
|
||||
"lora":LoRAArguments,
|
||||
"prefix":PrefixArguments,
|
||||
"bitfit":BitFitArguments,
|
||||
"soft_prompt":SoftPromptArguments,
|
||||
"compacter":CompacterArguments,
|
||||
"low_rank_adapter":LowRankAdapterArguments
|
||||
|
||||
}
|
||||
|
||||
# TODO: add more specific delta arguments
|
||||
|
@ -325,13 +372,14 @@ class RemainArgHfArgumentParser(HfArgumentParser):
|
|||
for d in outputs:
|
||||
if isinstance(d, DeltaArguments): # merge the specific delta arguments
|
||||
d.merge_arguments(outputs[-1])
|
||||
return *(outputs[:-1]), remain_args
|
||||
|
||||
return [*(outputs[:-1]), remain_args]
|
||||
else:
|
||||
outputs = self.parse_args_into_dataclasses(args=data_list, return_remaining_strings=return_remaining_args)
|
||||
for d in outputs:
|
||||
if isinstance(d, DeltaArguments):
|
||||
d.merge_arguments(outputs[-1])
|
||||
return (*(outputs[:-1]),)
|
||||
return [*(outputs[:-1]),]
|
||||
|
||||
def parse_args_into_dataclasses(
|
||||
self, args=None, return_remaining_strings=False, look_for_args_file=True, args_filename=None
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
|
||||
|
||||
def create_hub_repo_name(root = "DeltaHub",
|
||||
dataset = None,
|
||||
delta_type = None,
|
||||
model_name_or_path = None,
|
||||
center_value_only_tags = None,
|
||||
center_key_value_tags = None
|
||||
):
|
||||
r"""Currently, it's only a simple concatenation of the arguments.
|
||||
"""
|
||||
repo_name = []
|
||||
|
||||
repo_name.append(f"{delta_type}")
|
||||
model_name_or_path = model_name_or_path.split("/")[-1]
|
||||
repo_name.append(f"{model_name_or_path}")
|
||||
repo_name.append(f"{dataset}")
|
||||
|
||||
repo_name.extend(list(center_value_only_tags) if center_value_only_tags else [None])
|
||||
repo_name.extend([f"{k}-{v}" for k,v in center_key_value_tags.items()] if center_key_value_tags else [None])
|
||||
|
||||
repo_name = "_".join(repo_name)
|
||||
|
||||
repo_name = root+"/"+repo_name
|
||||
return repo_name
|
||||
|
||||
|
||||
|
||||
|
|
@ -108,6 +108,7 @@ class SaveLoadMixin:
|
|||
list_tags: Optional[List] = list(),
|
||||
dict_tags: Optional[Dict] = dict(),
|
||||
delay_push: bool = False,
|
||||
test_result = None,
|
||||
usage: Optional[str] = "",
|
||||
):
|
||||
r"""
|
||||
|
@ -177,9 +178,13 @@ class SaveLoadMixin:
|
|||
logger.info("\n"+"*"*30+f"\nYou delta models has been saved locally to:\n\t{os.path.abspath(save_directory)}"
|
||||
)
|
||||
|
||||
state_dict_total_params = sum(p.numel() for p in state_dict.values())
|
||||
other_tags={}
|
||||
other_tags.update({'state_dict_total_params(M)':state_dict_total_params/1024/1024})
|
||||
other_tags.update({'test_result':test_result})
|
||||
if push_to_dc:
|
||||
logger.info("Creating yaml file for delta center")
|
||||
self.create_yml(save_directory, final_center_args, list_tags, dict_tags)
|
||||
self.create_yml(save_directory, final_center_args, list_tags, dict_tags, other_tags)
|
||||
|
||||
if not delay_push:
|
||||
OssClient.upload(base_dir=save_directory)
|
||||
|
@ -190,11 +195,13 @@ class SaveLoadMixin:
|
|||
|
||||
|
||||
|
||||
def create_yml(self, save_dir, config, list_tags=list(), dict_tags=dict()):
|
||||
def create_yml(self, save_dir, config, list_tags=list(), dict_tags=dict(),other_tags=None):
|
||||
f = open("{}/config.yml".format(save_dir), 'w')
|
||||
config_dict = vars(config)
|
||||
config_dict['dict_tags'] = dict_tags
|
||||
config_dict['list_tags'] = list_tags
|
||||
if other_tags is not None:
|
||||
config_dict.update(other_tags)
|
||||
yaml.safe_dump(config_dict, f)
|
||||
f.close()
|
||||
|
||||
|
|
|
@ -3,6 +3,29 @@ import copy
|
|||
import opendelta.utils.logging as logging
|
||||
from opendelta.utils.visualization import Visualization
|
||||
logger = logging.get_logger(__name__)
|
||||
opt_mapping = {
|
||||
"model.decoder.embed_tokens": {"__name__":"embeddings"},
|
||||
"model.decoder.embed_positions": {"__name__":""},
|
||||
"model.decoder.project_out": {"__name__":""},
|
||||
"model.decoder.project_in": {"__name__":""},
|
||||
"model.decoder": {"__name__":"decoder",
|
||||
"layer": {"__name__":"block",
|
||||
"$": {"__name__":"$",
|
||||
"self_attn": {"__name__":"attn",
|
||||
"q_proj": {"__name__":"q"},
|
||||
"k_proj": {"__name__":"k"},
|
||||
"v_proj": {"__name__":"v"},
|
||||
"out_proj": {"__name__":"proj"}
|
||||
},
|
||||
"self_attn_layer_norm": {"__name__":"layer_norm"},
|
||||
"fc1": {"__name__":"ff.w1"},
|
||||
"fc2": {"__name__":"ff.w2"},
|
||||
"final_layer_norm": {"__name__":"layer_norm"},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
t5_mapping = {
|
||||
"shared": {"__name__":"embeddings"},
|
||||
"encoder": {"__name__":"encoder",
|
||||
|
@ -275,6 +298,14 @@ def mapping_for_ConditionalGeneration(mapping, type):
|
|||
raise NotImplementedError(MAPPINGERROR_MSG.format())
|
||||
return mapping
|
||||
|
||||
def mapping_for_CausalLM(mapping, type):
|
||||
mapping = copy.deepcopy(mapping)
|
||||
if type == "opt":
|
||||
mapping["lm_head"] = {"__name__":"lm_head.proj"}
|
||||
else:
|
||||
raise NotImplementedError
|
||||
return mapping
|
||||
|
||||
class _LazyLoading(OrderedDict):
|
||||
def __init__(self, mapping):
|
||||
self._mapping_string = mapping
|
||||
|
@ -304,7 +335,9 @@ class CommonStructureMap(object):
|
|||
"RobertaForMaskedLM": "roberta_mapping",
|
||||
"BertForMaskedLM": "bert_mapping",
|
||||
"T5ForConditionalGeneration": """mapping_for_ConditionalGeneration(t5_mapping, "t5")""",
|
||||
"DebertaV2ForSequenceClassification": """mapping_for_SequenceClassification(debertav2_mapping, "deberta")"""
|
||||
"DebertaV2ForSequenceClassification": """mapping_for_SequenceClassification(debertav2_mapping, "deberta")""",
|
||||
"CLIPModel":"""""",
|
||||
"OPTForCausalLM":"""mapping_for_CausalLM(opt_mapping,"opt")"""
|
||||
})
|
||||
|
||||
SpecialModelInverseMaps = {
|
||||
|
|
|
@ -8,4 +8,6 @@ decorator
|
|||
rich
|
||||
web.py
|
||||
gitpython
|
||||
scipy
|
||||
sklearn
|
||||
delta_center_client==0.0.4
|
||||
|
|
Loading…
Reference in New Issue