Merge pull request #28 from thunlp/new_backbones

New backbones
This commit is contained in:
StingNing 2022-08-06 16:17:39 +08:00 committed by GitHub
commit f8cb2d3138
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
31 changed files with 1318 additions and 22 deletions

13
.gitignore vendored
View File

@ -38,5 +38,18 @@ log.txt
**/output/
**/thunlp/
**/saved_ckpts/
DeltaCenter-Python-Client/
backbone_structure
delta_checkpoints
gitop.sh
load_dataset_and_model.ipynb
load_model.py
scripts
t.py
t.sh
!examples/examples_prompt/configs/*/*.json
!examples/examples_prompt/configs/**
**/delta_checkpoints/
**/outputs/

View File

@ -43,7 +43,8 @@ def preprocess_function(raw_example, **kwargs):
def get_backbone(model_args, **kwargs):
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
# model_args.config_name if model_args.config_name else model_args.model_name_or_path,
model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,

View File

@ -8,7 +8,6 @@ from transformers import (
AutoFeatureExtractor,
AutoModelForImageClassification,
)
from transformers import ViTFeatureExtractor
from transformers import Trainer as HfTrainer
import torch.nn as nn
@ -26,9 +25,10 @@ def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
def preprocess_function(raw_example, **kwargs):
# from IPython import embed; embed(header="Therefa")
tokenizer = kwargs['tokenizer']
model_inputs = tokenizer(raw_example['image'], return_tensors='pt')
# print(np.array(raw_example['img']).shape)
model_inputs = tokenizer(np.array(raw_example['image']), return_tensors='pt')
model_inputs['pixel_values'] = model_inputs['pixel_values'].squeeze()
model_inputs['labels'] = raw_example['labels']
model_inputs['labels'] = raw_example['label']
return model_inputs
def compute_metrics(eval_preds, dataset_name, eval_metric):
@ -55,7 +55,7 @@ def mask_token_func(tokenizer, ith_mask=0):
def get_remove_columns(dataset_features):
# dataset_features.pop("label")
print("remove_columns: {}".format(dataset_features))
# print("remove_columns: {}".format(dataset_features))
return dataset_features
class DataCollator(HfDataCollatorMixin):

View File

@ -0,0 +1,169 @@
from openpromptu.data_utils import InputExample
import torch
from transformers.data.data_collator import torch_default_data_collator
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
import numpy as np
from transformers import (
AutoConfig,
AutoModelForCausalLM,
AutoTokenizer,
)
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
import copy
from torch.nn import CrossEntropyLoss
def preprocess_function(raw_example, **kwargs):
tokenizer = kwargs['tokenizer']
data_args = kwargs['data_args']
template = kwargs['template']
verbalizer = kwargs['verbalizer']
tokenizer_wrapper = kwargs['tokenizer_wrapper']
example = InputExample(**raw_example)
# example = verbalizer.wrap_one_example(example)
example, other = template.wrap_one_example(example)
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
padding="max_length", truncation=True)
return model_inputs
def compute_metrics(eval_preds, dataset_name, eval_metric):
pass
def mask_token_func(tokenizer, ith_mask=0):
return tokenizer.pad_token
def get_remove_columns(dataset_features):
# dataset_features.remove("label")
return dataset_features
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
from openpromptu.prompts import GenerationVerbalizer
from openpromptu.prompts import ManualTemplate
from openpromptu import TokenizerWrapper
template = ManualTemplate(text = task.templates_text[template_id])
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None)
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
return template, verbalizer, tokenizer_wrapper
def get_backbone(model_args, **kwargs):
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
# config.dropout_rate = 0.0
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
return config, tokenizer, model
class Trainer(HfSeq2SeqTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.eval_task = eval_task
self.compute_metrics = self._compute_metrics
def compute_loss(self, model, inputs, return_outputs=False):
labels=copy.deepcopy(inputs['input_ids'])
# labels[labels==self.tokenizer.pad_token_id]=-100
outputs = model(**inputs)
logits = outputs.logits
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1))
return (loss, outputs) if return_outputs else loss
def prediction_step(
self,
model, #nn.Module,
inputs, #Dict[str, Union[torch.Tensor, Any]],
prediction_loss_only, #: bool,
ignore_keys, #: Optional[List[str]] = None,
): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
"""
Perform an evaluation step on :obj:`model` using obj:`inputs`.
Subclass and override to inject custom behavior.
Args:
model (:obj:`nn.Module`):
The model to evaluate.
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
The inputs and targets of the model.
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
prediction_loss_only (:obj:`bool`):
Whether or not to return the loss only.
Return:
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
labels (each being optional).
"""
if not self.args.predict_with_generate or prediction_loss_only:
return super().prediction_step(
model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
)
inputs = self._prepare_inputs(inputs)
with torch.no_grad():
labels=copy.deepcopy(inputs['input_ids'])
# labels[labels==self.tokenizer.pad_token_id]=-100
outputs = model(**inputs)
logits = outputs.logits
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous().long()
loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu()
loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss)
if prediction_loss_only:
return (loss, None, None)
else:
# non pad label
shift_labels = shift_labels.view(-1).detach().cpu()
nonpad_idx = shift_labels!=self.tokenizer.pad_token_id
shift_labels = shift_labels[nonpad_idx]
# the probability at the corresponding position
shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu()
target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device)
shift_logits = shift_logits.softmax(dim=-1)[target_position]
return (loss, shift_logits, shift_labels)
def _compute_metrics(self, eval_preds):
preds, labels = eval_preds
result = {}
for metric in self.eval_task.metric:
result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id))
average_metric = sum(result.values())/len(result)
result.update({"average_metrics":average_metric})
return result

View File

@ -164,7 +164,7 @@ class Trainer(HfSeq2SeqTrainer):
return (loss, generated_tokens, labels)
def _compute_metrics(self, eval_preds):
from IPython import embed; embed(header="In compute metrics")
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds
decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)

View File

@ -0,0 +1,171 @@
from openpromptu.data_utils import InputExample
import torch
from transformers.data.data_collator import torch_default_data_collator
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
import numpy as np
from transformers import (
AutoConfig,
AutoModelForCausalLM,
AutoTokenizer,
)
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
import copy
from torch.nn import CrossEntropyLoss
def preprocess_function(raw_example, **kwargs):
tokenizer = kwargs['tokenizer']
data_args = kwargs['data_args']
template = kwargs['template']
verbalizer = kwargs['verbalizer']
tokenizer_wrapper = kwargs['tokenizer_wrapper']
example = InputExample(**raw_example)
# example = verbalizer.wrap_one_example(example)
example, other = template.wrap_one_example(example)
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
padding="max_length", truncation=True)
return model_inputs
def compute_metrics(eval_preds, dataset_name, eval_metric):
pass
def mask_token_func(tokenizer, ith_mask=0):
return tokenizer.pad_token
def get_remove_columns(dataset_features):
# dataset_features.remove("label")
return dataset_features
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
from openpromptu.prompts import GenerationVerbalizer
from openpromptu.prompts import ManualTemplate
from openpromptu import TokenizerWrapper
template = ManualTemplate(text = task.templates_text[template_id])
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = None, label_words=None)
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="tail", mask_token_func=mask_token_func)
return template, verbalizer, tokenizer_wrapper
def get_backbone(model_args, **kwargs):
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
# config.dropout_rate = 0.0
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
if not hasattr(tokenizer,"pad_token") or (hasattr(tokenizer,"pad_token") and tokenizer.pad_token==None):
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
return config, tokenizer, model
class Trainer(HfSeq2SeqTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.eval_task = eval_task
self.compute_metrics = self._compute_metrics
def compute_loss(self, model, inputs, return_outputs=False):
labels=copy.deepcopy(inputs['input_ids'])
# labels[labels==self.tokenizer.pad_token_id]=-100
outputs = model(**inputs)
logits = outputs.logits
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.long().view(-1))
return (loss, outputs) if return_outputs else loss
def prediction_step(
self,
model, #nn.Module,
inputs, #Dict[str, Union[torch.Tensor, Any]],
prediction_loss_only, #: bool,
ignore_keys, #: Optional[List[str]] = None,
): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
"""
Perform an evaluation step on :obj:`model` using obj:`inputs`.
Subclass and override to inject custom behavior.
Args:
model (:obj:`nn.Module`):
The model to evaluate.
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
The inputs and targets of the model.
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
prediction_loss_only (:obj:`bool`):
Whether or not to return the loss only.
Return:
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
labels (each being optional).
"""
if not self.args.predict_with_generate or prediction_loss_only:
return super().prediction_step(
model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
)
inputs = self._prepare_inputs(inputs)
with torch.no_grad():
labels=copy.deepcopy(inputs['input_ids'])
# labels[labels==self.tokenizer.pad_token_id]=-100
outputs = model(**inputs)
logits = outputs.logits
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous().long()
loss_fct = CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
loss = loss_fct(shift_logits.view(-1, shift_logits.shape[-1]), shift_labels.view(-1)).detach().cpu()
loss = torch.where(torch.isnan(loss), torch.full_like(loss, 0), loss)
if prediction_loss_only:
return (loss, None, None)
else:
# non pad label
shift_labels = shift_labels.view(-1).detach().cpu()
nonpad_idx = shift_labels!=self.tokenizer.pad_token_id
shift_labels = shift_labels[nonpad_idx]
# the probability at the corresponding position
shift_logits = shift_logits.view(-1, shift_logits.shape[-1])[nonpad_idx].detach().cpu()
target_position = torch.nn.functional.one_hot(shift_labels,shift_logits.shape[-1]).bool().to(shift_labels.device)
shift_logits = shift_logits.softmax(dim=-1)[target_position]
return (loss, shift_logits, shift_labels)
def _compute_metrics(self, eval_preds):
preds, labels = eval_preds
result = {}
for metric in self.eval_task.metric:
result.update(metric(preds, labels,ignore_index=self.tokenizer.pad_token_id))
average_metric = sum(result.values())/len(result)
result.update({"average_metrics":average_metric})
return result

View File

@ -0,0 +1,48 @@
{
"bottleneck_dim": 24,
"dataset_config_name": [
"en"
],
"delta_type": "adapter",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "beans",
"eval_steps": 200,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32",
"num_classes": 3,
"num_train_epochs": 20,
"output_dir": "outputs/adapter/clip-vit-base-patch32/beans",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_delta_center": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "beans",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "beans",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/clip-vit-base-patch32",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0
}

View File

@ -0,0 +1,53 @@
{
"backbone_model": "opt",
"bottleneck_dim": 24,
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "adapter",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "wikitext",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps":2,
"greater_is_better": false,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 900,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m",
"model_path_public": "opt-350m",
"num_train_epochs": 3,
"output_dir": "outputs/adapter/opt-350m/wikitext",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 6,
"per_device_train_batch_size": 6,
"predict_with_generate": true,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "wikitext",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "wikitext",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/opt-350m",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["self_attn"]
}

View File

@ -0,0 +1,53 @@
{
"backbone_model": "vit",
"bottleneck_dim": 24,
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": false,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "adapter",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "beans",
"eval_steps": 200,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k",
"model_path_public": "vit-large-patch16-224-in21k",
"num_classes": 3,
"num_train_epochs": 20,
"output_dir": "outputs/adapter/vit-large-patch16-224-in21k/beans",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": false,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "beans",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "beans",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/vit-large-patch16-224-in21k",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["output"]
}

View File

@ -0,0 +1,51 @@
{
"backbone_model": "t5-large",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "rte",
"eval_steps": 100,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/t5-large",
"model_path_public": "t5-large",
"num_train_epochs": 20,
"output_dir": "outputs/bitfit/t5-large/rte",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 16,
"per_device_train_batch_size": 16,
"predict_with_generate": true,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 100,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "rte",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "rte",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/t5-large",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["attn", "ff", "layer_norm"]
}

View File

@ -0,0 +1,66 @@
{
"backbone_model": "blenderbot",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "compacter",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "sst2",
"eval_steps": 200,
"evaluation_strategy": "steps",
"factorized_phm": true,
"factorized_phm_rule": false,
"gradient_clip": false,
"greater_is_better": true,
"hypercomplex_adapters": true,
"hypercomplex_division": 4,
"hypercomplex_nonlinearity": "glorot-uniform",
"learn_phm": true,
"learning_rate": 0.003,
"load_best_model_at_end": true,
"max_source_length": 128,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b",
"model_path_public": "blenderbot-3b",
"non_linearity": "gelu_new",
"normalize_phm_weight": false,
"num_train_epochs": 3,
"output_dir": "outputs/compacter/blenderbot-3b/sst2",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"phm_c_init": "normal",
"phm_clamp": false,
"phm_init_range": 0.0001,
"predict_with_generate": true,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"shared_phm_rule": false,
"split_validation_test": true,
"task_name": "sst2",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "sst2",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"use_bias_down_sampler": true,
"use_bias_up_sampler": true,
"warmup_steps": 0,
"modified_modules":["fc2"]
}

View File

@ -0,0 +1,51 @@
{
"backbone_model": "deberta-v2-xlarge",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "compacter",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mnli",
"eval_steps": 500,
"evaluation_strategy": "steps",
"greater_is_better": true,
"is_seq2seq": false,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge",
"num_train_epochs": 3,
"output_dir": "outputs/compacter/deberta-v2-xlarge/mnli",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": false,
"push_to_dc": true,
"push_to_hub": false,
"save_steps": 500,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "mnli",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mnli",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/deberta-v2-xlarge",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["attention"]
}

View File

@ -0,0 +1,51 @@
{
"backbone_model": "long-t5",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "compacter",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "rte",
"eval_steps": 100,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large",
"model_path_public": "long-t5-tglobal-large",
"num_train_epochs": 20,
"output_dir": "outputs/compacter/long-t5-tglobal-large/rte",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 16,
"per_device_train_batch_size": 16,
"predict_with_generate": true,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 100,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "rte",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "rte",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/long-t5-tglobal-large",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["attn", "ff", "layer_norm"]
}

View File

@ -2,7 +2,7 @@ import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
# PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}

View File

@ -0,0 +1,52 @@
{
"backbone_model": "beit",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk",
"delta_type": "lora",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "cifar10",
"eval_steps": 200,
"evaluation_strategy": "steps",
"greater_is_better": true,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224",
"model_path_public": "beit-large-patch16-224",
"num_classes": 10,
"num_train_epochs": 20,
"output_dir": "outputs/lora/beit-large-patch16-224/cifar10",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": false,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "cifar10",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "cifar10",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/beit-large-patch16-224",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["query","value"]
}

View File

@ -0,0 +1,52 @@
{
"backbone_model": "gpt-j",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "lora",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "wikitext",
"eval_steps": 500,
"evaluation_strategy": "steps",
"gradient_accumulation_steps":4,
"greater_is_better": false,
"learning_rate": 0.00003,
"load_best_model_at_end": true,
"max_source_length": 512,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B",
"model_path_public": "gpt-j-6B",
"num_train_epochs": 2,
"output_dir": "outputs/lora/gpt-j-6B/wikitext",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 2,
"per_device_train_batch_size": 2,
"predict_with_generate": true,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 500,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "wikitext",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "wikitext",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt-j-6B",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["20.attn.q_proj","21.attn.q_proj","22.attn.q_proj","23.attn.q_proj","24.attn.q_proj","25.attn.q_proj","26.attn.q_proj","27.attn.q_proj"]
}

View File

@ -0,0 +1,52 @@
{
"backbone_model": "roberta-large",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "lora",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "superglue-boolq",
"eval_steps": 200,
"evaluation_strategy": "steps",
"greater_is_better": true,
"is_seq2seq": false,
"learning_rate": 0.0001,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large",
"model_path_public": "roberta-large",
"num_train_epochs": 20,
"output_dir": "outputs/lora/roberta-large/superglue-boolq",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": false,
"push_to_hub": false,
"push_to_dc": true,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "superglue-boolq",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "superglue-boolq",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/roberta-large",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["query","value"]
}

View File

@ -0,0 +1,52 @@
{
"backbone_model": "xlm-roberta-large",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "lora",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "superglue-wic",
"eval_steps": 100,
"evaluation_strategy": "steps",
"greater_is_better": true,
"is_seq2seq": false,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large",
"model_path_public": "xlm-roberta-large",
"num_train_epochs": 20,
"output_dir": "outputs/lora/xlm-roberta-large/superglue-wic",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 16,
"per_device_train_batch_size": 16,
"predict_with_generate": false,
"push_to_dc": true,
"push_to_hub": false,
"save_steps": 100,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "superglue-wic",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "superglue-wic",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/xlm-roberta-large",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["query","value"]
}

View File

@ -0,0 +1,52 @@
{
"backbone_model": "gpt2",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "low_rank_adapter",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "wikitext",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps":1,
"greater_is_better": false,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 768,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/gpt2",
"model_path_public": "gpt2",
"num_train_epochs": 2,
"output_dir": "outputs/low_rank_adapter/gpt2/wikitext",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 16,
"per_device_train_batch_size": 16,
"predict_with_generate": true,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "wikitext",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "wikitext",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/gpt2",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["attn","mlp"]
}

View File

@ -0,0 +1,51 @@
{
"backbone_model": "bert-large-cased",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "prefix",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "rte",
"eval_steps": 100,
"evaluation_strategy": "steps",
"greater_is_better": true,
"is_seq2seq": false,
"learning_rate": 0.0003,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased",
"num_train_epochs": 20,
"output_dir": "outputs/prefix/bert-large-cased/rte",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 16,
"per_device_train_batch_size": 16,
"predict_with_generate": false,
"push_to_dc": true,
"push_to_hub": false,
"save_steps": 100,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"split_validation_test": true,
"task_name": "rte",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "rte",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bert-large-cased",
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"warmup_steps": 0,
"modified_modules":["attention"]
}

View File

@ -0,0 +1,51 @@
{
"backbone_model": "bart",
"dataset_config_name": [
"en"
],
"datasets_load_from_disk": true,
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
"delta_type": "soft_prompt",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "superglue-boolq",
"eval_steps": 500,
"evaluation_strategy": "steps",
"gradient_accumulation_steps":1,
"greater_is_better": true,
"learning_rate": 0.1,
"load_best_model_at_end": true,
"max_source_length": 256,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/bart-large",
"model_path_public": "bart-large",
"num_train_epochs": 50,
"output_dir": "outputs/soft_prompt/bart-large/superglue-boolq",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_dc": true,
"push_to_hf": false,
"save_steps": 500,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 42,
"soft_token_num":100,
"split_validation_test": true,
"task_name": "superglue-boolq",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "superglue-boolq",
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/bart-large",
"token_init": true,
"unfrozen_modules": [
"deltas"
],
"warmup_steps": 0
}

View File

@ -93,4 +93,10 @@ class AbstractTask(abc.ABC):
# shuffles the data and samples it.
if n_obs is not None:
dataset = self.subsample(dataset, n_obs)
return dataset.map(self.preprocessor)
this_method = getattr(self.__class__, 'preprocessor')
base_method = getattr(AbstractTask, 'preprocessor')
if this_method is not base_method:
return dataset.map(self.preprocessor)
else:
return dataset

View File

@ -556,12 +556,74 @@ class Beans(AbstractTask):
# from IPython import embed; embed(header="beans")
offline = os.environ.get("HF_DATASETS_OFFLINE", "0")
if offline == '1':
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.wic")[split]
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/beans")[split]
else:
return datasets.load_dataset('beans', split=split, script_version="master")
class Wikitext(AbstractTask):
#wikitext-2-v1
name = "wikitext"
# labels_list = ['angular_leaf_spot', 'bean_rust', "healthy"]
split_to_data_split = {"train": "train",
"validation": "validation",
"test": "validation"}
metric = [metrics.perplexity]
metric_names = ["perplexity"]
verbalizers = {
"0": {
}
}
templates_text = {
"0": """{"meta":"text"}"""
}
split_valid_to_make_test = True
def load_dataset(self, split):
# from IPython import embed; embed(header="beans")
if self.data_args.datasets_load_from_disk:
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/wikitext")[split]
else:
return datasets.load_dataset('wikitext','wikitext-2-v1', split=split, script_version="master")
class Cifar10(AbstractTask):
name = "cifar10"
split_to_data_split = {"train": "train",
"validation": "test",
"test": "test"}
metric = [metrics.accuracy]
metric_names = ["accuracy"]
def load_dataset(self, split):
if self.data_args.datasets_load_from_disk:
d = datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/cifar10")[split].select(range(100))
print(d)
return d
else:
return datasets.load_dataset('cifar10', split=split, script_version="master")
# def preprocessor(self, example):
# example_ = {}
# example_["image"] = example["image"]
# example_["labels"] = example["label"]
# return example_
class Fashion_MNIST(AbstractTask):
name = "Fashion-MNIST"
split_to_data_split = {"train": "train",
"validation": "test",
"test": "test"}
metric = [metrics.accuracy]
metric_names = ["accuracy"]
def load_dataset(self, split):
if self.data_args.datasets_load_from_disk:
d = datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/fashion_mnist")[split]
print(d)
return d
else:
return datasets.load_dataset('fashion_mnist', split=split, script_version="master")
TASK_MAPPING = OrderedDict(
[
@ -581,7 +643,10 @@ TASK_MAPPING = OrderedDict(
('superglue-multirc', SuperGLUEMultiRC),
('superglue-wic', SuperGLUEWIC),
# ('superglue-record', SuperGLUERecord)
('beans', Beans)
('beans', Beans),
('wikitext',Wikitext),
('cifar10',Cifar10),
('fashion_mnist',Fashion_MNIST)
]
)

View File

@ -11,6 +11,14 @@ import sklearn.metrics
logger = getLogger(__name__)
def perplexity(outputs, targets,ignore_index=-100):
"""Computes the perplexity accuracy."""
ce = -np.log(outputs).mean()
# ce = F.cross_entropy(torch.Tensor(outputs).view(-1, outputs.shape[-1]), torch.Tensor(targets).view(-1).long(),ignore_index=ignore_index)
return {"perplexity":float(np.exp(ce))}
def accuracy(predictions, targets) -> dict:
"""Computes the average accuracy."""
return {"accuracy": 100 * ((np.array(predictions) == np.array(targets)).mean())}
@ -102,8 +110,8 @@ def f1_score(predictions, targets) -> dict:
Returns:
F1 score, where any prediction != 0 or 1 is counted as wrong.
"""
targets = targets.astype(np.int32)
predictions = predictions.astype(np.int32)
targets = np.array(targets).astype(np.int32)
predictions = np.array(predictions).astype(np.int32)
return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)}
# TODO: maybe gaurd against invalid values https://stackoverflow.com/questions/56865344/how-do-i-calculate-the-matthews-correlation-coefficient-in-tensorflow

View File

@ -31,6 +31,7 @@ os.environ['MKL_THREADING_LAYER'] = 'GNU'
os.environ['MKL_SERVICE_FORCE_INTEL'] = '1'
os.environ["TOKENIZERS_PARALLELISM"] = "false"
sys.path.append(os.path.join(os.getcwd(), "../"))
# sys.path.append(os.path.join(os.getcwd(), "/mnt/sfs_turbo/zhangzhen/OpenDelta"))
sys.path.append(os.path.join(os.getcwd()))
import functools
@ -120,7 +121,8 @@ def main():
if os.path.basename(model_args.model_name_or_path).startswith("t5"):
if os.path.basename(model_args.model_name_or_path).startswith("t5") \
or os.path.basename(model_args.model_name_or_path).startswith("long-t5") :
from examples_prompt.backbones.t5 import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
from examples_prompt.backbones.t5 import Trainer, DataCollator
elif os.path.basename(model_args.model_name_or_path).startswith("blenderbot"):
@ -128,7 +130,9 @@ def main():
from examples_prompt.backbones.blenderbot import Trainer, DataCollator
elif os.path.basename(model_args.model_name_or_path).startswith("roberta") \
or os.path.basename(model_args.model_name_or_path).startswith("bert") \
or os.path.basename(model_args.model_name_or_path).startswith("albert") :
or os.path.basename(model_args.model_name_or_path).startswith("albert") \
or os.path.basename(model_args.model_name_or_path).startswith("xlm-roberta") \
or os.path.basename(model_args.model_name_or_path).startswith("deberta") :
from examples_prompt.backbones.bert import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
from examples_prompt.backbones.bert import Trainer, DataCollator
elif os.path.basename(model_args.model_name_or_path).startswith("beit"):
@ -143,6 +147,10 @@ def main():
elif os.path.basename(model_args.model_name_or_path).startswith("clip"):
from examples_prompt.backbones.clip import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
from examples_prompt.backbones.clip import Trainer, DataCollator
elif os.path.basename(model_args.model_name_or_path).startswith("opt") \
or os.path.basename(model_args.model_name_or_path).startswith("gpt"):
from examples_prompt.backbones.opt import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
from examples_prompt.backbones.opt import Trainer, DataCollator
@ -324,11 +332,12 @@ def main():
delta_model.save_finetuned(finetuned_delta_path=delta_args.finetuned_delta_path,
push_to_dc=training_args.push_to_dc,
center_args={"test_performance":all_results['test'][data_args.task_name]['test_average_metrics'],
"backbone_model_path_public":model_args.model_name_or_path.split("/")[-1]},
},
center_args_pool = {**vars(model_args), **vars(data_args), **vars(training_args), **vars(delta_args)},
list_tags = ['NLI'],
dict_tags = {'purpose':'for testing'},
delay_push=delta_args.delay_push,
delay_push=True,
test_result=all_results['test']
)

View File

@ -271,10 +271,57 @@ class AdapterArguments:
bottleneck_dim: Optional[int] = field(
default=24, metadata={"help": "the dimension of the bottleneck layer"}
)
@dataclass
class LoRAArguments:
lora_r: Optional[int] = field(
default=8, metadata={"help": "the rank of the LoRA metrics."}
)
@dataclass
class PrefixArguments:
pass
@dataclass
class BitFitArguments:
pass
@dataclass
class SoftPromptArguments:
soft_token_num: Optional[int] = field(
default=100, metadata={"help": "the num of soft tokens."}
)
@dataclass
class CompacterArguments:
pass
@dataclass
class LowRankAdapterArguments:
pass
# from opendelta.delta_models.adapter import AdapterConfig
# from opendelta.delta_models.bitfit import BitFitConfig
# from opendelta.delta_models.compacter import CompacterConfig
# from opendelta.delta_models.lora import LoraArguments
# from opendelta.delta_models.low_rank_adapter import LowRankAdapterConfig
# from opendelta.delta_models.prefix import PrefixConfig
# from opendelta.delta_models.soft_prompt import SoftPromptConfig
# DELTAARGMAP = {
# "adapter": AdapterConfig,
# "lora":LoraArguments,
# "prefix":PrefixConfig,
# "bitfit":BitFitConfig,
# "soft_prompt":SoftPromptConfig,
# "compacter":CompacterConfig,
# "low_rank_adapter":LowRankAdapterConfig
# }
DELTAARGMAP = {
"adapter": AdapterArguments
"adapter": AdapterArguments,
"lora":LoRAArguments,
"prefix":PrefixArguments,
"bitfit":BitFitArguments,
"soft_prompt":SoftPromptArguments,
"compacter":CompacterArguments,
"low_rank_adapter":LowRankAdapterArguments
}
# TODO: add more specific delta arguments
@ -325,13 +372,14 @@ class RemainArgHfArgumentParser(HfArgumentParser):
for d in outputs:
if isinstance(d, DeltaArguments): # merge the specific delta arguments
d.merge_arguments(outputs[-1])
return *(outputs[:-1]), remain_args
return [*(outputs[:-1]), remain_args]
else:
outputs = self.parse_args_into_dataclasses(args=data_list, return_remaining_strings=return_remaining_args)
for d in outputs:
if isinstance(d, DeltaArguments):
d.merge_arguments(outputs[-1])
return (*(outputs[:-1]),)
return [*(outputs[:-1]),]
def parse_args_into_dataclasses(
self, args=None, return_remaining_strings=False, look_for_args_file=True, args_filename=None

View File

@ -0,0 +1,29 @@
def create_hub_repo_name(root = "DeltaHub",
dataset = None,
delta_type = None,
model_name_or_path = None,
center_value_only_tags = None,
center_key_value_tags = None
):
r"""Currently, it's only a simple concatenation of the arguments.
"""
repo_name = []
repo_name.append(f"{delta_type}")
model_name_or_path = model_name_or_path.split("/")[-1]
repo_name.append(f"{model_name_or_path}")
repo_name.append(f"{dataset}")
repo_name.extend(list(center_value_only_tags) if center_value_only_tags else [None])
repo_name.extend([f"{k}-{v}" for k,v in center_key_value_tags.items()] if center_key_value_tags else [None])
repo_name = "_".join(repo_name)
repo_name = root+"/"+repo_name
return repo_name

View File

@ -108,6 +108,7 @@ class SaveLoadMixin:
list_tags: Optional[List] = list(),
dict_tags: Optional[Dict] = dict(),
delay_push: bool = False,
test_result = None,
usage: Optional[str] = "",
):
r"""
@ -177,9 +178,13 @@ class SaveLoadMixin:
logger.info("\n"+"*"*30+f"\nYou delta models has been saved locally to:\n\t{os.path.abspath(save_directory)}"
)
state_dict_total_params = sum(p.numel() for p in state_dict.values())
other_tags={}
other_tags.update({'state_dict_total_params(M)':state_dict_total_params/1024/1024})
other_tags.update({'test_result':test_result})
if push_to_dc:
logger.info("Creating yaml file for delta center")
self.create_yml(save_directory, final_center_args, list_tags, dict_tags)
self.create_yml(save_directory, final_center_args, list_tags, dict_tags, other_tags)
if not delay_push:
OssClient.upload(base_dir=save_directory)
@ -190,11 +195,13 @@ class SaveLoadMixin:
def create_yml(self, save_dir, config, list_tags=list(), dict_tags=dict()):
def create_yml(self, save_dir, config, list_tags=list(), dict_tags=dict(),other_tags=None):
f = open("{}/config.yml".format(save_dir), 'w')
config_dict = vars(config)
config_dict['dict_tags'] = dict_tags
config_dict['list_tags'] = list_tags
if other_tags is not None:
config_dict.update(other_tags)
yaml.safe_dump(config_dict, f)
f.close()

View File

@ -3,6 +3,29 @@ import copy
import opendelta.utils.logging as logging
from opendelta.utils.visualization import Visualization
logger = logging.get_logger(__name__)
opt_mapping = {
"model.decoder.embed_tokens": {"__name__":"embeddings"},
"model.decoder.embed_positions": {"__name__":""},
"model.decoder.project_out": {"__name__":""},
"model.decoder.project_in": {"__name__":""},
"model.decoder": {"__name__":"decoder",
"layer": {"__name__":"block",
"$": {"__name__":"$",
"self_attn": {"__name__":"attn",
"q_proj": {"__name__":"q"},
"k_proj": {"__name__":"k"},
"v_proj": {"__name__":"v"},
"out_proj": {"__name__":"proj"}
},
"self_attn_layer_norm": {"__name__":"layer_norm"},
"fc1": {"__name__":"ff.w1"},
"fc2": {"__name__":"ff.w2"},
"final_layer_norm": {"__name__":"layer_norm"},
}
}
}
}
t5_mapping = {
"shared": {"__name__":"embeddings"},
"encoder": {"__name__":"encoder",
@ -275,6 +298,14 @@ def mapping_for_ConditionalGeneration(mapping, type):
raise NotImplementedError(MAPPINGERROR_MSG.format())
return mapping
def mapping_for_CausalLM(mapping, type):
mapping = copy.deepcopy(mapping)
if type == "opt":
mapping["lm_head"] = {"__name__":"lm_head.proj"}
else:
raise NotImplementedError
return mapping
class _LazyLoading(OrderedDict):
def __init__(self, mapping):
self._mapping_string = mapping
@ -304,7 +335,9 @@ class CommonStructureMap(object):
"RobertaForMaskedLM": "roberta_mapping",
"BertForMaskedLM": "bert_mapping",
"T5ForConditionalGeneration": """mapping_for_ConditionalGeneration(t5_mapping, "t5")""",
"DebertaV2ForSequenceClassification": """mapping_for_SequenceClassification(debertav2_mapping, "deberta")"""
"DebertaV2ForSequenceClassification": """mapping_for_SequenceClassification(debertav2_mapping, "deberta")""",
"CLIPModel":"""""",
"OPTForCausalLM":"""mapping_for_CausalLM(opt_mapping,"opt")"""
})
SpecialModelInverseMaps = {

View File

@ -8,4 +8,6 @@ decorator
rich
web.py
gitpython
scipy
sklearn
delta_center_client==0.0.4