move must_try to unittest, unitest to examples
This commit is contained in:
parent
085d388102
commit
62f03f0068
|
@ -0,0 +1,70 @@
|
|||
# use tranformers as usual.
|
||||
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
|
||||
t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
|
||||
t5_tokenizer = AutoTokenizer.from_pretrained("t5-large")
|
||||
# A running example
|
||||
inputs_ids = t5_tokenizer.encode("Is Harry Poter wrtten by JKrowling", return_tensors="pt")
|
||||
t5_tokenizer.decode(t5.generate(inputs_ids)[0])
|
||||
# >>> '<pad><extra_id_0>? Is it Harry Potter?</s>'
|
||||
|
||||
|
||||
# use existing delta models
|
||||
from opendelta import AutoDeltaModel, AutoDeltaConfig
|
||||
# use existing delta models from DeltaCenter
|
||||
delta = AutoDeltaModel.from_finetuned("thunlp/Spelling_Correction_T5_LRAdapter_demo", backbone_model=t5)
|
||||
# freeze the whole backbone model except the delta models.
|
||||
delta.freeze_module()
|
||||
# visualize the change
|
||||
delta.log()
|
||||
|
||||
|
||||
t5_tokenizer.decode(t5.generate(inputs_ids)[0])
|
||||
# >>> <pad> Is Harry Potter written by JK Rowling?</s>
|
||||
|
||||
|
||||
# Now save merely the delta models, not the whole backbone model, to tmp/
|
||||
delta.save_finetuned(".tmp")
|
||||
import os; os.listdir(".tmp")
|
||||
# >>> The state dict size is 1.443 MB
|
||||
# >>> We encourage users to push their final and public models to delta center to share them with the community!
|
||||
|
||||
|
||||
# reload the model from local url and add it to pre-trained T5.
|
||||
t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
|
||||
delta1 = AutoDeltaModel.from_finetuned(".tmp", backbone_model=t5)
|
||||
import shutil; shutil.rmtree(".tmp") # don't forget to remove the tmp files.
|
||||
t5_tokenizer.decode(t5.generate(inputs_ids)[0])
|
||||
# >>> <pad> Is Harry Potter written by JK Rowling?</s>
|
||||
|
||||
# detach the delta models, the model returns to the unmodified status.
|
||||
delta1.detach()
|
||||
t5_tokenizer.decode(t5.generate(inputs_ids)[0])
|
||||
# >>> '<pad><extra_id_0>? Is it Harry Potter?</s>'
|
||||
|
||||
# use default configuration for cunstomized wrapped models which have PLMs inside. This is a common need for users.
|
||||
import torch.nn as nn
|
||||
class WrappedModel(nn.Module):
|
||||
def __init__(self, inner_model):
|
||||
super().__init__()
|
||||
self.inner = inner_model
|
||||
def forward(self, *args, **kwargs):
|
||||
return self.inner(*args, **kwargs)
|
||||
|
||||
wrapped_model = WrappedModel(WrappedModel(t5))
|
||||
|
||||
# say we use LoRA
|
||||
delta_config = AutoDeltaConfig.from_dict({"delta_type":"lora"})
|
||||
delta2 = AutoDeltaModel.from_config(delta_config, backbone_model=wrapped_model)
|
||||
delta2.log()
|
||||
# >>> root
|
||||
# -- inner
|
||||
# -- inner
|
||||
# ...
|
||||
# ... lora_A:[8,1024], lora_B:[1024,8]
|
||||
delta2.detach()
|
||||
|
||||
# use a not default configuration
|
||||
# say we add lora to the last four layer of the decoder of t5, with lora rank=5
|
||||
delta_config3 = AutoDeltaConfig.from_dict({"delta_type":"lora", "modified_modules":["[r]decoder.*((20)|(21)|(22)|(23)).*DenseReluDense\.wi"], "lora_r":5})
|
||||
delta3 = AutoDeltaModel.from_config(delta_config3, backbone_model=wrapped_model)
|
||||
delta3.log()
|
|
@ -0,0 +1,200 @@
|
|||
|
||||
|
||||
|
||||
|
||||
import time
|
||||
import random
|
||||
import torch
|
||||
import bmtrain as bmt
|
||||
import numpy as np
|
||||
import os
|
||||
import csv
|
||||
|
||||
from model_center import get_args
|
||||
from model_center.model import CPM2
|
||||
from model_center.tokenizer import CPM2Tokenizer
|
||||
from model_center.dataset.cpm2dataset import DATASET
|
||||
from model_center.utils import print_inspect
|
||||
from model_center.dataset import DistributedDataLoader
|
||||
|
||||
def get_tokenizer(args):
|
||||
tokenizer = CPM2Tokenizer.from_pretrained(args.model_config)
|
||||
return tokenizer
|
||||
|
||||
def get_model(args):
|
||||
model = CPM2.from_pretrained(args.model_config)
|
||||
return model
|
||||
|
||||
def get_optimizer(args, model):
|
||||
optimizer = bmt.optim.AdamOffloadOptimizer(model.parameters(), weight_decay=args.weight_decay)
|
||||
return optimizer
|
||||
|
||||
def get_learning_rate_scheduler(args, optimizer):
|
||||
if args.lr_decay_iters is None:
|
||||
args.lr_decay_iters = args.train_iters * args.epochs
|
||||
if args.lr_decay_style == "noam":
|
||||
lr_scheduler = bmt.lr_scheduler.Noam(optimizer,
|
||||
start_lr = args.lr,
|
||||
warmup_iter = args.warmup_iters,
|
||||
end_iter = args.lr_decay_iters,
|
||||
num_iter = args.start_step)
|
||||
elif args.lr_decay_style == "constant":
|
||||
lr_scheduler = bmt.lr_scheduler.NoDecay(optimizer,
|
||||
start_lr = args.lr,
|
||||
warmup_iter = args.warmup_iters,
|
||||
end_iter = -1,
|
||||
num_iter = args.start_step)
|
||||
elif args.lr_decay_style == "linear":
|
||||
lr_scheduler = bmt.lr_scheduler.Linear(optimizer,
|
||||
start_lr = args.lr,
|
||||
warmup_iter = args.warmup_iters,
|
||||
end_iter = args.lr_decay_iters,
|
||||
num_iter = args.start_step)
|
||||
elif args.lr_decay_style == "exponential":
|
||||
lr_scheduler = bmt.lr_scheduler.Exponential(optimizer,
|
||||
start_lr = args.lr,
|
||||
warmup_iter = args.warmup_iters,
|
||||
end_iter = args.lr_decay_iters,
|
||||
num_iter = args.start_step)
|
||||
elif args.lr_decay_style == "cosine":
|
||||
lr_scheduler = bmt.lr_scheduler.Cosine(optimizer,
|
||||
start_lr = args.lr,
|
||||
warmup_iter = args.warmup_iters,
|
||||
end_iter = args.lr_decay_iters,
|
||||
num_iter = args.start_step)
|
||||
else:
|
||||
raise ValueError(f"lr_scheduler of type {args.lr_decay_style} is not supported yet.")
|
||||
|
||||
return lr_scheduler
|
||||
|
||||
def setup_model_and_optimizer(args):
|
||||
# get the tokenizer
|
||||
tokenizer = get_tokenizer(args)
|
||||
# get the model
|
||||
model = get_model(args)
|
||||
bmt.synchronize()
|
||||
# get the optimizer and lr_scheduler
|
||||
optimizer = get_optimizer(args, model)
|
||||
lr_scheduler = get_learning_rate_scheduler(args, optimizer)
|
||||
bmt.synchronize()
|
||||
# get the memory usage
|
||||
bmt.print_rank("Model mem\n", torch.cuda.memory_summary())
|
||||
bmt.synchronize()
|
||||
return tokenizer, model, optimizer, lr_scheduler
|
||||
|
||||
def initialize():
|
||||
# get arguments
|
||||
args = get_args()
|
||||
# init bmt
|
||||
bmt.init_distributed(seed = args.seed)
|
||||
# init save folder
|
||||
if args.save != None:
|
||||
os.makedirs(args.save, exist_ok=True)
|
||||
return args
|
||||
|
||||
def prepare_dataset(args, tokenizer, base_path, dataset_name, rank, world_size):
|
||||
splits = ['train', 'dev', 'test']
|
||||
dataset = {}
|
||||
for split in splits:
|
||||
dataset[split] = DATASET[dataset_name](base_path, split, rank, world_size, tokenizer, args.max_encoder_length, args.max_decoder_length)
|
||||
verbalizer = torch.LongTensor(DATASET[dataset_name].get_verbalizer(tokenizer)).cuda()
|
||||
return dataset, verbalizer
|
||||
|
||||
|
||||
def finetune(args, tokenizer, model, optimizer, lr_scheduler, dataset, verbalizer):
|
||||
loss_func = bmt.loss.FusedCrossEntropy(ignore_index=-100)
|
||||
|
||||
optim_manager = bmt.optim.OptimManager(loss_scale=args.loss_scale)
|
||||
optim_manager.add_optimizer(optimizer, lr_scheduler)
|
||||
|
||||
dataloader = {
|
||||
"train": DistributedDataLoader(dataset['train'], batch_size=args.batch_size, shuffle=True),
|
||||
"dev": DistributedDataLoader(dataset['dev'], batch_size=args.batch_size, shuffle=False),
|
||||
"test": DistributedDataLoader(dataset['test'], batch_size=args.batch_size, shuffle=False),
|
||||
}
|
||||
|
||||
for epoch in range(5):
|
||||
model.train()
|
||||
for it, data in enumerate(dataloader['train']):
|
||||
enc_input = data["enc_input"]
|
||||
enc_length = data["enc_length"]
|
||||
dec_input = data["dec_input"]
|
||||
dec_length = data["dec_length"]
|
||||
targets = data["targets"]
|
||||
index = data["index"]
|
||||
|
||||
logits = model(enc_input, enc_length, dec_input, dec_length)
|
||||
logits = logits.index_select(dim=-1, index=verbalizer)
|
||||
logits = logits[torch.where(index==1)]
|
||||
|
||||
loss = loss_func(logits, targets)
|
||||
global_loss = bmt.sum_loss(loss).item()
|
||||
|
||||
optim_manager.zero_grad()
|
||||
|
||||
optim_manager.backward(loss)
|
||||
grad_norm = optim_manager.clip_grad_norm(optimizer.param_groups, args.clip_grad, norm_type = 2)
|
||||
|
||||
optim_manager.step()
|
||||
|
||||
bmt.print_rank(
|
||||
"train | epoch {:3d} | Iter: {:6d}/{:6d} | loss: {:.4f} | lr: {:.4e}, scale: {:10.4f} | grad_norm: {:.4f} |".format(
|
||||
epoch,
|
||||
it,
|
||||
len(dataloader["train"]),
|
||||
global_loss,
|
||||
lr_scheduler.current_lr,
|
||||
int(optim_manager.loss_scale),
|
||||
grad_norm,
|
||||
)
|
||||
)
|
||||
# if it % args.inspect_iters == 0: print_inspect(model, "*")
|
||||
# if args.save != None and it % args.save_iters == 0:
|
||||
# bmt.save(model, os.path.join(args.save, args.save_name+("-%d.pt" % it)))
|
||||
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
acc = 0
|
||||
total = 0
|
||||
for it, data in enumerate(dataloader['dev']):
|
||||
enc_input = data["enc_input"]
|
||||
enc_length = data["enc_length"]
|
||||
dec_input = data["dec_input"]
|
||||
dec_length = data["dec_length"]
|
||||
targets = data["targets"]
|
||||
index = data["index"]
|
||||
|
||||
logits = model(enc_input, enc_length, dec_input, dec_length)
|
||||
logits = logits.index_select(dim=-1, index=verbalizer)
|
||||
logits = logits[torch.where(index==1)]
|
||||
logits = logits.argmax(dim=-1)
|
||||
|
||||
acc += torch.sum(logits == targets).item()
|
||||
total += logits.shape[0]
|
||||
bmt.print_rank(
|
||||
"dev | epoch {:3d} | Iter: {:6d}/{:6d} | acc: {:6d} | total: {:6d} |".format(
|
||||
epoch,
|
||||
it,
|
||||
len(dataloader["dev"]),
|
||||
acc,
|
||||
total,
|
||||
)
|
||||
)
|
||||
acc = torch.tensor(acc / total).cuda()
|
||||
acc = bmt.sum_loss(acc).cpu().item()
|
||||
bmt.print_rank(f"dev epoch {epoch}: accuracy: {acc}")
|
||||
|
||||
def main():
|
||||
args = initialize()
|
||||
tokenizer, model, optimizer, lr_scheduler = setup_model_and_optimizer(args)
|
||||
dataset, verbalizer = prepare_dataset(
|
||||
args,
|
||||
tokenizer,
|
||||
f"{args.base_path}/down_data/paraphrase",
|
||||
args.dataset_name,
|
||||
bmt.rank(), bmt.world_size(),
|
||||
)
|
||||
finetune(args, tokenizer, model, optimizer, lr_scheduler, dataset, verbalizer)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -0,0 +1,2 @@
|
|||
|
||||
|
|
@ -0,0 +1,182 @@
|
|||
# Adapted from Tevatron (https://github.com/texttron/tevatron)
|
||||
|
||||
from argparse import ArgumentParser
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import torch.nn as nn
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class UnitTest:
|
||||
def __init__(self, models):
|
||||
self.models = models
|
||||
|
||||
self.Configs = {}
|
||||
self.Configs[0] = {
|
||||
"delta_type": "lora",
|
||||
}
|
||||
|
||||
self.Configs[1] = {
|
||||
"delta_type": "bitfit",
|
||||
}
|
||||
|
||||
self.Configs[2] = {
|
||||
"delta_type": "adapter",
|
||||
}
|
||||
|
||||
self.Configs[3] = {
|
||||
"delta_type": "compacter",
|
||||
}
|
||||
|
||||
self.Configs[4] = {
|
||||
"delta_type": "prefix",
|
||||
}
|
||||
|
||||
self.Configs[5] = {
|
||||
"delta_type": "soft_prompt",
|
||||
}
|
||||
|
||||
self.Configs[6] = {
|
||||
"delta_type": "low_rank_adapter",
|
||||
}
|
||||
|
||||
def get_delta_config(self, config_id):
|
||||
return self.Configs[config_id]
|
||||
|
||||
|
||||
def unitTest0(self, delta_config_dict):
|
||||
model = self.models[0]
|
||||
from opendelta import Visualization
|
||||
Visualization(model).structure_graph()
|
||||
|
||||
from opendelta import AutoDeltaConfig, AutoDeltaModel
|
||||
|
||||
delta_config = AutoDeltaConfig.from_dict(delta_config_dict)
|
||||
delta_model = AutoDeltaModel.from_config(delta_config, backbone_model = model)
|
||||
|
||||
from opendelta import Visualization
|
||||
Visualization(model).structure_graph()
|
||||
|
||||
def unitTest1(self, delta_config_dict):
|
||||
class Mymodel(nn.Module):
|
||||
def __init__(self, a,b):
|
||||
super().__init__()
|
||||
self.a = a
|
||||
self.b = b
|
||||
|
||||
model = Mymodel(self.models[0], self.models[1])
|
||||
from opendelta import Visualization
|
||||
Visualization(model).structure_graph()
|
||||
|
||||
from opendelta import AutoDeltaConfig, AutoDeltaModel
|
||||
|
||||
delta_config = AutoDeltaConfig.from_dict(delta_config_dict)
|
||||
delta_model = AutoDeltaModel.from_config(delta_config, backbone_model = model)
|
||||
|
||||
from opendelta import Visualization
|
||||
Visualization(model).structure_graph()
|
||||
delta_model.save_finetuned("./tmp")
|
||||
|
||||
delta_model.freeze_module(exclude=['deltas'])
|
||||
delta_model.save_finetuned("./tmp")
|
||||
|
||||
model = Mymodel(self.models[0], self.models[1])
|
||||
Visualization(model).structure_graph()
|
||||
delta_model = AutoDeltaModel.from_finetuned("./tmp", backbone_model=model)
|
||||
Visualization(model).structure_graph()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def unit_test(self, test_id, config_id):
|
||||
delta_config_dict = self.Configs[config_id]
|
||||
if test_id == 0:
|
||||
self.unitTest0(delta_config_dict)
|
||||
elif test_id == 1:
|
||||
self.unitTest1(delta_config_dict)
|
||||
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
@dataclass
|
||||
class UnitTestArguments:
|
||||
"""
|
||||
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
|
||||
"""
|
||||
config_id: int = field(
|
||||
default=0,
|
||||
)
|
||||
test_id: int = field(
|
||||
default=0,
|
||||
)
|
||||
model_name_or_path: str =field(
|
||||
default='bert-base-cased',
|
||||
metadata={"help": "tested: bert-base-cased, roberta-base, rinna/japanese-gpt2-small, t5-small, facebook/opt-125m"}
|
||||
)
|
||||
|
||||
|
||||
from transformers import HfArgumentParser,TrainingArguments, AutoModel, GPT2Model
|
||||
|
||||
def main():
|
||||
parser = HfArgumentParser((TrainingArguments, UnitTestArguments))
|
||||
|
||||
|
||||
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
|
||||
training_args, unit_test_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
|
||||
else:
|
||||
training_args, unit_test_args = parser.parse_args_into_dataclasses()
|
||||
training_args: TrainingArguments
|
||||
|
||||
if (
|
||||
os.path.exists(training_args.output_dir)
|
||||
and os.listdir(training_args.output_dir)
|
||||
and training_args.do_train
|
||||
and not training_args.overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
|
||||
)
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
|
||||
)
|
||||
logger.warning(
|
||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
training_args.local_rank,
|
||||
training_args.device,
|
||||
training_args.n_gpu,
|
||||
bool(training_args.local_rank != -1),
|
||||
training_args.fp16,
|
||||
)
|
||||
logger.info("Training/evaluation parameters %s", training_args)
|
||||
|
||||
|
||||
model = AutoModel.from_pretrained(unit_test_args.model_name_or_path)
|
||||
|
||||
import torch
|
||||
import copy
|
||||
models = [model, copy.deepcopy(model)]
|
||||
|
||||
|
||||
unit_test = UnitTest(models)
|
||||
|
||||
|
||||
unit_test.unit_test(unit_test_args.test_id, unit_test_args.config_id)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue