move must_try to unittest, unitest to examples

This commit is contained in:
shengdinghu 2022-10-16 13:46:24 +00:00
parent 085d388102
commit 62f03f0068
6 changed files with 454 additions and 0 deletions

View File

@ -0,0 +1,70 @@
# use tranformers as usual.
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
t5_tokenizer = AutoTokenizer.from_pretrained("t5-large")
# A running example
inputs_ids = t5_tokenizer.encode("Is Harry Poter wrtten by JKrowling", return_tensors="pt")
t5_tokenizer.decode(t5.generate(inputs_ids)[0])
# >>> '<pad><extra_id_0>? Is it Harry Potter?</s>'
# use existing delta models
from opendelta import AutoDeltaModel, AutoDeltaConfig
# use existing delta models from DeltaCenter
delta = AutoDeltaModel.from_finetuned("thunlp/Spelling_Correction_T5_LRAdapter_demo", backbone_model=t5)
# freeze the whole backbone model except the delta models.
delta.freeze_module()
# visualize the change
delta.log()
t5_tokenizer.decode(t5.generate(inputs_ids)[0])
# >>> <pad> Is Harry Potter written by JK Rowling?</s>
# Now save merely the delta models, not the whole backbone model, to tmp/
delta.save_finetuned(".tmp")
import os; os.listdir(".tmp")
# >>> The state dict size is 1.443 MB
# >>> We encourage users to push their final and public models to delta center to share them with the community!
# reload the model from local url and add it to pre-trained T5.
t5 = AutoModelForSeq2SeqLM.from_pretrained("t5-large")
delta1 = AutoDeltaModel.from_finetuned(".tmp", backbone_model=t5)
import shutil; shutil.rmtree(".tmp") # don't forget to remove the tmp files.
t5_tokenizer.decode(t5.generate(inputs_ids)[0])
# >>> <pad> Is Harry Potter written by JK Rowling?</s>
# detach the delta models, the model returns to the unmodified status.
delta1.detach()
t5_tokenizer.decode(t5.generate(inputs_ids)[0])
# >>> '<pad><extra_id_0>? Is it Harry Potter?</s>'
# use default configuration for cunstomized wrapped models which have PLMs inside. This is a common need for users.
import torch.nn as nn
class WrappedModel(nn.Module):
def __init__(self, inner_model):
super().__init__()
self.inner = inner_model
def forward(self, *args, **kwargs):
return self.inner(*args, **kwargs)
wrapped_model = WrappedModel(WrappedModel(t5))
# say we use LoRA
delta_config = AutoDeltaConfig.from_dict({"delta_type":"lora"})
delta2 = AutoDeltaModel.from_config(delta_config, backbone_model=wrapped_model)
delta2.log()
# >>> root
# -- inner
# -- inner
# ...
# ... lora_A:[8,1024], lora_B:[1024,8]
delta2.detach()
# use a not default configuration
# say we add lora to the last four layer of the decoder of t5, with lora rank=5
delta_config3 = AutoDeltaConfig.from_dict({"delta_type":"lora", "modified_modules":["[r]decoder.*((20)|(21)|(22)|(23)).*DenseReluDense\.wi"], "lora_r":5})
delta3 = AutoDeltaModel.from_config(delta_config3, backbone_model=wrapped_model)
delta3.log()

View File

View File

@ -0,0 +1,200 @@
import time
import random
import torch
import bmtrain as bmt
import numpy as np
import os
import csv
from model_center import get_args
from model_center.model import CPM2
from model_center.tokenizer import CPM2Tokenizer
from model_center.dataset.cpm2dataset import DATASET
from model_center.utils import print_inspect
from model_center.dataset import DistributedDataLoader
def get_tokenizer(args):
tokenizer = CPM2Tokenizer.from_pretrained(args.model_config)
return tokenizer
def get_model(args):
model = CPM2.from_pretrained(args.model_config)
return model
def get_optimizer(args, model):
optimizer = bmt.optim.AdamOffloadOptimizer(model.parameters(), weight_decay=args.weight_decay)
return optimizer
def get_learning_rate_scheduler(args, optimizer):
if args.lr_decay_iters is None:
args.lr_decay_iters = args.train_iters * args.epochs
if args.lr_decay_style == "noam":
lr_scheduler = bmt.lr_scheduler.Noam(optimizer,
start_lr = args.lr,
warmup_iter = args.warmup_iters,
end_iter = args.lr_decay_iters,
num_iter = args.start_step)
elif args.lr_decay_style == "constant":
lr_scheduler = bmt.lr_scheduler.NoDecay(optimizer,
start_lr = args.lr,
warmup_iter = args.warmup_iters,
end_iter = -1,
num_iter = args.start_step)
elif args.lr_decay_style == "linear":
lr_scheduler = bmt.lr_scheduler.Linear(optimizer,
start_lr = args.lr,
warmup_iter = args.warmup_iters,
end_iter = args.lr_decay_iters,
num_iter = args.start_step)
elif args.lr_decay_style == "exponential":
lr_scheduler = bmt.lr_scheduler.Exponential(optimizer,
start_lr = args.lr,
warmup_iter = args.warmup_iters,
end_iter = args.lr_decay_iters,
num_iter = args.start_step)
elif args.lr_decay_style == "cosine":
lr_scheduler = bmt.lr_scheduler.Cosine(optimizer,
start_lr = args.lr,
warmup_iter = args.warmup_iters,
end_iter = args.lr_decay_iters,
num_iter = args.start_step)
else:
raise ValueError(f"lr_scheduler of type {args.lr_decay_style} is not supported yet.")
return lr_scheduler
def setup_model_and_optimizer(args):
# get the tokenizer
tokenizer = get_tokenizer(args)
# get the model
model = get_model(args)
bmt.synchronize()
# get the optimizer and lr_scheduler
optimizer = get_optimizer(args, model)
lr_scheduler = get_learning_rate_scheduler(args, optimizer)
bmt.synchronize()
# get the memory usage
bmt.print_rank("Model mem\n", torch.cuda.memory_summary())
bmt.synchronize()
return tokenizer, model, optimizer, lr_scheduler
def initialize():
# get arguments
args = get_args()
# init bmt
bmt.init_distributed(seed = args.seed)
# init save folder
if args.save != None:
os.makedirs(args.save, exist_ok=True)
return args
def prepare_dataset(args, tokenizer, base_path, dataset_name, rank, world_size):
splits = ['train', 'dev', 'test']
dataset = {}
for split in splits:
dataset[split] = DATASET[dataset_name](base_path, split, rank, world_size, tokenizer, args.max_encoder_length, args.max_decoder_length)
verbalizer = torch.LongTensor(DATASET[dataset_name].get_verbalizer(tokenizer)).cuda()
return dataset, verbalizer
def finetune(args, tokenizer, model, optimizer, lr_scheduler, dataset, verbalizer):
loss_func = bmt.loss.FusedCrossEntropy(ignore_index=-100)
optim_manager = bmt.optim.OptimManager(loss_scale=args.loss_scale)
optim_manager.add_optimizer(optimizer, lr_scheduler)
dataloader = {
"train": DistributedDataLoader(dataset['train'], batch_size=args.batch_size, shuffle=True),
"dev": DistributedDataLoader(dataset['dev'], batch_size=args.batch_size, shuffle=False),
"test": DistributedDataLoader(dataset['test'], batch_size=args.batch_size, shuffle=False),
}
for epoch in range(5):
model.train()
for it, data in enumerate(dataloader['train']):
enc_input = data["enc_input"]
enc_length = data["enc_length"]
dec_input = data["dec_input"]
dec_length = data["dec_length"]
targets = data["targets"]
index = data["index"]
logits = model(enc_input, enc_length, dec_input, dec_length)
logits = logits.index_select(dim=-1, index=verbalizer)
logits = logits[torch.where(index==1)]
loss = loss_func(logits, targets)
global_loss = bmt.sum_loss(loss).item()
optim_manager.zero_grad()
optim_manager.backward(loss)
grad_norm = optim_manager.clip_grad_norm(optimizer.param_groups, args.clip_grad, norm_type = 2)
optim_manager.step()
bmt.print_rank(
"train | epoch {:3d} | Iter: {:6d}/{:6d} | loss: {:.4f} | lr: {:.4e}, scale: {:10.4f} | grad_norm: {:.4f} |".format(
epoch,
it,
len(dataloader["train"]),
global_loss,
lr_scheduler.current_lr,
int(optim_manager.loss_scale),
grad_norm,
)
)
# if it % args.inspect_iters == 0: print_inspect(model, "*")
# if args.save != None and it % args.save_iters == 0:
# bmt.save(model, os.path.join(args.save, args.save_name+("-%d.pt" % it)))
model.eval()
with torch.no_grad():
acc = 0
total = 0
for it, data in enumerate(dataloader['dev']):
enc_input = data["enc_input"]
enc_length = data["enc_length"]
dec_input = data["dec_input"]
dec_length = data["dec_length"]
targets = data["targets"]
index = data["index"]
logits = model(enc_input, enc_length, dec_input, dec_length)
logits = logits.index_select(dim=-1, index=verbalizer)
logits = logits[torch.where(index==1)]
logits = logits.argmax(dim=-1)
acc += torch.sum(logits == targets).item()
total += logits.shape[0]
bmt.print_rank(
"dev | epoch {:3d} | Iter: {:6d}/{:6d} | acc: {:6d} | total: {:6d} |".format(
epoch,
it,
len(dataloader["dev"]),
acc,
total,
)
)
acc = torch.tensor(acc / total).cuda()
acc = bmt.sum_loss(acc).cpu().item()
bmt.print_rank(f"dev epoch {epoch}: accuracy: {acc}")
def main():
args = initialize()
tokenizer, model, optimizer, lr_scheduler = setup_model_and_optimizer(args)
dataset, verbalizer = prepare_dataset(
args,
tokenizer,
f"{args.base_path}/down_data/paraphrase",
args.dataset_name,
bmt.rank(), bmt.world_size(),
)
finetune(args, tokenizer, model, optimizer, lr_scheduler, dataset, verbalizer)
if __name__ == "__main__":
main()

View File

View File

@ -0,0 +1,2 @@

View File

@ -0,0 +1,182 @@
# Adapted from Tevatron (https://github.com/texttron/tevatron)
from argparse import ArgumentParser
import logging
import os
import sys
import torch.nn as nn
logger = logging.getLogger(__name__)
class UnitTest:
def __init__(self, models):
self.models = models
self.Configs = {}
self.Configs[0] = {
"delta_type": "lora",
}
self.Configs[1] = {
"delta_type": "bitfit",
}
self.Configs[2] = {
"delta_type": "adapter",
}
self.Configs[3] = {
"delta_type": "compacter",
}
self.Configs[4] = {
"delta_type": "prefix",
}
self.Configs[5] = {
"delta_type": "soft_prompt",
}
self.Configs[6] = {
"delta_type": "low_rank_adapter",
}
def get_delta_config(self, config_id):
return self.Configs[config_id]
def unitTest0(self, delta_config_dict):
model = self.models[0]
from opendelta import Visualization
Visualization(model).structure_graph()
from opendelta import AutoDeltaConfig, AutoDeltaModel
delta_config = AutoDeltaConfig.from_dict(delta_config_dict)
delta_model = AutoDeltaModel.from_config(delta_config, backbone_model = model)
from opendelta import Visualization
Visualization(model).structure_graph()
def unitTest1(self, delta_config_dict):
class Mymodel(nn.Module):
def __init__(self, a,b):
super().__init__()
self.a = a
self.b = b
model = Mymodel(self.models[0], self.models[1])
from opendelta import Visualization
Visualization(model).structure_graph()
from opendelta import AutoDeltaConfig, AutoDeltaModel
delta_config = AutoDeltaConfig.from_dict(delta_config_dict)
delta_model = AutoDeltaModel.from_config(delta_config, backbone_model = model)
from opendelta import Visualization
Visualization(model).structure_graph()
delta_model.save_finetuned("./tmp")
delta_model.freeze_module(exclude=['deltas'])
delta_model.save_finetuned("./tmp")
model = Mymodel(self.models[0], self.models[1])
Visualization(model).structure_graph()
delta_model = AutoDeltaModel.from_finetuned("./tmp", backbone_model=model)
Visualization(model).structure_graph()
def unit_test(self, test_id, config_id):
delta_config_dict = self.Configs[config_id]
if test_id == 0:
self.unitTest0(delta_config_dict)
elif test_id == 1:
self.unitTest1(delta_config_dict)
from dataclasses import dataclass, field
@dataclass
class UnitTestArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
config_id: int = field(
default=0,
)
test_id: int = field(
default=0,
)
model_name_or_path: str =field(
default='bert-base-cased',
metadata={"help": "tested: bert-base-cased, roberta-base, rinna/japanese-gpt2-small, t5-small, facebook/opt-125m"}
)
from transformers import HfArgumentParser,TrainingArguments, AutoModel, GPT2Model
def main():
parser = HfArgumentParser((TrainingArguments, UnitTestArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
training_args, unit_test_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
training_args, unit_test_args = parser.parse_args_into_dataclasses()
training_args: TrainingArguments
if (
os.path.exists(training_args.output_dir)
and os.listdir(training_args.output_dir)
and training_args.do_train
and not training_args.overwrite_output_dir
):
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
)
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
)
logger.warning(
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
training_args.local_rank,
training_args.device,
training_args.n_gpu,
bool(training_args.local_rank != -1),
training_args.fp16,
)
logger.info("Training/evaluation parameters %s", training_args)
model = AutoModel.from_pretrained(unit_test_args.model_name_or_path)
import torch
import copy
models = [model, copy.deepcopy(model)]
unit_test = UnitTest(models)
unit_test.unit_test(unit_test_args.test_id, unit_test_args.config_id)
if __name__ == "__main__":
main()