npu adapted
This commit is contained in:
parent
067eed2304
commit
561c1de0e5
|
@ -3,6 +3,7 @@ import copy
|
||||||
|
|
||||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||||
# PATHBASE="/home/hushengding/plm_cache/"
|
# PATHBASE="/home/hushengding/plm_cache/"
|
||||||
|
PATHBASE="/home/guozr/Downloads/"
|
||||||
|
|
||||||
AllConfigs = {}
|
AllConfigs = {}
|
||||||
|
|
||||||
|
@ -50,6 +51,7 @@ BaseConfigs['t5-base'] = {
|
||||||
"save_strategy": "steps",
|
"save_strategy": "steps",
|
||||||
"datasets_load_from_disk": True,
|
"datasets_load_from_disk": True,
|
||||||
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
||||||
|
"datasets_saved_path": f"{PATHBASE}huggingface_datasets/saved_to_disk/",
|
||||||
"backbone_model": "t5", # use in delta center,
|
"backbone_model": "t5", # use in delta center,
|
||||||
"model_path_public": "t5-base", # use in delta center,
|
"model_path_public": "t5-base", # use in delta center,
|
||||||
|
|
||||||
|
|
|
@ -366,7 +366,7 @@ class SuperGLUECB(AbstractTask):
|
||||||
if offline == '1':
|
if offline == '1':
|
||||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.cb")[split]
|
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.cb")[split]
|
||||||
else:
|
else:
|
||||||
return datasets.load_dataset('super_glue', 'cb', split=split, script_version="master")
|
return datasets.load_dataset('super_glue', 'cb', split=split)
|
||||||
|
|
||||||
|
|
||||||
class SuperGLUECOPA(AbstractTask):
|
class SuperGLUECOPA(AbstractTask):
|
||||||
|
|
|
@ -0,0 +1,357 @@
|
||||||
|
# coding=utf-8
|
||||||
|
# Copyright OpenDelta Team and THUNLP lab. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""
|
||||||
|
A unified runing scripts for most models to do down stream tasks in a
|
||||||
|
prompt learning fashion, i.e., No classification head, all tasks are casted
|
||||||
|
to mask prediction or span prediction tasks.
|
||||||
|
|
||||||
|
Processing relevant to different backbone models are stored in ../backbones/
|
||||||
|
|
||||||
|
Adding A few lines to integrate the Delta tuning methods.
|
||||||
|
|
||||||
|
You can also adapt this script on your own tasks.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
os.environ['MKL_THREADING_LAYER'] = 'GNU'
|
||||||
|
os.environ['MKL_SERVICE_FORCE_INTEL'] = '1'
|
||||||
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||||
|
sys.path.append(os.path.join(os.getcwd(), "../"))
|
||||||
|
# sys.path.append(os.path.join(os.getcwd(), "/mnt/sfs_turbo/zhangzhen/OpenDelta"))
|
||||||
|
sys.path.append(os.path.join(os.getcwd()))
|
||||||
|
os.environ['ASCEND_RT_VISIBLE_DEVICES'] = '0' # https://support.huaweicloud.com/bestpractice-modelarts/modelarts_10_4007.html
|
||||||
|
|
||||||
|
import functools
|
||||||
|
import logging
|
||||||
|
import torch
|
||||||
|
import json
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
import transformers
|
||||||
|
from transformers import (
|
||||||
|
AutoConfig,
|
||||||
|
AutoModelForMaskedLM,
|
||||||
|
AutoModelForSeq2SeqLM,
|
||||||
|
AutoTokenizer,
|
||||||
|
DataCollatorForSeq2Seq,
|
||||||
|
# HfArgumentParser,
|
||||||
|
# MBartTokenizer,
|
||||||
|
# default_data_collator,
|
||||||
|
Trainer,
|
||||||
|
Seq2SeqTrainer,
|
||||||
|
set_seed,
|
||||||
|
)
|
||||||
|
from transformers.trainer_utils import is_main_process, get_last_checkpoint
|
||||||
|
|
||||||
|
from data_processors import AutoTask #, #TaskDataCollatorForSeq2Seq, AutoPostProcessor, data_collator
|
||||||
|
from utils import read_json, save_json
|
||||||
|
from utils.args import ModelArguments, TrainingArguments, DataTrainingArguments, DeltaArguments, RemainArgHfArgumentParser
|
||||||
|
import torch_npu
|
||||||
|
import transfer_to_npu
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# See all possible arguments in src/transformers/training_args.py
|
||||||
|
# or by passing the --help flag to this script.
|
||||||
|
# We now keep distinct sets of args, for a cleaner separation of concerns.
|
||||||
|
parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, DeltaArguments))
|
||||||
|
|
||||||
|
# You can provide a json file with contains the arguments and use the --argument some_arg to override or append to the json file.
|
||||||
|
json_file, cmd_args = (os.path.abspath(sys.argv[1]), sys.argv[2:]) if sys.argv[1].endswith(".json") else (None, sys.argv[1:])
|
||||||
|
model_args, data_args, training_args, delta_args, remain_args = parser.parse_json_file_with_cmd_args(json_file=json_file, command_line_args=cmd_args)
|
||||||
|
logger.warning("The following arguments not used! {}".format(remain_args))
|
||||||
|
|
||||||
|
logger.info(f"The results will be used in {training_args.output_dir}/results.json")
|
||||||
|
# exit()
|
||||||
|
# Detecting last checkpoint.
|
||||||
|
last_checkpoint = None
|
||||||
|
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
|
||||||
|
last_checkpoint = get_last_checkpoint(training_args.output_dir)
|
||||||
|
print("#### last_checkpoint ", last_checkpoint)
|
||||||
|
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
|
||||||
|
'''
|
||||||
|
raise ValueError(
|
||||||
|
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
|
||||||
|
"Use --overwrite_output_dir to overcome."
|
||||||
|
)
|
||||||
|
'''
|
||||||
|
pass
|
||||||
|
elif last_checkpoint is not None:
|
||||||
|
logger.info(
|
||||||
|
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
|
||||||
|
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
logging.basicConfig(
|
||||||
|
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||||
|
datefmt="%m/%d/%Y %H:%M:%S",
|
||||||
|
handlers=[logging.StreamHandler(sys.stdout)],
|
||||||
|
)
|
||||||
|
logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN)
|
||||||
|
|
||||||
|
# Log on each process the small summary:
|
||||||
|
logger.warning(
|
||||||
|
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
||||||
|
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
||||||
|
)
|
||||||
|
# Set the verbosity to info of the Transformers logger (on main process only):
|
||||||
|
if is_main_process(training_args.local_rank):
|
||||||
|
transformers.utils.logging.set_verbosity_info()
|
||||||
|
# logger.info("Training/evaluation parameters %s", training_args, model_args, data_args, delta_args)
|
||||||
|
logger.info("{}\n{}\n{}\n{}".format(training_args, model_args, data_args, delta_args))
|
||||||
|
|
||||||
|
|
||||||
|
# Set seed before initializing model.
|
||||||
|
set_seed(training_args.seed)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if os.path.basename(model_args.model_name_or_path).startswith("t5") \
|
||||||
|
or os.path.basename(model_args.model_name_or_path).startswith("long-t5") :
|
||||||
|
from examples_prompt.backbones.t5 import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||||
|
from examples_prompt.backbones.t5 import Trainer, DataCollator
|
||||||
|
elif os.path.basename(model_args.model_name_or_path).startswith("blenderbot"):
|
||||||
|
from examples_prompt.backbones.blenderbot import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||||
|
from examples_prompt.backbones.blenderbot import Trainer, DataCollator
|
||||||
|
elif os.path.basename(model_args.model_name_or_path).startswith("roberta") \
|
||||||
|
or os.path.basename(model_args.model_name_or_path).startswith("bert") \
|
||||||
|
or os.path.basename(model_args.model_name_or_path).startswith("albert") \
|
||||||
|
or os.path.basename(model_args.model_name_or_path).startswith("xlm-roberta") \
|
||||||
|
or os.path.basename(model_args.model_name_or_path).startswith("deberta") :
|
||||||
|
from examples_prompt.backbones.bert import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||||
|
from examples_prompt.backbones.bert import Trainer, DataCollator
|
||||||
|
elif os.path.basename(model_args.model_name_or_path).startswith("beit"):
|
||||||
|
from examples_prompt.backbones.beit import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||||
|
from examples_prompt.backbones.beit import Trainer, DataCollator
|
||||||
|
elif os.path.basename(model_args.model_name_or_path).startswith("bart"):
|
||||||
|
from examples_prompt.backbones.bart import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||||
|
from examples_prompt.backbones.bart import Trainer, DataCollator
|
||||||
|
elif os.path.basename(model_args.model_name_or_path).startswith("bigbird"):
|
||||||
|
from examples_prompt.backbones.bigbird import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||||
|
from examples_prompt.backbones.bigbird import Trainer, DataCollator
|
||||||
|
elif os.path.basename(model_args.model_name_or_path).startswith("clip"):
|
||||||
|
from examples_prompt.backbones.clip import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||||
|
from examples_prompt.backbones.clip import Trainer, DataCollator
|
||||||
|
elif os.path.basename(model_args.model_name_or_path).startswith("opt") \
|
||||||
|
or os.path.basename(model_args.model_name_or_path).startswith("gpt"):
|
||||||
|
from examples_prompt.backbones.opt import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts
|
||||||
|
from examples_prompt.backbones.opt import Trainer, DataCollator
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
config, tokenizer, model = get_backbone(model_args=model_args)
|
||||||
|
|
||||||
|
# model parallelize
|
||||||
|
if hasattr(training_args, "model_parallel") and training_args.model_parallel:
|
||||||
|
logger.info('parallelize model!')
|
||||||
|
model.parallelize()
|
||||||
|
|
||||||
|
from bigmodelvis import Visualization
|
||||||
|
Visualization(model).structure_graph()
|
||||||
|
|
||||||
|
if delta_args.delta_type.lower() != "none":
|
||||||
|
from opendelta import AutoDeltaConfig,AutoDeltaModel
|
||||||
|
from dataclasses import asdict
|
||||||
|
delta_config = AutoDeltaConfig.from_dict(asdict(delta_args))
|
||||||
|
delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=model)
|
||||||
|
delta_model.freeze_module(set_state_dict = True)
|
||||||
|
delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
performance_metrics = {}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
non_empty_splits_names = []
|
||||||
|
if training_args.do_train:
|
||||||
|
non_empty_splits_names.append("train")
|
||||||
|
if training_args.do_eval:
|
||||||
|
non_empty_splits_names.append("eval")
|
||||||
|
if training_args.do_test:
|
||||||
|
non_empty_splits_names.append("test")
|
||||||
|
splits = {}
|
||||||
|
for split_name in ['train', 'eval', 'test']:
|
||||||
|
if split_name not in non_empty_splits_names:
|
||||||
|
splits[split_name] = None
|
||||||
|
continue
|
||||||
|
|
||||||
|
task = AutoTask.get(data_args.task_name,
|
||||||
|
data_args.dataset_config_name,
|
||||||
|
data_args=data_args,
|
||||||
|
seed=data_args.data_sample_seed)
|
||||||
|
|
||||||
|
dataset = task.get(split=split_name,
|
||||||
|
split_validation_test=training_args.split_validation_test,
|
||||||
|
n_obs=data_args.max_train_samples)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
template, _verbalizer, tokenizer_wrapper = get_prompts(task, tokenizer, data_args)
|
||||||
|
|
||||||
|
|
||||||
|
dataset = dataset.map(
|
||||||
|
functools.partial(preprocess_function,
|
||||||
|
data_args=data_args,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
template=template,
|
||||||
|
verbalizer=_verbalizer,
|
||||||
|
tokenizer_wrapper=tokenizer_wrapper,
|
||||||
|
split=split_name),
|
||||||
|
batched=False,
|
||||||
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
|
remove_columns=get_remove_columns(list(dataset.features.keys())),
|
||||||
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
)
|
||||||
|
# from IPython import embed; embed()
|
||||||
|
splits[split_name] = dataset
|
||||||
|
if split_name == "eval":
|
||||||
|
eval_task = task
|
||||||
|
verbalizer = _verbalizer
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
trainer = Trainer(
|
||||||
|
model=model,
|
||||||
|
verbalizer=verbalizer,
|
||||||
|
eval_task=eval_task,
|
||||||
|
args=training_args,
|
||||||
|
train_dataset=splits['train'],
|
||||||
|
eval_dataset=splits['eval'],
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
data_collator=DataCollator(tokenizer),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def save_training_config(config_file, output_dir):
|
||||||
|
json_data = read_json(config_file)
|
||||||
|
save_json(os.path.join(output_dir, "training_config.json"), json_data)
|
||||||
|
|
||||||
|
|
||||||
|
# Saves training config.
|
||||||
|
if trainer.is_world_process_zero():
|
||||||
|
save_training_config(sys.argv[1], training_args.output_dir)
|
||||||
|
|
||||||
|
# Training
|
||||||
|
if training_args.do_train:
|
||||||
|
checkpoint = None
|
||||||
|
if training_args.resume_from_checkpoint is not None:
|
||||||
|
checkpoint = training_args.resume_from_checkpoint
|
||||||
|
elif last_checkpoint is not None:
|
||||||
|
checkpoint = last_checkpoint
|
||||||
|
|
||||||
|
if training_args.compute_time:
|
||||||
|
torch.cuda.synchronize() # wait for move to complete
|
||||||
|
start = torch.cuda.Event(enable_timing=True)
|
||||||
|
end = torch.cuda.Event(enable_timing=True)
|
||||||
|
start.record()
|
||||||
|
|
||||||
|
train_result = trainer.train(resume_from_checkpoint=checkpoint)
|
||||||
|
|
||||||
|
if training_args.compute_time:
|
||||||
|
end.record()
|
||||||
|
torch.cuda.synchronize() # wait for all_reduce to complete
|
||||||
|
total_time = start.elapsed_time(end)/(1000*60)
|
||||||
|
performance_metrics.update({"total_time in minutes ": total_time})
|
||||||
|
|
||||||
|
trainer.save_model() # Saves the tokenizer too for easy upload
|
||||||
|
train_metrics = train_result.metrics
|
||||||
|
max_train_samples = (
|
||||||
|
data_args.max_train_samples if data_args.max_train_samples is not None else len(splits['train'])
|
||||||
|
)
|
||||||
|
train_metrics["train_samples"] = min(max_train_samples, len(splits['train']))
|
||||||
|
trainer.log_metrics("train", train_metrics)
|
||||||
|
trainer.save_metrics("train", train_metrics)
|
||||||
|
trainer.save_state()
|
||||||
|
|
||||||
|
if torch.cuda.is_available() and training_args.compute_memory:
|
||||||
|
peak_memory = (torch.cuda.max_memory_allocated() / 1024 ** 2)/1000
|
||||||
|
performance_metrics.update({"peak_memory": peak_memory})
|
||||||
|
if training_args.compute_memory or training_args.compute_time:
|
||||||
|
logger.info("Efficiency Statistics {}".format(performance_metrics))
|
||||||
|
trainer.save_metrics("performance", performance_metrics)
|
||||||
|
|
||||||
|
# Evaluation
|
||||||
|
all_results = {}
|
||||||
|
|
||||||
|
all_results['evaluate'] = {}
|
||||||
|
|
||||||
|
if training_args.do_eval:
|
||||||
|
logger.info("*** Evaluate ***")
|
||||||
|
|
||||||
|
metrics = trainer.evaluate(eval_dataset=splits['eval'],
|
||||||
|
)
|
||||||
|
trainer.log_metrics(f"{data_args.task_name}_eval", metrics)
|
||||||
|
trainer.save_metrics(f"{data_args.task_name}_eval", metrics)
|
||||||
|
all_results['evaluate'][data_args.task_name] = metrics
|
||||||
|
|
||||||
|
# Test
|
||||||
|
all_results['test'] = {}
|
||||||
|
if training_args.do_test:
|
||||||
|
logger.info("*** Test ***")
|
||||||
|
metrics = trainer.evaluate(eval_dataset=splits['test'],
|
||||||
|
metric_key_prefix="test"
|
||||||
|
)
|
||||||
|
trainer.log_metrics(f"{data_args.task_name}_test", metrics)
|
||||||
|
trainer.save_metrics(f"{data_args.task_name}_test", metrics)
|
||||||
|
all_results['test'][data_args.task_name] = metrics
|
||||||
|
|
||||||
|
# from opendelta.utils.delta_hub import create_hub_repo_name
|
||||||
|
# from opendelta.utils.delta_center import create_delta_center_args, create_repo_name
|
||||||
|
|
||||||
|
# repo_name = create_hub_repo_name(root="DeltaHub",
|
||||||
|
# dataset=data_args.task_name,
|
||||||
|
# delta_type = delta_args.delta_type,
|
||||||
|
# model_name_or_path= model_args.model_name_or_path)
|
||||||
|
|
||||||
|
# center_args =
|
||||||
|
# repo_name = create_repo_name(prefix="", center_args=center_args)
|
||||||
|
# all_results['repo_name'] = repo_name
|
||||||
|
|
||||||
|
|
||||||
|
delta_model.save_finetuned(finetuned_delta_path=delta_args.finetuned_delta_path,
|
||||||
|
push_to_dc=training_args.push_to_dc,
|
||||||
|
center_args={"test_performance":all_results['test'][data_args.task_name]['test_average_metrics'],
|
||||||
|
},
|
||||||
|
center_args_pool = {**vars(model_args), **vars(data_args), **vars(training_args), **vars(delta_args)},
|
||||||
|
list_tags = ['NLI'],
|
||||||
|
dict_tags = {'purpose':'for testing'},
|
||||||
|
delay_push=True,
|
||||||
|
test_result=all_results['test']
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
with open(f"{training_args.output_dir}/results.json", 'w') as fout:
|
||||||
|
string = json.dumps(all_results, indent=4,sort_keys=True)
|
||||||
|
fout.write(string+"\n")
|
||||||
|
|
||||||
|
return all_results
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
result = main()
|
|
@ -2,7 +2,7 @@ from transformers import AutoModelForSequenceClassification
|
||||||
model = AutoModelForSequenceClassification.from_pretrained("roberta-base")
|
model = AutoModelForSequenceClassification.from_pretrained("roberta-base")
|
||||||
# suppose we load BART
|
# suppose we load BART
|
||||||
|
|
||||||
from opendelta import Visualization
|
from bigmodelvis import Visualization
|
||||||
print("before modify")
|
print("before modify")
|
||||||
Visualization(model).structure_graph()
|
Visualization(model).structure_graph()
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,162 @@
|
||||||
|
"""
|
||||||
|
This tutorial is a copy of OpenPrompt's tutorial/1.1_mixed_template.py
|
||||||
|
The only modification is in lines 98 to 102
|
||||||
|
|
||||||
|
1. OpenPrompt provides pre-processing of data, such as prompt template formatting
|
||||||
|
2. OpenPrompt pre-process the model input, such as: prompt soft embedding
|
||||||
|
3. OpenDelta modify the backbone model, such as: Adapter, Lora, Compactor, etc.
|
||||||
|
4. OpenPrompt post-process the model output, such as: extract logits at <mask> position, apply prompt verbalizer
|
||||||
|
"""
|
||||||
|
|
||||||
|
# load dataset
|
||||||
|
from datasets import load_dataset
|
||||||
|
from datasets import load_from_disk
|
||||||
|
raw_dataset = load_dataset('super_glue', 'cb',
|
||||||
|
# cache_dir="../datasets/.cache/huggingface_datasets"
|
||||||
|
)
|
||||||
|
# raw_dataset = load_from_disk("/home/hx/huggingface_datasets/saved_to_disk/super_glue.cb")
|
||||||
|
# Note that if you are running this scripts inside a GPU cluster, there are chances are you are not able to connect to huggingface website directly.
|
||||||
|
# In this case, we recommend you to run `raw_dataset = load_dataset(...)` on some machine that have internet connections.
|
||||||
|
# Then use `raw_dataset.save_to_disk(path)` method to save to local path.
|
||||||
|
# Thirdly upload the saved content into the machiine in cluster.
|
||||||
|
# Then use `load_from_disk` method to load the dataset.
|
||||||
|
|
||||||
|
from openprompt.data_utils import InputExample
|
||||||
|
|
||||||
|
dataset = {}
|
||||||
|
for split in ['train', 'validation', 'test']:
|
||||||
|
dataset[split] = []
|
||||||
|
for data in raw_dataset[split]:
|
||||||
|
input_example = InputExample(text_a = data['premise'], text_b = data['hypothesis'], label=int(data['label']), guid=data['idx'])
|
||||||
|
dataset[split].append(input_example)
|
||||||
|
print(dataset['train'][0])
|
||||||
|
|
||||||
|
# You can load the plm related things provided by openprompt simply by calling:
|
||||||
|
from openprompt.plms import load_plm
|
||||||
|
plm, tokenizer, model_config, WrapperClass = load_plm("t5", "t5-base")
|
||||||
|
|
||||||
|
# Constructing Template
|
||||||
|
# A template can be constructed from the yaml config, but it can also be constructed by directly passing arguments.
|
||||||
|
from openprompt.prompts import MixedTemplate
|
||||||
|
template_text = '{"placeholder":"text_a"} {"soft"} {"soft"} {"soft"} {"placeholder":"text_b"}? {"soft"} {"soft"} {"soft"} {"mask"}.'
|
||||||
|
mytemplate = MixedTemplate(model=plm, tokenizer=tokenizer, text=template_text)
|
||||||
|
|
||||||
|
# To better understand how does the template wrap the example, we visualize one instance.
|
||||||
|
|
||||||
|
wrapped_example = mytemplate.wrap_one_example(dataset['train'][0])
|
||||||
|
print(wrapped_example)
|
||||||
|
|
||||||
|
# Now, the wrapped example is ready to be pass into the tokenizer, hence producing the input for language models.
|
||||||
|
# You can use the tokenizer to tokenize the input by yourself, but we recommend using our wrapped tokenizer, which is a wrapped tokenizer tailed for InputExample.
|
||||||
|
# The wrapper has been given if you use our `load_plm` function, otherwise, you should choose the suitable wrapper based on
|
||||||
|
# the configuration in `openprompt.plms.__init__.py`.
|
||||||
|
# Note that when t5 is used for classification, we only need to pass <pad> <extra_id_0> <eos> to decoder.
|
||||||
|
# The loss is calcaluted at <extra_id_0>. Thus passing decoder_max_length=3 saves the space
|
||||||
|
wrapped_t5tokenizer = WrapperClass(max_seq_length=128, decoder_max_length=3, tokenizer=tokenizer,truncate_method="head")
|
||||||
|
# or
|
||||||
|
from openprompt.plms import T5TokenizerWrapper
|
||||||
|
wrapped_t5tokenizer= T5TokenizerWrapper(max_seq_length=128, decoder_max_length=3, tokenizer=tokenizer,truncate_method="head")
|
||||||
|
|
||||||
|
# You can see what a tokenized example looks like by
|
||||||
|
tokenized_example = wrapped_t5tokenizer.tokenize_one_example(wrapped_example, teacher_forcing=False)
|
||||||
|
print(tokenized_example)
|
||||||
|
print(tokenizer.convert_ids_to_tokens(tokenized_example['input_ids']))
|
||||||
|
print(tokenizer.convert_ids_to_tokens(tokenized_example['decoder_input_ids']))
|
||||||
|
|
||||||
|
# Now it's time to convert the whole dataset into the input format!
|
||||||
|
# Simply loop over the dataset to achieve it!
|
||||||
|
|
||||||
|
model_inputs = {}
|
||||||
|
for split in ['train', 'validation', 'test']:
|
||||||
|
model_inputs[split] = []
|
||||||
|
for sample in dataset[split]:
|
||||||
|
tokenized_example = wrapped_t5tokenizer.tokenize_one_example(mytemplate.wrap_one_example(sample), teacher_forcing=False)
|
||||||
|
model_inputs[split].append(tokenized_example)
|
||||||
|
|
||||||
|
|
||||||
|
# We provide a `PromptDataLoader` class to help you do all the above matters and wrap them into an `torch.DataLoader` style iterator.
|
||||||
|
from openprompt import PromptDataLoader
|
||||||
|
|
||||||
|
train_dataloader = PromptDataLoader(dataset=dataset["train"], template=mytemplate, tokenizer=tokenizer,
|
||||||
|
tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
|
||||||
|
batch_size=4,shuffle=True, teacher_forcing=False, predict_eos_token=False,
|
||||||
|
truncate_method="head")
|
||||||
|
|
||||||
|
|
||||||
|
# Define the verbalizer
|
||||||
|
# In classification, you need to define your verbalizer, which is a mapping from logits on the vocabulary to the final label probability. Let's have a look at the verbalizer details:
|
||||||
|
|
||||||
|
from openprompt.prompts import ManualVerbalizer
|
||||||
|
import torch
|
||||||
|
|
||||||
|
# for example the verbalizer contains multiple label words in each class
|
||||||
|
myverbalizer = ManualVerbalizer(tokenizer, num_classes=3, label_words=[["yes"], ["no"], ["maybe"]])
|
||||||
|
|
||||||
|
print("label_words_ids", myverbalizer.label_words_ids)
|
||||||
|
|
||||||
|
# Although you can manually combine the plm, template, verbalizer together, we provide a pipeline
|
||||||
|
# model which take the batched data from the PromptDataLoader and produce a class-wise logits
|
||||||
|
|
||||||
|
from opendelta import LoraModel
|
||||||
|
# delta_model = LoraModel(backbone_model=plm, modified_modules=[])
|
||||||
|
delta_model = LoraModel(backbone_model=plm, modified_modules=["SelfAttention.q", "SelfAttention.v"])
|
||||||
|
delta_model.freeze_module(exclude=["deltas"], set_state_dict=True)
|
||||||
|
delta_model.log()
|
||||||
|
|
||||||
|
from openprompt import PromptForClassification
|
||||||
|
|
||||||
|
use_npu = True
|
||||||
|
prompt_model = PromptForClassification(plm=plm, template=mytemplate, verbalizer=myverbalizer)
|
||||||
|
if use_npu :
|
||||||
|
prompt_model = prompt_model.npu()
|
||||||
|
|
||||||
|
# Now the training is standard
|
||||||
|
from transformers import AdamW, get_linear_schedule_with_warmup
|
||||||
|
loss_func = torch.nn.CrossEntropyLoss()
|
||||||
|
no_decay = ['bias', 'LayerNorm.weight']
|
||||||
|
# it's always good practice to set no decay to biase and LayerNorm parameters
|
||||||
|
optimizer_grouped_parameters = [
|
||||||
|
{'params': [p for n, p in prompt_model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
||||||
|
{'params': [p for n, p in prompt_model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||||
|
]
|
||||||
|
print([n for n, p in prompt_model.named_parameters()])
|
||||||
|
|
||||||
|
optimizer = AdamW(optimizer_grouped_parameters, lr=1e-4)
|
||||||
|
|
||||||
|
for epoch in range(30):
|
||||||
|
tot_loss = 0
|
||||||
|
for step, inputs in enumerate(train_dataloader):
|
||||||
|
if use_npu:
|
||||||
|
# The inputs instance is of type InputFeature, which inherits from dict.
|
||||||
|
# The to() method can move it to other devices. The cuda() method is a wrapper for to(), specifically moving to CUDA devices.
|
||||||
|
# If you want to move it to an NPU device, you can directly use the underlying to() method.
|
||||||
|
inputs = inputs.to("npu")
|
||||||
|
delta_model.log()
|
||||||
|
logits = prompt_model(inputs)
|
||||||
|
labels = inputs['label']
|
||||||
|
loss = loss_func(logits, labels)
|
||||||
|
loss.backward()
|
||||||
|
tot_loss += loss.item()
|
||||||
|
optimizer.step()
|
||||||
|
optimizer.zero_grad()
|
||||||
|
if step %100 ==1:
|
||||||
|
print("Epoch {}, average loss: {}".format(epoch, tot_loss/(step+1)), flush=True)
|
||||||
|
|
||||||
|
# Evaluate
|
||||||
|
validation_dataloader = PromptDataLoader(dataset=dataset["validation"], template=mytemplate, tokenizer=tokenizer,
|
||||||
|
tokenizer_wrapper_class=WrapperClass, max_seq_length=256, decoder_max_length=3,
|
||||||
|
batch_size=4,shuffle=False, teacher_forcing=False, predict_eos_token=False,
|
||||||
|
truncate_method="head")
|
||||||
|
|
||||||
|
allpreds = []
|
||||||
|
alllabels = []
|
||||||
|
for step, inputs in enumerate(validation_dataloader):
|
||||||
|
if use_npu:
|
||||||
|
inputs = inputs.to("npu")
|
||||||
|
logits = prompt_model(inputs)
|
||||||
|
labels = inputs['label']
|
||||||
|
alllabels.extend(labels.cpu().tolist())
|
||||||
|
allpreds.extend(torch.argmax(logits, dim=-1).cpu().tolist())
|
||||||
|
|
||||||
|
acc = sum([int(i==j) for i,j in zip(allpreds, alllabels)])/len(allpreds)
|
||||||
|
print(acc)
|
|
@ -3,9 +3,20 @@ import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import opendelta.utils.logging as logging
|
import opendelta.utils.logging as logging
|
||||||
|
import importlib
|
||||||
|
|
||||||
logger = logging.get_logger(__name__)
|
logger = logging.get_logger(__name__)
|
||||||
|
|
||||||
|
def is_torch_npu_available():
|
||||||
|
if importlib.util.find_spec("torch_npu") is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch_npu
|
||||||
|
|
||||||
|
return hasattr(torch, "npu") and torch.npu.is_available()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def inspect_module_statistics(module: Optional[nn.Module]=None, verbose=True):
|
def inspect_module_statistics(module: Optional[nn.Module]=None, verbose=True):
|
||||||
r"""Get the statistics of the parameters in the delta modules.
|
r"""Get the statistics of the parameters in the delta modules.
|
||||||
|
@ -34,9 +45,14 @@ def inspect_module_statistics(module: Optional[nn.Module]=None, verbose=True):
|
||||||
|
|
||||||
cudamem = 0
|
cudamem = 0
|
||||||
maxcudamem = 0
|
maxcudamem = 0
|
||||||
for device_id in range(torch.cuda.device_count()):
|
if is_torch_npu_available():
|
||||||
cudamem += torch.cuda.memory_allocated(f"cuda:{device_id}")/1024**3
|
for device_id in range(torch.npu.device_count()):
|
||||||
maxcudamem += torch.cuda.max_memory_allocated(f"cuda:{device_id}")/1024**3
|
cudamem += torch.npu.memory_allocated(f"npu:{device_id}")/1024**3
|
||||||
|
maxcudamem += torch.npu.max_memory_allocated(f"npu:{device_id}")/1024**3
|
||||||
|
else:
|
||||||
|
for device_id in range(torch.cuda.device_count()):
|
||||||
|
cudamem += torch.cuda.memory_allocated(f"cuda:{device_id}")/1024**3
|
||||||
|
maxcudamem += torch.cuda.max_memory_allocated(f"cuda:{device_id}")/1024**3
|
||||||
stat['cudamem'] = cudamem
|
stat['cudamem'] = cudamem
|
||||||
stat['maxcudamem'] = maxcudamem
|
stat['maxcudamem'] = maxcudamem
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
torch>=1.8.0
|
torch>=1.8.0
|
||||||
transformers>=4.10.0
|
transformers>=4.10.0,<=4.27.1
|
||||||
datasets>=1.17.0
|
datasets>=1.17.0
|
||||||
sentencepiece>=0.1.96
|
sentencepiece>=0.1.96
|
||||||
tqdm>=4.62.2
|
tqdm>=4.62.2
|
||||||
|
|
Loading…
Reference in New Issue