diff --git a/examples/examples_prompt/backbones/bart.py b/examples/examples_prompt/backbones/bart.py new file mode 100644 index 0000000..bab8303 --- /dev/null +++ b/examples/examples_prompt/backbones/bart.py @@ -0,0 +1,178 @@ + +from openpromptu.data_utils import InputExample +from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer +from transformers import ( + AutoConfig, + AutoModelForSeq2SeqLM, + AutoTokenizer, +) +from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator +import torch + +def mask_token_func(tokenizer, ith_mask=0): + return tokenizer.mask_token + +def get_remove_columns(dataset_features): + return dataset_features + +def preprocess_function(raw_example, **kwargs): + # max_target_length += 1 + tokenizer = kwargs['tokenizer'] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + split = kwargs['split'] + example = InputExample(**raw_example) + + + try: + example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=256, + padding="max_length", truncation=True) + except: + from IPython import embed; embed(header="Therer") + + with tokenizer.as_target_tokenizer(): + label = tokenizer(other['tgt_text']).input_ids + + model_inputs["labels"] = label + return model_inputs + +def get_backbone(model_args, **kwargs): + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + + model = AutoModelForSeq2SeqLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + return config, tokenizer, model + + +def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): + from openpromptu.prompts import GenerationVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id]) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func) + return template, verbalizer, tokenizer_wrapper + +class Trainer(HfSeq2SeqTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.eval_task = eval_task + self.compute_metrics = self._compute_metrics + + def compute_loss(self, model, inputs, return_outputs=False): + outputs = model(**inputs) + if return_outputs: + return (outputs.loss, outputs) + else: + return outputs.loss + + def prediction_step( + self, + model, #nn.Module, + inputs, #Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only, #: bool, + ignore_keys, #: Optional[List[str]] = None, + ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + + has_labels = "labels" in inputs + inputs = self._prepare_inputs(inputs) + gen_kwargs = { + "max_length": 10, # self._max_length if s is not None else self.model.config.max_length, + "num_beams": 1 #self._num_beams if self._num_beams is not None else self.model.config.num_beams, + } + generated_tokens = self.model.generate( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + **gen_kwargs, + ) + # in case the batch is shorter than max length, the output should be padded + if generated_tokens.shape[-1] < gen_kwargs["max_length"]: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) + + with torch.no_grad(): + + outputs = model(**inputs) + if has_labels: + if self.label_smoother is not None: + loss = self.label_smoother(outputs, inputs["labels"]).mean().detach() + else: + loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach() + else: + loss = None + + if self.args.prediction_loss_only: + return (loss, None, None) + + labels = inputs["labels"] + if labels.shape[-1] < gen_kwargs["max_length"]: + labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) + + # from IPython import embed; embed(header="In seqseqtrainer") + return (loss, generated_tokens, labels) + + def _compute_metrics(self, eval_preds): + # from IPython import embed; embed(header="In compute metrics") + preds, labels = eval_preds + decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True) + decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True) + # post_processor = .get(data_args.dataset_name[0], tokenizer, + # data_args.ignore_pad_token_for_loss) + # decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info) + result = {} + for metric in self.eval_task.metric: + result.update(metric(decoded_preds, decoded_labels)) + + average_metric = sum(result.values())/len(result) + result.update({"average_metrics":average_metric}) + return result + diff --git a/examples/examples_prompt/backbones/beit.py b/examples/examples_prompt/backbones/beit.py index 288fe10..4494fed 100644 --- a/examples/examples_prompt/backbones/beit.py +++ b/examples/examples_prompt/backbones/beit.py @@ -13,11 +13,6 @@ from transformers import ViTFeatureExtractor from transformers import Trainer as HfTrainer import torch.nn as nn -def process_example(raw_example, **kwargs): - tokenizer = kwargs['tokenizer'] - inputs = tokenizer(raw_example['image'], return_tensors='pt') - inputs['labels'] = raw_example['labels'] - return inputs def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): # from openpromptu.prompts import ManualVerbalizer diff --git a/examples/examples_prompt/backbones/bert.py b/examples/examples_prompt/backbones/bert.py index 4a3036d..8cd5612 100644 --- a/examples/examples_prompt/backbones/bert.py +++ b/examples/examples_prompt/backbones/bert.py @@ -49,7 +49,7 @@ def mask_token_func(tokenizer, ith_mask=0): return tokenizer.mask_token def get_remove_columns(dataset_features): - dataset_features.pop("label") + dataset_features.remove("label") return dataset_features @@ -60,6 +60,7 @@ def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): template = ManualTemplate(text = task.templates_text[template_id]) verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id]) tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func) + from IPython import embed; embed() return template, verbalizer, tokenizer_wrapper class DataCollator(HfDataCollatorMixin): diff --git a/examples/examples_prompt/backbones/bigbird.py b/examples/examples_prompt/backbones/bigbird.py new file mode 100644 index 0000000..b1dabcb --- /dev/null +++ b/examples/examples_prompt/backbones/bigbird.py @@ -0,0 +1,143 @@ +from openpromptu.data_utils import InputExample +import torch +from transformers.data.data_collator import torch_default_data_collator +from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin +import numpy as np +from transformers import ( + AutoConfig, + AutoModelForMaskedLM, + AutoTokenizer, +) + +from transformers import Trainer as HfTrainer + + +def preprocess_function(raw_example, **kwargs): + tokenizer = kwargs['tokenizer'] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + + example = InputExample(**raw_example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length, + padding="max_length", truncation=True) + return model_inputs + +def compute_metrics(eval_preds, dataset_name, eval_metric): + # from IPython import embed; embed(header="In compute metrics") + + preds, labels = eval_preds.predictions, eval_preds.label_ids + + preds = np.argmax(preds, axis=-1) + + result = {} + average_metrics = [] + for metric in eval_metric: + metric_item = metric(preds, labels) + metric_value = list(metric_item.values()) + result.update(metric_item) + average_metrics.extend(metric_value) + print("average:",average_metrics) + average_metric = sum(average_metrics)/len(average_metrics) + result.update({"average_metrics":average_metric}) + return result + +def mask_token_func(tokenizer, ith_mask=0): + return tokenizer.mask_token + +def get_remove_columns(dataset_features): + # from IPython import embed; embed(header="get_remove_columns") + dataset_features.remove("label") + return dataset_features + + +def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"): + from openpromptu.prompts import ManualVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id]) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func) + # from IPython import embed; embed() + return template, verbalizer, tokenizer_wrapper + +class DataCollator(HfDataCollatorMixin): + def __init__(self, *args, **kwargs): + self.return_tensors='pt' + + def torch_call(self, features): + return torch_default_data_collator(features=features) + + + +def get_backbone(model_args, **kwargs): + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + model = AutoModelForMaskedLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + model.resize_token_embeddings(len(tokenizer)) + return config, tokenizer, model + +class Trainer(HfTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.verbalizer=verbalizer + self.eval_task=eval_task + self.compute_metrics = self._compute_metrics + + + def compute_loss(self, model, inputs, return_outputs=False): + labels = inputs.pop('labels') + outputs = model(**inputs) + logits = outputs.get("logits") + input_ids = inputs['input_ids'] + verbalizer = self.verbalizer.cuda() + logits_at_mask = logits[torch.where(input_ids == verbalizer.tokenizer.mask_token_id)] + label_logits = verbalizer.process_logits(logits_at_mask) + loss_fct = torch.nn.CrossEntropyLoss() + loss = loss_fct(label_logits, labels) + outputs.logits = label_logits + return (loss, outputs) if return_outputs else loss + + def _compute_metrics(self, eval_preds): + # from IPython import embed; embed(header="In compute metrics") + + preds, labels = eval_preds.predictions, eval_preds.label_ids + + preds = np.argmax(preds, axis=-1) + + result = {} + average_metrics = [] + for metric in self.eval_task.metric: + metric_item = metric(preds, labels) + metric_value = list(metric_item.values()) + result.update(metric_item) + average_metrics.extend(metric_value) + print("average:",average_metrics) + average_metric = sum(average_metrics)/len(average_metrics) + result.update({"average_metrics":average_metric}) + return result + + diff --git a/examples/examples_prompt/backbones/blenderbot.py b/examples/examples_prompt/backbones/blenderbot.py new file mode 100644 index 0000000..c1e8876 --- /dev/null +++ b/examples/examples_prompt/backbones/blenderbot.py @@ -0,0 +1,182 @@ + +from openpromptu.data_utils import InputExample +from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer +from transformers import ( + AutoConfig, + BlenderbotForConditionalGeneration, + AutoTokenizer, +) +from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator +import torch + +def mask_token_func(tokenizer, ith_mask=0): + return "" + +def get_remove_columns(dataset_features): + return dataset_features + +def preprocess_function(raw_example, **kwargs): + # max_target_length += 1 + tokenizer = kwargs['tokenizer'] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + split = kwargs['split'] + example = InputExample(**raw_example) + + + try: + example = verbalizer.wrap_one_example(example) + example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(example) + model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length, + padding="max_length", truncation=True) + except: + from IPython import embed; embed(header="Therer") + + with tokenizer.as_target_tokenizer(): + label = tokenizer(other['tgt_text']).input_ids + + model_inputs["labels"] = label + # from IPython import embed; embed() + return model_inputs + +def get_backbone(model_args, **kwargs): + config = AutoConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + config.dropout_rate = 0.0 + tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + + model = BlenderbotForConditionalGeneration.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + # from IPython import embed; embed() + return config, tokenizer, model + + +def get_prompts(task, tokenizer, data_args, template_id="blenderbot", verbalizer_id="blenderbot"): + from openpromptu.prompts import GenerationVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id]) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func) + return template, verbalizer, tokenizer_wrapper + +class Trainer(HfSeq2SeqTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.eval_task = eval_task + self.compute_metrics = self._compute_metrics + + def compute_loss(self, model, inputs, return_outputs=False): + # from IPython import embed; embed() + outputs = model(**inputs) + if return_outputs: + return (outputs.loss, outputs) + else: + return outputs.loss + + def prediction_step( + self, + model, #nn.Module, + inputs, #Dict[str, Union[torch.Tensor, Any]], + prediction_loss_only, #: bool, + ignore_keys, #: Optional[List[str]] = None, + ): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on :obj:`model` using obj:`inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (:obj:`nn.Module`): + The model to evaluate. + inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument :obj:`labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (:obj:`bool`): + Whether or not to return the loss only. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + + has_labels = "labels" in inputs + inputs = self._prepare_inputs(inputs) + gen_kwargs = { + "max_length": 10, # self._max_length if s is not None else self.model.config.max_length, + "num_beams": 1, #self._num_beams if self._num_beams is not None else self.model.config.num_beams, + "min_length": 1 # for blenderbot, generally we set it to be a large number. But in classification, we set it to 1 + } + generated_tokens = self.model.generate( + inputs["input_ids"], + attention_mask=inputs["attention_mask"], + **gen_kwargs, + ) + # in case the batch is shorter than max length, the output should be padded + if generated_tokens.shape[-1] < gen_kwargs["max_length"]: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"]) + + with torch.no_grad(): + + outputs = model(**inputs) + if has_labels: + if self.label_smoother is not None: + loss = self.label_smoother(outputs, inputs["labels"]).mean().detach() + else: + loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach() + else: + loss = None + + if self.args.prediction_loss_only: + return (loss, None, None) + + labels = inputs["labels"] + if labels.shape[-1] < gen_kwargs["max_length"]: + labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"]) + + # from IPython import embed; embed(header="In seqseqtrainer") + return (loss, generated_tokens, labels) + + def _compute_metrics(self, eval_preds): + from IPython import embed; embed(header="In compute metrics") + preds, labels = eval_preds + decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True) + decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True) + # post_processor = .get(data_args.dataset_name[0], tokenizer, + # data_args.ignore_pad_token_for_loss) + # decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info) + result = {} + for metric in self.eval_task.metric: + result.update(metric(decoded_preds, decoded_labels)) + + average_metric = sum(result.values())/len(result) + result.update({"average_metrics":average_metric}) + return result + diff --git a/examples/examples_prompt/backbones/clip.py b/examples/examples_prompt/backbones/clip.py new file mode 100644 index 0000000..4889b97 --- /dev/null +++ b/examples/examples_prompt/backbones/clip.py @@ -0,0 +1,172 @@ +from openpromptu.data_utils import InputExample +import torch +from transformers.data.data_collator import torch_default_data_collator +from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin +import numpy as np +from transformers import ( + CLIPConfig, + CLIPProcessor, + CLIPModel, +) +from transformers import ViTFeatureExtractor +from PIL import Image +from transformers import Trainer as HfTrainer +import torch.nn as nn + + + +def get_prompts(task, tokenizer, data_args, template_id="clip", verbalizer_id="clip"): + from openpromptu.prompts import GenerationVerbalizer + from openpromptu.prompts import ManualTemplate + from openpromptu import TokenizerWrapper + template = ManualTemplate(text = task.templates_text[template_id]) + verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id]) + tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer.tokenizer, truncate_method="balanced", mask_token_func=mask_token_func) + return template, verbalizer, tokenizer_wrapper + +def mask_token_func(tokenizer, ith_mask=0): + return tokenizer.mask_token + +def preprocess_function(raw_example, **kwargs): + # from IPython import embed; embed(header="Therefa") + tokenizer = kwargs['tokenizer'] + + # ["a photo of {}" for i in range()] + data_args = kwargs['data_args'] + template = kwargs['template'] + verbalizer = kwargs['verbalizer'] + tokenizer_wrapper = kwargs['tokenizer_wrapper'] + + example = InputExample(raw_example) + + texts = [] + + for candidate_label in range(verbalizer.num_classes): + tgt_text = verbalizer.wrap_one_example(label=candidate_label) + wrapped_example, other = template.wrap_one_example(example) + input_sentence = tokenizer_wrapper.merge_wrapped_example(wrapped_example, tgt_texts=[tgt_text]) + texts.append(input_sentence) + + # from IPython import embed; embed()/ + + image = Image.open(raw_example['image_file_path']) + + model_inputs = tokenizer(images=image, text=texts, max_length=16, padding="max_length", truncation=True, return_tensors='pt') + + # from IPython import embed; embed() + model_inputs["pixel_values"] = model_inputs["pixel_values"].squeeze() + model_inputs["label"] = example.label + return model_inputs + +def compute_metrics(eval_preds, dataset_name, eval_metric): + # from IPython import embed; embed(header="In compute metrics") + + preds, labels = eval_preds.predictions, eval_preds.label_ids + + preds = np.argmax(preds, axis=-1) + + result = {} + average_metrics = [] + for metric in eval_metric: + metric_item = metric(preds, labels) + metric_value = list(metric_item.values()) + result.update(metric_item) + average_metrics.extend(metric_value) + print("average:",average_metrics) + average_metric = sum(average_metrics)/len(average_metrics) + result.update({"average_metrics":average_metric}) + return result + + + +def get_remove_columns(dataset_features): + # from IPython import embed; embed(header="in remoev") + dataset_features.remove("labels") + print("remove_columns: {}".format(dataset_features)) + return dataset_features + +class DataCollator(HfDataCollatorMixin): + def __init__(self, *args, **kwargs): + self.return_tensors='pt' + + def torch_call(self, features): + # from IPython import embed; embed(header="in data collator") + a = torch_default_data_collator(features=features) + # from IPython import embed; embed(header="in data collator") + a["input_ids"] = a["input_ids"][0] + a["attention_mask"] = a["attention_mask"][0] + return a + + +def get_backbone(model_args, **kwargs): + config = CLIPConfig.from_pretrained( + model_args.config_name if model_args.config_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + # config.dropout_rate = 0.0 + tokenizer = CLIPProcessor.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, + cache_dir=model_args.cache_dir, + + use_fast=model_args.use_fast_tokenizer, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + + model = CLIPModel.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + # config.num_labels = model_args.num_classes + # old_classifier = model.classifier + # model.classifier = nn.Linear(old_classifier.in_features, config.num_labels) + + + return config, tokenizer, model + +class Trainer(HfTrainer): + def __init__(self, verbalizer=None, eval_task=None, **kwargs): + super().__init__(**kwargs) + self.verbalizer=verbalizer + self.eval_task=eval_task + self.compute_metrics = self._compute_metrics + self.loss_fn = nn.CrossEntropyLoss() + + def compute_loss(self, model, inputs, return_outputs=False): + # from IPython import embed; embed() + labels = inputs.pop('labels') + outputs = model(**inputs) + # logits = outputs.get("logits") + + + logits_per_image = outputs.logits_per_image + loss = self.loss_fn(logits_per_image, labels) + return (loss, outputs) if return_outputs else loss + + def _compute_metrics(self, eval_preds): + # from IPython import embed; embed(header="In compute metrics") + + preds, labels = eval_preds.predictions, eval_preds.label_ids + + preds = np.argmax(preds, axis=-1) + + result = {} + average_metrics = [] + for metric in self.eval_task.metric: + metric_item = metric(preds, labels) + metric_value = list(metric_item.values()) + result.update(metric_item) + average_metrics.extend(metric_value) + print("average:",average_metrics) + average_metric = sum(average_metrics)/len(average_metrics) + result.update({"average_metrics":average_metric}) + from IPython import embed; embed(header="In compute metrics") + return result + + diff --git a/examples/examples_seq2seq/configs/config_gen_bs1.py b/examples/examples_prompt/configs/gen.py similarity index 58% rename from examples/examples_seq2seq/configs/config_gen_bs1.py rename to examples/examples_prompt/configs/gen.py index 4cf3c8e..1ccdcdb 100644 --- a/examples/examples_seq2seq/configs/config_gen_bs1.py +++ b/examples/examples_prompt/configs/gen.py @@ -1,25 +1,33 @@ -import collections +import collections import copy -BS = 1 +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" + AllConfigs = {} +import argparse +import json +import os +parser = argparse.ArgumentParser("Parser to generate configuration") +parser.add_argument("--job", type=str) +parser.add_argument("--") +args = parser.parse_args() + BaseConfigs = {} BaseConfigs['t5-base'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", "max_source_length", "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, [0] *7 +[0] *8, [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], @@ -27,53 +35,9 @@ BaseConfigs['t5-base'] = { "do_train": True, "do_eval": True, "do_test": True, - - "model_name_or_path": "t5-base", - "tokenizer_name": "t5-base", - "save_total_limit": 1, - # For glue datasets. - "split_validation_test": True, - "seed": 42, - "dataset_config_name": ["en"], - "eval_dataset_config_name": ["en"], - "test_dataset_config_name": ["en"], - # other configurations. - "predict_with_generate": True, - # To evaluate during training. - "load_best_model_at_end": True, - "metric_for_best_model": "average_metrics", - "greater_is_better": True, - "evaluation_strategy": "steps", - "overwrite_output_dir": True, - "push_to_hub": True, - "save_strategy": "steps" - } -BaseConfigs['t5-large'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", - "max_source_length", - "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], - [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, - [0] *7 +[0] *8, - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - ), - "do_train": True, - "do_eval": True, - "do_test": True, - - "model_name_or_path": "/home/hushengding/plm_cache/t5-large", - "tokenizer_name": "/home/hushengding/plm_cache/t5-large", + "model_name_or_path": f"{PATHBASE}t5-base", + "tokenizer_name": f"{PATHBASE}t5-base", "save_total_limit": 1, # For glue datasets. "split_validation_test": True, @@ -89,63 +53,18 @@ BaseConfigs['t5-large'] = { "greater_is_better": True, "evaluation_strategy": "steps", "overwrite_output_dir": True, - "push_to_hub": True, - "save_strategy": "steps" - } - -BaseConfigs['t5-3b'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", - "max_source_length", - "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], - [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, - [0] *7 +[0] *8, - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - ), - "do_train": True, - "do_eval": True, - "do_test": True, - - "model_name_or_path": "/home/hushengding/plm_cache/t5-3b", - "tokenizer_name": "/home/hushengding/plm_cache/t5-3b", - "save_total_limit": 1, - # For glue datasets. - "split_validation_test": True, - "seed": 42, - "dataset_config_name": ["en"], - "eval_dataset_config_name": ["en"], - "test_dataset_config_name": ["en"], - # other configurations. - "predict_with_generate": True, - # To evaluate during training. - "load_best_model_at_end": True, - "metric_for_best_model": "average_metrics", - "greater_is_better": True, - "evaluation_strategy": "steps", - "overwrite_output_dir": True, - "push_to_hub": True, + "push_to_hub": False, + "push_to_delta_center": True, "save_strategy": "steps" } AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) AllConfigs['bitfit_t5-base'].update({ - "delta_type": "bitfit", - "learning_rate": 3e-4, + "delta_type": "bitfit", + "learning_rate": 3e-4, "output_dir": "outputs/bitfit/t5-base/", }) - - AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) AllConfigs['adapter_t5-base'].update({ "delta_type": "adapter", @@ -185,16 +104,16 @@ AllConfigs['compacter_t5-base'].update({ "non_linearity": "gelu_new", #Compacter. - "hypercomplex_division": 4, + "hypercomplex_division": 4, "hypercomplex_adapters": True, "hypercomplex_nonlinearity": "glorot-uniform", - # gradient clip and clamp + # gradient clip and clamp "gradient_clip": False, "phm_clamp": False, - "normalize_phm_weight": False, + "normalize_phm_weight": False, "learn_phm": True, - # shared one side - "factorized_phm": True, + # shared one side + "factorized_phm": True, "shared_phm_rule": False, "factorized_phm_rule": False, "phm_c_init": "normal", @@ -222,16 +141,16 @@ AllConfigs['compacter++_t5-base'].update({ "non_linearity": "gelu_new", #Compacter. - "hypercomplex_division": 4, + "hypercomplex_division": 4, "hypercomplex_adapters": True, "hypercomplex_nonlinearity": "glorot-uniform", - # gradient clip and clamp + # gradient clip and clamp "gradient_clip": False, "phm_clamp": False, - "normalize_phm_weight": False, + "normalize_phm_weight": False, "learn_phm": True, - # shared one side - "factorized_phm": True, + # shared one side + "factorized_phm": True, "shared_phm_rule": False, "factorized_phm_rule": False, "phm_c_init": "normal", @@ -252,7 +171,7 @@ AllConfigs['low_rank_adapter_t5-base'].update({ ], "output_dir": "outputs/low_rank_adapter/t5-base/", "non_linearity": "gelu_new", - "low_rank_w_init": "glorot-uniform", + "low_rank_w_init": "glorot-uniform", "low_rank_rank": 1, }) @@ -279,102 +198,71 @@ AllConfigs['prefix_t5-base'].update({ "output_dir": "outputs/prefix/t5-base/", }) -AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['none_t5-base'].update({ - "delta_type": "none", - "learning_rate": 3e-5, - "output_dir": "outputs/none/t5-base/", - }) - -AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) -AllConfigs['bitfit_t5-large'].update({ - "delta_type": "bitfit", - "learning_rate": 3e-4, - "output_dir": "outputs/bitfit/t5-large/", - }) - -AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) -AllConfigs['none_t5-large'].update({ - "delta_type": "none", - "learning_rate": 3e-5, - "output_dir": "outputs/none/t5-large/", - }) - - -AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) -AllConfigs['bitfit_t5-3b'].update({ - "delta_type": "bitfit", - "learning_rate": 3e-4, - "output_dir": "outputs/bitfit/t5-3b/", - }) - -AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) -AllConfigs['none_t5-3b'].update({ - "delta_type": "none", - "learning_rate": 3e-5, - "output_dir": "outputs/none/t5-3b/", - }) - -AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) -AllConfigs['adapter_t5-3b'].update({ - "delta_type": "adapter", +AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) +AllConfigs['soft_prompt_t5-base'].update({ + "delta_type": "soft_prompt", "learning_rate": 3e-4, "unfrozen_modules": [ "deltas", - "layer_norm", - "final_layer_norm" ], - "bottleneck_dim":24, - "output_dir": "outputs/adapter/t5-3b/", + "output_dir": "outputs/soft_prompt/t5-base/", }) +#### T5-base +BaseConfigs['t5-small'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, -AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) -AllConfigs['adapter_t5-large'].update({ - "delta_type": "adapter", + "model_name_or_path": f"{PATHBASE}t5-small", + "tokenizer_name": f"{PATHBASE}t5-small", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps" + } + +AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small']) +AllConfigs['prefix_t5-small'].update({ + "delta_type": "prefix", "learning_rate": 3e-4, "unfrozen_modules": [ "deltas", - "layer_norm", - "final_layer_norm" ], - "bottleneck_dim":24, - "output_dir": "outputs/adapter/t5-large/", + "output_dir": "outputs/prefix/t5-small/", }) -AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) -AllConfigs['lora_t5-large'].update({ - "delta_type": "lora", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "lora_r": 8, - "output_dir": "outputs/lora/t5-large/", - }) -AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) -AllConfigs['lora_t5-3b'].update({ - "delta_type": "lora", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "lora_r": 8, - "output_dir": "outputs/lora/t5-3b/", - }) - - if __name__ == "__main__": - import argparse - import json - import os - parser = argparse.ArgumentParser("Parser to generate configuration") - parser.add_argument("--job", type=str) - args = parser.parse_args() config = AllConfigs[args.job] @@ -399,13 +287,12 @@ if __name__ == "__main__": all_config_jsons[job_name][key] = config[key] - if not os.path.exists(f"./{args.job}_{BS}/"): - os.mkdir(f"./{args.job}_{BS}/") + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") for job_name in all_config_jsons: - with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) - - - \ No newline at end of file + + diff --git a/examples/examples_prompt/configs/gen_bart.py b/examples/examples_prompt/configs/gen_bart.py new file mode 100644 index 0000000..0008afc --- /dev/null +++ b/examples/examples_prompt/configs/gen_bart.py @@ -0,0 +1,248 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" +# PATHBASE="" + +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['bart-base'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}bart-base", + "tokenizer_name": f"{PATHBASE}bart-base", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps" + } + +AllConfigs['bitfit_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) +AllConfigs['bitfit_bart-base'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/bart-base/", + }) + +AllConfigs['adapter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) +AllConfigs['adapter_bart-base'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/bart-base/", + }) + +AllConfigs['lora_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) +AllConfigs['lora_bart-base'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "modified_modules": [ + "q_proj", + "v_proj", + ], + "lora_r": 8, + "output_dir": "outputs/lora/bart-base/", + }) + +AllConfigs['compacter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) +AllConfigs['compacter_bart-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/bart-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) +AllConfigs['compacter++_bart-base'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/bart-base/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) +AllConfigs['low_rank_adapter_bart-base'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/bart-base/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) +AllConfigs['soft_prompt_bart-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/bart-base/", + }) + +AllConfigs['prefix_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) +AllConfigs['prefix_bart-base'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/bart-base/", + }) + +AllConfigs['soft_prompt_bart-base'] = copy.deepcopy(BaseConfigs['bart-base']) +AllConfigs['soft_prompt_bart-base'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/bart-base/", + }) + + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/examples/examples_prompt/configs/gen_beit.py b/examples/examples_prompt/configs/gen_beit.py index 9b61108..15550ef 100644 --- a/examples/examples_prompt/configs/gen_beit.py +++ b/examples/examples_prompt/configs/gen_beit.py @@ -205,209 +205,9 @@ AllConfigs['soft_prompt_beit-base-patch16-224'].update({ ], "output_dir": "outputs/soft_prompt/beit-base-patch16-224/", }) -#### beit-base-patch16-224 -BaseConfigs['t5-small'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", - "max_source_length", - "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], - [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], - [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [0] *7 +[0] *8, - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - ), - "do_train": True, - "do_eval": True, - "do_test": True, - - "model_name_or_path": f"{PATHBASE}t5-small", - "tokenizer_name": f"{PATHBASE}t5-small", - "save_total_limit": 1, - # For glue datasets. - "split_validation_test": True, - "seed": 42, - "dataset_config_name": ["en"], - "eval_dataset_config_name": ["en"], - "test_dataset_config_name": ["en"], - # other configurations. - "predict_with_generate": True, - # To evaluate during training. - "load_best_model_at_end": True, - "metric_for_best_model": "average_metrics", - "greater_is_better": True, - "evaluation_strategy": "steps", - "overwrite_output_dir": True, - "push_to_hub": False, - "push_to_delta_center": True, - "save_strategy": "steps" - } - -AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small']) -AllConfigs['prefix_t5-small'].update({ - "delta_type": "prefix", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - ], - "output_dir": "outputs/prefix/t5-small/", - }) - -#### ROBERTA###### -BaseConfigs['roberta-base'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", - "max_source_length", - "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], - [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], - [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [0] *7 +[0] *8, - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - ), - "do_train": True, - "do_eval": True, - "do_test": True, - - "model_name_or_path": f"{PATHBASE}roberta-base", - "tokenizer_name": f"{PATHBASE}roberta-base", - "save_total_limit": 1, - # For glue datasets. - "is_seq2seq": False, - "split_validation_test": True, - "seed": 42, - "dataset_config_name": ["en"], - "eval_dataset_config_name": ["en"], - "test_dataset_config_name": ["en"], - # other configurations. - "predict_with_generate": False, - # To evaluate during training. - "load_best_model_at_end": True, - "metric_for_best_model": "average_metrics", - "greater_is_better": True, - "evaluation_strategy": "steps", - "overwrite_output_dir": True, - "push_to_hub": True, - "save_strategy": "steps" - } - - - -AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) -AllConfigs['bitfit_roberta-base'].update({ - "delta_type": "bitfit", - "learning_rate": 1e-3, - "output_dir": "outputs/bitfit/roberta-base/", - }) - -AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) -AllConfigs['none_roberta-base'].update({ - "delta_type": "none", - "learning_rate": 1e-5, - "output_dir": "outputs/none/roberta-base/", - }) - - -AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) -AllConfigs['lora_roberta-base'].update({ - "delta_type": "lora", - "learning_rate": 1e-3, - "output_dir": "outputs/lora/roberta-base/", - }) - -AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) -AllConfigs['adapter_roberta-base'].update({ - "delta_type": "adapter", - "learning_rate": 1e-3, - "output_dir": "outputs/adapter/roberta-base/", - }) - -AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) -AllConfigs['low_rank_adapter_roberta-base'].update({ - "delta_type": "low_rank_adapter", - "learning_rate": 1e-3, - "output_dir": "outputs/low_rank_adapter/roberta-base/", - }) - -#### ROBERTA###### -BaseConfigs['bert-base-cased'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", - "max_source_length", - "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], - [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], - [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [0] *7 +[0] *8, - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - ), - "do_train": True, - "do_eval": True, - "do_test": True, - - "model_name_or_path": f"{PATHBASE}bert-base-cased", - "tokenizer_name": f"{PATHBASE}bert-base-cased", - "save_total_limit": 1, - # For glue datasets. - "is_seq2seq": False, - "split_validation_test": True, - "seed": 42, - "dataset_config_name": ["en"], - "eval_dataset_config_name": ["en"], - "test_dataset_config_name": ["en"], - # other configurations. - "predict_with_generate": False, - # To evaluate during training. - "load_best_model_at_end": True, - "metric_for_best_model": "average_metrics", - "greater_is_better": True, - "evaluation_strategy": "steps", - "overwrite_output_dir": True, - "push_to_hub": True, - "save_strategy": "steps" - } - -AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased']) -AllConfigs['prefix_bert-base-cased'].update({ - "delta_type": "prefix", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - ], - "output_dir": "outputs/prefix/bert-base-cased/", - }) - -AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased']) -AllConfigs['soft_prompt_bert-base-cased'].update({ - "delta_type": "soft_prompt", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - ], - "output_dir": "outputs/soft_prompt/bert-base-cased/", - }) - if __name__ == "__main__": import argparse import json diff --git a/examples/examples_prompt/configs/gen_bigbird.py b/examples/examples_prompt/configs/gen_bigbird.py new file mode 100644 index 0000000..b5a41e0 --- /dev/null +++ b/examples/examples_prompt/configs/gen_bigbird.py @@ -0,0 +1,147 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} + +#### ROBERTA ###### +BaseConfigs['bigbird-roberta-large'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}bigbird-roberta-large", + "tokenizer_name": f"{PATHBASE}bigbird-roberta-large", + "save_total_limit": 1, + # For glue datasets. + "is_seq2seq": False, + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": False, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps" + } + + + +AllConfigs['bitfit_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large']) +AllConfigs['bitfit_bigbird-roberta-large'].update({ + "delta_type": "bitfit", + "learning_rate": 1e-3, + "output_dir": "outputs/bitfit/bigbird-roberta-large/", + }) + +AllConfigs['none_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large']) +AllConfigs['none_bigbird-roberta-large'].update({ + "delta_type": "none", + "learning_rate": 1e-5, + "output_dir": "outputs/none/bigbird-roberta-large/", + }) + + +AllConfigs['lora_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large']) +AllConfigs['lora_bigbird-roberta-large'].update({ + "delta_type": "lora", + "learning_rate": 1e-3, + "modified_modules": [ + "query", + "key", + ], + "output_dir": "outputs/lora/bigbird-roberta-large/", + }) + +AllConfigs['adapter_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large']) +AllConfigs['adapter_bigbird-roberta-large'].update({ + "delta_type": "adapter", + "learning_rate": 1e-3, + "output_dir": "outputs/adapter/bigbird-roberta-large/", + }) + +AllConfigs['low_rank_adapter_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large']) +AllConfigs['low_rank_adapter_bigbird-roberta-large'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 1e-3, + "output_dir": "outputs/low_rank_adapter/bigbird-roberta-large/", + }) + + +AllConfigs['soft_prompt_bigbird-roberta-large'] = copy.deepcopy(BaseConfigs['bigbird-roberta-large']) +AllConfigs['soft_prompt_bigbird-roberta-large'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/bigbird-roberta-large/", + }) + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/examples/examples_prompt/configs/gen_blenderbot.py b/examples/examples_prompt/configs/gen_blenderbot.py new file mode 100644 index 0000000..c0f9653 --- /dev/null +++ b/examples/examples_prompt/configs/gen_blenderbot.py @@ -0,0 +1,254 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['blenderbot-400M-distill'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}blenderbot-400M-distill", + "tokenizer_name": f"{PATHBASE}blenderbot-400M-distill", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps" + } + +AllConfigs['bitfit_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['bitfit_blenderbot-400M-distill'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/blenderbot-400M-distill/", + }) + +AllConfigs['adapter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['adapter_blenderbot-400M-distill'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/blenderbot-400M-distill/", + }) + +AllConfigs['lora_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['lora_blenderbot-400M-distill'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "modified_modules":[ + "q_proj", + "v_proj", + ], + "lora_r": 8, + "output_dir": "outputs/lora/blenderbot-400M-distill/", + }) + +AllConfigs['compacter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['compacter_blenderbot-400M-distill'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/blenderbot-400M-distill/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['compacter++_blenderbot-400M-distill'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/blenderbot-400M-distill/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['low_rank_adapter_blenderbot-400M-distill'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/blenderbot-400M-distill/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + +AllConfigs['none_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['none_blenderbot-400M-distill'].update({ + "delta_type": "none", + "learning_rate": 1e-5, + "output_dir": "outputs/none/blenderbot-400M-distill/", + }) + + +AllConfigs['soft_prompt_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['soft_prompt_blenderbot-400M-distill'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/blenderbot-400M-distill/", + }) + +AllConfigs['prefix_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['prefix_blenderbot-400M-distill'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/blenderbot-400M-distill/", + }) + +AllConfigs['soft_prompt_blenderbot-400M-distill'] = copy.deepcopy(BaseConfigs['blenderbot-400M-distill']) +AllConfigs['soft_prompt_blenderbot-400M-distill'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/blenderbot-400M-distill/", + }) + + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/examples/examples_prompt/configs/gen_clip.py b/examples/examples_prompt/configs/gen_clip.py new file mode 100644 index 0000000..e7cb94d --- /dev/null +++ b/examples/examples_prompt/configs/gen_clip.py @@ -0,0 +1,303 @@ +import collections +import copy + +PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/" +PATHBASE="/home/hushengding/plm_cache/" + +AllConfigs = {} + +BaseConfigs = {} +BaseConfigs['clip-vit-base-patch32'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "num_classes"): zip( + ["beans"], + ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20], + [256], + [ 32], + [ 32],#, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0], # *7 +[0] *8, + [200],# 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200],#, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [ 3], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}clip-vit-base-patch32", + "tokenizer_name": f"{PATHBASE}clip-vit-base-patch32", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps" + } + +AllConfigs['bitfit_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['bitfit_clip-vit-base-patch32'].update({ + "delta_type": "bitfit", + "learning_rate": 3e-4, + "output_dir": "outputs/bitfit/clip-vit-base-patch32/", + }) + +AllConfigs['none_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['none_clip-vit-base-patch32'].update({ + "delta_type": "none", + "learning_rate": 1e-5, + "output_dir": "outputs/none/clip-vit-base-patch32/", + }) + +AllConfigs['adapter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['adapter_clip-vit-base-patch32'].update({ + "delta_type": "adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "bottleneck_dim":24, + "output_dir": "outputs/adapter/clip-vit-base-patch32/", + }) + +AllConfigs['lora_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['lora_clip-vit-base-patch32'].update({ + "delta_type": "lora", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "lora_r": 8, + "output_dir": "outputs/lora/clip-vit-base-patch32/", + }) + +AllConfigs['compacter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['compacter_clip-vit-base-patch32'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter/clip-vit-base-patch32/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + +AllConfigs['compacter++_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['compacter++_clip-vit-base-patch32'].update({ + "delta_type": "compacter", + "learning_rate": 3e-3, + "do_train": True, + "do_eval": True, + "do_test": True, + "modified_modules": [ + "DenseReluDense" + ], + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/compacter++/clip-vit-base-patch32/", + "non_linearity": "gelu_new", + + #Compacter. + "hypercomplex_division": 4, + "hypercomplex_adapters": True, + "hypercomplex_nonlinearity": "glorot-uniform", + # gradient clip and clamp + "gradient_clip": False, + "phm_clamp": False, + "normalize_phm_weight": False, + "learn_phm": True, + # shared one side + "factorized_phm": True, + "shared_phm_rule": False, + "factorized_phm_rule": False, + "phm_c_init": "normal", + "phm_init_range": 0.0001, + "use_bias_down_sampler": True, + "use_bias_up_sampler": True, + }) + + +AllConfigs['low_rank_adapter_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['low_rank_adapter_clip-vit-base-patch32'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/clip-vit-base-patch32/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + + +AllConfigs['soft_prompt_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['soft_prompt_clip-vit-base-patch32'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-2, + "soft_token_num":100, + "token_init": False, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/clip-vit-base-patch32/", + }) + +AllConfigs['prefix_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['prefix_clip-vit-base-patch32'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/clip-vit-base-patch32/", + }) + +AllConfigs['soft_prompt_clip-vit-base-patch32'] = copy.deepcopy(BaseConfigs['clip-vit-base-patch32']) +AllConfigs['soft_prompt_clip-vit-base-patch32'].update({ + "delta_type": "soft_prompt", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/soft_prompt/clip-vit-base-patch32/", + }) +#### clip-vit-base-patch32 +BaseConfigs['t5-small'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": f"{PATHBASE}t5-small", + "tokenizer_name": f"{PATHBASE}t5-small", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": False, + "push_to_delta_center": True, + "save_strategy": "steps" + } + +AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small']) +AllConfigs['prefix_t5-small'].update({ + "delta_type": "prefix", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + ], + "output_dir": "outputs/prefix/t5-small/", + }) + + +if __name__ == "__main__": + import argparse + import json + import os + parser = argparse.ArgumentParser("Parser to generate configuration") + parser.add_argument("--job", type=str) + args = parser.parse_args() + + config = AllConfigs[args.job] + + Cartesian_product = [] + for key in config: + if isinstance(key, tuple): + Cartesian_product.append(key) + all_config_jsons = {} + for key_tuple in Cartesian_product: + for zipped in config[key_tuple]: + job_name = zipped[0] + all_config_jsons[job_name] = {} + for key_name, zipped_elem in zip(key_tuple, zipped): + if key_name != 'job_name': + all_config_jsons[job_name][key_name] = zipped_elem + for key in config: + if not isinstance(key, tuple): + for job_name in all_config_jsons: + if key == "output_dir": + all_config_jsons[job_name][key] = config[key] + job_name + else: + all_config_jsons[job_name][key] = config[key] + + + if not os.path.exists(f"configs/{args.job}/"): + os.mkdir(f"configs/{args.job}/") + + for job_name in all_config_jsons: + with open(f"configs/{args.job}/{job_name}.json", 'w') as fout: + json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) + + + diff --git a/examples/examples_prompt/configs/gen_t5.py b/examples/examples_prompt/configs/gen_t5.py index b2e15a5..1caeb41 100644 --- a/examples/examples_prompt/configs/gen_t5.py +++ b/examples/examples_prompt/configs/gen_t5.py @@ -76,8 +76,6 @@ AllConfigs['lora_t5-base'].update({ "learning_rate": 3e-4, "unfrozen_modules": [ "deltas", - "layer_norm", - "final_layer_norm" ], "lora_r": 8, "output_dir": "outputs/lora/t5-base/", @@ -254,154 +252,6 @@ AllConfigs['prefix_t5-small'].update({ }) - - -#### ROBERTA###### -BaseConfigs['roberta-base'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", - "max_source_length", - "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], - [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], - [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [0] *7 +[0] *8, - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - ), - "do_train": True, - "do_eval": True, - "do_test": True, - - "model_name_or_path": f"{PATHBASE}roberta-base", - "tokenizer_name": f"{PATHBASE}roberta-base", - "save_total_limit": 1, - # For glue datasets. - "is_seq2seq": False, - "split_validation_test": True, - "seed": 42, - "dataset_config_name": ["en"], - "eval_dataset_config_name": ["en"], - "test_dataset_config_name": ["en"], - # other configurations. - "predict_with_generate": False, - # To evaluate during training. - "load_best_model_at_end": True, - "metric_for_best_model": "average_metrics", - "greater_is_better": True, - "evaluation_strategy": "steps", - "overwrite_output_dir": True, - "push_to_hub": True, - "save_strategy": "steps" - } - - - -AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) -AllConfigs['bitfit_roberta-base'].update({ - "delta_type": "bitfit", - "learning_rate": 1e-3, - "output_dir": "outputs/bitfit/roberta-base/", - }) - -AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) -AllConfigs['none_roberta-base'].update({ - "delta_type": "none", - "learning_rate": 1e-5, - "output_dir": "outputs/none/roberta-base/", - }) - - -AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) -AllConfigs['lora_roberta-base'].update({ - "delta_type": "lora", - "learning_rate": 1e-3, - "output_dir": "outputs/lora/roberta-base/", - }) - -AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) -AllConfigs['adapter_roberta-base'].update({ - "delta_type": "adapter", - "learning_rate": 1e-3, - "output_dir": "outputs/adapter/roberta-base/", - }) - -AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base']) -AllConfigs['low_rank_adapter_roberta-base'].update({ - "delta_type": "low_rank_adapter", - "learning_rate": 1e-3, - "output_dir": "outputs/low_rank_adapter/roberta-base/", - }) - -#### ROBERTA###### -BaseConfigs['bert-base-cased'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", - "max_source_length", - "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], - [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], - [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [0] *7 +[0] *8, - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - ), - "do_train": True, - "do_eval": True, - "do_test": True, - - "model_name_or_path": f"{PATHBASE}bert-base-cased", - "tokenizer_name": f"{PATHBASE}bert-base-cased", - "save_total_limit": 1, - # For glue datasets. - "is_seq2seq": False, - "split_validation_test": True, - "seed": 42, - "dataset_config_name": ["en"], - "eval_dataset_config_name": ["en"], - "test_dataset_config_name": ["en"], - # other configurations. - "predict_with_generate": False, - # To evaluate during training. - "load_best_model_at_end": True, - "metric_for_best_model": "average_metrics", - "greater_is_better": True, - "evaluation_strategy": "steps", - "overwrite_output_dir": True, - "push_to_hub": True, - "save_strategy": "steps" - } - -AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased']) -AllConfigs['prefix_bert-base-cased'].update({ - "delta_type": "prefix", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - ], - "output_dir": "outputs/prefix/bert-base-cased/", - }) - -AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased']) -AllConfigs['soft_prompt_bert-base-cased'].update({ - "delta_type": "soft_prompt", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - ], - "output_dir": "outputs/soft_prompt/bert-base-cased/", - }) - if __name__ == "__main__": import argparse import json diff --git a/examples/examples_prompt/data_processors/tasks.py b/examples/examples_prompt/data_processors/tasks.py index 4823eae..aee5478 100644 --- a/examples/examples_prompt/data_processors/tasks.py +++ b/examples/examples_prompt/data_processors/tasks.py @@ -85,11 +85,14 @@ class SST2(AbstractTask): "test": "validation"} verbalizers = { - "0":{"0":"negative","1":"positive"} + "0":{"0":"negative","1":"positive"}, + "blenderbot":{"0":"negative","1":"positive"} + } templates_text = { - "0":"""The sentiment of sentence: "{"meta":"sentence", "shortenable":True} is {"mask"}.""" + "0":"""The sentiment of sentence: "{"meta":"sentence", "shortenable":True}" is {"mask"}.""", + "blenderbot": """{"meta":"sentence", "shortenable":True} what is the sentiment?""" } def load_dataset(self, split): @@ -533,14 +536,15 @@ class Beans(AbstractTask): metric_names = ["accuracy"] verbalizers = { - "0": { - "0": "No", - "1": "Yes", + "clip": { + "angular_leaf_spot": "angular_leaf_spot", + "bean_rust": "bean_rust", + "healthy": "healthy", } } templates_text = { - "0": """{"meta":"sentence1"}""" + "clip":"""a photo of {"mask"} leaf.""" } def load_dataset(self, split): diff --git a/examples/examples_prompt/src/run.py b/examples/examples_prompt/src/run.py index ba019ad..28f045d 100644 --- a/examples/examples_prompt/src/run.py +++ b/examples/examples_prompt/src/run.py @@ -124,6 +124,9 @@ def main(): if os.path.basename(model_args.model_name_or_path).startswith("t5"): from examples_prompt.backbones.t5 import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts from examples_prompt.backbones.t5 import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("blenderbot"): + from examples_prompt.backbones.blenderbot import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.blenderbot import Trainer, DataCollator elif os.path.basename(model_args.model_name_or_path).startswith("roberta") \ or os.path.basename(model_args.model_name_or_path).startswith("bert") \ or os.path.basename(model_args.model_name_or_path).startswith("albert") : @@ -132,6 +135,15 @@ def main(): elif os.path.basename(model_args.model_name_or_path).startswith("beit"): from examples_prompt.backbones.beit import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts from examples_prompt.backbones.beit import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("bart"): + from examples_prompt.backbones.bart import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.bart import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("bigbird"): + from examples_prompt.backbones.bigbird import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.bigbird import Trainer, DataCollator + elif os.path.basename(model_args.model_name_or_path).startswith("clip"): + from examples_prompt.backbones.clip import get_backbone, preprocess_function, mask_token_func, get_remove_columns, get_prompts + from examples_prompt.backbones.clip import Trainer, DataCollator @@ -139,6 +151,9 @@ def main(): config, tokenizer, model = get_backbone(model_args=model_args) + from opendelta import Visualization + Visualization(model).structure_graph() + if delta_args.delta_type.lower() != "none": from opendelta import AutoDeltaConfig,AutoDeltaModel delta_config = AutoDeltaConfig.from_dict(vars(delta_args)) @@ -174,7 +189,7 @@ def main(): task = AutoTask.get(data_args.task_name, data_args.dataset_config_name, data_args=data_args, - seed=data_args.data_seed) + seed=data_args.data_sample_seed) dataset = task.get(split=split_name, split_validation_test=training_args.split_validation_test, @@ -182,7 +197,7 @@ def main(): - template, _verbalizer, tokenizer_wrapper = get_prompts(task, tokenizer, training_args) + template, _verbalizer, tokenizer_wrapper = get_prompts(task, tokenizer, data_args) dataset = dataset.map( diff --git a/examples/examples_prompt/utils/args.py b/examples/examples_prompt/utils/args.py index 4a9de18..aefec9a 100644 --- a/examples/examples_prompt/utils/args.py +++ b/examples/examples_prompt/utils/args.py @@ -197,7 +197,7 @@ class DataTrainingArguments: datasets_saved_path: Optional[str] = field( default=None, metadata={"help": "the path of the saved datasets"} ) - data_seed: Optional[int] = field(default=42, metadata={"help": "seed used to shuffle the data."}) + data_sample_seed: Optional[int] = field(default=42, metadata={"help": "seed used to shuffle the data."}) model_parallel: Optional[bool] = field(default=False, metadata={"help": "whether apply model parallelization"}) diff --git a/examples/examples_seq2seq/collect_result.jsonl b/examples/examples_seq2seq/collect_result.jsonl deleted file mode 100644 index 3ab2a93..0000000 --- a/examples/examples_seq2seq/collect_result.jsonl +++ /dev/null @@ -1,21 +0,0 @@ -# the final results will be populated here.{ - "evaluate": { - "epoch": 20.0, - "eval_accuracy": 89.2156862745098, - "eval_average_metrics": 90.76168929110105, - "eval_f1": 92.3076923076923, - "eval_loss": 0.16493959724903107, - "eval_runtime": 1.6391, - "eval_samples_per_second": 124.455 - }, - "repo_name": "DeltaHub/bitfit_t5-base_mrpc", - "test": { - "epoch": 20.0, - "test_accuracy": 88.23529411764706, - "test_average_metrics": 89.97971602434077, - "test_f1": 91.72413793103448, - "test_loss": 0.14968213438987732, - "test_runtime": 1.6344, - "test_samples_per_second": 124.82 - } -} diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/cola.json b/examples/examples_seq2seq/configs/bitfit_t5-base/cola.json deleted file mode 100644 index a28c9a6..0000000 --- a/examples/examples_seq2seq/configs/bitfit_t5-base/cola.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "dataset_config_name": [ - "en" - ], - "delta_type": "bitfit", - "do_eval": true, - "do_test": true, - "do_train": true, - "eval_dataset_config_name": [ - "en" - ], - "eval_dataset_name": "cola", - "eval_steps": 100, - "evaluation_strategy": "steps", - "greater_is_better": true, - "learning_rate": 0.0003, - "load_best_model_at_end": true, - "max_source_length": 128, - "metric_for_best_model": "average_metrics", - "model_name_or_path": "t5-base", - "num_train_epochs": 20, - "output_dir": "outputs/bitfit/t5-base/cola", - "overwrite_output_dir": true, - "per_device_eval_batch_size": 64, - "per_device_train_batch_size": 64, - "predict_with_generate": true, - "push_to_hub": true, - "save_steps": 100, - "save_strategy": "steps", - "save_total_limit": 1, - "seed": 42, - "split_validation_test": true, - "task_name": "cola", - "test_dataset_config_name": [ - "en" - ], - "test_dataset_name": "cola", - "tokenizer_name": "t5-base", - "warmup_steps": 0 -} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/mnli.json b/examples/examples_seq2seq/configs/bitfit_t5-base/mnli.json deleted file mode 100644 index d2ef00b..0000000 --- a/examples/examples_seq2seq/configs/bitfit_t5-base/mnli.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "dataset_config_name": [ - "en" - ], - "delta_type": "bitfit", - "do_eval": true, - "do_test": true, - "do_train": true, - "eval_dataset_config_name": [ - "en" - ], - "eval_dataset_name": "mnli", - "eval_steps": 200, - "evaluation_strategy": "steps", - "greater_is_better": true, - "learning_rate": 0.0003, - "load_best_model_at_end": true, - "max_source_length": 128, - "metric_for_best_model": "average_metrics", - "model_name_or_path": "t5-base", - "num_train_epochs": 3, - "output_dir": "outputs/bitfit/t5-base/mnli", - "overwrite_output_dir": true, - "per_device_eval_batch_size": 64, - "per_device_train_batch_size": 64, - "predict_with_generate": true, - "push_to_hub": true, - "save_steps": 200, - "save_strategy": "steps", - "save_total_limit": 1, - "seed": 42, - "split_validation_test": true, - "task_name": "mnli", - "test_dataset_config_name": [ - "en" - ], - "test_dataset_name": "mnli", - "tokenizer_name": "t5-base", - "warmup_steps": 0 -} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/mrpc.json b/examples/examples_seq2seq/configs/bitfit_t5-base/mrpc.json deleted file mode 100644 index 85538e0..0000000 --- a/examples/examples_seq2seq/configs/bitfit_t5-base/mrpc.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "dataset_config_name": [ - "en" - ], - "delta_type": "bitfit", - "do_eval": true, - "do_test": true, - "do_train": true, - "eval_dataset_config_name": [ - "en" - ], - "eval_dataset_name": "mrpc", - "eval_steps": 200, - "evaluation_strategy": "steps", - "greater_is_better": true, - "learning_rate": 0.0003, - "load_best_model_at_end": true, - "max_source_length": 128, - "metric_for_best_model": "average_metrics", - "model_name_or_path": "t5-base", - "num_train_epochs": 20, - "output_dir": "outputs/bitfit/t5-base/mrpc", - "overwrite_output_dir": true, - "per_device_eval_batch_size": 64, - "per_device_train_batch_size": 64, - "predict_with_generate": true, - "push_to_hub": true, - "save_steps": 200, - "save_strategy": "steps", - "save_total_limit": 1, - "seed": 42, - "split_validation_test": true, - "task_name": "mrpc", - "test_dataset_config_name": [ - "en" - ], - "test_dataset_name": "mrpc", - "tokenizer_name": "t5-base", - "warmup_steps": 0 -} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/qnli.json b/examples/examples_seq2seq/configs/bitfit_t5-base/qnli.json deleted file mode 100644 index 47d850f..0000000 --- a/examples/examples_seq2seq/configs/bitfit_t5-base/qnli.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "dataset_config_name": [ - "en" - ], - "delta_type": "bitfit", - "do_eval": true, - "do_test": true, - "do_train": true, - "eval_dataset_config_name": [ - "en" - ], - "eval_dataset_name": "qnli", - "eval_steps": 200, - "evaluation_strategy": "steps", - "greater_is_better": true, - "learning_rate": 0.0003, - "load_best_model_at_end": true, - "max_source_length": 128, - "metric_for_best_model": "average_metrics", - "model_name_or_path": "t5-base", - "num_train_epochs": 3, - "output_dir": "outputs/bitfit/t5-base/qnli", - "overwrite_output_dir": true, - "per_device_eval_batch_size": 64, - "per_device_train_batch_size": 64, - "predict_with_generate": true, - "push_to_hub": true, - "save_steps": 200, - "save_strategy": "steps", - "save_total_limit": 1, - "seed": 42, - "split_validation_test": true, - "task_name": "qnli", - "test_dataset_config_name": [ - "en" - ], - "test_dataset_name": "qnli", - "tokenizer_name": "t5-base", - "warmup_steps": 0 -} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/qqp.json b/examples/examples_seq2seq/configs/bitfit_t5-base/qqp.json deleted file mode 100644 index 856d638..0000000 --- a/examples/examples_seq2seq/configs/bitfit_t5-base/qqp.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "dataset_config_name": [ - "en" - ], - "delta_type": "bitfit", - "do_eval": true, - "do_test": true, - "do_train": true, - "eval_dataset_config_name": [ - "en" - ], - "eval_dataset_name": "qqp", - "eval_steps": 200, - "evaluation_strategy": "steps", - "greater_is_better": true, - "learning_rate": 0.0003, - "load_best_model_at_end": true, - "max_source_length": 128, - "metric_for_best_model": "average_metrics", - "model_name_or_path": "t5-base", - "num_train_epochs": 3, - "output_dir": "outputs/bitfit/t5-base/qqp", - "overwrite_output_dir": true, - "per_device_eval_batch_size": 64, - "per_device_train_batch_size": 64, - "predict_with_generate": true, - "push_to_hub": true, - "save_steps": 200, - "save_strategy": "steps", - "save_total_limit": 1, - "seed": 42, - "split_validation_test": true, - "task_name": "qqp", - "test_dataset_config_name": [ - "en" - ], - "test_dataset_name": "qqp", - "tokenizer_name": "t5-base", - "warmup_steps": 0 -} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/rte.json b/examples/examples_seq2seq/configs/bitfit_t5-base/rte.json deleted file mode 100644 index f05ae43..0000000 --- a/examples/examples_seq2seq/configs/bitfit_t5-base/rte.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "dataset_config_name": [ - "en" - ], - "delta_type": "bitfit", - "do_eval": true, - "do_test": true, - "do_train": true, - "eval_dataset_config_name": [ - "en" - ], - "eval_dataset_name": "rte", - "eval_steps": 100, - "evaluation_strategy": "steps", - "greater_is_better": true, - "learning_rate": 0.0003, - "load_best_model_at_end": true, - "max_source_length": 128, - "metric_for_best_model": "average_metrics", - "model_name_or_path": "t5-base", - "num_train_epochs": 20, - "output_dir": "outputs/bitfit/t5-base/rte", - "overwrite_output_dir": true, - "per_device_eval_batch_size": 64, - "per_device_train_batch_size": 64, - "predict_with_generate": true, - "push_to_hub": true, - "save_steps": 100, - "save_strategy": "steps", - "save_total_limit": 1, - "seed": 42, - "split_validation_test": true, - "task_name": "rte", - "test_dataset_config_name": [ - "en" - ], - "test_dataset_name": "rte", - "tokenizer_name": "t5-base", - "warmup_steps": 0 -} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/sst2.json b/examples/examples_seq2seq/configs/bitfit_t5-base/sst2.json deleted file mode 100644 index db382a5..0000000 --- a/examples/examples_seq2seq/configs/bitfit_t5-base/sst2.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "dataset_config_name": [ - "en" - ], - "delta_type": "bitfit", - "do_eval": true, - "do_test": true, - "do_train": true, - "eval_dataset_config_name": [ - "en" - ], - "eval_dataset_name": "sst2", - "eval_steps": 200, - "evaluation_strategy": "steps", - "greater_is_better": true, - "learning_rate": 0.0003, - "load_best_model_at_end": true, - "max_source_length": 128, - "metric_for_best_model": "average_metrics", - "model_name_or_path": "t5-base", - "num_train_epochs": 3, - "output_dir": "outputs/bitfit/t5-base/sst2", - "overwrite_output_dir": true, - "per_device_eval_batch_size": 64, - "per_device_train_batch_size": 64, - "predict_with_generate": true, - "push_to_hub": true, - "save_steps": 200, - "save_strategy": "steps", - "save_total_limit": 1, - "seed": 42, - "split_validation_test": true, - "task_name": "sst2", - "test_dataset_config_name": [ - "en" - ], - "test_dataset_name": "sst2", - "tokenizer_name": "t5-base", - "warmup_steps": 0 -} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/stsb.json b/examples/examples_seq2seq/configs/bitfit_t5-base/stsb.json deleted file mode 100644 index 71f9969..0000000 --- a/examples/examples_seq2seq/configs/bitfit_t5-base/stsb.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "dataset_config_name": [ - "en" - ], - "delta_type": "bitfit", - "do_eval": true, - "do_test": true, - "do_train": true, - "eval_dataset_config_name": [ - "en" - ], - "eval_dataset_name": "stsb", - "eval_steps": 100, - "evaluation_strategy": "steps", - "greater_is_better": true, - "learning_rate": 0.0003, - "load_best_model_at_end": true, - "max_source_length": 128, - "metric_for_best_model": "average_metrics", - "model_name_or_path": "t5-base", - "num_train_epochs": 20, - "output_dir": "outputs/bitfit/t5-base/stsb", - "overwrite_output_dir": true, - "per_device_eval_batch_size": 64, - "per_device_train_batch_size": 64, - "predict_with_generate": true, - "push_to_hub": true, - "save_steps": 100, - "save_strategy": "steps", - "save_total_limit": 1, - "seed": 42, - "split_validation_test": true, - "task_name": "stsb", - "test_dataset_config_name": [ - "en" - ], - "test_dataset_name": "stsb", - "tokenizer_name": "t5-base", - "warmup_steps": 0 -} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-boolq.json b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-boolq.json deleted file mode 100644 index b733416..0000000 --- a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-boolq.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "dataset_config_name": [ - "en" - ], - "delta_type": "bitfit", - "do_eval": true, - "do_test": true, - "do_train": true, - "eval_dataset_config_name": [ - "en" - ], - "eval_dataset_name": "superglue-boolq", - "eval_steps": 200, - "evaluation_strategy": "steps", - "greater_is_better": true, - "learning_rate": 0.0003, - "load_best_model_at_end": true, - "max_source_length": 256, - "metric_for_best_model": "average_metrics", - "model_name_or_path": "t5-base", - "num_train_epochs": 20, - "output_dir": "outputs/bitfit/t5-base/superglue-boolq", - "overwrite_output_dir": true, - "per_device_eval_batch_size": 32, - "per_device_train_batch_size": 32, - "predict_with_generate": true, - "push_to_hub": true, - "save_steps": 200, - "save_strategy": "steps", - "save_total_limit": 1, - "seed": 42, - "split_validation_test": true, - "task_name": "superglue-boolq", - "test_dataset_config_name": [ - "en" - ], - "test_dataset_name": "superglue-boolq", - "tokenizer_name": "t5-base", - "warmup_steps": 0 -} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-cb.json b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-cb.json deleted file mode 100644 index a801550..0000000 --- a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-cb.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "dataset_config_name": [ - "en" - ], - "delta_type": "bitfit", - "do_eval": true, - "do_test": true, - "do_train": true, - "eval_dataset_config_name": [ - "en" - ], - "eval_dataset_name": "superglue-cb", - "eval_steps": 100, - "evaluation_strategy": "steps", - "greater_is_better": true, - "learning_rate": 0.0003, - "load_best_model_at_end": true, - "max_source_length": 256, - "metric_for_best_model": "average_metrics", - "model_name_or_path": "t5-base", - "num_train_epochs": 20, - "output_dir": "outputs/bitfit/t5-base/superglue-cb", - "overwrite_output_dir": true, - "per_device_eval_batch_size": 32, - "per_device_train_batch_size": 32, - "predict_with_generate": true, - "push_to_hub": true, - "save_steps": 100, - "save_strategy": "steps", - "save_total_limit": 1, - "seed": 42, - "split_validation_test": true, - "task_name": "superglue-cb", - "test_dataset_config_name": [ - "en" - ], - "test_dataset_name": "superglue-cb", - "tokenizer_name": "t5-base", - "warmup_steps": 0 -} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-copa.json b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-copa.json deleted file mode 100644 index c69b62d..0000000 --- a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-copa.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "dataset_config_name": [ - "en" - ], - "delta_type": "bitfit", - "do_eval": true, - "do_test": true, - "do_train": true, - "eval_dataset_config_name": [ - "en" - ], - "eval_dataset_name": "superglue-copa", - "eval_steps": 50, - "evaluation_strategy": "steps", - "greater_is_better": true, - "learning_rate": 0.0003, - "load_best_model_at_end": true, - "max_source_length": 256, - "metric_for_best_model": "average_metrics", - "model_name_or_path": "t5-base", - "num_train_epochs": 40, - "output_dir": "outputs/bitfit/t5-base/superglue-copa", - "overwrite_output_dir": true, - "per_device_eval_batch_size": 32, - "per_device_train_batch_size": 32, - "predict_with_generate": true, - "push_to_hub": true, - "save_steps": 50, - "save_strategy": "steps", - "save_total_limit": 1, - "seed": 42, - "split_validation_test": true, - "task_name": "superglue-copa", - "test_dataset_config_name": [ - "en" - ], - "test_dataset_name": "superglue-copa", - "tokenizer_name": "t5-base", - "warmup_steps": 0 -} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-multirc.json b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-multirc.json deleted file mode 100644 index fd694c2..0000000 --- a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-multirc.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "dataset_config_name": [ - "en" - ], - "delta_type": "bitfit", - "do_eval": true, - "do_test": true, - "do_train": true, - "eval_dataset_config_name": [ - "en" - ], - "eval_dataset_name": "superglue-multirc", - "eval_steps": 200, - "evaluation_strategy": "steps", - "greater_is_better": true, - "learning_rate": 0.0003, - "load_best_model_at_end": true, - "max_source_length": 256, - "metric_for_best_model": "average_metrics", - "model_name_or_path": "t5-base", - "num_train_epochs": 3, - "output_dir": "outputs/bitfit/t5-base/superglue-multirc", - "overwrite_output_dir": true, - "per_device_eval_batch_size": 32, - "per_device_train_batch_size": 32, - "predict_with_generate": true, - "push_to_hub": true, - "save_steps": 200, - "save_strategy": "steps", - "save_total_limit": 1, - "seed": 42, - "split_validation_test": true, - "task_name": "superglue-multirc", - "test_dataset_config_name": [ - "en" - ], - "test_dataset_name": "superglue-multirc", - "tokenizer_name": "t5-base", - "warmup_steps": 0 -} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-record.json b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-record.json deleted file mode 100644 index b9f79c5..0000000 --- a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-record.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "dataset_config_name": [ - "en" - ], - "delta_type": "bitfit", - "do_eval": true, - "do_test": true, - "do_train": true, - "eval_dataset_config_name": [ - "en" - ], - "eval_dataset_name": "superglue-record", - "eval_steps": 200, - "evaluation_strategy": "steps", - "greater_is_better": true, - "learning_rate": 0.0003, - "load_best_model_at_end": true, - "max_source_length": 512, - "metric_for_best_model": "average_metrics", - "model_name_or_path": "t5-base", - "num_train_epochs": 3, - "output_dir": "outputs/bitfit/t5-base/superglue-record", - "overwrite_output_dir": true, - "per_device_eval_batch_size": 16, - "per_device_train_batch_size": 16, - "predict_with_generate": true, - "push_to_hub": true, - "save_steps": 200, - "save_strategy": "steps", - "save_total_limit": 1, - "seed": 42, - "split_validation_test": true, - "task_name": "superglue-record", - "test_dataset_config_name": [ - "en" - ], - "test_dataset_name": "superglue-record", - "tokenizer_name": "t5-base", - "warmup_steps": 0 -} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-wic.json b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-wic.json deleted file mode 100644 index 900067f..0000000 --- a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-wic.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "dataset_config_name": [ - "en" - ], - "delta_type": "bitfit", - "do_eval": true, - "do_test": true, - "do_train": true, - "eval_dataset_config_name": [ - "en" - ], - "eval_dataset_name": "superglue-wic", - "eval_steps": 100, - "evaluation_strategy": "steps", - "greater_is_better": true, - "learning_rate": 0.0003, - "load_best_model_at_end": true, - "max_source_length": 256, - "metric_for_best_model": "average_metrics", - "model_name_or_path": "t5-base", - "num_train_epochs": 20, - "output_dir": "outputs/bitfit/t5-base/superglue-wic", - "overwrite_output_dir": true, - "per_device_eval_batch_size": 32, - "per_device_train_batch_size": 32, - "predict_with_generate": true, - "push_to_hub": true, - "save_steps": 100, - "save_strategy": "steps", - "save_total_limit": 1, - "seed": 42, - "split_validation_test": true, - "task_name": "superglue-wic", - "test_dataset_config_name": [ - "en" - ], - "test_dataset_name": "superglue-wic", - "tokenizer_name": "t5-base", - "warmup_steps": 0 -} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-wsc.fixed.json b/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-wsc.fixed.json deleted file mode 100644 index d6a7b64..0000000 --- a/examples/examples_seq2seq/configs/bitfit_t5-base/superglue-wsc.fixed.json +++ /dev/null @@ -1,40 +0,0 @@ -{ - "dataset_config_name": [ - "en" - ], - "delta_type": "bitfit", - "do_eval": true, - "do_test": true, - "do_train": true, - "eval_dataset_config_name": [ - "en" - ], - "eval_dataset_name": "superglue-wsc.fixed", - "eval_steps": 100, - "evaluation_strategy": "steps", - "greater_is_better": true, - "learning_rate": 0.0003, - "load_best_model_at_end": true, - "max_source_length": 256, - "metric_for_best_model": "average_metrics", - "model_name_or_path": "t5-base", - "num_train_epochs": 20, - "output_dir": "outputs/bitfit/t5-base/superglue-wsc.fixed", - "overwrite_output_dir": true, - "per_device_eval_batch_size": 32, - "per_device_train_batch_size": 32, - "predict_with_generate": true, - "push_to_hub": true, - "save_steps": 100, - "save_strategy": "steps", - "save_total_limit": 1, - "seed": 42, - "split_validation_test": true, - "task_name": "superglue-wsc.fixed", - "test_dataset_config_name": [ - "en" - ], - "test_dataset_name": "superglue-wsc.fixed", - "tokenizer_name": "t5-base", - "warmup_steps": 0 -} \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/config_gen.py b/examples/examples_seq2seq/configs/config_gen.py deleted file mode 100644 index 33613d3..0000000 --- a/examples/examples_seq2seq/configs/config_gen.py +++ /dev/null @@ -1,426 +0,0 @@ -import collections -import copy - -AllConfigs = {} - -BaseConfigs = {} -BaseConfigs['t5-base'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", - "max_source_length", - "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], - [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [8] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [8] * 8, - [0] *7 +[0] *8, - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - ), - "do_train": True, - "do_eval": True, - "do_test": True, - - "model_name_or_path": "t5-base", - "tokenizer_name": "t5-base", - "save_total_limit": 1, - # For glue datasets. - "split_validation_test": True, - "seed": 42, - "dataset_config_name": ["en"], - "eval_dataset_config_name": ["en"], - "test_dataset_config_name": ["en"], - # other configurations. - "predict_with_generate": True, - # To evaluate during training. - "load_best_model_at_end": True, - "metric_for_best_model": "average_metrics", - "greater_is_better": True, - "evaluation_strategy": "steps", - "overwrite_output_dir": True, - "push_to_hub": True, - "save_strategy": "steps" - } - - -BaseConfigs['t5-large'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", - "max_source_length", - "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], - [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [8] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [8] * 8, - [0] *7 +[0] *8, - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - ), - "do_train": True, - "do_eval": True, - "do_test": True, - - "model_name_or_path": "/home/hushengding/plm_cache/t5-large", - "tokenizer_name": "/home/hushengding/plm_cache/t5-large", - "save_total_limit": 1, - # For glue datasets. - "split_validation_test": True, - "seed": 42, - "dataset_config_name": ["en"], - "eval_dataset_config_name": ["en"], - "test_dataset_config_name": ["en"], - # other configurations. - "predict_with_generate": True, - # To evaluate during training. - "load_best_model_at_end": True, - "metric_for_best_model": "average_metrics", - "greater_is_better": True, - "evaluation_strategy": "steps", - "overwrite_output_dir": True, - "push_to_hub": True, - "save_strategy": "steps" - } - -BaseConfigs['t5-3b'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", - "max_source_length", - "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], - [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [8] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [8] * 8, - - [0] *7 +[0] *8, - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - ), - "do_train": True, - "do_eval": True, - "do_test": True, - "model_name_or_path": "/home/hushengding/plm_cache/t5-3b", - "tokenizer_name": "/home/hushengding/plm_cache/t5-3b", - "save_total_limit": 1, - # For glue datasets. - "split_validation_test": True, - "seed": 42, - "dataset_config_name": ["en"], - "eval_dataset_config_name": ["en"], - "test_dataset_config_name": ["en"], - # other configurations. - "predict_with_generate": True, - # To evaluate during training. - "load_best_model_at_end": True, - "metric_for_best_model": "average_metrics", - "greater_is_better": True, - "evaluation_strategy": "steps", - "overwrite_output_dir": True, - "push_to_hub": True, - "save_strategy": "steps" - } - -AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['bitfit_t5-base'].update({ - "delta_type": "bitfit", - "learning_rate": 3e-4, - "output_dir": "outputs/bitfit/t5-base/", - }) - - - -AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['adapter_t5-base'].update({ - "delta_type": "adapter", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "bottleneck_dim":24, - "output_dir": "outputs/adapter/t5-base/", - }) - -AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['lora_t5-base'].update({ - "delta_type": "lora", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "lora_r": 8, - "output_dir": "outputs/lora/t5-base/", - }) - -AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['compacter_t5-base'].update({ - "delta_type": "compacter", - "learning_rate": 3e-3, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "output_dir": "outputs/compacter/t5-base/", - "non_linearity": "gelu_new", - - #Compacter. - "hypercomplex_division": 4, - "hypercomplex_adapters": True, - "hypercomplex_nonlinearity": "glorot-uniform", - # gradient clip and clamp - "gradient_clip": False, - "phm_clamp": False, - "normalize_phm_weight": False, - "learn_phm": True, - # shared one side - "factorized_phm": True, - "shared_phm_rule": False, - "factorized_phm_rule": False, - "phm_c_init": "normal", - "phm_init_range": 0.0001, - "use_bias_down_sampler": True, - "use_bias_up_sampler": True, - }) - -AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['compacter++_t5-base'].update({ - "delta_type": "compacter", - "learning_rate": 3e-3, - "do_train": True, - "do_eval": True, - "do_test": True, - "modified_modules": [ - "DenseReluDense" - ], - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "output_dir": "outputs/compacter++/t5-base/", - "non_linearity": "gelu_new", - - #Compacter. - "hypercomplex_division": 4, - "hypercomplex_adapters": True, - "hypercomplex_nonlinearity": "glorot-uniform", - # gradient clip and clamp - "gradient_clip": False, - "phm_clamp": False, - "normalize_phm_weight": False, - "learn_phm": True, - # shared one side - "factorized_phm": True, - "shared_phm_rule": False, - "factorized_phm_rule": False, - "phm_c_init": "normal", - "phm_init_range": 0.0001, - "use_bias_down_sampler": True, - "use_bias_up_sampler": True, - }) - - -AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['low_rank_adapter_t5-base'].update({ - "delta_type": "low_rank_adapter", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "output_dir": "outputs/low_rank_adapter/t5-base/", - "non_linearity": "gelu_new", - "low_rank_w_init": "glorot-uniform", - "low_rank_rank": 1, - }) - -AllConfigs['low_rank_adapter_t5-xxl'] = copy.deepcopy(BaseConfigs['t5-xxl']) -AllConfigs['low_rank_adapter_t5-xxl'].update({ - "delta_type": "low_rank_adapter", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "output_dir": "outputs/low_rank_adapter/t5-xxl/", - "non_linearity": "gelu_new", - "low_rank_w_init": "glorot-uniform", - "low_rank_rank": 1, - }) - - -AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['soft_prompt_t5-base'].update({ - "delta_type": "soft_prompt", - "learning_rate": 3e-2, - "soft_token_num":100, - "token_init": False, - "unfrozen_modules": [ - "deltas", - ], - "output_dir": "outputs/soft_prompt/t5-base/", - }) - -AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['prefix_t5-base'].update({ - "delta_type": "prefix", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - ], - "output_dir": "outputs/prefix/t5-base/", - }) - -AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['none_t5-base'].update({ - "delta_type": "none", - "learning_rate": 3e-5, - "output_dir": "outputs/none/t5-base/", - }) - -AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) -AllConfigs['bitfit_t5-large'].update({ - "delta_type": "bitfit", - "learning_rate": 3e-4, - "output_dir": "outputs/bitfit/t5-large/", - }) - -AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) -AllConfigs['none_t5-large'].update({ - "delta_type": "none", - "learning_rate": 3e-5, - "output_dir": "outputs/none/t5-large/", - }) - - -AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) -AllConfigs['bitfit_t5-3b'].update({ - "delta_type": "bitfit", - "learning_rate": 3e-4, - "output_dir": "outputs/bitfit/t5-3b/", - }) - -AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) -AllConfigs['none_t5-3b'].update({ - "delta_type": "none", - "learning_rate": 3e-5, - "output_dir": "outputs/none/t5-3b/", - }) - -AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) -AllConfigs['adapter_t5-3b'].update({ - "delta_type": "adapter", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "bottleneck_dim":24, - "output_dir": "outputs/adapter/t5-3b/", - }) - -AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) -AllConfigs['adapter_t5-large'].update({ - "delta_type": "adapter", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "bottleneck_dim":24, - "output_dir": "outputs/adapter/t5-large/", - }) - -AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) -AllConfigs['lora_t5-large'].update({ - "delta_type": "lora", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "lora_r": 8, - "output_dir": "outputs/lora/t5-large/", - }) - -AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) -AllConfigs['lora_t5-3b'].update({ - "delta_type": "lora", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "lora_r": 8, - "output_dir": "outputs/lora/t5-3b/", - }) - - -if __name__ == "__main__": - import argparse - import json - import os - parser = argparse.ArgumentParser("Parser to generate configuration") - parser.add_argument("--job", type=str) - args = parser.parse_args() - - config = AllConfigs[args.job] - - Cartesian_product = [] - for key in config: - if isinstance(key, tuple): - Cartesian_product.append(key) - all_config_jsons = {} - for key_tuple in Cartesian_product: - for zipped in config[key_tuple]: - job_name = zipped[0] - all_config_jsons[job_name] = {} - for key_name, zipped_elem in zip(key_tuple, zipped): - if key_name != 'job_name': - all_config_jsons[job_name][key_name] = zipped_elem - for key in config: - if not isinstance(key, tuple): - for job_name in all_config_jsons: - if key == "output_dir": - all_config_jsons[job_name][key] = config[key] + job_name - else: - all_config_jsons[job_name][key] = config[key] - - - if not os.path.exists(f"./{args.job}/"): - os.mkdir(f"./{args.job}/") - - for job_name in all_config_jsons: - with open(f"./{args.job}/{job_name}.json", 'w') as fout: - json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) - - - - \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/config_gen_bs.py b/examples/examples_seq2seq/configs/config_gen_bs.py index 4cf3c8e..1d9f238 100644 --- a/examples/examples_seq2seq/configs/config_gen_bs.py +++ b/examples/examples_seq2seq/configs/config_gen_bs.py @@ -1,4 +1,4 @@ -import collections +import collections import copy BS = 1 @@ -6,13 +6,13 @@ AllConfigs = {} BaseConfigs = {} BaseConfigs['t5-base'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", "max_source_length", "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], @@ -27,7 +27,7 @@ BaseConfigs['t5-base'] = { "do_train": True, "do_eval": True, "do_test": True, - + "model_name_or_path": "t5-base", "tokenizer_name": "t5-base", "save_total_limit": 1, @@ -50,13 +50,13 @@ BaseConfigs['t5-base'] = { } BaseConfigs['t5-large'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", "max_source_length", "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], @@ -71,7 +71,7 @@ BaseConfigs['t5-large'] = { "do_train": True, "do_eval": True, "do_test": True, - + "model_name_or_path": "/home/hushengding/plm_cache/t5-large", "tokenizer_name": "/home/hushengding/plm_cache/t5-large", "save_total_limit": 1, @@ -94,13 +94,13 @@ BaseConfigs['t5-large'] = { } BaseConfigs['t5-3b'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", "max_source_length", "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], @@ -115,7 +115,7 @@ BaseConfigs['t5-3b'] = { "do_train": True, "do_eval": True, "do_test": True, - + "model_name_or_path": "/home/hushengding/plm_cache/t5-3b", "tokenizer_name": "/home/hushengding/plm_cache/t5-3b", "save_total_limit": 1, @@ -139,8 +139,8 @@ BaseConfigs['t5-3b'] = { AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) AllConfigs['bitfit_t5-base'].update({ - "delta_type": "bitfit", - "learning_rate": 3e-4, + "delta_type": "bitfit", + "learning_rate": 3e-4, "output_dir": "outputs/bitfit/t5-base/", }) @@ -185,16 +185,16 @@ AllConfigs['compacter_t5-base'].update({ "non_linearity": "gelu_new", #Compacter. - "hypercomplex_division": 4, + "hypercomplex_division": 4, "hypercomplex_adapters": True, "hypercomplex_nonlinearity": "glorot-uniform", - # gradient clip and clamp + # gradient clip and clamp "gradient_clip": False, "phm_clamp": False, - "normalize_phm_weight": False, + "normalize_phm_weight": False, "learn_phm": True, - # shared one side - "factorized_phm": True, + # shared one side + "factorized_phm": True, "shared_phm_rule": False, "factorized_phm_rule": False, "phm_c_init": "normal", @@ -222,16 +222,16 @@ AllConfigs['compacter++_t5-base'].update({ "non_linearity": "gelu_new", #Compacter. - "hypercomplex_division": 4, + "hypercomplex_division": 4, "hypercomplex_adapters": True, "hypercomplex_nonlinearity": "glorot-uniform", - # gradient clip and clamp + # gradient clip and clamp "gradient_clip": False, "phm_clamp": False, - "normalize_phm_weight": False, + "normalize_phm_weight": False, "learn_phm": True, - # shared one side - "factorized_phm": True, + # shared one side + "factorized_phm": True, "shared_phm_rule": False, "factorized_phm_rule": False, "phm_c_init": "normal", @@ -252,7 +252,7 @@ AllConfigs['low_rank_adapter_t5-base'].update({ ], "output_dir": "outputs/low_rank_adapter/t5-base/", "non_linearity": "gelu_new", - "low_rank_w_init": "glorot-uniform", + "low_rank_w_init": "glorot-uniform", "low_rank_rank": 1, }) @@ -288,8 +288,8 @@ AllConfigs['none_t5-base'].update({ AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) AllConfigs['bitfit_t5-large'].update({ - "delta_type": "bitfit", - "learning_rate": 3e-4, + "delta_type": "bitfit", + "learning_rate": 3e-4, "output_dir": "outputs/bitfit/t5-large/", }) @@ -303,8 +303,8 @@ AllConfigs['none_t5-large'].update({ AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) AllConfigs['bitfit_t5-3b'].update({ - "delta_type": "bitfit", - "learning_rate": 3e-4, + "delta_type": "bitfit", + "learning_rate": 3e-4, "output_dir": "outputs/bitfit/t5-3b/", }) @@ -367,7 +367,7 @@ AllConfigs['lora_t5-3b'].update({ "output_dir": "outputs/lora/t5-3b/", }) - + if __name__ == "__main__": import argparse import json @@ -405,7 +405,6 @@ if __name__ == "__main__": for job_name in all_config_jsons: with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout: json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) - - - \ No newline at end of file + + diff --git a/examples/examples_seq2seq/configs/config_gen_bs64.py b/examples/examples_seq2seq/configs/config_gen_bs64.py deleted file mode 100644 index 90426fe..0000000 --- a/examples/examples_seq2seq/configs/config_gen_bs64.py +++ /dev/null @@ -1,411 +0,0 @@ -import collections -import copy - -BS = 64 -AllConfigs = {} - -BaseConfigs = {} -BaseConfigs['t5-base'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", - "max_source_length", - "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], - [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, - [0] *7 +[0] *8, - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - ), - "do_train": True, - "do_eval": True, - "do_test": True, - - "model_name_or_path": "t5-base", - "tokenizer_name": "t5-base", - "save_total_limit": 1, - # For glue datasets. - "split_validation_test": True, - "seed": 42, - "dataset_config_name": ["en"], - "eval_dataset_config_name": ["en"], - "test_dataset_config_name": ["en"], - # other configurations. - "predict_with_generate": True, - # To evaluate during training. - "load_best_model_at_end": True, - "metric_for_best_model": "average_metrics", - "greater_is_better": True, - "evaluation_strategy": "steps", - "overwrite_output_dir": True, - "push_to_hub": True, - "save_strategy": "steps" - } - -BaseConfigs['t5-large'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", - "max_source_length", - "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], - [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, - [0] *7 +[0] *8, - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - ), - "do_train": True, - "do_eval": True, - "do_test": True, - - "model_name_or_path": "/home/hushengding/plm_cache/t5-large", - "tokenizer_name": "/home/hushengding/plm_cache/t5-large", - "save_total_limit": 1, - # For glue datasets. - "split_validation_test": True, - "seed": 42, - "dataset_config_name": ["en"], - "eval_dataset_config_name": ["en"], - "test_dataset_config_name": ["en"], - # other configurations. - "predict_with_generate": True, - # To evaluate during training. - "load_best_model_at_end": True, - "metric_for_best_model": "average_metrics", - "greater_is_better": True, - "evaluation_strategy": "steps", - "overwrite_output_dir": True, - "push_to_hub": True, - "save_strategy": "steps" - } - -BaseConfigs['t5-3b'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", - "max_source_length", - "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], - [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, - [0] *7 +[0] *8, - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - ), - "do_train": True, - "do_eval": True, - "do_test": True, - - "model_name_or_path": "/home/hushengding/plm_cache/t5-3b", - "tokenizer_name": "/home/hushengding/plm_cache/t5-3b", - "save_total_limit": 1, - # For glue datasets. - "split_validation_test": True, - "seed": 42, - "dataset_config_name": ["en"], - "eval_dataset_config_name": ["en"], - "test_dataset_config_name": ["en"], - # other configurations. - "predict_with_generate": True, - # To evaluate during training. - "load_best_model_at_end": True, - "metric_for_best_model": "average_metrics", - "greater_is_better": True, - "evaluation_strategy": "steps", - "overwrite_output_dir": True, - "push_to_hub": True, - "save_strategy": "steps" - } - -AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['bitfit_t5-base'].update({ - "delta_type": "bitfit", - "learning_rate": 3e-4, - "output_dir": "outputs/bitfit/t5-base/", - }) - - - -AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['adapter_t5-base'].update({ - "delta_type": "adapter", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "bottleneck_dim":24, - "output_dir": "outputs/adapter/t5-base/", - }) - -AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['lora_t5-base'].update({ - "delta_type": "lora", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "lora_r": 8, - "output_dir": "outputs/lora/t5-base/", - }) - -AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['compacter_t5-base'].update({ - "delta_type": "compacter", - "learning_rate": 3e-3, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "output_dir": "outputs/compacter/t5-base/", - "non_linearity": "gelu_new", - - #Compacter. - "hypercomplex_division": 4, - "hypercomplex_adapters": True, - "hypercomplex_nonlinearity": "glorot-uniform", - # gradient clip and clamp - "gradient_clip": False, - "phm_clamp": False, - "normalize_phm_weight": False, - "learn_phm": True, - # shared one side - "factorized_phm": True, - "shared_phm_rule": False, - "factorized_phm_rule": False, - "phm_c_init": "normal", - "phm_init_range": 0.0001, - "use_bias_down_sampler": True, - "use_bias_up_sampler": True, - }) - -AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['compacter++_t5-base'].update({ - "delta_type": "compacter", - "learning_rate": 3e-3, - "do_train": True, - "do_eval": True, - "do_test": True, - "modified_modules": [ - "DenseReluDense" - ], - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "output_dir": "outputs/compacter++/t5-base/", - "non_linearity": "gelu_new", - - #Compacter. - "hypercomplex_division": 4, - "hypercomplex_adapters": True, - "hypercomplex_nonlinearity": "glorot-uniform", - # gradient clip and clamp - "gradient_clip": False, - "phm_clamp": False, - "normalize_phm_weight": False, - "learn_phm": True, - # shared one side - "factorized_phm": True, - "shared_phm_rule": False, - "factorized_phm_rule": False, - "phm_c_init": "normal", - "phm_init_range": 0.0001, - "use_bias_down_sampler": True, - "use_bias_up_sampler": True, - }) - - -AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['low_rank_adapter_t5-base'].update({ - "delta_type": "low_rank_adapter", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "output_dir": "outputs/low_rank_adapter/t5-base/", - "non_linearity": "gelu_new", - "low_rank_w_init": "glorot-uniform", - "low_rank_rank": 1, - }) - - -AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['soft_prompt_t5-base'].update({ - "delta_type": "soft_prompt", - "learning_rate": 3e-2, - "soft_token_num":100, - "token_init": False, - "unfrozen_modules": [ - "deltas", - ], - "output_dir": "outputs/soft_prompt/t5-base/", - }) - -AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['prefix_t5-base'].update({ - "delta_type": "prefix", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - ], - "output_dir": "outputs/prefix/t5-base/", - }) - -AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['none_t5-base'].update({ - "delta_type": "none", - "learning_rate": 3e-5, - "output_dir": "outputs/none/t5-base/", - }) - -AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) -AllConfigs['bitfit_t5-large'].update({ - "delta_type": "bitfit", - "learning_rate": 3e-4, - "output_dir": "outputs/bitfit/t5-large/", - }) - -AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) -AllConfigs['none_t5-large'].update({ - "delta_type": "none", - "learning_rate": 3e-5, - "output_dir": "outputs/none/t5-large/", - }) - - -AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) -AllConfigs['bitfit_t5-3b'].update({ - "delta_type": "bitfit", - "learning_rate": 3e-4, - "output_dir": "outputs/bitfit/t5-3b/", - }) - -AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) -AllConfigs['none_t5-3b'].update({ - "delta_type": "none", - "learning_rate": 3e-5, - "output_dir": "outputs/none/t5-3b/", - }) - -AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) -AllConfigs['adapter_t5-3b'].update({ - "delta_type": "adapter", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "bottleneck_dim":24, - "output_dir": "outputs/adapter/t5-3b/", - }) - -AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) -AllConfigs['adapter_t5-large'].update({ - "delta_type": "adapter", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "bottleneck_dim":24, - "output_dir": "outputs/adapter/t5-large/", - }) - -AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) -AllConfigs['lora_t5-large'].update({ - "delta_type": "lora", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "lora_r": 8, - "output_dir": "outputs/lora/t5-large/", - }) - -AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) -AllConfigs['lora_t5-3b'].update({ - "delta_type": "lora", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "lora_r": 8, - "output_dir": "outputs/lora/t5-3b/", - }) - - -if __name__ == "__main__": - import argparse - import json - import os - parser = argparse.ArgumentParser("Parser to generate configuration") - parser.add_argument("--job", type=str) - args = parser.parse_args() - - config = AllConfigs[args.job] - - Cartesian_product = [] - for key in config: - if isinstance(key, tuple): - Cartesian_product.append(key) - all_config_jsons = {} - for key_tuple in Cartesian_product: - for zipped in config[key_tuple]: - job_name = zipped[0] - all_config_jsons[job_name] = {} - for key_name, zipped_elem in zip(key_tuple, zipped): - if key_name != 'job_name': - all_config_jsons[job_name][key_name] = zipped_elem - for key in config: - if not isinstance(key, tuple): - for job_name in all_config_jsons: - if key == "output_dir": - all_config_jsons[job_name][key] = config[key] + job_name - else: - all_config_jsons[job_name][key] = config[key] - - - if not os.path.exists(f"./{args.job}_{BS}/"): - os.mkdir(f"./{args.job}_{BS}/") - - for job_name in all_config_jsons: - with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout: - json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) - - - - \ No newline at end of file diff --git a/examples/examples_seq2seq/configs/config_gen_bs8.py b/examples/examples_seq2seq/configs/config_gen_bs8.py deleted file mode 100644 index 5e48edb..0000000 --- a/examples/examples_seq2seq/configs/config_gen_bs8.py +++ /dev/null @@ -1,411 +0,0 @@ -import collections -import copy - -BS = 8 -AllConfigs = {} - -BaseConfigs = {} -BaseConfigs['t5-base'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", - "max_source_length", - "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], - [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, - [0] *7 +[0] *8, - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - ), - "do_train": True, - "do_eval": True, - "do_test": True, - - "model_name_or_path": "t5-base", - "tokenizer_name": "t5-base", - "save_total_limit": 1, - # For glue datasets. - "split_validation_test": True, - "seed": 42, - "dataset_config_name": ["en"], - "eval_dataset_config_name": ["en"], - "test_dataset_config_name": ["en"], - # other configurations. - "predict_with_generate": True, - # To evaluate during training. - "load_best_model_at_end": True, - "metric_for_best_model": "average_metrics", - "greater_is_better": True, - "evaluation_strategy": "steps", - "overwrite_output_dir": True, - "push_to_hub": True, - "save_strategy": "steps" - } - -BaseConfigs['t5-large'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", - "max_source_length", - "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], - [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, - [0] *7 +[0] *8, - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - ), - "do_train": True, - "do_eval": True, - "do_test": True, - - "model_name_or_path": "/home/hushengding/plm_cache/t5-large", - "tokenizer_name": "/home/hushengding/plm_cache/t5-large", - "save_total_limit": 1, - # For glue datasets. - "split_validation_test": True, - "seed": 42, - "dataset_config_name": ["en"], - "eval_dataset_config_name": ["en"], - "test_dataset_config_name": ["en"], - # other configurations. - "predict_with_generate": True, - # To evaluate during training. - "load_best_model_at_end": True, - "metric_for_best_model": "average_metrics", - "greater_is_better": True, - "evaluation_strategy": "steps", - "overwrite_output_dir": True, - "push_to_hub": True, - "save_strategy": "steps" - } - -BaseConfigs['t5-3b'] = { - ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", - "max_source_length", - "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", - "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], - [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], - [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - # [ 32, 32, 32, 32, 32, 16, 32] + [32] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, - [ 32, 32, 32, 32, 32, 16, 32] + [BS] * 8, - [0] *7 +[0] *8, - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], - ), - "do_train": True, - "do_eval": True, - "do_test": True, - - "model_name_or_path": "/home/hushengding/plm_cache/t5-3b", - "tokenizer_name": "/home/hushengding/plm_cache/t5-3b", - "save_total_limit": 1, - # For glue datasets. - "split_validation_test": True, - "seed": 42, - "dataset_config_name": ["en"], - "eval_dataset_config_name": ["en"], - "test_dataset_config_name": ["en"], - # other configurations. - "predict_with_generate": True, - # To evaluate during training. - "load_best_model_at_end": True, - "metric_for_best_model": "average_metrics", - "greater_is_better": True, - "evaluation_strategy": "steps", - "overwrite_output_dir": True, - "push_to_hub": True, - "save_strategy": "steps" - } - -AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['bitfit_t5-base'].update({ - "delta_type": "bitfit", - "learning_rate": 3e-4, - "output_dir": "outputs/bitfit/t5-base/", - }) - - - -AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['adapter_t5-base'].update({ - "delta_type": "adapter", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "bottleneck_dim":24, - "output_dir": "outputs/adapter/t5-base/", - }) - -AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['lora_t5-base'].update({ - "delta_type": "lora", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "lora_r": 8, - "output_dir": "outputs/lora/t5-base/", - }) - -AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['compacter_t5-base'].update({ - "delta_type": "compacter", - "learning_rate": 3e-3, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "output_dir": "outputs/compacter/t5-base/", - "non_linearity": "gelu_new", - - #Compacter. - "hypercomplex_division": 4, - "hypercomplex_adapters": True, - "hypercomplex_nonlinearity": "glorot-uniform", - # gradient clip and clamp - "gradient_clip": False, - "phm_clamp": False, - "normalize_phm_weight": False, - "learn_phm": True, - # shared one side - "factorized_phm": True, - "shared_phm_rule": False, - "factorized_phm_rule": False, - "phm_c_init": "normal", - "phm_init_range": 0.0001, - "use_bias_down_sampler": True, - "use_bias_up_sampler": True, - }) - -AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['compacter++_t5-base'].update({ - "delta_type": "compacter", - "learning_rate": 3e-3, - "do_train": True, - "do_eval": True, - "do_test": True, - "modified_modules": [ - "DenseReluDense" - ], - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "output_dir": "outputs/compacter++/t5-base/", - "non_linearity": "gelu_new", - - #Compacter. - "hypercomplex_division": 4, - "hypercomplex_adapters": True, - "hypercomplex_nonlinearity": "glorot-uniform", - # gradient clip and clamp - "gradient_clip": False, - "phm_clamp": False, - "normalize_phm_weight": False, - "learn_phm": True, - # shared one side - "factorized_phm": True, - "shared_phm_rule": False, - "factorized_phm_rule": False, - "phm_c_init": "normal", - "phm_init_range": 0.0001, - "use_bias_down_sampler": True, - "use_bias_up_sampler": True, - }) - - -AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['low_rank_adapter_t5-base'].update({ - "delta_type": "low_rank_adapter", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "output_dir": "outputs/low_rank_adapter/t5-base/", - "non_linearity": "gelu_new", - "low_rank_w_init": "glorot-uniform", - "low_rank_rank": 1, - }) - - -AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['soft_prompt_t5-base'].update({ - "delta_type": "soft_prompt", - "learning_rate": 3e-2, - "soft_token_num":100, - "token_init": False, - "unfrozen_modules": [ - "deltas", - ], - "output_dir": "outputs/soft_prompt/t5-base/", - }) - -AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['prefix_t5-base'].update({ - "delta_type": "prefix", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - ], - "output_dir": "outputs/prefix/t5-base/", - }) - -AllConfigs['none_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) -AllConfigs['none_t5-base'].update({ - "delta_type": "none", - "learning_rate": 3e-5, - "output_dir": "outputs/none/t5-base/", - }) - -AllConfigs['bitfit_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) -AllConfigs['bitfit_t5-large'].update({ - "delta_type": "bitfit", - "learning_rate": 3e-4, - "output_dir": "outputs/bitfit/t5-large/", - }) - -AllConfigs['none_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) -AllConfigs['none_t5-large'].update({ - "delta_type": "none", - "learning_rate": 3e-5, - "output_dir": "outputs/none/t5-large/", - }) - - -AllConfigs['bitfit_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) -AllConfigs['bitfit_t5-3b'].update({ - "delta_type": "bitfit", - "learning_rate": 3e-4, - "output_dir": "outputs/bitfit/t5-3b/", - }) - -AllConfigs['none_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) -AllConfigs['none_t5-3b'].update({ - "delta_type": "none", - "learning_rate": 3e-5, - "output_dir": "outputs/none/t5-3b/", - }) - -AllConfigs['adapter_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) -AllConfigs['adapter_t5-3b'].update({ - "delta_type": "adapter", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "bottleneck_dim":24, - "output_dir": "outputs/adapter/t5-3b/", - }) - -AllConfigs['adapter_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) -AllConfigs['adapter_t5-large'].update({ - "delta_type": "adapter", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "bottleneck_dim":24, - "output_dir": "outputs/adapter/t5-large/", - }) - -AllConfigs['lora_t5-large'] = copy.deepcopy(BaseConfigs['t5-large']) -AllConfigs['lora_t5-large'].update({ - "delta_type": "lora", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "lora_r": 8, - "output_dir": "outputs/lora/t5-large/", - }) - -AllConfigs['lora_t5-3b'] = copy.deepcopy(BaseConfigs['t5-3b']) -AllConfigs['lora_t5-3b'].update({ - "delta_type": "lora", - "learning_rate": 3e-4, - "unfrozen_modules": [ - "deltas", - "layer_norm", - "final_layer_norm" - ], - "lora_r": 8, - "output_dir": "outputs/lora/t5-3b/", - }) - - -if __name__ == "__main__": - import argparse - import json - import os - parser = argparse.ArgumentParser("Parser to generate configuration") - parser.add_argument("--job", type=str) - args = parser.parse_args() - - config = AllConfigs[args.job] - - Cartesian_product = [] - for key in config: - if isinstance(key, tuple): - Cartesian_product.append(key) - all_config_jsons = {} - for key_tuple in Cartesian_product: - for zipped in config[key_tuple]: - job_name = zipped[0] - all_config_jsons[job_name] = {} - for key_name, zipped_elem in zip(key_tuple, zipped): - if key_name != 'job_name': - all_config_jsons[job_name][key_name] = zipped_elem - for key in config: - if not isinstance(key, tuple): - for job_name in all_config_jsons: - if key == "output_dir": - all_config_jsons[job_name][key] = config[key] + job_name - else: - all_config_jsons[job_name][key] = config[key] - - - if not os.path.exists(f"./{args.job}_{BS}/"): - os.mkdir(f"./{args.job}_{BS}/") - - for job_name in all_config_jsons: - with open(f"./{args.job}_{BS}/{job_name}.json", 'w') as fout: - json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True) - - - - \ No newline at end of file diff --git a/examples/examples_seq2seq/memory_data.txt b/examples/examples_seq2seq/memory_data.txt deleted file mode 100644 index e7688b5..0000000 --- a/examples/examples_seq2seq/memory_data.txt +++ /dev/null @@ -1,26 +0,0 @@ -lora 32 0.8396220207214355 3.7825655937194824 -lora 32 2.773350238800049 10.523799419403076 -lora 32 10.683510303497314 32.6428017616272 -lora 32 0.8396220207214355 3.7825236320495605 -lora 32 2.773350238800049 10.523311138153076 -adapter 32 0.8578410148620605 3.986640453338623 -adapter 32 2.821873188018799 11.039577007293701 -adapter 32 10.696877002716064 33.12049341201782 -adapter 8 0.8578410148620605 1.6147065162658691 -adapter 8 2.821873188018799 4.828186511993408 -adapter 8 10.696877002716064 16.09417200088501 -lora 8 0.8396220207214355 1.5540986061096191 -lora 8 2.773350238800049 4.664810657501221 -lora 1 0.8396220207214355 0.9107160568237305 -lora 8 10.683510303497314 15.965403079986572 -lora 64 0.8396220207214355 6.777950763702393 -lora 1 2.773350238800049 2.9350662231445312 -lora 64 2.773350238800049 18.340473651885986 -lora 1 10.683510303497314 11.131460189819336 -adapter 1 0.8578410148620605 0.9334897994995117 -lora 64 10.683510303497314 54.61024713516235 -adapter 1 2.821873188018799 2.9950332641601562 -adapter 64 0.8578410148620605 7.167330265045166 -adapter 1 10.696877002716064 11.156260967254639 -adapter 64 2.821873188018799 19.32366418838501 -adapter 64 10.696877002716064 55.56023454666138 diff --git a/examples/examples_seq2seq/run.sh b/examples/examples_seq2seq/run.sh deleted file mode 100644 index fe2c981..0000000 --- a/examples/examples_seq2seq/run.sh +++ /dev/null @@ -1,7 +0,0 @@ -files=(cola mnli mrpc qnli qqp rte sst2 stsb superglue-boolq superglue-cb superglue-copa superglue-multirc superglue-record superglue-wic superglue-wsc.fixed) -for ((i=$1; i<=$2; i++)) -do - dataset=${files[i]} - echo "id$i:$dataset" - TOKENIZERS_PARALLELISM=false python run_seq2seq.py configs/$3/$dataset.json -done \ No newline at end of file diff --git a/examples/examples_seq2seq/run_memory.sh b/examples/examples_seq2seq/run_memory.sh deleted file mode 100644 index ad18ffa..0000000 --- a/examples/examples_seq2seq/run_memory.sh +++ /dev/null @@ -1,34 +0,0 @@ -# files=(cola mnli mrpc qnli qqp rte sst2 stsb superglue-boolq superglue-cb superglue-copa superglue-multirc superglue-record superglue-wic superglue-wsc.fixed) -# for ((i=$1; i<=$2; i++)) -# do -# dataset=${files[i]} -# echo "id$i:$dataset" -# TOKENIZERS_PARALLELISM=false python run_seq2seq.py configs/$3/$dataset.json -# done - -cd configs - -for deltatype in "lora" "adapter" -do - for modeltype in "t5-base" "t5-large" "t5-3b" - do - echo $deltatype - python config_gen_bs$2.py --job $deltatype\_$modeltype - done -done - -ls -cd .. - -for deltatype in "lora" "adapter" -do - for modeltype in "t5-base" "t5-large" "t5-3b" - do - CUDA_VISIBLE_DEVICES=$1 bash run.sh 2 2 $deltatype\_$modeltype\_$2 - done -done - - - - - diff --git a/examples/examples_seq2seq/run_seq2seq.py b/examples/examples_seq2seq/run_seq2seq.py index 0062150..defa655 100644 --- a/examples/examples_seq2seq/run_seq2seq.py +++ b/examples/examples_seq2seq/run_seq2seq.py @@ -19,9 +19,9 @@ Fine-tuning the library models for sequence to sequence. import functools import logging # from opendelta.utils.delta_center import create_hub_repo_name -import torch +import torch import os -os.environ['MKL_THREADING_LAYER'] = 'GNU' +os.environ['MKL_THREADING_LAYER'] = 'GNU' os.environ['MKL_SERVICE_FORCE_INTEL'] = '1' import sys import subprocess @@ -43,14 +43,14 @@ from transformers.trainer_utils import is_main_process, get_last_checkpoint from examples_seq2seq.data_processors import AutoTask, TaskDataCollatorForSeq2Seq, AutoPostProcessor from examples_seq2seq.seq2seq_trainer import Seq2SeqTrainer # from training_args import AdapterTrainingArguments -from examples_seq2seq.trainers.trainer_utils import save_training_config +from examples_seq2seq.trainers.trainer_utils import save_training_config from dataclasses import dataclass, field from transformers.models.t5.modeling_t5 import T5Config, T5ForConditionalGeneration from examples_seq2seq.trainers.model_args import ModelArguments from examples_seq2seq.trainers.trainer_args import TrainingArguments, DataTrainingArguments -import tensorboardX +import tensorboardX tb_writer = tensorboardX.SummaryWriter("Delta_Memory") logger = logging.getLogger(__name__) @@ -100,7 +100,7 @@ class RemainArgHfArgumentParser(HfArgumentParser): inputs = {k: data.pop(k) for k in list(data.keys()) if k in keys} obj = dtype(**inputs) outputs.append(obj) - + remain_args = argparse.ArgumentParser() remain_args.__dict__.update(data) if return_remaining_args: @@ -108,41 +108,41 @@ class RemainArgHfArgumentParser(HfArgumentParser): else: return (*outputs,) -from transformers.trainer_callback import TrainerCallback +# from transformers.trainer_callback import TrainerCallback -class MyCallback(TrainerCallback): - def __init__(self, *args, **kwargs): - self.delta_args = kwargs.pop("delta_args") - self.trainer_args = kwargs.pop("trainer_args") - self.model_args = kwargs.pop("model_args") - super(MyCallback, self).__init__(*args, **kwargs) - - - maxcudamem = 0 - def on_step_end(self, args, state, control, **kwargs ): - glb_step = state.global_step - cudamem = 0 - realcudamem =0 - for device_id in range(torch.cuda.device_count()): - cudamem += torch.cuda.memory_allocated(f"cuda:{device_id}")/1024**3 - realcudamem += torch.cuda.max_memory_allocated(f"cuda:{device_id}")/1024**3 - torch.cuda.reset_peak_memory_stats(f"cuda:{device_id}") - self.maxcudamem = max(self.maxcudamem, realcudamem) - self.cudamem = cudamem - # self.tb_writer.add_scalar("Static Memory (GB)", cudamem, glb_step) +# class MyCallback(TrainerCallback): +# def __init__(self, *args, **kwargs): +# self.delta_args = kwargs.pop("delta_args") +# self.trainer_args = kwargs.pop("trainer_args") +# self.model_args = kwargs.pop("model_args") +# super(MyCallback, self).__init__(*args, **kwargs) + + +# maxcudamem = 0 +# def on_step_end(self, args, state, control, **kwargs ): +# glb_step = state.global_step +# cudamem = 0 +# realcudamem =0 +# for device_id in range(torch.cuda.device_count()): +# cudamem += torch.cuda.memory_allocated(f"cuda:{device_id}")/1024**3 +# realcudamem += torch.cuda.max_memory_allocated(f"cuda:{device_id}")/1024**3 +# torch.cuda.reset_peak_memory_stats(f"cuda:{device_id}") +# self.maxcudamem = max(self.maxcudamem, realcudamem) +# self.cudamem = cudamem +# # self.tb_writer.add_scalar("Static Memory (GB)", cudamem, glb_step) # self.tb_writer.add_scalar("Runtime Memory (GB)", realcudamem, glb_step) # self.tb_writer.add_scalar("Peak Memory (GB)", self.maxcudamem, glb_step) - if glb_step > 50: - content = f"{self.delta_args.delta_type}\t{self.trainer_args.per_device_train_batch_size}\t{self.model_args.model_name_or_path}\t{self.cudamem}\t{self.maxcudamem}\n" - with open("memory_data.txt", 'a') as fout: - fout.write(content) - exit() - - + # if glb_step > 50: + # content = f"{self.delta_args.delta_type}\t{self.trainer_args.per_device_train_batch_size}\t{self.model_args.model_name_or_path}\t{self.cudamem}\t{self.maxcudamem}\n" + # with open("memory_data.txt", 'a') as fout: + # fout.write(content) + # exit() + + + + - - @@ -172,7 +172,7 @@ def main(): "Use --overwrite_output_dir to overcome." ) ''' - pass + pass elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " @@ -273,7 +273,7 @@ def main(): # Temporarily set max_target_length for training. #max_target_length = data_args.max_target_length padding = "max_length" if data_args.pad_to_max_length else False - + def preprocess_function(examples, max_target_length): # max_target_length += 1 # model_inputs = tokenizer([s+"" for s in examples['source']], max_length=data_args.max_source_length, @@ -301,7 +301,7 @@ def main(): if training_args.do_train: train_datasets = [AutoTask.get(dataset_name, dataset_config_name, - seed=data_args.data_seed).get( + seed=data_args.data_sample_seed).get( split="train", split_validation_test=training_args.split_validation_test, add_prefix=True, @@ -320,11 +320,11 @@ def main(): load_from_cache_file=not data_args.overwrite_cache, ) train_dataset = concatenate_datasets(train_datasets) - + if training_args.do_eval: eval_datasets = {eval_dataset: AutoTask.get(eval_dataset, eval_dataset_config, - seed=data_args.data_seed).get( - split="validation", + seed=data_args.data_sample_seed).get( + split="validation", split_validation_test=training_args.split_validation_test, add_prefix=True, n_obs=data_args.max_val_samples) @@ -343,8 +343,8 @@ def main(): if training_args.do_test: test_datasets = {test_dataset: AutoTask.get(test_dataset, test_dataset_config, - seed=data_args.data_seed).get( - split="test", + seed=data_args.data_sample_seed).get( + split="test", split_validation_test=training_args.split_validation_test, add_prefix=True, n_obs=data_args.max_test_samples) @@ -379,10 +379,10 @@ def main(): # Extracts the extra information needed to evaluate on each dataset. # These information are only used in the compute_metrics. - # We will assume that the test/eval dataloader does not change the order of + # We will assume that the test/eval dataloader does not change the order of # the data. data_info = {"eval": eval_datasets[data_args.eval_dataset_name[0]]['extra_fields'], - "test": test_datasets[data_args.test_dataset_name[0]]['extra_fields'], + "test": test_datasets[data_args.test_dataset_name[0]]['extra_fields'], "train": train_dataset['extra_fields']} def compute_metrics(eval_preds): preds, labels, data_info = eval_preds @@ -409,10 +409,10 @@ def main(): evaluation_metrics = TASK_TO_METRICS[data_args.dataset_name[0]], ) - trainer.add_callback(MyCallback(trainer_args=training_args, delta_args=delta_args, model_args=model_args)) + # trainer.add_callback(MyCallback(trainer_args=training_args, delta_args=delta_args, model_args=model_args)) - # Saves training config. + # Saves training config. if trainer.is_world_process_zero(): os.makedirs(training_args.output_dir, exist_ok=True) save_training_config(sys.argv[1], training_args.output_dir) @@ -430,15 +430,15 @@ def main(): start = torch.cuda.Event(enable_timing=True) end = torch.cuda.Event(enable_timing=True) start.record() - + train_result = trainer.train(resume_from_checkpoint=checkpoint) - + if training_args.compute_time: end.record() torch.cuda.synchronize() # wait for all_reduce to complete total_time = start.elapsed_time(end)/(1000*60) performance_metrics.update({"total_time in minutes ": total_time}) - + trainer.save_model() # Saves the tokenizer too for easy upload train_metrics = train_result.metrics max_train_samples = ( @@ -460,7 +460,7 @@ def main(): if training_args.compute_memory or training_args.compute_time: print(performance_metrics) trainer.save_metrics("performance", performance_metrics) - + # Evaluation results = {} if training_args.do_eval: @@ -484,9 +484,9 @@ def main(): trainer.log_metrics("test", metrics) trainer.save_metrics("test", metrics) results['test'] = metrics - + repo_name = create_hub_repo_name(root="DeltaHub", - dataset=data_args.task_name, + dataset=data_args.task_name, delta_type = delta_args.delta_type, model_name_or_path= model_args.model_name_or_path) results['repo_name'] = repo_name diff --git a/examples/examples_seq2seq/seq2seq_trainer.py b/examples/examples_seq2seq/seq2seq_trainer.py index 8f31e54..e557844 100644 --- a/examples/examples_seq2seq/seq2seq_trainer.py +++ b/examples/examples_seq2seq/seq2seq_trainer.py @@ -5,21 +5,8 @@ from typing import Any, Dict, List, Optional, Tuple, Union from torch.utils.data.dataset import Dataset from transformers import Seq2SeqTrainer as HfSeq2SeqTrainner -from examples_seq2seq.trainers.trainer import BaseTrainer +from examples_seq2seq.trainers.trainer import BaseTrainer - # if is_sagemaker_mp_enabled(): -# import smdistributed.modelparallel.torch as smp - -# from transformers.trainer_utils import ShardedDDPOption - -# if is_fairscale_available(): -# dep_version_check("fairscale") -# import fairscale -# from fairscale.nn.data_parallel import FullyShardedDataParallel as FullyShardedDDP -# from fairscale.nn.data_parallel import ShardedDataParallel as ShardedDDP -# from fairscale.nn.wrap import auto_wrap -# from fairscale.optim import OSS -# from fairscale.optim.grad_scaler import ShardedGradScaler from transformers.optimization import Adafactor, AdamW, get_scheduler from transformers.trainer_pt_utils import get_parameter_names, is_sagemaker_mp_enabled @@ -121,7 +108,7 @@ class Seq2SeqTrainer(HfSeq2SeqTrainner, BaseTrainer): return (loss, generated_tokens, labels) - - - - + + + + diff --git a/examples/examples_seq2seq/trainers/trainer_args.py b/examples/examples_seq2seq/trainers/trainer_args.py index 4f30e79..7da768d 100644 --- a/examples/examples_seq2seq/trainers/trainer_args.py +++ b/examples/examples_seq2seq/trainers/trainer_args.py @@ -1,6 +1,6 @@ from dataclasses import dataclass, field from typing import Optional, List -from transformers import Seq2SeqTrainingArguments +from transformers import Seq2SeqTrainingArguments # run_seq2seq parameters. @dataclass @@ -127,8 +127,9 @@ class DataTrainingArguments: default=None, metadata={"help": "Defines a dictionary from tasks to the tasks embeddings."} ) - data_seed: Optional[int] = field(default=42, metadata={"help": "seed used to shuffle the data."}) - + data_sample_seed: Optional[int] = field(default=42, metadata={"help": "seed used to shuffle the data."}) + + model_parallel: Optional[bool] = field(default=False, metadata={"help": "whether apply model parallelization"}) def __post_init__(self): diff --git a/opendelta/delta_models/soft_prompt.py b/opendelta/delta_models/soft_prompt.py index d3baa9b..c682132 100644 --- a/opendelta/delta_models/soft_prompt.py +++ b/opendelta/delta_models/soft_prompt.py @@ -1,4 +1,3 @@ -from examples_prompt.metrics.metrics import exact_match from opendelta.utils.signature import get_arg_names, get_arg_names_inside_func from opendelta.utils.name_based_addressing import * from opendelta.utils.cuda import get_device @@ -7,8 +6,6 @@ from typing import * import torch import torch.nn as nn from opendelta import BaseDeltaConfig -from decorator import decorate -import torch.nn.functional as F from opendelta import logging logger = logging.get_logger(__name__)