Merge pull request #13 from ShengdingHu/v1.0.0

V1.0.0 prepare1
This commit is contained in:
DingDing 2022-04-22 19:38:20 +08:00 committed by GitHub
commit b3935927dc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
730 changed files with 5630 additions and 502680 deletions

12
.gitignore vendored
View File

@ -3,7 +3,7 @@ data/
logs/*
experiments/logs
!logs/.gitkeep
datasets/*
datasets/*
!datasets/*.sh
.vscode/
*.egg-info/
@ -17,17 +17,17 @@ _build/
outputs/
log.txt
**/DeltaHub/
**/sfs_scripts/
*beans/
**/examples/*/configs/
**/examples/*/configs/*
!examples/*/configs/config_gen.py
**/jupyter_notebook_examples/
!examples/jupyter_notebook_examples/*.py
!**/examples/*/configs/config_gen.py
!examples/*/configs/*.py
**/outputs_search/**/*.bin
**/outputs_search/**/*.pt
*.db
**/nohup.out
**/examples/examples_bmtrain/BigModels/down_data

BIN
dist/opendelta-0.0.4-py3-none-any.whl vendored Normal file

Binary file not shown.

BIN
dist/opendelta-0.0.4.tar.gz vendored Normal file

Binary file not shown.

@ -1 +0,0 @@
Subproject commit 058e5f25c898a1f956e3f17a0db6d62f08173e7f

@ -1 +0,0 @@
Subproject commit 3a5083d61e73bae607574a3047deafaa76b97646

View File

@ -1,50 +0,0 @@
<!---
Copyright 2021 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Use OpenDelta in vision transformer ViT
This example uses the [huggingface image classification examples](), by adding several
lines in the original scripts.
## Usage
### 1. install necessary package
```shell
pip install Pillow
pip install torchvision
pip install transformers==4.16.2
pip install datsets==1.18.0
```
### 2. run
```bash
python run_image_classification.py configs/lora_beans.json
```
Do not forget to re-install datasets back into 1.17.0 for other examples. :)
## Possible Errors
1. dataset connection error
Solution 1: open a python console, running the error command again, may not be useful
Solution 2: download the dataset by yourself on a internect connected machine, saved to disk and transfer to your server, at last load_from_disk.
## Link to original training scripts
You may find solution to other question about the scripts and irrelevant to Opendelta in
https://github.com/huggingface/transformers/tree/master/examples/pytorch/image-classification

View File

@ -1,30 +0,0 @@
{
"report_to": "none",
"dataset_name": "beans",
"output_dir": "./beans_outputs/",
"do_train": true,
"do_eval": true,
"num_train_epochs": 5,
"remove_unused_columns": false,
"per_device_train_batch_size": 8,
"per_device_eval_batch_size": 8,
"logging_strategy": "steps",
"logging_steps": 10,
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
"load_best_model_at_end": true,
"save_total_limit": 3,
"seed": 1337,
"delta_type": "lora",
"modified_modules": [
"attention.query",
"attention.value"
],
"unfrozen_modules": [
"classifier",
"deltas"
],
"overwrite_output_dir": true,
"learning_rate": 5e-4
}

View File

@ -1,89 +0,0 @@
# coding=utf-8
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Accuracy metric."""
from sklearn.metrics import accuracy_score
import datasets
_DESCRIPTION = """
Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
Accuracy = (TP + TN) / (TP + TN + FP + FN)
TP: True positive
TN: True negative
FP: False positive
FN: False negative
"""
_KWARGS_DESCRIPTION = """
Args:
predictions: Predicted labels, as returned by a model.
references: Ground truth labels.
normalize: If False, return the number of correctly classified samples.
Otherwise, return the fraction of correctly classified samples.
sample_weight: Sample weights.
Returns:
accuracy: Accuracy score.
Examples:
>>> accuracy_metric = datasets.load_metric("accuracy")
>>> results = accuracy_metric.compute(references=[0, 1], predictions=[0, 1])
>>> print(results)
{'accuracy': 1.0}
"""
_CITATION = """\
@article{scikit-learn,
title={Scikit-learn: Machine Learning in {P}ython},
author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
journal={Journal of Machine Learning Research},
volume={12},
pages={2825--2830},
year={2011}
}
"""
@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class Accuracy(datasets.Metric):
def _info(self):
return datasets.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features(
{
"predictions": datasets.Sequence(datasets.Value("int32")),
"references": datasets.Sequence(datasets.Value("int32")),
}
if self.config_name == "multilabel"
else {
"predictions": datasets.Value("int32"),
"references": datasets.Value("int32"),
}
),
reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"],
)
def _compute(self, predictions, references, normalize=True, sample_weight=None):
return {
"accuracy": float(
accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight)
)
}

View File

@ -1,3 +0,0 @@
# torch>=1.5.0
torchvision>=0.6.0
datasets>=1.8.0

View File

@ -1,392 +0,0 @@
#!/usr/bin/env python
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional
import datasets
import numpy as np
import torch
from datasets import load_dataset
from PIL import Image
from torchvision.transforms import (
CenterCrop,
Compose,
Normalize,
RandomHorizontalFlip,
RandomResizedCrop,
Resize,
ToTensor,
)
import transformers
from transformers import (
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
AutoConfig,
AutoFeatureExtractor,
AutoModelForImageClassification,
HfArgumentParser,
Trainer,
TrainingArguments,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version
""" Fine-tuning a 🤗 Transformers model for image classification"""
logger = logging.getLogger(__name__)
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.16.0.dev0")
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
MODEL_CONFIG_CLASSES = list(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
def pil_loader(path: str):
with open(path, "rb") as f:
im = Image.open(f)
return im.convert("RGB")
@dataclass
class DataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
Using ``HfArgumentParser`` we can turn this class
into argparse arguments to be able to specify them on
the command line.
"""
dataset_name: Optional[str] = field(
default="nateraw/image-folder", metadata={"help": "Name of a dataset from the datasets package"}
)
dataset_config_name: Optional[str] = field(
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
)
train_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the training data."})
validation_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the validation data."})
train_val_split: Optional[float] = field(
default=0.15, metadata={"help": "Percent to split off of train for validation."}
)
max_train_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
},
)
max_eval_samples: Optional[int] = field(
default=None,
metadata={
"help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
"value if set."
},
)
def __post_init__(self):
data_files = dict()
if self.train_dir is not None:
data_files["train"] = self.train_dir
if self.validation_dir is not None:
data_files["val"] = self.validation_dir
self.data_files = data_files if data_files else None
class RemainArgHfArgumentParser(HfArgumentParser):
def parse_json_file(self, json_file: str, return_remaining_args=True ):
"""
Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the
dataclass types.
"""
import argparse
import json
from pathlib import Path
import dataclasses
data = json.loads(Path(json_file).read_text())
outputs = []
for dtype in self.dataclass_types:
keys = {f.name for f in dataclasses.fields(dtype) if f.init}
inputs = {k: data.pop(k) for k in list(data.keys()) if k in keys}
obj = dtype(**inputs)
outputs.append(obj)
remain_args = argparse.ArgumentParser()
remain_args.__dict__.update(data)
if return_remaining_args:
return (*outputs, remain_args)
else:
return (*outputs,)
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path: str = field(
default="google/vit-base-patch16-224-in21k",
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
)
model_type: Optional[str] = field(
default=None,
metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
)
config_name: Optional[str] = field(
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
)
model_revision: str = field(
default="main",
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
)
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
use_auth_token: bool = field(
default=False,
metadata={
"help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
"with private models)."
},
)
def collate_fn(examples):
pixel_values = torch.stack([example["pixel_values"] for example in examples])
labels = torch.tensor([example["labels"] for example in examples])
return {"pixel_values": pixel_values, "labels": labels}
def main():
# See all possible arguments in src/transformers/training_args.py
# or by passing the --help flag to this script.
# We now keep distinct sets of args, for a cleaner separation of concerns.
parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, training_args, delta_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
else:
model_args, data_args, training_args, delta_args = parser.parse_args_into_dataclasses()
# Setup logging
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = training_args.get_process_log_level()
logger.setLevel(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()
# Log on each process the small summary:
logger.warning(
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
logger.info(f"Training/evaluation parameters {training_args}")
# Detecting last checkpoint.
last_checkpoint = None
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
last_checkpoint = get_last_checkpoint(training_args.output_dir)
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
raise ValueError(
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
logger.info(
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Initialize our dataset and prepare it for the 'image-classification' task.
ds = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
data_files=data_args.data_files,
cache_dir=model_args.cache_dir,
task="image-classification",
)
# If you encounter error here, try to down load the dataset by yourself and load from disk
# like the following two lines
# from datasets import load_from_disk
# ds = load_from_disk(f"../../../../huggingface_datasets/saved_to_disk/{data_args.dataset_name}")
# If we don't have a validation split, split off a percentage of train as validation.
data_args.train_val_split = None if "validation" in ds.keys() else data_args.train_val_split
if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0:
split = ds["train"].train_test_split(data_args.train_val_split)
ds["train"] = split["train"]
ds["validation"] = split["test"]
# Prepare label mappings.
# We'll include these in the model's config to get human readable labels in the Inference API.
labels = ds["train"].features["labels"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
label2id[label] = str(i)
id2label[str(i)] = label
# Load the accuracy metric from the datasets package
# metric = datasets.load_metric("accuracy")
metric = datasets.load_metric("metric.py")
# Define our compute_metrics function. It takes an ``EvalPrediction`` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
def compute_metrics(p):
"""Computes accuracy on a batch of predictions"""
return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
config = AutoConfig.from_pretrained(
model_args.config_name or model_args.model_name_or_path,
num_labels=len(labels),
label2id=label2id,
id2label=id2label,
finetuning_task="image-classification",
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForImageClassification.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
feature_extractor = AutoFeatureExtractor.from_pretrained(
model_args.feature_extractor_name or model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
if delta_args.delta_type.lower() != "none":
from opendelta import AutoDeltaConfig,AutoDeltaModel
delta_config = AutoDeltaConfig.from_dict(vars(delta_args))
delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=model)
delta_model.freeze_module(set_state_dict = True)
delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True)
# Define torchvision transforms to be applied to each image.
normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
_train_transforms = Compose(
[
RandomResizedCrop(feature_extractor.size),
RandomHorizontalFlip(),
ToTensor(),
normalize,
]
)
_val_transforms = Compose(
[
Resize(feature_extractor.size),
CenterCrop(feature_extractor.size),
ToTensor(),
normalize,
]
)
def train_transforms(example_batch):
"""Apply _train_transforms across a batch."""
example_batch["pixel_values"] = [
_train_transforms(pil_img.convert("RGB")) for pil_img in example_batch["image"]
]
return example_batch
def val_transforms(example_batch):
"""Apply _val_transforms across a batch."""
example_batch["pixel_values"] = [_val_transforms(pil_img.convert("RGB")) for pil_img in example_batch["image"]]
return example_batch
if training_args.do_train:
if "train" not in ds:
raise ValueError("--do_train requires a train dataset")
if data_args.max_train_samples is not None:
ds["train"] = ds["train"].shuffle(seed=training_args.seed).select(range(data_args.max_train_samples))
# Set the training transforms
ds["train"].set_transform(train_transforms)
if training_args.do_eval:
if "validation" not in ds:
raise ValueError("--do_eval requires a validation dataset")
if data_args.max_eval_samples is not None:
ds["validation"] = (
ds["validation"].shuffle(seed=training_args.seed).select(range(data_args.max_eval_samples))
)
# Set the validation transforms
ds["validation"].set_transform(val_transforms)
# Initalize our trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=ds["train"] if training_args.do_train else None,
eval_dataset=ds["validation"] if training_args.do_eval else None,
compute_metrics=compute_metrics,
tokenizer=feature_extractor,
data_collator=collate_fn,
)
# Training
if training_args.do_train:
checkpoint = None
if training_args.resume_from_checkpoint is not None:
checkpoint = training_args.resume_from_checkpoint
elif last_checkpoint is not None:
checkpoint = last_checkpoint
train_result = trainer.train(resume_from_checkpoint=checkpoint)
trainer.save_model()
trainer.log_metrics("train", train_result.metrics)
trainer.save_metrics("train", train_result.metrics)
trainer.save_state()
# Evaluation
if training_args.do_eval:
metrics = trainer.evaluate()
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
# Write model card and (optionally) push to hub
kwargs = {
"finetuned_from": model_args.model_name_or_path,
"tasks": "image-classification",
"dataset": data_args.dataset_name,
"tags": ["image-classification"],
}
if training_args.push_to_hub:
trainer.push_to_hub(**kwargs)
else:
trainer.create_model_card(**kwargs)
if __name__ == "__main__":
main()

View File

@ -10,55 +10,16 @@ This will add `examples_seq2seq` to the environment path of the python lib.
## Generating the json configuration file
```shell
python configs/gen_$BACKBONETYPE.py --job $YOURJOB
#e.g. python configs/gen_beit.py --job lora_beit-base-patch16-224
```
python config_gen.py --job $job_name
```
The available job configuration (e.g., `--job lora_t5-base`) can be seen from `config_gen.py`. You can also
The available job configuration (e.g., `--job lora_beit-base-patch16-224`) can be seen from the scripts. You can also
create your only configuration.
## Run the code
```
python run_seq2seq.py configs/$job_name/$dataset.json
CUDA_VISIBLE_DEVICES=1 python src/run.py configs/lora_beit-base-patch16-224/beans.json
```
## Possible Errors
1.
```
ValueError: You must login to the Hugging Face hub on this computer by typing `transformers-cli login` and entering your credentials to use `use_auth_token=Tr
ue`. Alternatively, you can pass your own token as the `use_auth_token` argument.
```
- Solution 1: Please register an account on [HuggingFace](https://huggingface.co/)
Then run transformers-cli login on your command line to enter the username and password.
- Solution 2: Disable push_to_hub by modifying in the config.json : "push_to_hub": False
2.
```
OSError: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once).
```
- Solution 1:
```
wget -P ~ https://github.com/git-lfs/git-lfs/releases/download/v3.0.2/git-lfs-linux-amd64-v3.0.2.tar.gz
cd ~
tar -xvzf git-lfs-linux-amd64-v3.0.2.tar.gz
export PATH=~:$PATH
git-lfs install
```
- Solution 2: Disable push_to_hub by modifying in the config.json : "push_to_hub": False
3. dataset connection error
Solution 1: open a python console, running the error command again, may not be useful
Solution 2: download the dataset by yourself on a internect connected machine, saved to disk and transfer to your server, at last load_from_disk.
## Link to the original training scripts
This example repo is based on the [compacter training scripts](https://github.com/rabeehk/compacter), with compacter-related lines removed. Thanks to the authors of the original repo. In addition, in private correspondence with the authors, they shared the codes to create the json configs. Thanks again for their efforts.

View File

@ -0,0 +1,145 @@
from openpromptu.data_utils import InputExample
import torch
from transformers.data.data_collator import torch_default_data_collator
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
import numpy as np
from transformers import (
AutoConfig,
AutoFeatureExtractor,
AutoModelForImageClassification,
)
from transformers import ViTFeatureExtractor
from transformers import Trainer as HfTrainer
import torch.nn as nn
def process_example(raw_example, **kwargs):
tokenizer = kwargs['tokenizer']
inputs = tokenizer(raw_example['image'], return_tensors='pt')
inputs['labels'] = raw_example['labels']
return inputs
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
# from openpromptu.prompts import ManualVerbalizer
# from openpromptu.prompts import ManualTemplate
# from openpromptu import TokenizerWrapper
# template = ManualTemplate(text = task.templates_text[template_id])
# verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
# tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
return None, None, None
def preprocess_function(raw_example, **kwargs):
# from IPython import embed; embed(header="Therefa")
tokenizer = kwargs['tokenizer']
model_inputs = tokenizer(raw_example['image'], return_tensors='pt')
model_inputs['pixel_values'] = model_inputs['pixel_values'].squeeze()
model_inputs['labels'] = raw_example['labels']
return model_inputs
def compute_metrics(eval_preds, dataset_name, eval_metric):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds.predictions, eval_preds.label_ids
preds = np.argmax(preds, axis=-1)
result = {}
average_metrics = []
for metric in eval_metric:
metric_item = metric(preds, labels)
metric_value = list(metric_item.values())
result.update(metric_item)
average_metrics.extend(metric_value)
print("average:",average_metrics)
average_metric = sum(average_metrics)/len(average_metrics)
result.update({"average_metrics":average_metric})
return result
def mask_token_func(tokenizer, ith_mask=0):
return tokenizer.mask_token
def get_remove_columns(dataset_features):
# dataset_features.pop("label")
print("remove_columns: {}".format(dataset_features))
return dataset_features
class DataCollator(HfDataCollatorMixin):
def __init__(self, *args, **kwargs):
self.return_tensors='pt'
def torch_call(self, features):
# from IPython import embed; embed(header="in data collator")
a = torch_default_data_collator(features=features)
# from IPython import embed; embed(header="in data collator")
return a
def get_backbone(model_args, **kwargs):
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
config.dropout_rate = 0.0
tokenizer = AutoFeatureExtractor.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForImageClassification.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
config.num_labels = model_args.num_classes
old_classifier = model.classifier
model.classifier = nn.Linear(old_classifier.in_features, config.num_labels)
return config, tokenizer, model
class Trainer(HfTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.verbalizer=verbalizer
self.eval_task=eval_task
self.compute_metrics = self._compute_metrics
self.loss_fn = nn.CrossEntropyLoss()
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.pop('labels')
outputs = model(**inputs)
logits = outputs.get("logits")
loss = self.loss_fn(logits, labels)
return (loss, outputs) if return_outputs else loss
def _compute_metrics(self, eval_preds):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds.predictions, eval_preds.label_ids
preds = np.argmax(preds, axis=-1)
result = {}
average_metrics = []
for metric in self.eval_task.metric:
metric_item = metric(preds, labels)
metric_value = list(metric_item.values())
result.update(metric_item)
average_metrics.extend(metric_value)
print("average:",average_metrics)
average_metric = sum(average_metrics)/len(average_metrics)
result.update({"average_metrics":average_metric})
from IPython import embed; embed(header="In compute metrics")
return result

View File

@ -0,0 +1,141 @@
from openpromptu.data_utils import InputExample
import torch
from transformers.data.data_collator import torch_default_data_collator
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
import numpy as np
from transformers import (
AutoConfig,
AutoModelForMaskedLM,
AutoTokenizer,
)
from transformers import Trainer as HfTrainer
def preprocess_function(raw_example, **kwargs):
tokenizer = kwargs['tokenizer']
data_args = kwargs['data_args']
template = kwargs['template']
verbalizer = kwargs['verbalizer']
tokenizer_wrapper = kwargs['tokenizer_wrapper']
example = InputExample(**raw_example)
example, other = template.wrap_one_example(example)
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
padding="max_length", truncation=True)
return model_inputs
def compute_metrics(eval_preds, dataset_name, eval_metric):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds.predictions, eval_preds.label_ids
preds = np.argmax(preds, axis=-1)
result = {}
average_metrics = []
for metric in eval_metric:
metric_item = metric(preds, labels)
metric_value = list(metric_item.values())
result.update(metric_item)
average_metrics.extend(metric_value)
print("average:",average_metrics)
average_metric = sum(average_metrics)/len(average_metrics)
result.update({"average_metrics":average_metric})
return result
def mask_token_func(tokenizer, ith_mask=0):
return tokenizer.mask_token
def get_remove_columns(dataset_features):
dataset_features.pop("label")
return dataset_features
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
from openpromptu.prompts import ManualVerbalizer
from openpromptu.prompts import ManualTemplate
from openpromptu import TokenizerWrapper
template = ManualTemplate(text = task.templates_text[template_id])
verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
return template, verbalizer, tokenizer_wrapper
class DataCollator(HfDataCollatorMixin):
def __init__(self, *args, **kwargs):
self.return_tensors='pt'
def torch_call(self, features):
return torch_default_data_collator(features=features)
def get_backbone(model_args, **kwargs):
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
config.dropout_rate = 0.0
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForMaskedLM.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model.resize_token_embeddings(len(tokenizer))
return config, tokenizer, model
class Trainer(HfTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.verbalizer=verbalizer
self.eval_task=eval_task
self.compute_metrics = self._compute_metrics
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.pop('labels')
outputs = model(**inputs)
logits = outputs.get("logits")
input_ids = inputs['input_ids']
verbalizer = self.verbalizer.cuda()
logits_at_mask = logits[torch.where(input_ids == verbalizer.tokenizer.mask_token_id)]
label_logits = verbalizer.process_logits(logits_at_mask)
loss_fct = torch.nn.CrossEntropyLoss()
loss = loss_fct(label_logits, labels)
outputs.logits = label_logits
return (loss, outputs) if return_outputs else loss
def _compute_metrics(self, eval_preds):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds.predictions, eval_preds.label_ids
preds = np.argmax(preds, axis=-1)
result = {}
average_metrics = []
for metric in self.eval_task.metric:
metric_item = metric(preds, labels)
metric_value = list(metric_item.values())
result.update(metric_item)
average_metrics.extend(metric_value)
print("average:",average_metrics)
average_metric = sum(average_metrics)/len(average_metrics)
result.update({"average_metrics":average_metric})
return result

View File

@ -0,0 +1,178 @@
from openpromptu.data_utils import InputExample
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
from transformers import (
AutoConfig,
AutoModelForSeq2SeqLM,
AutoTokenizer,
)
from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
import torch
def mask_token_func(tokenizer, ith_mask):
return tokenizer.additional_special_tokens[ith_mask]
def get_remove_columns(dataset_features):
return dataset_features
def preprocess_function(raw_example, **kwargs):
# max_target_length += 1
tokenizer = kwargs['tokenizer']
data_args = kwargs['data_args']
template = kwargs['template']
verbalizer = kwargs['verbalizer']
tokenizer_wrapper = kwargs['tokenizer_wrapper']
split = kwargs['split']
example = InputExample(**raw_example)
try:
example = verbalizer.wrap_one_example(example)
example, other = template.wrap_one_example(example)
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
model_inputs = tokenizer(input_sentence, max_length=256,
padding="max_length", truncation=True)
except:
from IPython import embed; embed(header="Therer")
with tokenizer.as_target_tokenizer():
label = tokenizer(other['tgt_text']).input_ids
model_inputs["labels"] = label
return model_inputs
def get_backbone(model_args, **kwargs):
config = AutoConfig.from_pretrained(
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
config.dropout_rate = 0.0
tokenizer = AutoTokenizer.from_pretrained(
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
cache_dir=model_args.cache_dir,
use_fast=model_args.use_fast_tokenizer,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
model = AutoModelForSeq2SeqLM.from_pretrained(
model_args.model_name_or_path,
from_tf=bool(".ckpt" in model_args.model_name_or_path),
config=config,
cache_dir=model_args.cache_dir,
revision=model_args.model_revision,
use_auth_token=True if model_args.use_auth_token else None,
)
return config, tokenizer, model
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
from openpromptu.prompts import GenerationVerbalizer
from openpromptu.prompts import ManualTemplate
from openpromptu import TokenizerWrapper
template = ManualTemplate(text = task.templates_text[template_id])
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
return template, verbalizer, tokenizer_wrapper
class Trainer(HfSeq2SeqTrainer):
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
super().__init__(**kwargs)
self.eval_task = eval_task
self.compute_metrics = self._compute_metrics
def compute_loss(self, model, inputs, return_outputs=False):
outputs = model(**inputs)
if return_outputs:
return (outputs.loss, outputs)
else:
return outputs.loss
def prediction_step(
self,
model, #nn.Module,
inputs, #Dict[str, Union[torch.Tensor, Any]],
prediction_loss_only, #: bool,
ignore_keys, #: Optional[List[str]] = None,
): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
"""
Perform an evaluation step on :obj:`model` using obj:`inputs`.
Subclass and override to inject custom behavior.
Args:
model (:obj:`nn.Module`):
The model to evaluate.
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
The inputs and targets of the model.
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
prediction_loss_only (:obj:`bool`):
Whether or not to return the loss only.
Return:
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
labels (each being optional).
"""
if not self.args.predict_with_generate or prediction_loss_only:
return super().prediction_step(
model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
)
has_labels = "labels" in inputs
inputs = self._prepare_inputs(inputs)
gen_kwargs = {
"max_length": 10, # self._max_length if s is not None else self.model.config.max_length,
"num_beams": 1 #self._num_beams if self._num_beams is not None else self.model.config.num_beams,
}
generated_tokens = self.model.generate(
inputs["input_ids"],
attention_mask=inputs["attention_mask"],
**gen_kwargs,
)
# in case the batch is shorter than max length, the output should be padded
if generated_tokens.shape[-1] < gen_kwargs["max_length"]:
generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
with torch.no_grad():
outputs = model(**inputs)
if has_labels:
if self.label_smoother is not None:
loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
else:
loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
else:
loss = None
if self.args.prediction_loss_only:
return (loss, None, None)
labels = inputs["labels"]
if labels.shape[-1] < gen_kwargs["max_length"]:
labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
# from IPython import embed; embed(header="In seqseqtrainer")
return (loss, generated_tokens, labels)
def _compute_metrics(self, eval_preds):
# from IPython import embed; embed(header="In compute metrics")
preds, labels = eval_preds
decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
# post_processor = .get(data_args.dataset_name[0], tokenizer,
# data_args.ignore_pad_token_for_loss)
# decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info)
result = {}
for metric in self.eval_task.metric:
result.update(metric(decoded_preds, decoded_labels))
average_metric = sum(result.values())/len(result)
result.update({"average_metrics":average_metric})
return result

View File

@ -0,0 +1,116 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
#### ROBERTA######
BaseConfigs['albert-xlarge-v2'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}albert-xlarge-v2",
"tokenizer_name": f"{PATHBASE}albert-xlarge-v2",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['prefix_albert-xlarge-v2'] = copy.deepcopy(BaseConfigs['albert-xlarge-v2'])
AllConfigs['prefix_albert-xlarge-v2'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/albert-xlarge-v2/",
})
AllConfigs['soft_prompt_albert-xlarge-v2'] = copy.deepcopy(BaseConfigs['albert-xlarge-v2'])
AllConfigs['soft_prompt_albert-xlarge-v2'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/albert-xlarge-v2/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,450 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
BaseConfigs['beit-base-patch16-224'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "num_classes"): zip(
["beans"],
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20],
[256],
[ 32],
[ 32],#, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0], # *7 +[0] *8,
[200],# 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200],#, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[ 3],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}beit-base-patch16-224",
"tokenizer_name": f"{PATHBASE}beit-base-patch16-224",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps",
"datasets_load_from_disk":False,
}
AllConfigs['bitfit_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['bitfit_beit-base-patch16-224'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/beit-base-patch16-224/",
})
AllConfigs['adapter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['adapter_beit-base-patch16-224'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/beit-base-patch16-224/",
})
AllConfigs['lora_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['lora_beit-base-patch16-224'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layernorm_after",
"classifier"
],
"modified_modules":[
"query",
"value",
],
"lora_r": 8,
"output_dir": "outputs/lora/beit-base-patch16-224/",
})
AllConfigs['compacter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['compacter_beit-base-patch16-224'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter/beit-base-patch16-224/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['compacter++_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['compacter++_beit-base-patch16-224'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"do_train": True,
"do_eval": True,
"do_test": True,
"modified_modules": [
"DenseReluDense"
],
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter++/beit-base-patch16-224/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['low_rank_adapter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['low_rank_adapter_beit-base-patch16-224'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/low_rank_adapter/beit-base-patch16-224/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
AllConfigs['soft_prompt_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['soft_prompt_beit-base-patch16-224'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-2,
"soft_token_num":100,
"token_init": False,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/beit-base-patch16-224/",
})
AllConfigs['prefix_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['prefix_beit-base-patch16-224'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/beit-base-patch16-224/",
})
AllConfigs['soft_prompt_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
AllConfigs['soft_prompt_beit-base-patch16-224'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/beit-base-patch16-224/",
})
#### beit-base-patch16-224
BaseConfigs['t5-small'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}t5-small",
"tokenizer_name": f"{PATHBASE}t5-small",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
AllConfigs['prefix_t5-small'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/t5-small/",
})
#### ROBERTA######
BaseConfigs['roberta-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}roberta-base",
"tokenizer_name": f"{PATHBASE}roberta-base",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['bitfit_roberta-base'].update({
"delta_type": "bitfit",
"learning_rate": 1e-3,
"output_dir": "outputs/bitfit/roberta-base/",
})
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['none_roberta-base'].update({
"delta_type": "none",
"learning_rate": 1e-5,
"output_dir": "outputs/none/roberta-base/",
})
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['lora_roberta-base'].update({
"delta_type": "lora",
"learning_rate": 1e-3,
"output_dir": "outputs/lora/roberta-base/",
})
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['adapter_roberta-base'].update({
"delta_type": "adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/adapter/roberta-base/",
})
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['low_rank_adapter_roberta-base'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/low_rank_adapter/roberta-base/",
})
#### ROBERTA######
BaseConfigs['bert-base-cased'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}bert-base-cased",
"tokenizer_name": f"{PATHBASE}bert-base-cased",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
AllConfigs['prefix_bert-base-cased'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/bert-base-cased/",
})
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
AllConfigs['soft_prompt_bert-base-cased'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/bert-base-cased/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,116 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
#### ROBERTA######
BaseConfigs['bert-base-cased'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}bert-base-cased",
"tokenizer_name": f"{PATHBASE}bert-base-cased",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
AllConfigs['prefix_bert-base-cased'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/bert-base-cased/",
})
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
AllConfigs['soft_prompt_bert-base-cased'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/bert-base-cased/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,433 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
BaseConfigs['t5-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}t5-base",
"tokenizer_name": f"{PATHBASE}t5-base",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['bitfit_t5-base'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-base/",
})
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['adapter_t5-base'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/t5-base/",
})
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['lora_t5-base'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"lora_r": 8,
"output_dir": "outputs/lora/t5-base/",
})
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['compacter_t5-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter/t5-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['compacter++_t5-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"do_train": True,
"do_eval": True,
"do_test": True,
"modified_modules": [
"DenseReluDense"
],
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter++/t5-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['low_rank_adapter_t5-base'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/low_rank_adapter/t5-base/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['soft_prompt_t5-base'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-2,
"soft_token_num":100,
"token_init": False,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/t5-base/",
})
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['prefix_t5-base'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/t5-base/",
})
#### T5-base
BaseConfigs['t5-small'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}t5-small",
"tokenizer_name": f"{PATHBASE}t5-small",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
AllConfigs['prefix_t5-small'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/t5-small/",
})
#### ROBERTA######
BaseConfigs['roberta-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}roberta-base",
"tokenizer_name": f"{PATHBASE}roberta-base",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['bitfit_roberta-base'].update({
"delta_type": "bitfit",
"learning_rate": 1e-3,
"output_dir": "outputs/bitfit/roberta-base/",
})
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['none_roberta-base'].update({
"delta_type": "none",
"learning_rate": 1e-5,
"output_dir": "outputs/none/roberta-base/",
})
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['lora_roberta-base'].update({
"delta_type": "lora",
"learning_rate": 1e-3,
"output_dir": "outputs/lora/roberta-base/",
})
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['adapter_roberta-base'].update({
"delta_type": "adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/adapter/roberta-base/",
})
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['low_rank_adapter_roberta-base'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/low_rank_adapter/roberta-base/",
})
#### ROBERTA######
BaseConfigs['bert-base-cased'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}bert-base-cased",
"tokenizer_name": f"{PATHBASE}bert-base-cased",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
AllConfigs['prefix_bert-base-cased'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/bert-base-cased/",
})
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
AllConfigs['soft_prompt_bert-base-cased'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/bert-base-cased/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,143 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
#### ROBERTA######
BaseConfigs['roberta-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}roberta-base",
"tokenizer_name": f"{PATHBASE}roberta-base",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['bitfit_roberta-base'].update({
"delta_type": "bitfit",
"learning_rate": 1e-3,
"output_dir": "outputs/bitfit/roberta-base/",
})
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['none_roberta-base'].update({
"delta_type": "none",
"learning_rate": 1e-5,
"output_dir": "outputs/none/roberta-base/",
})
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['lora_roberta-base'].update({
"delta_type": "lora",
"learning_rate": 1e-3,
"output_dir": "outputs/lora/roberta-base/",
})
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['adapter_roberta-base'].update({
"delta_type": "adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/adapter/roberta-base/",
})
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['low_rank_adapter_roberta-base'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/low_rank_adapter/roberta-base/",
})
AllConfigs['soft_prompt_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['soft_prompt_roberta-base'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/roberta-base/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -0,0 +1,444 @@
import collections
import copy
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
PATHBASE="/home/hushengding/plm_cache/"
AllConfigs = {}
BaseConfigs = {}
BaseConfigs['t5-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}t5-base",
"tokenizer_name": f"{PATHBASE}t5-base",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['bitfit_t5-base'].update({
"delta_type": "bitfit",
"learning_rate": 3e-4,
"output_dir": "outputs/bitfit/t5-base/",
})
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['adapter_t5-base'].update({
"delta_type": "adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"bottleneck_dim":24,
"output_dir": "outputs/adapter/t5-base/",
})
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['lora_t5-base'].update({
"delta_type": "lora",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"lora_r": 8,
"output_dir": "outputs/lora/t5-base/",
})
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['compacter_t5-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter/t5-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['compacter++_t5-base'].update({
"delta_type": "compacter",
"learning_rate": 3e-3,
"do_train": True,
"do_eval": True,
"do_test": True,
"modified_modules": [
"DenseReluDense"
],
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/compacter++/t5-base/",
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
"phm_init_range": 0.0001,
"use_bias_down_sampler": True,
"use_bias_up_sampler": True,
})
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['low_rank_adapter_t5-base'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
"layer_norm",
"final_layer_norm"
],
"output_dir": "outputs/low_rank_adapter/t5-base/",
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": 1,
})
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['soft_prompt_t5-base'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-2,
"soft_token_num":100,
"token_init": False,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/t5-base/",
})
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['prefix_t5-base'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/t5-base/",
})
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
AllConfigs['soft_prompt_t5-base'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/t5-base/",
})
#### T5-base
BaseConfigs['t5-small'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}t5-small",
"tokenizer_name": f"{PATHBASE}t5-small",
"save_total_limit": 1,
# For glue datasets.
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"push_to_delta_center": True,
"save_strategy": "steps"
}
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
AllConfigs['prefix_t5-small'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/t5-small/",
})
#### ROBERTA######
BaseConfigs['roberta-base'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}roberta-base",
"tokenizer_name": f"{PATHBASE}roberta-base",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['bitfit_roberta-base'].update({
"delta_type": "bitfit",
"learning_rate": 1e-3,
"output_dir": "outputs/bitfit/roberta-base/",
})
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['none_roberta-base'].update({
"delta_type": "none",
"learning_rate": 1e-5,
"output_dir": "outputs/none/roberta-base/",
})
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['lora_roberta-base'].update({
"delta_type": "lora",
"learning_rate": 1e-3,
"output_dir": "outputs/lora/roberta-base/",
})
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['adapter_roberta-base'].update({
"delta_type": "adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/adapter/roberta-base/",
})
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
AllConfigs['low_rank_adapter_roberta-base'].update({
"delta_type": "low_rank_adapter",
"learning_rate": 1e-3,
"output_dir": "outputs/low_rank_adapter/roberta-base/",
})
#### ROBERTA######
BaseConfigs['bert-base-cased'] = {
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
"max_source_length",
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
[0] *7 +[0] *8,
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
),
"do_train": True,
"do_eval": True,
"do_test": True,
"model_name_or_path": f"{PATHBASE}bert-base-cased",
"tokenizer_name": f"{PATHBASE}bert-base-cased",
"save_total_limit": 1,
# For glue datasets.
"is_seq2seq": False,
"split_validation_test": True,
"seed": 42,
"dataset_config_name": ["en"],
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": False,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
"greater_is_better": True,
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": True,
"save_strategy": "steps"
}
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
AllConfigs['prefix_bert-base-cased'].update({
"delta_type": "prefix",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/prefix/bert-base-cased/",
})
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
AllConfigs['soft_prompt_bert-base-cased'].update({
"delta_type": "soft_prompt",
"learning_rate": 3e-4,
"unfrozen_modules": [
"deltas",
],
"output_dir": "outputs/soft_prompt/bert-base-cased/",
})
if __name__ == "__main__":
import argparse
import json
import os
parser = argparse.ArgumentParser("Parser to generate configuration")
parser.add_argument("--job", type=str)
args = parser.parse_args()
config = AllConfigs[args.job]
Cartesian_product = []
for key in config:
if isinstance(key, tuple):
Cartesian_product.append(key)
all_config_jsons = {}
for key_tuple in Cartesian_product:
for zipped in config[key_tuple]:
job_name = zipped[0]
all_config_jsons[job_name] = {}
for key_name, zipped_elem in zip(key_tuple, zipped):
if key_name != 'job_name':
all_config_jsons[job_name][key_name] = zipped_elem
for key in config:
if not isinstance(key, tuple):
for job_name in all_config_jsons:
if key == "output_dir":
all_config_jsons[job_name][key] = config[key] + job_name
else:
all_config_jsons[job_name][key] = config[key]
if not os.path.exists(f"configs/{args.job}/"):
os.mkdir(f"configs/{args.job}/")
for job_name in all_config_jsons:
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)

View File

@ -1,3 +1,3 @@
from .tasks import TASK_MAPPING, AutoTask
from .data_collator import TaskDataCollatorForSeq2Seq
from .postprocessors import AutoPostProcessor
# from .data_collator import TaskDataCollatorForSeq2Seq
# from .postprocessors import AutoPostProcessor

View File

@ -1,16 +0,0 @@
import numpy as np
from dataclasses import dataclass
from transformers import DataCollatorForSeq2Seq
@dataclass
class TaskDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
def check_uniqueness(self, samples):
assert len(np.unique(samples)) == 1
def __call__(self, features):
# tasks = [d.pop('task') for d in features]
# self.check_uniqueness(tasks)
output = super().__call__(features)
# output["task"] = tasks[0]
return output

View File

@ -1,67 +0,0 @@
import abc
from collections import OrderedDict
import numpy as np
"""Defines functions to process the outputs to make them ready for the evaluation."""
def string_to_float(string, default=-1., **unused_kwargs):
"""Converts string to float, using default when conversion not possible."""
try:
return float(string)
except ValueError:
return default
class PostProcessor(abc.ABC):
"""Postprocess the predictions and labels to make them suitable for
evaluation."""
def __init__(self, tokenizer, ignore_pad_token_for_loss):
self.tokenizer = tokenizer
self.ignore_pad_token_for_loss = ignore_pad_token_for_loss
def process(self, preds, labels, data_info=None):
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
if self.ignore_pad_token_for_loss:
# Replace -100 in the labels as we can't decode them.
labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
# Some simple post-processing
decoded_preds = [pred.strip() for pred in decoded_preds]
decoded_labels = [label.strip() for label in decoded_labels]
return decoded_preds, decoded_labels
class MultiRC(PostProcessor):
def process(self, preds, labels, data_info):
preds, labels = super().process(preds, labels, data_info)
preds = [{"group": info["group"], "value":pred} \
for info, pred in zip(data_info, preds)]
labels = [{"group": info["group"], "value": label}\
for info, label in zip(data_info, labels)]
return preds, labels
class Record(PostProcessor):
def process(self, preds, labels, data_info):
preds, labels = super().process(preds, labels, data_info)
labels = [info["answers"] for info in data_info]
return preds, labels
POSTPROCESSOR_MAPPING = OrderedDict(
[
('superglue-record', Record),
('superglue-multirc', MultiRC)
]
)
class AutoPostProcessor:
@classmethod
def get(self, task, tokenizer, ignore_pad_token_for_loss):
if task in POSTPROCESSOR_MAPPING:
return POSTPROCESSOR_MAPPING[task](tokenizer, ignore_pad_token_for_loss)
return PostProcessor(tokenizer, ignore_pad_token_for_loss)

View File

@ -0,0 +1,96 @@
import abc
from typing import Callable, List, Mapping, Dict
import datasets
import logging
import numpy as np
import torch
logger = logging.getLogger(__name__)
class AbstractTask(abc.ABC):
name = NotImplemented
config = NotImplemented
prefix = NotImplemented
metric = NotImplemented
metric_names = NotImplemented
split_map = None
labels_list = None
split_to_data_split: Mapping[str, str] = \
{"train": "train", "validation": "validation", "test": "test"}
split_valid_to_make_test = True
split_train_to_make_test = False
keep_fields_after_preprocess = ["label"] # The fields that should be kept even after preprocessiing
def __init__(self, config, data_args, seed=42, default_max_length=1):
self.config = config
self.seed = seed
self.data_args = data_args
self.default_max_length = default_max_length
def check_n_obs(self, n_obs, total_size):
if n_obs is not None and n_obs > total_size:
n_obs = total_size
logger.warning("n_obs is set to %s", n_obs)
return n_obs
def shuffled_indices(self, dataset):
num_samples = len(dataset)
generator = torch.Generator()
generator.manual_seed(self.seed)
return torch.randperm(num_samples, generator=generator).tolist()
def subsample(self, dataset, n_obs=None, indices=None):
"""
Given a dataset returns the subsampled dataset.
:param n_obs: the number of samples of the subsampled dataset.
:param indices: indices to select the samples from, if not given, indices are computed
from by shuffling the given dataset.
:return: subsampled dataset.
"""
num_samples = len(dataset)
n_obs = self.check_n_obs(n_obs, num_samples)
if indices is None:
indices = self.shuffled_indices(dataset)
indices = indices[:n_obs]
return dataset.select(indices)
def load_dataset(self, split: int):
return datasets.load_dataset(self.name, self.config, split=split, script_version="master")
def get_split_indices(self, split, dataset, validation_size):
indices = self.shuffled_indices(dataset)
if split == "validation":
return indices[:validation_size]
else:
return indices[validation_size:]
def preprocessor(self, example):
return example
def get(self, split, n_obs=None, split_validation_test=False):
# For small datasets (n_samples < 10K) without test set, we divide validation set to
# half, use one half as test set and one half as validation set.
if split in ["eval", "dev", "valid"]:
split = "validation"
if split_validation_test and self.split_valid_to_make_test \
and split != "train":
mapped_split = self.split_to_data_split["validation"]
dataset = self.load_dataset(split=mapped_split)
indices = self.get_split_indices(split, dataset, validation_size=len(dataset)//2)
dataset = self.subsample(dataset, n_obs, indices)
# For larger datasets (n_samples > 10K), we divide training set into 1K as
# validation and the rest as training set, keeping the original validation
# set as the test set.
elif split_validation_test and self.split_train_to_make_test \
and split != "test":
dataset = self.load_dataset(split="train")
indices = self.get_split_indices(split, dataset, validation_size=1000)
dataset = self.subsample(dataset, n_obs, indices)
else:
mapped_split = self.split_to_data_split[split]
dataset = self.load_dataset(split=mapped_split)
# shuffles the data and samples it.
if n_obs is not None:
dataset = self.subsample(dataset, n_obs)
return dataset.map(self.preprocessor)

View File

@ -1,4 +1,4 @@
# from openprompt.prompts import ManualTemplate
class BasePrompt(object):
def __init__(self, template_id=0, verbalizer_id=0, generation=True):
@ -9,26 +9,28 @@ class BasePrompt(object):
self.verbalizer = self.mlmhead_verbalizers[verbalizer_id]
def __call__(self, example):
def eval_syntax(syntaxlist, example):
composed = []
for x in syntaxlist:
if x.startswith("[_eval_]"):
t = eval(x[len("[_eval_]"):])
t = eval(x[len("[_eval_]"):])
else:
t = x
composed.append(t)
return composed
src_texts = eval_syntax(self.template,example)
tgt_texts = self.verbalizer[str(example['label'])]
if isinstance(tgt_texts, list):
tgt_texts = eval_syntax(tgt_texts, example)
else:
tgt_texts = [tgt_texts]
return src_texts, tgt_texts
@ -48,7 +50,7 @@ class MRPCPrompt(BasePrompt):
"1": "same"
}
textual_templates = [
["sentence1:", """[_eval_]example['sentence1']""",
["sentence1:", """[_eval_]example['sentence1']""",
"sentence2:", """[_eval_]example["sentence2"]""", "Meanings different of same? Answer: " ]
]
@ -68,7 +70,7 @@ class BoolQPrompt(BasePrompt):
"1": "same"
}
textual_templates = [
["sentence1:", """[_eval_]example['sentence1']""",
["sentence1:", """[_eval_]example['sentence1']""",
"sentence2:", """[_eval_]example["sentence2"]""", "Meanings different of same? Answer: " ]
]
@ -84,7 +86,7 @@ class BoolQPrompt(BasePrompt):
"1": "yes"
}
textual_templates = [
["hypothesis:", """[_eval_]example['hypothesis']""",
["hypothesis:", """[_eval_]example['hypothesis']""",
"premise:", """[_eval_]example["premise"]""", "The answer was " ]
]
@ -100,7 +102,7 @@ class COLAPrompt(BasePrompt):
"1": "Yes"
}
textual_templates = [
["sentence:", """[_eval_]example['sentence']""",
["sentence:", """[_eval_]example['sentence']""",
"grammar correct? " ]
]
@ -119,7 +121,7 @@ class RTEPrompt(BasePrompt):
textual_templates = [
["sentence1:", """[_eval_]example['premise']""", "sentence2:",
"""[_eval_]example['hypothesis']""",
"The answer was " ]
"The answer was "]
]
class CBPrompt(BasePrompt):
@ -147,6 +149,5 @@ PromptCollections = {
"superglue-boolq": BoolQPrompt,
"cb": CBPrompt,
}

View File

@ -1,10 +1,10 @@
from collections import OrderedDict
import collections
import collections
import abc
import functools
from selectors import EpollSelector
from typing import Callable, List, Mapping
from examples_prompt.trainers.trainer_utils import pad_punctuation
from .utils import pad_punctuation
from examples_prompt.metrics import metrics
from .utils import round_stsb_target
import datasets
@ -12,119 +12,26 @@ import logging
import numpy as np
import torch
import re
from examples_prompt.data_processors.prompt import PromptCollections
from openprompt.prompts import ManualTemplate, ManualVerbalizer
from openprompt.plms.utils import TokenizerWrapper
from openprompt.data_utils import InputExample
from openprompt.prompts import GenerationVerbalizer
import itertools
logger = logging.getLogger(__name__)
class AbstractTask(abc.ABC):
name = NotImplemented
config = NotImplemented
prefix = NotImplemented
metric = NotImplemented
metric_names = NotImplemented
split_map = None
labels_list = None
split_to_data_split: Mapping[str, str] = \
{"train": "train", "validation": "validation", "test": "test"}
small_datasets_without_all_splits = ["cola", "wnli", "rte", "superglue-cb", "superglue-copa", "superglue-multirc",
"superglue-wic", "superglue-wsc.fixed", "superglue-rte", "mrpc", "stsb",
"superglue-boolq"]
large_data_without_all_splits = ["qqp", "qnli", "superglue-record", "sst2"]
def __init__(self, config, seed=42):
self.config = config
self.seed = seed
tid = getattr(config, "template_id", 0)
vid = getattr(config, "verbalizer_id", 0)
generation_paradigm = getattr(config, "generation_paradigm", True)
self.prompt = PromptCollections[self.name](tid, vid, generation_paradigm)
from transformers.models.auto.tokenization_auto import tokenizer_class_from_name
def get_max_target_length(self, tokenizer, default_max_length):
if self.prompt.verbalizer is not None:
return max([len(tokenizer.encode(label)) for key, label in self.prompt.verbalizer.items()])
return default_max_length
from typing import List, Dict
from collections import defaultdict
from openprompt.utils import round_list
import warnings
def seq2seq_format(self, source, target, extra_fields={}
):
return {'source': ' '.join(source),
'target': ' '.join(target),
'task': self.name,
'extra_fields': extra_fields
}
def check_n_obs(self, n_obs, total_size):
if n_obs is not None and n_obs > total_size:
n_obs = total_size
logger.warning("n_obs is set to %s", n_obs)
return n_obs
def shuffled_indices(self, dataset):
num_samples = len(dataset)
generator = torch.Generator()
generator.manual_seed(self.seed)
return torch.randperm(num_samples, generator=generator).tolist()
def subsample(self, dataset, n_obs=None, indices=None):
"""
Given a dataset returns the subsampled dataset.
:param n_obs: the number of samples of the subsampled dataset.
:param indices: indices to select the samples from, if not given, indices are computed
from by shuffling the given dataset.
:return: subsampled dataset.
"""
num_samples = len(dataset)
n_obs = self.check_n_obs(n_obs, num_samples)
if indices is None:
indices = self.shuffled_indices(dataset)
indices = indices[:n_obs]
return dataset.select(indices)
def load_dataset(self, split: int):
return datasets.load_dataset(self.name, self.config, split=split, script_version="master")
def get_split_indices(self, split, dataset, validation_size):
indices = self.shuffled_indices(dataset)
if split == "validation":
return indices[:validation_size]
else:
return indices[validation_size:]
def map_dataset(self, dataset, add_prefix):
# from IPython import embed; embed(header="in get target length")
return dataset.map(self.preprocessor)
def preprocessor(self, example):
source, target = self.prompt(example)
return self.seq2seq_format(source, target, extra_fields={})
def get(self, split, add_prefix=True, n_obs=None, split_validation_test=False):
# For small datasets (n_samples < 10K) without test set, we divide validation set to
# half, use one half as test set and one half as validation set.
if split_validation_test and self.name in self.small_datasets_without_all_splits \
and split != "train":
mapped_split = self.split_to_data_split["validation"]
dataset = self.load_dataset(split=mapped_split)
indices = self.get_split_indices(split, dataset, validation_size=len(dataset)//2)
dataset = self.subsample(dataset, n_obs, indices)
# For larger datasets (n_samples > 10K), we divide training set into 1K as
# validation and the rest as training set, keeping the original validation
# set as the test set.
elif split_validation_test and self.name in self.large_data_without_all_splits \
and split != "test":
dataset = self.load_dataset(split="train")
indices = self.get_split_indices(split, dataset, validation_size=1000)
dataset = self.subsample(dataset, n_obs, indices)
else:
mapped_split = self.split_to_data_split[split]
dataset = self.load_dataset(split=mapped_split)
# shuffles the data and samples it.
if n_obs is not None:
dataset = self.subsample(dataset, n_obs)
return self.map_dataset(dataset, add_prefix)
from .processor import AbstractTask
class Squad(AbstractTask):
name = "squad"
@ -143,25 +50,7 @@ class Squad(AbstractTask):
return self.seq2seq_format(source, target, add_prefix)
class MRPC(AbstractTask):
name = "mrpc"
labels_list = ["0", "1"]
metric = [metrics.f1_score, metrics.accuracy]
metric_names = ["f1", "accuracy"]
split_to_data_split = {"train": "train",
"validation": "validation",
"test": "validation"}
def load_dataset(self, split):
return datasets.load_dataset('glue', 'mrpc', split=split, script_version="master")
# def preprocessor(self, example, add_prefix=True):
# src_texts = ["sentence1:", example['sentence1'],
# "sentence2:", example["sentence2"]]
# tgt_texts = [str(example['label'])]
# return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
##GLUE
class COLA(AbstractTask):
name = "cola"
labels_list = ["0", "1"]
@ -171,14 +60,19 @@ class COLA(AbstractTask):
"validation": "validation",
"test": "validation"}
def load_dataset(self, split):
return datasets.load_dataset('glue', 'cola',
split=split, script_version="master")
templates_text = {"0": """sentence: {"meta": 'sentence', "shortenable":True} Are there any error in the sentence? {"mask"}""",
}
# def preprocessor(self, example, add_prefix=True):
# src_texts = ["sentence:", example['sentence']]
# tgt_texts = [str(example['label'])]
# return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
verbalizers = {
"0":{ "0": "yes", "1": "no"}
}
def load_dataset(self, split):
if self.data_args.datasets_load_from_disk:
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.cola")[split]
else:
return datasets.load_dataset('glue', 'cola',
split=split, script_version="master")
class SST2(AbstractTask):
@ -190,34 +84,50 @@ class SST2(AbstractTask):
"validation": "validation",
"test": "validation"}
verbalizers = {
"0":{"0":"negative","1":"positive"}
}
templates_text = {
"0":"""The sentiment of sentence: "{"meta":"sentence", "shortenable":True} is {"mask"}."""
}
def load_dataset(self, split):
return datasets.load_dataset('glue', 'sst2',
split=split, script_version="master")
def preprocessor(self, example, add_prefix=True):
src_texts = ["sentence:", example['sentence']]
tgt_texts = [str(example['label'])]
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
if self.data_args.datasets_load_from_disk:
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.sst2")[split]
else:
return datasets.load_dataset('glue', 'sst2',
split=split, script_version="master")
class STSB(AbstractTask):
name = "stsb"
labels_list = [str(np.round(label, decimals=1)) for label in np.arange(0, 5.2, 0.2)]
metric = [metrics.pearson_corrcoef, metrics.spearman_corrcoef]
metric_names = ["pearson", "spearmanr"]
class MRPC(AbstractTask):
name = "mrpc"
labels_list = ["0", "1"]
metric = [metrics.f1_score, metrics.accuracy]
metric_names = ["f1", "accuracy"]
split_to_data_split = {"train": "train",
"validation": "validation",
"test": "validation"}
def load_dataset(self, split):
return datasets.load_dataset('glue', 'stsb',
split=split, script_version="master")
def preprocessor(self, example, add_prefix=True):
src_texts = ["sentence1:", example['sentence1'],
"sentence2:", example["sentence2"]]
tgt_texts = [str(round_stsb_target(example['label']))]
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
templates_text = {
"0": """sentence1: {"meta": 'sentence1', "shortenable":True}. sentence2: {"meta":"sentence2", "shortenable":True}. Are sentence1 and sentence2 equivalent? {"mask"}.""",
}
verbalizers = {
"0":{"0": "no","1": "yes"}
}
def load_dataset(self, split):
if self.data_args.datasets_load_from_disk:
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.mrpc")[split]
else:
return datasets.load_dataset('glue', 'mrpc', split=split, script_version="master")
class QQP(AbstractTask):
@ -229,14 +139,46 @@ class QQP(AbstractTask):
"validation": "validation",
"test": "validation"}
templates_text = {"0":
"""question1: {"meta": 'question1', "shortenable":True}. question2: {"meta": 'question2', "shortenable":True} Are question1 and question2 equivalent? {"mask"}."""
}
verbalizers = {
"0":{"0": "no","1": "yes"}
}
def load_dataset(self, split):
return datasets.load_dataset('glue', 'qqp',
if self.data_args.datasets_load_from_disk:
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.qqp")[split]
else:
return datasets.load_dataset('glue', 'qqp',
split=split, script_version="master")
class STSB(AbstractTask):
name = "stsb"
labels_list = [str(np.round(label, decimals=1)) for label in np.arange(0, 5.2, 0.2)]
metric = [metrics.pearson_corrcoef, metrics.spearman_corrcoef]
metric_names = ["pearson", "spearmanr"]
split_to_data_split = {"train": "train",
"validation": "validation",
"test": "validation"}
verbalizers = {
""
}
def load_dataset(self, split):
return datasets.load_dataset('glue', 'stsb',
split=split, script_version="master")
def preprocessor(self, example, add_prefix=True):
src_texts = ["question1:", example['question1'],
"question2:", example["question2"]]
tgt_texts = [str(example['label'])]
src_texts = ["sentence1:", example['sentence1'],
"sentence2:", example["sentence2"]]
tgt_texts = [str(round_stsb_target(example['label']))]
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
@ -250,14 +192,29 @@ class MNLI(AbstractTask):
metric_names = ["accuracy"]
def load_dataset(self, split):
return datasets.load_dataset('glue', 'mnli', split=split, script_version="master")
templates_text = {
"0":"""premise: {"meta": 'premise', "shortenable":True}. hypothesis: {"meta": 'hypothesis', "shortenable":True} Does the premise entails the hypothesis? {"mask"}.""",
}
def preprocessor(self, example, add_prefix=True):
src_texts = ["premise:", example['premise'],
"hypothesis", example["hypothesis"]]
tgt_texts = [str(example['label'])]
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
verbalizers = {
"0":{
"0": "yes",
"1": "neutral",
"2": "no",
}
}
def load_dataset(self, split):
if self.data_args.datasets_load_from_disk:
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.mnli")[split]
else:
return datasets.load_dataset('glue', 'mnli', split=split, script_version="master")
# def preprocessor(self, example, add_prefix=True):
# src_texts = ["premise:", example['premise'],
# "hypothesis", example["hypothesis"]]
# tgt_texts = [str(example['label'])]
# return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
class QNLI(AbstractTask):
@ -269,15 +226,35 @@ class QNLI(AbstractTask):
"validation": "validation",
"test": "validation"}
templates_text = {
"0": """premise: {"meta": 'sentence', "shortenable":True}. hypothesis: {"meta": 'question', "shortenable":True}"""+
"""Does the premise entails the hypothesis? {"mask"}.""",
}
verbalizers = {
"0":{
"0": "yes",
"1": "no",
}
}
def load_dataset(self, split):
return datasets.load_dataset('glue', 'qnli', split=split, script_version="master")
if self.data_args.datasets_load_from_disk:
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.qnli")[split]
else:
return datasets.load_dataset('glue', 'qnli', split=split, script_version="master")
def preprocessor(self, example, add_prefix=True):
src_texts = ["question:", example['question'],
"sentence:", example["sentence"]]
tgt_texts = [str(example['label'])]
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
# def load_dataset(self, split):
# return datasets.load_dataset('glue', 'qnli', split=split, script_version="master")
# def preprocessor(self, example, add_prefix=True):
# src_texts = ["question:", example['question'],
# "sentence:", example["sentence"]]
# tgt_texts = [str(example['label'])]
# return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
#Tested
class RTE(AbstractTask):
name = "rte"
labels_list = ["0", "1"]
@ -287,15 +264,24 @@ class RTE(AbstractTask):
"validation": "validation",
"test": "validation"}
templates_text = {
"0": """sentence1: {"meta": 'sentence1', "shortenable":True} sentence2: {"meta":"sentence2", "shortenable":True} The answer was {"mask"}.""",
}
verbalizers = {
"0":{"0": "yes",
"1": "no"
}
}
def load_dataset(self, split):
return datasets.load_dataset('glue', 'rte',
if self.data_args.datasets_load_from_disk:
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.rte")[split]
else:
return datasets.load_dataset('glue', 'rte',
split=split, script_version="master")
def preprocessor(self, example, add_prefix=True):
src_texts = ["sentence1:", example['sentence1'],
"sentence2:", example["sentence2"]]
tgt_texts = [str(example['label'])]
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
class WNLI(AbstractTask):
@ -307,16 +293,23 @@ class WNLI(AbstractTask):
"validation": "validation",
"test": "validation"}
verbalizers = {
"0":{"0": "True",
"1": "False",
}
}
templates_text = {"0": """{"meta": 'sentence1',"shortenable":True} Does it mean the following: "{"meta":'sentence2'}"? {"mask"}."""
}
def load_dataset(self, split):
return datasets.load_dataset('glue', 'wnli', split=split, script_version="master")
def preprocessor(self, example, add_prefix=True):
src_texts = ["sentence1:", example['sentence1'],
"sentence2:", example["sentence2"]]
tgt_texts = [str(example['label'])]
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
if self.data_args.datasets_load_from_disk:
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.wnli")[split]
else:
return datasets.load_dataset('glue', 'wnli', split=split, script_version="master")
#SuperGLUE
class SuperGLUEBoolQ(AbstractTask):
name="superglue-boolq"
labels_list = ['0', '1']
@ -326,34 +319,25 @@ class SuperGLUEBoolQ(AbstractTask):
"validation": "validation",
"test": "validation"}
def load_dataset(self, split):
return datasets.load_dataset('super_glue', 'boolq', split=split, script_version="master")
verbalizers = {
"0": {
"0": "no",
"1": "yes"
},
}
def preprocessor(self, example, add_prefix=True):
src_texts = ["question:", example["question"], "passage:", example["passage"]]
tgt_texts = [str(example["label"])]
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
class SuperGLUERTE(AbstractTask):
name="superglue-rte"
labels_list = ['0', '1']
split_to_data_split = {"train": "train",
"validation": "validation",
"test": "validation"}
metric = [metrics.accuracy]
metric_names = ["accuracy"]
templates_text = {
"0": """hypothesis: {"meta": "question", "shortenable":True} premise: {"meta":"passage", "shortenable":True} The answer was {"mask"}."""
}
def load_dataset(self, split):
return datasets.load_dataset('super_glue', 'rte', split=split, script_version="master")
def preprocessor(self, example, add_prefix=True):
src_texts = ["premise:", example["premise"],
"hypothesis:", example["hypothesis"]]
tgt_texts = [str(example["label"])]
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
if self.data_args.datasets_load_from_disk:
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.boolq")[split]
else:
return datasets.load_dataset('super_glue', 'boolq', split=split, script_version="master")
#
class SuperGLUECB(AbstractTask):
name = "superglue-cb"
labels_list = ['0', '1', '2']
@ -363,13 +347,21 @@ class SuperGLUECB(AbstractTask):
metric = [metrics.mean_multiclass_f1(num_classes=3), metrics.accuracy]
metric_names = ["f1_multiclass", "accuracy"]
def load_dataset(self, split):
return datasets.load_dataset('super_glue', 'cb', split=split, script_version="master")
verbalizers = {
"0":{"0": "yes",
"1": "no",
"2": "maybe"
}
}
templates_text = {
"0": """hypothesis: {"meta": 'hypothesis',"shortenable":True} premise: {"meta":'premise', "shortenable":True} The answer was {"mask"}."""
}
def preprocessor(self, example, add_prefix=True):
src_texts = ["premise:", example["premise"], "hypothesis:", example["hypothesis"]]
tgt_texts = [str(example["label"])]
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
def load_dataset(self, split):
if self.data_args.datasets_load_from_disk:
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.cb")[split]
else:
return datasets.load_dataset('super_glue', 'cb', split=split, script_version="master")
class SuperGLUECOPA(AbstractTask):
@ -379,17 +371,23 @@ class SuperGLUECOPA(AbstractTask):
"validation": "validation",
"test": "validation"}
metric = [metrics.accuracy]
metric_names = ["accuracy"]
metric_names = ["accuracy"]
verbalizers = {
"0":{
"0": "1",
"1": "2",
}
}
templates_text = {
"0": """choice1: {"meta":"choice1"} choice2: {"meta":"choice2"} premise: {"meta":"premise", "shortenable":True} The {"meta":"question"} answer was choice{"mask"}."""
}
def load_dataset(self, split):
return datasets.load_dataset('super_glue', 'copa', split=split, script_version="master")
def preprocessor(self, example, add_prefix=True):
src_texts = ["premise:", example["premise"],
"choice1:", example["choice1"],
"choice2:", example["choice2"]]
tgt_texts = [str(example["label"])]
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
if self.data_args.datasets_load_from_disk:
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.copa")[split]
else:
return datasets.load_dataset('super_glue', 'copa', split=split, script_version="master")
class SuperGLUEMultiRC(AbstractTask):
@ -398,31 +396,44 @@ class SuperGLUEMultiRC(AbstractTask):
split_to_data_split = {"train": "train",
"validation": "validation",
"test": "validation"}
metric = [metrics.multirc_f1_over_all_answers,
metrics.mean_group_metric(metrics.exact_match)]
metric = [metrics.f1_score,
metrics.accuracy]
metric_names = ["f1", "em"]
verbalizers = {
"0": {
"0": "no",
"1": "yes",
}
}
templates_text = {
"0": """question: {"meta":"question", "shortenable":False} answer: {"meta":"answer", "shortenable":False, "post_processing": lambda x:x+"."} paragraph: {"meta":"paragraph", "shortenable":True} The answer was {"mask"}."""
}
def load_dataset(self, split):
return datasets.load_dataset('super_glue', 'multirc', split=split, script_version="master")
if self.data_args.datasets_load_from_disk:
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.multirc")[split]
else:
return datasets.load_dataset('super_glue', 'multirc', split=split, script_version="master")
def remove_markup(self, text):
"""Removes the HTML markup."""
text = re.sub('<br>', ' ', text)
text = re.sub('<(/)?b>', '', text)
return text
return text
def preprocessor(self, example, add_prefix=True):
group = example['idx']['question']
# T5 applies remove_markup to the joined string, but this should not make
def preprocessor(self, example):
# T5 applies remove_markup to the joined string, but this should not make
# any difference as well.
# https://github.com/google-research/text-to-text-transfer-transformer/blob/a1352e625db7ec114062f99d99b0565b9e45c155/t5/data/preprocessors.py#L797
src_texts = ["question:", self.remove_markup(example["question"]),
"answer:", self.remove_markup(example["answer"]),
"paragraph:", self.remove_markup(example["paragraph"])]
tgt_texts = [str(example["label"])]
return self.seq2seq_format(src_texts, tgt_texts, add_prefix, extra_fields={"group": group})
# https://github.com/google-research/text-to-text-transfer-transformer/blob/a1352e625db7ec114062f99d99b0565b9e45c155/t5/data/preprocessors.py#L797
example["question"] = self.remove_markup(example["question"])
example["answer"] = self.remove_markup(example["answer"])
example["paragraph"] = self.remove_markup(example["paragraph"])
return example
class SuperGLUEWIC(AbstractTask):
name = "superglue-wic"
@ -431,130 +442,115 @@ class SuperGLUEWIC(AbstractTask):
"validation": "validation",
"test": "validation"}
metric = [metrics.accuracy]
metric_names = ["accuracy"]
metric_names = ["accuracy"]
verbalizers = {
"0": {
"0": "No",
"1": "Yes",
}
}
templates_text = {
"0": """sentence1: {"meta":"sentence1"} sentence2: {"meta":"sentence2", "shortenable": True} word: {"meta":"word"} {"mask"}."""
}
def load_dataset(self, split):
return datasets.load_dataset('super_glue', 'wic', split=split, script_version="master")
def preprocessor(self, example, add_prefix=True):
src_texts = ["sentence1:", example["sentence1"],
"sentence2:", example["sentence2"],
"word:", example["word"]]
tgt_texts = [str(example["label"])]
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
if self.data_args.datasets_load_from_disk:
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.wic")[split]
else:
return datasets.load_dataset('super_glue', 'wic', split=split, script_version="master")
class SuperGLUEWSCFixed(AbstractTask):
# source: https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/data/preprocessors.py
"""Convert WSC examples to text2text format.
WSC includes a sentence along with 2 'spans': the first denoting a noun and
the other a pronoun. The 'label' specifies whether or not the pronoun is
referencing the noun. This preprocessor puts ' * ' around the noun and ' # '
around the pronoun.
For example, a typical example from WSC might look like
{
'text': 'This is a test sentence .',
'span1_text': 'test',
'span1_index': 3,
'span2_text': 'This',
'span2_index': 0,
'label': 0
}
This example would be transformed to
{
'inputs': 'wsc text: # This # is a * test * sentence .',
'targets': 'False'
}
"""
name = "superglue-wsc.fixed"
labels_list = ['0', '1']
# class SuperGLUERecord(AbstractTask):
# """Convert ReCoRD examples to text2text examples.
# ReCoRD contains a passage, query containing a '@placeholder' string, and a set
# of entities that are the possible values of the placeholder. Each train and
# validation example will have a list of answers, any of which would be
# considered correct.
# For example, a typical example from ReCoRD might look like
# {
# 'passsage': 'This is the passage.',
# 'query': 'A @placeholder is a bird.',
# 'entities': ['penguin', 'potato', 'pigeon'],
# 'answers': ['penguin', 'pigeon'],
# }
# which this preprocessor would turn into the following two examples:
# {
# 'inputs': 'record query: A @placeholder is a bird. entities: penguin, '
# 'potato, pigeon passage: This is the passage.',
# 'targets': 'penguin',
# }
# and
# {
# 'inputs': 'record query: A @placeholder is a bird. entities: penguin, '
# 'potato, pigeon passage: This is the passage.',
# 'targets': 'pigeon',
# }
# """
# name = "superglue-record"
# split_to_data_split = {"train": "train",
# "validation": "validation",
# "test": "validation"}
# metric = [metrics.squad]
# metric_names = ["squad"]
# def load_dataset(self, split):
# return datasets.load_dataset('super_glue', 'record', split=split, script_version="master")
# def preprocessor(self, batch, add_prefix=True):
# new_batch = collections.defaultdict(list)
# keys = batch.keys()
# for values in zip(*batch.values()):
# ex = {k: v for k, v in zip(keys, values)}
# # updates the passage.
# passage = ex['passage']
# passage = re.sub(r'(\.|\?|\!|\"|\')\n@highlight\n', r'\1 ', passage)
# passage = re.sub(r'\n@highlight\n', '. ', passage)
# inputs = f"record query: {ex['query']} entities: {', '.join(ex['entities'])} passage: {passage}"
# if add_prefix:
# inputs = self.name + " " + inputs
# # duplicates the samples based on number of answers.
# num_answers = len(ex["answers"])
# num_duplicates = np.maximum(1, num_answers)
# new_batch["source"].extend([inputs] * num_duplicates)
# new_batch["target"].extend(ex["answers"] if num_answers > 0 else ["<unk>"])
# new_batch["task"].extend([self.name] * num_duplicates)
# new_batch["extra_fields"].extend([{"answers": ex["answers"]}]*num_duplicates)
# return new_batch
# def map_dataset(self, dataset, add_prefix=True):
# return dataset.map(functools.partial(self.preprocessor, add_prefix=add_prefix),
# batched=True, remove_columns=dataset.column_names)
class Beans(AbstractTask):
name = "beans"
labels_list = ['angular_leaf_spot', 'bean_rust', "healthy"]
split_to_data_split = {"train": "train",
"validation": "validation",
"test": "validation"}
metric = [metrics.accuracy]
metric_names = ["accuracy"]
metric_names = ["accuracy"]
verbalizers = {
"0": {
"0": "No",
"1": "Yes",
}
}
templates_text = {
"0": """{"meta":"sentence1"}"""
}
def load_dataset(self, split):
return datasets.load_dataset('super_glue', 'wsc.fixed', split=split, script_version="master")
def _mark_span(self, text, span_str, span_idx, mark):
pattern_tmpl = r'^((?:\S+\s){N})(W)'
pattern = re.sub('N', str(span_idx), pattern_tmpl)
pattern = re.sub('W', span_str, pattern)
return re.sub(pattern, r'\1{0} \2 {0}'.format(mark), text)
def preprocessor(self, example, add_prefix=True):
# converts text as done in T5.
text = example['text']
text = self._mark_span(text, example['span1_text'], example['span1_index'], '*')
# Compensate for 2 added "words" added in previous step.
span2_index = example['span2_index'] + 2 * int(example['span1_index'] < example['span2_index'])
text = self._mark_span(text, example['span2_text'], span2_index, '#')
src_texts = ["text:", text]
tgt_texts = [str(example["label"])]
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
# from IPython import embed; embed(header="beans")
if self.data_args.datasets_load_from_disk:
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.wic")[split]
else:
return datasets.load_dataset('beans', split=split, script_version="master")
class SuperGLUERecord(AbstractTask):
"""Convert ReCoRD examples to text2text examples.
ReCoRD contains a passage, query containing a '@placeholder' string, and a set
of entities that are the possible values of the placeholder. Each train and
validation example will have a list of answers, any of which would be
considered correct.
For example, a typical example from ReCoRD might look like
{
'passsage': 'This is the passage.',
'query': 'A @placeholder is a bird.',
'entities': ['penguin', 'potato', 'pigeon'],
'answers': ['penguin', 'pigeon'],
}
which this preprocessor would turn into the following two examples:
{
'inputs': 'record query: A @placeholder is a bird. entities: penguin, '
'potato, pigeon passage: This is the passage.',
'targets': 'penguin',
}
and
{
'inputs': 'record query: A @placeholder is a bird. entities: penguin, '
'potato, pigeon passage: This is the passage.',
'targets': 'pigeon',
}
"""
name = "superglue-record"
split_to_data_split = {"train": "train",
"validation": "validation",
"test": "validation"}
metric = [metrics.squad]
metric_names = ["squad"]
def load_dataset(self, split):
return datasets.load_dataset('super_glue', 'record', split=split, script_version="master")
def preprocessor(self, batch, add_prefix=True):
new_batch = collections.defaultdict(list)
keys = batch.keys()
for values in zip(*batch.values()):
ex = {k: v for k, v in zip(keys, values)}
# updates the passage.
passage = ex['passage']
passage = re.sub(r'(\.|\?|\!|\"|\')\n@highlight\n', r'\1 ', passage)
passage = re.sub(r'\n@highlight\n', '. ', passage)
inputs = f"record query: {ex['query']} entities: {', '.join(ex['entities'])} passage: {passage}"
if add_prefix:
inputs = self.name + " " + inputs
# duplicates the samples based on number of answers.
num_answers = len(ex["answers"])
num_duplicates = np.maximum(1, num_answers)
new_batch["source"].extend([inputs] * num_duplicates)
new_batch["target"].extend(ex["answers"] if num_answers > 0 else ["<unk>"])
new_batch["task"].extend([self.name] * num_duplicates)
new_batch["extra_fields"].extend([{"answers": ex["answers"]}]*num_duplicates)
return new_batch
def map_dataset(self, dataset, add_prefix=True):
return dataset.map(functools.partial(self.preprocessor, add_prefix=add_prefix),
batched=True, remove_columns=dataset.column_names)
TASK_MAPPING = OrderedDict(
@ -570,21 +566,20 @@ TASK_MAPPING = OrderedDict(
('qqp', QQP),
('stsb', STSB),
('superglue-boolq', SuperGLUEBoolQ),
('superglue-rte', SuperGLUERTE),
('superglue-cb', SuperGLUECB),
('superglue-copa', SuperGLUECOPA),
('superglue-multirc', SuperGLUEMultiRC),
('superglue-wic', SuperGLUEWIC),
('superglue-wsc.fixed', SuperGLUEWSCFixed),
('superglue-record', SuperGLUERecord)
# ('superglue-record', SuperGLUERecord)
('beans', Beans)
]
)
class AutoTask:
@classmethod
def get(self, task, config, seed=42):
def get(self, task, config, data_args, seed=42):
if task in TASK_MAPPING:
return TASK_MAPPING[task](config, seed)
return TASK_MAPPING[task](config, data_args, seed)
raise ValueError(
"Unrecognized task {} for AutoTask Model: {}.\n"
"Task name should be one of {}.".format(

View File

@ -1,4 +1,5 @@
import numpy as np
import re
def round_stsb_target(label):
"""STSB maps two sentences to a floating point number between 1 and 5
@ -15,3 +16,15 @@ def round_stsb_target(label):
"""
return np.round((label * 5) / 5, decimals=1)
def pad_punctuation(text):
"""Re-implementation of _pad_punctuation in t5. This function adds spaces
around punctuation. While this pads punctuation as expected, it has the
unexpected effected of padding certain unicode characters with accents, with
spaces as well. For instance: "François" becomes "Fran ç ois"""
# Pad everything except for: underscores (_), whitespace (\s),
# numbers (\p{N}), letters (\p{L}) and accent characters (\p{M}).
text = re.sub(r'([^_\s\p{N}\p{L}\p{M}])', r' \1 ', text)
# Collapse consecutive whitespace into one space.
text = re.sub(r'\s+', ' ', text)
return text

View File

@ -0,0 +1,44 @@
PATHBASE=/mnt/sfs_turbo/hsd/officialod/OpenDelta-1/examples/examples_prompt/
PYTHONPATH=/mnt/sfs_turbo/zhangshudan/anaconda3/envs/officialod/bin/python
PLMPATHBASE=/mnt/sfs_turbo/hsd/plm_cache/ # must be empty string or dir that ends with /
DATASETSPATHBASE=/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/
RUNTIME=$(date +%m%d%H%M%S)
MODELNAME="roberta-base"
DATASET=$1
DELTATYPES=("none" "bitfit" "lora" "adapter")
CUDAIDS=("0 1" "2 3" "4 5" "6 7")
NUMTRIALS=50
CONTINUESTUDY=${2:-'0'}
echo $RUNTIME
echo $MODELNAME
echo $DATASET
echo $DELTATYPE
echo $CUDAIDS
echo $NUMTRIALS
echo $CONTINUESTUDY
cd $PATHBASE
for expid in 0 1 2 3
do
( $PYTHONPATH search_distributed.py \
--model_name $MODELNAME \
--dataset $DATASET \
--delta_type ${DELTATYPES[$expid]} \
--cuda_ids ${CUDAIDS[$expid]} \
--num_trials $NUMTRIALS \
--mode run \
--repeat_time 1 \
--main_file_name run_mlm.py \
--pathbase $PATHBASE \
--pythonpath $PYTHONPATH \
--plm_path_base $PLMPATHBASE \
--datasets_saved_path $DATASETSPATHBASE \
--datasets_load_from_disk \
--continue_study $CONTINUESTUDY >>/mnt/sfs_turbo/hsd/officialod/OpenDelta-1/examples/examples_prompt/out_sfs/$RUNTIME.txt 2>&1
) &
done
wait

View File

@ -15,9 +15,16 @@ if __name__=="__main__":
parser.add_argument("--study_name", type=str, default=None)
parser.add_argument("--cuda_ids", nargs='+', help="list")
parser.add_argument("--mode", type=str, default="run", help="select from 'run' and 'read' ")
parser.add_argument("--continue_study", type=bool, default=False)
parser.add_argument("--continue_study", type=int, default=0)
parser.add_argument("--substudy_prefix", type=str, default="")
parser.add_argument("--main_file_name", type=str)
parser.add_argument("--num_trials", type=int)
parser.add_argument("--pathbase", type=str, default="")
parser.add_argument("--pythonpath", type=str, default="python")
parser.add_argument("--plm_path_base", type=str, default="", help="The path where we cache the plms. Must be empty string or dir that ends with /")
parser.add_argument("--datasets_load_from_disk", action="store_true")
parser.add_argument("--datasets_saved_path", type=str)
parser.add_argument("--repeat_time", type=int, default=1)
args = parser.parse_args()
@ -26,13 +33,13 @@ if __name__=="__main__":
args.study_name = pardir
else:
args.study_name += pardir
setattr(args, "output_dir", f"outputs_search/{args.study_name}")
setattr(args, "output_dir", f"{args.pathbase}/outputs_search/{args.study_name}")
if args.mode == "run":
if args.continue_study:
if args.continue_study==1:
print("Continue study!")
else:
print("Creat new study!")
@ -41,7 +48,8 @@ if __name__=="__main__":
os.mkdir(f"{args.output_dir}")
else:
if not args.continue_study:
user_cmd = input("Detected existing study, are you sure to create new by removing old? [Yes/No]")
user_cmd = "yes" #input("Detected existing study, are you sure to create new by removing old? [Yes/No]")
while user_cmd.lower() not in ["yes", "no"]:
print("Please input Yes/No")
user_cmd = input("Detected existing study, are you sure to create new by removing old? [Yes/No]")
@ -62,23 +70,39 @@ if __name__=="__main__":
tot_chunk_num = len(args.cuda_ids)
subprocesses = []
for id, cudas in enumerate(args.cuda_ids):
if id+1 < tot_chunk_num:
sub_n_trials = args.num_trials//tot_chunk_num
else:
sub_n_trials = args.num_trials//tot_chunk_num + args.num_trials%tot_chunk_num
command = "nohup python search_single.py "
command = f"{args.pythonpath} search_single.py "
command += f"--cuda_id {cudas} "
command += f"--model_name {args.model_name} "
command += f"--dataset {args.dataset} "
command += f"--delta_type {args.delta_type} "
command += f"--study_name {args.study_name} "
command += f"--optuna_seed 10{id} "
command += f"--main_file_name {args.main_file_name} "
command += f"--num_trials {sub_n_trials} "
command += f">{args.output_dir}/{args.substudy_prefix}{id}.log 2>&1 &"
p = subprocess.Popen(command, cwd="./", shell=True)
print("id {} on cuda:{}, pid {}\n {}\n".format(id, cudas, p.pid, command))
command += f"--pythonpath {args.pythonpath} "
command += f"--pathbase {args.pathbase} "
command += f"--repeat_time {args.repeat_time} "
command += f"--plm_path_base {args.plm_path_base} "
command += f"--datasets_saved_path {args.datasets_saved_path} "
if args.datasets_load_from_disk:
command += f"--datasets_load_from_disk "
command += f"> {args.output_dir}/{args.substudy_prefix}{id}.log 2>&1"
p = subprocess.Popen(command, cwd=f"{args.pathbase}", shell=True)
subprocesses.append(p)
print("id {} on cuda:{}, pid {}".format(id, cudas, p.pid))
print(command)
print()
print("Wait for subprocesses to complete")
exit_codes = [p.wait() for p in subprocesses]
print("All complete!")
elif args.mode == 'read':
study = optuna.load_study(study_name=args.study_name, storage=f"sqlite:///{args.study_name}.db")
@ -96,17 +120,17 @@ if __name__=="__main__":
plot_contour = optuna.visualization.plot_contour(study, params=['learning_rate', 'batch_size_base'])
plot_contour2 = optuna.visualization.plot_contour(study, params=['learning_rate', 'warmup_steps'])
plot_history.write_image(f"{args.output_dir}/history.png")
plot_slice.write_image(f"{args.output_dir}/slice.png")
plot_contour.write_image(f"{args.output_dir}/contour.png")
plot_contour2.write_image(f"{args.output_dir}/contour2.png")

View File

@ -10,23 +10,29 @@ from optuna.samplers import TPESampler
import shutil
import time
import subprocess
def objective_singleseed(args, unicode, search_space_sample ):
os.mkdir(f"{args.output_dir}/{unicode}")
search_space_sample.update({"output_dir": f"{args.output_dir}/{unicode}"})
with open(f"{args.output_dir}/{unicode}/this_configs.json", 'w') as fout:
json.dump(search_space_sample, fout, indent=4,sort_keys=True)
command = "CUDA_VISIBLE_DEVICES={} ".format(args.cuda_id)
command += "python run.py "
command += f"{args.output_dir}/{unicode}/this_configs.json"
status_code = os.system(command)
print("status_code",status_code)
command = "CUDA_VISIBLE_DEVICES={} ".format(args.cuda_id)
command += f"{args.pythonpath} {args.main_file_name} "
command += f"{args.output_dir}/{unicode}/this_configs.json"
command += f" >> {args.output_dir}/{unicode}/output.log 2>&1"
print("======"*5+"\n"+command)
p = subprocess.Popen(command, cwd=f"{args.pathbase}", shell=True)
print(f"wait for subprocess \"{command}\" to complete")
p.wait()
# if status_code != 0:
# with open(f"{args.output_dir}/{args.cuda_id}.log",'r') as flog:
# lastlines = " ".join(flog.readlines()[-100:])
@ -50,8 +56,13 @@ def objective_singleseed(args, unicode, search_space_sample ):
else:
os.remove(full_file_name)
return results['test']['test_average_metrics']
results_all_test_datasets = []
print("results:", results)
for datasets in results['test']:
results_all_test_datasets.append(results['test'][datasets]['test_average_metrics'])
return sum(results_all_test_datasets)/len(results_all_test_datasets)#results['test']['average_metrics']
def objective(trial, args=None):
@ -61,7 +72,7 @@ def objective(trial, args=None):
search_space_sample.update(DatasetSearchSpace(args.dataset).get_config(trial, args))
search_space_sample.update(AllDeltaSearchSpace[args.delta_type]().get_config(trial, args))
results = []
for seed in [100]:
for seed in range(42, 42+args.repeat_time):
search_space_sample.update({"seed": seed})
unicode = random.randint(0, 100000000)
while os.path.exists(f"{args.output_dir}/{unicode}"):
@ -74,23 +85,33 @@ def objective(trial, args=None):
if __name__=="__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--delta_type")
parser.add_argument("--dataset")
parser.add_argument("--model_name")
parser.add_argument("--cuda_id", type=int)
parser.add_argument("--main_file_name", type=str)
parser.add_argument("--study_name")
parser.add_argument("--num_trials", type=int)
parser.add_argument("--repeat_time", type=int)
parser.add_argument("--optuna_seed", type=int, default="the seed to sample suggest point")
parser.add_argument("--pathbase", type=str, default="")
parser.add_argument("--pythonpath", type=str, default="")
parser.add_argument("--plm_path_base", type=str, default="")
parser.add_argument("--datasets_load_from_disk", action="store_true")
parser.add_argument("--datasets_saved_path", type=str)
args = parser.parse_args()
setattr(args, "output_dir", f"outputs_search/{args.study_name}")
setattr(args, "output_dir", f"{args.pathbase}/outputs_search/{args.study_name}")
study = optuna.load_study(study_name=args.study_name, storage=f'sqlite:///{args.study_name}.db', sampler=TPESampler(seed=args.optuna_seed))
study.optimize(partial(objective, args=args), n_trials=args.num_trials)
print("complete single!")

View File

@ -1,4 +1,4 @@
import collections
import collections
import copy
@ -10,7 +10,7 @@ class BaseSearchSpace:
"do_train": True,
"do_eval": True,
"do_test": True,
"save_total_limit": 1,
# For glue datasets.
@ -19,7 +19,6 @@ class BaseSearchSpace:
"eval_dataset_config_name": ["en"],
"test_dataset_config_name": ["en"],
# other configurations.
"predict_with_generate": True,
# To evaluate during training.
"load_best_model_at_end": True,
"metric_for_best_model": "average_metrics",
@ -27,7 +26,10 @@ class BaseSearchSpace:
"evaluation_strategy": "steps",
"overwrite_output_dir": True,
"push_to_hub": False,
"save_strategy": "steps"
"save_strategy": "steps",
"datasets_load_from_disk": args.datasets_load_from_disk,
"datasets_saved_path": args.datasets_saved_path
}
@ -37,7 +39,7 @@ class BitFitSearchSpace:
learning_rate = trail.suggest_loguniform('learning_rate', 1e-5, 1e-1)
return {
"delta_type": "bitfit",
'learning_rate': learning_rate,
'learning_rate': learning_rate,
}
class AdapterSearchSpace:
@ -68,7 +70,7 @@ class FinetuneSearchSpace:
learning_rate = trail.suggest_loguniform('learning_rate', 1e-5, 1e-1)
return {
"delta_type": "none",
'learning_rate': learning_rate,
'learning_rate': learning_rate,
}
class LoRASearchSpace:
@ -100,16 +102,16 @@ class CompacterSearchSpace:
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
@ -140,16 +142,16 @@ class CompacterppSearchSpace:
"non_linearity": "gelu_new",
#Compacter.
"hypercomplex_division": 4,
"hypercomplex_division": 4,
"hypercomplex_adapters": True,
"hypercomplex_nonlinearity": "glorot-uniform",
# gradient clip and clamp
# gradient clip and clamp
"gradient_clip": False,
"phm_clamp": False,
"normalize_phm_weight": False,
"normalize_phm_weight": False,
"learn_phm": True,
# shared one side
"factorized_phm": True,
# shared one side
"factorized_phm": True,
"shared_phm_rule": False,
"factorized_phm_rule": False,
"phm_c_init": "normal",
@ -171,7 +173,7 @@ class LowRankAdapterSearchSpace:
"final_layer_norm"
],
"non_linearity": "gelu_new",
"low_rank_w_init": "glorot-uniform",
"low_rank_w_init": "glorot-uniform",
"low_rank_rank": low_rank_rank,
}
@ -201,8 +203,8 @@ class T5BaseSearchSpace:
batch_size = int(16 * 2**(min(batch_size_base,3)-1))
warmup_steps = trail.suggest_categorical('warmup_steps', [0, 500])
return {
"model_name_or_path": "t5-base", # change here for loading from custom path
"tokenizer_name": "t5-base", # change here for loading from custom path
"model_name_or_path": f"{args.plm_path_base}t5-base", # change here for loading from custom path
"tokenizer_name": f"{args.plm_path_base}t5-base", # change here for loading from custom path
'batch_size':batch_size,
"per_device_train_batch_size": batch_size,
"per_device_eval_batch_size": batch_size,
@ -211,17 +213,43 @@ class T5BaseSearchSpace:
"save_steps": 200,
"eval_steps": 200,
"max_steps": 5000,
"predict_with_generate": True,
}
class RobertaBaseSearchSpace:
def get_config(self, trail, args=None):
batch_size_base = trail.suggest_int('batch_size_base', 1, 4)
if batch_size_base >= 4:
gradient_accumulation_steps = 2**(batch_size_base-3)
else:
gradient_accumulation_steps = 1
batch_size = int(16 * 2**(min(batch_size_base,3)-1))
warmup_steps = trail.suggest_categorical('warmup_steps', [0, 500])
return {
"model_name_or_path": f"{args.plm_path_base}roberta-base", # change here for loading from custom path
"tokenizer_name": f"{args.plm_path_base}roberta-base", # change here for loading from custom path
'batch_size':batch_size,
"per_device_train_batch_size": batch_size,
"per_device_eval_batch_size": batch_size,
"warmup_steps": warmup_steps,
"gradient_accumulation_steps": gradient_accumulation_steps,
"save_steps": 200,
"eval_steps": 200,
"max_steps": 5000,
"predict_with_generate": False,
}
class DatasetSearchSpace:
dataset_order = ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"]
dataset_config = {("task_name", "eval_dataset_name", "test_dataset_name",
dataset_order = ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb", "wnli"]
dataset_config = {("task_name", "eval_dataset_name", "test_dataset_name",
"max_source_length"): list(zip(
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb", "wnli"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb", "wnli"],
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb", "wnli"],
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128, 128],
))}
def __init__(self, dataset_name):
self.dataset_name = dataset_name
@ -250,6 +278,7 @@ AllDeltaSearchSpace = {
}
AllBackboneSearchSpace = {
"t5-base": T5BaseSearchSpace
"t5-base": T5BaseSearchSpace,
"roberta-base": RobertaBaseSearchSpace,
}

View File

@ -45,12 +45,51 @@ def spearman_corrcoef(predictions, targets) -> dict:
spearman_corrcoef = 0
return {"spearmanr": spearman_corrcoef}
def spearman_corrcoef(predictions, targets) -> dict:
"""Computes Spearman correlation coefficient."""
# TODO: we need to do postprocessors in a clean way for each dataset.
from examples_seq2seq.data_processors.postprocessors import string_to_float
targets = [string_to_float(target) for target in targets]
predictions= [string_to_float(prediction) for prediction in predictions]
spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0]
# Note that if all the predictions will be the same, spearman
# correlation is nan, to gaurad against this, we check the output
# and return 0 in this case.
if math.isnan(spearman_corrcoef):
spearman_corrcoef = 0
return {"spearmanr": spearman_corrcoef}
def f1_score_with_invalid(predictions, targets) -> dict:
"""Computes F1 score, with any prediction != 0 or 1 is counted as incorrect.
Args:
targets: list of targets, either 0 or 1
predictions: list of predictions, any integer value
Returns:
F1 score, where any prediction != 0 or 1 is counted as wrong.
"""
def binary_reverse(labels):
return ['0' if label == '1' else '1' for label in labels]
targets, predictions = np.asarray(targets), np.asarray(predictions)
# Get indices of invalid predictions.
invalid_idx_mask = np.logical_and(predictions != '0', predictions != '1')
# For any prediction != 0 or 1, we set the prediction to the opposite of its corresponding target.
predictions[invalid_idx_mask] = binary_reverse(targets[invalid_idx_mask])
targets = targets.astype(np.int32)
predictions = predictions.astype(np.int32)
return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)}
def transform_for_generation(predictions, targets):
mapping = {k: i for i, k in enumerate(set(targets))}
targets = np.asarray([mapping[k] for k in targets])
predictions = np.asarray([mapping[k] if k in mapping else (t+1)%len(mapping) for t, k in zip(targets, predictions)])
return predictions, targets

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.0002561697332863371,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/10940816",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 32,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.0017750209757755706,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/1107862",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 16,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 8.499916262600587e-05,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/15328099",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 16,
"per_device_train_batch_size": 16,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 32,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.0006091646696452159,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/15991793",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 16,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.020109951371648067,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/19489534",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 16,
"per_device_train_batch_size": 16,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 32,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.005159882530578781,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/2281342",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 0.006869610954981632,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/26349674",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 32,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.0002723799659564822,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/28219263",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.0018605158382269157,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/28244173",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 0.0001248231069039661,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/28313708",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 0.0009490000624893097,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/28844651",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 3.5602209401278214e-05,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/28881946",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 32,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.004220683008677483,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/29695566",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 32,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.004159184883370181,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/304080",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 0.0009353172054773991,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/33594301",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 32,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.0037650265946582574,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/37208828",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 16,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 6.867655291394631e-05,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/38351436",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 16,
"per_device_train_batch_size": 16,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 0.0022951686429675895,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/42338278",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 0.0011474682877585407,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/43419391",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.009965694572181888,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/45030088",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.0020236592832077785,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/50851153",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1 +0,0 @@
{"batch_size": 64, "dataset_config_name": ["en"], "delta_type": "bitfit", "do_eval": true, "do_test": true, "do_train": true, "eval_dataset_config_name": ["en"], "eval_dataset_name": "mrpc", "eval_steps": 200, "evaluation_strategy": "steps", "gradient_accumulation_steps": 1, "greater_is_better": true, "learning_rate": 0.0020236592832077785, "load_best_model_at_end": true, "max_source_length": 128, "max_steps": 5000, "metric_for_best_model": "average_metrics", "model_name_or_path": "t5-base", "output_dir": "outputs_search/bitfit.mrpc.t5-base/50851153", "overwrite_output_dir": true, "per_device_eval_batch_size": 64, "per_device_train_batch_size": 64, "predict_with_generate": true, "push_to_hub": false, "save_steps": 200, "save_strategy": "steps", "save_total_limit": 1, "seed": 100, "split_validation_test": true, "task_name": "mrpc", "test_dataset_config_name": ["en"], "test_dataset_name": "mrpc", "tokenizer_name": "t5-base", "warmup_steps": 0}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.011098597581779427,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/57783553",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 0.0005414844782319124,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/6060488",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 0.016927560240899083,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/61860753",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 1.0141082015912518e-05,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/63232091",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 0.0018137027382556477,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/6329472",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.023938918670661075,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/64753972",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 16,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.08212873599011565,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/65221118",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 16,
"per_device_train_batch_size": 16,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 4.8538530604501934e-05,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/66798551",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 32,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.0056649657801790786,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/67615376",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.03495857107255486,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/6773136",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 0.00039059864620439417,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/68027569",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 0.0002642938525995798,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/68314189",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.037536374095955345,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/71501650",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 32,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.008866400032296955,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/73962149",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.01086484610816823,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/83260414",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 1.2611496517588744e-05,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/83839551",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 32,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.0010110776655071255,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/85624941",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 32,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.0005414844782319124,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/86039549",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.0027955533792956614,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/89676181",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 0.0012573200149141731,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/91446644",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 0.001152480984285531,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/92427532",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 16,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.002464124578330328,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/93923515",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 16,
"per_device_train_batch_size": 16,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 16,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.000127337205276883,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/96799644",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 16,
"per_device_train_batch_size": 16,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 0.017304287780519442,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/97118516",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.057233123182472576,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/97177600",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.041620230849224296,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/97660529",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 0.0005420479832650441,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/98459622",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 32,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.0026938134462562973,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/99566760",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 0.00702408842393251,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/99826259",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "mrpc",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 0.00702408842393251,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.mrpc.t5-base/99826259",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "mrpc",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "mrpc",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 103 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 186 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 56 KiB

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "rte",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 1.1032607780913182e-05,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.rte.t5-base/1123702",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "rte",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "rte",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "rte",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 9.869021064463024e-05,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.rte.t5-base/12173417",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "rte",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "rte",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "rte",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 0.000913136097576348,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.rte.t5-base/14983360",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "rte",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "rte",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 64,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "rte",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 2,
"greater_is_better": true,
"learning_rate": 1.1605972169428286e-05,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.rte.t5-base/17148549",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 64,
"per_device_train_batch_size": 64,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "rte",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "rte",
"tokenizer_name": "t5-base",
"warmup_steps": 500
}

View File

@ -1,42 +0,0 @@
{
"batch_size": 32,
"dataset_config_name": [
"en"
],
"delta_type": "bitfit",
"do_eval": true,
"do_test": true,
"do_train": true,
"eval_dataset_config_name": [
"en"
],
"eval_dataset_name": "rte",
"eval_steps": 200,
"evaluation_strategy": "steps",
"gradient_accumulation_steps": 1,
"greater_is_better": true,
"learning_rate": 2.8707127478048054e-05,
"load_best_model_at_end": true,
"max_source_length": 128,
"max_steps": 5000,
"metric_for_best_model": "average_metrics",
"model_name_or_path": "t5-base",
"output_dir": "outputs_search/bitfit.rte.t5-base/18069491",
"overwrite_output_dir": true,
"per_device_eval_batch_size": 32,
"per_device_train_batch_size": 32,
"predict_with_generate": true,
"push_to_hub": false,
"save_steps": 200,
"save_strategy": "steps",
"save_total_limit": 1,
"seed": 100,
"split_validation_test": true,
"task_name": "rte",
"test_dataset_config_name": [
"en"
],
"test_dataset_name": "rte",
"tokenizer_name": "t5-base",
"warmup_steps": 0
}

Some files were not shown because too many files have changed in this diff Show More