commit
b3935927dc
|
@ -3,7 +3,7 @@ data/
|
|||
logs/*
|
||||
experiments/logs
|
||||
!logs/.gitkeep
|
||||
datasets/*
|
||||
datasets/*
|
||||
!datasets/*.sh
|
||||
.vscode/
|
||||
*.egg-info/
|
||||
|
@ -17,17 +17,17 @@ _build/
|
|||
outputs/
|
||||
log.txt
|
||||
**/DeltaHub/
|
||||
**/sfs_scripts/
|
||||
*beans/
|
||||
**/examples/*/configs/
|
||||
|
||||
**/examples/*/configs/*
|
||||
!examples/*/configs/config_gen.py
|
||||
**/jupyter_notebook_examples/
|
||||
!examples/jupyter_notebook_examples/*.py
|
||||
|
||||
|
||||
!**/examples/*/configs/config_gen.py
|
||||
!examples/*/configs/*.py
|
||||
**/outputs_search/**/*.bin
|
||||
**/outputs_search/**/*.pt
|
||||
|
||||
|
||||
*.db
|
||||
**/nohup.out
|
||||
**/examples/examples_bmtrain/BigModels/down_data
|
||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -1 +0,0 @@
|
|||
Subproject commit 058e5f25c898a1f956e3f17a0db6d62f08173e7f
|
|
@ -1 +0,0 @@
|
|||
Subproject commit 3a5083d61e73bae607574a3047deafaa76b97646
|
|
@ -1,50 +0,0 @@
|
|||
<!---
|
||||
Copyright 2021 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
# Use OpenDelta in vision transformer ViT
|
||||
|
||||
This example uses the [huggingface image classification examples](), by adding several
|
||||
lines in the original scripts.
|
||||
|
||||
## Usage
|
||||
### 1. install necessary package
|
||||
```shell
|
||||
pip install Pillow
|
||||
pip install torchvision
|
||||
pip install transformers==4.16.2
|
||||
pip install datsets==1.18.0
|
||||
```
|
||||
|
||||
### 2. run
|
||||
```bash
|
||||
python run_image_classification.py configs/lora_beans.json
|
||||
```
|
||||
|
||||
Do not forget to re-install datasets back into 1.17.0 for other examples. :)
|
||||
|
||||
|
||||
## Possible Errors
|
||||
1. dataset connection error
|
||||
|
||||
Solution 1: open a python console, running the error command again, may not be useful
|
||||
|
||||
Solution 2: download the dataset by yourself on a internect connected machine, saved to disk and transfer to your server, at last load_from_disk.
|
||||
|
||||
|
||||
## Link to original training scripts
|
||||
You may find solution to other question about the scripts and irrelevant to Opendelta in
|
||||
https://github.com/huggingface/transformers/tree/master/examples/pytorch/image-classification
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
{
|
||||
"report_to": "none",
|
||||
"dataset_name": "beans",
|
||||
"output_dir": "./beans_outputs/",
|
||||
"do_train": true,
|
||||
"do_eval": true,
|
||||
"num_train_epochs": 5,
|
||||
"remove_unused_columns": false,
|
||||
"per_device_train_batch_size": 8,
|
||||
"per_device_eval_batch_size": 8,
|
||||
"logging_strategy": "steps",
|
||||
"logging_steps": 10,
|
||||
"evaluation_strategy": "epoch",
|
||||
"save_strategy": "epoch",
|
||||
"load_best_model_at_end": true,
|
||||
"save_total_limit": 3,
|
||||
"seed": 1337,
|
||||
"delta_type": "lora",
|
||||
"modified_modules": [
|
||||
"attention.query",
|
||||
"attention.value"
|
||||
],
|
||||
"unfrozen_modules": [
|
||||
"classifier",
|
||||
"deltas"
|
||||
],
|
||||
"overwrite_output_dir": true,
|
||||
"learning_rate": 5e-4
|
||||
|
||||
}
|
|
@ -1,89 +0,0 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Accuracy metric."""
|
||||
|
||||
from sklearn.metrics import accuracy_score
|
||||
|
||||
import datasets
|
||||
|
||||
|
||||
_DESCRIPTION = """
|
||||
Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
|
||||
Accuracy = (TP + TN) / (TP + TN + FP + FN)
|
||||
TP: True positive
|
||||
TN: True negative
|
||||
FP: False positive
|
||||
FN: False negative
|
||||
"""
|
||||
|
||||
_KWARGS_DESCRIPTION = """
|
||||
Args:
|
||||
predictions: Predicted labels, as returned by a model.
|
||||
references: Ground truth labels.
|
||||
normalize: If False, return the number of correctly classified samples.
|
||||
Otherwise, return the fraction of correctly classified samples.
|
||||
sample_weight: Sample weights.
|
||||
Returns:
|
||||
accuracy: Accuracy score.
|
||||
Examples:
|
||||
|
||||
>>> accuracy_metric = datasets.load_metric("accuracy")
|
||||
>>> results = accuracy_metric.compute(references=[0, 1], predictions=[0, 1])
|
||||
>>> print(results)
|
||||
{'accuracy': 1.0}
|
||||
"""
|
||||
|
||||
_CITATION = """\
|
||||
@article{scikit-learn,
|
||||
title={Scikit-learn: Machine Learning in {P}ython},
|
||||
author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
|
||||
and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
|
||||
and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
|
||||
Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
|
||||
journal={Journal of Machine Learning Research},
|
||||
volume={12},
|
||||
pages={2825--2830},
|
||||
year={2011}
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
@datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
||||
class Accuracy(datasets.Metric):
|
||||
def _info(self):
|
||||
return datasets.MetricInfo(
|
||||
description=_DESCRIPTION,
|
||||
citation=_CITATION,
|
||||
inputs_description=_KWARGS_DESCRIPTION,
|
||||
features=datasets.Features(
|
||||
{
|
||||
"predictions": datasets.Sequence(datasets.Value("int32")),
|
||||
"references": datasets.Sequence(datasets.Value("int32")),
|
||||
}
|
||||
if self.config_name == "multilabel"
|
||||
else {
|
||||
"predictions": datasets.Value("int32"),
|
||||
"references": datasets.Value("int32"),
|
||||
}
|
||||
),
|
||||
reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"],
|
||||
)
|
||||
|
||||
def _compute(self, predictions, references, normalize=True, sample_weight=None):
|
||||
return {
|
||||
"accuracy": float(
|
||||
accuracy_score(references, predictions, normalize=normalize, sample_weight=sample_weight)
|
||||
)
|
||||
}
|
|
@ -1,3 +0,0 @@
|
|||
# torch>=1.5.0
|
||||
torchvision>=0.6.0
|
||||
datasets>=1.8.0
|
|
@ -1,392 +0,0 @@
|
|||
#!/usr/bin/env python
|
||||
# coding=utf-8
|
||||
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from PIL import Image
|
||||
from torchvision.transforms import (
|
||||
CenterCrop,
|
||||
Compose,
|
||||
Normalize,
|
||||
RandomHorizontalFlip,
|
||||
RandomResizedCrop,
|
||||
Resize,
|
||||
ToTensor,
|
||||
)
|
||||
|
||||
import transformers
|
||||
from transformers import (
|
||||
MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
|
||||
AutoConfig,
|
||||
AutoFeatureExtractor,
|
||||
AutoModelForImageClassification,
|
||||
HfArgumentParser,
|
||||
Trainer,
|
||||
TrainingArguments,
|
||||
)
|
||||
from transformers.trainer_utils import get_last_checkpoint
|
||||
from transformers.utils import check_min_version
|
||||
from transformers.utils.versions import require_version
|
||||
|
||||
|
||||
""" Fine-tuning a 🤗 Transformers model for image classification"""
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
|
||||
check_min_version("4.16.0.dev0")
|
||||
|
||||
require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
|
||||
|
||||
MODEL_CONFIG_CLASSES = list(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.keys())
|
||||
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
||||
|
||||
|
||||
def pil_loader(path: str):
|
||||
with open(path, "rb") as f:
|
||||
im = Image.open(f)
|
||||
return im.convert("RGB")
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataTrainingArguments:
|
||||
"""
|
||||
Arguments pertaining to what data we are going to input our model for training and eval.
|
||||
Using ``HfArgumentParser`` we can turn this class
|
||||
into argparse arguments to be able to specify them on
|
||||
the command line.
|
||||
"""
|
||||
|
||||
dataset_name: Optional[str] = field(
|
||||
default="nateraw/image-folder", metadata={"help": "Name of a dataset from the datasets package"}
|
||||
)
|
||||
dataset_config_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
||||
)
|
||||
train_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the training data."})
|
||||
validation_dir: Optional[str] = field(default=None, metadata={"help": "A folder containing the validation data."})
|
||||
train_val_split: Optional[float] = field(
|
||||
default=0.15, metadata={"help": "Percent to split off of train for validation."}
|
||||
)
|
||||
max_train_samples: Optional[int] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "For debugging purposes or quicker training, truncate the number of training examples to this "
|
||||
"value if set."
|
||||
},
|
||||
)
|
||||
max_eval_samples: Optional[int] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
|
||||
"value if set."
|
||||
},
|
||||
)
|
||||
|
||||
def __post_init__(self):
|
||||
data_files = dict()
|
||||
if self.train_dir is not None:
|
||||
data_files["train"] = self.train_dir
|
||||
if self.validation_dir is not None:
|
||||
data_files["val"] = self.validation_dir
|
||||
self.data_files = data_files if data_files else None
|
||||
|
||||
class RemainArgHfArgumentParser(HfArgumentParser):
|
||||
def parse_json_file(self, json_file: str, return_remaining_args=True ):
|
||||
"""
|
||||
Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the
|
||||
dataclass types.
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
from pathlib import Path
|
||||
import dataclasses
|
||||
|
||||
data = json.loads(Path(json_file).read_text())
|
||||
outputs = []
|
||||
for dtype in self.dataclass_types:
|
||||
keys = {f.name for f in dataclasses.fields(dtype) if f.init}
|
||||
inputs = {k: data.pop(k) for k in list(data.keys()) if k in keys}
|
||||
obj = dtype(**inputs)
|
||||
outputs.append(obj)
|
||||
|
||||
remain_args = argparse.ArgumentParser()
|
||||
remain_args.__dict__.update(data)
|
||||
if return_remaining_args:
|
||||
return (*outputs, remain_args)
|
||||
else:
|
||||
return (*outputs,)
|
||||
|
||||
@dataclass
|
||||
class ModelArguments:
|
||||
"""
|
||||
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
|
||||
"""
|
||||
|
||||
model_name_or_path: str = field(
|
||||
default="google/vit-base-patch16-224-in21k",
|
||||
metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"},
|
||||
)
|
||||
model_type: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
|
||||
)
|
||||
config_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
|
||||
)
|
||||
cache_dir: Optional[str] = field(
|
||||
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
|
||||
)
|
||||
model_revision: str = field(
|
||||
default="main",
|
||||
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
|
||||
)
|
||||
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
|
||||
use_auth_token: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
|
||||
"with private models)."
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def collate_fn(examples):
|
||||
pixel_values = torch.stack([example["pixel_values"] for example in examples])
|
||||
labels = torch.tensor([example["labels"] for example in examples])
|
||||
return {"pixel_values": pixel_values, "labels": labels}
|
||||
|
||||
|
||||
def main():
|
||||
# See all possible arguments in src/transformers/training_args.py
|
||||
# or by passing the --help flag to this script.
|
||||
# We now keep distinct sets of args, for a cleaner separation of concerns.
|
||||
|
||||
parser = RemainArgHfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
|
||||
if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
|
||||
# If we pass only one argument to the script and it's the path to a json file,
|
||||
# let's parse it to get our arguments.
|
||||
model_args, data_args, training_args, delta_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
|
||||
else:
|
||||
model_args, data_args, training_args, delta_args = parser.parse_args_into_dataclasses()
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
handlers=[logging.StreamHandler(sys.stdout)],
|
||||
)
|
||||
|
||||
log_level = training_args.get_process_log_level()
|
||||
logger.setLevel(log_level)
|
||||
transformers.utils.logging.set_verbosity(log_level)
|
||||
transformers.utils.logging.enable_default_handler()
|
||||
transformers.utils.logging.enable_explicit_format()
|
||||
|
||||
# Log on each process the small summary:
|
||||
logger.warning(
|
||||
f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
|
||||
+ f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
|
||||
)
|
||||
logger.info(f"Training/evaluation parameters {training_args}")
|
||||
|
||||
# Detecting last checkpoint.
|
||||
last_checkpoint = None
|
||||
if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
|
||||
last_checkpoint = get_last_checkpoint(training_args.output_dir)
|
||||
if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
|
||||
raise ValueError(
|
||||
f"Output directory ({training_args.output_dir}) already exists and is not empty. "
|
||||
"Use --overwrite_output_dir to overcome."
|
||||
)
|
||||
elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
|
||||
logger.info(
|
||||
f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
|
||||
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
|
||||
)
|
||||
|
||||
# Initialize our dataset and prepare it for the 'image-classification' task.
|
||||
ds = load_dataset(
|
||||
data_args.dataset_name,
|
||||
data_args.dataset_config_name,
|
||||
data_files=data_args.data_files,
|
||||
cache_dir=model_args.cache_dir,
|
||||
task="image-classification",
|
||||
)
|
||||
# If you encounter error here, try to down load the dataset by yourself and load from disk
|
||||
# like the following two lines
|
||||
# from datasets import load_from_disk
|
||||
# ds = load_from_disk(f"../../../../huggingface_datasets/saved_to_disk/{data_args.dataset_name}")
|
||||
|
||||
# If we don't have a validation split, split off a percentage of train as validation.
|
||||
data_args.train_val_split = None if "validation" in ds.keys() else data_args.train_val_split
|
||||
if isinstance(data_args.train_val_split, float) and data_args.train_val_split > 0.0:
|
||||
split = ds["train"].train_test_split(data_args.train_val_split)
|
||||
ds["train"] = split["train"]
|
||||
ds["validation"] = split["test"]
|
||||
|
||||
# Prepare label mappings.
|
||||
# We'll include these in the model's config to get human readable labels in the Inference API.
|
||||
labels = ds["train"].features["labels"].names
|
||||
label2id, id2label = dict(), dict()
|
||||
for i, label in enumerate(labels):
|
||||
label2id[label] = str(i)
|
||||
id2label[str(i)] = label
|
||||
|
||||
# Load the accuracy metric from the datasets package
|
||||
# metric = datasets.load_metric("accuracy")
|
||||
metric = datasets.load_metric("metric.py")
|
||||
|
||||
# Define our compute_metrics function. It takes an ``EvalPrediction`` object (a namedtuple with a
|
||||
# predictions and label_ids field) and has to return a dictionary string to float.
|
||||
def compute_metrics(p):
|
||||
"""Computes accuracy on a batch of predictions"""
|
||||
return metric.compute(predictions=np.argmax(p.predictions, axis=1), references=p.label_ids)
|
||||
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_args.config_name or model_args.model_name_or_path,
|
||||
num_labels=len(labels),
|
||||
label2id=label2id,
|
||||
id2label=id2label,
|
||||
finetuning_task="image-classification",
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
model = AutoModelForImageClassification.from_pretrained(
|
||||
model_args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
feature_extractor = AutoFeatureExtractor.from_pretrained(
|
||||
model_args.feature_extractor_name or model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
|
||||
|
||||
if delta_args.delta_type.lower() != "none":
|
||||
from opendelta import AutoDeltaConfig,AutoDeltaModel
|
||||
delta_config = AutoDeltaConfig.from_dict(vars(delta_args))
|
||||
delta_model = AutoDeltaModel.from_config(delta_config, backbone_model=model)
|
||||
delta_model.freeze_module(set_state_dict = True)
|
||||
delta_model.log(delta_ratio=True, trainable_ratio=True, visualization=True)
|
||||
|
||||
# Define torchvision transforms to be applied to each image.
|
||||
normalize = Normalize(mean=feature_extractor.image_mean, std=feature_extractor.image_std)
|
||||
_train_transforms = Compose(
|
||||
[
|
||||
RandomResizedCrop(feature_extractor.size),
|
||||
RandomHorizontalFlip(),
|
||||
ToTensor(),
|
||||
normalize,
|
||||
]
|
||||
)
|
||||
_val_transforms = Compose(
|
||||
[
|
||||
Resize(feature_extractor.size),
|
||||
CenterCrop(feature_extractor.size),
|
||||
ToTensor(),
|
||||
normalize,
|
||||
]
|
||||
)
|
||||
|
||||
def train_transforms(example_batch):
|
||||
"""Apply _train_transforms across a batch."""
|
||||
example_batch["pixel_values"] = [
|
||||
_train_transforms(pil_img.convert("RGB")) for pil_img in example_batch["image"]
|
||||
]
|
||||
return example_batch
|
||||
|
||||
def val_transforms(example_batch):
|
||||
"""Apply _val_transforms across a batch."""
|
||||
example_batch["pixel_values"] = [_val_transforms(pil_img.convert("RGB")) for pil_img in example_batch["image"]]
|
||||
return example_batch
|
||||
|
||||
if training_args.do_train:
|
||||
if "train" not in ds:
|
||||
raise ValueError("--do_train requires a train dataset")
|
||||
if data_args.max_train_samples is not None:
|
||||
ds["train"] = ds["train"].shuffle(seed=training_args.seed).select(range(data_args.max_train_samples))
|
||||
# Set the training transforms
|
||||
ds["train"].set_transform(train_transforms)
|
||||
|
||||
if training_args.do_eval:
|
||||
if "validation" not in ds:
|
||||
raise ValueError("--do_eval requires a validation dataset")
|
||||
if data_args.max_eval_samples is not None:
|
||||
ds["validation"] = (
|
||||
ds["validation"].shuffle(seed=training_args.seed).select(range(data_args.max_eval_samples))
|
||||
)
|
||||
# Set the validation transforms
|
||||
ds["validation"].set_transform(val_transforms)
|
||||
|
||||
# Initalize our trainer
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
train_dataset=ds["train"] if training_args.do_train else None,
|
||||
eval_dataset=ds["validation"] if training_args.do_eval else None,
|
||||
compute_metrics=compute_metrics,
|
||||
tokenizer=feature_extractor,
|
||||
data_collator=collate_fn,
|
||||
)
|
||||
|
||||
# Training
|
||||
if training_args.do_train:
|
||||
checkpoint = None
|
||||
if training_args.resume_from_checkpoint is not None:
|
||||
checkpoint = training_args.resume_from_checkpoint
|
||||
elif last_checkpoint is not None:
|
||||
checkpoint = last_checkpoint
|
||||
train_result = trainer.train(resume_from_checkpoint=checkpoint)
|
||||
trainer.save_model()
|
||||
trainer.log_metrics("train", train_result.metrics)
|
||||
trainer.save_metrics("train", train_result.metrics)
|
||||
trainer.save_state()
|
||||
|
||||
# Evaluation
|
||||
if training_args.do_eval:
|
||||
metrics = trainer.evaluate()
|
||||
trainer.log_metrics("eval", metrics)
|
||||
trainer.save_metrics("eval", metrics)
|
||||
|
||||
# Write model card and (optionally) push to hub
|
||||
kwargs = {
|
||||
"finetuned_from": model_args.model_name_or_path,
|
||||
"tasks": "image-classification",
|
||||
"dataset": data_args.dataset_name,
|
||||
"tags": ["image-classification"],
|
||||
}
|
||||
if training_args.push_to_hub:
|
||||
trainer.push_to_hub(**kwargs)
|
||||
else:
|
||||
trainer.create_model_card(**kwargs)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -10,55 +10,16 @@ This will add `examples_seq2seq` to the environment path of the python lib.
|
|||
|
||||
## Generating the json configuration file
|
||||
|
||||
```shell
|
||||
python configs/gen_$BACKBONETYPE.py --job $YOURJOB
|
||||
#e.g. python configs/gen_beit.py --job lora_beit-base-patch16-224
|
||||
```
|
||||
python config_gen.py --job $job_name
|
||||
|
||||
```
|
||||
The available job configuration (e.g., `--job lora_t5-base`) can be seen from `config_gen.py`. You can also
|
||||
The available job configuration (e.g., `--job lora_beit-base-patch16-224`) can be seen from the scripts. You can also
|
||||
create your only configuration.
|
||||
|
||||
|
||||
## Run the code
|
||||
|
||||
```
|
||||
python run_seq2seq.py configs/$job_name/$dataset.json
|
||||
CUDA_VISIBLE_DEVICES=1 python src/run.py configs/lora_beit-base-patch16-224/beans.json
|
||||
```
|
||||
|
||||
## Possible Errors
|
||||
|
||||
1.
|
||||
```
|
||||
ValueError: You must login to the Hugging Face hub on this computer by typing `transformers-cli login` and entering your credentials to use `use_auth_token=Tr
|
||||
ue`. Alternatively, you can pass your own token as the `use_auth_token` argument.
|
||||
```
|
||||
- Solution 1: Please register an account on [HuggingFace](https://huggingface.co/)
|
||||
Then run transformers-cli login on your command line to enter the username and password.
|
||||
|
||||
- Solution 2: Disable push_to_hub by modifying in the config.json : "push_to_hub": False
|
||||
|
||||
2.
|
||||
```
|
||||
OSError: Looks like you do not have git-lfs installed, please install. You can install from https://git-lfs.github.com/. Then run `git lfs install` (you only have to do this once).
|
||||
```
|
||||
|
||||
- Solution 1:
|
||||
```
|
||||
wget -P ~ https://github.com/git-lfs/git-lfs/releases/download/v3.0.2/git-lfs-linux-amd64-v3.0.2.tar.gz
|
||||
cd ~
|
||||
tar -xvzf git-lfs-linux-amd64-v3.0.2.tar.gz
|
||||
export PATH=~:$PATH
|
||||
git-lfs install
|
||||
```
|
||||
|
||||
- Solution 2: Disable push_to_hub by modifying in the config.json : "push_to_hub": False
|
||||
|
||||
|
||||
3. dataset connection error
|
||||
|
||||
Solution 1: open a python console, running the error command again, may not be useful
|
||||
|
||||
Solution 2: download the dataset by yourself on a internect connected machine, saved to disk and transfer to your server, at last load_from_disk.
|
||||
|
||||
|
||||
## Link to the original training scripts
|
||||
This example repo is based on the [compacter training scripts](https://github.com/rabeehk/compacter), with compacter-related lines removed. Thanks to the authors of the original repo. In addition, in private correspondence with the authors, they shared the codes to create the json configs. Thanks again for their efforts.
|
||||
|
|
|
@ -0,0 +1,145 @@
|
|||
from openpromptu.data_utils import InputExample
|
||||
import torch
|
||||
from transformers.data.data_collator import torch_default_data_collator
|
||||
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
|
||||
import numpy as np
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoFeatureExtractor,
|
||||
AutoModelForImageClassification,
|
||||
)
|
||||
from transformers import ViTFeatureExtractor
|
||||
|
||||
from transformers import Trainer as HfTrainer
|
||||
import torch.nn as nn
|
||||
|
||||
def process_example(raw_example, **kwargs):
|
||||
tokenizer = kwargs['tokenizer']
|
||||
inputs = tokenizer(raw_example['image'], return_tensors='pt')
|
||||
inputs['labels'] = raw_example['labels']
|
||||
return inputs
|
||||
|
||||
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
|
||||
# from openpromptu.prompts import ManualVerbalizer
|
||||
# from openpromptu.prompts import ManualTemplate
|
||||
# from openpromptu import TokenizerWrapper
|
||||
# template = ManualTemplate(text = task.templates_text[template_id])
|
||||
# verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
|
||||
# tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
|
||||
return None, None, None
|
||||
|
||||
def preprocess_function(raw_example, **kwargs):
|
||||
# from IPython import embed; embed(header="Therefa")
|
||||
tokenizer = kwargs['tokenizer']
|
||||
model_inputs = tokenizer(raw_example['image'], return_tensors='pt')
|
||||
model_inputs['pixel_values'] = model_inputs['pixel_values'].squeeze()
|
||||
model_inputs['labels'] = raw_example['labels']
|
||||
return model_inputs
|
||||
|
||||
def compute_metrics(eval_preds, dataset_name, eval_metric):
|
||||
# from IPython import embed; embed(header="In compute metrics")
|
||||
|
||||
preds, labels = eval_preds.predictions, eval_preds.label_ids
|
||||
|
||||
preds = np.argmax(preds, axis=-1)
|
||||
|
||||
result = {}
|
||||
average_metrics = []
|
||||
for metric in eval_metric:
|
||||
metric_item = metric(preds, labels)
|
||||
metric_value = list(metric_item.values())
|
||||
result.update(metric_item)
|
||||
average_metrics.extend(metric_value)
|
||||
print("average:",average_metrics)
|
||||
average_metric = sum(average_metrics)/len(average_metrics)
|
||||
result.update({"average_metrics":average_metric})
|
||||
return result
|
||||
|
||||
def mask_token_func(tokenizer, ith_mask=0):
|
||||
return tokenizer.mask_token
|
||||
|
||||
def get_remove_columns(dataset_features):
|
||||
# dataset_features.pop("label")
|
||||
print("remove_columns: {}".format(dataset_features))
|
||||
return dataset_features
|
||||
|
||||
class DataCollator(HfDataCollatorMixin):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.return_tensors='pt'
|
||||
|
||||
def torch_call(self, features):
|
||||
# from IPython import embed; embed(header="in data collator")
|
||||
a = torch_default_data_collator(features=features)
|
||||
# from IPython import embed; embed(header="in data collator")
|
||||
return a
|
||||
|
||||
|
||||
def get_backbone(model_args, **kwargs):
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
config.dropout_rate = 0.0
|
||||
tokenizer = AutoFeatureExtractor.from_pretrained(
|
||||
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
|
||||
use_fast=model_args.use_fast_tokenizer,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
|
||||
model = AutoModelForImageClassification.from_pretrained(
|
||||
model_args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
config.num_labels = model_args.num_classes
|
||||
old_classifier = model.classifier
|
||||
model.classifier = nn.Linear(old_classifier.in_features, config.num_labels)
|
||||
|
||||
|
||||
return config, tokenizer, model
|
||||
|
||||
class Trainer(HfTrainer):
|
||||
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.verbalizer=verbalizer
|
||||
self.eval_task=eval_task
|
||||
self.compute_metrics = self._compute_metrics
|
||||
self.loss_fn = nn.CrossEntropyLoss()
|
||||
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
labels = inputs.pop('labels')
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.get("logits")
|
||||
|
||||
loss = self.loss_fn(logits, labels)
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
|
||||
def _compute_metrics(self, eval_preds):
|
||||
# from IPython import embed; embed(header="In compute metrics")
|
||||
|
||||
preds, labels = eval_preds.predictions, eval_preds.label_ids
|
||||
|
||||
preds = np.argmax(preds, axis=-1)
|
||||
|
||||
result = {}
|
||||
average_metrics = []
|
||||
for metric in self.eval_task.metric:
|
||||
metric_item = metric(preds, labels)
|
||||
metric_value = list(metric_item.values())
|
||||
result.update(metric_item)
|
||||
average_metrics.extend(metric_value)
|
||||
print("average:",average_metrics)
|
||||
average_metric = sum(average_metrics)/len(average_metrics)
|
||||
result.update({"average_metrics":average_metric})
|
||||
from IPython import embed; embed(header="In compute metrics")
|
||||
return result
|
||||
|
||||
|
|
@ -0,0 +1,141 @@
|
|||
from openpromptu.data_utils import InputExample
|
||||
import torch
|
||||
from transformers.data.data_collator import torch_default_data_collator
|
||||
from transformers.data.data_collator import DataCollatorMixin as HfDataCollatorMixin
|
||||
import numpy as np
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModelForMaskedLM,
|
||||
AutoTokenizer,
|
||||
)
|
||||
|
||||
from transformers import Trainer as HfTrainer
|
||||
|
||||
|
||||
def preprocess_function(raw_example, **kwargs):
|
||||
tokenizer = kwargs['tokenizer']
|
||||
data_args = kwargs['data_args']
|
||||
template = kwargs['template']
|
||||
verbalizer = kwargs['verbalizer']
|
||||
tokenizer_wrapper = kwargs['tokenizer_wrapper']
|
||||
|
||||
example = InputExample(**raw_example)
|
||||
example, other = template.wrap_one_example(example)
|
||||
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
|
||||
model_inputs = tokenizer(input_sentence, max_length=data_args.max_source_length,
|
||||
padding="max_length", truncation=True)
|
||||
return model_inputs
|
||||
|
||||
def compute_metrics(eval_preds, dataset_name, eval_metric):
|
||||
# from IPython import embed; embed(header="In compute metrics")
|
||||
|
||||
preds, labels = eval_preds.predictions, eval_preds.label_ids
|
||||
|
||||
preds = np.argmax(preds, axis=-1)
|
||||
|
||||
result = {}
|
||||
average_metrics = []
|
||||
for metric in eval_metric:
|
||||
metric_item = metric(preds, labels)
|
||||
metric_value = list(metric_item.values())
|
||||
result.update(metric_item)
|
||||
average_metrics.extend(metric_value)
|
||||
print("average:",average_metrics)
|
||||
average_metric = sum(average_metrics)/len(average_metrics)
|
||||
result.update({"average_metrics":average_metric})
|
||||
return result
|
||||
|
||||
def mask_token_func(tokenizer, ith_mask=0):
|
||||
return tokenizer.mask_token
|
||||
|
||||
def get_remove_columns(dataset_features):
|
||||
dataset_features.pop("label")
|
||||
return dataset_features
|
||||
|
||||
|
||||
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
|
||||
from openpromptu.prompts import ManualVerbalizer
|
||||
from openpromptu.prompts import ManualTemplate
|
||||
from openpromptu import TokenizerWrapper
|
||||
template = ManualTemplate(text = task.templates_text[template_id])
|
||||
verbalizer = ManualVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
|
||||
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
|
||||
return template, verbalizer, tokenizer_wrapper
|
||||
|
||||
class DataCollator(HfDataCollatorMixin):
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.return_tensors='pt'
|
||||
|
||||
def torch_call(self, features):
|
||||
return torch_default_data_collator(features=features)
|
||||
|
||||
|
||||
|
||||
def get_backbone(model_args, **kwargs):
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
config.dropout_rate = 0.0
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_fast=model_args.use_fast_tokenizer,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
model = AutoModelForMaskedLM.from_pretrained(
|
||||
model_args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
return config, tokenizer, model
|
||||
|
||||
class Trainer(HfTrainer):
|
||||
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.verbalizer=verbalizer
|
||||
self.eval_task=eval_task
|
||||
self.compute_metrics = self._compute_metrics
|
||||
|
||||
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
labels = inputs.pop('labels')
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.get("logits")
|
||||
input_ids = inputs['input_ids']
|
||||
verbalizer = self.verbalizer.cuda()
|
||||
logits_at_mask = logits[torch.where(input_ids == verbalizer.tokenizer.mask_token_id)]
|
||||
label_logits = verbalizer.process_logits(logits_at_mask)
|
||||
loss_fct = torch.nn.CrossEntropyLoss()
|
||||
loss = loss_fct(label_logits, labels)
|
||||
outputs.logits = label_logits
|
||||
return (loss, outputs) if return_outputs else loss
|
||||
|
||||
def _compute_metrics(self, eval_preds):
|
||||
# from IPython import embed; embed(header="In compute metrics")
|
||||
|
||||
preds, labels = eval_preds.predictions, eval_preds.label_ids
|
||||
|
||||
preds = np.argmax(preds, axis=-1)
|
||||
|
||||
result = {}
|
||||
average_metrics = []
|
||||
for metric in self.eval_task.metric:
|
||||
metric_item = metric(preds, labels)
|
||||
metric_value = list(metric_item.values())
|
||||
result.update(metric_item)
|
||||
average_metrics.extend(metric_value)
|
||||
print("average:",average_metrics)
|
||||
average_metric = sum(average_metrics)/len(average_metrics)
|
||||
result.update({"average_metrics":average_metric})
|
||||
return result
|
||||
|
||||
|
|
@ -0,0 +1,178 @@
|
|||
|
||||
from openpromptu.data_utils import InputExample
|
||||
from transformers import Seq2SeqTrainer as HfSeq2SeqTrainer
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoModelForSeq2SeqLM,
|
||||
AutoTokenizer,
|
||||
)
|
||||
from transformers.data.data_collator import DataCollatorForSeq2Seq as DataCollator
|
||||
import torch
|
||||
|
||||
def mask_token_func(tokenizer, ith_mask):
|
||||
return tokenizer.additional_special_tokens[ith_mask]
|
||||
|
||||
def get_remove_columns(dataset_features):
|
||||
return dataset_features
|
||||
|
||||
def preprocess_function(raw_example, **kwargs):
|
||||
# max_target_length += 1
|
||||
tokenizer = kwargs['tokenizer']
|
||||
data_args = kwargs['data_args']
|
||||
template = kwargs['template']
|
||||
verbalizer = kwargs['verbalizer']
|
||||
tokenizer_wrapper = kwargs['tokenizer_wrapper']
|
||||
split = kwargs['split']
|
||||
example = InputExample(**raw_example)
|
||||
|
||||
|
||||
try:
|
||||
example = verbalizer.wrap_one_example(example)
|
||||
example, other = template.wrap_one_example(example)
|
||||
input_sentence = tokenizer_wrapper.merge_wrapped_example(example)
|
||||
model_inputs = tokenizer(input_sentence, max_length=256,
|
||||
padding="max_length", truncation=True)
|
||||
except:
|
||||
from IPython import embed; embed(header="Therer")
|
||||
|
||||
with tokenizer.as_target_tokenizer():
|
||||
label = tokenizer(other['tgt_text']).input_ids
|
||||
|
||||
model_inputs["labels"] = label
|
||||
return model_inputs
|
||||
|
||||
def get_backbone(model_args, **kwargs):
|
||||
config = AutoConfig.from_pretrained(
|
||||
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
config.dropout_rate = 0.0
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
|
||||
cache_dir=model_args.cache_dir,
|
||||
use_fast=model_args.use_fast_tokenizer,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
|
||||
|
||||
model = AutoModelForSeq2SeqLM.from_pretrained(
|
||||
model_args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=model_args.cache_dir,
|
||||
revision=model_args.model_revision,
|
||||
use_auth_token=True if model_args.use_auth_token else None,
|
||||
)
|
||||
return config, tokenizer, model
|
||||
|
||||
|
||||
def get_prompts(task, tokenizer, data_args, template_id="0", verbalizer_id="0"):
|
||||
from openpromptu.prompts import GenerationVerbalizer
|
||||
from openpromptu.prompts import ManualTemplate
|
||||
from openpromptu import TokenizerWrapper
|
||||
template = ManualTemplate(text = task.templates_text[template_id])
|
||||
verbalizer = GenerationVerbalizer(tokenizer=tokenizer, classes = task.labels_list, label_words=task.verbalizers[verbalizer_id])
|
||||
tokenizer_wrapper = TokenizerWrapper(max_seq_length=data_args.max_source_length, tokenizer=tokenizer, truncate_method="balanced", mask_token_func=mask_token_func)
|
||||
return template, verbalizer, tokenizer_wrapper
|
||||
|
||||
class Trainer(HfSeq2SeqTrainer):
|
||||
def __init__(self, verbalizer=None, eval_task=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.eval_task = eval_task
|
||||
self.compute_metrics = self._compute_metrics
|
||||
|
||||
def compute_loss(self, model, inputs, return_outputs=False):
|
||||
outputs = model(**inputs)
|
||||
if return_outputs:
|
||||
return (outputs.loss, outputs)
|
||||
else:
|
||||
return outputs.loss
|
||||
|
||||
def prediction_step(
|
||||
self,
|
||||
model, #nn.Module,
|
||||
inputs, #Dict[str, Union[torch.Tensor, Any]],
|
||||
prediction_loss_only, #: bool,
|
||||
ignore_keys, #: Optional[List[str]] = None,
|
||||
): #-> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
|
||||
"""
|
||||
Perform an evaluation step on :obj:`model` using obj:`inputs`.
|
||||
|
||||
Subclass and override to inject custom behavior.
|
||||
|
||||
Args:
|
||||
model (:obj:`nn.Module`):
|
||||
The model to evaluate.
|
||||
inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
|
||||
The inputs and targets of the model.
|
||||
|
||||
The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
|
||||
argument :obj:`labels`. Check your model's documentation for all accepted arguments.
|
||||
prediction_loss_only (:obj:`bool`):
|
||||
Whether or not to return the loss only.
|
||||
|
||||
Return:
|
||||
Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and
|
||||
labels (each being optional).
|
||||
"""
|
||||
if not self.args.predict_with_generate or prediction_loss_only:
|
||||
return super().prediction_step(
|
||||
model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys
|
||||
)
|
||||
|
||||
|
||||
has_labels = "labels" in inputs
|
||||
inputs = self._prepare_inputs(inputs)
|
||||
gen_kwargs = {
|
||||
"max_length": 10, # self._max_length if s is not None else self.model.config.max_length,
|
||||
"num_beams": 1 #self._num_beams if self._num_beams is not None else self.model.config.num_beams,
|
||||
}
|
||||
generated_tokens = self.model.generate(
|
||||
inputs["input_ids"],
|
||||
attention_mask=inputs["attention_mask"],
|
||||
**gen_kwargs,
|
||||
)
|
||||
# in case the batch is shorter than max length, the output should be padded
|
||||
if generated_tokens.shape[-1] < gen_kwargs["max_length"]:
|
||||
generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
|
||||
|
||||
with torch.no_grad():
|
||||
|
||||
outputs = model(**inputs)
|
||||
if has_labels:
|
||||
if self.label_smoother is not None:
|
||||
loss = self.label_smoother(outputs, inputs["labels"]).mean().detach()
|
||||
else:
|
||||
loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).mean().detach()
|
||||
else:
|
||||
loss = None
|
||||
|
||||
if self.args.prediction_loss_only:
|
||||
return (loss, None, None)
|
||||
|
||||
labels = inputs["labels"]
|
||||
if labels.shape[-1] < gen_kwargs["max_length"]:
|
||||
labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
|
||||
|
||||
# from IPython import embed; embed(header="In seqseqtrainer")
|
||||
return (loss, generated_tokens, labels)
|
||||
|
||||
def _compute_metrics(self, eval_preds):
|
||||
# from IPython import embed; embed(header="In compute metrics")
|
||||
preds, labels = eval_preds
|
||||
decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
|
||||
decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
|
||||
# post_processor = .get(data_args.dataset_name[0], tokenizer,
|
||||
# data_args.ignore_pad_token_for_loss)
|
||||
# decoded_preds, decoded_labels = post_processor.process(preds, labels, data_info)
|
||||
result = {}
|
||||
for metric in self.eval_task.metric:
|
||||
result.update(metric(decoded_preds, decoded_labels))
|
||||
|
||||
average_metric = sum(result.values())/len(result)
|
||||
result.update({"average_metrics":average_metric})
|
||||
return result
|
||||
|
|
@ -0,0 +1,116 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['albert-xlarge-v2'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}albert-xlarge-v2",
|
||||
"tokenizer_name": f"{PATHBASE}albert-xlarge-v2",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_albert-xlarge-v2'] = copy.deepcopy(BaseConfigs['albert-xlarge-v2'])
|
||||
AllConfigs['prefix_albert-xlarge-v2'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/albert-xlarge-v2/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_albert-xlarge-v2'] = copy.deepcopy(BaseConfigs['albert-xlarge-v2'])
|
||||
AllConfigs['soft_prompt_albert-xlarge-v2'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/albert-xlarge-v2/",
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"configs/{args.job}/"):
|
||||
os.mkdir(f"configs/{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,450 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['beit-base-patch16-224'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps", "num_classes"): zip(
|
||||
["beans"],
|
||||
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["beans"], #"superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20],
|
||||
[256],
|
||||
[ 32],
|
||||
[ 32],#, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0], # *7 +[0] *8,
|
||||
[200],# 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200],#, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[ 3],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}beit-base-patch16-224",
|
||||
"tokenizer_name": f"{PATHBASE}beit-base-patch16-224",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps",
|
||||
"datasets_load_from_disk":False,
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['bitfit_beit-base-patch16-224'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/beit-base-patch16-224/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['adapter_beit-base-patch16-224'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/beit-base-patch16-224/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['lora_beit-base-patch16-224'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layernorm_after",
|
||||
"classifier"
|
||||
],
|
||||
"modified_modules":[
|
||||
"query",
|
||||
"value",
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/beit-base-patch16-224/",
|
||||
})
|
||||
|
||||
AllConfigs['compacter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['compacter_beit-base-patch16-224'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter/beit-base-patch16-224/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
AllConfigs['compacter++_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['compacter++_beit-base-patch16-224'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
"modified_modules": [
|
||||
"DenseReluDense"
|
||||
],
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter++/beit-base-patch16-224/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['low_rank_adapter_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['low_rank_adapter_beit-base-patch16-224'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/low_rank_adapter/beit-base-patch16-224/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['soft_prompt_beit-base-patch16-224'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-2,
|
||||
"soft_token_num":100,
|
||||
"token_init": False,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/beit-base-patch16-224/",
|
||||
})
|
||||
|
||||
AllConfigs['prefix_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['prefix_beit-base-patch16-224'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/beit-base-patch16-224/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_beit-base-patch16-224'] = copy.deepcopy(BaseConfigs['beit-base-patch16-224'])
|
||||
AllConfigs['soft_prompt_beit-base-patch16-224'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/beit-base-patch16-224/",
|
||||
})
|
||||
#### beit-base-patch16-224
|
||||
BaseConfigs['t5-small'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}t5-small",
|
||||
"tokenizer_name": f"{PATHBASE}t5-small",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
|
||||
AllConfigs['prefix_t5-small'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-small/",
|
||||
})
|
||||
|
||||
|
||||
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['roberta-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}roberta-base",
|
||||
"tokenizer_name": f"{PATHBASE}roberta-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
|
||||
|
||||
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['bitfit_roberta-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/bitfit/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['none_roberta-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 1e-5,
|
||||
"output_dir": "outputs/none/roberta-base/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['lora_roberta-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/lora/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['adapter_roberta-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/adapter/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['low_rank_adapter_roberta-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/low_rank_adapter/roberta-base/",
|
||||
})
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['bert-base-cased'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}bert-base-cased",
|
||||
"tokenizer_name": f"{PATHBASE}bert-base-cased",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['prefix_bert-base-cased'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/bert-base-cased/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['soft_prompt_bert-base-cased'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/bert-base-cased/",
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"configs/{args.job}/"):
|
||||
os.mkdir(f"configs/{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,116 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['bert-base-cased'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}bert-base-cased",
|
||||
"tokenizer_name": f"{PATHBASE}bert-base-cased",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['prefix_bert-base-cased'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/bert-base-cased/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['soft_prompt_bert-base-cased'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/bert-base-cased/",
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"configs/{args.job}/"):
|
||||
os.mkdir(f"configs/{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,433 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['t5-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}t5-base",
|
||||
"tokenizer_name": f"{PATHBASE}t5-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['bitfit_t5-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['adapter_t5-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['lora_t5-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter++_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
"modified_modules": [
|
||||
"DenseReluDense"
|
||||
],
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter++/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['low_rank_adapter_t5-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/low_rank_adapter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['soft_prompt_t5-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-2,
|
||||
"soft_token_num":100,
|
||||
"token_init": False,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['prefix_t5-base'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-base/",
|
||||
})
|
||||
|
||||
#### T5-base
|
||||
BaseConfigs['t5-small'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}t5-small",
|
||||
"tokenizer_name": f"{PATHBASE}t5-small",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
|
||||
AllConfigs['prefix_t5-small'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-small/",
|
||||
})
|
||||
|
||||
|
||||
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['roberta-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}roberta-base",
|
||||
"tokenizer_name": f"{PATHBASE}roberta-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
|
||||
|
||||
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['bitfit_roberta-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/bitfit/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['none_roberta-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 1e-5,
|
||||
"output_dir": "outputs/none/roberta-base/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['lora_roberta-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/lora/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['adapter_roberta-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/adapter/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['low_rank_adapter_roberta-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/low_rank_adapter/roberta-base/",
|
||||
})
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['bert-base-cased'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}bert-base-cased",
|
||||
"tokenizer_name": f"{PATHBASE}bert-base-cased",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['prefix_bert-base-cased'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/bert-base-cased/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['soft_prompt_bert-base-cased'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/bert-base-cased/",
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"configs/{args.job}/"):
|
||||
os.mkdir(f"configs/{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,143 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['roberta-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}roberta-base",
|
||||
"tokenizer_name": f"{PATHBASE}roberta-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
|
||||
|
||||
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['bitfit_roberta-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/bitfit/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['none_roberta-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 1e-5,
|
||||
"output_dir": "outputs/none/roberta-base/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['lora_roberta-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/lora/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['adapter_roberta-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/adapter/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['low_rank_adapter_roberta-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/low_rank_adapter/roberta-base/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['soft_prompt_roberta-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/roberta-base/",
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"configs/{args.job}/"):
|
||||
os.mkdir(f"configs/{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,444 @@
|
|||
import collections
|
||||
import copy
|
||||
|
||||
PATHBASE="/mnt/sfs_turbo/hsd/plm_cache/"
|
||||
PATHBASE="/home/hushengding/plm_cache/"
|
||||
|
||||
AllConfigs = {}
|
||||
|
||||
BaseConfigs = {}
|
||||
BaseConfigs['t5-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}t5-base",
|
||||
"tokenizer_name": f"{PATHBASE}t5-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['bitfit_t5-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 3e-4,
|
||||
"output_dir": "outputs/bitfit/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['adapter_t5-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"bottleneck_dim":24,
|
||||
"output_dir": "outputs/adapter/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['lora_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['lora_t5-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"lora_r": 8,
|
||||
"output_dir": "outputs/lora/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['compacter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
AllConfigs['compacter++_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['compacter++_t5-base'].update({
|
||||
"delta_type": "compacter",
|
||||
"learning_rate": 3e-3,
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
"modified_modules": [
|
||||
"DenseReluDense"
|
||||
],
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/compacter++/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
"phm_init_range": 0.0001,
|
||||
"use_bias_down_sampler": True,
|
||||
"use_bias_up_sampler": True,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['low_rank_adapter_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['low_rank_adapter_t5-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
"layer_norm",
|
||||
"final_layer_norm"
|
||||
],
|
||||
"output_dir": "outputs/low_rank_adapter/t5-base/",
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": 1,
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['soft_prompt_t5-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-2,
|
||||
"soft_token_num":100,
|
||||
"token_init": False,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['prefix_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['prefix_t5-base'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-base/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base'])
|
||||
AllConfigs['soft_prompt_t5-base'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/t5-base/",
|
||||
})
|
||||
#### T5-base
|
||||
BaseConfigs['t5-small'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}t5-small",
|
||||
"tokenizer_name": f"{PATHBASE}t5-small",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"push_to_delta_center": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_t5-small'] = copy.deepcopy(BaseConfigs['t5-small'])
|
||||
AllConfigs['prefix_t5-small'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/t5-small/",
|
||||
})
|
||||
|
||||
|
||||
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['roberta-base'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}roberta-base",
|
||||
"tokenizer_name": f"{PATHBASE}roberta-base",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
|
||||
|
||||
AllConfigs['bitfit_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['bitfit_roberta-base'].update({
|
||||
"delta_type": "bitfit",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/bitfit/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['none_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['none_roberta-base'].update({
|
||||
"delta_type": "none",
|
||||
"learning_rate": 1e-5,
|
||||
"output_dir": "outputs/none/roberta-base/",
|
||||
})
|
||||
|
||||
|
||||
AllConfigs['lora_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['lora_roberta-base'].update({
|
||||
"delta_type": "lora",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/lora/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['adapter_roberta-base'].update({
|
||||
"delta_type": "adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/adapter/roberta-base/",
|
||||
})
|
||||
|
||||
AllConfigs['low_rank_adapter_roberta-base'] = copy.deepcopy(BaseConfigs['roberta-base'])
|
||||
AllConfigs['low_rank_adapter_roberta-base'].update({
|
||||
"delta_type": "low_rank_adapter",
|
||||
"learning_rate": 1e-3,
|
||||
"output_dir": "outputs/low_rank_adapter/roberta-base/",
|
||||
})
|
||||
|
||||
#### ROBERTA######
|
||||
BaseConfigs['bert-base-cased'] = {
|
||||
("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs",
|
||||
"max_source_length",
|
||||
"per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record",
|
||||
"superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[ 32, 32, 32, 32, 32, 16, 32] + [32] * 8,
|
||||
[0] *7 +[0] *8,
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
[200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100],
|
||||
),
|
||||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
"model_name_or_path": f"{PATHBASE}bert-base-cased",
|
||||
"tokenizer_name": f"{PATHBASE}bert-base-cased",
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
"is_seq2seq": False,
|
||||
"split_validation_test": True,
|
||||
"seed": 42,
|
||||
"dataset_config_name": ["en"],
|
||||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": False,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"greater_is_better": True,
|
||||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": True,
|
||||
"save_strategy": "steps"
|
||||
}
|
||||
|
||||
AllConfigs['prefix_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['prefix_bert-base-cased'].update({
|
||||
"delta_type": "prefix",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/prefix/bert-base-cased/",
|
||||
})
|
||||
|
||||
AllConfigs['soft_prompt_bert-base-cased'] = copy.deepcopy(BaseConfigs['bert-base-cased'])
|
||||
AllConfigs['soft_prompt_bert-base-cased'].update({
|
||||
"delta_type": "soft_prompt",
|
||||
"learning_rate": 3e-4,
|
||||
"unfrozen_modules": [
|
||||
"deltas",
|
||||
],
|
||||
"output_dir": "outputs/soft_prompt/bert-base-cased/",
|
||||
})
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
parser = argparse.ArgumentParser("Parser to generate configuration")
|
||||
parser.add_argument("--job", type=str)
|
||||
args = parser.parse_args()
|
||||
|
||||
config = AllConfigs[args.job]
|
||||
|
||||
Cartesian_product = []
|
||||
for key in config:
|
||||
if isinstance(key, tuple):
|
||||
Cartesian_product.append(key)
|
||||
all_config_jsons = {}
|
||||
for key_tuple in Cartesian_product:
|
||||
for zipped in config[key_tuple]:
|
||||
job_name = zipped[0]
|
||||
all_config_jsons[job_name] = {}
|
||||
for key_name, zipped_elem in zip(key_tuple, zipped):
|
||||
if key_name != 'job_name':
|
||||
all_config_jsons[job_name][key_name] = zipped_elem
|
||||
for key in config:
|
||||
if not isinstance(key, tuple):
|
||||
for job_name in all_config_jsons:
|
||||
if key == "output_dir":
|
||||
all_config_jsons[job_name][key] = config[key] + job_name
|
||||
else:
|
||||
all_config_jsons[job_name][key] = config[key]
|
||||
|
||||
|
||||
if not os.path.exists(f"configs/{args.job}/"):
|
||||
os.mkdir(f"configs/{args.job}/")
|
||||
|
||||
for job_name in all_config_jsons:
|
||||
with open(f"configs/{args.job}/{job_name}.json", 'w') as fout:
|
||||
json.dump(all_config_jsons[job_name], fout, indent=4,sort_keys=True)
|
||||
|
||||
|
||||
|
|
@ -1,3 +1,3 @@
|
|||
from .tasks import TASK_MAPPING, AutoTask
|
||||
from .data_collator import TaskDataCollatorForSeq2Seq
|
||||
from .postprocessors import AutoPostProcessor
|
||||
# from .data_collator import TaskDataCollatorForSeq2Seq
|
||||
# from .postprocessors import AutoPostProcessor
|
||||
|
|
|
@ -1,16 +0,0 @@
|
|||
import numpy as np
|
||||
from dataclasses import dataclass
|
||||
from transformers import DataCollatorForSeq2Seq
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
|
||||
def check_uniqueness(self, samples):
|
||||
assert len(np.unique(samples)) == 1
|
||||
|
||||
def __call__(self, features):
|
||||
# tasks = [d.pop('task') for d in features]
|
||||
# self.check_uniqueness(tasks)
|
||||
output = super().__call__(features)
|
||||
# output["task"] = tasks[0]
|
||||
return output
|
|
@ -1,67 +0,0 @@
|
|||
import abc
|
||||
from collections import OrderedDict
|
||||
import numpy as np
|
||||
|
||||
"""Defines functions to process the outputs to make them ready for the evaluation."""
|
||||
|
||||
def string_to_float(string, default=-1., **unused_kwargs):
|
||||
"""Converts string to float, using default when conversion not possible."""
|
||||
try:
|
||||
return float(string)
|
||||
except ValueError:
|
||||
return default
|
||||
|
||||
|
||||
class PostProcessor(abc.ABC):
|
||||
"""Postprocess the predictions and labels to make them suitable for
|
||||
evaluation."""
|
||||
def __init__(self, tokenizer, ignore_pad_token_for_loss):
|
||||
self.tokenizer = tokenizer
|
||||
self.ignore_pad_token_for_loss = ignore_pad_token_for_loss
|
||||
|
||||
|
||||
def process(self, preds, labels, data_info=None):
|
||||
if isinstance(preds, tuple):
|
||||
preds = preds[0]
|
||||
decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
|
||||
if self.ignore_pad_token_for_loss:
|
||||
# Replace -100 in the labels as we can't decode them.
|
||||
labels = np.where(labels != -100, labels, self.tokenizer.pad_token_id)
|
||||
decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
|
||||
# Some simple post-processing
|
||||
decoded_preds = [pred.strip() for pred in decoded_preds]
|
||||
decoded_labels = [label.strip() for label in decoded_labels]
|
||||
return decoded_preds, decoded_labels
|
||||
|
||||
|
||||
class MultiRC(PostProcessor):
|
||||
def process(self, preds, labels, data_info):
|
||||
preds, labels = super().process(preds, labels, data_info)
|
||||
preds = [{"group": info["group"], "value":pred} \
|
||||
for info, pred in zip(data_info, preds)]
|
||||
labels = [{"group": info["group"], "value": label}\
|
||||
for info, label in zip(data_info, labels)]
|
||||
return preds, labels
|
||||
|
||||
class Record(PostProcessor):
|
||||
def process(self, preds, labels, data_info):
|
||||
preds, labels = super().process(preds, labels, data_info)
|
||||
labels = [info["answers"] for info in data_info]
|
||||
return preds, labels
|
||||
|
||||
|
||||
POSTPROCESSOR_MAPPING = OrderedDict(
|
||||
[
|
||||
('superglue-record', Record),
|
||||
('superglue-multirc', MultiRC)
|
||||
]
|
||||
)
|
||||
|
||||
class AutoPostProcessor:
|
||||
@classmethod
|
||||
def get(self, task, tokenizer, ignore_pad_token_for_loss):
|
||||
if task in POSTPROCESSOR_MAPPING:
|
||||
return POSTPROCESSOR_MAPPING[task](tokenizer, ignore_pad_token_for_loss)
|
||||
return PostProcessor(tokenizer, ignore_pad_token_for_loss)
|
||||
|
||||
|
|
@ -0,0 +1,96 @@
|
|||
import abc
|
||||
from typing import Callable, List, Mapping, Dict
|
||||
import datasets
|
||||
import logging
|
||||
import numpy as np
|
||||
import torch
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AbstractTask(abc.ABC):
|
||||
name = NotImplemented
|
||||
config = NotImplemented
|
||||
prefix = NotImplemented
|
||||
metric = NotImplemented
|
||||
metric_names = NotImplemented
|
||||
split_map = None
|
||||
labels_list = None
|
||||
split_to_data_split: Mapping[str, str] = \
|
||||
{"train": "train", "validation": "validation", "test": "test"}
|
||||
split_valid_to_make_test = True
|
||||
split_train_to_make_test = False
|
||||
keep_fields_after_preprocess = ["label"] # The fields that should be kept even after preprocessiing
|
||||
|
||||
def __init__(self, config, data_args, seed=42, default_max_length=1):
|
||||
self.config = config
|
||||
self.seed = seed
|
||||
self.data_args = data_args
|
||||
|
||||
self.default_max_length = default_max_length
|
||||
|
||||
def check_n_obs(self, n_obs, total_size):
|
||||
if n_obs is not None and n_obs > total_size:
|
||||
n_obs = total_size
|
||||
logger.warning("n_obs is set to %s", n_obs)
|
||||
return n_obs
|
||||
|
||||
def shuffled_indices(self, dataset):
|
||||
num_samples = len(dataset)
|
||||
generator = torch.Generator()
|
||||
generator.manual_seed(self.seed)
|
||||
return torch.randperm(num_samples, generator=generator).tolist()
|
||||
|
||||
def subsample(self, dataset, n_obs=None, indices=None):
|
||||
"""
|
||||
Given a dataset returns the subsampled dataset.
|
||||
:param n_obs: the number of samples of the subsampled dataset.
|
||||
:param indices: indices to select the samples from, if not given, indices are computed
|
||||
from by shuffling the given dataset.
|
||||
:return: subsampled dataset.
|
||||
"""
|
||||
num_samples = len(dataset)
|
||||
n_obs = self.check_n_obs(n_obs, num_samples)
|
||||
if indices is None:
|
||||
indices = self.shuffled_indices(dataset)
|
||||
indices = indices[:n_obs]
|
||||
return dataset.select(indices)
|
||||
|
||||
def load_dataset(self, split: int):
|
||||
return datasets.load_dataset(self.name, self.config, split=split, script_version="master")
|
||||
|
||||
def get_split_indices(self, split, dataset, validation_size):
|
||||
indices = self.shuffled_indices(dataset)
|
||||
if split == "validation":
|
||||
return indices[:validation_size]
|
||||
else:
|
||||
return indices[validation_size:]
|
||||
|
||||
def preprocessor(self, example):
|
||||
return example
|
||||
|
||||
def get(self, split, n_obs=None, split_validation_test=False):
|
||||
# For small datasets (n_samples < 10K) without test set, we divide validation set to
|
||||
# half, use one half as test set and one half as validation set.
|
||||
if split in ["eval", "dev", "valid"]:
|
||||
split = "validation"
|
||||
if split_validation_test and self.split_valid_to_make_test \
|
||||
and split != "train":
|
||||
mapped_split = self.split_to_data_split["validation"]
|
||||
dataset = self.load_dataset(split=mapped_split)
|
||||
indices = self.get_split_indices(split, dataset, validation_size=len(dataset)//2)
|
||||
dataset = self.subsample(dataset, n_obs, indices)
|
||||
# For larger datasets (n_samples > 10K), we divide training set into 1K as
|
||||
# validation and the rest as training set, keeping the original validation
|
||||
# set as the test set.
|
||||
elif split_validation_test and self.split_train_to_make_test \
|
||||
and split != "test":
|
||||
dataset = self.load_dataset(split="train")
|
||||
indices = self.get_split_indices(split, dataset, validation_size=1000)
|
||||
dataset = self.subsample(dataset, n_obs, indices)
|
||||
else:
|
||||
mapped_split = self.split_to_data_split[split]
|
||||
dataset = self.load_dataset(split=mapped_split)
|
||||
# shuffles the data and samples it.
|
||||
if n_obs is not None:
|
||||
dataset = self.subsample(dataset, n_obs)
|
||||
return dataset.map(self.preprocessor)
|
|
@ -1,4 +1,4 @@
|
|||
|
||||
# from openprompt.prompts import ManualTemplate
|
||||
|
||||
class BasePrompt(object):
|
||||
def __init__(self, template_id=0, verbalizer_id=0, generation=True):
|
||||
|
@ -9,26 +9,28 @@ class BasePrompt(object):
|
|||
self.verbalizer = self.mlmhead_verbalizers[verbalizer_id]
|
||||
|
||||
|
||||
|
||||
def __call__(self, example):
|
||||
|
||||
def eval_syntax(syntaxlist, example):
|
||||
composed = []
|
||||
for x in syntaxlist:
|
||||
if x.startswith("[_eval_]"):
|
||||
t = eval(x[len("[_eval_]"):])
|
||||
t = eval(x[len("[_eval_]"):])
|
||||
else:
|
||||
t = x
|
||||
composed.append(t)
|
||||
return composed
|
||||
src_texts = eval_syntax(self.template,example)
|
||||
|
||||
|
||||
tgt_texts = self.verbalizer[str(example['label'])]
|
||||
if isinstance(tgt_texts, list):
|
||||
tgt_texts = eval_syntax(tgt_texts, example)
|
||||
else:
|
||||
tgt_texts = [tgt_texts]
|
||||
return src_texts, tgt_texts
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -48,7 +50,7 @@ class MRPCPrompt(BasePrompt):
|
|||
"1": "same"
|
||||
}
|
||||
textual_templates = [
|
||||
["sentence1:", """[_eval_]example['sentence1']""",
|
||||
["sentence1:", """[_eval_]example['sentence1']""",
|
||||
"sentence2:", """[_eval_]example["sentence2"]""", "Meanings different of same? Answer: " ]
|
||||
]
|
||||
|
||||
|
@ -68,7 +70,7 @@ class BoolQPrompt(BasePrompt):
|
|||
"1": "same"
|
||||
}
|
||||
textual_templates = [
|
||||
["sentence1:", """[_eval_]example['sentence1']""",
|
||||
["sentence1:", """[_eval_]example['sentence1']""",
|
||||
"sentence2:", """[_eval_]example["sentence2"]""", "Meanings different of same? Answer: " ]
|
||||
]
|
||||
|
||||
|
@ -84,7 +86,7 @@ class BoolQPrompt(BasePrompt):
|
|||
"1": "yes"
|
||||
}
|
||||
textual_templates = [
|
||||
["hypothesis:", """[_eval_]example['hypothesis']""",
|
||||
["hypothesis:", """[_eval_]example['hypothesis']""",
|
||||
"premise:", """[_eval_]example["premise"]""", "The answer was " ]
|
||||
]
|
||||
|
||||
|
@ -100,7 +102,7 @@ class COLAPrompt(BasePrompt):
|
|||
"1": "Yes"
|
||||
}
|
||||
textual_templates = [
|
||||
["sentence:", """[_eval_]example['sentence']""",
|
||||
["sentence:", """[_eval_]example['sentence']""",
|
||||
"grammar correct? " ]
|
||||
]
|
||||
|
||||
|
@ -119,7 +121,7 @@ class RTEPrompt(BasePrompt):
|
|||
textual_templates = [
|
||||
["sentence1:", """[_eval_]example['premise']""", "sentence2:",
|
||||
"""[_eval_]example['hypothesis']""",
|
||||
"The answer was " ]
|
||||
"The answer was "]
|
||||
]
|
||||
|
||||
class CBPrompt(BasePrompt):
|
||||
|
@ -147,6 +149,5 @@ PromptCollections = {
|
|||
"superglue-boolq": BoolQPrompt,
|
||||
"cb": CBPrompt,
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
from collections import OrderedDict
|
||||
import collections
|
||||
import collections
|
||||
import abc
|
||||
import functools
|
||||
from selectors import EpollSelector
|
||||
from typing import Callable, List, Mapping
|
||||
from examples_prompt.trainers.trainer_utils import pad_punctuation
|
||||
from .utils import pad_punctuation
|
||||
from examples_prompt.metrics import metrics
|
||||
from .utils import round_stsb_target
|
||||
import datasets
|
||||
|
@ -12,119 +12,26 @@ import logging
|
|||
import numpy as np
|
||||
import torch
|
||||
import re
|
||||
from examples_prompt.data_processors.prompt import PromptCollections
|
||||
from openprompt.prompts import ManualTemplate, ManualVerbalizer
|
||||
from openprompt.plms.utils import TokenizerWrapper
|
||||
from openprompt.data_utils import InputExample
|
||||
from openprompt.prompts import GenerationVerbalizer
|
||||
import itertools
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class AbstractTask(abc.ABC):
|
||||
name = NotImplemented
|
||||
config = NotImplemented
|
||||
prefix = NotImplemented
|
||||
metric = NotImplemented
|
||||
metric_names = NotImplemented
|
||||
split_map = None
|
||||
labels_list = None
|
||||
split_to_data_split: Mapping[str, str] = \
|
||||
{"train": "train", "validation": "validation", "test": "test"}
|
||||
small_datasets_without_all_splits = ["cola", "wnli", "rte", "superglue-cb", "superglue-copa", "superglue-multirc",
|
||||
"superglue-wic", "superglue-wsc.fixed", "superglue-rte", "mrpc", "stsb",
|
||||
"superglue-boolq"]
|
||||
large_data_without_all_splits = ["qqp", "qnli", "superglue-record", "sst2"]
|
||||
|
||||
def __init__(self, config, seed=42):
|
||||
self.config = config
|
||||
self.seed = seed
|
||||
|
||||
tid = getattr(config, "template_id", 0)
|
||||
vid = getattr(config, "verbalizer_id", 0)
|
||||
generation_paradigm = getattr(config, "generation_paradigm", True)
|
||||
self.prompt = PromptCollections[self.name](tid, vid, generation_paradigm)
|
||||
from transformers.models.auto.tokenization_auto import tokenizer_class_from_name
|
||||
|
||||
def get_max_target_length(self, tokenizer, default_max_length):
|
||||
if self.prompt.verbalizer is not None:
|
||||
return max([len(tokenizer.encode(label)) for key, label in self.prompt.verbalizer.items()])
|
||||
return default_max_length
|
||||
from typing import List, Dict
|
||||
from collections import defaultdict
|
||||
from openprompt.utils import round_list
|
||||
import warnings
|
||||
|
||||
def seq2seq_format(self, source, target, extra_fields={}
|
||||
):
|
||||
|
||||
return {'source': ' '.join(source),
|
||||
'target': ' '.join(target),
|
||||
'task': self.name,
|
||||
'extra_fields': extra_fields
|
||||
}
|
||||
|
||||
def check_n_obs(self, n_obs, total_size):
|
||||
if n_obs is not None and n_obs > total_size:
|
||||
n_obs = total_size
|
||||
logger.warning("n_obs is set to %s", n_obs)
|
||||
return n_obs
|
||||
|
||||
def shuffled_indices(self, dataset):
|
||||
num_samples = len(dataset)
|
||||
generator = torch.Generator()
|
||||
generator.manual_seed(self.seed)
|
||||
return torch.randperm(num_samples, generator=generator).tolist()
|
||||
|
||||
def subsample(self, dataset, n_obs=None, indices=None):
|
||||
"""
|
||||
Given a dataset returns the subsampled dataset.
|
||||
:param n_obs: the number of samples of the subsampled dataset.
|
||||
:param indices: indices to select the samples from, if not given, indices are computed
|
||||
from by shuffling the given dataset.
|
||||
:return: subsampled dataset.
|
||||
"""
|
||||
num_samples = len(dataset)
|
||||
n_obs = self.check_n_obs(n_obs, num_samples)
|
||||
if indices is None:
|
||||
indices = self.shuffled_indices(dataset)
|
||||
indices = indices[:n_obs]
|
||||
return dataset.select(indices)
|
||||
|
||||
def load_dataset(self, split: int):
|
||||
return datasets.load_dataset(self.name, self.config, split=split, script_version="master")
|
||||
|
||||
def get_split_indices(self, split, dataset, validation_size):
|
||||
indices = self.shuffled_indices(dataset)
|
||||
if split == "validation":
|
||||
return indices[:validation_size]
|
||||
else:
|
||||
return indices[validation_size:]
|
||||
|
||||
|
||||
def map_dataset(self, dataset, add_prefix):
|
||||
# from IPython import embed; embed(header="in get target length")
|
||||
return dataset.map(self.preprocessor)
|
||||
|
||||
|
||||
def preprocessor(self, example):
|
||||
source, target = self.prompt(example)
|
||||
return self.seq2seq_format(source, target, extra_fields={})
|
||||
|
||||
def get(self, split, add_prefix=True, n_obs=None, split_validation_test=False):
|
||||
# For small datasets (n_samples < 10K) without test set, we divide validation set to
|
||||
# half, use one half as test set and one half as validation set.
|
||||
if split_validation_test and self.name in self.small_datasets_without_all_splits \
|
||||
and split != "train":
|
||||
mapped_split = self.split_to_data_split["validation"]
|
||||
dataset = self.load_dataset(split=mapped_split)
|
||||
indices = self.get_split_indices(split, dataset, validation_size=len(dataset)//2)
|
||||
dataset = self.subsample(dataset, n_obs, indices)
|
||||
# For larger datasets (n_samples > 10K), we divide training set into 1K as
|
||||
# validation and the rest as training set, keeping the original validation
|
||||
# set as the test set.
|
||||
elif split_validation_test and self.name in self.large_data_without_all_splits \
|
||||
and split != "test":
|
||||
dataset = self.load_dataset(split="train")
|
||||
indices = self.get_split_indices(split, dataset, validation_size=1000)
|
||||
dataset = self.subsample(dataset, n_obs, indices)
|
||||
else:
|
||||
mapped_split = self.split_to_data_split[split]
|
||||
dataset = self.load_dataset(split=mapped_split)
|
||||
# shuffles the data and samples it.
|
||||
if n_obs is not None:
|
||||
dataset = self.subsample(dataset, n_obs)
|
||||
return self.map_dataset(dataset, add_prefix)
|
||||
from .processor import AbstractTask
|
||||
|
||||
class Squad(AbstractTask):
|
||||
name = "squad"
|
||||
|
@ -143,25 +50,7 @@ class Squad(AbstractTask):
|
|||
return self.seq2seq_format(source, target, add_prefix)
|
||||
|
||||
|
||||
class MRPC(AbstractTask):
|
||||
name = "mrpc"
|
||||
labels_list = ["0", "1"]
|
||||
metric = [metrics.f1_score, metrics.accuracy]
|
||||
metric_names = ["f1", "accuracy"]
|
||||
split_to_data_split = {"train": "train",
|
||||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('glue', 'mrpc', split=split, script_version="master")
|
||||
|
||||
# def preprocessor(self, example, add_prefix=True):
|
||||
# src_texts = ["sentence1:", example['sentence1'],
|
||||
# "sentence2:", example["sentence2"]]
|
||||
# tgt_texts = [str(example['label'])]
|
||||
# return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
|
||||
|
||||
##GLUE
|
||||
class COLA(AbstractTask):
|
||||
name = "cola"
|
||||
labels_list = ["0", "1"]
|
||||
|
@ -171,14 +60,19 @@ class COLA(AbstractTask):
|
|||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('glue', 'cola',
|
||||
split=split, script_version="master")
|
||||
templates_text = {"0": """sentence: {"meta": 'sentence', "shortenable":True} Are there any error in the sentence? {"mask"}""",
|
||||
}
|
||||
|
||||
# def preprocessor(self, example, add_prefix=True):
|
||||
# src_texts = ["sentence:", example['sentence']]
|
||||
# tgt_texts = [str(example['label'])]
|
||||
# return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
verbalizers = {
|
||||
"0":{ "0": "yes", "1": "no"}
|
||||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.cola")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'cola',
|
||||
split=split, script_version="master")
|
||||
|
||||
|
||||
class SST2(AbstractTask):
|
||||
|
@ -190,34 +84,50 @@ class SST2(AbstractTask):
|
|||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
|
||||
verbalizers = {
|
||||
"0":{"0":"negative","1":"positive"}
|
||||
}
|
||||
|
||||
templates_text = {
|
||||
"0":"""The sentiment of sentence: "{"meta":"sentence", "shortenable":True} is {"mask"}."""
|
||||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('glue', 'sst2',
|
||||
split=split, script_version="master")
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["sentence:", example['sentence']]
|
||||
tgt_texts = [str(example['label'])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.sst2")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'sst2',
|
||||
split=split, script_version="master")
|
||||
|
||||
|
||||
class STSB(AbstractTask):
|
||||
name = "stsb"
|
||||
labels_list = [str(np.round(label, decimals=1)) for label in np.arange(0, 5.2, 0.2)]
|
||||
metric = [metrics.pearson_corrcoef, metrics.spearman_corrcoef]
|
||||
metric_names = ["pearson", "spearmanr"]
|
||||
|
||||
class MRPC(AbstractTask):
|
||||
name = "mrpc"
|
||||
labels_list = ["0", "1"]
|
||||
metric = [metrics.f1_score, metrics.accuracy]
|
||||
metric_names = ["f1", "accuracy"]
|
||||
split_to_data_split = {"train": "train",
|
||||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('glue', 'stsb',
|
||||
split=split, script_version="master")
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["sentence1:", example['sentence1'],
|
||||
"sentence2:", example["sentence2"]]
|
||||
tgt_texts = [str(round_stsb_target(example['label']))]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
templates_text = {
|
||||
"0": """sentence1: {"meta": 'sentence1', "shortenable":True}. sentence2: {"meta":"sentence2", "shortenable":True}. Are sentence1 and sentence2 equivalent? {"mask"}.""",
|
||||
}
|
||||
|
||||
verbalizers = {
|
||||
"0":{"0": "no","1": "yes"}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.mrpc")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'mrpc', split=split, script_version="master")
|
||||
|
||||
|
||||
|
||||
class QQP(AbstractTask):
|
||||
|
@ -229,14 +139,46 @@ class QQP(AbstractTask):
|
|||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
|
||||
templates_text = {"0":
|
||||
"""question1: {"meta": 'question1', "shortenable":True}. question2: {"meta": 'question2', "shortenable":True} Are question1 and question2 equivalent? {"mask"}."""
|
||||
}
|
||||
|
||||
verbalizers = {
|
||||
"0":{"0": "no","1": "yes"}
|
||||
}
|
||||
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('glue', 'qqp',
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.qqp")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'qqp',
|
||||
split=split, script_version="master")
|
||||
|
||||
|
||||
|
||||
class STSB(AbstractTask):
|
||||
name = "stsb"
|
||||
labels_list = [str(np.round(label, decimals=1)) for label in np.arange(0, 5.2, 0.2)]
|
||||
metric = [metrics.pearson_corrcoef, metrics.spearman_corrcoef]
|
||||
metric_names = ["pearson", "spearmanr"]
|
||||
split_to_data_split = {"train": "train",
|
||||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
|
||||
|
||||
verbalizers = {
|
||||
""
|
||||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('glue', 'stsb',
|
||||
split=split, script_version="master")
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["question1:", example['question1'],
|
||||
"question2:", example["question2"]]
|
||||
tgt_texts = [str(example['label'])]
|
||||
src_texts = ["sentence1:", example['sentence1'],
|
||||
"sentence2:", example["sentence2"]]
|
||||
tgt_texts = [str(round_stsb_target(example['label']))]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
|
||||
|
||||
|
@ -250,14 +192,29 @@ class MNLI(AbstractTask):
|
|||
metric_names = ["accuracy"]
|
||||
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('glue', 'mnli', split=split, script_version="master")
|
||||
templates_text = {
|
||||
"0":"""premise: {"meta": 'premise', "shortenable":True}. hypothesis: {"meta": 'hypothesis', "shortenable":True} Does the premise entails the hypothesis? {"mask"}.""",
|
||||
}
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["premise:", example['premise'],
|
||||
"hypothesis", example["hypothesis"]]
|
||||
tgt_texts = [str(example['label'])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
verbalizers = {
|
||||
"0":{
|
||||
"0": "yes",
|
||||
"1": "neutral",
|
||||
"2": "no",
|
||||
}
|
||||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.mnli")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'mnli', split=split, script_version="master")
|
||||
|
||||
# def preprocessor(self, example, add_prefix=True):
|
||||
# src_texts = ["premise:", example['premise'],
|
||||
# "hypothesis", example["hypothesis"]]
|
||||
# tgt_texts = [str(example['label'])]
|
||||
# return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
|
||||
|
||||
class QNLI(AbstractTask):
|
||||
|
@ -269,15 +226,35 @@ class QNLI(AbstractTask):
|
|||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
|
||||
templates_text = {
|
||||
"0": """premise: {"meta": 'sentence', "shortenable":True}. hypothesis: {"meta": 'question', "shortenable":True}"""+
|
||||
"""Does the premise entails the hypothesis? {"mask"}.""",
|
||||
}
|
||||
|
||||
verbalizers = {
|
||||
"0":{
|
||||
"0": "yes",
|
||||
"1": "no",
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('glue', 'qnli', split=split, script_version="master")
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.qnli")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'qnli', split=split, script_version="master")
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["question:", example['question'],
|
||||
"sentence:", example["sentence"]]
|
||||
tgt_texts = [str(example['label'])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
# def load_dataset(self, split):
|
||||
# return datasets.load_dataset('glue', 'qnli', split=split, script_version="master")
|
||||
|
||||
# def preprocessor(self, example, add_prefix=True):
|
||||
# src_texts = ["question:", example['question'],
|
||||
# "sentence:", example["sentence"]]
|
||||
# tgt_texts = [str(example['label'])]
|
||||
# return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
|
||||
#Tested
|
||||
class RTE(AbstractTask):
|
||||
name = "rte"
|
||||
labels_list = ["0", "1"]
|
||||
|
@ -287,15 +264,24 @@ class RTE(AbstractTask):
|
|||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
|
||||
|
||||
templates_text = {
|
||||
"0": """sentence1: {"meta": 'sentence1', "shortenable":True} sentence2: {"meta":"sentence2", "shortenable":True} The answer was {"mask"}.""",
|
||||
}
|
||||
|
||||
verbalizers = {
|
||||
"0":{"0": "yes",
|
||||
"1": "no"
|
||||
}
|
||||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('glue', 'rte',
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.rte")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'rte',
|
||||
split=split, script_version="master")
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["sentence1:", example['sentence1'],
|
||||
"sentence2:", example["sentence2"]]
|
||||
tgt_texts = [str(example['label'])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
|
||||
|
||||
class WNLI(AbstractTask):
|
||||
|
@ -307,16 +293,23 @@ class WNLI(AbstractTask):
|
|||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
|
||||
verbalizers = {
|
||||
"0":{"0": "True",
|
||||
"1": "False",
|
||||
}
|
||||
}
|
||||
templates_text = {"0": """{"meta": 'sentence1',"shortenable":True} Does it mean the following: "{"meta":'sentence2'}"? {"mask"}."""
|
||||
}
|
||||
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('glue', 'wnli', split=split, script_version="master")
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["sentence1:", example['sentence1'],
|
||||
"sentence2:", example["sentence2"]]
|
||||
tgt_texts = [str(example['label'])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.wnli")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'wnli', split=split, script_version="master")
|
||||
|
||||
|
||||
#SuperGLUE
|
||||
class SuperGLUEBoolQ(AbstractTask):
|
||||
name="superglue-boolq"
|
||||
labels_list = ['0', '1']
|
||||
|
@ -326,34 +319,25 @@ class SuperGLUEBoolQ(AbstractTask):
|
|||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('super_glue', 'boolq', split=split, script_version="master")
|
||||
verbalizers = {
|
||||
"0": {
|
||||
"0": "no",
|
||||
"1": "yes"
|
||||
},
|
||||
}
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["question:", example["question"], "passage:", example["passage"]]
|
||||
tgt_texts = [str(example["label"])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
|
||||
|
||||
class SuperGLUERTE(AbstractTask):
|
||||
name="superglue-rte"
|
||||
labels_list = ['0', '1']
|
||||
split_to_data_split = {"train": "train",
|
||||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
metric = [metrics.accuracy]
|
||||
metric_names = ["accuracy"]
|
||||
templates_text = {
|
||||
"0": """hypothesis: {"meta": "question", "shortenable":True} premise: {"meta":"passage", "shortenable":True} The answer was {"mask"}."""
|
||||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('super_glue', 'rte', split=split, script_version="master")
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["premise:", example["premise"],
|
||||
"hypothesis:", example["hypothesis"]]
|
||||
tgt_texts = [str(example["label"])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.boolq")[split]
|
||||
else:
|
||||
return datasets.load_dataset('super_glue', 'boolq', split=split, script_version="master")
|
||||
|
||||
|
||||
#
|
||||
class SuperGLUECB(AbstractTask):
|
||||
name = "superglue-cb"
|
||||
labels_list = ['0', '1', '2']
|
||||
|
@ -363,13 +347,21 @@ class SuperGLUECB(AbstractTask):
|
|||
metric = [metrics.mean_multiclass_f1(num_classes=3), metrics.accuracy]
|
||||
metric_names = ["f1_multiclass", "accuracy"]
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('super_glue', 'cb', split=split, script_version="master")
|
||||
verbalizers = {
|
||||
"0":{"0": "yes",
|
||||
"1": "no",
|
||||
"2": "maybe"
|
||||
}
|
||||
}
|
||||
templates_text = {
|
||||
"0": """hypothesis: {"meta": 'hypothesis',"shortenable":True} premise: {"meta":'premise', "shortenable":True} The answer was {"mask"}."""
|
||||
}
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["premise:", example["premise"], "hypothesis:", example["hypothesis"]]
|
||||
tgt_texts = [str(example["label"])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.cb")[split]
|
||||
else:
|
||||
return datasets.load_dataset('super_glue', 'cb', split=split, script_version="master")
|
||||
|
||||
|
||||
class SuperGLUECOPA(AbstractTask):
|
||||
|
@ -379,17 +371,23 @@ class SuperGLUECOPA(AbstractTask):
|
|||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
metric = [metrics.accuracy]
|
||||
metric_names = ["accuracy"]
|
||||
metric_names = ["accuracy"]
|
||||
|
||||
verbalizers = {
|
||||
"0":{
|
||||
"0": "1",
|
||||
"1": "2",
|
||||
}
|
||||
}
|
||||
templates_text = {
|
||||
"0": """choice1: {"meta":"choice1"} choice2: {"meta":"choice2"} premise: {"meta":"premise", "shortenable":True} The {"meta":"question"} answer was choice{"mask"}."""
|
||||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('super_glue', 'copa', split=split, script_version="master")
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["premise:", example["premise"],
|
||||
"choice1:", example["choice1"],
|
||||
"choice2:", example["choice2"]]
|
||||
tgt_texts = [str(example["label"])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.copa")[split]
|
||||
else:
|
||||
return datasets.load_dataset('super_glue', 'copa', split=split, script_version="master")
|
||||
|
||||
|
||||
class SuperGLUEMultiRC(AbstractTask):
|
||||
|
@ -398,31 +396,44 @@ class SuperGLUEMultiRC(AbstractTask):
|
|||
split_to_data_split = {"train": "train",
|
||||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
metric = [metrics.multirc_f1_over_all_answers,
|
||||
metrics.mean_group_metric(metrics.exact_match)]
|
||||
metric = [metrics.f1_score,
|
||||
metrics.accuracy]
|
||||
metric_names = ["f1", "em"]
|
||||
|
||||
|
||||
verbalizers = {
|
||||
"0": {
|
||||
"0": "no",
|
||||
"1": "yes",
|
||||
}
|
||||
}
|
||||
templates_text = {
|
||||
"0": """question: {"meta":"question", "shortenable":False} answer: {"meta":"answer", "shortenable":False, "post_processing": lambda x:x+"."} paragraph: {"meta":"paragraph", "shortenable":True} The answer was {"mask"}."""
|
||||
}
|
||||
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('super_glue', 'multirc', split=split, script_version="master")
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.multirc")[split]
|
||||
else:
|
||||
return datasets.load_dataset('super_glue', 'multirc', split=split, script_version="master")
|
||||
|
||||
def remove_markup(self, text):
|
||||
"""Removes the HTML markup."""
|
||||
text = re.sub('<br>', ' ', text)
|
||||
text = re.sub('<(/)?b>', '', text)
|
||||
return text
|
||||
return text
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
group = example['idx']['question']
|
||||
# T5 applies remove_markup to the joined string, but this should not make
|
||||
def preprocessor(self, example):
|
||||
# T5 applies remove_markup to the joined string, but this should not make
|
||||
# any difference as well.
|
||||
# https://github.com/google-research/text-to-text-transfer-transformer/blob/a1352e625db7ec114062f99d99b0565b9e45c155/t5/data/preprocessors.py#L797
|
||||
src_texts = ["question:", self.remove_markup(example["question"]),
|
||||
"answer:", self.remove_markup(example["answer"]),
|
||||
"paragraph:", self.remove_markup(example["paragraph"])]
|
||||
tgt_texts = [str(example["label"])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix, extra_fields={"group": group})
|
||||
# https://github.com/google-research/text-to-text-transfer-transformer/blob/a1352e625db7ec114062f99d99b0565b9e45c155/t5/data/preprocessors.py#L797
|
||||
example["question"] = self.remove_markup(example["question"])
|
||||
example["answer"] = self.remove_markup(example["answer"])
|
||||
example["paragraph"] = self.remove_markup(example["paragraph"])
|
||||
return example
|
||||
|
||||
|
||||
|
||||
|
||||
class SuperGLUEWIC(AbstractTask):
|
||||
name = "superglue-wic"
|
||||
|
@ -431,130 +442,115 @@ class SuperGLUEWIC(AbstractTask):
|
|||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
metric = [metrics.accuracy]
|
||||
metric_names = ["accuracy"]
|
||||
metric_names = ["accuracy"]
|
||||
|
||||
verbalizers = {
|
||||
"0": {
|
||||
"0": "No",
|
||||
"1": "Yes",
|
||||
}
|
||||
}
|
||||
|
||||
templates_text = {
|
||||
"0": """sentence1: {"meta":"sentence1"} sentence2: {"meta":"sentence2", "shortenable": True} word: {"meta":"word"} {"mask"}."""
|
||||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('super_glue', 'wic', split=split, script_version="master")
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["sentence1:", example["sentence1"],
|
||||
"sentence2:", example["sentence2"],
|
||||
"word:", example["word"]]
|
||||
tgt_texts = [str(example["label"])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.wic")[split]
|
||||
else:
|
||||
return datasets.load_dataset('super_glue', 'wic', split=split, script_version="master")
|
||||
|
||||
|
||||
class SuperGLUEWSCFixed(AbstractTask):
|
||||
# source: https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/data/preprocessors.py
|
||||
"""Convert WSC examples to text2text format.
|
||||
WSC includes a sentence along with 2 'spans': the first denoting a noun and
|
||||
the other a pronoun. The 'label' specifies whether or not the pronoun is
|
||||
referencing the noun. This preprocessor puts ' * ' around the noun and ' # '
|
||||
around the pronoun.
|
||||
For example, a typical example from WSC might look like
|
||||
{
|
||||
'text': 'This is a test sentence .',
|
||||
'span1_text': 'test',
|
||||
'span1_index': 3,
|
||||
'span2_text': 'This',
|
||||
'span2_index': 0,
|
||||
'label': 0
|
||||
}
|
||||
This example would be transformed to
|
||||
{
|
||||
'inputs': 'wsc text: # This # is a * test * sentence .',
|
||||
'targets': 'False'
|
||||
}
|
||||
"""
|
||||
name = "superglue-wsc.fixed"
|
||||
labels_list = ['0', '1']
|
||||
# class SuperGLUERecord(AbstractTask):
|
||||
# """Convert ReCoRD examples to text2text examples.
|
||||
# ReCoRD contains a passage, query containing a '@placeholder' string, and a set
|
||||
# of entities that are the possible values of the placeholder. Each train and
|
||||
# validation example will have a list of answers, any of which would be
|
||||
# considered correct.
|
||||
# For example, a typical example from ReCoRD might look like
|
||||
# {
|
||||
# 'passsage': 'This is the passage.',
|
||||
# 'query': 'A @placeholder is a bird.',
|
||||
# 'entities': ['penguin', 'potato', 'pigeon'],
|
||||
# 'answers': ['penguin', 'pigeon'],
|
||||
# }
|
||||
# which this preprocessor would turn into the following two examples:
|
||||
# {
|
||||
# 'inputs': 'record query: A @placeholder is a bird. entities: penguin, '
|
||||
# 'potato, pigeon passage: This is the passage.',
|
||||
# 'targets': 'penguin',
|
||||
# }
|
||||
# and
|
||||
# {
|
||||
# 'inputs': 'record query: A @placeholder is a bird. entities: penguin, '
|
||||
# 'potato, pigeon passage: This is the passage.',
|
||||
# 'targets': 'pigeon',
|
||||
# }
|
||||
# """
|
||||
# name = "superglue-record"
|
||||
# split_to_data_split = {"train": "train",
|
||||
# "validation": "validation",
|
||||
# "test": "validation"}
|
||||
# metric = [metrics.squad]
|
||||
# metric_names = ["squad"]
|
||||
|
||||
# def load_dataset(self, split):
|
||||
# return datasets.load_dataset('super_glue', 'record', split=split, script_version="master")
|
||||
|
||||
# def preprocessor(self, batch, add_prefix=True):
|
||||
# new_batch = collections.defaultdict(list)
|
||||
# keys = batch.keys()
|
||||
# for values in zip(*batch.values()):
|
||||
# ex = {k: v for k, v in zip(keys, values)}
|
||||
# # updates the passage.
|
||||
# passage = ex['passage']
|
||||
# passage = re.sub(r'(\.|\?|\!|\"|\')\n@highlight\n', r'\1 ', passage)
|
||||
# passage = re.sub(r'\n@highlight\n', '. ', passage)
|
||||
# inputs = f"record query: {ex['query']} entities: {', '.join(ex['entities'])} passage: {passage}"
|
||||
# if add_prefix:
|
||||
# inputs = self.name + " " + inputs
|
||||
# # duplicates the samples based on number of answers.
|
||||
# num_answers = len(ex["answers"])
|
||||
# num_duplicates = np.maximum(1, num_answers)
|
||||
# new_batch["source"].extend([inputs] * num_duplicates)
|
||||
# new_batch["target"].extend(ex["answers"] if num_answers > 0 else ["<unk>"])
|
||||
# new_batch["task"].extend([self.name] * num_duplicates)
|
||||
# new_batch["extra_fields"].extend([{"answers": ex["answers"]}]*num_duplicates)
|
||||
# return new_batch
|
||||
|
||||
# def map_dataset(self, dataset, add_prefix=True):
|
||||
# return dataset.map(functools.partial(self.preprocessor, add_prefix=add_prefix),
|
||||
# batched=True, remove_columns=dataset.column_names)
|
||||
|
||||
class Beans(AbstractTask):
|
||||
name = "beans"
|
||||
labels_list = ['angular_leaf_spot', 'bean_rust', "healthy"]
|
||||
split_to_data_split = {"train": "train",
|
||||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
metric = [metrics.accuracy]
|
||||
metric_names = ["accuracy"]
|
||||
metric_names = ["accuracy"]
|
||||
|
||||
verbalizers = {
|
||||
"0": {
|
||||
"0": "No",
|
||||
"1": "Yes",
|
||||
}
|
||||
}
|
||||
|
||||
templates_text = {
|
||||
"0": """{"meta":"sentence1"}"""
|
||||
}
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('super_glue', 'wsc.fixed', split=split, script_version="master")
|
||||
|
||||
def _mark_span(self, text, span_str, span_idx, mark):
|
||||
pattern_tmpl = r'^((?:\S+\s){N})(W)'
|
||||
pattern = re.sub('N', str(span_idx), pattern_tmpl)
|
||||
pattern = re.sub('W', span_str, pattern)
|
||||
return re.sub(pattern, r'\1{0} \2 {0}'.format(mark), text)
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
# converts text as done in T5.
|
||||
text = example['text']
|
||||
text = self._mark_span(text, example['span1_text'], example['span1_index'], '*')
|
||||
# Compensate for 2 added "words" added in previous step.
|
||||
span2_index = example['span2_index'] + 2 * int(example['span1_index'] < example['span2_index'])
|
||||
text = self._mark_span(text, example['span2_text'], span2_index, '#')
|
||||
src_texts = ["text:", text]
|
||||
tgt_texts = [str(example["label"])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
# from IPython import embed; embed(header="beans")
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.wic")[split]
|
||||
else:
|
||||
return datasets.load_dataset('beans', split=split, script_version="master")
|
||||
|
||||
|
||||
class SuperGLUERecord(AbstractTask):
|
||||
"""Convert ReCoRD examples to text2text examples.
|
||||
ReCoRD contains a passage, query containing a '@placeholder' string, and a set
|
||||
of entities that are the possible values of the placeholder. Each train and
|
||||
validation example will have a list of answers, any of which would be
|
||||
considered correct.
|
||||
For example, a typical example from ReCoRD might look like
|
||||
{
|
||||
'passsage': 'This is the passage.',
|
||||
'query': 'A @placeholder is a bird.',
|
||||
'entities': ['penguin', 'potato', 'pigeon'],
|
||||
'answers': ['penguin', 'pigeon'],
|
||||
}
|
||||
which this preprocessor would turn into the following two examples:
|
||||
{
|
||||
'inputs': 'record query: A @placeholder is a bird. entities: penguin, '
|
||||
'potato, pigeon passage: This is the passage.',
|
||||
'targets': 'penguin',
|
||||
}
|
||||
and
|
||||
{
|
||||
'inputs': 'record query: A @placeholder is a bird. entities: penguin, '
|
||||
'potato, pigeon passage: This is the passage.',
|
||||
'targets': 'pigeon',
|
||||
}
|
||||
"""
|
||||
name = "superglue-record"
|
||||
split_to_data_split = {"train": "train",
|
||||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
metric = [metrics.squad]
|
||||
metric_names = ["squad"]
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('super_glue', 'record', split=split, script_version="master")
|
||||
|
||||
def preprocessor(self, batch, add_prefix=True):
|
||||
new_batch = collections.defaultdict(list)
|
||||
keys = batch.keys()
|
||||
for values in zip(*batch.values()):
|
||||
ex = {k: v for k, v in zip(keys, values)}
|
||||
# updates the passage.
|
||||
passage = ex['passage']
|
||||
passage = re.sub(r'(\.|\?|\!|\"|\')\n@highlight\n', r'\1 ', passage)
|
||||
passage = re.sub(r'\n@highlight\n', '. ', passage)
|
||||
inputs = f"record query: {ex['query']} entities: {', '.join(ex['entities'])} passage: {passage}"
|
||||
if add_prefix:
|
||||
inputs = self.name + " " + inputs
|
||||
# duplicates the samples based on number of answers.
|
||||
num_answers = len(ex["answers"])
|
||||
num_duplicates = np.maximum(1, num_answers)
|
||||
new_batch["source"].extend([inputs] * num_duplicates)
|
||||
new_batch["target"].extend(ex["answers"] if num_answers > 0 else ["<unk>"])
|
||||
new_batch["task"].extend([self.name] * num_duplicates)
|
||||
new_batch["extra_fields"].extend([{"answers": ex["answers"]}]*num_duplicates)
|
||||
return new_batch
|
||||
|
||||
def map_dataset(self, dataset, add_prefix=True):
|
||||
return dataset.map(functools.partial(self.preprocessor, add_prefix=add_prefix),
|
||||
batched=True, remove_columns=dataset.column_names)
|
||||
|
||||
|
||||
TASK_MAPPING = OrderedDict(
|
||||
|
@ -570,21 +566,20 @@ TASK_MAPPING = OrderedDict(
|
|||
('qqp', QQP),
|
||||
('stsb', STSB),
|
||||
('superglue-boolq', SuperGLUEBoolQ),
|
||||
('superglue-rte', SuperGLUERTE),
|
||||
('superglue-cb', SuperGLUECB),
|
||||
('superglue-copa', SuperGLUECOPA),
|
||||
('superglue-multirc', SuperGLUEMultiRC),
|
||||
('superglue-wic', SuperGLUEWIC),
|
||||
('superglue-wsc.fixed', SuperGLUEWSCFixed),
|
||||
('superglue-record', SuperGLUERecord)
|
||||
# ('superglue-record', SuperGLUERecord)
|
||||
('beans', Beans)
|
||||
]
|
||||
)
|
||||
|
||||
class AutoTask:
|
||||
@classmethod
|
||||
def get(self, task, config, seed=42):
|
||||
def get(self, task, config, data_args, seed=42):
|
||||
if task in TASK_MAPPING:
|
||||
return TASK_MAPPING[task](config, seed)
|
||||
return TASK_MAPPING[task](config, data_args, seed)
|
||||
raise ValueError(
|
||||
"Unrecognized task {} for AutoTask Model: {}.\n"
|
||||
"Task name should be one of {}.".format(
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
import numpy as np
|
||||
import re
|
||||
|
||||
def round_stsb_target(label):
|
||||
"""STSB maps two sentences to a floating point number between 1 and 5
|
||||
|
@ -15,3 +16,15 @@ def round_stsb_target(label):
|
|||
"""
|
||||
return np.round((label * 5) / 5, decimals=1)
|
||||
|
||||
|
||||
def pad_punctuation(text):
|
||||
"""Re-implementation of _pad_punctuation in t5. This function adds spaces
|
||||
around punctuation. While this pads punctuation as expected, it has the
|
||||
unexpected effected of padding certain unicode characters with accents, with
|
||||
spaces as well. For instance: "François" becomes "Fran ç ois"""
|
||||
# Pad everything except for: underscores (_), whitespace (\s),
|
||||
# numbers (\p{N}), letters (\p{L}) and accent characters (\p{M}).
|
||||
text = re.sub(r'([^_\s\p{N}\p{L}\p{M}])', r' \1 ', text)
|
||||
# Collapse consecutive whitespace into one space.
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
return text
|
|
@ -0,0 +1,44 @@
|
|||
|
||||
PATHBASE=/mnt/sfs_turbo/hsd/officialod/OpenDelta-1/examples/examples_prompt/
|
||||
PYTHONPATH=/mnt/sfs_turbo/zhangshudan/anaconda3/envs/officialod/bin/python
|
||||
PLMPATHBASE=/mnt/sfs_turbo/hsd/plm_cache/ # must be empty string or dir that ends with /
|
||||
DATASETSPATHBASE=/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/
|
||||
RUNTIME=$(date +%m%d%H%M%S)
|
||||
MODELNAME="roberta-base"
|
||||
DATASET=$1
|
||||
DELTATYPES=("none" "bitfit" "lora" "adapter")
|
||||
CUDAIDS=("0 1" "2 3" "4 5" "6 7")
|
||||
NUMTRIALS=50
|
||||
CONTINUESTUDY=${2:-'0'}
|
||||
|
||||
echo $RUNTIME
|
||||
echo $MODELNAME
|
||||
echo $DATASET
|
||||
echo $DELTATYPE
|
||||
echo $CUDAIDS
|
||||
echo $NUMTRIALS
|
||||
echo $CONTINUESTUDY
|
||||
cd $PATHBASE
|
||||
|
||||
|
||||
|
||||
for expid in 0 1 2 3
|
||||
do
|
||||
( $PYTHONPATH search_distributed.py \
|
||||
--model_name $MODELNAME \
|
||||
--dataset $DATASET \
|
||||
--delta_type ${DELTATYPES[$expid]} \
|
||||
--cuda_ids ${CUDAIDS[$expid]} \
|
||||
--num_trials $NUMTRIALS \
|
||||
--mode run \
|
||||
--repeat_time 1 \
|
||||
--main_file_name run_mlm.py \
|
||||
--pathbase $PATHBASE \
|
||||
--pythonpath $PYTHONPATH \
|
||||
--plm_path_base $PLMPATHBASE \
|
||||
--datasets_saved_path $DATASETSPATHBASE \
|
||||
--datasets_load_from_disk \
|
||||
--continue_study $CONTINUESTUDY >>/mnt/sfs_turbo/hsd/officialod/OpenDelta-1/examples/examples_prompt/out_sfs/$RUNTIME.txt 2>&1
|
||||
) &
|
||||
done
|
||||
wait
|
|
@ -15,9 +15,16 @@ if __name__=="__main__":
|
|||
parser.add_argument("--study_name", type=str, default=None)
|
||||
parser.add_argument("--cuda_ids", nargs='+', help="list")
|
||||
parser.add_argument("--mode", type=str, default="run", help="select from 'run' and 'read' ")
|
||||
parser.add_argument("--continue_study", type=bool, default=False)
|
||||
parser.add_argument("--continue_study", type=int, default=0)
|
||||
parser.add_argument("--substudy_prefix", type=str, default="")
|
||||
parser.add_argument("--main_file_name", type=str)
|
||||
parser.add_argument("--num_trials", type=int)
|
||||
parser.add_argument("--pathbase", type=str, default="")
|
||||
parser.add_argument("--pythonpath", type=str, default="python")
|
||||
parser.add_argument("--plm_path_base", type=str, default="", help="The path where we cache the plms. Must be empty string or dir that ends with /")
|
||||
parser.add_argument("--datasets_load_from_disk", action="store_true")
|
||||
parser.add_argument("--datasets_saved_path", type=str)
|
||||
parser.add_argument("--repeat_time", type=int, default=1)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
|
@ -26,13 +33,13 @@ if __name__=="__main__":
|
|||
args.study_name = pardir
|
||||
else:
|
||||
args.study_name += pardir
|
||||
|
||||
setattr(args, "output_dir", f"outputs_search/{args.study_name}")
|
||||
|
||||
|
||||
|
||||
setattr(args, "output_dir", f"{args.pathbase}/outputs_search/{args.study_name}")
|
||||
|
||||
|
||||
|
||||
if args.mode == "run":
|
||||
if args.continue_study:
|
||||
if args.continue_study==1:
|
||||
print("Continue study!")
|
||||
else:
|
||||
print("Creat new study!")
|
||||
|
@ -41,7 +48,8 @@ if __name__=="__main__":
|
|||
os.mkdir(f"{args.output_dir}")
|
||||
else:
|
||||
if not args.continue_study:
|
||||
user_cmd = input("Detected existing study, are you sure to create new by removing old? [Yes/No]")
|
||||
user_cmd = "yes" #input("Detected existing study, are you sure to create new by removing old? [Yes/No]")
|
||||
|
||||
while user_cmd.lower() not in ["yes", "no"]:
|
||||
print("Please input Yes/No")
|
||||
user_cmd = input("Detected existing study, are you sure to create new by removing old? [Yes/No]")
|
||||
|
@ -62,23 +70,39 @@ if __name__=="__main__":
|
|||
|
||||
tot_chunk_num = len(args.cuda_ids)
|
||||
|
||||
subprocesses = []
|
||||
for id, cudas in enumerate(args.cuda_ids):
|
||||
if id+1 < tot_chunk_num:
|
||||
sub_n_trials = args.num_trials//tot_chunk_num
|
||||
else:
|
||||
sub_n_trials = args.num_trials//tot_chunk_num + args.num_trials%tot_chunk_num
|
||||
|
||||
command = "nohup python search_single.py "
|
||||
command = f"{args.pythonpath} search_single.py "
|
||||
command += f"--cuda_id {cudas} "
|
||||
command += f"--model_name {args.model_name} "
|
||||
command += f"--dataset {args.dataset} "
|
||||
command += f"--delta_type {args.delta_type} "
|
||||
command += f"--study_name {args.study_name} "
|
||||
command += f"--optuna_seed 10{id} "
|
||||
command += f"--main_file_name {args.main_file_name} "
|
||||
command += f"--num_trials {sub_n_trials} "
|
||||
command += f">{args.output_dir}/{args.substudy_prefix}{id}.log 2>&1 &"
|
||||
p = subprocess.Popen(command, cwd="./", shell=True)
|
||||
print("id {} on cuda:{}, pid {}\n {}\n".format(id, cudas, p.pid, command))
|
||||
command += f"--pythonpath {args.pythonpath} "
|
||||
command += f"--pathbase {args.pathbase} "
|
||||
command += f"--repeat_time {args.repeat_time} "
|
||||
command += f"--plm_path_base {args.plm_path_base} "
|
||||
command += f"--datasets_saved_path {args.datasets_saved_path} "
|
||||
if args.datasets_load_from_disk:
|
||||
command += f"--datasets_load_from_disk "
|
||||
command += f"> {args.output_dir}/{args.substudy_prefix}{id}.log 2>&1"
|
||||
p = subprocess.Popen(command, cwd=f"{args.pathbase}", shell=True)
|
||||
subprocesses.append(p)
|
||||
print("id {} on cuda:{}, pid {}".format(id, cudas, p.pid))
|
||||
print(command)
|
||||
print()
|
||||
|
||||
print("Wait for subprocesses to complete")
|
||||
exit_codes = [p.wait() for p in subprocesses]
|
||||
print("All complete!")
|
||||
|
||||
elif args.mode == 'read':
|
||||
study = optuna.load_study(study_name=args.study_name, storage=f"sqlite:///{args.study_name}.db")
|
||||
|
@ -96,17 +120,17 @@ if __name__=="__main__":
|
|||
plot_contour = optuna.visualization.plot_contour(study, params=['learning_rate', 'batch_size_base'])
|
||||
plot_contour2 = optuna.visualization.plot_contour(study, params=['learning_rate', 'warmup_steps'])
|
||||
|
||||
|
||||
|
||||
plot_history.write_image(f"{args.output_dir}/history.png")
|
||||
plot_slice.write_image(f"{args.output_dir}/slice.png")
|
||||
plot_contour.write_image(f"{args.output_dir}/contour.png")
|
||||
plot_contour2.write_image(f"{args.output_dir}/contour2.png")
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -10,23 +10,29 @@ from optuna.samplers import TPESampler
|
|||
import shutil
|
||||
import time
|
||||
|
||||
import subprocess
|
||||
|
||||
|
||||
def objective_singleseed(args, unicode, search_space_sample ):
|
||||
os.mkdir(f"{args.output_dir}/{unicode}")
|
||||
search_space_sample.update({"output_dir": f"{args.output_dir}/{unicode}"})
|
||||
|
||||
|
||||
|
||||
with open(f"{args.output_dir}/{unicode}/this_configs.json", 'w') as fout:
|
||||
json.dump(search_space_sample, fout, indent=4,sort_keys=True)
|
||||
|
||||
command = "CUDA_VISIBLE_DEVICES={} ".format(args.cuda_id)
|
||||
command += "python run.py "
|
||||
command += f"{args.output_dir}/{unicode}/this_configs.json"
|
||||
|
||||
|
||||
status_code = os.system(command)
|
||||
print("status_code",status_code)
|
||||
|
||||
command = "CUDA_VISIBLE_DEVICES={} ".format(args.cuda_id)
|
||||
command += f"{args.pythonpath} {args.main_file_name} "
|
||||
command += f"{args.output_dir}/{unicode}/this_configs.json"
|
||||
command += f" >> {args.output_dir}/{unicode}/output.log 2>&1"
|
||||
|
||||
|
||||
print("======"*5+"\n"+command)
|
||||
p = subprocess.Popen(command, cwd=f"{args.pathbase}", shell=True)
|
||||
print(f"wait for subprocess \"{command}\" to complete")
|
||||
p.wait()
|
||||
|
||||
# if status_code != 0:
|
||||
# with open(f"{args.output_dir}/{args.cuda_id}.log",'r') as flog:
|
||||
# lastlines = " ".join(flog.readlines()[-100:])
|
||||
|
@ -50,8 +56,13 @@ def objective_singleseed(args, unicode, search_space_sample ):
|
|||
else:
|
||||
os.remove(full_file_name)
|
||||
|
||||
return results['test']['test_average_metrics']
|
||||
|
||||
results_all_test_datasets = []
|
||||
print("results:", results)
|
||||
for datasets in results['test']:
|
||||
results_all_test_datasets.append(results['test'][datasets]['test_average_metrics'])
|
||||
|
||||
return sum(results_all_test_datasets)/len(results_all_test_datasets)#results['test']['average_metrics']
|
||||
|
||||
|
||||
|
||||
def objective(trial, args=None):
|
||||
|
@ -61,7 +72,7 @@ def objective(trial, args=None):
|
|||
search_space_sample.update(DatasetSearchSpace(args.dataset).get_config(trial, args))
|
||||
search_space_sample.update(AllDeltaSearchSpace[args.delta_type]().get_config(trial, args))
|
||||
results = []
|
||||
for seed in [100]:
|
||||
for seed in range(42, 42+args.repeat_time):
|
||||
search_space_sample.update({"seed": seed})
|
||||
unicode = random.randint(0, 100000000)
|
||||
while os.path.exists(f"{args.output_dir}/{unicode}"):
|
||||
|
@ -74,23 +85,33 @@ def objective(trial, args=None):
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__=="__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--delta_type")
|
||||
parser.add_argument("--dataset")
|
||||
parser.add_argument("--model_name")
|
||||
parser.add_argument("--cuda_id", type=int)
|
||||
parser.add_argument("--main_file_name", type=str)
|
||||
parser.add_argument("--study_name")
|
||||
parser.add_argument("--num_trials", type=int)
|
||||
parser.add_argument("--repeat_time", type=int)
|
||||
parser.add_argument("--optuna_seed", type=int, default="the seed to sample suggest point")
|
||||
parser.add_argument("--pathbase", type=str, default="")
|
||||
parser.add_argument("--pythonpath", type=str, default="")
|
||||
parser.add_argument("--plm_path_base", type=str, default="")
|
||||
parser.add_argument("--datasets_load_from_disk", action="store_true")
|
||||
parser.add_argument("--datasets_saved_path", type=str)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
setattr(args, "output_dir", f"outputs_search/{args.study_name}")
|
||||
|
||||
setattr(args, "output_dir", f"{args.pathbase}/outputs_search/{args.study_name}")
|
||||
|
||||
study = optuna.load_study(study_name=args.study_name, storage=f'sqlite:///{args.study_name}.db', sampler=TPESampler(seed=args.optuna_seed))
|
||||
study.optimize(partial(objective, args=args), n_trials=args.num_trials)
|
||||
|
||||
print("complete single!")
|
||||
|
||||
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
import collections
|
||||
import collections
|
||||
import copy
|
||||
|
||||
|
||||
|
@ -10,7 +10,7 @@ class BaseSearchSpace:
|
|||
"do_train": True,
|
||||
"do_eval": True,
|
||||
"do_test": True,
|
||||
|
||||
|
||||
|
||||
"save_total_limit": 1,
|
||||
# For glue datasets.
|
||||
|
@ -19,7 +19,6 @@ class BaseSearchSpace:
|
|||
"eval_dataset_config_name": ["en"],
|
||||
"test_dataset_config_name": ["en"],
|
||||
# other configurations.
|
||||
"predict_with_generate": True,
|
||||
# To evaluate during training.
|
||||
"load_best_model_at_end": True,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
|
@ -27,7 +26,10 @@ class BaseSearchSpace:
|
|||
"evaluation_strategy": "steps",
|
||||
"overwrite_output_dir": True,
|
||||
"push_to_hub": False,
|
||||
"save_strategy": "steps"
|
||||
"save_strategy": "steps",
|
||||
"datasets_load_from_disk": args.datasets_load_from_disk,
|
||||
"datasets_saved_path": args.datasets_saved_path
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -37,7 +39,7 @@ class BitFitSearchSpace:
|
|||
learning_rate = trail.suggest_loguniform('learning_rate', 1e-5, 1e-1)
|
||||
return {
|
||||
"delta_type": "bitfit",
|
||||
'learning_rate': learning_rate,
|
||||
'learning_rate': learning_rate,
|
||||
}
|
||||
|
||||
class AdapterSearchSpace:
|
||||
|
@ -68,7 +70,7 @@ class FinetuneSearchSpace:
|
|||
learning_rate = trail.suggest_loguniform('learning_rate', 1e-5, 1e-1)
|
||||
return {
|
||||
"delta_type": "none",
|
||||
'learning_rate': learning_rate,
|
||||
'learning_rate': learning_rate,
|
||||
}
|
||||
|
||||
class LoRASearchSpace:
|
||||
|
@ -100,16 +102,16 @@ class CompacterSearchSpace:
|
|||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
|
@ -140,16 +142,16 @@ class CompacterppSearchSpace:
|
|||
"non_linearity": "gelu_new",
|
||||
|
||||
#Compacter.
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_division": 4,
|
||||
"hypercomplex_adapters": True,
|
||||
"hypercomplex_nonlinearity": "glorot-uniform",
|
||||
# gradient clip and clamp
|
||||
# gradient clip and clamp
|
||||
"gradient_clip": False,
|
||||
"phm_clamp": False,
|
||||
"normalize_phm_weight": False,
|
||||
"normalize_phm_weight": False,
|
||||
"learn_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
# shared one side
|
||||
"factorized_phm": True,
|
||||
"shared_phm_rule": False,
|
||||
"factorized_phm_rule": False,
|
||||
"phm_c_init": "normal",
|
||||
|
@ -171,7 +173,7 @@ class LowRankAdapterSearchSpace:
|
|||
"final_layer_norm"
|
||||
],
|
||||
"non_linearity": "gelu_new",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_w_init": "glorot-uniform",
|
||||
"low_rank_rank": low_rank_rank,
|
||||
}
|
||||
|
||||
|
@ -201,8 +203,8 @@ class T5BaseSearchSpace:
|
|||
batch_size = int(16 * 2**(min(batch_size_base,3)-1))
|
||||
warmup_steps = trail.suggest_categorical('warmup_steps', [0, 500])
|
||||
return {
|
||||
"model_name_or_path": "t5-base", # change here for loading from custom path
|
||||
"tokenizer_name": "t5-base", # change here for loading from custom path
|
||||
"model_name_or_path": f"{args.plm_path_base}t5-base", # change here for loading from custom path
|
||||
"tokenizer_name": f"{args.plm_path_base}t5-base", # change here for loading from custom path
|
||||
'batch_size':batch_size,
|
||||
"per_device_train_batch_size": batch_size,
|
||||
"per_device_eval_batch_size": batch_size,
|
||||
|
@ -211,17 +213,43 @@ class T5BaseSearchSpace:
|
|||
"save_steps": 200,
|
||||
"eval_steps": 200,
|
||||
"max_steps": 5000,
|
||||
"predict_with_generate": True,
|
||||
}
|
||||
|
||||
|
||||
class RobertaBaseSearchSpace:
|
||||
def get_config(self, trail, args=None):
|
||||
batch_size_base = trail.suggest_int('batch_size_base', 1, 4)
|
||||
if batch_size_base >= 4:
|
||||
gradient_accumulation_steps = 2**(batch_size_base-3)
|
||||
else:
|
||||
gradient_accumulation_steps = 1
|
||||
batch_size = int(16 * 2**(min(batch_size_base,3)-1))
|
||||
warmup_steps = trail.suggest_categorical('warmup_steps', [0, 500])
|
||||
return {
|
||||
"model_name_or_path": f"{args.plm_path_base}roberta-base", # change here for loading from custom path
|
||||
"tokenizer_name": f"{args.plm_path_base}roberta-base", # change here for loading from custom path
|
||||
'batch_size':batch_size,
|
||||
"per_device_train_batch_size": batch_size,
|
||||
"per_device_eval_batch_size": batch_size,
|
||||
"warmup_steps": warmup_steps,
|
||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||
"save_steps": 200,
|
||||
"eval_steps": 200,
|
||||
"max_steps": 5000,
|
||||
"predict_with_generate": False,
|
||||
}
|
||||
|
||||
|
||||
|
||||
class DatasetSearchSpace:
|
||||
dataset_order = ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"]
|
||||
dataset_config = {("task_name", "eval_dataset_name", "test_dataset_name",
|
||||
dataset_order = ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb", "wnli"]
|
||||
dataset_config = {("task_name", "eval_dataset_name", "test_dataset_name",
|
||||
"max_source_length"): list(zip(
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb", "wnli"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb", "wnli"],
|
||||
["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb", "wnli"],
|
||||
[256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128, 128],
|
||||
))}
|
||||
def __init__(self, dataset_name):
|
||||
self.dataset_name = dataset_name
|
||||
|
@ -250,6 +278,7 @@ AllDeltaSearchSpace = {
|
|||
}
|
||||
|
||||
AllBackboneSearchSpace = {
|
||||
"t5-base": T5BaseSearchSpace
|
||||
"t5-base": T5BaseSearchSpace,
|
||||
"roberta-base": RobertaBaseSearchSpace,
|
||||
}
|
||||
|
|
@ -45,12 +45,51 @@ def spearman_corrcoef(predictions, targets) -> dict:
|
|||
spearman_corrcoef = 0
|
||||
return {"spearmanr": spearman_corrcoef}
|
||||
|
||||
|
||||
|
||||
def spearman_corrcoef(predictions, targets) -> dict:
|
||||
"""Computes Spearman correlation coefficient."""
|
||||
# TODO: we need to do postprocessors in a clean way for each dataset.
|
||||
from examples_seq2seq.data_processors.postprocessors import string_to_float
|
||||
targets = [string_to_float(target) for target in targets]
|
||||
predictions= [string_to_float(prediction) for prediction in predictions]
|
||||
spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0]
|
||||
|
||||
# Note that if all the predictions will be the same, spearman
|
||||
# correlation is nan, to gaurad against this, we check the output
|
||||
# and return 0 in this case.
|
||||
if math.isnan(spearman_corrcoef):
|
||||
spearman_corrcoef = 0
|
||||
return {"spearmanr": spearman_corrcoef}
|
||||
|
||||
|
||||
def f1_score_with_invalid(predictions, targets) -> dict:
|
||||
"""Computes F1 score, with any prediction != 0 or 1 is counted as incorrect.
|
||||
Args:
|
||||
targets: list of targets, either 0 or 1
|
||||
predictions: list of predictions, any integer value
|
||||
Returns:
|
||||
F1 score, where any prediction != 0 or 1 is counted as wrong.
|
||||
"""
|
||||
def binary_reverse(labels):
|
||||
return ['0' if label == '1' else '1' for label in labels]
|
||||
targets, predictions = np.asarray(targets), np.asarray(predictions)
|
||||
# Get indices of invalid predictions.
|
||||
invalid_idx_mask = np.logical_and(predictions != '0', predictions != '1')
|
||||
# For any prediction != 0 or 1, we set the prediction to the opposite of its corresponding target.
|
||||
predictions[invalid_idx_mask] = binary_reverse(targets[invalid_idx_mask])
|
||||
targets = targets.astype(np.int32)
|
||||
predictions = predictions.astype(np.int32)
|
||||
return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)}
|
||||
|
||||
|
||||
|
||||
def transform_for_generation(predictions, targets):
|
||||
mapping = {k: i for i, k in enumerate(set(targets))}
|
||||
|
||||
targets = np.asarray([mapping[k] for k in targets])
|
||||
predictions = np.asarray([mapping[k] if k in mapping else (t+1)%len(mapping) for t, k in zip(targets, predictions)])
|
||||
|
||||
|
||||
return predictions, targets
|
||||
|
||||
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0002561697332863371,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/10940816",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0017750209757755706,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/1107862",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 16,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 8.499916262600587e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/15328099",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0006091646696452159,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/15991793",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 16,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.020109951371648067,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/19489534",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.005159882530578781,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/2281342",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.006869610954981632,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/26349674",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0002723799659564822,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/28219263",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0018605158382269157,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/28244173",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0001248231069039661,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/28313708",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0009490000624893097,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/28844651",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 3.5602209401278214e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/28881946",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.004220683008677483,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/29695566",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.004159184883370181,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/304080",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0009353172054773991,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/33594301",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0037650265946582574,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/37208828",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 16,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 6.867655291394631e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/38351436",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0022951686429675895,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/42338278",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0011474682877585407,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/43419391",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.009965694572181888,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/45030088",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
Binary file not shown.
Binary file not shown.
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0020236592832077785,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/50851153",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1 +0,0 @@
|
|||
{"batch_size": 64, "dataset_config_name": ["en"], "delta_type": "bitfit", "do_eval": true, "do_test": true, "do_train": true, "eval_dataset_config_name": ["en"], "eval_dataset_name": "mrpc", "eval_steps": 200, "evaluation_strategy": "steps", "gradient_accumulation_steps": 1, "greater_is_better": true, "learning_rate": 0.0020236592832077785, "load_best_model_at_end": true, "max_source_length": 128, "max_steps": 5000, "metric_for_best_model": "average_metrics", "model_name_or_path": "t5-base", "output_dir": "outputs_search/bitfit.mrpc.t5-base/50851153", "overwrite_output_dir": true, "per_device_eval_batch_size": 64, "per_device_train_batch_size": 64, "predict_with_generate": true, "push_to_hub": false, "save_steps": 200, "save_strategy": "steps", "save_total_limit": 1, "seed": 100, "split_validation_test": true, "task_name": "mrpc", "test_dataset_config_name": ["en"], "test_dataset_name": "mrpc", "tokenizer_name": "t5-base", "warmup_steps": 0}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.011098597581779427,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/57783553",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0005414844782319124,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/6060488",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.016927560240899083,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/61860753",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 1.0141082015912518e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/63232091",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0018137027382556477,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/6329472",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.023938918670661075,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/64753972",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 16,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.08212873599011565,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/65221118",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 4.8538530604501934e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/66798551",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0056649657801790786,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/67615376",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.03495857107255486,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/6773136",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.00039059864620439417,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/68027569",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0002642938525995798,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/68314189",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.037536374095955345,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/71501650",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.008866400032296955,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/73962149",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.01086484610816823,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/83260414",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 1.2611496517588744e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/83839551",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0010110776655071255,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/85624941",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0005414844782319124,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/86039549",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0027955533792956614,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/89676181",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0012573200149141731,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/91446644",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.001152480984285531,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/92427532",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 16,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.002464124578330328,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/93923515",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 16,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.000127337205276883,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/96799644",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.017304287780519442,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/97118516",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.057233123182472576,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/97177600",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.041620230849224296,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/97660529",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0005420479832650441,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/98459622",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0026938134462562973,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/99566760",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.00702408842393251,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/99826259",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.00702408842393251,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/99826259",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
Binary file not shown.
Before Width: | Height: | Size: 103 KiB |
Binary file not shown.
Before Width: | Height: | Size: 186 KiB |
Binary file not shown.
Before Width: | Height: | Size: 34 KiB |
Binary file not shown.
Before Width: | Height: | Size: 56 KiB |
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 1.1032607780913182e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/1123702",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 9.869021064463024e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/12173417",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.000913136097576348,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/14983360",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 1.1605972169428286e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/17148549",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 2.8707127478048054e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/18069491",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue