search
This commit is contained in:
parent
5d2dab3284
commit
6ebfab4a12
|
@ -1,4 +1,4 @@
|
|||
import numpy as np
|
||||
import numpy as np
|
||||
from dataclasses import dataclass
|
||||
from transformers import DataCollatorForSeq2Seq
|
||||
|
||||
|
@ -6,11 +6,23 @@ from transformers import DataCollatorForSeq2Seq
|
|||
@dataclass
|
||||
class TaskDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
|
||||
def check_uniqueness(self, samples):
|
||||
assert len(np.unique(samples)) == 1
|
||||
assert len(np.unique(samples)) == 1
|
||||
|
||||
def __call__(self, features):
|
||||
# tasks = [d.pop('task') for d in features]
|
||||
# self.check_uniqueness(tasks)
|
||||
output = super().__call__(features)
|
||||
# output["task"] = tasks[0]
|
||||
return output
|
||||
return output
|
||||
|
||||
# class CustomDataCollator(DefaultDataCollator):
|
||||
# def check_uniqueness(self, samples):
|
||||
# assert len(np.unique(samples)) == 1
|
||||
|
||||
# def __call__(self, features):
|
||||
# mask_positions = [d.pop('mask_positions') for d in features]
|
||||
# # self.check_uniqueness(tasks)
|
||||
# output = super().__call__(features)
|
||||
|
||||
# # output["task"] = tasks[0]
|
||||
# return output
|
|
@ -1,4 +1,4 @@
|
|||
|
||||
# from openprompt.prompts import ManualTemplate
|
||||
|
||||
class BasePrompt(object):
|
||||
def __init__(self, template_id=0, verbalizer_id=0, generation=True):
|
||||
|
@ -9,26 +9,28 @@ class BasePrompt(object):
|
|||
self.verbalizer = self.mlmhead_verbalizers[verbalizer_id]
|
||||
|
||||
|
||||
|
||||
def __call__(self, example):
|
||||
|
||||
def eval_syntax(syntaxlist, example):
|
||||
composed = []
|
||||
for x in syntaxlist:
|
||||
if x.startswith("[_eval_]"):
|
||||
t = eval(x[len("[_eval_]"):])
|
||||
t = eval(x[len("[_eval_]"):])
|
||||
else:
|
||||
t = x
|
||||
composed.append(t)
|
||||
return composed
|
||||
src_texts = eval_syntax(self.template,example)
|
||||
|
||||
|
||||
tgt_texts = self.verbalizer[str(example['label'])]
|
||||
if isinstance(tgt_texts, list):
|
||||
tgt_texts = eval_syntax(tgt_texts, example)
|
||||
else:
|
||||
tgt_texts = [tgt_texts]
|
||||
return src_texts, tgt_texts
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
@ -48,7 +50,7 @@ class MRPCPrompt(BasePrompt):
|
|||
"1": "same"
|
||||
}
|
||||
textual_templates = [
|
||||
["sentence1:", """[_eval_]example['sentence1']""",
|
||||
["sentence1:", """[_eval_]example['sentence1']""",
|
||||
"sentence2:", """[_eval_]example["sentence2"]""", "Meanings different of same? Answer: " ]
|
||||
]
|
||||
|
||||
|
@ -68,7 +70,7 @@ class BoolQPrompt(BasePrompt):
|
|||
"1": "same"
|
||||
}
|
||||
textual_templates = [
|
||||
["sentence1:", """[_eval_]example['sentence1']""",
|
||||
["sentence1:", """[_eval_]example['sentence1']""",
|
||||
"sentence2:", """[_eval_]example["sentence2"]""", "Meanings different of same? Answer: " ]
|
||||
]
|
||||
|
||||
|
@ -84,7 +86,7 @@ class BoolQPrompt(BasePrompt):
|
|||
"1": "yes"
|
||||
}
|
||||
textual_templates = [
|
||||
["hypothesis:", """[_eval_]example['hypothesis']""",
|
||||
["hypothesis:", """[_eval_]example['hypothesis']""",
|
||||
"premise:", """[_eval_]example["premise"]""", "The answer was " ]
|
||||
]
|
||||
|
||||
|
@ -100,7 +102,7 @@ class COLAPrompt(BasePrompt):
|
|||
"1": "Yes"
|
||||
}
|
||||
textual_templates = [
|
||||
["sentence:", """[_eval_]example['sentence']""",
|
||||
["sentence:", """[_eval_]example['sentence']""",
|
||||
"grammar correct? " ]
|
||||
]
|
||||
|
||||
|
@ -119,7 +121,7 @@ class RTEPrompt(BasePrompt):
|
|||
textual_templates = [
|
||||
["sentence1:", """[_eval_]example['premise']""", "sentence2:",
|
||||
"""[_eval_]example['hypothesis']""",
|
||||
"The answer was " ]
|
||||
"The answer was "]
|
||||
]
|
||||
|
||||
class CBPrompt(BasePrompt):
|
||||
|
@ -147,6 +149,5 @@ PromptCollections = {
|
|||
"superglue-boolq": BoolQPrompt,
|
||||
"cb": CBPrompt,
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from collections import OrderedDict
|
||||
import collections
|
||||
import collections
|
||||
import abc
|
||||
import functools
|
||||
from selectors import EpollSelector
|
||||
|
@ -12,10 +12,213 @@ import logging
|
|||
import numpy as np
|
||||
import torch
|
||||
import re
|
||||
from examples_prompt.data_processors.prompt import PromptCollections
|
||||
from openprompt.prompts import ManualTemplate, ManualVerbalizer
|
||||
from openprompt.plms.utils import TokenizerWrapper
|
||||
from openprompt.data_utils import InputExample
|
||||
import itertools
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
|
||||
from transformers.models.auto.tokenization_auto import tokenizer_class_from_name
|
||||
|
||||
from typing import List, Dict
|
||||
from collections import defaultdict
|
||||
from openprompt.utils import round_list
|
||||
import warnings
|
||||
class MLMTokenizerWrapper:
|
||||
def __init__(self, max_seq_length, tokenizer, truncate_method):
|
||||
self.max_seq_length=max_seq_length
|
||||
self.tokenizer=tokenizer
|
||||
self.num_special_tokens_to_add = len(tokenizer("")['input_ids'])
|
||||
# from IPython import embed; embed(header="Truega")
|
||||
self.truncate_method=truncate_method
|
||||
self.total_passed_sentences = 0
|
||||
self.num_truncated_sentences = 0
|
||||
if truncate_method=='tail':
|
||||
self.truncate_fct = self.truncate_from_tail
|
||||
elif truncate_method=='head':
|
||||
self.truncate_fct = self.truncate_from_head
|
||||
elif truncate_method == 'balanced':
|
||||
self.truncate_fct = self.balanced_truncate
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def merge_wrapped_example(self, wrapped_example, ):
|
||||
''' # TODO doens't consider the situation that input has two parts
|
||||
'''
|
||||
|
||||
wrapped_example, others = wrapped_example
|
||||
|
||||
# for some dataset like SuperGLUE.COPA, the answer requires prediction an span of
|
||||
# the input. Or in generation tasks, we need to generate a piece of target_text.
|
||||
# In these case, it tokenized to the encoded_tgt_text for furture use.
|
||||
|
||||
|
||||
|
||||
encoder_inputs = defaultdict(list)
|
||||
for piece in wrapped_example:
|
||||
encode_text = self.tokenizer.encode(piece['text'], add_special_tokens=False, return_special_tokens_mask=True )
|
||||
encoder_inputs['input_ids'].append(encode_text)
|
||||
encoder_inputs['shortenable_ids'].append([piece['shortenable_ids']] * len(encode_text))
|
||||
|
||||
|
||||
encoder_inputs = self.truncate(encoder_inputs=encoder_inputs)
|
||||
encoder_inputs.pop("shortenable_ids")
|
||||
encoder_inputs = self.concate_parts(input_dict=encoder_inputs)
|
||||
decoded_inputs = self.tokenizer.decode(encoder_inputs['input_ids'], clean_up_tokenization_spaces=False)
|
||||
|
||||
# again_encode = self.tokenizer.encode(decoded_inputs, add_special_tokens=False, return_special_tokens_mask=True)
|
||||
# if len(again_encode)> self.max_seq_length - 2:
|
||||
# print("length exceed!")
|
||||
# print(wrapped_example)
|
||||
# print(encoder_inputs['input_ids'])
|
||||
# print(again_encode)
|
||||
# print(decoded_inputs)
|
||||
# exit()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# delete shortenable ids
|
||||
|
||||
# encoder_inputs = self.concate_parts(input_dict=encoder_inputs)
|
||||
# encoder_inputs = self.add_special_tokens(encoder_inputs=encoder_inputs)
|
||||
# # create special input ids
|
||||
# encoder_inputs['attention_mask'] = [1] *len(encoder_inputs['input_ids'])
|
||||
# # padding
|
||||
# encoder_inputs = self.padding(input_dict=encoder_inputs, max_len=self.max_seq_length, pad_id_for_inputs=self.tokenizer.pad_token_id)
|
||||
|
||||
return decoded_inputs
|
||||
|
||||
@staticmethod
|
||||
def balanced_truncate(input_dict: Dict,
|
||||
num_tokens_to_truncate: int=0) -> Dict:
|
||||
'''truncate the inputs with balance, number of cut tokens is proportional to the part's length.
|
||||
'''
|
||||
shortenable_lens = [len(parts) if parts[0]==1 else 0
|
||||
for parts in input_dict['shortenable_ids']]
|
||||
total_shortenable_len = sum(shortenable_lens)
|
||||
num_tokens_to_truncate_each_part = [part_len/total_shortenable_len*num_tokens_to_truncate
|
||||
for part_len in shortenable_lens]
|
||||
round_list(num_tokens_to_truncate_each_part, num_tokens_to_truncate)
|
||||
|
||||
truncated_example = defaultdict(list)
|
||||
for key in input_dict:
|
||||
parts = input_dict[key]
|
||||
for num_tokens_to_truncate_part, part in zip(num_tokens_to_truncate_each_part, parts):
|
||||
truncated_example[key].append(part[:len(part)-num_tokens_to_truncate_part])
|
||||
return truncated_example
|
||||
|
||||
@staticmethod
|
||||
def truncate_from_tail(input_dict: Dict,
|
||||
num_tokens_to_truncate: int=0) -> Dict:
|
||||
r"""truncate the inputs from the rear
|
||||
"""
|
||||
truncated_example = defaultdict(list)
|
||||
shortenable_ids = input_dict['shortenable_ids']
|
||||
|
||||
for key in input_dict:
|
||||
parts = input_dict[key]
|
||||
to_trunc = num_tokens_to_truncate
|
||||
for i, part in enumerate(parts[::-1]):
|
||||
if len(part) == 0: # to prevent some part are empty after tokenization
|
||||
continue
|
||||
if shortenable_ids[-1-i][0]==0: # ==0 means the part is not shortenable
|
||||
continue
|
||||
parts[-1-i] = part[:-to_trunc] if to_trunc<len(part) else []
|
||||
to_trunc -= len(part)
|
||||
if to_trunc <= 0:
|
||||
break
|
||||
truncated_example[key] = parts
|
||||
return truncated_example
|
||||
|
||||
@staticmethod
|
||||
def truncate_from_head(input_dict: Dict,
|
||||
num_tokens_to_truncate: int=0) -> Dict:
|
||||
r"""truncate the inputs from the head
|
||||
"""
|
||||
truncated_example = defaultdict(list)
|
||||
shortenable_ids = input_dict['shortenable_ids']
|
||||
for key in input_dict:
|
||||
parts = input_dict[key]
|
||||
to_trunc = num_tokens_to_truncate
|
||||
for i, part in enumerate(parts):
|
||||
if shortenable_ids[i][0]==0: # ==0 means the part is not shortenable
|
||||
continue
|
||||
parts[i] = part[:-to_trunc] if to_trunc<len(part) else []
|
||||
to_trunc -= len(part)
|
||||
if to_trunc <= 0:
|
||||
break
|
||||
truncated_example[key] = parts
|
||||
return truncated_example
|
||||
|
||||
@staticmethod
|
||||
def concate_parts(input_dict: Dict) -> Dict:
|
||||
for key in input_dict:
|
||||
input_dict[key] = list(itertools.chain(*input_dict[key]))
|
||||
return input_dict
|
||||
|
||||
# @staticmethod
|
||||
# def padding(input_dict: Dict,
|
||||
# max_len: int, pad_id_for_inputs: int=0, pad_id_for_others: int=0) -> None:
|
||||
# for key, value in input_dict.items():
|
||||
# if (len(input_dict[key]) > max_len):
|
||||
# raise ValueError(f'''
|
||||
# Truncated seq length of '{key}' still greater than max length '{max_len}.'
|
||||
# One possible reason is that no enough shortenable parts in template. Try add {{"shortenable": "True"}} property.
|
||||
# ''')
|
||||
# if 'input' in key:
|
||||
# input_dict[key].extend([pad_id_for_inputs]*(max_len-len(value)))
|
||||
# else:
|
||||
# input_dict[key].extend([pad_id_for_others]*(max_len-len(value)))
|
||||
# return input_dict
|
||||
|
||||
|
||||
# def add_special_tokens(self, encoder_inputs):
|
||||
# # add special tokens
|
||||
# for key in encoder_inputs:
|
||||
# if key == "input_ids":
|
||||
# with warnings.catch_warnings():
|
||||
# warnings.simplefilter("ignore")
|
||||
# encoder_inputs[key] = self.tokenizer.build_inputs_with_special_tokens(
|
||||
# encoder_inputs[key])
|
||||
# return encoder_inputs
|
||||
|
||||
def truncate(self, encoder_inputs):
|
||||
total_tokens = sum([len(part) for part in encoder_inputs['input_ids']])
|
||||
num_specials = self.num_special_tokens_to_add
|
||||
# print("num_specials", num_specials)
|
||||
num_tokens_to_truncate = total_tokens - self.max_seq_length + num_specials
|
||||
self.total_passed_sentences+=1
|
||||
if num_tokens_to_truncate>0:
|
||||
self.num_truncated_sentences += 1
|
||||
if num_tokens_to_truncate > sum([len(x) for x in encoder_inputs['shortenable_ids']]):
|
||||
raise RuntimeError("num_tokens_to_truncate larger than number of shortenable tokens.")
|
||||
encoder_inputs = self.truncate_fct(input_dict=encoder_inputs,
|
||||
num_tokens_to_truncate=num_tokens_to_truncate)
|
||||
return encoder_inputs
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
class AbstractTask(abc.ABC):
|
||||
name = NotImplemented
|
||||
config = NotImplemented
|
||||
|
@ -28,26 +231,44 @@ class AbstractTask(abc.ABC):
|
|||
{"train": "train", "validation": "validation", "test": "test"}
|
||||
small_datasets_without_all_splits = ["cola", "wnli", "rte", "superglue-cb", "superglue-copa", "superglue-multirc",
|
||||
"superglue-wic", "superglue-wsc.fixed", "superglue-rte", "mrpc", "stsb",
|
||||
"superglue-boolq"]
|
||||
large_data_without_all_splits = ["qqp", "qnli", "superglue-record", "sst2"]
|
||||
"superglue-boolq", "qqp", "qnli", "superglue-record", "sst2"]
|
||||
large_data_without_all_splits = [] #["qqp", "qnli", "superglue-record", "sst2"]
|
||||
|
||||
def __init__(self, config, seed=42):
|
||||
def __init__(self, config, data_args, tokenizer, predict_with_generate, seed=42, default_max_length=1):
|
||||
self.config = config
|
||||
self.seed = seed
|
||||
self.data_args = data_args
|
||||
self.tokenizer = tokenizer
|
||||
self.predict_with_generate = predict_with_generate
|
||||
self.default_max_length = default_max_length
|
||||
self.truncate_method = getattr(data_args, "truncate_method", "balanced")
|
||||
|
||||
tid = getattr(config, "template_id", 0)
|
||||
vid = getattr(config, "verbalizer_id", 0)
|
||||
generation_paradigm = getattr(config, "generation_paradigm", True)
|
||||
self.prompt = PromptCollections[self.name](tid, vid, generation_paradigm)
|
||||
|
||||
def get_max_target_length(self, tokenizer, default_max_length):
|
||||
if self.prompt.verbalizer is not None:
|
||||
return max([len(tokenizer.encode(label)) for key, label in self.prompt.verbalizer.items()])
|
||||
return default_max_length
|
||||
self.template = ManualTemplate(tokenizer=self.tokenizer, text = self.templates_text[tid])
|
||||
self.verbalizer = ManualVerbalizer(tokenizer=self.tokenizer, classes = self.labels_list, label_words=self.verbalizers[vid])
|
||||
|
||||
# if self.predict_with_generate:
|
||||
# self.reverse_verbalizer = {(int(x) for x in self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(self.verbalizer[label]))): label for label in self.labels_list}
|
||||
# else:
|
||||
# self.reverse_verbalizer = {int(self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(self.verbalizer[label]))[0]): label for label in self.labels_list}
|
||||
|
||||
self.tokenizer_wrapper = MLMTokenizerWrapper(max_seq_length=self.data_args.max_source_length, tokenizer=self.tokenizer, truncate_method=self.truncate_method)
|
||||
|
||||
generation_paradigm = getattr(config, "generation_paradigm", True)
|
||||
# self.prompt = PromptCollections[self.name](tid, vid, generation_paradigm)
|
||||
self.max_target_length = self.get_max_target_length(self.default_max_length)
|
||||
|
||||
def get_max_target_length(self, default_max_length):
|
||||
if self.predict_with_generate:
|
||||
return max([len(label) for key, label in self.verbalizer.label_words_ids.items()])
|
||||
else:
|
||||
return default_max_length
|
||||
|
||||
def seq2seq_format(self, source, target, extra_fields={}
|
||||
):
|
||||
|
||||
|
||||
return {'source': ' '.join(source),
|
||||
'target': ' '.join(target),
|
||||
'task': self.name,
|
||||
|
@ -59,7 +280,7 @@ class AbstractTask(abc.ABC):
|
|||
n_obs = total_size
|
||||
logger.warning("n_obs is set to %s", n_obs)
|
||||
return n_obs
|
||||
|
||||
|
||||
def shuffled_indices(self, dataset):
|
||||
num_samples = len(dataset)
|
||||
generator = torch.Generator()
|
||||
|
@ -91,19 +312,42 @@ class AbstractTask(abc.ABC):
|
|||
else:
|
||||
return indices[validation_size:]
|
||||
|
||||
|
||||
def map_dataset(self, dataset, add_prefix):
|
||||
# from IPython import embed; embed(header="in get target length")
|
||||
return dataset.map(self.preprocessor)
|
||||
|
||||
|
||||
|
||||
def map_dataset(self, dataset):
|
||||
# from IPython import embed; embed(header="in get target length")
|
||||
return dataset.map(self.preprocessor).map(self.tokenizer_preprocessor)
|
||||
|
||||
def preprocessor(self, example):
|
||||
source, target = self.prompt(example)
|
||||
return self.seq2seq_format(source, target, extra_fields={})
|
||||
|
||||
def get(self, split, add_prefix=True, n_obs=None, split_validation_test=False):
|
||||
return example
|
||||
|
||||
def tokenizer_preprocessor(self, example):
|
||||
# source, target = example
|
||||
# from IPython import embed; embed(header="Trehre2")
|
||||
label = example['label']
|
||||
guid = example['idx']
|
||||
meta = dict(example)
|
||||
meta.pop("label")
|
||||
meta.pop("idx")
|
||||
|
||||
|
||||
|
||||
# from IPython import embed; embed(header="Trehre2")
|
||||
|
||||
e = InputExample(**{"meta": meta, 'label': label, 'guid': guid})
|
||||
template_e = self.template.wrap_one_example(e)
|
||||
encoded_sentence = self.tokenizer_wrapper.merge_wrapped_example(template_e)
|
||||
if self.predict_with_generate:
|
||||
# return {"source": encoded_sentence, 'target': ', 'extra_fields':[]}
|
||||
raise NotImplementedError
|
||||
else:
|
||||
return {"source": encoded_sentence, "label": label, 'target': '', 'extra_fields':{'dataset_name':self.name}}
|
||||
|
||||
|
||||
def get(self, split, n_obs=None, split_validation_test=False):
|
||||
# For small datasets (n_samples < 10K) without test set, we divide validation set to
|
||||
# half, use one half as test set and one half as validation set.
|
||||
if split in ["eval", "dev", "valid"]:
|
||||
split = "validation"
|
||||
if split_validation_test and self.name in self.small_datasets_without_all_splits \
|
||||
and split != "train":
|
||||
mapped_split = self.split_to_data_split["validation"]
|
||||
|
@ -124,7 +368,7 @@ class AbstractTask(abc.ABC):
|
|||
# shuffles the data and samples it.
|
||||
if n_obs is not None:
|
||||
dataset = self.subsample(dataset, n_obs)
|
||||
return self.map_dataset(dataset, add_prefix)
|
||||
return self.map_dataset(dataset)
|
||||
|
||||
class Squad(AbstractTask):
|
||||
name = "squad"
|
||||
|
@ -190,6 +434,10 @@ class SST2(AbstractTask):
|
|||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
|
||||
verbalizers = [
|
||||
|
||||
]
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('glue', 'sst2',
|
||||
split=split, script_version="master")
|
||||
|
@ -278,6 +526,7 @@ class QNLI(AbstractTask):
|
|||
tgt_texts = [str(example['label'])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
|
||||
#Tested
|
||||
class RTE(AbstractTask):
|
||||
name = "rte"
|
||||
labels_list = ["0", "1"]
|
||||
|
@ -287,15 +536,56 @@ class RTE(AbstractTask):
|
|||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
|
||||
|
||||
templates_text = [
|
||||
"""sentence1: {"meta": 'sentence1', "shortenable":True}. sentence2:,"""+
|
||||
"""{"meta":"sentence2", "shortenable":True}. The answer was {"mask"}.""",
|
||||
]
|
||||
|
||||
verbalizers = [{
|
||||
"0": "yes",
|
||||
"1": "no"
|
||||
}]
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('glue', 'rte',
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.rte")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'rte',
|
||||
split=split, script_version="master")
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["sentence1:", example['sentence1'],
|
||||
"sentence2:", example["sentence2"]]
|
||||
tgt_texts = [str(example['label'])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
|
||||
#Tested
|
||||
class SuperGLUEBoolQ(AbstractTask):
|
||||
name="superglue-boolq"
|
||||
labels_list = ['0', '1']
|
||||
metric = [metrics.accuracy]
|
||||
metric_names = ["accuracy"]
|
||||
split_to_data_split = {"train": "train",
|
||||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
|
||||
verbalizers = [
|
||||
{
|
||||
"0": "no",
|
||||
"1": "yes"
|
||||
},
|
||||
]
|
||||
mlmhead_verbalizers = {
|
||||
"0": "no",
|
||||
"1": "yes"
|
||||
}
|
||||
templates_text = [
|
||||
"""hypothesis: {"meta": "question", "shortenable":True} premise: {"meta":"passage", "shortenable":True} The answer was {"mask"}."""
|
||||
]
|
||||
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.boolq")[split]
|
||||
else:
|
||||
return datasets.load_dataset('super_glue', 'boolq', split=split, script_version="master")
|
||||
|
||||
|
||||
|
||||
|
||||
class WNLI(AbstractTask):
|
||||
|
@ -307,53 +597,23 @@ class WNLI(AbstractTask):
|
|||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('glue', 'wnli', split=split, script_version="master")
|
||||
verbalizers = [{
|
||||
"0": "True",
|
||||
"1": "False",
|
||||
}]
|
||||
templates_text = [
|
||||
"""{"meta": 'sentence1',"shortenable":True} Does it mean the following: "{"meta":'sentence2'}"? {"mask"}."""
|
||||
]
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["sentence1:", example['sentence1'],
|
||||
"sentence2:", example["sentence2"]]
|
||||
tgt_texts = [str(example['label'])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
|
||||
|
||||
class SuperGLUEBoolQ(AbstractTask):
|
||||
name="superglue-boolq"
|
||||
labels_list = ['0', '1']
|
||||
metric = [metrics.accuracy]
|
||||
metric_names = ["accuracy"]
|
||||
split_to_data_split = {"train": "train",
|
||||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('super_glue', 'boolq', split=split, script_version="master")
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["question:", example["question"], "passage:", example["passage"]]
|
||||
tgt_texts = [str(example["label"])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
|
||||
|
||||
class SuperGLUERTE(AbstractTask):
|
||||
name="superglue-rte"
|
||||
labels_list = ['0', '1']
|
||||
split_to_data_split = {"train": "train",
|
||||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
metric = [metrics.accuracy]
|
||||
metric_names = ["accuracy"]
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('super_glue', 'rte', split=split, script_version="master")
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["premise:", example["premise"],
|
||||
"hypothesis:", example["hypothesis"]]
|
||||
tgt_texts = [str(example["label"])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/glue.wnli")[split]
|
||||
else:
|
||||
return datasets.load_dataset('glue', 'wnli', split=split, script_version="master")
|
||||
|
||||
|
||||
#
|
||||
class SuperGLUECB(AbstractTask):
|
||||
name = "superglue-cb"
|
||||
labels_list = ['0', '1', '2']
|
||||
|
@ -363,13 +623,20 @@ class SuperGLUECB(AbstractTask):
|
|||
metric = [metrics.mean_multiclass_f1(num_classes=3), metrics.accuracy]
|
||||
metric_names = ["f1_multiclass", "accuracy"]
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('super_glue', 'cb', split=split, script_version="master")
|
||||
verbalizers = [{
|
||||
"0": "yes",
|
||||
"1": "no",
|
||||
"2": "maybe"
|
||||
}]
|
||||
templates_text = [
|
||||
"""hypothesis: {"meta": 'hypothesis',"shortenable":True} premise: {"meta":'premise', "shortenable":True} The answer was {"mask"}."""
|
||||
]
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["premise:", example["premise"], "hypothesis:", example["hypothesis"]]
|
||||
tgt_texts = [str(example["label"])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
def load_dataset(self, split):
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.cb")[split]
|
||||
else:
|
||||
return datasets.load_dataset('super_glue', 'cb', split=split, script_version="master")
|
||||
|
||||
|
||||
class SuperGLUECOPA(AbstractTask):
|
||||
|
@ -379,17 +646,21 @@ class SuperGLUECOPA(AbstractTask):
|
|||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
metric = [metrics.accuracy]
|
||||
metric_names = ["accuracy"]
|
||||
metric_names = ["accuracy"]
|
||||
|
||||
verbalizers = [{
|
||||
"0": "1",
|
||||
"1": "2",
|
||||
}]
|
||||
templates_text = [
|
||||
"""choice1: {"meta":"choice1"} choice2: {"meta":"choice2"} premise: {"meta":"premise", "shortenable":True} The {"meta":"question"} answer was choice{"mask"}."""
|
||||
]
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('super_glue', 'copa', split=split, script_version="master")
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["premise:", example["premise"],
|
||||
"choice1:", example["choice1"],
|
||||
"choice2:", example["choice2"]]
|
||||
tgt_texts = [str(example["label"])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.copa")[split]
|
||||
else:
|
||||
return datasets.load_dataset('super_glue', 'copa', split=split, script_version="master")
|
||||
|
||||
|
||||
class SuperGLUEMultiRC(AbstractTask):
|
||||
|
@ -398,31 +669,47 @@ class SuperGLUEMultiRC(AbstractTask):
|
|||
split_to_data_split = {"train": "train",
|
||||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
metric = [metrics.multirc_f1_over_all_answers,
|
||||
metrics.mean_group_metric(metrics.exact_match)]
|
||||
metric = [metrics.f1_score,
|
||||
metrics.accuracy]
|
||||
metric_names = ["f1", "em"]
|
||||
|
||||
# generation_verbalizers = [{
|
||||
# "0": "no",
|
||||
# "1": "yes",
|
||||
# },
|
||||
# ]
|
||||
|
||||
verbalizers = [{
|
||||
"0": "no",
|
||||
"1": "yes",
|
||||
}]
|
||||
templates_text = [
|
||||
"""question: {"meta":"question", "shortenable":False} answer: {"meta":"answer", "shortenable":False, "post_processing": lambda x:x+"."} paragraph: {"meta":"paragraph", "shortenable":True} The answer was {"mask"}."""
|
||||
]
|
||||
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('super_glue', 'multirc', split=split, script_version="master")
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.multirc")[split]
|
||||
else:
|
||||
return datasets.load_dataset('super_glue', 'multirc', split=split, script_version="master")
|
||||
|
||||
def remove_markup(self, text):
|
||||
"""Removes the HTML markup."""
|
||||
text = re.sub('<br>', ' ', text)
|
||||
text = re.sub('<(/)?b>', '', text)
|
||||
return text
|
||||
return text
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
group = example['idx']['question']
|
||||
# T5 applies remove_markup to the joined string, but this should not make
|
||||
def preprocessor(self, example):
|
||||
# T5 applies remove_markup to the joined string, but this should not make
|
||||
# any difference as well.
|
||||
# https://github.com/google-research/text-to-text-transfer-transformer/blob/a1352e625db7ec114062f99d99b0565b9e45c155/t5/data/preprocessors.py#L797
|
||||
src_texts = ["question:", self.remove_markup(example["question"]),
|
||||
"answer:", self.remove_markup(example["answer"]),
|
||||
"paragraph:", self.remove_markup(example["paragraph"])]
|
||||
tgt_texts = [str(example["label"])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix, extra_fields={"group": group})
|
||||
# https://github.com/google-research/text-to-text-transfer-transformer/blob/a1352e625db7ec114062f99d99b0565b9e45c155/t5/data/preprocessors.py#L797
|
||||
example["question"] = self.remove_markup(example["question"])
|
||||
example["answer"] = self.remove_markup(example["answer"])
|
||||
example["paragraph"] = self.remove_markup(example["paragraph"])
|
||||
return example
|
||||
|
||||
|
||||
|
||||
|
||||
class SuperGLUEWIC(AbstractTask):
|
||||
name = "superglue-wic"
|
||||
|
@ -431,68 +718,75 @@ class SuperGLUEWIC(AbstractTask):
|
|||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
metric = [metrics.accuracy]
|
||||
metric_names = ["accuracy"]
|
||||
metric_names = ["accuracy"]
|
||||
|
||||
verbalizers = [{
|
||||
"0": "No",
|
||||
"1": "Yes",
|
||||
}]
|
||||
|
||||
templates_text = [
|
||||
"""sentence1: {"meta":"sentence1"} sentence2: {"meta":"sentence2", "shortenable": True} word: {"meta":"word"} {"mask"}.
|
||||
"""
|
||||
]
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('super_glue', 'wic', split=split, script_version="master")
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
src_texts = ["sentence1:", example["sentence1"],
|
||||
"sentence2:", example["sentence2"],
|
||||
"word:", example["word"]]
|
||||
tgt_texts = [str(example["label"])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
if self.data_args.datasets_load_from_disk:
|
||||
return datasets.load_from_disk(f"{self.data_args.datasets_saved_path}/super_glue.wic")[split]
|
||||
else:
|
||||
return datasets.load_dataset('super_glue', 'wic', split=split, script_version="master")
|
||||
|
||||
|
||||
class SuperGLUEWSCFixed(AbstractTask):
|
||||
# source: https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/data/preprocessors.py
|
||||
"""Convert WSC examples to text2text format.
|
||||
WSC includes a sentence along with 2 'spans': the first denoting a noun and
|
||||
the other a pronoun. The 'label' specifies whether or not the pronoun is
|
||||
referencing the noun. This preprocessor puts ' * ' around the noun and ' # '
|
||||
around the pronoun.
|
||||
For example, a typical example from WSC might look like
|
||||
{
|
||||
'text': 'This is a test sentence .',
|
||||
'span1_text': 'test',
|
||||
'span1_index': 3,
|
||||
'span2_text': 'This',
|
||||
'span2_index': 0,
|
||||
'label': 0
|
||||
}
|
||||
This example would be transformed to
|
||||
{
|
||||
'inputs': 'wsc text: # This # is a * test * sentence .',
|
||||
'targets': 'False'
|
||||
}
|
||||
"""
|
||||
name = "superglue-wsc.fixed"
|
||||
labels_list = ['0', '1']
|
||||
split_to_data_split = {"train": "train",
|
||||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
metric = [metrics.accuracy]
|
||||
metric_names = ["accuracy"]
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('super_glue', 'wsc.fixed', split=split, script_version="master")
|
||||
# class SuperGLUEWSCFixed(AbstractTask):
|
||||
# # source: https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/data/preprocessors.py
|
||||
# """Convert WSC examples to text2text format.
|
||||
# WSC includes a sentence along with 2 'spans': the first denoting a noun and
|
||||
# the other a pronoun. The 'label' specifies whether or not the pronoun is
|
||||
# referencing the noun. This preprocessor puts ' * ' around the noun and ' # '
|
||||
# around the pronoun.
|
||||
# For example, a typical example from WSC might look like
|
||||
# {
|
||||
# 'text': 'This is a test sentence .',
|
||||
# 'span1_text': 'test',
|
||||
# 'span1_index': 3,
|
||||
# 'span2_text': 'This',
|
||||
# 'span2_index': 0,
|
||||
# 'label': 0
|
||||
# }
|
||||
# This example would be transformed to
|
||||
# {
|
||||
# 'inputs': 'wsc text: # This # is a * test * sentence .',
|
||||
# 'targets': 'False'
|
||||
# }
|
||||
# """
|
||||
# name = "superglue-wsc.fixed"
|
||||
# labels_list = ['0', '1']
|
||||
# split_to_data_split = {"train": "train",
|
||||
# "validation": "validation",
|
||||
# "test": "validation"}
|
||||
# metric = [metrics.accuracy]
|
||||
# metric_names = ["accuracy"]
|
||||
|
||||
def _mark_span(self, text, span_str, span_idx, mark):
|
||||
pattern_tmpl = r'^((?:\S+\s){N})(W)'
|
||||
pattern = re.sub('N', str(span_idx), pattern_tmpl)
|
||||
pattern = re.sub('W', span_str, pattern)
|
||||
return re.sub(pattern, r'\1{0} \2 {0}'.format(mark), text)
|
||||
# def load_dataset(self, split):
|
||||
# return datasets.load_dataset('super_glue', 'wsc.fixed', split=split, script_version="master")
|
||||
|
||||
def preprocessor(self, example, add_prefix=True):
|
||||
# converts text as done in T5.
|
||||
text = example['text']
|
||||
text = self._mark_span(text, example['span1_text'], example['span1_index'], '*')
|
||||
# Compensate for 2 added "words" added in previous step.
|
||||
span2_index = example['span2_index'] + 2 * int(example['span1_index'] < example['span2_index'])
|
||||
text = self._mark_span(text, example['span2_text'], span2_index, '#')
|
||||
src_texts = ["text:", text]
|
||||
tgt_texts = [str(example["label"])]
|
||||
return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
# def _mark_span(self, text, span_str, span_idx, mark):
|
||||
# pattern_tmpl = r'^((?:\S+\s){N})(W)'
|
||||
# pattern = re.sub('N', str(span_idx), pattern_tmpl)
|
||||
# pattern = re.sub('W', span_str, pattern)
|
||||
# return re.sub(pattern, r'\1{0} \2 {0}'.format(mark), text)
|
||||
|
||||
# def preprocessor(self, example, add_prefix=True):
|
||||
# # converts text as done in T5.
|
||||
# text = example['text']
|
||||
# text = self._mark_span(text, example['span1_text'], example['span1_index'], '*')
|
||||
# # Compensate for 2 added "words" added in previous step.
|
||||
# span2_index = example['span2_index'] + 2 * int(example['span1_index'] < example['span2_index'])
|
||||
# text = self._mark_span(text, example['span2_text'], span2_index, '#')
|
||||
# src_texts = ["text:", text]
|
||||
# tgt_texts = [str(example["label"])]
|
||||
# return self.seq2seq_format(src_texts, tgt_texts, add_prefix)
|
||||
|
||||
|
||||
class SuperGLUERecord(AbstractTask):
|
||||
|
@ -526,8 +820,8 @@ class SuperGLUERecord(AbstractTask):
|
|||
"validation": "validation",
|
||||
"test": "validation"}
|
||||
metric = [metrics.squad]
|
||||
metric_names = ["squad"]
|
||||
|
||||
metric_names = ["squad"]
|
||||
|
||||
def load_dataset(self, split):
|
||||
return datasets.load_dataset('super_glue', 'record', split=split, script_version="master")
|
||||
|
||||
|
@ -542,18 +836,18 @@ class SuperGLUERecord(AbstractTask):
|
|||
passage = re.sub(r'\n@highlight\n', '. ', passage)
|
||||
inputs = f"record query: {ex['query']} entities: {', '.join(ex['entities'])} passage: {passage}"
|
||||
if add_prefix:
|
||||
inputs = self.name + " " + inputs
|
||||
inputs = self.name + " " + inputs
|
||||
# duplicates the samples based on number of answers.
|
||||
num_answers = len(ex["answers"])
|
||||
num_duplicates = np.maximum(1, num_answers)
|
||||
new_batch["source"].extend([inputs] * num_duplicates)
|
||||
new_batch["source"].extend([inputs] * num_duplicates)
|
||||
new_batch["target"].extend(ex["answers"] if num_answers > 0 else ["<unk>"])
|
||||
new_batch["task"].extend([self.name] * num_duplicates)
|
||||
new_batch["extra_fields"].extend([{"answers": ex["answers"]}]*num_duplicates)
|
||||
new_batch["extra_fields"].extend([{"answers": ex["answers"]}]*num_duplicates)
|
||||
return new_batch
|
||||
|
||||
|
||||
def map_dataset(self, dataset, add_prefix=True):
|
||||
return dataset.map(functools.partial(self.preprocessor, add_prefix=add_prefix),
|
||||
return dataset.map(functools.partial(self.preprocessor, add_prefix=add_prefix),
|
||||
batched=True, remove_columns=dataset.column_names)
|
||||
|
||||
|
||||
|
@ -570,21 +864,20 @@ TASK_MAPPING = OrderedDict(
|
|||
('qqp', QQP),
|
||||
('stsb', STSB),
|
||||
('superglue-boolq', SuperGLUEBoolQ),
|
||||
('superglue-rte', SuperGLUERTE),
|
||||
('superglue-cb', SuperGLUECB),
|
||||
('superglue-copa', SuperGLUECOPA),
|
||||
('superglue-multirc', SuperGLUEMultiRC),
|
||||
('superglue-wic', SuperGLUEWIC),
|
||||
('superglue-wsc.fixed', SuperGLUEWSCFixed),
|
||||
# ('superglue-wsc.fixed', SuperGLUEWSCFixed),
|
||||
('superglue-record', SuperGLUERecord)
|
||||
]
|
||||
)
|
||||
|
||||
class AutoTask:
|
||||
@classmethod
|
||||
def get(self, task, config, seed=42):
|
||||
def get(self, task, config, data_args, tokenizer,predict_with_generate, seed=42):
|
||||
if task in TASK_MAPPING:
|
||||
return TASK_MAPPING[task](config, seed)
|
||||
return TASK_MAPPING[task](config, data_args, tokenizer,predict_with_generate, seed)
|
||||
raise ValueError(
|
||||
"Unrecognized task {} for AutoTask Model: {}.\n"
|
||||
"Task name should be one of {}.".format(
|
||||
|
|
|
@ -45,12 +45,51 @@ def spearman_corrcoef(predictions, targets) -> dict:
|
|||
spearman_corrcoef = 0
|
||||
return {"spearmanr": spearman_corrcoef}
|
||||
|
||||
|
||||
|
||||
def spearman_corrcoef(predictions, targets) -> dict:
|
||||
"""Computes Spearman correlation coefficient."""
|
||||
# TODO: we need to do postprocessors in a clean way for each dataset.
|
||||
from examples_seq2seq.data_processors.postprocessors import string_to_float
|
||||
targets = [string_to_float(target) for target in targets]
|
||||
predictions= [string_to_float(prediction) for prediction in predictions]
|
||||
spearman_corrcoef = 100 * scipy.stats.spearmanr(targets, predictions)[0]
|
||||
|
||||
# Note that if all the predictions will be the same, spearman
|
||||
# correlation is nan, to gaurad against this, we check the output
|
||||
# and return 0 in this case.
|
||||
if math.isnan(spearman_corrcoef):
|
||||
spearman_corrcoef = 0
|
||||
return {"spearmanr": spearman_corrcoef}
|
||||
|
||||
|
||||
def f1_score_with_invalid(predictions, targets) -> dict:
|
||||
"""Computes F1 score, with any prediction != 0 or 1 is counted as incorrect.
|
||||
Args:
|
||||
targets: list of targets, either 0 or 1
|
||||
predictions: list of predictions, any integer value
|
||||
Returns:
|
||||
F1 score, where any prediction != 0 or 1 is counted as wrong.
|
||||
"""
|
||||
def binary_reverse(labels):
|
||||
return ['0' if label == '1' else '1' for label in labels]
|
||||
targets, predictions = np.asarray(targets), np.asarray(predictions)
|
||||
# Get indices of invalid predictions.
|
||||
invalid_idx_mask = np.logical_and(predictions != '0', predictions != '1')
|
||||
# For any prediction != 0 or 1, we set the prediction to the opposite of its corresponding target.
|
||||
predictions[invalid_idx_mask] = binary_reverse(targets[invalid_idx_mask])
|
||||
targets = targets.astype(np.int32)
|
||||
predictions = predictions.astype(np.int32)
|
||||
return {"f1": 100 * sklearn.metrics.f1_score(targets, predictions)}
|
||||
|
||||
|
||||
|
||||
def transform_for_generation(predictions, targets):
|
||||
mapping = {k: i for i, k in enumerate(set(targets))}
|
||||
|
||||
targets = np.asarray([mapping[k] for k in targets])
|
||||
predictions = np.asarray([mapping[k] if k in mapping else (t+1)%len(mapping) for t, k in zip(targets, predictions)])
|
||||
|
||||
|
||||
return predictions, targets
|
||||
|
||||
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0002561697332863371,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/10940816",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0017750209757755706,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/1107862",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 16,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 8.499916262600587e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/15328099",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0006091646696452159,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/15991793",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 16,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.020109951371648067,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/19489534",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.005159882530578781,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/2281342",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.006869610954981632,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/26349674",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0002723799659564822,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/28219263",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0018605158382269157,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/28244173",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0001248231069039661,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/28313708",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0009490000624893097,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/28844651",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 3.5602209401278214e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/28881946",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.004220683008677483,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/29695566",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.004159184883370181,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/304080",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0009353172054773991,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/33594301",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0037650265946582574,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/37208828",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 16,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 6.867655291394631e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/38351436",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0022951686429675895,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/42338278",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0011474682877585407,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/43419391",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.009965694572181888,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/45030088",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
Binary file not shown.
Binary file not shown.
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0020236592832077785,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/50851153",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1 +0,0 @@
|
|||
{"batch_size": 64, "dataset_config_name": ["en"], "delta_type": "bitfit", "do_eval": true, "do_test": true, "do_train": true, "eval_dataset_config_name": ["en"], "eval_dataset_name": "mrpc", "eval_steps": 200, "evaluation_strategy": "steps", "gradient_accumulation_steps": 1, "greater_is_better": true, "learning_rate": 0.0020236592832077785, "load_best_model_at_end": true, "max_source_length": 128, "max_steps": 5000, "metric_for_best_model": "average_metrics", "model_name_or_path": "t5-base", "output_dir": "outputs_search/bitfit.mrpc.t5-base/50851153", "overwrite_output_dir": true, "per_device_eval_batch_size": 64, "per_device_train_batch_size": 64, "predict_with_generate": true, "push_to_hub": false, "save_steps": 200, "save_strategy": "steps", "save_total_limit": 1, "seed": 100, "split_validation_test": true, "task_name": "mrpc", "test_dataset_config_name": ["en"], "test_dataset_name": "mrpc", "tokenizer_name": "t5-base", "warmup_steps": 0}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.011098597581779427,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/57783553",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0005414844782319124,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/6060488",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.016927560240899083,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/61860753",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 1.0141082015912518e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/63232091",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0018137027382556477,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/6329472",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.023938918670661075,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/64753972",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 16,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.08212873599011565,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/65221118",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 4.8538530604501934e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/66798551",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0056649657801790786,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/67615376",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.03495857107255486,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/6773136",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.00039059864620439417,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/68027569",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0002642938525995798,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/68314189",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.037536374095955345,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/71501650",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.008866400032296955,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/73962149",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.01086484610816823,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/83260414",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 1.2611496517588744e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/83839551",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0010110776655071255,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/85624941",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0005414844782319124,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/86039549",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0027955533792956614,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/89676181",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0012573200149141731,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/91446644",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.001152480984285531,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/92427532",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 16,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.002464124578330328,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/93923515",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 16,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.000127337205276883,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/96799644",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.017304287780519442,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/97118516",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.057233123182472576,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/97177600",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.041620230849224296,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/97660529",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0005420479832650441,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/98459622",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0026938134462562973,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/99566760",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.00702408842393251,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/99826259",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "mrpc",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.00702408842393251,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.mrpc.t5-base/99826259",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "mrpc",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "mrpc",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
Binary file not shown.
Before Width: | Height: | Size: 103 KiB |
Binary file not shown.
Before Width: | Height: | Size: 186 KiB |
Binary file not shown.
Before Width: | Height: | Size: 34 KiB |
Binary file not shown.
Before Width: | Height: | Size: 56 KiB |
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 1.1032607780913182e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/1123702",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 9.869021064463024e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/12173417",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.000913136097576348,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/14983360",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 1.1605972169428286e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/17148549",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 2.8707127478048054e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/18069491",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.000194974976225138,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/20719975",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.03781286205477464,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/26158876",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0618810008699179,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/28522034",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 6.262592496186088e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/29099149",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 4.8538530604501934e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/30778533",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 16,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0009445961555576889,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/33442523",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 16,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.020109951371648067,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/35699804",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 6.0263760479697114e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/41924547",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 7.833953000267327e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/45992418",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0005632466045355159,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/46821674",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.008374542128252581,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/47176009",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 7.170024484707928e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/47432895",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.00043437143218908386,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/47615745",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.00010014550655645348,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/49804166",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 16,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 6.867655291394631e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/52735972",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 1.1566052650322366e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/56419593",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 1.760953133010801e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/57722645",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.037536374095955345,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/60994585",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.00020538230336950936,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/63007050",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0002561697332863371,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/63359012",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 32,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.02044226389800505,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/63678776",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 32,
|
||||
"per_device_train_batch_size": 32,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 1.4137672005343143e-05,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/64148506",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 16,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.08212873599011565,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/68062044",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 16,
|
||||
"per_device_train_batch_size": 16,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.00012472437804600788,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/69091480",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 0
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0004055014962899548,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/69407180",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 2,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.0005726557355744265,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/71736541",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
|
@ -1,42 +0,0 @@
|
|||
{
|
||||
"batch_size": 64,
|
||||
"dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"delta_type": "bitfit",
|
||||
"do_eval": true,
|
||||
"do_test": true,
|
||||
"do_train": true,
|
||||
"eval_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"eval_dataset_name": "rte",
|
||||
"eval_steps": 200,
|
||||
"evaluation_strategy": "steps",
|
||||
"gradient_accumulation_steps": 1,
|
||||
"greater_is_better": true,
|
||||
"learning_rate": 0.00012167338692493834,
|
||||
"load_best_model_at_end": true,
|
||||
"max_source_length": 128,
|
||||
"max_steps": 5000,
|
||||
"metric_for_best_model": "average_metrics",
|
||||
"model_name_or_path": "t5-base",
|
||||
"output_dir": "outputs_search/bitfit.rte.t5-base/71901921",
|
||||
"overwrite_output_dir": true,
|
||||
"per_device_eval_batch_size": 64,
|
||||
"per_device_train_batch_size": 64,
|
||||
"predict_with_generate": true,
|
||||
"push_to_hub": false,
|
||||
"save_steps": 200,
|
||||
"save_strategy": "steps",
|
||||
"save_total_limit": 1,
|
||||
"seed": 100,
|
||||
"split_validation_test": true,
|
||||
"task_name": "rte",
|
||||
"test_dataset_config_name": [
|
||||
"en"
|
||||
],
|
||||
"test_dataset_name": "rte",
|
||||
"tokenizer_name": "t5-base",
|
||||
"warmup_steps": 500
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue