66 lines
1.9 KiB
JSON
66 lines
1.9 KiB
JSON
{
|
|
"backbone_model": "blenderbot",
|
|
"dataset_config_name": [
|
|
"en"
|
|
],
|
|
"datasets_load_from_disk": true,
|
|
"datasets_saved_path": "/mnt/sfs_turbo/hsd/huggingface_datasets/saved_to_disk/",
|
|
"delta_type": "compacter",
|
|
"do_eval": true,
|
|
"do_test": true,
|
|
"do_train": true,
|
|
"eval_dataset_config_name": [
|
|
"en"
|
|
],
|
|
"eval_dataset_name": "sst2",
|
|
"eval_steps": 200,
|
|
"evaluation_strategy": "steps",
|
|
"factorized_phm": true,
|
|
"factorized_phm_rule": false,
|
|
"gradient_clip": false,
|
|
"greater_is_better": true,
|
|
"hypercomplex_adapters": true,
|
|
"hypercomplex_division": 4,
|
|
"hypercomplex_nonlinearity": "glorot-uniform",
|
|
"learn_phm": true,
|
|
"learning_rate": 0.003,
|
|
"load_best_model_at_end": true,
|
|
"max_source_length": 128,
|
|
"metric_for_best_model": "average_metrics",
|
|
"model_name_or_path": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b",
|
|
"model_path_public": "blenderbot-3b",
|
|
"non_linearity": "gelu_new",
|
|
"normalize_phm_weight": false,
|
|
"num_train_epochs": 3,
|
|
"output_dir": "outputs/compacter/blenderbot-3b/sst2",
|
|
"overwrite_output_dir": true,
|
|
"per_device_eval_batch_size": 32,
|
|
"per_device_train_batch_size": 32,
|
|
"phm_c_init": "normal",
|
|
"phm_clamp": false,
|
|
"phm_init_range": 0.0001,
|
|
"predict_with_generate": true,
|
|
"push_to_dc": true,
|
|
"push_to_hf": false,
|
|
"save_steps": 200,
|
|
"save_strategy": "steps",
|
|
"save_total_limit": 1,
|
|
"seed": 42,
|
|
"shared_phm_rule": false,
|
|
"split_validation_test": true,
|
|
"task_name": "sst2",
|
|
"test_dataset_config_name": [
|
|
"en"
|
|
],
|
|
"test_dataset_name": "sst2",
|
|
"tokenizer_name": "/mnt/sfs_turbo/hsd/plm_cache/blenderbot-3b",
|
|
"unfrozen_modules": [
|
|
"deltas",
|
|
"layer_norm",
|
|
"final_layer_norm"
|
|
],
|
|
"use_bias_down_sampler": true,
|
|
"use_bias_up_sampler": true,
|
|
"warmup_steps": 0,
|
|
"modified_modules":["fc2"]
|
|
} |