diff --git a/.gitignore b/.gitignore index fd82c1a..11a8cb8 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,6 @@ log.txt *beans/ **/examples/*/configs/ !examples/*/configs/config_gen.py +**/jupyter_notebook_examples/ +!examples/jupyter_notebook_examples/*.py + diff --git a/examples/examples_seq2seq/configs/config_gen.py b/examples/examples_seq2seq/configs/config_gen.py index 073a112..00d013b 100644 --- a/examples/examples_seq2seq/configs/config_gen.py +++ b/examples/examples_seq2seq/configs/config_gen.py @@ -46,6 +46,50 @@ BaseConfigs['t5-base'] = { "save_strategy": "steps" } + +BaseConfigs['t5-xxl'] = { + ("job_name", "task_name", "eval_dataset_name", "test_dataset_name", "num_train_epochs", + "max_source_length", + "per_device_train_batch_size", "per_device_eval_batch_size", "warmup_steps","save_steps", "eval_steps"): zip( + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", + "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + ["superglue-boolq", "superglue-cb", "superglue-copa", "superglue-wic", "superglue-multirc", "superglue-record", "superglue-wsc.fixed", "mrpc", "cola", "sst2", "qnli", "rte", "mnli", "qqp", "stsb"], + [ 20, 20, 40, 20, 3, 3, 20, 20, 20, 3, 3, 20, 3, 3, 20], + [256, 256, 256, 256, 256, 512, 256, 128, 128, 128, 128, 128, 128, 128, 128], + [ 32, 32, 32, 32, 32, 16, 32] + [4] * 8, + [ 32, 32, 32, 32, 32, 16, 32] + [4] * 8, + [0] *7 +[0] *8, + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + [200, 100, 50, 100, 200, 200, 100, 200, 100, 200, 200, 100, 200, 200, 100], + ), + "do_train": True, + "do_eval": True, + "do_test": True, + + "model_name_or_path": "/home/hushengding/plm_cache/t5-xxl-lm-adapt/", + "tokenizer_name": "/home/hushengding/plm_cache/t5-xxl-lm-adapt/", + "save_total_limit": 1, + # For glue datasets. + "split_validation_test": True, + "seed": 42, + "dataset_config_name": ["en"], + "eval_dataset_config_name": ["en"], + "test_dataset_config_name": ["en"], + # other configurations. + "predict_with_generate": True, + # To evaluate during training. + "load_best_model_at_end": True, + "metric_for_best_model": "average_metrics", + "greater_is_better": True, + "evaluation_strategy": "steps", + "overwrite_output_dir": True, + "push_to_hub": True, + "save_strategy": "steps", + "model_parallel": True + } + AllConfigs['bitfit_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) AllConfigs['bitfit_t5-base'].update({ "delta_type": "bitfit", @@ -163,6 +207,21 @@ AllConfigs['low_rank_adapter_t5-base'].update({ "low_rank_rank": 1, }) +AllConfigs['low_rank_adapter_t5-xxl'] = copy.deepcopy(BaseConfigs['t5-xxl']) +AllConfigs['low_rank_adapter_t5-xxl'].update({ + "delta_type": "low_rank_adapter", + "learning_rate": 3e-4, + "unfrozen_modules": [ + "deltas", + "layer_norm", + "final_layer_norm" + ], + "output_dir": "outputs/low_rank_adapter/t5-xxl/", + "non_linearity": "gelu_new", + "low_rank_w_init": "glorot-uniform", + "low_rank_rank": 1, + }) + AllConfigs['soft_prompt_t5-base'] = copy.deepcopy(BaseConfigs['t5-base']) AllConfigs['soft_prompt_t5-base'].update({ diff --git a/opendelta/basemodel.py b/opendelta/basemodel.py index e6dc2de..6d544a6 100644 --- a/opendelta/basemodel.py +++ b/opendelta/basemodel.py @@ -632,7 +632,7 @@ class DeltaBase(nn.Module, SaveLoadMixin): self.config = config - def log(self, module=None, delta_ratio=True, trainable_ratio=True, visualization=True): + def log(self, module=None, delta_ratio=True, trainable_ratio=True, visualization=True, cuda_memory=True): r"""Log and visualize the result of applying delta. Possible Options are ``trainable_ratio``, ``visualization``, ``delta_ratio``. @@ -658,6 +658,15 @@ class DeltaBase(nn.Module, SaveLoadMixin): n_delta = self.num_delta_parameters(module) n_total = self.num_total_parameters(module) logger.info("Delta Parameter Ratio: {:2f}%".format(n_delta/n_total*100)) + if cuda_memory: + cudamem = 0 + maxcudamem = 0 + for device_id in range(torch.cuda.device_count()): + cudamem += torch.cuda.memory_allocated(f"cuda:{device_id}")/1024**3 + maxcudamem += torch.cuda.max_memory_allocated(f"cuda:{device_id}")/1024**3 + logger.info("Static Memory {:.2f} GB, Max Memory {:.2f} GB".format(cudamem, maxcudamem)) + + def num_delta_parameters(self, module: Optional[nn.Module]=None): r"""[NODOC] A small sugar function to get the number of trainable parameter in the backbone model. Often used to diff --git a/opendelta/utils/logging.py b/opendelta/utils/logging.py index 727232d..2211c6f 100644 --- a/opendelta/utils/logging.py +++ b/opendelta/utils/logging.py @@ -113,18 +113,6 @@ def get_log_levels_dict(): return log_levels -def get_logger(name: Optional[str] = None) -> logging.Logger: - """ - Return a logger with the specified name. - This function is not supposed to be directly accessed unless you are writing a custom transformers module. - """ - - if name is None: - name = _get_library_name() - - _configure_library_root_logger() - return logging.getLogger(name) - def get_verbosity() -> int: """ @@ -275,4 +263,17 @@ def warning_advice(self, *args, **kwargs): logging.Logger.warning_advice = warning_advice -set_verbosity_debug() \ No newline at end of file + +def get_logger(name: Optional[str] = None, verbosity='info') -> logging.Logger: + """ + Return a logger with the specified name. + This function is not supposed to be directly accessed unless you are writing a custom transformers module. + """ + + if name is None: + name = _get_library_name() + + _configure_library_root_logger() + logger = logging.getLogger(name) + logger.setLevel(log_levels[verbosity]) + return logger diff --git a/requirements.txt b/requirements.txt index 9fee7b5..a1e2702 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -torch>=1.9.0 -transformers==4.10.0 -datasets==1.17.0 -sentencepiece==0.1.96 -tqdm==4.62.2 +torch>=1.8.0 +transformers>=4.10.0 +datasets>=1.17.0 +sentencepiece>=0.1.96 +tqdm>=4.62.2 loralib decorator rich diff --git a/setup.py b/setup.py index a10b156..5007a31 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ print(requires) with open('README.md', 'r') as f: setuptools.setup( name = 'opendelta', - version = '0.0.1', + version = '0.0.2', description = "An open source framework for delta learning (parameter efficient learning).", long_description=open("README.md", "r", encoding="utf-8").read(), long_description_content_type="text/markdown", @@ -26,7 +26,7 @@ with open('README.md', 'r') as f: license="Apache", url="https://github.com/thunlp/OpenDelta", keywords = ['PLM', 'Parameter-efficient-Learning', 'AI', 'NLP'], - python_requires=">=3.8.0", + python_requires=">=3.6.0", install_requires=requires, packages=setuptools.find_packages(), classifiers=[