OpenDeltaMirror/opendelta/delta_configs.py

480 lines
22 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import re
from typing import Union, Dict, Any, Tuple, Optional
from opendelta import __version__ as opendelta_version
from opendelta.utils import logging
from opendelta.utils.signature import get_arg_names, get_arg_names_inside_func
import transformers
from transformers.file_utils import (
PushToHubMixin,
is_offline_mode,
cached_path,
is_remote_url,
get_list_of_files,
hf_bucket_url,
)
from packaging import version
import json
import copy
CONFIG_NAME = "config.json"
transformers_version = transformers.__version__
checked_package_versions = ["transformers_version", "opendelta_version"]
logger = logging.get_logger(__name__)
FULL_CONFIGURATION_FILE = "config.json"
_re_configuration_file = re.compile(r"config\.(.*)\.json")
class BaseDeltaConfig(PushToHubMixin):
r"""Base class for all configuration classes. Handles a few
parameters common to all delta models' configurations as well as methods for loading/downloading/saving configurations.
Class attributes (overridden by derived classes):
- **delta_type** :obj:`str`) -- the name of the delta modules, used to create the correct :py:class:`~opendelta.AutoConfig`.
Args:
modified_modules (:obj:`List[str]`, *optional*, defaults to :obj:``None``)
The list of keys to determine which modules you want to modify. OpenDelta will take every modulees that
**ends with** the one of the provided keys as the modification target. When not given any value, i.e.
``modified_modules=None``, the delta module will use the it corresponding default modification modules.
Taking DistilBertModel with an classifier on top as an example:
.. note::
**Examples**: When adding delta to DistilBertModel,
1. set to ``["0.attention.out_lin"]`` will add delta modules to the attention output of distilbert's
ayer 0, i.e., ``distilbert.transformer.layer.0.attention.out_lin``.
2. set to ``["attention.out_lin"]`` will add the delta modules in every layer's ``attention.out_lin``.
unfrozen_modules (:obj:`List[str]`, *optional*, defaults to :obj:`["deltas"]` )
exclude_modules (:obj:`str`, *optional*, default to :obj:`None`): The modules starts with these strings will
be excluded in modification. Note that currently only plain text (no regular expression) is supported.
The modules that are unfrozen
during training. Including the ones that are newly introduced as delta modules, and the ones that are
originally a part of the model but set to trainable (:obj:`requires_grad=True`) to train together with the
delta modules. OpenDelta will take every modules that **ends with** the one of the provided keys and all
its sub-modules and paramters as trainable.
.. note::
**Examples**: When adding delta to DistilBertModel,
1. set this argument to ``["bias"]`` will make all bias terms tunable.
2. set this argument to ``["attention"]`` will make all parameters in all attention modules tunable.
3. set this argument to ``["deltas"]`` will make all the parameters in the newly introduced delta
modules tunable.
4. set this argument to ``["classifier"]`` will make all parameters in the classifier tunable.
5. set this argument to ``["3.ffn.lin2", "deltas", "classifier"]``, will make all parameters in
the third layer's feed forward layer's send linear layer, the detla modules, and the classifiers modules
tunable.
common_structure (:obj:`bool`, *optional*, default to :obj:`None`): Whether using the common structure mapping of
the transformer model when designating :obj:`modified_modules` and :obj:`unfrozen_modules`.
backbone_class (:obj:`str`, *optional*, default to :obj:`None`): The name of backbone model's class, e.g.
``RobertaForMaskedLM``. Saving this infomation let the users explicitly know on which backbone the
delta model is trained.
backbone_checkpoint_name (:obj:`str`, *optional*, default to :obj:`None`): The specific checkpoint of the model.
In ideal case, it should be the url to download the checkpoint. However, we do not force the user to
specify a downloadable url here.
backbone_hash (:obj:`str`, *optional*, default to :obj:`None`): The md5-hash of the backbone model. It is
calculated using the string representation of the model and the sequential expansion of all the
parameters in the model. When loading a delta checkpoint in strict mode, the hash of the backbone model
will be compared to the hash in this config.
"""
delta_type: str = ""
def __init__(self,
modified_modules = None,
exclude_modules = None,
unfrozen_modules = ["deltas"],
common_structure=False,
backbone_class = None,
backbone_checkpoint_name = None,
backbone_hash = None,
):
arg_names = get_arg_names(BaseDeltaConfig.__init__)
for arg_name in arg_names:
setattr(self, arg_name, locals()[arg_name])
@classmethod
def from_finetuned(cls, finetuned_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "BaseDeltaConfig":
r"""
Instantiate a :obj:`BaseDeltaConfig` (or a derived class) from a finetined delta module configuration.
Args:
finetuned_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): This can be either:
* a string, the *model id* of a finetuned delta model configuration hosted inside a model repo on
deltahub.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
* a path to a *directory* containing a configuration file saved using the :meth:`BaseDeltaConfig.save_finetuned` method, e.g., ``./my_model_directory/``.
* a path or url to a saved configuration JSON *file*, e.g., ``./my_model_directory/configuration.json``.
cache_dir (:obj:`str` or :obj:`os.PathLike`, *optional*):
Path to a directory in which a downloaded pretrained delta model configuration should be cached if the
standard cache should not be used.
.. code-block:: python
delta_config = LoraConfig.from_finetuned("DeltaHub/lora_t5-base_mrpc")
"""
config_dict, kwargs = cls.get_config_dict(finetuned_model_name_or_path, **kwargs)
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
logger.warn(
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
)
return cls.from_dict(config_dict, **kwargs)
def save_finetuned(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
"""
Save a configuration object to the directory :obj:`save_directory`, so that it can be re-loaded using the
:meth:`BaseDeltaConfig.from_finetuned` class method.
Args:
save_directory (:obj:`str` or :obj:`os.PathLike`): Directory where the configuration JSON file
will be saved (will be created if it does not exist).
push_to_hub (:obj:`bool`, *optional*, defaults to :obj:`False`): Whether or not to push your model to
the Hugging Face model hub after saving it.
.. warning::
1. Will raise error if you haven't config a Huggingface Model Hub.
2. Using ``push_to_hub=True`` will synchronize the repository you are pushing to with ``save_directory``,
which requires ``save_directory`` to be a local clone of the repo you are pushing to if it's an existing
folder. Pass along ``temp_dir=True`` to use a temporary directory instead.
kwargs:
Additional key word arguments passed along to the
`PushToHubMixin.push_to_hub <https://huggingface.co/docs/transformers/master/main_classes/model#transformers.file_utils.PushToHubMixin.push_to_hub>`_ method.
"""
if os.path.isfile(save_directory):
raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
if push_to_hub:
commit_message = kwargs.pop("commit_message", None)
repo = self._create_or_get_repo(save_directory, **kwargs)
os.makedirs(save_directory, exist_ok=True)
# If we save using the predefined names, we can load using `from_pretrained`
output_config_file = os.path.join(save_directory, CONFIG_NAME)
self.to_json_file(output_config_file, use_diff=True)
logger.info(f"Configuration saved in {output_config_file}")
if push_to_hub:
url = self._push_to_hub(repo, commit_message=commit_message)
logger.info(f"Configuration pushed to the hub in this commit: {url}")
@classmethod
def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "BaseDeltaConfig":
r"""
Instantiate a :obj:`BaseDeltaConfig` from a python dictionary of parameters.
Args:
config_dict (:obj:`Dict[str, Any]`):
Dictionary that will be used to instantiate the configuration object. Such a dictionary can be
retrieved from a pretrained checkpoint by leveraging the :py:meth:`~PretrainedConfig.get_config_dict` method.
kwargs (:obj:`Dict[str, Any]`):
Additional parameters from which to initialize the configuration object.
Returns:
:obj:`BaseDeltaConfig`: The configuration object instantiated from those parameters.
"""
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
accept_args = get_arg_names(cls.__init__) + get_arg_names(BaseDeltaConfig.__init__)
unused_config_keys = []
for config_key in list(config_dict.keys()):
if config_key not in accept_args:
config_dict.pop(config_key)
unused_config_keys.append(config_key)
logger.warning(f"The following keys are not used by {cls}.__init__ function: {unused_config_keys}")
config = cls(**config_dict)
# Update config with kwargs if needed
to_remove = []
for key, value in kwargs.items():
if hasattr(config, key):
setattr(config, key, value)
if key != "torch_dtype":
to_remove.append(key)
for key in to_remove:
kwargs.pop(key, None)
logger.info(f"Model config {config}")
if return_unused_kwargs:
return config, kwargs
else:
return config
@classmethod
def get_config_dict(
cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""[NODOC]
From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a
[``PretrainedConfig``] using ``from_dict``.
Parameters:
pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
Returns:
:obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.
"""
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
resume_download = kwargs.pop("resume_download", False)
proxies = kwargs.pop("proxies", None)
use_auth_token = kwargs.pop("use_auth_token", None)
local_files_only = kwargs.pop("local_files_only", False)
revision = kwargs.pop("revision", None)
# from_pipeline = kwargs.pop("_from_pipeline", None)
from_auto_class = kwargs.pop("_from_auto", False)
user_agent = {"file_type": "config", "from_auto_class": from_auto_class}
# if from_pipeline is not None:
# user_agent["using_pipeline"] = from_pipeline
if is_offline_mode() and not local_files_only:
logger.info("Offline mode: forcing local_files_only=True")
local_files_only = True
pretrained_model_name_or_path = str(pretrained_model_name_or_path)
if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
config_file = pretrained_model_name_or_path
else:
configuration_file = get_configuration_file(
pretrained_model_name_or_path,
revision=revision,
use_auth_token=use_auth_token,
local_files_only=local_files_only,
)
if os.path.isdir(pretrained_model_name_or_path):
config_file = os.path.join(pretrained_model_name_or_path, configuration_file)
else:
config_file = hf_bucket_url(
pretrained_model_name_or_path, filename=configuration_file, revision=revision, mirror=None
)
try:
# Load from URL or cache if already cached
resolved_config_file = cached_path(
config_file,
cache_dir=cache_dir,
force_download=force_download,
proxies=proxies,
resume_download=resume_download,
local_files_only=local_files_only,
use_auth_token=use_auth_token,
user_agent=user_agent,
)
# Load config dict
config_dict = cls._dict_from_json_file(resolved_config_file)
except EnvironmentError as err:
logger.error(err)
msg = (
f"Can't load config for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n"
f" (make sure '{pretrained_model_name_or_path}' is not a path to a local directory with something else, in that case)\n\n"
f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {CONFIG_NAME} file\n\n"
)
if revision is not None:
msg += f"- or '{revision}' is a valid git identifier (branch name, a tag name, or a commit id) that exists for this model name as listed on its model page on 'https://huggingface.co/models'\n\n"
raise EnvironmentError(msg)
except (json.JSONDecodeError, UnicodeDecodeError):
msg = (
f"Couldn't reach server at '{config_file}' to download configuration file or "
"configuration file is not a valid JSON file. "
f"Please check network or file content here: {resolved_config_file}."
)
raise EnvironmentError(msg)
if resolved_config_file == config_file:
logger.info(f"loading configuration file {config_file}")
else:
logger.info(f"loading configuration file {config_file} from cache at {resolved_config_file}")
return config_dict, kwargs
@classmethod
def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
with open(json_file, "r", encoding="utf-8") as reader:
text = reader.read()
return json.loads(text)
def __repr__(self):
return f"{self.__class__.__name__} {self.to_json_string()}"
def __eq__(self, other):
return self.__dict__ == other.__dict__
def to_json_string(self, use_diff: bool = True) -> str:
"""[NODOC]
Serializes this instance to a JSON string.
Args:
use_diff (:obj:`bool`, *optional*, defaults to :obj:`True`):
If set to :obj:`True`, only the difference between the config instance and the default ``PretrainedConfig()``
is serialized to JSON string.
Returns:
:obj:`str`: String containing all the attributes that make up this configuration instance in JSON format.
"""
if use_diff is True:
config_dict = self.to_diff_dict()
else:
config_dict = self.to_dict()
return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
def to_json_file(self, json_file_path: Union[str, os.PathLike], use_diff: bool = True):
"""[NODOC]
Save this instance to a JSON file.
Args:
json_file_path (:obj:`str` or :obj:`os.PathLike`):
Path to the JSON file in which this configuration instance's parameters will be saved.
use_diff (:obj:`bool`, *optional*, defaults to :obj:`True`):
If set to :obj:`True`, only the difference between the config instance and the default ``PretrainedConfig()``
is serialized to JSON file.
"""
with open(json_file_path, "w", encoding="utf-8") as writer:
writer.write(self.to_json_string(use_diff=use_diff))
def to_diff_dict(self) -> Dict[str, Any]:
"""[NODOC]
Removes all attributes from config which correspond to the default config attributes for better readability and
serializes to a Python dictionary.
Returns:
:obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
"""
config_dict = self.to_dict()
# get the default config dict
default_config_dict = BaseDeltaConfig().to_dict()
# get class specific config dict
class_config_dict = self.__class__().to_dict() #if not self.is_composition else {}
serializable_config_dict = {}
# only serialize values that differ from the default config
for key, value in config_dict.items():
if (
key not in default_config_dict
or key in checked_package_versions
or value != default_config_dict[key]
or (key in class_config_dict and value != class_config_dict[key])
):
serializable_config_dict[key] = value
self.dict_torch_dtype_to_str(serializable_config_dict)
return serializable_config_dict
def update(self, config_dict: Dict[str, Any]):
"""[NODOC]
Updates attributes of this class with attributes from ``config_dict``.
Args:
config_dict (:obj:`Dict[str, Any]`): Dictionary of attributes that should be updated for this class.
"""
for key, value in config_dict.items():
setattr(self, key, value)
def to_dict(self) -> Dict[str, Any]:
"""
Serializes this instance to a Python dictionary.
Returns:
:obj:`dict`: Dictionary of all the attributes that make up this configuration instance.
"""
output = copy.deepcopy(self.__dict__)
if hasattr(self.__class__, "model_type"):
output["model_type"] = self.__class__.model_type
# Transformers version when serializing the model
output["transformers_version"] = transformers_version
output["opendelta_version"] = opendelta_version
self.dict_torch_dtype_to_str(output)
return output
def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None:
"""[NODOC]
Checks whether the passed dictionary has a *torch_dtype* key and if it's not None, converts torch.dtype to a
string of just the type. For example, ``torch.float32`` get converted into *"float32"* string, which can then be
stored in the json format.
"""
if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str):
d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1]
def get_configuration_file(
path_or_repo: Union[str, os.PathLike],
revision: Optional[str] = None,
use_auth_token: Optional[Union[bool, str]] = None,
local_files_only: bool = False,
) -> str:
"""
Get the configuration file to use for this version of transformers.
Args:
path_or_repo (`:obj:str` or `:obj:os.PathLike`):
Can be either the id of a repo on huggingface.co or a path to a *directory*.
revision(`:obj:str`, *optional*, defaults to ``"main"``):
The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
identifier allowed by git.
use_auth_token (:obj:`str` or *bool*, *optional*):
The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token generated
when running ``transformers-cli login`` (stored in ``~/.huggingface``).
local_files_only (:obj:`bool`, *optional*, defaults to :obj:`False`):
Whether or not to only rely on local files and not to attempt to download any files.
Returns:
:obj:`str`: The configuration file to use.
"""
# Inspect all files from the repo/folder.
all_files = get_list_of_files(
path_or_repo, revision=revision, use_auth_token=use_auth_token, local_files_only=local_files_only
)
configuration_files_map = {}
for file_name in all_files:
search = _re_configuration_file.search(file_name)
if search is not None:
v = search.groups()[0]
configuration_files_map[v] = os.path.split(file_name)[-1]
available_versions = sorted(configuration_files_map.keys())
# Defaults to FULL_CONFIGURATION_FILE and then try to look at some newer versions.
configuration_file = FULL_CONFIGURATION_FILE
# transformers_version_ = version.parse(transformers_version)
for v in available_versions:
# if version.parse(v) <= transformers_version_:
configuration_file = configuration_files_map[v]
# else:
# # No point going further since the versions are sorted.
# break
return configuration_file
if __name__ == "__main__":
myconfig = BaseDeltaConfig.from_pretrained("../ckpts/lora/")
myconfig.save_pretrained("../ckpts/lora.1/")
print(myconfig)