From b7ca6c8dc14f689d0df16684a6121cc0ec24f8ba Mon Sep 17 00:00:00 2001 From: hiyouga <467089858@qq.com> Date: Mon, 5 Aug 2024 23:48:19 +0800 Subject: [PATCH] fix #5048 --- README.md | 16 ++-- README_zh.md | 18 ++--- requirements.txt | 10 +-- src/llamafactory/__init__.py | 16 ++-- src/llamafactory/extras/constants.py | 8 +- src/llamafactory/extras/misc.py | 10 +-- src/llamafactory/extras/packages.py | 5 ++ .../model/model_utils/attention.py | 2 +- .../model/model_utils/longlora.py | 75 +++++++++++++------ src/llamafactory/model/model_utils/packing.py | 12 ++- src/llamafactory/train/callbacks.py | 5 +- src/llamafactory/webui/interface.py | 2 +- src/llamafactory/webui/locales.py | 1 - 13 files changed, 111 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index 87b0af7c..386177bb 100644 --- a/README.md +++ b/README.md @@ -300,20 +300,20 @@ huggingface-cli login | Mandatory | Minimum | Recommend | | ------------ | ------- | --------- | | python | 3.8 | 3.11 | -| torch | 1.13.1 | 2.3.0 | -| transformers | 4.41.2 | 4.41.2 | -| datasets | 2.16.0 | 2.19.2 | -| accelerate | 0.30.1 | 0.30.1 | -| peft | 0.11.1 | 0.11.1 | -| trl | 0.8.6 | 0.9.4 | +| torch | 1.13.1 | 2.4.0 | +| transformers | 4.41.2 | 4.43.4 | +| datasets | 2.16.0 | 2.20.0 | +| accelerate | 0.30.1 | 0.32.0 | +| peft | 0.11.1 | 0.12.0 | +| trl | 0.8.6 | 0.9.6 | | Optional | Minimum | Recommend | | ------------ | ------- | --------- | | CUDA | 11.6 | 12.2 | | deepspeed | 0.10.0 | 0.14.0 | | bitsandbytes | 0.39.0 | 0.43.1 | -| vllm | 0.4.3 | 0.4.3 | -| flash-attn | 2.3.0 | 2.5.9 | +| vllm | 0.4.3 | 0.5.0 | +| flash-attn | 2.3.0 | 2.6.3 | ### Hardware Requirement diff --git a/README_zh.md b/README_zh.md index 3a7724d1..812b7b28 100644 --- a/README_zh.md +++ b/README_zh.md @@ -166,8 +166,8 @@ https://github.com/user-attachments/assets/e6ce34b0-52d5-4f3e-a830-592106c4c272 | [Llama 2](https://huggingface.co/meta-llama) | 7B/13B/70B | llama2 | | [Llama 3/Llama 3.1](https://huggingface.co/meta-llama) | 8B/70B | llama3 | | [LLaVA-1.5](https://huggingface.co/llava-hf) | 7B/13B | vicuna | -| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | | [MiniCPM](https://huggingface.co/openbmb) | 1B/2B | cpm | +| [Mistral/Mixtral](https://huggingface.co/mistralai) | 7B/8x7B/8x22B | mistral | | [OLMo](https://huggingface.co/allenai) | 1B/7B | - | | [PaliGemma](https://huggingface.co/google) | 3B | gemma | | [Phi-1.5/Phi-2](https://huggingface.co/microsoft) | 1.3B/2.7B | - | @@ -300,20 +300,20 @@ huggingface-cli login | 必需项 | 至少 | 推荐 | | ------------ | ------- | --------- | | python | 3.8 | 3.11 | -| torch | 1.13.1 | 2.3.0 | -| transformers | 4.41.2 | 4.41.2 | -| datasets | 2.16.0 | 2.19.2 | -| accelerate | 0.30.1 | 0.30.1 | -| peft | 0.11.1 | 0.11.1 | -| trl | 0.8.6 | 0.9.4 | +| torch | 1.13.1 | 2.4.0 | +| transformers | 4.41.2 | 4.43.4 | +| datasets | 2.16.0 | 2.20.0 | +| accelerate | 0.30.1 | 0.32.0 | +| peft | 0.11.1 | 0.12.0 | +| trl | 0.8.6 | 0.9.6 | | 可选项 | 至少 | 推荐 | | ------------ | ------- | --------- | | CUDA | 11.6 | 12.2 | | deepspeed | 0.10.0 | 0.14.0 | | bitsandbytes | 0.39.0 | 0.43.1 | -| vllm | 0.4.3 | 0.4.3 | -| flash-attn | 2.3.0 | 2.5.9 | +| vllm | 0.4.3 | 0.5.0 | +| flash-attn | 2.3.0 | 2.6.3 | ### 硬件依赖 diff --git a/requirements.txt b/requirements.txt index 7380add4..93e83530 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ -transformers>=4.41.2 -datasets>=2.16.0 -accelerate>=0.30.1 -peft>=0.11.1 -trl>=0.8.6 +transformers>=4.41.2,<=4.43.4 +datasets>=2.16.0,<=2.20.0 +accelerate>=0.30.1,<=0.32.0 +peft>=0.11.1,<=0.12.0 +trl>=0.8.6,<=0.9.6 gradio>=4.0.0 pandas>=2.0.0 scipy diff --git a/src/llamafactory/__init__.py b/src/llamafactory/__init__.py index 28f5144a..7b602a92 100644 --- a/src/llamafactory/__init__.py +++ b/src/llamafactory/__init__.py @@ -20,19 +20,17 @@ Level: Dependency graph: main: - transformers>=4.41.2 - datasets>=2.16.0 - accelerate>=0.30.1 - peft>=0.11.1 - trl>=0.8.6 + transformers>=4.41.2,<=4.43.4 + datasets>=2.16.0,<=2.20.0 + accelerate>=0.30.1,<=0.32.0 + peft>=0.11.1,<=0.12.0 + trl>=0.8.6,<=0.9.6 attention: transformers>=4.42.4 (gemma+fa2) longlora: - transformers>=4.41.2,<=4.42.4 + transformers>=4.41.2,<=4.43.4 packing: - transformers>=4.41.2,<=4.42.4 - patcher: - transformers==4.41.2 (chatglm) + transformers>=4.41.2,<=4.43.4 """ from .cli import VERSION diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py index c413c51d..4531db4a 100644 --- a/src/llamafactory/extras/constants.py +++ b/src/llamafactory/extras/constants.py @@ -535,10 +535,6 @@ register_model_group( DownloadSource.DEFAULT: "google/gemma-2-2b", DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-2b", }, - "Gemma-2-2B-Chat": { - DownloadSource.DEFAULT: "google/gemma-2-2b-it", - DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-2b-it", - }, "Gemma-2-9B": { DownloadSource.DEFAULT: "google/gemma-2-9b", DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-9b", @@ -547,6 +543,10 @@ register_model_group( DownloadSource.DEFAULT: "google/gemma-2-27b", DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-27b", }, + "Gemma-2-2B-Chat": { + DownloadSource.DEFAULT: "google/gemma-2-2b-it", + DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-2b-it", + }, "Gemma-2-9B-Chat": { DownloadSource.DEFAULT: "google/gemma-2-9b-it", DownloadSource.MODELSCOPE: "LLM-Research/gemma-2-9b-it", diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py index d7329b06..c1395552 100644 --- a/src/llamafactory/extras/misc.py +++ b/src/llamafactory/extras/misc.py @@ -79,11 +79,11 @@ def check_dependencies() -> None: if os.environ.get("DISABLE_VERSION_CHECK", "0").lower() in ["true", "1"]: logger.warning("Version checking has been disabled, may lead to unexpected behaviors.") else: - require_version("transformers>=4.41.2", "To fix: pip install transformers>=4.41.2") - require_version("datasets>=2.16.0", "To fix: pip install datasets>=2.16.0") - require_version("accelerate>=0.30.1", "To fix: pip install accelerate>=0.30.1") - require_version("peft>=0.11.1", "To fix: pip install peft>=0.11.1") - require_version("trl>=0.8.6", "To fix: pip install trl>=0.8.6") + require_version("transformers>=4.41.2,<=4.43.4", "To fix: pip install transformers>=4.41.2,<=4.43.4") + require_version("datasets>=2.16.0,<=2.20.0", "To fix: pip install datasets>=2.16.0,<=2.20.0") + require_version("accelerate>=0.30.1,<=0.32.0", "To fix: pip install accelerate>=0.30.1,<=0.32.0") + require_version("peft>=0.11.1,<=0.12.0", "To fix: pip install peft>=0.11.1,<=0.12.0") + require_version("trl>=0.8.6,<=0.9.6", "To fix: pip install trl>=0.8.6,<=0.9.6") def count_parameters(model: "torch.nn.Module") -> Tuple[int, int]: diff --git a/src/llamafactory/extras/packages.py b/src/llamafactory/extras/packages.py index a9072103..ae270d1b 100644 --- a/src/llamafactory/extras/packages.py +++ b/src/llamafactory/extras/packages.py @@ -70,6 +70,11 @@ def is_starlette_available(): return _is_package_available("sse_starlette") +@lru_cache +def is_transformers_version_greater_than_4_43(): + return _get_package_version("transformers") >= version.parse("4.43.0") + + def is_uvicorn_available(): return _is_package_available("uvicorn") diff --git a/src/llamafactory/model/model_utils/attention.py b/src/llamafactory/model/model_utils/attention.py index da53baa2..96e2c8a9 100644 --- a/src/llamafactory/model/model_utils/attention.py +++ b/src/llamafactory/model/model_utils/attention.py @@ -36,7 +36,7 @@ def configure_attn_implementation( if model_args.flash_attn == "auto" or model_args.flash_attn == "fa2": if is_flash_attn_2_available(): require_version("transformers>=4.42.4", "To fix: pip install transformers>=4.42.4") - require_version("flash_attn>=2.6.0", "To fix: pip install flash_attn>=2.6.0") + require_version("flash_attn>=2.6.3", "To fix: pip install flash_attn>=2.6.3") logger.warning("Gemma-2 should use flash attention 2, change `flash_attn` to fa2.") model_args.flash_attn = "fa2" else: diff --git a/src/llamafactory/model/model_utils/longlora.py b/src/llamafactory/model/model_utils/longlora.py index 53570a16..e518aefb 100644 --- a/src/llamafactory/model/model_utils/longlora.py +++ b/src/llamafactory/model/model_utils/longlora.py @@ -35,6 +35,7 @@ from transformers.utils.versions import require_version from ...extras.constants import SUPPORTED_CLASS_FOR_S2ATTN from ...extras.logging import get_logger +from ...extras.packages import is_transformers_version_greater_than_4_43 if TYPE_CHECKING: @@ -50,14 +51,15 @@ transformers_logger = logging.get_logger(__name__) # https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py def llama_attention_forward( self: "LlamaAttention", - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, + hidden_states: "torch.Tensor", + attention_mask: Optional["torch.Tensor"] = None, + position_ids: Optional["torch.LongTensor"] = None, past_key_value: Optional["Cache"] = None, output_attentions: bool = False, - cache_position: Optional[torch.LongTensor] = None, + cache_position: Optional["torch.LongTensor"] = None, + position_embeddings: Optional[Tuple["torch.Tensor", "torch.Tensor"]] = None, **kwargs, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: +) -> Tuple["torch.Tensor", Optional["torch.Tensor"], Optional[Tuple["torch.Tensor"]]]: bsz, q_len, _ = hidden_states.size() query_states: "torch.Tensor" = self.q_proj(hidden_states) @@ -68,7 +70,11 @@ def llama_attention_forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -130,14 +136,15 @@ def llama_attention_forward( # https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py def llama_flash_attention_2_forward( self: "LlamaFlashAttention2", - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, + hidden_states: "torch.Tensor", + attention_mask: Optional["torch.Tensor"] = None, + position_ids: Optional["torch.LongTensor"] = None, past_key_value: Optional["Cache"] = None, output_attentions: bool = False, - cache_position: Optional[torch.LongTensor] = None, + cache_position: Optional["torch.LongTensor"] = None, + position_embeddings: Optional[Tuple["torch.Tensor", "torch.Tensor"]] = None, **kwargs, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: +) -> Tuple["torch.Tensor", Optional["torch.Tensor"], Optional[Tuple["torch.Tensor"]]]: # LlamaFlashAttention2 attention does not support output_attentions output_attentions = False @@ -151,7 +158,11 @@ def llama_flash_attention_2_forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -198,9 +209,24 @@ def llama_flash_attention_2_forward( if attention_mask is not None: attention_mask = attention_mask[:, :groupsz].repeat(num_groups, 1) - attn_output: "torch.Tensor" = self._flash_attention_forward( - query_states, key_states, value_states, attention_mask, query_states.size(1), dropout=dropout_rate - ) + if is_transformers_version_greater_than_4_43(): + from transformers.modeling_flash_attention_utils import _flash_attention_forward + + attn_output: "torch.Tensor" = _flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + query_states.size(1), + dropout=dropout_rate, + sliding_window=getattr(self, "sliding_window", None), + use_top_left_mask=self._flash_attn_uses_top_left_mask, + is_causal=self.is_causal, + ) + else: + attn_output: "torch.Tensor" = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, query_states.size(1), dropout=dropout_rate + ) if getattr(self.config, "group_size_ratio", None) and self.training: # shift back attn_output.reshape(bsz, q_len, self.num_heads, self.head_dim) @@ -225,14 +251,15 @@ def llama_flash_attention_2_forward( # https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llama/modeling_llama.py def llama_sdpa_attention_forward( self: "LlamaSdpaAttention", - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, + hidden_states: "torch.Tensor", + attention_mask: Optional["torch.Tensor"] = None, + position_ids: Optional["torch.LongTensor"] = None, past_key_value: Optional["Cache"] = None, output_attentions: bool = False, - cache_position: Optional[torch.LongTensor] = None, + cache_position: Optional["torch.LongTensor"] = None, + position_embeddings: Optional[Tuple["torch.Tensor", "torch.Tensor"]] = None, **kwargs, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: +) -> Tuple["torch.Tensor", Optional["torch.Tensor"], Optional[Tuple["torch.Tensor"]]]: if output_attentions: transformers_logger.warning_once( "SDPA does not support `output_attentions=True`. Falling back to the vanilla attention" @@ -258,7 +285,11 @@ def llama_sdpa_attention_forward( key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) - cos, sin = self.rotary_emb(value_states, position_ids) + if position_embeddings is None: + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) if past_key_value is not None: @@ -322,7 +353,7 @@ def llama_sdpa_attention_forward( def _apply_llama_patch() -> None: - require_version("transformers>=4.41.2,<=4.42.4", "To fix: pip install transformers>=4.41.2,<=4.42.4") + require_version("transformers>=4.41.2,<=4.43.4", "To fix: pip install transformers>=4.41.2,<=4.43.4") LlamaAttention.forward = llama_attention_forward LlamaFlashAttention2.forward = llama_flash_attention_2_forward LlamaSdpaAttention.forward = llama_sdpa_attention_forward diff --git a/src/llamafactory/model/model_utils/packing.py b/src/llamafactory/model/model_utils/packing.py index 674e0b4a..ded7f295 100644 --- a/src/llamafactory/model/model_utils/packing.py +++ b/src/llamafactory/model/model_utils/packing.py @@ -41,11 +41,11 @@ from typing import TYPE_CHECKING, Tuple import torch import torch.nn.functional as F -import transformers.models from transformers.utils.versions import require_version from ...extras.constants import SUPPORTED_CLASS_FOR_BLOCK_DIAG_ATTN from ...extras.logging import get_logger +from ...extras.packages import is_transformers_version_greater_than_4_43 if TYPE_CHECKING: @@ -114,7 +114,15 @@ def get_unpad_data(attention_mask: "torch.Tensor") -> Tuple["torch.Tensor", "tor def _patch_for_block_diag_attn(model_type: str) -> None: - require_version("transformers>=4.41.2,<=4.42.4", "To fix: pip install transformers>=4.41.2,<=4.42.4") + require_version("transformers>=4.41.2,<=4.43.4", "To fix: pip install transformers>=4.41.2,<=4.43.4") + if is_transformers_version_greater_than_4_43(): + import transformers.modeling_flash_attention_utils + + transformers.modeling_flash_attention_utils._get_unpad_data = get_unpad_data + return + + import transformers.models + if model_type == "cohere": transformers.models.cohere.modeling_cohere._get_unpad_data = get_unpad_data elif model_type == "falcon": diff --git a/src/llamafactory/train/callbacks.py b/src/llamafactory/train/callbacks.py index 657dd8f3..3b05317d 100644 --- a/src/llamafactory/train/callbacks.py +++ b/src/llamafactory/train/callbacks.py @@ -162,11 +162,12 @@ class PissaConvertCallback(TrainerCallback): setattr(model.peft_config["default"], "init_lora_weights", init_lora_weights) model.save_pretrained( pissa_convert_dir, safe_serialization=args.save_safetensors, convert_pissa_to_lora=pissa_init_dir - ) + ) # TODO: use `path_initial_model_for_weight_conversion` (peft>=0.12.0) model.load_adapter(pissa_backup_dir, "default", is_trainable=True) model.set_adapter("default") - if "pissa_init" in model.peft_config.keys(): + if "pissa_init" in model.peft_config.keys(): # backward compatibility (peft<0.12.0) model.delete_adapter("pissa_init") + setattr(model.peft_config["default"], "init_lora_weights", init_lora_weights) diff --git a/src/llamafactory/webui/interface.py b/src/llamafactory/webui/interface.py index 1ca152c4..0ea37787 100644 --- a/src/llamafactory/webui/interface.py +++ b/src/llamafactory/webui/interface.py @@ -71,7 +71,7 @@ def create_web_demo() -> "gr.Blocks": engine = Engine(pure_chat=True) with gr.Blocks(title="Web Demo", css=CSS) as demo: - lang = gr.Dropdown(choices=["en", "zh"]) + lang = gr.Dropdown(choices=["en", "ru", "zh", "ko"], scale=1) engine.manager.add_elems("top", dict(lang=lang)) _, _, chat_elems = create_chat_box(engine, visible=True) diff --git a/src/llamafactory/webui/locales.py b/src/llamafactory/webui/locales.py index 01d6fe29..0a8ca68a 100644 --- a/src/llamafactory/webui/locales.py +++ b/src/llamafactory/webui/locales.py @@ -362,7 +362,6 @@ LOCALES = { "label": "학습률", "info": "AdamW의 초기 학습률.", }, - }, "num_train_epochs": { "en": {