add tests

This commit is contained in:
hiyouga 2024-06-15 19:51:20 +08:00
parent 572d8bbfdd
commit 1b834f50be
8 changed files with 166 additions and 14 deletions

View File

@ -11,4 +11,4 @@ style:
ruff format $(check_dirs) ruff format $(check_dirs)
test: test:
pytest tests/ CUDA_VISIBLE_DEVICES= pytest tests/

View File

@ -22,6 +22,7 @@ from transformers import InfNanRemoveLogitsProcessor, LogitsProcessorList, PreTr
from transformers.utils import ( from transformers.utils import (
SAFE_WEIGHTS_NAME, SAFE_WEIGHTS_NAME,
WEIGHTS_NAME, WEIGHTS_NAME,
is_safetensors_available,
is_torch_bf16_gpu_available, is_torch_bf16_gpu_available,
is_torch_cuda_available, is_torch_cuda_available,
is_torch_mps_available, is_torch_mps_available,
@ -34,6 +35,11 @@ from .constants import V_HEAD_SAFE_WEIGHTS_NAME, V_HEAD_WEIGHTS_NAME
from .logging import get_logger from .logging import get_logger
if is_safetensors_available():
from safetensors import safe_open
from safetensors.torch import save_file
_is_fp16_available = is_torch_npu_available() or is_torch_cuda_available() _is_fp16_available = is_torch_npu_available() or is_torch_cuda_available()
try: try:
_is_bf16_available = is_torch_bf16_gpu_available() _is_bf16_available = is_torch_bf16_gpu_available()
@ -128,9 +134,6 @@ def fix_valuehead_checkpoint(
return return
if safe_serialization: if safe_serialization:
from safetensors import safe_open
from safetensors.torch import save_file
path_to_checkpoint = os.path.join(output_dir, SAFE_WEIGHTS_NAME) path_to_checkpoint = os.path.join(output_dir, SAFE_WEIGHTS_NAME)
with safe_open(path_to_checkpoint, framework="pt", device="cpu") as f: with safe_open(path_to_checkpoint, framework="pt", device="cpu") as f:
state_dict: Dict[str, torch.Tensor] = {key: f.get_tensor(key) for key in f.keys()} state_dict: Dict[str, torch.Tensor] = {key: f.get_tensor(key) for key in f.keys()}

View File

@ -41,7 +41,7 @@ TRAIN_ARGS = {
} }
@pytest.mark.parametrize("num_samples", [10]) @pytest.mark.parametrize("num_samples", [16])
def test_supervised(num_samples: int): def test_supervised(num_samples: int):
model_args, data_args, training_args, _, _ = get_train_args(TRAIN_ARGS) model_args, data_args, training_args, _, _ = get_train_args(TRAIN_ARGS)
tokenizer_module = load_tokenizer(model_args) tokenizer_module = load_tokenizer(model_args)

View File

@ -0,0 +1,74 @@
# Copyright 2024 the LlamaFactory team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import torch
from llamafactory.extras.misc import get_current_device
from llamafactory.hparams import get_train_args
from llamafactory.model import load_model, load_tokenizer
TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
TRAIN_ARGS = {
"model_name_or_path": TINY_LLAMA,
"stage": "sft",
"do_train": True,
"finetuning_type": "lora",
"lora_target": "all",
"dataset": "llamafactory/tiny-supervised-dataset",
"dataset_dir": "ONLINE",
"template": "llama3",
"cutoff_len": 1024,
"overwrite_cache": True,
"output_dir": "dummy_dir",
"overwrite_output_dir": True,
"fp16": True,
}
def test_checkpointing_enable():
model_args, _, _, finetuning_args, _ = get_train_args({"disable_gradient_checkpointing": False, **TRAIN_ARGS})
tokenizer_module = load_tokenizer(model_args)
model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
for module in filter(lambda m: hasattr(m, "gradient_checkpointing"), model.modules()):
assert getattr(module, "gradient_checkpointing") is True
def test_checkpointing_disable():
model_args, _, _, finetuning_args, _ = get_train_args({"disable_gradient_checkpointing": True, **TRAIN_ARGS})
tokenizer_module = load_tokenizer(model_args)
model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
for module in filter(lambda m: hasattr(m, "gradient_checkpointing"), model.modules()):
assert getattr(module, "gradient_checkpointing") is False
def test_upcast_layernorm():
model_args, _, _, finetuning_args, _ = get_train_args({"upcast_layernorm": True, **TRAIN_ARGS})
tokenizer_module = load_tokenizer(model_args)
model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
for name, param in model.named_parameters():
if param.ndim == 1 and "norm" in name:
assert param.dtype == torch.float32
def test_upcast_lmhead_output():
model_args, _, _, finetuning_args, _ = get_train_args({"upcast_lmhead_output": True, **TRAIN_ARGS})
tokenizer_module = load_tokenizer(model_args)
model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
inputs = torch.randn((1, 16), dtype=torch.float16, device=get_current_device())
outputs: "torch.Tensor" = model.lm_head(inputs)
assert outputs.dtype == torch.float32

View File

@ -13,16 +13,21 @@
# limitations under the License. # limitations under the License.
import os import os
from typing import Dict
import torch import torch
from transformers import AutoModelForCausalLM from transformers import AutoModelForCausalLM
from trl import AutoModelForCausalLMWithValueHead
from llamafactory.extras.misc import get_current_device
from llamafactory.hparams import get_infer_args from llamafactory.hparams import get_infer_args
from llamafactory.model import load_model, load_tokenizer from llamafactory.model import load_model, load_tokenizer
TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3") TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
TINY_LLAMA_VALUEHEAD = os.environ.get("TINY_LLAMA_VALUEHEAD", "llamafactory/tiny-random-Llama-3-valuehead")
INFER_ARGS = { INFER_ARGS = {
"model_name_or_path": TINY_LLAMA, "model_name_or_path": TINY_LLAMA,
"template": "llama3", "template": "llama3",
@ -38,9 +43,32 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module"):
assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
def post_init(self: "AutoModelForCausalLMWithValueHead", state_dict: Dict[str, "torch.Tensor"]):
state_dict = {k[7:]: state_dict[k] for k in state_dict.keys() if k.startswith("v_head.")}
self.v_head.load_state_dict(state_dict, strict=False)
del state_dict
def test_base(): def test_base():
model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS) model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
tokenizer_module = load_tokenizer(model_args) tokenizer_module = load_tokenizer(model_args)
model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False) model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
ref_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device)
ref_model = AutoModelForCausalLM.from_pretrained(
TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
)
compare_model(model, ref_model)
def test_valuehead():
AutoModelForCausalLMWithValueHead.post_init = post_init # patch for CPU test
model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
tokenizer_module = load_tokenizer(model_args)
model = load_model(
tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False, add_valuehead=True
)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(
TINY_LLAMA_VALUEHEAD, torch_dtype=torch.float16, device_map=get_current_device()
)
compare_model(model, ref_model) compare_model(model, ref_model)

View File

@ -49,6 +49,7 @@ def test_freeze_train_all_modules():
model_args, _, _, finetuning_args, _ = get_train_args({"freeze_trainable_layers": 1, **TRAIN_ARGS}) model_args, _, _, finetuning_args, _ = get_train_args({"freeze_trainable_layers": 1, **TRAIN_ARGS})
tokenizer_module = load_tokenizer(model_args) tokenizer_module = load_tokenizer(model_args)
model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True) model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
for name, param in model.named_parameters(): for name, param in model.named_parameters():
if name.startswith("model.layers.1."): if name.startswith("model.layers.1."):
assert param.requires_grad is True assert param.requires_grad is True
@ -64,6 +65,7 @@ def test_freeze_train_extra_modules():
) )
tokenizer_module = load_tokenizer(model_args) tokenizer_module = load_tokenizer(model_args)
model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True) model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
for name, param in model.named_parameters(): for name, param in model.named_parameters():
if name.startswith("model.layers.1.") or any(module in name for module in ["embed_tokens", "lm_head"]): if name.startswith("model.layers.1.") or any(module in name for module in ["embed_tokens", "lm_head"]):
assert param.requires_grad is True assert param.requires_grad is True
@ -77,6 +79,7 @@ def test_freeze_inference():
model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS) model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
tokenizer_module = load_tokenizer(model_args) tokenizer_module = load_tokenizer(model_args)
model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False) model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
for param in model.parameters(): for param in model.parameters():
assert param.requires_grad is False assert param.requires_grad is False
assert param.dtype == torch.float16 assert param.dtype == torch.float16

View File

@ -49,6 +49,7 @@ def test_full_train():
model_args, _, _, finetuning_args, _ = get_train_args(TRAIN_ARGS) model_args, _, _, finetuning_args, _ = get_train_args(TRAIN_ARGS)
tokenizer_module = load_tokenizer(model_args) tokenizer_module = load_tokenizer(model_args)
model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True) model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
for param in model.parameters(): for param in model.parameters():
assert param.requires_grad is True assert param.requires_grad is True
assert param.dtype == torch.float32 assert param.dtype == torch.float32
@ -58,6 +59,7 @@ def test_full_inference():
model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS) model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
tokenizer_module = load_tokenizer(model_args) tokenizer_module = load_tokenizer(model_args)
model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False) model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
for param in model.parameters(): for param in model.parameters():
assert param.requires_grad is False assert param.requires_grad is False
assert param.dtype == torch.float16 assert param.dtype == torch.float16

View File

@ -18,7 +18,9 @@ from typing import Sequence
import torch import torch
from peft import LoraModel, PeftModel from peft import LoraModel, PeftModel
from transformers import AutoModelForCausalLM from transformers import AutoModelForCausalLM
from trl import AutoModelForCausalLMWithValueHead
from llamafactory.extras.misc import get_current_device
from llamafactory.hparams import get_infer_args, get_train_args from llamafactory.hparams import get_infer_args, get_train_args
from llamafactory.model import load_model, load_tokenizer from llamafactory.model import load_model, load_tokenizer
@ -27,6 +29,8 @@ TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
TINY_LLAMA_ADAPTER = os.environ.get("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-lora") TINY_LLAMA_ADAPTER = os.environ.get("TINY_LLAMA_ADAPTER", "llamafactory/tiny-random-Llama-3-lora")
TINY_LLAMA_VALUEHEAD = os.environ.get("TINY_LLAMA_VALUEHEAD", "llamafactory/tiny-random-Llama-3-valuehead")
TRAIN_ARGS = { TRAIN_ARGS = {
"model_name_or_path": TINY_LLAMA, "model_name_or_path": TINY_LLAMA,
"stage": "sft", "stage": "sft",
@ -67,10 +71,29 @@ def compare_model(model_a: "torch.nn.Module", model_b: "torch.nn.Module", diff_k
assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True assert torch.allclose(state_dict_a[name], state_dict_b[name]) is True
def test_lora_train_qv_modules():
model_args, _, _, finetuning_args, _ = get_train_args({"lora_target": "q_proj,v_proj", **TRAIN_ARGS})
tokenizer_module = load_tokenizer(model_args)
model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
linear_modules = set()
for name, param in model.named_parameters():
if any(module in name for module in ["lora_A", "lora_B"]):
linear_modules.add(name.split(".lora_", maxsplit=1)[0].split(".")[-1])
assert param.requires_grad is True
assert param.dtype == torch.float32
else:
assert param.requires_grad is False
assert param.dtype == torch.float16
assert linear_modules == {"q_proj", "v_proj"}
def test_lora_train_all_modules(): def test_lora_train_all_modules():
model_args, _, _, finetuning_args, _ = get_train_args({"lora_target": "all", **TRAIN_ARGS}) model_args, _, _, finetuning_args, _ = get_train_args({"lora_target": "all", **TRAIN_ARGS})
tokenizer_module = load_tokenizer(model_args) tokenizer_module = load_tokenizer(model_args)
model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True) model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
linear_modules = set() linear_modules = set()
for name, param in model.named_parameters(): for name, param in model.named_parameters():
if any(module in name for module in ["lora_A", "lora_B"]): if any(module in name for module in ["lora_A", "lora_B"]):
@ -90,6 +113,7 @@ def test_lora_train_extra_modules():
) )
tokenizer_module = load_tokenizer(model_args) tokenizer_module = load_tokenizer(model_args)
model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True) model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
extra_modules = set() extra_modules = set()
for name, param in model.named_parameters(): for name, param in model.named_parameters():
if any(module in name for module in ["lora_A", "lora_B"]): if any(module in name for module in ["lora_A", "lora_B"]):
@ -113,7 +137,9 @@ def test_lora_train_old_adapters():
tokenizer_module = load_tokenizer(model_args) tokenizer_module = load_tokenizer(model_args)
model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True) model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
base_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device) base_model = AutoModelForCausalLM.from_pretrained(
TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
)
ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER, is_trainable=True) ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER, is_trainable=True)
for param in filter(lambda p: p.requires_grad, ref_model.parameters()): for param in filter(lambda p: p.requires_grad, ref_model.parameters()):
param.data = param.data.to(torch.float32) param.data = param.data.to(torch.float32)
@ -128,7 +154,9 @@ def test_lora_train_new_adapters():
tokenizer_module = load_tokenizer(model_args) tokenizer_module = load_tokenizer(model_args)
model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True) model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True)
base_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device) base_model = AutoModelForCausalLM.from_pretrained(
TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
)
ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER, is_trainable=True) ref_model = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER, is_trainable=True)
for param in filter(lambda p: p.requires_grad, ref_model.parameters()): for param in filter(lambda p: p.requires_grad, ref_model.parameters()):
param.data = param.data.to(torch.float32) param.data = param.data.to(torch.float32)
@ -138,17 +166,31 @@ def test_lora_train_new_adapters():
) )
def test_lora_train_valuehead():
model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
tokenizer_module = load_tokenizer(model_args)
model = load_model(
tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=True, add_valuehead=True
)
ref_model: "AutoModelForCausalLMWithValueHead" = AutoModelForCausalLMWithValueHead.from_pretrained(
TINY_LLAMA_VALUEHEAD, torch_dtype=torch.float16, device_map=get_current_device()
)
state_dict = model.state_dict()
ref_state_dict = ref_model.state_dict()
assert torch.allclose(state_dict["v_head.summary.weight"], ref_state_dict["v_head.summary.weight"])
assert torch.allclose(state_dict["v_head.summary.bias"], ref_state_dict["v_head.summary.bias"])
def test_lora_inference(): def test_lora_inference():
model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS) model_args, _, finetuning_args, _ = get_infer_args(INFER_ARGS)
tokenizer_module = load_tokenizer(model_args) tokenizer_module = load_tokenizer(model_args)
model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False) model = load_model(tokenizer_module["tokenizer"], model_args, finetuning_args, is_trainable=False)
base_model = AutoModelForCausalLM.from_pretrained(TINY_LLAMA, torch_dtype=model.dtype, device_map=model.device) base_model = AutoModelForCausalLM.from_pretrained(
TINY_LLAMA, torch_dtype=torch.float16, device_map=get_current_device()
)
ref_model: "LoraModel" = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER) ref_model: "LoraModel" = PeftModel.from_pretrained(base_model, TINY_LLAMA_ADAPTER)
ref_model = ref_model.merge_and_unload() ref_model = ref_model.merge_and_unload()
compare_model(model, ref_model) compare_model(model, ref_model)
for name, param in model.named_parameters():
assert param.requires_grad is False
assert param.dtype == torch.float16
assert "lora" not in name