diff --git a/README.md b/README.md index 77d9c709..44897420 100644 --- a/README.md +++ b/README.md @@ -430,7 +430,6 @@ docker run --gpus=all \ -v ./hf_cache:/root/.cache/huggingface/ \ -v ./data:/app/data \ -v ./output:/app/output \ - -e CUDA_VISIBLE_DEVICES=0 \ -p 7860:7860 \ --shm-size 16G \ --name llama_factory \ diff --git a/README_zh.md b/README_zh.md index 9a52a963..8321d202 100644 --- a/README_zh.md +++ b/README_zh.md @@ -428,7 +428,6 @@ docker run --gpus=all \ -v ./hf_cache:/root/.cache/huggingface/ \ -v ./data:/app/data \ -v ./output:/app/output \ - -e CUDA_VISIBLE_DEVICES=0 \ -p 7860:7860 \ --shm-size 16G \ --name llama_factory \ diff --git a/docker-compose.yml b/docker-compose.yml index 333dc51e..9602a3e3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,8 +10,6 @@ services: - ./hf_cache:/root/.cache/huggingface/ - ./data:/app/data - ./output:/app/output - environment: - - CUDA_VISIBLE_DEVICES=0 ports: - "7860:7860" ipc: host diff --git a/tests/test_toolcall.py b/scripts/test_toolcall.py similarity index 97% rename from tests/test_toolcall.py rename to scripts/test_toolcall.py index d36e7fec..7e460017 100644 --- a/tests/test_toolcall.py +++ b/scripts/test_toolcall.py @@ -20,7 +20,7 @@ def calculate_gpa(grades: Sequence[str], hours: Sequence[int]) -> float: def main(): client = OpenAI( - api_key="0", + api_key="{}".format(os.environ.get("API_KEY", "0")), base_url="http://localhost:{}/v1".format(os.environ.get("API_PORT", 8000)), ) tools = [ diff --git a/tests/model/test_attn.py b/tests/model/test_attn.py new file mode 100644 index 00000000..12d920ef --- /dev/null +++ b/tests/model/test_attn.py @@ -0,0 +1,35 @@ +import os + +from transformers.utils import is_flash_attn_2_available, is_torch_sdpa_available + +from llamafactory.hparams import get_infer_args +from llamafactory.model import load_model, load_tokenizer + + +TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-LlamaForCausalLM") + + +def test_attention(): + attention_available = ["off"] + if is_torch_sdpa_available(): + attention_available.append("sdpa") + + if is_flash_attn_2_available(): + attention_available.append("fa2") + + llama_attention_classes = { + "off": "LlamaAttention", + "sdpa": "LlamaSdpaAttention", + "fa2": "LlamaFlashAttention2", + } + for requested_attention in attention_available: + model_args, _, finetuning_args, _ = get_infer_args({ + "model_name_or_path": TINY_LLAMA, + "template": "llama2", + "flash_attn": requested_attention, + }) + tokenizer = load_tokenizer(model_args) + model = load_model(tokenizer["tokenizer"], model_args, finetuning_args) + for module in model.modules(): + if "Attention" in module.__class__.__name__: + assert module.__class__.__name__ == llama_attention_classes[requested_attention] diff --git a/tests/test_throughput.py b/tests/test_throughput.py deleted file mode 100644 index e8048910..00000000 --- a/tests/test_throughput.py +++ /dev/null @@ -1,30 +0,0 @@ -import os -import time - -from openai import OpenAI -from transformers.utils.versions import require_version - - -require_version("openai>=1.5.0", "To fix: pip install openai>=1.5.0") - - -def main(): - client = OpenAI( - api_key="0", - base_url="http://localhost:{}/v1".format(os.environ.get("API_PORT", 8000)), - ) - messages = [{"role": "user", "content": "Write a long essay about environment protection as long as possible."}] - num_tokens = 0 - start_time = time.time() - for _ in range(8): - result = client.chat.completions.create(messages=messages, model="test") - num_tokens += result.usage.completion_tokens - - elapsed_time = time.time() - start_time - print("Throughput: {:.2f} tokens/s".format(num_tokens / elapsed_time)) - # --infer_backend hf: 27.22 tokens/s (1.0x) - # --infer_backend vllm: 73.03 tokens/s (2.7x) - - -if __name__ == "__main__": - main()