init unittest
This commit is contained in:
parent
4b55f35662
commit
1c7f0ab519
|
@ -430,7 +430,6 @@ docker run --gpus=all \
|
||||||
-v ./hf_cache:/root/.cache/huggingface/ \
|
-v ./hf_cache:/root/.cache/huggingface/ \
|
||||||
-v ./data:/app/data \
|
-v ./data:/app/data \
|
||||||
-v ./output:/app/output \
|
-v ./output:/app/output \
|
||||||
-e CUDA_VISIBLE_DEVICES=0 \
|
|
||||||
-p 7860:7860 \
|
-p 7860:7860 \
|
||||||
--shm-size 16G \
|
--shm-size 16G \
|
||||||
--name llama_factory \
|
--name llama_factory \
|
||||||
|
|
|
@ -428,7 +428,6 @@ docker run --gpus=all \
|
||||||
-v ./hf_cache:/root/.cache/huggingface/ \
|
-v ./hf_cache:/root/.cache/huggingface/ \
|
||||||
-v ./data:/app/data \
|
-v ./data:/app/data \
|
||||||
-v ./output:/app/output \
|
-v ./output:/app/output \
|
||||||
-e CUDA_VISIBLE_DEVICES=0 \
|
|
||||||
-p 7860:7860 \
|
-p 7860:7860 \
|
||||||
--shm-size 16G \
|
--shm-size 16G \
|
||||||
--name llama_factory \
|
--name llama_factory \
|
||||||
|
|
|
@ -10,8 +10,6 @@ services:
|
||||||
- ./hf_cache:/root/.cache/huggingface/
|
- ./hf_cache:/root/.cache/huggingface/
|
||||||
- ./data:/app/data
|
- ./data:/app/data
|
||||||
- ./output:/app/output
|
- ./output:/app/output
|
||||||
environment:
|
|
||||||
- CUDA_VISIBLE_DEVICES=0
|
|
||||||
ports:
|
ports:
|
||||||
- "7860:7860"
|
- "7860:7860"
|
||||||
ipc: host
|
ipc: host
|
||||||
|
|
|
@ -20,7 +20,7 @@ def calculate_gpa(grades: Sequence[str], hours: Sequence[int]) -> float:
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
client = OpenAI(
|
client = OpenAI(
|
||||||
api_key="0",
|
api_key="{}".format(os.environ.get("API_KEY", "0")),
|
||||||
base_url="http://localhost:{}/v1".format(os.environ.get("API_PORT", 8000)),
|
base_url="http://localhost:{}/v1".format(os.environ.get("API_PORT", 8000)),
|
||||||
)
|
)
|
||||||
tools = [
|
tools = [
|
|
@ -0,0 +1,35 @@
|
||||||
|
import os
|
||||||
|
|
||||||
|
from transformers.utils import is_flash_attn_2_available, is_torch_sdpa_available
|
||||||
|
|
||||||
|
from llamafactory.hparams import get_infer_args
|
||||||
|
from llamafactory.model import load_model, load_tokenizer
|
||||||
|
|
||||||
|
|
||||||
|
TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-LlamaForCausalLM")
|
||||||
|
|
||||||
|
|
||||||
|
def test_attention():
|
||||||
|
attention_available = ["off"]
|
||||||
|
if is_torch_sdpa_available():
|
||||||
|
attention_available.append("sdpa")
|
||||||
|
|
||||||
|
if is_flash_attn_2_available():
|
||||||
|
attention_available.append("fa2")
|
||||||
|
|
||||||
|
llama_attention_classes = {
|
||||||
|
"off": "LlamaAttention",
|
||||||
|
"sdpa": "LlamaSdpaAttention",
|
||||||
|
"fa2": "LlamaFlashAttention2",
|
||||||
|
}
|
||||||
|
for requested_attention in attention_available:
|
||||||
|
model_args, _, finetuning_args, _ = get_infer_args({
|
||||||
|
"model_name_or_path": TINY_LLAMA,
|
||||||
|
"template": "llama2",
|
||||||
|
"flash_attn": requested_attention,
|
||||||
|
})
|
||||||
|
tokenizer = load_tokenizer(model_args)
|
||||||
|
model = load_model(tokenizer["tokenizer"], model_args, finetuning_args)
|
||||||
|
for module in model.modules():
|
||||||
|
if "Attention" in module.__class__.__name__:
|
||||||
|
assert module.__class__.__name__ == llama_attention_classes[requested_attention]
|
|
@ -1,30 +0,0 @@
|
||||||
import os
|
|
||||||
import time
|
|
||||||
|
|
||||||
from openai import OpenAI
|
|
||||||
from transformers.utils.versions import require_version
|
|
||||||
|
|
||||||
|
|
||||||
require_version("openai>=1.5.0", "To fix: pip install openai>=1.5.0")
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
client = OpenAI(
|
|
||||||
api_key="0",
|
|
||||||
base_url="http://localhost:{}/v1".format(os.environ.get("API_PORT", 8000)),
|
|
||||||
)
|
|
||||||
messages = [{"role": "user", "content": "Write a long essay about environment protection as long as possible."}]
|
|
||||||
num_tokens = 0
|
|
||||||
start_time = time.time()
|
|
||||||
for _ in range(8):
|
|
||||||
result = client.chat.completions.create(messages=messages, model="test")
|
|
||||||
num_tokens += result.usage.completion_tokens
|
|
||||||
|
|
||||||
elapsed_time = time.time() - start_time
|
|
||||||
print("Throughput: {:.2f} tokens/s".format(num_tokens / elapsed_time))
|
|
||||||
# --infer_backend hf: 27.22 tokens/s (1.0x)
|
|
||||||
# --infer_backend vllm: 73.03 tokens/s (2.7x)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
Loading…
Reference in New Issue