From 10ab83f4c4dc96013e916462f056d1497c6ddf6c Mon Sep 17 00:00:00 2001 From: hiyouga Date: Wed, 8 May 2024 16:37:54 +0800 Subject: [PATCH] add deepseek moe 236B --- README.md | 2 +- README_zh.md | 2 +- requirements.txt | 2 +- setup.py | 2 +- src/llmtuner/extras/constants.py | 8 ++++++++ 5 files changed, 12 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e9d93daf..798b7bd4 100644 --- a/README.md +++ b/README.md @@ -145,7 +145,7 @@ Compared to ChatGLM's [P-Tuning](https://github.com/THUDM/ChatGLM2-6B/tree/main/ | [BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value | - | | [ChatGLM3](https://huggingface.co/THUDM) | 6B | query_key_value | chatglm3 | | [Command-R](https://huggingface.co/CohereForAI) | 35B/104B | q_proj,v_proj | cohere | -| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B | q_proj,v_proj | deepseek | +| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | q_proj,v_proj | deepseek | | [Falcon](https://huggingface.co/tiiuae) | 7B/40B/180B | query_key_value | falcon | | [Gemma/CodeGemma](https://huggingface.co/google) | 2B/7B | q_proj,v_proj | gemma | | [InternLM2](https://huggingface.co/internlm) | 7B/20B | wqkv | intern2 | diff --git a/README_zh.md b/README_zh.md index 15758ae4..2c5b1aa1 100644 --- a/README_zh.md +++ b/README_zh.md @@ -145,7 +145,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/ec36a9dd-37f4-4f72-81bd | [BLOOMZ](https://huggingface.co/bigscience) | 560M/1.1B/1.7B/3B/7.1B/176B | query_key_value | - | | [ChatGLM3](https://huggingface.co/THUDM) | 6B | query_key_value | chatglm3 | | [Command-R](https://huggingface.co/CohereForAI) | 35B/104B | q_proj,v_proj | cohere | -| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B | q_proj,v_proj | deepseek | +| [DeepSeek (MoE)](https://huggingface.co/deepseek-ai) | 7B/16B/67B/236B | q_proj,v_proj | deepseek | | [Falcon](https://huggingface.co/tiiuae) | 7B/40B/180B | query_key_value | falcon | | [Gemma/CodeGemma](https://huggingface.co/google) | 2B/7B | q_proj,v_proj | gemma | | [InternLM2](https://huggingface.co/internlm) | 7B/20B | wqkv | intern2 | diff --git a/requirements.txt b/requirements.txt index f4818ed2..67bd7033 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,7 @@ uvicorn pydantic fastapi sse-starlette -matplotlib +matplotlib>=3.7.0 fire packaging pyyaml diff --git a/setup.py b/setup.py index 7b849942..ddc3a594 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ def get_requires(): extra_require = { "metrics": ["nltk", "jieba", "rouge-chinese"], - "deepspeed": ["deepspeed>=0.10.0"], + "deepspeed": ["deepspeed>=0.10.0,<=0.14.0"], "bitsandbytes": ["bitsandbytes>=0.39.0"], "vllm": ["vllm>=0.4.0"], "galore": ["galore-torch"], diff --git a/src/llmtuner/extras/constants.py b/src/llmtuner/extras/constants.py index bf542e69..b620bed4 100644 --- a/src/llmtuner/extras/constants.py +++ b/src/llmtuner/extras/constants.py @@ -324,6 +324,14 @@ register_model_group( DownloadSource.DEFAULT: "deepseek-ai/deepseek-moe-16b-chat", DownloadSource.MODELSCOPE: "deepseek-ai/deepseek-moe-16b-chat", }, + "DeepSeek-MoE-236B": { + DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2", + DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2", + }, + "DeepSeek-MoE-236B-Chat": { + DownloadSource.DEFAULT: "deepseek-ai/DeepSeek-V2-Chat", + DownloadSource.MODELSCOPE: "deepseek-ai/DeepSeek-V2-Chat", + }, }, template="deepseek", )