improve fix tokenizer

2024-02-09 14:53:14 +08:00 · 2024-02-09 14:53:14 +08:00 · 54ea9684ed
parent d0daaa01f9
commit 54ea9684ed
7 changed files with 105 additions and 71 deletions
--- a/README.md
+++ b/README.md
@ -157,8 +157,8 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list

 - [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
 - [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
- [Self-cognition (zh)](data/self_cognition.json)
+- [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
+- [Self Cognition (zh)](data/self_cognition.json)
 - [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
 - [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
 - [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
@ -176,6 +176,7 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list
 - [OpenOrca (en)](https://huggingface.co/datasets/Open-Orca/OpenOrca)
 - [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
 - [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
+- [Wiki QA (en)](https://huggingface.co/datasets/wiki_qa)
 - [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
 - [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
 - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
@ -190,14 +191,14 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list
 - [Glaive Function Calling V2 (en)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2)
 - [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
 - [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
- [FreedomIntelligence Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de)
- [LeoLM/OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de)
- [FreedomIntelligence/evol-instruct-deutsch (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de)
- [wiki_qa (de)](https://huggingface.co/datasets/wiki_qa)
- [cognitivecomputations/dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de)
- [booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
- [jondurbin/airoboros-3.0 (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
- [stingning/ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
+- [Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de)
+- [OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de)
+- [Evol Instruct (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de)
+- [Dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de)
+- [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
+- [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
+- [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
+
 </details>

 <details><summary>Preference datasets</summary>
@ -206,7 +207,7 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list
 - [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
 - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
 - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
- [Intel/orca_dpo_pairs (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
+- [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)

 </details>

--- a/README_zh.md
+++ b/README_zh.md
@ -157,8 +157,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846

 - [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
 - [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
- [Self-cognition (zh)](data/self_cognition.json)
+- [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
+- [Self Cognition (zh)](data/self_cognition.json)
 - [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
 - [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
 - [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
@ -176,6 +176,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846
 - [OpenOrca (en)](https://huggingface.co/datasets/Open-Orca/OpenOrca)
 - [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
 - [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
+- [Wiki QA (en)](https://huggingface.co/datasets/wiki_qa)
 - [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
 - [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
 - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
@ -188,6 +189,15 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846
 - [LMSYS Chat 1M (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)
 - [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
 - [Glaive Function Calling V2 (en)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2)
+- [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
+- [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
+- [Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de)
+- [OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de)
+- [Evol Instruct (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de)
+- [Dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de)
+- [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
+- [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
+- [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)

 </details>

@ -197,6 +207,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846
 - [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
 - [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
 - [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
+- [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)

 </details>

--- a/assets/wechat.jpg
+++ b/assets/wechat.jpg
--- a/src/llmtuner/chat/chat_model.py
+++ b/src/llmtuner/chat/chat_model.py
@ -28,7 +28,7 @@ class ChatModel:
        )
        self.tokenizer.padding_side = "left" if self.can_generate else "right"
        self.model = dispatch_model(self.model)
-        self.template = get_template_and_fix_tokenizer(data_args.template, self.tokenizer)
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template)

    def _process_args(
        self,
--- a/src/llmtuner/data/loader.py
+++ b/src/llmtuner/data/loader.py
@ -142,7 +142,7 @@ def get_dataset(
    stage: Literal["pt", "sft", "rm", "ppo"],
    # split: Optional[str] = "train", # TODO: add split
 ) -> Union["Dataset", "IterableDataset"]:
-    template = get_template_and_fix_tokenizer(data_args.template, tokenizer)
+    template = get_template_and_fix_tokenizer(tokenizer, data_args.template)
    if data_args.train_on_prompt and template.efficient_eos:
        raise ValueError("Current template does not support `train_on_prompt`.")

--- a/src/llmtuner/data/template.py
+++ b/src/llmtuner/data/template.py
@ -198,7 +198,7 @@ class Llama2Template(Template):
 templates: Dict[str, Template] = {}


-def register_template(
+def _register_template(
    name: str,
    format_user: Optional["Formatter"] = None,
    format_assistant: Optional["Formatter"] = None,
@ -236,29 +236,45 @@ def register_template(
    )


-def get_template_and_fix_tokenizer(name: str, tokenizer: "PreTrainedTokenizer") -> Template:
-    if tokenizer.eos_token_id is None:
-        tokenizer.eos_token = "<|endoftext|>"
+def _add_or_replace_eos_token(tokenizer: "PreTrainedTokenizer", eos_token: str) -> None:
+    is_added = tokenizer.eos_token_id is None
+    is_oov = eos_token not in tokenizer.get_vocab()
+    tokenizer.add_special_tokens({"eos_token": eos_token})
+
+    if is_added:
        logger.info("Add eos token: {}".format(tokenizer.eos_token))
+    else:
+        logger.info("Replace eos token: {}".format(tokenizer.eos_token))

-    if tokenizer.pad_token_id is None:
-        tokenizer.pad_token = tokenizer.eos_token
-        logger.info("Add pad token: {}".format(tokenizer.pad_token))
+    if is_oov:
+        logger.warning("New token is added, you must enable `resize_vocab` to activate it.")

-    if name is None:  # for pre-training
-        return None

-    template = templates.get(name, None)
-    assert template is not None, "Template {} does not exist.".format(name)
+def get_template_and_fix_tokenizer(
+    tokenizer: "PreTrainedTokenizer",
+    name: Optional[str] = None,
+) -> Template:
+    if name is None:
+        template = templates["vanilla"]  # placeholder
+    else:
+        template = templates.get(name, None)
+        if templates is None:
+            raise ValueError("Template {} does not exist.".format(name))

    stop_words = template.stop_words
    if template.replace_eos:
        if not stop_words:
            raise ValueError("Stop words are required to replace the EOS token.")

-        tokenizer.eos_token = stop_words[0]
+        _add_or_replace_eos_token(tokenizer, eos_token=stop_words[0])
        stop_words = stop_words[1:]
-        logger.info("Replace eos token: {}".format(tokenizer.eos_token))
+
+    if tokenizer.eos_token_id is None:
+        _add_or_replace_eos_token(tokenizer, eos_token="<|endoftext|>")
+
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        logger.info("Add pad token: {}".format(tokenizer.pad_token))

    if stop_words:
        tokenizer.add_special_tokens(
@ -269,7 +285,7 @@ def get_template_and_fix_tokenizer(name: str, tokenizer: "PreTrainedTokenizer")
    return template


-register_template(
+_register_template(
    name="alpaca",
    format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n\n### Response:\n"]),
    format_separator=EmptyFormatter(slots=["\n\n"]),
@ -279,7 +295,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="aquila",
    format_user=StringFormatter(slots=["Human: {{content}}###Assistant:"]),
    format_separator=EmptyFormatter(slots=["###"]),
@ -292,21 +308,21 @@ register_template(
 )


-register_template(
+_register_template(
    name="baichuan",
    format_user=StringFormatter(slots=[{"token": "<reserved_102>"}, "{{content}}", {"token": "<reserved_103>"}]),
    efficient_eos=True,
 )


-register_template(
+_register_template(
    name="baichuan2",
    format_user=StringFormatter(slots=[{"token": "<reserved_106>"}, "{{content}}", {"token": "<reserved_107>"}]),
    efficient_eos=True,
 )


-register_template(
+_register_template(
    name="belle",
    format_user=StringFormatter(slots=["Human: {{content}}\n\nBelle: "]),
    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
@ -315,13 +331,13 @@ register_template(
 )


-register_template(
+_register_template(
    name="bluelm",
    format_user=StringFormatter(slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]),
 )


-register_template(
+_register_template(
    name="chatglm2",
    format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问：{{content}}\n\n答："]),
    format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
@ -331,7 +347,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="chatglm3",
    format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
    format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
@ -351,14 +367,25 @@ register_template(
 )


-register_template(
+_register_template(
+    name="chatml_de",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+_register_template(
    name="codegeex2",
    format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
    force_system=True,
 )


-register_template(
+_register_template(
    name="cpm",
    format_user=StringFormatter(slots=["<用户>{{content}}<AI>"]),
    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
@ -366,7 +393,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="deepseek",
    format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]),
    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
@ -374,7 +401,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="deepseekcoder",
    format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n### Response:"]),
    format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
@ -390,14 +417,14 @@ register_template(
 )


-register_template(
+_register_template(
    name="default",
    format_user=StringFormatter(slots=["Human: {{content}}\nAssistant: "]),
    format_separator=EmptyFormatter(slots=["\n"]),
 )


-register_template(
+_register_template(
    name="falcon",
    format_user=StringFormatter(slots=["User: {{content}}\nFalcon:"]),
    format_separator=EmptyFormatter(slots=["\n"]),
@ -405,7 +432,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="intern",
    format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": "<eoh>"}, "\n<|Bot|>:"]),
    format_separator=EmptyFormatter(slots=[{"token": "<eoa>"}, "\n"]),
@ -414,7 +441,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="intern2",
    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
    format_system=StringFormatter(slots=[{"bos_token"}, "<|im_start|>system\n{{content}}<|im_end|>\n"]),
@ -431,7 +458,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="llama2",
    format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
    format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
@ -448,7 +475,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="llama2_zh",
    format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
    format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
@ -456,7 +483,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="mistral",
    format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]),
    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
@ -464,7 +491,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="openchat",
    format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]),
    format_assistant=StringFormatter(slots=["{{content}}"]),
@ -473,7 +500,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="orion",
    format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]),
    format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
@ -481,7 +508,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="qwen",
    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
@ -492,7 +519,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="solar",
    format_user=StringFormatter(slots=["### User:\n{{content}}\n\n### Assistant:\n"]),
    format_system=StringFormatter(slots=["### System:\n{{content}}\n\n"]),
@ -500,7 +527,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="starchat",
    format_user=StringFormatter(
        slots=[{"token": "<|user|>"}, "\n{{content}}", {"token": "<|end|>"}, "\n", {"token": "<|assistant|>"}]
@ -513,20 +540,12 @@ register_template(
 )


-register_template(name="vanilla")
-
-register_template(
-    name="chatml_de",
-    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
-    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
-    format_separator=EmptyFormatter(slots=["\n"]),
-    default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.",
-    stop_words=["<|im_end|>"],
-    replace_eos=True,
+_register_template(
+    name="vanilla",
 )


-register_template(
+_register_template(
    name="vicuna",
    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
    default_system=(
@ -536,7 +555,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="xuanyuan",
    format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]),
    default_system=(
@ -547,10 +566,13 @@ register_template(
 )


-register_template(name="xverse", format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: "]))
+_register_template(
+    name="xverse",
+    format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: "]),
+)


-register_template(
+_register_template(
    name="yayi",
    format_user=StringFormatter(slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]),
    format_system=StringFormatter(slots=[{"token": "<|System|>"}, ":\n{{content}}\n\n"]),
@ -570,7 +592,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="yi",
    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
    format_separator=EmptyFormatter(slots=["\n"]),
@ -579,7 +601,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="yuan",
    format_user=StringFormatter(slots=["{{content}}", {"token": "<sep>"}]),
    format_separator=EmptyFormatter(slots=["\n"]),
@ -588,7 +610,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="zephyr",
    format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]),
    format_system=StringFormatter(slots=["<|system|>\n{{content}}", {"eos_token"}]),
@ -596,7 +618,7 @@ register_template(
 )


-register_template(
+_register_template(
    name="ziya",
    format_user=StringFormatter(slots=[{"token": "<human>"}, ":{{content}}\n", {"token": "<bot>"}, ":"]),
    format_separator=EmptyFormatter(slots=["\n"]),
--- a/src/llmtuner/eval/evaluator.py
+++ b/src/llmtuner/eval/evaluator.py
@ -24,7 +24,7 @@ class Evaluator:
        self.model, self.tokenizer = load_model_and_tokenizer(self.model_args, finetuning_args)
        self.tokenizer.padding_side = "right"  # avoid overflow issue in batched inference for llama2
        self.model = dispatch_model(self.model)
-        self.template = get_template_and_fix_tokenizer(self.data_args.template, self.tokenizer)
+        self.template = get_template_and_fix_tokenizer(self.tokenizer, self.data_args.template)
        self.eval_template = get_eval_template(self.eval_args.lang)
        self.choice_inputs = [
            self.tokenizer.encode(self.eval_template.prefix + ch, add_special_tokens=False)[-1] for ch in CHOICES