improve fix tokenizer
This commit is contained in:
parent
d0daaa01f9
commit
54ea9684ed
23
README.md
23
README.md
|
@ -157,8 +157,8 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list
|
||||||
|
|
||||||
- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
|
- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
|
||||||
- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
|
- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
|
||||||
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
|
- [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
|
||||||
- [Self-cognition (zh)](data/self_cognition.json)
|
- [Self Cognition (zh)](data/self_cognition.json)
|
||||||
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
|
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
|
||||||
- [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
|
- [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
|
||||||
- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
|
- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
|
||||||
|
@ -176,6 +176,7 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list
|
||||||
- [OpenOrca (en)](https://huggingface.co/datasets/Open-Orca/OpenOrca)
|
- [OpenOrca (en)](https://huggingface.co/datasets/Open-Orca/OpenOrca)
|
||||||
- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
|
- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
|
||||||
- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
|
- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
|
||||||
|
- [Wiki QA (en)](https://huggingface.co/datasets/wiki_qa)
|
||||||
- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
|
- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
|
||||||
- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
|
- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
|
||||||
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
|
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
|
||||||
|
@ -190,14 +191,14 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list
|
||||||
- [Glaive Function Calling V2 (en)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2)
|
- [Glaive Function Calling V2 (en)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2)
|
||||||
- [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
|
- [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
|
||||||
- [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
|
- [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
|
||||||
- [FreedomIntelligence Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de)
|
- [Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de)
|
||||||
- [LeoLM/OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de)
|
- [OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de)
|
||||||
- [FreedomIntelligence/evol-instruct-deutsch (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de)
|
- [Evol Instruct (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de)
|
||||||
- [wiki_qa (de)](https://huggingface.co/datasets/wiki_qa)
|
- [Dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de)
|
||||||
- [cognitivecomputations/dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de)
|
- [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
|
||||||
- [booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
|
- [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
|
||||||
- [jondurbin/airoboros-3.0 (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
|
- [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
|
||||||
- [stingning/ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
<details><summary>Preference datasets</summary>
|
<details><summary>Preference datasets</summary>
|
||||||
|
@ -206,7 +207,7 @@ Please refer to [constants.py](src/llmtuner/extras/constants.py) for a full list
|
||||||
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
|
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
|
||||||
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
|
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
|
||||||
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
|
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
|
||||||
- [Intel/orca_dpo_pairs (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
|
- [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
|
15
README_zh.md
15
README_zh.md
|
@ -157,8 +157,8 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846
|
||||||
|
|
||||||
- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
|
- [Stanford Alpaca (en)](https://github.com/tatsu-lab/stanford_alpaca)
|
||||||
- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
|
- [Stanford Alpaca (zh)](https://github.com/ymcui/Chinese-LLaMA-Alpaca)
|
||||||
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
|
- [Alpaca GPT4 (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
|
||||||
- [Self-cognition (zh)](data/self_cognition.json)
|
- [Self Cognition (zh)](data/self_cognition.json)
|
||||||
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
|
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
|
||||||
- [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
|
- [ShareGPT (zh)](https://huggingface.co/datasets/QingyiSi/Alpaca-CoT/tree/main/Chinese-instruction-collection)
|
||||||
- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
|
- [Guanaco Dataset (multilingual)](https://huggingface.co/datasets/JosephusCheung/GuanacoDataset)
|
||||||
|
@ -176,6 +176,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846
|
||||||
- [OpenOrca (en)](https://huggingface.co/datasets/Open-Orca/OpenOrca)
|
- [OpenOrca (en)](https://huggingface.co/datasets/Open-Orca/OpenOrca)
|
||||||
- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
|
- [MathInstruct (en)](https://huggingface.co/datasets/TIGER-Lab/MathInstruct)
|
||||||
- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
|
- [Firefly 1.1M (zh)](https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M)
|
||||||
|
- [Wiki QA (en)](https://huggingface.co/datasets/wiki_qa)
|
||||||
- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
|
- [Web QA (zh)](https://huggingface.co/datasets/suolyer/webqa)
|
||||||
- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
|
- [WebNovel (zh)](https://huggingface.co/datasets/zxbsmk/webnovel_cn)
|
||||||
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
|
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
|
||||||
|
@ -188,6 +189,15 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846
|
||||||
- [LMSYS Chat 1M (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)
|
- [LMSYS Chat 1M (en)](https://huggingface.co/datasets/lmsys/lmsys-chat-1m)
|
||||||
- [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
|
- [Evol Instruct V2 (en)](https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k)
|
||||||
- [Glaive Function Calling V2 (en)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2)
|
- [Glaive Function Calling V2 (en)](https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2)
|
||||||
|
- [Open Assistant (de)](https://huggingface.co/datasets/mayflowergmbh/oasst_de)
|
||||||
|
- [Dolly 15k (de)](https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de)
|
||||||
|
- [Alpaca GPT4 (de)](https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de)
|
||||||
|
- [OpenSchnabeltier (de)](https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de)
|
||||||
|
- [Evol Instruct (de)](https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de)
|
||||||
|
- [Dolphin (de)](https://huggingface.co/datasets/mayflowergmbh/dolphin_de)
|
||||||
|
- [Booksum (de)](https://huggingface.co/datasets/mayflowergmbh/booksum_de)
|
||||||
|
- [Airoboros (de)](https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de)
|
||||||
|
- [Ultrachat (de)](https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de)
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
@ -197,6 +207,7 @@ https://github.com/hiyouga/LLaMA-Factory/assets/16256802/6ba60acc-e2e2-4bec-b846
|
||||||
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
|
- [Open Assistant (multilingual)](https://huggingface.co/datasets/OpenAssistant/oasst1)
|
||||||
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
|
- [GPT-4 Generated Data (en&zh)](https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM)
|
||||||
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
|
- [Nectar (en)](https://huggingface.co/datasets/berkeley-nest/Nectar)
|
||||||
|
- [Orca DPO (de)](https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de)
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 137 KiB After Width: | Height: | Size: 142 KiB |
|
@ -28,7 +28,7 @@ class ChatModel:
|
||||||
)
|
)
|
||||||
self.tokenizer.padding_side = "left" if self.can_generate else "right"
|
self.tokenizer.padding_side = "left" if self.can_generate else "right"
|
||||||
self.model = dispatch_model(self.model)
|
self.model = dispatch_model(self.model)
|
||||||
self.template = get_template_and_fix_tokenizer(data_args.template, self.tokenizer)
|
self.template = get_template_and_fix_tokenizer(self.tokenizer, data_args.template)
|
||||||
|
|
||||||
def _process_args(
|
def _process_args(
|
||||||
self,
|
self,
|
||||||
|
|
|
@ -142,7 +142,7 @@ def get_dataset(
|
||||||
stage: Literal["pt", "sft", "rm", "ppo"],
|
stage: Literal["pt", "sft", "rm", "ppo"],
|
||||||
# split: Optional[str] = "train", # TODO: add split
|
# split: Optional[str] = "train", # TODO: add split
|
||||||
) -> Union["Dataset", "IterableDataset"]:
|
) -> Union["Dataset", "IterableDataset"]:
|
||||||
template = get_template_and_fix_tokenizer(data_args.template, tokenizer)
|
template = get_template_and_fix_tokenizer(tokenizer, data_args.template)
|
||||||
if data_args.train_on_prompt and template.efficient_eos:
|
if data_args.train_on_prompt and template.efficient_eos:
|
||||||
raise ValueError("Current template does not support `train_on_prompt`.")
|
raise ValueError("Current template does not support `train_on_prompt`.")
|
||||||
|
|
||||||
|
|
|
@ -198,7 +198,7 @@ class Llama2Template(Template):
|
||||||
templates: Dict[str, Template] = {}
|
templates: Dict[str, Template] = {}
|
||||||
|
|
||||||
|
|
||||||
def register_template(
|
def _register_template(
|
||||||
name: str,
|
name: str,
|
||||||
format_user: Optional["Formatter"] = None,
|
format_user: Optional["Formatter"] = None,
|
||||||
format_assistant: Optional["Formatter"] = None,
|
format_assistant: Optional["Formatter"] = None,
|
||||||
|
@ -236,29 +236,45 @@ def register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_template_and_fix_tokenizer(name: str, tokenizer: "PreTrainedTokenizer") -> Template:
|
def _add_or_replace_eos_token(tokenizer: "PreTrainedTokenizer", eos_token: str) -> None:
|
||||||
if tokenizer.eos_token_id is None:
|
is_added = tokenizer.eos_token_id is None
|
||||||
tokenizer.eos_token = "<|endoftext|>"
|
is_oov = eos_token not in tokenizer.get_vocab()
|
||||||
|
tokenizer.add_special_tokens({"eos_token": eos_token})
|
||||||
|
|
||||||
|
if is_added:
|
||||||
logger.info("Add eos token: {}".format(tokenizer.eos_token))
|
logger.info("Add eos token: {}".format(tokenizer.eos_token))
|
||||||
|
else:
|
||||||
|
logger.info("Replace eos token: {}".format(tokenizer.eos_token))
|
||||||
|
|
||||||
if tokenizer.pad_token_id is None:
|
if is_oov:
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
logger.warning("New token is added, you must enable `resize_vocab` to activate it.")
|
||||||
logger.info("Add pad token: {}".format(tokenizer.pad_token))
|
|
||||||
|
|
||||||
if name is None: # for pre-training
|
|
||||||
return None
|
|
||||||
|
|
||||||
template = templates.get(name, None)
|
def get_template_and_fix_tokenizer(
|
||||||
assert template is not None, "Template {} does not exist.".format(name)
|
tokenizer: "PreTrainedTokenizer",
|
||||||
|
name: Optional[str] = None,
|
||||||
|
) -> Template:
|
||||||
|
if name is None:
|
||||||
|
template = templates["vanilla"] # placeholder
|
||||||
|
else:
|
||||||
|
template = templates.get(name, None)
|
||||||
|
if templates is None:
|
||||||
|
raise ValueError("Template {} does not exist.".format(name))
|
||||||
|
|
||||||
stop_words = template.stop_words
|
stop_words = template.stop_words
|
||||||
if template.replace_eos:
|
if template.replace_eos:
|
||||||
if not stop_words:
|
if not stop_words:
|
||||||
raise ValueError("Stop words are required to replace the EOS token.")
|
raise ValueError("Stop words are required to replace the EOS token.")
|
||||||
|
|
||||||
tokenizer.eos_token = stop_words[0]
|
_add_or_replace_eos_token(tokenizer, eos_token=stop_words[0])
|
||||||
stop_words = stop_words[1:]
|
stop_words = stop_words[1:]
|
||||||
logger.info("Replace eos token: {}".format(tokenizer.eos_token))
|
|
||||||
|
if tokenizer.eos_token_id is None:
|
||||||
|
_add_or_replace_eos_token(tokenizer, eos_token="<|endoftext|>")
|
||||||
|
|
||||||
|
if tokenizer.pad_token_id is None:
|
||||||
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
|
logger.info("Add pad token: {}".format(tokenizer.pad_token))
|
||||||
|
|
||||||
if stop_words:
|
if stop_words:
|
||||||
tokenizer.add_special_tokens(
|
tokenizer.add_special_tokens(
|
||||||
|
@ -269,7 +285,7 @@ def get_template_and_fix_tokenizer(name: str, tokenizer: "PreTrainedTokenizer")
|
||||||
return template
|
return template
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="alpaca",
|
name="alpaca",
|
||||||
format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n\n### Response:\n"]),
|
format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n\n### Response:\n"]),
|
||||||
format_separator=EmptyFormatter(slots=["\n\n"]),
|
format_separator=EmptyFormatter(slots=["\n\n"]),
|
||||||
|
@ -279,7 +295,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="aquila",
|
name="aquila",
|
||||||
format_user=StringFormatter(slots=["Human: {{content}}###Assistant:"]),
|
format_user=StringFormatter(slots=["Human: {{content}}###Assistant:"]),
|
||||||
format_separator=EmptyFormatter(slots=["###"]),
|
format_separator=EmptyFormatter(slots=["###"]),
|
||||||
|
@ -292,21 +308,21 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="baichuan",
|
name="baichuan",
|
||||||
format_user=StringFormatter(slots=[{"token": "<reserved_102>"}, "{{content}}", {"token": "<reserved_103>"}]),
|
format_user=StringFormatter(slots=[{"token": "<reserved_102>"}, "{{content}}", {"token": "<reserved_103>"}]),
|
||||||
efficient_eos=True,
|
efficient_eos=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="baichuan2",
|
name="baichuan2",
|
||||||
format_user=StringFormatter(slots=[{"token": "<reserved_106>"}, "{{content}}", {"token": "<reserved_107>"}]),
|
format_user=StringFormatter(slots=[{"token": "<reserved_106>"}, "{{content}}", {"token": "<reserved_107>"}]),
|
||||||
efficient_eos=True,
|
efficient_eos=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="belle",
|
name="belle",
|
||||||
format_user=StringFormatter(slots=["Human: {{content}}\n\nBelle: "]),
|
format_user=StringFormatter(slots=["Human: {{content}}\n\nBelle: "]),
|
||||||
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
|
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
|
||||||
|
@ -315,13 +331,13 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="bluelm",
|
name="bluelm",
|
||||||
format_user=StringFormatter(slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]),
|
format_user=StringFormatter(slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="chatglm2",
|
name="chatglm2",
|
||||||
format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问:{{content}}\n\n答:"]),
|
format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问:{{content}}\n\n答:"]),
|
||||||
format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
|
format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
|
||||||
|
@ -331,7 +347,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="chatglm3",
|
name="chatglm3",
|
||||||
format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
|
format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
|
||||||
format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
|
format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
|
||||||
|
@ -351,14 +367,25 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
|
name="chatml_de",
|
||||||
|
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
|
||||||
|
format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
|
||||||
|
format_separator=EmptyFormatter(slots=["\n"]),
|
||||||
|
default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.",
|
||||||
|
stop_words=["<|im_end|>"],
|
||||||
|
replace_eos=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_register_template(
|
||||||
name="codegeex2",
|
name="codegeex2",
|
||||||
format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
|
format_system=StringFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}, "{{content}}"]),
|
||||||
force_system=True,
|
force_system=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="cpm",
|
name="cpm",
|
||||||
format_user=StringFormatter(slots=["<用户>{{content}}<AI>"]),
|
format_user=StringFormatter(slots=["<用户>{{content}}<AI>"]),
|
||||||
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
|
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
|
||||||
|
@ -366,7 +393,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="deepseek",
|
name="deepseek",
|
||||||
format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]),
|
format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]),
|
||||||
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
|
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
|
||||||
|
@ -374,7 +401,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="deepseekcoder",
|
name="deepseekcoder",
|
||||||
format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n### Response:"]),
|
format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n### Response:"]),
|
||||||
format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
|
format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
|
||||||
|
@ -390,14 +417,14 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="default",
|
name="default",
|
||||||
format_user=StringFormatter(slots=["Human: {{content}}\nAssistant: "]),
|
format_user=StringFormatter(slots=["Human: {{content}}\nAssistant: "]),
|
||||||
format_separator=EmptyFormatter(slots=["\n"]),
|
format_separator=EmptyFormatter(slots=["\n"]),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="falcon",
|
name="falcon",
|
||||||
format_user=StringFormatter(slots=["User: {{content}}\nFalcon:"]),
|
format_user=StringFormatter(slots=["User: {{content}}\nFalcon:"]),
|
||||||
format_separator=EmptyFormatter(slots=["\n"]),
|
format_separator=EmptyFormatter(slots=["\n"]),
|
||||||
|
@ -405,7 +432,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="intern",
|
name="intern",
|
||||||
format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": "<eoh>"}, "\n<|Bot|>:"]),
|
format_user=StringFormatter(slots=["<|User|>:{{content}}", {"token": "<eoh>"}, "\n<|Bot|>:"]),
|
||||||
format_separator=EmptyFormatter(slots=[{"token": "<eoa>"}, "\n"]),
|
format_separator=EmptyFormatter(slots=[{"token": "<eoa>"}, "\n"]),
|
||||||
|
@ -414,7 +441,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="intern2",
|
name="intern2",
|
||||||
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
|
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
|
||||||
format_system=StringFormatter(slots=[{"bos_token"}, "<|im_start|>system\n{{content}}<|im_end|>\n"]),
|
format_system=StringFormatter(slots=[{"bos_token"}, "<|im_start|>system\n{{content}}<|im_end|>\n"]),
|
||||||
|
@ -431,7 +458,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="llama2",
|
name="llama2",
|
||||||
format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
|
format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
|
||||||
format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
|
format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
|
||||||
|
@ -448,7 +475,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="llama2_zh",
|
name="llama2_zh",
|
||||||
format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
|
format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
|
||||||
format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
|
format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
|
||||||
|
@ -456,7 +483,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="mistral",
|
name="mistral",
|
||||||
format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]),
|
format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]),
|
||||||
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
|
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
|
||||||
|
@ -464,7 +491,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="openchat",
|
name="openchat",
|
||||||
format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]),
|
format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]),
|
||||||
format_assistant=StringFormatter(slots=["{{content}}"]),
|
format_assistant=StringFormatter(slots=["{{content}}"]),
|
||||||
|
@ -473,7 +500,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="orion",
|
name="orion",
|
||||||
format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]),
|
format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]),
|
||||||
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
|
format_system=StringFormatter(slots=[{"bos_token"}, "{{content}}"]),
|
||||||
|
@ -481,7 +508,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="qwen",
|
name="qwen",
|
||||||
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
|
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
|
||||||
format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
|
format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
|
||||||
|
@ -492,7 +519,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="solar",
|
name="solar",
|
||||||
format_user=StringFormatter(slots=["### User:\n{{content}}\n\n### Assistant:\n"]),
|
format_user=StringFormatter(slots=["### User:\n{{content}}\n\n### Assistant:\n"]),
|
||||||
format_system=StringFormatter(slots=["### System:\n{{content}}\n\n"]),
|
format_system=StringFormatter(slots=["### System:\n{{content}}\n\n"]),
|
||||||
|
@ -500,7 +527,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="starchat",
|
name="starchat",
|
||||||
format_user=StringFormatter(
|
format_user=StringFormatter(
|
||||||
slots=[{"token": "<|user|>"}, "\n{{content}}", {"token": "<|end|>"}, "\n", {"token": "<|assistant|>"}]
|
slots=[{"token": "<|user|>"}, "\n{{content}}", {"token": "<|end|>"}, "\n", {"token": "<|assistant|>"}]
|
||||||
|
@ -513,20 +540,12 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(name="vanilla")
|
_register_template(
|
||||||
|
name="vanilla",
|
||||||
register_template(
|
|
||||||
name="chatml_de",
|
|
||||||
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
|
|
||||||
format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
|
|
||||||
format_separator=EmptyFormatter(slots=["\n"]),
|
|
||||||
default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.",
|
|
||||||
stop_words=["<|im_end|>"],
|
|
||||||
replace_eos=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="vicuna",
|
name="vicuna",
|
||||||
format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
|
format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
|
||||||
default_system=(
|
default_system=(
|
||||||
|
@ -536,7 +555,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="xuanyuan",
|
name="xuanyuan",
|
||||||
format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]),
|
format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]),
|
||||||
default_system=(
|
default_system=(
|
||||||
|
@ -547,10 +566,13 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(name="xverse", format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: "]))
|
_register_template(
|
||||||
|
name="xverse",
|
||||||
|
format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: "]),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="yayi",
|
name="yayi",
|
||||||
format_user=StringFormatter(slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]),
|
format_user=StringFormatter(slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]),
|
||||||
format_system=StringFormatter(slots=[{"token": "<|System|>"}, ":\n{{content}}\n\n"]),
|
format_system=StringFormatter(slots=[{"token": "<|System|>"}, ":\n{{content}}\n\n"]),
|
||||||
|
@ -570,7 +592,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="yi",
|
name="yi",
|
||||||
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
|
format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
|
||||||
format_separator=EmptyFormatter(slots=["\n"]),
|
format_separator=EmptyFormatter(slots=["\n"]),
|
||||||
|
@ -579,7 +601,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="yuan",
|
name="yuan",
|
||||||
format_user=StringFormatter(slots=["{{content}}", {"token": "<sep>"}]),
|
format_user=StringFormatter(slots=["{{content}}", {"token": "<sep>"}]),
|
||||||
format_separator=EmptyFormatter(slots=["\n"]),
|
format_separator=EmptyFormatter(slots=["\n"]),
|
||||||
|
@ -588,7 +610,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="zephyr",
|
name="zephyr",
|
||||||
format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]),
|
format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>"]),
|
||||||
format_system=StringFormatter(slots=["<|system|>\n{{content}}", {"eos_token"}]),
|
format_system=StringFormatter(slots=["<|system|>\n{{content}}", {"eos_token"}]),
|
||||||
|
@ -596,7 +618,7 @@ register_template(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
register_template(
|
_register_template(
|
||||||
name="ziya",
|
name="ziya",
|
||||||
format_user=StringFormatter(slots=[{"token": "<human>"}, ":{{content}}\n", {"token": "<bot>"}, ":"]),
|
format_user=StringFormatter(slots=[{"token": "<human>"}, ":{{content}}\n", {"token": "<bot>"}, ":"]),
|
||||||
format_separator=EmptyFormatter(slots=["\n"]),
|
format_separator=EmptyFormatter(slots=["\n"]),
|
||||||
|
|
|
@ -24,7 +24,7 @@ class Evaluator:
|
||||||
self.model, self.tokenizer = load_model_and_tokenizer(self.model_args, finetuning_args)
|
self.model, self.tokenizer = load_model_and_tokenizer(self.model_args, finetuning_args)
|
||||||
self.tokenizer.padding_side = "right" # avoid overflow issue in batched inference for llama2
|
self.tokenizer.padding_side = "right" # avoid overflow issue in batched inference for llama2
|
||||||
self.model = dispatch_model(self.model)
|
self.model = dispatch_model(self.model)
|
||||||
self.template = get_template_and_fix_tokenizer(self.data_args.template, self.tokenizer)
|
self.template = get_template_and_fix_tokenizer(self.tokenizer, self.data_args.template)
|
||||||
self.eval_template = get_eval_template(self.eval_args.lang)
|
self.eval_template = get_eval_template(self.eval_args.lang)
|
||||||
self.choice_inputs = [
|
self.choice_inputs = [
|
||||||
self.tokenizer.encode(self.eval_template.prefix + ch, add_special_tokens=False)[-1] for ch in CHOICES
|
self.tokenizer.encode(self.eval_template.prefix + ch, add_special_tokens=False)[-1] for ch in CHOICES
|
||||||
|
|
Loading…
Reference in New Issue