fix qwen tokenizer #361

This commit is contained in:
hiyouga 2023-08-05 17:06:05 +08:00
parent 1afa51c2fa
commit 7f18d2a335
1 changed files with 6 additions and 2 deletions

View File

@ -98,12 +98,16 @@ class Template:
r"""
Converts context to token ids.
"""
if hasattr(tokenizer, "tokenizer"): # for tiktoken tokenizer (Qwen)
kwargs = dict(allowed_special="all")
else:
kwargs = dict(add_special_tokens=False)
token_ids = []
for elem in context:
if isinstance(elem, str):
elem = elem.replace("{{query}}", query, 1)
elem = elem.replace("<mask>", "[MASK]")
token_ids = token_ids + tokenizer.encode(elem, add_special_tokens=False)
token_ids = token_ids + tokenizer.encode(elem, **kwargs)
elif isinstance(elem, dict):
token_ids = token_ids + [tokenizer.convert_tokens_to_ids(elem.get("token"))]
else: