fix #763

2023-09-01 23:13:05 +08:00 · 2023-09-01 23:13:05 +08:00 · 370bdb6e43
parent a9d1fb72f7
commit 370bdb6e43
2 changed files with 3 additions and 1 deletions
--- a/src/llmtuner/dsets/preprocess.py
+++ b/src/llmtuner/dsets/preprocess.py
@ -34,6 +34,8 @@ def preprocess_dataset(
        # build grouped texts with format `X1 X2 X3 ...`
        if isinstance(getattr(tokenizer, "tokenizer", None), tiktoken.Encoding):
            kwargs = dict(allowed_special="all") # for tiktoken tokenizer (Qwen)
+        else:
+            kwargs = dict(add_special_tokens=True)

        if hasattr(tokenizer, "add_bos_token") and hasattr(tokenizer, "add_eos_token"):
            setattr(tokenizer, "add_bos_token", True) # for LLaMA tokenizer
--- a/src/llmtuner/webui/utils.py
+++ b/src/llmtuner/webui/utils.py
@ -74,7 +74,7 @@ def can_quantize(finetuning_type: str) -> Dict[str, Any]:
 def gen_cmd(args: Dict[str, Any]) -> str:
    if args.get("do_train", None):
        args["plot_loss"] = True
-    cmd_lines = ["CUDA_VISIBLE_DEVICES=0 python src/train_bash.py"]
+    cmd_lines = ["CUDA_VISIBLE_DEVICES=0 python src/train_bash.py "]
    for k, v in args.items():
        if v is not None and v != "":
            cmd_lines.append("    --{} {} ".format(k, str(v)))