diff --git a/README.md b/README.md index 522c0d82..80bddd00 100644 --- a/README.md +++ b/README.md @@ -324,7 +324,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ ``` > [!WARNING] -> Use `--per_device_train_batch_size=1` for LLaMA-2 models in fp16 training. +> Use `--per_device_train_batch_size=1` for LLaMA-2 models in fp16 PPO training. #### DPO Training diff --git a/README_zh.md b/README_zh.md index d2ec3097..be71d344 100644 --- a/README_zh.md +++ b/README_zh.md @@ -324,7 +324,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ ``` > [!WARNING] -> 如果在 fp16 精度下训练 LLaMA-2 模型,请使用 `--per_device_eval_batch_size=1`。 +> 如果使用 fp16 精度进行 LLaMA-2 模型的 PPO 训练,请使用 `--per_device_train_batch_size=1`。 #### DPO 训练 @@ -499,7 +499,7 @@ CUDA_VISIBLE_DEVICES=0 python src/train_bash.py \ ``` > [!WARNING] -> 如果在 fp16 精度下推理 LLaMA-2 模型,请使用 `--per_device_eval_batch_size=1`。 +> 如果使用 fp16 精度进行 LLaMA-2 模型的预测,请使用 `--per_device_eval_batch_size=1`。 > [!TIP] > 我们建议在量化模型的预测中使用 `--per_device_eval_batch_size=1` 和 `--max_target_length 128`。 diff --git a/src/llmtuner/data/template.py b/src/llmtuner/data/template.py index b852eb3d..2e1f31c7 100644 --- a/src/llmtuner/data/template.py +++ b/src/llmtuner/data/template.py @@ -354,6 +354,35 @@ register_template( "\n", "{{system}}" ], + prompt=[ + {"token": "<|user|>"}, + "\n", + "{{query}}", + {"token": "<|assistant|>"}, + "\n" # add an extra newline to avoid error in ChatGLM's process_response method + ], + system=( + "You are ChatGLM3, a large language model trained by Zhipu.AI. " + "Follow the user's instructions carefully. Respond using markdown." + ), + sep=[], + stop_words=[ + "<|user|>", + "<|observation|>" + ], + efficient_eos=True +) + + +register_template( + name="chatglm3_raw", # the raw template for tool tuning + prefix=[ + {"token": "[gMASK]"}, + {"token": "sop"}, + {"token": "<|system|>"}, + "\n", + "{{system}}" + ], prompt=[ {"token": "<|user|>"}, "\n", diff --git a/src/llmtuner/model/adapter.py b/src/llmtuner/model/adapter.py index b7fe78a0..9a3fb3f6 100644 --- a/src/llmtuner/model/adapter.py +++ b/src/llmtuner/model/adapter.py @@ -65,7 +65,12 @@ def init_adapter( checkpoint_to_resume = None if model_args.checkpoint_dir is not None: - if is_trainable and finetuning_args.resume_lora_training: + is_mergeable = True + if getattr(model, "quantization_method", None) == "gptq": + assert len(model_args.checkpoint_dir) == 1, "GPTQ quantized model only accepts a single checkpoint." + is_mergeable = False + + if (is_trainable and finetuning_args.resume_lora_training) or (not is_mergeable): checkpoints_to_merge, checkpoint_to_resume = model_args.checkpoint_dir[:-1], model_args.checkpoint_dir[-1] else: checkpoints_to_merge = model_args.checkpoint_dir diff --git a/src/llmtuner/train/tuner.py b/src/llmtuner/train/tuner.py index 361dafb8..5833ee9e 100644 --- a/src/llmtuner/train/tuner.py +++ b/src/llmtuner/train/tuner.py @@ -37,8 +37,13 @@ def run_exp(args: Optional[Dict[str, Any]] = None, callbacks: Optional[List["Tra def export_model(args: Optional[Dict[str, Any]] = None, max_shard_size: Optional[str] = "10GB"): model_args, _, finetuning_args, _ = get_infer_args(args) model, tokenizer = load_model_and_tokenizer(model_args, finetuning_args) + + if getattr(model, "quantization_method", None) == "gptq": + raise ValueError("Cannot export a GPTQ quantized model.") + model.config.use_cache = True model.save_pretrained(finetuning_args.export_dir, max_shard_size=max_shard_size) + try: tokenizer.padding_side = "left" # restore padding side tokenizer.init_kwargs["padding_side"] = "left"