From 1653c2243895cca69c6efecc158881ed6e7c4129 Mon Sep 17 00:00:00 2001 From: hiyouga Date: Wed, 10 Jan 2024 12:37:45 +0800 Subject: [PATCH] improve web ui --- src/llmtuner/extras/misc.py | 4 ++++ src/llmtuner/webui/components/train.py | 8 ++++---- src/llmtuner/webui/locales.py | 16 ++++++++++------ src/llmtuner/webui/runner.py | 12 ++++++++---- 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/src/llmtuner/extras/misc.py b/src/llmtuner/extras/misc.py index 9aafd3ff..3455b5e2 100644 --- a/src/llmtuner/extras/misc.py +++ b/src/llmtuner/extras/misc.py @@ -79,6 +79,10 @@ def get_current_device() -> torch.device: return torch.device(device) +def get_device_count() -> int: + return torch.cuda.device_count() + + def get_logits_processor() -> "LogitsProcessorList": r""" Gets logits processor that removes NaN and Inf logits. diff --git a/src/llmtuner/webui/components/train.py b/src/llmtuner/webui/components/train.py index 45ebffd2..5989c421 100644 --- a/src/llmtuner/webui/components/train.py +++ b/src/llmtuner/webui/components/train.py @@ -37,7 +37,7 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]: learning_rate = gr.Textbox(value="5e-5") num_train_epochs = gr.Textbox(value="3.0") max_samples = gr.Textbox(value="100000") - compute_type = gr.Radio(choices=["fp16", "bf16"], value="fp16") + compute_type = gr.Radio(choices=["fp16", "bf16", "fp32"], value="fp16") input_elems.update({cutoff_len, learning_rate, num_train_epochs, max_samples, compute_type}) elem_dict.update(dict( @@ -68,13 +68,13 @@ def create_train_tab(engine: "Engine") -> Dict[str, "Component"]: neftune_alpha = gr.Slider(value=0, minimum=0, maximum=10, step=0.1) with gr.Column(): - train_on_prompt = gr.Checkbox(value=False) + sft_packing = gr.Checkbox(value=False) upcast_layernorm = gr.Checkbox(value=False) - input_elems.update({logging_steps, save_steps, warmup_steps, neftune_alpha, train_on_prompt, upcast_layernorm}) + input_elems.update({logging_steps, save_steps, warmup_steps, neftune_alpha, sft_packing, upcast_layernorm}) elem_dict.update(dict( extra_tab=extra_tab, logging_steps=logging_steps, save_steps=save_steps, warmup_steps=warmup_steps, - neftune_alpha=neftune_alpha, train_on_prompt=train_on_prompt, upcast_layernorm=upcast_layernorm + neftune_alpha=neftune_alpha, sft_packing=sft_packing, upcast_layernorm=upcast_layernorm )) with gr.Accordion(label="LoRA config", open=False) as lora_tab: diff --git a/src/llmtuner/webui/locales.py b/src/llmtuner/webui/locales.py index 3d01a0f2..d6f9d31f 100644 --- a/src/llmtuner/webui/locales.py +++ b/src/llmtuner/webui/locales.py @@ -325,14 +325,14 @@ LOCALES = { "info": "嵌入向量所添加的噪声大小。" } }, - "train_on_prompt": { + "sft_packing": { "en": { - "label": "Train on prompt", - "info": "Compute loss on the prompt tokens in supervised fine-tuning." + "label": "Pack sequences", + "info": "Pack sequences into samples of fixed length in supervised fine-tuning." }, "zh": { - "label": "计算输入损失", - "info": "在监督微调时候计算输入序列的损失。" + "label": "序列打包", + "info": "在有监督微调阶段将序列打包为相同长度的样本。" } }, "upcast_layernorm": { @@ -342,7 +342,7 @@ LOCALES = { }, "zh": { "label": "缩放归一化层", - "info": "将归一化层权重缩放至 32 位浮点数。" + "info": "将归一化层权重缩放至 32 位精度。" } }, "lora_tab": { @@ -665,6 +665,10 @@ ALERTS = { "en": "Training is unavailable in demo mode, duplicate the space to a private one first.", "zh": "展示模式不支持训练,请先复制到私人空间。" }, + "err_device_count": { + "en": "Multiple GPUs are not supported yet.", + "zh": "尚不支持多 GPU 训练。" + }, "info_aborting": { "en": "Aborted, wait for terminating...", "zh": "训练中断,正在等待线程结束……" diff --git a/src/llmtuner/webui/runner.py b/src/llmtuner/webui/runner.py index a6863c36..374d72a3 100644 --- a/src/llmtuner/webui/runner.py +++ b/src/llmtuner/webui/runner.py @@ -12,7 +12,7 @@ from transformers.trainer import TRAINING_ARGS_NAME from llmtuner.extras.callbacks import LogCallback from llmtuner.extras.constants import TRAINING_STAGES from llmtuner.extras.logging import LoggerHandler -from llmtuner.extras.misc import torch_gc +from llmtuner.extras.misc import get_device_count, torch_gc from llmtuner.train import run_exp from llmtuner.webui.common import get_module, get_save_dir, load_config from llmtuner.webui.locales import ALERTS @@ -67,6 +67,9 @@ class Runner: if self.demo_mode and (not from_preview): return ALERTS["err_demo"][lang] + if not from_preview and get_device_count() > 1: + return ALERTS["err_device_count"][lang] + self.aborted = False self.logger_handler.reset() self.trainer_callback = LogCallback(self) @@ -119,16 +122,17 @@ class Runner: save_steps=get("train.save_steps"), warmup_steps=get("train.warmup_steps"), neftune_noise_alpha=get("train.neftune_alpha") or None, - train_on_prompt=get("train.train_on_prompt"), + sft_packing=get("train.sft_packing"), upcast_layernorm=get("train.upcast_layernorm"), lora_rank=get("train.lora_rank"), lora_dropout=get("train.lora_dropout"), lora_target=get("train.lora_target") or get_module(get("top.model_name")), additional_target=get("train.additional_target") or None, create_new_adapter=get("train.create_new_adapter"), - output_dir=get_save_dir(get("top.model_name"), get("top.finetuning_type"), get("train.output_dir")) + output_dir=get_save_dir(get("top.model_name"), get("top.finetuning_type"), get("train.output_dir")), + fp16=(get("train.compute_type") == "fp16"), + bf16=(get("train.compute_type") == "bf16") ) - args[get("train.compute_type")] = True args["disable_tqdm"] = True if TRAINING_STAGES[get("train.training_stage")] in ["rm", "ppo", "dpo"]: