diff --git a/examples/README.md b/examples/README.md
index 9c6d5fb0..727b27c8 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -110,19 +110,20 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_l
 #### Supervised Fine-Tuning with Accelerate on Single Node
 
 ```bash
-bash examples/lora_multi_gpu/single_node.sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
 ```
 
 #### Supervised Fine-Tuning with Accelerate on Multiple Nodes
 
 ```bash
-bash examples/lora_multi_gpu/multi_node.sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
+CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
 ```
 
 #### Supervised Fine-Tuning with DeepSpeed ZeRO-3 (Weight Sharding)
 
 ```bash
-bash examples/lora_multi_gpu/ds_zero3.sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
 ```
 
 ### LoRA Fine-Tuning on Multiple NPUs
@@ -130,7 +131,7 @@ bash examples/lora_multi_gpu/ds_zero3.sh
 #### Supervised Fine-Tuning with DeepSpeed ZeRO-0
 
 ```bash
-bash examples/lora_multi_npu/ds_zero0.sh
+ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu/llama3_lora_sft_ds.yaml
 ```
 
 ### Full-Parameter Fine-Tuning on Multiple GPUs
@@ -138,19 +139,20 @@ bash examples/lora_multi_npu/ds_zero0.sh
 #### Supervised Fine-Tuning with Accelerate on Single Node
 
 ```bash
-bash examples/full_multi_gpu/single_node.sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
 ```
 
 #### Supervised Fine-Tuning with Accelerate on Multiple Nodes
 
 ```bash
-bash examples/full_multi_gpu/multi_node.sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
+CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
 ```
 
 #### Batch Predicting and Computing BLEU and ROUGE Scores
 
 ```bash
-bash examples/full_multi_gpu/predict.sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_predict.yaml
 ```
 
 ### Merging LoRA Adapters and Quantization
diff --git a/examples/README_zh.md b/examples/README_zh.md
index 0ff33398..6974faa9 100644
--- a/examples/README_zh.md
+++ b/examples/README_zh.md
@@ -110,19 +110,20 @@ CUDA_VISIBLE_DEVICES=0 llamafactory-cli train examples/qlora_single_gpu/llama3_l
 #### 使用 Accelerate 进行单节点训练
 
 ```bash
-bash examples/lora_multi_gpu/single_node.sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
 ```
 
 #### 使用 Accelerate 进行多节点训练
 
 ```bash
-bash examples/lora_multi_gpu/multi_node.sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
+CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft.yaml
 ```
 
 #### 使用 DeepSpeed ZeRO-3 平均分配显存
 
 ```bash
-bash examples/lora_multi_gpu/ds_zero3.sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
 ```
 
 ### 多 NPU LoRA 微调
@@ -130,7 +131,7 @@ bash examples/lora_multi_gpu/ds_zero3.sh
 #### 使用 DeepSpeed ZeRO-0 训练
 
 ```bash
-bash examples/lora_multi_npu/ds_zero0.sh
+ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/lora_multi_npu/llama3_lora_sft_ds.yaml
 ```
 
 ### 多 GPU 全参数微调
@@ -138,19 +139,20 @@ bash examples/lora_multi_npu/ds_zero0.sh
 #### 使用 DeepSpeed 进行单节点训练
 
 ```bash
-bash examples/full_multi_gpu/single_node.sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
 ```
 
 #### 使用 DeepSpeed 进行多节点训练
 
 ```bash
-bash examples/full_multi_gpu/multi_node.sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
+CUDA_VISIBLE_DEVICES=0,1,2,3 NNODES=2 RANK=1 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 llamafactory-cli train examples/full_multi_gpu/llama3_full_sft.yaml
 ```
 
 #### 批量预测并计算 BLEU 和 ROUGE 分数
 
 ```bash
-bash examples/full_multi_gpu/predict.sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 llamafactory-cli train examples/full_multi_gpu/llama3_full_predict.yaml
 ```
 
 ### 合并 LoRA 适配器与模型量化
diff --git a/examples/accelerate/master_config.yaml b/examples/accelerate/master_config.yaml
deleted file mode 100644
index a1018313..00000000
--- a/examples/accelerate/master_config.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-compute_environment: LOCAL_MACHINE
-debug: false
-distributed_type: MULTI_GPU
-downcast_bf16: 'no'
-gpu_ids: all
-machine_rank: 0
-main_process_ip: 192.168.0.1
-main_process_port: 29555
-main_training_function: main
-mixed_precision: fp16
-num_machines: 2 # the number of nodes
-num_processes: 8 # the number of GPUs in all nodes
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
diff --git a/examples/accelerate/single_config.yaml b/examples/accelerate/single_config.yaml
deleted file mode 100644
index 97f8c633..00000000
--- a/examples/accelerate/single_config.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-compute_environment: LOCAL_MACHINE
-debug: false
-distributed_type: MULTI_GPU
-downcast_bf16: 'no'
-gpu_ids: all
-machine_rank: 0
-main_training_function: main
-mixed_precision: fp16
-num_machines: 1 # the number of nodes
-num_processes: 4 # the number of GPUs in all nodes
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
diff --git a/examples/accelerate/slave_config.yaml b/examples/accelerate/slave_config.yaml
deleted file mode 100644
index e610fd0e..00000000
--- a/examples/accelerate/slave_config.yaml
+++ /dev/null
@@ -1,18 +0,0 @@
-compute_environment: LOCAL_MACHINE
-debug: false
-distributed_type: MULTI_GPU
-downcast_bf16: 'no'
-gpu_ids: all
-machine_rank: 1
-main_process_ip: 192.168.0.1
-main_process_port: 29555
-main_training_function: main
-mixed_precision: fp16
-num_machines: 2 # the number of nodes
-num_processes: 8 # the number of GPUs in all nodes
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
diff --git a/examples/full_multi_gpu/multi_node.sh b/examples/full_multi_gpu/multi_node.sh
deleted file mode 100644
index 34c038d4..00000000
--- a/examples/full_multi_gpu/multi_node.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-NPROC_PER_NODE=4
-NNODES=2
-RANK=0
-MASTER_ADDR=192.168.0.1
-MASTER_PORT=29500
-
-CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
-    --nproc_per_node $NPROC_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT \
-    src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
diff --git a/examples/full_multi_gpu/predict.sh b/examples/full_multi_gpu/predict.sh
deleted file mode 100644
index 2445f444..00000000
--- a/examples/full_multi_gpu/predict.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
-    --config_file examples/accelerate/single_config.yaml \
-    src/train.py examples/full_multi_gpu/llama3_full_predict.yaml
diff --git a/examples/full_multi_gpu/single_node.sh b/examples/full_multi_gpu/single_node.sh
deleted file mode 100644
index ac29c097..00000000
--- a/examples/full_multi_gpu/single_node.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-NPROC_PER_NODE=4
-NNODES=1
-RANK=0
-MASTER_ADDR=127.0.0.1
-MASTER_PORT=29500
-
-CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
-    --nproc_per_node $NPROC_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT \
-    src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
diff --git a/examples/lora_multi_gpu/ds_zero3.sh b/examples/lora_multi_gpu/ds_zero3.sh
deleted file mode 100644
index 90ea00dd..00000000
--- a/examples/lora_multi_gpu/ds_zero3.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-NPROC_PER_NODE=4
-NNODES=1
-RANK=0
-MASTER_ADDR=127.0.0.1
-MASTER_PORT=29500
-
-CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
-    --nproc_per_node $NPROC_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT \
-    src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
diff --git a/examples/lora_multi_gpu/multi_node.sh b/examples/lora_multi_gpu/multi_node.sh
deleted file mode 100644
index 401fac5f..00000000
--- a/examples/lora_multi_gpu/multi_node.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-# also launch it on slave machine using slave_config.yaml
-
-CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
-    --config_file examples/accelerate/master_config.yaml \
-    src/train.py examples/lora_multi_gpu/llama3_lora_sft.yaml
diff --git a/examples/lora_multi_gpu/single_node.sh b/examples/lora_multi_gpu/single_node.sh
deleted file mode 100644
index 885a0e8c..00000000
--- a/examples/lora_multi_gpu/single_node.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/bash
-
-CUDA_VISIBLE_DEVICES=0,1,2,3 accelerate launch \
-    --config_file examples/accelerate/single_config.yaml \
-    src/train.py examples/lora_multi_gpu/llama3_lora_sft.yaml
diff --git a/examples/lora_multi_npu/ds_zero0.sh b/examples/lora_multi_npu/ds_zero0.sh
deleted file mode 100644
index 4ffaa1b0..00000000
--- a/examples/lora_multi_npu/ds_zero0.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-NPROC_PER_NODE=4
-NNODES=1
-RANK=0
-MASTER_ADDR=127.0.0.1
-MASTER_PORT=29500
-
-ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 torchrun \
-    --nproc_per_node $NPROC_PER_NODE \
-    --nnodes $NNODES \
-    --node_rank $RANK \
-    --master_addr $MASTER_ADDR \
-    --master_port $MASTER_PORT \
-    src/train.py examples/lora_multi_npu/llama3_lora_sft_ds.yaml
diff --git a/src/llamafactory/cli.py b/src/llamafactory/cli.py
index f9b63ded..26975f3c 100644
--- a/src/llamafactory/cli.py
+++ b/src/llamafactory/cli.py
@@ -1,9 +1,16 @@
+import os
+import random
+import subprocess
 import sys
 from enum import Enum, unique
 
+from llamafactory import launcher
+
 from .api.app import run_api
 from .chat.chat_model import run_chat
 from .eval.evaluator import run_eval
+from .extras.logging import get_logger
+from .extras.misc import get_device_count
 from .train.tuner import export_model, run_exp
 from .webui.interface import run_web_demo, run_web_ui
 
@@ -37,6 +44,8 @@ WELCOME = (
     + "-" * 58
 )
 
+logger = get_logger(__name__)
+
 
 @unique
 class Command(str, Enum):
@@ -62,7 +71,32 @@ def main():
     elif command == Command.EXPORT:
         export_model()
     elif command == Command.TRAIN:
-        run_exp()
+        if get_device_count() > 1:
+            nnodes = os.environ.get("NNODES", "1")
+            node_rank = os.environ.get("RANK", "0")
+            nproc_per_node = os.environ.get("NPROC_PER_NODE", str(get_device_count()))
+            master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1")
+            master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999)))
+            logger.info("Initializing distributed tasks at: {}:{}".format(master_addr, master_port))
+            subprocess.run(
+                [
+                    "torchrun",
+                    "--nnodes",
+                    nnodes,
+                    "--node_rank",
+                    node_rank,
+                    "--nproc_per_node",
+                    nproc_per_node,
+                    "--master_addr",
+                    master_addr,
+                    "--master_port",
+                    master_port,
+                    launcher.__file__,
+                    *sys.argv[1:],
+                ]
+            )
+        else:
+            run_exp()
     elif command == Command.WEBDEMO:
         run_web_demo()
     elif command == Command.WEBUI:
diff --git a/src/llamafactory/extras/misc.py b/src/llamafactory/extras/misc.py
index 0dc07d28..2c7f170c 100644
--- a/src/llamafactory/extras/misc.py
+++ b/src/llamafactory/extras/misc.py
@@ -165,13 +165,15 @@ def get_current_device() -> torch.device:
 
 def get_device_count() -> int:
     r"""
-    Gets the number of available GPU devices.
+    Gets the number of available GPU or NPU devices.
     """
-    if not torch.cuda.is_available():
+    if is_torch_npu_available():
+        return torch.npu.device_count()
+    elif is_torch_cuda_available():
+        return torch.cuda.device_count()
+    else:
         return 0
 
-    return torch.cuda.device_count()
-
 
 def get_logits_processor() -> "LogitsProcessorList":
     r"""
@@ -194,6 +196,13 @@ def infer_optim_dtype(model_dtype: torch.dtype) -> torch.dtype:
         return torch.float32
 
 
+def is_gpu_or_npu_available() -> bool:
+    r"""
+    Checks if the GPU or NPU is available.
+    """
+    return is_torch_npu_available() or is_torch_cuda_available()
+
+
 def has_tokenized_data(path: os.PathLike) -> bool:
     r"""
     Checks if the path has a tokenized dataset.
diff --git a/src/llamafactory/hparams/parser.py b/src/llamafactory/hparams/parser.py
index 6311297e..c6869e4c 100644
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@@ -10,7 +10,6 @@ from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import is_torch_bf16_gpu_available
 from transformers.utils.versions import require_version
 
-from ..extras.constants import TRAINER_CONFIG
 from ..extras.logging import get_logger
 from ..extras.misc import check_dependencies, get_current_device
 from .data_args import DataArguments
@@ -252,10 +251,6 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
         and can_resume_from_checkpoint
     ):
         last_checkpoint = get_last_checkpoint(training_args.output_dir)
-        files = os.listdir(training_args.output_dir)
-        if last_checkpoint is None and len(files) > 0 and (len(files) != 1 or files[0] != TRAINER_CONFIG):
-            raise ValueError("Output directory already exists and is not empty. Please set `overwrite_output_dir`.")
-
         if last_checkpoint is not None:
             training_args.resume_from_checkpoint = last_checkpoint
             logger.info(
diff --git a/src/llamafactory/launcher.py b/src/llamafactory/launcher.py
new file mode 100644
index 00000000..de154db9
--- /dev/null
+++ b/src/llamafactory/launcher.py
@@ -0,0 +1,9 @@
+from llamafactory.train.tuner import run_exp
+
+
+def launch():
+    run_exp()
+
+
+if __name__ == "__main__":
+    launch()
diff --git a/src/llamafactory/webui/locales.py b/src/llamafactory/webui/locales.py
index bd4a4205..570a8b42 100644
--- a/src/llamafactory/webui/locales.py
+++ b/src/llamafactory/webui/locales.py
@@ -1469,11 +1469,6 @@ ALERTS = {
         "ru": "Обучение недоступно в демонстрационном режиме, сначала скопируйте пространство в частное.",
         "zh": "展示模式不支持训练，请先复制到私人空间。",
     },
-    "err_device_count": {
-        "en": "Multiple GPUs are not supported yet.",
-        "ru": "Пока не поддерживается множественные GPU.",
-        "zh": "尚不支持多 GPU 训练。",
-    },
     "err_tool_name": {
         "en": "Tool name not found.",
         "ru": "Имя инструмента не найдено.",
diff --git a/src/llamafactory/webui/runner.py b/src/llamafactory/webui/runner.py
index 57595a08..1310b999 100644
--- a/src/llamafactory/webui/runner.py
+++ b/src/llamafactory/webui/runner.py
@@ -6,10 +6,9 @@ from typing import TYPE_CHECKING, Any, Dict, Generator, Optional
 
 import psutil
 from transformers.trainer import TRAINING_ARGS_NAME
-from transformers.utils import is_torch_cuda_available
 
 from ..extras.constants import TRAINING_STAGES
-from ..extras.misc import get_device_count, torch_gc
+from ..extras.misc import is_gpu_or_npu_available, torch_gc
 from ..extras.packages import is_gradio_available
 from .common import get_module, get_save_dir, load_args, load_config, save_args
 from .locales import ALERTS
@@ -64,16 +63,13 @@ class Runner:
         if not from_preview and self.demo_mode:
             return ALERTS["err_demo"][lang]
 
-        if not from_preview and get_device_count() > 1:
-            return ALERTS["err_device_count"][lang]
-
         if do_train:
             stage = TRAINING_STAGES[get("train.training_stage")]
             reward_model = get("train.reward_model")
             if stage == "ppo" and not reward_model:
                 return ALERTS["err_no_reward_model"][lang]
 
-        if not from_preview and not is_torch_cuda_available():
+        if not from_preview and not is_gpu_or_npu_available():
             gr.Warning(ALERTS["warn_no_cuda"][lang])
 
         return ""
@@ -273,7 +269,6 @@ class Runner:
             self.do_train, self.running_data = do_train, data
             args = self._parse_train_args(data) if do_train else self._parse_eval_args(data)
             env = deepcopy(os.environ)
-            env["CUDA_VISIBLE_DEVICES"] = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
             env["LLAMABOARD_ENABLED"] = "1"
             self.trainer = Popen("llamafactory-cli train {}".format(save_cmd(args)), env=env, shell=True)
             yield from self.monitor()
diff --git a/src/llamafactory/webui/utils.py b/src/llamafactory/webui/utils.py
index 3d34f0d2..ceeb9352 100644
--- a/src/llamafactory/webui/utils.py
+++ b/src/llamafactory/webui/utils.py
@@ -42,8 +42,7 @@ def clean_cmd(args: Dict[str, Any]) -> Dict[str, Any]:
 
 
 def gen_cmd(args: Dict[str, Any]) -> str:
-    current_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "0")
-    cmd_lines = ["CUDA_VISIBLE_DEVICES={} llamafactory-cli train ".format(current_devices)]
+    cmd_lines = ["llamafactory-cli train "]
     for k, v in clean_cmd(args).items():
         cmd_lines.append("    --{} {} ".format(k, str(v)))