diff --git a/examples/deepspeed/ds_z0_config.json b/examples/deepspeed/ds_z0_config.json
new file mode 100644
index 00000000..b7826b20
--- /dev/null
+++ b/examples/deepspeed/ds_z0_config.json
@@ -0,0 +1,18 @@
+{
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "zero_allow_untested_optimizer": true,
+  "fp16": {
+    "enabled": "auto",
+    "loss_scale": 0,
+    "loss_scale_window": 1000,
+    "initial_scale_power": 16,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "bf16": {
+    "enabled": "auto"
+  }
+}
\ No newline at end of file
diff --git a/examples/full_multi_gpu/multi_node.sh b/examples/full_multi_gpu/multi_node.sh
index 962409a1..34c038d4 100644
--- a/examples/full_multi_gpu/multi_node.sh
+++ b/examples/full_multi_gpu/multi_node.sh
@@ -6,7 +6,7 @@ RANK=0
 MASTER_ADDR=192.168.0.1
 MASTER_PORT=29500
 
-CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
+CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
     --nproc_per_node $NPROC_PER_NODE \
     --nnodes $NNODES \
     --node_rank $RANK \
diff --git a/examples/full_multi_gpu/single_node.sh b/examples/full_multi_gpu/single_node.sh
index 97f7af64..ac29c097 100644
--- a/examples/full_multi_gpu/single_node.sh
+++ b/examples/full_multi_gpu/single_node.sh
@@ -1,9 +1,15 @@
 #!/bin/bash
 
 NPROC_PER_NODE=4
+NNODES=1
+RANK=0
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=29500
 
-CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
+CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
     --nproc_per_node $NPROC_PER_NODE \
-    --nnodes 1 \
-    --standalone \
+    --nnodes $NNODES \
+    --node_rank $RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
     src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
diff --git a/examples/lora_multi_gpu/ds_zero3.sh b/examples/lora_multi_gpu/ds_zero3.sh
index b8fd2640..90ea00dd 100644
--- a/examples/lora_multi_gpu/ds_zero3.sh
+++ b/examples/lora_multi_gpu/ds_zero3.sh
@@ -1,9 +1,15 @@
 #!/bin/bash
 
 NPROC_PER_NODE=4
+NNODES=1
+RANK=0
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=29500
 
-CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
+CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
     --nproc_per_node $NPROC_PER_NODE \
-    --nnodes 1 \
-    --standalone \
+    --nnodes $NNODES \
+    --node_rank $RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
     src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
diff --git a/examples/lora_multi_npu/ds_zero0.sh b/examples/lora_multi_npu/ds_zero0.sh
new file mode 100644
index 00000000..f849c5c9
--- /dev/null
+++ b/examples/lora_multi_npu/ds_zero0.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+NPROC_PER_NODE=4
+NNODES=1
+RANK=0
+MASTER_ADDR=127.0.0.1
+MASTER_PORT=29500
+
+ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 torchrun \
+    --nproc_per_node $NPROC_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT \
+    src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
diff --git a/examples/lora_multi_npu/llama3_lora_sft_ds.yaml b/examples/lora_multi_npu/llama3_lora_sft_ds.yaml
new file mode 100644
index 00000000..2e9c0558
--- /dev/null
+++ b/examples/lora_multi_npu/llama3_lora_sft_ds.yaml
@@ -0,0 +1,42 @@
+# model
+model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
+
+# method
+stage: sft
+do_train: true
+finetuning_type: lora
+lora_target: q_proj,v_proj
+
+# ddp
+ddp_timeout: 180000000
+deepspeed: examples/deepspeed/ds_z0_config.json
+
+# dataset
+dataset: identity,alpaca_gpt4_en
+template: llama3
+cutoff_len: 1024
+max_samples: 1000
+overwrite_cache: true
+preprocessing_num_workers: 16
+
+# output
+output_dir: saves/llama3-8b/lora/sft
+logging_steps: 10
+save_steps: 500
+plot_loss: true
+overwrite_output_dir: true
+
+# train
+per_device_train_batch_size: 1
+gradient_accumulation_steps: 2
+learning_rate: 0.0001
+num_train_epochs: 3.0
+lr_scheduler_type: cosine
+warmup_steps: 0.1
+fp16: true
+
+# eval
+val_size: 0.1
+per_device_eval_batch_size: 1
+evaluation_strategy: steps
+eval_steps: 500
diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py
index fd99bd3b..b28a23d0 100644
--- a/src/llmtuner/model/patcher.py
+++ b/src/llmtuner/model/patcher.py
@@ -1,9 +1,10 @@
+import os
 from types import MethodType
 from typing import TYPE_CHECKING, Any, Dict
 
 import torch
 from peft import PeftModel
-from transformers import PreTrainedModel, PreTrainedTokenizerBase
+from transformers import PreTrainedModel, PreTrainedTokenizerBase, is_torch_npu_available
 from transformers.integrations import is_deepspeed_zero3_enabled
 
 from ..extras.logging import get_logger
@@ -44,6 +45,10 @@ def patch_config(
     if model_args.compute_dtype is None:  # priority: bf16 > fp16 > fp32
         model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
 
+    if is_torch_npu_available():
+        use_jit_compile = os.environ.get("JIT_COMPILE", "0").lower() in ["true", "1"]
+        torch.npu.set_compile_mode(jit_compile=use_jit_compile)
+
     configure_attn_implementation(config, model_args)
     configure_rope(config, model_args, is_trainable)
     configure_longlora(config, model_args, is_trainable)
@@ -56,7 +61,7 @@ def patch_config(
         logger.info("Using KV cache for faster generation.")
 
     if getattr(config, "model_type", None) == "qwen":
-        setattr(config, "use_flash_attn", model_args.flash_attn)
+        setattr(config, "use_flash_attn", model_args.flash_attn == "fa2")
         for dtype_name, dtype in [("fp16", torch.float16), ("bf16", torch.bfloat16), ("fp32", torch.float32)]:
             setattr(config, dtype_name, model_args.compute_dtype == dtype)
 
diff --git a/src/llmtuner/model/utils/attention.py b/src/llmtuner/model/utils/attention.py
index f4686489..b52ddc86 100644
--- a/src/llmtuner/model/utils/attention.py
+++ b/src/llmtuner/model/utils/attention.py
@@ -22,7 +22,7 @@ def configure_attn_implementation(config: "PretrainedConfig", model_args: "Model
 
     elif model_args.flash_attn == "sdpa":
         if not is_sdpa_available():
-            logger.warning("Torch>=2.1.1 is required for SDPA attention.")
+            logger.warning("torch>=2.1.1 is required for SDPA attention.")
             return
 
         requested_attn_implementation = "sdpa"
@@ -52,4 +52,4 @@ def print_attn_implementation(config: "PretrainedConfig") -> None:
     elif attn_implementation == "sdpa":
         logger.info("Using torch SDPA for faster training and inference.")
     else:
-        logger.info("Using vanilla Attention implementation.")
+        logger.info("Using vanilla attention implementation.")
diff --git a/src/train.py b/src/train.py
index 4cc21194..6a3212cb 100644
--- a/src/train.py
+++ b/src/train.py
@@ -1,8 +1,3 @@
-import os
-
-import torch
-from transformers import is_torch_npu_available
-
 from llmtuner.train.tuner import run_exp
 
 
@@ -16,7 +11,4 @@ def _mp_fn(index):
 
 
 if __name__ == "__main__":
-    if is_torch_npu_available():
-        use_jit_compile = os.getenv('JIT_COMPILE', 'False').lower() in ['true', '1']
-        torch.npu.set_compile_mode(jit_compile=use_jit_compile)
     main()