add npu examples
This commit is contained in:
parent
ee4752f6d2
commit
af343034dd
|
@ -0,0 +1,18 @@
|
||||||
|
{
|
||||||
|
"train_batch_size": "auto",
|
||||||
|
"train_micro_batch_size_per_gpu": "auto",
|
||||||
|
"gradient_accumulation_steps": "auto",
|
||||||
|
"gradient_clipping": "auto",
|
||||||
|
"zero_allow_untested_optimizer": true,
|
||||||
|
"fp16": {
|
||||||
|
"enabled": "auto",
|
||||||
|
"loss_scale": 0,
|
||||||
|
"loss_scale_window": 1000,
|
||||||
|
"initial_scale_power": 16,
|
||||||
|
"hysteresis": 2,
|
||||||
|
"min_loss_scale": 1
|
||||||
|
},
|
||||||
|
"bf16": {
|
||||||
|
"enabled": "auto"
|
||||||
|
}
|
||||||
|
}
|
|
@ -6,7 +6,7 @@ RANK=0
|
||||||
MASTER_ADDR=192.168.0.1
|
MASTER_ADDR=192.168.0.1
|
||||||
MASTER_PORT=29500
|
MASTER_PORT=29500
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
|
CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
|
||||||
--nproc_per_node $NPROC_PER_NODE \
|
--nproc_per_node $NPROC_PER_NODE \
|
||||||
--nnodes $NNODES \
|
--nnodes $NNODES \
|
||||||
--node_rank $RANK \
|
--node_rank $RANK \
|
||||||
|
|
|
@ -1,9 +1,15 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
NPROC_PER_NODE=4
|
NPROC_PER_NODE=4
|
||||||
|
NNODES=1
|
||||||
|
RANK=0
|
||||||
|
MASTER_ADDR=127.0.0.1
|
||||||
|
MASTER_PORT=29500
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
|
CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
|
||||||
--nproc_per_node $NPROC_PER_NODE \
|
--nproc_per_node $NPROC_PER_NODE \
|
||||||
--nnodes 1 \
|
--nnodes $NNODES \
|
||||||
--standalone \
|
--node_rank $RANK \
|
||||||
|
--master_addr $MASTER_ADDR \
|
||||||
|
--master_port $MASTER_PORT \
|
||||||
src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
|
src/train.py examples/full_multi_gpu/llama3_full_sft.yaml
|
||||||
|
|
|
@ -1,9 +1,15 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
NPROC_PER_NODE=4
|
NPROC_PER_NODE=4
|
||||||
|
NNODES=1
|
||||||
|
RANK=0
|
||||||
|
MASTER_ADDR=127.0.0.1
|
||||||
|
MASTER_PORT=29500
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \
|
CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \
|
||||||
--nproc_per_node $NPROC_PER_NODE \
|
--nproc_per_node $NPROC_PER_NODE \
|
||||||
--nnodes 1 \
|
--nnodes $NNODES \
|
||||||
--standalone \
|
--node_rank $RANK \
|
||||||
|
--master_addr $MASTER_ADDR \
|
||||||
|
--master_port $MASTER_PORT \
|
||||||
src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
|
src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
|
||||||
|
|
|
@ -0,0 +1,15 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
NPROC_PER_NODE=4
|
||||||
|
NNODES=1
|
||||||
|
RANK=0
|
||||||
|
MASTER_ADDR=127.0.0.1
|
||||||
|
MASTER_PORT=29500
|
||||||
|
|
||||||
|
ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 torchrun \
|
||||||
|
--nproc_per_node $NPROC_PER_NODE \
|
||||||
|
--nnodes $NNODES \
|
||||||
|
--node_rank $RANK \
|
||||||
|
--master_addr $MASTER_ADDR \
|
||||||
|
--master_port $MASTER_PORT \
|
||||||
|
src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml
|
|
@ -0,0 +1,42 @@
|
||||||
|
# model
|
||||||
|
model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
|
||||||
|
|
||||||
|
# method
|
||||||
|
stage: sft
|
||||||
|
do_train: true
|
||||||
|
finetuning_type: lora
|
||||||
|
lora_target: q_proj,v_proj
|
||||||
|
|
||||||
|
# ddp
|
||||||
|
ddp_timeout: 180000000
|
||||||
|
deepspeed: examples/deepspeed/ds_z0_config.json
|
||||||
|
|
||||||
|
# dataset
|
||||||
|
dataset: identity,alpaca_gpt4_en
|
||||||
|
template: llama3
|
||||||
|
cutoff_len: 1024
|
||||||
|
max_samples: 1000
|
||||||
|
overwrite_cache: true
|
||||||
|
preprocessing_num_workers: 16
|
||||||
|
|
||||||
|
# output
|
||||||
|
output_dir: saves/llama3-8b/lora/sft
|
||||||
|
logging_steps: 10
|
||||||
|
save_steps: 500
|
||||||
|
plot_loss: true
|
||||||
|
overwrite_output_dir: true
|
||||||
|
|
||||||
|
# train
|
||||||
|
per_device_train_batch_size: 1
|
||||||
|
gradient_accumulation_steps: 2
|
||||||
|
learning_rate: 0.0001
|
||||||
|
num_train_epochs: 3.0
|
||||||
|
lr_scheduler_type: cosine
|
||||||
|
warmup_steps: 0.1
|
||||||
|
fp16: true
|
||||||
|
|
||||||
|
# eval
|
||||||
|
val_size: 0.1
|
||||||
|
per_device_eval_batch_size: 1
|
||||||
|
evaluation_strategy: steps
|
||||||
|
eval_steps: 500
|
|
@ -1,9 +1,10 @@
|
||||||
|
import os
|
||||||
from types import MethodType
|
from types import MethodType
|
||||||
from typing import TYPE_CHECKING, Any, Dict
|
from typing import TYPE_CHECKING, Any, Dict
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from peft import PeftModel
|
from peft import PeftModel
|
||||||
from transformers import PreTrainedModel, PreTrainedTokenizerBase
|
from transformers import PreTrainedModel, PreTrainedTokenizerBase, is_torch_npu_available
|
||||||
from transformers.integrations import is_deepspeed_zero3_enabled
|
from transformers.integrations import is_deepspeed_zero3_enabled
|
||||||
|
|
||||||
from ..extras.logging import get_logger
|
from ..extras.logging import get_logger
|
||||||
|
@ -44,6 +45,10 @@ def patch_config(
|
||||||
if model_args.compute_dtype is None: # priority: bf16 > fp16 > fp32
|
if model_args.compute_dtype is None: # priority: bf16 > fp16 > fp32
|
||||||
model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
|
model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None))
|
||||||
|
|
||||||
|
if is_torch_npu_available():
|
||||||
|
use_jit_compile = os.environ.get("JIT_COMPILE", "0").lower() in ["true", "1"]
|
||||||
|
torch.npu.set_compile_mode(jit_compile=use_jit_compile)
|
||||||
|
|
||||||
configure_attn_implementation(config, model_args)
|
configure_attn_implementation(config, model_args)
|
||||||
configure_rope(config, model_args, is_trainable)
|
configure_rope(config, model_args, is_trainable)
|
||||||
configure_longlora(config, model_args, is_trainable)
|
configure_longlora(config, model_args, is_trainable)
|
||||||
|
@ -56,7 +61,7 @@ def patch_config(
|
||||||
logger.info("Using KV cache for faster generation.")
|
logger.info("Using KV cache for faster generation.")
|
||||||
|
|
||||||
if getattr(config, "model_type", None) == "qwen":
|
if getattr(config, "model_type", None) == "qwen":
|
||||||
setattr(config, "use_flash_attn", model_args.flash_attn)
|
setattr(config, "use_flash_attn", model_args.flash_attn == "fa2")
|
||||||
for dtype_name, dtype in [("fp16", torch.float16), ("bf16", torch.bfloat16), ("fp32", torch.float32)]:
|
for dtype_name, dtype in [("fp16", torch.float16), ("bf16", torch.bfloat16), ("fp32", torch.float32)]:
|
||||||
setattr(config, dtype_name, model_args.compute_dtype == dtype)
|
setattr(config, dtype_name, model_args.compute_dtype == dtype)
|
||||||
|
|
||||||
|
|
|
@ -22,7 +22,7 @@ def configure_attn_implementation(config: "PretrainedConfig", model_args: "Model
|
||||||
|
|
||||||
elif model_args.flash_attn == "sdpa":
|
elif model_args.flash_attn == "sdpa":
|
||||||
if not is_sdpa_available():
|
if not is_sdpa_available():
|
||||||
logger.warning("Torch>=2.1.1 is required for SDPA attention.")
|
logger.warning("torch>=2.1.1 is required for SDPA attention.")
|
||||||
return
|
return
|
||||||
|
|
||||||
requested_attn_implementation = "sdpa"
|
requested_attn_implementation = "sdpa"
|
||||||
|
@ -52,4 +52,4 @@ def print_attn_implementation(config: "PretrainedConfig") -> None:
|
||||||
elif attn_implementation == "sdpa":
|
elif attn_implementation == "sdpa":
|
||||||
logger.info("Using torch SDPA for faster training and inference.")
|
logger.info("Using torch SDPA for faster training and inference.")
|
||||||
else:
|
else:
|
||||||
logger.info("Using vanilla Attention implementation.")
|
logger.info("Using vanilla attention implementation.")
|
||||||
|
|
|
@ -1,8 +1,3 @@
|
||||||
import os
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from transformers import is_torch_npu_available
|
|
||||||
|
|
||||||
from llmtuner.train.tuner import run_exp
|
from llmtuner.train.tuner import run_exp
|
||||||
|
|
||||||
|
|
||||||
|
@ -16,7 +11,4 @@ def _mp_fn(index):
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if is_torch_npu_available():
|
|
||||||
use_jit_compile = os.getenv('JIT_COMPILE', 'False').lower() in ['true', '1']
|
|
||||||
torch.npu.set_compile_mode(jit_compile=use_jit_compile)
|
|
||||||
main()
|
main()
|
||||||
|
|
Loading…
Reference in New Issue