diff --git a/examples/deepspeed/ds_z0_config.json b/examples/deepspeed/ds_z0_config.json new file mode 100644 index 00000000..b7826b20 --- /dev/null +++ b/examples/deepspeed/ds_z0_config.json @@ -0,0 +1,18 @@ +{ + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "zero_allow_untested_optimizer": true, + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "bf16": { + "enabled": "auto" + } +} \ No newline at end of file diff --git a/examples/full_multi_gpu/multi_node.sh b/examples/full_multi_gpu/multi_node.sh index 962409a1..34c038d4 100644 --- a/examples/full_multi_gpu/multi_node.sh +++ b/examples/full_multi_gpu/multi_node.sh @@ -6,7 +6,7 @@ RANK=0 MASTER_ADDR=192.168.0.1 MASTER_PORT=29500 -CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \ +CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \ --nproc_per_node $NPROC_PER_NODE \ --nnodes $NNODES \ --node_rank $RANK \ diff --git a/examples/full_multi_gpu/single_node.sh b/examples/full_multi_gpu/single_node.sh index 97f7af64..ac29c097 100644 --- a/examples/full_multi_gpu/single_node.sh +++ b/examples/full_multi_gpu/single_node.sh @@ -1,9 +1,15 @@ #!/bin/bash NPROC_PER_NODE=4 +NNODES=1 +RANK=0 +MASTER_ADDR=127.0.0.1 +MASTER_PORT=29500 -CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \ +CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \ --nproc_per_node $NPROC_PER_NODE \ - --nnodes 1 \ - --standalone \ + --nnodes $NNODES \ + --node_rank $RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ src/train.py examples/full_multi_gpu/llama3_full_sft.yaml diff --git a/examples/lora_multi_gpu/ds_zero3.sh b/examples/lora_multi_gpu/ds_zero3.sh index b8fd2640..90ea00dd 100644 --- a/examples/lora_multi_gpu/ds_zero3.sh +++ b/examples/lora_multi_gpu/ds_zero3.sh @@ -1,9 +1,15 @@ #!/bin/bash NPROC_PER_NODE=4 +NNODES=1 +RANK=0 +MASTER_ADDR=127.0.0.1 +MASTER_PORT=29500 -CUDA_VISIBLE_DEVICES=0,1,2,3 python -m torch.distributed.run \ +CUDA_VISIBLE_DEVICES=0,1,2,3 torchrun \ --nproc_per_node $NPROC_PER_NODE \ - --nnodes 1 \ - --standalone \ + --nnodes $NNODES \ + --node_rank $RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml diff --git a/examples/lora_multi_npu/ds_zero0.sh b/examples/lora_multi_npu/ds_zero0.sh new file mode 100644 index 00000000..f849c5c9 --- /dev/null +++ b/examples/lora_multi_npu/ds_zero0.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +NPROC_PER_NODE=4 +NNODES=1 +RANK=0 +MASTER_ADDR=127.0.0.1 +MASTER_PORT=29500 + +ASCEND_RT_VISIBLE_DEVICES=0,1,2,3 torchrun \ + --nproc_per_node $NPROC_PER_NODE \ + --nnodes $NNODES \ + --node_rank $RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT \ + src/train.py examples/lora_multi_gpu/llama3_lora_sft_ds.yaml diff --git a/examples/lora_multi_npu/llama3_lora_sft_ds.yaml b/examples/lora_multi_npu/llama3_lora_sft_ds.yaml new file mode 100644 index 00000000..2e9c0558 --- /dev/null +++ b/examples/lora_multi_npu/llama3_lora_sft_ds.yaml @@ -0,0 +1,42 @@ +# model +model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct + +# method +stage: sft +do_train: true +finetuning_type: lora +lora_target: q_proj,v_proj + +# ddp +ddp_timeout: 180000000 +deepspeed: examples/deepspeed/ds_z0_config.json + +# dataset +dataset: identity,alpaca_gpt4_en +template: llama3 +cutoff_len: 1024 +max_samples: 1000 +overwrite_cache: true +preprocessing_num_workers: 16 + +# output +output_dir: saves/llama3-8b/lora/sft +logging_steps: 10 +save_steps: 500 +plot_loss: true +overwrite_output_dir: true + +# train +per_device_train_batch_size: 1 +gradient_accumulation_steps: 2 +learning_rate: 0.0001 +num_train_epochs: 3.0 +lr_scheduler_type: cosine +warmup_steps: 0.1 +fp16: true + +# eval +val_size: 0.1 +per_device_eval_batch_size: 1 +evaluation_strategy: steps +eval_steps: 500 diff --git a/src/llmtuner/model/patcher.py b/src/llmtuner/model/patcher.py index fd99bd3b..b28a23d0 100644 --- a/src/llmtuner/model/patcher.py +++ b/src/llmtuner/model/patcher.py @@ -1,9 +1,10 @@ +import os from types import MethodType from typing import TYPE_CHECKING, Any, Dict import torch from peft import PeftModel -from transformers import PreTrainedModel, PreTrainedTokenizerBase +from transformers import PreTrainedModel, PreTrainedTokenizerBase, is_torch_npu_available from transformers.integrations import is_deepspeed_zero3_enabled from ..extras.logging import get_logger @@ -44,6 +45,10 @@ def patch_config( if model_args.compute_dtype is None: # priority: bf16 > fp16 > fp32 model_args.compute_dtype = infer_optim_dtype(model_dtype=getattr(config, "torch_dtype", None)) + if is_torch_npu_available(): + use_jit_compile = os.environ.get("JIT_COMPILE", "0").lower() in ["true", "1"] + torch.npu.set_compile_mode(jit_compile=use_jit_compile) + configure_attn_implementation(config, model_args) configure_rope(config, model_args, is_trainable) configure_longlora(config, model_args, is_trainable) @@ -56,7 +61,7 @@ def patch_config( logger.info("Using KV cache for faster generation.") if getattr(config, "model_type", None) == "qwen": - setattr(config, "use_flash_attn", model_args.flash_attn) + setattr(config, "use_flash_attn", model_args.flash_attn == "fa2") for dtype_name, dtype in [("fp16", torch.float16), ("bf16", torch.bfloat16), ("fp32", torch.float32)]: setattr(config, dtype_name, model_args.compute_dtype == dtype) diff --git a/src/llmtuner/model/utils/attention.py b/src/llmtuner/model/utils/attention.py index f4686489..b52ddc86 100644 --- a/src/llmtuner/model/utils/attention.py +++ b/src/llmtuner/model/utils/attention.py @@ -22,7 +22,7 @@ def configure_attn_implementation(config: "PretrainedConfig", model_args: "Model elif model_args.flash_attn == "sdpa": if not is_sdpa_available(): - logger.warning("Torch>=2.1.1 is required for SDPA attention.") + logger.warning("torch>=2.1.1 is required for SDPA attention.") return requested_attn_implementation = "sdpa" @@ -52,4 +52,4 @@ def print_attn_implementation(config: "PretrainedConfig") -> None: elif attn_implementation == "sdpa": logger.info("Using torch SDPA for faster training and inference.") else: - logger.info("Using vanilla Attention implementation.") + logger.info("Using vanilla attention implementation.") diff --git a/src/train.py b/src/train.py index 4cc21194..6a3212cb 100644 --- a/src/train.py +++ b/src/train.py @@ -1,8 +1,3 @@ -import os - -import torch -from transformers import is_torch_npu_available - from llmtuner.train.tuner import run_exp @@ -16,7 +11,4 @@ def _mp_fn(index): if __name__ == "__main__": - if is_torch_npu_available(): - use_jit_compile = os.getenv('JIT_COMPILE', 'False').lower() in ['true', '1'] - torch.npu.set_compile_mode(jit_compile=use_jit_compile) main()