forked from p04798526/LLaMA-Factory-Mirror
tiny fix about badam
This commit is contained in:
parent
efb81b25ec
commit
095fab58d3
|
@ -34,7 +34,7 @@ jobs:
|
|||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install .[torch,dev]
|
||||
python -m pip install ".[torch,dev]"
|
||||
|
||||
- name: Check quality
|
||||
run: |
|
||||
|
|
|
@ -160,8 +160,8 @@ cython_debug/
|
|||
.idea/
|
||||
|
||||
# custom .gitignore
|
||||
user.config
|
||||
saves/
|
||||
cache/
|
||||
wandb
|
||||
ds_badam_exp
|
||||
config/
|
||||
saves/
|
||||
output/
|
||||
wandb/
|
||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 198 KiB After Width: | Height: | Size: 141 KiB |
Binary file not shown.
Before Width: | Height: | Size: 194 KiB After Width: | Height: | Size: 149 KiB |
|
@ -6,6 +6,7 @@ stage: sft
|
|||
do_train: true
|
||||
finetuning_type: full
|
||||
use_badam: true
|
||||
badam_mode: layer
|
||||
badam_switch_mode: ascending
|
||||
badam_switch_interval: 50
|
||||
badam_verbose: 2
|
||||
|
@ -32,7 +33,6 @@ learning_rate: 1.0e-4
|
|||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
||||
pure_bf16: true
|
||||
|
||||
### eval
|
||||
val_size: 0.1
|
|
@ -6,9 +6,11 @@ stage: sft
|
|||
do_train: true
|
||||
finetuning_type: full
|
||||
use_badam: true
|
||||
badam_mode: layer
|
||||
badam_switch_mode: ascending
|
||||
badam_switch_interval: 50
|
||||
badam_verbose: 2
|
||||
deepspeed: examples/deepspeed/ds_z3_config.json
|
||||
|
||||
### dataset
|
||||
dataset: identity,alpaca_en_demo
|
||||
|
@ -28,7 +30,7 @@ overwrite_output_dir: true
|
|||
### train
|
||||
per_device_train_batch_size: 1
|
||||
gradient_accumulation_steps: 8
|
||||
learning_rate: 1.0e-6
|
||||
learning_rate: 1.0e-4
|
||||
num_train_epochs: 3.0
|
||||
lr_scheduler_type: cosine
|
||||
warmup_ratio: 0.1
|
|
@ -1,37 +0,0 @@
|
|||
#!/bin/bash
|
||||
export CUDA_VISIBLE_DEVICES=0
|
||||
|
||||
cd ../../..
|
||||
|
||||
llamafactory-cli train \
|
||||
--stage sft \
|
||||
--do_train True \
|
||||
--model_name_or_path meta-llama/Llama-2-13b-hf \
|
||||
--preprocessing_num_workers 16 \
|
||||
--finetuning_type full \
|
||||
--template default \
|
||||
--flash_attn auto \
|
||||
--dataset_dir data \
|
||||
--dataset alpaca_en_demo \
|
||||
--cutoff_len 1024 \
|
||||
--learning_rate 1e-6 \
|
||||
--num_train_epochs 3.0 \
|
||||
--max_samples 100000 \
|
||||
--per_device_train_batch_size 1 \
|
||||
--gradient_accumulation_steps 8 \
|
||||
--lr_scheduler_type cosine \
|
||||
--max_grad_norm 1.0 \
|
||||
--logging_steps 5 \
|
||||
--save_steps 100 \
|
||||
--warmup_steps 0 \
|
||||
--optim adamw_torch \
|
||||
--packing False \
|
||||
--report_to none \
|
||||
--use_badam True \
|
||||
--output_dir saves/LLaMA2-13B/full/BAdam \
|
||||
--plot_loss True \
|
||||
--ddp_timeout 180000000 \
|
||||
--include_num_input_tokens_seen True \
|
||||
--badam_mode layer \
|
||||
--badam_switch_mode ascending \
|
||||
--badam_switch_interval 50
|
|
@ -1,39 +0,0 @@
|
|||
#!/bin/bash
|
||||
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
||||
|
||||
cd ../../..
|
||||
|
||||
llamafactory-cli train \
|
||||
--stage sft \
|
||||
--do_train True \
|
||||
--model_name_or_path meta-llama/Llama-2-13b-hf \
|
||||
--preprocessing_num_workers 16 \
|
||||
--finetuning_type full \
|
||||
--template default \
|
||||
--flash_attn auto \
|
||||
--dataset_dir data \
|
||||
--dataset alpaca_en_demo \
|
||||
--cutoff_len 1024 \
|
||||
--learning_rate 1e-6 \
|
||||
--num_train_epochs 3.0 \
|
||||
--max_samples 100000 \
|
||||
--per_device_train_batch_size 8 \
|
||||
--gradient_accumulation_steps 2 \
|
||||
--lr_scheduler_type cosine \
|
||||
--max_grad_norm 1.0 \
|
||||
--logging_steps 5 \
|
||||
--save_steps 100 \
|
||||
--warmup_steps 0 \
|
||||
--optim adamw_torch \
|
||||
--packing False \
|
||||
--report_to none \
|
||||
--use_badam True \
|
||||
--output_dir saves/LLaMA2-13B/full/BAdam \
|
||||
--fp16 True \
|
||||
--plot_loss True \
|
||||
--ddp_timeout 180000000 \
|
||||
--include_num_input_tokens_seen True \
|
||||
--badam_mode layer \
|
||||
--badam_switch_mode ascending \
|
||||
--badam_switch_interval 50 \
|
||||
--deepspeed cache/ds_z3_config.json
|
2
setup.py
2
setup.py
|
@ -41,7 +41,7 @@ extra_require = {
|
|||
"bitsandbytes": ["bitsandbytes>=0.39.0"],
|
||||
"vllm": ["vllm>=0.4.3"],
|
||||
"galore": ["galore-torch"],
|
||||
"badam": ["badam"],
|
||||
"badam": ["badam>=1.2.1"],
|
||||
"gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"],
|
||||
"awq": ["autoawq"],
|
||||
"aqlm": ["aqlm[gpu]>=1.1.0"],
|
||||
|
|
|
@ -121,7 +121,7 @@ def _check_extra_dependencies(
|
|||
require_version("galore_torch", "To fix: pip install galore_torch")
|
||||
|
||||
if finetuning_args.use_badam:
|
||||
require_version("badam", "To fix: pip install badam")
|
||||
require_version("badam>=1.2.1", "To fix: pip install badam>=1.2.1")
|
||||
|
||||
if finetuning_args.plot_loss:
|
||||
require_version("matplotlib", "To fix: pip install matplotlib")
|
||||
|
@ -214,15 +214,15 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
|
|||
|
||||
if (
|
||||
finetuning_args.use_badam
|
||||
and training_args.parallel_mode.value == "distributed"
|
||||
and training_args.parallel_mode == ParallelMode.DISTRIBUTED
|
||||
):
|
||||
if finetuning_args.badam_mode == "ratio":
|
||||
raise ValueError("Ratio-wise BAdam does not yet support distributed training, use layer-wise BAdam: --badam_mode layer")
|
||||
if finetuning_args.badam_mode == "layer" and (not is_deepspeed_zero3_enabled()):
|
||||
raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage.")
|
||||
raise ValueError("Radio-based BAdam does not yet support distributed training, use layer-wise BAdam.")
|
||||
elif not is_deepspeed_zero3_enabled():
|
||||
raise ValueError("Layer-wise BAdam only supports DeepSpeed ZeRO-3 training.")
|
||||
|
||||
if (finetuning_args.use_galore) and training_args.deepspeed is not None:
|
||||
raise ValueError("GaLore are incompatible with DeepSpeed yet.")
|
||||
if finetuning_args.use_galore and training_args.deepspeed is not None:
|
||||
raise ValueError("GaLore is incompatible with DeepSpeed yet.")
|
||||
|
||||
if model_args.infer_backend == "vllm":
|
||||
raise ValueError("vLLM backend is only available for API, CLI and Web.")
|
||||
|
|
|
@ -96,7 +96,8 @@ class CustomDPOTrainer(DPOTrainer):
|
|||
self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
|
||||
|
||||
if finetuning_args.use_badam:
|
||||
from badam import clip_grad_norm_old_version, BAdamCallback
|
||||
from badam import BAdamCallback, clip_grad_norm_old_version
|
||||
|
||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||
self.callback_handler.add_callback(BAdamCallback)
|
||||
|
||||
|
|
|
@ -91,7 +91,8 @@ class CustomKTOTrainer(KTOTrainer):
|
|||
self.ref_model.eval()
|
||||
|
||||
if finetuning_args.use_badam:
|
||||
from badam import clip_grad_norm_old_version, BAdamCallback
|
||||
from badam import BAdamCallback, clip_grad_norm_old_version
|
||||
|
||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||
self.callback_handler.add_callback(BAdamCallback)
|
||||
|
||||
|
|
|
@ -166,7 +166,8 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
|
|||
self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True)
|
||||
|
||||
if finetuning_args.use_badam:
|
||||
from badam import clip_grad_norm_old_version, BAdamCallback
|
||||
from badam import BAdamCallback, clip_grad_norm_old_version
|
||||
|
||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||
self.callback_handler.add_callback(BAdamCallback)
|
||||
|
||||
|
|
|
@ -48,7 +48,8 @@ class CustomTrainer(Trainer):
|
|||
self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
|
||||
|
||||
if finetuning_args.use_badam:
|
||||
from badam import clip_grad_norm_old_version, BAdamCallback
|
||||
from badam import BAdamCallback, clip_grad_norm_old_version
|
||||
|
||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||
self.callback_handler.add_callback(BAdamCallback)
|
||||
|
||||
|
|
|
@ -72,7 +72,8 @@ class PairwiseTrainer(Trainer):
|
|||
self.processor = processor
|
||||
self.can_return_loss = True # override property to return eval_loss
|
||||
if finetuning_args.use_badam:
|
||||
from badam import clip_grad_norm_old_version, BAdamCallback
|
||||
from badam import BAdamCallback, clip_grad_norm_old_version
|
||||
|
||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||
self.callback_handler.add_callback(BAdamCallback)
|
||||
|
||||
|
|
|
@ -56,7 +56,8 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
|
|||
self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
|
||||
|
||||
if finetuning_args.use_badam:
|
||||
from badam import clip_grad_norm_old_version, BAdamCallback
|
||||
from badam import BAdamCallback, clip_grad_norm_old_version
|
||||
|
||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||
self.callback_handler.add_callback(BAdamCallback)
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
|
|||
import torch
|
||||
from peft import PeftModel
|
||||
from transformers import Trainer
|
||||
from transformers.integrations import is_deepspeed_zero3_enabled
|
||||
from transformers.optimization import get_scheduler
|
||||
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||
from transformers.trainer_pt_utils import get_parameter_names
|
||||
|
@ -372,9 +373,6 @@ def _create_badam_optimizer(
|
|||
dict(params=decay_params, weight_decay=training_args.weight_decay),
|
||||
]
|
||||
|
||||
from transformers.integrations import is_deepspeed_zero3_enabled
|
||||
ds_zero3_enabled = is_deepspeed_zero3_enabled()
|
||||
|
||||
if finetuning_args.badam_mode == "layer":
|
||||
from badam import BlockOptimizer
|
||||
|
||||
|
@ -387,7 +385,7 @@ def _create_badam_optimizer(
|
|||
start_block=finetuning_args.badam_start_block,
|
||||
switch_mode=finetuning_args.badam_switch_mode,
|
||||
verbose=finetuning_args.badam_verbose,
|
||||
ds_zero3_enabled=ds_zero3_enabled
|
||||
ds_zero3_enabled=is_deepspeed_zero3_enabled(),
|
||||
)
|
||||
logger.info(
|
||||
f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, "
|
||||
|
@ -398,7 +396,6 @@ def _create_badam_optimizer(
|
|||
elif finetuning_args.badam_mode == "ratio":
|
||||
from badam import BlockOptimizerRatio
|
||||
|
||||
assert not ds_zero3_enabled, "BAdam with ratio-based update does not support Deepspeed ZeRO-3 yet, use layer-wise update instead: --badam_mode layer."
|
||||
assert finetuning_args.badam_update_ratio > 1e-6
|
||||
optimizer = BlockOptimizerRatio(
|
||||
param_groups=param_groups,
|
||||
|
|
Loading…
Reference in New Issue