forked from p04798526/LLaMA-Factory-Mirror
tiny fix about badam
This commit is contained in:
parent
efb81b25ec
commit
095fab58d3
|
@ -34,7 +34,7 @@ jobs:
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
python -m pip install .[torch,dev]
|
python -m pip install ".[torch,dev]"
|
||||||
|
|
||||||
- name: Check quality
|
- name: Check quality
|
||||||
run: |
|
run: |
|
||||||
|
|
|
@ -160,8 +160,8 @@ cython_debug/
|
||||||
.idea/
|
.idea/
|
||||||
|
|
||||||
# custom .gitignore
|
# custom .gitignore
|
||||||
user.config
|
|
||||||
saves/
|
|
||||||
cache/
|
cache/
|
||||||
wandb
|
config/
|
||||||
ds_badam_exp
|
saves/
|
||||||
|
output/
|
||||||
|
wandb/
|
||||||
|
|
Binary file not shown.
Before Width: | Height: | Size: 198 KiB After Width: | Height: | Size: 141 KiB |
Binary file not shown.
Before Width: | Height: | Size: 194 KiB After Width: | Height: | Size: 149 KiB |
|
@ -6,6 +6,7 @@ stage: sft
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: full
|
finetuning_type: full
|
||||||
use_badam: true
|
use_badam: true
|
||||||
|
badam_mode: layer
|
||||||
badam_switch_mode: ascending
|
badam_switch_mode: ascending
|
||||||
badam_switch_interval: 50
|
badam_switch_interval: 50
|
||||||
badam_verbose: 2
|
badam_verbose: 2
|
||||||
|
@ -32,7 +33,6 @@ learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
pure_bf16: true
|
|
||||||
|
|
||||||
### eval
|
### eval
|
||||||
val_size: 0.1
|
val_size: 0.1
|
|
@ -6,9 +6,11 @@ stage: sft
|
||||||
do_train: true
|
do_train: true
|
||||||
finetuning_type: full
|
finetuning_type: full
|
||||||
use_badam: true
|
use_badam: true
|
||||||
|
badam_mode: layer
|
||||||
badam_switch_mode: ascending
|
badam_switch_mode: ascending
|
||||||
badam_switch_interval: 50
|
badam_switch_interval: 50
|
||||||
badam_verbose: 2
|
badam_verbose: 2
|
||||||
|
deepspeed: examples/deepspeed/ds_z3_config.json
|
||||||
|
|
||||||
### dataset
|
### dataset
|
||||||
dataset: identity,alpaca_en_demo
|
dataset: identity,alpaca_en_demo
|
||||||
|
@ -28,7 +30,7 @@ overwrite_output_dir: true
|
||||||
### train
|
### train
|
||||||
per_device_train_batch_size: 1
|
per_device_train_batch_size: 1
|
||||||
gradient_accumulation_steps: 8
|
gradient_accumulation_steps: 8
|
||||||
learning_rate: 1.0e-6
|
learning_rate: 1.0e-4
|
||||||
num_train_epochs: 3.0
|
num_train_epochs: 3.0
|
||||||
lr_scheduler_type: cosine
|
lr_scheduler_type: cosine
|
||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
|
@ -1,37 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
export CUDA_VISIBLE_DEVICES=0
|
|
||||||
|
|
||||||
cd ../../..
|
|
||||||
|
|
||||||
llamafactory-cli train \
|
|
||||||
--stage sft \
|
|
||||||
--do_train True \
|
|
||||||
--model_name_or_path meta-llama/Llama-2-13b-hf \
|
|
||||||
--preprocessing_num_workers 16 \
|
|
||||||
--finetuning_type full \
|
|
||||||
--template default \
|
|
||||||
--flash_attn auto \
|
|
||||||
--dataset_dir data \
|
|
||||||
--dataset alpaca_en_demo \
|
|
||||||
--cutoff_len 1024 \
|
|
||||||
--learning_rate 1e-6 \
|
|
||||||
--num_train_epochs 3.0 \
|
|
||||||
--max_samples 100000 \
|
|
||||||
--per_device_train_batch_size 1 \
|
|
||||||
--gradient_accumulation_steps 8 \
|
|
||||||
--lr_scheduler_type cosine \
|
|
||||||
--max_grad_norm 1.0 \
|
|
||||||
--logging_steps 5 \
|
|
||||||
--save_steps 100 \
|
|
||||||
--warmup_steps 0 \
|
|
||||||
--optim adamw_torch \
|
|
||||||
--packing False \
|
|
||||||
--report_to none \
|
|
||||||
--use_badam True \
|
|
||||||
--output_dir saves/LLaMA2-13B/full/BAdam \
|
|
||||||
--plot_loss True \
|
|
||||||
--ddp_timeout 180000000 \
|
|
||||||
--include_num_input_tokens_seen True \
|
|
||||||
--badam_mode layer \
|
|
||||||
--badam_switch_mode ascending \
|
|
||||||
--badam_switch_interval 50
|
|
|
@ -1,39 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
export CUDA_VISIBLE_DEVICES=0,1,2,3
|
|
||||||
|
|
||||||
cd ../../..
|
|
||||||
|
|
||||||
llamafactory-cli train \
|
|
||||||
--stage sft \
|
|
||||||
--do_train True \
|
|
||||||
--model_name_or_path meta-llama/Llama-2-13b-hf \
|
|
||||||
--preprocessing_num_workers 16 \
|
|
||||||
--finetuning_type full \
|
|
||||||
--template default \
|
|
||||||
--flash_attn auto \
|
|
||||||
--dataset_dir data \
|
|
||||||
--dataset alpaca_en_demo \
|
|
||||||
--cutoff_len 1024 \
|
|
||||||
--learning_rate 1e-6 \
|
|
||||||
--num_train_epochs 3.0 \
|
|
||||||
--max_samples 100000 \
|
|
||||||
--per_device_train_batch_size 8 \
|
|
||||||
--gradient_accumulation_steps 2 \
|
|
||||||
--lr_scheduler_type cosine \
|
|
||||||
--max_grad_norm 1.0 \
|
|
||||||
--logging_steps 5 \
|
|
||||||
--save_steps 100 \
|
|
||||||
--warmup_steps 0 \
|
|
||||||
--optim adamw_torch \
|
|
||||||
--packing False \
|
|
||||||
--report_to none \
|
|
||||||
--use_badam True \
|
|
||||||
--output_dir saves/LLaMA2-13B/full/BAdam \
|
|
||||||
--fp16 True \
|
|
||||||
--plot_loss True \
|
|
||||||
--ddp_timeout 180000000 \
|
|
||||||
--include_num_input_tokens_seen True \
|
|
||||||
--badam_mode layer \
|
|
||||||
--badam_switch_mode ascending \
|
|
||||||
--badam_switch_interval 50 \
|
|
||||||
--deepspeed cache/ds_z3_config.json
|
|
2
setup.py
2
setup.py
|
@ -41,7 +41,7 @@ extra_require = {
|
||||||
"bitsandbytes": ["bitsandbytes>=0.39.0"],
|
"bitsandbytes": ["bitsandbytes>=0.39.0"],
|
||||||
"vllm": ["vllm>=0.4.3"],
|
"vllm": ["vllm>=0.4.3"],
|
||||||
"galore": ["galore-torch"],
|
"galore": ["galore-torch"],
|
||||||
"badam": ["badam"],
|
"badam": ["badam>=1.2.1"],
|
||||||
"gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"],
|
"gptq": ["optimum>=1.16.0", "auto-gptq>=0.5.0"],
|
||||||
"awq": ["autoawq"],
|
"awq": ["autoawq"],
|
||||||
"aqlm": ["aqlm[gpu]>=1.1.0"],
|
"aqlm": ["aqlm[gpu]>=1.1.0"],
|
||||||
|
|
|
@ -121,7 +121,7 @@ def _check_extra_dependencies(
|
||||||
require_version("galore_torch", "To fix: pip install galore_torch")
|
require_version("galore_torch", "To fix: pip install galore_torch")
|
||||||
|
|
||||||
if finetuning_args.use_badam:
|
if finetuning_args.use_badam:
|
||||||
require_version("badam", "To fix: pip install badam")
|
require_version("badam>=1.2.1", "To fix: pip install badam>=1.2.1")
|
||||||
|
|
||||||
if finetuning_args.plot_loss:
|
if finetuning_args.plot_loss:
|
||||||
require_version("matplotlib", "To fix: pip install matplotlib")
|
require_version("matplotlib", "To fix: pip install matplotlib")
|
||||||
|
@ -214,15 +214,15 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
|
||||||
|
|
||||||
if (
|
if (
|
||||||
finetuning_args.use_badam
|
finetuning_args.use_badam
|
||||||
and training_args.parallel_mode.value == "distributed"
|
and training_args.parallel_mode == ParallelMode.DISTRIBUTED
|
||||||
):
|
):
|
||||||
if finetuning_args.badam_mode == "ratio":
|
if finetuning_args.badam_mode == "ratio":
|
||||||
raise ValueError("Ratio-wise BAdam does not yet support distributed training, use layer-wise BAdam: --badam_mode layer")
|
raise ValueError("Radio-based BAdam does not yet support distributed training, use layer-wise BAdam.")
|
||||||
if finetuning_args.badam_mode == "layer" and (not is_deepspeed_zero3_enabled()):
|
elif not is_deepspeed_zero3_enabled():
|
||||||
raise ValueError(f"Layer-wise BAdam only supports DeepSpeed ZeRO 3 stage.")
|
raise ValueError("Layer-wise BAdam only supports DeepSpeed ZeRO-3 training.")
|
||||||
|
|
||||||
if (finetuning_args.use_galore) and training_args.deepspeed is not None:
|
if finetuning_args.use_galore and training_args.deepspeed is not None:
|
||||||
raise ValueError("GaLore are incompatible with DeepSpeed yet.")
|
raise ValueError("GaLore is incompatible with DeepSpeed yet.")
|
||||||
|
|
||||||
if model_args.infer_backend == "vllm":
|
if model_args.infer_backend == "vllm":
|
||||||
raise ValueError("vLLM backend is only available for API, CLI and Web.")
|
raise ValueError("vLLM backend is only available for API, CLI and Web.")
|
||||||
|
|
|
@ -96,7 +96,8 @@ class CustomDPOTrainer(DPOTrainer):
|
||||||
self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
|
self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
|
||||||
|
|
||||||
if finetuning_args.use_badam:
|
if finetuning_args.use_badam:
|
||||||
from badam import clip_grad_norm_old_version, BAdamCallback
|
from badam import BAdamCallback, clip_grad_norm_old_version
|
||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||||
self.callback_handler.add_callback(BAdamCallback)
|
self.callback_handler.add_callback(BAdamCallback)
|
||||||
|
|
||||||
|
|
|
@ -91,7 +91,8 @@ class CustomKTOTrainer(KTOTrainer):
|
||||||
self.ref_model.eval()
|
self.ref_model.eval()
|
||||||
|
|
||||||
if finetuning_args.use_badam:
|
if finetuning_args.use_badam:
|
||||||
from badam import clip_grad_norm_old_version, BAdamCallback
|
from badam import BAdamCallback, clip_grad_norm_old_version
|
||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||||
self.callback_handler.add_callback(BAdamCallback)
|
self.callback_handler.add_callback(BAdamCallback)
|
||||||
|
|
||||||
|
|
|
@ -166,7 +166,8 @@ class CustomPPOTrainer(PPOTrainer, Trainer):
|
||||||
self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True)
|
self.reward_model = self.accelerator.prepare_model(self.reward_model, evaluation_mode=True)
|
||||||
|
|
||||||
if finetuning_args.use_badam:
|
if finetuning_args.use_badam:
|
||||||
from badam import clip_grad_norm_old_version, BAdamCallback
|
from badam import BAdamCallback, clip_grad_norm_old_version
|
||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||||
self.callback_handler.add_callback(BAdamCallback)
|
self.callback_handler.add_callback(BAdamCallback)
|
||||||
|
|
||||||
|
|
|
@ -48,7 +48,8 @@ class CustomTrainer(Trainer):
|
||||||
self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
|
self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
|
||||||
|
|
||||||
if finetuning_args.use_badam:
|
if finetuning_args.use_badam:
|
||||||
from badam import clip_grad_norm_old_version, BAdamCallback
|
from badam import BAdamCallback, clip_grad_norm_old_version
|
||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||||
self.callback_handler.add_callback(BAdamCallback)
|
self.callback_handler.add_callback(BAdamCallback)
|
||||||
|
|
||||||
|
|
|
@ -72,7 +72,8 @@ class PairwiseTrainer(Trainer):
|
||||||
self.processor = processor
|
self.processor = processor
|
||||||
self.can_return_loss = True # override property to return eval_loss
|
self.can_return_loss = True # override property to return eval_loss
|
||||||
if finetuning_args.use_badam:
|
if finetuning_args.use_badam:
|
||||||
from badam import clip_grad_norm_old_version, BAdamCallback
|
from badam import BAdamCallback, clip_grad_norm_old_version
|
||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||||
self.callback_handler.add_callback(BAdamCallback)
|
self.callback_handler.add_callback(BAdamCallback)
|
||||||
|
|
||||||
|
|
|
@ -56,7 +56,8 @@ class CustomSeq2SeqTrainer(Seq2SeqTrainer):
|
||||||
self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
|
self.save_model(os.path.join(self.args.output_dir, "pissa_init"))
|
||||||
|
|
||||||
if finetuning_args.use_badam:
|
if finetuning_args.use_badam:
|
||||||
from badam import clip_grad_norm_old_version, BAdamCallback
|
from badam import BAdamCallback, clip_grad_norm_old_version
|
||||||
|
|
||||||
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
self.accelerator.clip_grad_norm_ = MethodType(clip_grad_norm_old_version, self.accelerator)
|
||||||
self.callback_handler.add_callback(BAdamCallback)
|
self.callback_handler.add_callback(BAdamCallback)
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,7 @@ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
|
||||||
import torch
|
import torch
|
||||||
from peft import PeftModel
|
from peft import PeftModel
|
||||||
from transformers import Trainer
|
from transformers import Trainer
|
||||||
|
from transformers.integrations import is_deepspeed_zero3_enabled
|
||||||
from transformers.optimization import get_scheduler
|
from transformers.optimization import get_scheduler
|
||||||
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
|
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||||
from transformers.trainer_pt_utils import get_parameter_names
|
from transformers.trainer_pt_utils import get_parameter_names
|
||||||
|
@ -372,9 +373,6 @@ def _create_badam_optimizer(
|
||||||
dict(params=decay_params, weight_decay=training_args.weight_decay),
|
dict(params=decay_params, weight_decay=training_args.weight_decay),
|
||||||
]
|
]
|
||||||
|
|
||||||
from transformers.integrations import is_deepspeed_zero3_enabled
|
|
||||||
ds_zero3_enabled = is_deepspeed_zero3_enabled()
|
|
||||||
|
|
||||||
if finetuning_args.badam_mode == "layer":
|
if finetuning_args.badam_mode == "layer":
|
||||||
from badam import BlockOptimizer
|
from badam import BlockOptimizer
|
||||||
|
|
||||||
|
@ -387,7 +385,7 @@ def _create_badam_optimizer(
|
||||||
start_block=finetuning_args.badam_start_block,
|
start_block=finetuning_args.badam_start_block,
|
||||||
switch_mode=finetuning_args.badam_switch_mode,
|
switch_mode=finetuning_args.badam_switch_mode,
|
||||||
verbose=finetuning_args.badam_verbose,
|
verbose=finetuning_args.badam_verbose,
|
||||||
ds_zero3_enabled=ds_zero3_enabled
|
ds_zero3_enabled=is_deepspeed_zero3_enabled(),
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, "
|
f"Using BAdam optimizer with layer-wise update, switch mode is {finetuning_args.badam_switch_mode}, "
|
||||||
|
@ -398,7 +396,6 @@ def _create_badam_optimizer(
|
||||||
elif finetuning_args.badam_mode == "ratio":
|
elif finetuning_args.badam_mode == "ratio":
|
||||||
from badam import BlockOptimizerRatio
|
from badam import BlockOptimizerRatio
|
||||||
|
|
||||||
assert not ds_zero3_enabled, "BAdam with ratio-based update does not support Deepspeed ZeRO-3 yet, use layer-wise update instead: --badam_mode layer."
|
|
||||||
assert finetuning_args.badam_update_ratio > 1e-6
|
assert finetuning_args.badam_update_ratio > 1e-6
|
||||||
optimizer = BlockOptimizerRatio(
|
optimizer = BlockOptimizerRatio(
|
||||||
param_groups=param_groups,
|
param_groups=param_groups,
|
||||||
|
|
Loading…
Reference in New Issue