8360 lines
492 KiB
Plaintext
8360 lines
492 KiB
Plaintext
|
08/16/2024 11:02:54 - INFO - llamafactory.cli - Initializing distributed tasks at: 127.0.0.1:28784
|
||
|
usage: launcher.py [-h] --model_name_or_path MODEL_NAME_OR_PATH
|
||
|
[--adapter_name_or_path ADAPTER_NAME_OR_PATH]
|
||
|
[--adapter_folder ADAPTER_FOLDER] [--cache_dir CACHE_DIR]
|
||
|
[--use_fast_tokenizer [USE_FAST_TOKENIZER]]
|
||
|
[--no_use_fast_tokenizer] [--resize_vocab [RESIZE_VOCAB]]
|
||
|
[--split_special_tokens [SPLIT_SPECIAL_TOKENS]]
|
||
|
[--new_special_tokens NEW_SPECIAL_TOKENS]
|
||
|
[--model_revision MODEL_REVISION]
|
||
|
[--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]]
|
||
|
[--no_low_cpu_mem_usage]
|
||
|
[--quantization_method {bitsandbytes,hqq,eetq}]
|
||
|
[--quantization_bit QUANTIZATION_BIT]
|
||
|
[--quantization_type {fp4,nf4}]
|
||
|
[--double_quantization [DOUBLE_QUANTIZATION]]
|
||
|
[--no_double_quantization]
|
||
|
[--quantization_device_map {auto}]
|
||
|
[--rope_scaling {linear,dynamic}]
|
||
|
[--flash_attn {auto,disabled,sdpa,fa2}]
|
||
|
[--shift_attn [SHIFT_ATTN]]
|
||
|
[--mixture_of_depths {convert,load}]
|
||
|
[--use_unsloth [USE_UNSLOTH]]
|
||
|
[--visual_inputs [VISUAL_INPUTS]]
|
||
|
[--moe_aux_loss_coef MOE_AUX_LOSS_COEF]
|
||
|
[--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]]
|
||
|
[--upcast_layernorm [UPCAST_LAYERNORM]]
|
||
|
[--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]]
|
||
|
[--train_from_scratch [TRAIN_FROM_SCRATCH]]
|
||
|
[--infer_backend {huggingface,vllm}]
|
||
|
[--vllm_maxlen VLLM_MAXLEN] [--vllm_gpu_util VLLM_GPU_UTIL]
|
||
|
[--vllm_enforce_eager [VLLM_ENFORCE_EAGER]]
|
||
|
[--vllm_max_lora_rank VLLM_MAX_LORA_RANK]
|
||
|
[--offload_folder OFFLOAD_FOLDER] [--use_cache [USE_CACHE]]
|
||
|
[--no_use_cache]
|
||
|
[--infer_dtype {auto,float16,bfloat16,float32}]
|
||
|
[--hf_hub_token HF_HUB_TOKEN] [--ms_hub_token MS_HUB_TOKEN]
|
||
|
[--export_dir EXPORT_DIR] [--export_size EXPORT_SIZE]
|
||
|
[--export_device {cpu,auto}]
|
||
|
[--export_quantization_bit EXPORT_QUANTIZATION_BIT]
|
||
|
[--export_quantization_dataset EXPORT_QUANTIZATION_DATASET]
|
||
|
[--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES]
|
||
|
[--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN]
|
||
|
[--export_legacy_format [EXPORT_LEGACY_FORMAT]]
|
||
|
[--export_hub_model_id EXPORT_HUB_MODEL_ID]
|
||
|
[--print_param_status [PRINT_PARAM_STATUS]]
|
||
|
[--template TEMPLATE] [--dataset DATASET]
|
||
|
[--eval_dataset EVAL_DATASET] [--dataset_dir DATASET_DIR]
|
||
|
[--cutoff_len CUTOFF_LEN]
|
||
|
[--train_on_prompt [TRAIN_ON_PROMPT]]
|
||
|
[--mask_history [MASK_HISTORY]] [--streaming [STREAMING]]
|
||
|
[--buffer_size BUFFER_SIZE]
|
||
|
[--mix_strategy {concat,interleave_under,interleave_over}]
|
||
|
[--interleave_probs INTERLEAVE_PROBS]
|
||
|
[--overwrite_cache [OVERWRITE_CACHE]]
|
||
|
[--preprocessing_num_workers PREPROCESSING_NUM_WORKERS]
|
||
|
[--max_samples MAX_SAMPLES]
|
||
|
[--eval_num_beams EVAL_NUM_BEAMS]
|
||
|
[--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]]
|
||
|
[--no_ignore_pad_token_for_loss] [--val_size VAL_SIZE]
|
||
|
[--packing PACKING] [--neat_packing [NEAT_PACKING]]
|
||
|
[--tool_format TOOL_FORMAT]
|
||
|
[--tokenized_path TOKENIZED_PATH] --output_dir OUTPUT_DIR
|
||
|
[--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]]
|
||
|
[--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]]
|
||
|
[--do_predict [DO_PREDICT]]
|
||
|
[--eval_strategy {no,steps,epoch}]
|
||
|
[--prediction_loss_only [PREDICTION_LOSS_ONLY]]
|
||
|
[--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE]
|
||
|
[--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]
|
||
|
[--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE]
|
||
|
[--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE]
|
||
|
[--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS]
|
||
|
[--eval_accumulation_steps EVAL_ACCUMULATION_STEPS]
|
||
|
[--eval_delay EVAL_DELAY]
|
||
|
[--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS]
|
||
|
[--learning_rate LEARNING_RATE]
|
||
|
[--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1]
|
||
|
[--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON]
|
||
|
[--max_grad_norm MAX_GRAD_NORM]
|
||
|
[--num_train_epochs NUM_TRAIN_EPOCHS]
|
||
|
[--max_steps MAX_STEPS]
|
||
|
[--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}]
|
||
|
[--lr_scheduler_kwargs LR_SCHEDULER_KWARGS]
|
||
|
[--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS]
|
||
|
[--log_level {detail,debug,info,warning,error,critical,passive}]
|
||
|
[--log_level_replica {detail,debug,info,warning,error,critical,passive}]
|
||
|
[--log_on_each_node [LOG_ON_EACH_NODE]]
|
||
|
[--no_log_on_each_node] [--logging_dir LOGGING_DIR]
|
||
|
[--logging_strategy {no,steps,epoch}]
|
||
|
[--logging_first_step [LOGGING_FIRST_STEP]]
|
||
|
[--logging_steps LOGGING_STEPS]
|
||
|
[--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]]
|
||
|
[--no_logging_nan_inf_filter]
|
||
|
[--save_strategy {no,steps,epoch}]
|
||
|
[--save_steps SAVE_STEPS]
|
||
|
[--save_total_limit SAVE_TOTAL_LIMIT]
|
||
|
[--save_safetensors [SAVE_SAFETENSORS]]
|
||
|
[--no_save_safetensors]
|
||
|
[--save_on_each_node [SAVE_ON_EACH_NODE]]
|
||
|
[--save_only_model [SAVE_ONLY_MODEL]]
|
||
|
[--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]]
|
||
|
[--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]]
|
||
|
[--use_mps_device [USE_MPS_DEVICE]] [--seed SEED]
|
||
|
[--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]]
|
||
|
[--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]]
|
||
|
[--fp16_opt_level FP16_OPT_LEVEL]
|
||
|
[--half_precision_backend {auto,apex,cpu_amp}]
|
||
|
[--bf16_full_eval [BF16_FULL_EVAL]]
|
||
|
[--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32]
|
||
|
[--local_rank LOCAL_RANK]
|
||
|
[--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}]
|
||
|
[--tpu_num_cores TPU_NUM_CORES]
|
||
|
[--tpu_metrics_debug [TPU_METRICS_DEBUG]]
|
||
|
[--debug DEBUG [DEBUG ...]]
|
||
|
[--dataloader_drop_last [DATALOADER_DROP_LAST]]
|
||
|
[--eval_steps EVAL_STEPS]
|
||
|
[--dataloader_num_workers DATALOADER_NUM_WORKERS]
|
||
|
[--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR]
|
||
|
[--past_index PAST_INDEX] [--run_name RUN_NAME]
|
||
|
[--disable_tqdm DISABLE_TQDM]
|
||
|
[--remove_unused_columns [REMOVE_UNUSED_COLUMNS]]
|
||
|
[--no_remove_unused_columns]
|
||
|
[--label_names LABEL_NAMES [LABEL_NAMES ...]]
|
||
|
[--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]]
|
||
|
[--metric_for_best_model METRIC_FOR_BEST_MODEL]
|
||
|
[--greater_is_better GREATER_IS_BETTER]
|
||
|
[--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP]
|
||
|
[--fsdp_min_num_params FSDP_MIN_NUM_PARAMS]
|
||
|
[--fsdp_config FSDP_CONFIG]
|
||
|
[--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP]
|
||
|
[--accelerator_config ACCELERATOR_CONFIG]
|
||
|
[--deepspeed DEEPSPEED]
|
||
|
[--label_smoothing_factor LABEL_SMOOTHING_FACTOR]
|
||
|
[--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}]
|
||
|
[--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]]
|
||
|
[--group_by_length [GROUP_BY_LENGTH]]
|
||
|
[--length_column_name LENGTH_COLUMN_NAME]
|
||
|
[--report_to REPORT_TO]
|
||
|
[--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS]
|
||
|
[--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB]
|
||
|
[--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS]
|
||
|
[--dataloader_pin_memory [DATALOADER_PIN_MEMORY]]
|
||
|
[--no_dataloader_pin_memory]
|
||
|
[--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]]
|
||
|
[--skip_memory_metrics [SKIP_MEMORY_METRICS]]
|
||
|
[--no_skip_memory_metrics]
|
||
|
[--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]]
|
||
|
[--push_to_hub [PUSH_TO_HUB]]
|
||
|
[--resume_from_checkpoint RESUME_FROM_CHECKPOINT]
|
||
|
[--hub_model_id HUB_MODEL_ID]
|
||
|
[--hub_strategy {end,every_save,checkpoint,all_checkpoints}]
|
||
|
[--hub_token HUB_TOKEN]
|
||
|
[--hub_private_repo [HUB_PRIVATE_REPO]]
|
||
|
[--hub_always_push [HUB_ALWAYS_PUSH]]
|
||
|
[--gradient_checkpointing [GRADIENT_CHECKPOINTING]]
|
||
|
[--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS]
|
||
|
[--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]]
|
||
|
[--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]]
|
||
|
[--no_eval_do_concat_batches]
|
||
|
[--fp16_backend {auto,apex,cpu_amp}]
|
||
|
[--evaluation_strategy {no,steps,epoch}]
|
||
|
[--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID]
|
||
|
[--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION]
|
||
|
[--push_to_hub_token PUSH_TO_HUB_TOKEN]
|
||
|
[--mp_parameters MP_PARAMETERS]
|
||
|
[--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]]
|
||
|
[--full_determinism [FULL_DETERMINISM]]
|
||
|
[--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE]
|
||
|
[--ddp_timeout DDP_TIMEOUT]
|
||
|
[--torch_compile [TORCH_COMPILE]]
|
||
|
[--torch_compile_backend TORCH_COMPILE_BACKEND]
|
||
|
[--torch_compile_mode TORCH_COMPILE_MODE]
|
||
|
[--dispatch_batches DISPATCH_BATCHES]
|
||
|
[--split_batches SPLIT_BATCHES]
|
||
|
[--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]]
|
||
|
[--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]]
|
||
|
[--neftune_noise_alpha NEFTUNE_NOISE_ALPHA]
|
||
|
[--optim_target_modules OPTIM_TARGET_MODULES]
|
||
|
[--batch_eval_metrics [BATCH_EVAL_METRICS]]
|
||
|
[--eval_on_start [EVAL_ON_START]]
|
||
|
[--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]]
|
||
|
[--sortish_sampler [SORTISH_SAMPLER]]
|
||
|
[--predict_with_generate [PREDICT_WITH_GENERATE]]
|
||
|
[--generation_max_length GENERATION_MAX_LENGTH]
|
||
|
[--generation_num_beams GENERATION_NUM_BEAMS]
|
||
|
[--generation_config GENERATION_CONFIG]
|
||
|
[--use_badam [USE_BADAM]] [--badam_mode {layer,ratio}]
|
||
|
[--badam_start_block BADAM_START_BLOCK]
|
||
|
[--badam_switch_mode {ascending,descending,random,fixed}]
|
||
|
[--badam_switch_interval BADAM_SWITCH_INTERVAL]
|
||
|
[--badam_update_ratio BADAM_UPDATE_RATIO]
|
||
|
[--badam_mask_mode {adjacent,scatter}]
|
||
|
[--badam_verbose BADAM_VERBOSE] [--use_galore [USE_GALORE]]
|
||
|
[--galore_target GALORE_TARGET] [--galore_rank GALORE_RANK]
|
||
|
[--galore_update_interval GALORE_UPDATE_INTERVAL]
|
||
|
[--galore_scale GALORE_SCALE]
|
||
|
[--galore_proj_type {std,reverse_std,right,left,full}]
|
||
|
[--galore_layerwise [GALORE_LAYERWISE]]
|
||
|
[--pref_beta PREF_BETA] [--pref_ftx PREF_FTX]
|
||
|
[--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}]
|
||
|
[--dpo_label_smoothing DPO_LABEL_SMOOTHING]
|
||
|
[--kto_chosen_weight KTO_CHOSEN_WEIGHT]
|
||
|
[--kto_rejected_weight KTO_REJECTED_WEIGHT]
|
||
|
[--simpo_gamma SIMPO_GAMMA]
|
||
|
[--ppo_buffer_size PPO_BUFFER_SIZE]
|
||
|
[--ppo_epochs PPO_EPOCHS]
|
||
|
[--ppo_score_norm [PPO_SCORE_NORM]]
|
||
|
[--ppo_target PPO_TARGET]
|
||
|
[--ppo_whiten_rewards [PPO_WHITEN_REWARDS]]
|
||
|
[--ref_model REF_MODEL]
|
||
|
[--ref_model_adapters REF_MODEL_ADAPTERS]
|
||
|
[--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT]
|
||
|
[--reward_model REWARD_MODEL]
|
||
|
[--reward_model_adapters REWARD_MODEL_ADAPTERS]
|
||
|
[--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT]
|
||
|
[--reward_model_type {lora,full,api}]
|
||
|
[--additional_target ADDITIONAL_TARGET]
|
||
|
[--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT]
|
||
|
[--lora_rank LORA_RANK] [--lora_target LORA_TARGET]
|
||
|
[--loraplus_lr_ratio LORAPLUS_LR_RATIO]
|
||
|
[--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING]
|
||
|
[--use_rslora [USE_RSLORA]] [--use_dora [USE_DORA]]
|
||
|
[--pissa_init [PISSA_INIT]] [--pissa_iter PISSA_ITER]
|
||
|
[--pissa_convert [PISSA_CONVERT]]
|
||
|
[--create_new_adapter [CREATE_NEW_ADAPTER]]
|
||
|
[--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS]
|
||
|
[--freeze_trainable_modules FREEZE_TRAINABLE_MODULES]
|
||
|
[--freeze_extra_modules FREEZE_EXTRA_MODULES]
|
||
|
[--pure_bf16 [PURE_BF16]] [--stage {pt,sft,rm,ppo,dpo,kto}]
|
||
|
[--finetuning_type {lora,freeze,full}]
|
||
|
[--use_llama_pro [USE_LLAMA_PRO]]
|
||
|
[--use_adam_mini [USE_ADAM_MINI]]
|
||
|
[--freeze_vision_tower [FREEZE_VISION_TOWER]]
|
||
|
[--no_freeze_vision_tower]
|
||
|
[--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]]
|
||
|
[--compute_accuracy [COMPUTE_ACCURACY]]
|
||
|
[--plot_loss [PLOT_LOSS]] [--do_sample [DO_SAMPLE]]
|
||
|
[--no_do_sample] [--temperature TEMPERATURE]
|
||
|
[--top_p TOP_P] [--top_k TOP_K] [--num_beams NUM_BEAMS]
|
||
|
[--max_length MAX_LENGTH] [--max_new_tokens MAX_NEW_TOKENS]
|
||
|
[--repetition_penalty REPETITION_PENALTY]
|
||
|
[--length_penalty LENGTH_PENALTY]
|
||
|
[--default_system DEFAULT_SYSTEM]
|
||
|
|
||
|
optional arguments:
|
||
|
-h, --help show this help message and exit
|
||
|
--model_name_or_path MODEL_NAME_OR_PATH
|
||
|
Path to the model weight or identifier from
|
||
|
huggingface.co/models or modelscope.cn/models.
|
||
|
(default: None)
|
||
|
--adapter_name_or_path ADAPTER_NAME_OR_PATH
|
||
|
Path to the adapter weight or identifier from
|
||
|
huggingface.co/models. Use commas to separate multiple
|
||
|
adapters. (default: None)
|
||
|
--adapter_folder ADAPTER_FOLDER
|
||
|
The folder containing the adapter weights to load.
|
||
|
(default: None)
|
||
|
--cache_dir CACHE_DIR
|
||
|
Where to store the pre-trained models downloaded from
|
||
|
huggingface.co or modelscope.cn. (default: None)
|
||
|
--use_fast_tokenizer [USE_FAST_TOKENIZER]
|
||
|
Whether or not to use one of the fast tokenizer
|
||
|
(backed by the tokenizers library). (default: True)
|
||
|
--no_use_fast_tokenizer
|
||
|
Whether or not to use one of the fast tokenizer
|
||
|
(backed by the tokenizers library). (default: False)
|
||
|
--resize_vocab [RESIZE_VOCAB]
|
||
|
Whether or not to resize the tokenizer vocab and the
|
||
|
embedding layers. (default: False)
|
||
|
--split_special_tokens [SPLIT_SPECIAL_TOKENS]
|
||
|
Whether or not the special tokens should be split
|
||
|
during the tokenization process. (default: False)
|
||
|
--new_special_tokens NEW_SPECIAL_TOKENS
|
||
|
Special tokens to be added into the tokenizer. Use
|
||
|
commas to separate multiple tokens. (default: None)
|
||
|
--model_revision MODEL_REVISION
|
||
|
The specific model version to use (can be a branch
|
||
|
name, tag name or commit id). (default: main)
|
||
|
--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]
|
||
|
Whether or not to use memory-efficient model loading.
|
||
|
(default: True)
|
||
|
--no_low_cpu_mem_usage
|
||
|
Whether or not to use memory-efficient model loading.
|
||
|
(default: False)
|
||
|
--quantization_method {bitsandbytes,hqq,eetq}
|
||
|
Quantization method to use for on-the-fly
|
||
|
quantization. (default: bitsandbytes)
|
||
|
--quantization_bit QUANTIZATION_BIT
|
||
|
The number of bits to quantize the model using
|
||
|
bitsandbytes. (default: None)
|
||
|
--quantization_type {fp4,nf4}
|
||
|
Quantization data type to use in int4 training.
|
||
|
(default: nf4)
|
||
|
--double_quantization [DOUBLE_QUANTIZATION]
|
||
|
Whether or not to use double quantization in int4
|
||
|
training. (default: True)
|
||
|
--no_double_quantization
|
||
|
Whether or not to use double quantization in int4
|
||
|
training. (default: False)
|
||
|
--quantization_device_map {auto}
|
||
|
Device map used to infer the 4-bit quantized model,
|
||
|
needs bitsandbytes>=0.43.0. (default: None)
|
||
|
--rope_scaling {linear,dynamic}
|
||
|
Which scaling strategy should be adopted for the RoPE
|
||
|
embeddings. (default: None)
|
||
|
--flash_attn {auto,disabled,sdpa,fa2}
|
||
|
Enable FlashAttention for faster training and
|
||
|
inference. (default: auto)
|
||
|
--shift_attn [SHIFT_ATTN]
|
||
|
Enable shift short attention (S^2-Attn) proposed by
|
||
|
LongLoRA. (default: False)
|
||
|
--mixture_of_depths {convert,load}
|
||
|
Convert the model to mixture-of-depths (MoD) or load
|
||
|
the MoD model. (default: None)
|
||
|
--use_unsloth [USE_UNSLOTH]
|
||
|
Whether or not to use unsloth's optimization for the
|
||
|
LoRA training. (default: False)
|
||
|
--visual_inputs [VISUAL_INPUTS]
|
||
|
Whethor or not to use multimodal LLM that accepts
|
||
|
visual inputs. (default: False)
|
||
|
--moe_aux_loss_coef MOE_AUX_LOSS_COEF
|
||
|
Coefficient of the auxiliary router loss in mixture-
|
||
|
of-experts model. (default: None)
|
||
|
--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]
|
||
|
Whether or not to disable gradient checkpointing.
|
||
|
(default: False)
|
||
|
--upcast_layernorm [UPCAST_LAYERNORM]
|
||
|
Whether or not to upcast the layernorm weights in
|
||
|
fp32. (default: False)
|
||
|
--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]
|
||
|
Whether or not to upcast the output of lm_head in
|
||
|
fp32. (default: False)
|
||
|
--train_from_scratch [TRAIN_FROM_SCRATCH]
|
||
|
Whether or not to randomly initialize the model
|
||
|
weights. (default: False)
|
||
|
--infer_backend {huggingface,vllm}
|
||
|
Backend engine used at inference. (default:
|
||
|
huggingface)
|
||
|
--vllm_maxlen VLLM_MAXLEN
|
||
|
Maximum sequence (prompt + response) length of the
|
||
|
vLLM engine. (default: 2048)
|
||
|
--vllm_gpu_util VLLM_GPU_UTIL
|
||
|
The fraction of GPU memory in (0,1) to be used for the
|
||
|
vLLM engine. (default: 0.9)
|
||
|
--vllm_enforce_eager [VLLM_ENFORCE_EAGER]
|
||
|
Whether or not to disable CUDA graph in the vLLM
|
||
|
engine. (default: False)
|
||
|
--vllm_max_lora_rank VLLM_MAX_LORA_RANK
|
||
|
Maximum rank of all LoRAs in the vLLM engine.
|
||
|
(default: 32)
|
||
|
--offload_folder OFFLOAD_FOLDER
|
||
|
Path to offload model weights. (default: offload)
|
||
|
--use_cache [USE_CACHE]
|
||
|
Whether or not to use KV cache in generation.
|
||
|
(default: True)
|
||
|
--no_use_cache Whether or not to use KV cache in generation.
|
||
|
(default: False)
|
||
|
--infer_dtype {auto,float16,bfloat16,float32}
|
||
|
Data type for model weights and activations at
|
||
|
inference. (default: auto)
|
||
|
--hf_hub_token HF_HUB_TOKEN
|
||
|
Auth token to log in with Hugging Face Hub. (default:
|
||
|
None)
|
||
|
--ms_hub_token MS_HUB_TOKEN
|
||
|
Auth token to log in with ModelScope Hub. (default:
|
||
|
None)
|
||
|
--export_dir EXPORT_DIR
|
||
|
Path to the directory to save the exported model.
|
||
|
(default: None)
|
||
|
--export_size EXPORT_SIZE
|
||
|
The file shard size (in GB) of the exported model.
|
||
|
(default: 1)
|
||
|
--export_device {cpu,auto}
|
||
|
The device used in model export, use `auto` to
|
||
|
accelerate exporting. (default: cpu)
|
||
|
--export_quantization_bit EXPORT_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the exported model.
|
||
|
(default: None)
|
||
|
--export_quantization_dataset EXPORT_QUANTIZATION_DATASET
|
||
|
Path to the dataset or dataset name to use in
|
||
|
quantizing the exported model. (default: None)
|
||
|
--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES
|
||
|
The number of samples used for quantization. (default:
|
||
|
128)
|
||
|
--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN
|
||
|
The maximum length of the model inputs used for
|
||
|
quantization. (default: 1024)
|
||
|
--export_legacy_format [EXPORT_LEGACY_FORMAT]
|
||
|
Whether or not to save the `.bin` files instead of
|
||
|
`.safetensors`. (default: False)
|
||
|
--export_hub_model_id EXPORT_HUB_MODEL_ID
|
||
|
The name of the repository if push the model to the
|
||
|
Hugging Face hub. (default: None)
|
||
|
--print_param_status [PRINT_PARAM_STATUS]
|
||
|
For debugging purposes, print the status of the
|
||
|
parameters in the model. (default: False)
|
||
|
--template TEMPLATE Which template to use for constructing prompts in
|
||
|
training and inference. (default: None)
|
||
|
--dataset DATASET The name of dataset(s) to use for training. Use commas
|
||
|
to separate multiple datasets. (default: None)
|
||
|
--eval_dataset EVAL_DATASET
|
||
|
The name of dataset(s) to use for evaluation. Use
|
||
|
commas to separate multiple datasets. (default: None)
|
||
|
--dataset_dir DATASET_DIR
|
||
|
Path to the folder containing the datasets. (default:
|
||
|
data)
|
||
|
--cutoff_len CUTOFF_LEN
|
||
|
The cutoff length of the tokenized inputs in the
|
||
|
dataset. (default: 1024)
|
||
|
--train_on_prompt [TRAIN_ON_PROMPT]
|
||
|
Whether or not to disable the mask on the prompt.
|
||
|
(default: False)
|
||
|
--mask_history [MASK_HISTORY]
|
||
|
Whether or not to mask the history and train on the
|
||
|
last turn only. (default: False)
|
||
|
--streaming [STREAMING]
|
||
|
Enable dataset streaming. (default: False)
|
||
|
--buffer_size BUFFER_SIZE
|
||
|
Size of the buffer to randomly sample examples from in
|
||
|
dataset streaming. (default: 16384)
|
||
|
--mix_strategy {concat,interleave_under,interleave_over}
|
||
|
Strategy to use in dataset mixing (concat/interleave)
|
||
|
(undersampling/oversampling). (default: concat)
|
||
|
--interleave_probs INTERLEAVE_PROBS
|
||
|
Probabilities to sample data from datasets. Use commas
|
||
|
to separate multiple datasets. (default: None)
|
||
|
--overwrite_cache [OVERWRITE_CACHE]
|
||
|
Overwrite the cached training and evaluation sets.
|
||
|
(default: False)
|
||
|
--preprocessing_num_workers PREPROCESSING_NUM_WORKERS
|
||
|
The number of processes to use for the pre-processing.
|
||
|
(default: None)
|
||
|
--max_samples MAX_SAMPLES
|
||
|
For debugging purposes, truncate the number of
|
||
|
examples for each dataset. (default: None)
|
||
|
--eval_num_beams EVAL_NUM_BEAMS
|
||
|
Number of beams to use for evaluation. This argument
|
||
|
will be passed to `model.generate` (default: None)
|
||
|
--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]
|
||
|
Whether or not to ignore the tokens corresponding to
|
||
|
the pad label in loss computation. (default: True)
|
||
|
--no_ignore_pad_token_for_loss
|
||
|
Whether or not to ignore the tokens corresponding to
|
||
|
the pad label in loss computation. (default: False)
|
||
|
--val_size VAL_SIZE Size of the development set, should be an integer or a
|
||
|
float in range `[0,1)`. (default: 0.0)
|
||
|
--packing PACKING Enable sequences packing in training. Will
|
||
|
automatically enable in pre-training. (default: None)
|
||
|
--neat_packing [NEAT_PACKING]
|
||
|
Enable sequence packing without cross-attention.
|
||
|
(default: False)
|
||
|
--tool_format TOOL_FORMAT
|
||
|
Tool format to use for constructing function calling
|
||
|
examples. (default: None)
|
||
|
--tokenized_path TOKENIZED_PATH
|
||
|
Path to save or load the tokenized datasets. (default:
|
||
|
None)
|
||
|
--output_dir OUTPUT_DIR
|
||
|
The output directory where the model predictions and
|
||
|
checkpoints will be written. (default: None)
|
||
|
--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]
|
||
|
Overwrite the content of the output directory. Use
|
||
|
this to continue training if output_dir points to a
|
||
|
checkpoint directory. (default: False)
|
||
|
--do_train [DO_TRAIN]
|
||
|
Whether to run training. (default: False)
|
||
|
--do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False)
|
||
|
--do_predict [DO_PREDICT]
|
||
|
Whether to run predictions on the test set. (default:
|
||
|
False)
|
||
|
--eval_strategy {no,steps,epoch}
|
||
|
The evaluation strategy to use. (default: no)
|
||
|
--prediction_loss_only [PREDICTION_LOSS_ONLY]
|
||
|
When performing evaluation and predictions, only
|
||
|
returns the loss. (default: False)
|
||
|
--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE
|
||
|
Batch size per GPU/TPU/MPS/NPU core/CPU for training.
|
||
|
(default: 8)
|
||
|
--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE
|
||
|
Batch size per GPU/TPU/MPS/NPU core/CPU for
|
||
|
evaluation. (default: 8)
|
||
|
--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE
|
||
|
Deprecated, the use of `--per_device_train_batch_size`
|
||
|
is preferred. Batch size per GPU/TPU core/CPU for
|
||
|
training. (default: None)
|
||
|
--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE
|
||
|
Deprecated, the use of `--per_device_eval_batch_size`
|
||
|
is preferred. Batch size per GPU/TPU core/CPU for
|
||
|
evaluation. (default: None)
|
||
|
--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS
|
||
|
Number of updates steps to accumulate before
|
||
|
performing a backward/update pass. (default: 1)
|
||
|
--eval_accumulation_steps EVAL_ACCUMULATION_STEPS
|
||
|
Number of predictions steps to accumulate before
|
||
|
moving the tensors to the CPU. (default: None)
|
||
|
--eval_delay EVAL_DELAY
|
||
|
Number of epochs or steps to wait for before the first
|
||
|
evaluation can be performed, depending on the
|
||
|
eval_strategy. (default: 0)
|
||
|
--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS
|
||
|
Number of steps to wait before calling
|
||
|
`torch.<device>.empty_cache()`.This can help avoid
|
||
|
CUDA out-of-memory errors by lowering peak VRAM usage
|
||
|
at a cost of about [10{'option_strings': ['--
|
||
|
torch_empty_cache_steps'], 'dest':
|
||
|
'torch_empty_cache_steps', 'nargs': None, 'const':
|
||
|
None, 'default': None, 'type': 'int', 'choices': None,
|
||
|
'required': False, 'help': 'Number of steps to wait
|
||
|
before calling `torch.<device>.empty_cache()`.This can
|
||
|
help avoid CUDA out-of-memory errors by lowering peak
|
||
|
VRAM usage at a cost of about [10% slower performance]
|
||
|
(https://github.com/huggingface/transformers/issues/31
|
||
|
372).If left unset or set to None, cache will not be
|
||
|
emptied.', 'metavar': None, 'container':
|
||
|
<argparse._ArgumentGroup object at 0x7fa4d999efd0>,
|
||
|
'prog': 'launcher.py'}lower performance](https://githu
|
||
|
b.com/huggingface/transformers/issues/31372).If left
|
||
|
unset or set to None, cache will not be emptied.
|
||
|
(default: None)
|
||
|
--learning_rate LEARNING_RATE
|
||
|
The initial learning rate for AdamW. (default: 5e-05)
|
||
|
--weight_decay WEIGHT_DECAY
|
||
|
Weight decay for AdamW if we apply some. (default:
|
||
|
0.0)
|
||
|
--adam_beta1 ADAM_BETA1
|
||
|
Beta1 for AdamW optimizer (default: 0.9)
|
||
|
--adam_beta2 ADAM_BETA2
|
||
|
Beta2 for AdamW optimizer (default: 0.999)
|
||
|
--adam_epsilon ADAM_EPSILON
|
||
|
Epsilon for AdamW optimizer. (default: 1e-08)
|
||
|
--max_grad_norm MAX_GRAD_NORM
|
||
|
Max gradient norm. (default: 1.0)
|
||
|
--num_train_epochs NUM_TRAIN_EPOCHS
|
||
|
Total number of training epochs to perform. (default:
|
||
|
3.0)
|
||
|
--max_steps MAX_STEPS
|
||
|
If > 0: set total number of training steps to perform.
|
||
|
Override num_train_epochs. (default: -1)
|
||
|
--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}
|
||
|
The scheduler type to use. (default: linear)
|
||
|
--lr_scheduler_kwargs LR_SCHEDULER_KWARGS
|
||
|
Extra parameters for the lr_scheduler such as
|
||
|
{'num_cycles': 1} for the cosine with hard restarts.
|
||
|
(default: {})
|
||
|
--warmup_ratio WARMUP_RATIO
|
||
|
Linear warmup over warmup_ratio fraction of total
|
||
|
steps. (default: 0.0)
|
||
|
--warmup_steps WARMUP_STEPS
|
||
|
Linear warmup over warmup_steps. (default: 0)
|
||
|
--log_level {detail,debug,info,warning,error,critical,passive}
|
||
|
Logger log level to use on the main node. Possible
|
||
|
choices are the log levels as strings: 'debug',
|
||
|
'info', 'warning', 'error' and 'critical', plus a
|
||
|
'passive' level which doesn't set anything and lets
|
||
|
the application set the level. Defaults to 'passive'.
|
||
|
(default: passive)
|
||
|
--log_level_replica {detail,debug,info,warning,error,critical,passive}
|
||
|
Logger log level to use on replica nodes. Same choices
|
||
|
and defaults as ``log_level`` (default: warning)
|
||
|
--log_on_each_node [LOG_ON_EACH_NODE]
|
||
|
When doing a multinode distributed training, whether
|
||
|
to log once per node or just once on the main node.
|
||
|
(default: True)
|
||
|
--no_log_on_each_node
|
||
|
When doing a multinode distributed training, whether
|
||
|
to log once per node or just once on the main node.
|
||
|
(default: False)
|
||
|
--logging_dir LOGGING_DIR
|
||
|
Tensorboard log dir. (default: None)
|
||
|
--logging_strategy {no,steps,epoch}
|
||
|
The logging strategy to use. (default: steps)
|
||
|
--logging_first_step [LOGGING_FIRST_STEP]
|
||
|
Log the first global_step (default: False)
|
||
|
--logging_steps LOGGING_STEPS
|
||
|
Log every X updates steps. Should be an integer or a
|
||
|
float in range `[0,1)`. If smaller than 1, will be
|
||
|
interpreted as ratio of total training steps.
|
||
|
(default: 500)
|
||
|
--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]
|
||
|
Filter nan and inf losses for logging. (default: True)
|
||
|
--no_logging_nan_inf_filter
|
||
|
Filter nan and inf losses for logging. (default:
|
||
|
False)
|
||
|
--save_strategy {no,steps,epoch}
|
||
|
The checkpoint save strategy to use. (default: steps)
|
||
|
--save_steps SAVE_STEPS
|
||
|
Save checkpoint every X updates steps. Should be an
|
||
|
integer or a float in range `[0,1)`. If smaller than
|
||
|
1, will be interpreted as ratio of total training
|
||
|
steps. (default: 500)
|
||
|
--save_total_limit SAVE_TOTAL_LIMIT
|
||
|
If a value is passed, will limit the total amount of
|
||
|
checkpoints. Deletes the older checkpoints in
|
||
|
`output_dir`. When `load_best_model_at_end` is
|
||
|
enabled, the 'best' checkpoint according to
|
||
|
`metric_for_best_model` will always be retained in
|
||
|
addition to the most recent ones. For example, for
|
||
|
`save_total_limit=5` and
|
||
|
`load_best_model_at_end=True`, the four last
|
||
|
checkpoints will always be retained alongside the best
|
||
|
model. When `save_total_limit=1` and
|
||
|
`load_best_model_at_end=True`, it is possible that two
|
||
|
checkpoints are saved: the last one and the best one
|
||
|
(if they are different). Default is unlimited
|
||
|
checkpoints (default: None)
|
||
|
--save_safetensors [SAVE_SAFETENSORS]
|
||
|
Use safetensors saving and loading for state dicts
|
||
|
instead of default torch.load and torch.save.
|
||
|
(default: True)
|
||
|
--no_save_safetensors
|
||
|
Use safetensors saving and loading for state dicts
|
||
|
instead of default torch.load and torch.save.
|
||
|
(default: False)
|
||
|
--save_on_each_node [SAVE_ON_EACH_NODE]
|
||
|
When doing multi-node distributed training, whether to
|
||
|
save models and checkpoints on each node, or only on
|
||
|
the main one (default: False)
|
||
|
--save_only_model [SAVE_ONLY_MODEL]
|
||
|
When checkpointing, whether to only save the model, or
|
||
|
also the optimizer, scheduler & rng state.Note that
|
||
|
when this is true, you won't be able to resume
|
||
|
training from checkpoint.This enables you to save
|
||
|
storage by not storing the optimizer, scheduler & rng
|
||
|
state.You can only load the model using
|
||
|
from_pretrained with this option set to True.
|
||
|
(default: False)
|
||
|
--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]
|
||
|
Whether to restore the callback states from the
|
||
|
checkpoint. If `True`, will override callbacks passed
|
||
|
to the `Trainer` if they exist in the checkpoint.
|
||
|
(default: False)
|
||
|
--no_cuda [NO_CUDA] This argument is deprecated. It will be removed in
|
||
|
version 5.0 of 🤗 Transformers. (default: False)
|
||
|
--use_cpu [USE_CPU] Whether or not to use cpu. If set to False, we will
|
||
|
use cuda/tpu/mps/npu device if available. (default:
|
||
|
False)
|
||
|
--use_mps_device [USE_MPS_DEVICE]
|
||
|
This argument is deprecated. `mps` device will be used
|
||
|
if available similar to `cuda` device. It will be
|
||
|
removed in version 5.0 of 🤗 Transformers (default:
|
||
|
False)
|
||
|
--seed SEED Random seed that will be set at the beginning of
|
||
|
training. (default: 42)
|
||
|
--data_seed DATA_SEED
|
||
|
Random seed to be used with data samplers. (default:
|
||
|
None)
|
||
|
--jit_mode_eval [JIT_MODE_EVAL]
|
||
|
Whether or not to use PyTorch jit trace for inference
|
||
|
(default: False)
|
||
|
--use_ipex [USE_IPEX]
|
||
|
Use Intel extension for PyTorch when it is available,
|
||
|
installation: 'https://github.com/intel/intel-
|
||
|
extension-for-pytorch' (default: False)
|
||
|
--bf16 [BF16] Whether to use bf16 (mixed) precision instead of
|
||
|
32-bit. Requires Ampere or higher NVIDIA architecture
|
||
|
or using CPU (use_cpu) or Ascend NPU. This is an
|
||
|
experimental API and it may change. (default: False)
|
||
|
--fp16 [FP16] Whether to use fp16 (mixed) precision instead of
|
||
|
32-bit (default: False)
|
||
|
--fp16_opt_level FP16_OPT_LEVEL
|
||
|
For fp16: Apex AMP optimization level selected in
|
||
|
['O0', 'O1', 'O2', and 'O3']. See details at
|
||
|
https://nvidia.github.io/apex/amp.html (default: O1)
|
||
|
--half_precision_backend {auto,apex,cpu_amp}
|
||
|
The backend to be used for half precision. (default:
|
||
|
auto)
|
||
|
--bf16_full_eval [BF16_FULL_EVAL]
|
||
|
Whether to use full bfloat16 evaluation instead of
|
||
|
32-bit. This is an experimental API and it may change.
|
||
|
(default: False)
|
||
|
--fp16_full_eval [FP16_FULL_EVAL]
|
||
|
Whether to use full float16 evaluation instead of
|
||
|
32-bit (default: False)
|
||
|
--tf32 TF32 Whether to enable tf32 mode, available in Ampere and
|
||
|
newer GPU architectures. This is an experimental API
|
||
|
and it may change. (default: None)
|
||
|
--local_rank LOCAL_RANK
|
||
|
For distributed training: local_rank (default: -1)
|
||
|
--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}
|
||
|
The backend to be used for distributed training
|
||
|
(default: None)
|
||
|
--tpu_num_cores TPU_NUM_CORES
|
||
|
TPU: Number of TPU cores (automatically passed by
|
||
|
launcher script) (default: None)
|
||
|
--tpu_metrics_debug [TPU_METRICS_DEBUG]
|
||
|
Deprecated, the use of `--debug tpu_metrics_debug` is
|
||
|
preferred. TPU: Whether to print debug metrics
|
||
|
(default: False)
|
||
|
--debug DEBUG [DEBUG ...]
|
||
|
Whether or not to enable debug mode. Current options:
|
||
|
`underflow_overflow` (Detect underflow and overflow in
|
||
|
activations and weights), `tpu_metrics_debug` (print
|
||
|
debug metrics on TPU). (default: None)
|
||
|
--dataloader_drop_last [DATALOADER_DROP_LAST]
|
||
|
Drop the last incomplete batch if it is not divisible
|
||
|
by the batch size. (default: False)
|
||
|
--eval_steps EVAL_STEPS
|
||
|
Run an evaluation every X steps. Should be an integer
|
||
|
or a float in range `[0,1)`. If smaller than 1, will
|
||
|
be interpreted as ratio of total training steps.
|
||
|
(default: None)
|
||
|
--dataloader_num_workers DATALOADER_NUM_WORKERS
|
||
|
Number of subprocesses to use for data loading
|
||
|
(PyTorch only). 0 means that the data will be loaded
|
||
|
in the main process. (default: 0)
|
||
|
--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR
|
||
|
Number of batches loaded in advance by each worker. 2
|
||
|
means there will be a total of 2 * num_workers batches
|
||
|
prefetched across all workers. Default is 2 for
|
||
|
PyTorch < 2.0.0 and otherwise None. (default: None)
|
||
|
--past_index PAST_INDEX
|
||
|
If >=0, uses the corresponding part of the output as
|
||
|
the past state for next step. (default: -1)
|
||
|
--run_name RUN_NAME An optional descriptor for the run. Notably used for
|
||
|
wandb, mlflow and comet logging. (default: None)
|
||
|
--disable_tqdm DISABLE_TQDM
|
||
|
Whether or not to disable the tqdm progress bars.
|
||
|
(default: None)
|
||
|
--remove_unused_columns [REMOVE_UNUSED_COLUMNS]
|
||
|
Remove columns not required by the model when using an
|
||
|
nlp.Dataset. (default: True)
|
||
|
--no_remove_unused_columns
|
||
|
Remove columns not required by the model when using an
|
||
|
nlp.Dataset. (default: False)
|
||
|
--label_names LABEL_NAMES [LABEL_NAMES ...]
|
||
|
The list of keys in your dictionary of inputs that
|
||
|
correspond to the labels. (default: None)
|
||
|
--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]
|
||
|
Whether or not to load the best model found during
|
||
|
training at the end of training. When this option is
|
||
|
enabled, the best checkpoint will always be saved. See
|
||
|
`save_total_limit` for more. (default: False)
|
||
|
--metric_for_best_model METRIC_FOR_BEST_MODEL
|
||
|
The metric to use to compare two different models.
|
||
|
(default: None)
|
||
|
--greater_is_better GREATER_IS_BETTER
|
||
|
Whether the `metric_for_best_model` should be
|
||
|
maximized or not. (default: None)
|
||
|
--ignore_data_skip [IGNORE_DATA_SKIP]
|
||
|
When resuming training, whether or not to skip the
|
||
|
first epochs and batches to get to the same training
|
||
|
data. (default: False)
|
||
|
--fsdp FSDP Whether or not to use PyTorch Fully Sharded Data
|
||
|
Parallel (FSDP) training (in distributed training
|
||
|
only). The base option should be `full_shard`,
|
||
|
`shard_grad_op` or `no_shard` and you can add CPU-
|
||
|
offload to `full_shard` or `shard_grad_op` like this:
|
||
|
full_shard offload` or `shard_grad_op offload`. You
|
||
|
can add auto-wrap to `full_shard` or `shard_grad_op`
|
||
|
with the same syntax: full_shard auto_wrap` or
|
||
|
`shard_grad_op auto_wrap`. (default: )
|
||
|
--fsdp_min_num_params FSDP_MIN_NUM_PARAMS
|
||
|
This parameter is deprecated. FSDP's minimum number of
|
||
|
parameters for Default Auto Wrapping. (useful only
|
||
|
when `fsdp` field is passed). (default: 0)
|
||
|
--fsdp_config FSDP_CONFIG
|
||
|
Config to be used with FSDP (Pytorch Fully Sharded
|
||
|
Data Parallel). The value is either a fsdp json config
|
||
|
file (e.g., `fsdp_config.json`) or an already loaded
|
||
|
json file as `dict`. (default: None)
|
||
|
--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP
|
||
|
This parameter is deprecated. Transformer layer class
|
||
|
name (case-sensitive) to wrap, e.g, `BertLayer`,
|
||
|
`GPTJBlock`, `T5Block` .... (useful only when `fsdp`
|
||
|
flag is passed). (default: None)
|
||
|
--accelerator_config ACCELERATOR_CONFIG
|
||
|
Config to be used with the internal Accelerator object
|
||
|
initializtion. The value is either a accelerator json
|
||
|
config file (e.g., `accelerator_config.json`) or an
|
||
|
already loaded json file as `dict`. (default: None)
|
||
|
--deepspeed DEEPSPEED
|
||
|
Enable deepspeed and pass the path to deepspeed json
|
||
|
config file (e.g. `ds_config.json`) or an already
|
||
|
loaded json file as a dict (default: None)
|
||
|
--label_smoothing_factor LABEL_SMOOTHING_FACTOR
|
||
|
The label smoothing epsilon to apply (zero means no
|
||
|
label smoothing). (default: 0.0)
|
||
|
--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}
|
||
|
The optimizer to use. (default: adamw_torch)
|
||
|
--optim_args OPTIM_ARGS
|
||
|
Optional arguments to supply to optimizer. (default:
|
||
|
None)
|
||
|
--adafactor [ADAFACTOR]
|
||
|
Whether or not to replace AdamW by Adafactor.
|
||
|
(default: False)
|
||
|
--group_by_length [GROUP_BY_LENGTH]
|
||
|
Whether or not to group samples of roughly the same
|
||
|
length together when batching. (default: False)
|
||
|
--length_column_name LENGTH_COLUMN_NAME
|
||
|
Column name with precomputed lengths to use when
|
||
|
grouping by length. (default: length)
|
||
|
--report_to REPORT_TO
|
||
|
The list of integrations to report the results and
|
||
|
logs to. (default: None)
|
||
|
--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS
|
||
|
When using distributed training, the value of the flag
|
||
|
`find_unused_parameters` passed to
|
||
|
`DistributedDataParallel`. (default: None)
|
||
|
--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB
|
||
|
When using distributed training, the value of the flag
|
||
|
`bucket_cap_mb` passed to `DistributedDataParallel`.
|
||
|
(default: None)
|
||
|
--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS
|
||
|
When using distributed training, the value of the flag
|
||
|
`broadcast_buffers` passed to
|
||
|
`DistributedDataParallel`. (default: None)
|
||
|
--dataloader_pin_memory [DATALOADER_PIN_MEMORY]
|
||
|
Whether or not to pin memory for DataLoader. (default:
|
||
|
True)
|
||
|
--no_dataloader_pin_memory
|
||
|
Whether or not to pin memory for DataLoader. (default:
|
||
|
False)
|
||
|
--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]
|
||
|
If True, the data loader will not shut down the worker
|
||
|
processes after a dataset has been consumed once. This
|
||
|
allows to maintain the workers Dataset instances
|
||
|
alive. Can potentially speed up training, but will
|
||
|
increase RAM usage. (default: False)
|
||
|
--skip_memory_metrics [SKIP_MEMORY_METRICS]
|
||
|
Whether or not to skip adding of memory profiler
|
||
|
reports to metrics. (default: True)
|
||
|
--no_skip_memory_metrics
|
||
|
Whether or not to skip adding of memory profiler
|
||
|
reports to metrics. (default: False)
|
||
|
--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]
|
||
|
Whether or not to use the legacy prediction_loop in
|
||
|
the Trainer. (default: False)
|
||
|
--push_to_hub [PUSH_TO_HUB]
|
||
|
Whether or not to upload the trained model to the
|
||
|
model hub after training. (default: False)
|
||
|
--resume_from_checkpoint RESUME_FROM_CHECKPOINT
|
||
|
The path to a folder with a valid checkpoint for your
|
||
|
model. (default: None)
|
||
|
--hub_model_id HUB_MODEL_ID
|
||
|
The name of the repository to keep in sync with the
|
||
|
local `output_dir`. (default: None)
|
||
|
--hub_strategy {end,every_save,checkpoint,all_checkpoints}
|
||
|
The hub strategy to use when `--push_to_hub` is
|
||
|
activated. (default: every_save)
|
||
|
--hub_token HUB_TOKEN
|
||
|
The token to use to push to the Model Hub. (default:
|
||
|
None)
|
||
|
--hub_private_repo [HUB_PRIVATE_REPO]
|
||
|
Whether the model repository is private or not.
|
||
|
(default: False)
|
||
|
--hub_always_push [HUB_ALWAYS_PUSH]
|
||
|
Unless `True`, the Trainer will skip pushes if the
|
||
|
previous one wasn't finished yet. (default: False)
|
||
|
--gradient_checkpointing [GRADIENT_CHECKPOINTING]
|
||
|
If True, use gradient checkpointing to save memory at
|
||
|
the expense of slower backward pass. (default: False)
|
||
|
--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS
|
||
|
Gradient checkpointing key word arguments such as
|
||
|
`use_reentrant`. Will be passed to
|
||
|
`torch.utils.checkpoint.checkpoint` through
|
||
|
`model.gradient_checkpointing_enable`. (default: None)
|
||
|
--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]
|
||
|
Whether or not the inputs will be passed to the
|
||
|
`compute_metrics` function. (default: False)
|
||
|
--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]
|
||
|
Whether to recursively concat
|
||
|
inputs/losses/labels/predictions across batches. If
|
||
|
`False`, will instead store them as lists, with each
|
||
|
batch kept separate. (default: True)
|
||
|
--no_eval_do_concat_batches
|
||
|
Whether to recursively concat
|
||
|
inputs/losses/labels/predictions across batches. If
|
||
|
`False`, will instead store them as lists, with each
|
||
|
batch kept separate. (default: False)
|
||
|
--fp16_backend {auto,apex,cpu_amp}
|
||
|
Deprecated. Use half_precision_backend instead
|
||
|
(default: auto)
|
||
|
--evaluation_strategy {no,steps,epoch}
|
||
|
Deprecated. Use `eval_strategy` instead (default:
|
||
|
None)
|
||
|
--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID
|
||
|
The name of the repository to which push the
|
||
|
`Trainer`. (default: None)
|
||
|
--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION
|
||
|
The name of the organization in with to which push the
|
||
|
`Trainer`. (default: None)
|
||
|
--push_to_hub_token PUSH_TO_HUB_TOKEN
|
||
|
The token to use to push to the Model Hub. (default:
|
||
|
None)
|
||
|
--mp_parameters MP_PARAMETERS
|
||
|
Used by the SageMaker launcher to send mp-specific
|
||
|
args. Ignored in Trainer (default: )
|
||
|
--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]
|
||
|
Whether to automatically decrease the batch size in
|
||
|
half and rerun the training loop again each time a
|
||
|
CUDA Out-of-Memory was reached (default: False)
|
||
|
--full_determinism [FULL_DETERMINISM]
|
||
|
Whether to call enable_full_determinism instead of
|
||
|
set_seed for reproducibility in distributed training.
|
||
|
Important: this will negatively impact the
|
||
|
performance, so only use it for debugging. (default:
|
||
|
False)
|
||
|
--torchdynamo TORCHDYNAMO
|
||
|
This argument is deprecated, use
|
||
|
`--torch_compile_backend` instead. (default: None)
|
||
|
--ray_scope RAY_SCOPE
|
||
|
The scope to use when doing hyperparameter search with
|
||
|
Ray. By default, `"last"` will be used. Ray will then
|
||
|
use the last checkpoint of all trials, compare those,
|
||
|
and select the best one. However, other options are
|
||
|
also available. See the Ray documentation (https://doc
|
||
|
s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun
|
||
|
e.ExperimentAnalysis.get_best_trial) for more options.
|
||
|
(default: last)
|
||
|
--ddp_timeout DDP_TIMEOUT
|
||
|
Overrides the default timeout for distributed training
|
||
|
(value should be given in seconds). (default: 1800)
|
||
|
--torch_compile [TORCH_COMPILE]
|
||
|
If set to `True`, the model will be wrapped in
|
||
|
`torch.compile`. (default: False)
|
||
|
--torch_compile_backend TORCH_COMPILE_BACKEND
|
||
|
Which backend to use with `torch.compile`, passing one
|
||
|
will trigger a model compilation. (default: None)
|
||
|
--torch_compile_mode TORCH_COMPILE_MODE
|
||
|
Which mode to use with `torch.compile`, passing one
|
||
|
will trigger a model compilation. (default: None)
|
||
|
--dispatch_batches DISPATCH_BATCHES
|
||
|
Deprecated. Pass {'dispatch_batches':VALUE} to
|
||
|
`accelerator_config`. (default: None)
|
||
|
--split_batches SPLIT_BATCHES
|
||
|
Deprecated. Pass {'split_batches':True} to
|
||
|
`accelerator_config`. (default: None)
|
||
|
--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]
|
||
|
If set to `True`, the speed metrics will include `tgs`
|
||
|
(tokens per second per device). (default: False)
|
||
|
--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]
|
||
|
If set to `True`, will track the number of input
|
||
|
tokens seen throughout training. (May be slower in
|
||
|
distributed training) (default: False)
|
||
|
--neftune_noise_alpha NEFTUNE_NOISE_ALPHA
|
||
|
Activates neftune noise embeddings into the model.
|
||
|
NEFTune has been proven to drastically improve model
|
||
|
performances for instrcution fine-tuning. Check out
|
||
|
the original paper here:
|
||
|
https://arxiv.org/abs/2310.05914 and the original code
|
||
|
here: https://github.com/neelsjain/NEFTune. Only
|
||
|
supported for `PreTrainedModel` and `PeftModel`
|
||
|
classes. (default: None)
|
||
|
--optim_target_modules OPTIM_TARGET_MODULES
|
||
|
Target modules for the optimizer defined in the
|
||
|
`optim` argument. Only used for the GaLore optimizer
|
||
|
at the moment. (default: None)
|
||
|
--batch_eval_metrics [BATCH_EVAL_METRICS]
|
||
|
Break eval metrics calculation into batches to save
|
||
|
memory. (default: False)
|
||
|
--eval_on_start [EVAL_ON_START]
|
||
|
Whether to run through the entire `evaluation` step at
|
||
|
the very beginning of training as a sanity check.
|
||
|
(default: False)
|
||
|
--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]
|
||
|
Whether to run recursively gather object in a nested
|
||
|
list/tuple/dictionary of objects from all devices.
|
||
|
(default: False)
|
||
|
--sortish_sampler [SORTISH_SAMPLER]
|
||
|
Whether to use SortishSampler or not. (default: False)
|
||
|
--predict_with_generate [PREDICT_WITH_GENERATE]
|
||
|
Whether to use generate to calculate generative
|
||
|
metrics (ROUGE, BLEU). (default: False)
|
||
|
--generation_max_length GENERATION_MAX_LENGTH
|
||
|
The `max_length` to use on each evaluation loop when
|
||
|
`predict_with_generate=True`. Will default to the
|
||
|
`max_length` value of the model configuration.
|
||
|
(default: None)
|
||
|
--generation_num_beams GENERATION_NUM_BEAMS
|
||
|
The `num_beams` to use on each evaluation loop when
|
||
|
`predict_with_generate=True`. Will default to the
|
||
|
`num_beams` value of the model configuration.
|
||
|
(default: None)
|
||
|
--generation_config GENERATION_CONFIG
|
||
|
Model id, file path or url pointing to a
|
||
|
GenerationConfig json file, to use during prediction.
|
||
|
(default: None)
|
||
|
--use_badam [USE_BADAM]
|
||
|
Whether or not to use the BAdam optimizer. (default:
|
||
|
False)
|
||
|
--badam_mode {layer,ratio}
|
||
|
Whether to use layer-wise or ratio-wise BAdam
|
||
|
optimizer. (default: layer)
|
||
|
--badam_start_block BADAM_START_BLOCK
|
||
|
The starting block index for layer-wise BAdam.
|
||
|
(default: None)
|
||
|
--badam_switch_mode {ascending,descending,random,fixed}
|
||
|
the strategy of picking block to update for layer-wise
|
||
|
BAdam. (default: ascending)
|
||
|
--badam_switch_interval BADAM_SWITCH_INTERVAL
|
||
|
Number of steps to update the block for layer-wise
|
||
|
BAdam. Use -1 to disable the block update. (default:
|
||
|
50)
|
||
|
--badam_update_ratio BADAM_UPDATE_RATIO
|
||
|
The ratio of the update for ratio-wise BAdam.
|
||
|
(default: 0.05)
|
||
|
--badam_mask_mode {adjacent,scatter}
|
||
|
The mode of the mask for BAdam optimizer. `adjacent`
|
||
|
means that the trainable parameters are adjacent to
|
||
|
each other, `scatter` means that trainable parameters
|
||
|
are randomly choosed from the weight. (default:
|
||
|
adjacent)
|
||
|
--badam_verbose BADAM_VERBOSE
|
||
|
The verbosity level of BAdam optimizer. 0 for no
|
||
|
print, 1 for print the block prefix, 2 for print
|
||
|
trainable parameters. (default: 0)
|
||
|
--use_galore [USE_GALORE]
|
||
|
Whether or not to use the gradient low-Rank projection
|
||
|
(GaLore). (default: False)
|
||
|
--galore_target GALORE_TARGET
|
||
|
Name(s) of modules to apply GaLore. Use commas to
|
||
|
separate multiple modules. Use `all` to specify all
|
||
|
the linear modules. (default: all)
|
||
|
--galore_rank GALORE_RANK
|
||
|
The rank of GaLore gradients. (default: 16)
|
||
|
--galore_update_interval GALORE_UPDATE_INTERVAL
|
||
|
Number of steps to update the GaLore projection.
|
||
|
(default: 200)
|
||
|
--galore_scale GALORE_SCALE
|
||
|
GaLore scaling coefficient. (default: 0.25)
|
||
|
--galore_proj_type {std,reverse_std,right,left,full}
|
||
|
Type of GaLore projection. (default: std)
|
||
|
--galore_layerwise [GALORE_LAYERWISE]
|
||
|
Whether or not to enable layer-wise update to further
|
||
|
save memory. (default: False)
|
||
|
--pref_beta PREF_BETA
|
||
|
The beta parameter in the preference loss. (default:
|
||
|
0.1)
|
||
|
--pref_ftx PREF_FTX The supervised fine-tuning loss coefficient in DPO
|
||
|
training. (default: 0.0)
|
||
|
--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}
|
||
|
The type of DPO loss to use. (default: sigmoid)
|
||
|
--dpo_label_smoothing DPO_LABEL_SMOOTHING
|
||
|
The robust DPO label smoothing parameter in cDPO that
|
||
|
should be between 0 and 0.5. (default: 0.0)
|
||
|
--kto_chosen_weight KTO_CHOSEN_WEIGHT
|
||
|
The weight factor of the desirable losses in KTO
|
||
|
training. (default: 1.0)
|
||
|
--kto_rejected_weight KTO_REJECTED_WEIGHT
|
||
|
The weight factor of the undesirable losses in KTO
|
||
|
training. (default: 1.0)
|
||
|
--simpo_gamma SIMPO_GAMMA
|
||
|
The target reward margin term in SimPO loss. (default:
|
||
|
0.5)
|
||
|
--ppo_buffer_size PPO_BUFFER_SIZE
|
||
|
The number of mini-batches to make experience buffer
|
||
|
in a PPO optimization step. (default: 1)
|
||
|
--ppo_epochs PPO_EPOCHS
|
||
|
The number of epochs to perform in a PPO optimization
|
||
|
step. (default: 4)
|
||
|
--ppo_score_norm [PPO_SCORE_NORM]
|
||
|
Use score normalization in PPO training. (default:
|
||
|
False)
|
||
|
--ppo_target PPO_TARGET
|
||
|
Target KL value for adaptive KL control in PPO
|
||
|
training. (default: 6.0)
|
||
|
--ppo_whiten_rewards [PPO_WHITEN_REWARDS]
|
||
|
Whiten the rewards before compute advantages in PPO
|
||
|
training. (default: False)
|
||
|
--ref_model REF_MODEL
|
||
|
Path to the reference model used for the PPO or DPO
|
||
|
training. (default: None)
|
||
|
--ref_model_adapters REF_MODEL_ADAPTERS
|
||
|
Path to the adapters of the reference model. (default:
|
||
|
None)
|
||
|
--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the reference model.
|
||
|
(default: None)
|
||
|
--reward_model REWARD_MODEL
|
||
|
Path to the reward model used for the PPO training.
|
||
|
(default: None)
|
||
|
--reward_model_adapters REWARD_MODEL_ADAPTERS
|
||
|
Path to the adapters of the reward model. (default:
|
||
|
None)
|
||
|
--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the reward model.
|
||
|
(default: None)
|
||
|
--reward_model_type {lora,full,api}
|
||
|
The type of the reward model in PPO training. Lora
|
||
|
model only supports lora training. (default: lora)
|
||
|
--additional_target ADDITIONAL_TARGET
|
||
|
Name(s) of modules apart from LoRA layers to be set as
|
||
|
trainable and saved in the final checkpoint. Use
|
||
|
commas to separate multiple modules. (default: None)
|
||
|
--lora_alpha LORA_ALPHA
|
||
|
The scale factor for LoRA fine-tuning (default:
|
||
|
lora_rank * 2). (default: None)
|
||
|
--lora_dropout LORA_DROPOUT
|
||
|
Dropout rate for the LoRA fine-tuning. (default: 0.0)
|
||
|
--lora_rank LORA_RANK
|
||
|
The intrinsic dimension for LoRA fine-tuning.
|
||
|
(default: 8)
|
||
|
--lora_target LORA_TARGET
|
||
|
Name(s) of target modules to apply LoRA. Use commas to
|
||
|
separate multiple modules. Use `all` to specify all
|
||
|
the linear modules. (default: all)
|
||
|
--loraplus_lr_ratio LORAPLUS_LR_RATIO
|
||
|
LoRA plus learning rate ratio (lr_B / lr_A). (default:
|
||
|
None)
|
||
|
--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING
|
||
|
LoRA plus learning rate for lora embedding layers.
|
||
|
(default: 1e-06)
|
||
|
--use_rslora [USE_RSLORA]
|
||
|
Whether or not to use the rank stabilization scaling
|
||
|
factor for LoRA layer. (default: False)
|
||
|
--use_dora [USE_DORA]
|
||
|
Whether or not to use the weight-decomposed lora
|
||
|
method (DoRA). (default: False)
|
||
|
--pissa_init [PISSA_INIT]
|
||
|
Whether or not to initialize a PiSSA adapter.
|
||
|
(default: False)
|
||
|
--pissa_iter PISSA_ITER
|
||
|
The number of iteration steps performed by FSVD in
|
||
|
PiSSA. Use -1 to disable it. (default: 16)
|
||
|
--pissa_convert [PISSA_CONVERT]
|
||
|
Whether or not to convert the PiSSA adapter to a
|
||
|
normal LoRA adapter. (default: False)
|
||
|
--create_new_adapter [CREATE_NEW_ADAPTER]
|
||
|
Whether or not to create a new adapter with randomly
|
||
|
initialized weight. (default: False)
|
||
|
--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS
|
||
|
The number of trainable layers for freeze (partial-
|
||
|
parameter) fine-tuning. Positive numbers mean the last
|
||
|
n layers are set as trainable, negative numbers mean
|
||
|
the first n layers are set as trainable. (default: 2)
|
||
|
--freeze_trainable_modules FREEZE_TRAINABLE_MODULES
|
||
|
Name(s) of trainable modules for freeze (partial-
|
||
|
parameter) fine-tuning. Use commas to separate
|
||
|
multiple modules. Use `all` to specify all the
|
||
|
available modules. (default: all)
|
||
|
--freeze_extra_modules FREEZE_EXTRA_MODULES
|
||
|
Name(s) of modules apart from hidden layers to be set
|
||
|
as trainable for freeze (partial-parameter) fine-
|
||
|
tuning. Use commas to separate multiple modules.
|
||
|
(default: None)
|
||
|
--pure_bf16 [PURE_BF16]
|
||
|
Whether or not to train model in purely bf16 precision
|
||
|
(without AMP). (default: False)
|
||
|
--stage {pt,sft,rm,ppo,dpo,kto}
|
||
|
Which stage will be performed in training. (default:
|
||
|
sft)
|
||
|
--finetuning_type {lora,freeze,full}
|
||
|
Which fine-tuning method to use. (default: lora)
|
||
|
--use_llama_pro [USE_LLAMA_PRO]
|
||
|
Whether or not to make only the parameters in the
|
||
|
expanded blocks trainable. (default: False)
|
||
|
--use_adam_mini [USE_ADAM_MINI]
|
||
|
Whether or not to use the Adam-mini optimizer.
|
||
|
(default: False)
|
||
|
--freeze_vision_tower [FREEZE_VISION_TOWER]
|
||
|
Whether ot not to freeze vision tower in MLLM
|
||
|
training. (default: True)
|
||
|
--no_freeze_vision_tower
|
||
|
Whether ot not to freeze vision tower in MLLM
|
||
|
training. (default: False)
|
||
|
--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]
|
||
|
Whether or not to train the multimodal projector for
|
||
|
MLLM only. (default: False)
|
||
|
--compute_accuracy [COMPUTE_ACCURACY]
|
||
|
Whether or not to compute the token-level accuracy at
|
||
|
evaluation. (default: False)
|
||
|
--plot_loss [PLOT_LOSS]
|
||
|
Whether or not to save the training loss curves.
|
||
|
(default: False)
|
||
|
--do_sample [DO_SAMPLE]
|
||
|
Whether or not to use sampling, use greedy decoding
|
||
|
otherwise. (default: True)
|
||
|
--no_do_sample Whether or not to use sampling, use greedy decoding
|
||
|
otherwise. (default: False)
|
||
|
--temperature TEMPERATURE
|
||
|
The value used to modulate the next token
|
||
|
probabilities. (default: 0.95)
|
||
|
--top_p TOP_P The smallest set of most probable tokens with
|
||
|
probabilities that add up to top_p or higher are kept.
|
||
|
(default: 0.7)
|
||
|
--top_k TOP_K The number of highest probability vocabulary tokens to
|
||
|
keep for top-k filtering. (default: 50)
|
||
|
--num_beams NUM_BEAMS
|
||
|
Number of beams for beam search. 1 means no beam
|
||
|
search. (default: 1)
|
||
|
--max_length MAX_LENGTH
|
||
|
The maximum length the generated tokens can have. It
|
||
|
can be overridden by max_new_tokens. (default: 1024)
|
||
|
--max_new_tokens MAX_NEW_TOKENS
|
||
|
The maximum numbers of tokens to generate, ignoring
|
||
|
the number of tokens in the prompt. (default: 1024)
|
||
|
--repetition_penalty REPETITION_PENALTY
|
||
|
The parameter for repetition penalty. 1.0 means no
|
||
|
penalty. (default: 1.0)
|
||
|
--length_penalty LENGTH_PENALTY
|
||
|
Exponential penalty to the length that is used with
|
||
|
beam-based generation. (default: 1.0)
|
||
|
--default_system DEFAULT_SYSTEM
|
||
|
Default system message to use in chat completion.
|
||
|
(default: None)
|
||
|
usage: launcher.py [-h] --model_name_or_path MODEL_NAME_OR_PATH
|
||
|
[--adapter_name_or_path ADAPTER_NAME_OR_PATH]
|
||
|
[--adapter_folder ADAPTER_FOLDER] [--cache_dir CACHE_DIR]
|
||
|
[--use_fast_tokenizer [USE_FAST_TOKENIZER]]
|
||
|
[--no_use_fast_tokenizer] [--resize_vocab [RESIZE_VOCAB]]
|
||
|
[--split_special_tokens [SPLIT_SPECIAL_TOKENS]]
|
||
|
[--new_special_tokens NEW_SPECIAL_TOKENS]
|
||
|
[--model_revision MODEL_REVISION]
|
||
|
[--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]]
|
||
|
[--no_low_cpu_mem_usage]
|
||
|
[--quantization_method {bitsandbytes,hqq,eetq}]
|
||
|
[--quantization_bit QUANTIZATION_BIT]
|
||
|
[--quantization_type {fp4,nf4}]
|
||
|
[--double_quantization [DOUBLE_QUANTIZATION]]
|
||
|
[--no_double_quantization]
|
||
|
[--quantization_device_map {auto}]
|
||
|
[--rope_scaling {linear,dynamic}]
|
||
|
[--flash_attn {auto,disabled,sdpa,fa2}]
|
||
|
[--shift_attn [SHIFT_ATTN]]
|
||
|
[--mixture_of_depths {convert,load}]
|
||
|
[--use_unsloth [USE_UNSLOTH]]
|
||
|
[--visual_inputs [VISUAL_INPUTS]]
|
||
|
[--moe_aux_loss_coef MOE_AUX_LOSS_COEF]
|
||
|
[--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]]
|
||
|
[--upcast_layernorm [UPCAST_LAYERNORM]]
|
||
|
[--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]]
|
||
|
[--train_from_scratch [TRAIN_FROM_SCRATCH]]
|
||
|
[--infer_backend {huggingface,vllm}]
|
||
|
[--vllm_maxlen VLLM_MAXLEN] [--vllm_gpu_util VLLM_GPU_UTIL]
|
||
|
[--vllm_enforce_eager [VLLM_ENFORCE_EAGER]]
|
||
|
[--vllm_max_lora_rank VLLM_MAX_LORA_RANK]
|
||
|
[--offload_folder OFFLOAD_FOLDER] [--use_cache [USE_CACHE]]
|
||
|
[--no_use_cache]
|
||
|
[--infer_dtype {auto,float16,bfloat16,float32}]
|
||
|
[--hf_hub_token HF_HUB_TOKEN] [--ms_hub_token MS_HUB_TOKEN]
|
||
|
[--export_dir EXPORT_DIR] [--export_size EXPORT_SIZE]
|
||
|
[--export_device {cpu,auto}]
|
||
|
[--export_quantization_bit EXPORT_QUANTIZATION_BIT]
|
||
|
[--export_quantization_dataset EXPORT_QUANTIZATION_DATASET]
|
||
|
[--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES]
|
||
|
[--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN]
|
||
|
[--export_legacy_format [EXPORT_LEGACY_FORMAT]]
|
||
|
[--export_hub_model_id EXPORT_HUB_MODEL_ID]
|
||
|
[--print_param_status [PRINT_PARAM_STATUS]]
|
||
|
[--template TEMPLATE] [--dataset DATASET]
|
||
|
[--eval_dataset EVAL_DATASET] [--dataset_dir DATASET_DIR]
|
||
|
[--cutoff_len CUTOFF_LEN]
|
||
|
[--train_on_prompt [TRAIN_ON_PROMPT]]
|
||
|
[--mask_history [MASK_HISTORY]] [--streaming [STREAMING]]
|
||
|
[--buffer_size BUFFER_SIZE]
|
||
|
[--mix_strategy {concat,interleave_under,interleave_over}]
|
||
|
[--interleave_probs INTERLEAVE_PROBS]
|
||
|
[--overwrite_cache [OVERWRITE_CACHE]]
|
||
|
[--preprocessing_num_workers PREPROCESSING_NUM_WORKERS]
|
||
|
[--max_samples MAX_SAMPLES]
|
||
|
[--eval_num_beams EVAL_NUM_BEAMS]
|
||
|
[--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]]
|
||
|
[--no_ignore_pad_token_for_loss] [--val_size VAL_SIZE]
|
||
|
[--packing PACKING] [--neat_packing [NEAT_PACKING]]
|
||
|
[--tool_format TOOL_FORMAT]
|
||
|
[--tokenized_path TOKENIZED_PATH] --output_dir OUTPUT_DIR
|
||
|
[--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]]
|
||
|
[--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]]
|
||
|
[--do_predict [DO_PREDICT]]
|
||
|
[--eval_strategy {no,steps,epoch}]
|
||
|
[--prediction_loss_only [PREDICTION_LOSS_ONLY]]
|
||
|
[--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE]
|
||
|
[--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]
|
||
|
[--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE]
|
||
|
[--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE]
|
||
|
[--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS]
|
||
|
[--eval_accumulation_steps EVAL_ACCUMULATION_STEPS]
|
||
|
[--eval_delay EVAL_DELAY]
|
||
|
[--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS]
|
||
|
[--learning_rate LEARNING_RATE]
|
||
|
[--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1]
|
||
|
[--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON]
|
||
|
[--max_grad_norm MAX_GRAD_NORM]
|
||
|
[--num_train_epochs NUM_TRAIN_EPOCHS]
|
||
|
[--max_steps MAX_STEPS]
|
||
|
[--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}]
|
||
|
[--lr_scheduler_kwargs LR_SCHEDULER_KWARGS]
|
||
|
[--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS]
|
||
|
[--log_level {detail,debug,info,warning,error,critical,passive}]
|
||
|
[--log_level_replica {detail,debug,info,warning,error,critical,passive}]
|
||
|
[--log_on_each_node [LOG_ON_EACH_NODE]]
|
||
|
[--no_log_on_each_node] [--logging_dir LOGGING_DIR]
|
||
|
[--logging_strategy {no,steps,epoch}]
|
||
|
[--logging_first_step [LOGGING_FIRST_STEP]]
|
||
|
[--logging_steps LOGGING_STEPS]
|
||
|
[--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]]
|
||
|
[--no_logging_nan_inf_filter]
|
||
|
[--save_strategy {no,steps,epoch}]
|
||
|
[--save_steps SAVE_STEPS]
|
||
|
[--save_total_limit SAVE_TOTAL_LIMIT]
|
||
|
[--save_safetensors [SAVE_SAFETENSORS]]
|
||
|
[--no_save_safetensors]
|
||
|
[--save_on_each_node [SAVE_ON_EACH_NODE]]
|
||
|
[--save_only_model [SAVE_ONLY_MODEL]]
|
||
|
[--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]]
|
||
|
[--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]]
|
||
|
[--use_mps_device [USE_MPS_DEVICE]] [--seed SEED]
|
||
|
[--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]]
|
||
|
[--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]]
|
||
|
[--fp16_opt_level FP16_OPT_LEVEL]
|
||
|
[--half_precision_backend {auto,apex,cpu_amp}]
|
||
|
[--bf16_full_eval [BF16_FULL_EVAL]]
|
||
|
[--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32]
|
||
|
[--local_rank LOCAL_RANK]
|
||
|
[--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}]
|
||
|
[--tpu_num_cores TPU_NUM_CORES]
|
||
|
[--tpu_metrics_debug [TPU_METRICS_DEBUG]]
|
||
|
[--debug DEBUG [DEBUG ...]]
|
||
|
[--dataloader_drop_last [DATALOADER_DROP_LAST]]
|
||
|
[--eval_steps EVAL_STEPS]
|
||
|
[--dataloader_num_workers DATALOADER_NUM_WORKERS]
|
||
|
[--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR]
|
||
|
[--past_index PAST_INDEX] [--run_name RUN_NAME]
|
||
|
[--disable_tqdm DISABLE_TQDM]
|
||
|
[--remove_unused_columns [REMOVE_UNUSED_COLUMNS]]
|
||
|
[--no_remove_unused_columns]
|
||
|
[--label_names LABEL_NAMES [LABEL_NAMES ...]]
|
||
|
[--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]]
|
||
|
[--metric_for_best_model METRIC_FOR_BEST_MODEL]
|
||
|
[--greater_is_better GREATER_IS_BETTER]
|
||
|
[--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP]
|
||
|
[--fsdp_min_num_params FSDP_MIN_NUM_PARAMS]
|
||
|
[--fsdp_config FSDP_CONFIG]
|
||
|
[--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP]
|
||
|
[--accelerator_config ACCELERATOR_CONFIG]
|
||
|
[--deepspeed DEEPSPEED]
|
||
|
[--label_smoothing_factor LABEL_SMOOTHING_FACTOR]
|
||
|
[--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}]
|
||
|
[--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]]
|
||
|
[--group_by_length [GROUP_BY_LENGTH]]
|
||
|
[--length_column_name LENGTH_COLUMN_NAME]
|
||
|
[--report_to REPORT_TO]
|
||
|
[--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS]
|
||
|
[--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB]
|
||
|
[--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS]
|
||
|
[--dataloader_pin_memory [DATALOADER_PIN_MEMORY]]
|
||
|
[--no_dataloader_pin_memory]
|
||
|
[--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]]
|
||
|
[--skip_memory_metrics [SKIP_MEMORY_METRICS]]
|
||
|
[--no_skip_memory_metrics]
|
||
|
[--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]]
|
||
|
[--push_to_hub [PUSH_TO_HUB]]
|
||
|
[--resume_from_checkpoint RESUME_FROM_CHECKPOINT]
|
||
|
[--hub_model_id HUB_MODEL_ID]
|
||
|
[--hub_strategy {end,every_save,checkpoint,all_checkpoints}]
|
||
|
[--hub_token HUB_TOKEN]
|
||
|
[--hub_private_repo [HUB_PRIVATE_REPO]]
|
||
|
[--hub_always_push [HUB_ALWAYS_PUSH]]
|
||
|
[--gradient_checkpointing [GRADIENT_CHECKPOINTING]]
|
||
|
[--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS]
|
||
|
[--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]]
|
||
|
[--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]]
|
||
|
[--no_eval_do_concat_batches]
|
||
|
[--fp16_backend {auto,apex,cpu_amp}]
|
||
|
[--evaluation_strategy {no,steps,epoch}]
|
||
|
[--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID]
|
||
|
[--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION]
|
||
|
[--push_to_hub_token PUSH_TO_HUB_TOKEN]
|
||
|
[--mp_parameters MP_PARAMETERS]
|
||
|
[--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]]
|
||
|
[--full_determinism [FULL_DETERMINISM]]
|
||
|
[--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE]
|
||
|
[--ddp_timeout DDP_TIMEOUT]
|
||
|
[--torch_compile [TORCH_COMPILE]]
|
||
|
[--torch_compile_backend TORCH_COMPILE_BACKEND]
|
||
|
[--torch_compile_mode TORCH_COMPILE_MODE]
|
||
|
[--dispatch_batches DISPATCH_BATCHES]
|
||
|
[--split_batches SPLIT_BATCHES]
|
||
|
[--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]]
|
||
|
[--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]]
|
||
|
[--neftune_noise_alpha NEFTUNE_NOISE_ALPHA]
|
||
|
[--optim_target_modules OPTIM_TARGET_MODULES]
|
||
|
[--batch_eval_metrics [BATCH_EVAL_METRICS]]
|
||
|
[--eval_on_start [EVAL_ON_START]]
|
||
|
[--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]]
|
||
|
[--sortish_sampler [SORTISH_SAMPLER]]
|
||
|
[--predict_with_generate [PREDICT_WITH_GENERATE]]
|
||
|
[--generation_max_length GENERATION_MAX_LENGTH]
|
||
|
[--generation_num_beams GENERATION_NUM_BEAMS]
|
||
|
[--generation_config GENERATION_CONFIG]
|
||
|
[--use_badam [USE_BADAM]] [--badam_mode {layer,ratio}]
|
||
|
[--badam_start_block BADAM_START_BLOCK]
|
||
|
[--badam_switch_mode {ascending,descending,random,fixed}]
|
||
|
[--badam_switch_interval BADAM_SWITCH_INTERVAL]
|
||
|
[--badam_update_ratio BADAM_UPDATE_RATIO]
|
||
|
[--badam_mask_mode {adjacent,scatter}]
|
||
|
[--badam_verbose BADAM_VERBOSE] [--use_galore [USE_GALORE]]
|
||
|
[--galore_target GALORE_TARGET] [--galore_rank GALORE_RANK]
|
||
|
[--galore_update_interval GALORE_UPDATE_INTERVAL]
|
||
|
[--galore_scale GALORE_SCALE]
|
||
|
[--galore_proj_type {std,reverse_std,right,left,full}]
|
||
|
[--galore_layerwise [GALORE_LAYERWISE]]
|
||
|
[--pref_beta PREF_BETA] [--pref_ftx PREF_FTX]
|
||
|
[--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}]
|
||
|
[--dpo_label_smoothing DPO_LABEL_SMOOTHING]
|
||
|
[--kto_chosen_weight KTO_CHOSEN_WEIGHT]
|
||
|
[--kto_rejected_weight KTO_REJECTED_WEIGHT]
|
||
|
[--simpo_gamma SIMPO_GAMMA]
|
||
|
[--ppo_buffer_size PPO_BUFFER_SIZE]
|
||
|
[--ppo_epochs PPO_EPOCHS]
|
||
|
[--ppo_score_norm [PPO_SCORE_NORM]]
|
||
|
[--ppo_target PPO_TARGET]
|
||
|
[--ppo_whiten_rewards [PPO_WHITEN_REWARDS]]
|
||
|
[--ref_model REF_MODEL]
|
||
|
[--ref_model_adapters REF_MODEL_ADAPTERS]
|
||
|
[--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT]
|
||
|
[--reward_model REWARD_MODEL]
|
||
|
[--reward_model_adapters REWARD_MODEL_ADAPTERS]
|
||
|
[--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT]
|
||
|
[--reward_model_type {lora,full,api}]
|
||
|
[--additional_target ADDITIONAL_TARGET]
|
||
|
[--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT]
|
||
|
[--lora_rank LORA_RANK] [--lora_target LORA_TARGET]
|
||
|
[--loraplus_lr_ratio LORAPLUS_LR_RATIO]
|
||
|
[--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING]
|
||
|
[--use_rslora [USE_RSLORA]] [--use_dora [USE_DORA]]
|
||
|
[--pissa_init [PISSA_INIT]] [--pissa_iter PISSA_ITER]
|
||
|
[--pissa_convert [PISSA_CONVERT]]
|
||
|
[--create_new_adapter [CREATE_NEW_ADAPTER]]
|
||
|
[--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS]
|
||
|
[--freeze_trainable_modules FREEZE_TRAINABLE_MODULES]
|
||
|
[--freeze_extra_modules FREEZE_EXTRA_MODULES]
|
||
|
[--pure_bf16 [PURE_BF16]] [--stage {pt,sft,rm,ppo,dpo,kto}]
|
||
|
[--finetuning_type {lora,freeze,full}]
|
||
|
[--use_llama_pro [USE_LLAMA_PRO]]
|
||
|
[--use_adam_mini [USE_ADAM_MINI]]
|
||
|
[--freeze_vision_tower [FREEZE_VISION_TOWER]]
|
||
|
[--no_freeze_vision_tower]
|
||
|
[--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]]
|
||
|
[--compute_accuracy [COMPUTE_ACCURACY]]
|
||
|
[--plot_loss [PLOT_LOSS]] [--do_sample [DO_SAMPLE]]
|
||
|
[--no_do_sample] [--temperature TEMPERATURE]
|
||
|
[--top_p TOP_P] [--top_k TOP_K] [--num_beams NUM_BEAMS]
|
||
|
[--max_length MAX_LENGTH] [--max_new_tokens MAX_NEW_TOKENS]
|
||
|
[--repetition_penalty REPETITION_PENALTY]
|
||
|
[--length_penalty LENGTH_PENALTY]
|
||
|
[--default_system DEFAULT_SYSTEM]
|
||
|
|
||
|
optional arguments:
|
||
|
-h, --help show this help message and exit
|
||
|
--model_name_or_path MODEL_NAME_OR_PATH
|
||
|
Path to the model weight or identifier from
|
||
|
huggingface.co/models or modelscope.cn/models.
|
||
|
(default: None)
|
||
|
--adapter_name_or_path ADAPTER_NAME_OR_PATH
|
||
|
Path to the adapter weight or identifier from
|
||
|
huggingface.co/models. Use commas to separate multiple
|
||
|
adapters. (default: None)
|
||
|
--adapter_folder ADAPTER_FOLDER
|
||
|
The folder containing the adapter weights to load.
|
||
|
(default: None)
|
||
|
--cache_dir CACHE_DIR
|
||
|
Where to store the pre-trained models downloaded from
|
||
|
huggingface.co or modelscope.cn. (default: None)
|
||
|
--use_fast_tokenizer [USE_FAST_TOKENIZER]
|
||
|
Whether or not to use one of the fast tokenizer
|
||
|
(backed by the tokenizers library). (default: True)
|
||
|
--no_use_fast_tokenizer
|
||
|
Whether or not to use one of the fast tokenizer
|
||
|
(backed by the tokenizers library). (default: False)
|
||
|
--resize_vocab [RESIZE_VOCAB]
|
||
|
Whether or not to resize the tokenizer vocab and the
|
||
|
embedding layers. (default: False)
|
||
|
--split_special_tokens [SPLIT_SPECIAL_TOKENS]
|
||
|
Whether or not the special tokens should be split
|
||
|
during the tokenization process. (default: False)
|
||
|
--new_special_tokens NEW_SPECIAL_TOKENS
|
||
|
Special tokens to be added into the tokenizer. Use
|
||
|
commas to separate multiple tokens. (default: None)
|
||
|
--model_revision MODEL_REVISION
|
||
|
The specific model version to use (can be a branch
|
||
|
name, tag name or commit id). (default: main)
|
||
|
--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]
|
||
|
Whether or not to use memory-efficient model loading.
|
||
|
(default: True)
|
||
|
--no_low_cpu_mem_usage
|
||
|
Whether or not to use memory-efficient model loading.
|
||
|
(default: False)
|
||
|
--quantization_method {bitsandbytes,hqq,eetq}
|
||
|
Quantization method to use for on-the-fly
|
||
|
quantization. (default: bitsandbytes)
|
||
|
--quantization_bit QUANTIZATION_BIT
|
||
|
The number of bits to quantize the model using
|
||
|
bitsandbytes. (default: None)
|
||
|
--quantization_type {fp4,nf4}
|
||
|
Quantization data type to use in int4 training.
|
||
|
(default: nf4)
|
||
|
--double_quantization [DOUBLE_QUANTIZATION]
|
||
|
Whether or not to use double quantization in int4
|
||
|
training. (default: True)
|
||
|
--no_double_quantization
|
||
|
Whether or not to use double quantization in int4
|
||
|
training. (default: False)
|
||
|
--quantization_device_map {auto}
|
||
|
Device map used to infer the 4-bit quantized model,
|
||
|
needs bitsandbytes>=0.43.0. (default: None)
|
||
|
--rope_scaling {linear,dynamic}
|
||
|
Which scaling strategy should be adopted for the RoPE
|
||
|
embeddings. (default: None)
|
||
|
--flash_attn {auto,disabled,sdpa,fa2}
|
||
|
Enable FlashAttention for faster training and
|
||
|
inference. (default: auto)
|
||
|
--shift_attn [SHIFT_ATTN]
|
||
|
Enable shift short attention (S^2-Attn) proposed by
|
||
|
LongLoRA. (default: False)
|
||
|
--mixture_of_depths {convert,load}
|
||
|
Convert the model to mixture-of-depths (MoD) or load
|
||
|
the MoD model. (default: None)
|
||
|
--use_unsloth [USE_UNSLOTH]
|
||
|
Whether or not to use unsloth's optimization for the
|
||
|
LoRA training. (default: False)
|
||
|
--visual_inputs [VISUAL_INPUTS]
|
||
|
Whethor or not to use multimodal LLM that accepts
|
||
|
visual inputs. (default: False)
|
||
|
--moe_aux_loss_coef MOE_AUX_LOSS_COEF
|
||
|
Coefficient of the auxiliary router loss in mixture-
|
||
|
of-experts model. (default: None)
|
||
|
--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]
|
||
|
Whether or not to disable gradient checkpointing.
|
||
|
(default: False)
|
||
|
--upcast_layernorm [UPCAST_LAYERNORM]
|
||
|
Whether or not to upcast the layernorm weights in
|
||
|
fp32. (default: False)
|
||
|
--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]
|
||
|
Whether or not to upcast the output of lm_head in
|
||
|
fp32. (default: False)
|
||
|
--train_from_scratch [TRAIN_FROM_SCRATCH]
|
||
|
Whether or not to randomly initialize the model
|
||
|
weights. (default: False)
|
||
|
--infer_backend {huggingface,vllm}
|
||
|
Backend engine used at inference. (default:
|
||
|
huggingface)
|
||
|
--vllm_maxlen VLLM_MAXLEN
|
||
|
Maximum sequence (prompt + response) length of the
|
||
|
vLLM engine. (default: 2048)
|
||
|
--vllm_gpu_util VLLM_GPU_UTIL
|
||
|
The fraction of GPU memory in (0,1) to be used for the
|
||
|
vLLM engine. (default: 0.9)
|
||
|
--vllm_enforce_eager [VLLM_ENFORCE_EAGER]
|
||
|
Whether or not to disable CUDA graph in the vLLM
|
||
|
engine. (default: False)
|
||
|
--vllm_max_lora_rank VLLM_MAX_LORA_RANK
|
||
|
Maximum rank of all LoRAs in the vLLM engine.
|
||
|
(default: 32)
|
||
|
--offload_folder OFFLOAD_FOLDER
|
||
|
Path to offload model weights. (default: offload)
|
||
|
--use_cache [USE_CACHE]
|
||
|
Whether or not to use KV cache in generation.
|
||
|
(default: True)
|
||
|
--no_use_cache Whether or not to use KV cache in generation.
|
||
|
(default: False)
|
||
|
--infer_dtype {auto,float16,bfloat16,float32}
|
||
|
Data type for model weights and activations at
|
||
|
inference. (default: auto)
|
||
|
--hf_hub_token HF_HUB_TOKEN
|
||
|
Auth token to log in with Hugging Face Hub. (default:
|
||
|
None)
|
||
|
--ms_hub_token MS_HUB_TOKEN
|
||
|
Auth token to log in with ModelScope Hub. (default:
|
||
|
None)
|
||
|
--export_dir EXPORT_DIR
|
||
|
Path to the directory to save the exported model.
|
||
|
(default: None)
|
||
|
--export_size EXPORT_SIZE
|
||
|
The file shard size (in GB) of the exported model.
|
||
|
(default: 1)
|
||
|
--export_device {cpu,auto}
|
||
|
The device used in model export, use `auto` to
|
||
|
accelerate exporting. (default: cpu)
|
||
|
--export_quantization_bit EXPORT_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the exported model.
|
||
|
(default: None)
|
||
|
--export_quantization_dataset EXPORT_QUANTIZATION_DATASET
|
||
|
Path to the dataset or dataset name to use in
|
||
|
quantizing the exported model. (default: None)
|
||
|
--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES
|
||
|
The number of samples used for quantization. (default:
|
||
|
128)
|
||
|
--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN
|
||
|
The maximum length of the model inputs used for
|
||
|
quantization. (default: 1024)
|
||
|
--export_legacy_format [EXPORT_LEGACY_FORMAT]
|
||
|
Whether or not to save the `.bin` files instead of
|
||
|
`.safetensors`. (default: False)
|
||
|
--export_hub_model_id EXPORT_HUB_MODEL_ID
|
||
|
The name of the repository if push the model to the
|
||
|
Hugging Face hub. (default: None)
|
||
|
--print_param_status [PRINT_PARAM_STATUS]
|
||
|
For debugging purposes, print the status of the
|
||
|
parameters in the model. (default: False)
|
||
|
--template TEMPLATE Which template to use for constructing prompts in
|
||
|
training and inference. (default: None)
|
||
|
--dataset DATASET The name of dataset(s) to use for training. Use commas
|
||
|
to separate multiple datasets. (default: None)
|
||
|
--eval_dataset EVAL_DATASET
|
||
|
The name of dataset(s) to use for evaluation. Use
|
||
|
commas to separate multiple datasets. (default: None)
|
||
|
--dataset_dir DATASET_DIR
|
||
|
Path to the folder containing the datasets. (default:
|
||
|
data)
|
||
|
--cutoff_len CUTOFF_LEN
|
||
|
The cutoff length of the tokenized inputs in the
|
||
|
dataset. (default: 1024)
|
||
|
--train_on_prompt [TRAIN_ON_PROMPT]
|
||
|
Whether or not to disable the mask on the prompt.
|
||
|
(default: False)
|
||
|
--mask_history [MASK_HISTORY]
|
||
|
Whether or not to mask the history and train on the
|
||
|
last turn only. (default: False)
|
||
|
--streaming [STREAMING]
|
||
|
Enable dataset streaming. (default: False)
|
||
|
--buffer_size BUFFER_SIZE
|
||
|
Size of the buffer to randomly sample examples from in
|
||
|
dataset streaming. (default: 16384)
|
||
|
--mix_strategy {concat,interleave_under,interleave_over}
|
||
|
Strategy to use in dataset mixing (concat/interleave)
|
||
|
(undersampling/oversampling). (default: concat)
|
||
|
--interleave_probs INTERLEAVE_PROBS
|
||
|
Probabilities to sample data from datasets. Use commas
|
||
|
to separate multiple datasets. (default: None)
|
||
|
--overwrite_cache [OVERWRITE_CACHE]
|
||
|
Overwrite the cached training and evaluation sets.
|
||
|
(default: False)
|
||
|
--preprocessing_num_workers PREPROCESSING_NUM_WORKERS
|
||
|
The number of processes to use for the pre-processing.
|
||
|
(default: None)
|
||
|
--max_samples MAX_SAMPLES
|
||
|
For debugging purposes, truncate the number of
|
||
|
examples for each dataset. (default: None)
|
||
|
--eval_num_beams EVAL_NUM_BEAMS
|
||
|
Number of beams to use for evaluation. This argument
|
||
|
will be passed to `model.generate` (default: None)
|
||
|
--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]
|
||
|
Whether or not to ignore the tokens corresponding to
|
||
|
the pad label in loss computation. (default: True)
|
||
|
--no_ignore_pad_token_for_loss
|
||
|
Whether or not to ignore the tokens corresponding to
|
||
|
the pad label in loss computation. (default: False)
|
||
|
--val_size VAL_SIZE Size of the development set, should be an integer or a
|
||
|
float in range `[0,1)`. (default: 0.0)
|
||
|
--packing PACKING Enable sequences packing in training. Will
|
||
|
automatically enable in pre-training. (default: None)
|
||
|
--neat_packing [NEAT_PACKING]
|
||
|
Enable sequence packing without cross-attention.
|
||
|
(default: False)
|
||
|
--tool_format TOOL_FORMAT
|
||
|
Tool format to use for constructing function calling
|
||
|
examples. (default: None)
|
||
|
--tokenized_path TOKENIZED_PATH
|
||
|
Path to save or load the tokenized datasets. (default:
|
||
|
None)
|
||
|
--output_dir OUTPUT_DIR
|
||
|
The output directory where the model predictions and
|
||
|
checkpoints will be written. (default: None)
|
||
|
--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]
|
||
|
Overwrite the content of the output directory. Use
|
||
|
this to continue training if output_dir points to a
|
||
|
checkpoint directory. (default: False)
|
||
|
--do_train [DO_TRAIN]
|
||
|
Whether to run training. (default: False)
|
||
|
--do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False)
|
||
|
--do_predict [DO_PREDICT]
|
||
|
Whether to run predictions on the test set. (default:
|
||
|
False)
|
||
|
--eval_strategy {no,steps,epoch}
|
||
|
The evaluation strategy to use. (default: no)
|
||
|
--prediction_loss_only [PREDICTION_LOSS_ONLY]
|
||
|
When performing evaluation and predictions, only
|
||
|
returns the loss. (default: False)
|
||
|
--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE
|
||
|
Batch size per GPU/TPU/MPS/NPU core/CPU for training.
|
||
|
(default: 8)
|
||
|
--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE
|
||
|
Batch size per GPU/TPU/MPS/NPU core/CPU for
|
||
|
evaluation. (default: 8)
|
||
|
--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE
|
||
|
Deprecated, the use of `--per_device_train_batch_size`
|
||
|
is preferred. Batch size per GPU/TPU core/CPU for
|
||
|
training. (default: None)
|
||
|
--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE
|
||
|
Deprecated, the use of `--per_device_eval_batch_size`
|
||
|
is preferred. Batch size per GPU/TPU core/CPU for
|
||
|
evaluation. (default: None)
|
||
|
--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS
|
||
|
Number of updates steps to accumulate before
|
||
|
performing a backward/update pass. (default: 1)
|
||
|
--eval_accumulation_steps EVAL_ACCUMULATION_STEPS
|
||
|
Number of predictions steps to accumulate before
|
||
|
moving the tensors to the CPU. (default: None)
|
||
|
--eval_delay EVAL_DELAY
|
||
|
Number of epochs or steps to wait for before the first
|
||
|
evaluation can be performed, depending on the
|
||
|
eval_strategy. (default: 0)
|
||
|
--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS
|
||
|
Number of steps to wait before calling
|
||
|
`torch.<device>.empty_cache()`.This can help avoid
|
||
|
CUDA out-of-memory errors by lowering peak VRAM usage
|
||
|
at a cost of about [10{'option_strings': ['--
|
||
|
torch_empty_cache_steps'], 'dest':
|
||
|
'torch_empty_cache_steps', 'nargs': None, 'const':
|
||
|
None, 'default': None, 'type': 'int', 'choices': None,
|
||
|
'required': False, 'help': 'Number of steps to wait
|
||
|
before calling `torch.<device>.empty_cache()`.This can
|
||
|
help avoid CUDA out-of-memory errors by lowering peak
|
||
|
VRAM usage at a cost of about [10% slower performance]
|
||
|
(https://github.com/huggingface/transformers/issues/31
|
||
|
372).If left unset or set to None, cache will not be
|
||
|
emptied.', 'metavar': None, 'container':
|
||
|
<argparse._ArgumentGroup object at 0x7f0f4840dfd0>,
|
||
|
'prog': 'launcher.py'}lower performance](https://githu
|
||
|
b.com/huggingface/transformers/issues/31372).If left
|
||
|
unset or set to None, cache will not be emptied.
|
||
|
(default: None)
|
||
|
--learning_rate LEARNING_RATE
|
||
|
The initial learning rate for AdamW. (default: 5e-05)
|
||
|
--weight_decay WEIGHT_DECAY
|
||
|
Weight decay for AdamW if we apply some. (default:
|
||
|
0.0)
|
||
|
--adam_beta1 ADAM_BETA1
|
||
|
Beta1 for AdamW optimizer (default: 0.9)
|
||
|
--adam_beta2 ADAM_BETA2
|
||
|
Beta2 for AdamW optimizer (default: 0.999)
|
||
|
--adam_epsilon ADAM_EPSILON
|
||
|
Epsilon for AdamW optimizer. (default: 1e-08)
|
||
|
--max_grad_norm MAX_GRAD_NORM
|
||
|
Max gradient norm. (default: 1.0)
|
||
|
--num_train_epochs NUM_TRAIN_EPOCHS
|
||
|
Total number of training epochs to perform. (default:
|
||
|
3.0)
|
||
|
--max_steps MAX_STEPS
|
||
|
If > 0: set total number of training steps to perform.
|
||
|
Override num_train_epochs. (default: -1)
|
||
|
--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}
|
||
|
The scheduler type to use. (default: linear)
|
||
|
--lr_scheduler_kwargs LR_SCHEDULER_KWARGS
|
||
|
Extra parameters for the lr_scheduler such as
|
||
|
{'num_cycles': 1} for the cosine with hard restarts.
|
||
|
(default: {})
|
||
|
--warmup_ratio WARMUP_RATIO
|
||
|
Linear warmup over warmup_ratio fraction of total
|
||
|
steps. (default: 0.0)
|
||
|
--warmup_steps WARMUP_STEPS
|
||
|
Linear warmup over warmup_steps. (default: 0)
|
||
|
--log_level {detail,debug,info,warning,error,critical,passive}
|
||
|
Logger log level to use on the main node. Possible
|
||
|
choices are the log levels as strings: 'debug',
|
||
|
'info', 'warning', 'error' and 'critical', plus a
|
||
|
'passive' level which doesn't set anything and lets
|
||
|
the application set the level. Defaults to 'passive'.
|
||
|
(default: passive)
|
||
|
--log_level_replica {detail,debug,info,warning,error,critical,passive}
|
||
|
Logger log level to use on replica nodes. Same choices
|
||
|
and defaults as ``log_level`` (default: warning)
|
||
|
--log_on_each_node [LOG_ON_EACH_NODE]
|
||
|
When doing a multinode distributed training, whether
|
||
|
to log once per node or just once on the main node.
|
||
|
(default: True)
|
||
|
--no_log_on_each_node
|
||
|
When doing a multinode distributed training, whether
|
||
|
to log once per node or just once on the main node.
|
||
|
(default: False)
|
||
|
--logging_dir LOGGING_DIR
|
||
|
Tensorboard log dir. (default: None)
|
||
|
--logging_strategy {no,steps,epoch}
|
||
|
The logging strategy to use. (default: steps)
|
||
|
--logging_first_step [LOGGING_FIRST_STEP]
|
||
|
Log the first global_step (default: False)
|
||
|
--logging_steps LOGGING_STEPS
|
||
|
Log every X updates steps. Should be an integer or a
|
||
|
float in range `[0,1)`. If smaller than 1, will be
|
||
|
interpreted as ratio of total training steps.
|
||
|
(default: 500)
|
||
|
--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]
|
||
|
Filter nan and inf losses for logging. (default: True)
|
||
|
--no_logging_nan_inf_filter
|
||
|
Filter nan and inf losses for logging. (default:
|
||
|
False)
|
||
|
--save_strategy {no,steps,epoch}
|
||
|
The checkpoint save strategy to use. (default: steps)
|
||
|
--save_steps SAVE_STEPS
|
||
|
Save checkpoint every X updates steps. Should be an
|
||
|
integer or a float in range `[0,1)`. If smaller than
|
||
|
1, will be interpreted as ratio of total training
|
||
|
steps. (default: 500)
|
||
|
--save_total_limit SAVE_TOTAL_LIMIT
|
||
|
If a value is passed, will limit the total amount of
|
||
|
checkpoints. Deletes the older checkpoints in
|
||
|
`output_dir`. When `load_best_model_at_end` is
|
||
|
enabled, the 'best' checkpoint according to
|
||
|
`metric_for_best_model` will always be retained in
|
||
|
addition to the most recent ones. For example, for
|
||
|
`save_total_limit=5` and
|
||
|
`load_best_model_at_end=True`, the four last
|
||
|
checkpoints will always be retained alongside the best
|
||
|
model. When `save_total_limit=1` and
|
||
|
`load_best_model_at_end=True`, it is possible that two
|
||
|
checkpoints are saved: the last one and the best one
|
||
|
(if they are different). Default is unlimited
|
||
|
checkpoints (default: None)
|
||
|
--save_safetensors [SAVE_SAFETENSORS]
|
||
|
Use safetensors saving and loading for state dicts
|
||
|
instead of default torch.load and torch.save.
|
||
|
(default: True)
|
||
|
--no_save_safetensors
|
||
|
Use safetensors saving and loading for state dicts
|
||
|
instead of default torch.load and torch.save.
|
||
|
(default: False)
|
||
|
--save_on_each_node [SAVE_ON_EACH_NODE]
|
||
|
When doing multi-node distributed training, whether to
|
||
|
save models and checkpoints on each node, or only on
|
||
|
the main one (default: False)
|
||
|
--save_only_model [SAVE_ONLY_MODEL]
|
||
|
When checkpointing, whether to only save the model, or
|
||
|
also the optimizer, scheduler & rng state.Note that
|
||
|
when this is true, you won't be able to resume
|
||
|
training from checkpoint.This enables you to save
|
||
|
storage by not storing the optimizer, scheduler & rng
|
||
|
state.You can only load the model using
|
||
|
from_pretrained with this option set to True.
|
||
|
(default: False)
|
||
|
--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]
|
||
|
Whether to restore the callback states from the
|
||
|
checkpoint. If `True`, will override callbacks passed
|
||
|
to the `Trainer` if they exist in the checkpoint.
|
||
|
(default: False)
|
||
|
--no_cuda [NO_CUDA] This argument is deprecated. It will be removed in
|
||
|
version 5.0 of 🤗 Transformers. (default: False)
|
||
|
--use_cpu [USE_CPU] Whether or not to use cpu. If set to False, we will
|
||
|
use cuda/tpu/mps/npu device if available. (default:
|
||
|
False)
|
||
|
--use_mps_device [USE_MPS_DEVICE]
|
||
|
This argument is deprecated. `mps` device will be used
|
||
|
if available similar to `cuda` device. It will be
|
||
|
removed in version 5.0 of 🤗 Transformers (default:
|
||
|
False)
|
||
|
--seed SEED Random seed that will be set at the beginning of
|
||
|
training. (default: 42)
|
||
|
--data_seed DATA_SEED
|
||
|
Random seed to be used with data samplers. (default:
|
||
|
None)
|
||
|
--jit_mode_eval [JIT_MODE_EVAL]
|
||
|
Whether or not to use PyTorch jit trace for inference
|
||
|
(default: False)
|
||
|
--use_ipex [USE_IPEX]
|
||
|
Use Intel extension for PyTorch when it is available,
|
||
|
installation: 'https://github.com/intel/intel-
|
||
|
extension-for-pytorch' (default: False)
|
||
|
--bf16 [BF16] Whether to use bf16 (mixed) precision instead of
|
||
|
32-bit. Requires Ampere or higher NVIDIA architecture
|
||
|
or using CPU (use_cpu) or Ascend NPU. This is an
|
||
|
experimental API and it may change. (default: False)
|
||
|
--fp16 [FP16] Whether to use fp16 (mixed) precision instead of
|
||
|
32-bit (default: False)
|
||
|
--fp16_opt_level FP16_OPT_LEVEL
|
||
|
For fp16: Apex AMP optimization level selected in
|
||
|
['O0', 'O1', 'O2', and 'O3']. See details at
|
||
|
https://nvidia.github.io/apex/amp.html (default: O1)
|
||
|
--half_precision_backend {auto,apex,cpu_amp}
|
||
|
The backend to be used for half precision. (default:
|
||
|
auto)
|
||
|
--bf16_full_eval [BF16_FULL_EVAL]
|
||
|
Whether to use full bfloat16 evaluation instead of
|
||
|
32-bit. This is an experimental API and it may change.
|
||
|
(default: False)
|
||
|
--fp16_full_eval [FP16_FULL_EVAL]
|
||
|
Whether to use full float16 evaluation instead of
|
||
|
32-bit (default: False)
|
||
|
--tf32 TF32 Whether to enable tf32 mode, available in Ampere and
|
||
|
newer GPU architectures. This is an experimental API
|
||
|
and it may change. (default: None)
|
||
|
--local_rank LOCAL_RANK
|
||
|
For distributed training: local_rank (default: -1)
|
||
|
--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}
|
||
|
The backend to be used for distributed training
|
||
|
(default: None)
|
||
|
--tpu_num_cores TPU_NUM_CORES
|
||
|
TPU: Number of TPU cores (automatically passed by
|
||
|
launcher script) (default: None)
|
||
|
--tpu_metrics_debug [TPU_METRICS_DEBUG]
|
||
|
Deprecated, the use of `--debug tpu_metrics_debug` is
|
||
|
preferred. TPU: Whether to print debug metrics
|
||
|
(default: False)
|
||
|
--debug DEBUG [DEBUG ...]
|
||
|
Whether or not to enable debug mode. Current options:
|
||
|
`underflow_overflow` (Detect underflow and overflow in
|
||
|
activations and weights), `tpu_metrics_debug` (print
|
||
|
debug metrics on TPU). (default: None)
|
||
|
--dataloader_drop_last [DATALOADER_DROP_LAST]
|
||
|
Drop the last incomplete batch if it is not divisible
|
||
|
by the batch size. (default: False)
|
||
|
--eval_steps EVAL_STEPS
|
||
|
Run an evaluation every X steps. Should be an integer
|
||
|
or a float in range `[0,1)`. If smaller than 1, will
|
||
|
be interpreted as ratio of total training steps.
|
||
|
(default: None)
|
||
|
--dataloader_num_workers DATALOADER_NUM_WORKERS
|
||
|
Number of subprocesses to use for data loading
|
||
|
(PyTorch only). 0 means that the data will be loaded
|
||
|
in the main process. (default: 0)
|
||
|
--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR
|
||
|
Number of batches loaded in advance by each worker. 2
|
||
|
means there will be a total of 2 * num_workers batches
|
||
|
prefetched across all workers. Default is 2 for
|
||
|
PyTorch < 2.0.0 and otherwise None. (default: None)
|
||
|
--past_index PAST_INDEX
|
||
|
If >=0, uses the corresponding part of the output as
|
||
|
the past state for next step. (default: -1)
|
||
|
--run_name RUN_NAME An optional descriptor for the run. Notably used for
|
||
|
wandb, mlflow and comet logging. (default: None)
|
||
|
--disable_tqdm DISABLE_TQDM
|
||
|
Whether or not to disable the tqdm progress bars.
|
||
|
(default: None)
|
||
|
--remove_unused_columns [REMOVE_UNUSED_COLUMNS]
|
||
|
Remove columns not required by the model when using an
|
||
|
nlp.Dataset. (default: True)
|
||
|
--no_remove_unused_columns
|
||
|
Remove columns not required by the model when using an
|
||
|
nlp.Dataset. (default: False)
|
||
|
--label_names LABEL_NAMES [LABEL_NAMES ...]
|
||
|
The list of keys in your dictionary of inputs that
|
||
|
correspond to the labels. (default: None)
|
||
|
--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]
|
||
|
Whether or not to load the best model found during
|
||
|
training at the end of training. When this option is
|
||
|
enabled, the best checkpoint will always be saved. See
|
||
|
`save_total_limit` for more. (default: False)
|
||
|
--metric_for_best_model METRIC_FOR_BEST_MODEL
|
||
|
The metric to use to compare two different models.
|
||
|
(default: None)
|
||
|
--greater_is_better GREATER_IS_BETTER
|
||
|
Whether the `metric_for_best_model` should be
|
||
|
maximized or not. (default: None)
|
||
|
--ignore_data_skip [IGNORE_DATA_SKIP]
|
||
|
When resuming training, whether or not to skip the
|
||
|
first epochs and batches to get to the same training
|
||
|
data. (default: False)
|
||
|
--fsdp FSDP Whether or not to use PyTorch Fully Sharded Data
|
||
|
Parallel (FSDP) training (in distributed training
|
||
|
only). The base option should be `full_shard`,
|
||
|
`shard_grad_op` or `no_shard` and you can add CPU-
|
||
|
offload to `full_shard` or `shard_grad_op` like this:
|
||
|
full_shard offload` or `shard_grad_op offload`. You
|
||
|
can add auto-wrap to `full_shard` or `shard_grad_op`
|
||
|
with the same syntax: full_shard auto_wrap` or
|
||
|
`shard_grad_op auto_wrap`. (default: )
|
||
|
--fsdp_min_num_params FSDP_MIN_NUM_PARAMS
|
||
|
This parameter is deprecated. FSDP's minimum number of
|
||
|
parameters for Default Auto Wrapping. (useful only
|
||
|
when `fsdp` field is passed). (default: 0)
|
||
|
--fsdp_config FSDP_CONFIG
|
||
|
Config to be used with FSDP (Pytorch Fully Sharded
|
||
|
Data Parallel). The value is either a fsdp json config
|
||
|
file (e.g., `fsdp_config.json`) or an already loaded
|
||
|
json file as `dict`. (default: None)
|
||
|
--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP
|
||
|
This parameter is deprecated. Transformer layer class
|
||
|
name (case-sensitive) to wrap, e.g, `BertLayer`,
|
||
|
`GPTJBlock`, `T5Block` .... (useful only when `fsdp`
|
||
|
flag is passed). (default: None)
|
||
|
--accelerator_config ACCELERATOR_CONFIG
|
||
|
Config to be used with the internal Accelerator object
|
||
|
initializtion. The value is either a accelerator json
|
||
|
config file (e.g., `accelerator_config.json`) or an
|
||
|
already loaded json file as `dict`. (default: None)
|
||
|
--deepspeed DEEPSPEED
|
||
|
Enable deepspeed and pass the path to deepspeed json
|
||
|
config file (e.g. `ds_config.json`) or an already
|
||
|
loaded json file as a dict (default: None)
|
||
|
--label_smoothing_factor LABEL_SMOOTHING_FACTOR
|
||
|
The label smoothing epsilon to apply (zero means no
|
||
|
label smoothing). (default: 0.0)
|
||
|
--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}
|
||
|
The optimizer to use. (default: adamw_torch)
|
||
|
--optim_args OPTIM_ARGS
|
||
|
Optional arguments to supply to optimizer. (default:
|
||
|
None)
|
||
|
--adafactor [ADAFACTOR]
|
||
|
Whether or not to replace AdamW by Adafactor.
|
||
|
(default: False)
|
||
|
--group_by_length [GROUP_BY_LENGTH]
|
||
|
Whether or not to group samples of roughly the same
|
||
|
length together when batching. (default: False)
|
||
|
--length_column_name LENGTH_COLUMN_NAME
|
||
|
Column name with precomputed lengths to use when
|
||
|
grouping by length. (default: length)
|
||
|
--report_to REPORT_TO
|
||
|
The list of integrations to report the results and
|
||
|
logs to. (default: None)
|
||
|
--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS
|
||
|
When using distributed training, the value of the flag
|
||
|
`find_unused_parameters` passed to
|
||
|
`DistributedDataParallel`. (default: None)
|
||
|
--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB
|
||
|
When using distributed training, the value of the flag
|
||
|
`bucket_cap_mb` passed to `DistributedDataParallel`.
|
||
|
(default: None)
|
||
|
--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS
|
||
|
When using distributed training, the value of the flag
|
||
|
`broadcast_buffers` passed to
|
||
|
`DistributedDataParallel`. (default: None)
|
||
|
--dataloader_pin_memory [DATALOADER_PIN_MEMORY]
|
||
|
Whether or not to pin memory for DataLoader. (default:
|
||
|
True)
|
||
|
--no_dataloader_pin_memory
|
||
|
Whether or not to pin memory for DataLoader. (default:
|
||
|
False)
|
||
|
--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]
|
||
|
If True, the data loader will not shut down the worker
|
||
|
processes after a dataset has been consumed once. This
|
||
|
allows to maintain the workers Dataset instances
|
||
|
alive. Can potentially speed up training, but will
|
||
|
increase RAM usage. (default: False)
|
||
|
--skip_memory_metrics [SKIP_MEMORY_METRICS]
|
||
|
Whether or not to skip adding of memory profiler
|
||
|
reports to metrics. (default: True)
|
||
|
--no_skip_memory_metrics
|
||
|
Whether or not to skip adding of memory profiler
|
||
|
reports to metrics. (default: False)
|
||
|
--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]
|
||
|
Whether or not to use the legacy prediction_loop in
|
||
|
the Trainer. (default: False)
|
||
|
--push_to_hub [PUSH_TO_HUB]
|
||
|
Whether or not to upload the trained model to the
|
||
|
model hub after training. (default: False)
|
||
|
--resume_from_checkpoint RESUME_FROM_CHECKPOINT
|
||
|
The path to a folder with a valid checkpoint for your
|
||
|
model. (default: None)
|
||
|
--hub_model_id HUB_MODEL_ID
|
||
|
The name of the repository to keep in sync with the
|
||
|
local `output_dir`. (default: None)
|
||
|
--hub_strategy {end,every_save,checkpoint,all_checkpoints}
|
||
|
The hub strategy to use when `--push_to_hub` is
|
||
|
activated. (default: every_save)
|
||
|
--hub_token HUB_TOKEN
|
||
|
The token to use to push to the Model Hub. (default:
|
||
|
None)
|
||
|
--hub_private_repo [HUB_PRIVATE_REPO]
|
||
|
Whether the model repository is private or not.
|
||
|
(default: False)
|
||
|
--hub_always_push [HUB_ALWAYS_PUSH]
|
||
|
Unless `True`, the Trainer will skip pushes if the
|
||
|
previous one wasn't finished yet. (default: False)
|
||
|
--gradient_checkpointing [GRADIENT_CHECKPOINTING]
|
||
|
If True, use gradient checkpointing to save memory at
|
||
|
the expense of slower backward pass. (default: False)
|
||
|
--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS
|
||
|
Gradient checkpointing key word arguments such as
|
||
|
`use_reentrant`. Will be passed to
|
||
|
`torch.utils.checkpoint.checkpoint` through
|
||
|
`model.gradient_checkpointing_enable`. (default: None)
|
||
|
--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]
|
||
|
Whether or not the inputs will be passed to the
|
||
|
`compute_metrics` function. (default: False)
|
||
|
--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]
|
||
|
Whether to recursively concat
|
||
|
inputs/losses/labels/predictions across batches. If
|
||
|
`False`, will instead store them as lists, with each
|
||
|
batch kept separate. (default: True)
|
||
|
--no_eval_do_concat_batches
|
||
|
Whether to recursively concat
|
||
|
inputs/losses/labels/predictions across batches. If
|
||
|
`False`, will instead store them as lists, with each
|
||
|
batch kept separate. (default: False)
|
||
|
--fp16_backend {auto,apex,cpu_amp}
|
||
|
Deprecated. Use half_precision_backend instead
|
||
|
(default: auto)
|
||
|
--evaluation_strategy {no,steps,epoch}
|
||
|
Deprecated. Use `eval_strategy` instead (default:
|
||
|
None)
|
||
|
--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID
|
||
|
The name of the repository to which push the
|
||
|
`Trainer`. (default: None)
|
||
|
--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION
|
||
|
The name of the organization in with to which push the
|
||
|
`Trainer`. (default: None)
|
||
|
--push_to_hub_token PUSH_TO_HUB_TOKEN
|
||
|
The token to use to push to the Model Hub. (default:
|
||
|
None)
|
||
|
--mp_parameters MP_PARAMETERS
|
||
|
Used by the SageMaker launcher to send mp-specific
|
||
|
args. Ignored in Trainer (default: )
|
||
|
--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]
|
||
|
Whether to automatically decrease the batch size in
|
||
|
half and rerun the training loop again each time a
|
||
|
CUDA Out-of-Memory was reached (default: False)
|
||
|
--full_determinism [FULL_DETERMINISM]
|
||
|
Whether to call enable_full_determinism instead of
|
||
|
set_seed for reproducibility in distributed training.
|
||
|
Important: this will negatively impact the
|
||
|
performance, so only use it for debugging. (default:
|
||
|
False)
|
||
|
--torchdynamo TORCHDYNAMO
|
||
|
This argument is deprecated, use
|
||
|
`--torch_compile_backend` instead. (default: None)
|
||
|
--ray_scope RAY_SCOPE
|
||
|
The scope to use when doing hyperparameter search with
|
||
|
Ray. By default, `"last"` will be used. Ray will then
|
||
|
use the last checkpoint of all trials, compare those,
|
||
|
and select the best one. However, other options are
|
||
|
also available. See the Ray documentation (https://doc
|
||
|
s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun
|
||
|
e.ExperimentAnalysis.get_best_trial) for more options.
|
||
|
(default: last)
|
||
|
--ddp_timeout DDP_TIMEOUT
|
||
|
Overrides the default timeout for distributed training
|
||
|
(value should be given in seconds). (default: 1800)
|
||
|
--torch_compile [TORCH_COMPILE]
|
||
|
If set to `True`, the model will be wrapped in
|
||
|
`torch.compile`. (default: False)
|
||
|
--torch_compile_backend TORCH_COMPILE_BACKEND
|
||
|
Which backend to use with `torch.compile`, passing one
|
||
|
will trigger a model compilation. (default: None)
|
||
|
--torch_compile_mode TORCH_COMPILE_MODE
|
||
|
Which mode to use with `torch.compile`, passing one
|
||
|
will trigger a model compilation. (default: None)
|
||
|
--dispatch_batches DISPATCH_BATCHES
|
||
|
Deprecated. Pass {'dispatch_batches':VALUE} to
|
||
|
`accelerator_config`. (default: None)
|
||
|
--split_batches SPLIT_BATCHES
|
||
|
Deprecated. Pass {'split_batches':True} to
|
||
|
`accelerator_config`. (default: None)
|
||
|
--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]
|
||
|
If set to `True`, the speed metrics will include `tgs`
|
||
|
(tokens per second per device). (default: False)
|
||
|
--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]
|
||
|
If set to `True`, will track the number of input
|
||
|
tokens seen throughout training. (May be slower in
|
||
|
distributed training) (default: False)
|
||
|
--neftune_noise_alpha NEFTUNE_NOISE_ALPHA
|
||
|
Activates neftune noise embeddings into the model.
|
||
|
NEFTune has been proven to drastically improve model
|
||
|
performances for instrcution fine-tuning. Check out
|
||
|
the original paper here:
|
||
|
https://arxiv.org/abs/2310.05914 and the original code
|
||
|
here: https://github.com/neelsjain/NEFTune. Only
|
||
|
supported for `PreTrainedModel` and `PeftModel`
|
||
|
classes. (default: None)
|
||
|
--optim_target_modules OPTIM_TARGET_MODULES
|
||
|
Target modules for the optimizer defined in the
|
||
|
`optim` argument. Only used for the GaLore optimizer
|
||
|
at the moment. (default: None)
|
||
|
--batch_eval_metrics [BATCH_EVAL_METRICS]
|
||
|
Break eval metrics calculation into batches to save
|
||
|
memory. (default: False)
|
||
|
--eval_on_start [EVAL_ON_START]
|
||
|
Whether to run through the entire `evaluation` step at
|
||
|
the very beginning of training as a sanity check.
|
||
|
(default: False)
|
||
|
--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]
|
||
|
Whether to run recursively gather object in a nested
|
||
|
list/tuple/dictionary of objects from all devices.
|
||
|
(default: False)
|
||
|
--sortish_sampler [SORTISH_SAMPLER]
|
||
|
Whether to use SortishSampler or not. (default: False)
|
||
|
--predict_with_generate [PREDICT_WITH_GENERATE]
|
||
|
Whether to use generate to calculate generative
|
||
|
metrics (ROUGE, BLEU). (default: False)
|
||
|
--generation_max_length GENERATION_MAX_LENGTH
|
||
|
The `max_length` to use on each evaluation loop when
|
||
|
`predict_with_generate=True`. Will default to the
|
||
|
`max_length` value of the model configuration.
|
||
|
(default: None)
|
||
|
--generation_num_beams GENERATION_NUM_BEAMS
|
||
|
The `num_beams` to use on each evaluation loop when
|
||
|
`predict_with_generate=True`. Will default to the
|
||
|
`num_beams` value of the model configuration.
|
||
|
(default: None)
|
||
|
--generation_config GENERATION_CONFIG
|
||
|
Model id, file path or url pointing to a
|
||
|
GenerationConfig json file, to use during prediction.
|
||
|
(default: None)
|
||
|
--use_badam [USE_BADAM]
|
||
|
Whether or not to use the BAdam optimizer. (default:
|
||
|
False)
|
||
|
--badam_mode {layer,ratio}
|
||
|
Whether to use layer-wise or ratio-wise BAdam
|
||
|
optimizer. (default: layer)
|
||
|
--badam_start_block BADAM_START_BLOCK
|
||
|
The starting block index for layer-wise BAdam.
|
||
|
(default: None)
|
||
|
--badam_switch_mode {ascending,descending,random,fixed}
|
||
|
the strategy of picking block to update for layer-wise
|
||
|
BAdam. (default: ascending)
|
||
|
--badam_switch_interval BADAM_SWITCH_INTERVAL
|
||
|
Number of steps to update the block for layer-wise
|
||
|
BAdam. Use -1 to disable the block update. (default:
|
||
|
50)
|
||
|
--badam_update_ratio BADAM_UPDATE_RATIO
|
||
|
The ratio of the update for ratio-wise BAdam.
|
||
|
(default: 0.05)
|
||
|
--badam_mask_mode {adjacent,scatter}
|
||
|
The mode of the mask for BAdam optimizer. `adjacent`
|
||
|
means that the trainable parameters are adjacent to
|
||
|
each other, `scatter` means that trainable parameters
|
||
|
are randomly choosed from the weight. (default:
|
||
|
adjacent)
|
||
|
--badam_verbose BADAM_VERBOSE
|
||
|
The verbosity level of BAdam optimizer. 0 for no
|
||
|
print, 1 for print the block prefix, 2 for print
|
||
|
trainable parameters. (default: 0)
|
||
|
--use_galore [USE_GALORE]
|
||
|
Whether or not to use the gradient low-Rank projection
|
||
|
(GaLore). (default: False)
|
||
|
--galore_target GALORE_TARGET
|
||
|
Name(s) of modules to apply GaLore. Use commas to
|
||
|
separate multiple modules. Use `all` to specify all
|
||
|
the linear modules. (default: all)
|
||
|
--galore_rank GALORE_RANK
|
||
|
The rank of GaLore gradients. (default: 16)
|
||
|
--galore_update_interval GALORE_UPDATE_INTERVAL
|
||
|
Number of steps to update the GaLore projection.
|
||
|
(default: 200)
|
||
|
--galore_scale GALORE_SCALE
|
||
|
GaLore scaling coefficient. (default: 0.25)
|
||
|
--galore_proj_type {std,reverse_std,right,left,full}
|
||
|
Type of GaLore projection. (default: std)
|
||
|
--galore_layerwise [GALORE_LAYERWISE]
|
||
|
Whether or not to enable layer-wise update to further
|
||
|
save memory. (default: False)
|
||
|
--pref_beta PREF_BETA
|
||
|
The beta parameter in the preference loss. (default:
|
||
|
0.1)
|
||
|
--pref_ftx PREF_FTX The supervised fine-tuning loss coefficient in DPO
|
||
|
training. (default: 0.0)
|
||
|
--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}
|
||
|
The type of DPO loss to use. (default: sigmoid)
|
||
|
--dpo_label_smoothing DPO_LABEL_SMOOTHING
|
||
|
The robust DPO label smoothing parameter in cDPO that
|
||
|
should be between 0 and 0.5. (default: 0.0)
|
||
|
--kto_chosen_weight KTO_CHOSEN_WEIGHT
|
||
|
The weight factor of the desirable losses in KTO
|
||
|
training. (default: 1.0)
|
||
|
--kto_rejected_weight KTO_REJECTED_WEIGHT
|
||
|
The weight factor of the undesirable losses in KTO
|
||
|
training. (default: 1.0)
|
||
|
--simpo_gamma SIMPO_GAMMA
|
||
|
The target reward margin term in SimPO loss. (default:
|
||
|
0.5)
|
||
|
--ppo_buffer_size PPO_BUFFER_SIZE
|
||
|
The number of mini-batches to make experience buffer
|
||
|
in a PPO optimization step. (default: 1)
|
||
|
--ppo_epochs PPO_EPOCHS
|
||
|
The number of epochs to perform in a PPO optimization
|
||
|
step. (default: 4)
|
||
|
--ppo_score_norm [PPO_SCORE_NORM]
|
||
|
Use score normalization in PPO training. (default:
|
||
|
False)
|
||
|
--ppo_target PPO_TARGET
|
||
|
Target KL value for adaptive KL control in PPO
|
||
|
training. (default: 6.0)
|
||
|
--ppo_whiten_rewards [PPO_WHITEN_REWARDS]
|
||
|
Whiten the rewards before compute advantages in PPO
|
||
|
training. (default: False)
|
||
|
--ref_model REF_MODEL
|
||
|
Path to the reference model used for the PPO or DPO
|
||
|
training. (default: None)
|
||
|
--ref_model_adapters REF_MODEL_ADAPTERS
|
||
|
Path to the adapters of the reference model. (default:
|
||
|
None)
|
||
|
--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the reference model.
|
||
|
(default: None)
|
||
|
--reward_model REWARD_MODEL
|
||
|
Path to the reward model used for the PPO training.
|
||
|
(default: None)
|
||
|
--reward_model_adapters REWARD_MODEL_ADAPTERS
|
||
|
Path to the adapters of the reward model. (default:
|
||
|
None)
|
||
|
--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the reward model.
|
||
|
(default: None)
|
||
|
--reward_model_type {lora,full,api}
|
||
|
The type of the reward model in PPO training. Lora
|
||
|
model only supports lora training. (default: lora)
|
||
|
--additional_target ADDITIONAL_TARGET
|
||
|
Name(s) of modules apart from LoRA layers to be set as
|
||
|
trainable and saved in the final checkpoint. Use
|
||
|
commas to separate multiple modules. (default: None)
|
||
|
--lora_alpha LORA_ALPHA
|
||
|
The scale factor for LoRA fine-tuning (default:
|
||
|
lora_rank * 2). (default: None)
|
||
|
--lora_dropout LORA_DROPOUT
|
||
|
Dropout rate for the LoRA fine-tuning. (default: 0.0)
|
||
|
--lora_rank LORA_RANK
|
||
|
The intrinsic dimension for LoRA fine-tuning.
|
||
|
(default: 8)
|
||
|
--lora_target LORA_TARGET
|
||
|
Name(s) of target modules to apply LoRA. Use commas to
|
||
|
separate multiple modules. Use `all` to specify all
|
||
|
the linear modules. (default: all)
|
||
|
--loraplus_lr_ratio LORAPLUS_LR_RATIO
|
||
|
LoRA plus learning rate ratio (lr_B / lr_A). (default:
|
||
|
None)
|
||
|
--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING
|
||
|
LoRA plus learning rate for lora embedding layers.
|
||
|
(default: 1e-06)
|
||
|
--use_rslora [USE_RSLORA]
|
||
|
Whether or not to use the rank stabilization scaling
|
||
|
factor for LoRA layer. (default: False)
|
||
|
--use_dora [USE_DORA]
|
||
|
Whether or not to use the weight-decomposed lora
|
||
|
method (DoRA). (default: False)
|
||
|
--pissa_init [PISSA_INIT]
|
||
|
Whether or not to initialize a PiSSA adapter.
|
||
|
(default: False)
|
||
|
--pissa_iter PISSA_ITER
|
||
|
The number of iteration steps performed by FSVD in
|
||
|
PiSSA. Use -1 to disable it. (default: 16)
|
||
|
--pissa_convert [PISSA_CONVERT]
|
||
|
Whether or not to convert the PiSSA adapter to a
|
||
|
normal LoRA adapter. (default: False)
|
||
|
--create_new_adapter [CREATE_NEW_ADAPTER]
|
||
|
Whether or not to create a new adapter with randomly
|
||
|
initialized weight. (default: False)
|
||
|
--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS
|
||
|
The number of trainable layers for freeze (partial-
|
||
|
parameter) fine-tuning. Positive numbers mean the last
|
||
|
n layers are set as trainable, negative numbers mean
|
||
|
the first n layers are set as trainable. (default: 2)
|
||
|
--freeze_trainable_modules FREEZE_TRAINABLE_MODULES
|
||
|
Name(s) of trainable modules for freeze (partial-
|
||
|
parameter) fine-tuning. Use commas to separate
|
||
|
multiple modules. Use `all` to specify all the
|
||
|
available modules. (default: all)
|
||
|
--freeze_extra_modules FREEZE_EXTRA_MODULES
|
||
|
Name(s) of modules apart from hidden layers to be set
|
||
|
as trainable for freeze (partial-parameter) fine-
|
||
|
tuning. Use commas to separate multiple modules.
|
||
|
(default: None)
|
||
|
--pure_bf16 [PURE_BF16]
|
||
|
Whether or not to train model in purely bf16 precision
|
||
|
(without AMP). (default: False)
|
||
|
--stage {pt,sft,rm,ppo,dpo,kto}
|
||
|
Which stage will be performed in training. (default:
|
||
|
sft)
|
||
|
--finetuning_type {lora,freeze,full}
|
||
|
Which fine-tuning method to use. (default: lora)
|
||
|
--use_llama_pro [USE_LLAMA_PRO]
|
||
|
Whether or not to make only the parameters in the
|
||
|
expanded blocks trainable. (default: False)
|
||
|
--use_adam_mini [USE_ADAM_MINI]
|
||
|
Whether or not to use the Adam-mini optimizer.
|
||
|
(default: False)
|
||
|
--freeze_vision_tower [FREEZE_VISION_TOWER]
|
||
|
Whether ot not to freeze vision tower in MLLM
|
||
|
training. (default: True)
|
||
|
--no_freeze_vision_tower
|
||
|
Whether ot not to freeze vision tower in MLLM
|
||
|
training. (default: False)
|
||
|
--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]
|
||
|
Whether or not to train the multimodal projector for
|
||
|
MLLM only. (default: False)
|
||
|
--compute_accuracy [COMPUTE_ACCURACY]
|
||
|
Whether or not to compute the token-level accuracy at
|
||
|
evaluation. (default: False)
|
||
|
--plot_loss [PLOT_LOSS]
|
||
|
Whether or not to save the training loss curves.
|
||
|
(default: False)
|
||
|
--do_sample [DO_SAMPLE]
|
||
|
Whether or not to use sampling, use greedy decoding
|
||
|
otherwise. (default: True)
|
||
|
--no_do_sample Whether or not to use sampling, use greedy decoding
|
||
|
otherwise. (default: False)
|
||
|
--temperature TEMPERATURE
|
||
|
The value used to modulate the next token
|
||
|
probabilities. (default: 0.95)
|
||
|
--top_p TOP_P The smallest set of most probable tokens with
|
||
|
probabilities that add up to top_p or higher are kept.
|
||
|
(default: 0.7)
|
||
|
--top_k TOP_K The number of highest probability vocabulary tokens to
|
||
|
keep for top-k filtering. (default: 50)
|
||
|
--num_beams NUM_BEAMS
|
||
|
Number of beams for beam search. 1 means no beam
|
||
|
search. (default: 1)
|
||
|
--max_length MAX_LENGTH
|
||
|
The maximum length the generated tokens can have. It
|
||
|
can be overridden by max_new_tokens. (default: 1024)
|
||
|
--max_new_tokens MAX_NEW_TOKENS
|
||
|
The maximum numbers of tokens to generate, ignoring
|
||
|
the number of tokens in the prompt. (default: 1024)
|
||
|
--repetition_penalty REPETITION_PENALTY
|
||
|
The parameter for repetition penalty. 1.0 means no
|
||
|
penalty. (default: 1.0)
|
||
|
--length_penalty LENGTH_PENALTY
|
||
|
Exponential penalty to the length that is used with
|
||
|
beam-based generation. (default: 1.0)
|
||
|
--default_system DEFAULT_SYSTEM
|
||
|
Default system message to use in chat completion.
|
||
|
(default: None)
|
||
|
usage: launcher.py [-h] --model_name_or_path MODEL_NAME_OR_PATH
|
||
|
[--adapter_name_or_path ADAPTER_NAME_OR_PATH]
|
||
|
[--adapter_folder ADAPTER_FOLDER] [--cache_dir CACHE_DIR]
|
||
|
[--use_fast_tokenizer [USE_FAST_TOKENIZER]]
|
||
|
[--no_use_fast_tokenizer] [--resize_vocab [RESIZE_VOCAB]]
|
||
|
[--split_special_tokens [SPLIT_SPECIAL_TOKENS]]
|
||
|
[--new_special_tokens NEW_SPECIAL_TOKENS]
|
||
|
[--model_revision MODEL_REVISION]
|
||
|
[--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]]
|
||
|
[--no_low_cpu_mem_usage]
|
||
|
[--quantization_method {bitsandbytes,hqq,eetq}]
|
||
|
[--quantization_bit QUANTIZATION_BIT]
|
||
|
[--quantization_type {fp4,nf4}]
|
||
|
[--double_quantization [DOUBLE_QUANTIZATION]]
|
||
|
[--no_double_quantization]
|
||
|
[--quantization_device_map {auto}]
|
||
|
[--rope_scaling {linear,dynamic}]
|
||
|
[--flash_attn {auto,disabled,sdpa,fa2}]
|
||
|
[--shift_attn [SHIFT_ATTN]]
|
||
|
[--mixture_of_depths {convert,load}]
|
||
|
[--use_unsloth [USE_UNSLOTH]]
|
||
|
[--visual_inputs [VISUAL_INPUTS]]
|
||
|
[--moe_aux_loss_coef MOE_AUX_LOSS_COEF]
|
||
|
[--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]]
|
||
|
[--upcast_layernorm [UPCAST_LAYERNORM]]
|
||
|
[--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]]
|
||
|
[--train_from_scratch [TRAIN_FROM_SCRATCH]]
|
||
|
[--infer_backend {huggingface,vllm}]
|
||
|
[--vllm_maxlen VLLM_MAXLEN] [--vllm_gpu_util VLLM_GPU_UTIL]
|
||
|
[--vllm_enforce_eager [VLLM_ENFORCE_EAGER]]
|
||
|
[--vllm_max_lora_rank VLLM_MAX_LORA_RANK]
|
||
|
[--offload_folder OFFLOAD_FOLDER] [--use_cache [USE_CACHE]]
|
||
|
[--no_use_cache]
|
||
|
[--infer_dtype {auto,float16,bfloat16,float32}]
|
||
|
[--hf_hub_token HF_HUB_TOKEN] [--ms_hub_token MS_HUB_TOKEN]
|
||
|
[--export_dir EXPORT_DIR] [--export_size EXPORT_SIZE]
|
||
|
[--export_device {cpu,auto}]
|
||
|
[--export_quantization_bit EXPORT_QUANTIZATION_BIT]
|
||
|
[--export_quantization_dataset EXPORT_QUANTIZATION_DATASET]
|
||
|
[--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES]
|
||
|
[--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN]
|
||
|
[--export_legacy_format [EXPORT_LEGACY_FORMAT]]
|
||
|
[--export_hub_model_id EXPORT_HUB_MODEL_ID]
|
||
|
[--print_param_status [PRINT_PARAM_STATUS]]
|
||
|
[--template TEMPLATE] [--dataset DATASET]
|
||
|
[--eval_dataset EVAL_DATASET] [--dataset_dir DATASET_DIR]
|
||
|
[--cutoff_len CUTOFF_LEN]
|
||
|
[--train_on_prompt [TRAIN_ON_PROMPT]]
|
||
|
[--mask_history [MASK_HISTORY]] [--streaming [STREAMING]]
|
||
|
[--buffer_size BUFFER_SIZE]
|
||
|
[--mix_strategy {concat,interleave_under,interleave_over}]
|
||
|
[--interleave_probs INTERLEAVE_PROBS]
|
||
|
[--overwrite_cache [OVERWRITE_CACHE]]
|
||
|
[--preprocessing_num_workers PREPROCESSING_NUM_WORKERS]
|
||
|
[--max_samples MAX_SAMPLES]
|
||
|
[--eval_num_beams EVAL_NUM_BEAMS]
|
||
|
[--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]]
|
||
|
[--no_ignore_pad_token_for_loss] [--val_size VAL_SIZE]
|
||
|
[--packing PACKING] [--neat_packing [NEAT_PACKING]]
|
||
|
[--tool_format TOOL_FORMAT]
|
||
|
[--tokenized_path TOKENIZED_PATH] --output_dir OUTPUT_DIR
|
||
|
[--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]]
|
||
|
[--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]]
|
||
|
[--do_predict [DO_PREDICT]]
|
||
|
[--eval_strategy {no,steps,epoch}]
|
||
|
[--prediction_loss_only [PREDICTION_LOSS_ONLY]]
|
||
|
[--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE]
|
||
|
[--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]
|
||
|
[--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE]
|
||
|
[--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE]
|
||
|
[--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS]
|
||
|
[--eval_accumulation_steps EVAL_ACCUMULATION_STEPS]
|
||
|
[--eval_delay EVAL_DELAY]
|
||
|
[--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS]
|
||
|
[--learning_rate LEARNING_RATE]
|
||
|
[--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1]
|
||
|
[--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON]
|
||
|
[--max_grad_norm MAX_GRAD_NORM]
|
||
|
[--num_train_epochs NUM_TRAIN_EPOCHS]
|
||
|
[--max_steps MAX_STEPS]
|
||
|
[--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}]
|
||
|
[--lr_scheduler_kwargs LR_SCHEDULER_KWARGS]
|
||
|
[--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS]
|
||
|
[--log_level {detail,debug,info,warning,error,critical,passive}]
|
||
|
[--log_level_replica {detail,debug,info,warning,error,critical,passive}]
|
||
|
[--log_on_each_node [LOG_ON_EACH_NODE]]
|
||
|
[--no_log_on_each_node] [--logging_dir LOGGING_DIR]
|
||
|
[--logging_strategy {no,steps,epoch}]
|
||
|
[--logging_first_step [LOGGING_FIRST_STEP]]
|
||
|
[--logging_steps LOGGING_STEPS]
|
||
|
[--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]]
|
||
|
[--no_logging_nan_inf_filter]
|
||
|
[--save_strategy {no,steps,epoch}]
|
||
|
[--save_steps SAVE_STEPS]
|
||
|
[--save_total_limit SAVE_TOTAL_LIMIT]
|
||
|
[--save_safetensors [SAVE_SAFETENSORS]]
|
||
|
[--no_save_safetensors]
|
||
|
[--save_on_each_node [SAVE_ON_EACH_NODE]]
|
||
|
[--save_only_model [SAVE_ONLY_MODEL]]
|
||
|
[--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]]
|
||
|
[--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]]
|
||
|
[--use_mps_device [USE_MPS_DEVICE]] [--seed SEED]
|
||
|
[--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]]
|
||
|
[--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]]
|
||
|
[--fp16_opt_level FP16_OPT_LEVEL]
|
||
|
[--half_precision_backend {auto,apex,cpu_amp}]
|
||
|
[--bf16_full_eval [BF16_FULL_EVAL]]
|
||
|
[--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32]
|
||
|
[--local_rank LOCAL_RANK]
|
||
|
[--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}]
|
||
|
[--tpu_num_cores TPU_NUM_CORES]
|
||
|
[--tpu_metrics_debug [TPU_METRICS_DEBUG]]
|
||
|
[--debug DEBUG [DEBUG ...]]
|
||
|
[--dataloader_drop_last [DATALOADER_DROP_LAST]]
|
||
|
[--eval_steps EVAL_STEPS]
|
||
|
[--dataloader_num_workers DATALOADER_NUM_WORKERS]
|
||
|
[--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR]
|
||
|
[--past_index PAST_INDEX] [--run_name RUN_NAME]
|
||
|
[--disable_tqdm DISABLE_TQDM]
|
||
|
[--remove_unused_columns [REMOVE_UNUSED_COLUMNS]]
|
||
|
[--no_remove_unused_columns]
|
||
|
[--label_names LABEL_NAMES [LABEL_NAMES ...]]
|
||
|
[--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]]
|
||
|
[--metric_for_best_model METRIC_FOR_BEST_MODEL]
|
||
|
[--greater_is_better GREATER_IS_BETTER]
|
||
|
[--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP]
|
||
|
[--fsdp_min_num_params FSDP_MIN_NUM_PARAMS]
|
||
|
[--fsdp_config FSDP_CONFIG]
|
||
|
[--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP]
|
||
|
[--accelerator_config ACCELERATOR_CONFIG]
|
||
|
[--deepspeed DEEPSPEED]
|
||
|
[--label_smoothing_factor LABEL_SMOOTHING_FACTOR]
|
||
|
[--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}]
|
||
|
[--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]]
|
||
|
[--group_by_length [GROUP_BY_LENGTH]]
|
||
|
[--length_column_name LENGTH_COLUMN_NAME]
|
||
|
[--report_to REPORT_TO]
|
||
|
[--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS]
|
||
|
[--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB]
|
||
|
[--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS]
|
||
|
[--dataloader_pin_memory [DATALOADER_PIN_MEMORY]]
|
||
|
[--no_dataloader_pin_memory]
|
||
|
[--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]]
|
||
|
[--skip_memory_metrics [SKIP_MEMORY_METRICS]]
|
||
|
[--no_skip_memory_metrics]
|
||
|
[--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]]
|
||
|
[--push_to_hub [PUSH_TO_HUB]]
|
||
|
[--resume_from_checkpoint RESUME_FROM_CHECKPOINT]
|
||
|
[--hub_model_id HUB_MODEL_ID]
|
||
|
[--hub_strategy {end,every_save,checkpoint,all_checkpoints}]
|
||
|
[--hub_token HUB_TOKEN]
|
||
|
[--hub_private_repo [HUB_PRIVATE_REPO]]
|
||
|
[--hub_always_push [HUB_ALWAYS_PUSH]]
|
||
|
[--gradient_checkpointing [GRADIENT_CHECKPOINTING]]
|
||
|
[--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS]
|
||
|
[--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]]
|
||
|
[--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]]
|
||
|
[--no_eval_do_concat_batches]
|
||
|
[--fp16_backend {auto,apex,cpu_amp}]
|
||
|
[--evaluation_strategy {no,steps,epoch}]
|
||
|
[--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID]
|
||
|
[--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION]
|
||
|
[--push_to_hub_token PUSH_TO_HUB_TOKEN]
|
||
|
[--mp_parameters MP_PARAMETERS]
|
||
|
[--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]]
|
||
|
[--full_determinism [FULL_DETERMINISM]]
|
||
|
[--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE]
|
||
|
[--ddp_timeout DDP_TIMEOUT]
|
||
|
[--torch_compile [TORCH_COMPILE]]
|
||
|
[--torch_compile_backend TORCH_COMPILE_BACKEND]
|
||
|
[--torch_compile_mode TORCH_COMPILE_MODE]
|
||
|
[--dispatch_batches DISPATCH_BATCHES]
|
||
|
[--split_batches SPLIT_BATCHES]
|
||
|
[--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]]
|
||
|
[--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]]
|
||
|
[--neftune_noise_alpha NEFTUNE_NOISE_ALPHA]
|
||
|
[--optim_target_modules OPTIM_TARGET_MODULES]
|
||
|
[--batch_eval_metrics [BATCH_EVAL_METRICS]]
|
||
|
[--eval_on_start [EVAL_ON_START]]
|
||
|
[--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]]
|
||
|
[--sortish_sampler [SORTISH_SAMPLER]]
|
||
|
[--predict_with_generate [PREDICT_WITH_GENERATE]]
|
||
|
[--generation_max_length GENERATION_MAX_LENGTH]
|
||
|
[--generation_num_beams GENERATION_NUM_BEAMS]
|
||
|
[--generation_config GENERATION_CONFIG]
|
||
|
[--use_badam [USE_BADAM]] [--badam_mode {layer,ratio}]
|
||
|
[--badam_start_block BADAM_START_BLOCK]
|
||
|
[--badam_switch_mode {ascending,descending,random,fixed}]
|
||
|
[--badam_switch_interval BADAM_SWITCH_INTERVAL]
|
||
|
[--badam_update_ratio BADAM_UPDATE_RATIO]
|
||
|
[--badam_mask_mode {adjacent,scatter}]
|
||
|
[--badam_verbose BADAM_VERBOSE] [--use_galore [USE_GALORE]]
|
||
|
[--galore_target GALORE_TARGET] [--galore_rank GALORE_RANK]
|
||
|
[--galore_update_interval GALORE_UPDATE_INTERVAL]
|
||
|
[--galore_scale GALORE_SCALE]
|
||
|
[--galore_proj_type {std,reverse_std,right,left,full}]
|
||
|
[--galore_layerwise [GALORE_LAYERWISE]]
|
||
|
[--pref_beta PREF_BETA] [--pref_ftx PREF_FTX]
|
||
|
[--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}]
|
||
|
[--dpo_label_smoothing DPO_LABEL_SMOOTHING]
|
||
|
[--kto_chosen_weight KTO_CHOSEN_WEIGHT]
|
||
|
[--kto_rejected_weight KTO_REJECTED_WEIGHT]
|
||
|
[--simpo_gamma SIMPO_GAMMA]
|
||
|
[--ppo_buffer_size PPO_BUFFER_SIZE]
|
||
|
[--ppo_epochs PPO_EPOCHS]
|
||
|
[--ppo_score_norm [PPO_SCORE_NORM]]
|
||
|
[--ppo_target PPO_TARGET]
|
||
|
[--ppo_whiten_rewards [PPO_WHITEN_REWARDS]]
|
||
|
[--ref_model REF_MODEL]
|
||
|
[--ref_model_adapters REF_MODEL_ADAPTERS]
|
||
|
[--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT]
|
||
|
[--reward_model REWARD_MODEL]
|
||
|
[--reward_model_adapters REWARD_MODEL_ADAPTERS]
|
||
|
[--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT]
|
||
|
[--reward_model_type {lora,full,api}]
|
||
|
[--additional_target ADDITIONAL_TARGET]
|
||
|
[--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT]
|
||
|
[--lora_rank LORA_RANK] [--lora_target LORA_TARGET]
|
||
|
[--loraplus_lr_ratio LORAPLUS_LR_RATIO]
|
||
|
[--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING]
|
||
|
[--use_rslora [USE_RSLORA]] [--use_dora [USE_DORA]]
|
||
|
[--pissa_init [PISSA_INIT]] [--pissa_iter PISSA_ITER]
|
||
|
[--pissa_convert [PISSA_CONVERT]]
|
||
|
[--create_new_adapter [CREATE_NEW_ADAPTER]]
|
||
|
[--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS]
|
||
|
[--freeze_trainable_modules FREEZE_TRAINABLE_MODULES]
|
||
|
[--freeze_extra_modules FREEZE_EXTRA_MODULES]
|
||
|
[--pure_bf16 [PURE_BF16]] [--stage {pt,sft,rm,ppo,dpo,kto}]
|
||
|
[--finetuning_type {lora,freeze,full}]
|
||
|
[--use_llama_pro [USE_LLAMA_PRO]]
|
||
|
[--use_adam_mini [USE_ADAM_MINI]]
|
||
|
[--freeze_vision_tower [FREEZE_VISION_TOWER]]
|
||
|
[--no_freeze_vision_tower]
|
||
|
[--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]]
|
||
|
[--compute_accuracy [COMPUTE_ACCURACY]]
|
||
|
[--plot_loss [PLOT_LOSS]] [--do_sample [DO_SAMPLE]]
|
||
|
[--no_do_sample] [--temperature TEMPERATURE]
|
||
|
[--top_p TOP_P] [--top_k TOP_K] [--num_beams NUM_BEAMS]
|
||
|
[--max_length MAX_LENGTH] [--max_new_tokens MAX_NEW_TOKENS]
|
||
|
[--repetition_penalty REPETITION_PENALTY]
|
||
|
[--length_penalty LENGTH_PENALTY]
|
||
|
[--default_system DEFAULT_SYSTEM]
|
||
|
|
||
|
optional arguments:
|
||
|
-h, --help show this help message and exit
|
||
|
--model_name_or_path MODEL_NAME_OR_PATH
|
||
|
Path to the model weight or identifier from
|
||
|
huggingface.co/models or modelscope.cn/models.
|
||
|
(default: None)
|
||
|
--adapter_name_or_path ADAPTER_NAME_OR_PATH
|
||
|
Path to the adapter weight or identifier from
|
||
|
huggingface.co/models. Use commas to separate multiple
|
||
|
adapters. (default: None)
|
||
|
--adapter_folder ADAPTER_FOLDER
|
||
|
The folder containing the adapter weights to load.
|
||
|
(default: None)
|
||
|
--cache_dir CACHE_DIR
|
||
|
Where to store the pre-trained models downloaded from
|
||
|
huggingface.co or modelscope.cn. (default: None)
|
||
|
--use_fast_tokenizer [USE_FAST_TOKENIZER]
|
||
|
Whether or not to use one of the fast tokenizer
|
||
|
(backed by the tokenizers library). (default: True)
|
||
|
--no_use_fast_tokenizer
|
||
|
Whether or not to use one of the fast tokenizer
|
||
|
(backed by the tokenizers library). (default: False)
|
||
|
--resize_vocab [RESIZE_VOCAB]
|
||
|
Whether or not to resize the tokenizer vocab and the
|
||
|
embedding layers. (default: False)
|
||
|
--split_special_tokens [SPLIT_SPECIAL_TOKENS]
|
||
|
Whether or not the special tokens should be split
|
||
|
during the tokenization process. (default: False)
|
||
|
--new_special_tokens NEW_SPECIAL_TOKENS
|
||
|
Special tokens to be added into the tokenizer. Use
|
||
|
commas to separate multiple tokens. (default: None)
|
||
|
--model_revision MODEL_REVISION
|
||
|
The specific model version to use (can be a branch
|
||
|
name, tag name or commit id). (default: main)
|
||
|
--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]
|
||
|
Whether or not to use memory-efficient model loading.
|
||
|
(default: True)
|
||
|
--no_low_cpu_mem_usage
|
||
|
Whether or not to use memory-efficient model loading.
|
||
|
(default: False)
|
||
|
--quantization_method {bitsandbytes,hqq,eetq}
|
||
|
Quantization method to use for on-the-fly
|
||
|
quantization. (default: bitsandbytes)
|
||
|
--quantization_bit QUANTIZATION_BIT
|
||
|
The number of bits to quantize the model using
|
||
|
bitsandbytes. (default: None)
|
||
|
--quantization_type {fp4,nf4}
|
||
|
Quantization data type to use in int4 training.
|
||
|
(default: nf4)
|
||
|
--double_quantization [DOUBLE_QUANTIZATION]
|
||
|
Whether or not to use double quantization in int4
|
||
|
training. (default: True)
|
||
|
--no_double_quantization
|
||
|
Whether or not to use double quantization in int4
|
||
|
training. (default: False)
|
||
|
--quantization_device_map {auto}
|
||
|
Device map used to infer the 4-bit quantized model,
|
||
|
needs bitsandbytes>=0.43.0. (default: None)
|
||
|
--rope_scaling {linear,dynamic}
|
||
|
Which scaling strategy should be adopted for the RoPE
|
||
|
embeddings. (default: None)
|
||
|
--flash_attn {auto,disabled,sdpa,fa2}
|
||
|
Enable FlashAttention for faster training and
|
||
|
inference. (default: auto)
|
||
|
--shift_attn [SHIFT_ATTN]
|
||
|
Enable shift short attention (S^2-Attn) proposed by
|
||
|
LongLoRA. (default: False)
|
||
|
--mixture_of_depths {convert,load}
|
||
|
Convert the model to mixture-of-depths (MoD) or load
|
||
|
the MoD model. (default: None)
|
||
|
--use_unsloth [USE_UNSLOTH]
|
||
|
Whether or not to use unsloth's optimization for the
|
||
|
LoRA training. (default: False)
|
||
|
--visual_inputs [VISUAL_INPUTS]
|
||
|
Whethor or not to use multimodal LLM that accepts
|
||
|
visual inputs. (default: False)
|
||
|
--moe_aux_loss_coef MOE_AUX_LOSS_COEF
|
||
|
Coefficient of the auxiliary router loss in mixture-
|
||
|
of-experts model. (default: None)
|
||
|
--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]
|
||
|
Whether or not to disable gradient checkpointing.
|
||
|
(default: False)
|
||
|
--upcast_layernorm [UPCAST_LAYERNORM]
|
||
|
Whether or not to upcast the layernorm weights in
|
||
|
fp32. (default: False)
|
||
|
--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]
|
||
|
Whether or not to upcast the output of lm_head in
|
||
|
fp32. (default: False)
|
||
|
--train_from_scratch [TRAIN_FROM_SCRATCH]
|
||
|
Whether or not to randomly initialize the model
|
||
|
weights. (default: False)
|
||
|
--infer_backend {huggingface,vllm}
|
||
|
Backend engine used at inference. (default:
|
||
|
huggingface)
|
||
|
--vllm_maxlen VLLM_MAXLEN
|
||
|
Maximum sequence (prompt + response) length of the
|
||
|
vLLM engine. (default: 2048)
|
||
|
--vllm_gpu_util VLLM_GPU_UTIL
|
||
|
The fraction of GPU memory in (0,1) to be used for the
|
||
|
vLLM engine. (default: 0.9)
|
||
|
--vllm_enforce_eager [VLLM_ENFORCE_EAGER]
|
||
|
Whether or not to disable CUDA graph in the vLLM
|
||
|
engine. (default: False)
|
||
|
--vllm_max_lora_rank VLLM_MAX_LORA_RANK
|
||
|
Maximum rank of all LoRAs in the vLLM engine.
|
||
|
(default: 32)
|
||
|
--offload_folder OFFLOAD_FOLDER
|
||
|
Path to offload model weights. (default: offload)
|
||
|
--use_cache [USE_CACHE]
|
||
|
Whether or not to use KV cache in generation.
|
||
|
(default: True)
|
||
|
--no_use_cache Whether or not to use KV cache in generation.
|
||
|
(default: False)
|
||
|
--infer_dtype {auto,float16,bfloat16,float32}
|
||
|
Data type for model weights and activations at
|
||
|
inference. (default: auto)
|
||
|
--hf_hub_token HF_HUB_TOKEN
|
||
|
Auth token to log in with Hugging Face Hub. (default:
|
||
|
None)
|
||
|
--ms_hub_token MS_HUB_TOKEN
|
||
|
Auth token to log in with ModelScope Hub. (default:
|
||
|
None)
|
||
|
--export_dir EXPORT_DIR
|
||
|
Path to the directory to save the exported model.
|
||
|
(default: None)
|
||
|
--export_size EXPORT_SIZE
|
||
|
The file shard size (in GB) of the exported model.
|
||
|
(default: 1)
|
||
|
--export_device {cpu,auto}
|
||
|
The device used in model export, use `auto` to
|
||
|
accelerate exporting. (default: cpu)
|
||
|
--export_quantization_bit EXPORT_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the exported model.
|
||
|
(default: None)
|
||
|
--export_quantization_dataset EXPORT_QUANTIZATION_DATASET
|
||
|
Path to the dataset or dataset name to use in
|
||
|
quantizing the exported model. (default: None)
|
||
|
--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES
|
||
|
The number of samples used for quantization. (default:
|
||
|
128)
|
||
|
--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN
|
||
|
The maximum length of the model inputs used for
|
||
|
quantization. (default: 1024)
|
||
|
--export_legacy_format [EXPORT_LEGACY_FORMAT]
|
||
|
Whether or not to save the `.bin` files instead of
|
||
|
`.safetensors`. (default: False)
|
||
|
--export_hub_model_id EXPORT_HUB_MODEL_ID
|
||
|
The name of the repository if push the model to the
|
||
|
Hugging Face hub. (default: None)
|
||
|
--print_param_status [PRINT_PARAM_STATUS]
|
||
|
For debugging purposes, print the status of the
|
||
|
parameters in the model. (default: False)
|
||
|
--template TEMPLATE Which template to use for constructing prompts in
|
||
|
training and inference. (default: None)
|
||
|
--dataset DATASET The name of dataset(s) to use for training. Use commas
|
||
|
to separate multiple datasets. (default: None)
|
||
|
--eval_dataset EVAL_DATASET
|
||
|
The name of dataset(s) to use for evaluation. Use
|
||
|
commas to separate multiple datasets. (default: None)
|
||
|
--dataset_dir DATASET_DIR
|
||
|
Path to the folder containing the datasets. (default:
|
||
|
data)
|
||
|
--cutoff_len CUTOFF_LEN
|
||
|
The cutoff length of the tokenized inputs in the
|
||
|
dataset. (default: 1024)
|
||
|
--train_on_prompt [TRAIN_ON_PROMPT]
|
||
|
Whether or not to disable the mask on the prompt.
|
||
|
(default: False)
|
||
|
--mask_history [MASK_HISTORY]
|
||
|
Whether or not to mask the history and train on the
|
||
|
last turn only. (default: False)
|
||
|
--streaming [STREAMING]
|
||
|
Enable dataset streaming. (default: False)
|
||
|
--buffer_size BUFFER_SIZE
|
||
|
Size of the buffer to randomly sample examples from in
|
||
|
dataset streaming. (default: 16384)
|
||
|
--mix_strategy {concat,interleave_under,interleave_over}
|
||
|
Strategy to use in dataset mixing (concat/interleave)
|
||
|
(undersampling/oversampling). (default: concat)
|
||
|
--interleave_probs INTERLEAVE_PROBS
|
||
|
Probabilities to sample data from datasets. Use commas
|
||
|
to separate multiple datasets. (default: None)
|
||
|
--overwrite_cache [OVERWRITE_CACHE]
|
||
|
Overwrite the cached training and evaluation sets.
|
||
|
(default: False)
|
||
|
--preprocessing_num_workers PREPROCESSING_NUM_WORKERS
|
||
|
The number of processes to use for the pre-processing.
|
||
|
(default: None)
|
||
|
--max_samples MAX_SAMPLES
|
||
|
For debugging purposes, truncate the number of
|
||
|
examples for each dataset. (default: None)
|
||
|
--eval_num_beams EVAL_NUM_BEAMS
|
||
|
Number of beams to use for evaluation. This argument
|
||
|
will be passed to `model.generate` (default: None)
|
||
|
--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]
|
||
|
Whether or not to ignore the tokens corresponding to
|
||
|
the pad label in loss computation. (default: True)
|
||
|
--no_ignore_pad_token_for_loss
|
||
|
Whether or not to ignore the tokens corresponding to
|
||
|
the pad label in loss computation. (default: False)
|
||
|
--val_size VAL_SIZE Size of the development set, should be an integer or a
|
||
|
float in range `[0,1)`. (default: 0.0)
|
||
|
--packing PACKING Enable sequences packing in training. Will
|
||
|
automatically enable in pre-training. (default: None)
|
||
|
--neat_packing [NEAT_PACKING]
|
||
|
Enable sequence packing without cross-attention.
|
||
|
(default: False)
|
||
|
--tool_format TOOL_FORMAT
|
||
|
Tool format to use for constructing function calling
|
||
|
examples. (default: None)
|
||
|
--tokenized_path TOKENIZED_PATH
|
||
|
Path to save or load the tokenized datasets. (default:
|
||
|
None)
|
||
|
--output_dir OUTPUT_DIR
|
||
|
The output directory where the model predictions and
|
||
|
checkpoints will be written. (default: None)
|
||
|
--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]
|
||
|
Overwrite the content of the output directory. Use
|
||
|
this to continue training if output_dir points to a
|
||
|
checkpoint directory. (default: False)
|
||
|
--do_train [DO_TRAIN]
|
||
|
Whether to run training. (default: False)
|
||
|
--do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False)
|
||
|
--do_predict [DO_PREDICT]
|
||
|
Whether to run predictions on the test set. (default:
|
||
|
False)
|
||
|
--eval_strategy {no,steps,epoch}
|
||
|
The evaluation strategy to use. (default: no)
|
||
|
--prediction_loss_only [PREDICTION_LOSS_ONLY]
|
||
|
When performing evaluation and predictions, only
|
||
|
returns the loss. (default: False)
|
||
|
--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE
|
||
|
Batch size per GPU/TPU/MPS/NPU core/CPU for training.
|
||
|
(default: 8)
|
||
|
--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE
|
||
|
Batch size per GPU/TPU/MPS/NPU core/CPU for
|
||
|
evaluation. (default: 8)
|
||
|
--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE
|
||
|
Deprecated, the use of `--per_device_train_batch_size`
|
||
|
is preferred. Batch size per GPU/TPU core/CPU for
|
||
|
training. (default: None)
|
||
|
--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE
|
||
|
Deprecated, the use of `--per_device_eval_batch_size`
|
||
|
is preferred. Batch size per GPU/TPU core/CPU for
|
||
|
evaluation. (default: None)
|
||
|
--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS
|
||
|
Number of updates steps to accumulate before
|
||
|
performing a backward/update pass. (default: 1)
|
||
|
--eval_accumulation_steps EVAL_ACCUMULATION_STEPS
|
||
|
Number of predictions steps to accumulate before
|
||
|
moving the tensors to the CPU. (default: None)
|
||
|
--eval_delay EVAL_DELAY
|
||
|
Number of epochs or steps to wait for before the first
|
||
|
evaluation can be performed, depending on the
|
||
|
eval_strategy. (default: 0)
|
||
|
--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS
|
||
|
Number of steps to wait before calling
|
||
|
`torch.<device>.empty_cache()`.This can help avoid
|
||
|
CUDA out-of-memory errors by lowering peak VRAM usage
|
||
|
at a cost of about [10{'option_strings': ['--
|
||
|
torch_empty_cache_steps'], 'dest':
|
||
|
'torch_empty_cache_steps', 'nargs': None, 'const':
|
||
|
None, 'default': None, 'type': 'int', 'choices': None,
|
||
|
'required': False, 'help': 'Number of steps to wait
|
||
|
before calling `torch.<device>.empty_cache()`.This can
|
||
|
help avoid CUDA out-of-memory errors by lowering peak
|
||
|
VRAM usage at a cost of about [10% slower performance]
|
||
|
(https://github.com/huggingface/transformers/issues/31
|
||
|
372).If left unset or set to None, cache will not be
|
||
|
emptied.', 'metavar': None, 'container':
|
||
|
<argparse._ArgumentGroup object at 0x7fe22dc1efd0>,
|
||
|
'prog': 'launcher.py'}lower performance](https://githu
|
||
|
b.com/huggingface/transformers/issues/31372).If left
|
||
|
unset or set to None, cache will not be emptied.
|
||
|
(default: None)
|
||
|
--learning_rate LEARNING_RATE
|
||
|
The initial learning rate for AdamW. (default: 5e-05)
|
||
|
--weight_decay WEIGHT_DECAY
|
||
|
Weight decay for AdamW if we apply some. (default:
|
||
|
0.0)
|
||
|
--adam_beta1 ADAM_BETA1
|
||
|
Beta1 for AdamW optimizer (default: 0.9)
|
||
|
--adam_beta2 ADAM_BETA2
|
||
|
Beta2 for AdamW optimizer (default: 0.999)
|
||
|
--adam_epsilon ADAM_EPSILON
|
||
|
Epsilon for AdamW optimizer. (default: 1e-08)
|
||
|
--max_grad_norm MAX_GRAD_NORM
|
||
|
Max gradient norm. (default: 1.0)
|
||
|
--num_train_epochs NUM_TRAIN_EPOCHS
|
||
|
Total number of training epochs to perform. (default:
|
||
|
3.0)
|
||
|
--max_steps MAX_STEPS
|
||
|
If > 0: set total number of training steps to perform.
|
||
|
Override num_train_epochs. (default: -1)
|
||
|
--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}
|
||
|
The scheduler type to use. (default: linear)
|
||
|
--lr_scheduler_kwargs LR_SCHEDULER_KWARGS
|
||
|
Extra parameters for the lr_scheduler such as
|
||
|
{'num_cycles': 1} for the cosine with hard restarts.
|
||
|
(default: {})
|
||
|
--warmup_ratio WARMUP_RATIO
|
||
|
Linear warmup over warmup_ratio fraction of total
|
||
|
steps. (default: 0.0)
|
||
|
--warmup_steps WARMUP_STEPS
|
||
|
Linear warmup over warmup_steps. (default: 0)
|
||
|
--log_level {detail,debug,info,warning,error,critical,passive}
|
||
|
Logger log level to use on the main node. Possible
|
||
|
choices are the log levels as strings: 'debug',
|
||
|
'info', 'warning', 'error' and 'critical', plus a
|
||
|
'passive' level which doesn't set anything and lets
|
||
|
the application set the level. Defaults to 'passive'.
|
||
|
(default: passive)
|
||
|
--log_level_replica {detail,debug,info,warning,error,critical,passive}
|
||
|
Logger log level to use on replica nodes. Same choices
|
||
|
and defaults as ``log_level`` (default: warning)
|
||
|
--log_on_each_node [LOG_ON_EACH_NODE]
|
||
|
When doing a multinode distributed training, whether
|
||
|
to log once per node or just once on the main node.
|
||
|
(default: True)
|
||
|
--no_log_on_each_node
|
||
|
When doing a multinode distributed training, whether
|
||
|
to log once per node or just once on the main node.
|
||
|
(default: False)
|
||
|
--logging_dir LOGGING_DIR
|
||
|
Tensorboard log dir. (default: None)
|
||
|
--logging_strategy {no,steps,epoch}
|
||
|
The logging strategy to use. (default: steps)
|
||
|
--logging_first_step [LOGGING_FIRST_STEP]
|
||
|
Log the first global_step (default: False)
|
||
|
--logging_steps LOGGING_STEPS
|
||
|
Log every X updates steps. Should be an integer or a
|
||
|
float in range `[0,1)`. If smaller than 1, will be
|
||
|
interpreted as ratio of total training steps.
|
||
|
(default: 500)
|
||
|
--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]
|
||
|
Filter nan and inf losses for logging. (default: True)
|
||
|
--no_logging_nan_inf_filter
|
||
|
Filter nan and inf losses for logging. (default:
|
||
|
False)
|
||
|
--save_strategy {no,steps,epoch}
|
||
|
The checkpoint save strategy to use. (default: steps)
|
||
|
--save_steps SAVE_STEPS
|
||
|
Save checkpoint every X updates steps. Should be an
|
||
|
integer or a float in range `[0,1)`. If smaller than
|
||
|
1, will be interpreted as ratio of total training
|
||
|
steps. (default: 500)
|
||
|
--save_total_limit SAVE_TOTAL_LIMIT
|
||
|
If a value is passed, will limit the total amount of
|
||
|
checkpoints. Deletes the older checkpoints in
|
||
|
`output_dir`. When `load_best_model_at_end` is
|
||
|
enabled, the 'best' checkpoint according to
|
||
|
`metric_for_best_model` will always be retained in
|
||
|
addition to the most recent ones. For example, for
|
||
|
`save_total_limit=5` and
|
||
|
`load_best_model_at_end=True`, the four last
|
||
|
checkpoints will always be retained alongside the best
|
||
|
model. When `save_total_limit=1` and
|
||
|
`load_best_model_at_end=True`, it is possible that two
|
||
|
checkpoints are saved: the last one and the best one
|
||
|
(if they are different). Default is unlimited
|
||
|
checkpoints (default: None)
|
||
|
--save_safetensors [SAVE_SAFETENSORS]
|
||
|
Use safetensors saving and loading for state dicts
|
||
|
instead of default torch.load and torch.save.
|
||
|
(default: True)
|
||
|
--no_save_safetensors
|
||
|
Use safetensors saving and loading for state dicts
|
||
|
instead of default torch.load and torch.save.
|
||
|
(default: False)
|
||
|
--save_on_each_node [SAVE_ON_EACH_NODE]
|
||
|
When doing multi-node distributed training, whether to
|
||
|
save models and checkpoints on each node, or only on
|
||
|
the main one (default: False)
|
||
|
--save_only_model [SAVE_ONLY_MODEL]
|
||
|
When checkpointing, whether to only save the model, or
|
||
|
also the optimizer, scheduler & rng state.Note that
|
||
|
when this is true, you won't be able to resume
|
||
|
training from checkpoint.This enables you to save
|
||
|
storage by not storing the optimizer, scheduler & rng
|
||
|
state.You can only load the model using
|
||
|
from_pretrained with this option set to True.
|
||
|
(default: False)
|
||
|
--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]
|
||
|
Whether to restore the callback states from the
|
||
|
checkpoint. If `True`, will override callbacks passed
|
||
|
to the `Trainer` if they exist in the checkpoint.
|
||
|
(default: False)
|
||
|
--no_cuda [NO_CUDA] This argument is deprecated. It will be removed in
|
||
|
version 5.0 of 🤗 Transformers. (default: False)
|
||
|
--use_cpu [USE_CPU] Whether or not to use cpu. If set to False, we will
|
||
|
use cuda/tpu/mps/npu device if available. (default:
|
||
|
False)
|
||
|
--use_mps_device [USE_MPS_DEVICE]
|
||
|
This argument is deprecated. `mps` device will be used
|
||
|
if available similar to `cuda` device. It will be
|
||
|
removed in version 5.0 of 🤗 Transformers (default:
|
||
|
False)
|
||
|
--seed SEED Random seed that will be set at the beginning of
|
||
|
training. (default: 42)
|
||
|
--data_seed DATA_SEED
|
||
|
Random seed to be used with data samplers. (default:
|
||
|
None)
|
||
|
--jit_mode_eval [JIT_MODE_EVAL]
|
||
|
Whether or not to use PyTorch jit trace for inference
|
||
|
(default: False)
|
||
|
--use_ipex [USE_IPEX]
|
||
|
Use Intel extension for PyTorch when it is available,
|
||
|
installation: 'https://github.com/intel/intel-
|
||
|
extension-for-pytorch' (default: False)
|
||
|
--bf16 [BF16] Whether to use bf16 (mixed) precision instead of
|
||
|
32-bit. Requires Ampere or higher NVIDIA architecture
|
||
|
or using CPU (use_cpu) or Ascend NPU. This is an
|
||
|
experimental API and it may change. (default: False)
|
||
|
--fp16 [FP16] Whether to use fp16 (mixed) precision instead of
|
||
|
32-bit (default: False)
|
||
|
--fp16_opt_level FP16_OPT_LEVEL
|
||
|
For fp16: Apex AMP optimization level selected in
|
||
|
['O0', 'O1', 'O2', and 'O3']. See details at
|
||
|
https://nvidia.github.io/apex/amp.html (default: O1)
|
||
|
--half_precision_backend {auto,apex,cpu_amp}
|
||
|
The backend to be used for half precision. (default:
|
||
|
auto)
|
||
|
--bf16_full_eval [BF16_FULL_EVAL]
|
||
|
Whether to use full bfloat16 evaluation instead of
|
||
|
32-bit. This is an experimental API and it may change.
|
||
|
(default: False)
|
||
|
--fp16_full_eval [FP16_FULL_EVAL]
|
||
|
Whether to use full float16 evaluation instead of
|
||
|
32-bit (default: False)
|
||
|
--tf32 TF32 Whether to enable tf32 mode, available in Ampere and
|
||
|
newer GPU architectures. This is an experimental API
|
||
|
and it may change. (default: None)
|
||
|
--local_rank LOCAL_RANK
|
||
|
For distributed training: local_rank (default: -1)
|
||
|
--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}
|
||
|
The backend to be used for distributed training
|
||
|
(default: None)
|
||
|
--tpu_num_cores TPU_NUM_CORES
|
||
|
TPU: Number of TPU cores (automatically passed by
|
||
|
launcher script) (default: None)
|
||
|
--tpu_metrics_debug [TPU_METRICS_DEBUG]
|
||
|
Deprecated, the use of `--debug tpu_metrics_debug` is
|
||
|
preferred. TPU: Whether to print debug metrics
|
||
|
(default: False)
|
||
|
--debug DEBUG [DEBUG ...]
|
||
|
Whether or not to enable debug mode. Current options:
|
||
|
`underflow_overflow` (Detect underflow and overflow in
|
||
|
activations and weights), `tpu_metrics_debug` (print
|
||
|
debug metrics on TPU). (default: None)
|
||
|
--dataloader_drop_last [DATALOADER_DROP_LAST]
|
||
|
Drop the last incomplete batch if it is not divisible
|
||
|
by the batch size. (default: False)
|
||
|
--eval_steps EVAL_STEPS
|
||
|
Run an evaluation every X steps. Should be an integer
|
||
|
or a float in range `[0,1)`. If smaller than 1, will
|
||
|
be interpreted as ratio of total training steps.
|
||
|
(default: None)
|
||
|
--dataloader_num_workers DATALOADER_NUM_WORKERS
|
||
|
Number of subprocesses to use for data loading
|
||
|
(PyTorch only). 0 means that the data will be loaded
|
||
|
in the main process. (default: 0)
|
||
|
--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR
|
||
|
Number of batches loaded in advance by each worker. 2
|
||
|
means there will be a total of 2 * num_workers batches
|
||
|
prefetched across all workers. Default is 2 for
|
||
|
PyTorch < 2.0.0 and otherwise None. (default: None)
|
||
|
--past_index PAST_INDEX
|
||
|
If >=0, uses the corresponding part of the output as
|
||
|
the past state for next step. (default: -1)
|
||
|
--run_name RUN_NAME An optional descriptor for the run. Notably used for
|
||
|
wandb, mlflow and comet logging. (default: None)
|
||
|
--disable_tqdm DISABLE_TQDM
|
||
|
Whether or not to disable the tqdm progress bars.
|
||
|
(default: None)
|
||
|
--remove_unused_columns [REMOVE_UNUSED_COLUMNS]
|
||
|
Remove columns not required by the model when using an
|
||
|
nlp.Dataset. (default: True)
|
||
|
--no_remove_unused_columns
|
||
|
Remove columns not required by the model when using an
|
||
|
nlp.Dataset. (default: False)
|
||
|
--label_names LABEL_NAMES [LABEL_NAMES ...]
|
||
|
The list of keys in your dictionary of inputs that
|
||
|
correspond to the labels. (default: None)
|
||
|
--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]
|
||
|
Whether or not to load the best model found during
|
||
|
training at the end of training. When this option is
|
||
|
enabled, the best checkpoint will always be saved. See
|
||
|
`save_total_limit` for more. (default: False)
|
||
|
--metric_for_best_model METRIC_FOR_BEST_MODEL
|
||
|
The metric to use to compare two different models.
|
||
|
(default: None)
|
||
|
--greater_is_better GREATER_IS_BETTER
|
||
|
Whether the `metric_for_best_model` should be
|
||
|
maximized or not. (default: None)
|
||
|
--ignore_data_skip [IGNORE_DATA_SKIP]
|
||
|
When resuming training, whether or not to skip the
|
||
|
first epochs and batches to get to the same training
|
||
|
data. (default: False)
|
||
|
--fsdp FSDP Whether or not to use PyTorch Fully Sharded Data
|
||
|
Parallel (FSDP) training (in distributed training
|
||
|
only). The base option should be `full_shard`,
|
||
|
`shard_grad_op` or `no_shard` and you can add CPU-
|
||
|
offload to `full_shard` or `shard_grad_op` like this:
|
||
|
full_shard offload` or `shard_grad_op offload`. You
|
||
|
can add auto-wrap to `full_shard` or `shard_grad_op`
|
||
|
with the same syntax: full_shard auto_wrap` or
|
||
|
`shard_grad_op auto_wrap`. (default: )
|
||
|
--fsdp_min_num_params FSDP_MIN_NUM_PARAMS
|
||
|
This parameter is deprecated. FSDP's minimum number of
|
||
|
parameters for Default Auto Wrapping. (useful only
|
||
|
when `fsdp` field is passed). (default: 0)
|
||
|
--fsdp_config FSDP_CONFIG
|
||
|
Config to be used with FSDP (Pytorch Fully Sharded
|
||
|
Data Parallel). The value is either a fsdp json config
|
||
|
file (e.g., `fsdp_config.json`) or an already loaded
|
||
|
json file as `dict`. (default: None)
|
||
|
--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP
|
||
|
This parameter is deprecated. Transformer layer class
|
||
|
name (case-sensitive) to wrap, e.g, `BertLayer`,
|
||
|
`GPTJBlock`, `T5Block` .... (useful only when `fsdp`
|
||
|
flag is passed). (default: None)
|
||
|
--accelerator_config ACCELERATOR_CONFIG
|
||
|
Config to be used with the internal Accelerator object
|
||
|
initializtion. The value is either a accelerator json
|
||
|
config file (e.g., `accelerator_config.json`) or an
|
||
|
already loaded json file as `dict`. (default: None)
|
||
|
--deepspeed DEEPSPEED
|
||
|
Enable deepspeed and pass the path to deepspeed json
|
||
|
config file (e.g. `ds_config.json`) or an already
|
||
|
loaded json file as a dict (default: None)
|
||
|
--label_smoothing_factor LABEL_SMOOTHING_FACTOR
|
||
|
The label smoothing epsilon to apply (zero means no
|
||
|
label smoothing). (default: 0.0)
|
||
|
--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}
|
||
|
The optimizer to use. (default: adamw_torch)
|
||
|
--optim_args OPTIM_ARGS
|
||
|
Optional arguments to supply to optimizer. (default:
|
||
|
None)
|
||
|
--adafactor [ADAFACTOR]
|
||
|
Whether or not to replace AdamW by Adafactor.
|
||
|
(default: False)
|
||
|
--group_by_length [GROUP_BY_LENGTH]
|
||
|
Whether or not to group samples of roughly the same
|
||
|
length together when batching. (default: False)
|
||
|
--length_column_name LENGTH_COLUMN_NAME
|
||
|
Column name with precomputed lengths to use when
|
||
|
grouping by length. (default: length)
|
||
|
--report_to REPORT_TO
|
||
|
The list of integrations to report the results and
|
||
|
logs to. (default: None)
|
||
|
--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS
|
||
|
When using distributed training, the value of the flag
|
||
|
`find_unused_parameters` passed to
|
||
|
`DistributedDataParallel`. (default: None)
|
||
|
--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB
|
||
|
When using distributed training, the value of the flag
|
||
|
`bucket_cap_mb` passed to `DistributedDataParallel`.
|
||
|
(default: None)
|
||
|
--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS
|
||
|
When using distributed training, the value of the flag
|
||
|
`broadcast_buffers` passed to
|
||
|
`DistributedDataParallel`. (default: None)
|
||
|
--dataloader_pin_memory [DATALOADER_PIN_MEMORY]
|
||
|
Whether or not to pin memory for DataLoader. (default:
|
||
|
True)
|
||
|
--no_dataloader_pin_memory
|
||
|
Whether or not to pin memory for DataLoader. (default:
|
||
|
False)
|
||
|
--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]
|
||
|
If True, the data loader will not shut down the worker
|
||
|
processes after a dataset has been consumed once. This
|
||
|
allows to maintain the workers Dataset instances
|
||
|
alive. Can potentially speed up training, but will
|
||
|
increase RAM usage. (default: False)
|
||
|
--skip_memory_metrics [SKIP_MEMORY_METRICS]
|
||
|
Whether or not to skip adding of memory profiler
|
||
|
reports to metrics. (default: True)
|
||
|
--no_skip_memory_metrics
|
||
|
Whether or not to skip adding of memory profiler
|
||
|
reports to metrics. (default: False)
|
||
|
--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]
|
||
|
Whether or not to use the legacy prediction_loop in
|
||
|
the Trainer. (default: False)
|
||
|
--push_to_hub [PUSH_TO_HUB]
|
||
|
Whether or not to upload the trained model to the
|
||
|
model hub after training. (default: False)
|
||
|
--resume_from_checkpoint RESUME_FROM_CHECKPOINT
|
||
|
The path to a folder with a valid checkpoint for your
|
||
|
model. (default: None)
|
||
|
--hub_model_id HUB_MODEL_ID
|
||
|
The name of the repository to keep in sync with the
|
||
|
local `output_dir`. (default: None)
|
||
|
--hub_strategy {end,every_save,checkpoint,all_checkpoints}
|
||
|
The hub strategy to use when `--push_to_hub` is
|
||
|
activated. (default: every_save)
|
||
|
--hub_token HUB_TOKEN
|
||
|
The token to use to push to the Model Hub. (default:
|
||
|
None)
|
||
|
--hub_private_repo [HUB_PRIVATE_REPO]
|
||
|
Whether the model repository is private or not.
|
||
|
(default: False)
|
||
|
--hub_always_push [HUB_ALWAYS_PUSH]
|
||
|
Unless `True`, the Trainer will skip pushes if the
|
||
|
previous one wasn't finished yet. (default: False)
|
||
|
--gradient_checkpointing [GRADIENT_CHECKPOINTING]
|
||
|
If True, use gradient checkpointing to save memory at
|
||
|
the expense of slower backward pass. (default: False)
|
||
|
--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS
|
||
|
Gradient checkpointing key word arguments such as
|
||
|
`use_reentrant`. Will be passed to
|
||
|
`torch.utils.checkpoint.checkpoint` through
|
||
|
`model.gradient_checkpointing_enable`. (default: None)
|
||
|
--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]
|
||
|
Whether or not the inputs will be passed to the
|
||
|
`compute_metrics` function. (default: False)
|
||
|
--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]
|
||
|
Whether to recursively concat
|
||
|
inputs/losses/labels/predictions across batches. If
|
||
|
`False`, will instead store them as lists, with each
|
||
|
batch kept separate. (default: True)
|
||
|
--no_eval_do_concat_batches
|
||
|
Whether to recursively concat
|
||
|
inputs/losses/labels/predictions across batches. If
|
||
|
`False`, will instead store them as lists, with each
|
||
|
batch kept separate. (default: False)
|
||
|
--fp16_backend {auto,apex,cpu_amp}
|
||
|
Deprecated. Use half_precision_backend instead
|
||
|
(default: auto)
|
||
|
--evaluation_strategy {no,steps,epoch}
|
||
|
Deprecated. Use `eval_strategy` instead (default:
|
||
|
None)
|
||
|
--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID
|
||
|
The name of the repository to which push the
|
||
|
`Trainer`. (default: None)
|
||
|
--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION
|
||
|
The name of the organization in with to which push the
|
||
|
`Trainer`. (default: None)
|
||
|
--push_to_hub_token PUSH_TO_HUB_TOKEN
|
||
|
The token to use to push to the Model Hub. (default:
|
||
|
None)
|
||
|
--mp_parameters MP_PARAMETERS
|
||
|
Used by the SageMaker launcher to send mp-specific
|
||
|
args. Ignored in Trainer (default: )
|
||
|
--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]
|
||
|
Whether to automatically decrease the batch size in
|
||
|
half and rerun the training loop again each time a
|
||
|
CUDA Out-of-Memory was reached (default: False)
|
||
|
--full_determinism [FULL_DETERMINISM]
|
||
|
Whether to call enable_full_determinism instead of
|
||
|
set_seed for reproducibility in distributed training.
|
||
|
Important: this will negatively impact the
|
||
|
performance, so only use it for debugging. (default:
|
||
|
False)
|
||
|
--torchdynamo TORCHDYNAMO
|
||
|
This argument is deprecated, use
|
||
|
`--torch_compile_backend` instead. (default: None)
|
||
|
--ray_scope RAY_SCOPE
|
||
|
The scope to use when doing hyperparameter search with
|
||
|
Ray. By default, `"last"` will be used. Ray will then
|
||
|
use the last checkpoint of all trials, compare those,
|
||
|
and select the best one. However, other options are
|
||
|
also available. See the Ray documentation (https://doc
|
||
|
s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun
|
||
|
e.ExperimentAnalysis.get_best_trial) for more options.
|
||
|
(default: last)
|
||
|
--ddp_timeout DDP_TIMEOUT
|
||
|
Overrides the default timeout for distributed training
|
||
|
(value should be given in seconds). (default: 1800)
|
||
|
--torch_compile [TORCH_COMPILE]
|
||
|
If set to `True`, the model will be wrapped in
|
||
|
`torch.compile`. (default: False)
|
||
|
--torch_compile_backend TORCH_COMPILE_BACKEND
|
||
|
Which backend to use with `torch.compile`, passing one
|
||
|
will trigger a model compilation. (default: None)
|
||
|
--torch_compile_mode TORCH_COMPILE_MODE
|
||
|
Which mode to use with `torch.compile`, passing one
|
||
|
will trigger a model compilation. (default: None)
|
||
|
--dispatch_batches DISPATCH_BATCHES
|
||
|
Deprecated. Pass {'dispatch_batches':VALUE} to
|
||
|
`accelerator_config`. (default: None)
|
||
|
--split_batches SPLIT_BATCHES
|
||
|
Deprecated. Pass {'split_batches':True} to
|
||
|
`accelerator_config`. (default: None)
|
||
|
--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]
|
||
|
If set to `True`, the speed metrics will include `tgs`
|
||
|
(tokens per second per device). (default: False)
|
||
|
--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]
|
||
|
If set to `True`, will track the number of input
|
||
|
tokens seen throughout training. (May be slower in
|
||
|
distributed training) (default: False)
|
||
|
--neftune_noise_alpha NEFTUNE_NOISE_ALPHA
|
||
|
Activates neftune noise embeddings into the model.
|
||
|
NEFTune has been proven to drastically improve model
|
||
|
performances for instrcution fine-tuning. Check out
|
||
|
the original paper here:
|
||
|
https://arxiv.org/abs/2310.05914 and the original code
|
||
|
here: https://github.com/neelsjain/NEFTune. Only
|
||
|
supported for `PreTrainedModel` and `PeftModel`
|
||
|
classes. (default: None)
|
||
|
--optim_target_modules OPTIM_TARGET_MODULES
|
||
|
Target modules for the optimizer defined in the
|
||
|
`optim` argument. Only used for the GaLore optimizer
|
||
|
at the moment. (default: None)
|
||
|
--batch_eval_metrics [BATCH_EVAL_METRICS]
|
||
|
Break eval metrics calculation into batches to save
|
||
|
memory. (default: False)
|
||
|
--eval_on_start [EVAL_ON_START]
|
||
|
Whether to run through the entire `evaluation` step at
|
||
|
the very beginning of training as a sanity check.
|
||
|
(default: False)
|
||
|
--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]
|
||
|
Whether to run recursively gather object in a nested
|
||
|
list/tuple/dictionary of objects from all devices.
|
||
|
(default: False)
|
||
|
--sortish_sampler [SORTISH_SAMPLER]
|
||
|
Whether to use SortishSampler or not. (default: False)
|
||
|
--predict_with_generate [PREDICT_WITH_GENERATE]
|
||
|
Whether to use generate to calculate generative
|
||
|
metrics (ROUGE, BLEU). (default: False)
|
||
|
--generation_max_length GENERATION_MAX_LENGTH
|
||
|
The `max_length` to use on each evaluation loop when
|
||
|
`predict_with_generate=True`. Will default to the
|
||
|
`max_length` value of the model configuration.
|
||
|
(default: None)
|
||
|
--generation_num_beams GENERATION_NUM_BEAMS
|
||
|
The `num_beams` to use on each evaluation loop when
|
||
|
`predict_with_generate=True`. Will default to the
|
||
|
`num_beams` value of the model configuration.
|
||
|
(default: None)
|
||
|
--generation_config GENERATION_CONFIG
|
||
|
Model id, file path or url pointing to a
|
||
|
GenerationConfig json file, to use during prediction.
|
||
|
(default: None)
|
||
|
--use_badam [USE_BADAM]
|
||
|
Whether or not to use the BAdam optimizer. (default:
|
||
|
False)
|
||
|
--badam_mode {layer,ratio}
|
||
|
Whether to use layer-wise or ratio-wise BAdam
|
||
|
optimizer. (default: layer)
|
||
|
--badam_start_block BADAM_START_BLOCK
|
||
|
The starting block index for layer-wise BAdam.
|
||
|
(default: None)
|
||
|
--badam_switch_mode {ascending,descending,random,fixed}
|
||
|
the strategy of picking block to update for layer-wise
|
||
|
BAdam. (default: ascending)
|
||
|
--badam_switch_interval BADAM_SWITCH_INTERVAL
|
||
|
Number of steps to update the block for layer-wise
|
||
|
BAdam. Use -1 to disable the block update. (default:
|
||
|
50)
|
||
|
--badam_update_ratio BADAM_UPDATE_RATIO
|
||
|
The ratio of the update for ratio-wise BAdam.
|
||
|
(default: 0.05)
|
||
|
--badam_mask_mode {adjacent,scatter}
|
||
|
The mode of the mask for BAdam optimizer. `adjacent`
|
||
|
means that the trainable parameters are adjacent to
|
||
|
each other, `scatter` means that trainable parameters
|
||
|
are randomly choosed from the weight. (default:
|
||
|
adjacent)
|
||
|
--badam_verbose BADAM_VERBOSE
|
||
|
The verbosity level of BAdam optimizer. 0 for no
|
||
|
print, 1 for print the block prefix, 2 for print
|
||
|
trainable parameters. (default: 0)
|
||
|
--use_galore [USE_GALORE]
|
||
|
Whether or not to use the gradient low-Rank projection
|
||
|
(GaLore). (default: False)
|
||
|
--galore_target GALORE_TARGET
|
||
|
Name(s) of modules to apply GaLore. Use commas to
|
||
|
separate multiple modules. Use `all` to specify all
|
||
|
the linear modules. (default: all)
|
||
|
--galore_rank GALORE_RANK
|
||
|
The rank of GaLore gradients. (default: 16)
|
||
|
--galore_update_interval GALORE_UPDATE_INTERVAL
|
||
|
Number of steps to update the GaLore projection.
|
||
|
(default: 200)
|
||
|
--galore_scale GALORE_SCALE
|
||
|
GaLore scaling coefficient. (default: 0.25)
|
||
|
--galore_proj_type {std,reverse_std,right,left,full}
|
||
|
Type of GaLore projection. (default: std)
|
||
|
--galore_layerwise [GALORE_LAYERWISE]
|
||
|
Whether or not to enable layer-wise update to further
|
||
|
save memory. (default: False)
|
||
|
--pref_beta PREF_BETA
|
||
|
The beta parameter in the preference loss. (default:
|
||
|
0.1)
|
||
|
--pref_ftx PREF_FTX The supervised fine-tuning loss coefficient in DPO
|
||
|
training. (default: 0.0)
|
||
|
--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}
|
||
|
The type of DPO loss to use. (default: sigmoid)
|
||
|
--dpo_label_smoothing DPO_LABEL_SMOOTHING
|
||
|
The robust DPO label smoothing parameter in cDPO that
|
||
|
should be between 0 and 0.5. (default: 0.0)
|
||
|
--kto_chosen_weight KTO_CHOSEN_WEIGHT
|
||
|
The weight factor of the desirable losses in KTO
|
||
|
training. (default: 1.0)
|
||
|
--kto_rejected_weight KTO_REJECTED_WEIGHT
|
||
|
The weight factor of the undesirable losses in KTO
|
||
|
training. (default: 1.0)
|
||
|
--simpo_gamma SIMPO_GAMMA
|
||
|
The target reward margin term in SimPO loss. (default:
|
||
|
0.5)
|
||
|
--ppo_buffer_size PPO_BUFFER_SIZE
|
||
|
The number of mini-batches to make experience buffer
|
||
|
in a PPO optimization step. (default: 1)
|
||
|
--ppo_epochs PPO_EPOCHS
|
||
|
The number of epochs to perform in a PPO optimization
|
||
|
step. (default: 4)
|
||
|
--ppo_score_norm [PPO_SCORE_NORM]
|
||
|
Use score normalization in PPO training. (default:
|
||
|
False)
|
||
|
--ppo_target PPO_TARGET
|
||
|
Target KL value for adaptive KL control in PPO
|
||
|
training. (default: 6.0)
|
||
|
--ppo_whiten_rewards [PPO_WHITEN_REWARDS]
|
||
|
Whiten the rewards before compute advantages in PPO
|
||
|
training. (default: False)
|
||
|
--ref_model REF_MODEL
|
||
|
Path to the reference model used for the PPO or DPO
|
||
|
training. (default: None)
|
||
|
--ref_model_adapters REF_MODEL_ADAPTERS
|
||
|
Path to the adapters of the reference model. (default:
|
||
|
None)
|
||
|
--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the reference model.
|
||
|
(default: None)
|
||
|
--reward_model REWARD_MODEL
|
||
|
Path to the reward model used for the PPO training.
|
||
|
(default: None)
|
||
|
--reward_model_adapters REWARD_MODEL_ADAPTERS
|
||
|
Path to the adapters of the reward model. (default:
|
||
|
None)
|
||
|
--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the reward model.
|
||
|
(default: None)
|
||
|
--reward_model_type {lora,full,api}
|
||
|
The type of the reward model in PPO training. Lora
|
||
|
model only supports lora training. (default: lora)
|
||
|
--additional_target ADDITIONAL_TARGET
|
||
|
Name(s) of modules apart from LoRA layers to be set as
|
||
|
trainable and saved in the final checkpoint. Use
|
||
|
commas to separate multiple modules. (default: None)
|
||
|
--lora_alpha LORA_ALPHA
|
||
|
The scale factor for LoRA fine-tuning (default:
|
||
|
lora_rank * 2). (default: None)
|
||
|
--lora_dropout LORA_DROPOUT
|
||
|
Dropout rate for the LoRA fine-tuning. (default: 0.0)
|
||
|
--lora_rank LORA_RANK
|
||
|
The intrinsic dimension for LoRA fine-tuning.
|
||
|
(default: 8)
|
||
|
--lora_target LORA_TARGET
|
||
|
Name(s) of target modules to apply LoRA. Use commas to
|
||
|
separate multiple modules. Use `all` to specify all
|
||
|
the linear modules. (default: all)
|
||
|
--loraplus_lr_ratio LORAPLUS_LR_RATIO
|
||
|
LoRA plus learning rate ratio (lr_B / lr_A). (default:
|
||
|
None)
|
||
|
--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING
|
||
|
LoRA plus learning rate for lora embedding layers.
|
||
|
(default: 1e-06)
|
||
|
--use_rslora [USE_RSLORA]
|
||
|
Whether or not to use the rank stabilization scaling
|
||
|
factor for LoRA layer. (default: False)
|
||
|
--use_dora [USE_DORA]
|
||
|
Whether or not to use the weight-decomposed lora
|
||
|
method (DoRA). (default: False)
|
||
|
--pissa_init [PISSA_INIT]
|
||
|
Whether or not to initialize a PiSSA adapter.
|
||
|
(default: False)
|
||
|
--pissa_iter PISSA_ITER
|
||
|
The number of iteration steps performed by FSVD in
|
||
|
PiSSA. Use -1 to disable it. (default: 16)
|
||
|
--pissa_convert [PISSA_CONVERT]
|
||
|
Whether or not to convert the PiSSA adapter to a
|
||
|
normal LoRA adapter. (default: False)
|
||
|
--create_new_adapter [CREATE_NEW_ADAPTER]
|
||
|
Whether or not to create a new adapter with randomly
|
||
|
initialized weight. (default: False)
|
||
|
--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS
|
||
|
The number of trainable layers for freeze (partial-
|
||
|
parameter) fine-tuning. Positive numbers mean the last
|
||
|
n layers are set as trainable, negative numbers mean
|
||
|
the first n layers are set as trainable. (default: 2)
|
||
|
--freeze_trainable_modules FREEZE_TRAINABLE_MODULES
|
||
|
Name(s) of trainable modules for freeze (partial-
|
||
|
parameter) fine-tuning. Use commas to separate
|
||
|
multiple modules. Use `all` to specify all the
|
||
|
available modules. (default: all)
|
||
|
--freeze_extra_modules FREEZE_EXTRA_MODULES
|
||
|
Name(s) of modules apart from hidden layers to be set
|
||
|
as trainable for freeze (partial-parameter) fine-
|
||
|
tuning. Use commas to separate multiple modules.
|
||
|
(default: None)
|
||
|
--pure_bf16 [PURE_BF16]
|
||
|
Whether or not to train model in purely bf16 precision
|
||
|
(without AMP). (default: False)
|
||
|
--stage {pt,sft,rm,ppo,dpo,kto}
|
||
|
Which stage will be performed in training. (default:
|
||
|
sft)
|
||
|
--finetuning_type {lora,freeze,full}
|
||
|
Which fine-tuning method to use. (default: lora)
|
||
|
--use_llama_pro [USE_LLAMA_PRO]
|
||
|
Whether or not to make only the parameters in the
|
||
|
expanded blocks trainable. (default: False)
|
||
|
--use_adam_mini [USE_ADAM_MINI]
|
||
|
Whether or not to use the Adam-mini optimizer.
|
||
|
(default: False)
|
||
|
--freeze_vision_tower [FREEZE_VISION_TOWER]
|
||
|
Whether ot not to freeze vision tower in MLLM
|
||
|
training. (default: True)
|
||
|
--no_freeze_vision_tower
|
||
|
Whether ot not to freeze vision tower in MLLM
|
||
|
training. (default: False)
|
||
|
--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]
|
||
|
Whether or not to train the multimodal projector for
|
||
|
MLLM only. (default: False)
|
||
|
--compute_accuracy [COMPUTE_ACCURACY]
|
||
|
Whether or not to compute the token-level accuracy at
|
||
|
evaluation. (default: False)
|
||
|
--plot_loss [PLOT_LOSS]
|
||
|
Whether or not to save the training loss curves.
|
||
|
(default: False)
|
||
|
--do_sample [DO_SAMPLE]
|
||
|
Whether or not to use sampling, use greedy decoding
|
||
|
otherwise. (default: True)
|
||
|
--no_do_sample Whether or not to use sampling, use greedy decoding
|
||
|
otherwise. (default: False)
|
||
|
--temperature TEMPERATURE
|
||
|
The value used to modulate the next token
|
||
|
probabilities. (default: 0.95)
|
||
|
--top_p TOP_P The smallest set of most probable tokens with
|
||
|
probabilities that add up to top_p or higher are kept.
|
||
|
(default: 0.7)
|
||
|
--top_k TOP_K The number of highest probability vocabulary tokens to
|
||
|
keep for top-k filtering. (default: 50)
|
||
|
--num_beams NUM_BEAMS
|
||
|
Number of beams for beam search. 1 means no beam
|
||
|
search. (default: 1)
|
||
|
--max_length MAX_LENGTH
|
||
|
The maximum length the generated tokens can have. It
|
||
|
can be overridden by max_new_tokens. (default: 1024)
|
||
|
--max_new_tokens MAX_NEW_TOKENS
|
||
|
The maximum numbers of tokens to generate, ignoring
|
||
|
the number of tokens in the prompt. (default: 1024)
|
||
|
--repetition_penalty REPETITION_PENALTY
|
||
|
The parameter for repetition penalty. 1.0 means no
|
||
|
penalty. (default: 1.0)
|
||
|
--length_penalty LENGTH_PENALTY
|
||
|
Exponential penalty to the length that is used with
|
||
|
beam-based generation. (default: 1.0)
|
||
|
--default_system DEFAULT_SYSTEM
|
||
|
Default system message to use in chat completion.
|
||
|
(default: None)
|
||
|
usage: launcher.py [-h] --model_name_or_path MODEL_NAME_OR_PATH
|
||
|
[--adapter_name_or_path ADAPTER_NAME_OR_PATH]
|
||
|
[--adapter_folder ADAPTER_FOLDER] [--cache_dir CACHE_DIR]
|
||
|
[--use_fast_tokenizer [USE_FAST_TOKENIZER]]
|
||
|
[--no_use_fast_tokenizer] [--resize_vocab [RESIZE_VOCAB]]
|
||
|
[--split_special_tokens [SPLIT_SPECIAL_TOKENS]]
|
||
|
[--new_special_tokens NEW_SPECIAL_TOKENS]
|
||
|
[--model_revision MODEL_REVISION]
|
||
|
[--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]]
|
||
|
[--no_low_cpu_mem_usage]
|
||
|
[--quantization_method {bitsandbytes,hqq,eetq}]
|
||
|
[--quantization_bit QUANTIZATION_BIT]
|
||
|
[--quantization_type {fp4,nf4}]
|
||
|
[--double_quantization [DOUBLE_QUANTIZATION]]
|
||
|
[--no_double_quantization]
|
||
|
[--quantization_device_map {auto}]
|
||
|
[--rope_scaling {linear,dynamic}]
|
||
|
[--flash_attn {auto,disabled,sdpa,fa2}]
|
||
|
[--shift_attn [SHIFT_ATTN]]
|
||
|
[--mixture_of_depths {convert,load}]
|
||
|
[--use_unsloth [USE_UNSLOTH]]
|
||
|
[--visual_inputs [VISUAL_INPUTS]]
|
||
|
[--moe_aux_loss_coef MOE_AUX_LOSS_COEF]
|
||
|
[--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]]
|
||
|
[--upcast_layernorm [UPCAST_LAYERNORM]]
|
||
|
[--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]]
|
||
|
[--train_from_scratch [TRAIN_FROM_SCRATCH]]
|
||
|
[--infer_backend {huggingface,vllm}]
|
||
|
[--vllm_maxlen VLLM_MAXLEN] [--vllm_gpu_util VLLM_GPU_UTIL]
|
||
|
[--vllm_enforce_eager [VLLM_ENFORCE_EAGER]]
|
||
|
[--vllm_max_lora_rank VLLM_MAX_LORA_RANK]
|
||
|
[--offload_folder OFFLOAD_FOLDER] [--use_cache [USE_CACHE]]
|
||
|
[--no_use_cache]
|
||
|
[--infer_dtype {auto,float16,bfloat16,float32}]
|
||
|
[--hf_hub_token HF_HUB_TOKEN] [--ms_hub_token MS_HUB_TOKEN]
|
||
|
[--export_dir EXPORT_DIR] [--export_size EXPORT_SIZE]
|
||
|
[--export_device {cpu,auto}]
|
||
|
[--export_quantization_bit EXPORT_QUANTIZATION_BIT]
|
||
|
[--export_quantization_dataset EXPORT_QUANTIZATION_DATASET]
|
||
|
[--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES]
|
||
|
[--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN]
|
||
|
[--export_legacy_format [EXPORT_LEGACY_FORMAT]]
|
||
|
[--export_hub_model_id EXPORT_HUB_MODEL_ID]
|
||
|
[--print_param_status [PRINT_PARAM_STATUS]]
|
||
|
[--template TEMPLATE] [--dataset DATASET]
|
||
|
[--eval_dataset EVAL_DATASET] [--dataset_dir DATASET_DIR]
|
||
|
[--cutoff_len CUTOFF_LEN]
|
||
|
[--train_on_prompt [TRAIN_ON_PROMPT]]
|
||
|
[--mask_history [MASK_HISTORY]] [--streaming [STREAMING]]
|
||
|
[--buffer_size BUFFER_SIZE]
|
||
|
[--mix_strategy {concat,interleave_under,interleave_over}]
|
||
|
[--interleave_probs INTERLEAVE_PROBS]
|
||
|
[--overwrite_cache [OVERWRITE_CACHE]]
|
||
|
[--preprocessing_num_workers PREPROCESSING_NUM_WORKERS]
|
||
|
[--max_samples MAX_SAMPLES]
|
||
|
[--eval_num_beams EVAL_NUM_BEAMS]
|
||
|
[--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]]
|
||
|
[--no_ignore_pad_token_for_loss] [--val_size VAL_SIZE]
|
||
|
[--packing PACKING] [--neat_packing [NEAT_PACKING]]
|
||
|
[--tool_format TOOL_FORMAT]
|
||
|
[--tokenized_path TOKENIZED_PATH] --output_dir OUTPUT_DIR
|
||
|
[--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]]
|
||
|
[--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]]
|
||
|
[--do_predict [DO_PREDICT]]
|
||
|
[--eval_strategy {no,steps,epoch}]
|
||
|
[--prediction_loss_only [PREDICTION_LOSS_ONLY]]
|
||
|
[--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE]
|
||
|
[--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]
|
||
|
[--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE]
|
||
|
[--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE]
|
||
|
[--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS]
|
||
|
[--eval_accumulation_steps EVAL_ACCUMULATION_STEPS]
|
||
|
[--eval_delay EVAL_DELAY]
|
||
|
[--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS]
|
||
|
[--learning_rate LEARNING_RATE]
|
||
|
[--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1]
|
||
|
[--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON]
|
||
|
[--max_grad_norm MAX_GRAD_NORM]
|
||
|
[--num_train_epochs NUM_TRAIN_EPOCHS]
|
||
|
[--max_steps MAX_STEPS]
|
||
|
[--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}]
|
||
|
[--lr_scheduler_kwargs LR_SCHEDULER_KWARGS]
|
||
|
[--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS]
|
||
|
[--log_level {detail,debug,info,warning,error,critical,passive}]
|
||
|
[--log_level_replica {detail,debug,info,warning,error,critical,passive}]
|
||
|
[--log_on_each_node [LOG_ON_EACH_NODE]]
|
||
|
[--no_log_on_each_node] [--logging_dir LOGGING_DIR]
|
||
|
[--logging_strategy {no,steps,epoch}]
|
||
|
[--logging_first_step [LOGGING_FIRST_STEP]]
|
||
|
[--logging_steps LOGGING_STEPS]
|
||
|
[--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]]
|
||
|
[--no_logging_nan_inf_filter]
|
||
|
[--save_strategy {no,steps,epoch}]
|
||
|
[--save_steps SAVE_STEPS]
|
||
|
[--save_total_limit SAVE_TOTAL_LIMIT]
|
||
|
[--save_safetensors [SAVE_SAFETENSORS]]
|
||
|
[--no_save_safetensors]
|
||
|
[--save_on_each_node [SAVE_ON_EACH_NODE]]
|
||
|
[--save_only_model [SAVE_ONLY_MODEL]]
|
||
|
[--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]]
|
||
|
[--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]]
|
||
|
[--use_mps_device [USE_MPS_DEVICE]] [--seed SEED]
|
||
|
[--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]]
|
||
|
[--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]]
|
||
|
[--fp16_opt_level FP16_OPT_LEVEL]
|
||
|
[--half_precision_backend {auto,apex,cpu_amp}]
|
||
|
[--bf16_full_eval [BF16_FULL_EVAL]]
|
||
|
[--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32]
|
||
|
[--local_rank LOCAL_RANK]
|
||
|
[--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}]
|
||
|
[--tpu_num_cores TPU_NUM_CORES]
|
||
|
[--tpu_metrics_debug [TPU_METRICS_DEBUG]]
|
||
|
[--debug DEBUG [DEBUG ...]]
|
||
|
[--dataloader_drop_last [DATALOADER_DROP_LAST]]
|
||
|
[--eval_steps EVAL_STEPS]
|
||
|
[--dataloader_num_workers DATALOADER_NUM_WORKERS]
|
||
|
[--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR]
|
||
|
[--past_index PAST_INDEX] [--run_name RUN_NAME]
|
||
|
[--disable_tqdm DISABLE_TQDM]
|
||
|
[--remove_unused_columns [REMOVE_UNUSED_COLUMNS]]
|
||
|
[--no_remove_unused_columns]
|
||
|
[--label_names LABEL_NAMES [LABEL_NAMES ...]]
|
||
|
[--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]]
|
||
|
[--metric_for_best_model METRIC_FOR_BEST_MODEL]
|
||
|
[--greater_is_better GREATER_IS_BETTER]
|
||
|
[--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP]
|
||
|
[--fsdp_min_num_params FSDP_MIN_NUM_PARAMS]
|
||
|
[--fsdp_config FSDP_CONFIG]
|
||
|
[--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP]
|
||
|
[--accelerator_config ACCELERATOR_CONFIG]
|
||
|
[--deepspeed DEEPSPEED]
|
||
|
[--label_smoothing_factor LABEL_SMOOTHING_FACTOR]
|
||
|
[--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}]
|
||
|
[--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]]
|
||
|
[--group_by_length [GROUP_BY_LENGTH]]
|
||
|
[--length_column_name LENGTH_COLUMN_NAME]
|
||
|
[--report_to REPORT_TO]
|
||
|
[--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS]
|
||
|
[--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB]
|
||
|
[--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS]
|
||
|
[--dataloader_pin_memory [DATALOADER_PIN_MEMORY]]
|
||
|
[--no_dataloader_pin_memory]
|
||
|
[--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]]
|
||
|
[--skip_memory_metrics [SKIP_MEMORY_METRICS]]
|
||
|
[--no_skip_memory_metrics]
|
||
|
[--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]]
|
||
|
[--push_to_hub [PUSH_TO_HUB]]
|
||
|
[--resume_from_checkpoint RESUME_FROM_CHECKPOINT]
|
||
|
[--hub_model_id HUB_MODEL_ID]
|
||
|
[--hub_strategy {end,every_save,checkpoint,all_checkpoints}]
|
||
|
[--hub_token HUB_TOKEN]
|
||
|
[--hub_private_repo [HUB_PRIVATE_REPO]]
|
||
|
[--hub_always_push [HUB_ALWAYS_PUSH]]
|
||
|
[--gradient_checkpointing [GRADIENT_CHECKPOINTING]]
|
||
|
[--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS]
|
||
|
[--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]]
|
||
|
[--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]]
|
||
|
[--no_eval_do_concat_batches]
|
||
|
[--fp16_backend {auto,apex,cpu_amp}]
|
||
|
[--evaluation_strategy {no,steps,epoch}]
|
||
|
[--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID]
|
||
|
[--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION]
|
||
|
[--push_to_hub_token PUSH_TO_HUB_TOKEN]
|
||
|
[--mp_parameters MP_PARAMETERS]
|
||
|
[--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]]
|
||
|
[--full_determinism [FULL_DETERMINISM]]
|
||
|
[--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE]
|
||
|
[--ddp_timeout DDP_TIMEOUT]
|
||
|
[--torch_compile [TORCH_COMPILE]]
|
||
|
[--torch_compile_backend TORCH_COMPILE_BACKEND]
|
||
|
[--torch_compile_mode TORCH_COMPILE_MODE]
|
||
|
[--dispatch_batches DISPATCH_BATCHES]
|
||
|
[--split_batches SPLIT_BATCHES]
|
||
|
[--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]]
|
||
|
[--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]]
|
||
|
[--neftune_noise_alpha NEFTUNE_NOISE_ALPHA]
|
||
|
[--optim_target_modules OPTIM_TARGET_MODULES]
|
||
|
[--batch_eval_metrics [BATCH_EVAL_METRICS]]
|
||
|
[--eval_on_start [EVAL_ON_START]]
|
||
|
[--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]]
|
||
|
[--sortish_sampler [SORTISH_SAMPLER]]
|
||
|
[--predict_with_generate [PREDICT_WITH_GENERATE]]
|
||
|
[--generation_max_length GENERATION_MAX_LENGTH]
|
||
|
[--generation_num_beams GENERATION_NUM_BEAMS]
|
||
|
[--generation_config GENERATION_CONFIG]
|
||
|
[--use_badam [USE_BADAM]] [--badam_mode {layer,ratio}]
|
||
|
[--badam_start_block BADAM_START_BLOCK]
|
||
|
[--badam_switch_mode {ascending,descending,random,fixed}]
|
||
|
[--badam_switch_interval BADAM_SWITCH_INTERVAL]
|
||
|
[--badam_update_ratio BADAM_UPDATE_RATIO]
|
||
|
[--badam_mask_mode {adjacent,scatter}]
|
||
|
[--badam_verbose BADAM_VERBOSE] [--use_galore [USE_GALORE]]
|
||
|
[--galore_target GALORE_TARGET] [--galore_rank GALORE_RANK]
|
||
|
[--galore_update_interval GALORE_UPDATE_INTERVAL]
|
||
|
[--galore_scale GALORE_SCALE]
|
||
|
[--galore_proj_type {std,reverse_std,right,left,full}]
|
||
|
[--galore_layerwise [GALORE_LAYERWISE]]
|
||
|
[--pref_beta PREF_BETA] [--pref_ftx PREF_FTX]
|
||
|
[--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}]
|
||
|
[--dpo_label_smoothing DPO_LABEL_SMOOTHING]
|
||
|
[--kto_chosen_weight KTO_CHOSEN_WEIGHT]
|
||
|
[--kto_rejected_weight KTO_REJECTED_WEIGHT]
|
||
|
[--simpo_gamma SIMPO_GAMMA]
|
||
|
[--ppo_buffer_size PPO_BUFFER_SIZE]
|
||
|
[--ppo_epochs PPO_EPOCHS]
|
||
|
[--ppo_score_norm [PPO_SCORE_NORM]]
|
||
|
[--ppo_target PPO_TARGET]
|
||
|
[--ppo_whiten_rewards [PPO_WHITEN_REWARDS]]
|
||
|
[--ref_model REF_MODEL]
|
||
|
[--ref_model_adapters REF_MODEL_ADAPTERS]
|
||
|
[--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT]
|
||
|
[--reward_model REWARD_MODEL]
|
||
|
[--reward_model_adapters REWARD_MODEL_ADAPTERS]
|
||
|
[--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT]
|
||
|
[--reward_model_type {lora,full,api}]
|
||
|
[--additional_target ADDITIONAL_TARGET]
|
||
|
[--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT]
|
||
|
[--lora_rank LORA_RANK] [--lora_target LORA_TARGET]
|
||
|
[--loraplus_lr_ratio LORAPLUS_LR_RATIO]
|
||
|
[--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING]
|
||
|
[--use_rslora [USE_RSLORA]] [--use_dora [USE_DORA]]
|
||
|
[--pissa_init [PISSA_INIT]] [--pissa_iter PISSA_ITER]
|
||
|
[--pissa_convert [PISSA_CONVERT]]
|
||
|
[--create_new_adapter [CREATE_NEW_ADAPTER]]
|
||
|
[--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS]
|
||
|
[--freeze_trainable_modules FREEZE_TRAINABLE_MODULES]
|
||
|
[--freeze_extra_modules FREEZE_EXTRA_MODULES]
|
||
|
[--pure_bf16 [PURE_BF16]] [--stage {pt,sft,rm,ppo,dpo,kto}]
|
||
|
[--finetuning_type {lora,freeze,full}]
|
||
|
[--use_llama_pro [USE_LLAMA_PRO]]
|
||
|
[--use_adam_mini [USE_ADAM_MINI]]
|
||
|
[--freeze_vision_tower [FREEZE_VISION_TOWER]]
|
||
|
[--no_freeze_vision_tower]
|
||
|
[--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]]
|
||
|
[--compute_accuracy [COMPUTE_ACCURACY]]
|
||
|
[--plot_loss [PLOT_LOSS]] [--do_sample [DO_SAMPLE]]
|
||
|
[--no_do_sample] [--temperature TEMPERATURE]
|
||
|
[--top_p TOP_P] [--top_k TOP_K] [--num_beams NUM_BEAMS]
|
||
|
[--max_length MAX_LENGTH] [--max_new_tokens MAX_NEW_TOKENS]
|
||
|
[--repetition_penalty REPETITION_PENALTY]
|
||
|
[--length_penalty LENGTH_PENALTY]
|
||
|
[--default_system DEFAULT_SYSTEM]
|
||
|
|
||
|
optional arguments:
|
||
|
-h, --help show this help message and exit
|
||
|
--model_name_or_path MODEL_NAME_OR_PATH
|
||
|
Path to the model weight or identifier from
|
||
|
huggingface.co/models or modelscope.cn/models.
|
||
|
(default: None)
|
||
|
--adapter_name_or_path ADAPTER_NAME_OR_PATH
|
||
|
Path to the adapter weight or identifier from
|
||
|
huggingface.co/models. Use commas to separate multiple
|
||
|
adapters. (default: None)
|
||
|
--adapter_folder ADAPTER_FOLDER
|
||
|
The folder containing the adapter weights to load.
|
||
|
(default: None)
|
||
|
--cache_dir CACHE_DIR
|
||
|
Where to store the pre-trained models downloaded from
|
||
|
huggingface.co or modelscope.cn. (default: None)
|
||
|
--use_fast_tokenizer [USE_FAST_TOKENIZER]
|
||
|
Whether or not to use one of the fast tokenizer
|
||
|
(backed by the tokenizers library). (default: True)
|
||
|
--no_use_fast_tokenizer
|
||
|
Whether or not to use one of the fast tokenizer
|
||
|
(backed by the tokenizers library). (default: False)
|
||
|
--resize_vocab [RESIZE_VOCAB]
|
||
|
Whether or not to resize the tokenizer vocab and the
|
||
|
embedding layers. (default: False)
|
||
|
--split_special_tokens [SPLIT_SPECIAL_TOKENS]
|
||
|
Whether or not the special tokens should be split
|
||
|
during the tokenization process. (default: False)
|
||
|
--new_special_tokens NEW_SPECIAL_TOKENS
|
||
|
Special tokens to be added into the tokenizer. Use
|
||
|
commas to separate multiple tokens. (default: None)
|
||
|
--model_revision MODEL_REVISION
|
||
|
The specific model version to use (can be a branch
|
||
|
name, tag name or commit id). (default: main)
|
||
|
--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]
|
||
|
Whether or not to use memory-efficient model loading.
|
||
|
(default: True)
|
||
|
--no_low_cpu_mem_usage
|
||
|
Whether or not to use memory-efficient model loading.
|
||
|
(default: False)
|
||
|
--quantization_method {bitsandbytes,hqq,eetq}
|
||
|
Quantization method to use for on-the-fly
|
||
|
quantization. (default: bitsandbytes)
|
||
|
--quantization_bit QUANTIZATION_BIT
|
||
|
The number of bits to quantize the model using
|
||
|
bitsandbytes. (default: None)
|
||
|
--quantization_type {fp4,nf4}
|
||
|
Quantization data type to use in int4 training.
|
||
|
(default: nf4)
|
||
|
--double_quantization [DOUBLE_QUANTIZATION]
|
||
|
Whether or not to use double quantization in int4
|
||
|
training. (default: True)
|
||
|
--no_double_quantization
|
||
|
Whether or not to use double quantization in int4
|
||
|
training. (default: False)
|
||
|
--quantization_device_map {auto}
|
||
|
Device map used to infer the 4-bit quantized model,
|
||
|
needs bitsandbytes>=0.43.0. (default: None)
|
||
|
--rope_scaling {linear,dynamic}
|
||
|
Which scaling strategy should be adopted for the RoPE
|
||
|
embeddings. (default: None)
|
||
|
--flash_attn {auto,disabled,sdpa,fa2}
|
||
|
Enable FlashAttention for faster training and
|
||
|
inference. (default: auto)
|
||
|
--shift_attn [SHIFT_ATTN]
|
||
|
Enable shift short attention (S^2-Attn) proposed by
|
||
|
LongLoRA. (default: False)
|
||
|
--mixture_of_depths {convert,load}
|
||
|
Convert the model to mixture-of-depths (MoD) or load
|
||
|
the MoD model. (default: None)
|
||
|
--use_unsloth [USE_UNSLOTH]
|
||
|
Whether or not to use unsloth's optimization for the
|
||
|
LoRA training. (default: False)
|
||
|
--visual_inputs [VISUAL_INPUTS]
|
||
|
Whethor or not to use multimodal LLM that accepts
|
||
|
visual inputs. (default: False)
|
||
|
--moe_aux_loss_coef MOE_AUX_LOSS_COEF
|
||
|
Coefficient of the auxiliary router loss in mixture-
|
||
|
of-experts model. (default: None)
|
||
|
--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]
|
||
|
Whether or not to disable gradient checkpointing.
|
||
|
(default: False)
|
||
|
--upcast_layernorm [UPCAST_LAYERNORM]
|
||
|
Whether or not to upcast the layernorm weights in
|
||
|
fp32. (default: False)
|
||
|
--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]
|
||
|
Whether or not to upcast the output of lm_head in
|
||
|
fp32. (default: False)
|
||
|
--train_from_scratch [TRAIN_FROM_SCRATCH]
|
||
|
Whether or not to randomly initialize the model
|
||
|
weights. (default: False)
|
||
|
--infer_backend {huggingface,vllm}
|
||
|
Backend engine used at inference. (default:
|
||
|
huggingface)
|
||
|
--vllm_maxlen VLLM_MAXLEN
|
||
|
Maximum sequence (prompt + response) length of the
|
||
|
vLLM engine. (default: 2048)
|
||
|
--vllm_gpu_util VLLM_GPU_UTIL
|
||
|
The fraction of GPU memory in (0,1) to be used for the
|
||
|
vLLM engine. (default: 0.9)
|
||
|
--vllm_enforce_eager [VLLM_ENFORCE_EAGER]
|
||
|
Whether or not to disable CUDA graph in the vLLM
|
||
|
engine. (default: False)
|
||
|
--vllm_max_lora_rank VLLM_MAX_LORA_RANK
|
||
|
Maximum rank of all LoRAs in the vLLM engine.
|
||
|
(default: 32)
|
||
|
--offload_folder OFFLOAD_FOLDER
|
||
|
Path to offload model weights. (default: offload)
|
||
|
--use_cache [USE_CACHE]
|
||
|
Whether or not to use KV cache in generation.
|
||
|
(default: True)
|
||
|
--no_use_cache Whether or not to use KV cache in generation.
|
||
|
(default: False)
|
||
|
--infer_dtype {auto,float16,bfloat16,float32}
|
||
|
Data type for model weights and activations at
|
||
|
inference. (default: auto)
|
||
|
--hf_hub_token HF_HUB_TOKEN
|
||
|
Auth token to log in with Hugging Face Hub. (default:
|
||
|
None)
|
||
|
--ms_hub_token MS_HUB_TOKEN
|
||
|
Auth token to log in with ModelScope Hub. (default:
|
||
|
None)
|
||
|
--export_dir EXPORT_DIR
|
||
|
Path to the directory to save the exported model.
|
||
|
(default: None)
|
||
|
--export_size EXPORT_SIZE
|
||
|
The file shard size (in GB) of the exported model.
|
||
|
(default: 1)
|
||
|
--export_device {cpu,auto}
|
||
|
The device used in model export, use `auto` to
|
||
|
accelerate exporting. (default: cpu)
|
||
|
--export_quantization_bit EXPORT_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the exported model.
|
||
|
(default: None)
|
||
|
--export_quantization_dataset EXPORT_QUANTIZATION_DATASET
|
||
|
Path to the dataset or dataset name to use in
|
||
|
quantizing the exported model. (default: None)
|
||
|
--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES
|
||
|
The number of samples used for quantization. (default:
|
||
|
128)
|
||
|
--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN
|
||
|
The maximum length of the model inputs used for
|
||
|
quantization. (default: 1024)
|
||
|
--export_legacy_format [EXPORT_LEGACY_FORMAT]
|
||
|
Whether or not to save the `.bin` files instead of
|
||
|
`.safetensors`. (default: False)
|
||
|
--export_hub_model_id EXPORT_HUB_MODEL_ID
|
||
|
The name of the repository if push the model to the
|
||
|
Hugging Face hub. (default: None)
|
||
|
--print_param_status [PRINT_PARAM_STATUS]
|
||
|
For debugging purposes, print the status of the
|
||
|
parameters in the model. (default: False)
|
||
|
--template TEMPLATE Which template to use for constructing prompts in
|
||
|
training and inference. (default: None)
|
||
|
--dataset DATASET The name of dataset(s) to use for training. Use commas
|
||
|
to separate multiple datasets. (default: None)
|
||
|
--eval_dataset EVAL_DATASET
|
||
|
The name of dataset(s) to use for evaluation. Use
|
||
|
commas to separate multiple datasets. (default: None)
|
||
|
--dataset_dir DATASET_DIR
|
||
|
Path to the folder containing the datasets. (default:
|
||
|
data)
|
||
|
--cutoff_len CUTOFF_LEN
|
||
|
The cutoff length of the tokenized inputs in the
|
||
|
dataset. (default: 1024)
|
||
|
--train_on_prompt [TRAIN_ON_PROMPT]
|
||
|
Whether or not to disable the mask on the prompt.
|
||
|
(default: False)
|
||
|
--mask_history [MASK_HISTORY]
|
||
|
Whether or not to mask the history and train on the
|
||
|
last turn only. (default: False)
|
||
|
--streaming [STREAMING]
|
||
|
Enable dataset streaming. (default: False)
|
||
|
--buffer_size BUFFER_SIZE
|
||
|
Size of the buffer to randomly sample examples from in
|
||
|
dataset streaming. (default: 16384)
|
||
|
--mix_strategy {concat,interleave_under,interleave_over}
|
||
|
Strategy to use in dataset mixing (concat/interleave)
|
||
|
(undersampling/oversampling). (default: concat)
|
||
|
--interleave_probs INTERLEAVE_PROBS
|
||
|
Probabilities to sample data from datasets. Use commas
|
||
|
to separate multiple datasets. (default: None)
|
||
|
--overwrite_cache [OVERWRITE_CACHE]
|
||
|
Overwrite the cached training and evaluation sets.
|
||
|
(default: False)
|
||
|
--preprocessing_num_workers PREPROCESSING_NUM_WORKERS
|
||
|
The number of processes to use for the pre-processing.
|
||
|
(default: None)
|
||
|
--max_samples MAX_SAMPLES
|
||
|
For debugging purposes, truncate the number of
|
||
|
examples for each dataset. (default: None)
|
||
|
--eval_num_beams EVAL_NUM_BEAMS
|
||
|
Number of beams to use for evaluation. This argument
|
||
|
will be passed to `model.generate` (default: None)
|
||
|
--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]
|
||
|
Whether or not to ignore the tokens corresponding to
|
||
|
the pad label in loss computation. (default: True)
|
||
|
--no_ignore_pad_token_for_loss
|
||
|
Whether or not to ignore the tokens corresponding to
|
||
|
the pad label in loss computation. (default: False)
|
||
|
--val_size VAL_SIZE Size of the development set, should be an integer or a
|
||
|
float in range `[0,1)`. (default: 0.0)
|
||
|
--packing PACKING Enable sequences packing in training. Will
|
||
|
automatically enable in pre-training. (default: None)
|
||
|
--neat_packing [NEAT_PACKING]
|
||
|
Enable sequence packing without cross-attention.
|
||
|
(default: False)
|
||
|
--tool_format TOOL_FORMAT
|
||
|
Tool format to use for constructing function calling
|
||
|
examples. (default: None)
|
||
|
--tokenized_path TOKENIZED_PATH
|
||
|
Path to save or load the tokenized datasets. (default:
|
||
|
None)
|
||
|
--output_dir OUTPUT_DIR
|
||
|
The output directory where the model predictions and
|
||
|
checkpoints will be written. (default: None)
|
||
|
--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]
|
||
|
Overwrite the content of the output directory. Use
|
||
|
this to continue training if output_dir points to a
|
||
|
checkpoint directory. (default: False)
|
||
|
--do_train [DO_TRAIN]
|
||
|
Whether to run training. (default: False)
|
||
|
--do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False)
|
||
|
--do_predict [DO_PREDICT]
|
||
|
Whether to run predictions on the test set. (default:
|
||
|
False)
|
||
|
--eval_strategy {no,steps,epoch}
|
||
|
The evaluation strategy to use. (default: no)
|
||
|
--prediction_loss_only [PREDICTION_LOSS_ONLY]
|
||
|
When performing evaluation and predictions, only
|
||
|
returns the loss. (default: False)
|
||
|
--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE
|
||
|
Batch size per GPU/TPU/MPS/NPU core/CPU for training.
|
||
|
(default: 8)
|
||
|
--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE
|
||
|
Batch size per GPU/TPU/MPS/NPU core/CPU for
|
||
|
evaluation. (default: 8)
|
||
|
--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE
|
||
|
Deprecated, the use of `--per_device_train_batch_size`
|
||
|
is preferred. Batch size per GPU/TPU core/CPU for
|
||
|
training. (default: None)
|
||
|
--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE
|
||
|
Deprecated, the use of `--per_device_eval_batch_size`
|
||
|
is preferred. Batch size per GPU/TPU core/CPU for
|
||
|
evaluation. (default: None)
|
||
|
--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS
|
||
|
Number of updates steps to accumulate before
|
||
|
performing a backward/update pass. (default: 1)
|
||
|
--eval_accumulation_steps EVAL_ACCUMULATION_STEPS
|
||
|
Number of predictions steps to accumulate before
|
||
|
moving the tensors to the CPU. (default: None)
|
||
|
--eval_delay EVAL_DELAY
|
||
|
Number of epochs or steps to wait for before the first
|
||
|
evaluation can be performed, depending on the
|
||
|
eval_strategy. (default: 0)
|
||
|
--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS
|
||
|
Number of steps to wait before calling
|
||
|
`torch.<device>.empty_cache()`.This can help avoid
|
||
|
CUDA out-of-memory errors by lowering peak VRAM usage
|
||
|
at a cost of about [10{'option_strings': ['--
|
||
|
torch_empty_cache_steps'], 'dest':
|
||
|
'torch_empty_cache_steps', 'nargs': None, 'const':
|
||
|
None, 'default': None, 'type': 'int', 'choices': None,
|
||
|
'required': False, 'help': 'Number of steps to wait
|
||
|
before calling `torch.<device>.empty_cache()`.This can
|
||
|
help avoid CUDA out-of-memory errors by lowering peak
|
||
|
VRAM usage at a cost of about [10% slower performance]
|
||
|
(https://github.com/huggingface/transformers/issues/31
|
||
|
372).If left unset or set to None, cache will not be
|
||
|
emptied.', 'metavar': None, 'container':
|
||
|
<argparse._ArgumentGroup object at 0x7ff0cf26fee0>,
|
||
|
'prog': 'launcher.py'}lower performance](https://githu
|
||
|
b.com/huggingface/transformers/issues/31372).If left
|
||
|
unset or set to None, cache will not be emptied.
|
||
|
(default: None)
|
||
|
--learning_rate LEARNING_RATE
|
||
|
The initial learning rate for AdamW. (default: 5e-05)
|
||
|
--weight_decay WEIGHT_DECAY
|
||
|
Weight decay for AdamW if we apply some. (default:
|
||
|
0.0)
|
||
|
--adam_beta1 ADAM_BETA1
|
||
|
Beta1 for AdamW optimizer (default: 0.9)
|
||
|
--adam_beta2 ADAM_BETA2
|
||
|
Beta2 for AdamW optimizer (default: 0.999)
|
||
|
--adam_epsilon ADAM_EPSILON
|
||
|
Epsilon for AdamW optimizer. (default: 1e-08)
|
||
|
--max_grad_norm MAX_GRAD_NORM
|
||
|
Max gradient norm. (default: 1.0)
|
||
|
--num_train_epochs NUM_TRAIN_EPOCHS
|
||
|
Total number of training epochs to perform. (default:
|
||
|
3.0)
|
||
|
--max_steps MAX_STEPS
|
||
|
If > 0: set total number of training steps to perform.
|
||
|
Override num_train_epochs. (default: -1)
|
||
|
--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}
|
||
|
The scheduler type to use. (default: linear)
|
||
|
--lr_scheduler_kwargs LR_SCHEDULER_KWARGS
|
||
|
Extra parameters for the lr_scheduler such as
|
||
|
{'num_cycles': 1} for the cosine with hard restarts.
|
||
|
(default: {})
|
||
|
--warmup_ratio WARMUP_RATIO
|
||
|
Linear warmup over warmup_ratio fraction of total
|
||
|
steps. (default: 0.0)
|
||
|
--warmup_steps WARMUP_STEPS
|
||
|
Linear warmup over warmup_steps. (default: 0)
|
||
|
--log_level {detail,debug,info,warning,error,critical,passive}
|
||
|
Logger log level to use on the main node. Possible
|
||
|
choices are the log levels as strings: 'debug',
|
||
|
'info', 'warning', 'error' and 'critical', plus a
|
||
|
'passive' level which doesn't set anything and lets
|
||
|
the application set the level. Defaults to 'passive'.
|
||
|
(default: passive)
|
||
|
--log_level_replica {detail,debug,info,warning,error,critical,passive}
|
||
|
Logger log level to use on replica nodes. Same choices
|
||
|
and defaults as ``log_level`` (default: warning)
|
||
|
--log_on_each_node [LOG_ON_EACH_NODE]
|
||
|
When doing a multinode distributed training, whether
|
||
|
to log once per node or just once on the main node.
|
||
|
(default: True)
|
||
|
--no_log_on_each_node
|
||
|
When doing a multinode distributed training, whether
|
||
|
to log once per node or just once on the main node.
|
||
|
(default: False)
|
||
|
--logging_dir LOGGING_DIR
|
||
|
Tensorboard log dir. (default: None)
|
||
|
--logging_strategy {no,steps,epoch}
|
||
|
The logging strategy to use. (default: steps)
|
||
|
--logging_first_step [LOGGING_FIRST_STEP]
|
||
|
Log the first global_step (default: False)
|
||
|
--logging_steps LOGGING_STEPS
|
||
|
Log every X updates steps. Should be an integer or a
|
||
|
float in range `[0,1)`. If smaller than 1, will be
|
||
|
interpreted as ratio of total training steps.
|
||
|
(default: 500)
|
||
|
--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]
|
||
|
Filter nan and inf losses for logging. (default: True)
|
||
|
--no_logging_nan_inf_filter
|
||
|
Filter nan and inf losses for logging. (default:
|
||
|
False)
|
||
|
--save_strategy {no,steps,epoch}
|
||
|
The checkpoint save strategy to use. (default: steps)
|
||
|
--save_steps SAVE_STEPS
|
||
|
Save checkpoint every X updates steps. Should be an
|
||
|
integer or a float in range `[0,1)`. If smaller than
|
||
|
1, will be interpreted as ratio of total training
|
||
|
steps. (default: 500)
|
||
|
--save_total_limit SAVE_TOTAL_LIMIT
|
||
|
If a value is passed, will limit the total amount of
|
||
|
checkpoints. Deletes the older checkpoints in
|
||
|
`output_dir`. When `load_best_model_at_end` is
|
||
|
enabled, the 'best' checkpoint according to
|
||
|
`metric_for_best_model` will always be retained in
|
||
|
addition to the most recent ones. For example, for
|
||
|
`save_total_limit=5` and
|
||
|
`load_best_model_at_end=True`, the four last
|
||
|
checkpoints will always be retained alongside the best
|
||
|
model. When `save_total_limit=1` and
|
||
|
`load_best_model_at_end=True`, it is possible that two
|
||
|
checkpoints are saved: the last one and the best one
|
||
|
(if they are different). Default is unlimited
|
||
|
checkpoints (default: None)
|
||
|
--save_safetensors [SAVE_SAFETENSORS]
|
||
|
Use safetensors saving and loading for state dicts
|
||
|
instead of default torch.load and torch.save.
|
||
|
(default: True)
|
||
|
--no_save_safetensors
|
||
|
Use safetensors saving and loading for state dicts
|
||
|
instead of default torch.load and torch.save.
|
||
|
(default: False)
|
||
|
--save_on_each_node [SAVE_ON_EACH_NODE]
|
||
|
When doing multi-node distributed training, whether to
|
||
|
save models and checkpoints on each node, or only on
|
||
|
the main one (default: False)
|
||
|
--save_only_model [SAVE_ONLY_MODEL]
|
||
|
When checkpointing, whether to only save the model, or
|
||
|
also the optimizer, scheduler & rng state.Note that
|
||
|
when this is true, you won't be able to resume
|
||
|
training from checkpoint.This enables you to save
|
||
|
storage by not storing the optimizer, scheduler & rng
|
||
|
state.You can only load the model using
|
||
|
from_pretrained with this option set to True.
|
||
|
(default: False)
|
||
|
--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]
|
||
|
Whether to restore the callback states from the
|
||
|
checkpoint. If `True`, will override callbacks passed
|
||
|
to the `Trainer` if they exist in the checkpoint.
|
||
|
(default: False)
|
||
|
--no_cuda [NO_CUDA] This argument is deprecated. It will be removed in
|
||
|
version 5.0 of 🤗 Transformers. (default: False)
|
||
|
--use_cpu [USE_CPU] Whether or not to use cpu. If set to False, we will
|
||
|
use cuda/tpu/mps/npu device if available. (default:
|
||
|
False)
|
||
|
--use_mps_device [USE_MPS_DEVICE]
|
||
|
This argument is deprecated. `mps` device will be used
|
||
|
if available similar to `cuda` device. It will be
|
||
|
removed in version 5.0 of 🤗 Transformers (default:
|
||
|
False)
|
||
|
--seed SEED Random seed that will be set at the beginning of
|
||
|
training. (default: 42)
|
||
|
--data_seed DATA_SEED
|
||
|
Random seed to be used with data samplers. (default:
|
||
|
None)
|
||
|
--jit_mode_eval [JIT_MODE_EVAL]
|
||
|
Whether or not to use PyTorch jit trace for inference
|
||
|
(default: False)
|
||
|
--use_ipex [USE_IPEX]
|
||
|
Use Intel extension for PyTorch when it is available,
|
||
|
installation: 'https://github.com/intel/intel-
|
||
|
extension-for-pytorch' (default: False)
|
||
|
--bf16 [BF16] Whether to use bf16 (mixed) precision instead of
|
||
|
32-bit. Requires Ampere or higher NVIDIA architecture
|
||
|
or using CPU (use_cpu) or Ascend NPU. This is an
|
||
|
experimental API and it may change. (default: False)
|
||
|
--fp16 [FP16] Whether to use fp16 (mixed) precision instead of
|
||
|
32-bit (default: False)
|
||
|
--fp16_opt_level FP16_OPT_LEVEL
|
||
|
For fp16: Apex AMP optimization level selected in
|
||
|
['O0', 'O1', 'O2', and 'O3']. See details at
|
||
|
https://nvidia.github.io/apex/amp.html (default: O1)
|
||
|
--half_precision_backend {auto,apex,cpu_amp}
|
||
|
The backend to be used for half precision. (default:
|
||
|
auto)
|
||
|
--bf16_full_eval [BF16_FULL_EVAL]
|
||
|
Whether to use full bfloat16 evaluation instead of
|
||
|
32-bit. This is an experimental API and it may change.
|
||
|
(default: False)
|
||
|
--fp16_full_eval [FP16_FULL_EVAL]
|
||
|
Whether to use full float16 evaluation instead of
|
||
|
32-bit (default: False)
|
||
|
--tf32 TF32 Whether to enable tf32 mode, available in Ampere and
|
||
|
newer GPU architectures. This is an experimental API
|
||
|
and it may change. (default: None)
|
||
|
--local_rank LOCAL_RANK
|
||
|
For distributed training: local_rank (default: -1)
|
||
|
--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}
|
||
|
The backend to be used for distributed training
|
||
|
(default: None)
|
||
|
--tpu_num_cores TPU_NUM_CORES
|
||
|
TPU: Number of TPU cores (automatically passed by
|
||
|
launcher script) (default: None)
|
||
|
--tpu_metrics_debug [TPU_METRICS_DEBUG]
|
||
|
Deprecated, the use of `--debug tpu_metrics_debug` is
|
||
|
preferred. TPU: Whether to print debug metrics
|
||
|
(default: False)
|
||
|
--debug DEBUG [DEBUG ...]
|
||
|
Whether or not to enable debug mode. Current options:
|
||
|
`underflow_overflow` (Detect underflow and overflow in
|
||
|
activations and weights), `tpu_metrics_debug` (print
|
||
|
debug metrics on TPU). (default: None)
|
||
|
--dataloader_drop_last [DATALOADER_DROP_LAST]
|
||
|
Drop the last incomplete batch if it is not divisible
|
||
|
by the batch size. (default: False)
|
||
|
--eval_steps EVAL_STEPS
|
||
|
Run an evaluation every X steps. Should be an integer
|
||
|
or a float in range `[0,1)`. If smaller than 1, will
|
||
|
be interpreted as ratio of total training steps.
|
||
|
(default: None)
|
||
|
--dataloader_num_workers DATALOADER_NUM_WORKERS
|
||
|
Number of subprocesses to use for data loading
|
||
|
(PyTorch only). 0 means that the data will be loaded
|
||
|
in the main process. (default: 0)
|
||
|
--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR
|
||
|
Number of batches loaded in advance by each worker. 2
|
||
|
means there will be a total of 2 * num_workers batches
|
||
|
prefetched across all workers. Default is 2 for
|
||
|
PyTorch < 2.0.0 and otherwise None. (default: None)
|
||
|
--past_index PAST_INDEX
|
||
|
If >=0, uses the corresponding part of the output as
|
||
|
the past state for next step. (default: -1)
|
||
|
--run_name RUN_NAME An optional descriptor for the run. Notably used for
|
||
|
wandb, mlflow and comet logging. (default: None)
|
||
|
--disable_tqdm DISABLE_TQDM
|
||
|
Whether or not to disable the tqdm progress bars.
|
||
|
(default: None)
|
||
|
--remove_unused_columns [REMOVE_UNUSED_COLUMNS]
|
||
|
Remove columns not required by the model when using an
|
||
|
nlp.Dataset. (default: True)
|
||
|
--no_remove_unused_columns
|
||
|
Remove columns not required by the model when using an
|
||
|
nlp.Dataset. (default: False)
|
||
|
--label_names LABEL_NAMES [LABEL_NAMES ...]
|
||
|
The list of keys in your dictionary of inputs that
|
||
|
correspond to the labels. (default: None)
|
||
|
--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]
|
||
|
Whether or not to load the best model found during
|
||
|
training at the end of training. When this option is
|
||
|
enabled, the best checkpoint will always be saved. See
|
||
|
`save_total_limit` for more. (default: False)
|
||
|
--metric_for_best_model METRIC_FOR_BEST_MODEL
|
||
|
The metric to use to compare two different models.
|
||
|
(default: None)
|
||
|
--greater_is_better GREATER_IS_BETTER
|
||
|
Whether the `metric_for_best_model` should be
|
||
|
maximized or not. (default: None)
|
||
|
--ignore_data_skip [IGNORE_DATA_SKIP]
|
||
|
When resuming training, whether or not to skip the
|
||
|
first epochs and batches to get to the same training
|
||
|
data. (default: False)
|
||
|
--fsdp FSDP Whether or not to use PyTorch Fully Sharded Data
|
||
|
Parallel (FSDP) training (in distributed training
|
||
|
only). The base option should be `full_shard`,
|
||
|
`shard_grad_op` or `no_shard` and you can add CPU-
|
||
|
offload to `full_shard` or `shard_grad_op` like this:
|
||
|
full_shard offload` or `shard_grad_op offload`. You
|
||
|
can add auto-wrap to `full_shard` or `shard_grad_op`
|
||
|
with the same syntax: full_shard auto_wrap` or
|
||
|
`shard_grad_op auto_wrap`. (default: )
|
||
|
--fsdp_min_num_params FSDP_MIN_NUM_PARAMS
|
||
|
This parameter is deprecated. FSDP's minimum number of
|
||
|
parameters for Default Auto Wrapping. (useful only
|
||
|
when `fsdp` field is passed). (default: 0)
|
||
|
--fsdp_config FSDP_CONFIG
|
||
|
Config to be used with FSDP (Pytorch Fully Sharded
|
||
|
Data Parallel). The value is either a fsdp json config
|
||
|
file (e.g., `fsdp_config.json`) or an already loaded
|
||
|
json file as `dict`. (default: None)
|
||
|
--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP
|
||
|
This parameter is deprecated. Transformer layer class
|
||
|
name (case-sensitive) to wrap, e.g, `BertLayer`,
|
||
|
`GPTJBlock`, `T5Block` .... (useful only when `fsdp`
|
||
|
flag is passed). (default: None)
|
||
|
--accelerator_config ACCELERATOR_CONFIG
|
||
|
Config to be used with the internal Accelerator object
|
||
|
initializtion. The value is either a accelerator json
|
||
|
config file (e.g., `accelerator_config.json`) or an
|
||
|
already loaded json file as `dict`. (default: None)
|
||
|
--deepspeed DEEPSPEED
|
||
|
Enable deepspeed and pass the path to deepspeed json
|
||
|
config file (e.g. `ds_config.json`) or an already
|
||
|
loaded json file as a dict (default: None)
|
||
|
--label_smoothing_factor LABEL_SMOOTHING_FACTOR
|
||
|
The label smoothing epsilon to apply (zero means no
|
||
|
label smoothing). (default: 0.0)
|
||
|
--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}
|
||
|
The optimizer to use. (default: adamw_torch)
|
||
|
--optim_args OPTIM_ARGS
|
||
|
Optional arguments to supply to optimizer. (default:
|
||
|
None)
|
||
|
--adafactor [ADAFACTOR]
|
||
|
Whether or not to replace AdamW by Adafactor.
|
||
|
(default: False)
|
||
|
--group_by_length [GROUP_BY_LENGTH]
|
||
|
Whether or not to group samples of roughly the same
|
||
|
length together when batching. (default: False)
|
||
|
--length_column_name LENGTH_COLUMN_NAME
|
||
|
Column name with precomputed lengths to use when
|
||
|
grouping by length. (default: length)
|
||
|
--report_to REPORT_TO
|
||
|
The list of integrations to report the results and
|
||
|
logs to. (default: None)
|
||
|
--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS
|
||
|
When using distributed training, the value of the flag
|
||
|
`find_unused_parameters` passed to
|
||
|
`DistributedDataParallel`. (default: None)
|
||
|
--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB
|
||
|
When using distributed training, the value of the flag
|
||
|
`bucket_cap_mb` passed to `DistributedDataParallel`.
|
||
|
(default: None)
|
||
|
--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS
|
||
|
When using distributed training, the value of the flag
|
||
|
`broadcast_buffers` passed to
|
||
|
`DistributedDataParallel`. (default: None)
|
||
|
--dataloader_pin_memory [DATALOADER_PIN_MEMORY]
|
||
|
Whether or not to pin memory for DataLoader. (default:
|
||
|
True)
|
||
|
--no_dataloader_pin_memory
|
||
|
Whether or not to pin memory for DataLoader. (default:
|
||
|
False)
|
||
|
--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]
|
||
|
If True, the data loader will not shut down the worker
|
||
|
processes after a dataset has been consumed once. This
|
||
|
allows to maintain the workers Dataset instances
|
||
|
alive. Can potentially speed up training, but will
|
||
|
increase RAM usage. (default: False)
|
||
|
--skip_memory_metrics [SKIP_MEMORY_METRICS]
|
||
|
Whether or not to skip adding of memory profiler
|
||
|
reports to metrics. (default: True)
|
||
|
--no_skip_memory_metrics
|
||
|
Whether or not to skip adding of memory profiler
|
||
|
reports to metrics. (default: False)
|
||
|
--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]
|
||
|
Whether or not to use the legacy prediction_loop in
|
||
|
the Trainer. (default: False)
|
||
|
--push_to_hub [PUSH_TO_HUB]
|
||
|
Whether or not to upload the trained model to the
|
||
|
model hub after training. (default: False)
|
||
|
--resume_from_checkpoint RESUME_FROM_CHECKPOINT
|
||
|
The path to a folder with a valid checkpoint for your
|
||
|
model. (default: None)
|
||
|
--hub_model_id HUB_MODEL_ID
|
||
|
The name of the repository to keep in sync with the
|
||
|
local `output_dir`. (default: None)
|
||
|
--hub_strategy {end,every_save,checkpoint,all_checkpoints}
|
||
|
The hub strategy to use when `--push_to_hub` is
|
||
|
activated. (default: every_save)
|
||
|
--hub_token HUB_TOKEN
|
||
|
The token to use to push to the Model Hub. (default:
|
||
|
None)
|
||
|
--hub_private_repo [HUB_PRIVATE_REPO]
|
||
|
Whether the model repository is private or not.
|
||
|
(default: False)
|
||
|
--hub_always_push [HUB_ALWAYS_PUSH]
|
||
|
Unless `True`, the Trainer will skip pushes if the
|
||
|
previous one wasn't finished yet. (default: False)
|
||
|
--gradient_checkpointing [GRADIENT_CHECKPOINTING]
|
||
|
If True, use gradient checkpointing to save memory at
|
||
|
the expense of slower backward pass. (default: False)
|
||
|
--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS
|
||
|
Gradient checkpointing key word arguments such as
|
||
|
`use_reentrant`. Will be passed to
|
||
|
`torch.utils.checkpoint.checkpoint` through
|
||
|
`model.gradient_checkpointing_enable`. (default: None)
|
||
|
--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]
|
||
|
Whether or not the inputs will be passed to the
|
||
|
`compute_metrics` function. (default: False)
|
||
|
--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]
|
||
|
Whether to recursively concat
|
||
|
inputs/losses/labels/predictions across batches. If
|
||
|
`False`, will instead store them as lists, with each
|
||
|
batch kept separate. (default: True)
|
||
|
--no_eval_do_concat_batches
|
||
|
Whether to recursively concat
|
||
|
inputs/losses/labels/predictions across batches. If
|
||
|
`False`, will instead store them as lists, with each
|
||
|
batch kept separate. (default: False)
|
||
|
--fp16_backend {auto,apex,cpu_amp}
|
||
|
Deprecated. Use half_precision_backend instead
|
||
|
(default: auto)
|
||
|
--evaluation_strategy {no,steps,epoch}
|
||
|
Deprecated. Use `eval_strategy` instead (default:
|
||
|
None)
|
||
|
--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID
|
||
|
The name of the repository to which push the
|
||
|
`Trainer`. (default: None)
|
||
|
--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION
|
||
|
The name of the organization in with to which push the
|
||
|
`Trainer`. (default: None)
|
||
|
--push_to_hub_token PUSH_TO_HUB_TOKEN
|
||
|
The token to use to push to the Model Hub. (default:
|
||
|
None)
|
||
|
--mp_parameters MP_PARAMETERS
|
||
|
Used by the SageMaker launcher to send mp-specific
|
||
|
args. Ignored in Trainer (default: )
|
||
|
--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]
|
||
|
Whether to automatically decrease the batch size in
|
||
|
half and rerun the training loop again each time a
|
||
|
CUDA Out-of-Memory was reached (default: False)
|
||
|
--full_determinism [FULL_DETERMINISM]
|
||
|
Whether to call enable_full_determinism instead of
|
||
|
set_seed for reproducibility in distributed training.
|
||
|
Important: this will negatively impact the
|
||
|
performance, so only use it for debugging. (default:
|
||
|
False)
|
||
|
--torchdynamo TORCHDYNAMO
|
||
|
This argument is deprecated, use
|
||
|
`--torch_compile_backend` instead. (default: None)
|
||
|
--ray_scope RAY_SCOPE
|
||
|
The scope to use when doing hyperparameter search with
|
||
|
Ray. By default, `"last"` will be used. Ray will then
|
||
|
use the last checkpoint of all trials, compare those,
|
||
|
and select the best one. However, other options are
|
||
|
also available. See the Ray documentation (https://doc
|
||
|
s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun
|
||
|
e.ExperimentAnalysis.get_best_trial) for more options.
|
||
|
(default: last)
|
||
|
--ddp_timeout DDP_TIMEOUT
|
||
|
Overrides the default timeout for distributed training
|
||
|
(value should be given in seconds). (default: 1800)
|
||
|
--torch_compile [TORCH_COMPILE]
|
||
|
If set to `True`, the model will be wrapped in
|
||
|
`torch.compile`. (default: False)
|
||
|
--torch_compile_backend TORCH_COMPILE_BACKEND
|
||
|
Which backend to use with `torch.compile`, passing one
|
||
|
will trigger a model compilation. (default: None)
|
||
|
--torch_compile_mode TORCH_COMPILE_MODE
|
||
|
Which mode to use with `torch.compile`, passing one
|
||
|
will trigger a model compilation. (default: None)
|
||
|
--dispatch_batches DISPATCH_BATCHES
|
||
|
Deprecated. Pass {'dispatch_batches':VALUE} to
|
||
|
`accelerator_config`. (default: None)
|
||
|
--split_batches SPLIT_BATCHES
|
||
|
Deprecated. Pass {'split_batches':True} to
|
||
|
`accelerator_config`. (default: None)
|
||
|
--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]
|
||
|
If set to `True`, the speed metrics will include `tgs`
|
||
|
(tokens per second per device). (default: False)
|
||
|
--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]
|
||
|
If set to `True`, will track the number of input
|
||
|
tokens seen throughout training. (May be slower in
|
||
|
distributed training) (default: False)
|
||
|
--neftune_noise_alpha NEFTUNE_NOISE_ALPHA
|
||
|
Activates neftune noise embeddings into the model.
|
||
|
NEFTune has been proven to drastically improve model
|
||
|
performances for instrcution fine-tuning. Check out
|
||
|
the original paper here:
|
||
|
https://arxiv.org/abs/2310.05914 and the original code
|
||
|
here: https://github.com/neelsjain/NEFTune. Only
|
||
|
supported for `PreTrainedModel` and `PeftModel`
|
||
|
classes. (default: None)
|
||
|
--optim_target_modules OPTIM_TARGET_MODULES
|
||
|
Target modules for the optimizer defined in the
|
||
|
`optim` argument. Only used for the GaLore optimizer
|
||
|
at the moment. (default: None)
|
||
|
--batch_eval_metrics [BATCH_EVAL_METRICS]
|
||
|
Break eval metrics calculation into batches to save
|
||
|
memory. (default: False)
|
||
|
--eval_on_start [EVAL_ON_START]
|
||
|
Whether to run through the entire `evaluation` step at
|
||
|
the very beginning of training as a sanity check.
|
||
|
(default: False)
|
||
|
--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]
|
||
|
Whether to run recursively gather object in a nested
|
||
|
list/tuple/dictionary of objects from all devices.
|
||
|
(default: False)
|
||
|
--sortish_sampler [SORTISH_SAMPLER]
|
||
|
Whether to use SortishSampler or not. (default: False)
|
||
|
--predict_with_generate [PREDICT_WITH_GENERATE]
|
||
|
Whether to use generate to calculate generative
|
||
|
metrics (ROUGE, BLEU). (default: False)
|
||
|
--generation_max_length GENERATION_MAX_LENGTH
|
||
|
The `max_length` to use on each evaluation loop when
|
||
|
`predict_with_generate=True`. Will default to the
|
||
|
`max_length` value of the model configuration.
|
||
|
(default: None)
|
||
|
--generation_num_beams GENERATION_NUM_BEAMS
|
||
|
The `num_beams` to use on each evaluation loop when
|
||
|
`predict_with_generate=True`. Will default to the
|
||
|
`num_beams` value of the model configuration.
|
||
|
(default: None)
|
||
|
--generation_config GENERATION_CONFIG
|
||
|
Model id, file path or url pointing to a
|
||
|
GenerationConfig json file, to use during prediction.
|
||
|
(default: None)
|
||
|
--use_badam [USE_BADAM]
|
||
|
Whether or not to use the BAdam optimizer. (default:
|
||
|
False)
|
||
|
--badam_mode {layer,ratio}
|
||
|
Whether to use layer-wise or ratio-wise BAdam
|
||
|
optimizer. (default: layer)
|
||
|
--badam_start_block BADAM_START_BLOCK
|
||
|
The starting block index for layer-wise BAdam.
|
||
|
(default: None)
|
||
|
--badam_switch_mode {ascending,descending,random,fixed}
|
||
|
the strategy of picking block to update for layer-wise
|
||
|
BAdam. (default: ascending)
|
||
|
--badam_switch_interval BADAM_SWITCH_INTERVAL
|
||
|
Number of steps to update the block for layer-wise
|
||
|
BAdam. Use -1 to disable the block update. (default:
|
||
|
50)
|
||
|
--badam_update_ratio BADAM_UPDATE_RATIO
|
||
|
The ratio of the update for ratio-wise BAdam.
|
||
|
(default: 0.05)
|
||
|
--badam_mask_mode {adjacent,scatter}
|
||
|
The mode of the mask for BAdam optimizer. `adjacent`
|
||
|
means that the trainable parameters are adjacent to
|
||
|
each other, `scatter` means that trainable parameters
|
||
|
are randomly choosed from the weight. (default:
|
||
|
adjacent)
|
||
|
--badam_verbose BADAM_VERBOSE
|
||
|
The verbosity level of BAdam optimizer. 0 for no
|
||
|
print, 1 for print the block prefix, 2 for print
|
||
|
trainable parameters. (default: 0)
|
||
|
--use_galore [USE_GALORE]
|
||
|
Whether or not to use the gradient low-Rank projection
|
||
|
(GaLore). (default: False)
|
||
|
--galore_target GALORE_TARGET
|
||
|
Name(s) of modules to apply GaLore. Use commas to
|
||
|
separate multiple modules. Use `all` to specify all
|
||
|
the linear modules. (default: all)
|
||
|
--galore_rank GALORE_RANK
|
||
|
The rank of GaLore gradients. (default: 16)
|
||
|
--galore_update_interval GALORE_UPDATE_INTERVAL
|
||
|
Number of steps to update the GaLore projection.
|
||
|
(default: 200)
|
||
|
--galore_scale GALORE_SCALE
|
||
|
GaLore scaling coefficient. (default: 0.25)
|
||
|
--galore_proj_type {std,reverse_std,right,left,full}
|
||
|
Type of GaLore projection. (default: std)
|
||
|
--galore_layerwise [GALORE_LAYERWISE]
|
||
|
Whether or not to enable layer-wise update to further
|
||
|
save memory. (default: False)
|
||
|
--pref_beta PREF_BETA
|
||
|
The beta parameter in the preference loss. (default:
|
||
|
0.1)
|
||
|
--pref_ftx PREF_FTX The supervised fine-tuning loss coefficient in DPO
|
||
|
training. (default: 0.0)
|
||
|
--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}
|
||
|
The type of DPO loss to use. (default: sigmoid)
|
||
|
--dpo_label_smoothing DPO_LABEL_SMOOTHING
|
||
|
The robust DPO label smoothing parameter in cDPO that
|
||
|
should be between 0 and 0.5. (default: 0.0)
|
||
|
--kto_chosen_weight KTO_CHOSEN_WEIGHT
|
||
|
The weight factor of the desirable losses in KTO
|
||
|
training. (default: 1.0)
|
||
|
--kto_rejected_weight KTO_REJECTED_WEIGHT
|
||
|
The weight factor of the undesirable losses in KTO
|
||
|
training. (default: 1.0)
|
||
|
--simpo_gamma SIMPO_GAMMA
|
||
|
The target reward margin term in SimPO loss. (default:
|
||
|
0.5)
|
||
|
--ppo_buffer_size PPO_BUFFER_SIZE
|
||
|
The number of mini-batches to make experience buffer
|
||
|
in a PPO optimization step. (default: 1)
|
||
|
--ppo_epochs PPO_EPOCHS
|
||
|
The number of epochs to perform in a PPO optimization
|
||
|
step. (default: 4)
|
||
|
--ppo_score_norm [PPO_SCORE_NORM]
|
||
|
Use score normalization in PPO training. (default:
|
||
|
False)
|
||
|
--ppo_target PPO_TARGET
|
||
|
Target KL value for adaptive KL control in PPO
|
||
|
training. (default: 6.0)
|
||
|
--ppo_whiten_rewards [PPO_WHITEN_REWARDS]
|
||
|
Whiten the rewards before compute advantages in PPO
|
||
|
training. (default: False)
|
||
|
--ref_model REF_MODEL
|
||
|
Path to the reference model used for the PPO or DPO
|
||
|
training. (default: None)
|
||
|
--ref_model_adapters REF_MODEL_ADAPTERS
|
||
|
Path to the adapters of the reference model. (default:
|
||
|
None)
|
||
|
--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the reference model.
|
||
|
(default: None)
|
||
|
--reward_model REWARD_MODEL
|
||
|
Path to the reward model used for the PPO training.
|
||
|
(default: None)
|
||
|
--reward_model_adapters REWARD_MODEL_ADAPTERS
|
||
|
Path to the adapters of the reward model. (default:
|
||
|
None)
|
||
|
--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the reward model.
|
||
|
(default: None)
|
||
|
--reward_model_type {lora,full,api}
|
||
|
The type of the reward model in PPO training. Lora
|
||
|
model only supports lora training. (default: lora)
|
||
|
--additional_target ADDITIONAL_TARGET
|
||
|
Name(s) of modules apart from LoRA layers to be set as
|
||
|
trainable and saved in the final checkpoint. Use
|
||
|
commas to separate multiple modules. (default: None)
|
||
|
--lora_alpha LORA_ALPHA
|
||
|
The scale factor for LoRA fine-tuning (default:
|
||
|
lora_rank * 2). (default: None)
|
||
|
--lora_dropout LORA_DROPOUT
|
||
|
Dropout rate for the LoRA fine-tuning. (default: 0.0)
|
||
|
--lora_rank LORA_RANK
|
||
|
The intrinsic dimension for LoRA fine-tuning.
|
||
|
(default: 8)
|
||
|
--lora_target LORA_TARGET
|
||
|
Name(s) of target modules to apply LoRA. Use commas to
|
||
|
separate multiple modules. Use `all` to specify all
|
||
|
the linear modules. (default: all)
|
||
|
--loraplus_lr_ratio LORAPLUS_LR_RATIO
|
||
|
LoRA plus learning rate ratio (lr_B / lr_A). (default:
|
||
|
None)
|
||
|
--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING
|
||
|
LoRA plus learning rate for lora embedding layers.
|
||
|
(default: 1e-06)
|
||
|
--use_rslora [USE_RSLORA]
|
||
|
Whether or not to use the rank stabilization scaling
|
||
|
factor for LoRA layer. (default: False)
|
||
|
--use_dora [USE_DORA]
|
||
|
Whether or not to use the weight-decomposed lora
|
||
|
method (DoRA). (default: False)
|
||
|
--pissa_init [PISSA_INIT]
|
||
|
Whether or not to initialize a PiSSA adapter.
|
||
|
(default: False)
|
||
|
--pissa_iter PISSA_ITER
|
||
|
The number of iteration steps performed by FSVD in
|
||
|
PiSSA. Use -1 to disable it. (default: 16)
|
||
|
--pissa_convert [PISSA_CONVERT]
|
||
|
Whether or not to convert the PiSSA adapter to a
|
||
|
normal LoRA adapter. (default: False)
|
||
|
--create_new_adapter [CREATE_NEW_ADAPTER]
|
||
|
Whether or not to create a new adapter with randomly
|
||
|
initialized weight. (default: False)
|
||
|
--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS
|
||
|
The number of trainable layers for freeze (partial-
|
||
|
parameter) fine-tuning. Positive numbers mean the last
|
||
|
n layers are set as trainable, negative numbers mean
|
||
|
the first n layers are set as trainable. (default: 2)
|
||
|
--freeze_trainable_modules FREEZE_TRAINABLE_MODULES
|
||
|
Name(s) of trainable modules for freeze (partial-
|
||
|
parameter) fine-tuning. Use commas to separate
|
||
|
multiple modules. Use `all` to specify all the
|
||
|
available modules. (default: all)
|
||
|
--freeze_extra_modules FREEZE_EXTRA_MODULES
|
||
|
Name(s) of modules apart from hidden layers to be set
|
||
|
as trainable for freeze (partial-parameter) fine-
|
||
|
tuning. Use commas to separate multiple modules.
|
||
|
(default: None)
|
||
|
--pure_bf16 [PURE_BF16]
|
||
|
Whether or not to train model in purely bf16 precision
|
||
|
(without AMP). (default: False)
|
||
|
--stage {pt,sft,rm,ppo,dpo,kto}
|
||
|
Which stage will be performed in training. (default:
|
||
|
sft)
|
||
|
--finetuning_type {lora,freeze,full}
|
||
|
Which fine-tuning method to use. (default: lora)
|
||
|
--use_llama_pro [USE_LLAMA_PRO]
|
||
|
Whether or not to make only the parameters in the
|
||
|
expanded blocks trainable. (default: False)
|
||
|
--use_adam_mini [USE_ADAM_MINI]
|
||
|
Whether or not to use the Adam-mini optimizer.
|
||
|
(default: False)
|
||
|
--freeze_vision_tower [FREEZE_VISION_TOWER]
|
||
|
Whether ot not to freeze vision tower in MLLM
|
||
|
training. (default: True)
|
||
|
--no_freeze_vision_tower
|
||
|
Whether ot not to freeze vision tower in MLLM
|
||
|
training. (default: False)
|
||
|
--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]
|
||
|
Whether or not to train the multimodal projector for
|
||
|
MLLM only. (default: False)
|
||
|
--compute_accuracy [COMPUTE_ACCURACY]
|
||
|
Whether or not to compute the token-level accuracy at
|
||
|
evaluation. (default: False)
|
||
|
--plot_loss [PLOT_LOSS]
|
||
|
Whether or not to save the training loss curves.
|
||
|
(default: False)
|
||
|
--do_sample [DO_SAMPLE]
|
||
|
Whether or not to use sampling, use greedy decoding
|
||
|
otherwise. (default: True)
|
||
|
--no_do_sample Whether or not to use sampling, use greedy decoding
|
||
|
otherwise. (default: False)
|
||
|
--temperature TEMPERATURE
|
||
|
The value used to modulate the next token
|
||
|
probabilities. (default: 0.95)
|
||
|
--top_p TOP_P The smallest set of most probable tokens with
|
||
|
probabilities that add up to top_p or higher are kept.
|
||
|
(default: 0.7)
|
||
|
--top_k TOP_K The number of highest probability vocabulary tokens to
|
||
|
keep for top-k filtering. (default: 50)
|
||
|
--num_beams NUM_BEAMS
|
||
|
Number of beams for beam search. 1 means no beam
|
||
|
search. (default: 1)
|
||
|
--max_length MAX_LENGTH
|
||
|
The maximum length the generated tokens can have. It
|
||
|
can be overridden by max_new_tokens. (default: 1024)
|
||
|
--max_new_tokens MAX_NEW_TOKENS
|
||
|
The maximum numbers of tokens to generate, ignoring
|
||
|
the number of tokens in the prompt. (default: 1024)
|
||
|
--repetition_penalty REPETITION_PENALTY
|
||
|
The parameter for repetition penalty. 1.0 means no
|
||
|
penalty. (default: 1.0)
|
||
|
--length_penalty LENGTH_PENALTY
|
||
|
Exponential penalty to the length that is used with
|
||
|
beam-based generation. (default: 1.0)
|
||
|
--default_system DEFAULT_SYSTEM
|
||
|
Default system message to use in chat completion.
|
||
|
(default: None)
|
||
|
usage: launcher.py [-h] --model_name_or_path MODEL_NAME_OR_PATH
|
||
|
[--adapter_name_or_path ADAPTER_NAME_OR_PATH]
|
||
|
[--adapter_folder ADAPTER_FOLDER] [--cache_dir CACHE_DIR]
|
||
|
[--use_fast_tokenizer [USE_FAST_TOKENIZER]]
|
||
|
[--no_use_fast_tokenizer] [--resize_vocab [RESIZE_VOCAB]]
|
||
|
[--split_special_tokens [SPLIT_SPECIAL_TOKENS]]
|
||
|
[--new_special_tokens NEW_SPECIAL_TOKENS]
|
||
|
[--model_revision MODEL_REVISION]
|
||
|
[--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]]
|
||
|
[--no_low_cpu_mem_usage]
|
||
|
[--quantization_method {bitsandbytes,hqq,eetq}]
|
||
|
[--quantization_bit QUANTIZATION_BIT]
|
||
|
[--quantization_type {fp4,nf4}]
|
||
|
[--double_quantization [DOUBLE_QUANTIZATION]]
|
||
|
[--no_double_quantization]
|
||
|
[--quantization_device_map {auto}]
|
||
|
[--rope_scaling {linear,dynamic}]
|
||
|
[--flash_attn {auto,disabled,sdpa,fa2}]
|
||
|
[--shift_attn [SHIFT_ATTN]]
|
||
|
[--mixture_of_depths {convert,load}]
|
||
|
[--use_unsloth [USE_UNSLOTH]]
|
||
|
[--visual_inputs [VISUAL_INPUTS]]
|
||
|
[--moe_aux_loss_coef MOE_AUX_LOSS_COEF]
|
||
|
[--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]]
|
||
|
[--upcast_layernorm [UPCAST_LAYERNORM]]
|
||
|
[--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]]
|
||
|
[--train_from_scratch [TRAIN_FROM_SCRATCH]]
|
||
|
[--infer_backend {huggingface,vllm}]
|
||
|
[--vllm_maxlen VLLM_MAXLEN] [--vllm_gpu_util VLLM_GPU_UTIL]
|
||
|
[--vllm_enforce_eager [VLLM_ENFORCE_EAGER]]
|
||
|
[--vllm_max_lora_rank VLLM_MAX_LORA_RANK]
|
||
|
[--offload_folder OFFLOAD_FOLDER] [--use_cache [USE_CACHE]]
|
||
|
[--no_use_cache]
|
||
|
[--infer_dtype {auto,float16,bfloat16,float32}]
|
||
|
[--hf_hub_token HF_HUB_TOKEN] [--ms_hub_token MS_HUB_TOKEN]
|
||
|
[--export_dir EXPORT_DIR] [--export_size EXPORT_SIZE]
|
||
|
[--export_device {cpu,auto}]
|
||
|
[--export_quantization_bit EXPORT_QUANTIZATION_BIT]
|
||
|
[--export_quantization_dataset EXPORT_QUANTIZATION_DATASET]
|
||
|
[--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES]
|
||
|
[--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN]
|
||
|
[--export_legacy_format [EXPORT_LEGACY_FORMAT]]
|
||
|
[--export_hub_model_id EXPORT_HUB_MODEL_ID]
|
||
|
[--print_param_status [PRINT_PARAM_STATUS]]
|
||
|
[--template TEMPLATE] [--dataset DATASET]
|
||
|
[--eval_dataset EVAL_DATASET] [--dataset_dir DATASET_DIR]
|
||
|
[--cutoff_len CUTOFF_LEN]
|
||
|
[--train_on_prompt [TRAIN_ON_PROMPT]]
|
||
|
[--mask_history [MASK_HISTORY]] [--streaming [STREAMING]]
|
||
|
[--buffer_size BUFFER_SIZE]
|
||
|
[--mix_strategy {concat,interleave_under,interleave_over}]
|
||
|
[--interleave_probs INTERLEAVE_PROBS]
|
||
|
[--overwrite_cache [OVERWRITE_CACHE]]
|
||
|
[--preprocessing_num_workers PREPROCESSING_NUM_WORKERS]
|
||
|
[--max_samples MAX_SAMPLES]
|
||
|
[--eval_num_beams EVAL_NUM_BEAMS]
|
||
|
[--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]]
|
||
|
[--no_ignore_pad_token_for_loss] [--val_size VAL_SIZE]
|
||
|
[--packing PACKING] [--neat_packing [NEAT_PACKING]]
|
||
|
[--tool_format TOOL_FORMAT]
|
||
|
[--tokenized_path TOKENIZED_PATH] --output_dir OUTPUT_DIR
|
||
|
[--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]]
|
||
|
[--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]]
|
||
|
[--do_predict [DO_PREDICT]]
|
||
|
[--eval_strategy {no,steps,epoch}]
|
||
|
[--prediction_loss_only [PREDICTION_LOSS_ONLY]]
|
||
|
[--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE]
|
||
|
[--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]
|
||
|
[--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE]
|
||
|
[--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE]
|
||
|
[--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS]
|
||
|
[--eval_accumulation_steps EVAL_ACCUMULATION_STEPS]
|
||
|
[--eval_delay EVAL_DELAY]
|
||
|
[--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS]
|
||
|
[--learning_rate LEARNING_RATE]
|
||
|
[--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1]
|
||
|
[--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON]
|
||
|
[--max_grad_norm MAX_GRAD_NORM]
|
||
|
[--num_train_epochs NUM_TRAIN_EPOCHS]
|
||
|
[--max_steps MAX_STEPS]
|
||
|
[--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}]
|
||
|
[--lr_scheduler_kwargs LR_SCHEDULER_KWARGS]
|
||
|
[--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS]
|
||
|
[--log_level {detail,debug,info,warning,error,critical,passive}]
|
||
|
[--log_level_replica {detail,debug,info,warning,error,critical,passive}]
|
||
|
[--log_on_each_node [LOG_ON_EACH_NODE]]
|
||
|
[--no_log_on_each_node] [--logging_dir LOGGING_DIR]
|
||
|
[--logging_strategy {no,steps,epoch}]
|
||
|
[--logging_first_step [LOGGING_FIRST_STEP]]
|
||
|
[--logging_steps LOGGING_STEPS]
|
||
|
[--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]]
|
||
|
[--no_logging_nan_inf_filter]
|
||
|
[--save_strategy {no,steps,epoch}]
|
||
|
[--save_steps SAVE_STEPS]
|
||
|
[--save_total_limit SAVE_TOTAL_LIMIT]
|
||
|
[--save_safetensors [SAVE_SAFETENSORS]]
|
||
|
[--no_save_safetensors]
|
||
|
[--save_on_each_node [SAVE_ON_EACH_NODE]]
|
||
|
[--save_only_model [SAVE_ONLY_MODEL]]
|
||
|
[--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]]
|
||
|
[--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]]
|
||
|
[--use_mps_device [USE_MPS_DEVICE]] [--seed SEED]
|
||
|
[--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]]
|
||
|
[--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]]
|
||
|
[--fp16_opt_level FP16_OPT_LEVEL]
|
||
|
[--half_precision_backend {auto,apex,cpu_amp}]
|
||
|
[--bf16_full_eval [BF16_FULL_EVAL]]
|
||
|
[--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32]
|
||
|
[--local_rank LOCAL_RANK]
|
||
|
[--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}]
|
||
|
[--tpu_num_cores TPU_NUM_CORES]
|
||
|
[--tpu_metrics_debug [TPU_METRICS_DEBUG]]
|
||
|
[--debug DEBUG [DEBUG ...]]
|
||
|
[--dataloader_drop_last [DATALOADER_DROP_LAST]]
|
||
|
[--eval_steps EVAL_STEPS]
|
||
|
[--dataloader_num_workers DATALOADER_NUM_WORKERS]
|
||
|
[--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR]
|
||
|
[--past_index PAST_INDEX] [--run_name RUN_NAME]
|
||
|
[--disable_tqdm DISABLE_TQDM]
|
||
|
[--remove_unused_columns [REMOVE_UNUSED_COLUMNS]]
|
||
|
[--no_remove_unused_columns]
|
||
|
[--label_names LABEL_NAMES [LABEL_NAMES ...]]
|
||
|
[--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]]
|
||
|
[--metric_for_best_model METRIC_FOR_BEST_MODEL]
|
||
|
[--greater_is_better GREATER_IS_BETTER]
|
||
|
[--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP]
|
||
|
[--fsdp_min_num_params FSDP_MIN_NUM_PARAMS]
|
||
|
[--fsdp_config FSDP_CONFIG]
|
||
|
[--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP]
|
||
|
[--accelerator_config ACCELERATOR_CONFIG]
|
||
|
[--deepspeed DEEPSPEED]
|
||
|
[--label_smoothing_factor LABEL_SMOOTHING_FACTOR]
|
||
|
[--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}]
|
||
|
[--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]]
|
||
|
[--group_by_length [GROUP_BY_LENGTH]]
|
||
|
[--length_column_name LENGTH_COLUMN_NAME]
|
||
|
[--report_to REPORT_TO]
|
||
|
[--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS]
|
||
|
[--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB]
|
||
|
[--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS]
|
||
|
[--dataloader_pin_memory [DATALOADER_PIN_MEMORY]]
|
||
|
[--no_dataloader_pin_memory]
|
||
|
[--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]]
|
||
|
[--skip_memory_metrics [SKIP_MEMORY_METRICS]]
|
||
|
[--no_skip_memory_metrics]
|
||
|
[--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]]
|
||
|
[--push_to_hub [PUSH_TO_HUB]]
|
||
|
[--resume_from_checkpoint RESUME_FROM_CHECKPOINT]
|
||
|
[--hub_model_id HUB_MODEL_ID]
|
||
|
[--hub_strategy {end,every_save,checkpoint,all_checkpoints}]
|
||
|
[--hub_token HUB_TOKEN]
|
||
|
[--hub_private_repo [HUB_PRIVATE_REPO]]
|
||
|
[--hub_always_push [HUB_ALWAYS_PUSH]]
|
||
|
[--gradient_checkpointing [GRADIENT_CHECKPOINTING]]
|
||
|
[--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS]
|
||
|
[--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]]
|
||
|
[--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]]
|
||
|
[--no_eval_do_concat_batches]
|
||
|
[--fp16_backend {auto,apex,cpu_amp}]
|
||
|
[--evaluation_strategy {no,steps,epoch}]
|
||
|
[--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID]
|
||
|
[--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION]
|
||
|
[--push_to_hub_token PUSH_TO_HUB_TOKEN]
|
||
|
[--mp_parameters MP_PARAMETERS]
|
||
|
[--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]]
|
||
|
[--full_determinism [FULL_DETERMINISM]]
|
||
|
[--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE]
|
||
|
[--ddp_timeout DDP_TIMEOUT]
|
||
|
[--torch_compile [TORCH_COMPILE]]
|
||
|
[--torch_compile_backend TORCH_COMPILE_BACKEND]
|
||
|
[--torch_compile_mode TORCH_COMPILE_MODE]
|
||
|
[--dispatch_batches DISPATCH_BATCHES]
|
||
|
[--split_batches SPLIT_BATCHES]
|
||
|
[--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]]
|
||
|
[--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]]
|
||
|
[--neftune_noise_alpha NEFTUNE_NOISE_ALPHA]
|
||
|
[--optim_target_modules OPTIM_TARGET_MODULES]
|
||
|
[--batch_eval_metrics [BATCH_EVAL_METRICS]]
|
||
|
[--eval_on_start [EVAL_ON_START]]
|
||
|
[--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]]
|
||
|
[--sortish_sampler [SORTISH_SAMPLER]]
|
||
|
[--predict_with_generate [PREDICT_WITH_GENERATE]]
|
||
|
[--generation_max_length GENERATION_MAX_LENGTH]
|
||
|
[--generation_num_beams GENERATION_NUM_BEAMS]
|
||
|
[--generation_config GENERATION_CONFIG]
|
||
|
[--use_badam [USE_BADAM]] [--badam_mode {layer,ratio}]
|
||
|
[--badam_start_block BADAM_START_BLOCK]
|
||
|
[--badam_switch_mode {ascending,descending,random,fixed}]
|
||
|
[--badam_switch_interval BADAM_SWITCH_INTERVAL]
|
||
|
[--badam_update_ratio BADAM_UPDATE_RATIO]
|
||
|
[--badam_mask_mode {adjacent,scatter}]
|
||
|
[--badam_verbose BADAM_VERBOSE] [--use_galore [USE_GALORE]]
|
||
|
[--galore_target GALORE_TARGET] [--galore_rank GALORE_RANK]
|
||
|
[--galore_update_interval GALORE_UPDATE_INTERVAL]
|
||
|
[--galore_scale GALORE_SCALE]
|
||
|
[--galore_proj_type {std,reverse_std,right,left,full}]
|
||
|
[--galore_layerwise [GALORE_LAYERWISE]]
|
||
|
[--pref_beta PREF_BETA] [--pref_ftx PREF_FTX]
|
||
|
[--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}]
|
||
|
[--dpo_label_smoothing DPO_LABEL_SMOOTHING]
|
||
|
[--kto_chosen_weight KTO_CHOSEN_WEIGHT]
|
||
|
[--kto_rejected_weight KTO_REJECTED_WEIGHT]
|
||
|
[--simpo_gamma SIMPO_GAMMA]
|
||
|
[--ppo_buffer_size PPO_BUFFER_SIZE]
|
||
|
[--ppo_epochs PPO_EPOCHS]
|
||
|
[--ppo_score_norm [PPO_SCORE_NORM]]
|
||
|
[--ppo_target PPO_TARGET]
|
||
|
[--ppo_whiten_rewards [PPO_WHITEN_REWARDS]]
|
||
|
[--ref_model REF_MODEL]
|
||
|
[--ref_model_adapters REF_MODEL_ADAPTERS]
|
||
|
[--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT]
|
||
|
[--reward_model REWARD_MODEL]
|
||
|
[--reward_model_adapters REWARD_MODEL_ADAPTERS]
|
||
|
[--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT]
|
||
|
[--reward_model_type {lora,full,api}]
|
||
|
[--additional_target ADDITIONAL_TARGET]
|
||
|
[--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT]
|
||
|
[--lora_rank LORA_RANK] [--lora_target LORA_TARGET]
|
||
|
[--loraplus_lr_ratio LORAPLUS_LR_RATIO]
|
||
|
[--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING]
|
||
|
[--use_rslora [USE_RSLORA]] [--use_dora [USE_DORA]]
|
||
|
[--pissa_init [PISSA_INIT]] [--pissa_iter PISSA_ITER]
|
||
|
[--pissa_convert [PISSA_CONVERT]]
|
||
|
[--create_new_adapter [CREATE_NEW_ADAPTER]]
|
||
|
[--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS]
|
||
|
[--freeze_trainable_modules FREEZE_TRAINABLE_MODULES]
|
||
|
[--freeze_extra_modules FREEZE_EXTRA_MODULES]
|
||
|
[--pure_bf16 [PURE_BF16]] [--stage {pt,sft,rm,ppo,dpo,kto}]
|
||
|
[--finetuning_type {lora,freeze,full}]
|
||
|
[--use_llama_pro [USE_LLAMA_PRO]]
|
||
|
[--use_adam_mini [USE_ADAM_MINI]]
|
||
|
[--freeze_vision_tower [FREEZE_VISION_TOWER]]
|
||
|
[--no_freeze_vision_tower]
|
||
|
[--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]]
|
||
|
[--compute_accuracy [COMPUTE_ACCURACY]]
|
||
|
[--plot_loss [PLOT_LOSS]] [--do_sample [DO_SAMPLE]]
|
||
|
[--no_do_sample] [--temperature TEMPERATURE]
|
||
|
[--top_p TOP_P] [--top_k TOP_K] [--num_beams NUM_BEAMS]
|
||
|
[--max_length MAX_LENGTH] [--max_new_tokens MAX_NEW_TOKENS]
|
||
|
[--repetition_penalty REPETITION_PENALTY]
|
||
|
[--length_penalty LENGTH_PENALTY]
|
||
|
[--default_system DEFAULT_SYSTEM]
|
||
|
|
||
|
optional arguments:
|
||
|
-h, --help show this help message and exit
|
||
|
--model_name_or_path MODEL_NAME_OR_PATH
|
||
|
Path to the model weight or identifier from
|
||
|
huggingface.co/models or modelscope.cn/models.
|
||
|
(default: None)
|
||
|
--adapter_name_or_path ADAPTER_NAME_OR_PATH
|
||
|
Path to the adapter weight or identifier from
|
||
|
huggingface.co/models. Use commas to separate multiple
|
||
|
adapters. (default: None)
|
||
|
--adapter_folder ADAPTER_FOLDER
|
||
|
The folder containing the adapter weights to load.
|
||
|
(default: None)
|
||
|
--cache_dir CACHE_DIR
|
||
|
Where to store the pre-trained models downloaded from
|
||
|
huggingface.co or modelscope.cn. (default: None)
|
||
|
--use_fast_tokenizer [USE_FAST_TOKENIZER]
|
||
|
Whether or not to use one of the fast tokenizer
|
||
|
(backed by the tokenizers library). (default: True)
|
||
|
--no_use_fast_tokenizer
|
||
|
Whether or not to use one of the fast tokenizer
|
||
|
(backed by the tokenizers library). (default: False)
|
||
|
--resize_vocab [RESIZE_VOCAB]
|
||
|
Whether or not to resize the tokenizer vocab and the
|
||
|
embedding layers. (default: False)
|
||
|
--split_special_tokens [SPLIT_SPECIAL_TOKENS]
|
||
|
Whether or not the special tokens should be split
|
||
|
during the tokenization process. (default: False)
|
||
|
--new_special_tokens NEW_SPECIAL_TOKENS
|
||
|
Special tokens to be added into the tokenizer. Use
|
||
|
commas to separate multiple tokens. (default: None)
|
||
|
--model_revision MODEL_REVISION
|
||
|
The specific model version to use (can be a branch
|
||
|
name, tag name or commit id). (default: main)
|
||
|
--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]
|
||
|
Whether or not to use memory-efficient model loading.
|
||
|
(default: True)
|
||
|
--no_low_cpu_mem_usage
|
||
|
Whether or not to use memory-efficient model loading.
|
||
|
(default: False)
|
||
|
--quantization_method {bitsandbytes,hqq,eetq}
|
||
|
Quantization method to use for on-the-fly
|
||
|
quantization. (default: bitsandbytes)
|
||
|
--quantization_bit QUANTIZATION_BIT
|
||
|
The number of bits to quantize the model using
|
||
|
bitsandbytes. (default: None)
|
||
|
--quantization_type {fp4,nf4}
|
||
|
Quantization data type to use in int4 training.
|
||
|
(default: nf4)
|
||
|
--double_quantization [DOUBLE_QUANTIZATION]
|
||
|
Whether or not to use double quantization in int4
|
||
|
training. (default: True)
|
||
|
--no_double_quantization
|
||
|
Whether or not to use double quantization in int4
|
||
|
training. (default: False)
|
||
|
--quantization_device_map {auto}
|
||
|
Device map used to infer the 4-bit quantized model,
|
||
|
needs bitsandbytes>=0.43.0. (default: None)
|
||
|
--rope_scaling {linear,dynamic}
|
||
|
Which scaling strategy should be adopted for the RoPE
|
||
|
embeddings. (default: None)
|
||
|
--flash_attn {auto,disabled,sdpa,fa2}
|
||
|
Enable FlashAttention for faster training and
|
||
|
inference. (default: auto)
|
||
|
--shift_attn [SHIFT_ATTN]
|
||
|
Enable shift short attention (S^2-Attn) proposed by
|
||
|
LongLoRA. (default: False)
|
||
|
--mixture_of_depths {convert,load}
|
||
|
Convert the model to mixture-of-depths (MoD) or load
|
||
|
the MoD model. (default: None)
|
||
|
--use_unsloth [USE_UNSLOTH]
|
||
|
Whether or not to use unsloth's optimization for the
|
||
|
LoRA training. (default: False)
|
||
|
--visual_inputs [VISUAL_INPUTS]
|
||
|
Whethor or not to use multimodal LLM that accepts
|
||
|
visual inputs. (default: False)
|
||
|
--moe_aux_loss_coef MOE_AUX_LOSS_COEF
|
||
|
Coefficient of the auxiliary router loss in mixture-
|
||
|
of-experts model. (default: None)
|
||
|
--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]
|
||
|
Whether or not to disable gradient checkpointing.
|
||
|
(default: False)
|
||
|
--upcast_layernorm [UPCAST_LAYERNORM]
|
||
|
Whether or not to upcast the layernorm weights in
|
||
|
fp32. (default: False)
|
||
|
--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]
|
||
|
Whether or not to upcast the output of lm_head in
|
||
|
fp32. (default: False)
|
||
|
--train_from_scratch [TRAIN_FROM_SCRATCH]
|
||
|
Whether or not to randomly initialize the model
|
||
|
weights. (default: False)
|
||
|
--infer_backend {huggingface,vllm}
|
||
|
Backend engine used at inference. (default:
|
||
|
huggingface)
|
||
|
--vllm_maxlen VLLM_MAXLEN
|
||
|
Maximum sequence (prompt + response) length of the
|
||
|
vLLM engine. (default: 2048)
|
||
|
--vllm_gpu_util VLLM_GPU_UTIL
|
||
|
The fraction of GPU memory in (0,1) to be used for the
|
||
|
vLLM engine. (default: 0.9)
|
||
|
--vllm_enforce_eager [VLLM_ENFORCE_EAGER]
|
||
|
Whether or not to disable CUDA graph in the vLLM
|
||
|
engine. (default: False)
|
||
|
--vllm_max_lora_rank VLLM_MAX_LORA_RANK
|
||
|
Maximum rank of all LoRAs in the vLLM engine.
|
||
|
(default: 32)
|
||
|
--offload_folder OFFLOAD_FOLDER
|
||
|
Path to offload model weights. (default: offload)
|
||
|
--use_cache [USE_CACHE]
|
||
|
Whether or not to use KV cache in generation.
|
||
|
(default: True)
|
||
|
--no_use_cache Whether or not to use KV cache in generation.
|
||
|
(default: False)
|
||
|
--infer_dtype {auto,float16,bfloat16,float32}
|
||
|
Data type for model weights and activations at
|
||
|
inference. (default: auto)
|
||
|
--hf_hub_token HF_HUB_TOKEN
|
||
|
Auth token to log in with Hugging Face Hub. (default:
|
||
|
None)
|
||
|
--ms_hub_token MS_HUB_TOKEN
|
||
|
Auth token to log in with ModelScope Hub. (default:
|
||
|
None)
|
||
|
--export_dir EXPORT_DIR
|
||
|
Path to the directory to save the exported model.
|
||
|
(default: None)
|
||
|
--export_size EXPORT_SIZE
|
||
|
The file shard size (in GB) of the exported model.
|
||
|
(default: 1)
|
||
|
--export_device {cpu,auto}
|
||
|
The device used in model export, use `auto` to
|
||
|
accelerate exporting. (default: cpu)
|
||
|
--export_quantization_bit EXPORT_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the exported model.
|
||
|
(default: None)
|
||
|
--export_quantization_dataset EXPORT_QUANTIZATION_DATASET
|
||
|
Path to the dataset or dataset name to use in
|
||
|
quantizing the exported model. (default: None)
|
||
|
--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES
|
||
|
The number of samples used for quantization. (default:
|
||
|
128)
|
||
|
--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN
|
||
|
The maximum length of the model inputs used for
|
||
|
quantization. (default: 1024)
|
||
|
--export_legacy_format [EXPORT_LEGACY_FORMAT]
|
||
|
Whether or not to save the `.bin` files instead of
|
||
|
`.safetensors`. (default: False)
|
||
|
--export_hub_model_id EXPORT_HUB_MODEL_ID
|
||
|
The name of the repository if push the model to the
|
||
|
Hugging Face hub. (default: None)
|
||
|
--print_param_status [PRINT_PARAM_STATUS]
|
||
|
For debugging purposes, print the status of the
|
||
|
parameters in the model. (default: False)
|
||
|
--template TEMPLATE Which template to use for constructing prompts in
|
||
|
training and inference. (default: None)
|
||
|
--dataset DATASET The name of dataset(s) to use for training. Use commas
|
||
|
to separate multiple datasets. (default: None)
|
||
|
--eval_dataset EVAL_DATASET
|
||
|
The name of dataset(s) to use for evaluation. Use
|
||
|
commas to separate multiple datasets. (default: None)
|
||
|
--dataset_dir DATASET_DIR
|
||
|
Path to the folder containing the datasets. (default:
|
||
|
data)
|
||
|
--cutoff_len CUTOFF_LEN
|
||
|
The cutoff length of the tokenized inputs in the
|
||
|
dataset. (default: 1024)
|
||
|
--train_on_prompt [TRAIN_ON_PROMPT]
|
||
|
Whether or not to disable the mask on the prompt.
|
||
|
(default: False)
|
||
|
--mask_history [MASK_HISTORY]
|
||
|
Whether or not to mask the history and train on the
|
||
|
last turn only. (default: False)
|
||
|
--streaming [STREAMING]
|
||
|
Enable dataset streaming. (default: False)
|
||
|
--buffer_size BUFFER_SIZE
|
||
|
Size of the buffer to randomly sample examples from in
|
||
|
dataset streaming. (default: 16384)
|
||
|
--mix_strategy {concat,interleave_under,interleave_over}
|
||
|
Strategy to use in dataset mixing (concat/interleave)
|
||
|
(undersampling/oversampling). (default: concat)
|
||
|
--interleave_probs INTERLEAVE_PROBS
|
||
|
Probabilities to sample data from datasets. Use commas
|
||
|
to separate multiple datasets. (default: None)
|
||
|
--overwrite_cache [OVERWRITE_CACHE]
|
||
|
Overwrite the cached training and evaluation sets.
|
||
|
(default: False)
|
||
|
--preprocessing_num_workers PREPROCESSING_NUM_WORKERS
|
||
|
The number of processes to use for the pre-processing.
|
||
|
(default: None)
|
||
|
--max_samples MAX_SAMPLES
|
||
|
For debugging purposes, truncate the number of
|
||
|
examples for each dataset. (default: None)
|
||
|
--eval_num_beams EVAL_NUM_BEAMS
|
||
|
Number of beams to use for evaluation. This argument
|
||
|
will be passed to `model.generate` (default: None)
|
||
|
--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]
|
||
|
Whether or not to ignore the tokens corresponding to
|
||
|
the pad label in loss computation. (default: True)
|
||
|
--no_ignore_pad_token_for_loss
|
||
|
Whether or not to ignore the tokens corresponding to
|
||
|
the pad label in loss computation. (default: False)
|
||
|
--val_size VAL_SIZE Size of the development set, should be an integer or a
|
||
|
float in range `[0,1)`. (default: 0.0)
|
||
|
--packing PACKING Enable sequences packing in training. Will
|
||
|
automatically enable in pre-training. (default: None)
|
||
|
--neat_packing [NEAT_PACKING]
|
||
|
Enable sequence packing without cross-attention.
|
||
|
(default: False)
|
||
|
--tool_format TOOL_FORMAT
|
||
|
Tool format to use for constructing function calling
|
||
|
examples. (default: None)
|
||
|
--tokenized_path TOKENIZED_PATH
|
||
|
Path to save or load the tokenized datasets. (default:
|
||
|
None)
|
||
|
--output_dir OUTPUT_DIR
|
||
|
The output directory where the model predictions and
|
||
|
checkpoints will be written. (default: None)
|
||
|
--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]
|
||
|
Overwrite the content of the output directory. Use
|
||
|
this to continue training if output_dir points to a
|
||
|
checkpoint directory. (default: False)
|
||
|
--do_train [DO_TRAIN]
|
||
|
Whether to run training. (default: False)
|
||
|
--do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False)
|
||
|
--do_predict [DO_PREDICT]
|
||
|
Whether to run predictions on the test set. (default:
|
||
|
False)
|
||
|
--eval_strategy {no,steps,epoch}
|
||
|
The evaluation strategy to use. (default: no)
|
||
|
--prediction_loss_only [PREDICTION_LOSS_ONLY]
|
||
|
When performing evaluation and predictions, only
|
||
|
returns the loss. (default: False)
|
||
|
--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE
|
||
|
Batch size per GPU/TPU/MPS/NPU core/CPU for training.
|
||
|
(default: 8)
|
||
|
--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE
|
||
|
Batch size per GPU/TPU/MPS/NPU core/CPU for
|
||
|
evaluation. (default: 8)
|
||
|
--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE
|
||
|
Deprecated, the use of `--per_device_train_batch_size`
|
||
|
is preferred. Batch size per GPU/TPU core/CPU for
|
||
|
training. (default: None)
|
||
|
--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE
|
||
|
Deprecated, the use of `--per_device_eval_batch_size`
|
||
|
is preferred. Batch size per GPU/TPU core/CPU for
|
||
|
evaluation. (default: None)
|
||
|
--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS
|
||
|
Number of updates steps to accumulate before
|
||
|
performing a backward/update pass. (default: 1)
|
||
|
--eval_accumulation_steps EVAL_ACCUMULATION_STEPS
|
||
|
Number of predictions steps to accumulate before
|
||
|
moving the tensors to the CPU. (default: None)
|
||
|
--eval_delay EVAL_DELAY
|
||
|
Number of epochs or steps to wait for before the first
|
||
|
evaluation can be performed, depending on the
|
||
|
eval_strategy. (default: 0)
|
||
|
--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS
|
||
|
Number of steps to wait before calling
|
||
|
`torch.<device>.empty_cache()`.This can help avoid
|
||
|
CUDA out-of-memory errors by lowering peak VRAM usage
|
||
|
at a cost of about [10{'option_strings': ['--
|
||
|
torch_empty_cache_steps'], 'dest':
|
||
|
'torch_empty_cache_steps', 'nargs': None, 'const':
|
||
|
None, 'default': None, 'type': 'int', 'choices': None,
|
||
|
'required': False, 'help': 'Number of steps to wait
|
||
|
before calling `torch.<device>.empty_cache()`.This can
|
||
|
help avoid CUDA out-of-memory errors by lowering peak
|
||
|
VRAM usage at a cost of about [10% slower performance]
|
||
|
(https://github.com/huggingface/transformers/issues/31
|
||
|
372).If left unset or set to None, cache will not be
|
||
|
emptied.', 'metavar': None, 'container':
|
||
|
<argparse._ArgumentGroup object at 0x7f738a5e8fd0>,
|
||
|
'prog': 'launcher.py'}lower performance](https://githu
|
||
|
b.com/huggingface/transformers/issues/31372).If left
|
||
|
unset or set to None, cache will not be emptied.
|
||
|
(default: None)
|
||
|
--learning_rate LEARNING_RATE
|
||
|
The initial learning rate for AdamW. (default: 5e-05)
|
||
|
--weight_decay WEIGHT_DECAY
|
||
|
Weight decay for AdamW if we apply some. (default:
|
||
|
0.0)
|
||
|
--adam_beta1 ADAM_BETA1
|
||
|
Beta1 for AdamW optimizer (default: 0.9)
|
||
|
--adam_beta2 ADAM_BETA2
|
||
|
Beta2 for AdamW optimizer (default: 0.999)
|
||
|
--adam_epsilon ADAM_EPSILON
|
||
|
Epsilon for AdamW optimizer. (default: 1e-08)
|
||
|
--max_grad_norm MAX_GRAD_NORM
|
||
|
Max gradient norm. (default: 1.0)
|
||
|
--num_train_epochs NUM_TRAIN_EPOCHS
|
||
|
Total number of training epochs to perform. (default:
|
||
|
3.0)
|
||
|
--max_steps MAX_STEPS
|
||
|
If > 0: set total number of training steps to perform.
|
||
|
Override num_train_epochs. (default: -1)
|
||
|
--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}
|
||
|
The scheduler type to use. (default: linear)
|
||
|
--lr_scheduler_kwargs LR_SCHEDULER_KWARGS
|
||
|
Extra parameters for the lr_scheduler such as
|
||
|
{'num_cycles': 1} for the cosine with hard restarts.
|
||
|
(default: {})
|
||
|
--warmup_ratio WARMUP_RATIO
|
||
|
Linear warmup over warmup_ratio fraction of total
|
||
|
steps. (default: 0.0)
|
||
|
--warmup_steps WARMUP_STEPS
|
||
|
Linear warmup over warmup_steps. (default: 0)
|
||
|
--log_level {detail,debug,info,warning,error,critical,passive}
|
||
|
Logger log level to use on the main node. Possible
|
||
|
choices are the log levels as strings: 'debug',
|
||
|
'info', 'warning', 'error' and 'critical', plus a
|
||
|
'passive' level which doesn't set anything and lets
|
||
|
the application set the level. Defaults to 'passive'.
|
||
|
(default: passive)
|
||
|
--log_level_replica {detail,debug,info,warning,error,critical,passive}
|
||
|
Logger log level to use on replica nodes. Same choices
|
||
|
and defaults as ``log_level`` (default: warning)
|
||
|
--log_on_each_node [LOG_ON_EACH_NODE]
|
||
|
When doing a multinode distributed training, whether
|
||
|
to log once per node or just once on the main node.
|
||
|
(default: True)
|
||
|
--no_log_on_each_node
|
||
|
When doing a multinode distributed training, whether
|
||
|
to log once per node or just once on the main node.
|
||
|
(default: False)
|
||
|
--logging_dir LOGGING_DIR
|
||
|
Tensorboard log dir. (default: None)
|
||
|
--logging_strategy {no,steps,epoch}
|
||
|
The logging strategy to use. (default: steps)
|
||
|
--logging_first_step [LOGGING_FIRST_STEP]
|
||
|
Log the first global_step (default: False)
|
||
|
--logging_steps LOGGING_STEPS
|
||
|
Log every X updates steps. Should be an integer or a
|
||
|
float in range `[0,1)`. If smaller than 1, will be
|
||
|
interpreted as ratio of total training steps.
|
||
|
(default: 500)
|
||
|
--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]
|
||
|
Filter nan and inf losses for logging. (default: True)
|
||
|
--no_logging_nan_inf_filter
|
||
|
Filter nan and inf losses for logging. (default:
|
||
|
False)
|
||
|
--save_strategy {no,steps,epoch}
|
||
|
The checkpoint save strategy to use. (default: steps)
|
||
|
--save_steps SAVE_STEPS
|
||
|
Save checkpoint every X updates steps. Should be an
|
||
|
integer or a float in range `[0,1)`. If smaller than
|
||
|
1, will be interpreted as ratio of total training
|
||
|
steps. (default: 500)
|
||
|
--save_total_limit SAVE_TOTAL_LIMIT
|
||
|
If a value is passed, will limit the total amount of
|
||
|
checkpoints. Deletes the older checkpoints in
|
||
|
`output_dir`. When `load_best_model_at_end` is
|
||
|
enabled, the 'best' checkpoint according to
|
||
|
`metric_for_best_model` will always be retained in
|
||
|
addition to the most recent ones. For example, for
|
||
|
`save_total_limit=5` and
|
||
|
`load_best_model_at_end=True`, the four last
|
||
|
checkpoints will always be retained alongside the best
|
||
|
model. When `save_total_limit=1` and
|
||
|
`load_best_model_at_end=True`, it is possible that two
|
||
|
checkpoints are saved: the last one and the best one
|
||
|
(if they are different). Default is unlimited
|
||
|
checkpoints (default: None)
|
||
|
--save_safetensors [SAVE_SAFETENSORS]
|
||
|
Use safetensors saving and loading for state dicts
|
||
|
instead of default torch.load and torch.save.
|
||
|
(default: True)
|
||
|
--no_save_safetensors
|
||
|
Use safetensors saving and loading for state dicts
|
||
|
instead of default torch.load and torch.save.
|
||
|
(default: False)
|
||
|
--save_on_each_node [SAVE_ON_EACH_NODE]
|
||
|
When doing multi-node distributed training, whether to
|
||
|
save models and checkpoints on each node, or only on
|
||
|
the main one (default: False)
|
||
|
--save_only_model [SAVE_ONLY_MODEL]
|
||
|
When checkpointing, whether to only save the model, or
|
||
|
also the optimizer, scheduler & rng state.Note that
|
||
|
when this is true, you won't be able to resume
|
||
|
training from checkpoint.This enables you to save
|
||
|
storage by not storing the optimizer, scheduler & rng
|
||
|
state.You can only load the model using
|
||
|
from_pretrained with this option set to True.
|
||
|
(default: False)
|
||
|
--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]
|
||
|
Whether to restore the callback states from the
|
||
|
checkpoint. If `True`, will override callbacks passed
|
||
|
to the `Trainer` if they exist in the checkpoint.
|
||
|
(default: False)
|
||
|
--no_cuda [NO_CUDA] This argument is deprecated. It will be removed in
|
||
|
version 5.0 of 🤗 Transformers. (default: False)
|
||
|
--use_cpu [USE_CPU] Whether or not to use cpu. If set to False, we will
|
||
|
use cuda/tpu/mps/npu device if available. (default:
|
||
|
False)
|
||
|
--use_mps_device [USE_MPS_DEVICE]
|
||
|
This argument is deprecated. `mps` device will be used
|
||
|
if available similar to `cuda` device. It will be
|
||
|
removed in version 5.0 of 🤗 Transformers (default:
|
||
|
False)
|
||
|
--seed SEED Random seed that will be set at the beginning of
|
||
|
training. (default: 42)
|
||
|
--data_seed DATA_SEED
|
||
|
Random seed to be used with data samplers. (default:
|
||
|
None)
|
||
|
--jit_mode_eval [JIT_MODE_EVAL]
|
||
|
Whether or not to use PyTorch jit trace for inference
|
||
|
(default: False)
|
||
|
--use_ipex [USE_IPEX]
|
||
|
Use Intel extension for PyTorch when it is available,
|
||
|
installation: 'https://github.com/intel/intel-
|
||
|
extension-for-pytorch' (default: False)
|
||
|
--bf16 [BF16] Whether to use bf16 (mixed) precision instead of
|
||
|
32-bit. Requires Ampere or higher NVIDIA architecture
|
||
|
or using CPU (use_cpu) or Ascend NPU. This is an
|
||
|
experimental API and it may change. (default: False)
|
||
|
--fp16 [FP16] Whether to use fp16 (mixed) precision instead of
|
||
|
32-bit (default: False)
|
||
|
--fp16_opt_level FP16_OPT_LEVEL
|
||
|
For fp16: Apex AMP optimization level selected in
|
||
|
['O0', 'O1', 'O2', and 'O3']. See details at
|
||
|
https://nvidia.github.io/apex/amp.html (default: O1)
|
||
|
--half_precision_backend {auto,apex,cpu_amp}
|
||
|
The backend to be used for half precision. (default:
|
||
|
auto)
|
||
|
--bf16_full_eval [BF16_FULL_EVAL]
|
||
|
Whether to use full bfloat16 evaluation instead of
|
||
|
32-bit. This is an experimental API and it may change.
|
||
|
(default: False)
|
||
|
--fp16_full_eval [FP16_FULL_EVAL]
|
||
|
Whether to use full float16 evaluation instead of
|
||
|
32-bit (default: False)
|
||
|
--tf32 TF32 Whether to enable tf32 mode, available in Ampere and
|
||
|
newer GPU architectures. This is an experimental API
|
||
|
and it may change. (default: None)
|
||
|
--local_rank LOCAL_RANK
|
||
|
For distributed training: local_rank (default: -1)
|
||
|
--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}
|
||
|
The backend to be used for distributed training
|
||
|
(default: None)
|
||
|
--tpu_num_cores TPU_NUM_CORES
|
||
|
TPU: Number of TPU cores (automatically passed by
|
||
|
launcher script) (default: None)
|
||
|
--tpu_metrics_debug [TPU_METRICS_DEBUG]
|
||
|
Deprecated, the use of `--debug tpu_metrics_debug` is
|
||
|
preferred. TPU: Whether to print debug metrics
|
||
|
(default: False)
|
||
|
--debug DEBUG [DEBUG ...]
|
||
|
Whether or not to enable debug mode. Current options:
|
||
|
`underflow_overflow` (Detect underflow and overflow in
|
||
|
activations and weights), `tpu_metrics_debug` (print
|
||
|
debug metrics on TPU). (default: None)
|
||
|
--dataloader_drop_last [DATALOADER_DROP_LAST]
|
||
|
Drop the last incomplete batch if it is not divisible
|
||
|
by the batch size. (default: False)
|
||
|
--eval_steps EVAL_STEPS
|
||
|
Run an evaluation every X steps. Should be an integer
|
||
|
or a float in range `[0,1)`. If smaller than 1, will
|
||
|
be interpreted as ratio of total training steps.
|
||
|
(default: None)
|
||
|
--dataloader_num_workers DATALOADER_NUM_WORKERS
|
||
|
Number of subprocesses to use for data loading
|
||
|
(PyTorch only). 0 means that the data will be loaded
|
||
|
in the main process. (default: 0)
|
||
|
--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR
|
||
|
Number of batches loaded in advance by each worker. 2
|
||
|
means there will be a total of 2 * num_workers batches
|
||
|
prefetched across all workers. Default is 2 for
|
||
|
PyTorch < 2.0.0 and otherwise None. (default: None)
|
||
|
--past_index PAST_INDEX
|
||
|
If >=0, uses the corresponding part of the output as
|
||
|
the past state for next step. (default: -1)
|
||
|
--run_name RUN_NAME An optional descriptor for the run. Notably used for
|
||
|
wandb, mlflow and comet logging. (default: None)
|
||
|
--disable_tqdm DISABLE_TQDM
|
||
|
Whether or not to disable the tqdm progress bars.
|
||
|
(default: None)
|
||
|
--remove_unused_columns [REMOVE_UNUSED_COLUMNS]
|
||
|
Remove columns not required by the model when using an
|
||
|
nlp.Dataset. (default: True)
|
||
|
--no_remove_unused_columns
|
||
|
Remove columns not required by the model when using an
|
||
|
nlp.Dataset. (default: False)
|
||
|
--label_names LABEL_NAMES [LABEL_NAMES ...]
|
||
|
The list of keys in your dictionary of inputs that
|
||
|
correspond to the labels. (default: None)
|
||
|
--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]
|
||
|
Whether or not to load the best model found during
|
||
|
training at the end of training. When this option is
|
||
|
enabled, the best checkpoint will always be saved. See
|
||
|
`save_total_limit` for more. (default: False)
|
||
|
--metric_for_best_model METRIC_FOR_BEST_MODEL
|
||
|
The metric to use to compare two different models.
|
||
|
(default: None)
|
||
|
--greater_is_better GREATER_IS_BETTER
|
||
|
Whether the `metric_for_best_model` should be
|
||
|
maximized or not. (default: None)
|
||
|
--ignore_data_skip [IGNORE_DATA_SKIP]
|
||
|
When resuming training, whether or not to skip the
|
||
|
first epochs and batches to get to the same training
|
||
|
data. (default: False)
|
||
|
--fsdp FSDP Whether or not to use PyTorch Fully Sharded Data
|
||
|
Parallel (FSDP) training (in distributed training
|
||
|
only). The base option should be `full_shard`,
|
||
|
`shard_grad_op` or `no_shard` and you can add CPU-
|
||
|
offload to `full_shard` or `shard_grad_op` like this:
|
||
|
full_shard offload` or `shard_grad_op offload`. You
|
||
|
can add auto-wrap to `full_shard` or `shard_grad_op`
|
||
|
with the same syntax: full_shard auto_wrap` or
|
||
|
`shard_grad_op auto_wrap`. (default: )
|
||
|
--fsdp_min_num_params FSDP_MIN_NUM_PARAMS
|
||
|
This parameter is deprecated. FSDP's minimum number of
|
||
|
parameters for Default Auto Wrapping. (useful only
|
||
|
when `fsdp` field is passed). (default: 0)
|
||
|
--fsdp_config FSDP_CONFIG
|
||
|
Config to be used with FSDP (Pytorch Fully Sharded
|
||
|
Data Parallel). The value is either a fsdp json config
|
||
|
file (e.g., `fsdp_config.json`) or an already loaded
|
||
|
json file as `dict`. (default: None)
|
||
|
--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP
|
||
|
This parameter is deprecated. Transformer layer class
|
||
|
name (case-sensitive) to wrap, e.g, `BertLayer`,
|
||
|
`GPTJBlock`, `T5Block` .... (useful only when `fsdp`
|
||
|
flag is passed). (default: None)
|
||
|
--accelerator_config ACCELERATOR_CONFIG
|
||
|
Config to be used with the internal Accelerator object
|
||
|
initializtion. The value is either a accelerator json
|
||
|
config file (e.g., `accelerator_config.json`) or an
|
||
|
already loaded json file as `dict`. (default: None)
|
||
|
--deepspeed DEEPSPEED
|
||
|
Enable deepspeed and pass the path to deepspeed json
|
||
|
config file (e.g. `ds_config.json`) or an already
|
||
|
loaded json file as a dict (default: None)
|
||
|
--label_smoothing_factor LABEL_SMOOTHING_FACTOR
|
||
|
The label smoothing epsilon to apply (zero means no
|
||
|
label smoothing). (default: 0.0)
|
||
|
--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}
|
||
|
The optimizer to use. (default: adamw_torch)
|
||
|
--optim_args OPTIM_ARGS
|
||
|
Optional arguments to supply to optimizer. (default:
|
||
|
None)
|
||
|
--adafactor [ADAFACTOR]
|
||
|
Whether or not to replace AdamW by Adafactor.
|
||
|
(default: False)
|
||
|
--group_by_length [GROUP_BY_LENGTH]
|
||
|
Whether or not to group samples of roughly the same
|
||
|
length together when batching. (default: False)
|
||
|
--length_column_name LENGTH_COLUMN_NAME
|
||
|
Column name with precomputed lengths to use when
|
||
|
grouping by length. (default: length)
|
||
|
--report_to REPORT_TO
|
||
|
The list of integrations to report the results and
|
||
|
logs to. (default: None)
|
||
|
--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS
|
||
|
When using distributed training, the value of the flag
|
||
|
`find_unused_parameters` passed to
|
||
|
`DistributedDataParallel`. (default: None)
|
||
|
--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB
|
||
|
When using distributed training, the value of the flag
|
||
|
`bucket_cap_mb` passed to `DistributedDataParallel`.
|
||
|
(default: None)
|
||
|
--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS
|
||
|
When using distributed training, the value of the flag
|
||
|
`broadcast_buffers` passed to
|
||
|
`DistributedDataParallel`. (default: None)
|
||
|
--dataloader_pin_memory [DATALOADER_PIN_MEMORY]
|
||
|
Whether or not to pin memory for DataLoader. (default:
|
||
|
True)
|
||
|
--no_dataloader_pin_memory
|
||
|
Whether or not to pin memory for DataLoader. (default:
|
||
|
False)
|
||
|
--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]
|
||
|
If True, the data loader will not shut down the worker
|
||
|
processes after a dataset has been consumed once. This
|
||
|
allows to maintain the workers Dataset instances
|
||
|
alive. Can potentially speed up training, but will
|
||
|
increase RAM usage. (default: False)
|
||
|
--skip_memory_metrics [SKIP_MEMORY_METRICS]
|
||
|
Whether or not to skip adding of memory profiler
|
||
|
reports to metrics. (default: True)
|
||
|
--no_skip_memory_metrics
|
||
|
Whether or not to skip adding of memory profiler
|
||
|
reports to metrics. (default: False)
|
||
|
--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]
|
||
|
Whether or not to use the legacy prediction_loop in
|
||
|
the Trainer. (default: False)
|
||
|
--push_to_hub [PUSH_TO_HUB]
|
||
|
Whether or not to upload the trained model to the
|
||
|
model hub after training. (default: False)
|
||
|
--resume_from_checkpoint RESUME_FROM_CHECKPOINT
|
||
|
The path to a folder with a valid checkpoint for your
|
||
|
model. (default: None)
|
||
|
--hub_model_id HUB_MODEL_ID
|
||
|
The name of the repository to keep in sync with the
|
||
|
local `output_dir`. (default: None)
|
||
|
--hub_strategy {end,every_save,checkpoint,all_checkpoints}
|
||
|
The hub strategy to use when `--push_to_hub` is
|
||
|
activated. (default: every_save)
|
||
|
--hub_token HUB_TOKEN
|
||
|
The token to use to push to the Model Hub. (default:
|
||
|
None)
|
||
|
--hub_private_repo [HUB_PRIVATE_REPO]
|
||
|
Whether the model repository is private or not.
|
||
|
(default: False)
|
||
|
--hub_always_push [HUB_ALWAYS_PUSH]
|
||
|
Unless `True`, the Trainer will skip pushes if the
|
||
|
previous one wasn't finished yet. (default: False)
|
||
|
--gradient_checkpointing [GRADIENT_CHECKPOINTING]
|
||
|
If True, use gradient checkpointing to save memory at
|
||
|
the expense of slower backward pass. (default: False)
|
||
|
--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS
|
||
|
Gradient checkpointing key word arguments such as
|
||
|
`use_reentrant`. Will be passed to
|
||
|
`torch.utils.checkpoint.checkpoint` through
|
||
|
`model.gradient_checkpointing_enable`. (default: None)
|
||
|
--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]
|
||
|
Whether or not the inputs will be passed to the
|
||
|
`compute_metrics` function. (default: False)
|
||
|
--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]
|
||
|
Whether to recursively concat
|
||
|
inputs/losses/labels/predictions across batches. If
|
||
|
`False`, will instead store them as lists, with each
|
||
|
batch kept separate. (default: True)
|
||
|
--no_eval_do_concat_batches
|
||
|
Whether to recursively concat
|
||
|
inputs/losses/labels/predictions across batches. If
|
||
|
`False`, will instead store them as lists, with each
|
||
|
batch kept separate. (default: False)
|
||
|
--fp16_backend {auto,apex,cpu_amp}
|
||
|
Deprecated. Use half_precision_backend instead
|
||
|
(default: auto)
|
||
|
--evaluation_strategy {no,steps,epoch}
|
||
|
Deprecated. Use `eval_strategy` instead (default:
|
||
|
None)
|
||
|
--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID
|
||
|
The name of the repository to which push the
|
||
|
`Trainer`. (default: None)
|
||
|
--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION
|
||
|
The name of the organization in with to which push the
|
||
|
`Trainer`. (default: None)
|
||
|
--push_to_hub_token PUSH_TO_HUB_TOKEN
|
||
|
The token to use to push to the Model Hub. (default:
|
||
|
None)
|
||
|
--mp_parameters MP_PARAMETERS
|
||
|
Used by the SageMaker launcher to send mp-specific
|
||
|
args. Ignored in Trainer (default: )
|
||
|
--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]
|
||
|
Whether to automatically decrease the batch size in
|
||
|
half and rerun the training loop again each time a
|
||
|
CUDA Out-of-Memory was reached (default: False)
|
||
|
--full_determinism [FULL_DETERMINISM]
|
||
|
Whether to call enable_full_determinism instead of
|
||
|
set_seed for reproducibility in distributed training.
|
||
|
Important: this will negatively impact the
|
||
|
performance, so only use it for debugging. (default:
|
||
|
False)
|
||
|
--torchdynamo TORCHDYNAMO
|
||
|
This argument is deprecated, use
|
||
|
`--torch_compile_backend` instead. (default: None)
|
||
|
--ray_scope RAY_SCOPE
|
||
|
The scope to use when doing hyperparameter search with
|
||
|
Ray. By default, `"last"` will be used. Ray will then
|
||
|
use the last checkpoint of all trials, compare those,
|
||
|
and select the best one. However, other options are
|
||
|
also available. See the Ray documentation (https://doc
|
||
|
s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun
|
||
|
e.ExperimentAnalysis.get_best_trial) for more options.
|
||
|
(default: last)
|
||
|
--ddp_timeout DDP_TIMEOUT
|
||
|
Overrides the default timeout for distributed training
|
||
|
(value should be given in seconds). (default: 1800)
|
||
|
--torch_compile [TORCH_COMPILE]
|
||
|
If set to `True`, the model will be wrapped in
|
||
|
`torch.compile`. (default: False)
|
||
|
--torch_compile_backend TORCH_COMPILE_BACKEND
|
||
|
Which backend to use with `torch.compile`, passing one
|
||
|
will trigger a model compilation. (default: None)
|
||
|
--torch_compile_mode TORCH_COMPILE_MODE
|
||
|
Which mode to use with `torch.compile`, passing one
|
||
|
will trigger a model compilation. (default: None)
|
||
|
--dispatch_batches DISPATCH_BATCHES
|
||
|
Deprecated. Pass {'dispatch_batches':VALUE} to
|
||
|
`accelerator_config`. (default: None)
|
||
|
--split_batches SPLIT_BATCHES
|
||
|
Deprecated. Pass {'split_batches':True} to
|
||
|
`accelerator_config`. (default: None)
|
||
|
--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]
|
||
|
If set to `True`, the speed metrics will include `tgs`
|
||
|
(tokens per second per device). (default: False)
|
||
|
--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]
|
||
|
If set to `True`, will track the number of input
|
||
|
tokens seen throughout training. (May be slower in
|
||
|
distributed training) (default: False)
|
||
|
--neftune_noise_alpha NEFTUNE_NOISE_ALPHA
|
||
|
Activates neftune noise embeddings into the model.
|
||
|
NEFTune has been proven to drastically improve model
|
||
|
performances for instrcution fine-tuning. Check out
|
||
|
the original paper here:
|
||
|
https://arxiv.org/abs/2310.05914 and the original code
|
||
|
here: https://github.com/neelsjain/NEFTune. Only
|
||
|
supported for `PreTrainedModel` and `PeftModel`
|
||
|
classes. (default: None)
|
||
|
--optim_target_modules OPTIM_TARGET_MODULES
|
||
|
Target modules for the optimizer defined in the
|
||
|
`optim` argument. Only used for the GaLore optimizer
|
||
|
at the moment. (default: None)
|
||
|
--batch_eval_metrics [BATCH_EVAL_METRICS]
|
||
|
Break eval metrics calculation into batches to save
|
||
|
memory. (default: False)
|
||
|
--eval_on_start [EVAL_ON_START]
|
||
|
Whether to run through the entire `evaluation` step at
|
||
|
the very beginning of training as a sanity check.
|
||
|
(default: False)
|
||
|
--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]
|
||
|
Whether to run recursively gather object in a nested
|
||
|
list/tuple/dictionary of objects from all devices.
|
||
|
(default: False)
|
||
|
--sortish_sampler [SORTISH_SAMPLER]
|
||
|
Whether to use SortishSampler or not. (default: False)
|
||
|
--predict_with_generate [PREDICT_WITH_GENERATE]
|
||
|
Whether to use generate to calculate generative
|
||
|
metrics (ROUGE, BLEU). (default: False)
|
||
|
--generation_max_length GENERATION_MAX_LENGTH
|
||
|
The `max_length` to use on each evaluation loop when
|
||
|
`predict_with_generate=True`. Will default to the
|
||
|
`max_length` value of the model configuration.
|
||
|
(default: None)
|
||
|
--generation_num_beams GENERATION_NUM_BEAMS
|
||
|
The `num_beams` to use on each evaluation loop when
|
||
|
`predict_with_generate=True`. Will default to the
|
||
|
`num_beams` value of the model configuration.
|
||
|
(default: None)
|
||
|
--generation_config GENERATION_CONFIG
|
||
|
Model id, file path or url pointing to a
|
||
|
GenerationConfig json file, to use during prediction.
|
||
|
(default: None)
|
||
|
--use_badam [USE_BADAM]
|
||
|
Whether or not to use the BAdam optimizer. (default:
|
||
|
False)
|
||
|
--badam_mode {layer,ratio}
|
||
|
Whether to use layer-wise or ratio-wise BAdam
|
||
|
optimizer. (default: layer)
|
||
|
--badam_start_block BADAM_START_BLOCK
|
||
|
The starting block index for layer-wise BAdam.
|
||
|
(default: None)
|
||
|
--badam_switch_mode {ascending,descending,random,fixed}
|
||
|
the strategy of picking block to update for layer-wise
|
||
|
BAdam. (default: ascending)
|
||
|
--badam_switch_interval BADAM_SWITCH_INTERVAL
|
||
|
Number of steps to update the block for layer-wise
|
||
|
BAdam. Use -1 to disable the block update. (default:
|
||
|
50)
|
||
|
--badam_update_ratio BADAM_UPDATE_RATIO
|
||
|
The ratio of the update for ratio-wise BAdam.
|
||
|
(default: 0.05)
|
||
|
--badam_mask_mode {adjacent,scatter}
|
||
|
The mode of the mask for BAdam optimizer. `adjacent`
|
||
|
means that the trainable parameters are adjacent to
|
||
|
each other, `scatter` means that trainable parameters
|
||
|
are randomly choosed from the weight. (default:
|
||
|
adjacent)
|
||
|
--badam_verbose BADAM_VERBOSE
|
||
|
The verbosity level of BAdam optimizer. 0 for no
|
||
|
print, 1 for print the block prefix, 2 for print
|
||
|
trainable parameters. (default: 0)
|
||
|
--use_galore [USE_GALORE]
|
||
|
Whether or not to use the gradient low-Rank projection
|
||
|
(GaLore). (default: False)
|
||
|
--galore_target GALORE_TARGET
|
||
|
Name(s) of modules to apply GaLore. Use commas to
|
||
|
separate multiple modules. Use `all` to specify all
|
||
|
the linear modules. (default: all)
|
||
|
--galore_rank GALORE_RANK
|
||
|
The rank of GaLore gradients. (default: 16)
|
||
|
--galore_update_interval GALORE_UPDATE_INTERVAL
|
||
|
Number of steps to update the GaLore projection.
|
||
|
(default: 200)
|
||
|
--galore_scale GALORE_SCALE
|
||
|
GaLore scaling coefficient. (default: 0.25)
|
||
|
--galore_proj_type {std,reverse_std,right,left,full}
|
||
|
Type of GaLore projection. (default: std)
|
||
|
--galore_layerwise [GALORE_LAYERWISE]
|
||
|
Whether or not to enable layer-wise update to further
|
||
|
save memory. (default: False)
|
||
|
--pref_beta PREF_BETA
|
||
|
The beta parameter in the preference loss. (default:
|
||
|
0.1)
|
||
|
--pref_ftx PREF_FTX The supervised fine-tuning loss coefficient in DPO
|
||
|
training. (default: 0.0)
|
||
|
--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}
|
||
|
The type of DPO loss to use. (default: sigmoid)
|
||
|
--dpo_label_smoothing DPO_LABEL_SMOOTHING
|
||
|
The robust DPO label smoothing parameter in cDPO that
|
||
|
should be between 0 and 0.5. (default: 0.0)
|
||
|
--kto_chosen_weight KTO_CHOSEN_WEIGHT
|
||
|
The weight factor of the desirable losses in KTO
|
||
|
training. (default: 1.0)
|
||
|
--kto_rejected_weight KTO_REJECTED_WEIGHT
|
||
|
The weight factor of the undesirable losses in KTO
|
||
|
training. (default: 1.0)
|
||
|
--simpo_gamma SIMPO_GAMMA
|
||
|
The target reward margin term in SimPO loss. (default:
|
||
|
0.5)
|
||
|
--ppo_buffer_size PPO_BUFFER_SIZE
|
||
|
The number of mini-batches to make experience buffer
|
||
|
in a PPO optimization step. (default: 1)
|
||
|
--ppo_epochs PPO_EPOCHS
|
||
|
The number of epochs to perform in a PPO optimization
|
||
|
step. (default: 4)
|
||
|
--ppo_score_norm [PPO_SCORE_NORM]
|
||
|
Use score normalization in PPO training. (default:
|
||
|
False)
|
||
|
--ppo_target PPO_TARGET
|
||
|
Target KL value for adaptive KL control in PPO
|
||
|
training. (default: 6.0)
|
||
|
--ppo_whiten_rewards [PPO_WHITEN_REWARDS]
|
||
|
Whiten the rewards before compute advantages in PPO
|
||
|
training. (default: False)
|
||
|
--ref_model REF_MODEL
|
||
|
Path to the reference model used for the PPO or DPO
|
||
|
training. (default: None)
|
||
|
--ref_model_adapters REF_MODEL_ADAPTERS
|
||
|
Path to the adapters of the reference model. (default:
|
||
|
None)
|
||
|
--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the reference model.
|
||
|
(default: None)
|
||
|
--reward_model REWARD_MODEL
|
||
|
Path to the reward model used for the PPO training.
|
||
|
(default: None)
|
||
|
--reward_model_adapters REWARD_MODEL_ADAPTERS
|
||
|
Path to the adapters of the reward model. (default:
|
||
|
None)
|
||
|
--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the reward model.
|
||
|
(default: None)
|
||
|
--reward_model_type {lora,full,api}
|
||
|
The type of the reward model in PPO training. Lora
|
||
|
model only supports lora training. (default: lora)
|
||
|
--additional_target ADDITIONAL_TARGET
|
||
|
Name(s) of modules apart from LoRA layers to be set as
|
||
|
trainable and saved in the final checkpoint. Use
|
||
|
commas to separate multiple modules. (default: None)
|
||
|
--lora_alpha LORA_ALPHA
|
||
|
The scale factor for LoRA fine-tuning (default:
|
||
|
lora_rank * 2). (default: None)
|
||
|
--lora_dropout LORA_DROPOUT
|
||
|
Dropout rate for the LoRA fine-tuning. (default: 0.0)
|
||
|
--lora_rank LORA_RANK
|
||
|
The intrinsic dimension for LoRA fine-tuning.
|
||
|
(default: 8)
|
||
|
--lora_target LORA_TARGET
|
||
|
Name(s) of target modules to apply LoRA. Use commas to
|
||
|
separate multiple modules. Use `all` to specify all
|
||
|
the linear modules. (default: all)
|
||
|
--loraplus_lr_ratio LORAPLUS_LR_RATIO
|
||
|
LoRA plus learning rate ratio (lr_B / lr_A). (default:
|
||
|
None)
|
||
|
--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING
|
||
|
LoRA plus learning rate for lora embedding layers.
|
||
|
(default: 1e-06)
|
||
|
--use_rslora [USE_RSLORA]
|
||
|
Whether or not to use the rank stabilization scaling
|
||
|
factor for LoRA layer. (default: False)
|
||
|
--use_dora [USE_DORA]
|
||
|
Whether or not to use the weight-decomposed lora
|
||
|
method (DoRA). (default: False)
|
||
|
--pissa_init [PISSA_INIT]
|
||
|
Whether or not to initialize a PiSSA adapter.
|
||
|
(default: False)
|
||
|
--pissa_iter PISSA_ITER
|
||
|
The number of iteration steps performed by FSVD in
|
||
|
PiSSA. Use -1 to disable it. (default: 16)
|
||
|
--pissa_convert [PISSA_CONVERT]
|
||
|
Whether or not to convert the PiSSA adapter to a
|
||
|
normal LoRA adapter. (default: False)
|
||
|
--create_new_adapter [CREATE_NEW_ADAPTER]
|
||
|
Whether or not to create a new adapter with randomly
|
||
|
initialized weight. (default: False)
|
||
|
--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS
|
||
|
The number of trainable layers for freeze (partial-
|
||
|
parameter) fine-tuning. Positive numbers mean the last
|
||
|
n layers are set as trainable, negative numbers mean
|
||
|
the first n layers are set as trainable. (default: 2)
|
||
|
--freeze_trainable_modules FREEZE_TRAINABLE_MODULES
|
||
|
Name(s) of trainable modules for freeze (partial-
|
||
|
parameter) fine-tuning. Use commas to separate
|
||
|
multiple modules. Use `all` to specify all the
|
||
|
available modules. (default: all)
|
||
|
--freeze_extra_modules FREEZE_EXTRA_MODULES
|
||
|
Name(s) of modules apart from hidden layers to be set
|
||
|
as trainable for freeze (partial-parameter) fine-
|
||
|
tuning. Use commas to separate multiple modules.
|
||
|
(default: None)
|
||
|
--pure_bf16 [PURE_BF16]
|
||
|
Whether or not to train model in purely bf16 precision
|
||
|
(without AMP). (default: False)
|
||
|
--stage {pt,sft,rm,ppo,dpo,kto}
|
||
|
Which stage will be performed in training. (default:
|
||
|
sft)
|
||
|
--finetuning_type {lora,freeze,full}
|
||
|
Which fine-tuning method to use. (default: lora)
|
||
|
--use_llama_pro [USE_LLAMA_PRO]
|
||
|
Whether or not to make only the parameters in the
|
||
|
expanded blocks trainable. (default: False)
|
||
|
--use_adam_mini [USE_ADAM_MINI]
|
||
|
Whether or not to use the Adam-mini optimizer.
|
||
|
(default: False)
|
||
|
--freeze_vision_tower [FREEZE_VISION_TOWER]
|
||
|
Whether ot not to freeze vision tower in MLLM
|
||
|
training. (default: True)
|
||
|
--no_freeze_vision_tower
|
||
|
Whether ot not to freeze vision tower in MLLM
|
||
|
training. (default: False)
|
||
|
--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]
|
||
|
Whether or not to train the multimodal projector for
|
||
|
MLLM only. (default: False)
|
||
|
--compute_accuracy [COMPUTE_ACCURACY]
|
||
|
Whether or not to compute the token-level accuracy at
|
||
|
evaluation. (default: False)
|
||
|
--plot_loss [PLOT_LOSS]
|
||
|
Whether or not to save the training loss curves.
|
||
|
(default: False)
|
||
|
--do_sample [DO_SAMPLE]
|
||
|
Whether or not to use sampling, use greedy decoding
|
||
|
otherwise. (default: True)
|
||
|
--no_do_sample Whether or not to use sampling, use greedy decoding
|
||
|
otherwise. (default: False)
|
||
|
--temperature TEMPERATURE
|
||
|
The value used to modulate the next token
|
||
|
probabilities. (default: 0.95)
|
||
|
--top_p TOP_P The smallest set of most probable tokens with
|
||
|
probabilities that add up to top_p or higher are kept.
|
||
|
(default: 0.7)
|
||
|
--top_k TOP_K The number of highest probability vocabulary tokens to
|
||
|
keep for top-k filtering. (default: 50)
|
||
|
--num_beams NUM_BEAMS
|
||
|
Number of beams for beam search. 1 means no beam
|
||
|
search. (default: 1)
|
||
|
--max_length MAX_LENGTH
|
||
|
The maximum length the generated tokens can have. It
|
||
|
can be overridden by max_new_tokens. (default: 1024)
|
||
|
--max_new_tokens MAX_NEW_TOKENS
|
||
|
The maximum numbers of tokens to generate, ignoring
|
||
|
the number of tokens in the prompt. (default: 1024)
|
||
|
--repetition_penalty REPETITION_PENALTY
|
||
|
The parameter for repetition penalty. 1.0 means no
|
||
|
penalty. (default: 1.0)
|
||
|
--length_penalty LENGTH_PENALTY
|
||
|
Exponential penalty to the length that is used with
|
||
|
beam-based generation. (default: 1.0)
|
||
|
--default_system DEFAULT_SYSTEM
|
||
|
Default system message to use in chat completion.
|
||
|
(default: None)
|
||
|
usage: launcher.py [-h] --model_name_or_path MODEL_NAME_OR_PATH
|
||
|
[--adapter_name_or_path ADAPTER_NAME_OR_PATH]
|
||
|
[--adapter_folder ADAPTER_FOLDER] [--cache_dir CACHE_DIR]
|
||
|
[--use_fast_tokenizer [USE_FAST_TOKENIZER]]
|
||
|
[--no_use_fast_tokenizer] [--resize_vocab [RESIZE_VOCAB]]
|
||
|
[--split_special_tokens [SPLIT_SPECIAL_TOKENS]]
|
||
|
[--new_special_tokens NEW_SPECIAL_TOKENS]
|
||
|
[--model_revision MODEL_REVISION]
|
||
|
[--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]]
|
||
|
[--no_low_cpu_mem_usage]
|
||
|
[--quantization_method {bitsandbytes,hqq,eetq}]
|
||
|
[--quantization_bit QUANTIZATION_BIT]
|
||
|
[--quantization_type {fp4,nf4}]
|
||
|
[--double_quantization [DOUBLE_QUANTIZATION]]
|
||
|
[--no_double_quantization]
|
||
|
[--quantization_device_map {auto}]
|
||
|
[--rope_scaling {linear,dynamic}]
|
||
|
[--flash_attn {auto,disabled,sdpa,fa2}]
|
||
|
[--shift_attn [SHIFT_ATTN]]
|
||
|
[--mixture_of_depths {convert,load}]
|
||
|
[--use_unsloth [USE_UNSLOTH]]
|
||
|
[--visual_inputs [VISUAL_INPUTS]]
|
||
|
[--moe_aux_loss_coef MOE_AUX_LOSS_COEF]
|
||
|
[--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]]
|
||
|
[--upcast_layernorm [UPCAST_LAYERNORM]]
|
||
|
[--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]]
|
||
|
[--train_from_scratch [TRAIN_FROM_SCRATCH]]
|
||
|
[--infer_backend {huggingface,vllm}]
|
||
|
[--vllm_maxlen VLLM_MAXLEN] [--vllm_gpu_util VLLM_GPU_UTIL]
|
||
|
[--vllm_enforce_eager [VLLM_ENFORCE_EAGER]]
|
||
|
[--vllm_max_lora_rank VLLM_MAX_LORA_RANK]
|
||
|
[--offload_folder OFFLOAD_FOLDER] [--use_cache [USE_CACHE]]
|
||
|
[--no_use_cache]
|
||
|
[--infer_dtype {auto,float16,bfloat16,float32}]
|
||
|
[--hf_hub_token HF_HUB_TOKEN] [--ms_hub_token MS_HUB_TOKEN]
|
||
|
[--export_dir EXPORT_DIR] [--export_size EXPORT_SIZE]
|
||
|
[--export_device {cpu,auto}]
|
||
|
[--export_quantization_bit EXPORT_QUANTIZATION_BIT]
|
||
|
[--export_quantization_dataset EXPORT_QUANTIZATION_DATASET]
|
||
|
[--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES]
|
||
|
[--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN]
|
||
|
[--export_legacy_format [EXPORT_LEGACY_FORMAT]]
|
||
|
[--export_hub_model_id EXPORT_HUB_MODEL_ID]
|
||
|
[--print_param_status [PRINT_PARAM_STATUS]]
|
||
|
[--template TEMPLATE] [--dataset DATASET]
|
||
|
[--eval_dataset EVAL_DATASET] [--dataset_dir DATASET_DIR]
|
||
|
[--cutoff_len CUTOFF_LEN]
|
||
|
[--train_on_prompt [TRAIN_ON_PROMPT]]
|
||
|
[--mask_history [MASK_HISTORY]] [--streaming [STREAMING]]
|
||
|
[--buffer_size BUFFER_SIZE]
|
||
|
[--mix_strategy {concat,interleave_under,interleave_over}]
|
||
|
[--interleave_probs INTERLEAVE_PROBS]
|
||
|
[--overwrite_cache [OVERWRITE_CACHE]]
|
||
|
[--preprocessing_num_workers PREPROCESSING_NUM_WORKERS]
|
||
|
[--max_samples MAX_SAMPLES]
|
||
|
[--eval_num_beams EVAL_NUM_BEAMS]
|
||
|
[--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]]
|
||
|
[--no_ignore_pad_token_for_loss] [--val_size VAL_SIZE]
|
||
|
[--packing PACKING] [--neat_packing [NEAT_PACKING]]
|
||
|
[--tool_format TOOL_FORMAT]
|
||
|
[--tokenized_path TOKENIZED_PATH] --output_dir OUTPUT_DIR
|
||
|
[--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]]
|
||
|
[--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]]
|
||
|
[--do_predict [DO_PREDICT]]
|
||
|
[--eval_strategy {no,steps,epoch}]
|
||
|
[--prediction_loss_only [PREDICTION_LOSS_ONLY]]
|
||
|
[--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE]
|
||
|
[--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]
|
||
|
[--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE]
|
||
|
[--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE]
|
||
|
[--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS]
|
||
|
[--eval_accumulation_steps EVAL_ACCUMULATION_STEPS]
|
||
|
[--eval_delay EVAL_DELAY]
|
||
|
[--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS]
|
||
|
[--learning_rate LEARNING_RATE]
|
||
|
[--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1]
|
||
|
[--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON]
|
||
|
[--max_grad_norm MAX_GRAD_NORM]
|
||
|
[--num_train_epochs NUM_TRAIN_EPOCHS]
|
||
|
[--max_steps MAX_STEPS]
|
||
|
[--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}]
|
||
|
[--lr_scheduler_kwargs LR_SCHEDULER_KWARGS]
|
||
|
[--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS]
|
||
|
[--log_level {detail,debug,info,warning,error,critical,passive}]
|
||
|
[--log_level_replica {detail,debug,info,warning,error,critical,passive}]
|
||
|
[--log_on_each_node [LOG_ON_EACH_NODE]]
|
||
|
[--no_log_on_each_node] [--logging_dir LOGGING_DIR]
|
||
|
[--logging_strategy {no,steps,epoch}]
|
||
|
[--logging_first_step [LOGGING_FIRST_STEP]]
|
||
|
[--logging_steps LOGGING_STEPS]
|
||
|
[--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]]
|
||
|
[--no_logging_nan_inf_filter]
|
||
|
[--save_strategy {no,steps,epoch}]
|
||
|
[--save_steps SAVE_STEPS]
|
||
|
[--save_total_limit SAVE_TOTAL_LIMIT]
|
||
|
[--save_safetensors [SAVE_SAFETENSORS]]
|
||
|
[--no_save_safetensors]
|
||
|
[--save_on_each_node [SAVE_ON_EACH_NODE]]
|
||
|
[--save_only_model [SAVE_ONLY_MODEL]]
|
||
|
[--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]]
|
||
|
[--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]]
|
||
|
[--use_mps_device [USE_MPS_DEVICE]] [--seed SEED]
|
||
|
[--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]]
|
||
|
[--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]]
|
||
|
[--fp16_opt_level FP16_OPT_LEVEL]
|
||
|
[--half_precision_backend {auto,apex,cpu_amp}]
|
||
|
[--bf16_full_eval [BF16_FULL_EVAL]]
|
||
|
[--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32]
|
||
|
[--local_rank LOCAL_RANK]
|
||
|
[--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}]
|
||
|
[--tpu_num_cores TPU_NUM_CORES]
|
||
|
[--tpu_metrics_debug [TPU_METRICS_DEBUG]]
|
||
|
[--debug DEBUG [DEBUG ...]]
|
||
|
[--dataloader_drop_last [DATALOADER_DROP_LAST]]
|
||
|
[--eval_steps EVAL_STEPS]
|
||
|
[--dataloader_num_workers DATALOADER_NUM_WORKERS]
|
||
|
[--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR]
|
||
|
[--past_index PAST_INDEX] [--run_name RUN_NAME]
|
||
|
[--disable_tqdm DISABLE_TQDM]
|
||
|
[--remove_unused_columns [REMOVE_UNUSED_COLUMNS]]
|
||
|
[--no_remove_unused_columns]
|
||
|
[--label_names LABEL_NAMES [LABEL_NAMES ...]]
|
||
|
[--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]]
|
||
|
[--metric_for_best_model METRIC_FOR_BEST_MODEL]
|
||
|
[--greater_is_better GREATER_IS_BETTER]
|
||
|
[--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP]
|
||
|
[--fsdp_min_num_params FSDP_MIN_NUM_PARAMS]
|
||
|
[--fsdp_config FSDP_CONFIG]
|
||
|
[--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP]
|
||
|
[--accelerator_config ACCELERATOR_CONFIG]
|
||
|
[--deepspeed DEEPSPEED]
|
||
|
[--label_smoothing_factor LABEL_SMOOTHING_FACTOR]
|
||
|
[--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}]
|
||
|
[--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]]
|
||
|
[--group_by_length [GROUP_BY_LENGTH]]
|
||
|
[--length_column_name LENGTH_COLUMN_NAME]
|
||
|
[--report_to REPORT_TO]
|
||
|
[--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS]
|
||
|
[--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB]
|
||
|
[--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS]
|
||
|
[--dataloader_pin_memory [DATALOADER_PIN_MEMORY]]
|
||
|
[--no_dataloader_pin_memory]
|
||
|
[--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]]
|
||
|
[--skip_memory_metrics [SKIP_MEMORY_METRICS]]
|
||
|
[--no_skip_memory_metrics]
|
||
|
[--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]]
|
||
|
[--push_to_hub [PUSH_TO_HUB]]
|
||
|
[--resume_from_checkpoint RESUME_FROM_CHECKPOINT]
|
||
|
[--hub_model_id HUB_MODEL_ID]
|
||
|
[--hub_strategy {end,every_save,checkpoint,all_checkpoints}]
|
||
|
[--hub_token HUB_TOKEN]
|
||
|
[--hub_private_repo [HUB_PRIVATE_REPO]]
|
||
|
[--hub_always_push [HUB_ALWAYS_PUSH]]
|
||
|
[--gradient_checkpointing [GRADIENT_CHECKPOINTING]]
|
||
|
[--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS]
|
||
|
[--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]]
|
||
|
[--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]]
|
||
|
[--no_eval_do_concat_batches]
|
||
|
[--fp16_backend {auto,apex,cpu_amp}]
|
||
|
[--evaluation_strategy {no,steps,epoch}]
|
||
|
[--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID]
|
||
|
[--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION]
|
||
|
[--push_to_hub_token PUSH_TO_HUB_TOKEN]
|
||
|
[--mp_parameters MP_PARAMETERS]
|
||
|
[--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]]
|
||
|
[--full_determinism [FULL_DETERMINISM]]
|
||
|
[--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE]
|
||
|
[--ddp_timeout DDP_TIMEOUT]
|
||
|
[--torch_compile [TORCH_COMPILE]]
|
||
|
[--torch_compile_backend TORCH_COMPILE_BACKEND]
|
||
|
[--torch_compile_mode TORCH_COMPILE_MODE]
|
||
|
[--dispatch_batches DISPATCH_BATCHES]
|
||
|
[--split_batches SPLIT_BATCHES]
|
||
|
[--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]]
|
||
|
[--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]]
|
||
|
[--neftune_noise_alpha NEFTUNE_NOISE_ALPHA]
|
||
|
[--optim_target_modules OPTIM_TARGET_MODULES]
|
||
|
[--batch_eval_metrics [BATCH_EVAL_METRICS]]
|
||
|
[--eval_on_start [EVAL_ON_START]]
|
||
|
[--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]]
|
||
|
[--sortish_sampler [SORTISH_SAMPLER]]
|
||
|
[--predict_with_generate [PREDICT_WITH_GENERATE]]
|
||
|
[--generation_max_length GENERATION_MAX_LENGTH]
|
||
|
[--generation_num_beams GENERATION_NUM_BEAMS]
|
||
|
[--generation_config GENERATION_CONFIG]
|
||
|
[--use_badam [USE_BADAM]] [--badam_mode {layer,ratio}]
|
||
|
[--badam_start_block BADAM_START_BLOCK]
|
||
|
[--badam_switch_mode {ascending,descending,random,fixed}]
|
||
|
[--badam_switch_interval BADAM_SWITCH_INTERVAL]
|
||
|
[--badam_update_ratio BADAM_UPDATE_RATIO]
|
||
|
[--badam_mask_mode {adjacent,scatter}]
|
||
|
[--badam_verbose BADAM_VERBOSE] [--use_galore [USE_GALORE]]
|
||
|
[--galore_target GALORE_TARGET] [--galore_rank GALORE_RANK]
|
||
|
[--galore_update_interval GALORE_UPDATE_INTERVAL]
|
||
|
[--galore_scale GALORE_SCALE]
|
||
|
[--galore_proj_type {std,reverse_std,right,left,full}]
|
||
|
[--galore_layerwise [GALORE_LAYERWISE]]
|
||
|
[--pref_beta PREF_BETA] [--pref_ftx PREF_FTX]
|
||
|
[--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}]
|
||
|
[--dpo_label_smoothing DPO_LABEL_SMOOTHING]
|
||
|
[--kto_chosen_weight KTO_CHOSEN_WEIGHT]
|
||
|
[--kto_rejected_weight KTO_REJECTED_WEIGHT]
|
||
|
[--simpo_gamma SIMPO_GAMMA]
|
||
|
[--ppo_buffer_size PPO_BUFFER_SIZE]
|
||
|
[--ppo_epochs PPO_EPOCHS]
|
||
|
[--ppo_score_norm [PPO_SCORE_NORM]]
|
||
|
[--ppo_target PPO_TARGET]
|
||
|
[--ppo_whiten_rewards [PPO_WHITEN_REWARDS]]
|
||
|
[--ref_model REF_MODEL]
|
||
|
[--ref_model_adapters REF_MODEL_ADAPTERS]
|
||
|
[--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT]
|
||
|
[--reward_model REWARD_MODEL]
|
||
|
[--reward_model_adapters REWARD_MODEL_ADAPTERS]
|
||
|
[--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT]
|
||
|
[--reward_model_type {lora,full,api}]
|
||
|
[--additional_target ADDITIONAL_TARGET]
|
||
|
[--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT]
|
||
|
[--lora_rank LORA_RANK] [--lora_target LORA_TARGET]
|
||
|
[--loraplus_lr_ratio LORAPLUS_LR_RATIO]
|
||
|
[--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING]
|
||
|
[--use_rslora [USE_RSLORA]] [--use_dora [USE_DORA]]
|
||
|
[--pissa_init [PISSA_INIT]] [--pissa_iter PISSA_ITER]
|
||
|
[--pissa_convert [PISSA_CONVERT]]
|
||
|
[--create_new_adapter [CREATE_NEW_ADAPTER]]
|
||
|
[--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS]
|
||
|
[--freeze_trainable_modules FREEZE_TRAINABLE_MODULES]
|
||
|
[--freeze_extra_modules FREEZE_EXTRA_MODULES]
|
||
|
[--pure_bf16 [PURE_BF16]] [--stage {pt,sft,rm,ppo,dpo,kto}]
|
||
|
[--finetuning_type {lora,freeze,full}]
|
||
|
[--use_llama_pro [USE_LLAMA_PRO]]
|
||
|
[--use_adam_mini [USE_ADAM_MINI]]
|
||
|
[--freeze_vision_tower [FREEZE_VISION_TOWER]]
|
||
|
[--no_freeze_vision_tower]
|
||
|
[--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]]
|
||
|
[--compute_accuracy [COMPUTE_ACCURACY]]
|
||
|
[--plot_loss [PLOT_LOSS]] [--do_sample [DO_SAMPLE]]
|
||
|
[--no_do_sample] [--temperature TEMPERATURE]
|
||
|
[--top_p TOP_P] [--top_k TOP_K] [--num_beams NUM_BEAMS]
|
||
|
[--max_length MAX_LENGTH] [--max_new_tokens MAX_NEW_TOKENS]
|
||
|
[--repetition_penalty REPETITION_PENALTY]
|
||
|
[--length_penalty LENGTH_PENALTY]
|
||
|
[--default_system DEFAULT_SYSTEM]
|
||
|
|
||
|
optional arguments:
|
||
|
-h, --help show this help message and exit
|
||
|
--model_name_or_path MODEL_NAME_OR_PATH
|
||
|
Path to the model weight or identifier from
|
||
|
huggingface.co/models or modelscope.cn/models.
|
||
|
(default: None)
|
||
|
--adapter_name_or_path ADAPTER_NAME_OR_PATH
|
||
|
Path to the adapter weight or identifier from
|
||
|
huggingface.co/models. Use commas to separate multiple
|
||
|
adapters. (default: None)
|
||
|
--adapter_folder ADAPTER_FOLDER
|
||
|
The folder containing the adapter weights to load.
|
||
|
(default: None)
|
||
|
--cache_dir CACHE_DIR
|
||
|
Where to store the pre-trained models downloaded from
|
||
|
huggingface.co or modelscope.cn. (default: None)
|
||
|
--use_fast_tokenizer [USE_FAST_TOKENIZER]
|
||
|
Whether or not to use one of the fast tokenizer
|
||
|
(backed by the tokenizers library). (default: True)
|
||
|
--no_use_fast_tokenizer
|
||
|
Whether or not to use one of the fast tokenizer
|
||
|
(backed by the tokenizers library). (default: False)
|
||
|
--resize_vocab [RESIZE_VOCAB]
|
||
|
Whether or not to resize the tokenizer vocab and the
|
||
|
embedding layers. (default: False)
|
||
|
--split_special_tokens [SPLIT_SPECIAL_TOKENS]
|
||
|
Whether or not the special tokens should be split
|
||
|
during the tokenization process. (default: False)
|
||
|
--new_special_tokens NEW_SPECIAL_TOKENS
|
||
|
Special tokens to be added into the tokenizer. Use
|
||
|
commas to separate multiple tokens. (default: None)
|
||
|
--model_revision MODEL_REVISION
|
||
|
The specific model version to use (can be a branch
|
||
|
name, tag name or commit id). (default: main)
|
||
|
--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]
|
||
|
Whether or not to use memory-efficient model loading.
|
||
|
(default: True)
|
||
|
--no_low_cpu_mem_usage
|
||
|
Whether or not to use memory-efficient model loading.
|
||
|
(default: False)
|
||
|
--quantization_method {bitsandbytes,hqq,eetq}
|
||
|
Quantization method to use for on-the-fly
|
||
|
quantization. (default: bitsandbytes)
|
||
|
--quantization_bit QUANTIZATION_BIT
|
||
|
The number of bits to quantize the model using
|
||
|
bitsandbytes. (default: None)
|
||
|
--quantization_type {fp4,nf4}
|
||
|
Quantization data type to use in int4 training.
|
||
|
(default: nf4)
|
||
|
--double_quantization [DOUBLE_QUANTIZATION]
|
||
|
Whether or not to use double quantization in int4
|
||
|
training. (default: True)
|
||
|
--no_double_quantization
|
||
|
Whether or not to use double quantization in int4
|
||
|
training. (default: False)
|
||
|
--quantization_device_map {auto}
|
||
|
Device map used to infer the 4-bit quantized model,
|
||
|
needs bitsandbytes>=0.43.0. (default: None)
|
||
|
--rope_scaling {linear,dynamic}
|
||
|
Which scaling strategy should be adopted for the RoPE
|
||
|
embeddings. (default: None)
|
||
|
--flash_attn {auto,disabled,sdpa,fa2}
|
||
|
Enable FlashAttention for faster training and
|
||
|
inference. (default: auto)
|
||
|
--shift_attn [SHIFT_ATTN]
|
||
|
Enable shift short attention (S^2-Attn) proposed by
|
||
|
LongLoRA. (default: False)
|
||
|
--mixture_of_depths {convert,load}
|
||
|
Convert the model to mixture-of-depths (MoD) or load
|
||
|
the MoD model. (default: None)
|
||
|
--use_unsloth [USE_UNSLOTH]
|
||
|
Whether or not to use unsloth's optimization for the
|
||
|
LoRA training. (default: False)
|
||
|
--visual_inputs [VISUAL_INPUTS]
|
||
|
Whethor or not to use multimodal LLM that accepts
|
||
|
visual inputs. (default: False)
|
||
|
--moe_aux_loss_coef MOE_AUX_LOSS_COEF
|
||
|
Coefficient of the auxiliary router loss in mixture-
|
||
|
of-experts model. (default: None)
|
||
|
--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]
|
||
|
Whether or not to disable gradient checkpointing.
|
||
|
(default: False)
|
||
|
--upcast_layernorm [UPCAST_LAYERNORM]
|
||
|
Whether or not to upcast the layernorm weights in
|
||
|
fp32. (default: False)
|
||
|
--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]
|
||
|
Whether or not to upcast the output of lm_head in
|
||
|
fp32. (default: False)
|
||
|
--train_from_scratch [TRAIN_FROM_SCRATCH]
|
||
|
Whether or not to randomly initialize the model
|
||
|
weights. (default: False)
|
||
|
--infer_backend {huggingface,vllm}
|
||
|
Backend engine used at inference. (default:
|
||
|
huggingface)
|
||
|
--vllm_maxlen VLLM_MAXLEN
|
||
|
Maximum sequence (prompt + response) length of the
|
||
|
vLLM engine. (default: 2048)
|
||
|
--vllm_gpu_util VLLM_GPU_UTIL
|
||
|
The fraction of GPU memory in (0,1) to be used for the
|
||
|
vLLM engine. (default: 0.9)
|
||
|
--vllm_enforce_eager [VLLM_ENFORCE_EAGER]
|
||
|
Whether or not to disable CUDA graph in the vLLM
|
||
|
engine. (default: False)
|
||
|
--vllm_max_lora_rank VLLM_MAX_LORA_RANK
|
||
|
Maximum rank of all LoRAs in the vLLM engine.
|
||
|
(default: 32)
|
||
|
--offload_folder OFFLOAD_FOLDER
|
||
|
Path to offload model weights. (default: offload)
|
||
|
--use_cache [USE_CACHE]
|
||
|
Whether or not to use KV cache in generation.
|
||
|
(default: True)
|
||
|
--no_use_cache Whether or not to use KV cache in generation.
|
||
|
(default: False)
|
||
|
--infer_dtype {auto,float16,bfloat16,float32}
|
||
|
Data type for model weights and activations at
|
||
|
inference. (default: auto)
|
||
|
--hf_hub_token HF_HUB_TOKEN
|
||
|
Auth token to log in with Hugging Face Hub. (default:
|
||
|
None)
|
||
|
--ms_hub_token MS_HUB_TOKEN
|
||
|
Auth token to log in with ModelScope Hub. (default:
|
||
|
None)
|
||
|
--export_dir EXPORT_DIR
|
||
|
Path to the directory to save the exported model.
|
||
|
(default: None)
|
||
|
--export_size EXPORT_SIZE
|
||
|
The file shard size (in GB) of the exported model.
|
||
|
(default: 1)
|
||
|
--export_device {cpu,auto}
|
||
|
The device used in model export, use `auto` to
|
||
|
accelerate exporting. (default: cpu)
|
||
|
--export_quantization_bit EXPORT_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the exported model.
|
||
|
(default: None)
|
||
|
--export_quantization_dataset EXPORT_QUANTIZATION_DATASET
|
||
|
Path to the dataset or dataset name to use in
|
||
|
quantizing the exported model. (default: None)
|
||
|
--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES
|
||
|
The number of samples used for quantization. (default:
|
||
|
128)
|
||
|
--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN
|
||
|
The maximum length of the model inputs used for
|
||
|
quantization. (default: 1024)
|
||
|
--export_legacy_format [EXPORT_LEGACY_FORMAT]
|
||
|
Whether or not to save the `.bin` files instead of
|
||
|
`.safetensors`. (default: False)
|
||
|
--export_hub_model_id EXPORT_HUB_MODEL_ID
|
||
|
The name of the repository if push the model to the
|
||
|
Hugging Face hub. (default: None)
|
||
|
--print_param_status [PRINT_PARAM_STATUS]
|
||
|
For debugging purposes, print the status of the
|
||
|
parameters in the model. (default: False)
|
||
|
--template TEMPLATE Which template to use for constructing prompts in
|
||
|
training and inference. (default: None)
|
||
|
--dataset DATASET The name of dataset(s) to use for training. Use commas
|
||
|
to separate multiple datasets. (default: None)
|
||
|
--eval_dataset EVAL_DATASET
|
||
|
The name of dataset(s) to use for evaluation. Use
|
||
|
commas to separate multiple datasets. (default: None)
|
||
|
--dataset_dir DATASET_DIR
|
||
|
Path to the folder containing the datasets. (default:
|
||
|
data)
|
||
|
--cutoff_len CUTOFF_LEN
|
||
|
The cutoff length of the tokenized inputs in the
|
||
|
dataset. (default: 1024)
|
||
|
--train_on_prompt [TRAIN_ON_PROMPT]
|
||
|
Whether or not to disable the mask on the prompt.
|
||
|
(default: False)
|
||
|
--mask_history [MASK_HISTORY]
|
||
|
Whether or not to mask the history and train on the
|
||
|
last turn only. (default: False)
|
||
|
--streaming [STREAMING]
|
||
|
Enable dataset streaming. (default: False)
|
||
|
--buffer_size BUFFER_SIZE
|
||
|
Size of the buffer to randomly sample examples from in
|
||
|
dataset streaming. (default: 16384)
|
||
|
--mix_strategy {concat,interleave_under,interleave_over}
|
||
|
Strategy to use in dataset mixing (concat/interleave)
|
||
|
(undersampling/oversampling). (default: concat)
|
||
|
--interleave_probs INTERLEAVE_PROBS
|
||
|
Probabilities to sample data from datasets. Use commas
|
||
|
to separate multiple datasets. (default: None)
|
||
|
--overwrite_cache [OVERWRITE_CACHE]
|
||
|
Overwrite the cached training and evaluation sets.
|
||
|
(default: False)
|
||
|
--preprocessing_num_workers PREPROCESSING_NUM_WORKERS
|
||
|
The number of processes to use for the pre-processing.
|
||
|
(default: None)
|
||
|
--max_samples MAX_SAMPLES
|
||
|
For debugging purposes, truncate the number of
|
||
|
examples for each dataset. (default: None)
|
||
|
--eval_num_beams EVAL_NUM_BEAMS
|
||
|
Number of beams to use for evaluation. This argument
|
||
|
will be passed to `model.generate` (default: None)
|
||
|
--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]
|
||
|
Whether or not to ignore the tokens corresponding to
|
||
|
the pad label in loss computation. (default: True)
|
||
|
--no_ignore_pad_token_for_loss
|
||
|
Whether or not to ignore the tokens corresponding to
|
||
|
the pad label in loss computation. (default: False)
|
||
|
--val_size VAL_SIZE Size of the development set, should be an integer or a
|
||
|
float in range `[0,1)`. (default: 0.0)
|
||
|
--packing PACKING Enable sequences packing in training. Will
|
||
|
automatically enable in pre-training. (default: None)
|
||
|
--neat_packing [NEAT_PACKING]
|
||
|
Enable sequence packing without cross-attention.
|
||
|
(default: False)
|
||
|
--tool_format TOOL_FORMAT
|
||
|
Tool format to use for constructing function calling
|
||
|
examples. (default: None)
|
||
|
--tokenized_path TOKENIZED_PATH
|
||
|
Path to save or load the tokenized datasets. (default:
|
||
|
None)
|
||
|
--output_dir OUTPUT_DIR
|
||
|
The output directory where the model predictions and
|
||
|
checkpoints will be written. (default: None)
|
||
|
--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]
|
||
|
Overwrite the content of the output directory. Use
|
||
|
this to continue training if output_dir points to a
|
||
|
checkpoint directory. (default: False)
|
||
|
--do_train [DO_TRAIN]
|
||
|
Whether to run training. (default: False)
|
||
|
--do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False)
|
||
|
--do_predict [DO_PREDICT]
|
||
|
Whether to run predictions on the test set. (default:
|
||
|
False)
|
||
|
--eval_strategy {no,steps,epoch}
|
||
|
The evaluation strategy to use. (default: no)
|
||
|
--prediction_loss_only [PREDICTION_LOSS_ONLY]
|
||
|
When performing evaluation and predictions, only
|
||
|
returns the loss. (default: False)
|
||
|
--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE
|
||
|
Batch size per GPU/TPU/MPS/NPU core/CPU for training.
|
||
|
(default: 8)
|
||
|
--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE
|
||
|
Batch size per GPU/TPU/MPS/NPU core/CPU for
|
||
|
evaluation. (default: 8)
|
||
|
--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE
|
||
|
Deprecated, the use of `--per_device_train_batch_size`
|
||
|
is preferred. Batch size per GPU/TPU core/CPU for
|
||
|
training. (default: None)
|
||
|
--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE
|
||
|
Deprecated, the use of `--per_device_eval_batch_size`
|
||
|
is preferred. Batch size per GPU/TPU core/CPU for
|
||
|
evaluation. (default: None)
|
||
|
--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS
|
||
|
Number of updates steps to accumulate before
|
||
|
performing a backward/update pass. (default: 1)
|
||
|
--eval_accumulation_steps EVAL_ACCUMULATION_STEPS
|
||
|
Number of predictions steps to accumulate before
|
||
|
moving the tensors to the CPU. (default: None)
|
||
|
--eval_delay EVAL_DELAY
|
||
|
Number of epochs or steps to wait for before the first
|
||
|
evaluation can be performed, depending on the
|
||
|
eval_strategy. (default: 0)
|
||
|
--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS
|
||
|
Number of steps to wait before calling
|
||
|
`torch.<device>.empty_cache()`.This can help avoid
|
||
|
CUDA out-of-memory errors by lowering peak VRAM usage
|
||
|
at a cost of about [10{'option_strings': ['--
|
||
|
torch_empty_cache_steps'], 'dest':
|
||
|
'torch_empty_cache_steps', 'nargs': None, 'const':
|
||
|
None, 'default': None, 'type': 'int', 'choices': None,
|
||
|
'required': False, 'help': 'Number of steps to wait
|
||
|
before calling `torch.<device>.empty_cache()`.This can
|
||
|
help avoid CUDA out-of-memory errors by lowering peak
|
||
|
VRAM usage at a cost of about [10% slower performance]
|
||
|
(https://github.com/huggingface/transformers/issues/31
|
||
|
372).If left unset or set to None, cache will not be
|
||
|
emptied.', 'metavar': None, 'container':
|
||
|
<argparse._ArgumentGroup object at 0x7f253d26fee0>,
|
||
|
'prog': 'launcher.py'}lower performance](https://githu
|
||
|
b.com/huggingface/transformers/issues/31372).If left
|
||
|
unset or set to None, cache will not be emptied.
|
||
|
(default: None)
|
||
|
--learning_rate LEARNING_RATE
|
||
|
The initial learning rate for AdamW. (default: 5e-05)
|
||
|
--weight_decay WEIGHT_DECAY
|
||
|
Weight decay for AdamW if we apply some. (default:
|
||
|
0.0)
|
||
|
--adam_beta1 ADAM_BETA1
|
||
|
Beta1 for AdamW optimizer (default: 0.9)
|
||
|
--adam_beta2 ADAM_BETA2
|
||
|
Beta2 for AdamW optimizer (default: 0.999)
|
||
|
--adam_epsilon ADAM_EPSILON
|
||
|
Epsilon for AdamW optimizer. (default: 1e-08)
|
||
|
--max_grad_norm MAX_GRAD_NORM
|
||
|
Max gradient norm. (default: 1.0)
|
||
|
--num_train_epochs NUM_TRAIN_EPOCHS
|
||
|
Total number of training epochs to perform. (default:
|
||
|
3.0)
|
||
|
--max_steps MAX_STEPS
|
||
|
If > 0: set total number of training steps to perform.
|
||
|
Override num_train_epochs. (default: -1)
|
||
|
--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}
|
||
|
The scheduler type to use. (default: linear)
|
||
|
--lr_scheduler_kwargs LR_SCHEDULER_KWARGS
|
||
|
Extra parameters for the lr_scheduler such as
|
||
|
{'num_cycles': 1} for the cosine with hard restarts.
|
||
|
(default: {})
|
||
|
--warmup_ratio WARMUP_RATIO
|
||
|
Linear warmup over warmup_ratio fraction of total
|
||
|
steps. (default: 0.0)
|
||
|
--warmup_steps WARMUP_STEPS
|
||
|
Linear warmup over warmup_steps. (default: 0)
|
||
|
--log_level {detail,debug,info,warning,error,critical,passive}
|
||
|
Logger log level to use on the main node. Possible
|
||
|
choices are the log levels as strings: 'debug',
|
||
|
'info', 'warning', 'error' and 'critical', plus a
|
||
|
'passive' level which doesn't set anything and lets
|
||
|
the application set the level. Defaults to 'passive'.
|
||
|
(default: passive)
|
||
|
--log_level_replica {detail,debug,info,warning,error,critical,passive}
|
||
|
Logger log level to use on replica nodes. Same choices
|
||
|
and defaults as ``log_level`` (default: warning)
|
||
|
--log_on_each_node [LOG_ON_EACH_NODE]
|
||
|
When doing a multinode distributed training, whether
|
||
|
to log once per node or just once on the main node.
|
||
|
(default: True)
|
||
|
--no_log_on_each_node
|
||
|
When doing a multinode distributed training, whether
|
||
|
to log once per node or just once on the main node.
|
||
|
(default: False)
|
||
|
--logging_dir LOGGING_DIR
|
||
|
Tensorboard log dir. (default: None)
|
||
|
--logging_strategy {no,steps,epoch}
|
||
|
The logging strategy to use. (default: steps)
|
||
|
--logging_first_step [LOGGING_FIRST_STEP]
|
||
|
Log the first global_step (default: False)
|
||
|
--logging_steps LOGGING_STEPS
|
||
|
Log every X updates steps. Should be an integer or a
|
||
|
float in range `[0,1)`. If smaller than 1, will be
|
||
|
interpreted as ratio of total training steps.
|
||
|
(default: 500)
|
||
|
--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]
|
||
|
Filter nan and inf losses for logging. (default: True)
|
||
|
--no_logging_nan_inf_filter
|
||
|
Filter nan and inf losses for logging. (default:
|
||
|
False)
|
||
|
--save_strategy {no,steps,epoch}
|
||
|
The checkpoint save strategy to use. (default: steps)
|
||
|
--save_steps SAVE_STEPS
|
||
|
Save checkpoint every X updates steps. Should be an
|
||
|
integer or a float in range `[0,1)`. If smaller than
|
||
|
1, will be interpreted as ratio of total training
|
||
|
steps. (default: 500)
|
||
|
--save_total_limit SAVE_TOTAL_LIMIT
|
||
|
If a value is passed, will limit the total amount of
|
||
|
checkpoints. Deletes the older checkpoints in
|
||
|
`output_dir`. When `load_best_model_at_end` is
|
||
|
enabled, the 'best' checkpoint according to
|
||
|
`metric_for_best_model` will always be retained in
|
||
|
addition to the most recent ones. For example, for
|
||
|
`save_total_limit=5` and
|
||
|
`load_best_model_at_end=True`, the four last
|
||
|
checkpoints will always be retained alongside the best
|
||
|
model. When `save_total_limit=1` and
|
||
|
`load_best_model_at_end=True`, it is possible that two
|
||
|
checkpoints are saved: the last one and the best one
|
||
|
(if they are different). Default is unlimited
|
||
|
checkpoints (default: None)
|
||
|
--save_safetensors [SAVE_SAFETENSORS]
|
||
|
Use safetensors saving and loading for state dicts
|
||
|
instead of default torch.load and torch.save.
|
||
|
(default: True)
|
||
|
--no_save_safetensors
|
||
|
Use safetensors saving and loading for state dicts
|
||
|
instead of default torch.load and torch.save.
|
||
|
(default: False)
|
||
|
--save_on_each_node [SAVE_ON_EACH_NODE]
|
||
|
When doing multi-node distributed training, whether to
|
||
|
save models and checkpoints on each node, or only on
|
||
|
the main one (default: False)
|
||
|
--save_only_model [SAVE_ONLY_MODEL]
|
||
|
When checkpointing, whether to only save the model, or
|
||
|
also the optimizer, scheduler & rng state.Note that
|
||
|
when this is true, you won't be able to resume
|
||
|
training from checkpoint.This enables you to save
|
||
|
storage by not storing the optimizer, scheduler & rng
|
||
|
state.You can only load the model using
|
||
|
from_pretrained with this option set to True.
|
||
|
(default: False)
|
||
|
--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]
|
||
|
Whether to restore the callback states from the
|
||
|
checkpoint. If `True`, will override callbacks passed
|
||
|
to the `Trainer` if they exist in the checkpoint.
|
||
|
(default: False)
|
||
|
--no_cuda [NO_CUDA] This argument is deprecated. It will be removed in
|
||
|
version 5.0 of 🤗 Transformers. (default: False)
|
||
|
--use_cpu [USE_CPU] Whether or not to use cpu. If set to False, we will
|
||
|
use cuda/tpu/mps/npu device if available. (default:
|
||
|
False)
|
||
|
--use_mps_device [USE_MPS_DEVICE]
|
||
|
This argument is deprecated. `mps` device will be used
|
||
|
if available similar to `cuda` device. It will be
|
||
|
removed in version 5.0 of 🤗 Transformers (default:
|
||
|
False)
|
||
|
--seed SEED Random seed that will be set at the beginning of
|
||
|
training. (default: 42)
|
||
|
--data_seed DATA_SEED
|
||
|
Random seed to be used with data samplers. (default:
|
||
|
None)
|
||
|
--jit_mode_eval [JIT_MODE_EVAL]
|
||
|
Whether or not to use PyTorch jit trace for inference
|
||
|
(default: False)
|
||
|
--use_ipex [USE_IPEX]
|
||
|
Use Intel extension for PyTorch when it is available,
|
||
|
installation: 'https://github.com/intel/intel-
|
||
|
extension-for-pytorch' (default: False)
|
||
|
--bf16 [BF16] Whether to use bf16 (mixed) precision instead of
|
||
|
32-bit. Requires Ampere or higher NVIDIA architecture
|
||
|
or using CPU (use_cpu) or Ascend NPU. This is an
|
||
|
experimental API and it may change. (default: False)
|
||
|
--fp16 [FP16] Whether to use fp16 (mixed) precision instead of
|
||
|
32-bit (default: False)
|
||
|
--fp16_opt_level FP16_OPT_LEVEL
|
||
|
For fp16: Apex AMP optimization level selected in
|
||
|
['O0', 'O1', 'O2', and 'O3']. See details at
|
||
|
https://nvidia.github.io/apex/amp.html (default: O1)
|
||
|
--half_precision_backend {auto,apex,cpu_amp}
|
||
|
The backend to be used for half precision. (default:
|
||
|
auto)
|
||
|
--bf16_full_eval [BF16_FULL_EVAL]
|
||
|
Whether to use full bfloat16 evaluation instead of
|
||
|
32-bit. This is an experimental API and it may change.
|
||
|
(default: False)
|
||
|
--fp16_full_eval [FP16_FULL_EVAL]
|
||
|
Whether to use full float16 evaluation instead of
|
||
|
32-bit (default: False)
|
||
|
--tf32 TF32 Whether to enable tf32 mode, available in Ampere and
|
||
|
newer GPU architectures. This is an experimental API
|
||
|
and it may change. (default: None)
|
||
|
--local_rank LOCAL_RANK
|
||
|
For distributed training: local_rank (default: -1)
|
||
|
--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}
|
||
|
The backend to be used for distributed training
|
||
|
(default: None)
|
||
|
--tpu_num_cores TPU_NUM_CORES
|
||
|
TPU: Number of TPU cores (automatically passed by
|
||
|
launcher script) (default: None)
|
||
|
--tpu_metrics_debug [TPU_METRICS_DEBUG]
|
||
|
Deprecated, the use of `--debug tpu_metrics_debug` is
|
||
|
preferred. TPU: Whether to print debug metrics
|
||
|
(default: False)
|
||
|
--debug DEBUG [DEBUG ...]
|
||
|
Whether or not to enable debug mode. Current options:
|
||
|
`underflow_overflow` (Detect underflow and overflow in
|
||
|
activations and weights), `tpu_metrics_debug` (print
|
||
|
debug metrics on TPU). (default: None)
|
||
|
--dataloader_drop_last [DATALOADER_DROP_LAST]
|
||
|
Drop the last incomplete batch if it is not divisible
|
||
|
by the batch size. (default: False)
|
||
|
--eval_steps EVAL_STEPS
|
||
|
Run an evaluation every X steps. Should be an integer
|
||
|
or a float in range `[0,1)`. If smaller than 1, will
|
||
|
be interpreted as ratio of total training steps.
|
||
|
(default: None)
|
||
|
--dataloader_num_workers DATALOADER_NUM_WORKERS
|
||
|
Number of subprocesses to use for data loading
|
||
|
(PyTorch only). 0 means that the data will be loaded
|
||
|
in the main process. (default: 0)
|
||
|
--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR
|
||
|
Number of batches loaded in advance by each worker. 2
|
||
|
means there will be a total of 2 * num_workers batches
|
||
|
prefetched across all workers. Default is 2 for
|
||
|
PyTorch < 2.0.0 and otherwise None. (default: None)
|
||
|
--past_index PAST_INDEX
|
||
|
If >=0, uses the corresponding part of the output as
|
||
|
the past state for next step. (default: -1)
|
||
|
--run_name RUN_NAME An optional descriptor for the run. Notably used for
|
||
|
wandb, mlflow and comet logging. (default: None)
|
||
|
--disable_tqdm DISABLE_TQDM
|
||
|
Whether or not to disable the tqdm progress bars.
|
||
|
(default: None)
|
||
|
--remove_unused_columns [REMOVE_UNUSED_COLUMNS]
|
||
|
Remove columns not required by the model when using an
|
||
|
nlp.Dataset. (default: True)
|
||
|
--no_remove_unused_columns
|
||
|
Remove columns not required by the model when using an
|
||
|
nlp.Dataset. (default: False)
|
||
|
--label_names LABEL_NAMES [LABEL_NAMES ...]
|
||
|
The list of keys in your dictionary of inputs that
|
||
|
correspond to the labels. (default: None)
|
||
|
--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]
|
||
|
Whether or not to load the best model found during
|
||
|
training at the end of training. When this option is
|
||
|
enabled, the best checkpoint will always be saved. See
|
||
|
`save_total_limit` for more. (default: False)
|
||
|
--metric_for_best_model METRIC_FOR_BEST_MODEL
|
||
|
The metric to use to compare two different models.
|
||
|
(default: None)
|
||
|
--greater_is_better GREATER_IS_BETTER
|
||
|
Whether the `metric_for_best_model` should be
|
||
|
maximized or not. (default: None)
|
||
|
--ignore_data_skip [IGNORE_DATA_SKIP]
|
||
|
When resuming training, whether or not to skip the
|
||
|
first epochs and batches to get to the same training
|
||
|
data. (default: False)
|
||
|
--fsdp FSDP Whether or not to use PyTorch Fully Sharded Data
|
||
|
Parallel (FSDP) training (in distributed training
|
||
|
only). The base option should be `full_shard`,
|
||
|
`shard_grad_op` or `no_shard` and you can add CPU-
|
||
|
offload to `full_shard` or `shard_grad_op` like this:
|
||
|
full_shard offload` or `shard_grad_op offload`. You
|
||
|
can add auto-wrap to `full_shard` or `shard_grad_op`
|
||
|
with the same syntax: full_shard auto_wrap` or
|
||
|
`shard_grad_op auto_wrap`. (default: )
|
||
|
--fsdp_min_num_params FSDP_MIN_NUM_PARAMS
|
||
|
This parameter is deprecated. FSDP's minimum number of
|
||
|
parameters for Default Auto Wrapping. (useful only
|
||
|
when `fsdp` field is passed). (default: 0)
|
||
|
--fsdp_config FSDP_CONFIG
|
||
|
Config to be used with FSDP (Pytorch Fully Sharded
|
||
|
Data Parallel). The value is either a fsdp json config
|
||
|
file (e.g., `fsdp_config.json`) or an already loaded
|
||
|
json file as `dict`. (default: None)
|
||
|
--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP
|
||
|
This parameter is deprecated. Transformer layer class
|
||
|
name (case-sensitive) to wrap, e.g, `BertLayer`,
|
||
|
`GPTJBlock`, `T5Block` .... (useful only when `fsdp`
|
||
|
flag is passed). (default: None)
|
||
|
--accelerator_config ACCELERATOR_CONFIG
|
||
|
Config to be used with the internal Accelerator object
|
||
|
initializtion. The value is either a accelerator json
|
||
|
config file (e.g., `accelerator_config.json`) or an
|
||
|
already loaded json file as `dict`. (default: None)
|
||
|
--deepspeed DEEPSPEED
|
||
|
Enable deepspeed and pass the path to deepspeed json
|
||
|
config file (e.g. `ds_config.json`) or an already
|
||
|
loaded json file as a dict (default: None)
|
||
|
--label_smoothing_factor LABEL_SMOOTHING_FACTOR
|
||
|
The label smoothing epsilon to apply (zero means no
|
||
|
label smoothing). (default: 0.0)
|
||
|
--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}
|
||
|
The optimizer to use. (default: adamw_torch)
|
||
|
--optim_args OPTIM_ARGS
|
||
|
Optional arguments to supply to optimizer. (default:
|
||
|
None)
|
||
|
--adafactor [ADAFACTOR]
|
||
|
Whether or not to replace AdamW by Adafactor.
|
||
|
(default: False)
|
||
|
--group_by_length [GROUP_BY_LENGTH]
|
||
|
Whether or not to group samples of roughly the same
|
||
|
length together when batching. (default: False)
|
||
|
--length_column_name LENGTH_COLUMN_NAME
|
||
|
Column name with precomputed lengths to use when
|
||
|
grouping by length. (default: length)
|
||
|
--report_to REPORT_TO
|
||
|
The list of integrations to report the results and
|
||
|
logs to. (default: None)
|
||
|
--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS
|
||
|
When using distributed training, the value of the flag
|
||
|
`find_unused_parameters` passed to
|
||
|
`DistributedDataParallel`. (default: None)
|
||
|
--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB
|
||
|
When using distributed training, the value of the flag
|
||
|
`bucket_cap_mb` passed to `DistributedDataParallel`.
|
||
|
(default: None)
|
||
|
--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS
|
||
|
When using distributed training, the value of the flag
|
||
|
`broadcast_buffers` passed to
|
||
|
`DistributedDataParallel`. (default: None)
|
||
|
--dataloader_pin_memory [DATALOADER_PIN_MEMORY]
|
||
|
Whether or not to pin memory for DataLoader. (default:
|
||
|
True)
|
||
|
--no_dataloader_pin_memory
|
||
|
Whether or not to pin memory for DataLoader. (default:
|
||
|
False)
|
||
|
--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]
|
||
|
If True, the data loader will not shut down the worker
|
||
|
processes after a dataset has been consumed once. This
|
||
|
allows to maintain the workers Dataset instances
|
||
|
alive. Can potentially speed up training, but will
|
||
|
increase RAM usage. (default: False)
|
||
|
--skip_memory_metrics [SKIP_MEMORY_METRICS]
|
||
|
Whether or not to skip adding of memory profiler
|
||
|
reports to metrics. (default: True)
|
||
|
--no_skip_memory_metrics
|
||
|
Whether or not to skip adding of memory profiler
|
||
|
reports to metrics. (default: False)
|
||
|
--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]
|
||
|
Whether or not to use the legacy prediction_loop in
|
||
|
the Trainer. (default: False)
|
||
|
--push_to_hub [PUSH_TO_HUB]
|
||
|
Whether or not to upload the trained model to the
|
||
|
model hub after training. (default: False)
|
||
|
--resume_from_checkpoint RESUME_FROM_CHECKPOINT
|
||
|
The path to a folder with a valid checkpoint for your
|
||
|
model. (default: None)
|
||
|
--hub_model_id HUB_MODEL_ID
|
||
|
The name of the repository to keep in sync with the
|
||
|
local `output_dir`. (default: None)
|
||
|
--hub_strategy {end,every_save,checkpoint,all_checkpoints}
|
||
|
The hub strategy to use when `--push_to_hub` is
|
||
|
activated. (default: every_save)
|
||
|
--hub_token HUB_TOKEN
|
||
|
The token to use to push to the Model Hub. (default:
|
||
|
None)
|
||
|
--hub_private_repo [HUB_PRIVATE_REPO]
|
||
|
Whether the model repository is private or not.
|
||
|
(default: False)
|
||
|
--hub_always_push [HUB_ALWAYS_PUSH]
|
||
|
Unless `True`, the Trainer will skip pushes if the
|
||
|
previous one wasn't finished yet. (default: False)
|
||
|
--gradient_checkpointing [GRADIENT_CHECKPOINTING]
|
||
|
If True, use gradient checkpointing to save memory at
|
||
|
the expense of slower backward pass. (default: False)
|
||
|
--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS
|
||
|
Gradient checkpointing key word arguments such as
|
||
|
`use_reentrant`. Will be passed to
|
||
|
`torch.utils.checkpoint.checkpoint` through
|
||
|
`model.gradient_checkpointing_enable`. (default: None)
|
||
|
--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]
|
||
|
Whether or not the inputs will be passed to the
|
||
|
`compute_metrics` function. (default: False)
|
||
|
--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]
|
||
|
Whether to recursively concat
|
||
|
inputs/losses/labels/predictions across batches. If
|
||
|
`False`, will instead store them as lists, with each
|
||
|
batch kept separate. (default: True)
|
||
|
--no_eval_do_concat_batches
|
||
|
Whether to recursively concat
|
||
|
inputs/losses/labels/predictions across batches. If
|
||
|
`False`, will instead store them as lists, with each
|
||
|
batch kept separate. (default: False)
|
||
|
--fp16_backend {auto,apex,cpu_amp}
|
||
|
Deprecated. Use half_precision_backend instead
|
||
|
(default: auto)
|
||
|
--evaluation_strategy {no,steps,epoch}
|
||
|
Deprecated. Use `eval_strategy` instead (default:
|
||
|
None)
|
||
|
--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID
|
||
|
The name of the repository to which push the
|
||
|
`Trainer`. (default: None)
|
||
|
--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION
|
||
|
The name of the organization in with to which push the
|
||
|
`Trainer`. (default: None)
|
||
|
--push_to_hub_token PUSH_TO_HUB_TOKEN
|
||
|
The token to use to push to the Model Hub. (default:
|
||
|
None)
|
||
|
--mp_parameters MP_PARAMETERS
|
||
|
Used by the SageMaker launcher to send mp-specific
|
||
|
args. Ignored in Trainer (default: )
|
||
|
--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]
|
||
|
Whether to automatically decrease the batch size in
|
||
|
half and rerun the training loop again each time a
|
||
|
CUDA Out-of-Memory was reached (default: False)
|
||
|
--full_determinism [FULL_DETERMINISM]
|
||
|
Whether to call enable_full_determinism instead of
|
||
|
set_seed for reproducibility in distributed training.
|
||
|
Important: this will negatively impact the
|
||
|
performance, so only use it for debugging. (default:
|
||
|
False)
|
||
|
--torchdynamo TORCHDYNAMO
|
||
|
This argument is deprecated, use
|
||
|
`--torch_compile_backend` instead. (default: None)
|
||
|
--ray_scope RAY_SCOPE
|
||
|
The scope to use when doing hyperparameter search with
|
||
|
Ray. By default, `"last"` will be used. Ray will then
|
||
|
use the last checkpoint of all trials, compare those,
|
||
|
and select the best one. However, other options are
|
||
|
also available. See the Ray documentation (https://doc
|
||
|
s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun
|
||
|
e.ExperimentAnalysis.get_best_trial) for more options.
|
||
|
(default: last)
|
||
|
--ddp_timeout DDP_TIMEOUT
|
||
|
Overrides the default timeout for distributed training
|
||
|
(value should be given in seconds). (default: 1800)
|
||
|
--torch_compile [TORCH_COMPILE]
|
||
|
If set to `True`, the model will be wrapped in
|
||
|
`torch.compile`. (default: False)
|
||
|
--torch_compile_backend TORCH_COMPILE_BACKEND
|
||
|
Which backend to use with `torch.compile`, passing one
|
||
|
will trigger a model compilation. (default: None)
|
||
|
--torch_compile_mode TORCH_COMPILE_MODE
|
||
|
Which mode to use with `torch.compile`, passing one
|
||
|
will trigger a model compilation. (default: None)
|
||
|
--dispatch_batches DISPATCH_BATCHES
|
||
|
Deprecated. Pass {'dispatch_batches':VALUE} to
|
||
|
`accelerator_config`. (default: None)
|
||
|
--split_batches SPLIT_BATCHES
|
||
|
Deprecated. Pass {'split_batches':True} to
|
||
|
`accelerator_config`. (default: None)
|
||
|
--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]
|
||
|
If set to `True`, the speed metrics will include `tgs`
|
||
|
(tokens per second per device). (default: False)
|
||
|
--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]
|
||
|
If set to `True`, will track the number of input
|
||
|
tokens seen throughout training. (May be slower in
|
||
|
distributed training) (default: False)
|
||
|
--neftune_noise_alpha NEFTUNE_NOISE_ALPHA
|
||
|
Activates neftune noise embeddings into the model.
|
||
|
NEFTune has been proven to drastically improve model
|
||
|
performances for instrcution fine-tuning. Check out
|
||
|
the original paper here:
|
||
|
https://arxiv.org/abs/2310.05914 and the original code
|
||
|
here: https://github.com/neelsjain/NEFTune. Only
|
||
|
supported for `PreTrainedModel` and `PeftModel`
|
||
|
classes. (default: None)
|
||
|
--optim_target_modules OPTIM_TARGET_MODULES
|
||
|
Target modules for the optimizer defined in the
|
||
|
`optim` argument. Only used for the GaLore optimizer
|
||
|
at the moment. (default: None)
|
||
|
--batch_eval_metrics [BATCH_EVAL_METRICS]
|
||
|
Break eval metrics calculation into batches to save
|
||
|
memory. (default: False)
|
||
|
--eval_on_start [EVAL_ON_START]
|
||
|
Whether to run through the entire `evaluation` step at
|
||
|
the very beginning of training as a sanity check.
|
||
|
(default: False)
|
||
|
--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]
|
||
|
Whether to run recursively gather object in a nested
|
||
|
list/tuple/dictionary of objects from all devices.
|
||
|
(default: False)
|
||
|
--sortish_sampler [SORTISH_SAMPLER]
|
||
|
Whether to use SortishSampler or not. (default: False)
|
||
|
--predict_with_generate [PREDICT_WITH_GENERATE]
|
||
|
Whether to use generate to calculate generative
|
||
|
metrics (ROUGE, BLEU). (default: False)
|
||
|
--generation_max_length GENERATION_MAX_LENGTH
|
||
|
The `max_length` to use on each evaluation loop when
|
||
|
`predict_with_generate=True`. Will default to the
|
||
|
`max_length` value of the model configuration.
|
||
|
(default: None)
|
||
|
--generation_num_beams GENERATION_NUM_BEAMS
|
||
|
The `num_beams` to use on each evaluation loop when
|
||
|
`predict_with_generate=True`. Will default to the
|
||
|
`num_beams` value of the model configuration.
|
||
|
(default: None)
|
||
|
--generation_config GENERATION_CONFIG
|
||
|
Model id, file path or url pointing to a
|
||
|
GenerationConfig json file, to use during prediction.
|
||
|
(default: None)
|
||
|
--use_badam [USE_BADAM]
|
||
|
Whether or not to use the BAdam optimizer. (default:
|
||
|
False)
|
||
|
--badam_mode {layer,ratio}
|
||
|
Whether to use layer-wise or ratio-wise BAdam
|
||
|
optimizer. (default: layer)
|
||
|
--badam_start_block BADAM_START_BLOCK
|
||
|
The starting block index for layer-wise BAdam.
|
||
|
(default: None)
|
||
|
--badam_switch_mode {ascending,descending,random,fixed}
|
||
|
the strategy of picking block to update for layer-wise
|
||
|
BAdam. (default: ascending)
|
||
|
--badam_switch_interval BADAM_SWITCH_INTERVAL
|
||
|
Number of steps to update the block for layer-wise
|
||
|
BAdam. Use -1 to disable the block update. (default:
|
||
|
50)
|
||
|
--badam_update_ratio BADAM_UPDATE_RATIO
|
||
|
The ratio of the update for ratio-wise BAdam.
|
||
|
(default: 0.05)
|
||
|
--badam_mask_mode {adjacent,scatter}
|
||
|
The mode of the mask for BAdam optimizer. `adjacent`
|
||
|
means that the trainable parameters are adjacent to
|
||
|
each other, `scatter` means that trainable parameters
|
||
|
are randomly choosed from the weight. (default:
|
||
|
adjacent)
|
||
|
--badam_verbose BADAM_VERBOSE
|
||
|
The verbosity level of BAdam optimizer. 0 for no
|
||
|
print, 1 for print the block prefix, 2 for print
|
||
|
trainable parameters. (default: 0)
|
||
|
--use_galore [USE_GALORE]
|
||
|
Whether or not to use the gradient low-Rank projection
|
||
|
(GaLore). (default: False)
|
||
|
--galore_target GALORE_TARGET
|
||
|
Name(s) of modules to apply GaLore. Use commas to
|
||
|
separate multiple modules. Use `all` to specify all
|
||
|
the linear modules. (default: all)
|
||
|
--galore_rank GALORE_RANK
|
||
|
The rank of GaLore gradients. (default: 16)
|
||
|
--galore_update_interval GALORE_UPDATE_INTERVAL
|
||
|
Number of steps to update the GaLore projection.
|
||
|
(default: 200)
|
||
|
--galore_scale GALORE_SCALE
|
||
|
GaLore scaling coefficient. (default: 0.25)
|
||
|
--galore_proj_type {std,reverse_std,right,left,full}
|
||
|
Type of GaLore projection. (default: std)
|
||
|
--galore_layerwise [GALORE_LAYERWISE]
|
||
|
Whether or not to enable layer-wise update to further
|
||
|
save memory. (default: False)
|
||
|
--pref_beta PREF_BETA
|
||
|
The beta parameter in the preference loss. (default:
|
||
|
0.1)
|
||
|
--pref_ftx PREF_FTX The supervised fine-tuning loss coefficient in DPO
|
||
|
training. (default: 0.0)
|
||
|
--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}
|
||
|
The type of DPO loss to use. (default: sigmoid)
|
||
|
--dpo_label_smoothing DPO_LABEL_SMOOTHING
|
||
|
The robust DPO label smoothing parameter in cDPO that
|
||
|
should be between 0 and 0.5. (default: 0.0)
|
||
|
--kto_chosen_weight KTO_CHOSEN_WEIGHT
|
||
|
The weight factor of the desirable losses in KTO
|
||
|
training. (default: 1.0)
|
||
|
--kto_rejected_weight KTO_REJECTED_WEIGHT
|
||
|
The weight factor of the undesirable losses in KTO
|
||
|
training. (default: 1.0)
|
||
|
--simpo_gamma SIMPO_GAMMA
|
||
|
The target reward margin term in SimPO loss. (default:
|
||
|
0.5)
|
||
|
--ppo_buffer_size PPO_BUFFER_SIZE
|
||
|
The number of mini-batches to make experience buffer
|
||
|
in a PPO optimization step. (default: 1)
|
||
|
--ppo_epochs PPO_EPOCHS
|
||
|
The number of epochs to perform in a PPO optimization
|
||
|
step. (default: 4)
|
||
|
--ppo_score_norm [PPO_SCORE_NORM]
|
||
|
Use score normalization in PPO training. (default:
|
||
|
False)
|
||
|
--ppo_target PPO_TARGET
|
||
|
Target KL value for adaptive KL control in PPO
|
||
|
training. (default: 6.0)
|
||
|
--ppo_whiten_rewards [PPO_WHITEN_REWARDS]
|
||
|
Whiten the rewards before compute advantages in PPO
|
||
|
training. (default: False)
|
||
|
--ref_model REF_MODEL
|
||
|
Path to the reference model used for the PPO or DPO
|
||
|
training. (default: None)
|
||
|
--ref_model_adapters REF_MODEL_ADAPTERS
|
||
|
Path to the adapters of the reference model. (default:
|
||
|
None)
|
||
|
--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the reference model.
|
||
|
(default: None)
|
||
|
--reward_model REWARD_MODEL
|
||
|
Path to the reward model used for the PPO training.
|
||
|
(default: None)
|
||
|
--reward_model_adapters REWARD_MODEL_ADAPTERS
|
||
|
Path to the adapters of the reward model. (default:
|
||
|
None)
|
||
|
--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the reward model.
|
||
|
(default: None)
|
||
|
--reward_model_type {lora,full,api}
|
||
|
The type of the reward model in PPO training. Lora
|
||
|
model only supports lora training. (default: lora)
|
||
|
--additional_target ADDITIONAL_TARGET
|
||
|
Name(s) of modules apart from LoRA layers to be set as
|
||
|
trainable and saved in the final checkpoint. Use
|
||
|
commas to separate multiple modules. (default: None)
|
||
|
--lora_alpha LORA_ALPHA
|
||
|
The scale factor for LoRA fine-tuning (default:
|
||
|
lora_rank * 2). (default: None)
|
||
|
--lora_dropout LORA_DROPOUT
|
||
|
Dropout rate for the LoRA fine-tuning. (default: 0.0)
|
||
|
--lora_rank LORA_RANK
|
||
|
The intrinsic dimension for LoRA fine-tuning.
|
||
|
(default: 8)
|
||
|
--lora_target LORA_TARGET
|
||
|
Name(s) of target modules to apply LoRA. Use commas to
|
||
|
separate multiple modules. Use `all` to specify all
|
||
|
the linear modules. (default: all)
|
||
|
--loraplus_lr_ratio LORAPLUS_LR_RATIO
|
||
|
LoRA plus learning rate ratio (lr_B / lr_A). (default:
|
||
|
None)
|
||
|
--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING
|
||
|
LoRA plus learning rate for lora embedding layers.
|
||
|
(default: 1e-06)
|
||
|
--use_rslora [USE_RSLORA]
|
||
|
Whether or not to use the rank stabilization scaling
|
||
|
factor for LoRA layer. (default: False)
|
||
|
--use_dora [USE_DORA]
|
||
|
Whether or not to use the weight-decomposed lora
|
||
|
method (DoRA). (default: False)
|
||
|
--pissa_init [PISSA_INIT]
|
||
|
Whether or not to initialize a PiSSA adapter.
|
||
|
(default: False)
|
||
|
--pissa_iter PISSA_ITER
|
||
|
The number of iteration steps performed by FSVD in
|
||
|
PiSSA. Use -1 to disable it. (default: 16)
|
||
|
--pissa_convert [PISSA_CONVERT]
|
||
|
Whether or not to convert the PiSSA adapter to a
|
||
|
normal LoRA adapter. (default: False)
|
||
|
--create_new_adapter [CREATE_NEW_ADAPTER]
|
||
|
Whether or not to create a new adapter with randomly
|
||
|
initialized weight. (default: False)
|
||
|
--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS
|
||
|
The number of trainable layers for freeze (partial-
|
||
|
parameter) fine-tuning. Positive numbers mean the last
|
||
|
n layers are set as trainable, negative numbers mean
|
||
|
the first n layers are set as trainable. (default: 2)
|
||
|
--freeze_trainable_modules FREEZE_TRAINABLE_MODULES
|
||
|
Name(s) of trainable modules for freeze (partial-
|
||
|
parameter) fine-tuning. Use commas to separate
|
||
|
multiple modules. Use `all` to specify all the
|
||
|
available modules. (default: all)
|
||
|
--freeze_extra_modules FREEZE_EXTRA_MODULES
|
||
|
Name(s) of modules apart from hidden layers to be set
|
||
|
as trainable for freeze (partial-parameter) fine-
|
||
|
tuning. Use commas to separate multiple modules.
|
||
|
(default: None)
|
||
|
--pure_bf16 [PURE_BF16]
|
||
|
Whether or not to train model in purely bf16 precision
|
||
|
(without AMP). (default: False)
|
||
|
--stage {pt,sft,rm,ppo,dpo,kto}
|
||
|
Which stage will be performed in training. (default:
|
||
|
sft)
|
||
|
--finetuning_type {lora,freeze,full}
|
||
|
Which fine-tuning method to use. (default: lora)
|
||
|
--use_llama_pro [USE_LLAMA_PRO]
|
||
|
Whether or not to make only the parameters in the
|
||
|
expanded blocks trainable. (default: False)
|
||
|
--use_adam_mini [USE_ADAM_MINI]
|
||
|
Whether or not to use the Adam-mini optimizer.
|
||
|
(default: False)
|
||
|
--freeze_vision_tower [FREEZE_VISION_TOWER]
|
||
|
Whether ot not to freeze vision tower in MLLM
|
||
|
training. (default: True)
|
||
|
--no_freeze_vision_tower
|
||
|
Whether ot not to freeze vision tower in MLLM
|
||
|
training. (default: False)
|
||
|
--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]
|
||
|
Whether or not to train the multimodal projector for
|
||
|
MLLM only. (default: False)
|
||
|
--compute_accuracy [COMPUTE_ACCURACY]
|
||
|
Whether or not to compute the token-level accuracy at
|
||
|
evaluation. (default: False)
|
||
|
--plot_loss [PLOT_LOSS]
|
||
|
Whether or not to save the training loss curves.
|
||
|
(default: False)
|
||
|
--do_sample [DO_SAMPLE]
|
||
|
Whether or not to use sampling, use greedy decoding
|
||
|
otherwise. (default: True)
|
||
|
--no_do_sample Whether or not to use sampling, use greedy decoding
|
||
|
otherwise. (default: False)
|
||
|
--temperature TEMPERATURE
|
||
|
The value used to modulate the next token
|
||
|
probabilities. (default: 0.95)
|
||
|
--top_p TOP_P The smallest set of most probable tokens with
|
||
|
probabilities that add up to top_p or higher are kept.
|
||
|
(default: 0.7)
|
||
|
--top_k TOP_K The number of highest probability vocabulary tokens to
|
||
|
keep for top-k filtering. (default: 50)
|
||
|
--num_beams NUM_BEAMS
|
||
|
Number of beams for beam search. 1 means no beam
|
||
|
search. (default: 1)
|
||
|
--max_length MAX_LENGTH
|
||
|
The maximum length the generated tokens can have. It
|
||
|
can be overridden by max_new_tokens. (default: 1024)
|
||
|
--max_new_tokens MAX_NEW_TOKENS
|
||
|
The maximum numbers of tokens to generate, ignoring
|
||
|
the number of tokens in the prompt. (default: 1024)
|
||
|
--repetition_penalty REPETITION_PENALTY
|
||
|
The parameter for repetition penalty. 1.0 means no
|
||
|
penalty. (default: 1.0)
|
||
|
--length_penalty LENGTH_PENALTY
|
||
|
Exponential penalty to the length that is used with
|
||
|
beam-based generation. (default: 1.0)
|
||
|
--default_system DEFAULT_SYSTEM
|
||
|
Default system message to use in chat completion.
|
||
|
(default: None)
|
||
|
usage: launcher.py [-h] --model_name_or_path MODEL_NAME_OR_PATH
|
||
|
[--adapter_name_or_path ADAPTER_NAME_OR_PATH]
|
||
|
[--adapter_folder ADAPTER_FOLDER] [--cache_dir CACHE_DIR]
|
||
|
[--use_fast_tokenizer [USE_FAST_TOKENIZER]]
|
||
|
[--no_use_fast_tokenizer] [--resize_vocab [RESIZE_VOCAB]]
|
||
|
[--split_special_tokens [SPLIT_SPECIAL_TOKENS]]
|
||
|
[--new_special_tokens NEW_SPECIAL_TOKENS]
|
||
|
[--model_revision MODEL_REVISION]
|
||
|
[--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]]
|
||
|
[--no_low_cpu_mem_usage]
|
||
|
[--quantization_method {bitsandbytes,hqq,eetq}]
|
||
|
[--quantization_bit QUANTIZATION_BIT]
|
||
|
[--quantization_type {fp4,nf4}]
|
||
|
[--double_quantization [DOUBLE_QUANTIZATION]]
|
||
|
[--no_double_quantization]
|
||
|
[--quantization_device_map {auto}]
|
||
|
[--rope_scaling {linear,dynamic}]
|
||
|
[--flash_attn {auto,disabled,sdpa,fa2}]
|
||
|
[--shift_attn [SHIFT_ATTN]]
|
||
|
[--mixture_of_depths {convert,load}]
|
||
|
[--use_unsloth [USE_UNSLOTH]]
|
||
|
[--visual_inputs [VISUAL_INPUTS]]
|
||
|
[--moe_aux_loss_coef MOE_AUX_LOSS_COEF]
|
||
|
[--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]]
|
||
|
[--upcast_layernorm [UPCAST_LAYERNORM]]
|
||
|
[--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]]
|
||
|
[--train_from_scratch [TRAIN_FROM_SCRATCH]]
|
||
|
[--infer_backend {huggingface,vllm}]
|
||
|
[--vllm_maxlen VLLM_MAXLEN] [--vllm_gpu_util VLLM_GPU_UTIL]
|
||
|
[--vllm_enforce_eager [VLLM_ENFORCE_EAGER]]
|
||
|
[--vllm_max_lora_rank VLLM_MAX_LORA_RANK]
|
||
|
[--offload_folder OFFLOAD_FOLDER] [--use_cache [USE_CACHE]]
|
||
|
[--no_use_cache]
|
||
|
[--infer_dtype {auto,float16,bfloat16,float32}]
|
||
|
[--hf_hub_token HF_HUB_TOKEN] [--ms_hub_token MS_HUB_TOKEN]
|
||
|
[--export_dir EXPORT_DIR] [--export_size EXPORT_SIZE]
|
||
|
[--export_device {cpu,auto}]
|
||
|
[--export_quantization_bit EXPORT_QUANTIZATION_BIT]
|
||
|
[--export_quantization_dataset EXPORT_QUANTIZATION_DATASET]
|
||
|
[--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES]
|
||
|
[--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN]
|
||
|
[--export_legacy_format [EXPORT_LEGACY_FORMAT]]
|
||
|
[--export_hub_model_id EXPORT_HUB_MODEL_ID]
|
||
|
[--print_param_status [PRINT_PARAM_STATUS]]
|
||
|
[--template TEMPLATE] [--dataset DATASET]
|
||
|
[--eval_dataset EVAL_DATASET] [--dataset_dir DATASET_DIR]
|
||
|
[--cutoff_len CUTOFF_LEN]
|
||
|
[--train_on_prompt [TRAIN_ON_PROMPT]]
|
||
|
[--mask_history [MASK_HISTORY]] [--streaming [STREAMING]]
|
||
|
[--buffer_size BUFFER_SIZE]
|
||
|
[--mix_strategy {concat,interleave_under,interleave_over}]
|
||
|
[--interleave_probs INTERLEAVE_PROBS]
|
||
|
[--overwrite_cache [OVERWRITE_CACHE]]
|
||
|
[--preprocessing_num_workers PREPROCESSING_NUM_WORKERS]
|
||
|
[--max_samples MAX_SAMPLES]
|
||
|
[--eval_num_beams EVAL_NUM_BEAMS]
|
||
|
[--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]]
|
||
|
[--no_ignore_pad_token_for_loss] [--val_size VAL_SIZE]
|
||
|
[--packing PACKING] [--neat_packing [NEAT_PACKING]]
|
||
|
[--tool_format TOOL_FORMAT]
|
||
|
[--tokenized_path TOKENIZED_PATH] --output_dir OUTPUT_DIR
|
||
|
[--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]]
|
||
|
[--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]]
|
||
|
[--do_predict [DO_PREDICT]]
|
||
|
[--eval_strategy {no,steps,epoch}]
|
||
|
[--prediction_loss_only [PREDICTION_LOSS_ONLY]]
|
||
|
[--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE]
|
||
|
[--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE]
|
||
|
[--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE]
|
||
|
[--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE]
|
||
|
[--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS]
|
||
|
[--eval_accumulation_steps EVAL_ACCUMULATION_STEPS]
|
||
|
[--eval_delay EVAL_DELAY]
|
||
|
[--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS]
|
||
|
[--learning_rate LEARNING_RATE]
|
||
|
[--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1]
|
||
|
[--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON]
|
||
|
[--max_grad_norm MAX_GRAD_NORM]
|
||
|
[--num_train_epochs NUM_TRAIN_EPOCHS]
|
||
|
[--max_steps MAX_STEPS]
|
||
|
[--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}]
|
||
|
[--lr_scheduler_kwargs LR_SCHEDULER_KWARGS]
|
||
|
[--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS]
|
||
|
[--log_level {detail,debug,info,warning,error,critical,passive}]
|
||
|
[--log_level_replica {detail,debug,info,warning,error,critical,passive}]
|
||
|
[--log_on_each_node [LOG_ON_EACH_NODE]]
|
||
|
[--no_log_on_each_node] [--logging_dir LOGGING_DIR]
|
||
|
[--logging_strategy {no,steps,epoch}]
|
||
|
[--logging_first_step [LOGGING_FIRST_STEP]]
|
||
|
[--logging_steps LOGGING_STEPS]
|
||
|
[--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]]
|
||
|
[--no_logging_nan_inf_filter]
|
||
|
[--save_strategy {no,steps,epoch}]
|
||
|
[--save_steps SAVE_STEPS]
|
||
|
[--save_total_limit SAVE_TOTAL_LIMIT]
|
||
|
[--save_safetensors [SAVE_SAFETENSORS]]
|
||
|
[--no_save_safetensors]
|
||
|
[--save_on_each_node [SAVE_ON_EACH_NODE]]
|
||
|
[--save_only_model [SAVE_ONLY_MODEL]]
|
||
|
[--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]]
|
||
|
[--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]]
|
||
|
[--use_mps_device [USE_MPS_DEVICE]] [--seed SEED]
|
||
|
[--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]]
|
||
|
[--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]]
|
||
|
[--fp16_opt_level FP16_OPT_LEVEL]
|
||
|
[--half_precision_backend {auto,apex,cpu_amp}]
|
||
|
[--bf16_full_eval [BF16_FULL_EVAL]]
|
||
|
[--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32]
|
||
|
[--local_rank LOCAL_RANK]
|
||
|
[--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}]
|
||
|
[--tpu_num_cores TPU_NUM_CORES]
|
||
|
[--tpu_metrics_debug [TPU_METRICS_DEBUG]]
|
||
|
[--debug DEBUG [DEBUG ...]]
|
||
|
[--dataloader_drop_last [DATALOADER_DROP_LAST]]
|
||
|
[--eval_steps EVAL_STEPS]
|
||
|
[--dataloader_num_workers DATALOADER_NUM_WORKERS]
|
||
|
[--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR]
|
||
|
[--past_index PAST_INDEX] [--run_name RUN_NAME]
|
||
|
[--disable_tqdm DISABLE_TQDM]
|
||
|
[--remove_unused_columns [REMOVE_UNUSED_COLUMNS]]
|
||
|
[--no_remove_unused_columns]
|
||
|
[--label_names LABEL_NAMES [LABEL_NAMES ...]]
|
||
|
[--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]]
|
||
|
[--metric_for_best_model METRIC_FOR_BEST_MODEL]
|
||
|
[--greater_is_better GREATER_IS_BETTER]
|
||
|
[--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP]
|
||
|
[--fsdp_min_num_params FSDP_MIN_NUM_PARAMS]
|
||
|
[--fsdp_config FSDP_CONFIG]
|
||
|
[--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP]
|
||
|
[--accelerator_config ACCELERATOR_CONFIG]
|
||
|
[--deepspeed DEEPSPEED]
|
||
|
[--label_smoothing_factor LABEL_SMOOTHING_FACTOR]
|
||
|
[--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}]
|
||
|
[--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]]
|
||
|
[--group_by_length [GROUP_BY_LENGTH]]
|
||
|
[--length_column_name LENGTH_COLUMN_NAME]
|
||
|
[--report_to REPORT_TO]
|
||
|
[--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS]
|
||
|
[--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB]
|
||
|
[--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS]
|
||
|
[--dataloader_pin_memory [DATALOADER_PIN_MEMORY]]
|
||
|
[--no_dataloader_pin_memory]
|
||
|
[--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]]
|
||
|
[--skip_memory_metrics [SKIP_MEMORY_METRICS]]
|
||
|
[--no_skip_memory_metrics]
|
||
|
[--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]]
|
||
|
[--push_to_hub [PUSH_TO_HUB]]
|
||
|
[--resume_from_checkpoint RESUME_FROM_CHECKPOINT]
|
||
|
[--hub_model_id HUB_MODEL_ID]
|
||
|
[--hub_strategy {end,every_save,checkpoint,all_checkpoints}]
|
||
|
[--hub_token HUB_TOKEN]
|
||
|
[--hub_private_repo [HUB_PRIVATE_REPO]]
|
||
|
[--hub_always_push [HUB_ALWAYS_PUSH]]
|
||
|
[--gradient_checkpointing [GRADIENT_CHECKPOINTING]]
|
||
|
[--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS]
|
||
|
[--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]]
|
||
|
[--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]]
|
||
|
[--no_eval_do_concat_batches]
|
||
|
[--fp16_backend {auto,apex,cpu_amp}]
|
||
|
[--evaluation_strategy {no,steps,epoch}]
|
||
|
[--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID]
|
||
|
[--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION]
|
||
|
[--push_to_hub_token PUSH_TO_HUB_TOKEN]
|
||
|
[--mp_parameters MP_PARAMETERS]
|
||
|
[--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]]
|
||
|
[--full_determinism [FULL_DETERMINISM]]
|
||
|
[--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE]
|
||
|
[--ddp_timeout DDP_TIMEOUT]
|
||
|
[--torch_compile [TORCH_COMPILE]]
|
||
|
[--torch_compile_backend TORCH_COMPILE_BACKEND]
|
||
|
[--torch_compile_mode TORCH_COMPILE_MODE]
|
||
|
[--dispatch_batches DISPATCH_BATCHES]
|
||
|
[--split_batches SPLIT_BATCHES]
|
||
|
[--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]]
|
||
|
[--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]]
|
||
|
[--neftune_noise_alpha NEFTUNE_NOISE_ALPHA]
|
||
|
[--optim_target_modules OPTIM_TARGET_MODULES]
|
||
|
[--batch_eval_metrics [BATCH_EVAL_METRICS]]
|
||
|
[--eval_on_start [EVAL_ON_START]]
|
||
|
[--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]]
|
||
|
[--sortish_sampler [SORTISH_SAMPLER]]
|
||
|
[--predict_with_generate [PREDICT_WITH_GENERATE]]
|
||
|
[--generation_max_length GENERATION_MAX_LENGTH]
|
||
|
[--generation_num_beams GENERATION_NUM_BEAMS]
|
||
|
[--generation_config GENERATION_CONFIG]
|
||
|
[--use_badam [USE_BADAM]] [--badam_mode {layer,ratio}]
|
||
|
[--badam_start_block BADAM_START_BLOCK]
|
||
|
[--badam_switch_mode {ascending,descending,random,fixed}]
|
||
|
[--badam_switch_interval BADAM_SWITCH_INTERVAL]
|
||
|
[--badam_update_ratio BADAM_UPDATE_RATIO]
|
||
|
[--badam_mask_mode {adjacent,scatter}]
|
||
|
[--badam_verbose BADAM_VERBOSE] [--use_galore [USE_GALORE]]
|
||
|
[--galore_target GALORE_TARGET] [--galore_rank GALORE_RANK]
|
||
|
[--galore_update_interval GALORE_UPDATE_INTERVAL]
|
||
|
[--galore_scale GALORE_SCALE]
|
||
|
[--galore_proj_type {std,reverse_std,right,left,full}]
|
||
|
[--galore_layerwise [GALORE_LAYERWISE]]
|
||
|
[--pref_beta PREF_BETA] [--pref_ftx PREF_FTX]
|
||
|
[--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}]
|
||
|
[--dpo_label_smoothing DPO_LABEL_SMOOTHING]
|
||
|
[--kto_chosen_weight KTO_CHOSEN_WEIGHT]
|
||
|
[--kto_rejected_weight KTO_REJECTED_WEIGHT]
|
||
|
[--simpo_gamma SIMPO_GAMMA]
|
||
|
[--ppo_buffer_size PPO_BUFFER_SIZE]
|
||
|
[--ppo_epochs PPO_EPOCHS]
|
||
|
[--ppo_score_norm [PPO_SCORE_NORM]]
|
||
|
[--ppo_target PPO_TARGET]
|
||
|
[--ppo_whiten_rewards [PPO_WHITEN_REWARDS]]
|
||
|
[--ref_model REF_MODEL]
|
||
|
[--ref_model_adapters REF_MODEL_ADAPTERS]
|
||
|
[--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT]
|
||
|
[--reward_model REWARD_MODEL]
|
||
|
[--reward_model_adapters REWARD_MODEL_ADAPTERS]
|
||
|
[--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT]
|
||
|
[--reward_model_type {lora,full,api}]
|
||
|
[--additional_target ADDITIONAL_TARGET]
|
||
|
[--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT]
|
||
|
[--lora_rank LORA_RANK] [--lora_target LORA_TARGET]
|
||
|
[--loraplus_lr_ratio LORAPLUS_LR_RATIO]
|
||
|
[--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING]
|
||
|
[--use_rslora [USE_RSLORA]] [--use_dora [USE_DORA]]
|
||
|
[--pissa_init [PISSA_INIT]] [--pissa_iter PISSA_ITER]
|
||
|
[--pissa_convert [PISSA_CONVERT]]
|
||
|
[--create_new_adapter [CREATE_NEW_ADAPTER]]
|
||
|
[--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS]
|
||
|
[--freeze_trainable_modules FREEZE_TRAINABLE_MODULES]
|
||
|
[--freeze_extra_modules FREEZE_EXTRA_MODULES]
|
||
|
[--pure_bf16 [PURE_BF16]] [--stage {pt,sft,rm,ppo,dpo,kto}]
|
||
|
[--finetuning_type {lora,freeze,full}]
|
||
|
[--use_llama_pro [USE_LLAMA_PRO]]
|
||
|
[--use_adam_mini [USE_ADAM_MINI]]
|
||
|
[--freeze_vision_tower [FREEZE_VISION_TOWER]]
|
||
|
[--no_freeze_vision_tower]
|
||
|
[--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]]
|
||
|
[--compute_accuracy [COMPUTE_ACCURACY]]
|
||
|
[--plot_loss [PLOT_LOSS]] [--do_sample [DO_SAMPLE]]
|
||
|
[--no_do_sample] [--temperature TEMPERATURE]
|
||
|
[--top_p TOP_P] [--top_k TOP_K] [--num_beams NUM_BEAMS]
|
||
|
[--max_length MAX_LENGTH] [--max_new_tokens MAX_NEW_TOKENS]
|
||
|
[--repetition_penalty REPETITION_PENALTY]
|
||
|
[--length_penalty LENGTH_PENALTY]
|
||
|
[--default_system DEFAULT_SYSTEM]
|
||
|
|
||
|
optional arguments:
|
||
|
-h, --help show this help message and exit
|
||
|
--model_name_or_path MODEL_NAME_OR_PATH
|
||
|
Path to the model weight or identifier from
|
||
|
huggingface.co/models or modelscope.cn/models.
|
||
|
(default: None)
|
||
|
--adapter_name_or_path ADAPTER_NAME_OR_PATH
|
||
|
Path to the adapter weight or identifier from
|
||
|
huggingface.co/models. Use commas to separate multiple
|
||
|
adapters. (default: None)
|
||
|
--adapter_folder ADAPTER_FOLDER
|
||
|
The folder containing the adapter weights to load.
|
||
|
(default: None)
|
||
|
--cache_dir CACHE_DIR
|
||
|
Where to store the pre-trained models downloaded from
|
||
|
huggingface.co or modelscope.cn. (default: None)
|
||
|
--use_fast_tokenizer [USE_FAST_TOKENIZER]
|
||
|
Whether or not to use one of the fast tokenizer
|
||
|
(backed by the tokenizers library). (default: True)
|
||
|
--no_use_fast_tokenizer
|
||
|
Whether or not to use one of the fast tokenizer
|
||
|
(backed by the tokenizers library). (default: False)
|
||
|
--resize_vocab [RESIZE_VOCAB]
|
||
|
Whether or not to resize the tokenizer vocab and the
|
||
|
embedding layers. (default: False)
|
||
|
--split_special_tokens [SPLIT_SPECIAL_TOKENS]
|
||
|
Whether or not the special tokens should be split
|
||
|
during the tokenization process. (default: False)
|
||
|
--new_special_tokens NEW_SPECIAL_TOKENS
|
||
|
Special tokens to be added into the tokenizer. Use
|
||
|
commas to separate multiple tokens. (default: None)
|
||
|
--model_revision MODEL_REVISION
|
||
|
The specific model version to use (can be a branch
|
||
|
name, tag name or commit id). (default: main)
|
||
|
--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]
|
||
|
Whether or not to use memory-efficient model loading.
|
||
|
(default: True)
|
||
|
--no_low_cpu_mem_usage
|
||
|
Whether or not to use memory-efficient model loading.
|
||
|
(default: False)
|
||
|
--quantization_method {bitsandbytes,hqq,eetq}
|
||
|
Quantization method to use for on-the-fly
|
||
|
quantization. (default: bitsandbytes)
|
||
|
--quantization_bit QUANTIZATION_BIT
|
||
|
The number of bits to quantize the model using
|
||
|
bitsandbytes. (default: None)
|
||
|
--quantization_type {fp4,nf4}
|
||
|
Quantization data type to use in int4 training.
|
||
|
(default: nf4)
|
||
|
--double_quantization [DOUBLE_QUANTIZATION]
|
||
|
Whether or not to use double quantization in int4
|
||
|
training. (default: True)
|
||
|
--no_double_quantization
|
||
|
Whether or not to use double quantization in int4
|
||
|
training. (default: False)
|
||
|
--quantization_device_map {auto}
|
||
|
Device map used to infer the 4-bit quantized model,
|
||
|
needs bitsandbytes>=0.43.0. (default: None)
|
||
|
--rope_scaling {linear,dynamic}
|
||
|
Which scaling strategy should be adopted for the RoPE
|
||
|
embeddings. (default: None)
|
||
|
--flash_attn {auto,disabled,sdpa,fa2}
|
||
|
Enable FlashAttention for faster training and
|
||
|
inference. (default: auto)
|
||
|
--shift_attn [SHIFT_ATTN]
|
||
|
Enable shift short attention (S^2-Attn) proposed by
|
||
|
LongLoRA. (default: False)
|
||
|
--mixture_of_depths {convert,load}
|
||
|
Convert the model to mixture-of-depths (MoD) or load
|
||
|
the MoD model. (default: None)
|
||
|
--use_unsloth [USE_UNSLOTH]
|
||
|
Whether or not to use unsloth's optimization for the
|
||
|
LoRA training. (default: False)
|
||
|
--visual_inputs [VISUAL_INPUTS]
|
||
|
Whethor or not to use multimodal LLM that accepts
|
||
|
visual inputs. (default: False)
|
||
|
--moe_aux_loss_coef MOE_AUX_LOSS_COEF
|
||
|
Coefficient of the auxiliary router loss in mixture-
|
||
|
of-experts model. (default: None)
|
||
|
--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]
|
||
|
Whether or not to disable gradient checkpointing.
|
||
|
(default: False)
|
||
|
--upcast_layernorm [UPCAST_LAYERNORM]
|
||
|
Whether or not to upcast the layernorm weights in
|
||
|
fp32. (default: False)
|
||
|
--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]
|
||
|
Whether or not to upcast the output of lm_head in
|
||
|
fp32. (default: False)
|
||
|
--train_from_scratch [TRAIN_FROM_SCRATCH]
|
||
|
Whether or not to randomly initialize the model
|
||
|
weights. (default: False)
|
||
|
--infer_backend {huggingface,vllm}
|
||
|
Backend engine used at inference. (default:
|
||
|
huggingface)
|
||
|
--vllm_maxlen VLLM_MAXLEN
|
||
|
Maximum sequence (prompt + response) length of the
|
||
|
vLLM engine. (default: 2048)
|
||
|
--vllm_gpu_util VLLM_GPU_UTIL
|
||
|
The fraction of GPU memory in (0,1) to be used for the
|
||
|
vLLM engine. (default: 0.9)
|
||
|
--vllm_enforce_eager [VLLM_ENFORCE_EAGER]
|
||
|
Whether or not to disable CUDA graph in the vLLM
|
||
|
engine. (default: False)
|
||
|
--vllm_max_lora_rank VLLM_MAX_LORA_RANK
|
||
|
Maximum rank of all LoRAs in the vLLM engine.
|
||
|
(default: 32)
|
||
|
--offload_folder OFFLOAD_FOLDER
|
||
|
Path to offload model weights. (default: offload)
|
||
|
--use_cache [USE_CACHE]
|
||
|
Whether or not to use KV cache in generation.
|
||
|
(default: True)
|
||
|
--no_use_cache Whether or not to use KV cache in generation.
|
||
|
(default: False)
|
||
|
--infer_dtype {auto,float16,bfloat16,float32}
|
||
|
Data type for model weights and activations at
|
||
|
inference. (default: auto)
|
||
|
--hf_hub_token HF_HUB_TOKEN
|
||
|
Auth token to log in with Hugging Face Hub. (default:
|
||
|
None)
|
||
|
--ms_hub_token MS_HUB_TOKEN
|
||
|
Auth token to log in with ModelScope Hub. (default:
|
||
|
None)
|
||
|
--export_dir EXPORT_DIR
|
||
|
Path to the directory to save the exported model.
|
||
|
(default: None)
|
||
|
--export_size EXPORT_SIZE
|
||
|
The file shard size (in GB) of the exported model.
|
||
|
(default: 1)
|
||
|
--export_device {cpu,auto}
|
||
|
The device used in model export, use `auto` to
|
||
|
accelerate exporting. (default: cpu)
|
||
|
--export_quantization_bit EXPORT_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the exported model.
|
||
|
(default: None)
|
||
|
--export_quantization_dataset EXPORT_QUANTIZATION_DATASET
|
||
|
Path to the dataset or dataset name to use in
|
||
|
quantizing the exported model. (default: None)
|
||
|
--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES
|
||
|
The number of samples used for quantization. (default:
|
||
|
128)
|
||
|
--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN
|
||
|
The maximum length of the model inputs used for
|
||
|
quantization. (default: 1024)
|
||
|
--export_legacy_format [EXPORT_LEGACY_FORMAT]
|
||
|
Whether or not to save the `.bin` files instead of
|
||
|
`.safetensors`. (default: False)
|
||
|
--export_hub_model_id EXPORT_HUB_MODEL_ID
|
||
|
The name of the repository if push the model to the
|
||
|
Hugging Face hub. (default: None)
|
||
|
--print_param_status [PRINT_PARAM_STATUS]
|
||
|
For debugging purposes, print the status of the
|
||
|
parameters in the model. (default: False)
|
||
|
--template TEMPLATE Which template to use for constructing prompts in
|
||
|
training and inference. (default: None)
|
||
|
--dataset DATASET The name of dataset(s) to use for training. Use commas
|
||
|
to separate multiple datasets. (default: None)
|
||
|
--eval_dataset EVAL_DATASET
|
||
|
The name of dataset(s) to use for evaluation. Use
|
||
|
commas to separate multiple datasets. (default: None)
|
||
|
--dataset_dir DATASET_DIR
|
||
|
Path to the folder containing the datasets. (default:
|
||
|
data)
|
||
|
--cutoff_len CUTOFF_LEN
|
||
|
The cutoff length of the tokenized inputs in the
|
||
|
dataset. (default: 1024)
|
||
|
--train_on_prompt [TRAIN_ON_PROMPT]
|
||
|
Whether or not to disable the mask on the prompt.
|
||
|
(default: False)
|
||
|
--mask_history [MASK_HISTORY]
|
||
|
Whether or not to mask the history and train on the
|
||
|
last turn only. (default: False)
|
||
|
--streaming [STREAMING]
|
||
|
Enable dataset streaming. (default: False)
|
||
|
--buffer_size BUFFER_SIZE
|
||
|
Size of the buffer to randomly sample examples from in
|
||
|
dataset streaming. (default: 16384)
|
||
|
--mix_strategy {concat,interleave_under,interleave_over}
|
||
|
Strategy to use in dataset mixing (concat/interleave)
|
||
|
(undersampling/oversampling). (default: concat)
|
||
|
--interleave_probs INTERLEAVE_PROBS
|
||
|
Probabilities to sample data from datasets. Use commas
|
||
|
to separate multiple datasets. (default: None)
|
||
|
--overwrite_cache [OVERWRITE_CACHE]
|
||
|
Overwrite the cached training and evaluation sets.
|
||
|
(default: False)
|
||
|
--preprocessing_num_workers PREPROCESSING_NUM_WORKERS
|
||
|
The number of processes to use for the pre-processing.
|
||
|
(default: None)
|
||
|
--max_samples MAX_SAMPLES
|
||
|
For debugging purposes, truncate the number of
|
||
|
examples for each dataset. (default: None)
|
||
|
--eval_num_beams EVAL_NUM_BEAMS
|
||
|
Number of beams to use for evaluation. This argument
|
||
|
will be passed to `model.generate` (default: None)
|
||
|
--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]
|
||
|
Whether or not to ignore the tokens corresponding to
|
||
|
the pad label in loss computation. (default: True)
|
||
|
--no_ignore_pad_token_for_loss
|
||
|
Whether or not to ignore the tokens corresponding to
|
||
|
the pad label in loss computation. (default: False)
|
||
|
--val_size VAL_SIZE Size of the development set, should be an integer or a
|
||
|
float in range `[0,1)`. (default: 0.0)
|
||
|
--packing PACKING Enable sequences packing in training. Will
|
||
|
automatically enable in pre-training. (default: None)
|
||
|
--neat_packing [NEAT_PACKING]
|
||
|
Enable sequence packing without cross-attention.
|
||
|
(default: False)
|
||
|
--tool_format TOOL_FORMAT
|
||
|
Tool format to use for constructing function calling
|
||
|
examples. (default: None)
|
||
|
--tokenized_path TOKENIZED_PATH
|
||
|
Path to save or load the tokenized datasets. (default:
|
||
|
None)
|
||
|
--output_dir OUTPUT_DIR
|
||
|
The output directory where the model predictions and
|
||
|
checkpoints will be written. (default: None)
|
||
|
--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]
|
||
|
Overwrite the content of the output directory. Use
|
||
|
this to continue training if output_dir points to a
|
||
|
checkpoint directory. (default: False)
|
||
|
--do_train [DO_TRAIN]
|
||
|
Whether to run training. (default: False)
|
||
|
--do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False)
|
||
|
--do_predict [DO_PREDICT]
|
||
|
Whether to run predictions on the test set. (default:
|
||
|
False)
|
||
|
--eval_strategy {no,steps,epoch}
|
||
|
The evaluation strategy to use. (default: no)
|
||
|
--prediction_loss_only [PREDICTION_LOSS_ONLY]
|
||
|
When performing evaluation and predictions, only
|
||
|
returns the loss. (default: False)
|
||
|
--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE
|
||
|
Batch size per GPU/TPU/MPS/NPU core/CPU for training.
|
||
|
(default: 8)
|
||
|
--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE
|
||
|
Batch size per GPU/TPU/MPS/NPU core/CPU for
|
||
|
evaluation. (default: 8)
|
||
|
--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE
|
||
|
Deprecated, the use of `--per_device_train_batch_size`
|
||
|
is preferred. Batch size per GPU/TPU core/CPU for
|
||
|
training. (default: None)
|
||
|
--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE
|
||
|
Deprecated, the use of `--per_device_eval_batch_size`
|
||
|
is preferred. Batch size per GPU/TPU core/CPU for
|
||
|
evaluation. (default: None)
|
||
|
--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS
|
||
|
Number of updates steps to accumulate before
|
||
|
performing a backward/update pass. (default: 1)
|
||
|
--eval_accumulation_steps EVAL_ACCUMULATION_STEPS
|
||
|
Number of predictions steps to accumulate before
|
||
|
moving the tensors to the CPU. (default: None)
|
||
|
--eval_delay EVAL_DELAY
|
||
|
Number of epochs or steps to wait for before the first
|
||
|
evaluation can be performed, depending on the
|
||
|
eval_strategy. (default: 0)
|
||
|
--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS
|
||
|
Number of steps to wait before calling
|
||
|
`torch.<device>.empty_cache()`.This can help avoid
|
||
|
CUDA out-of-memory errors by lowering peak VRAM usage
|
||
|
at a cost of about [10{'option_strings': ['--
|
||
|
torch_empty_cache_steps'], 'dest':
|
||
|
'torch_empty_cache_steps', 'nargs': None, 'const':
|
||
|
None, 'default': None, 'type': 'int', 'choices': None,
|
||
|
'required': False, 'help': 'Number of steps to wait
|
||
|
before calling `torch.<device>.empty_cache()`.This can
|
||
|
help avoid CUDA out-of-memory errors by lowering peak
|
||
|
VRAM usage at a cost of about [10% slower performance]
|
||
|
(https://github.com/huggingface/transformers/issues/31
|
||
|
372).If left unset or set to None, cache will not be
|
||
|
emptied.', 'metavar': None, 'container':
|
||
|
<argparse._ArgumentGroup object at 0x7f051d170ee0>,
|
||
|
'prog': 'launcher.py'}lower performance](https://githu
|
||
|
b.com/huggingface/transformers/issues/31372).If left
|
||
|
unset or set to None, cache will not be emptied.
|
||
|
(default: None)
|
||
|
--learning_rate LEARNING_RATE
|
||
|
The initial learning rate for AdamW. (default: 5e-05)
|
||
|
--weight_decay WEIGHT_DECAY
|
||
|
Weight decay for AdamW if we apply some. (default:
|
||
|
0.0)
|
||
|
--adam_beta1 ADAM_BETA1
|
||
|
Beta1 for AdamW optimizer (default: 0.9)
|
||
|
--adam_beta2 ADAM_BETA2
|
||
|
Beta2 for AdamW optimizer (default: 0.999)
|
||
|
--adam_epsilon ADAM_EPSILON
|
||
|
Epsilon for AdamW optimizer. (default: 1e-08)
|
||
|
--max_grad_norm MAX_GRAD_NORM
|
||
|
Max gradient norm. (default: 1.0)
|
||
|
--num_train_epochs NUM_TRAIN_EPOCHS
|
||
|
Total number of training epochs to perform. (default:
|
||
|
3.0)
|
||
|
--max_steps MAX_STEPS
|
||
|
If > 0: set total number of training steps to perform.
|
||
|
Override num_train_epochs. (default: -1)
|
||
|
--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}
|
||
|
The scheduler type to use. (default: linear)
|
||
|
--lr_scheduler_kwargs LR_SCHEDULER_KWARGS
|
||
|
Extra parameters for the lr_scheduler such as
|
||
|
{'num_cycles': 1} for the cosine with hard restarts.
|
||
|
(default: {})
|
||
|
--warmup_ratio WARMUP_RATIO
|
||
|
Linear warmup over warmup_ratio fraction of total
|
||
|
steps. (default: 0.0)
|
||
|
--warmup_steps WARMUP_STEPS
|
||
|
Linear warmup over warmup_steps. (default: 0)
|
||
|
--log_level {detail,debug,info,warning,error,critical,passive}
|
||
|
Logger log level to use on the main node. Possible
|
||
|
choices are the log levels as strings: 'debug',
|
||
|
'info', 'warning', 'error' and 'critical', plus a
|
||
|
'passive' level which doesn't set anything and lets
|
||
|
the application set the level. Defaults to 'passive'.
|
||
|
(default: passive)
|
||
|
--log_level_replica {detail,debug,info,warning,error,critical,passive}
|
||
|
Logger log level to use on replica nodes. Same choices
|
||
|
and defaults as ``log_level`` (default: warning)
|
||
|
--log_on_each_node [LOG_ON_EACH_NODE]
|
||
|
When doing a multinode distributed training, whether
|
||
|
to log once per node or just once on the main node.
|
||
|
(default: True)
|
||
|
--no_log_on_each_node
|
||
|
When doing a multinode distributed training, whether
|
||
|
to log once per node or just once on the main node.
|
||
|
(default: False)
|
||
|
--logging_dir LOGGING_DIR
|
||
|
Tensorboard log dir. (default: None)
|
||
|
--logging_strategy {no,steps,epoch}
|
||
|
The logging strategy to use. (default: steps)
|
||
|
--logging_first_step [LOGGING_FIRST_STEP]
|
||
|
Log the first global_step (default: False)
|
||
|
--logging_steps LOGGING_STEPS
|
||
|
Log every X updates steps. Should be an integer or a
|
||
|
float in range `[0,1)`. If smaller than 1, will be
|
||
|
interpreted as ratio of total training steps.
|
||
|
(default: 500)
|
||
|
--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]
|
||
|
Filter nan and inf losses for logging. (default: True)
|
||
|
--no_logging_nan_inf_filter
|
||
|
Filter nan and inf losses for logging. (default:
|
||
|
False)
|
||
|
--save_strategy {no,steps,epoch}
|
||
|
The checkpoint save strategy to use. (default: steps)
|
||
|
--save_steps SAVE_STEPS
|
||
|
Save checkpoint every X updates steps. Should be an
|
||
|
integer or a float in range `[0,1)`. If smaller than
|
||
|
1, will be interpreted as ratio of total training
|
||
|
steps. (default: 500)
|
||
|
--save_total_limit SAVE_TOTAL_LIMIT
|
||
|
If a value is passed, will limit the total amount of
|
||
|
checkpoints. Deletes the older checkpoints in
|
||
|
`output_dir`. When `load_best_model_at_end` is
|
||
|
enabled, the 'best' checkpoint according to
|
||
|
`metric_for_best_model` will always be retained in
|
||
|
addition to the most recent ones. For example, for
|
||
|
`save_total_limit=5` and
|
||
|
`load_best_model_at_end=True`, the four last
|
||
|
checkpoints will always be retained alongside the best
|
||
|
model. When `save_total_limit=1` and
|
||
|
`load_best_model_at_end=True`, it is possible that two
|
||
|
checkpoints are saved: the last one and the best one
|
||
|
(if they are different). Default is unlimited
|
||
|
checkpoints (default: None)
|
||
|
--save_safetensors [SAVE_SAFETENSORS]
|
||
|
Use safetensors saving and loading for state dicts
|
||
|
instead of default torch.load and torch.save.
|
||
|
(default: True)
|
||
|
--no_save_safetensors
|
||
|
Use safetensors saving and loading for state dicts
|
||
|
instead of default torch.load and torch.save.
|
||
|
(default: False)
|
||
|
--save_on_each_node [SAVE_ON_EACH_NODE]
|
||
|
When doing multi-node distributed training, whether to
|
||
|
save models and checkpoints on each node, or only on
|
||
|
the main one (default: False)
|
||
|
--save_only_model [SAVE_ONLY_MODEL]
|
||
|
When checkpointing, whether to only save the model, or
|
||
|
also the optimizer, scheduler & rng state.Note that
|
||
|
when this is true, you won't be able to resume
|
||
|
training from checkpoint.This enables you to save
|
||
|
storage by not storing the optimizer, scheduler & rng
|
||
|
state.You can only load the model using
|
||
|
from_pretrained with this option set to True.
|
||
|
(default: False)
|
||
|
--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]
|
||
|
Whether to restore the callback states from the
|
||
|
checkpoint. If `True`, will override callbacks passed
|
||
|
to the `Trainer` if they exist in the checkpoint.
|
||
|
(default: False)
|
||
|
--no_cuda [NO_CUDA] This argument is deprecated. It will be removed in
|
||
|
version 5.0 of 🤗 Transformers. (default: False)
|
||
|
--use_cpu [USE_CPU] Whether or not to use cpu. If set to False, we will
|
||
|
use cuda/tpu/mps/npu device if available. (default:
|
||
|
False)
|
||
|
--use_mps_device [USE_MPS_DEVICE]
|
||
|
This argument is deprecated. `mps` device will be used
|
||
|
if available similar to `cuda` device. It will be
|
||
|
removed in version 5.0 of 🤗 Transformers (default:
|
||
|
False)
|
||
|
--seed SEED Random seed that will be set at the beginning of
|
||
|
training. (default: 42)
|
||
|
--data_seed DATA_SEED
|
||
|
Random seed to be used with data samplers. (default:
|
||
|
None)
|
||
|
--jit_mode_eval [JIT_MODE_EVAL]
|
||
|
Whether or not to use PyTorch jit trace for inference
|
||
|
(default: False)
|
||
|
--use_ipex [USE_IPEX]
|
||
|
Use Intel extension for PyTorch when it is available,
|
||
|
installation: 'https://github.com/intel/intel-
|
||
|
extension-for-pytorch' (default: False)
|
||
|
--bf16 [BF16] Whether to use bf16 (mixed) precision instead of
|
||
|
32-bit. Requires Ampere or higher NVIDIA architecture
|
||
|
or using CPU (use_cpu) or Ascend NPU. This is an
|
||
|
experimental API and it may change. (default: False)
|
||
|
--fp16 [FP16] Whether to use fp16 (mixed) precision instead of
|
||
|
32-bit (default: False)
|
||
|
--fp16_opt_level FP16_OPT_LEVEL
|
||
|
For fp16: Apex AMP optimization level selected in
|
||
|
['O0', 'O1', 'O2', and 'O3']. See details at
|
||
|
https://nvidia.github.io/apex/amp.html (default: O1)
|
||
|
--half_precision_backend {auto,apex,cpu_amp}
|
||
|
The backend to be used for half precision. (default:
|
||
|
auto)
|
||
|
--bf16_full_eval [BF16_FULL_EVAL]
|
||
|
Whether to use full bfloat16 evaluation instead of
|
||
|
32-bit. This is an experimental API and it may change.
|
||
|
(default: False)
|
||
|
--fp16_full_eval [FP16_FULL_EVAL]
|
||
|
Whether to use full float16 evaluation instead of
|
||
|
32-bit (default: False)
|
||
|
--tf32 TF32 Whether to enable tf32 mode, available in Ampere and
|
||
|
newer GPU architectures. This is an experimental API
|
||
|
and it may change. (default: None)
|
||
|
--local_rank LOCAL_RANK
|
||
|
For distributed training: local_rank (default: -1)
|
||
|
--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}
|
||
|
The backend to be used for distributed training
|
||
|
(default: None)
|
||
|
--tpu_num_cores TPU_NUM_CORES
|
||
|
TPU: Number of TPU cores (automatically passed by
|
||
|
launcher script) (default: None)
|
||
|
--tpu_metrics_debug [TPU_METRICS_DEBUG]
|
||
|
Deprecated, the use of `--debug tpu_metrics_debug` is
|
||
|
preferred. TPU: Whether to print debug metrics
|
||
|
(default: False)
|
||
|
--debug DEBUG [DEBUG ...]
|
||
|
Whether or not to enable debug mode. Current options:
|
||
|
`underflow_overflow` (Detect underflow and overflow in
|
||
|
activations and weights), `tpu_metrics_debug` (print
|
||
|
debug metrics on TPU). (default: None)
|
||
|
--dataloader_drop_last [DATALOADER_DROP_LAST]
|
||
|
Drop the last incomplete batch if it is not divisible
|
||
|
by the batch size. (default: False)
|
||
|
--eval_steps EVAL_STEPS
|
||
|
Run an evaluation every X steps. Should be an integer
|
||
|
or a float in range `[0,1)`. If smaller than 1, will
|
||
|
be interpreted as ratio of total training steps.
|
||
|
(default: None)
|
||
|
--dataloader_num_workers DATALOADER_NUM_WORKERS
|
||
|
Number of subprocesses to use for data loading
|
||
|
(PyTorch only). 0 means that the data will be loaded
|
||
|
in the main process. (default: 0)
|
||
|
--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR
|
||
|
Number of batches loaded in advance by each worker. 2
|
||
|
means there will be a total of 2 * num_workers batches
|
||
|
prefetched across all workers. Default is 2 for
|
||
|
PyTorch < 2.0.0 and otherwise None. (default: None)
|
||
|
--past_index PAST_INDEX
|
||
|
If >=0, uses the corresponding part of the output as
|
||
|
the past state for next step. (default: -1)
|
||
|
--run_name RUN_NAME An optional descriptor for the run. Notably used for
|
||
|
wandb, mlflow and comet logging. (default: None)
|
||
|
--disable_tqdm DISABLE_TQDM
|
||
|
Whether or not to disable the tqdm progress bars.
|
||
|
(default: None)
|
||
|
--remove_unused_columns [REMOVE_UNUSED_COLUMNS]
|
||
|
Remove columns not required by the model when using an
|
||
|
nlp.Dataset. (default: True)
|
||
|
--no_remove_unused_columns
|
||
|
Remove columns not required by the model when using an
|
||
|
nlp.Dataset. (default: False)
|
||
|
--label_names LABEL_NAMES [LABEL_NAMES ...]
|
||
|
The list of keys in your dictionary of inputs that
|
||
|
correspond to the labels. (default: None)
|
||
|
--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]
|
||
|
Whether or not to load the best model found during
|
||
|
training at the end of training. When this option is
|
||
|
enabled, the best checkpoint will always be saved. See
|
||
|
`save_total_limit` for more. (default: False)
|
||
|
--metric_for_best_model METRIC_FOR_BEST_MODEL
|
||
|
The metric to use to compare two different models.
|
||
|
(default: None)
|
||
|
--greater_is_better GREATER_IS_BETTER
|
||
|
Whether the `metric_for_best_model` should be
|
||
|
maximized or not. (default: None)
|
||
|
--ignore_data_skip [IGNORE_DATA_SKIP]
|
||
|
When resuming training, whether or not to skip the
|
||
|
first epochs and batches to get to the same training
|
||
|
data. (default: False)
|
||
|
--fsdp FSDP Whether or not to use PyTorch Fully Sharded Data
|
||
|
Parallel (FSDP) training (in distributed training
|
||
|
only). The base option should be `full_shard`,
|
||
|
`shard_grad_op` or `no_shard` and you can add CPU-
|
||
|
offload to `full_shard` or `shard_grad_op` like this:
|
||
|
full_shard offload` or `shard_grad_op offload`. You
|
||
|
can add auto-wrap to `full_shard` or `shard_grad_op`
|
||
|
with the same syntax: full_shard auto_wrap` or
|
||
|
`shard_grad_op auto_wrap`. (default: )
|
||
|
--fsdp_min_num_params FSDP_MIN_NUM_PARAMS
|
||
|
This parameter is deprecated. FSDP's minimum number of
|
||
|
parameters for Default Auto Wrapping. (useful only
|
||
|
when `fsdp` field is passed). (default: 0)
|
||
|
--fsdp_config FSDP_CONFIG
|
||
|
Config to be used with FSDP (Pytorch Fully Sharded
|
||
|
Data Parallel). The value is either a fsdp json config
|
||
|
file (e.g., `fsdp_config.json`) or an already loaded
|
||
|
json file as `dict`. (default: None)
|
||
|
--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP
|
||
|
This parameter is deprecated. Transformer layer class
|
||
|
name (case-sensitive) to wrap, e.g, `BertLayer`,
|
||
|
`GPTJBlock`, `T5Block` .... (useful only when `fsdp`
|
||
|
flag is passed). (default: None)
|
||
|
--accelerator_config ACCELERATOR_CONFIG
|
||
|
Config to be used with the internal Accelerator object
|
||
|
initializtion. The value is either a accelerator json
|
||
|
config file (e.g., `accelerator_config.json`) or an
|
||
|
already loaded json file as `dict`. (default: None)
|
||
|
--deepspeed DEEPSPEED
|
||
|
Enable deepspeed and pass the path to deepspeed json
|
||
|
config file (e.g. `ds_config.json`) or an already
|
||
|
loaded json file as a dict (default: None)
|
||
|
--label_smoothing_factor LABEL_SMOOTHING_FACTOR
|
||
|
The label smoothing epsilon to apply (zero means no
|
||
|
label smoothing). (default: 0.0)
|
||
|
--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}
|
||
|
The optimizer to use. (default: adamw_torch)
|
||
|
--optim_args OPTIM_ARGS
|
||
|
Optional arguments to supply to optimizer. (default:
|
||
|
None)
|
||
|
--adafactor [ADAFACTOR]
|
||
|
Whether or not to replace AdamW by Adafactor.
|
||
|
(default: False)
|
||
|
--group_by_length [GROUP_BY_LENGTH]
|
||
|
Whether or not to group samples of roughly the same
|
||
|
length together when batching. (default: False)
|
||
|
--length_column_name LENGTH_COLUMN_NAME
|
||
|
Column name with precomputed lengths to use when
|
||
|
grouping by length. (default: length)
|
||
|
--report_to REPORT_TO
|
||
|
The list of integrations to report the results and
|
||
|
logs to. (default: None)
|
||
|
--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS
|
||
|
When using distributed training, the value of the flag
|
||
|
`find_unused_parameters` passed to
|
||
|
`DistributedDataParallel`. (default: None)
|
||
|
--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB
|
||
|
When using distributed training, the value of the flag
|
||
|
`bucket_cap_mb` passed to `DistributedDataParallel`.
|
||
|
(default: None)
|
||
|
--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS
|
||
|
When using distributed training, the value of the flag
|
||
|
`broadcast_buffers` passed to
|
||
|
`DistributedDataParallel`. (default: None)
|
||
|
--dataloader_pin_memory [DATALOADER_PIN_MEMORY]
|
||
|
Whether or not to pin memory for DataLoader. (default:
|
||
|
True)
|
||
|
--no_dataloader_pin_memory
|
||
|
Whether or not to pin memory for DataLoader. (default:
|
||
|
False)
|
||
|
--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]
|
||
|
If True, the data loader will not shut down the worker
|
||
|
processes after a dataset has been consumed once. This
|
||
|
allows to maintain the workers Dataset instances
|
||
|
alive. Can potentially speed up training, but will
|
||
|
increase RAM usage. (default: False)
|
||
|
--skip_memory_metrics [SKIP_MEMORY_METRICS]
|
||
|
Whether or not to skip adding of memory profiler
|
||
|
reports to metrics. (default: True)
|
||
|
--no_skip_memory_metrics
|
||
|
Whether or not to skip adding of memory profiler
|
||
|
reports to metrics. (default: False)
|
||
|
--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]
|
||
|
Whether or not to use the legacy prediction_loop in
|
||
|
the Trainer. (default: False)
|
||
|
--push_to_hub [PUSH_TO_HUB]
|
||
|
Whether or not to upload the trained model to the
|
||
|
model hub after training. (default: False)
|
||
|
--resume_from_checkpoint RESUME_FROM_CHECKPOINT
|
||
|
The path to a folder with a valid checkpoint for your
|
||
|
model. (default: None)
|
||
|
--hub_model_id HUB_MODEL_ID
|
||
|
The name of the repository to keep in sync with the
|
||
|
local `output_dir`. (default: None)
|
||
|
--hub_strategy {end,every_save,checkpoint,all_checkpoints}
|
||
|
The hub strategy to use when `--push_to_hub` is
|
||
|
activated. (default: every_save)
|
||
|
--hub_token HUB_TOKEN
|
||
|
The token to use to push to the Model Hub. (default:
|
||
|
None)
|
||
|
--hub_private_repo [HUB_PRIVATE_REPO]
|
||
|
Whether the model repository is private or not.
|
||
|
(default: False)
|
||
|
--hub_always_push [HUB_ALWAYS_PUSH]
|
||
|
Unless `True`, the Trainer will skip pushes if the
|
||
|
previous one wasn't finished yet. (default: False)
|
||
|
--gradient_checkpointing [GRADIENT_CHECKPOINTING]
|
||
|
If True, use gradient checkpointing to save memory at
|
||
|
the expense of slower backward pass. (default: False)
|
||
|
--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS
|
||
|
Gradient checkpointing key word arguments such as
|
||
|
`use_reentrant`. Will be passed to
|
||
|
`torch.utils.checkpoint.checkpoint` through
|
||
|
`model.gradient_checkpointing_enable`. (default: None)
|
||
|
--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]
|
||
|
Whether or not the inputs will be passed to the
|
||
|
`compute_metrics` function. (default: False)
|
||
|
--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]
|
||
|
Whether to recursively concat
|
||
|
inputs/losses/labels/predictions across batches. If
|
||
|
`False`, will instead store them as lists, with each
|
||
|
batch kept separate. (default: True)
|
||
|
--no_eval_do_concat_batches
|
||
|
Whether to recursively concat
|
||
|
inputs/losses/labels/predictions across batches. If
|
||
|
`False`, will instead store them as lists, with each
|
||
|
batch kept separate. (default: False)
|
||
|
--fp16_backend {auto,apex,cpu_amp}
|
||
|
Deprecated. Use half_precision_backend instead
|
||
|
(default: auto)
|
||
|
--evaluation_strategy {no,steps,epoch}
|
||
|
Deprecated. Use `eval_strategy` instead (default:
|
||
|
None)
|
||
|
--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID
|
||
|
The name of the repository to which push the
|
||
|
`Trainer`. (default: None)
|
||
|
--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION
|
||
|
The name of the organization in with to which push the
|
||
|
`Trainer`. (default: None)
|
||
|
--push_to_hub_token PUSH_TO_HUB_TOKEN
|
||
|
The token to use to push to the Model Hub. (default:
|
||
|
None)
|
||
|
--mp_parameters MP_PARAMETERS
|
||
|
Used by the SageMaker launcher to send mp-specific
|
||
|
args. Ignored in Trainer (default: )
|
||
|
--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]
|
||
|
Whether to automatically decrease the batch size in
|
||
|
half and rerun the training loop again each time a
|
||
|
CUDA Out-of-Memory was reached (default: False)
|
||
|
--full_determinism [FULL_DETERMINISM]
|
||
|
Whether to call enable_full_determinism instead of
|
||
|
set_seed for reproducibility in distributed training.
|
||
|
Important: this will negatively impact the
|
||
|
performance, so only use it for debugging. (default:
|
||
|
False)
|
||
|
--torchdynamo TORCHDYNAMO
|
||
|
This argument is deprecated, use
|
||
|
`--torch_compile_backend` instead. (default: None)
|
||
|
--ray_scope RAY_SCOPE
|
||
|
The scope to use when doing hyperparameter search with
|
||
|
Ray. By default, `"last"` will be used. Ray will then
|
||
|
use the last checkpoint of all trials, compare those,
|
||
|
and select the best one. However, other options are
|
||
|
also available. See the Ray documentation (https://doc
|
||
|
s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun
|
||
|
e.ExperimentAnalysis.get_best_trial) for more options.
|
||
|
(default: last)
|
||
|
--ddp_timeout DDP_TIMEOUT
|
||
|
Overrides the default timeout for distributed training
|
||
|
(value should be given in seconds). (default: 1800)
|
||
|
--torch_compile [TORCH_COMPILE]
|
||
|
If set to `True`, the model will be wrapped in
|
||
|
`torch.compile`. (default: False)
|
||
|
--torch_compile_backend TORCH_COMPILE_BACKEND
|
||
|
Which backend to use with `torch.compile`, passing one
|
||
|
will trigger a model compilation. (default: None)
|
||
|
--torch_compile_mode TORCH_COMPILE_MODE
|
||
|
Which mode to use with `torch.compile`, passing one
|
||
|
will trigger a model compilation. (default: None)
|
||
|
--dispatch_batches DISPATCH_BATCHES
|
||
|
Deprecated. Pass {'dispatch_batches':VALUE} to
|
||
|
`accelerator_config`. (default: None)
|
||
|
--split_batches SPLIT_BATCHES
|
||
|
Deprecated. Pass {'split_batches':True} to
|
||
|
`accelerator_config`. (default: None)
|
||
|
--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]
|
||
|
If set to `True`, the speed metrics will include `tgs`
|
||
|
(tokens per second per device). (default: False)
|
||
|
--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]
|
||
|
If set to `True`, will track the number of input
|
||
|
tokens seen throughout training. (May be slower in
|
||
|
distributed training) (default: False)
|
||
|
--neftune_noise_alpha NEFTUNE_NOISE_ALPHA
|
||
|
Activates neftune noise embeddings into the model.
|
||
|
NEFTune has been proven to drastically improve model
|
||
|
performances for instrcution fine-tuning. Check out
|
||
|
the original paper here:
|
||
|
https://arxiv.org/abs/2310.05914 and the original code
|
||
|
here: https://github.com/neelsjain/NEFTune. Only
|
||
|
supported for `PreTrainedModel` and `PeftModel`
|
||
|
classes. (default: None)
|
||
|
--optim_target_modules OPTIM_TARGET_MODULES
|
||
|
Target modules for the optimizer defined in the
|
||
|
`optim` argument. Only used for the GaLore optimizer
|
||
|
at the moment. (default: None)
|
||
|
--batch_eval_metrics [BATCH_EVAL_METRICS]
|
||
|
Break eval metrics calculation into batches to save
|
||
|
memory. (default: False)
|
||
|
--eval_on_start [EVAL_ON_START]
|
||
|
Whether to run through the entire `evaluation` step at
|
||
|
the very beginning of training as a sanity check.
|
||
|
(default: False)
|
||
|
--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]
|
||
|
Whether to run recursively gather object in a nested
|
||
|
list/tuple/dictionary of objects from all devices.
|
||
|
(default: False)
|
||
|
--sortish_sampler [SORTISH_SAMPLER]
|
||
|
Whether to use SortishSampler or not. (default: False)
|
||
|
--predict_with_generate [PREDICT_WITH_GENERATE]
|
||
|
Whether to use generate to calculate generative
|
||
|
metrics (ROUGE, BLEU). (default: False)
|
||
|
--generation_max_length GENERATION_MAX_LENGTH
|
||
|
The `max_length` to use on each evaluation loop when
|
||
|
`predict_with_generate=True`. Will default to the
|
||
|
`max_length` value of the model configuration.
|
||
|
(default: None)
|
||
|
--generation_num_beams GENERATION_NUM_BEAMS
|
||
|
The `num_beams` to use on each evaluation loop when
|
||
|
`predict_with_generate=True`. Will default to the
|
||
|
`num_beams` value of the model configuration.
|
||
|
(default: None)
|
||
|
--generation_config GENERATION_CONFIG
|
||
|
Model id, file path or url pointing to a
|
||
|
GenerationConfig json file, to use during prediction.
|
||
|
(default: None)
|
||
|
--use_badam [USE_BADAM]
|
||
|
Whether or not to use the BAdam optimizer. (default:
|
||
|
False)
|
||
|
--badam_mode {layer,ratio}
|
||
|
Whether to use layer-wise or ratio-wise BAdam
|
||
|
optimizer. (default: layer)
|
||
|
--badam_start_block BADAM_START_BLOCK
|
||
|
The starting block index for layer-wise BAdam.
|
||
|
(default: None)
|
||
|
--badam_switch_mode {ascending,descending,random,fixed}
|
||
|
the strategy of picking block to update for layer-wise
|
||
|
BAdam. (default: ascending)
|
||
|
--badam_switch_interval BADAM_SWITCH_INTERVAL
|
||
|
Number of steps to update the block for layer-wise
|
||
|
BAdam. Use -1 to disable the block update. (default:
|
||
|
50)
|
||
|
--badam_update_ratio BADAM_UPDATE_RATIO
|
||
|
The ratio of the update for ratio-wise BAdam.
|
||
|
(default: 0.05)
|
||
|
--badam_mask_mode {adjacent,scatter}
|
||
|
The mode of the mask for BAdam optimizer. `adjacent`
|
||
|
means that the trainable parameters are adjacent to
|
||
|
each other, `scatter` means that trainable parameters
|
||
|
are randomly choosed from the weight. (default:
|
||
|
adjacent)
|
||
|
--badam_verbose BADAM_VERBOSE
|
||
|
The verbosity level of BAdam optimizer. 0 for no
|
||
|
print, 1 for print the block prefix, 2 for print
|
||
|
trainable parameters. (default: 0)
|
||
|
--use_galore [USE_GALORE]
|
||
|
Whether or not to use the gradient low-Rank projection
|
||
|
(GaLore). (default: False)
|
||
|
--galore_target GALORE_TARGET
|
||
|
Name(s) of modules to apply GaLore. Use commas to
|
||
|
separate multiple modules. Use `all` to specify all
|
||
|
the linear modules. (default: all)
|
||
|
--galore_rank GALORE_RANK
|
||
|
The rank of GaLore gradients. (default: 16)
|
||
|
--galore_update_interval GALORE_UPDATE_INTERVAL
|
||
|
Number of steps to update the GaLore projection.
|
||
|
(default: 200)
|
||
|
--galore_scale GALORE_SCALE
|
||
|
GaLore scaling coefficient. (default: 0.25)
|
||
|
--galore_proj_type {std,reverse_std,right,left,full}
|
||
|
Type of GaLore projection. (default: std)
|
||
|
--galore_layerwise [GALORE_LAYERWISE]
|
||
|
Whether or not to enable layer-wise update to further
|
||
|
save memory. (default: False)
|
||
|
--pref_beta PREF_BETA
|
||
|
The beta parameter in the preference loss. (default:
|
||
|
0.1)
|
||
|
--pref_ftx PREF_FTX The supervised fine-tuning loss coefficient in DPO
|
||
|
training. (default: 0.0)
|
||
|
--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}
|
||
|
The type of DPO loss to use. (default: sigmoid)
|
||
|
--dpo_label_smoothing DPO_LABEL_SMOOTHING
|
||
|
The robust DPO label smoothing parameter in cDPO that
|
||
|
should be between 0 and 0.5. (default: 0.0)
|
||
|
--kto_chosen_weight KTO_CHOSEN_WEIGHT
|
||
|
The weight factor of the desirable losses in KTO
|
||
|
training. (default: 1.0)
|
||
|
--kto_rejected_weight KTO_REJECTED_WEIGHT
|
||
|
The weight factor of the undesirable losses in KTO
|
||
|
training. (default: 1.0)
|
||
|
--simpo_gamma SIMPO_GAMMA
|
||
|
The target reward margin term in SimPO loss. (default:
|
||
|
0.5)
|
||
|
--ppo_buffer_size PPO_BUFFER_SIZE
|
||
|
The number of mini-batches to make experience buffer
|
||
|
in a PPO optimization step. (default: 1)
|
||
|
--ppo_epochs PPO_EPOCHS
|
||
|
The number of epochs to perform in a PPO optimization
|
||
|
step. (default: 4)
|
||
|
--ppo_score_norm [PPO_SCORE_NORM]
|
||
|
Use score normalization in PPO training. (default:
|
||
|
False)
|
||
|
--ppo_target PPO_TARGET
|
||
|
Target KL value for adaptive KL control in PPO
|
||
|
training. (default: 6.0)
|
||
|
--ppo_whiten_rewards [PPO_WHITEN_REWARDS]
|
||
|
Whiten the rewards before compute advantages in PPO
|
||
|
training. (default: False)
|
||
|
--ref_model REF_MODEL
|
||
|
Path to the reference model used for the PPO or DPO
|
||
|
training. (default: None)
|
||
|
--ref_model_adapters REF_MODEL_ADAPTERS
|
||
|
Path to the adapters of the reference model. (default:
|
||
|
None)
|
||
|
--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the reference model.
|
||
|
(default: None)
|
||
|
--reward_model REWARD_MODEL
|
||
|
Path to the reward model used for the PPO training.
|
||
|
(default: None)
|
||
|
--reward_model_adapters REWARD_MODEL_ADAPTERS
|
||
|
Path to the adapters of the reward model. (default:
|
||
|
None)
|
||
|
--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT
|
||
|
The number of bits to quantize the reward model.
|
||
|
(default: None)
|
||
|
--reward_model_type {lora,full,api}
|
||
|
The type of the reward model in PPO training. Lora
|
||
|
model only supports lora training. (default: lora)
|
||
|
--additional_target ADDITIONAL_TARGET
|
||
|
Name(s) of modules apart from LoRA layers to be set as
|
||
|
trainable and saved in the final checkpoint. Use
|
||
|
commas to separate multiple modules. (default: None)
|
||
|
--lora_alpha LORA_ALPHA
|
||
|
The scale factor for LoRA fine-tuning (default:
|
||
|
lora_rank * 2). (default: None)
|
||
|
--lora_dropout LORA_DROPOUT
|
||
|
Dropout rate for the LoRA fine-tuning. (default: 0.0)
|
||
|
--lora_rank LORA_RANK
|
||
|
The intrinsic dimension for LoRA fine-tuning.
|
||
|
(default: 8)
|
||
|
--lora_target LORA_TARGET
|
||
|
Name(s) of target modules to apply LoRA. Use commas to
|
||
|
separate multiple modules. Use `all` to specify all
|
||
|
the linear modules. (default: all)
|
||
|
--loraplus_lr_ratio LORAPLUS_LR_RATIO
|
||
|
LoRA plus learning rate ratio (lr_B / lr_A). (default:
|
||
|
None)
|
||
|
--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING
|
||
|
LoRA plus learning rate for lora embedding layers.
|
||
|
(default: 1e-06)
|
||
|
--use_rslora [USE_RSLORA]
|
||
|
Whether or not to use the rank stabilization scaling
|
||
|
factor for LoRA layer. (default: False)
|
||
|
--use_dora [USE_DORA]
|
||
|
Whether or not to use the weight-decomposed lora
|
||
|
method (DoRA). (default: False)
|
||
|
--pissa_init [PISSA_INIT]
|
||
|
Whether or not to initialize a PiSSA adapter.
|
||
|
(default: False)
|
||
|
--pissa_iter PISSA_ITER
|
||
|
The number of iteration steps performed by FSVD in
|
||
|
PiSSA. Use -1 to disable it. (default: 16)
|
||
|
--pissa_convert [PISSA_CONVERT]
|
||
|
Whether or not to convert the PiSSA adapter to a
|
||
|
normal LoRA adapter. (default: False)
|
||
|
--create_new_adapter [CREATE_NEW_ADAPTER]
|
||
|
Whether or not to create a new adapter with randomly
|
||
|
initialized weight. (default: False)
|
||
|
--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS
|
||
|
The number of trainable layers for freeze (partial-
|
||
|
parameter) fine-tuning. Positive numbers mean the last
|
||
|
n layers are set as trainable, negative numbers mean
|
||
|
the first n layers are set as trainable. (default: 2)
|
||
|
--freeze_trainable_modules FREEZE_TRAINABLE_MODULES
|
||
|
Name(s) of trainable modules for freeze (partial-
|
||
|
parameter) fine-tuning. Use commas to separate
|
||
|
multiple modules. Use `all` to specify all the
|
||
|
available modules. (default: all)
|
||
|
--freeze_extra_modules FREEZE_EXTRA_MODULES
|
||
|
Name(s) of modules apart from hidden layers to be set
|
||
|
as trainable for freeze (partial-parameter) fine-
|
||
|
tuning. Use commas to separate multiple modules.
|
||
|
(default: None)
|
||
|
--pure_bf16 [PURE_BF16]
|
||
|
Whether or not to train model in purely bf16 precision
|
||
|
(without AMP). (default: False)
|
||
|
--stage {pt,sft,rm,ppo,dpo,kto}
|
||
|
Which stage will be performed in training. (default:
|
||
|
sft)
|
||
|
--finetuning_type {lora,freeze,full}
|
||
|
Which fine-tuning method to use. (default: lora)
|
||
|
--use_llama_pro [USE_LLAMA_PRO]
|
||
|
Whether or not to make only the parameters in the
|
||
|
expanded blocks trainable. (default: False)
|
||
|
--use_adam_mini [USE_ADAM_MINI]
|
||
|
Whether or not to use the Adam-mini optimizer.
|
||
|
(default: False)
|
||
|
--freeze_vision_tower [FREEZE_VISION_TOWER]
|
||
|
Whether ot not to freeze vision tower in MLLM
|
||
|
training. (default: True)
|
||
|
--no_freeze_vision_tower
|
||
|
Whether ot not to freeze vision tower in MLLM
|
||
|
training. (default: False)
|
||
|
--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]
|
||
|
Whether or not to train the multimodal projector for
|
||
|
MLLM only. (default: False)
|
||
|
--compute_accuracy [COMPUTE_ACCURACY]
|
||
|
Whether or not to compute the token-level accuracy at
|
||
|
evaluation. (default: False)
|
||
|
--plot_loss [PLOT_LOSS]
|
||
|
Whether or not to save the training loss curves.
|
||
|
(default: False)
|
||
|
--do_sample [DO_SAMPLE]
|
||
|
Whether or not to use sampling, use greedy decoding
|
||
|
otherwise. (default: True)
|
||
|
--no_do_sample Whether or not to use sampling, use greedy decoding
|
||
|
otherwise. (default: False)
|
||
|
--temperature TEMPERATURE
|
||
|
The value used to modulate the next token
|
||
|
probabilities. (default: 0.95)
|
||
|
--top_p TOP_P The smallest set of most probable tokens with
|
||
|
probabilities that add up to top_p or higher are kept.
|
||
|
(default: 0.7)
|
||
|
--top_k TOP_K The number of highest probability vocabulary tokens to
|
||
|
keep for top-k filtering. (default: 50)
|
||
|
--num_beams NUM_BEAMS
|
||
|
Number of beams for beam search. 1 means no beam
|
||
|
search. (default: 1)
|
||
|
--max_length MAX_LENGTH
|
||
|
The maximum length the generated tokens can have. It
|
||
|
can be overridden by max_new_tokens. (default: 1024)
|
||
|
--max_new_tokens MAX_NEW_TOKENS
|
||
|
The maximum numbers of tokens to generate, ignoring
|
||
|
the number of tokens in the prompt. (default: 1024)
|
||
|
--repetition_penalty REPETITION_PENALTY
|
||
|
The parameter for repetition penalty. 1.0 means no
|
||
|
penalty. (default: 1.0)
|
||
|
--length_penalty LENGTH_PENALTY
|
||
|
Exponential penalty to the length that is used with
|
||
|
beam-based generation. (default: 1.0)
|
||
|
--default_system DEFAULT_SYSTEM
|
||
|
Default system message to use in chat completion.
|
||
|
(default: None)
|