diff --git a/help.txt b/help.txt new file mode 100644 index 00000000..cc9f0f9a --- /dev/null +++ b/help.txt @@ -0,0 +1,8359 @@ +08/16/2024 11:02:54 - INFO - llamafactory.cli - Initializing distributed tasks at: 127.0.0.1:28784 +usage: launcher.py [-h] --model_name_or_path MODEL_NAME_OR_PATH + [--adapter_name_or_path ADAPTER_NAME_OR_PATH] + [--adapter_folder ADAPTER_FOLDER] [--cache_dir CACHE_DIR] + [--use_fast_tokenizer [USE_FAST_TOKENIZER]] + [--no_use_fast_tokenizer] [--resize_vocab [RESIZE_VOCAB]] + [--split_special_tokens [SPLIT_SPECIAL_TOKENS]] + [--new_special_tokens NEW_SPECIAL_TOKENS] + [--model_revision MODEL_REVISION] + [--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]] + [--no_low_cpu_mem_usage] + [--quantization_method {bitsandbytes,hqq,eetq}] + [--quantization_bit QUANTIZATION_BIT] + [--quantization_type {fp4,nf4}] + [--double_quantization [DOUBLE_QUANTIZATION]] + [--no_double_quantization] + [--quantization_device_map {auto}] + [--rope_scaling {linear,dynamic}] + [--flash_attn {auto,disabled,sdpa,fa2}] + [--shift_attn [SHIFT_ATTN]] + [--mixture_of_depths {convert,load}] + [--use_unsloth [USE_UNSLOTH]] + [--visual_inputs [VISUAL_INPUTS]] + [--moe_aux_loss_coef MOE_AUX_LOSS_COEF] + [--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]] + [--upcast_layernorm [UPCAST_LAYERNORM]] + [--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]] + [--train_from_scratch [TRAIN_FROM_SCRATCH]] + [--infer_backend {huggingface,vllm}] + [--vllm_maxlen VLLM_MAXLEN] [--vllm_gpu_util VLLM_GPU_UTIL] + [--vllm_enforce_eager [VLLM_ENFORCE_EAGER]] + [--vllm_max_lora_rank VLLM_MAX_LORA_RANK] + [--offload_folder OFFLOAD_FOLDER] [--use_cache [USE_CACHE]] + [--no_use_cache] + [--infer_dtype {auto,float16,bfloat16,float32}] + [--hf_hub_token HF_HUB_TOKEN] [--ms_hub_token MS_HUB_TOKEN] + [--export_dir EXPORT_DIR] [--export_size EXPORT_SIZE] + [--export_device {cpu,auto}] + [--export_quantization_bit EXPORT_QUANTIZATION_BIT] + [--export_quantization_dataset EXPORT_QUANTIZATION_DATASET] + [--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES] + [--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN] + [--export_legacy_format [EXPORT_LEGACY_FORMAT]] + [--export_hub_model_id EXPORT_HUB_MODEL_ID] + [--print_param_status [PRINT_PARAM_STATUS]] + [--template TEMPLATE] [--dataset DATASET] + [--eval_dataset EVAL_DATASET] [--dataset_dir DATASET_DIR] + [--cutoff_len CUTOFF_LEN] + [--train_on_prompt [TRAIN_ON_PROMPT]] + [--mask_history [MASK_HISTORY]] [--streaming [STREAMING]] + [--buffer_size BUFFER_SIZE] + [--mix_strategy {concat,interleave_under,interleave_over}] + [--interleave_probs INTERLEAVE_PROBS] + [--overwrite_cache [OVERWRITE_CACHE]] + [--preprocessing_num_workers PREPROCESSING_NUM_WORKERS] + [--max_samples MAX_SAMPLES] + [--eval_num_beams EVAL_NUM_BEAMS] + [--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]] + [--no_ignore_pad_token_for_loss] [--val_size VAL_SIZE] + [--packing PACKING] [--neat_packing [NEAT_PACKING]] + [--tool_format TOOL_FORMAT] + [--tokenized_path TOKENIZED_PATH] --output_dir OUTPUT_DIR + [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] + [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]] + [--do_predict [DO_PREDICT]] + [--eval_strategy {no,steps,epoch}] + [--prediction_loss_only [PREDICTION_LOSS_ONLY]] + [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] + [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE] + [--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] + [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE] + [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS] + [--eval_accumulation_steps EVAL_ACCUMULATION_STEPS] + [--eval_delay EVAL_DELAY] + [--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS] + [--learning_rate LEARNING_RATE] + [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1] + [--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON] + [--max_grad_norm MAX_GRAD_NORM] + [--num_train_epochs NUM_TRAIN_EPOCHS] + [--max_steps MAX_STEPS] + [--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}] + [--lr_scheduler_kwargs LR_SCHEDULER_KWARGS] + [--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS] + [--log_level {detail,debug,info,warning,error,critical,passive}] + [--log_level_replica {detail,debug,info,warning,error,critical,passive}] + [--log_on_each_node [LOG_ON_EACH_NODE]] + [--no_log_on_each_node] [--logging_dir LOGGING_DIR] + [--logging_strategy {no,steps,epoch}] + [--logging_first_step [LOGGING_FIRST_STEP]] + [--logging_steps LOGGING_STEPS] + [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]] + [--no_logging_nan_inf_filter] + [--save_strategy {no,steps,epoch}] + [--save_steps SAVE_STEPS] + [--save_total_limit SAVE_TOTAL_LIMIT] + [--save_safetensors [SAVE_SAFETENSORS]] + [--no_save_safetensors] + [--save_on_each_node [SAVE_ON_EACH_NODE]] + [--save_only_model [SAVE_ONLY_MODEL]] + [--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]] + [--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]] + [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] + [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]] + [--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]] + [--fp16_opt_level FP16_OPT_LEVEL] + [--half_precision_backend {auto,apex,cpu_amp}] + [--bf16_full_eval [BF16_FULL_EVAL]] + [--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32] + [--local_rank LOCAL_RANK] + [--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}] + [--tpu_num_cores TPU_NUM_CORES] + [--tpu_metrics_debug [TPU_METRICS_DEBUG]] + [--debug DEBUG [DEBUG ...]] + [--dataloader_drop_last [DATALOADER_DROP_LAST]] + [--eval_steps EVAL_STEPS] + [--dataloader_num_workers DATALOADER_NUM_WORKERS] + [--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR] + [--past_index PAST_INDEX] [--run_name RUN_NAME] + [--disable_tqdm DISABLE_TQDM] + [--remove_unused_columns [REMOVE_UNUSED_COLUMNS]] + [--no_remove_unused_columns] + [--label_names LABEL_NAMES [LABEL_NAMES ...]] + [--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]] + [--metric_for_best_model METRIC_FOR_BEST_MODEL] + [--greater_is_better GREATER_IS_BETTER] + [--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP] + [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] + [--fsdp_config FSDP_CONFIG] + [--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] + [--accelerator_config ACCELERATOR_CONFIG] + [--deepspeed DEEPSPEED] + [--label_smoothing_factor LABEL_SMOOTHING_FACTOR] + [--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}] + [--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] + [--group_by_length [GROUP_BY_LENGTH]] + [--length_column_name LENGTH_COLUMN_NAME] + [--report_to REPORT_TO] + [--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] + [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB] + [--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS] + [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]] + [--no_dataloader_pin_memory] + [--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]] + [--skip_memory_metrics [SKIP_MEMORY_METRICS]] + [--no_skip_memory_metrics] + [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]] + [--push_to_hub [PUSH_TO_HUB]] + [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] + [--hub_model_id HUB_MODEL_ID] + [--hub_strategy {end,every_save,checkpoint,all_checkpoints}] + [--hub_token HUB_TOKEN] + [--hub_private_repo [HUB_PRIVATE_REPO]] + [--hub_always_push [HUB_ALWAYS_PUSH]] + [--gradient_checkpointing [GRADIENT_CHECKPOINTING]] + [--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS] + [--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]] + [--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]] + [--no_eval_do_concat_batches] + [--fp16_backend {auto,apex,cpu_amp}] + [--evaluation_strategy {no,steps,epoch}] + [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID] + [--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION] + [--push_to_hub_token PUSH_TO_HUB_TOKEN] + [--mp_parameters MP_PARAMETERS] + [--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]] + [--full_determinism [FULL_DETERMINISM]] + [--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE] + [--ddp_timeout DDP_TIMEOUT] + [--torch_compile [TORCH_COMPILE]] + [--torch_compile_backend TORCH_COMPILE_BACKEND] + [--torch_compile_mode TORCH_COMPILE_MODE] + [--dispatch_batches DISPATCH_BATCHES] + [--split_batches SPLIT_BATCHES] + [--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]] + [--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]] + [--neftune_noise_alpha NEFTUNE_NOISE_ALPHA] + [--optim_target_modules OPTIM_TARGET_MODULES] + [--batch_eval_metrics [BATCH_EVAL_METRICS]] + [--eval_on_start [EVAL_ON_START]] + [--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]] + [--sortish_sampler [SORTISH_SAMPLER]] + [--predict_with_generate [PREDICT_WITH_GENERATE]] + [--generation_max_length GENERATION_MAX_LENGTH] + [--generation_num_beams GENERATION_NUM_BEAMS] + [--generation_config GENERATION_CONFIG] + [--use_badam [USE_BADAM]] [--badam_mode {layer,ratio}] + [--badam_start_block BADAM_START_BLOCK] + [--badam_switch_mode {ascending,descending,random,fixed}] + [--badam_switch_interval BADAM_SWITCH_INTERVAL] + [--badam_update_ratio BADAM_UPDATE_RATIO] + [--badam_mask_mode {adjacent,scatter}] + [--badam_verbose BADAM_VERBOSE] [--use_galore [USE_GALORE]] + [--galore_target GALORE_TARGET] [--galore_rank GALORE_RANK] + [--galore_update_interval GALORE_UPDATE_INTERVAL] + [--galore_scale GALORE_SCALE] + [--galore_proj_type {std,reverse_std,right,left,full}] + [--galore_layerwise [GALORE_LAYERWISE]] + [--pref_beta PREF_BETA] [--pref_ftx PREF_FTX] + [--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}] + [--dpo_label_smoothing DPO_LABEL_SMOOTHING] + [--kto_chosen_weight KTO_CHOSEN_WEIGHT] + [--kto_rejected_weight KTO_REJECTED_WEIGHT] + [--simpo_gamma SIMPO_GAMMA] + [--ppo_buffer_size PPO_BUFFER_SIZE] + [--ppo_epochs PPO_EPOCHS] + [--ppo_score_norm [PPO_SCORE_NORM]] + [--ppo_target PPO_TARGET] + [--ppo_whiten_rewards [PPO_WHITEN_REWARDS]] + [--ref_model REF_MODEL] + [--ref_model_adapters REF_MODEL_ADAPTERS] + [--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT] + [--reward_model REWARD_MODEL] + [--reward_model_adapters REWARD_MODEL_ADAPTERS] + [--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT] + [--reward_model_type {lora,full,api}] + [--additional_target ADDITIONAL_TARGET] + [--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT] + [--lora_rank LORA_RANK] [--lora_target LORA_TARGET] + [--loraplus_lr_ratio LORAPLUS_LR_RATIO] + [--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING] + [--use_rslora [USE_RSLORA]] [--use_dora [USE_DORA]] + [--pissa_init [PISSA_INIT]] [--pissa_iter PISSA_ITER] + [--pissa_convert [PISSA_CONVERT]] + [--create_new_adapter [CREATE_NEW_ADAPTER]] + [--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS] + [--freeze_trainable_modules FREEZE_TRAINABLE_MODULES] + [--freeze_extra_modules FREEZE_EXTRA_MODULES] + [--pure_bf16 [PURE_BF16]] [--stage {pt,sft,rm,ppo,dpo,kto}] + [--finetuning_type {lora,freeze,full}] + [--use_llama_pro [USE_LLAMA_PRO]] + [--use_adam_mini [USE_ADAM_MINI]] + [--freeze_vision_tower [FREEZE_VISION_TOWER]] + [--no_freeze_vision_tower] + [--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]] + [--compute_accuracy [COMPUTE_ACCURACY]] + [--plot_loss [PLOT_LOSS]] [--do_sample [DO_SAMPLE]] + [--no_do_sample] [--temperature TEMPERATURE] + [--top_p TOP_P] [--top_k TOP_K] [--num_beams NUM_BEAMS] + [--max_length MAX_LENGTH] [--max_new_tokens MAX_NEW_TOKENS] + [--repetition_penalty REPETITION_PENALTY] + [--length_penalty LENGTH_PENALTY] + [--default_system DEFAULT_SYSTEM] + +optional arguments: + -h, --help show this help message and exit + --model_name_or_path MODEL_NAME_OR_PATH + Path to the model weight or identifier from + huggingface.co/models or modelscope.cn/models. + (default: None) + --adapter_name_or_path ADAPTER_NAME_OR_PATH + Path to the adapter weight or identifier from + huggingface.co/models. Use commas to separate multiple + adapters. (default: None) + --adapter_folder ADAPTER_FOLDER + The folder containing the adapter weights to load. + (default: None) + --cache_dir CACHE_DIR + Where to store the pre-trained models downloaded from + huggingface.co or modelscope.cn. (default: None) + --use_fast_tokenizer [USE_FAST_TOKENIZER] + Whether or not to use one of the fast tokenizer + (backed by the tokenizers library). (default: True) + --no_use_fast_tokenizer + Whether or not to use one of the fast tokenizer + (backed by the tokenizers library). (default: False) + --resize_vocab [RESIZE_VOCAB] + Whether or not to resize the tokenizer vocab and the + embedding layers. (default: False) + --split_special_tokens [SPLIT_SPECIAL_TOKENS] + Whether or not the special tokens should be split + during the tokenization process. (default: False) + --new_special_tokens NEW_SPECIAL_TOKENS + Special tokens to be added into the tokenizer. Use + commas to separate multiple tokens. (default: None) + --model_revision MODEL_REVISION + The specific model version to use (can be a branch + name, tag name or commit id). (default: main) + --low_cpu_mem_usage [LOW_CPU_MEM_USAGE] + Whether or not to use memory-efficient model loading. + (default: True) + --no_low_cpu_mem_usage + Whether or not to use memory-efficient model loading. + (default: False) + --quantization_method {bitsandbytes,hqq,eetq} + Quantization method to use for on-the-fly + quantization. (default: bitsandbytes) + --quantization_bit QUANTIZATION_BIT + The number of bits to quantize the model using + bitsandbytes. (default: None) + --quantization_type {fp4,nf4} + Quantization data type to use in int4 training. + (default: nf4) + --double_quantization [DOUBLE_QUANTIZATION] + Whether or not to use double quantization in int4 + training. (default: True) + --no_double_quantization + Whether or not to use double quantization in int4 + training. (default: False) + --quantization_device_map {auto} + Device map used to infer the 4-bit quantized model, + needs bitsandbytes>=0.43.0. (default: None) + --rope_scaling {linear,dynamic} + Which scaling strategy should be adopted for the RoPE + embeddings. (default: None) + --flash_attn {auto,disabled,sdpa,fa2} + Enable FlashAttention for faster training and + inference. (default: auto) + --shift_attn [SHIFT_ATTN] + Enable shift short attention (S^2-Attn) proposed by + LongLoRA. (default: False) + --mixture_of_depths {convert,load} + Convert the model to mixture-of-depths (MoD) or load + the MoD model. (default: None) + --use_unsloth [USE_UNSLOTH] + Whether or not to use unsloth's optimization for the + LoRA training. (default: False) + --visual_inputs [VISUAL_INPUTS] + Whethor or not to use multimodal LLM that accepts + visual inputs. (default: False) + --moe_aux_loss_coef MOE_AUX_LOSS_COEF + Coefficient of the auxiliary router loss in mixture- + of-experts model. (default: None) + --disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING] + Whether or not to disable gradient checkpointing. + (default: False) + --upcast_layernorm [UPCAST_LAYERNORM] + Whether or not to upcast the layernorm weights in + fp32. (default: False) + --upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT] + Whether or not to upcast the output of lm_head in + fp32. (default: False) + --train_from_scratch [TRAIN_FROM_SCRATCH] + Whether or not to randomly initialize the model + weights. (default: False) + --infer_backend {huggingface,vllm} + Backend engine used at inference. (default: + huggingface) + --vllm_maxlen VLLM_MAXLEN + Maximum sequence (prompt + response) length of the + vLLM engine. (default: 2048) + --vllm_gpu_util VLLM_GPU_UTIL + The fraction of GPU memory in (0,1) to be used for the + vLLM engine. (default: 0.9) + --vllm_enforce_eager [VLLM_ENFORCE_EAGER] + Whether or not to disable CUDA graph in the vLLM + engine. (default: False) + --vllm_max_lora_rank VLLM_MAX_LORA_RANK + Maximum rank of all LoRAs in the vLLM engine. + (default: 32) + --offload_folder OFFLOAD_FOLDER + Path to offload model weights. (default: offload) + --use_cache [USE_CACHE] + Whether or not to use KV cache in generation. + (default: True) + --no_use_cache Whether or not to use KV cache in generation. + (default: False) + --infer_dtype {auto,float16,bfloat16,float32} + Data type for model weights and activations at + inference. (default: auto) + --hf_hub_token HF_HUB_TOKEN + Auth token to log in with Hugging Face Hub. (default: + None) + --ms_hub_token MS_HUB_TOKEN + Auth token to log in with ModelScope Hub. (default: + None) + --export_dir EXPORT_DIR + Path to the directory to save the exported model. + (default: None) + --export_size EXPORT_SIZE + The file shard size (in GB) of the exported model. + (default: 1) + --export_device {cpu,auto} + The device used in model export, use `auto` to + accelerate exporting. (default: cpu) + --export_quantization_bit EXPORT_QUANTIZATION_BIT + The number of bits to quantize the exported model. + (default: None) + --export_quantization_dataset EXPORT_QUANTIZATION_DATASET + Path to the dataset or dataset name to use in + quantizing the exported model. (default: None) + --export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES + The number of samples used for quantization. (default: + 128) + --export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN + The maximum length of the model inputs used for + quantization. (default: 1024) + --export_legacy_format [EXPORT_LEGACY_FORMAT] + Whether or not to save the `.bin` files instead of + `.safetensors`. (default: False) + --export_hub_model_id EXPORT_HUB_MODEL_ID + The name of the repository if push the model to the + Hugging Face hub. (default: None) + --print_param_status [PRINT_PARAM_STATUS] + For debugging purposes, print the status of the + parameters in the model. (default: False) + --template TEMPLATE Which template to use for constructing prompts in + training and inference. (default: None) + --dataset DATASET The name of dataset(s) to use for training. Use commas + to separate multiple datasets. (default: None) + --eval_dataset EVAL_DATASET + The name of dataset(s) to use for evaluation. Use + commas to separate multiple datasets. (default: None) + --dataset_dir DATASET_DIR + Path to the folder containing the datasets. (default: + data) + --cutoff_len CUTOFF_LEN + The cutoff length of the tokenized inputs in the + dataset. (default: 1024) + --train_on_prompt [TRAIN_ON_PROMPT] + Whether or not to disable the mask on the prompt. + (default: False) + --mask_history [MASK_HISTORY] + Whether or not to mask the history and train on the + last turn only. (default: False) + --streaming [STREAMING] + Enable dataset streaming. (default: False) + --buffer_size BUFFER_SIZE + Size of the buffer to randomly sample examples from in + dataset streaming. (default: 16384) + --mix_strategy {concat,interleave_under,interleave_over} + Strategy to use in dataset mixing (concat/interleave) + (undersampling/oversampling). (default: concat) + --interleave_probs INTERLEAVE_PROBS + Probabilities to sample data from datasets. Use commas + to separate multiple datasets. (default: None) + --overwrite_cache [OVERWRITE_CACHE] + Overwrite the cached training and evaluation sets. + (default: False) + --preprocessing_num_workers PREPROCESSING_NUM_WORKERS + The number of processes to use for the pre-processing. + (default: None) + --max_samples MAX_SAMPLES + For debugging purposes, truncate the number of + examples for each dataset. (default: None) + --eval_num_beams EVAL_NUM_BEAMS + Number of beams to use for evaluation. This argument + will be passed to `model.generate` (default: None) + --ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS] + Whether or not to ignore the tokens corresponding to + the pad label in loss computation. (default: True) + --no_ignore_pad_token_for_loss + Whether or not to ignore the tokens corresponding to + the pad label in loss computation. (default: False) + --val_size VAL_SIZE Size of the development set, should be an integer or a + float in range `[0,1)`. (default: 0.0) + --packing PACKING Enable sequences packing in training. Will + automatically enable in pre-training. (default: None) + --neat_packing [NEAT_PACKING] + Enable sequence packing without cross-attention. + (default: False) + --tool_format TOOL_FORMAT + Tool format to use for constructing function calling + examples. (default: None) + --tokenized_path TOKENIZED_PATH + Path to save or load the tokenized datasets. (default: + None) + --output_dir OUTPUT_DIR + The output directory where the model predictions and + checkpoints will be written. (default: None) + --overwrite_output_dir [OVERWRITE_OUTPUT_DIR] + Overwrite the content of the output directory. Use + this to continue training if output_dir points to a + checkpoint directory. (default: False) + --do_train [DO_TRAIN] + Whether to run training. (default: False) + --do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False) + --do_predict [DO_PREDICT] + Whether to run predictions on the test set. (default: + False) + --eval_strategy {no,steps,epoch} + The evaluation strategy to use. (default: no) + --prediction_loss_only [PREDICTION_LOSS_ONLY] + When performing evaluation and predictions, only + returns the loss. (default: False) + --per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE + Batch size per GPU/TPU/MPS/NPU core/CPU for training. + (default: 8) + --per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE + Batch size per GPU/TPU/MPS/NPU core/CPU for + evaluation. (default: 8) + --per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE + Deprecated, the use of `--per_device_train_batch_size` + is preferred. Batch size per GPU/TPU core/CPU for + training. (default: None) + --per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE + Deprecated, the use of `--per_device_eval_batch_size` + is preferred. Batch size per GPU/TPU core/CPU for + evaluation. (default: None) + --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS + Number of updates steps to accumulate before + performing a backward/update pass. (default: 1) + --eval_accumulation_steps EVAL_ACCUMULATION_STEPS + Number of predictions steps to accumulate before + moving the tensors to the CPU. (default: None) + --eval_delay EVAL_DELAY + Number of epochs or steps to wait for before the first + evaluation can be performed, depending on the + eval_strategy. (default: 0) + --torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS + Number of steps to wait before calling + `torch..empty_cache()`.This can help avoid + CUDA out-of-memory errors by lowering peak VRAM usage + at a cost of about [10{'option_strings': ['-- + torch_empty_cache_steps'], 'dest': + 'torch_empty_cache_steps', 'nargs': None, 'const': + None, 'default': None, 'type': 'int', 'choices': None, + 'required': False, 'help': 'Number of steps to wait + before calling `torch..empty_cache()`.This can + help avoid CUDA out-of-memory errors by lowering peak + VRAM usage at a cost of about [10% slower performance] + (https://github.com/huggingface/transformers/issues/31 + 372).If left unset or set to None, cache will not be + emptied.', 'metavar': None, 'container': + , + 'prog': 'launcher.py'}lower performance](https://githu + b.com/huggingface/transformers/issues/31372).If left + unset or set to None, cache will not be emptied. + (default: None) + --learning_rate LEARNING_RATE + The initial learning rate for AdamW. (default: 5e-05) + --weight_decay WEIGHT_DECAY + Weight decay for AdamW if we apply some. (default: + 0.0) + --adam_beta1 ADAM_BETA1 + Beta1 for AdamW optimizer (default: 0.9) + --adam_beta2 ADAM_BETA2 + Beta2 for AdamW optimizer (default: 0.999) + --adam_epsilon ADAM_EPSILON + Epsilon for AdamW optimizer. (default: 1e-08) + --max_grad_norm MAX_GRAD_NORM + Max gradient norm. (default: 1.0) + --num_train_epochs NUM_TRAIN_EPOCHS + Total number of training epochs to perform. (default: + 3.0) + --max_steps MAX_STEPS + If > 0: set total number of training steps to perform. + Override num_train_epochs. (default: -1) + --lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay} + The scheduler type to use. (default: linear) + --lr_scheduler_kwargs LR_SCHEDULER_KWARGS + Extra parameters for the lr_scheduler such as + {'num_cycles': 1} for the cosine with hard restarts. + (default: {}) + --warmup_ratio WARMUP_RATIO + Linear warmup over warmup_ratio fraction of total + steps. (default: 0.0) + --warmup_steps WARMUP_STEPS + Linear warmup over warmup_steps. (default: 0) + --log_level {detail,debug,info,warning,error,critical,passive} + Logger log level to use on the main node. Possible + choices are the log levels as strings: 'debug', + 'info', 'warning', 'error' and 'critical', plus a + 'passive' level which doesn't set anything and lets + the application set the level. Defaults to 'passive'. + (default: passive) + --log_level_replica {detail,debug,info,warning,error,critical,passive} + Logger log level to use on replica nodes. Same choices + and defaults as ``log_level`` (default: warning) + --log_on_each_node [LOG_ON_EACH_NODE] + When doing a multinode distributed training, whether + to log once per node or just once on the main node. + (default: True) + --no_log_on_each_node + When doing a multinode distributed training, whether + to log once per node or just once on the main node. + (default: False) + --logging_dir LOGGING_DIR + Tensorboard log dir. (default: None) + --logging_strategy {no,steps,epoch} + The logging strategy to use. (default: steps) + --logging_first_step [LOGGING_FIRST_STEP] + Log the first global_step (default: False) + --logging_steps LOGGING_STEPS + Log every X updates steps. Should be an integer or a + float in range `[0,1)`. If smaller than 1, will be + interpreted as ratio of total training steps. + (default: 500) + --logging_nan_inf_filter [LOGGING_NAN_INF_FILTER] + Filter nan and inf losses for logging. (default: True) + --no_logging_nan_inf_filter + Filter nan and inf losses for logging. (default: + False) + --save_strategy {no,steps,epoch} + The checkpoint save strategy to use. (default: steps) + --save_steps SAVE_STEPS + Save checkpoint every X updates steps. Should be an + integer or a float in range `[0,1)`. If smaller than + 1, will be interpreted as ratio of total training + steps. (default: 500) + --save_total_limit SAVE_TOTAL_LIMIT + If a value is passed, will limit the total amount of + checkpoints. Deletes the older checkpoints in + `output_dir`. When `load_best_model_at_end` is + enabled, the 'best' checkpoint according to + `metric_for_best_model` will always be retained in + addition to the most recent ones. For example, for + `save_total_limit=5` and + `load_best_model_at_end=True`, the four last + checkpoints will always be retained alongside the best + model. When `save_total_limit=1` and + `load_best_model_at_end=True`, it is possible that two + checkpoints are saved: the last one and the best one + (if they are different). Default is unlimited + checkpoints (default: None) + --save_safetensors [SAVE_SAFETENSORS] + Use safetensors saving and loading for state dicts + instead of default torch.load and torch.save. + (default: True) + --no_save_safetensors + Use safetensors saving and loading for state dicts + instead of default torch.load and torch.save. + (default: False) + --save_on_each_node [SAVE_ON_EACH_NODE] + When doing multi-node distributed training, whether to + save models and checkpoints on each node, or only on + the main one (default: False) + --save_only_model [SAVE_ONLY_MODEL] + When checkpointing, whether to only save the model, or + also the optimizer, scheduler & rng state.Note that + when this is true, you won't be able to resume + training from checkpoint.This enables you to save + storage by not storing the optimizer, scheduler & rng + state.You can only load the model using + from_pretrained with this option set to True. + (default: False) + --restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT] + Whether to restore the callback states from the + checkpoint. If `True`, will override callbacks passed + to the `Trainer` if they exist in the checkpoint. + (default: False) + --no_cuda [NO_CUDA] This argument is deprecated. It will be removed in + version 5.0 of 🤗 Transformers. (default: False) + --use_cpu [USE_CPU] Whether or not to use cpu. If set to False, we will + use cuda/tpu/mps/npu device if available. (default: + False) + --use_mps_device [USE_MPS_DEVICE] + This argument is deprecated. `mps` device will be used + if available similar to `cuda` device. It will be + removed in version 5.0 of 🤗 Transformers (default: + False) + --seed SEED Random seed that will be set at the beginning of + training. (default: 42) + --data_seed DATA_SEED + Random seed to be used with data samplers. (default: + None) + --jit_mode_eval [JIT_MODE_EVAL] + Whether or not to use PyTorch jit trace for inference + (default: False) + --use_ipex [USE_IPEX] + Use Intel extension for PyTorch when it is available, + installation: 'https://github.com/intel/intel- + extension-for-pytorch' (default: False) + --bf16 [BF16] Whether to use bf16 (mixed) precision instead of + 32-bit. Requires Ampere or higher NVIDIA architecture + or using CPU (use_cpu) or Ascend NPU. This is an + experimental API and it may change. (default: False) + --fp16 [FP16] Whether to use fp16 (mixed) precision instead of + 32-bit (default: False) + --fp16_opt_level FP16_OPT_LEVEL + For fp16: Apex AMP optimization level selected in + ['O0', 'O1', 'O2', and 'O3']. See details at + https://nvidia.github.io/apex/amp.html (default: O1) + --half_precision_backend {auto,apex,cpu_amp} + The backend to be used for half precision. (default: + auto) + --bf16_full_eval [BF16_FULL_EVAL] + Whether to use full bfloat16 evaluation instead of + 32-bit. This is an experimental API and it may change. + (default: False) + --fp16_full_eval [FP16_FULL_EVAL] + Whether to use full float16 evaluation instead of + 32-bit (default: False) + --tf32 TF32 Whether to enable tf32 mode, available in Ampere and + newer GPU architectures. This is an experimental API + and it may change. (default: None) + --local_rank LOCAL_RANK + For distributed training: local_rank (default: -1) + --ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl} + The backend to be used for distributed training + (default: None) + --tpu_num_cores TPU_NUM_CORES + TPU: Number of TPU cores (automatically passed by + launcher script) (default: None) + --tpu_metrics_debug [TPU_METRICS_DEBUG] + Deprecated, the use of `--debug tpu_metrics_debug` is + preferred. TPU: Whether to print debug metrics + (default: False) + --debug DEBUG [DEBUG ...] + Whether or not to enable debug mode. Current options: + `underflow_overflow` (Detect underflow and overflow in + activations and weights), `tpu_metrics_debug` (print + debug metrics on TPU). (default: None) + --dataloader_drop_last [DATALOADER_DROP_LAST] + Drop the last incomplete batch if it is not divisible + by the batch size. (default: False) + --eval_steps EVAL_STEPS + Run an evaluation every X steps. Should be an integer + or a float in range `[0,1)`. If smaller than 1, will + be interpreted as ratio of total training steps. + (default: None) + --dataloader_num_workers DATALOADER_NUM_WORKERS + Number of subprocesses to use for data loading + (PyTorch only). 0 means that the data will be loaded + in the main process. (default: 0) + --dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR + Number of batches loaded in advance by each worker. 2 + means there will be a total of 2 * num_workers batches + prefetched across all workers. Default is 2 for + PyTorch < 2.0.0 and otherwise None. (default: None) + --past_index PAST_INDEX + If >=0, uses the corresponding part of the output as + the past state for next step. (default: -1) + --run_name RUN_NAME An optional descriptor for the run. Notably used for + wandb, mlflow and comet logging. (default: None) + --disable_tqdm DISABLE_TQDM + Whether or not to disable the tqdm progress bars. + (default: None) + --remove_unused_columns [REMOVE_UNUSED_COLUMNS] + Remove columns not required by the model when using an + nlp.Dataset. (default: True) + --no_remove_unused_columns + Remove columns not required by the model when using an + nlp.Dataset. (default: False) + --label_names LABEL_NAMES [LABEL_NAMES ...] + The list of keys in your dictionary of inputs that + correspond to the labels. (default: None) + --load_best_model_at_end [LOAD_BEST_MODEL_AT_END] + Whether or not to load the best model found during + training at the end of training. When this option is + enabled, the best checkpoint will always be saved. See + `save_total_limit` for more. (default: False) + --metric_for_best_model METRIC_FOR_BEST_MODEL + The metric to use to compare two different models. + (default: None) + --greater_is_better GREATER_IS_BETTER + Whether the `metric_for_best_model` should be + maximized or not. (default: None) + --ignore_data_skip [IGNORE_DATA_SKIP] + When resuming training, whether or not to skip the + first epochs and batches to get to the same training + data. (default: False) + --fsdp FSDP Whether or not to use PyTorch Fully Sharded Data + Parallel (FSDP) training (in distributed training + only). The base option should be `full_shard`, + `shard_grad_op` or `no_shard` and you can add CPU- + offload to `full_shard` or `shard_grad_op` like this: + full_shard offload` or `shard_grad_op offload`. You + can add auto-wrap to `full_shard` or `shard_grad_op` + with the same syntax: full_shard auto_wrap` or + `shard_grad_op auto_wrap`. (default: ) + --fsdp_min_num_params FSDP_MIN_NUM_PARAMS + This parameter is deprecated. FSDP's minimum number of + parameters for Default Auto Wrapping. (useful only + when `fsdp` field is passed). (default: 0) + --fsdp_config FSDP_CONFIG + Config to be used with FSDP (Pytorch Fully Sharded + Data Parallel). The value is either a fsdp json config + file (e.g., `fsdp_config.json`) or an already loaded + json file as `dict`. (default: None) + --fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP + This parameter is deprecated. Transformer layer class + name (case-sensitive) to wrap, e.g, `BertLayer`, + `GPTJBlock`, `T5Block` .... (useful only when `fsdp` + flag is passed). (default: None) + --accelerator_config ACCELERATOR_CONFIG + Config to be used with the internal Accelerator object + initializtion. The value is either a accelerator json + config file (e.g., `accelerator_config.json`) or an + already loaded json file as `dict`. (default: None) + --deepspeed DEEPSPEED + Enable deepspeed and pass the path to deepspeed json + config file (e.g. `ds_config.json`) or an already + loaded json file as a dict (default: None) + --label_smoothing_factor LABEL_SMOOTHING_FACTOR + The label smoothing epsilon to apply (zero means no + label smoothing). (default: 0.0) + --optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo} + The optimizer to use. (default: adamw_torch) + --optim_args OPTIM_ARGS + Optional arguments to supply to optimizer. (default: + None) + --adafactor [ADAFACTOR] + Whether or not to replace AdamW by Adafactor. + (default: False) + --group_by_length [GROUP_BY_LENGTH] + Whether or not to group samples of roughly the same + length together when batching. (default: False) + --length_column_name LENGTH_COLUMN_NAME + Column name with precomputed lengths to use when + grouping by length. (default: length) + --report_to REPORT_TO + The list of integrations to report the results and + logs to. (default: None) + --ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS + When using distributed training, the value of the flag + `find_unused_parameters` passed to + `DistributedDataParallel`. (default: None) + --ddp_bucket_cap_mb DDP_BUCKET_CAP_MB + When using distributed training, the value of the flag + `bucket_cap_mb` passed to `DistributedDataParallel`. + (default: None) + --ddp_broadcast_buffers DDP_BROADCAST_BUFFERS + When using distributed training, the value of the flag + `broadcast_buffers` passed to + `DistributedDataParallel`. (default: None) + --dataloader_pin_memory [DATALOADER_PIN_MEMORY] + Whether or not to pin memory for DataLoader. (default: + True) + --no_dataloader_pin_memory + Whether or not to pin memory for DataLoader. (default: + False) + --dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS] + If True, the data loader will not shut down the worker + processes after a dataset has been consumed once. This + allows to maintain the workers Dataset instances + alive. Can potentially speed up training, but will + increase RAM usage. (default: False) + --skip_memory_metrics [SKIP_MEMORY_METRICS] + Whether or not to skip adding of memory profiler + reports to metrics. (default: True) + --no_skip_memory_metrics + Whether or not to skip adding of memory profiler + reports to metrics. (default: False) + --use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP] + Whether or not to use the legacy prediction_loop in + the Trainer. (default: False) + --push_to_hub [PUSH_TO_HUB] + Whether or not to upload the trained model to the + model hub after training. (default: False) + --resume_from_checkpoint RESUME_FROM_CHECKPOINT + The path to a folder with a valid checkpoint for your + model. (default: None) + --hub_model_id HUB_MODEL_ID + The name of the repository to keep in sync with the + local `output_dir`. (default: None) + --hub_strategy {end,every_save,checkpoint,all_checkpoints} + The hub strategy to use when `--push_to_hub` is + activated. (default: every_save) + --hub_token HUB_TOKEN + The token to use to push to the Model Hub. (default: + None) + --hub_private_repo [HUB_PRIVATE_REPO] + Whether the model repository is private or not. + (default: False) + --hub_always_push [HUB_ALWAYS_PUSH] + Unless `True`, the Trainer will skip pushes if the + previous one wasn't finished yet. (default: False) + --gradient_checkpointing [GRADIENT_CHECKPOINTING] + If True, use gradient checkpointing to save memory at + the expense of slower backward pass. (default: False) + --gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS + Gradient checkpointing key word arguments such as + `use_reentrant`. Will be passed to + `torch.utils.checkpoint.checkpoint` through + `model.gradient_checkpointing_enable`. (default: None) + --include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS] + Whether or not the inputs will be passed to the + `compute_metrics` function. (default: False) + --eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES] + Whether to recursively concat + inputs/losses/labels/predictions across batches. If + `False`, will instead store them as lists, with each + batch kept separate. (default: True) + --no_eval_do_concat_batches + Whether to recursively concat + inputs/losses/labels/predictions across batches. If + `False`, will instead store them as lists, with each + batch kept separate. (default: False) + --fp16_backend {auto,apex,cpu_amp} + Deprecated. Use half_precision_backend instead + (default: auto) + --evaluation_strategy {no,steps,epoch} + Deprecated. Use `eval_strategy` instead (default: + None) + --push_to_hub_model_id PUSH_TO_HUB_MODEL_ID + The name of the repository to which push the + `Trainer`. (default: None) + --push_to_hub_organization PUSH_TO_HUB_ORGANIZATION + The name of the organization in with to which push the + `Trainer`. (default: None) + --push_to_hub_token PUSH_TO_HUB_TOKEN + The token to use to push to the Model Hub. (default: + None) + --mp_parameters MP_PARAMETERS + Used by the SageMaker launcher to send mp-specific + args. Ignored in Trainer (default: ) + --auto_find_batch_size [AUTO_FIND_BATCH_SIZE] + Whether to automatically decrease the batch size in + half and rerun the training loop again each time a + CUDA Out-of-Memory was reached (default: False) + --full_determinism [FULL_DETERMINISM] + Whether to call enable_full_determinism instead of + set_seed for reproducibility in distributed training. + Important: this will negatively impact the + performance, so only use it for debugging. (default: + False) + --torchdynamo TORCHDYNAMO + This argument is deprecated, use + `--torch_compile_backend` instead. (default: None) + --ray_scope RAY_SCOPE + The scope to use when doing hyperparameter search with + Ray. By default, `"last"` will be used. Ray will then + use the last checkpoint of all trials, compare those, + and select the best one. However, other options are + also available. See the Ray documentation (https://doc + s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun + e.ExperimentAnalysis.get_best_trial) for more options. + (default: last) + --ddp_timeout DDP_TIMEOUT + Overrides the default timeout for distributed training + (value should be given in seconds). (default: 1800) + --torch_compile [TORCH_COMPILE] + If set to `True`, the model will be wrapped in + `torch.compile`. (default: False) + --torch_compile_backend TORCH_COMPILE_BACKEND + Which backend to use with `torch.compile`, passing one + will trigger a model compilation. (default: None) + --torch_compile_mode TORCH_COMPILE_MODE + Which mode to use with `torch.compile`, passing one + will trigger a model compilation. (default: None) + --dispatch_batches DISPATCH_BATCHES + Deprecated. Pass {'dispatch_batches':VALUE} to + `accelerator_config`. (default: None) + --split_batches SPLIT_BATCHES + Deprecated. Pass {'split_batches':True} to + `accelerator_config`. (default: None) + --include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND] + If set to `True`, the speed metrics will include `tgs` + (tokens per second per device). (default: False) + --include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN] + If set to `True`, will track the number of input + tokens seen throughout training. (May be slower in + distributed training) (default: False) + --neftune_noise_alpha NEFTUNE_NOISE_ALPHA + Activates neftune noise embeddings into the model. + NEFTune has been proven to drastically improve model + performances for instrcution fine-tuning. Check out + the original paper here: + https://arxiv.org/abs/2310.05914 and the original code + here: https://github.com/neelsjain/NEFTune. Only + supported for `PreTrainedModel` and `PeftModel` + classes. (default: None) + --optim_target_modules OPTIM_TARGET_MODULES + Target modules for the optimizer defined in the + `optim` argument. Only used for the GaLore optimizer + at the moment. (default: None) + --batch_eval_metrics [BATCH_EVAL_METRICS] + Break eval metrics calculation into batches to save + memory. (default: False) + --eval_on_start [EVAL_ON_START] + Whether to run through the entire `evaluation` step at + the very beginning of training as a sanity check. + (default: False) + --eval_use_gather_object [EVAL_USE_GATHER_OBJECT] + Whether to run recursively gather object in a nested + list/tuple/dictionary of objects from all devices. + (default: False) + --sortish_sampler [SORTISH_SAMPLER] + Whether to use SortishSampler or not. (default: False) + --predict_with_generate [PREDICT_WITH_GENERATE] + Whether to use generate to calculate generative + metrics (ROUGE, BLEU). (default: False) + --generation_max_length GENERATION_MAX_LENGTH + The `max_length` to use on each evaluation loop when + `predict_with_generate=True`. Will default to the + `max_length` value of the model configuration. + (default: None) + --generation_num_beams GENERATION_NUM_BEAMS + The `num_beams` to use on each evaluation loop when + `predict_with_generate=True`. Will default to the + `num_beams` value of the model configuration. + (default: None) + --generation_config GENERATION_CONFIG + Model id, file path or url pointing to a + GenerationConfig json file, to use during prediction. + (default: None) + --use_badam [USE_BADAM] + Whether or not to use the BAdam optimizer. (default: + False) + --badam_mode {layer,ratio} + Whether to use layer-wise or ratio-wise BAdam + optimizer. (default: layer) + --badam_start_block BADAM_START_BLOCK + The starting block index for layer-wise BAdam. + (default: None) + --badam_switch_mode {ascending,descending,random,fixed} + the strategy of picking block to update for layer-wise + BAdam. (default: ascending) + --badam_switch_interval BADAM_SWITCH_INTERVAL + Number of steps to update the block for layer-wise + BAdam. Use -1 to disable the block update. (default: + 50) + --badam_update_ratio BADAM_UPDATE_RATIO + The ratio of the update for ratio-wise BAdam. + (default: 0.05) + --badam_mask_mode {adjacent,scatter} + The mode of the mask for BAdam optimizer. `adjacent` + means that the trainable parameters are adjacent to + each other, `scatter` means that trainable parameters + are randomly choosed from the weight. (default: + adjacent) + --badam_verbose BADAM_VERBOSE + The verbosity level of BAdam optimizer. 0 for no + print, 1 for print the block prefix, 2 for print + trainable parameters. (default: 0) + --use_galore [USE_GALORE] + Whether or not to use the gradient low-Rank projection + (GaLore). (default: False) + --galore_target GALORE_TARGET + Name(s) of modules to apply GaLore. Use commas to + separate multiple modules. Use `all` to specify all + the linear modules. (default: all) + --galore_rank GALORE_RANK + The rank of GaLore gradients. (default: 16) + --galore_update_interval GALORE_UPDATE_INTERVAL + Number of steps to update the GaLore projection. + (default: 200) + --galore_scale GALORE_SCALE + GaLore scaling coefficient. (default: 0.25) + --galore_proj_type {std,reverse_std,right,left,full} + Type of GaLore projection. (default: std) + --galore_layerwise [GALORE_LAYERWISE] + Whether or not to enable layer-wise update to further + save memory. (default: False) + --pref_beta PREF_BETA + The beta parameter in the preference loss. (default: + 0.1) + --pref_ftx PREF_FTX The supervised fine-tuning loss coefficient in DPO + training. (default: 0.0) + --pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo} + The type of DPO loss to use. (default: sigmoid) + --dpo_label_smoothing DPO_LABEL_SMOOTHING + The robust DPO label smoothing parameter in cDPO that + should be between 0 and 0.5. (default: 0.0) + --kto_chosen_weight KTO_CHOSEN_WEIGHT + The weight factor of the desirable losses in KTO + training. (default: 1.0) + --kto_rejected_weight KTO_REJECTED_WEIGHT + The weight factor of the undesirable losses in KTO + training. (default: 1.0) + --simpo_gamma SIMPO_GAMMA + The target reward margin term in SimPO loss. (default: + 0.5) + --ppo_buffer_size PPO_BUFFER_SIZE + The number of mini-batches to make experience buffer + in a PPO optimization step. (default: 1) + --ppo_epochs PPO_EPOCHS + The number of epochs to perform in a PPO optimization + step. (default: 4) + --ppo_score_norm [PPO_SCORE_NORM] + Use score normalization in PPO training. (default: + False) + --ppo_target PPO_TARGET + Target KL value for adaptive KL control in PPO + training. (default: 6.0) + --ppo_whiten_rewards [PPO_WHITEN_REWARDS] + Whiten the rewards before compute advantages in PPO + training. (default: False) + --ref_model REF_MODEL + Path to the reference model used for the PPO or DPO + training. (default: None) + --ref_model_adapters REF_MODEL_ADAPTERS + Path to the adapters of the reference model. (default: + None) + --ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT + The number of bits to quantize the reference model. + (default: None) + --reward_model REWARD_MODEL + Path to the reward model used for the PPO training. + (default: None) + --reward_model_adapters REWARD_MODEL_ADAPTERS + Path to the adapters of the reward model. (default: + None) + --reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT + The number of bits to quantize the reward model. + (default: None) + --reward_model_type {lora,full,api} + The type of the reward model in PPO training. Lora + model only supports lora training. (default: lora) + --additional_target ADDITIONAL_TARGET + Name(s) of modules apart from LoRA layers to be set as + trainable and saved in the final checkpoint. Use + commas to separate multiple modules. (default: None) + --lora_alpha LORA_ALPHA + The scale factor for LoRA fine-tuning (default: + lora_rank * 2). (default: None) + --lora_dropout LORA_DROPOUT + Dropout rate for the LoRA fine-tuning. (default: 0.0) + --lora_rank LORA_RANK + The intrinsic dimension for LoRA fine-tuning. + (default: 8) + --lora_target LORA_TARGET + Name(s) of target modules to apply LoRA. Use commas to + separate multiple modules. Use `all` to specify all + the linear modules. (default: all) + --loraplus_lr_ratio LORAPLUS_LR_RATIO + LoRA plus learning rate ratio (lr_B / lr_A). (default: + None) + --loraplus_lr_embedding LORAPLUS_LR_EMBEDDING + LoRA plus learning rate for lora embedding layers. + (default: 1e-06) + --use_rslora [USE_RSLORA] + Whether or not to use the rank stabilization scaling + factor for LoRA layer. (default: False) + --use_dora [USE_DORA] + Whether or not to use the weight-decomposed lora + method (DoRA). (default: False) + --pissa_init [PISSA_INIT] + Whether or not to initialize a PiSSA adapter. + (default: False) + --pissa_iter PISSA_ITER + The number of iteration steps performed by FSVD in + PiSSA. Use -1 to disable it. (default: 16) + --pissa_convert [PISSA_CONVERT] + Whether or not to convert the PiSSA adapter to a + normal LoRA adapter. (default: False) + --create_new_adapter [CREATE_NEW_ADAPTER] + Whether or not to create a new adapter with randomly + initialized weight. (default: False) + --freeze_trainable_layers FREEZE_TRAINABLE_LAYERS + The number of trainable layers for freeze (partial- + parameter) fine-tuning. Positive numbers mean the last + n layers are set as trainable, negative numbers mean + the first n layers are set as trainable. (default: 2) + --freeze_trainable_modules FREEZE_TRAINABLE_MODULES + Name(s) of trainable modules for freeze (partial- + parameter) fine-tuning. Use commas to separate + multiple modules. Use `all` to specify all the + available modules. (default: all) + --freeze_extra_modules FREEZE_EXTRA_MODULES + Name(s) of modules apart from hidden layers to be set + as trainable for freeze (partial-parameter) fine- + tuning. Use commas to separate multiple modules. + (default: None) + --pure_bf16 [PURE_BF16] + Whether or not to train model in purely bf16 precision + (without AMP). (default: False) + --stage {pt,sft,rm,ppo,dpo,kto} + Which stage will be performed in training. (default: + sft) + --finetuning_type {lora,freeze,full} + Which fine-tuning method to use. (default: lora) + --use_llama_pro [USE_LLAMA_PRO] + Whether or not to make only the parameters in the + expanded blocks trainable. (default: False) + --use_adam_mini [USE_ADAM_MINI] + Whether or not to use the Adam-mini optimizer. + (default: False) + --freeze_vision_tower [FREEZE_VISION_TOWER] + Whether ot not to freeze vision tower in MLLM + training. (default: True) + --no_freeze_vision_tower + Whether ot not to freeze vision tower in MLLM + training. (default: False) + --train_mm_proj_only [TRAIN_MM_PROJ_ONLY] + Whether or not to train the multimodal projector for + MLLM only. (default: False) + --compute_accuracy [COMPUTE_ACCURACY] + Whether or not to compute the token-level accuracy at + evaluation. (default: False) + --plot_loss [PLOT_LOSS] + Whether or not to save the training loss curves. + (default: False) + --do_sample [DO_SAMPLE] + Whether or not to use sampling, use greedy decoding + otherwise. (default: True) + --no_do_sample Whether or not to use sampling, use greedy decoding + otherwise. (default: False) + --temperature TEMPERATURE + The value used to modulate the next token + probabilities. (default: 0.95) + --top_p TOP_P The smallest set of most probable tokens with + probabilities that add up to top_p or higher are kept. + (default: 0.7) + --top_k TOP_K The number of highest probability vocabulary tokens to + keep for top-k filtering. (default: 50) + --num_beams NUM_BEAMS + Number of beams for beam search. 1 means no beam + search. (default: 1) + --max_length MAX_LENGTH + The maximum length the generated tokens can have. It + can be overridden by max_new_tokens. (default: 1024) + --max_new_tokens MAX_NEW_TOKENS + The maximum numbers of tokens to generate, ignoring + the number of tokens in the prompt. (default: 1024) + --repetition_penalty REPETITION_PENALTY + The parameter for repetition penalty. 1.0 means no + penalty. (default: 1.0) + --length_penalty LENGTH_PENALTY + Exponential penalty to the length that is used with + beam-based generation. (default: 1.0) + --default_system DEFAULT_SYSTEM + Default system message to use in chat completion. + (default: None) +usage: launcher.py [-h] --model_name_or_path MODEL_NAME_OR_PATH + [--adapter_name_or_path ADAPTER_NAME_OR_PATH] + [--adapter_folder ADAPTER_FOLDER] [--cache_dir CACHE_DIR] + [--use_fast_tokenizer [USE_FAST_TOKENIZER]] + [--no_use_fast_tokenizer] [--resize_vocab [RESIZE_VOCAB]] + [--split_special_tokens [SPLIT_SPECIAL_TOKENS]] + [--new_special_tokens NEW_SPECIAL_TOKENS] + [--model_revision MODEL_REVISION] + [--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]] + [--no_low_cpu_mem_usage] + [--quantization_method {bitsandbytes,hqq,eetq}] + [--quantization_bit QUANTIZATION_BIT] + [--quantization_type {fp4,nf4}] + [--double_quantization [DOUBLE_QUANTIZATION]] + [--no_double_quantization] + [--quantization_device_map {auto}] + [--rope_scaling {linear,dynamic}] + [--flash_attn {auto,disabled,sdpa,fa2}] + [--shift_attn [SHIFT_ATTN]] + [--mixture_of_depths {convert,load}] + [--use_unsloth [USE_UNSLOTH]] + [--visual_inputs [VISUAL_INPUTS]] + [--moe_aux_loss_coef MOE_AUX_LOSS_COEF] + [--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]] + [--upcast_layernorm [UPCAST_LAYERNORM]] + [--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]] + [--train_from_scratch [TRAIN_FROM_SCRATCH]] + [--infer_backend {huggingface,vllm}] + [--vllm_maxlen VLLM_MAXLEN] [--vllm_gpu_util VLLM_GPU_UTIL] + [--vllm_enforce_eager [VLLM_ENFORCE_EAGER]] + [--vllm_max_lora_rank VLLM_MAX_LORA_RANK] + [--offload_folder OFFLOAD_FOLDER] [--use_cache [USE_CACHE]] + [--no_use_cache] + [--infer_dtype {auto,float16,bfloat16,float32}] + [--hf_hub_token HF_HUB_TOKEN] [--ms_hub_token MS_HUB_TOKEN] + [--export_dir EXPORT_DIR] [--export_size EXPORT_SIZE] + [--export_device {cpu,auto}] + [--export_quantization_bit EXPORT_QUANTIZATION_BIT] + [--export_quantization_dataset EXPORT_QUANTIZATION_DATASET] + [--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES] + [--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN] + [--export_legacy_format [EXPORT_LEGACY_FORMAT]] + [--export_hub_model_id EXPORT_HUB_MODEL_ID] + [--print_param_status [PRINT_PARAM_STATUS]] + [--template TEMPLATE] [--dataset DATASET] + [--eval_dataset EVAL_DATASET] [--dataset_dir DATASET_DIR] + [--cutoff_len CUTOFF_LEN] + [--train_on_prompt [TRAIN_ON_PROMPT]] + [--mask_history [MASK_HISTORY]] [--streaming [STREAMING]] + [--buffer_size BUFFER_SIZE] + [--mix_strategy {concat,interleave_under,interleave_over}] + [--interleave_probs INTERLEAVE_PROBS] + [--overwrite_cache [OVERWRITE_CACHE]] + [--preprocessing_num_workers PREPROCESSING_NUM_WORKERS] + [--max_samples MAX_SAMPLES] + [--eval_num_beams EVAL_NUM_BEAMS] + [--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]] + [--no_ignore_pad_token_for_loss] [--val_size VAL_SIZE] + [--packing PACKING] [--neat_packing [NEAT_PACKING]] + [--tool_format TOOL_FORMAT] + [--tokenized_path TOKENIZED_PATH] --output_dir OUTPUT_DIR + [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] + [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]] + [--do_predict [DO_PREDICT]] + [--eval_strategy {no,steps,epoch}] + [--prediction_loss_only [PREDICTION_LOSS_ONLY]] + [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] + [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE] + [--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] + [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE] + [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS] + [--eval_accumulation_steps EVAL_ACCUMULATION_STEPS] + [--eval_delay EVAL_DELAY] + [--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS] + [--learning_rate LEARNING_RATE] + [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1] + [--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON] + [--max_grad_norm MAX_GRAD_NORM] + [--num_train_epochs NUM_TRAIN_EPOCHS] + [--max_steps MAX_STEPS] + [--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}] + [--lr_scheduler_kwargs LR_SCHEDULER_KWARGS] + [--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS] + [--log_level {detail,debug,info,warning,error,critical,passive}] + [--log_level_replica {detail,debug,info,warning,error,critical,passive}] + [--log_on_each_node [LOG_ON_EACH_NODE]] + [--no_log_on_each_node] [--logging_dir LOGGING_DIR] + [--logging_strategy {no,steps,epoch}] + [--logging_first_step [LOGGING_FIRST_STEP]] + [--logging_steps LOGGING_STEPS] + [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]] + [--no_logging_nan_inf_filter] + [--save_strategy {no,steps,epoch}] + [--save_steps SAVE_STEPS] + [--save_total_limit SAVE_TOTAL_LIMIT] + [--save_safetensors [SAVE_SAFETENSORS]] + [--no_save_safetensors] + [--save_on_each_node [SAVE_ON_EACH_NODE]] + [--save_only_model [SAVE_ONLY_MODEL]] + [--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]] + [--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]] + [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] + [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]] + [--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]] + [--fp16_opt_level FP16_OPT_LEVEL] + [--half_precision_backend {auto,apex,cpu_amp}] + [--bf16_full_eval [BF16_FULL_EVAL]] + [--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32] + [--local_rank LOCAL_RANK] + [--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}] + [--tpu_num_cores TPU_NUM_CORES] + [--tpu_metrics_debug [TPU_METRICS_DEBUG]] + [--debug DEBUG [DEBUG ...]] + [--dataloader_drop_last [DATALOADER_DROP_LAST]] + [--eval_steps EVAL_STEPS] + [--dataloader_num_workers DATALOADER_NUM_WORKERS] + [--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR] + [--past_index PAST_INDEX] [--run_name RUN_NAME] + [--disable_tqdm DISABLE_TQDM] + [--remove_unused_columns [REMOVE_UNUSED_COLUMNS]] + [--no_remove_unused_columns] + [--label_names LABEL_NAMES [LABEL_NAMES ...]] + [--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]] + [--metric_for_best_model METRIC_FOR_BEST_MODEL] + [--greater_is_better GREATER_IS_BETTER] + [--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP] + [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] + [--fsdp_config FSDP_CONFIG] + [--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] + [--accelerator_config ACCELERATOR_CONFIG] + [--deepspeed DEEPSPEED] + [--label_smoothing_factor LABEL_SMOOTHING_FACTOR] + [--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}] + [--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] + [--group_by_length [GROUP_BY_LENGTH]] + [--length_column_name LENGTH_COLUMN_NAME] + [--report_to REPORT_TO] + [--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] + [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB] + [--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS] + [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]] + [--no_dataloader_pin_memory] + [--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]] + [--skip_memory_metrics [SKIP_MEMORY_METRICS]] + [--no_skip_memory_metrics] + [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]] + [--push_to_hub [PUSH_TO_HUB]] + [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] + [--hub_model_id HUB_MODEL_ID] + [--hub_strategy {end,every_save,checkpoint,all_checkpoints}] + [--hub_token HUB_TOKEN] + [--hub_private_repo [HUB_PRIVATE_REPO]] + [--hub_always_push [HUB_ALWAYS_PUSH]] + [--gradient_checkpointing [GRADIENT_CHECKPOINTING]] + [--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS] + [--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]] + [--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]] + [--no_eval_do_concat_batches] + [--fp16_backend {auto,apex,cpu_amp}] + [--evaluation_strategy {no,steps,epoch}] + [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID] + [--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION] + [--push_to_hub_token PUSH_TO_HUB_TOKEN] + [--mp_parameters MP_PARAMETERS] + [--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]] + [--full_determinism [FULL_DETERMINISM]] + [--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE] + [--ddp_timeout DDP_TIMEOUT] + [--torch_compile [TORCH_COMPILE]] + [--torch_compile_backend TORCH_COMPILE_BACKEND] + [--torch_compile_mode TORCH_COMPILE_MODE] + [--dispatch_batches DISPATCH_BATCHES] + [--split_batches SPLIT_BATCHES] + [--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]] + [--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]] + [--neftune_noise_alpha NEFTUNE_NOISE_ALPHA] + [--optim_target_modules OPTIM_TARGET_MODULES] + [--batch_eval_metrics [BATCH_EVAL_METRICS]] + [--eval_on_start [EVAL_ON_START]] + [--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]] + [--sortish_sampler [SORTISH_SAMPLER]] + [--predict_with_generate [PREDICT_WITH_GENERATE]] + [--generation_max_length GENERATION_MAX_LENGTH] + [--generation_num_beams GENERATION_NUM_BEAMS] + [--generation_config GENERATION_CONFIG] + [--use_badam [USE_BADAM]] [--badam_mode {layer,ratio}] + [--badam_start_block BADAM_START_BLOCK] + [--badam_switch_mode {ascending,descending,random,fixed}] + [--badam_switch_interval BADAM_SWITCH_INTERVAL] + [--badam_update_ratio BADAM_UPDATE_RATIO] + [--badam_mask_mode {adjacent,scatter}] + [--badam_verbose BADAM_VERBOSE] [--use_galore [USE_GALORE]] + [--galore_target GALORE_TARGET] [--galore_rank GALORE_RANK] + [--galore_update_interval GALORE_UPDATE_INTERVAL] + [--galore_scale GALORE_SCALE] + [--galore_proj_type {std,reverse_std,right,left,full}] + [--galore_layerwise [GALORE_LAYERWISE]] + [--pref_beta PREF_BETA] [--pref_ftx PREF_FTX] + [--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}] + [--dpo_label_smoothing DPO_LABEL_SMOOTHING] + [--kto_chosen_weight KTO_CHOSEN_WEIGHT] + [--kto_rejected_weight KTO_REJECTED_WEIGHT] + [--simpo_gamma SIMPO_GAMMA] + [--ppo_buffer_size PPO_BUFFER_SIZE] + [--ppo_epochs PPO_EPOCHS] + [--ppo_score_norm [PPO_SCORE_NORM]] + [--ppo_target PPO_TARGET] + [--ppo_whiten_rewards [PPO_WHITEN_REWARDS]] + [--ref_model REF_MODEL] + [--ref_model_adapters REF_MODEL_ADAPTERS] + [--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT] + [--reward_model REWARD_MODEL] + [--reward_model_adapters REWARD_MODEL_ADAPTERS] + [--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT] + [--reward_model_type {lora,full,api}] + [--additional_target ADDITIONAL_TARGET] + [--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT] + [--lora_rank LORA_RANK] [--lora_target LORA_TARGET] + [--loraplus_lr_ratio LORAPLUS_LR_RATIO] + [--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING] + [--use_rslora [USE_RSLORA]] [--use_dora [USE_DORA]] + [--pissa_init [PISSA_INIT]] [--pissa_iter PISSA_ITER] + [--pissa_convert [PISSA_CONVERT]] + [--create_new_adapter [CREATE_NEW_ADAPTER]] + [--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS] + [--freeze_trainable_modules FREEZE_TRAINABLE_MODULES] + [--freeze_extra_modules FREEZE_EXTRA_MODULES] + [--pure_bf16 [PURE_BF16]] [--stage {pt,sft,rm,ppo,dpo,kto}] + [--finetuning_type {lora,freeze,full}] + [--use_llama_pro [USE_LLAMA_PRO]] + [--use_adam_mini [USE_ADAM_MINI]] + [--freeze_vision_tower [FREEZE_VISION_TOWER]] + [--no_freeze_vision_tower] + [--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]] + [--compute_accuracy [COMPUTE_ACCURACY]] + [--plot_loss [PLOT_LOSS]] [--do_sample [DO_SAMPLE]] + [--no_do_sample] [--temperature TEMPERATURE] + [--top_p TOP_P] [--top_k TOP_K] [--num_beams NUM_BEAMS] + [--max_length MAX_LENGTH] [--max_new_tokens MAX_NEW_TOKENS] + [--repetition_penalty REPETITION_PENALTY] + [--length_penalty LENGTH_PENALTY] + [--default_system DEFAULT_SYSTEM] + +optional arguments: + -h, --help show this help message and exit + --model_name_or_path MODEL_NAME_OR_PATH + Path to the model weight or identifier from + huggingface.co/models or modelscope.cn/models. + (default: None) + --adapter_name_or_path ADAPTER_NAME_OR_PATH + Path to the adapter weight or identifier from + huggingface.co/models. Use commas to separate multiple + adapters. (default: None) + --adapter_folder ADAPTER_FOLDER + The folder containing the adapter weights to load. + (default: None) + --cache_dir CACHE_DIR + Where to store the pre-trained models downloaded from + huggingface.co or modelscope.cn. (default: None) + --use_fast_tokenizer [USE_FAST_TOKENIZER] + Whether or not to use one of the fast tokenizer + (backed by the tokenizers library). (default: True) + --no_use_fast_tokenizer + Whether or not to use one of the fast tokenizer + (backed by the tokenizers library). (default: False) + --resize_vocab [RESIZE_VOCAB] + Whether or not to resize the tokenizer vocab and the + embedding layers. (default: False) + --split_special_tokens [SPLIT_SPECIAL_TOKENS] + Whether or not the special tokens should be split + during the tokenization process. (default: False) + --new_special_tokens NEW_SPECIAL_TOKENS + Special tokens to be added into the tokenizer. Use + commas to separate multiple tokens. (default: None) + --model_revision MODEL_REVISION + The specific model version to use (can be a branch + name, tag name or commit id). (default: main) + --low_cpu_mem_usage [LOW_CPU_MEM_USAGE] + Whether or not to use memory-efficient model loading. + (default: True) + --no_low_cpu_mem_usage + Whether or not to use memory-efficient model loading. + (default: False) + --quantization_method {bitsandbytes,hqq,eetq} + Quantization method to use for on-the-fly + quantization. (default: bitsandbytes) + --quantization_bit QUANTIZATION_BIT + The number of bits to quantize the model using + bitsandbytes. (default: None) + --quantization_type {fp4,nf4} + Quantization data type to use in int4 training. + (default: nf4) + --double_quantization [DOUBLE_QUANTIZATION] + Whether or not to use double quantization in int4 + training. (default: True) + --no_double_quantization + Whether or not to use double quantization in int4 + training. (default: False) + --quantization_device_map {auto} + Device map used to infer the 4-bit quantized model, + needs bitsandbytes>=0.43.0. (default: None) + --rope_scaling {linear,dynamic} + Which scaling strategy should be adopted for the RoPE + embeddings. (default: None) + --flash_attn {auto,disabled,sdpa,fa2} + Enable FlashAttention for faster training and + inference. (default: auto) + --shift_attn [SHIFT_ATTN] + Enable shift short attention (S^2-Attn) proposed by + LongLoRA. (default: False) + --mixture_of_depths {convert,load} + Convert the model to mixture-of-depths (MoD) or load + the MoD model. (default: None) + --use_unsloth [USE_UNSLOTH] + Whether or not to use unsloth's optimization for the + LoRA training. (default: False) + --visual_inputs [VISUAL_INPUTS] + Whethor or not to use multimodal LLM that accepts + visual inputs. (default: False) + --moe_aux_loss_coef MOE_AUX_LOSS_COEF + Coefficient of the auxiliary router loss in mixture- + of-experts model. (default: None) + --disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING] + Whether or not to disable gradient checkpointing. + (default: False) + --upcast_layernorm [UPCAST_LAYERNORM] + Whether or not to upcast the layernorm weights in + fp32. (default: False) + --upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT] + Whether or not to upcast the output of lm_head in + fp32. (default: False) + --train_from_scratch [TRAIN_FROM_SCRATCH] + Whether or not to randomly initialize the model + weights. (default: False) + --infer_backend {huggingface,vllm} + Backend engine used at inference. (default: + huggingface) + --vllm_maxlen VLLM_MAXLEN + Maximum sequence (prompt + response) length of the + vLLM engine. (default: 2048) + --vllm_gpu_util VLLM_GPU_UTIL + The fraction of GPU memory in (0,1) to be used for the + vLLM engine. (default: 0.9) + --vllm_enforce_eager [VLLM_ENFORCE_EAGER] + Whether or not to disable CUDA graph in the vLLM + engine. (default: False) + --vllm_max_lora_rank VLLM_MAX_LORA_RANK + Maximum rank of all LoRAs in the vLLM engine. + (default: 32) + --offload_folder OFFLOAD_FOLDER + Path to offload model weights. (default: offload) + --use_cache [USE_CACHE] + Whether or not to use KV cache in generation. + (default: True) + --no_use_cache Whether or not to use KV cache in generation. + (default: False) + --infer_dtype {auto,float16,bfloat16,float32} + Data type for model weights and activations at + inference. (default: auto) + --hf_hub_token HF_HUB_TOKEN + Auth token to log in with Hugging Face Hub. (default: + None) + --ms_hub_token MS_HUB_TOKEN + Auth token to log in with ModelScope Hub. (default: + None) + --export_dir EXPORT_DIR + Path to the directory to save the exported model. + (default: None) + --export_size EXPORT_SIZE + The file shard size (in GB) of the exported model. + (default: 1) + --export_device {cpu,auto} + The device used in model export, use `auto` to + accelerate exporting. (default: cpu) + --export_quantization_bit EXPORT_QUANTIZATION_BIT + The number of bits to quantize the exported model. + (default: None) + --export_quantization_dataset EXPORT_QUANTIZATION_DATASET + Path to the dataset or dataset name to use in + quantizing the exported model. (default: None) + --export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES + The number of samples used for quantization. (default: + 128) + --export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN + The maximum length of the model inputs used for + quantization. (default: 1024) + --export_legacy_format [EXPORT_LEGACY_FORMAT] + Whether or not to save the `.bin` files instead of + `.safetensors`. (default: False) + --export_hub_model_id EXPORT_HUB_MODEL_ID + The name of the repository if push the model to the + Hugging Face hub. (default: None) + --print_param_status [PRINT_PARAM_STATUS] + For debugging purposes, print the status of the + parameters in the model. (default: False) + --template TEMPLATE Which template to use for constructing prompts in + training and inference. (default: None) + --dataset DATASET The name of dataset(s) to use for training. Use commas + to separate multiple datasets. (default: None) + --eval_dataset EVAL_DATASET + The name of dataset(s) to use for evaluation. Use + commas to separate multiple datasets. (default: None) + --dataset_dir DATASET_DIR + Path to the folder containing the datasets. (default: + data) + --cutoff_len CUTOFF_LEN + The cutoff length of the tokenized inputs in the + dataset. (default: 1024) + --train_on_prompt [TRAIN_ON_PROMPT] + Whether or not to disable the mask on the prompt. + (default: False) + --mask_history [MASK_HISTORY] + Whether or not to mask the history and train on the + last turn only. (default: False) + --streaming [STREAMING] + Enable dataset streaming. (default: False) + --buffer_size BUFFER_SIZE + Size of the buffer to randomly sample examples from in + dataset streaming. (default: 16384) + --mix_strategy {concat,interleave_under,interleave_over} + Strategy to use in dataset mixing (concat/interleave) + (undersampling/oversampling). (default: concat) + --interleave_probs INTERLEAVE_PROBS + Probabilities to sample data from datasets. Use commas + to separate multiple datasets. (default: None) + --overwrite_cache [OVERWRITE_CACHE] + Overwrite the cached training and evaluation sets. + (default: False) + --preprocessing_num_workers PREPROCESSING_NUM_WORKERS + The number of processes to use for the pre-processing. + (default: None) + --max_samples MAX_SAMPLES + For debugging purposes, truncate the number of + examples for each dataset. (default: None) + --eval_num_beams EVAL_NUM_BEAMS + Number of beams to use for evaluation. This argument + will be passed to `model.generate` (default: None) + --ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS] + Whether or not to ignore the tokens corresponding to + the pad label in loss computation. (default: True) + --no_ignore_pad_token_for_loss + Whether or not to ignore the tokens corresponding to + the pad label in loss computation. (default: False) + --val_size VAL_SIZE Size of the development set, should be an integer or a + float in range `[0,1)`. (default: 0.0) + --packing PACKING Enable sequences packing in training. Will + automatically enable in pre-training. (default: None) + --neat_packing [NEAT_PACKING] + Enable sequence packing without cross-attention. + (default: False) + --tool_format TOOL_FORMAT + Tool format to use for constructing function calling + examples. (default: None) + --tokenized_path TOKENIZED_PATH + Path to save or load the tokenized datasets. (default: + None) + --output_dir OUTPUT_DIR + The output directory where the model predictions and + checkpoints will be written. (default: None) + --overwrite_output_dir [OVERWRITE_OUTPUT_DIR] + Overwrite the content of the output directory. Use + this to continue training if output_dir points to a + checkpoint directory. (default: False) + --do_train [DO_TRAIN] + Whether to run training. (default: False) + --do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False) + --do_predict [DO_PREDICT] + Whether to run predictions on the test set. (default: + False) + --eval_strategy {no,steps,epoch} + The evaluation strategy to use. (default: no) + --prediction_loss_only [PREDICTION_LOSS_ONLY] + When performing evaluation and predictions, only + returns the loss. (default: False) + --per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE + Batch size per GPU/TPU/MPS/NPU core/CPU for training. + (default: 8) + --per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE + Batch size per GPU/TPU/MPS/NPU core/CPU for + evaluation. (default: 8) + --per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE + Deprecated, the use of `--per_device_train_batch_size` + is preferred. Batch size per GPU/TPU core/CPU for + training. (default: None) + --per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE + Deprecated, the use of `--per_device_eval_batch_size` + is preferred. Batch size per GPU/TPU core/CPU for + evaluation. (default: None) + --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS + Number of updates steps to accumulate before + performing a backward/update pass. (default: 1) + --eval_accumulation_steps EVAL_ACCUMULATION_STEPS + Number of predictions steps to accumulate before + moving the tensors to the CPU. (default: None) + --eval_delay EVAL_DELAY + Number of epochs or steps to wait for before the first + evaluation can be performed, depending on the + eval_strategy. (default: 0) + --torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS + Number of steps to wait before calling + `torch..empty_cache()`.This can help avoid + CUDA out-of-memory errors by lowering peak VRAM usage + at a cost of about [10{'option_strings': ['-- + torch_empty_cache_steps'], 'dest': + 'torch_empty_cache_steps', 'nargs': None, 'const': + None, 'default': None, 'type': 'int', 'choices': None, + 'required': False, 'help': 'Number of steps to wait + before calling `torch..empty_cache()`.This can + help avoid CUDA out-of-memory errors by lowering peak + VRAM usage at a cost of about [10% slower performance] + (https://github.com/huggingface/transformers/issues/31 + 372).If left unset or set to None, cache will not be + emptied.', 'metavar': None, 'container': + , + 'prog': 'launcher.py'}lower performance](https://githu + b.com/huggingface/transformers/issues/31372).If left + unset or set to None, cache will not be emptied. + (default: None) + --learning_rate LEARNING_RATE + The initial learning rate for AdamW. (default: 5e-05) + --weight_decay WEIGHT_DECAY + Weight decay for AdamW if we apply some. (default: + 0.0) + --adam_beta1 ADAM_BETA1 + Beta1 for AdamW optimizer (default: 0.9) + --adam_beta2 ADAM_BETA2 + Beta2 for AdamW optimizer (default: 0.999) + --adam_epsilon ADAM_EPSILON + Epsilon for AdamW optimizer. (default: 1e-08) + --max_grad_norm MAX_GRAD_NORM + Max gradient norm. (default: 1.0) + --num_train_epochs NUM_TRAIN_EPOCHS + Total number of training epochs to perform. (default: + 3.0) + --max_steps MAX_STEPS + If > 0: set total number of training steps to perform. + Override num_train_epochs. (default: -1) + --lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay} + The scheduler type to use. (default: linear) + --lr_scheduler_kwargs LR_SCHEDULER_KWARGS + Extra parameters for the lr_scheduler such as + {'num_cycles': 1} for the cosine with hard restarts. + (default: {}) + --warmup_ratio WARMUP_RATIO + Linear warmup over warmup_ratio fraction of total + steps. (default: 0.0) + --warmup_steps WARMUP_STEPS + Linear warmup over warmup_steps. (default: 0) + --log_level {detail,debug,info,warning,error,critical,passive} + Logger log level to use on the main node. Possible + choices are the log levels as strings: 'debug', + 'info', 'warning', 'error' and 'critical', plus a + 'passive' level which doesn't set anything and lets + the application set the level. Defaults to 'passive'. + (default: passive) + --log_level_replica {detail,debug,info,warning,error,critical,passive} + Logger log level to use on replica nodes. Same choices + and defaults as ``log_level`` (default: warning) + --log_on_each_node [LOG_ON_EACH_NODE] + When doing a multinode distributed training, whether + to log once per node or just once on the main node. + (default: True) + --no_log_on_each_node + When doing a multinode distributed training, whether + to log once per node or just once on the main node. + (default: False) + --logging_dir LOGGING_DIR + Tensorboard log dir. (default: None) + --logging_strategy {no,steps,epoch} + The logging strategy to use. (default: steps) + --logging_first_step [LOGGING_FIRST_STEP] + Log the first global_step (default: False) + --logging_steps LOGGING_STEPS + Log every X updates steps. Should be an integer or a + float in range `[0,1)`. If smaller than 1, will be + interpreted as ratio of total training steps. + (default: 500) + --logging_nan_inf_filter [LOGGING_NAN_INF_FILTER] + Filter nan and inf losses for logging. (default: True) + --no_logging_nan_inf_filter + Filter nan and inf losses for logging. (default: + False) + --save_strategy {no,steps,epoch} + The checkpoint save strategy to use. (default: steps) + --save_steps SAVE_STEPS + Save checkpoint every X updates steps. Should be an + integer or a float in range `[0,1)`. If smaller than + 1, will be interpreted as ratio of total training + steps. (default: 500) + --save_total_limit SAVE_TOTAL_LIMIT + If a value is passed, will limit the total amount of + checkpoints. Deletes the older checkpoints in + `output_dir`. When `load_best_model_at_end` is + enabled, the 'best' checkpoint according to + `metric_for_best_model` will always be retained in + addition to the most recent ones. For example, for + `save_total_limit=5` and + `load_best_model_at_end=True`, the four last + checkpoints will always be retained alongside the best + model. When `save_total_limit=1` and + `load_best_model_at_end=True`, it is possible that two + checkpoints are saved: the last one and the best one + (if they are different). Default is unlimited + checkpoints (default: None) + --save_safetensors [SAVE_SAFETENSORS] + Use safetensors saving and loading for state dicts + instead of default torch.load and torch.save. + (default: True) + --no_save_safetensors + Use safetensors saving and loading for state dicts + instead of default torch.load and torch.save. + (default: False) + --save_on_each_node [SAVE_ON_EACH_NODE] + When doing multi-node distributed training, whether to + save models and checkpoints on each node, or only on + the main one (default: False) + --save_only_model [SAVE_ONLY_MODEL] + When checkpointing, whether to only save the model, or + also the optimizer, scheduler & rng state.Note that + when this is true, you won't be able to resume + training from checkpoint.This enables you to save + storage by not storing the optimizer, scheduler & rng + state.You can only load the model using + from_pretrained with this option set to True. + (default: False) + --restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT] + Whether to restore the callback states from the + checkpoint. If `True`, will override callbacks passed + to the `Trainer` if they exist in the checkpoint. + (default: False) + --no_cuda [NO_CUDA] This argument is deprecated. It will be removed in + version 5.0 of 🤗 Transformers. (default: False) + --use_cpu [USE_CPU] Whether or not to use cpu. If set to False, we will + use cuda/tpu/mps/npu device if available. (default: + False) + --use_mps_device [USE_MPS_DEVICE] + This argument is deprecated. `mps` device will be used + if available similar to `cuda` device. It will be + removed in version 5.0 of 🤗 Transformers (default: + False) + --seed SEED Random seed that will be set at the beginning of + training. (default: 42) + --data_seed DATA_SEED + Random seed to be used with data samplers. (default: + None) + --jit_mode_eval [JIT_MODE_EVAL] + Whether or not to use PyTorch jit trace for inference + (default: False) + --use_ipex [USE_IPEX] + Use Intel extension for PyTorch when it is available, + installation: 'https://github.com/intel/intel- + extension-for-pytorch' (default: False) + --bf16 [BF16] Whether to use bf16 (mixed) precision instead of + 32-bit. Requires Ampere or higher NVIDIA architecture + or using CPU (use_cpu) or Ascend NPU. This is an + experimental API and it may change. (default: False) + --fp16 [FP16] Whether to use fp16 (mixed) precision instead of + 32-bit (default: False) + --fp16_opt_level FP16_OPT_LEVEL + For fp16: Apex AMP optimization level selected in + ['O0', 'O1', 'O2', and 'O3']. See details at + https://nvidia.github.io/apex/amp.html (default: O1) + --half_precision_backend {auto,apex,cpu_amp} + The backend to be used for half precision. (default: + auto) + --bf16_full_eval [BF16_FULL_EVAL] + Whether to use full bfloat16 evaluation instead of + 32-bit. This is an experimental API and it may change. + (default: False) + --fp16_full_eval [FP16_FULL_EVAL] + Whether to use full float16 evaluation instead of + 32-bit (default: False) + --tf32 TF32 Whether to enable tf32 mode, available in Ampere and + newer GPU architectures. This is an experimental API + and it may change. (default: None) + --local_rank LOCAL_RANK + For distributed training: local_rank (default: -1) + --ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl} + The backend to be used for distributed training + (default: None) + --tpu_num_cores TPU_NUM_CORES + TPU: Number of TPU cores (automatically passed by + launcher script) (default: None) + --tpu_metrics_debug [TPU_METRICS_DEBUG] + Deprecated, the use of `--debug tpu_metrics_debug` is + preferred. TPU: Whether to print debug metrics + (default: False) + --debug DEBUG [DEBUG ...] + Whether or not to enable debug mode. Current options: + `underflow_overflow` (Detect underflow and overflow in + activations and weights), `tpu_metrics_debug` (print + debug metrics on TPU). (default: None) + --dataloader_drop_last [DATALOADER_DROP_LAST] + Drop the last incomplete batch if it is not divisible + by the batch size. (default: False) + --eval_steps EVAL_STEPS + Run an evaluation every X steps. Should be an integer + or a float in range `[0,1)`. If smaller than 1, will + be interpreted as ratio of total training steps. + (default: None) + --dataloader_num_workers DATALOADER_NUM_WORKERS + Number of subprocesses to use for data loading + (PyTorch only). 0 means that the data will be loaded + in the main process. (default: 0) + --dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR + Number of batches loaded in advance by each worker. 2 + means there will be a total of 2 * num_workers batches + prefetched across all workers. Default is 2 for + PyTorch < 2.0.0 and otherwise None. (default: None) + --past_index PAST_INDEX + If >=0, uses the corresponding part of the output as + the past state for next step. (default: -1) + --run_name RUN_NAME An optional descriptor for the run. Notably used for + wandb, mlflow and comet logging. (default: None) + --disable_tqdm DISABLE_TQDM + Whether or not to disable the tqdm progress bars. + (default: None) + --remove_unused_columns [REMOVE_UNUSED_COLUMNS] + Remove columns not required by the model when using an + nlp.Dataset. (default: True) + --no_remove_unused_columns + Remove columns not required by the model when using an + nlp.Dataset. (default: False) + --label_names LABEL_NAMES [LABEL_NAMES ...] + The list of keys in your dictionary of inputs that + correspond to the labels. (default: None) + --load_best_model_at_end [LOAD_BEST_MODEL_AT_END] + Whether or not to load the best model found during + training at the end of training. When this option is + enabled, the best checkpoint will always be saved. See + `save_total_limit` for more. (default: False) + --metric_for_best_model METRIC_FOR_BEST_MODEL + The metric to use to compare two different models. + (default: None) + --greater_is_better GREATER_IS_BETTER + Whether the `metric_for_best_model` should be + maximized or not. (default: None) + --ignore_data_skip [IGNORE_DATA_SKIP] + When resuming training, whether or not to skip the + first epochs and batches to get to the same training + data. (default: False) + --fsdp FSDP Whether or not to use PyTorch Fully Sharded Data + Parallel (FSDP) training (in distributed training + only). The base option should be `full_shard`, + `shard_grad_op` or `no_shard` and you can add CPU- + offload to `full_shard` or `shard_grad_op` like this: + full_shard offload` or `shard_grad_op offload`. You + can add auto-wrap to `full_shard` or `shard_grad_op` + with the same syntax: full_shard auto_wrap` or + `shard_grad_op auto_wrap`. (default: ) + --fsdp_min_num_params FSDP_MIN_NUM_PARAMS + This parameter is deprecated. FSDP's minimum number of + parameters for Default Auto Wrapping. (useful only + when `fsdp` field is passed). (default: 0) + --fsdp_config FSDP_CONFIG + Config to be used with FSDP (Pytorch Fully Sharded + Data Parallel). The value is either a fsdp json config + file (e.g., `fsdp_config.json`) or an already loaded + json file as `dict`. (default: None) + --fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP + This parameter is deprecated. Transformer layer class + name (case-sensitive) to wrap, e.g, `BertLayer`, + `GPTJBlock`, `T5Block` .... (useful only when `fsdp` + flag is passed). (default: None) + --accelerator_config ACCELERATOR_CONFIG + Config to be used with the internal Accelerator object + initializtion. The value is either a accelerator json + config file (e.g., `accelerator_config.json`) or an + already loaded json file as `dict`. (default: None) + --deepspeed DEEPSPEED + Enable deepspeed and pass the path to deepspeed json + config file (e.g. `ds_config.json`) or an already + loaded json file as a dict (default: None) + --label_smoothing_factor LABEL_SMOOTHING_FACTOR + The label smoothing epsilon to apply (zero means no + label smoothing). (default: 0.0) + --optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo} + The optimizer to use. (default: adamw_torch) + --optim_args OPTIM_ARGS + Optional arguments to supply to optimizer. (default: + None) + --adafactor [ADAFACTOR] + Whether or not to replace AdamW by Adafactor. + (default: False) + --group_by_length [GROUP_BY_LENGTH] + Whether or not to group samples of roughly the same + length together when batching. (default: False) + --length_column_name LENGTH_COLUMN_NAME + Column name with precomputed lengths to use when + grouping by length. (default: length) + --report_to REPORT_TO + The list of integrations to report the results and + logs to. (default: None) + --ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS + When using distributed training, the value of the flag + `find_unused_parameters` passed to + `DistributedDataParallel`. (default: None) + --ddp_bucket_cap_mb DDP_BUCKET_CAP_MB + When using distributed training, the value of the flag + `bucket_cap_mb` passed to `DistributedDataParallel`. + (default: None) + --ddp_broadcast_buffers DDP_BROADCAST_BUFFERS + When using distributed training, the value of the flag + `broadcast_buffers` passed to + `DistributedDataParallel`. (default: None) + --dataloader_pin_memory [DATALOADER_PIN_MEMORY] + Whether or not to pin memory for DataLoader. (default: + True) + --no_dataloader_pin_memory + Whether or not to pin memory for DataLoader. (default: + False) + --dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS] + If True, the data loader will not shut down the worker + processes after a dataset has been consumed once. This + allows to maintain the workers Dataset instances + alive. Can potentially speed up training, but will + increase RAM usage. (default: False) + --skip_memory_metrics [SKIP_MEMORY_METRICS] + Whether or not to skip adding of memory profiler + reports to metrics. (default: True) + --no_skip_memory_metrics + Whether or not to skip adding of memory profiler + reports to metrics. (default: False) + --use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP] + Whether or not to use the legacy prediction_loop in + the Trainer. (default: False) + --push_to_hub [PUSH_TO_HUB] + Whether or not to upload the trained model to the + model hub after training. (default: False) + --resume_from_checkpoint RESUME_FROM_CHECKPOINT + The path to a folder with a valid checkpoint for your + model. (default: None) + --hub_model_id HUB_MODEL_ID + The name of the repository to keep in sync with the + local `output_dir`. (default: None) + --hub_strategy {end,every_save,checkpoint,all_checkpoints} + The hub strategy to use when `--push_to_hub` is + activated. (default: every_save) + --hub_token HUB_TOKEN + The token to use to push to the Model Hub. (default: + None) + --hub_private_repo [HUB_PRIVATE_REPO] + Whether the model repository is private or not. + (default: False) + --hub_always_push [HUB_ALWAYS_PUSH] + Unless `True`, the Trainer will skip pushes if the + previous one wasn't finished yet. (default: False) + --gradient_checkpointing [GRADIENT_CHECKPOINTING] + If True, use gradient checkpointing to save memory at + the expense of slower backward pass. (default: False) + --gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS + Gradient checkpointing key word arguments such as + `use_reentrant`. Will be passed to + `torch.utils.checkpoint.checkpoint` through + `model.gradient_checkpointing_enable`. (default: None) + --include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS] + Whether or not the inputs will be passed to the + `compute_metrics` function. (default: False) + --eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES] + Whether to recursively concat + inputs/losses/labels/predictions across batches. If + `False`, will instead store them as lists, with each + batch kept separate. (default: True) + --no_eval_do_concat_batches + Whether to recursively concat + inputs/losses/labels/predictions across batches. If + `False`, will instead store them as lists, with each + batch kept separate. (default: False) + --fp16_backend {auto,apex,cpu_amp} + Deprecated. Use half_precision_backend instead + (default: auto) + --evaluation_strategy {no,steps,epoch} + Deprecated. Use `eval_strategy` instead (default: + None) + --push_to_hub_model_id PUSH_TO_HUB_MODEL_ID + The name of the repository to which push the + `Trainer`. (default: None) + --push_to_hub_organization PUSH_TO_HUB_ORGANIZATION + The name of the organization in with to which push the + `Trainer`. (default: None) + --push_to_hub_token PUSH_TO_HUB_TOKEN + The token to use to push to the Model Hub. (default: + None) + --mp_parameters MP_PARAMETERS + Used by the SageMaker launcher to send mp-specific + args. Ignored in Trainer (default: ) + --auto_find_batch_size [AUTO_FIND_BATCH_SIZE] + Whether to automatically decrease the batch size in + half and rerun the training loop again each time a + CUDA Out-of-Memory was reached (default: False) + --full_determinism [FULL_DETERMINISM] + Whether to call enable_full_determinism instead of + set_seed for reproducibility in distributed training. + Important: this will negatively impact the + performance, so only use it for debugging. (default: + False) + --torchdynamo TORCHDYNAMO + This argument is deprecated, use + `--torch_compile_backend` instead. (default: None) + --ray_scope RAY_SCOPE + The scope to use when doing hyperparameter search with + Ray. By default, `"last"` will be used. Ray will then + use the last checkpoint of all trials, compare those, + and select the best one. However, other options are + also available. See the Ray documentation (https://doc + s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun + e.ExperimentAnalysis.get_best_trial) for more options. + (default: last) + --ddp_timeout DDP_TIMEOUT + Overrides the default timeout for distributed training + (value should be given in seconds). (default: 1800) + --torch_compile [TORCH_COMPILE] + If set to `True`, the model will be wrapped in + `torch.compile`. (default: False) + --torch_compile_backend TORCH_COMPILE_BACKEND + Which backend to use with `torch.compile`, passing one + will trigger a model compilation. (default: None) + --torch_compile_mode TORCH_COMPILE_MODE + Which mode to use with `torch.compile`, passing one + will trigger a model compilation. (default: None) + --dispatch_batches DISPATCH_BATCHES + Deprecated. Pass {'dispatch_batches':VALUE} to + `accelerator_config`. (default: None) + --split_batches SPLIT_BATCHES + Deprecated. Pass {'split_batches':True} to + `accelerator_config`. (default: None) + --include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND] + If set to `True`, the speed metrics will include `tgs` + (tokens per second per device). (default: False) + --include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN] + If set to `True`, will track the number of input + tokens seen throughout training. (May be slower in + distributed training) (default: False) + --neftune_noise_alpha NEFTUNE_NOISE_ALPHA + Activates neftune noise embeddings into the model. + NEFTune has been proven to drastically improve model + performances for instrcution fine-tuning. Check out + the original paper here: + https://arxiv.org/abs/2310.05914 and the original code + here: https://github.com/neelsjain/NEFTune. Only + supported for `PreTrainedModel` and `PeftModel` + classes. (default: None) + --optim_target_modules OPTIM_TARGET_MODULES + Target modules for the optimizer defined in the + `optim` argument. Only used for the GaLore optimizer + at the moment. (default: None) + --batch_eval_metrics [BATCH_EVAL_METRICS] + Break eval metrics calculation into batches to save + memory. (default: False) + --eval_on_start [EVAL_ON_START] + Whether to run through the entire `evaluation` step at + the very beginning of training as a sanity check. + (default: False) + --eval_use_gather_object [EVAL_USE_GATHER_OBJECT] + Whether to run recursively gather object in a nested + list/tuple/dictionary of objects from all devices. + (default: False) + --sortish_sampler [SORTISH_SAMPLER] + Whether to use SortishSampler or not. (default: False) + --predict_with_generate [PREDICT_WITH_GENERATE] + Whether to use generate to calculate generative + metrics (ROUGE, BLEU). (default: False) + --generation_max_length GENERATION_MAX_LENGTH + The `max_length` to use on each evaluation loop when + `predict_with_generate=True`. Will default to the + `max_length` value of the model configuration. + (default: None) + --generation_num_beams GENERATION_NUM_BEAMS + The `num_beams` to use on each evaluation loop when + `predict_with_generate=True`. Will default to the + `num_beams` value of the model configuration. + (default: None) + --generation_config GENERATION_CONFIG + Model id, file path or url pointing to a + GenerationConfig json file, to use during prediction. + (default: None) + --use_badam [USE_BADAM] + Whether or not to use the BAdam optimizer. (default: + False) + --badam_mode {layer,ratio} + Whether to use layer-wise or ratio-wise BAdam + optimizer. (default: layer) + --badam_start_block BADAM_START_BLOCK + The starting block index for layer-wise BAdam. + (default: None) + --badam_switch_mode {ascending,descending,random,fixed} + the strategy of picking block to update for layer-wise + BAdam. (default: ascending) + --badam_switch_interval BADAM_SWITCH_INTERVAL + Number of steps to update the block for layer-wise + BAdam. Use -1 to disable the block update. (default: + 50) + --badam_update_ratio BADAM_UPDATE_RATIO + The ratio of the update for ratio-wise BAdam. + (default: 0.05) + --badam_mask_mode {adjacent,scatter} + The mode of the mask for BAdam optimizer. `adjacent` + means that the trainable parameters are adjacent to + each other, `scatter` means that trainable parameters + are randomly choosed from the weight. (default: + adjacent) + --badam_verbose BADAM_VERBOSE + The verbosity level of BAdam optimizer. 0 for no + print, 1 for print the block prefix, 2 for print + trainable parameters. (default: 0) + --use_galore [USE_GALORE] + Whether or not to use the gradient low-Rank projection + (GaLore). (default: False) + --galore_target GALORE_TARGET + Name(s) of modules to apply GaLore. Use commas to + separate multiple modules. Use `all` to specify all + the linear modules. (default: all) + --galore_rank GALORE_RANK + The rank of GaLore gradients. (default: 16) + --galore_update_interval GALORE_UPDATE_INTERVAL + Number of steps to update the GaLore projection. + (default: 200) + --galore_scale GALORE_SCALE + GaLore scaling coefficient. (default: 0.25) + --galore_proj_type {std,reverse_std,right,left,full} + Type of GaLore projection. (default: std) + --galore_layerwise [GALORE_LAYERWISE] + Whether or not to enable layer-wise update to further + save memory. (default: False) + --pref_beta PREF_BETA + The beta parameter in the preference loss. (default: + 0.1) + --pref_ftx PREF_FTX The supervised fine-tuning loss coefficient in DPO + training. (default: 0.0) + --pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo} + The type of DPO loss to use. (default: sigmoid) + --dpo_label_smoothing DPO_LABEL_SMOOTHING + The robust DPO label smoothing parameter in cDPO that + should be between 0 and 0.5. (default: 0.0) + --kto_chosen_weight KTO_CHOSEN_WEIGHT + The weight factor of the desirable losses in KTO + training. (default: 1.0) + --kto_rejected_weight KTO_REJECTED_WEIGHT + The weight factor of the undesirable losses in KTO + training. (default: 1.0) + --simpo_gamma SIMPO_GAMMA + The target reward margin term in SimPO loss. (default: + 0.5) + --ppo_buffer_size PPO_BUFFER_SIZE + The number of mini-batches to make experience buffer + in a PPO optimization step. (default: 1) + --ppo_epochs PPO_EPOCHS + The number of epochs to perform in a PPO optimization + step. (default: 4) + --ppo_score_norm [PPO_SCORE_NORM] + Use score normalization in PPO training. (default: + False) + --ppo_target PPO_TARGET + Target KL value for adaptive KL control in PPO + training. (default: 6.0) + --ppo_whiten_rewards [PPO_WHITEN_REWARDS] + Whiten the rewards before compute advantages in PPO + training. (default: False) + --ref_model REF_MODEL + Path to the reference model used for the PPO or DPO + training. (default: None) + --ref_model_adapters REF_MODEL_ADAPTERS + Path to the adapters of the reference model. (default: + None) + --ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT + The number of bits to quantize the reference model. + (default: None) + --reward_model REWARD_MODEL + Path to the reward model used for the PPO training. + (default: None) + --reward_model_adapters REWARD_MODEL_ADAPTERS + Path to the adapters of the reward model. (default: + None) + --reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT + The number of bits to quantize the reward model. + (default: None) + --reward_model_type {lora,full,api} + The type of the reward model in PPO training. Lora + model only supports lora training. (default: lora) + --additional_target ADDITIONAL_TARGET + Name(s) of modules apart from LoRA layers to be set as + trainable and saved in the final checkpoint. Use + commas to separate multiple modules. (default: None) + --lora_alpha LORA_ALPHA + The scale factor for LoRA fine-tuning (default: + lora_rank * 2). (default: None) + --lora_dropout LORA_DROPOUT + Dropout rate for the LoRA fine-tuning. (default: 0.0) + --lora_rank LORA_RANK + The intrinsic dimension for LoRA fine-tuning. + (default: 8) + --lora_target LORA_TARGET + Name(s) of target modules to apply LoRA. Use commas to + separate multiple modules. Use `all` to specify all + the linear modules. (default: all) + --loraplus_lr_ratio LORAPLUS_LR_RATIO + LoRA plus learning rate ratio (lr_B / lr_A). (default: + None) + --loraplus_lr_embedding LORAPLUS_LR_EMBEDDING + LoRA plus learning rate for lora embedding layers. + (default: 1e-06) + --use_rslora [USE_RSLORA] + Whether or not to use the rank stabilization scaling + factor for LoRA layer. (default: False) + --use_dora [USE_DORA] + Whether or not to use the weight-decomposed lora + method (DoRA). (default: False) + --pissa_init [PISSA_INIT] + Whether or not to initialize a PiSSA adapter. + (default: False) + --pissa_iter PISSA_ITER + The number of iteration steps performed by FSVD in + PiSSA. Use -1 to disable it. (default: 16) + --pissa_convert [PISSA_CONVERT] + Whether or not to convert the PiSSA adapter to a + normal LoRA adapter. (default: False) + --create_new_adapter [CREATE_NEW_ADAPTER] + Whether or not to create a new adapter with randomly + initialized weight. (default: False) + --freeze_trainable_layers FREEZE_TRAINABLE_LAYERS + The number of trainable layers for freeze (partial- + parameter) fine-tuning. Positive numbers mean the last + n layers are set as trainable, negative numbers mean + the first n layers are set as trainable. (default: 2) + --freeze_trainable_modules FREEZE_TRAINABLE_MODULES + Name(s) of trainable modules for freeze (partial- + parameter) fine-tuning. Use commas to separate + multiple modules. Use `all` to specify all the + available modules. (default: all) + --freeze_extra_modules FREEZE_EXTRA_MODULES + Name(s) of modules apart from hidden layers to be set + as trainable for freeze (partial-parameter) fine- + tuning. Use commas to separate multiple modules. + (default: None) + --pure_bf16 [PURE_BF16] + Whether or not to train model in purely bf16 precision + (without AMP). (default: False) + --stage {pt,sft,rm,ppo,dpo,kto} + Which stage will be performed in training. (default: + sft) + --finetuning_type {lora,freeze,full} + Which fine-tuning method to use. (default: lora) + --use_llama_pro [USE_LLAMA_PRO] + Whether or not to make only the parameters in the + expanded blocks trainable. (default: False) + --use_adam_mini [USE_ADAM_MINI] + Whether or not to use the Adam-mini optimizer. + (default: False) + --freeze_vision_tower [FREEZE_VISION_TOWER] + Whether ot not to freeze vision tower in MLLM + training. (default: True) + --no_freeze_vision_tower + Whether ot not to freeze vision tower in MLLM + training. (default: False) + --train_mm_proj_only [TRAIN_MM_PROJ_ONLY] + Whether or not to train the multimodal projector for + MLLM only. (default: False) + --compute_accuracy [COMPUTE_ACCURACY] + Whether or not to compute the token-level accuracy at + evaluation. (default: False) + --plot_loss [PLOT_LOSS] + Whether or not to save the training loss curves. + (default: False) + --do_sample [DO_SAMPLE] + Whether or not to use sampling, use greedy decoding + otherwise. (default: True) + --no_do_sample Whether or not to use sampling, use greedy decoding + otherwise. (default: False) + --temperature TEMPERATURE + The value used to modulate the next token + probabilities. (default: 0.95) + --top_p TOP_P The smallest set of most probable tokens with + probabilities that add up to top_p or higher are kept. + (default: 0.7) + --top_k TOP_K The number of highest probability vocabulary tokens to + keep for top-k filtering. (default: 50) + --num_beams NUM_BEAMS + Number of beams for beam search. 1 means no beam + search. (default: 1) + --max_length MAX_LENGTH + The maximum length the generated tokens can have. It + can be overridden by max_new_tokens. (default: 1024) + --max_new_tokens MAX_NEW_TOKENS + The maximum numbers of tokens to generate, ignoring + the number of tokens in the prompt. (default: 1024) + --repetition_penalty REPETITION_PENALTY + The parameter for repetition penalty. 1.0 means no + penalty. (default: 1.0) + --length_penalty LENGTH_PENALTY + Exponential penalty to the length that is used with + beam-based generation. (default: 1.0) + --default_system DEFAULT_SYSTEM + Default system message to use in chat completion. + (default: None) +usage: launcher.py [-h] --model_name_or_path MODEL_NAME_OR_PATH + [--adapter_name_or_path ADAPTER_NAME_OR_PATH] + [--adapter_folder ADAPTER_FOLDER] [--cache_dir CACHE_DIR] + [--use_fast_tokenizer [USE_FAST_TOKENIZER]] + [--no_use_fast_tokenizer] [--resize_vocab [RESIZE_VOCAB]] + [--split_special_tokens [SPLIT_SPECIAL_TOKENS]] + [--new_special_tokens NEW_SPECIAL_TOKENS] + [--model_revision MODEL_REVISION] + [--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]] + [--no_low_cpu_mem_usage] + [--quantization_method {bitsandbytes,hqq,eetq}] + [--quantization_bit QUANTIZATION_BIT] + [--quantization_type {fp4,nf4}] + [--double_quantization [DOUBLE_QUANTIZATION]] + [--no_double_quantization] + [--quantization_device_map {auto}] + [--rope_scaling {linear,dynamic}] + [--flash_attn {auto,disabled,sdpa,fa2}] + [--shift_attn [SHIFT_ATTN]] + [--mixture_of_depths {convert,load}] + [--use_unsloth [USE_UNSLOTH]] + [--visual_inputs [VISUAL_INPUTS]] + [--moe_aux_loss_coef MOE_AUX_LOSS_COEF] + [--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]] + [--upcast_layernorm [UPCAST_LAYERNORM]] + [--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]] + [--train_from_scratch [TRAIN_FROM_SCRATCH]] + [--infer_backend {huggingface,vllm}] + [--vllm_maxlen VLLM_MAXLEN] [--vllm_gpu_util VLLM_GPU_UTIL] + [--vllm_enforce_eager [VLLM_ENFORCE_EAGER]] + [--vllm_max_lora_rank VLLM_MAX_LORA_RANK] + [--offload_folder OFFLOAD_FOLDER] [--use_cache [USE_CACHE]] + [--no_use_cache] + [--infer_dtype {auto,float16,bfloat16,float32}] + [--hf_hub_token HF_HUB_TOKEN] [--ms_hub_token MS_HUB_TOKEN] + [--export_dir EXPORT_DIR] [--export_size EXPORT_SIZE] + [--export_device {cpu,auto}] + [--export_quantization_bit EXPORT_QUANTIZATION_BIT] + [--export_quantization_dataset EXPORT_QUANTIZATION_DATASET] + [--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES] + [--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN] + [--export_legacy_format [EXPORT_LEGACY_FORMAT]] + [--export_hub_model_id EXPORT_HUB_MODEL_ID] + [--print_param_status [PRINT_PARAM_STATUS]] + [--template TEMPLATE] [--dataset DATASET] + [--eval_dataset EVAL_DATASET] [--dataset_dir DATASET_DIR] + [--cutoff_len CUTOFF_LEN] + [--train_on_prompt [TRAIN_ON_PROMPT]] + [--mask_history [MASK_HISTORY]] [--streaming [STREAMING]] + [--buffer_size BUFFER_SIZE] + [--mix_strategy {concat,interleave_under,interleave_over}] + [--interleave_probs INTERLEAVE_PROBS] + [--overwrite_cache [OVERWRITE_CACHE]] + [--preprocessing_num_workers PREPROCESSING_NUM_WORKERS] + [--max_samples MAX_SAMPLES] + [--eval_num_beams EVAL_NUM_BEAMS] + [--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]] + [--no_ignore_pad_token_for_loss] [--val_size VAL_SIZE] + [--packing PACKING] [--neat_packing [NEAT_PACKING]] + [--tool_format TOOL_FORMAT] + [--tokenized_path TOKENIZED_PATH] --output_dir OUTPUT_DIR + [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] + [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]] + [--do_predict [DO_PREDICT]] + [--eval_strategy {no,steps,epoch}] + [--prediction_loss_only [PREDICTION_LOSS_ONLY]] + [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] + [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE] + [--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] + [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE] + [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS] + [--eval_accumulation_steps EVAL_ACCUMULATION_STEPS] + [--eval_delay EVAL_DELAY] + [--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS] + [--learning_rate LEARNING_RATE] + [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1] + [--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON] + [--max_grad_norm MAX_GRAD_NORM] + [--num_train_epochs NUM_TRAIN_EPOCHS] + [--max_steps MAX_STEPS] + [--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}] + [--lr_scheduler_kwargs LR_SCHEDULER_KWARGS] + [--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS] + [--log_level {detail,debug,info,warning,error,critical,passive}] + [--log_level_replica {detail,debug,info,warning,error,critical,passive}] + [--log_on_each_node [LOG_ON_EACH_NODE]] + [--no_log_on_each_node] [--logging_dir LOGGING_DIR] + [--logging_strategy {no,steps,epoch}] + [--logging_first_step [LOGGING_FIRST_STEP]] + [--logging_steps LOGGING_STEPS] + [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]] + [--no_logging_nan_inf_filter] + [--save_strategy {no,steps,epoch}] + [--save_steps SAVE_STEPS] + [--save_total_limit SAVE_TOTAL_LIMIT] + [--save_safetensors [SAVE_SAFETENSORS]] + [--no_save_safetensors] + [--save_on_each_node [SAVE_ON_EACH_NODE]] + [--save_only_model [SAVE_ONLY_MODEL]] + [--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]] + [--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]] + [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] + [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]] + [--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]] + [--fp16_opt_level FP16_OPT_LEVEL] + [--half_precision_backend {auto,apex,cpu_amp}] + [--bf16_full_eval [BF16_FULL_EVAL]] + [--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32] + [--local_rank LOCAL_RANK] + [--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}] + [--tpu_num_cores TPU_NUM_CORES] + [--tpu_metrics_debug [TPU_METRICS_DEBUG]] + [--debug DEBUG [DEBUG ...]] + [--dataloader_drop_last [DATALOADER_DROP_LAST]] + [--eval_steps EVAL_STEPS] + [--dataloader_num_workers DATALOADER_NUM_WORKERS] + [--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR] + [--past_index PAST_INDEX] [--run_name RUN_NAME] + [--disable_tqdm DISABLE_TQDM] + [--remove_unused_columns [REMOVE_UNUSED_COLUMNS]] + [--no_remove_unused_columns] + [--label_names LABEL_NAMES [LABEL_NAMES ...]] + [--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]] + [--metric_for_best_model METRIC_FOR_BEST_MODEL] + [--greater_is_better GREATER_IS_BETTER] + [--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP] + [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] + [--fsdp_config FSDP_CONFIG] + [--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] + [--accelerator_config ACCELERATOR_CONFIG] + [--deepspeed DEEPSPEED] + [--label_smoothing_factor LABEL_SMOOTHING_FACTOR] + [--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}] + [--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] + [--group_by_length [GROUP_BY_LENGTH]] + [--length_column_name LENGTH_COLUMN_NAME] + [--report_to REPORT_TO] + [--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] + [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB] + [--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS] + [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]] + [--no_dataloader_pin_memory] + [--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]] + [--skip_memory_metrics [SKIP_MEMORY_METRICS]] + [--no_skip_memory_metrics] + [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]] + [--push_to_hub [PUSH_TO_HUB]] + [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] + [--hub_model_id HUB_MODEL_ID] + [--hub_strategy {end,every_save,checkpoint,all_checkpoints}] + [--hub_token HUB_TOKEN] + [--hub_private_repo [HUB_PRIVATE_REPO]] + [--hub_always_push [HUB_ALWAYS_PUSH]] + [--gradient_checkpointing [GRADIENT_CHECKPOINTING]] + [--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS] + [--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]] + [--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]] + [--no_eval_do_concat_batches] + [--fp16_backend {auto,apex,cpu_amp}] + [--evaluation_strategy {no,steps,epoch}] + [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID] + [--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION] + [--push_to_hub_token PUSH_TO_HUB_TOKEN] + [--mp_parameters MP_PARAMETERS] + [--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]] + [--full_determinism [FULL_DETERMINISM]] + [--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE] + [--ddp_timeout DDP_TIMEOUT] + [--torch_compile [TORCH_COMPILE]] + [--torch_compile_backend TORCH_COMPILE_BACKEND] + [--torch_compile_mode TORCH_COMPILE_MODE] + [--dispatch_batches DISPATCH_BATCHES] + [--split_batches SPLIT_BATCHES] + [--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]] + [--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]] + [--neftune_noise_alpha NEFTUNE_NOISE_ALPHA] + [--optim_target_modules OPTIM_TARGET_MODULES] + [--batch_eval_metrics [BATCH_EVAL_METRICS]] + [--eval_on_start [EVAL_ON_START]] + [--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]] + [--sortish_sampler [SORTISH_SAMPLER]] + [--predict_with_generate [PREDICT_WITH_GENERATE]] + [--generation_max_length GENERATION_MAX_LENGTH] + [--generation_num_beams GENERATION_NUM_BEAMS] + [--generation_config GENERATION_CONFIG] + [--use_badam [USE_BADAM]] [--badam_mode {layer,ratio}] + [--badam_start_block BADAM_START_BLOCK] + [--badam_switch_mode {ascending,descending,random,fixed}] + [--badam_switch_interval BADAM_SWITCH_INTERVAL] + [--badam_update_ratio BADAM_UPDATE_RATIO] + [--badam_mask_mode {adjacent,scatter}] + [--badam_verbose BADAM_VERBOSE] [--use_galore [USE_GALORE]] + [--galore_target GALORE_TARGET] [--galore_rank GALORE_RANK] + [--galore_update_interval GALORE_UPDATE_INTERVAL] + [--galore_scale GALORE_SCALE] + [--galore_proj_type {std,reverse_std,right,left,full}] + [--galore_layerwise [GALORE_LAYERWISE]] + [--pref_beta PREF_BETA] [--pref_ftx PREF_FTX] + [--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}] + [--dpo_label_smoothing DPO_LABEL_SMOOTHING] + [--kto_chosen_weight KTO_CHOSEN_WEIGHT] + [--kto_rejected_weight KTO_REJECTED_WEIGHT] + [--simpo_gamma SIMPO_GAMMA] + [--ppo_buffer_size PPO_BUFFER_SIZE] + [--ppo_epochs PPO_EPOCHS] + [--ppo_score_norm [PPO_SCORE_NORM]] + [--ppo_target PPO_TARGET] + [--ppo_whiten_rewards [PPO_WHITEN_REWARDS]] + [--ref_model REF_MODEL] + [--ref_model_adapters REF_MODEL_ADAPTERS] + [--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT] + [--reward_model REWARD_MODEL] + [--reward_model_adapters REWARD_MODEL_ADAPTERS] + [--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT] + [--reward_model_type {lora,full,api}] + [--additional_target ADDITIONAL_TARGET] + [--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT] + [--lora_rank LORA_RANK] [--lora_target LORA_TARGET] + [--loraplus_lr_ratio LORAPLUS_LR_RATIO] + [--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING] + [--use_rslora [USE_RSLORA]] [--use_dora [USE_DORA]] + [--pissa_init [PISSA_INIT]] [--pissa_iter PISSA_ITER] + [--pissa_convert [PISSA_CONVERT]] + [--create_new_adapter [CREATE_NEW_ADAPTER]] + [--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS] + [--freeze_trainable_modules FREEZE_TRAINABLE_MODULES] + [--freeze_extra_modules FREEZE_EXTRA_MODULES] + [--pure_bf16 [PURE_BF16]] [--stage {pt,sft,rm,ppo,dpo,kto}] + [--finetuning_type {lora,freeze,full}] + [--use_llama_pro [USE_LLAMA_PRO]] + [--use_adam_mini [USE_ADAM_MINI]] + [--freeze_vision_tower [FREEZE_VISION_TOWER]] + [--no_freeze_vision_tower] + [--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]] + [--compute_accuracy [COMPUTE_ACCURACY]] + [--plot_loss [PLOT_LOSS]] [--do_sample [DO_SAMPLE]] + [--no_do_sample] [--temperature TEMPERATURE] + [--top_p TOP_P] [--top_k TOP_K] [--num_beams NUM_BEAMS] + [--max_length MAX_LENGTH] [--max_new_tokens MAX_NEW_TOKENS] + [--repetition_penalty REPETITION_PENALTY] + [--length_penalty LENGTH_PENALTY] + [--default_system DEFAULT_SYSTEM] + +optional arguments: + -h, --help show this help message and exit + --model_name_or_path MODEL_NAME_OR_PATH + Path to the model weight or identifier from + huggingface.co/models or modelscope.cn/models. + (default: None) + --adapter_name_or_path ADAPTER_NAME_OR_PATH + Path to the adapter weight or identifier from + huggingface.co/models. Use commas to separate multiple + adapters. (default: None) + --adapter_folder ADAPTER_FOLDER + The folder containing the adapter weights to load. + (default: None) + --cache_dir CACHE_DIR + Where to store the pre-trained models downloaded from + huggingface.co or modelscope.cn. (default: None) + --use_fast_tokenizer [USE_FAST_TOKENIZER] + Whether or not to use one of the fast tokenizer + (backed by the tokenizers library). (default: True) + --no_use_fast_tokenizer + Whether or not to use one of the fast tokenizer + (backed by the tokenizers library). (default: False) + --resize_vocab [RESIZE_VOCAB] + Whether or not to resize the tokenizer vocab and the + embedding layers. (default: False) + --split_special_tokens [SPLIT_SPECIAL_TOKENS] + Whether or not the special tokens should be split + during the tokenization process. (default: False) + --new_special_tokens NEW_SPECIAL_TOKENS + Special tokens to be added into the tokenizer. Use + commas to separate multiple tokens. (default: None) + --model_revision MODEL_REVISION + The specific model version to use (can be a branch + name, tag name or commit id). (default: main) + --low_cpu_mem_usage [LOW_CPU_MEM_USAGE] + Whether or not to use memory-efficient model loading. + (default: True) + --no_low_cpu_mem_usage + Whether or not to use memory-efficient model loading. + (default: False) + --quantization_method {bitsandbytes,hqq,eetq} + Quantization method to use for on-the-fly + quantization. (default: bitsandbytes) + --quantization_bit QUANTIZATION_BIT + The number of bits to quantize the model using + bitsandbytes. (default: None) + --quantization_type {fp4,nf4} + Quantization data type to use in int4 training. + (default: nf4) + --double_quantization [DOUBLE_QUANTIZATION] + Whether or not to use double quantization in int4 + training. (default: True) + --no_double_quantization + Whether or not to use double quantization in int4 + training. (default: False) + --quantization_device_map {auto} + Device map used to infer the 4-bit quantized model, + needs bitsandbytes>=0.43.0. (default: None) + --rope_scaling {linear,dynamic} + Which scaling strategy should be adopted for the RoPE + embeddings. (default: None) + --flash_attn {auto,disabled,sdpa,fa2} + Enable FlashAttention for faster training and + inference. (default: auto) + --shift_attn [SHIFT_ATTN] + Enable shift short attention (S^2-Attn) proposed by + LongLoRA. (default: False) + --mixture_of_depths {convert,load} + Convert the model to mixture-of-depths (MoD) or load + the MoD model. (default: None) + --use_unsloth [USE_UNSLOTH] + Whether or not to use unsloth's optimization for the + LoRA training. (default: False) + --visual_inputs [VISUAL_INPUTS] + Whethor or not to use multimodal LLM that accepts + visual inputs. (default: False) + --moe_aux_loss_coef MOE_AUX_LOSS_COEF + Coefficient of the auxiliary router loss in mixture- + of-experts model. (default: None) + --disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING] + Whether or not to disable gradient checkpointing. + (default: False) + --upcast_layernorm [UPCAST_LAYERNORM] + Whether or not to upcast the layernorm weights in + fp32. (default: False) + --upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT] + Whether or not to upcast the output of lm_head in + fp32. (default: False) + --train_from_scratch [TRAIN_FROM_SCRATCH] + Whether or not to randomly initialize the model + weights. (default: False) + --infer_backend {huggingface,vllm} + Backend engine used at inference. (default: + huggingface) + --vllm_maxlen VLLM_MAXLEN + Maximum sequence (prompt + response) length of the + vLLM engine. (default: 2048) + --vllm_gpu_util VLLM_GPU_UTIL + The fraction of GPU memory in (0,1) to be used for the + vLLM engine. (default: 0.9) + --vllm_enforce_eager [VLLM_ENFORCE_EAGER] + Whether or not to disable CUDA graph in the vLLM + engine. (default: False) + --vllm_max_lora_rank VLLM_MAX_LORA_RANK + Maximum rank of all LoRAs in the vLLM engine. + (default: 32) + --offload_folder OFFLOAD_FOLDER + Path to offload model weights. (default: offload) + --use_cache [USE_CACHE] + Whether or not to use KV cache in generation. + (default: True) + --no_use_cache Whether or not to use KV cache in generation. + (default: False) + --infer_dtype {auto,float16,bfloat16,float32} + Data type for model weights and activations at + inference. (default: auto) + --hf_hub_token HF_HUB_TOKEN + Auth token to log in with Hugging Face Hub. (default: + None) + --ms_hub_token MS_HUB_TOKEN + Auth token to log in with ModelScope Hub. (default: + None) + --export_dir EXPORT_DIR + Path to the directory to save the exported model. + (default: None) + --export_size EXPORT_SIZE + The file shard size (in GB) of the exported model. + (default: 1) + --export_device {cpu,auto} + The device used in model export, use `auto` to + accelerate exporting. (default: cpu) + --export_quantization_bit EXPORT_QUANTIZATION_BIT + The number of bits to quantize the exported model. + (default: None) + --export_quantization_dataset EXPORT_QUANTIZATION_DATASET + Path to the dataset or dataset name to use in + quantizing the exported model. (default: None) + --export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES + The number of samples used for quantization. (default: + 128) + --export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN + The maximum length of the model inputs used for + quantization. (default: 1024) + --export_legacy_format [EXPORT_LEGACY_FORMAT] + Whether or not to save the `.bin` files instead of + `.safetensors`. (default: False) + --export_hub_model_id EXPORT_HUB_MODEL_ID + The name of the repository if push the model to the + Hugging Face hub. (default: None) + --print_param_status [PRINT_PARAM_STATUS] + For debugging purposes, print the status of the + parameters in the model. (default: False) + --template TEMPLATE Which template to use for constructing prompts in + training and inference. (default: None) + --dataset DATASET The name of dataset(s) to use for training. Use commas + to separate multiple datasets. (default: None) + --eval_dataset EVAL_DATASET + The name of dataset(s) to use for evaluation. Use + commas to separate multiple datasets. (default: None) + --dataset_dir DATASET_DIR + Path to the folder containing the datasets. (default: + data) + --cutoff_len CUTOFF_LEN + The cutoff length of the tokenized inputs in the + dataset. (default: 1024) + --train_on_prompt [TRAIN_ON_PROMPT] + Whether or not to disable the mask on the prompt. + (default: False) + --mask_history [MASK_HISTORY] + Whether or not to mask the history and train on the + last turn only. (default: False) + --streaming [STREAMING] + Enable dataset streaming. (default: False) + --buffer_size BUFFER_SIZE + Size of the buffer to randomly sample examples from in + dataset streaming. (default: 16384) + --mix_strategy {concat,interleave_under,interleave_over} + Strategy to use in dataset mixing (concat/interleave) + (undersampling/oversampling). (default: concat) + --interleave_probs INTERLEAVE_PROBS + Probabilities to sample data from datasets. Use commas + to separate multiple datasets. (default: None) + --overwrite_cache [OVERWRITE_CACHE] + Overwrite the cached training and evaluation sets. + (default: False) + --preprocessing_num_workers PREPROCESSING_NUM_WORKERS + The number of processes to use for the pre-processing. + (default: None) + --max_samples MAX_SAMPLES + For debugging purposes, truncate the number of + examples for each dataset. (default: None) + --eval_num_beams EVAL_NUM_BEAMS + Number of beams to use for evaluation. This argument + will be passed to `model.generate` (default: None) + --ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS] + Whether or not to ignore the tokens corresponding to + the pad label in loss computation. (default: True) + --no_ignore_pad_token_for_loss + Whether or not to ignore the tokens corresponding to + the pad label in loss computation. (default: False) + --val_size VAL_SIZE Size of the development set, should be an integer or a + float in range `[0,1)`. (default: 0.0) + --packing PACKING Enable sequences packing in training. Will + automatically enable in pre-training. (default: None) + --neat_packing [NEAT_PACKING] + Enable sequence packing without cross-attention. + (default: False) + --tool_format TOOL_FORMAT + Tool format to use for constructing function calling + examples. (default: None) + --tokenized_path TOKENIZED_PATH + Path to save or load the tokenized datasets. (default: + None) + --output_dir OUTPUT_DIR + The output directory where the model predictions and + checkpoints will be written. (default: None) + --overwrite_output_dir [OVERWRITE_OUTPUT_DIR] + Overwrite the content of the output directory. Use + this to continue training if output_dir points to a + checkpoint directory. (default: False) + --do_train [DO_TRAIN] + Whether to run training. (default: False) + --do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False) + --do_predict [DO_PREDICT] + Whether to run predictions on the test set. (default: + False) + --eval_strategy {no,steps,epoch} + The evaluation strategy to use. (default: no) + --prediction_loss_only [PREDICTION_LOSS_ONLY] + When performing evaluation and predictions, only + returns the loss. (default: False) + --per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE + Batch size per GPU/TPU/MPS/NPU core/CPU for training. + (default: 8) + --per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE + Batch size per GPU/TPU/MPS/NPU core/CPU for + evaluation. (default: 8) + --per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE + Deprecated, the use of `--per_device_train_batch_size` + is preferred. Batch size per GPU/TPU core/CPU for + training. (default: None) + --per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE + Deprecated, the use of `--per_device_eval_batch_size` + is preferred. Batch size per GPU/TPU core/CPU for + evaluation. (default: None) + --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS + Number of updates steps to accumulate before + performing a backward/update pass. (default: 1) + --eval_accumulation_steps EVAL_ACCUMULATION_STEPS + Number of predictions steps to accumulate before + moving the tensors to the CPU. (default: None) + --eval_delay EVAL_DELAY + Number of epochs or steps to wait for before the first + evaluation can be performed, depending on the + eval_strategy. (default: 0) + --torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS + Number of steps to wait before calling + `torch..empty_cache()`.This can help avoid + CUDA out-of-memory errors by lowering peak VRAM usage + at a cost of about [10{'option_strings': ['-- + torch_empty_cache_steps'], 'dest': + 'torch_empty_cache_steps', 'nargs': None, 'const': + None, 'default': None, 'type': 'int', 'choices': None, + 'required': False, 'help': 'Number of steps to wait + before calling `torch..empty_cache()`.This can + help avoid CUDA out-of-memory errors by lowering peak + VRAM usage at a cost of about [10% slower performance] + (https://github.com/huggingface/transformers/issues/31 + 372).If left unset or set to None, cache will not be + emptied.', 'metavar': None, 'container': + , + 'prog': 'launcher.py'}lower performance](https://githu + b.com/huggingface/transformers/issues/31372).If left + unset or set to None, cache will not be emptied. + (default: None) + --learning_rate LEARNING_RATE + The initial learning rate for AdamW. (default: 5e-05) + --weight_decay WEIGHT_DECAY + Weight decay for AdamW if we apply some. (default: + 0.0) + --adam_beta1 ADAM_BETA1 + Beta1 for AdamW optimizer (default: 0.9) + --adam_beta2 ADAM_BETA2 + Beta2 for AdamW optimizer (default: 0.999) + --adam_epsilon ADAM_EPSILON + Epsilon for AdamW optimizer. (default: 1e-08) + --max_grad_norm MAX_GRAD_NORM + Max gradient norm. (default: 1.0) + --num_train_epochs NUM_TRAIN_EPOCHS + Total number of training epochs to perform. (default: + 3.0) + --max_steps MAX_STEPS + If > 0: set total number of training steps to perform. + Override num_train_epochs. (default: -1) + --lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay} + The scheduler type to use. (default: linear) + --lr_scheduler_kwargs LR_SCHEDULER_KWARGS + Extra parameters for the lr_scheduler such as + {'num_cycles': 1} for the cosine with hard restarts. + (default: {}) + --warmup_ratio WARMUP_RATIO + Linear warmup over warmup_ratio fraction of total + steps. (default: 0.0) + --warmup_steps WARMUP_STEPS + Linear warmup over warmup_steps. (default: 0) + --log_level {detail,debug,info,warning,error,critical,passive} + Logger log level to use on the main node. Possible + choices are the log levels as strings: 'debug', + 'info', 'warning', 'error' and 'critical', plus a + 'passive' level which doesn't set anything and lets + the application set the level. Defaults to 'passive'. + (default: passive) + --log_level_replica {detail,debug,info,warning,error,critical,passive} + Logger log level to use on replica nodes. Same choices + and defaults as ``log_level`` (default: warning) + --log_on_each_node [LOG_ON_EACH_NODE] + When doing a multinode distributed training, whether + to log once per node or just once on the main node. + (default: True) + --no_log_on_each_node + When doing a multinode distributed training, whether + to log once per node or just once on the main node. + (default: False) + --logging_dir LOGGING_DIR + Tensorboard log dir. (default: None) + --logging_strategy {no,steps,epoch} + The logging strategy to use. (default: steps) + --logging_first_step [LOGGING_FIRST_STEP] + Log the first global_step (default: False) + --logging_steps LOGGING_STEPS + Log every X updates steps. Should be an integer or a + float in range `[0,1)`. If smaller than 1, will be + interpreted as ratio of total training steps. + (default: 500) + --logging_nan_inf_filter [LOGGING_NAN_INF_FILTER] + Filter nan and inf losses for logging. (default: True) + --no_logging_nan_inf_filter + Filter nan and inf losses for logging. (default: + False) + --save_strategy {no,steps,epoch} + The checkpoint save strategy to use. (default: steps) + --save_steps SAVE_STEPS + Save checkpoint every X updates steps. Should be an + integer or a float in range `[0,1)`. If smaller than + 1, will be interpreted as ratio of total training + steps. (default: 500) + --save_total_limit SAVE_TOTAL_LIMIT + If a value is passed, will limit the total amount of + checkpoints. Deletes the older checkpoints in + `output_dir`. When `load_best_model_at_end` is + enabled, the 'best' checkpoint according to + `metric_for_best_model` will always be retained in + addition to the most recent ones. For example, for + `save_total_limit=5` and + `load_best_model_at_end=True`, the four last + checkpoints will always be retained alongside the best + model. When `save_total_limit=1` and + `load_best_model_at_end=True`, it is possible that two + checkpoints are saved: the last one and the best one + (if they are different). Default is unlimited + checkpoints (default: None) + --save_safetensors [SAVE_SAFETENSORS] + Use safetensors saving and loading for state dicts + instead of default torch.load and torch.save. + (default: True) + --no_save_safetensors + Use safetensors saving and loading for state dicts + instead of default torch.load and torch.save. + (default: False) + --save_on_each_node [SAVE_ON_EACH_NODE] + When doing multi-node distributed training, whether to + save models and checkpoints on each node, or only on + the main one (default: False) + --save_only_model [SAVE_ONLY_MODEL] + When checkpointing, whether to only save the model, or + also the optimizer, scheduler & rng state.Note that + when this is true, you won't be able to resume + training from checkpoint.This enables you to save + storage by not storing the optimizer, scheduler & rng + state.You can only load the model using + from_pretrained with this option set to True. + (default: False) + --restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT] + Whether to restore the callback states from the + checkpoint. If `True`, will override callbacks passed + to the `Trainer` if they exist in the checkpoint. + (default: False) + --no_cuda [NO_CUDA] This argument is deprecated. It will be removed in + version 5.0 of 🤗 Transformers. (default: False) + --use_cpu [USE_CPU] Whether or not to use cpu. If set to False, we will + use cuda/tpu/mps/npu device if available. (default: + False) + --use_mps_device [USE_MPS_DEVICE] + This argument is deprecated. `mps` device will be used + if available similar to `cuda` device. It will be + removed in version 5.0 of 🤗 Transformers (default: + False) + --seed SEED Random seed that will be set at the beginning of + training. (default: 42) + --data_seed DATA_SEED + Random seed to be used with data samplers. (default: + None) + --jit_mode_eval [JIT_MODE_EVAL] + Whether or not to use PyTorch jit trace for inference + (default: False) + --use_ipex [USE_IPEX] + Use Intel extension for PyTorch when it is available, + installation: 'https://github.com/intel/intel- + extension-for-pytorch' (default: False) + --bf16 [BF16] Whether to use bf16 (mixed) precision instead of + 32-bit. Requires Ampere or higher NVIDIA architecture + or using CPU (use_cpu) or Ascend NPU. This is an + experimental API and it may change. (default: False) + --fp16 [FP16] Whether to use fp16 (mixed) precision instead of + 32-bit (default: False) + --fp16_opt_level FP16_OPT_LEVEL + For fp16: Apex AMP optimization level selected in + ['O0', 'O1', 'O2', and 'O3']. See details at + https://nvidia.github.io/apex/amp.html (default: O1) + --half_precision_backend {auto,apex,cpu_amp} + The backend to be used for half precision. (default: + auto) + --bf16_full_eval [BF16_FULL_EVAL] + Whether to use full bfloat16 evaluation instead of + 32-bit. This is an experimental API and it may change. + (default: False) + --fp16_full_eval [FP16_FULL_EVAL] + Whether to use full float16 evaluation instead of + 32-bit (default: False) + --tf32 TF32 Whether to enable tf32 mode, available in Ampere and + newer GPU architectures. This is an experimental API + and it may change. (default: None) + --local_rank LOCAL_RANK + For distributed training: local_rank (default: -1) + --ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl} + The backend to be used for distributed training + (default: None) + --tpu_num_cores TPU_NUM_CORES + TPU: Number of TPU cores (automatically passed by + launcher script) (default: None) + --tpu_metrics_debug [TPU_METRICS_DEBUG] + Deprecated, the use of `--debug tpu_metrics_debug` is + preferred. TPU: Whether to print debug metrics + (default: False) + --debug DEBUG [DEBUG ...] + Whether or not to enable debug mode. Current options: + `underflow_overflow` (Detect underflow and overflow in + activations and weights), `tpu_metrics_debug` (print + debug metrics on TPU). (default: None) + --dataloader_drop_last [DATALOADER_DROP_LAST] + Drop the last incomplete batch if it is not divisible + by the batch size. (default: False) + --eval_steps EVAL_STEPS + Run an evaluation every X steps. Should be an integer + or a float in range `[0,1)`. If smaller than 1, will + be interpreted as ratio of total training steps. + (default: None) + --dataloader_num_workers DATALOADER_NUM_WORKERS + Number of subprocesses to use for data loading + (PyTorch only). 0 means that the data will be loaded + in the main process. (default: 0) + --dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR + Number of batches loaded in advance by each worker. 2 + means there will be a total of 2 * num_workers batches + prefetched across all workers. Default is 2 for + PyTorch < 2.0.0 and otherwise None. (default: None) + --past_index PAST_INDEX + If >=0, uses the corresponding part of the output as + the past state for next step. (default: -1) + --run_name RUN_NAME An optional descriptor for the run. Notably used for + wandb, mlflow and comet logging. (default: None) + --disable_tqdm DISABLE_TQDM + Whether or not to disable the tqdm progress bars. + (default: None) + --remove_unused_columns [REMOVE_UNUSED_COLUMNS] + Remove columns not required by the model when using an + nlp.Dataset. (default: True) + --no_remove_unused_columns + Remove columns not required by the model when using an + nlp.Dataset. (default: False) + --label_names LABEL_NAMES [LABEL_NAMES ...] + The list of keys in your dictionary of inputs that + correspond to the labels. (default: None) + --load_best_model_at_end [LOAD_BEST_MODEL_AT_END] + Whether or not to load the best model found during + training at the end of training. When this option is + enabled, the best checkpoint will always be saved. See + `save_total_limit` for more. (default: False) + --metric_for_best_model METRIC_FOR_BEST_MODEL + The metric to use to compare two different models. + (default: None) + --greater_is_better GREATER_IS_BETTER + Whether the `metric_for_best_model` should be + maximized or not. (default: None) + --ignore_data_skip [IGNORE_DATA_SKIP] + When resuming training, whether or not to skip the + first epochs and batches to get to the same training + data. (default: False) + --fsdp FSDP Whether or not to use PyTorch Fully Sharded Data + Parallel (FSDP) training (in distributed training + only). The base option should be `full_shard`, + `shard_grad_op` or `no_shard` and you can add CPU- + offload to `full_shard` or `shard_grad_op` like this: + full_shard offload` or `shard_grad_op offload`. You + can add auto-wrap to `full_shard` or `shard_grad_op` + with the same syntax: full_shard auto_wrap` or + `shard_grad_op auto_wrap`. (default: ) + --fsdp_min_num_params FSDP_MIN_NUM_PARAMS + This parameter is deprecated. FSDP's minimum number of + parameters for Default Auto Wrapping. (useful only + when `fsdp` field is passed). (default: 0) + --fsdp_config FSDP_CONFIG + Config to be used with FSDP (Pytorch Fully Sharded + Data Parallel). The value is either a fsdp json config + file (e.g., `fsdp_config.json`) or an already loaded + json file as `dict`. (default: None) + --fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP + This parameter is deprecated. Transformer layer class + name (case-sensitive) to wrap, e.g, `BertLayer`, + `GPTJBlock`, `T5Block` .... (useful only when `fsdp` + flag is passed). (default: None) + --accelerator_config ACCELERATOR_CONFIG + Config to be used with the internal Accelerator object + initializtion. The value is either a accelerator json + config file (e.g., `accelerator_config.json`) or an + already loaded json file as `dict`. (default: None) + --deepspeed DEEPSPEED + Enable deepspeed and pass the path to deepspeed json + config file (e.g. `ds_config.json`) or an already + loaded json file as a dict (default: None) + --label_smoothing_factor LABEL_SMOOTHING_FACTOR + The label smoothing epsilon to apply (zero means no + label smoothing). (default: 0.0) + --optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo} + The optimizer to use. (default: adamw_torch) + --optim_args OPTIM_ARGS + Optional arguments to supply to optimizer. (default: + None) + --adafactor [ADAFACTOR] + Whether or not to replace AdamW by Adafactor. + (default: False) + --group_by_length [GROUP_BY_LENGTH] + Whether or not to group samples of roughly the same + length together when batching. (default: False) + --length_column_name LENGTH_COLUMN_NAME + Column name with precomputed lengths to use when + grouping by length. (default: length) + --report_to REPORT_TO + The list of integrations to report the results and + logs to. (default: None) + --ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS + When using distributed training, the value of the flag + `find_unused_parameters` passed to + `DistributedDataParallel`. (default: None) + --ddp_bucket_cap_mb DDP_BUCKET_CAP_MB + When using distributed training, the value of the flag + `bucket_cap_mb` passed to `DistributedDataParallel`. + (default: None) + --ddp_broadcast_buffers DDP_BROADCAST_BUFFERS + When using distributed training, the value of the flag + `broadcast_buffers` passed to + `DistributedDataParallel`. (default: None) + --dataloader_pin_memory [DATALOADER_PIN_MEMORY] + Whether or not to pin memory for DataLoader. (default: + True) + --no_dataloader_pin_memory + Whether or not to pin memory for DataLoader. (default: + False) + --dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS] + If True, the data loader will not shut down the worker + processes after a dataset has been consumed once. This + allows to maintain the workers Dataset instances + alive. Can potentially speed up training, but will + increase RAM usage. (default: False) + --skip_memory_metrics [SKIP_MEMORY_METRICS] + Whether or not to skip adding of memory profiler + reports to metrics. (default: True) + --no_skip_memory_metrics + Whether or not to skip adding of memory profiler + reports to metrics. (default: False) + --use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP] + Whether or not to use the legacy prediction_loop in + the Trainer. (default: False) + --push_to_hub [PUSH_TO_HUB] + Whether or not to upload the trained model to the + model hub after training. (default: False) + --resume_from_checkpoint RESUME_FROM_CHECKPOINT + The path to a folder with a valid checkpoint for your + model. (default: None) + --hub_model_id HUB_MODEL_ID + The name of the repository to keep in sync with the + local `output_dir`. (default: None) + --hub_strategy {end,every_save,checkpoint,all_checkpoints} + The hub strategy to use when `--push_to_hub` is + activated. (default: every_save) + --hub_token HUB_TOKEN + The token to use to push to the Model Hub. (default: + None) + --hub_private_repo [HUB_PRIVATE_REPO] + Whether the model repository is private or not. + (default: False) + --hub_always_push [HUB_ALWAYS_PUSH] + Unless `True`, the Trainer will skip pushes if the + previous one wasn't finished yet. (default: False) + --gradient_checkpointing [GRADIENT_CHECKPOINTING] + If True, use gradient checkpointing to save memory at + the expense of slower backward pass. (default: False) + --gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS + Gradient checkpointing key word arguments such as + `use_reentrant`. Will be passed to + `torch.utils.checkpoint.checkpoint` through + `model.gradient_checkpointing_enable`. (default: None) + --include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS] + Whether or not the inputs will be passed to the + `compute_metrics` function. (default: False) + --eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES] + Whether to recursively concat + inputs/losses/labels/predictions across batches. If + `False`, will instead store them as lists, with each + batch kept separate. (default: True) + --no_eval_do_concat_batches + Whether to recursively concat + inputs/losses/labels/predictions across batches. If + `False`, will instead store them as lists, with each + batch kept separate. (default: False) + --fp16_backend {auto,apex,cpu_amp} + Deprecated. Use half_precision_backend instead + (default: auto) + --evaluation_strategy {no,steps,epoch} + Deprecated. Use `eval_strategy` instead (default: + None) + --push_to_hub_model_id PUSH_TO_HUB_MODEL_ID + The name of the repository to which push the + `Trainer`. (default: None) + --push_to_hub_organization PUSH_TO_HUB_ORGANIZATION + The name of the organization in with to which push the + `Trainer`. (default: None) + --push_to_hub_token PUSH_TO_HUB_TOKEN + The token to use to push to the Model Hub. (default: + None) + --mp_parameters MP_PARAMETERS + Used by the SageMaker launcher to send mp-specific + args. Ignored in Trainer (default: ) + --auto_find_batch_size [AUTO_FIND_BATCH_SIZE] + Whether to automatically decrease the batch size in + half and rerun the training loop again each time a + CUDA Out-of-Memory was reached (default: False) + --full_determinism [FULL_DETERMINISM] + Whether to call enable_full_determinism instead of + set_seed for reproducibility in distributed training. + Important: this will negatively impact the + performance, so only use it for debugging. (default: + False) + --torchdynamo TORCHDYNAMO + This argument is deprecated, use + `--torch_compile_backend` instead. (default: None) + --ray_scope RAY_SCOPE + The scope to use when doing hyperparameter search with + Ray. By default, `"last"` will be used. Ray will then + use the last checkpoint of all trials, compare those, + and select the best one. However, other options are + also available. See the Ray documentation (https://doc + s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun + e.ExperimentAnalysis.get_best_trial) for more options. + (default: last) + --ddp_timeout DDP_TIMEOUT + Overrides the default timeout for distributed training + (value should be given in seconds). (default: 1800) + --torch_compile [TORCH_COMPILE] + If set to `True`, the model will be wrapped in + `torch.compile`. (default: False) + --torch_compile_backend TORCH_COMPILE_BACKEND + Which backend to use with `torch.compile`, passing one + will trigger a model compilation. (default: None) + --torch_compile_mode TORCH_COMPILE_MODE + Which mode to use with `torch.compile`, passing one + will trigger a model compilation. (default: None) + --dispatch_batches DISPATCH_BATCHES + Deprecated. Pass {'dispatch_batches':VALUE} to + `accelerator_config`. (default: None) + --split_batches SPLIT_BATCHES + Deprecated. Pass {'split_batches':True} to + `accelerator_config`. (default: None) + --include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND] + If set to `True`, the speed metrics will include `tgs` + (tokens per second per device). (default: False) + --include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN] + If set to `True`, will track the number of input + tokens seen throughout training. (May be slower in + distributed training) (default: False) + --neftune_noise_alpha NEFTUNE_NOISE_ALPHA + Activates neftune noise embeddings into the model. + NEFTune has been proven to drastically improve model + performances for instrcution fine-tuning. Check out + the original paper here: + https://arxiv.org/abs/2310.05914 and the original code + here: https://github.com/neelsjain/NEFTune. Only + supported for `PreTrainedModel` and `PeftModel` + classes. (default: None) + --optim_target_modules OPTIM_TARGET_MODULES + Target modules for the optimizer defined in the + `optim` argument. Only used for the GaLore optimizer + at the moment. (default: None) + --batch_eval_metrics [BATCH_EVAL_METRICS] + Break eval metrics calculation into batches to save + memory. (default: False) + --eval_on_start [EVAL_ON_START] + Whether to run through the entire `evaluation` step at + the very beginning of training as a sanity check. + (default: False) + --eval_use_gather_object [EVAL_USE_GATHER_OBJECT] + Whether to run recursively gather object in a nested + list/tuple/dictionary of objects from all devices. + (default: False) + --sortish_sampler [SORTISH_SAMPLER] + Whether to use SortishSampler or not. (default: False) + --predict_with_generate [PREDICT_WITH_GENERATE] + Whether to use generate to calculate generative + metrics (ROUGE, BLEU). (default: False) + --generation_max_length GENERATION_MAX_LENGTH + The `max_length` to use on each evaluation loop when + `predict_with_generate=True`. Will default to the + `max_length` value of the model configuration. + (default: None) + --generation_num_beams GENERATION_NUM_BEAMS + The `num_beams` to use on each evaluation loop when + `predict_with_generate=True`. Will default to the + `num_beams` value of the model configuration. + (default: None) + --generation_config GENERATION_CONFIG + Model id, file path or url pointing to a + GenerationConfig json file, to use during prediction. + (default: None) + --use_badam [USE_BADAM] + Whether or not to use the BAdam optimizer. (default: + False) + --badam_mode {layer,ratio} + Whether to use layer-wise or ratio-wise BAdam + optimizer. (default: layer) + --badam_start_block BADAM_START_BLOCK + The starting block index for layer-wise BAdam. + (default: None) + --badam_switch_mode {ascending,descending,random,fixed} + the strategy of picking block to update for layer-wise + BAdam. (default: ascending) + --badam_switch_interval BADAM_SWITCH_INTERVAL + Number of steps to update the block for layer-wise + BAdam. Use -1 to disable the block update. (default: + 50) + --badam_update_ratio BADAM_UPDATE_RATIO + The ratio of the update for ratio-wise BAdam. + (default: 0.05) + --badam_mask_mode {adjacent,scatter} + The mode of the mask for BAdam optimizer. `adjacent` + means that the trainable parameters are adjacent to + each other, `scatter` means that trainable parameters + are randomly choosed from the weight. (default: + adjacent) + --badam_verbose BADAM_VERBOSE + The verbosity level of BAdam optimizer. 0 for no + print, 1 for print the block prefix, 2 for print + trainable parameters. (default: 0) + --use_galore [USE_GALORE] + Whether or not to use the gradient low-Rank projection + (GaLore). (default: False) + --galore_target GALORE_TARGET + Name(s) of modules to apply GaLore. Use commas to + separate multiple modules. Use `all` to specify all + the linear modules. (default: all) + --galore_rank GALORE_RANK + The rank of GaLore gradients. (default: 16) + --galore_update_interval GALORE_UPDATE_INTERVAL + Number of steps to update the GaLore projection. + (default: 200) + --galore_scale GALORE_SCALE + GaLore scaling coefficient. (default: 0.25) + --galore_proj_type {std,reverse_std,right,left,full} + Type of GaLore projection. (default: std) + --galore_layerwise [GALORE_LAYERWISE] + Whether or not to enable layer-wise update to further + save memory. (default: False) + --pref_beta PREF_BETA + The beta parameter in the preference loss. (default: + 0.1) + --pref_ftx PREF_FTX The supervised fine-tuning loss coefficient in DPO + training. (default: 0.0) + --pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo} + The type of DPO loss to use. (default: sigmoid) + --dpo_label_smoothing DPO_LABEL_SMOOTHING + The robust DPO label smoothing parameter in cDPO that + should be between 0 and 0.5. (default: 0.0) + --kto_chosen_weight KTO_CHOSEN_WEIGHT + The weight factor of the desirable losses in KTO + training. (default: 1.0) + --kto_rejected_weight KTO_REJECTED_WEIGHT + The weight factor of the undesirable losses in KTO + training. (default: 1.0) + --simpo_gamma SIMPO_GAMMA + The target reward margin term in SimPO loss. (default: + 0.5) + --ppo_buffer_size PPO_BUFFER_SIZE + The number of mini-batches to make experience buffer + in a PPO optimization step. (default: 1) + --ppo_epochs PPO_EPOCHS + The number of epochs to perform in a PPO optimization + step. (default: 4) + --ppo_score_norm [PPO_SCORE_NORM] + Use score normalization in PPO training. (default: + False) + --ppo_target PPO_TARGET + Target KL value for adaptive KL control in PPO + training. (default: 6.0) + --ppo_whiten_rewards [PPO_WHITEN_REWARDS] + Whiten the rewards before compute advantages in PPO + training. (default: False) + --ref_model REF_MODEL + Path to the reference model used for the PPO or DPO + training. (default: None) + --ref_model_adapters REF_MODEL_ADAPTERS + Path to the adapters of the reference model. (default: + None) + --ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT + The number of bits to quantize the reference model. + (default: None) + --reward_model REWARD_MODEL + Path to the reward model used for the PPO training. + (default: None) + --reward_model_adapters REWARD_MODEL_ADAPTERS + Path to the adapters of the reward model. (default: + None) + --reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT + The number of bits to quantize the reward model. + (default: None) + --reward_model_type {lora,full,api} + The type of the reward model in PPO training. Lora + model only supports lora training. (default: lora) + --additional_target ADDITIONAL_TARGET + Name(s) of modules apart from LoRA layers to be set as + trainable and saved in the final checkpoint. Use + commas to separate multiple modules. (default: None) + --lora_alpha LORA_ALPHA + The scale factor for LoRA fine-tuning (default: + lora_rank * 2). (default: None) + --lora_dropout LORA_DROPOUT + Dropout rate for the LoRA fine-tuning. (default: 0.0) + --lora_rank LORA_RANK + The intrinsic dimension for LoRA fine-tuning. + (default: 8) + --lora_target LORA_TARGET + Name(s) of target modules to apply LoRA. Use commas to + separate multiple modules. Use `all` to specify all + the linear modules. (default: all) + --loraplus_lr_ratio LORAPLUS_LR_RATIO + LoRA plus learning rate ratio (lr_B / lr_A). (default: + None) + --loraplus_lr_embedding LORAPLUS_LR_EMBEDDING + LoRA plus learning rate for lora embedding layers. + (default: 1e-06) + --use_rslora [USE_RSLORA] + Whether or not to use the rank stabilization scaling + factor for LoRA layer. (default: False) + --use_dora [USE_DORA] + Whether or not to use the weight-decomposed lora + method (DoRA). (default: False) + --pissa_init [PISSA_INIT] + Whether or not to initialize a PiSSA adapter. + (default: False) + --pissa_iter PISSA_ITER + The number of iteration steps performed by FSVD in + PiSSA. Use -1 to disable it. (default: 16) + --pissa_convert [PISSA_CONVERT] + Whether or not to convert the PiSSA adapter to a + normal LoRA adapter. (default: False) + --create_new_adapter [CREATE_NEW_ADAPTER] + Whether or not to create a new adapter with randomly + initialized weight. (default: False) + --freeze_trainable_layers FREEZE_TRAINABLE_LAYERS + The number of trainable layers for freeze (partial- + parameter) fine-tuning. Positive numbers mean the last + n layers are set as trainable, negative numbers mean + the first n layers are set as trainable. (default: 2) + --freeze_trainable_modules FREEZE_TRAINABLE_MODULES + Name(s) of trainable modules for freeze (partial- + parameter) fine-tuning. Use commas to separate + multiple modules. Use `all` to specify all the + available modules. (default: all) + --freeze_extra_modules FREEZE_EXTRA_MODULES + Name(s) of modules apart from hidden layers to be set + as trainable for freeze (partial-parameter) fine- + tuning. Use commas to separate multiple modules. + (default: None) + --pure_bf16 [PURE_BF16] + Whether or not to train model in purely bf16 precision + (without AMP). (default: False) + --stage {pt,sft,rm,ppo,dpo,kto} + Which stage will be performed in training. (default: + sft) + --finetuning_type {lora,freeze,full} + Which fine-tuning method to use. (default: lora) + --use_llama_pro [USE_LLAMA_PRO] + Whether or not to make only the parameters in the + expanded blocks trainable. (default: False) + --use_adam_mini [USE_ADAM_MINI] + Whether or not to use the Adam-mini optimizer. + (default: False) + --freeze_vision_tower [FREEZE_VISION_TOWER] + Whether ot not to freeze vision tower in MLLM + training. (default: True) + --no_freeze_vision_tower + Whether ot not to freeze vision tower in MLLM + training. (default: False) + --train_mm_proj_only [TRAIN_MM_PROJ_ONLY] + Whether or not to train the multimodal projector for + MLLM only. (default: False) + --compute_accuracy [COMPUTE_ACCURACY] + Whether or not to compute the token-level accuracy at + evaluation. (default: False) + --plot_loss [PLOT_LOSS] + Whether or not to save the training loss curves. + (default: False) + --do_sample [DO_SAMPLE] + Whether or not to use sampling, use greedy decoding + otherwise. (default: True) + --no_do_sample Whether or not to use sampling, use greedy decoding + otherwise. (default: False) + --temperature TEMPERATURE + The value used to modulate the next token + probabilities. (default: 0.95) + --top_p TOP_P The smallest set of most probable tokens with + probabilities that add up to top_p or higher are kept. + (default: 0.7) + --top_k TOP_K The number of highest probability vocabulary tokens to + keep for top-k filtering. (default: 50) + --num_beams NUM_BEAMS + Number of beams for beam search. 1 means no beam + search. (default: 1) + --max_length MAX_LENGTH + The maximum length the generated tokens can have. It + can be overridden by max_new_tokens. (default: 1024) + --max_new_tokens MAX_NEW_TOKENS + The maximum numbers of tokens to generate, ignoring + the number of tokens in the prompt. (default: 1024) + --repetition_penalty REPETITION_PENALTY + The parameter for repetition penalty. 1.0 means no + penalty. (default: 1.0) + --length_penalty LENGTH_PENALTY + Exponential penalty to the length that is used with + beam-based generation. (default: 1.0) + --default_system DEFAULT_SYSTEM + Default system message to use in chat completion. + (default: None) +usage: launcher.py [-h] --model_name_or_path MODEL_NAME_OR_PATH + [--adapter_name_or_path ADAPTER_NAME_OR_PATH] + [--adapter_folder ADAPTER_FOLDER] [--cache_dir CACHE_DIR] + [--use_fast_tokenizer [USE_FAST_TOKENIZER]] + [--no_use_fast_tokenizer] [--resize_vocab [RESIZE_VOCAB]] + [--split_special_tokens [SPLIT_SPECIAL_TOKENS]] + [--new_special_tokens NEW_SPECIAL_TOKENS] + [--model_revision MODEL_REVISION] + [--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]] + [--no_low_cpu_mem_usage] + [--quantization_method {bitsandbytes,hqq,eetq}] + [--quantization_bit QUANTIZATION_BIT] + [--quantization_type {fp4,nf4}] + [--double_quantization [DOUBLE_QUANTIZATION]] + [--no_double_quantization] + [--quantization_device_map {auto}] + [--rope_scaling {linear,dynamic}] + [--flash_attn {auto,disabled,sdpa,fa2}] + [--shift_attn [SHIFT_ATTN]] + [--mixture_of_depths {convert,load}] + [--use_unsloth [USE_UNSLOTH]] + [--visual_inputs [VISUAL_INPUTS]] + [--moe_aux_loss_coef MOE_AUX_LOSS_COEF] + [--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]] + [--upcast_layernorm [UPCAST_LAYERNORM]] + [--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]] + [--train_from_scratch [TRAIN_FROM_SCRATCH]] + [--infer_backend {huggingface,vllm}] + [--vllm_maxlen VLLM_MAXLEN] [--vllm_gpu_util VLLM_GPU_UTIL] + [--vllm_enforce_eager [VLLM_ENFORCE_EAGER]] + [--vllm_max_lora_rank VLLM_MAX_LORA_RANK] + [--offload_folder OFFLOAD_FOLDER] [--use_cache [USE_CACHE]] + [--no_use_cache] + [--infer_dtype {auto,float16,bfloat16,float32}] + [--hf_hub_token HF_HUB_TOKEN] [--ms_hub_token MS_HUB_TOKEN] + [--export_dir EXPORT_DIR] [--export_size EXPORT_SIZE] + [--export_device {cpu,auto}] + [--export_quantization_bit EXPORT_QUANTIZATION_BIT] + [--export_quantization_dataset EXPORT_QUANTIZATION_DATASET] + [--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES] + [--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN] + [--export_legacy_format [EXPORT_LEGACY_FORMAT]] + [--export_hub_model_id EXPORT_HUB_MODEL_ID] + [--print_param_status [PRINT_PARAM_STATUS]] + [--template TEMPLATE] [--dataset DATASET] + [--eval_dataset EVAL_DATASET] [--dataset_dir DATASET_DIR] + [--cutoff_len CUTOFF_LEN] + [--train_on_prompt [TRAIN_ON_PROMPT]] + [--mask_history [MASK_HISTORY]] [--streaming [STREAMING]] + [--buffer_size BUFFER_SIZE] + [--mix_strategy {concat,interleave_under,interleave_over}] + [--interleave_probs INTERLEAVE_PROBS] + [--overwrite_cache [OVERWRITE_CACHE]] + [--preprocessing_num_workers PREPROCESSING_NUM_WORKERS] + [--max_samples MAX_SAMPLES] + [--eval_num_beams EVAL_NUM_BEAMS] + [--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]] + [--no_ignore_pad_token_for_loss] [--val_size VAL_SIZE] + [--packing PACKING] [--neat_packing [NEAT_PACKING]] + [--tool_format TOOL_FORMAT] + [--tokenized_path TOKENIZED_PATH] --output_dir OUTPUT_DIR + [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] + [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]] + [--do_predict [DO_PREDICT]] + [--eval_strategy {no,steps,epoch}] + [--prediction_loss_only [PREDICTION_LOSS_ONLY]] + [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] + [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE] + [--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] + [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE] + [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS] + [--eval_accumulation_steps EVAL_ACCUMULATION_STEPS] + [--eval_delay EVAL_DELAY] + [--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS] + [--learning_rate LEARNING_RATE] + [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1] + [--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON] + [--max_grad_norm MAX_GRAD_NORM] + [--num_train_epochs NUM_TRAIN_EPOCHS] + [--max_steps MAX_STEPS] + [--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}] + [--lr_scheduler_kwargs LR_SCHEDULER_KWARGS] + [--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS] + [--log_level {detail,debug,info,warning,error,critical,passive}] + [--log_level_replica {detail,debug,info,warning,error,critical,passive}] + [--log_on_each_node [LOG_ON_EACH_NODE]] + [--no_log_on_each_node] [--logging_dir LOGGING_DIR] + [--logging_strategy {no,steps,epoch}] + [--logging_first_step [LOGGING_FIRST_STEP]] + [--logging_steps LOGGING_STEPS] + [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]] + [--no_logging_nan_inf_filter] + [--save_strategy {no,steps,epoch}] + [--save_steps SAVE_STEPS] + [--save_total_limit SAVE_TOTAL_LIMIT] + [--save_safetensors [SAVE_SAFETENSORS]] + [--no_save_safetensors] + [--save_on_each_node [SAVE_ON_EACH_NODE]] + [--save_only_model [SAVE_ONLY_MODEL]] + [--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]] + [--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]] + [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] + [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]] + [--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]] + [--fp16_opt_level FP16_OPT_LEVEL] + [--half_precision_backend {auto,apex,cpu_amp}] + [--bf16_full_eval [BF16_FULL_EVAL]] + [--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32] + [--local_rank LOCAL_RANK] + [--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}] + [--tpu_num_cores TPU_NUM_CORES] + [--tpu_metrics_debug [TPU_METRICS_DEBUG]] + [--debug DEBUG [DEBUG ...]] + [--dataloader_drop_last [DATALOADER_DROP_LAST]] + [--eval_steps EVAL_STEPS] + [--dataloader_num_workers DATALOADER_NUM_WORKERS] + [--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR] + [--past_index PAST_INDEX] [--run_name RUN_NAME] + [--disable_tqdm DISABLE_TQDM] + [--remove_unused_columns [REMOVE_UNUSED_COLUMNS]] + [--no_remove_unused_columns] + [--label_names LABEL_NAMES [LABEL_NAMES ...]] + [--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]] + [--metric_for_best_model METRIC_FOR_BEST_MODEL] + [--greater_is_better GREATER_IS_BETTER] + [--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP] + [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] + [--fsdp_config FSDP_CONFIG] + [--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] + [--accelerator_config ACCELERATOR_CONFIG] + [--deepspeed DEEPSPEED] + [--label_smoothing_factor LABEL_SMOOTHING_FACTOR] + [--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}] + [--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] + [--group_by_length [GROUP_BY_LENGTH]] + [--length_column_name LENGTH_COLUMN_NAME] + [--report_to REPORT_TO] + [--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] + [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB] + [--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS] + [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]] + [--no_dataloader_pin_memory] + [--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]] + [--skip_memory_metrics [SKIP_MEMORY_METRICS]] + [--no_skip_memory_metrics] + [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]] + [--push_to_hub [PUSH_TO_HUB]] + [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] + [--hub_model_id HUB_MODEL_ID] + [--hub_strategy {end,every_save,checkpoint,all_checkpoints}] + [--hub_token HUB_TOKEN] + [--hub_private_repo [HUB_PRIVATE_REPO]] + [--hub_always_push [HUB_ALWAYS_PUSH]] + [--gradient_checkpointing [GRADIENT_CHECKPOINTING]] + [--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS] + [--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]] + [--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]] + [--no_eval_do_concat_batches] + [--fp16_backend {auto,apex,cpu_amp}] + [--evaluation_strategy {no,steps,epoch}] + [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID] + [--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION] + [--push_to_hub_token PUSH_TO_HUB_TOKEN] + [--mp_parameters MP_PARAMETERS] + [--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]] + [--full_determinism [FULL_DETERMINISM]] + [--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE] + [--ddp_timeout DDP_TIMEOUT] + [--torch_compile [TORCH_COMPILE]] + [--torch_compile_backend TORCH_COMPILE_BACKEND] + [--torch_compile_mode TORCH_COMPILE_MODE] + [--dispatch_batches DISPATCH_BATCHES] + [--split_batches SPLIT_BATCHES] + [--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]] + [--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]] + [--neftune_noise_alpha NEFTUNE_NOISE_ALPHA] + [--optim_target_modules OPTIM_TARGET_MODULES] + [--batch_eval_metrics [BATCH_EVAL_METRICS]] + [--eval_on_start [EVAL_ON_START]] + [--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]] + [--sortish_sampler [SORTISH_SAMPLER]] + [--predict_with_generate [PREDICT_WITH_GENERATE]] + [--generation_max_length GENERATION_MAX_LENGTH] + [--generation_num_beams GENERATION_NUM_BEAMS] + [--generation_config GENERATION_CONFIG] + [--use_badam [USE_BADAM]] [--badam_mode {layer,ratio}] + [--badam_start_block BADAM_START_BLOCK] + [--badam_switch_mode {ascending,descending,random,fixed}] + [--badam_switch_interval BADAM_SWITCH_INTERVAL] + [--badam_update_ratio BADAM_UPDATE_RATIO] + [--badam_mask_mode {adjacent,scatter}] + [--badam_verbose BADAM_VERBOSE] [--use_galore [USE_GALORE]] + [--galore_target GALORE_TARGET] [--galore_rank GALORE_RANK] + [--galore_update_interval GALORE_UPDATE_INTERVAL] + [--galore_scale GALORE_SCALE] + [--galore_proj_type {std,reverse_std,right,left,full}] + [--galore_layerwise [GALORE_LAYERWISE]] + [--pref_beta PREF_BETA] [--pref_ftx PREF_FTX] + [--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}] + [--dpo_label_smoothing DPO_LABEL_SMOOTHING] + [--kto_chosen_weight KTO_CHOSEN_WEIGHT] + [--kto_rejected_weight KTO_REJECTED_WEIGHT] + [--simpo_gamma SIMPO_GAMMA] + [--ppo_buffer_size PPO_BUFFER_SIZE] + [--ppo_epochs PPO_EPOCHS] + [--ppo_score_norm [PPO_SCORE_NORM]] + [--ppo_target PPO_TARGET] + [--ppo_whiten_rewards [PPO_WHITEN_REWARDS]] + [--ref_model REF_MODEL] + [--ref_model_adapters REF_MODEL_ADAPTERS] + [--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT] + [--reward_model REWARD_MODEL] + [--reward_model_adapters REWARD_MODEL_ADAPTERS] + [--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT] + [--reward_model_type {lora,full,api}] + [--additional_target ADDITIONAL_TARGET] + [--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT] + [--lora_rank LORA_RANK] [--lora_target LORA_TARGET] + [--loraplus_lr_ratio LORAPLUS_LR_RATIO] + [--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING] + [--use_rslora [USE_RSLORA]] [--use_dora [USE_DORA]] + [--pissa_init [PISSA_INIT]] [--pissa_iter PISSA_ITER] + [--pissa_convert [PISSA_CONVERT]] + [--create_new_adapter [CREATE_NEW_ADAPTER]] + [--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS] + [--freeze_trainable_modules FREEZE_TRAINABLE_MODULES] + [--freeze_extra_modules FREEZE_EXTRA_MODULES] + [--pure_bf16 [PURE_BF16]] [--stage {pt,sft,rm,ppo,dpo,kto}] + [--finetuning_type {lora,freeze,full}] + [--use_llama_pro [USE_LLAMA_PRO]] + [--use_adam_mini [USE_ADAM_MINI]] + [--freeze_vision_tower [FREEZE_VISION_TOWER]] + [--no_freeze_vision_tower] + [--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]] + [--compute_accuracy [COMPUTE_ACCURACY]] + [--plot_loss [PLOT_LOSS]] [--do_sample [DO_SAMPLE]] + [--no_do_sample] [--temperature TEMPERATURE] + [--top_p TOP_P] [--top_k TOP_K] [--num_beams NUM_BEAMS] + [--max_length MAX_LENGTH] [--max_new_tokens MAX_NEW_TOKENS] + [--repetition_penalty REPETITION_PENALTY] + [--length_penalty LENGTH_PENALTY] + [--default_system DEFAULT_SYSTEM] + +optional arguments: + -h, --help show this help message and exit + --model_name_or_path MODEL_NAME_OR_PATH + Path to the model weight or identifier from + huggingface.co/models or modelscope.cn/models. + (default: None) + --adapter_name_or_path ADAPTER_NAME_OR_PATH + Path to the adapter weight or identifier from + huggingface.co/models. Use commas to separate multiple + adapters. (default: None) + --adapter_folder ADAPTER_FOLDER + The folder containing the adapter weights to load. + (default: None) + --cache_dir CACHE_DIR + Where to store the pre-trained models downloaded from + huggingface.co or modelscope.cn. (default: None) + --use_fast_tokenizer [USE_FAST_TOKENIZER] + Whether or not to use one of the fast tokenizer + (backed by the tokenizers library). (default: True) + --no_use_fast_tokenizer + Whether or not to use one of the fast tokenizer + (backed by the tokenizers library). (default: False) + --resize_vocab [RESIZE_VOCAB] + Whether or not to resize the tokenizer vocab and the + embedding layers. (default: False) + --split_special_tokens [SPLIT_SPECIAL_TOKENS] + Whether or not the special tokens should be split + during the tokenization process. (default: False) + --new_special_tokens NEW_SPECIAL_TOKENS + Special tokens to be added into the tokenizer. Use + commas to separate multiple tokens. (default: None) + --model_revision MODEL_REVISION + The specific model version to use (can be a branch + name, tag name or commit id). (default: main) + --low_cpu_mem_usage [LOW_CPU_MEM_USAGE] + Whether or not to use memory-efficient model loading. + (default: True) + --no_low_cpu_mem_usage + Whether or not to use memory-efficient model loading. + (default: False) + --quantization_method {bitsandbytes,hqq,eetq} + Quantization method to use for on-the-fly + quantization. (default: bitsandbytes) + --quantization_bit QUANTIZATION_BIT + The number of bits to quantize the model using + bitsandbytes. (default: None) + --quantization_type {fp4,nf4} + Quantization data type to use in int4 training. + (default: nf4) + --double_quantization [DOUBLE_QUANTIZATION] + Whether or not to use double quantization in int4 + training. (default: True) + --no_double_quantization + Whether or not to use double quantization in int4 + training. (default: False) + --quantization_device_map {auto} + Device map used to infer the 4-bit quantized model, + needs bitsandbytes>=0.43.0. (default: None) + --rope_scaling {linear,dynamic} + Which scaling strategy should be adopted for the RoPE + embeddings. (default: None) + --flash_attn {auto,disabled,sdpa,fa2} + Enable FlashAttention for faster training and + inference. (default: auto) + --shift_attn [SHIFT_ATTN] + Enable shift short attention (S^2-Attn) proposed by + LongLoRA. (default: False) + --mixture_of_depths {convert,load} + Convert the model to mixture-of-depths (MoD) or load + the MoD model. (default: None) + --use_unsloth [USE_UNSLOTH] + Whether or not to use unsloth's optimization for the + LoRA training. (default: False) + --visual_inputs [VISUAL_INPUTS] + Whethor or not to use multimodal LLM that accepts + visual inputs. (default: False) + --moe_aux_loss_coef MOE_AUX_LOSS_COEF + Coefficient of the auxiliary router loss in mixture- + of-experts model. (default: None) + --disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING] + Whether or not to disable gradient checkpointing. + (default: False) + --upcast_layernorm [UPCAST_LAYERNORM] + Whether or not to upcast the layernorm weights in + fp32. (default: False) + --upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT] + Whether or not to upcast the output of lm_head in + fp32. (default: False) + --train_from_scratch [TRAIN_FROM_SCRATCH] + Whether or not to randomly initialize the model + weights. (default: False) + --infer_backend {huggingface,vllm} + Backend engine used at inference. (default: + huggingface) + --vllm_maxlen VLLM_MAXLEN + Maximum sequence (prompt + response) length of the + vLLM engine. (default: 2048) + --vllm_gpu_util VLLM_GPU_UTIL + The fraction of GPU memory in (0,1) to be used for the + vLLM engine. (default: 0.9) + --vllm_enforce_eager [VLLM_ENFORCE_EAGER] + Whether or not to disable CUDA graph in the vLLM + engine. (default: False) + --vllm_max_lora_rank VLLM_MAX_LORA_RANK + Maximum rank of all LoRAs in the vLLM engine. + (default: 32) + --offload_folder OFFLOAD_FOLDER + Path to offload model weights. (default: offload) + --use_cache [USE_CACHE] + Whether or not to use KV cache in generation. + (default: True) + --no_use_cache Whether or not to use KV cache in generation. + (default: False) + --infer_dtype {auto,float16,bfloat16,float32} + Data type for model weights and activations at + inference. (default: auto) + --hf_hub_token HF_HUB_TOKEN + Auth token to log in with Hugging Face Hub. (default: + None) + --ms_hub_token MS_HUB_TOKEN + Auth token to log in with ModelScope Hub. (default: + None) + --export_dir EXPORT_DIR + Path to the directory to save the exported model. + (default: None) + --export_size EXPORT_SIZE + The file shard size (in GB) of the exported model. + (default: 1) + --export_device {cpu,auto} + The device used in model export, use `auto` to + accelerate exporting. (default: cpu) + --export_quantization_bit EXPORT_QUANTIZATION_BIT + The number of bits to quantize the exported model. + (default: None) + --export_quantization_dataset EXPORT_QUANTIZATION_DATASET + Path to the dataset or dataset name to use in + quantizing the exported model. (default: None) + --export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES + The number of samples used for quantization. (default: + 128) + --export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN + The maximum length of the model inputs used for + quantization. (default: 1024) + --export_legacy_format [EXPORT_LEGACY_FORMAT] + Whether or not to save the `.bin` files instead of + `.safetensors`. (default: False) + --export_hub_model_id EXPORT_HUB_MODEL_ID + The name of the repository if push the model to the + Hugging Face hub. (default: None) + --print_param_status [PRINT_PARAM_STATUS] + For debugging purposes, print the status of the + parameters in the model. (default: False) + --template TEMPLATE Which template to use for constructing prompts in + training and inference. (default: None) + --dataset DATASET The name of dataset(s) to use for training. Use commas + to separate multiple datasets. (default: None) + --eval_dataset EVAL_DATASET + The name of dataset(s) to use for evaluation. Use + commas to separate multiple datasets. (default: None) + --dataset_dir DATASET_DIR + Path to the folder containing the datasets. (default: + data) + --cutoff_len CUTOFF_LEN + The cutoff length of the tokenized inputs in the + dataset. (default: 1024) + --train_on_prompt [TRAIN_ON_PROMPT] + Whether or not to disable the mask on the prompt. + (default: False) + --mask_history [MASK_HISTORY] + Whether or not to mask the history and train on the + last turn only. (default: False) + --streaming [STREAMING] + Enable dataset streaming. (default: False) + --buffer_size BUFFER_SIZE + Size of the buffer to randomly sample examples from in + dataset streaming. (default: 16384) + --mix_strategy {concat,interleave_under,interleave_over} + Strategy to use in dataset mixing (concat/interleave) + (undersampling/oversampling). (default: concat) + --interleave_probs INTERLEAVE_PROBS + Probabilities to sample data from datasets. Use commas + to separate multiple datasets. (default: None) + --overwrite_cache [OVERWRITE_CACHE] + Overwrite the cached training and evaluation sets. + (default: False) + --preprocessing_num_workers PREPROCESSING_NUM_WORKERS + The number of processes to use for the pre-processing. + (default: None) + --max_samples MAX_SAMPLES + For debugging purposes, truncate the number of + examples for each dataset. (default: None) + --eval_num_beams EVAL_NUM_BEAMS + Number of beams to use for evaluation. This argument + will be passed to `model.generate` (default: None) + --ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS] + Whether or not to ignore the tokens corresponding to + the pad label in loss computation. (default: True) + --no_ignore_pad_token_for_loss + Whether or not to ignore the tokens corresponding to + the pad label in loss computation. (default: False) + --val_size VAL_SIZE Size of the development set, should be an integer or a + float in range `[0,1)`. (default: 0.0) + --packing PACKING Enable sequences packing in training. Will + automatically enable in pre-training. (default: None) + --neat_packing [NEAT_PACKING] + Enable sequence packing without cross-attention. + (default: False) + --tool_format TOOL_FORMAT + Tool format to use for constructing function calling + examples. (default: None) + --tokenized_path TOKENIZED_PATH + Path to save or load the tokenized datasets. (default: + None) + --output_dir OUTPUT_DIR + The output directory where the model predictions and + checkpoints will be written. (default: None) + --overwrite_output_dir [OVERWRITE_OUTPUT_DIR] + Overwrite the content of the output directory. Use + this to continue training if output_dir points to a + checkpoint directory. (default: False) + --do_train [DO_TRAIN] + Whether to run training. (default: False) + --do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False) + --do_predict [DO_PREDICT] + Whether to run predictions on the test set. (default: + False) + --eval_strategy {no,steps,epoch} + The evaluation strategy to use. (default: no) + --prediction_loss_only [PREDICTION_LOSS_ONLY] + When performing evaluation and predictions, only + returns the loss. (default: False) + --per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE + Batch size per GPU/TPU/MPS/NPU core/CPU for training. + (default: 8) + --per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE + Batch size per GPU/TPU/MPS/NPU core/CPU for + evaluation. (default: 8) + --per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE + Deprecated, the use of `--per_device_train_batch_size` + is preferred. Batch size per GPU/TPU core/CPU for + training. (default: None) + --per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE + Deprecated, the use of `--per_device_eval_batch_size` + is preferred. Batch size per GPU/TPU core/CPU for + evaluation. (default: None) + --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS + Number of updates steps to accumulate before + performing a backward/update pass. (default: 1) + --eval_accumulation_steps EVAL_ACCUMULATION_STEPS + Number of predictions steps to accumulate before + moving the tensors to the CPU. (default: None) + --eval_delay EVAL_DELAY + Number of epochs or steps to wait for before the first + evaluation can be performed, depending on the + eval_strategy. (default: 0) + --torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS + Number of steps to wait before calling + `torch..empty_cache()`.This can help avoid + CUDA out-of-memory errors by lowering peak VRAM usage + at a cost of about [10{'option_strings': ['-- + torch_empty_cache_steps'], 'dest': + 'torch_empty_cache_steps', 'nargs': None, 'const': + None, 'default': None, 'type': 'int', 'choices': None, + 'required': False, 'help': 'Number of steps to wait + before calling `torch..empty_cache()`.This can + help avoid CUDA out-of-memory errors by lowering peak + VRAM usage at a cost of about [10% slower performance] + (https://github.com/huggingface/transformers/issues/31 + 372).If left unset or set to None, cache will not be + emptied.', 'metavar': None, 'container': + , + 'prog': 'launcher.py'}lower performance](https://githu + b.com/huggingface/transformers/issues/31372).If left + unset or set to None, cache will not be emptied. + (default: None) + --learning_rate LEARNING_RATE + The initial learning rate for AdamW. (default: 5e-05) + --weight_decay WEIGHT_DECAY + Weight decay for AdamW if we apply some. (default: + 0.0) + --adam_beta1 ADAM_BETA1 + Beta1 for AdamW optimizer (default: 0.9) + --adam_beta2 ADAM_BETA2 + Beta2 for AdamW optimizer (default: 0.999) + --adam_epsilon ADAM_EPSILON + Epsilon for AdamW optimizer. (default: 1e-08) + --max_grad_norm MAX_GRAD_NORM + Max gradient norm. (default: 1.0) + --num_train_epochs NUM_TRAIN_EPOCHS + Total number of training epochs to perform. (default: + 3.0) + --max_steps MAX_STEPS + If > 0: set total number of training steps to perform. + Override num_train_epochs. (default: -1) + --lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay} + The scheduler type to use. (default: linear) + --lr_scheduler_kwargs LR_SCHEDULER_KWARGS + Extra parameters for the lr_scheduler such as + {'num_cycles': 1} for the cosine with hard restarts. + (default: {}) + --warmup_ratio WARMUP_RATIO + Linear warmup over warmup_ratio fraction of total + steps. (default: 0.0) + --warmup_steps WARMUP_STEPS + Linear warmup over warmup_steps. (default: 0) + --log_level {detail,debug,info,warning,error,critical,passive} + Logger log level to use on the main node. Possible + choices are the log levels as strings: 'debug', + 'info', 'warning', 'error' and 'critical', plus a + 'passive' level which doesn't set anything and lets + the application set the level. Defaults to 'passive'. + (default: passive) + --log_level_replica {detail,debug,info,warning,error,critical,passive} + Logger log level to use on replica nodes. Same choices + and defaults as ``log_level`` (default: warning) + --log_on_each_node [LOG_ON_EACH_NODE] + When doing a multinode distributed training, whether + to log once per node or just once on the main node. + (default: True) + --no_log_on_each_node + When doing a multinode distributed training, whether + to log once per node or just once on the main node. + (default: False) + --logging_dir LOGGING_DIR + Tensorboard log dir. (default: None) + --logging_strategy {no,steps,epoch} + The logging strategy to use. (default: steps) + --logging_first_step [LOGGING_FIRST_STEP] + Log the first global_step (default: False) + --logging_steps LOGGING_STEPS + Log every X updates steps. Should be an integer or a + float in range `[0,1)`. If smaller than 1, will be + interpreted as ratio of total training steps. + (default: 500) + --logging_nan_inf_filter [LOGGING_NAN_INF_FILTER] + Filter nan and inf losses for logging. (default: True) + --no_logging_nan_inf_filter + Filter nan and inf losses for logging. (default: + False) + --save_strategy {no,steps,epoch} + The checkpoint save strategy to use. (default: steps) + --save_steps SAVE_STEPS + Save checkpoint every X updates steps. Should be an + integer or a float in range `[0,1)`. If smaller than + 1, will be interpreted as ratio of total training + steps. (default: 500) + --save_total_limit SAVE_TOTAL_LIMIT + If a value is passed, will limit the total amount of + checkpoints. Deletes the older checkpoints in + `output_dir`. When `load_best_model_at_end` is + enabled, the 'best' checkpoint according to + `metric_for_best_model` will always be retained in + addition to the most recent ones. For example, for + `save_total_limit=5` and + `load_best_model_at_end=True`, the four last + checkpoints will always be retained alongside the best + model. When `save_total_limit=1` and + `load_best_model_at_end=True`, it is possible that two + checkpoints are saved: the last one and the best one + (if they are different). Default is unlimited + checkpoints (default: None) + --save_safetensors [SAVE_SAFETENSORS] + Use safetensors saving and loading for state dicts + instead of default torch.load and torch.save. + (default: True) + --no_save_safetensors + Use safetensors saving and loading for state dicts + instead of default torch.load and torch.save. + (default: False) + --save_on_each_node [SAVE_ON_EACH_NODE] + When doing multi-node distributed training, whether to + save models and checkpoints on each node, or only on + the main one (default: False) + --save_only_model [SAVE_ONLY_MODEL] + When checkpointing, whether to only save the model, or + also the optimizer, scheduler & rng state.Note that + when this is true, you won't be able to resume + training from checkpoint.This enables you to save + storage by not storing the optimizer, scheduler & rng + state.You can only load the model using + from_pretrained with this option set to True. + (default: False) + --restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT] + Whether to restore the callback states from the + checkpoint. If `True`, will override callbacks passed + to the `Trainer` if they exist in the checkpoint. + (default: False) + --no_cuda [NO_CUDA] This argument is deprecated. It will be removed in + version 5.0 of 🤗 Transformers. (default: False) + --use_cpu [USE_CPU] Whether or not to use cpu. If set to False, we will + use cuda/tpu/mps/npu device if available. (default: + False) + --use_mps_device [USE_MPS_DEVICE] + This argument is deprecated. `mps` device will be used + if available similar to `cuda` device. It will be + removed in version 5.0 of 🤗 Transformers (default: + False) + --seed SEED Random seed that will be set at the beginning of + training. (default: 42) + --data_seed DATA_SEED + Random seed to be used with data samplers. (default: + None) + --jit_mode_eval [JIT_MODE_EVAL] + Whether or not to use PyTorch jit trace for inference + (default: False) + --use_ipex [USE_IPEX] + Use Intel extension for PyTorch when it is available, + installation: 'https://github.com/intel/intel- + extension-for-pytorch' (default: False) + --bf16 [BF16] Whether to use bf16 (mixed) precision instead of + 32-bit. Requires Ampere or higher NVIDIA architecture + or using CPU (use_cpu) or Ascend NPU. This is an + experimental API and it may change. (default: False) + --fp16 [FP16] Whether to use fp16 (mixed) precision instead of + 32-bit (default: False) + --fp16_opt_level FP16_OPT_LEVEL + For fp16: Apex AMP optimization level selected in + ['O0', 'O1', 'O2', and 'O3']. See details at + https://nvidia.github.io/apex/amp.html (default: O1) + --half_precision_backend {auto,apex,cpu_amp} + The backend to be used for half precision. (default: + auto) + --bf16_full_eval [BF16_FULL_EVAL] + Whether to use full bfloat16 evaluation instead of + 32-bit. This is an experimental API and it may change. + (default: False) + --fp16_full_eval [FP16_FULL_EVAL] + Whether to use full float16 evaluation instead of + 32-bit (default: False) + --tf32 TF32 Whether to enable tf32 mode, available in Ampere and + newer GPU architectures. This is an experimental API + and it may change. (default: None) + --local_rank LOCAL_RANK + For distributed training: local_rank (default: -1) + --ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl} + The backend to be used for distributed training + (default: None) + --tpu_num_cores TPU_NUM_CORES + TPU: Number of TPU cores (automatically passed by + launcher script) (default: None) + --tpu_metrics_debug [TPU_METRICS_DEBUG] + Deprecated, the use of `--debug tpu_metrics_debug` is + preferred. TPU: Whether to print debug metrics + (default: False) + --debug DEBUG [DEBUG ...] + Whether or not to enable debug mode. Current options: + `underflow_overflow` (Detect underflow and overflow in + activations and weights), `tpu_metrics_debug` (print + debug metrics on TPU). (default: None) + --dataloader_drop_last [DATALOADER_DROP_LAST] + Drop the last incomplete batch if it is not divisible + by the batch size. (default: False) + --eval_steps EVAL_STEPS + Run an evaluation every X steps. Should be an integer + or a float in range `[0,1)`. If smaller than 1, will + be interpreted as ratio of total training steps. + (default: None) + --dataloader_num_workers DATALOADER_NUM_WORKERS + Number of subprocesses to use for data loading + (PyTorch only). 0 means that the data will be loaded + in the main process. (default: 0) + --dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR + Number of batches loaded in advance by each worker. 2 + means there will be a total of 2 * num_workers batches + prefetched across all workers. Default is 2 for + PyTorch < 2.0.0 and otherwise None. (default: None) + --past_index PAST_INDEX + If >=0, uses the corresponding part of the output as + the past state for next step. (default: -1) + --run_name RUN_NAME An optional descriptor for the run. Notably used for + wandb, mlflow and comet logging. (default: None) + --disable_tqdm DISABLE_TQDM + Whether or not to disable the tqdm progress bars. + (default: None) + --remove_unused_columns [REMOVE_UNUSED_COLUMNS] + Remove columns not required by the model when using an + nlp.Dataset. (default: True) + --no_remove_unused_columns + Remove columns not required by the model when using an + nlp.Dataset. (default: False) + --label_names LABEL_NAMES [LABEL_NAMES ...] + The list of keys in your dictionary of inputs that + correspond to the labels. (default: None) + --load_best_model_at_end [LOAD_BEST_MODEL_AT_END] + Whether or not to load the best model found during + training at the end of training. When this option is + enabled, the best checkpoint will always be saved. See + `save_total_limit` for more. (default: False) + --metric_for_best_model METRIC_FOR_BEST_MODEL + The metric to use to compare two different models. + (default: None) + --greater_is_better GREATER_IS_BETTER + Whether the `metric_for_best_model` should be + maximized or not. (default: None) + --ignore_data_skip [IGNORE_DATA_SKIP] + When resuming training, whether or not to skip the + first epochs and batches to get to the same training + data. (default: False) + --fsdp FSDP Whether or not to use PyTorch Fully Sharded Data + Parallel (FSDP) training (in distributed training + only). The base option should be `full_shard`, + `shard_grad_op` or `no_shard` and you can add CPU- + offload to `full_shard` or `shard_grad_op` like this: + full_shard offload` or `shard_grad_op offload`. You + can add auto-wrap to `full_shard` or `shard_grad_op` + with the same syntax: full_shard auto_wrap` or + `shard_grad_op auto_wrap`. (default: ) + --fsdp_min_num_params FSDP_MIN_NUM_PARAMS + This parameter is deprecated. FSDP's minimum number of + parameters for Default Auto Wrapping. (useful only + when `fsdp` field is passed). (default: 0) + --fsdp_config FSDP_CONFIG + Config to be used with FSDP (Pytorch Fully Sharded + Data Parallel). The value is either a fsdp json config + file (e.g., `fsdp_config.json`) or an already loaded + json file as `dict`. (default: None) + --fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP + This parameter is deprecated. Transformer layer class + name (case-sensitive) to wrap, e.g, `BertLayer`, + `GPTJBlock`, `T5Block` .... (useful only when `fsdp` + flag is passed). (default: None) + --accelerator_config ACCELERATOR_CONFIG + Config to be used with the internal Accelerator object + initializtion. The value is either a accelerator json + config file (e.g., `accelerator_config.json`) or an + already loaded json file as `dict`. (default: None) + --deepspeed DEEPSPEED + Enable deepspeed and pass the path to deepspeed json + config file (e.g. `ds_config.json`) or an already + loaded json file as a dict (default: None) + --label_smoothing_factor LABEL_SMOOTHING_FACTOR + The label smoothing epsilon to apply (zero means no + label smoothing). (default: 0.0) + --optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo} + The optimizer to use. (default: adamw_torch) + --optim_args OPTIM_ARGS + Optional arguments to supply to optimizer. (default: + None) + --adafactor [ADAFACTOR] + Whether or not to replace AdamW by Adafactor. + (default: False) + --group_by_length [GROUP_BY_LENGTH] + Whether or not to group samples of roughly the same + length together when batching. (default: False) + --length_column_name LENGTH_COLUMN_NAME + Column name with precomputed lengths to use when + grouping by length. (default: length) + --report_to REPORT_TO + The list of integrations to report the results and + logs to. (default: None) + --ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS + When using distributed training, the value of the flag + `find_unused_parameters` passed to + `DistributedDataParallel`. (default: None) + --ddp_bucket_cap_mb DDP_BUCKET_CAP_MB + When using distributed training, the value of the flag + `bucket_cap_mb` passed to `DistributedDataParallel`. + (default: None) + --ddp_broadcast_buffers DDP_BROADCAST_BUFFERS + When using distributed training, the value of the flag + `broadcast_buffers` passed to + `DistributedDataParallel`. (default: None) + --dataloader_pin_memory [DATALOADER_PIN_MEMORY] + Whether or not to pin memory for DataLoader. (default: + True) + --no_dataloader_pin_memory + Whether or not to pin memory for DataLoader. (default: + False) + --dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS] + If True, the data loader will not shut down the worker + processes after a dataset has been consumed once. This + allows to maintain the workers Dataset instances + alive. Can potentially speed up training, but will + increase RAM usage. (default: False) + --skip_memory_metrics [SKIP_MEMORY_METRICS] + Whether or not to skip adding of memory profiler + reports to metrics. (default: True) + --no_skip_memory_metrics + Whether or not to skip adding of memory profiler + reports to metrics. (default: False) + --use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP] + Whether or not to use the legacy prediction_loop in + the Trainer. (default: False) + --push_to_hub [PUSH_TO_HUB] + Whether or not to upload the trained model to the + model hub after training. (default: False) + --resume_from_checkpoint RESUME_FROM_CHECKPOINT + The path to a folder with a valid checkpoint for your + model. (default: None) + --hub_model_id HUB_MODEL_ID + The name of the repository to keep in sync with the + local `output_dir`. (default: None) + --hub_strategy {end,every_save,checkpoint,all_checkpoints} + The hub strategy to use when `--push_to_hub` is + activated. (default: every_save) + --hub_token HUB_TOKEN + The token to use to push to the Model Hub. (default: + None) + --hub_private_repo [HUB_PRIVATE_REPO] + Whether the model repository is private or not. + (default: False) + --hub_always_push [HUB_ALWAYS_PUSH] + Unless `True`, the Trainer will skip pushes if the + previous one wasn't finished yet. (default: False) + --gradient_checkpointing [GRADIENT_CHECKPOINTING] + If True, use gradient checkpointing to save memory at + the expense of slower backward pass. (default: False) + --gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS + Gradient checkpointing key word arguments such as + `use_reentrant`. Will be passed to + `torch.utils.checkpoint.checkpoint` through + `model.gradient_checkpointing_enable`. (default: None) + --include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS] + Whether or not the inputs will be passed to the + `compute_metrics` function. (default: False) + --eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES] + Whether to recursively concat + inputs/losses/labels/predictions across batches. If + `False`, will instead store them as lists, with each + batch kept separate. (default: True) + --no_eval_do_concat_batches + Whether to recursively concat + inputs/losses/labels/predictions across batches. If + `False`, will instead store them as lists, with each + batch kept separate. (default: False) + --fp16_backend {auto,apex,cpu_amp} + Deprecated. Use half_precision_backend instead + (default: auto) + --evaluation_strategy {no,steps,epoch} + Deprecated. Use `eval_strategy` instead (default: + None) + --push_to_hub_model_id PUSH_TO_HUB_MODEL_ID + The name of the repository to which push the + `Trainer`. (default: None) + --push_to_hub_organization PUSH_TO_HUB_ORGANIZATION + The name of the organization in with to which push the + `Trainer`. (default: None) + --push_to_hub_token PUSH_TO_HUB_TOKEN + The token to use to push to the Model Hub. (default: + None) + --mp_parameters MP_PARAMETERS + Used by the SageMaker launcher to send mp-specific + args. Ignored in Trainer (default: ) + --auto_find_batch_size [AUTO_FIND_BATCH_SIZE] + Whether to automatically decrease the batch size in + half and rerun the training loop again each time a + CUDA Out-of-Memory was reached (default: False) + --full_determinism [FULL_DETERMINISM] + Whether to call enable_full_determinism instead of + set_seed for reproducibility in distributed training. + Important: this will negatively impact the + performance, so only use it for debugging. (default: + False) + --torchdynamo TORCHDYNAMO + This argument is deprecated, use + `--torch_compile_backend` instead. (default: None) + --ray_scope RAY_SCOPE + The scope to use when doing hyperparameter search with + Ray. By default, `"last"` will be used. Ray will then + use the last checkpoint of all trials, compare those, + and select the best one. However, other options are + also available. See the Ray documentation (https://doc + s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun + e.ExperimentAnalysis.get_best_trial) for more options. + (default: last) + --ddp_timeout DDP_TIMEOUT + Overrides the default timeout for distributed training + (value should be given in seconds). (default: 1800) + --torch_compile [TORCH_COMPILE] + If set to `True`, the model will be wrapped in + `torch.compile`. (default: False) + --torch_compile_backend TORCH_COMPILE_BACKEND + Which backend to use with `torch.compile`, passing one + will trigger a model compilation. (default: None) + --torch_compile_mode TORCH_COMPILE_MODE + Which mode to use with `torch.compile`, passing one + will trigger a model compilation. (default: None) + --dispatch_batches DISPATCH_BATCHES + Deprecated. Pass {'dispatch_batches':VALUE} to + `accelerator_config`. (default: None) + --split_batches SPLIT_BATCHES + Deprecated. Pass {'split_batches':True} to + `accelerator_config`. (default: None) + --include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND] + If set to `True`, the speed metrics will include `tgs` + (tokens per second per device). (default: False) + --include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN] + If set to `True`, will track the number of input + tokens seen throughout training. (May be slower in + distributed training) (default: False) + --neftune_noise_alpha NEFTUNE_NOISE_ALPHA + Activates neftune noise embeddings into the model. + NEFTune has been proven to drastically improve model + performances for instrcution fine-tuning. Check out + the original paper here: + https://arxiv.org/abs/2310.05914 and the original code + here: https://github.com/neelsjain/NEFTune. Only + supported for `PreTrainedModel` and `PeftModel` + classes. (default: None) + --optim_target_modules OPTIM_TARGET_MODULES + Target modules for the optimizer defined in the + `optim` argument. Only used for the GaLore optimizer + at the moment. (default: None) + --batch_eval_metrics [BATCH_EVAL_METRICS] + Break eval metrics calculation into batches to save + memory. (default: False) + --eval_on_start [EVAL_ON_START] + Whether to run through the entire `evaluation` step at + the very beginning of training as a sanity check. + (default: False) + --eval_use_gather_object [EVAL_USE_GATHER_OBJECT] + Whether to run recursively gather object in a nested + list/tuple/dictionary of objects from all devices. + (default: False) + --sortish_sampler [SORTISH_SAMPLER] + Whether to use SortishSampler or not. (default: False) + --predict_with_generate [PREDICT_WITH_GENERATE] + Whether to use generate to calculate generative + metrics (ROUGE, BLEU). (default: False) + --generation_max_length GENERATION_MAX_LENGTH + The `max_length` to use on each evaluation loop when + `predict_with_generate=True`. Will default to the + `max_length` value of the model configuration. + (default: None) + --generation_num_beams GENERATION_NUM_BEAMS + The `num_beams` to use on each evaluation loop when + `predict_with_generate=True`. Will default to the + `num_beams` value of the model configuration. + (default: None) + --generation_config GENERATION_CONFIG + Model id, file path or url pointing to a + GenerationConfig json file, to use during prediction. + (default: None) + --use_badam [USE_BADAM] + Whether or not to use the BAdam optimizer. (default: + False) + --badam_mode {layer,ratio} + Whether to use layer-wise or ratio-wise BAdam + optimizer. (default: layer) + --badam_start_block BADAM_START_BLOCK + The starting block index for layer-wise BAdam. + (default: None) + --badam_switch_mode {ascending,descending,random,fixed} + the strategy of picking block to update for layer-wise + BAdam. (default: ascending) + --badam_switch_interval BADAM_SWITCH_INTERVAL + Number of steps to update the block for layer-wise + BAdam. Use -1 to disable the block update. (default: + 50) + --badam_update_ratio BADAM_UPDATE_RATIO + The ratio of the update for ratio-wise BAdam. + (default: 0.05) + --badam_mask_mode {adjacent,scatter} + The mode of the mask for BAdam optimizer. `adjacent` + means that the trainable parameters are adjacent to + each other, `scatter` means that trainable parameters + are randomly choosed from the weight. (default: + adjacent) + --badam_verbose BADAM_VERBOSE + The verbosity level of BAdam optimizer. 0 for no + print, 1 for print the block prefix, 2 for print + trainable parameters. (default: 0) + --use_galore [USE_GALORE] + Whether or not to use the gradient low-Rank projection + (GaLore). (default: False) + --galore_target GALORE_TARGET + Name(s) of modules to apply GaLore. Use commas to + separate multiple modules. Use `all` to specify all + the linear modules. (default: all) + --galore_rank GALORE_RANK + The rank of GaLore gradients. (default: 16) + --galore_update_interval GALORE_UPDATE_INTERVAL + Number of steps to update the GaLore projection. + (default: 200) + --galore_scale GALORE_SCALE + GaLore scaling coefficient. (default: 0.25) + --galore_proj_type {std,reverse_std,right,left,full} + Type of GaLore projection. (default: std) + --galore_layerwise [GALORE_LAYERWISE] + Whether or not to enable layer-wise update to further + save memory. (default: False) + --pref_beta PREF_BETA + The beta parameter in the preference loss. (default: + 0.1) + --pref_ftx PREF_FTX The supervised fine-tuning loss coefficient in DPO + training. (default: 0.0) + --pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo} + The type of DPO loss to use. (default: sigmoid) + --dpo_label_smoothing DPO_LABEL_SMOOTHING + The robust DPO label smoothing parameter in cDPO that + should be between 0 and 0.5. (default: 0.0) + --kto_chosen_weight KTO_CHOSEN_WEIGHT + The weight factor of the desirable losses in KTO + training. (default: 1.0) + --kto_rejected_weight KTO_REJECTED_WEIGHT + The weight factor of the undesirable losses in KTO + training. (default: 1.0) + --simpo_gamma SIMPO_GAMMA + The target reward margin term in SimPO loss. (default: + 0.5) + --ppo_buffer_size PPO_BUFFER_SIZE + The number of mini-batches to make experience buffer + in a PPO optimization step. (default: 1) + --ppo_epochs PPO_EPOCHS + The number of epochs to perform in a PPO optimization + step. (default: 4) + --ppo_score_norm [PPO_SCORE_NORM] + Use score normalization in PPO training. (default: + False) + --ppo_target PPO_TARGET + Target KL value for adaptive KL control in PPO + training. (default: 6.0) + --ppo_whiten_rewards [PPO_WHITEN_REWARDS] + Whiten the rewards before compute advantages in PPO + training. (default: False) + --ref_model REF_MODEL + Path to the reference model used for the PPO or DPO + training. (default: None) + --ref_model_adapters REF_MODEL_ADAPTERS + Path to the adapters of the reference model. (default: + None) + --ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT + The number of bits to quantize the reference model. + (default: None) + --reward_model REWARD_MODEL + Path to the reward model used for the PPO training. + (default: None) + --reward_model_adapters REWARD_MODEL_ADAPTERS + Path to the adapters of the reward model. (default: + None) + --reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT + The number of bits to quantize the reward model. + (default: None) + --reward_model_type {lora,full,api} + The type of the reward model in PPO training. Lora + model only supports lora training. (default: lora) + --additional_target ADDITIONAL_TARGET + Name(s) of modules apart from LoRA layers to be set as + trainable and saved in the final checkpoint. Use + commas to separate multiple modules. (default: None) + --lora_alpha LORA_ALPHA + The scale factor for LoRA fine-tuning (default: + lora_rank * 2). (default: None) + --lora_dropout LORA_DROPOUT + Dropout rate for the LoRA fine-tuning. (default: 0.0) + --lora_rank LORA_RANK + The intrinsic dimension for LoRA fine-tuning. + (default: 8) + --lora_target LORA_TARGET + Name(s) of target modules to apply LoRA. Use commas to + separate multiple modules. Use `all` to specify all + the linear modules. (default: all) + --loraplus_lr_ratio LORAPLUS_LR_RATIO + LoRA plus learning rate ratio (lr_B / lr_A). (default: + None) + --loraplus_lr_embedding LORAPLUS_LR_EMBEDDING + LoRA plus learning rate for lora embedding layers. + (default: 1e-06) + --use_rslora [USE_RSLORA] + Whether or not to use the rank stabilization scaling + factor for LoRA layer. (default: False) + --use_dora [USE_DORA] + Whether or not to use the weight-decomposed lora + method (DoRA). (default: False) + --pissa_init [PISSA_INIT] + Whether or not to initialize a PiSSA adapter. + (default: False) + --pissa_iter PISSA_ITER + The number of iteration steps performed by FSVD in + PiSSA. Use -1 to disable it. (default: 16) + --pissa_convert [PISSA_CONVERT] + Whether or not to convert the PiSSA adapter to a + normal LoRA adapter. (default: False) + --create_new_adapter [CREATE_NEW_ADAPTER] + Whether or not to create a new adapter with randomly + initialized weight. (default: False) + --freeze_trainable_layers FREEZE_TRAINABLE_LAYERS + The number of trainable layers for freeze (partial- + parameter) fine-tuning. Positive numbers mean the last + n layers are set as trainable, negative numbers mean + the first n layers are set as trainable. (default: 2) + --freeze_trainable_modules FREEZE_TRAINABLE_MODULES + Name(s) of trainable modules for freeze (partial- + parameter) fine-tuning. Use commas to separate + multiple modules. Use `all` to specify all the + available modules. (default: all) + --freeze_extra_modules FREEZE_EXTRA_MODULES + Name(s) of modules apart from hidden layers to be set + as trainable for freeze (partial-parameter) fine- + tuning. Use commas to separate multiple modules. + (default: None) + --pure_bf16 [PURE_BF16] + Whether or not to train model in purely bf16 precision + (without AMP). (default: False) + --stage {pt,sft,rm,ppo,dpo,kto} + Which stage will be performed in training. (default: + sft) + --finetuning_type {lora,freeze,full} + Which fine-tuning method to use. (default: lora) + --use_llama_pro [USE_LLAMA_PRO] + Whether or not to make only the parameters in the + expanded blocks trainable. (default: False) + --use_adam_mini [USE_ADAM_MINI] + Whether or not to use the Adam-mini optimizer. + (default: False) + --freeze_vision_tower [FREEZE_VISION_TOWER] + Whether ot not to freeze vision tower in MLLM + training. (default: True) + --no_freeze_vision_tower + Whether ot not to freeze vision tower in MLLM + training. (default: False) + --train_mm_proj_only [TRAIN_MM_PROJ_ONLY] + Whether or not to train the multimodal projector for + MLLM only. (default: False) + --compute_accuracy [COMPUTE_ACCURACY] + Whether or not to compute the token-level accuracy at + evaluation. (default: False) + --plot_loss [PLOT_LOSS] + Whether or not to save the training loss curves. + (default: False) + --do_sample [DO_SAMPLE] + Whether or not to use sampling, use greedy decoding + otherwise. (default: True) + --no_do_sample Whether or not to use sampling, use greedy decoding + otherwise. (default: False) + --temperature TEMPERATURE + The value used to modulate the next token + probabilities. (default: 0.95) + --top_p TOP_P The smallest set of most probable tokens with + probabilities that add up to top_p or higher are kept. + (default: 0.7) + --top_k TOP_K The number of highest probability vocabulary tokens to + keep for top-k filtering. (default: 50) + --num_beams NUM_BEAMS + Number of beams for beam search. 1 means no beam + search. (default: 1) + --max_length MAX_LENGTH + The maximum length the generated tokens can have. It + can be overridden by max_new_tokens. (default: 1024) + --max_new_tokens MAX_NEW_TOKENS + The maximum numbers of tokens to generate, ignoring + the number of tokens in the prompt. (default: 1024) + --repetition_penalty REPETITION_PENALTY + The parameter for repetition penalty. 1.0 means no + penalty. (default: 1.0) + --length_penalty LENGTH_PENALTY + Exponential penalty to the length that is used with + beam-based generation. (default: 1.0) + --default_system DEFAULT_SYSTEM + Default system message to use in chat completion. + (default: None) +usage: launcher.py [-h] --model_name_or_path MODEL_NAME_OR_PATH + [--adapter_name_or_path ADAPTER_NAME_OR_PATH] + [--adapter_folder ADAPTER_FOLDER] [--cache_dir CACHE_DIR] + [--use_fast_tokenizer [USE_FAST_TOKENIZER]] + [--no_use_fast_tokenizer] [--resize_vocab [RESIZE_VOCAB]] + [--split_special_tokens [SPLIT_SPECIAL_TOKENS]] + [--new_special_tokens NEW_SPECIAL_TOKENS] + [--model_revision MODEL_REVISION] + [--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]] + [--no_low_cpu_mem_usage] + [--quantization_method {bitsandbytes,hqq,eetq}] + [--quantization_bit QUANTIZATION_BIT] + [--quantization_type {fp4,nf4}] + [--double_quantization [DOUBLE_QUANTIZATION]] + [--no_double_quantization] + [--quantization_device_map {auto}] + [--rope_scaling {linear,dynamic}] + [--flash_attn {auto,disabled,sdpa,fa2}] + [--shift_attn [SHIFT_ATTN]] + [--mixture_of_depths {convert,load}] + [--use_unsloth [USE_UNSLOTH]] + [--visual_inputs [VISUAL_INPUTS]] + [--moe_aux_loss_coef MOE_AUX_LOSS_COEF] + [--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]] + [--upcast_layernorm [UPCAST_LAYERNORM]] + [--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]] + [--train_from_scratch [TRAIN_FROM_SCRATCH]] + [--infer_backend {huggingface,vllm}] + [--vllm_maxlen VLLM_MAXLEN] [--vllm_gpu_util VLLM_GPU_UTIL] + [--vllm_enforce_eager [VLLM_ENFORCE_EAGER]] + [--vllm_max_lora_rank VLLM_MAX_LORA_RANK] + [--offload_folder OFFLOAD_FOLDER] [--use_cache [USE_CACHE]] + [--no_use_cache] + [--infer_dtype {auto,float16,bfloat16,float32}] + [--hf_hub_token HF_HUB_TOKEN] [--ms_hub_token MS_HUB_TOKEN] + [--export_dir EXPORT_DIR] [--export_size EXPORT_SIZE] + [--export_device {cpu,auto}] + [--export_quantization_bit EXPORT_QUANTIZATION_BIT] + [--export_quantization_dataset EXPORT_QUANTIZATION_DATASET] + [--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES] + [--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN] + [--export_legacy_format [EXPORT_LEGACY_FORMAT]] + [--export_hub_model_id EXPORT_HUB_MODEL_ID] + [--print_param_status [PRINT_PARAM_STATUS]] + [--template TEMPLATE] [--dataset DATASET] + [--eval_dataset EVAL_DATASET] [--dataset_dir DATASET_DIR] + [--cutoff_len CUTOFF_LEN] + [--train_on_prompt [TRAIN_ON_PROMPT]] + [--mask_history [MASK_HISTORY]] [--streaming [STREAMING]] + [--buffer_size BUFFER_SIZE] + [--mix_strategy {concat,interleave_under,interleave_over}] + [--interleave_probs INTERLEAVE_PROBS] + [--overwrite_cache [OVERWRITE_CACHE]] + [--preprocessing_num_workers PREPROCESSING_NUM_WORKERS] + [--max_samples MAX_SAMPLES] + [--eval_num_beams EVAL_NUM_BEAMS] + [--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]] + [--no_ignore_pad_token_for_loss] [--val_size VAL_SIZE] + [--packing PACKING] [--neat_packing [NEAT_PACKING]] + [--tool_format TOOL_FORMAT] + [--tokenized_path TOKENIZED_PATH] --output_dir OUTPUT_DIR + [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] + [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]] + [--do_predict [DO_PREDICT]] + [--eval_strategy {no,steps,epoch}] + [--prediction_loss_only [PREDICTION_LOSS_ONLY]] + [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] + [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE] + [--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] + [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE] + [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS] + [--eval_accumulation_steps EVAL_ACCUMULATION_STEPS] + [--eval_delay EVAL_DELAY] + [--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS] + [--learning_rate LEARNING_RATE] + [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1] + [--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON] + [--max_grad_norm MAX_GRAD_NORM] + [--num_train_epochs NUM_TRAIN_EPOCHS] + [--max_steps MAX_STEPS] + [--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}] + [--lr_scheduler_kwargs LR_SCHEDULER_KWARGS] + [--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS] + [--log_level {detail,debug,info,warning,error,critical,passive}] + [--log_level_replica {detail,debug,info,warning,error,critical,passive}] + [--log_on_each_node [LOG_ON_EACH_NODE]] + [--no_log_on_each_node] [--logging_dir LOGGING_DIR] + [--logging_strategy {no,steps,epoch}] + [--logging_first_step [LOGGING_FIRST_STEP]] + [--logging_steps LOGGING_STEPS] + [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]] + [--no_logging_nan_inf_filter] + [--save_strategy {no,steps,epoch}] + [--save_steps SAVE_STEPS] + [--save_total_limit SAVE_TOTAL_LIMIT] + [--save_safetensors [SAVE_SAFETENSORS]] + [--no_save_safetensors] + [--save_on_each_node [SAVE_ON_EACH_NODE]] + [--save_only_model [SAVE_ONLY_MODEL]] + [--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]] + [--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]] + [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] + [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]] + [--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]] + [--fp16_opt_level FP16_OPT_LEVEL] + [--half_precision_backend {auto,apex,cpu_amp}] + [--bf16_full_eval [BF16_FULL_EVAL]] + [--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32] + [--local_rank LOCAL_RANK] + [--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}] + [--tpu_num_cores TPU_NUM_CORES] + [--tpu_metrics_debug [TPU_METRICS_DEBUG]] + [--debug DEBUG [DEBUG ...]] + [--dataloader_drop_last [DATALOADER_DROP_LAST]] + [--eval_steps EVAL_STEPS] + [--dataloader_num_workers DATALOADER_NUM_WORKERS] + [--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR] + [--past_index PAST_INDEX] [--run_name RUN_NAME] + [--disable_tqdm DISABLE_TQDM] + [--remove_unused_columns [REMOVE_UNUSED_COLUMNS]] + [--no_remove_unused_columns] + [--label_names LABEL_NAMES [LABEL_NAMES ...]] + [--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]] + [--metric_for_best_model METRIC_FOR_BEST_MODEL] + [--greater_is_better GREATER_IS_BETTER] + [--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP] + [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] + [--fsdp_config FSDP_CONFIG] + [--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] + [--accelerator_config ACCELERATOR_CONFIG] + [--deepspeed DEEPSPEED] + [--label_smoothing_factor LABEL_SMOOTHING_FACTOR] + [--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}] + [--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] + [--group_by_length [GROUP_BY_LENGTH]] + [--length_column_name LENGTH_COLUMN_NAME] + [--report_to REPORT_TO] + [--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] + [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB] + [--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS] + [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]] + [--no_dataloader_pin_memory] + [--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]] + [--skip_memory_metrics [SKIP_MEMORY_METRICS]] + [--no_skip_memory_metrics] + [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]] + [--push_to_hub [PUSH_TO_HUB]] + [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] + [--hub_model_id HUB_MODEL_ID] + [--hub_strategy {end,every_save,checkpoint,all_checkpoints}] + [--hub_token HUB_TOKEN] + [--hub_private_repo [HUB_PRIVATE_REPO]] + [--hub_always_push [HUB_ALWAYS_PUSH]] + [--gradient_checkpointing [GRADIENT_CHECKPOINTING]] + [--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS] + [--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]] + [--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]] + [--no_eval_do_concat_batches] + [--fp16_backend {auto,apex,cpu_amp}] + [--evaluation_strategy {no,steps,epoch}] + [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID] + [--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION] + [--push_to_hub_token PUSH_TO_HUB_TOKEN] + [--mp_parameters MP_PARAMETERS] + [--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]] + [--full_determinism [FULL_DETERMINISM]] + [--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE] + [--ddp_timeout DDP_TIMEOUT] + [--torch_compile [TORCH_COMPILE]] + [--torch_compile_backend TORCH_COMPILE_BACKEND] + [--torch_compile_mode TORCH_COMPILE_MODE] + [--dispatch_batches DISPATCH_BATCHES] + [--split_batches SPLIT_BATCHES] + [--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]] + [--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]] + [--neftune_noise_alpha NEFTUNE_NOISE_ALPHA] + [--optim_target_modules OPTIM_TARGET_MODULES] + [--batch_eval_metrics [BATCH_EVAL_METRICS]] + [--eval_on_start [EVAL_ON_START]] + [--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]] + [--sortish_sampler [SORTISH_SAMPLER]] + [--predict_with_generate [PREDICT_WITH_GENERATE]] + [--generation_max_length GENERATION_MAX_LENGTH] + [--generation_num_beams GENERATION_NUM_BEAMS] + [--generation_config GENERATION_CONFIG] + [--use_badam [USE_BADAM]] [--badam_mode {layer,ratio}] + [--badam_start_block BADAM_START_BLOCK] + [--badam_switch_mode {ascending,descending,random,fixed}] + [--badam_switch_interval BADAM_SWITCH_INTERVAL] + [--badam_update_ratio BADAM_UPDATE_RATIO] + [--badam_mask_mode {adjacent,scatter}] + [--badam_verbose BADAM_VERBOSE] [--use_galore [USE_GALORE]] + [--galore_target GALORE_TARGET] [--galore_rank GALORE_RANK] + [--galore_update_interval GALORE_UPDATE_INTERVAL] + [--galore_scale GALORE_SCALE] + [--galore_proj_type {std,reverse_std,right,left,full}] + [--galore_layerwise [GALORE_LAYERWISE]] + [--pref_beta PREF_BETA] [--pref_ftx PREF_FTX] + [--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}] + [--dpo_label_smoothing DPO_LABEL_SMOOTHING] + [--kto_chosen_weight KTO_CHOSEN_WEIGHT] + [--kto_rejected_weight KTO_REJECTED_WEIGHT] + [--simpo_gamma SIMPO_GAMMA] + [--ppo_buffer_size PPO_BUFFER_SIZE] + [--ppo_epochs PPO_EPOCHS] + [--ppo_score_norm [PPO_SCORE_NORM]] + [--ppo_target PPO_TARGET] + [--ppo_whiten_rewards [PPO_WHITEN_REWARDS]] + [--ref_model REF_MODEL] + [--ref_model_adapters REF_MODEL_ADAPTERS] + [--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT] + [--reward_model REWARD_MODEL] + [--reward_model_adapters REWARD_MODEL_ADAPTERS] + [--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT] + [--reward_model_type {lora,full,api}] + [--additional_target ADDITIONAL_TARGET] + [--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT] + [--lora_rank LORA_RANK] [--lora_target LORA_TARGET] + [--loraplus_lr_ratio LORAPLUS_LR_RATIO] + [--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING] + [--use_rslora [USE_RSLORA]] [--use_dora [USE_DORA]] + [--pissa_init [PISSA_INIT]] [--pissa_iter PISSA_ITER] + [--pissa_convert [PISSA_CONVERT]] + [--create_new_adapter [CREATE_NEW_ADAPTER]] + [--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS] + [--freeze_trainable_modules FREEZE_TRAINABLE_MODULES] + [--freeze_extra_modules FREEZE_EXTRA_MODULES] + [--pure_bf16 [PURE_BF16]] [--stage {pt,sft,rm,ppo,dpo,kto}] + [--finetuning_type {lora,freeze,full}] + [--use_llama_pro [USE_LLAMA_PRO]] + [--use_adam_mini [USE_ADAM_MINI]] + [--freeze_vision_tower [FREEZE_VISION_TOWER]] + [--no_freeze_vision_tower] + [--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]] + [--compute_accuracy [COMPUTE_ACCURACY]] + [--plot_loss [PLOT_LOSS]] [--do_sample [DO_SAMPLE]] + [--no_do_sample] [--temperature TEMPERATURE] + [--top_p TOP_P] [--top_k TOP_K] [--num_beams NUM_BEAMS] + [--max_length MAX_LENGTH] [--max_new_tokens MAX_NEW_TOKENS] + [--repetition_penalty REPETITION_PENALTY] + [--length_penalty LENGTH_PENALTY] + [--default_system DEFAULT_SYSTEM] + +optional arguments: + -h, --help show this help message and exit + --model_name_or_path MODEL_NAME_OR_PATH + Path to the model weight or identifier from + huggingface.co/models or modelscope.cn/models. + (default: None) + --adapter_name_or_path ADAPTER_NAME_OR_PATH + Path to the adapter weight or identifier from + huggingface.co/models. Use commas to separate multiple + adapters. (default: None) + --adapter_folder ADAPTER_FOLDER + The folder containing the adapter weights to load. + (default: None) + --cache_dir CACHE_DIR + Where to store the pre-trained models downloaded from + huggingface.co or modelscope.cn. (default: None) + --use_fast_tokenizer [USE_FAST_TOKENIZER] + Whether or not to use one of the fast tokenizer + (backed by the tokenizers library). (default: True) + --no_use_fast_tokenizer + Whether or not to use one of the fast tokenizer + (backed by the tokenizers library). (default: False) + --resize_vocab [RESIZE_VOCAB] + Whether or not to resize the tokenizer vocab and the + embedding layers. (default: False) + --split_special_tokens [SPLIT_SPECIAL_TOKENS] + Whether or not the special tokens should be split + during the tokenization process. (default: False) + --new_special_tokens NEW_SPECIAL_TOKENS + Special tokens to be added into the tokenizer. Use + commas to separate multiple tokens. (default: None) + --model_revision MODEL_REVISION + The specific model version to use (can be a branch + name, tag name or commit id). (default: main) + --low_cpu_mem_usage [LOW_CPU_MEM_USAGE] + Whether or not to use memory-efficient model loading. + (default: True) + --no_low_cpu_mem_usage + Whether or not to use memory-efficient model loading. + (default: False) + --quantization_method {bitsandbytes,hqq,eetq} + Quantization method to use for on-the-fly + quantization. (default: bitsandbytes) + --quantization_bit QUANTIZATION_BIT + The number of bits to quantize the model using + bitsandbytes. (default: None) + --quantization_type {fp4,nf4} + Quantization data type to use in int4 training. + (default: nf4) + --double_quantization [DOUBLE_QUANTIZATION] + Whether or not to use double quantization in int4 + training. (default: True) + --no_double_quantization + Whether or not to use double quantization in int4 + training. (default: False) + --quantization_device_map {auto} + Device map used to infer the 4-bit quantized model, + needs bitsandbytes>=0.43.0. (default: None) + --rope_scaling {linear,dynamic} + Which scaling strategy should be adopted for the RoPE + embeddings. (default: None) + --flash_attn {auto,disabled,sdpa,fa2} + Enable FlashAttention for faster training and + inference. (default: auto) + --shift_attn [SHIFT_ATTN] + Enable shift short attention (S^2-Attn) proposed by + LongLoRA. (default: False) + --mixture_of_depths {convert,load} + Convert the model to mixture-of-depths (MoD) or load + the MoD model. (default: None) + --use_unsloth [USE_UNSLOTH] + Whether or not to use unsloth's optimization for the + LoRA training. (default: False) + --visual_inputs [VISUAL_INPUTS] + Whethor or not to use multimodal LLM that accepts + visual inputs. (default: False) + --moe_aux_loss_coef MOE_AUX_LOSS_COEF + Coefficient of the auxiliary router loss in mixture- + of-experts model. (default: None) + --disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING] + Whether or not to disable gradient checkpointing. + (default: False) + --upcast_layernorm [UPCAST_LAYERNORM] + Whether or not to upcast the layernorm weights in + fp32. (default: False) + --upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT] + Whether or not to upcast the output of lm_head in + fp32. (default: False) + --train_from_scratch [TRAIN_FROM_SCRATCH] + Whether or not to randomly initialize the model + weights. (default: False) + --infer_backend {huggingface,vllm} + Backend engine used at inference. (default: + huggingface) + --vllm_maxlen VLLM_MAXLEN + Maximum sequence (prompt + response) length of the + vLLM engine. (default: 2048) + --vllm_gpu_util VLLM_GPU_UTIL + The fraction of GPU memory in (0,1) to be used for the + vLLM engine. (default: 0.9) + --vllm_enforce_eager [VLLM_ENFORCE_EAGER] + Whether or not to disable CUDA graph in the vLLM + engine. (default: False) + --vllm_max_lora_rank VLLM_MAX_LORA_RANK + Maximum rank of all LoRAs in the vLLM engine. + (default: 32) + --offload_folder OFFLOAD_FOLDER + Path to offload model weights. (default: offload) + --use_cache [USE_CACHE] + Whether or not to use KV cache in generation. + (default: True) + --no_use_cache Whether or not to use KV cache in generation. + (default: False) + --infer_dtype {auto,float16,bfloat16,float32} + Data type for model weights and activations at + inference. (default: auto) + --hf_hub_token HF_HUB_TOKEN + Auth token to log in with Hugging Face Hub. (default: + None) + --ms_hub_token MS_HUB_TOKEN + Auth token to log in with ModelScope Hub. (default: + None) + --export_dir EXPORT_DIR + Path to the directory to save the exported model. + (default: None) + --export_size EXPORT_SIZE + The file shard size (in GB) of the exported model. + (default: 1) + --export_device {cpu,auto} + The device used in model export, use `auto` to + accelerate exporting. (default: cpu) + --export_quantization_bit EXPORT_QUANTIZATION_BIT + The number of bits to quantize the exported model. + (default: None) + --export_quantization_dataset EXPORT_QUANTIZATION_DATASET + Path to the dataset or dataset name to use in + quantizing the exported model. (default: None) + --export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES + The number of samples used for quantization. (default: + 128) + --export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN + The maximum length of the model inputs used for + quantization. (default: 1024) + --export_legacy_format [EXPORT_LEGACY_FORMAT] + Whether or not to save the `.bin` files instead of + `.safetensors`. (default: False) + --export_hub_model_id EXPORT_HUB_MODEL_ID + The name of the repository if push the model to the + Hugging Face hub. (default: None) + --print_param_status [PRINT_PARAM_STATUS] + For debugging purposes, print the status of the + parameters in the model. (default: False) + --template TEMPLATE Which template to use for constructing prompts in + training and inference. (default: None) + --dataset DATASET The name of dataset(s) to use for training. Use commas + to separate multiple datasets. (default: None) + --eval_dataset EVAL_DATASET + The name of dataset(s) to use for evaluation. Use + commas to separate multiple datasets. (default: None) + --dataset_dir DATASET_DIR + Path to the folder containing the datasets. (default: + data) + --cutoff_len CUTOFF_LEN + The cutoff length of the tokenized inputs in the + dataset. (default: 1024) + --train_on_prompt [TRAIN_ON_PROMPT] + Whether or not to disable the mask on the prompt. + (default: False) + --mask_history [MASK_HISTORY] + Whether or not to mask the history and train on the + last turn only. (default: False) + --streaming [STREAMING] + Enable dataset streaming. (default: False) + --buffer_size BUFFER_SIZE + Size of the buffer to randomly sample examples from in + dataset streaming. (default: 16384) + --mix_strategy {concat,interleave_under,interleave_over} + Strategy to use in dataset mixing (concat/interleave) + (undersampling/oversampling). (default: concat) + --interleave_probs INTERLEAVE_PROBS + Probabilities to sample data from datasets. Use commas + to separate multiple datasets. (default: None) + --overwrite_cache [OVERWRITE_CACHE] + Overwrite the cached training and evaluation sets. + (default: False) + --preprocessing_num_workers PREPROCESSING_NUM_WORKERS + The number of processes to use for the pre-processing. + (default: None) + --max_samples MAX_SAMPLES + For debugging purposes, truncate the number of + examples for each dataset. (default: None) + --eval_num_beams EVAL_NUM_BEAMS + Number of beams to use for evaluation. This argument + will be passed to `model.generate` (default: None) + --ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS] + Whether or not to ignore the tokens corresponding to + the pad label in loss computation. (default: True) + --no_ignore_pad_token_for_loss + Whether or not to ignore the tokens corresponding to + the pad label in loss computation. (default: False) + --val_size VAL_SIZE Size of the development set, should be an integer or a + float in range `[0,1)`. (default: 0.0) + --packing PACKING Enable sequences packing in training. Will + automatically enable in pre-training. (default: None) + --neat_packing [NEAT_PACKING] + Enable sequence packing without cross-attention. + (default: False) + --tool_format TOOL_FORMAT + Tool format to use for constructing function calling + examples. (default: None) + --tokenized_path TOKENIZED_PATH + Path to save or load the tokenized datasets. (default: + None) + --output_dir OUTPUT_DIR + The output directory where the model predictions and + checkpoints will be written. (default: None) + --overwrite_output_dir [OVERWRITE_OUTPUT_DIR] + Overwrite the content of the output directory. Use + this to continue training if output_dir points to a + checkpoint directory. (default: False) + --do_train [DO_TRAIN] + Whether to run training. (default: False) + --do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False) + --do_predict [DO_PREDICT] + Whether to run predictions on the test set. (default: + False) + --eval_strategy {no,steps,epoch} + The evaluation strategy to use. (default: no) + --prediction_loss_only [PREDICTION_LOSS_ONLY] + When performing evaluation and predictions, only + returns the loss. (default: False) + --per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE + Batch size per GPU/TPU/MPS/NPU core/CPU for training. + (default: 8) + --per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE + Batch size per GPU/TPU/MPS/NPU core/CPU for + evaluation. (default: 8) + --per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE + Deprecated, the use of `--per_device_train_batch_size` + is preferred. Batch size per GPU/TPU core/CPU for + training. (default: None) + --per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE + Deprecated, the use of `--per_device_eval_batch_size` + is preferred. Batch size per GPU/TPU core/CPU for + evaluation. (default: None) + --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS + Number of updates steps to accumulate before + performing a backward/update pass. (default: 1) + --eval_accumulation_steps EVAL_ACCUMULATION_STEPS + Number of predictions steps to accumulate before + moving the tensors to the CPU. (default: None) + --eval_delay EVAL_DELAY + Number of epochs or steps to wait for before the first + evaluation can be performed, depending on the + eval_strategy. (default: 0) + --torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS + Number of steps to wait before calling + `torch..empty_cache()`.This can help avoid + CUDA out-of-memory errors by lowering peak VRAM usage + at a cost of about [10{'option_strings': ['-- + torch_empty_cache_steps'], 'dest': + 'torch_empty_cache_steps', 'nargs': None, 'const': + None, 'default': None, 'type': 'int', 'choices': None, + 'required': False, 'help': 'Number of steps to wait + before calling `torch..empty_cache()`.This can + help avoid CUDA out-of-memory errors by lowering peak + VRAM usage at a cost of about [10% slower performance] + (https://github.com/huggingface/transformers/issues/31 + 372).If left unset or set to None, cache will not be + emptied.', 'metavar': None, 'container': + , + 'prog': 'launcher.py'}lower performance](https://githu + b.com/huggingface/transformers/issues/31372).If left + unset or set to None, cache will not be emptied. + (default: None) + --learning_rate LEARNING_RATE + The initial learning rate for AdamW. (default: 5e-05) + --weight_decay WEIGHT_DECAY + Weight decay for AdamW if we apply some. (default: + 0.0) + --adam_beta1 ADAM_BETA1 + Beta1 for AdamW optimizer (default: 0.9) + --adam_beta2 ADAM_BETA2 + Beta2 for AdamW optimizer (default: 0.999) + --adam_epsilon ADAM_EPSILON + Epsilon for AdamW optimizer. (default: 1e-08) + --max_grad_norm MAX_GRAD_NORM + Max gradient norm. (default: 1.0) + --num_train_epochs NUM_TRAIN_EPOCHS + Total number of training epochs to perform. (default: + 3.0) + --max_steps MAX_STEPS + If > 0: set total number of training steps to perform. + Override num_train_epochs. (default: -1) + --lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay} + The scheduler type to use. (default: linear) + --lr_scheduler_kwargs LR_SCHEDULER_KWARGS + Extra parameters for the lr_scheduler such as + {'num_cycles': 1} for the cosine with hard restarts. + (default: {}) + --warmup_ratio WARMUP_RATIO + Linear warmup over warmup_ratio fraction of total + steps. (default: 0.0) + --warmup_steps WARMUP_STEPS + Linear warmup over warmup_steps. (default: 0) + --log_level {detail,debug,info,warning,error,critical,passive} + Logger log level to use on the main node. Possible + choices are the log levels as strings: 'debug', + 'info', 'warning', 'error' and 'critical', plus a + 'passive' level which doesn't set anything and lets + the application set the level. Defaults to 'passive'. + (default: passive) + --log_level_replica {detail,debug,info,warning,error,critical,passive} + Logger log level to use on replica nodes. Same choices + and defaults as ``log_level`` (default: warning) + --log_on_each_node [LOG_ON_EACH_NODE] + When doing a multinode distributed training, whether + to log once per node or just once on the main node. + (default: True) + --no_log_on_each_node + When doing a multinode distributed training, whether + to log once per node or just once on the main node. + (default: False) + --logging_dir LOGGING_DIR + Tensorboard log dir. (default: None) + --logging_strategy {no,steps,epoch} + The logging strategy to use. (default: steps) + --logging_first_step [LOGGING_FIRST_STEP] + Log the first global_step (default: False) + --logging_steps LOGGING_STEPS + Log every X updates steps. Should be an integer or a + float in range `[0,1)`. If smaller than 1, will be + interpreted as ratio of total training steps. + (default: 500) + --logging_nan_inf_filter [LOGGING_NAN_INF_FILTER] + Filter nan and inf losses for logging. (default: True) + --no_logging_nan_inf_filter + Filter nan and inf losses for logging. (default: + False) + --save_strategy {no,steps,epoch} + The checkpoint save strategy to use. (default: steps) + --save_steps SAVE_STEPS + Save checkpoint every X updates steps. Should be an + integer or a float in range `[0,1)`. If smaller than + 1, will be interpreted as ratio of total training + steps. (default: 500) + --save_total_limit SAVE_TOTAL_LIMIT + If a value is passed, will limit the total amount of + checkpoints. Deletes the older checkpoints in + `output_dir`. When `load_best_model_at_end` is + enabled, the 'best' checkpoint according to + `metric_for_best_model` will always be retained in + addition to the most recent ones. For example, for + `save_total_limit=5` and + `load_best_model_at_end=True`, the four last + checkpoints will always be retained alongside the best + model. When `save_total_limit=1` and + `load_best_model_at_end=True`, it is possible that two + checkpoints are saved: the last one and the best one + (if they are different). Default is unlimited + checkpoints (default: None) + --save_safetensors [SAVE_SAFETENSORS] + Use safetensors saving and loading for state dicts + instead of default torch.load and torch.save. + (default: True) + --no_save_safetensors + Use safetensors saving and loading for state dicts + instead of default torch.load and torch.save. + (default: False) + --save_on_each_node [SAVE_ON_EACH_NODE] + When doing multi-node distributed training, whether to + save models and checkpoints on each node, or only on + the main one (default: False) + --save_only_model [SAVE_ONLY_MODEL] + When checkpointing, whether to only save the model, or + also the optimizer, scheduler & rng state.Note that + when this is true, you won't be able to resume + training from checkpoint.This enables you to save + storage by not storing the optimizer, scheduler & rng + state.You can only load the model using + from_pretrained with this option set to True. + (default: False) + --restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT] + Whether to restore the callback states from the + checkpoint. If `True`, will override callbacks passed + to the `Trainer` if they exist in the checkpoint. + (default: False) + --no_cuda [NO_CUDA] This argument is deprecated. It will be removed in + version 5.0 of 🤗 Transformers. (default: False) + --use_cpu [USE_CPU] Whether or not to use cpu. If set to False, we will + use cuda/tpu/mps/npu device if available. (default: + False) + --use_mps_device [USE_MPS_DEVICE] + This argument is deprecated. `mps` device will be used + if available similar to `cuda` device. It will be + removed in version 5.0 of 🤗 Transformers (default: + False) + --seed SEED Random seed that will be set at the beginning of + training. (default: 42) + --data_seed DATA_SEED + Random seed to be used with data samplers. (default: + None) + --jit_mode_eval [JIT_MODE_EVAL] + Whether or not to use PyTorch jit trace for inference + (default: False) + --use_ipex [USE_IPEX] + Use Intel extension for PyTorch when it is available, + installation: 'https://github.com/intel/intel- + extension-for-pytorch' (default: False) + --bf16 [BF16] Whether to use bf16 (mixed) precision instead of + 32-bit. Requires Ampere or higher NVIDIA architecture + or using CPU (use_cpu) or Ascend NPU. This is an + experimental API and it may change. (default: False) + --fp16 [FP16] Whether to use fp16 (mixed) precision instead of + 32-bit (default: False) + --fp16_opt_level FP16_OPT_LEVEL + For fp16: Apex AMP optimization level selected in + ['O0', 'O1', 'O2', and 'O3']. See details at + https://nvidia.github.io/apex/amp.html (default: O1) + --half_precision_backend {auto,apex,cpu_amp} + The backend to be used for half precision. (default: + auto) + --bf16_full_eval [BF16_FULL_EVAL] + Whether to use full bfloat16 evaluation instead of + 32-bit. This is an experimental API and it may change. + (default: False) + --fp16_full_eval [FP16_FULL_EVAL] + Whether to use full float16 evaluation instead of + 32-bit (default: False) + --tf32 TF32 Whether to enable tf32 mode, available in Ampere and + newer GPU architectures. This is an experimental API + and it may change. (default: None) + --local_rank LOCAL_RANK + For distributed training: local_rank (default: -1) + --ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl} + The backend to be used for distributed training + (default: None) + --tpu_num_cores TPU_NUM_CORES + TPU: Number of TPU cores (automatically passed by + launcher script) (default: None) + --tpu_metrics_debug [TPU_METRICS_DEBUG] + Deprecated, the use of `--debug tpu_metrics_debug` is + preferred. TPU: Whether to print debug metrics + (default: False) + --debug DEBUG [DEBUG ...] + Whether or not to enable debug mode. Current options: + `underflow_overflow` (Detect underflow and overflow in + activations and weights), `tpu_metrics_debug` (print + debug metrics on TPU). (default: None) + --dataloader_drop_last [DATALOADER_DROP_LAST] + Drop the last incomplete batch if it is not divisible + by the batch size. (default: False) + --eval_steps EVAL_STEPS + Run an evaluation every X steps. Should be an integer + or a float in range `[0,1)`. If smaller than 1, will + be interpreted as ratio of total training steps. + (default: None) + --dataloader_num_workers DATALOADER_NUM_WORKERS + Number of subprocesses to use for data loading + (PyTorch only). 0 means that the data will be loaded + in the main process. (default: 0) + --dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR + Number of batches loaded in advance by each worker. 2 + means there will be a total of 2 * num_workers batches + prefetched across all workers. Default is 2 for + PyTorch < 2.0.0 and otherwise None. (default: None) + --past_index PAST_INDEX + If >=0, uses the corresponding part of the output as + the past state for next step. (default: -1) + --run_name RUN_NAME An optional descriptor for the run. Notably used for + wandb, mlflow and comet logging. (default: None) + --disable_tqdm DISABLE_TQDM + Whether or not to disable the tqdm progress bars. + (default: None) + --remove_unused_columns [REMOVE_UNUSED_COLUMNS] + Remove columns not required by the model when using an + nlp.Dataset. (default: True) + --no_remove_unused_columns + Remove columns not required by the model when using an + nlp.Dataset. (default: False) + --label_names LABEL_NAMES [LABEL_NAMES ...] + The list of keys in your dictionary of inputs that + correspond to the labels. (default: None) + --load_best_model_at_end [LOAD_BEST_MODEL_AT_END] + Whether or not to load the best model found during + training at the end of training. When this option is + enabled, the best checkpoint will always be saved. See + `save_total_limit` for more. (default: False) + --metric_for_best_model METRIC_FOR_BEST_MODEL + The metric to use to compare two different models. + (default: None) + --greater_is_better GREATER_IS_BETTER + Whether the `metric_for_best_model` should be + maximized or not. (default: None) + --ignore_data_skip [IGNORE_DATA_SKIP] + When resuming training, whether or not to skip the + first epochs and batches to get to the same training + data. (default: False) + --fsdp FSDP Whether or not to use PyTorch Fully Sharded Data + Parallel (FSDP) training (in distributed training + only). The base option should be `full_shard`, + `shard_grad_op` or `no_shard` and you can add CPU- + offload to `full_shard` or `shard_grad_op` like this: + full_shard offload` or `shard_grad_op offload`. You + can add auto-wrap to `full_shard` or `shard_grad_op` + with the same syntax: full_shard auto_wrap` or + `shard_grad_op auto_wrap`. (default: ) + --fsdp_min_num_params FSDP_MIN_NUM_PARAMS + This parameter is deprecated. FSDP's minimum number of + parameters for Default Auto Wrapping. (useful only + when `fsdp` field is passed). (default: 0) + --fsdp_config FSDP_CONFIG + Config to be used with FSDP (Pytorch Fully Sharded + Data Parallel). The value is either a fsdp json config + file (e.g., `fsdp_config.json`) or an already loaded + json file as `dict`. (default: None) + --fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP + This parameter is deprecated. Transformer layer class + name (case-sensitive) to wrap, e.g, `BertLayer`, + `GPTJBlock`, `T5Block` .... (useful only when `fsdp` + flag is passed). (default: None) + --accelerator_config ACCELERATOR_CONFIG + Config to be used with the internal Accelerator object + initializtion. The value is either a accelerator json + config file (e.g., `accelerator_config.json`) or an + already loaded json file as `dict`. (default: None) + --deepspeed DEEPSPEED + Enable deepspeed and pass the path to deepspeed json + config file (e.g. `ds_config.json`) or an already + loaded json file as a dict (default: None) + --label_smoothing_factor LABEL_SMOOTHING_FACTOR + The label smoothing epsilon to apply (zero means no + label smoothing). (default: 0.0) + --optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo} + The optimizer to use. (default: adamw_torch) + --optim_args OPTIM_ARGS + Optional arguments to supply to optimizer. (default: + None) + --adafactor [ADAFACTOR] + Whether or not to replace AdamW by Adafactor. + (default: False) + --group_by_length [GROUP_BY_LENGTH] + Whether or not to group samples of roughly the same + length together when batching. (default: False) + --length_column_name LENGTH_COLUMN_NAME + Column name with precomputed lengths to use when + grouping by length. (default: length) + --report_to REPORT_TO + The list of integrations to report the results and + logs to. (default: None) + --ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS + When using distributed training, the value of the flag + `find_unused_parameters` passed to + `DistributedDataParallel`. (default: None) + --ddp_bucket_cap_mb DDP_BUCKET_CAP_MB + When using distributed training, the value of the flag + `bucket_cap_mb` passed to `DistributedDataParallel`. + (default: None) + --ddp_broadcast_buffers DDP_BROADCAST_BUFFERS + When using distributed training, the value of the flag + `broadcast_buffers` passed to + `DistributedDataParallel`. (default: None) + --dataloader_pin_memory [DATALOADER_PIN_MEMORY] + Whether or not to pin memory for DataLoader. (default: + True) + --no_dataloader_pin_memory + Whether or not to pin memory for DataLoader. (default: + False) + --dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS] + If True, the data loader will not shut down the worker + processes after a dataset has been consumed once. This + allows to maintain the workers Dataset instances + alive. Can potentially speed up training, but will + increase RAM usage. (default: False) + --skip_memory_metrics [SKIP_MEMORY_METRICS] + Whether or not to skip adding of memory profiler + reports to metrics. (default: True) + --no_skip_memory_metrics + Whether or not to skip adding of memory profiler + reports to metrics. (default: False) + --use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP] + Whether or not to use the legacy prediction_loop in + the Trainer. (default: False) + --push_to_hub [PUSH_TO_HUB] + Whether or not to upload the trained model to the + model hub after training. (default: False) + --resume_from_checkpoint RESUME_FROM_CHECKPOINT + The path to a folder with a valid checkpoint for your + model. (default: None) + --hub_model_id HUB_MODEL_ID + The name of the repository to keep in sync with the + local `output_dir`. (default: None) + --hub_strategy {end,every_save,checkpoint,all_checkpoints} + The hub strategy to use when `--push_to_hub` is + activated. (default: every_save) + --hub_token HUB_TOKEN + The token to use to push to the Model Hub. (default: + None) + --hub_private_repo [HUB_PRIVATE_REPO] + Whether the model repository is private or not. + (default: False) + --hub_always_push [HUB_ALWAYS_PUSH] + Unless `True`, the Trainer will skip pushes if the + previous one wasn't finished yet. (default: False) + --gradient_checkpointing [GRADIENT_CHECKPOINTING] + If True, use gradient checkpointing to save memory at + the expense of slower backward pass. (default: False) + --gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS + Gradient checkpointing key word arguments such as + `use_reentrant`. Will be passed to + `torch.utils.checkpoint.checkpoint` through + `model.gradient_checkpointing_enable`. (default: None) + --include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS] + Whether or not the inputs will be passed to the + `compute_metrics` function. (default: False) + --eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES] + Whether to recursively concat + inputs/losses/labels/predictions across batches. If + `False`, will instead store them as lists, with each + batch kept separate. (default: True) + --no_eval_do_concat_batches + Whether to recursively concat + inputs/losses/labels/predictions across batches. If + `False`, will instead store them as lists, with each + batch kept separate. (default: False) + --fp16_backend {auto,apex,cpu_amp} + Deprecated. Use half_precision_backend instead + (default: auto) + --evaluation_strategy {no,steps,epoch} + Deprecated. Use `eval_strategy` instead (default: + None) + --push_to_hub_model_id PUSH_TO_HUB_MODEL_ID + The name of the repository to which push the + `Trainer`. (default: None) + --push_to_hub_organization PUSH_TO_HUB_ORGANIZATION + The name of the organization in with to which push the + `Trainer`. (default: None) + --push_to_hub_token PUSH_TO_HUB_TOKEN + The token to use to push to the Model Hub. (default: + None) + --mp_parameters MP_PARAMETERS + Used by the SageMaker launcher to send mp-specific + args. Ignored in Trainer (default: ) + --auto_find_batch_size [AUTO_FIND_BATCH_SIZE] + Whether to automatically decrease the batch size in + half and rerun the training loop again each time a + CUDA Out-of-Memory was reached (default: False) + --full_determinism [FULL_DETERMINISM] + Whether to call enable_full_determinism instead of + set_seed for reproducibility in distributed training. + Important: this will negatively impact the + performance, so only use it for debugging. (default: + False) + --torchdynamo TORCHDYNAMO + This argument is deprecated, use + `--torch_compile_backend` instead. (default: None) + --ray_scope RAY_SCOPE + The scope to use when doing hyperparameter search with + Ray. By default, `"last"` will be used. Ray will then + use the last checkpoint of all trials, compare those, + and select the best one. However, other options are + also available. See the Ray documentation (https://doc + s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun + e.ExperimentAnalysis.get_best_trial) for more options. + (default: last) + --ddp_timeout DDP_TIMEOUT + Overrides the default timeout for distributed training + (value should be given in seconds). (default: 1800) + --torch_compile [TORCH_COMPILE] + If set to `True`, the model will be wrapped in + `torch.compile`. (default: False) + --torch_compile_backend TORCH_COMPILE_BACKEND + Which backend to use with `torch.compile`, passing one + will trigger a model compilation. (default: None) + --torch_compile_mode TORCH_COMPILE_MODE + Which mode to use with `torch.compile`, passing one + will trigger a model compilation. (default: None) + --dispatch_batches DISPATCH_BATCHES + Deprecated. Pass {'dispatch_batches':VALUE} to + `accelerator_config`. (default: None) + --split_batches SPLIT_BATCHES + Deprecated. Pass {'split_batches':True} to + `accelerator_config`. (default: None) + --include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND] + If set to `True`, the speed metrics will include `tgs` + (tokens per second per device). (default: False) + --include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN] + If set to `True`, will track the number of input + tokens seen throughout training. (May be slower in + distributed training) (default: False) + --neftune_noise_alpha NEFTUNE_NOISE_ALPHA + Activates neftune noise embeddings into the model. + NEFTune has been proven to drastically improve model + performances for instrcution fine-tuning. Check out + the original paper here: + https://arxiv.org/abs/2310.05914 and the original code + here: https://github.com/neelsjain/NEFTune. Only + supported for `PreTrainedModel` and `PeftModel` + classes. (default: None) + --optim_target_modules OPTIM_TARGET_MODULES + Target modules for the optimizer defined in the + `optim` argument. Only used for the GaLore optimizer + at the moment. (default: None) + --batch_eval_metrics [BATCH_EVAL_METRICS] + Break eval metrics calculation into batches to save + memory. (default: False) + --eval_on_start [EVAL_ON_START] + Whether to run through the entire `evaluation` step at + the very beginning of training as a sanity check. + (default: False) + --eval_use_gather_object [EVAL_USE_GATHER_OBJECT] + Whether to run recursively gather object in a nested + list/tuple/dictionary of objects from all devices. + (default: False) + --sortish_sampler [SORTISH_SAMPLER] + Whether to use SortishSampler or not. (default: False) + --predict_with_generate [PREDICT_WITH_GENERATE] + Whether to use generate to calculate generative + metrics (ROUGE, BLEU). (default: False) + --generation_max_length GENERATION_MAX_LENGTH + The `max_length` to use on each evaluation loop when + `predict_with_generate=True`. Will default to the + `max_length` value of the model configuration. + (default: None) + --generation_num_beams GENERATION_NUM_BEAMS + The `num_beams` to use on each evaluation loop when + `predict_with_generate=True`. Will default to the + `num_beams` value of the model configuration. + (default: None) + --generation_config GENERATION_CONFIG + Model id, file path or url pointing to a + GenerationConfig json file, to use during prediction. + (default: None) + --use_badam [USE_BADAM] + Whether or not to use the BAdam optimizer. (default: + False) + --badam_mode {layer,ratio} + Whether to use layer-wise or ratio-wise BAdam + optimizer. (default: layer) + --badam_start_block BADAM_START_BLOCK + The starting block index for layer-wise BAdam. + (default: None) + --badam_switch_mode {ascending,descending,random,fixed} + the strategy of picking block to update for layer-wise + BAdam. (default: ascending) + --badam_switch_interval BADAM_SWITCH_INTERVAL + Number of steps to update the block for layer-wise + BAdam. Use -1 to disable the block update. (default: + 50) + --badam_update_ratio BADAM_UPDATE_RATIO + The ratio of the update for ratio-wise BAdam. + (default: 0.05) + --badam_mask_mode {adjacent,scatter} + The mode of the mask for BAdam optimizer. `adjacent` + means that the trainable parameters are adjacent to + each other, `scatter` means that trainable parameters + are randomly choosed from the weight. (default: + adjacent) + --badam_verbose BADAM_VERBOSE + The verbosity level of BAdam optimizer. 0 for no + print, 1 for print the block prefix, 2 for print + trainable parameters. (default: 0) + --use_galore [USE_GALORE] + Whether or not to use the gradient low-Rank projection + (GaLore). (default: False) + --galore_target GALORE_TARGET + Name(s) of modules to apply GaLore. Use commas to + separate multiple modules. Use `all` to specify all + the linear modules. (default: all) + --galore_rank GALORE_RANK + The rank of GaLore gradients. (default: 16) + --galore_update_interval GALORE_UPDATE_INTERVAL + Number of steps to update the GaLore projection. + (default: 200) + --galore_scale GALORE_SCALE + GaLore scaling coefficient. (default: 0.25) + --galore_proj_type {std,reverse_std,right,left,full} + Type of GaLore projection. (default: std) + --galore_layerwise [GALORE_LAYERWISE] + Whether or not to enable layer-wise update to further + save memory. (default: False) + --pref_beta PREF_BETA + The beta parameter in the preference loss. (default: + 0.1) + --pref_ftx PREF_FTX The supervised fine-tuning loss coefficient in DPO + training. (default: 0.0) + --pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo} + The type of DPO loss to use. (default: sigmoid) + --dpo_label_smoothing DPO_LABEL_SMOOTHING + The robust DPO label smoothing parameter in cDPO that + should be between 0 and 0.5. (default: 0.0) + --kto_chosen_weight KTO_CHOSEN_WEIGHT + The weight factor of the desirable losses in KTO + training. (default: 1.0) + --kto_rejected_weight KTO_REJECTED_WEIGHT + The weight factor of the undesirable losses in KTO + training. (default: 1.0) + --simpo_gamma SIMPO_GAMMA + The target reward margin term in SimPO loss. (default: + 0.5) + --ppo_buffer_size PPO_BUFFER_SIZE + The number of mini-batches to make experience buffer + in a PPO optimization step. (default: 1) + --ppo_epochs PPO_EPOCHS + The number of epochs to perform in a PPO optimization + step. (default: 4) + --ppo_score_norm [PPO_SCORE_NORM] + Use score normalization in PPO training. (default: + False) + --ppo_target PPO_TARGET + Target KL value for adaptive KL control in PPO + training. (default: 6.0) + --ppo_whiten_rewards [PPO_WHITEN_REWARDS] + Whiten the rewards before compute advantages in PPO + training. (default: False) + --ref_model REF_MODEL + Path to the reference model used for the PPO or DPO + training. (default: None) + --ref_model_adapters REF_MODEL_ADAPTERS + Path to the adapters of the reference model. (default: + None) + --ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT + The number of bits to quantize the reference model. + (default: None) + --reward_model REWARD_MODEL + Path to the reward model used for the PPO training. + (default: None) + --reward_model_adapters REWARD_MODEL_ADAPTERS + Path to the adapters of the reward model. (default: + None) + --reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT + The number of bits to quantize the reward model. + (default: None) + --reward_model_type {lora,full,api} + The type of the reward model in PPO training. Lora + model only supports lora training. (default: lora) + --additional_target ADDITIONAL_TARGET + Name(s) of modules apart from LoRA layers to be set as + trainable and saved in the final checkpoint. Use + commas to separate multiple modules. (default: None) + --lora_alpha LORA_ALPHA + The scale factor for LoRA fine-tuning (default: + lora_rank * 2). (default: None) + --lora_dropout LORA_DROPOUT + Dropout rate for the LoRA fine-tuning. (default: 0.0) + --lora_rank LORA_RANK + The intrinsic dimension for LoRA fine-tuning. + (default: 8) + --lora_target LORA_TARGET + Name(s) of target modules to apply LoRA. Use commas to + separate multiple modules. Use `all` to specify all + the linear modules. (default: all) + --loraplus_lr_ratio LORAPLUS_LR_RATIO + LoRA plus learning rate ratio (lr_B / lr_A). (default: + None) + --loraplus_lr_embedding LORAPLUS_LR_EMBEDDING + LoRA plus learning rate for lora embedding layers. + (default: 1e-06) + --use_rslora [USE_RSLORA] + Whether or not to use the rank stabilization scaling + factor for LoRA layer. (default: False) + --use_dora [USE_DORA] + Whether or not to use the weight-decomposed lora + method (DoRA). (default: False) + --pissa_init [PISSA_INIT] + Whether or not to initialize a PiSSA adapter. + (default: False) + --pissa_iter PISSA_ITER + The number of iteration steps performed by FSVD in + PiSSA. Use -1 to disable it. (default: 16) + --pissa_convert [PISSA_CONVERT] + Whether or not to convert the PiSSA adapter to a + normal LoRA adapter. (default: False) + --create_new_adapter [CREATE_NEW_ADAPTER] + Whether or not to create a new adapter with randomly + initialized weight. (default: False) + --freeze_trainable_layers FREEZE_TRAINABLE_LAYERS + The number of trainable layers for freeze (partial- + parameter) fine-tuning. Positive numbers mean the last + n layers are set as trainable, negative numbers mean + the first n layers are set as trainable. (default: 2) + --freeze_trainable_modules FREEZE_TRAINABLE_MODULES + Name(s) of trainable modules for freeze (partial- + parameter) fine-tuning. Use commas to separate + multiple modules. Use `all` to specify all the + available modules. (default: all) + --freeze_extra_modules FREEZE_EXTRA_MODULES + Name(s) of modules apart from hidden layers to be set + as trainable for freeze (partial-parameter) fine- + tuning. Use commas to separate multiple modules. + (default: None) + --pure_bf16 [PURE_BF16] + Whether or not to train model in purely bf16 precision + (without AMP). (default: False) + --stage {pt,sft,rm,ppo,dpo,kto} + Which stage will be performed in training. (default: + sft) + --finetuning_type {lora,freeze,full} + Which fine-tuning method to use. (default: lora) + --use_llama_pro [USE_LLAMA_PRO] + Whether or not to make only the parameters in the + expanded blocks trainable. (default: False) + --use_adam_mini [USE_ADAM_MINI] + Whether or not to use the Adam-mini optimizer. + (default: False) + --freeze_vision_tower [FREEZE_VISION_TOWER] + Whether ot not to freeze vision tower in MLLM + training. (default: True) + --no_freeze_vision_tower + Whether ot not to freeze vision tower in MLLM + training. (default: False) + --train_mm_proj_only [TRAIN_MM_PROJ_ONLY] + Whether or not to train the multimodal projector for + MLLM only. (default: False) + --compute_accuracy [COMPUTE_ACCURACY] + Whether or not to compute the token-level accuracy at + evaluation. (default: False) + --plot_loss [PLOT_LOSS] + Whether or not to save the training loss curves. + (default: False) + --do_sample [DO_SAMPLE] + Whether or not to use sampling, use greedy decoding + otherwise. (default: True) + --no_do_sample Whether or not to use sampling, use greedy decoding + otherwise. (default: False) + --temperature TEMPERATURE + The value used to modulate the next token + probabilities. (default: 0.95) + --top_p TOP_P The smallest set of most probable tokens with + probabilities that add up to top_p or higher are kept. + (default: 0.7) + --top_k TOP_K The number of highest probability vocabulary tokens to + keep for top-k filtering. (default: 50) + --num_beams NUM_BEAMS + Number of beams for beam search. 1 means no beam + search. (default: 1) + --max_length MAX_LENGTH + The maximum length the generated tokens can have. It + can be overridden by max_new_tokens. (default: 1024) + --max_new_tokens MAX_NEW_TOKENS + The maximum numbers of tokens to generate, ignoring + the number of tokens in the prompt. (default: 1024) + --repetition_penalty REPETITION_PENALTY + The parameter for repetition penalty. 1.0 means no + penalty. (default: 1.0) + --length_penalty LENGTH_PENALTY + Exponential penalty to the length that is used with + beam-based generation. (default: 1.0) + --default_system DEFAULT_SYSTEM + Default system message to use in chat completion. + (default: None) +usage: launcher.py [-h] --model_name_or_path MODEL_NAME_OR_PATH + [--adapter_name_or_path ADAPTER_NAME_OR_PATH] + [--adapter_folder ADAPTER_FOLDER] [--cache_dir CACHE_DIR] + [--use_fast_tokenizer [USE_FAST_TOKENIZER]] + [--no_use_fast_tokenizer] [--resize_vocab [RESIZE_VOCAB]] + [--split_special_tokens [SPLIT_SPECIAL_TOKENS]] + [--new_special_tokens NEW_SPECIAL_TOKENS] + [--model_revision MODEL_REVISION] + [--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]] + [--no_low_cpu_mem_usage] + [--quantization_method {bitsandbytes,hqq,eetq}] + [--quantization_bit QUANTIZATION_BIT] + [--quantization_type {fp4,nf4}] + [--double_quantization [DOUBLE_QUANTIZATION]] + [--no_double_quantization] + [--quantization_device_map {auto}] + [--rope_scaling {linear,dynamic}] + [--flash_attn {auto,disabled,sdpa,fa2}] + [--shift_attn [SHIFT_ATTN]] + [--mixture_of_depths {convert,load}] + [--use_unsloth [USE_UNSLOTH]] + [--visual_inputs [VISUAL_INPUTS]] + [--moe_aux_loss_coef MOE_AUX_LOSS_COEF] + [--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]] + [--upcast_layernorm [UPCAST_LAYERNORM]] + [--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]] + [--train_from_scratch [TRAIN_FROM_SCRATCH]] + [--infer_backend {huggingface,vllm}] + [--vllm_maxlen VLLM_MAXLEN] [--vllm_gpu_util VLLM_GPU_UTIL] + [--vllm_enforce_eager [VLLM_ENFORCE_EAGER]] + [--vllm_max_lora_rank VLLM_MAX_LORA_RANK] + [--offload_folder OFFLOAD_FOLDER] [--use_cache [USE_CACHE]] + [--no_use_cache] + [--infer_dtype {auto,float16,bfloat16,float32}] + [--hf_hub_token HF_HUB_TOKEN] [--ms_hub_token MS_HUB_TOKEN] + [--export_dir EXPORT_DIR] [--export_size EXPORT_SIZE] + [--export_device {cpu,auto}] + [--export_quantization_bit EXPORT_QUANTIZATION_BIT] + [--export_quantization_dataset EXPORT_QUANTIZATION_DATASET] + [--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES] + [--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN] + [--export_legacy_format [EXPORT_LEGACY_FORMAT]] + [--export_hub_model_id EXPORT_HUB_MODEL_ID] + [--print_param_status [PRINT_PARAM_STATUS]] + [--template TEMPLATE] [--dataset DATASET] + [--eval_dataset EVAL_DATASET] [--dataset_dir DATASET_DIR] + [--cutoff_len CUTOFF_LEN] + [--train_on_prompt [TRAIN_ON_PROMPT]] + [--mask_history [MASK_HISTORY]] [--streaming [STREAMING]] + [--buffer_size BUFFER_SIZE] + [--mix_strategy {concat,interleave_under,interleave_over}] + [--interleave_probs INTERLEAVE_PROBS] + [--overwrite_cache [OVERWRITE_CACHE]] + [--preprocessing_num_workers PREPROCESSING_NUM_WORKERS] + [--max_samples MAX_SAMPLES] + [--eval_num_beams EVAL_NUM_BEAMS] + [--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]] + [--no_ignore_pad_token_for_loss] [--val_size VAL_SIZE] + [--packing PACKING] [--neat_packing [NEAT_PACKING]] + [--tool_format TOOL_FORMAT] + [--tokenized_path TOKENIZED_PATH] --output_dir OUTPUT_DIR + [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] + [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]] + [--do_predict [DO_PREDICT]] + [--eval_strategy {no,steps,epoch}] + [--prediction_loss_only [PREDICTION_LOSS_ONLY]] + [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] + [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE] + [--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] + [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE] + [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS] + [--eval_accumulation_steps EVAL_ACCUMULATION_STEPS] + [--eval_delay EVAL_DELAY] + [--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS] + [--learning_rate LEARNING_RATE] + [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1] + [--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON] + [--max_grad_norm MAX_GRAD_NORM] + [--num_train_epochs NUM_TRAIN_EPOCHS] + [--max_steps MAX_STEPS] + [--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}] + [--lr_scheduler_kwargs LR_SCHEDULER_KWARGS] + [--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS] + [--log_level {detail,debug,info,warning,error,critical,passive}] + [--log_level_replica {detail,debug,info,warning,error,critical,passive}] + [--log_on_each_node [LOG_ON_EACH_NODE]] + [--no_log_on_each_node] [--logging_dir LOGGING_DIR] + [--logging_strategy {no,steps,epoch}] + [--logging_first_step [LOGGING_FIRST_STEP]] + [--logging_steps LOGGING_STEPS] + [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]] + [--no_logging_nan_inf_filter] + [--save_strategy {no,steps,epoch}] + [--save_steps SAVE_STEPS] + [--save_total_limit SAVE_TOTAL_LIMIT] + [--save_safetensors [SAVE_SAFETENSORS]] + [--no_save_safetensors] + [--save_on_each_node [SAVE_ON_EACH_NODE]] + [--save_only_model [SAVE_ONLY_MODEL]] + [--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]] + [--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]] + [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] + [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]] + [--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]] + [--fp16_opt_level FP16_OPT_LEVEL] + [--half_precision_backend {auto,apex,cpu_amp}] + [--bf16_full_eval [BF16_FULL_EVAL]] + [--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32] + [--local_rank LOCAL_RANK] + [--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}] + [--tpu_num_cores TPU_NUM_CORES] + [--tpu_metrics_debug [TPU_METRICS_DEBUG]] + [--debug DEBUG [DEBUG ...]] + [--dataloader_drop_last [DATALOADER_DROP_LAST]] + [--eval_steps EVAL_STEPS] + [--dataloader_num_workers DATALOADER_NUM_WORKERS] + [--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR] + [--past_index PAST_INDEX] [--run_name RUN_NAME] + [--disable_tqdm DISABLE_TQDM] + [--remove_unused_columns [REMOVE_UNUSED_COLUMNS]] + [--no_remove_unused_columns] + [--label_names LABEL_NAMES [LABEL_NAMES ...]] + [--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]] + [--metric_for_best_model METRIC_FOR_BEST_MODEL] + [--greater_is_better GREATER_IS_BETTER] + [--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP] + [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] + [--fsdp_config FSDP_CONFIG] + [--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] + [--accelerator_config ACCELERATOR_CONFIG] + [--deepspeed DEEPSPEED] + [--label_smoothing_factor LABEL_SMOOTHING_FACTOR] + [--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}] + [--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] + [--group_by_length [GROUP_BY_LENGTH]] + [--length_column_name LENGTH_COLUMN_NAME] + [--report_to REPORT_TO] + [--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] + [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB] + [--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS] + [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]] + [--no_dataloader_pin_memory] + [--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]] + [--skip_memory_metrics [SKIP_MEMORY_METRICS]] + [--no_skip_memory_metrics] + [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]] + [--push_to_hub [PUSH_TO_HUB]] + [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] + [--hub_model_id HUB_MODEL_ID] + [--hub_strategy {end,every_save,checkpoint,all_checkpoints}] + [--hub_token HUB_TOKEN] + [--hub_private_repo [HUB_PRIVATE_REPO]] + [--hub_always_push [HUB_ALWAYS_PUSH]] + [--gradient_checkpointing [GRADIENT_CHECKPOINTING]] + [--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS] + [--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]] + [--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]] + [--no_eval_do_concat_batches] + [--fp16_backend {auto,apex,cpu_amp}] + [--evaluation_strategy {no,steps,epoch}] + [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID] + [--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION] + [--push_to_hub_token PUSH_TO_HUB_TOKEN] + [--mp_parameters MP_PARAMETERS] + [--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]] + [--full_determinism [FULL_DETERMINISM]] + [--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE] + [--ddp_timeout DDP_TIMEOUT] + [--torch_compile [TORCH_COMPILE]] + [--torch_compile_backend TORCH_COMPILE_BACKEND] + [--torch_compile_mode TORCH_COMPILE_MODE] + [--dispatch_batches DISPATCH_BATCHES] + [--split_batches SPLIT_BATCHES] + [--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]] + [--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]] + [--neftune_noise_alpha NEFTUNE_NOISE_ALPHA] + [--optim_target_modules OPTIM_TARGET_MODULES] + [--batch_eval_metrics [BATCH_EVAL_METRICS]] + [--eval_on_start [EVAL_ON_START]] + [--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]] + [--sortish_sampler [SORTISH_SAMPLER]] + [--predict_with_generate [PREDICT_WITH_GENERATE]] + [--generation_max_length GENERATION_MAX_LENGTH] + [--generation_num_beams GENERATION_NUM_BEAMS] + [--generation_config GENERATION_CONFIG] + [--use_badam [USE_BADAM]] [--badam_mode {layer,ratio}] + [--badam_start_block BADAM_START_BLOCK] + [--badam_switch_mode {ascending,descending,random,fixed}] + [--badam_switch_interval BADAM_SWITCH_INTERVAL] + [--badam_update_ratio BADAM_UPDATE_RATIO] + [--badam_mask_mode {adjacent,scatter}] + [--badam_verbose BADAM_VERBOSE] [--use_galore [USE_GALORE]] + [--galore_target GALORE_TARGET] [--galore_rank GALORE_RANK] + [--galore_update_interval GALORE_UPDATE_INTERVAL] + [--galore_scale GALORE_SCALE] + [--galore_proj_type {std,reverse_std,right,left,full}] + [--galore_layerwise [GALORE_LAYERWISE]] + [--pref_beta PREF_BETA] [--pref_ftx PREF_FTX] + [--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}] + [--dpo_label_smoothing DPO_LABEL_SMOOTHING] + [--kto_chosen_weight KTO_CHOSEN_WEIGHT] + [--kto_rejected_weight KTO_REJECTED_WEIGHT] + [--simpo_gamma SIMPO_GAMMA] + [--ppo_buffer_size PPO_BUFFER_SIZE] + [--ppo_epochs PPO_EPOCHS] + [--ppo_score_norm [PPO_SCORE_NORM]] + [--ppo_target PPO_TARGET] + [--ppo_whiten_rewards [PPO_WHITEN_REWARDS]] + [--ref_model REF_MODEL] + [--ref_model_adapters REF_MODEL_ADAPTERS] + [--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT] + [--reward_model REWARD_MODEL] + [--reward_model_adapters REWARD_MODEL_ADAPTERS] + [--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT] + [--reward_model_type {lora,full,api}] + [--additional_target ADDITIONAL_TARGET] + [--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT] + [--lora_rank LORA_RANK] [--lora_target LORA_TARGET] + [--loraplus_lr_ratio LORAPLUS_LR_RATIO] + [--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING] + [--use_rslora [USE_RSLORA]] [--use_dora [USE_DORA]] + [--pissa_init [PISSA_INIT]] [--pissa_iter PISSA_ITER] + [--pissa_convert [PISSA_CONVERT]] + [--create_new_adapter [CREATE_NEW_ADAPTER]] + [--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS] + [--freeze_trainable_modules FREEZE_TRAINABLE_MODULES] + [--freeze_extra_modules FREEZE_EXTRA_MODULES] + [--pure_bf16 [PURE_BF16]] [--stage {pt,sft,rm,ppo,dpo,kto}] + [--finetuning_type {lora,freeze,full}] + [--use_llama_pro [USE_LLAMA_PRO]] + [--use_adam_mini [USE_ADAM_MINI]] + [--freeze_vision_tower [FREEZE_VISION_TOWER]] + [--no_freeze_vision_tower] + [--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]] + [--compute_accuracy [COMPUTE_ACCURACY]] + [--plot_loss [PLOT_LOSS]] [--do_sample [DO_SAMPLE]] + [--no_do_sample] [--temperature TEMPERATURE] + [--top_p TOP_P] [--top_k TOP_K] [--num_beams NUM_BEAMS] + [--max_length MAX_LENGTH] [--max_new_tokens MAX_NEW_TOKENS] + [--repetition_penalty REPETITION_PENALTY] + [--length_penalty LENGTH_PENALTY] + [--default_system DEFAULT_SYSTEM] + +optional arguments: + -h, --help show this help message and exit + --model_name_or_path MODEL_NAME_OR_PATH + Path to the model weight or identifier from + huggingface.co/models or modelscope.cn/models. + (default: None) + --adapter_name_or_path ADAPTER_NAME_OR_PATH + Path to the adapter weight or identifier from + huggingface.co/models. Use commas to separate multiple + adapters. (default: None) + --adapter_folder ADAPTER_FOLDER + The folder containing the adapter weights to load. + (default: None) + --cache_dir CACHE_DIR + Where to store the pre-trained models downloaded from + huggingface.co or modelscope.cn. (default: None) + --use_fast_tokenizer [USE_FAST_TOKENIZER] + Whether or not to use one of the fast tokenizer + (backed by the tokenizers library). (default: True) + --no_use_fast_tokenizer + Whether or not to use one of the fast tokenizer + (backed by the tokenizers library). (default: False) + --resize_vocab [RESIZE_VOCAB] + Whether or not to resize the tokenizer vocab and the + embedding layers. (default: False) + --split_special_tokens [SPLIT_SPECIAL_TOKENS] + Whether or not the special tokens should be split + during the tokenization process. (default: False) + --new_special_tokens NEW_SPECIAL_TOKENS + Special tokens to be added into the tokenizer. Use + commas to separate multiple tokens. (default: None) + --model_revision MODEL_REVISION + The specific model version to use (can be a branch + name, tag name or commit id). (default: main) + --low_cpu_mem_usage [LOW_CPU_MEM_USAGE] + Whether or not to use memory-efficient model loading. + (default: True) + --no_low_cpu_mem_usage + Whether or not to use memory-efficient model loading. + (default: False) + --quantization_method {bitsandbytes,hqq,eetq} + Quantization method to use for on-the-fly + quantization. (default: bitsandbytes) + --quantization_bit QUANTIZATION_BIT + The number of bits to quantize the model using + bitsandbytes. (default: None) + --quantization_type {fp4,nf4} + Quantization data type to use in int4 training. + (default: nf4) + --double_quantization [DOUBLE_QUANTIZATION] + Whether or not to use double quantization in int4 + training. (default: True) + --no_double_quantization + Whether or not to use double quantization in int4 + training. (default: False) + --quantization_device_map {auto} + Device map used to infer the 4-bit quantized model, + needs bitsandbytes>=0.43.0. (default: None) + --rope_scaling {linear,dynamic} + Which scaling strategy should be adopted for the RoPE + embeddings. (default: None) + --flash_attn {auto,disabled,sdpa,fa2} + Enable FlashAttention for faster training and + inference. (default: auto) + --shift_attn [SHIFT_ATTN] + Enable shift short attention (S^2-Attn) proposed by + LongLoRA. (default: False) + --mixture_of_depths {convert,load} + Convert the model to mixture-of-depths (MoD) or load + the MoD model. (default: None) + --use_unsloth [USE_UNSLOTH] + Whether or not to use unsloth's optimization for the + LoRA training. (default: False) + --visual_inputs [VISUAL_INPUTS] + Whethor or not to use multimodal LLM that accepts + visual inputs. (default: False) + --moe_aux_loss_coef MOE_AUX_LOSS_COEF + Coefficient of the auxiliary router loss in mixture- + of-experts model. (default: None) + --disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING] + Whether or not to disable gradient checkpointing. + (default: False) + --upcast_layernorm [UPCAST_LAYERNORM] + Whether or not to upcast the layernorm weights in + fp32. (default: False) + --upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT] + Whether or not to upcast the output of lm_head in + fp32. (default: False) + --train_from_scratch [TRAIN_FROM_SCRATCH] + Whether or not to randomly initialize the model + weights. (default: False) + --infer_backend {huggingface,vllm} + Backend engine used at inference. (default: + huggingface) + --vllm_maxlen VLLM_MAXLEN + Maximum sequence (prompt + response) length of the + vLLM engine. (default: 2048) + --vllm_gpu_util VLLM_GPU_UTIL + The fraction of GPU memory in (0,1) to be used for the + vLLM engine. (default: 0.9) + --vllm_enforce_eager [VLLM_ENFORCE_EAGER] + Whether or not to disable CUDA graph in the vLLM + engine. (default: False) + --vllm_max_lora_rank VLLM_MAX_LORA_RANK + Maximum rank of all LoRAs in the vLLM engine. + (default: 32) + --offload_folder OFFLOAD_FOLDER + Path to offload model weights. (default: offload) + --use_cache [USE_CACHE] + Whether or not to use KV cache in generation. + (default: True) + --no_use_cache Whether or not to use KV cache in generation. + (default: False) + --infer_dtype {auto,float16,bfloat16,float32} + Data type for model weights and activations at + inference. (default: auto) + --hf_hub_token HF_HUB_TOKEN + Auth token to log in with Hugging Face Hub. (default: + None) + --ms_hub_token MS_HUB_TOKEN + Auth token to log in with ModelScope Hub. (default: + None) + --export_dir EXPORT_DIR + Path to the directory to save the exported model. + (default: None) + --export_size EXPORT_SIZE + The file shard size (in GB) of the exported model. + (default: 1) + --export_device {cpu,auto} + The device used in model export, use `auto` to + accelerate exporting. (default: cpu) + --export_quantization_bit EXPORT_QUANTIZATION_BIT + The number of bits to quantize the exported model. + (default: None) + --export_quantization_dataset EXPORT_QUANTIZATION_DATASET + Path to the dataset or dataset name to use in + quantizing the exported model. (default: None) + --export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES + The number of samples used for quantization. (default: + 128) + --export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN + The maximum length of the model inputs used for + quantization. (default: 1024) + --export_legacy_format [EXPORT_LEGACY_FORMAT] + Whether or not to save the `.bin` files instead of + `.safetensors`. (default: False) + --export_hub_model_id EXPORT_HUB_MODEL_ID + The name of the repository if push the model to the + Hugging Face hub. (default: None) + --print_param_status [PRINT_PARAM_STATUS] + For debugging purposes, print the status of the + parameters in the model. (default: False) + --template TEMPLATE Which template to use for constructing prompts in + training and inference. (default: None) + --dataset DATASET The name of dataset(s) to use for training. Use commas + to separate multiple datasets. (default: None) + --eval_dataset EVAL_DATASET + The name of dataset(s) to use for evaluation. Use + commas to separate multiple datasets. (default: None) + --dataset_dir DATASET_DIR + Path to the folder containing the datasets. (default: + data) + --cutoff_len CUTOFF_LEN + The cutoff length of the tokenized inputs in the + dataset. (default: 1024) + --train_on_prompt [TRAIN_ON_PROMPT] + Whether or not to disable the mask on the prompt. + (default: False) + --mask_history [MASK_HISTORY] + Whether or not to mask the history and train on the + last turn only. (default: False) + --streaming [STREAMING] + Enable dataset streaming. (default: False) + --buffer_size BUFFER_SIZE + Size of the buffer to randomly sample examples from in + dataset streaming. (default: 16384) + --mix_strategy {concat,interleave_under,interleave_over} + Strategy to use in dataset mixing (concat/interleave) + (undersampling/oversampling). (default: concat) + --interleave_probs INTERLEAVE_PROBS + Probabilities to sample data from datasets. Use commas + to separate multiple datasets. (default: None) + --overwrite_cache [OVERWRITE_CACHE] + Overwrite the cached training and evaluation sets. + (default: False) + --preprocessing_num_workers PREPROCESSING_NUM_WORKERS + The number of processes to use for the pre-processing. + (default: None) + --max_samples MAX_SAMPLES + For debugging purposes, truncate the number of + examples for each dataset. (default: None) + --eval_num_beams EVAL_NUM_BEAMS + Number of beams to use for evaluation. This argument + will be passed to `model.generate` (default: None) + --ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS] + Whether or not to ignore the tokens corresponding to + the pad label in loss computation. (default: True) + --no_ignore_pad_token_for_loss + Whether or not to ignore the tokens corresponding to + the pad label in loss computation. (default: False) + --val_size VAL_SIZE Size of the development set, should be an integer or a + float in range `[0,1)`. (default: 0.0) + --packing PACKING Enable sequences packing in training. Will + automatically enable in pre-training. (default: None) + --neat_packing [NEAT_PACKING] + Enable sequence packing without cross-attention. + (default: False) + --tool_format TOOL_FORMAT + Tool format to use for constructing function calling + examples. (default: None) + --tokenized_path TOKENIZED_PATH + Path to save or load the tokenized datasets. (default: + None) + --output_dir OUTPUT_DIR + The output directory where the model predictions and + checkpoints will be written. (default: None) + --overwrite_output_dir [OVERWRITE_OUTPUT_DIR] + Overwrite the content of the output directory. Use + this to continue training if output_dir points to a + checkpoint directory. (default: False) + --do_train [DO_TRAIN] + Whether to run training. (default: False) + --do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False) + --do_predict [DO_PREDICT] + Whether to run predictions on the test set. (default: + False) + --eval_strategy {no,steps,epoch} + The evaluation strategy to use. (default: no) + --prediction_loss_only [PREDICTION_LOSS_ONLY] + When performing evaluation and predictions, only + returns the loss. (default: False) + --per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE + Batch size per GPU/TPU/MPS/NPU core/CPU for training. + (default: 8) + --per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE + Batch size per GPU/TPU/MPS/NPU core/CPU for + evaluation. (default: 8) + --per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE + Deprecated, the use of `--per_device_train_batch_size` + is preferred. Batch size per GPU/TPU core/CPU for + training. (default: None) + --per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE + Deprecated, the use of `--per_device_eval_batch_size` + is preferred. Batch size per GPU/TPU core/CPU for + evaluation. (default: None) + --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS + Number of updates steps to accumulate before + performing a backward/update pass. (default: 1) + --eval_accumulation_steps EVAL_ACCUMULATION_STEPS + Number of predictions steps to accumulate before + moving the tensors to the CPU. (default: None) + --eval_delay EVAL_DELAY + Number of epochs or steps to wait for before the first + evaluation can be performed, depending on the + eval_strategy. (default: 0) + --torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS + Number of steps to wait before calling + `torch..empty_cache()`.This can help avoid + CUDA out-of-memory errors by lowering peak VRAM usage + at a cost of about [10{'option_strings': ['-- + torch_empty_cache_steps'], 'dest': + 'torch_empty_cache_steps', 'nargs': None, 'const': + None, 'default': None, 'type': 'int', 'choices': None, + 'required': False, 'help': 'Number of steps to wait + before calling `torch..empty_cache()`.This can + help avoid CUDA out-of-memory errors by lowering peak + VRAM usage at a cost of about [10% slower performance] + (https://github.com/huggingface/transformers/issues/31 + 372).If left unset or set to None, cache will not be + emptied.', 'metavar': None, 'container': + , + 'prog': 'launcher.py'}lower performance](https://githu + b.com/huggingface/transformers/issues/31372).If left + unset or set to None, cache will not be emptied. + (default: None) + --learning_rate LEARNING_RATE + The initial learning rate for AdamW. (default: 5e-05) + --weight_decay WEIGHT_DECAY + Weight decay for AdamW if we apply some. (default: + 0.0) + --adam_beta1 ADAM_BETA1 + Beta1 for AdamW optimizer (default: 0.9) + --adam_beta2 ADAM_BETA2 + Beta2 for AdamW optimizer (default: 0.999) + --adam_epsilon ADAM_EPSILON + Epsilon for AdamW optimizer. (default: 1e-08) + --max_grad_norm MAX_GRAD_NORM + Max gradient norm. (default: 1.0) + --num_train_epochs NUM_TRAIN_EPOCHS + Total number of training epochs to perform. (default: + 3.0) + --max_steps MAX_STEPS + If > 0: set total number of training steps to perform. + Override num_train_epochs. (default: -1) + --lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay} + The scheduler type to use. (default: linear) + --lr_scheduler_kwargs LR_SCHEDULER_KWARGS + Extra parameters for the lr_scheduler such as + {'num_cycles': 1} for the cosine with hard restarts. + (default: {}) + --warmup_ratio WARMUP_RATIO + Linear warmup over warmup_ratio fraction of total + steps. (default: 0.0) + --warmup_steps WARMUP_STEPS + Linear warmup over warmup_steps. (default: 0) + --log_level {detail,debug,info,warning,error,critical,passive} + Logger log level to use on the main node. Possible + choices are the log levels as strings: 'debug', + 'info', 'warning', 'error' and 'critical', plus a + 'passive' level which doesn't set anything and lets + the application set the level. Defaults to 'passive'. + (default: passive) + --log_level_replica {detail,debug,info,warning,error,critical,passive} + Logger log level to use on replica nodes. Same choices + and defaults as ``log_level`` (default: warning) + --log_on_each_node [LOG_ON_EACH_NODE] + When doing a multinode distributed training, whether + to log once per node or just once on the main node. + (default: True) + --no_log_on_each_node + When doing a multinode distributed training, whether + to log once per node or just once on the main node. + (default: False) + --logging_dir LOGGING_DIR + Tensorboard log dir. (default: None) + --logging_strategy {no,steps,epoch} + The logging strategy to use. (default: steps) + --logging_first_step [LOGGING_FIRST_STEP] + Log the first global_step (default: False) + --logging_steps LOGGING_STEPS + Log every X updates steps. Should be an integer or a + float in range `[0,1)`. If smaller than 1, will be + interpreted as ratio of total training steps. + (default: 500) + --logging_nan_inf_filter [LOGGING_NAN_INF_FILTER] + Filter nan and inf losses for logging. (default: True) + --no_logging_nan_inf_filter + Filter nan and inf losses for logging. (default: + False) + --save_strategy {no,steps,epoch} + The checkpoint save strategy to use. (default: steps) + --save_steps SAVE_STEPS + Save checkpoint every X updates steps. Should be an + integer or a float in range `[0,1)`. If smaller than + 1, will be interpreted as ratio of total training + steps. (default: 500) + --save_total_limit SAVE_TOTAL_LIMIT + If a value is passed, will limit the total amount of + checkpoints. Deletes the older checkpoints in + `output_dir`. When `load_best_model_at_end` is + enabled, the 'best' checkpoint according to + `metric_for_best_model` will always be retained in + addition to the most recent ones. For example, for + `save_total_limit=5` and + `load_best_model_at_end=True`, the four last + checkpoints will always be retained alongside the best + model. When `save_total_limit=1` and + `load_best_model_at_end=True`, it is possible that two + checkpoints are saved: the last one and the best one + (if they are different). Default is unlimited + checkpoints (default: None) + --save_safetensors [SAVE_SAFETENSORS] + Use safetensors saving and loading for state dicts + instead of default torch.load and torch.save. + (default: True) + --no_save_safetensors + Use safetensors saving and loading for state dicts + instead of default torch.load and torch.save. + (default: False) + --save_on_each_node [SAVE_ON_EACH_NODE] + When doing multi-node distributed training, whether to + save models and checkpoints on each node, or only on + the main one (default: False) + --save_only_model [SAVE_ONLY_MODEL] + When checkpointing, whether to only save the model, or + also the optimizer, scheduler & rng state.Note that + when this is true, you won't be able to resume + training from checkpoint.This enables you to save + storage by not storing the optimizer, scheduler & rng + state.You can only load the model using + from_pretrained with this option set to True. + (default: False) + --restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT] + Whether to restore the callback states from the + checkpoint. If `True`, will override callbacks passed + to the `Trainer` if they exist in the checkpoint. + (default: False) + --no_cuda [NO_CUDA] This argument is deprecated. It will be removed in + version 5.0 of 🤗 Transformers. (default: False) + --use_cpu [USE_CPU] Whether or not to use cpu. If set to False, we will + use cuda/tpu/mps/npu device if available. (default: + False) + --use_mps_device [USE_MPS_DEVICE] + This argument is deprecated. `mps` device will be used + if available similar to `cuda` device. It will be + removed in version 5.0 of 🤗 Transformers (default: + False) + --seed SEED Random seed that will be set at the beginning of + training. (default: 42) + --data_seed DATA_SEED + Random seed to be used with data samplers. (default: + None) + --jit_mode_eval [JIT_MODE_EVAL] + Whether or not to use PyTorch jit trace for inference + (default: False) + --use_ipex [USE_IPEX] + Use Intel extension for PyTorch when it is available, + installation: 'https://github.com/intel/intel- + extension-for-pytorch' (default: False) + --bf16 [BF16] Whether to use bf16 (mixed) precision instead of + 32-bit. Requires Ampere or higher NVIDIA architecture + or using CPU (use_cpu) or Ascend NPU. This is an + experimental API and it may change. (default: False) + --fp16 [FP16] Whether to use fp16 (mixed) precision instead of + 32-bit (default: False) + --fp16_opt_level FP16_OPT_LEVEL + For fp16: Apex AMP optimization level selected in + ['O0', 'O1', 'O2', and 'O3']. See details at + https://nvidia.github.io/apex/amp.html (default: O1) + --half_precision_backend {auto,apex,cpu_amp} + The backend to be used for half precision. (default: + auto) + --bf16_full_eval [BF16_FULL_EVAL] + Whether to use full bfloat16 evaluation instead of + 32-bit. This is an experimental API and it may change. + (default: False) + --fp16_full_eval [FP16_FULL_EVAL] + Whether to use full float16 evaluation instead of + 32-bit (default: False) + --tf32 TF32 Whether to enable tf32 mode, available in Ampere and + newer GPU architectures. This is an experimental API + and it may change. (default: None) + --local_rank LOCAL_RANK + For distributed training: local_rank (default: -1) + --ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl} + The backend to be used for distributed training + (default: None) + --tpu_num_cores TPU_NUM_CORES + TPU: Number of TPU cores (automatically passed by + launcher script) (default: None) + --tpu_metrics_debug [TPU_METRICS_DEBUG] + Deprecated, the use of `--debug tpu_metrics_debug` is + preferred. TPU: Whether to print debug metrics + (default: False) + --debug DEBUG [DEBUG ...] + Whether or not to enable debug mode. Current options: + `underflow_overflow` (Detect underflow and overflow in + activations and weights), `tpu_metrics_debug` (print + debug metrics on TPU). (default: None) + --dataloader_drop_last [DATALOADER_DROP_LAST] + Drop the last incomplete batch if it is not divisible + by the batch size. (default: False) + --eval_steps EVAL_STEPS + Run an evaluation every X steps. Should be an integer + or a float in range `[0,1)`. If smaller than 1, will + be interpreted as ratio of total training steps. + (default: None) + --dataloader_num_workers DATALOADER_NUM_WORKERS + Number of subprocesses to use for data loading + (PyTorch only). 0 means that the data will be loaded + in the main process. (default: 0) + --dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR + Number of batches loaded in advance by each worker. 2 + means there will be a total of 2 * num_workers batches + prefetched across all workers. Default is 2 for + PyTorch < 2.0.0 and otherwise None. (default: None) + --past_index PAST_INDEX + If >=0, uses the corresponding part of the output as + the past state for next step. (default: -1) + --run_name RUN_NAME An optional descriptor for the run. Notably used for + wandb, mlflow and comet logging. (default: None) + --disable_tqdm DISABLE_TQDM + Whether or not to disable the tqdm progress bars. + (default: None) + --remove_unused_columns [REMOVE_UNUSED_COLUMNS] + Remove columns not required by the model when using an + nlp.Dataset. (default: True) + --no_remove_unused_columns + Remove columns not required by the model when using an + nlp.Dataset. (default: False) + --label_names LABEL_NAMES [LABEL_NAMES ...] + The list of keys in your dictionary of inputs that + correspond to the labels. (default: None) + --load_best_model_at_end [LOAD_BEST_MODEL_AT_END] + Whether or not to load the best model found during + training at the end of training. When this option is + enabled, the best checkpoint will always be saved. See + `save_total_limit` for more. (default: False) + --metric_for_best_model METRIC_FOR_BEST_MODEL + The metric to use to compare two different models. + (default: None) + --greater_is_better GREATER_IS_BETTER + Whether the `metric_for_best_model` should be + maximized or not. (default: None) + --ignore_data_skip [IGNORE_DATA_SKIP] + When resuming training, whether or not to skip the + first epochs and batches to get to the same training + data. (default: False) + --fsdp FSDP Whether or not to use PyTorch Fully Sharded Data + Parallel (FSDP) training (in distributed training + only). The base option should be `full_shard`, + `shard_grad_op` or `no_shard` and you can add CPU- + offload to `full_shard` or `shard_grad_op` like this: + full_shard offload` or `shard_grad_op offload`. You + can add auto-wrap to `full_shard` or `shard_grad_op` + with the same syntax: full_shard auto_wrap` or + `shard_grad_op auto_wrap`. (default: ) + --fsdp_min_num_params FSDP_MIN_NUM_PARAMS + This parameter is deprecated. FSDP's minimum number of + parameters for Default Auto Wrapping. (useful only + when `fsdp` field is passed). (default: 0) + --fsdp_config FSDP_CONFIG + Config to be used with FSDP (Pytorch Fully Sharded + Data Parallel). The value is either a fsdp json config + file (e.g., `fsdp_config.json`) or an already loaded + json file as `dict`. (default: None) + --fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP + This parameter is deprecated. Transformer layer class + name (case-sensitive) to wrap, e.g, `BertLayer`, + `GPTJBlock`, `T5Block` .... (useful only when `fsdp` + flag is passed). (default: None) + --accelerator_config ACCELERATOR_CONFIG + Config to be used with the internal Accelerator object + initializtion. The value is either a accelerator json + config file (e.g., `accelerator_config.json`) or an + already loaded json file as `dict`. (default: None) + --deepspeed DEEPSPEED + Enable deepspeed and pass the path to deepspeed json + config file (e.g. `ds_config.json`) or an already + loaded json file as a dict (default: None) + --label_smoothing_factor LABEL_SMOOTHING_FACTOR + The label smoothing epsilon to apply (zero means no + label smoothing). (default: 0.0) + --optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo} + The optimizer to use. (default: adamw_torch) + --optim_args OPTIM_ARGS + Optional arguments to supply to optimizer. (default: + None) + --adafactor [ADAFACTOR] + Whether or not to replace AdamW by Adafactor. + (default: False) + --group_by_length [GROUP_BY_LENGTH] + Whether or not to group samples of roughly the same + length together when batching. (default: False) + --length_column_name LENGTH_COLUMN_NAME + Column name with precomputed lengths to use when + grouping by length. (default: length) + --report_to REPORT_TO + The list of integrations to report the results and + logs to. (default: None) + --ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS + When using distributed training, the value of the flag + `find_unused_parameters` passed to + `DistributedDataParallel`. (default: None) + --ddp_bucket_cap_mb DDP_BUCKET_CAP_MB + When using distributed training, the value of the flag + `bucket_cap_mb` passed to `DistributedDataParallel`. + (default: None) + --ddp_broadcast_buffers DDP_BROADCAST_BUFFERS + When using distributed training, the value of the flag + `broadcast_buffers` passed to + `DistributedDataParallel`. (default: None) + --dataloader_pin_memory [DATALOADER_PIN_MEMORY] + Whether or not to pin memory for DataLoader. (default: + True) + --no_dataloader_pin_memory + Whether or not to pin memory for DataLoader. (default: + False) + --dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS] + If True, the data loader will not shut down the worker + processes after a dataset has been consumed once. This + allows to maintain the workers Dataset instances + alive. Can potentially speed up training, but will + increase RAM usage. (default: False) + --skip_memory_metrics [SKIP_MEMORY_METRICS] + Whether or not to skip adding of memory profiler + reports to metrics. (default: True) + --no_skip_memory_metrics + Whether or not to skip adding of memory profiler + reports to metrics. (default: False) + --use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP] + Whether or not to use the legacy prediction_loop in + the Trainer. (default: False) + --push_to_hub [PUSH_TO_HUB] + Whether or not to upload the trained model to the + model hub after training. (default: False) + --resume_from_checkpoint RESUME_FROM_CHECKPOINT + The path to a folder with a valid checkpoint for your + model. (default: None) + --hub_model_id HUB_MODEL_ID + The name of the repository to keep in sync with the + local `output_dir`. (default: None) + --hub_strategy {end,every_save,checkpoint,all_checkpoints} + The hub strategy to use when `--push_to_hub` is + activated. (default: every_save) + --hub_token HUB_TOKEN + The token to use to push to the Model Hub. (default: + None) + --hub_private_repo [HUB_PRIVATE_REPO] + Whether the model repository is private or not. + (default: False) + --hub_always_push [HUB_ALWAYS_PUSH] + Unless `True`, the Trainer will skip pushes if the + previous one wasn't finished yet. (default: False) + --gradient_checkpointing [GRADIENT_CHECKPOINTING] + If True, use gradient checkpointing to save memory at + the expense of slower backward pass. (default: False) + --gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS + Gradient checkpointing key word arguments such as + `use_reentrant`. Will be passed to + `torch.utils.checkpoint.checkpoint` through + `model.gradient_checkpointing_enable`. (default: None) + --include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS] + Whether or not the inputs will be passed to the + `compute_metrics` function. (default: False) + --eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES] + Whether to recursively concat + inputs/losses/labels/predictions across batches. If + `False`, will instead store them as lists, with each + batch kept separate. (default: True) + --no_eval_do_concat_batches + Whether to recursively concat + inputs/losses/labels/predictions across batches. If + `False`, will instead store them as lists, with each + batch kept separate. (default: False) + --fp16_backend {auto,apex,cpu_amp} + Deprecated. Use half_precision_backend instead + (default: auto) + --evaluation_strategy {no,steps,epoch} + Deprecated. Use `eval_strategy` instead (default: + None) + --push_to_hub_model_id PUSH_TO_HUB_MODEL_ID + The name of the repository to which push the + `Trainer`. (default: None) + --push_to_hub_organization PUSH_TO_HUB_ORGANIZATION + The name of the organization in with to which push the + `Trainer`. (default: None) + --push_to_hub_token PUSH_TO_HUB_TOKEN + The token to use to push to the Model Hub. (default: + None) + --mp_parameters MP_PARAMETERS + Used by the SageMaker launcher to send mp-specific + args. Ignored in Trainer (default: ) + --auto_find_batch_size [AUTO_FIND_BATCH_SIZE] + Whether to automatically decrease the batch size in + half and rerun the training loop again each time a + CUDA Out-of-Memory was reached (default: False) + --full_determinism [FULL_DETERMINISM] + Whether to call enable_full_determinism instead of + set_seed for reproducibility in distributed training. + Important: this will negatively impact the + performance, so only use it for debugging. (default: + False) + --torchdynamo TORCHDYNAMO + This argument is deprecated, use + `--torch_compile_backend` instead. (default: None) + --ray_scope RAY_SCOPE + The scope to use when doing hyperparameter search with + Ray. By default, `"last"` will be used. Ray will then + use the last checkpoint of all trials, compare those, + and select the best one. However, other options are + also available. See the Ray documentation (https://doc + s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun + e.ExperimentAnalysis.get_best_trial) for more options. + (default: last) + --ddp_timeout DDP_TIMEOUT + Overrides the default timeout for distributed training + (value should be given in seconds). (default: 1800) + --torch_compile [TORCH_COMPILE] + If set to `True`, the model will be wrapped in + `torch.compile`. (default: False) + --torch_compile_backend TORCH_COMPILE_BACKEND + Which backend to use with `torch.compile`, passing one + will trigger a model compilation. (default: None) + --torch_compile_mode TORCH_COMPILE_MODE + Which mode to use with `torch.compile`, passing one + will trigger a model compilation. (default: None) + --dispatch_batches DISPATCH_BATCHES + Deprecated. Pass {'dispatch_batches':VALUE} to + `accelerator_config`. (default: None) + --split_batches SPLIT_BATCHES + Deprecated. Pass {'split_batches':True} to + `accelerator_config`. (default: None) + --include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND] + If set to `True`, the speed metrics will include `tgs` + (tokens per second per device). (default: False) + --include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN] + If set to `True`, will track the number of input + tokens seen throughout training. (May be slower in + distributed training) (default: False) + --neftune_noise_alpha NEFTUNE_NOISE_ALPHA + Activates neftune noise embeddings into the model. + NEFTune has been proven to drastically improve model + performances for instrcution fine-tuning. Check out + the original paper here: + https://arxiv.org/abs/2310.05914 and the original code + here: https://github.com/neelsjain/NEFTune. Only + supported for `PreTrainedModel` and `PeftModel` + classes. (default: None) + --optim_target_modules OPTIM_TARGET_MODULES + Target modules for the optimizer defined in the + `optim` argument. Only used for the GaLore optimizer + at the moment. (default: None) + --batch_eval_metrics [BATCH_EVAL_METRICS] + Break eval metrics calculation into batches to save + memory. (default: False) + --eval_on_start [EVAL_ON_START] + Whether to run through the entire `evaluation` step at + the very beginning of training as a sanity check. + (default: False) + --eval_use_gather_object [EVAL_USE_GATHER_OBJECT] + Whether to run recursively gather object in a nested + list/tuple/dictionary of objects from all devices. + (default: False) + --sortish_sampler [SORTISH_SAMPLER] + Whether to use SortishSampler or not. (default: False) + --predict_with_generate [PREDICT_WITH_GENERATE] + Whether to use generate to calculate generative + metrics (ROUGE, BLEU). (default: False) + --generation_max_length GENERATION_MAX_LENGTH + The `max_length` to use on each evaluation loop when + `predict_with_generate=True`. Will default to the + `max_length` value of the model configuration. + (default: None) + --generation_num_beams GENERATION_NUM_BEAMS + The `num_beams` to use on each evaluation loop when + `predict_with_generate=True`. Will default to the + `num_beams` value of the model configuration. + (default: None) + --generation_config GENERATION_CONFIG + Model id, file path or url pointing to a + GenerationConfig json file, to use during prediction. + (default: None) + --use_badam [USE_BADAM] + Whether or not to use the BAdam optimizer. (default: + False) + --badam_mode {layer,ratio} + Whether to use layer-wise or ratio-wise BAdam + optimizer. (default: layer) + --badam_start_block BADAM_START_BLOCK + The starting block index for layer-wise BAdam. + (default: None) + --badam_switch_mode {ascending,descending,random,fixed} + the strategy of picking block to update for layer-wise + BAdam. (default: ascending) + --badam_switch_interval BADAM_SWITCH_INTERVAL + Number of steps to update the block for layer-wise + BAdam. Use -1 to disable the block update. (default: + 50) + --badam_update_ratio BADAM_UPDATE_RATIO + The ratio of the update for ratio-wise BAdam. + (default: 0.05) + --badam_mask_mode {adjacent,scatter} + The mode of the mask for BAdam optimizer. `adjacent` + means that the trainable parameters are adjacent to + each other, `scatter` means that trainable parameters + are randomly choosed from the weight. (default: + adjacent) + --badam_verbose BADAM_VERBOSE + The verbosity level of BAdam optimizer. 0 for no + print, 1 for print the block prefix, 2 for print + trainable parameters. (default: 0) + --use_galore [USE_GALORE] + Whether or not to use the gradient low-Rank projection + (GaLore). (default: False) + --galore_target GALORE_TARGET + Name(s) of modules to apply GaLore. Use commas to + separate multiple modules. Use `all` to specify all + the linear modules. (default: all) + --galore_rank GALORE_RANK + The rank of GaLore gradients. (default: 16) + --galore_update_interval GALORE_UPDATE_INTERVAL + Number of steps to update the GaLore projection. + (default: 200) + --galore_scale GALORE_SCALE + GaLore scaling coefficient. (default: 0.25) + --galore_proj_type {std,reverse_std,right,left,full} + Type of GaLore projection. (default: std) + --galore_layerwise [GALORE_LAYERWISE] + Whether or not to enable layer-wise update to further + save memory. (default: False) + --pref_beta PREF_BETA + The beta parameter in the preference loss. (default: + 0.1) + --pref_ftx PREF_FTX The supervised fine-tuning loss coefficient in DPO + training. (default: 0.0) + --pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo} + The type of DPO loss to use. (default: sigmoid) + --dpo_label_smoothing DPO_LABEL_SMOOTHING + The robust DPO label smoothing parameter in cDPO that + should be between 0 and 0.5. (default: 0.0) + --kto_chosen_weight KTO_CHOSEN_WEIGHT + The weight factor of the desirable losses in KTO + training. (default: 1.0) + --kto_rejected_weight KTO_REJECTED_WEIGHT + The weight factor of the undesirable losses in KTO + training. (default: 1.0) + --simpo_gamma SIMPO_GAMMA + The target reward margin term in SimPO loss. (default: + 0.5) + --ppo_buffer_size PPO_BUFFER_SIZE + The number of mini-batches to make experience buffer + in a PPO optimization step. (default: 1) + --ppo_epochs PPO_EPOCHS + The number of epochs to perform in a PPO optimization + step. (default: 4) + --ppo_score_norm [PPO_SCORE_NORM] + Use score normalization in PPO training. (default: + False) + --ppo_target PPO_TARGET + Target KL value for adaptive KL control in PPO + training. (default: 6.0) + --ppo_whiten_rewards [PPO_WHITEN_REWARDS] + Whiten the rewards before compute advantages in PPO + training. (default: False) + --ref_model REF_MODEL + Path to the reference model used for the PPO or DPO + training. (default: None) + --ref_model_adapters REF_MODEL_ADAPTERS + Path to the adapters of the reference model. (default: + None) + --ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT + The number of bits to quantize the reference model. + (default: None) + --reward_model REWARD_MODEL + Path to the reward model used for the PPO training. + (default: None) + --reward_model_adapters REWARD_MODEL_ADAPTERS + Path to the adapters of the reward model. (default: + None) + --reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT + The number of bits to quantize the reward model. + (default: None) + --reward_model_type {lora,full,api} + The type of the reward model in PPO training. Lora + model only supports lora training. (default: lora) + --additional_target ADDITIONAL_TARGET + Name(s) of modules apart from LoRA layers to be set as + trainable and saved in the final checkpoint. Use + commas to separate multiple modules. (default: None) + --lora_alpha LORA_ALPHA + The scale factor for LoRA fine-tuning (default: + lora_rank * 2). (default: None) + --lora_dropout LORA_DROPOUT + Dropout rate for the LoRA fine-tuning. (default: 0.0) + --lora_rank LORA_RANK + The intrinsic dimension for LoRA fine-tuning. + (default: 8) + --lora_target LORA_TARGET + Name(s) of target modules to apply LoRA. Use commas to + separate multiple modules. Use `all` to specify all + the linear modules. (default: all) + --loraplus_lr_ratio LORAPLUS_LR_RATIO + LoRA plus learning rate ratio (lr_B / lr_A). (default: + None) + --loraplus_lr_embedding LORAPLUS_LR_EMBEDDING + LoRA plus learning rate for lora embedding layers. + (default: 1e-06) + --use_rslora [USE_RSLORA] + Whether or not to use the rank stabilization scaling + factor for LoRA layer. (default: False) + --use_dora [USE_DORA] + Whether or not to use the weight-decomposed lora + method (DoRA). (default: False) + --pissa_init [PISSA_INIT] + Whether or not to initialize a PiSSA adapter. + (default: False) + --pissa_iter PISSA_ITER + The number of iteration steps performed by FSVD in + PiSSA. Use -1 to disable it. (default: 16) + --pissa_convert [PISSA_CONVERT] + Whether or not to convert the PiSSA adapter to a + normal LoRA adapter. (default: False) + --create_new_adapter [CREATE_NEW_ADAPTER] + Whether or not to create a new adapter with randomly + initialized weight. (default: False) + --freeze_trainable_layers FREEZE_TRAINABLE_LAYERS + The number of trainable layers for freeze (partial- + parameter) fine-tuning. Positive numbers mean the last + n layers are set as trainable, negative numbers mean + the first n layers are set as trainable. (default: 2) + --freeze_trainable_modules FREEZE_TRAINABLE_MODULES + Name(s) of trainable modules for freeze (partial- + parameter) fine-tuning. Use commas to separate + multiple modules. Use `all` to specify all the + available modules. (default: all) + --freeze_extra_modules FREEZE_EXTRA_MODULES + Name(s) of modules apart from hidden layers to be set + as trainable for freeze (partial-parameter) fine- + tuning. Use commas to separate multiple modules. + (default: None) + --pure_bf16 [PURE_BF16] + Whether or not to train model in purely bf16 precision + (without AMP). (default: False) + --stage {pt,sft,rm,ppo,dpo,kto} + Which stage will be performed in training. (default: + sft) + --finetuning_type {lora,freeze,full} + Which fine-tuning method to use. (default: lora) + --use_llama_pro [USE_LLAMA_PRO] + Whether or not to make only the parameters in the + expanded blocks trainable. (default: False) + --use_adam_mini [USE_ADAM_MINI] + Whether or not to use the Adam-mini optimizer. + (default: False) + --freeze_vision_tower [FREEZE_VISION_TOWER] + Whether ot not to freeze vision tower in MLLM + training. (default: True) + --no_freeze_vision_tower + Whether ot not to freeze vision tower in MLLM + training. (default: False) + --train_mm_proj_only [TRAIN_MM_PROJ_ONLY] + Whether or not to train the multimodal projector for + MLLM only. (default: False) + --compute_accuracy [COMPUTE_ACCURACY] + Whether or not to compute the token-level accuracy at + evaluation. (default: False) + --plot_loss [PLOT_LOSS] + Whether or not to save the training loss curves. + (default: False) + --do_sample [DO_SAMPLE] + Whether or not to use sampling, use greedy decoding + otherwise. (default: True) + --no_do_sample Whether or not to use sampling, use greedy decoding + otherwise. (default: False) + --temperature TEMPERATURE + The value used to modulate the next token + probabilities. (default: 0.95) + --top_p TOP_P The smallest set of most probable tokens with + probabilities that add up to top_p or higher are kept. + (default: 0.7) + --top_k TOP_K The number of highest probability vocabulary tokens to + keep for top-k filtering. (default: 50) + --num_beams NUM_BEAMS + Number of beams for beam search. 1 means no beam + search. (default: 1) + --max_length MAX_LENGTH + The maximum length the generated tokens can have. It + can be overridden by max_new_tokens. (default: 1024) + --max_new_tokens MAX_NEW_TOKENS + The maximum numbers of tokens to generate, ignoring + the number of tokens in the prompt. (default: 1024) + --repetition_penalty REPETITION_PENALTY + The parameter for repetition penalty. 1.0 means no + penalty. (default: 1.0) + --length_penalty LENGTH_PENALTY + Exponential penalty to the length that is used with + beam-based generation. (default: 1.0) + --default_system DEFAULT_SYSTEM + Default system message to use in chat completion. + (default: None) +usage: launcher.py [-h] --model_name_or_path MODEL_NAME_OR_PATH + [--adapter_name_or_path ADAPTER_NAME_OR_PATH] + [--adapter_folder ADAPTER_FOLDER] [--cache_dir CACHE_DIR] + [--use_fast_tokenizer [USE_FAST_TOKENIZER]] + [--no_use_fast_tokenizer] [--resize_vocab [RESIZE_VOCAB]] + [--split_special_tokens [SPLIT_SPECIAL_TOKENS]] + [--new_special_tokens NEW_SPECIAL_TOKENS] + [--model_revision MODEL_REVISION] + [--low_cpu_mem_usage [LOW_CPU_MEM_USAGE]] + [--no_low_cpu_mem_usage] + [--quantization_method {bitsandbytes,hqq,eetq}] + [--quantization_bit QUANTIZATION_BIT] + [--quantization_type {fp4,nf4}] + [--double_quantization [DOUBLE_QUANTIZATION]] + [--no_double_quantization] + [--quantization_device_map {auto}] + [--rope_scaling {linear,dynamic}] + [--flash_attn {auto,disabled,sdpa,fa2}] + [--shift_attn [SHIFT_ATTN]] + [--mixture_of_depths {convert,load}] + [--use_unsloth [USE_UNSLOTH]] + [--visual_inputs [VISUAL_INPUTS]] + [--moe_aux_loss_coef MOE_AUX_LOSS_COEF] + [--disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING]] + [--upcast_layernorm [UPCAST_LAYERNORM]] + [--upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT]] + [--train_from_scratch [TRAIN_FROM_SCRATCH]] + [--infer_backend {huggingface,vllm}] + [--vllm_maxlen VLLM_MAXLEN] [--vllm_gpu_util VLLM_GPU_UTIL] + [--vllm_enforce_eager [VLLM_ENFORCE_EAGER]] + [--vllm_max_lora_rank VLLM_MAX_LORA_RANK] + [--offload_folder OFFLOAD_FOLDER] [--use_cache [USE_CACHE]] + [--no_use_cache] + [--infer_dtype {auto,float16,bfloat16,float32}] + [--hf_hub_token HF_HUB_TOKEN] [--ms_hub_token MS_HUB_TOKEN] + [--export_dir EXPORT_DIR] [--export_size EXPORT_SIZE] + [--export_device {cpu,auto}] + [--export_quantization_bit EXPORT_QUANTIZATION_BIT] + [--export_quantization_dataset EXPORT_QUANTIZATION_DATASET] + [--export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES] + [--export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN] + [--export_legacy_format [EXPORT_LEGACY_FORMAT]] + [--export_hub_model_id EXPORT_HUB_MODEL_ID] + [--print_param_status [PRINT_PARAM_STATUS]] + [--template TEMPLATE] [--dataset DATASET] + [--eval_dataset EVAL_DATASET] [--dataset_dir DATASET_DIR] + [--cutoff_len CUTOFF_LEN] + [--train_on_prompt [TRAIN_ON_PROMPT]] + [--mask_history [MASK_HISTORY]] [--streaming [STREAMING]] + [--buffer_size BUFFER_SIZE] + [--mix_strategy {concat,interleave_under,interleave_over}] + [--interleave_probs INTERLEAVE_PROBS] + [--overwrite_cache [OVERWRITE_CACHE]] + [--preprocessing_num_workers PREPROCESSING_NUM_WORKERS] + [--max_samples MAX_SAMPLES] + [--eval_num_beams EVAL_NUM_BEAMS] + [--ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS]] + [--no_ignore_pad_token_for_loss] [--val_size VAL_SIZE] + [--packing PACKING] [--neat_packing [NEAT_PACKING]] + [--tool_format TOOL_FORMAT] + [--tokenized_path TOKENIZED_PATH] --output_dir OUTPUT_DIR + [--overwrite_output_dir [OVERWRITE_OUTPUT_DIR]] + [--do_train [DO_TRAIN]] [--do_eval [DO_EVAL]] + [--do_predict [DO_PREDICT]] + [--eval_strategy {no,steps,epoch}] + [--prediction_loss_only [PREDICTION_LOSS_ONLY]] + [--per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE] + [--per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE] + [--per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE] + [--per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE] + [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS] + [--eval_accumulation_steps EVAL_ACCUMULATION_STEPS] + [--eval_delay EVAL_DELAY] + [--torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS] + [--learning_rate LEARNING_RATE] + [--weight_decay WEIGHT_DECAY] [--adam_beta1 ADAM_BETA1] + [--adam_beta2 ADAM_BETA2] [--adam_epsilon ADAM_EPSILON] + [--max_grad_norm MAX_GRAD_NORM] + [--num_train_epochs NUM_TRAIN_EPOCHS] + [--max_steps MAX_STEPS] + [--lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay}] + [--lr_scheduler_kwargs LR_SCHEDULER_KWARGS] + [--warmup_ratio WARMUP_RATIO] [--warmup_steps WARMUP_STEPS] + [--log_level {detail,debug,info,warning,error,critical,passive}] + [--log_level_replica {detail,debug,info,warning,error,critical,passive}] + [--log_on_each_node [LOG_ON_EACH_NODE]] + [--no_log_on_each_node] [--logging_dir LOGGING_DIR] + [--logging_strategy {no,steps,epoch}] + [--logging_first_step [LOGGING_FIRST_STEP]] + [--logging_steps LOGGING_STEPS] + [--logging_nan_inf_filter [LOGGING_NAN_INF_FILTER]] + [--no_logging_nan_inf_filter] + [--save_strategy {no,steps,epoch}] + [--save_steps SAVE_STEPS] + [--save_total_limit SAVE_TOTAL_LIMIT] + [--save_safetensors [SAVE_SAFETENSORS]] + [--no_save_safetensors] + [--save_on_each_node [SAVE_ON_EACH_NODE]] + [--save_only_model [SAVE_ONLY_MODEL]] + [--restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT]] + [--no_cuda [NO_CUDA]] [--use_cpu [USE_CPU]] + [--use_mps_device [USE_MPS_DEVICE]] [--seed SEED] + [--data_seed DATA_SEED] [--jit_mode_eval [JIT_MODE_EVAL]] + [--use_ipex [USE_IPEX]] [--bf16 [BF16]] [--fp16 [FP16]] + [--fp16_opt_level FP16_OPT_LEVEL] + [--half_precision_backend {auto,apex,cpu_amp}] + [--bf16_full_eval [BF16_FULL_EVAL]] + [--fp16_full_eval [FP16_FULL_EVAL]] [--tf32 TF32] + [--local_rank LOCAL_RANK] + [--ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl}] + [--tpu_num_cores TPU_NUM_CORES] + [--tpu_metrics_debug [TPU_METRICS_DEBUG]] + [--debug DEBUG [DEBUG ...]] + [--dataloader_drop_last [DATALOADER_DROP_LAST]] + [--eval_steps EVAL_STEPS] + [--dataloader_num_workers DATALOADER_NUM_WORKERS] + [--dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR] + [--past_index PAST_INDEX] [--run_name RUN_NAME] + [--disable_tqdm DISABLE_TQDM] + [--remove_unused_columns [REMOVE_UNUSED_COLUMNS]] + [--no_remove_unused_columns] + [--label_names LABEL_NAMES [LABEL_NAMES ...]] + [--load_best_model_at_end [LOAD_BEST_MODEL_AT_END]] + [--metric_for_best_model METRIC_FOR_BEST_MODEL] + [--greater_is_better GREATER_IS_BETTER] + [--ignore_data_skip [IGNORE_DATA_SKIP]] [--fsdp FSDP] + [--fsdp_min_num_params FSDP_MIN_NUM_PARAMS] + [--fsdp_config FSDP_CONFIG] + [--fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP] + [--accelerator_config ACCELERATOR_CONFIG] + [--deepspeed DEEPSPEED] + [--label_smoothing_factor LABEL_SMOOTHING_FACTOR] + [--optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo}] + [--optim_args OPTIM_ARGS] [--adafactor [ADAFACTOR]] + [--group_by_length [GROUP_BY_LENGTH]] + [--length_column_name LENGTH_COLUMN_NAME] + [--report_to REPORT_TO] + [--ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS] + [--ddp_bucket_cap_mb DDP_BUCKET_CAP_MB] + [--ddp_broadcast_buffers DDP_BROADCAST_BUFFERS] + [--dataloader_pin_memory [DATALOADER_PIN_MEMORY]] + [--no_dataloader_pin_memory] + [--dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS]] + [--skip_memory_metrics [SKIP_MEMORY_METRICS]] + [--no_skip_memory_metrics] + [--use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP]] + [--push_to_hub [PUSH_TO_HUB]] + [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] + [--hub_model_id HUB_MODEL_ID] + [--hub_strategy {end,every_save,checkpoint,all_checkpoints}] + [--hub_token HUB_TOKEN] + [--hub_private_repo [HUB_PRIVATE_REPO]] + [--hub_always_push [HUB_ALWAYS_PUSH]] + [--gradient_checkpointing [GRADIENT_CHECKPOINTING]] + [--gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS] + [--include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS]] + [--eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES]] + [--no_eval_do_concat_batches] + [--fp16_backend {auto,apex,cpu_amp}] + [--evaluation_strategy {no,steps,epoch}] + [--push_to_hub_model_id PUSH_TO_HUB_MODEL_ID] + [--push_to_hub_organization PUSH_TO_HUB_ORGANIZATION] + [--push_to_hub_token PUSH_TO_HUB_TOKEN] + [--mp_parameters MP_PARAMETERS] + [--auto_find_batch_size [AUTO_FIND_BATCH_SIZE]] + [--full_determinism [FULL_DETERMINISM]] + [--torchdynamo TORCHDYNAMO] [--ray_scope RAY_SCOPE] + [--ddp_timeout DDP_TIMEOUT] + [--torch_compile [TORCH_COMPILE]] + [--torch_compile_backend TORCH_COMPILE_BACKEND] + [--torch_compile_mode TORCH_COMPILE_MODE] + [--dispatch_batches DISPATCH_BATCHES] + [--split_batches SPLIT_BATCHES] + [--include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND]] + [--include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN]] + [--neftune_noise_alpha NEFTUNE_NOISE_ALPHA] + [--optim_target_modules OPTIM_TARGET_MODULES] + [--batch_eval_metrics [BATCH_EVAL_METRICS]] + [--eval_on_start [EVAL_ON_START]] + [--eval_use_gather_object [EVAL_USE_GATHER_OBJECT]] + [--sortish_sampler [SORTISH_SAMPLER]] + [--predict_with_generate [PREDICT_WITH_GENERATE]] + [--generation_max_length GENERATION_MAX_LENGTH] + [--generation_num_beams GENERATION_NUM_BEAMS] + [--generation_config GENERATION_CONFIG] + [--use_badam [USE_BADAM]] [--badam_mode {layer,ratio}] + [--badam_start_block BADAM_START_BLOCK] + [--badam_switch_mode {ascending,descending,random,fixed}] + [--badam_switch_interval BADAM_SWITCH_INTERVAL] + [--badam_update_ratio BADAM_UPDATE_RATIO] + [--badam_mask_mode {adjacent,scatter}] + [--badam_verbose BADAM_VERBOSE] [--use_galore [USE_GALORE]] + [--galore_target GALORE_TARGET] [--galore_rank GALORE_RANK] + [--galore_update_interval GALORE_UPDATE_INTERVAL] + [--galore_scale GALORE_SCALE] + [--galore_proj_type {std,reverse_std,right,left,full}] + [--galore_layerwise [GALORE_LAYERWISE]] + [--pref_beta PREF_BETA] [--pref_ftx PREF_FTX] + [--pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo}] + [--dpo_label_smoothing DPO_LABEL_SMOOTHING] + [--kto_chosen_weight KTO_CHOSEN_WEIGHT] + [--kto_rejected_weight KTO_REJECTED_WEIGHT] + [--simpo_gamma SIMPO_GAMMA] + [--ppo_buffer_size PPO_BUFFER_SIZE] + [--ppo_epochs PPO_EPOCHS] + [--ppo_score_norm [PPO_SCORE_NORM]] + [--ppo_target PPO_TARGET] + [--ppo_whiten_rewards [PPO_WHITEN_REWARDS]] + [--ref_model REF_MODEL] + [--ref_model_adapters REF_MODEL_ADAPTERS] + [--ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT] + [--reward_model REWARD_MODEL] + [--reward_model_adapters REWARD_MODEL_ADAPTERS] + [--reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT] + [--reward_model_type {lora,full,api}] + [--additional_target ADDITIONAL_TARGET] + [--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT] + [--lora_rank LORA_RANK] [--lora_target LORA_TARGET] + [--loraplus_lr_ratio LORAPLUS_LR_RATIO] + [--loraplus_lr_embedding LORAPLUS_LR_EMBEDDING] + [--use_rslora [USE_RSLORA]] [--use_dora [USE_DORA]] + [--pissa_init [PISSA_INIT]] [--pissa_iter PISSA_ITER] + [--pissa_convert [PISSA_CONVERT]] + [--create_new_adapter [CREATE_NEW_ADAPTER]] + [--freeze_trainable_layers FREEZE_TRAINABLE_LAYERS] + [--freeze_trainable_modules FREEZE_TRAINABLE_MODULES] + [--freeze_extra_modules FREEZE_EXTRA_MODULES] + [--pure_bf16 [PURE_BF16]] [--stage {pt,sft,rm,ppo,dpo,kto}] + [--finetuning_type {lora,freeze,full}] + [--use_llama_pro [USE_LLAMA_PRO]] + [--use_adam_mini [USE_ADAM_MINI]] + [--freeze_vision_tower [FREEZE_VISION_TOWER]] + [--no_freeze_vision_tower] + [--train_mm_proj_only [TRAIN_MM_PROJ_ONLY]] + [--compute_accuracy [COMPUTE_ACCURACY]] + [--plot_loss [PLOT_LOSS]] [--do_sample [DO_SAMPLE]] + [--no_do_sample] [--temperature TEMPERATURE] + [--top_p TOP_P] [--top_k TOP_K] [--num_beams NUM_BEAMS] + [--max_length MAX_LENGTH] [--max_new_tokens MAX_NEW_TOKENS] + [--repetition_penalty REPETITION_PENALTY] + [--length_penalty LENGTH_PENALTY] + [--default_system DEFAULT_SYSTEM] + +optional arguments: + -h, --help show this help message and exit + --model_name_or_path MODEL_NAME_OR_PATH + Path to the model weight or identifier from + huggingface.co/models or modelscope.cn/models. + (default: None) + --adapter_name_or_path ADAPTER_NAME_OR_PATH + Path to the adapter weight or identifier from + huggingface.co/models. Use commas to separate multiple + adapters. (default: None) + --adapter_folder ADAPTER_FOLDER + The folder containing the adapter weights to load. + (default: None) + --cache_dir CACHE_DIR + Where to store the pre-trained models downloaded from + huggingface.co or modelscope.cn. (default: None) + --use_fast_tokenizer [USE_FAST_TOKENIZER] + Whether or not to use one of the fast tokenizer + (backed by the tokenizers library). (default: True) + --no_use_fast_tokenizer + Whether or not to use one of the fast tokenizer + (backed by the tokenizers library). (default: False) + --resize_vocab [RESIZE_VOCAB] + Whether or not to resize the tokenizer vocab and the + embedding layers. (default: False) + --split_special_tokens [SPLIT_SPECIAL_TOKENS] + Whether or not the special tokens should be split + during the tokenization process. (default: False) + --new_special_tokens NEW_SPECIAL_TOKENS + Special tokens to be added into the tokenizer. Use + commas to separate multiple tokens. (default: None) + --model_revision MODEL_REVISION + The specific model version to use (can be a branch + name, tag name or commit id). (default: main) + --low_cpu_mem_usage [LOW_CPU_MEM_USAGE] + Whether or not to use memory-efficient model loading. + (default: True) + --no_low_cpu_mem_usage + Whether or not to use memory-efficient model loading. + (default: False) + --quantization_method {bitsandbytes,hqq,eetq} + Quantization method to use for on-the-fly + quantization. (default: bitsandbytes) + --quantization_bit QUANTIZATION_BIT + The number of bits to quantize the model using + bitsandbytes. (default: None) + --quantization_type {fp4,nf4} + Quantization data type to use in int4 training. + (default: nf4) + --double_quantization [DOUBLE_QUANTIZATION] + Whether or not to use double quantization in int4 + training. (default: True) + --no_double_quantization + Whether or not to use double quantization in int4 + training. (default: False) + --quantization_device_map {auto} + Device map used to infer the 4-bit quantized model, + needs bitsandbytes>=0.43.0. (default: None) + --rope_scaling {linear,dynamic} + Which scaling strategy should be adopted for the RoPE + embeddings. (default: None) + --flash_attn {auto,disabled,sdpa,fa2} + Enable FlashAttention for faster training and + inference. (default: auto) + --shift_attn [SHIFT_ATTN] + Enable shift short attention (S^2-Attn) proposed by + LongLoRA. (default: False) + --mixture_of_depths {convert,load} + Convert the model to mixture-of-depths (MoD) or load + the MoD model. (default: None) + --use_unsloth [USE_UNSLOTH] + Whether or not to use unsloth's optimization for the + LoRA training. (default: False) + --visual_inputs [VISUAL_INPUTS] + Whethor or not to use multimodal LLM that accepts + visual inputs. (default: False) + --moe_aux_loss_coef MOE_AUX_LOSS_COEF + Coefficient of the auxiliary router loss in mixture- + of-experts model. (default: None) + --disable_gradient_checkpointing [DISABLE_GRADIENT_CHECKPOINTING] + Whether or not to disable gradient checkpointing. + (default: False) + --upcast_layernorm [UPCAST_LAYERNORM] + Whether or not to upcast the layernorm weights in + fp32. (default: False) + --upcast_lmhead_output [UPCAST_LMHEAD_OUTPUT] + Whether or not to upcast the output of lm_head in + fp32. (default: False) + --train_from_scratch [TRAIN_FROM_SCRATCH] + Whether or not to randomly initialize the model + weights. (default: False) + --infer_backend {huggingface,vllm} + Backend engine used at inference. (default: + huggingface) + --vllm_maxlen VLLM_MAXLEN + Maximum sequence (prompt + response) length of the + vLLM engine. (default: 2048) + --vllm_gpu_util VLLM_GPU_UTIL + The fraction of GPU memory in (0,1) to be used for the + vLLM engine. (default: 0.9) + --vllm_enforce_eager [VLLM_ENFORCE_EAGER] + Whether or not to disable CUDA graph in the vLLM + engine. (default: False) + --vllm_max_lora_rank VLLM_MAX_LORA_RANK + Maximum rank of all LoRAs in the vLLM engine. + (default: 32) + --offload_folder OFFLOAD_FOLDER + Path to offload model weights. (default: offload) + --use_cache [USE_CACHE] + Whether or not to use KV cache in generation. + (default: True) + --no_use_cache Whether or not to use KV cache in generation. + (default: False) + --infer_dtype {auto,float16,bfloat16,float32} + Data type for model weights and activations at + inference. (default: auto) + --hf_hub_token HF_HUB_TOKEN + Auth token to log in with Hugging Face Hub. (default: + None) + --ms_hub_token MS_HUB_TOKEN + Auth token to log in with ModelScope Hub. (default: + None) + --export_dir EXPORT_DIR + Path to the directory to save the exported model. + (default: None) + --export_size EXPORT_SIZE + The file shard size (in GB) of the exported model. + (default: 1) + --export_device {cpu,auto} + The device used in model export, use `auto` to + accelerate exporting. (default: cpu) + --export_quantization_bit EXPORT_QUANTIZATION_BIT + The number of bits to quantize the exported model. + (default: None) + --export_quantization_dataset EXPORT_QUANTIZATION_DATASET + Path to the dataset or dataset name to use in + quantizing the exported model. (default: None) + --export_quantization_nsamples EXPORT_QUANTIZATION_NSAMPLES + The number of samples used for quantization. (default: + 128) + --export_quantization_maxlen EXPORT_QUANTIZATION_MAXLEN + The maximum length of the model inputs used for + quantization. (default: 1024) + --export_legacy_format [EXPORT_LEGACY_FORMAT] + Whether or not to save the `.bin` files instead of + `.safetensors`. (default: False) + --export_hub_model_id EXPORT_HUB_MODEL_ID + The name of the repository if push the model to the + Hugging Face hub. (default: None) + --print_param_status [PRINT_PARAM_STATUS] + For debugging purposes, print the status of the + parameters in the model. (default: False) + --template TEMPLATE Which template to use for constructing prompts in + training and inference. (default: None) + --dataset DATASET The name of dataset(s) to use for training. Use commas + to separate multiple datasets. (default: None) + --eval_dataset EVAL_DATASET + The name of dataset(s) to use for evaluation. Use + commas to separate multiple datasets. (default: None) + --dataset_dir DATASET_DIR + Path to the folder containing the datasets. (default: + data) + --cutoff_len CUTOFF_LEN + The cutoff length of the tokenized inputs in the + dataset. (default: 1024) + --train_on_prompt [TRAIN_ON_PROMPT] + Whether or not to disable the mask on the prompt. + (default: False) + --mask_history [MASK_HISTORY] + Whether or not to mask the history and train on the + last turn only. (default: False) + --streaming [STREAMING] + Enable dataset streaming. (default: False) + --buffer_size BUFFER_SIZE + Size of the buffer to randomly sample examples from in + dataset streaming. (default: 16384) + --mix_strategy {concat,interleave_under,interleave_over} + Strategy to use in dataset mixing (concat/interleave) + (undersampling/oversampling). (default: concat) + --interleave_probs INTERLEAVE_PROBS + Probabilities to sample data from datasets. Use commas + to separate multiple datasets. (default: None) + --overwrite_cache [OVERWRITE_CACHE] + Overwrite the cached training and evaluation sets. + (default: False) + --preprocessing_num_workers PREPROCESSING_NUM_WORKERS + The number of processes to use for the pre-processing. + (default: None) + --max_samples MAX_SAMPLES + For debugging purposes, truncate the number of + examples for each dataset. (default: None) + --eval_num_beams EVAL_NUM_BEAMS + Number of beams to use for evaluation. This argument + will be passed to `model.generate` (default: None) + --ignore_pad_token_for_loss [IGNORE_PAD_TOKEN_FOR_LOSS] + Whether or not to ignore the tokens corresponding to + the pad label in loss computation. (default: True) + --no_ignore_pad_token_for_loss + Whether or not to ignore the tokens corresponding to + the pad label in loss computation. (default: False) + --val_size VAL_SIZE Size of the development set, should be an integer or a + float in range `[0,1)`. (default: 0.0) + --packing PACKING Enable sequences packing in training. Will + automatically enable in pre-training. (default: None) + --neat_packing [NEAT_PACKING] + Enable sequence packing without cross-attention. + (default: False) + --tool_format TOOL_FORMAT + Tool format to use for constructing function calling + examples. (default: None) + --tokenized_path TOKENIZED_PATH + Path to save or load the tokenized datasets. (default: + None) + --output_dir OUTPUT_DIR + The output directory where the model predictions and + checkpoints will be written. (default: None) + --overwrite_output_dir [OVERWRITE_OUTPUT_DIR] + Overwrite the content of the output directory. Use + this to continue training if output_dir points to a + checkpoint directory. (default: False) + --do_train [DO_TRAIN] + Whether to run training. (default: False) + --do_eval [DO_EVAL] Whether to run eval on the dev set. (default: False) + --do_predict [DO_PREDICT] + Whether to run predictions on the test set. (default: + False) + --eval_strategy {no,steps,epoch} + The evaluation strategy to use. (default: no) + --prediction_loss_only [PREDICTION_LOSS_ONLY] + When performing evaluation and predictions, only + returns the loss. (default: False) + --per_device_train_batch_size PER_DEVICE_TRAIN_BATCH_SIZE + Batch size per GPU/TPU/MPS/NPU core/CPU for training. + (default: 8) + --per_device_eval_batch_size PER_DEVICE_EVAL_BATCH_SIZE + Batch size per GPU/TPU/MPS/NPU core/CPU for + evaluation. (default: 8) + --per_gpu_train_batch_size PER_GPU_TRAIN_BATCH_SIZE + Deprecated, the use of `--per_device_train_batch_size` + is preferred. Batch size per GPU/TPU core/CPU for + training. (default: None) + --per_gpu_eval_batch_size PER_GPU_EVAL_BATCH_SIZE + Deprecated, the use of `--per_device_eval_batch_size` + is preferred. Batch size per GPU/TPU core/CPU for + evaluation. (default: None) + --gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS + Number of updates steps to accumulate before + performing a backward/update pass. (default: 1) + --eval_accumulation_steps EVAL_ACCUMULATION_STEPS + Number of predictions steps to accumulate before + moving the tensors to the CPU. (default: None) + --eval_delay EVAL_DELAY + Number of epochs or steps to wait for before the first + evaluation can be performed, depending on the + eval_strategy. (default: 0) + --torch_empty_cache_steps TORCH_EMPTY_CACHE_STEPS + Number of steps to wait before calling + `torch..empty_cache()`.This can help avoid + CUDA out-of-memory errors by lowering peak VRAM usage + at a cost of about [10{'option_strings': ['-- + torch_empty_cache_steps'], 'dest': + 'torch_empty_cache_steps', 'nargs': None, 'const': + None, 'default': None, 'type': 'int', 'choices': None, + 'required': False, 'help': 'Number of steps to wait + before calling `torch..empty_cache()`.This can + help avoid CUDA out-of-memory errors by lowering peak + VRAM usage at a cost of about [10% slower performance] + (https://github.com/huggingface/transformers/issues/31 + 372).If left unset or set to None, cache will not be + emptied.', 'metavar': None, 'container': + , + 'prog': 'launcher.py'}lower performance](https://githu + b.com/huggingface/transformers/issues/31372).If left + unset or set to None, cache will not be emptied. + (default: None) + --learning_rate LEARNING_RATE + The initial learning rate for AdamW. (default: 5e-05) + --weight_decay WEIGHT_DECAY + Weight decay for AdamW if we apply some. (default: + 0.0) + --adam_beta1 ADAM_BETA1 + Beta1 for AdamW optimizer (default: 0.9) + --adam_beta2 ADAM_BETA2 + Beta2 for AdamW optimizer (default: 0.999) + --adam_epsilon ADAM_EPSILON + Epsilon for AdamW optimizer. (default: 1e-08) + --max_grad_norm MAX_GRAD_NORM + Max gradient norm. (default: 1.0) + --num_train_epochs NUM_TRAIN_EPOCHS + Total number of training epochs to perform. (default: + 3.0) + --max_steps MAX_STEPS + If > 0: set total number of training steps to perform. + Override num_train_epochs. (default: -1) + --lr_scheduler_type {linear,cosine,cosine_with_restarts,polynomial,constant,constant_with_warmup,inverse_sqrt,reduce_lr_on_plateau,cosine_with_min_lr,warmup_stable_decay} + The scheduler type to use. (default: linear) + --lr_scheduler_kwargs LR_SCHEDULER_KWARGS + Extra parameters for the lr_scheduler such as + {'num_cycles': 1} for the cosine with hard restarts. + (default: {}) + --warmup_ratio WARMUP_RATIO + Linear warmup over warmup_ratio fraction of total + steps. (default: 0.0) + --warmup_steps WARMUP_STEPS + Linear warmup over warmup_steps. (default: 0) + --log_level {detail,debug,info,warning,error,critical,passive} + Logger log level to use on the main node. Possible + choices are the log levels as strings: 'debug', + 'info', 'warning', 'error' and 'critical', plus a + 'passive' level which doesn't set anything and lets + the application set the level. Defaults to 'passive'. + (default: passive) + --log_level_replica {detail,debug,info,warning,error,critical,passive} + Logger log level to use on replica nodes. Same choices + and defaults as ``log_level`` (default: warning) + --log_on_each_node [LOG_ON_EACH_NODE] + When doing a multinode distributed training, whether + to log once per node or just once on the main node. + (default: True) + --no_log_on_each_node + When doing a multinode distributed training, whether + to log once per node or just once on the main node. + (default: False) + --logging_dir LOGGING_DIR + Tensorboard log dir. (default: None) + --logging_strategy {no,steps,epoch} + The logging strategy to use. (default: steps) + --logging_first_step [LOGGING_FIRST_STEP] + Log the first global_step (default: False) + --logging_steps LOGGING_STEPS + Log every X updates steps. Should be an integer or a + float in range `[0,1)`. If smaller than 1, will be + interpreted as ratio of total training steps. + (default: 500) + --logging_nan_inf_filter [LOGGING_NAN_INF_FILTER] + Filter nan and inf losses for logging. (default: True) + --no_logging_nan_inf_filter + Filter nan and inf losses for logging. (default: + False) + --save_strategy {no,steps,epoch} + The checkpoint save strategy to use. (default: steps) + --save_steps SAVE_STEPS + Save checkpoint every X updates steps. Should be an + integer or a float in range `[0,1)`. If smaller than + 1, will be interpreted as ratio of total training + steps. (default: 500) + --save_total_limit SAVE_TOTAL_LIMIT + If a value is passed, will limit the total amount of + checkpoints. Deletes the older checkpoints in + `output_dir`. When `load_best_model_at_end` is + enabled, the 'best' checkpoint according to + `metric_for_best_model` will always be retained in + addition to the most recent ones. For example, for + `save_total_limit=5` and + `load_best_model_at_end=True`, the four last + checkpoints will always be retained alongside the best + model. When `save_total_limit=1` and + `load_best_model_at_end=True`, it is possible that two + checkpoints are saved: the last one and the best one + (if they are different). Default is unlimited + checkpoints (default: None) + --save_safetensors [SAVE_SAFETENSORS] + Use safetensors saving and loading for state dicts + instead of default torch.load and torch.save. + (default: True) + --no_save_safetensors + Use safetensors saving and loading for state dicts + instead of default torch.load and torch.save. + (default: False) + --save_on_each_node [SAVE_ON_EACH_NODE] + When doing multi-node distributed training, whether to + save models and checkpoints on each node, or only on + the main one (default: False) + --save_only_model [SAVE_ONLY_MODEL] + When checkpointing, whether to only save the model, or + also the optimizer, scheduler & rng state.Note that + when this is true, you won't be able to resume + training from checkpoint.This enables you to save + storage by not storing the optimizer, scheduler & rng + state.You can only load the model using + from_pretrained with this option set to True. + (default: False) + --restore_callback_states_from_checkpoint [RESTORE_CALLBACK_STATES_FROM_CHECKPOINT] + Whether to restore the callback states from the + checkpoint. If `True`, will override callbacks passed + to the `Trainer` if they exist in the checkpoint. + (default: False) + --no_cuda [NO_CUDA] This argument is deprecated. It will be removed in + version 5.0 of 🤗 Transformers. (default: False) + --use_cpu [USE_CPU] Whether or not to use cpu. If set to False, we will + use cuda/tpu/mps/npu device if available. (default: + False) + --use_mps_device [USE_MPS_DEVICE] + This argument is deprecated. `mps` device will be used + if available similar to `cuda` device. It will be + removed in version 5.0 of 🤗 Transformers (default: + False) + --seed SEED Random seed that will be set at the beginning of + training. (default: 42) + --data_seed DATA_SEED + Random seed to be used with data samplers. (default: + None) + --jit_mode_eval [JIT_MODE_EVAL] + Whether or not to use PyTorch jit trace for inference + (default: False) + --use_ipex [USE_IPEX] + Use Intel extension for PyTorch when it is available, + installation: 'https://github.com/intel/intel- + extension-for-pytorch' (default: False) + --bf16 [BF16] Whether to use bf16 (mixed) precision instead of + 32-bit. Requires Ampere or higher NVIDIA architecture + or using CPU (use_cpu) or Ascend NPU. This is an + experimental API and it may change. (default: False) + --fp16 [FP16] Whether to use fp16 (mixed) precision instead of + 32-bit (default: False) + --fp16_opt_level FP16_OPT_LEVEL + For fp16: Apex AMP optimization level selected in + ['O0', 'O1', 'O2', and 'O3']. See details at + https://nvidia.github.io/apex/amp.html (default: O1) + --half_precision_backend {auto,apex,cpu_amp} + The backend to be used for half precision. (default: + auto) + --bf16_full_eval [BF16_FULL_EVAL] + Whether to use full bfloat16 evaluation instead of + 32-bit. This is an experimental API and it may change. + (default: False) + --fp16_full_eval [FP16_FULL_EVAL] + Whether to use full float16 evaluation instead of + 32-bit (default: False) + --tf32 TF32 Whether to enable tf32 mode, available in Ampere and + newer GPU architectures. This is an experimental API + and it may change. (default: None) + --local_rank LOCAL_RANK + For distributed training: local_rank (default: -1) + --ddp_backend {nccl,gloo,mpi,ccl,hccl,cncl} + The backend to be used for distributed training + (default: None) + --tpu_num_cores TPU_NUM_CORES + TPU: Number of TPU cores (automatically passed by + launcher script) (default: None) + --tpu_metrics_debug [TPU_METRICS_DEBUG] + Deprecated, the use of `--debug tpu_metrics_debug` is + preferred. TPU: Whether to print debug metrics + (default: False) + --debug DEBUG [DEBUG ...] + Whether or not to enable debug mode. Current options: + `underflow_overflow` (Detect underflow and overflow in + activations and weights), `tpu_metrics_debug` (print + debug metrics on TPU). (default: None) + --dataloader_drop_last [DATALOADER_DROP_LAST] + Drop the last incomplete batch if it is not divisible + by the batch size. (default: False) + --eval_steps EVAL_STEPS + Run an evaluation every X steps. Should be an integer + or a float in range `[0,1)`. If smaller than 1, will + be interpreted as ratio of total training steps. + (default: None) + --dataloader_num_workers DATALOADER_NUM_WORKERS + Number of subprocesses to use for data loading + (PyTorch only). 0 means that the data will be loaded + in the main process. (default: 0) + --dataloader_prefetch_factor DATALOADER_PREFETCH_FACTOR + Number of batches loaded in advance by each worker. 2 + means there will be a total of 2 * num_workers batches + prefetched across all workers. Default is 2 for + PyTorch < 2.0.0 and otherwise None. (default: None) + --past_index PAST_INDEX + If >=0, uses the corresponding part of the output as + the past state for next step. (default: -1) + --run_name RUN_NAME An optional descriptor for the run. Notably used for + wandb, mlflow and comet logging. (default: None) + --disable_tqdm DISABLE_TQDM + Whether or not to disable the tqdm progress bars. + (default: None) + --remove_unused_columns [REMOVE_UNUSED_COLUMNS] + Remove columns not required by the model when using an + nlp.Dataset. (default: True) + --no_remove_unused_columns + Remove columns not required by the model when using an + nlp.Dataset. (default: False) + --label_names LABEL_NAMES [LABEL_NAMES ...] + The list of keys in your dictionary of inputs that + correspond to the labels. (default: None) + --load_best_model_at_end [LOAD_BEST_MODEL_AT_END] + Whether or not to load the best model found during + training at the end of training. When this option is + enabled, the best checkpoint will always be saved. See + `save_total_limit` for more. (default: False) + --metric_for_best_model METRIC_FOR_BEST_MODEL + The metric to use to compare two different models. + (default: None) + --greater_is_better GREATER_IS_BETTER + Whether the `metric_for_best_model` should be + maximized or not. (default: None) + --ignore_data_skip [IGNORE_DATA_SKIP] + When resuming training, whether or not to skip the + first epochs and batches to get to the same training + data. (default: False) + --fsdp FSDP Whether or not to use PyTorch Fully Sharded Data + Parallel (FSDP) training (in distributed training + only). The base option should be `full_shard`, + `shard_grad_op` or `no_shard` and you can add CPU- + offload to `full_shard` or `shard_grad_op` like this: + full_shard offload` or `shard_grad_op offload`. You + can add auto-wrap to `full_shard` or `shard_grad_op` + with the same syntax: full_shard auto_wrap` or + `shard_grad_op auto_wrap`. (default: ) + --fsdp_min_num_params FSDP_MIN_NUM_PARAMS + This parameter is deprecated. FSDP's minimum number of + parameters for Default Auto Wrapping. (useful only + when `fsdp` field is passed). (default: 0) + --fsdp_config FSDP_CONFIG + Config to be used with FSDP (Pytorch Fully Sharded + Data Parallel). The value is either a fsdp json config + file (e.g., `fsdp_config.json`) or an already loaded + json file as `dict`. (default: None) + --fsdp_transformer_layer_cls_to_wrap FSDP_TRANSFORMER_LAYER_CLS_TO_WRAP + This parameter is deprecated. Transformer layer class + name (case-sensitive) to wrap, e.g, `BertLayer`, + `GPTJBlock`, `T5Block` .... (useful only when `fsdp` + flag is passed). (default: None) + --accelerator_config ACCELERATOR_CONFIG + Config to be used with the internal Accelerator object + initializtion. The value is either a accelerator json + config file (e.g., `accelerator_config.json`) or an + already loaded json file as `dict`. (default: None) + --deepspeed DEEPSPEED + Enable deepspeed and pass the path to deepspeed json + config file (e.g. `ds_config.json`) or an already + loaded json file as a dict (default: None) + --label_smoothing_factor LABEL_SMOOTHING_FACTOR + The label smoothing epsilon to apply (zero means no + label smoothing). (default: 0.0) + --optim {adamw_hf,adamw_torch,adamw_torch_fused,adamw_torch_xla,adamw_torch_npu_fused,adamw_apex_fused,adafactor,adamw_anyprecision,sgd,adagrad,adamw_bnb_8bit,adamw_8bit,lion_8bit,lion_32bit,paged_adamw_32bit,paged_adamw_8bit,paged_lion_32bit,paged_lion_8bit,rmsprop,rmsprop_bnb,rmsprop_bnb_8bit,rmsprop_bnb_32bit,galore_adamw,galore_adamw_8bit,galore_adafactor,galore_adamw_layerwise,galore_adamw_8bit_layerwise,galore_adafactor_layerwise,lomo,adalomo} + The optimizer to use. (default: adamw_torch) + --optim_args OPTIM_ARGS + Optional arguments to supply to optimizer. (default: + None) + --adafactor [ADAFACTOR] + Whether or not to replace AdamW by Adafactor. + (default: False) + --group_by_length [GROUP_BY_LENGTH] + Whether or not to group samples of roughly the same + length together when batching. (default: False) + --length_column_name LENGTH_COLUMN_NAME + Column name with precomputed lengths to use when + grouping by length. (default: length) + --report_to REPORT_TO + The list of integrations to report the results and + logs to. (default: None) + --ddp_find_unused_parameters DDP_FIND_UNUSED_PARAMETERS + When using distributed training, the value of the flag + `find_unused_parameters` passed to + `DistributedDataParallel`. (default: None) + --ddp_bucket_cap_mb DDP_BUCKET_CAP_MB + When using distributed training, the value of the flag + `bucket_cap_mb` passed to `DistributedDataParallel`. + (default: None) + --ddp_broadcast_buffers DDP_BROADCAST_BUFFERS + When using distributed training, the value of the flag + `broadcast_buffers` passed to + `DistributedDataParallel`. (default: None) + --dataloader_pin_memory [DATALOADER_PIN_MEMORY] + Whether or not to pin memory for DataLoader. (default: + True) + --no_dataloader_pin_memory + Whether or not to pin memory for DataLoader. (default: + False) + --dataloader_persistent_workers [DATALOADER_PERSISTENT_WORKERS] + If True, the data loader will not shut down the worker + processes after a dataset has been consumed once. This + allows to maintain the workers Dataset instances + alive. Can potentially speed up training, but will + increase RAM usage. (default: False) + --skip_memory_metrics [SKIP_MEMORY_METRICS] + Whether or not to skip adding of memory profiler + reports to metrics. (default: True) + --no_skip_memory_metrics + Whether or not to skip adding of memory profiler + reports to metrics. (default: False) + --use_legacy_prediction_loop [USE_LEGACY_PREDICTION_LOOP] + Whether or not to use the legacy prediction_loop in + the Trainer. (default: False) + --push_to_hub [PUSH_TO_HUB] + Whether or not to upload the trained model to the + model hub after training. (default: False) + --resume_from_checkpoint RESUME_FROM_CHECKPOINT + The path to a folder with a valid checkpoint for your + model. (default: None) + --hub_model_id HUB_MODEL_ID + The name of the repository to keep in sync with the + local `output_dir`. (default: None) + --hub_strategy {end,every_save,checkpoint,all_checkpoints} + The hub strategy to use when `--push_to_hub` is + activated. (default: every_save) + --hub_token HUB_TOKEN + The token to use to push to the Model Hub. (default: + None) + --hub_private_repo [HUB_PRIVATE_REPO] + Whether the model repository is private or not. + (default: False) + --hub_always_push [HUB_ALWAYS_PUSH] + Unless `True`, the Trainer will skip pushes if the + previous one wasn't finished yet. (default: False) + --gradient_checkpointing [GRADIENT_CHECKPOINTING] + If True, use gradient checkpointing to save memory at + the expense of slower backward pass. (default: False) + --gradient_checkpointing_kwargs GRADIENT_CHECKPOINTING_KWARGS + Gradient checkpointing key word arguments such as + `use_reentrant`. Will be passed to + `torch.utils.checkpoint.checkpoint` through + `model.gradient_checkpointing_enable`. (default: None) + --include_inputs_for_metrics [INCLUDE_INPUTS_FOR_METRICS] + Whether or not the inputs will be passed to the + `compute_metrics` function. (default: False) + --eval_do_concat_batches [EVAL_DO_CONCAT_BATCHES] + Whether to recursively concat + inputs/losses/labels/predictions across batches. If + `False`, will instead store them as lists, with each + batch kept separate. (default: True) + --no_eval_do_concat_batches + Whether to recursively concat + inputs/losses/labels/predictions across batches. If + `False`, will instead store them as lists, with each + batch kept separate. (default: False) + --fp16_backend {auto,apex,cpu_amp} + Deprecated. Use half_precision_backend instead + (default: auto) + --evaluation_strategy {no,steps,epoch} + Deprecated. Use `eval_strategy` instead (default: + None) + --push_to_hub_model_id PUSH_TO_HUB_MODEL_ID + The name of the repository to which push the + `Trainer`. (default: None) + --push_to_hub_organization PUSH_TO_HUB_ORGANIZATION + The name of the organization in with to which push the + `Trainer`. (default: None) + --push_to_hub_token PUSH_TO_HUB_TOKEN + The token to use to push to the Model Hub. (default: + None) + --mp_parameters MP_PARAMETERS + Used by the SageMaker launcher to send mp-specific + args. Ignored in Trainer (default: ) + --auto_find_batch_size [AUTO_FIND_BATCH_SIZE] + Whether to automatically decrease the batch size in + half and rerun the training loop again each time a + CUDA Out-of-Memory was reached (default: False) + --full_determinism [FULL_DETERMINISM] + Whether to call enable_full_determinism instead of + set_seed for reproducibility in distributed training. + Important: this will negatively impact the + performance, so only use it for debugging. (default: + False) + --torchdynamo TORCHDYNAMO + This argument is deprecated, use + `--torch_compile_backend` instead. (default: None) + --ray_scope RAY_SCOPE + The scope to use when doing hyperparameter search with + Ray. By default, `"last"` will be used. Ray will then + use the last checkpoint of all trials, compare those, + and select the best one. However, other options are + also available. See the Ray documentation (https://doc + s.ray.io/en/latest/tune/api_docs/analysis.html#ray.tun + e.ExperimentAnalysis.get_best_trial) for more options. + (default: last) + --ddp_timeout DDP_TIMEOUT + Overrides the default timeout for distributed training + (value should be given in seconds). (default: 1800) + --torch_compile [TORCH_COMPILE] + If set to `True`, the model will be wrapped in + `torch.compile`. (default: False) + --torch_compile_backend TORCH_COMPILE_BACKEND + Which backend to use with `torch.compile`, passing one + will trigger a model compilation. (default: None) + --torch_compile_mode TORCH_COMPILE_MODE + Which mode to use with `torch.compile`, passing one + will trigger a model compilation. (default: None) + --dispatch_batches DISPATCH_BATCHES + Deprecated. Pass {'dispatch_batches':VALUE} to + `accelerator_config`. (default: None) + --split_batches SPLIT_BATCHES + Deprecated. Pass {'split_batches':True} to + `accelerator_config`. (default: None) + --include_tokens_per_second [INCLUDE_TOKENS_PER_SECOND] + If set to `True`, the speed metrics will include `tgs` + (tokens per second per device). (default: False) + --include_num_input_tokens_seen [INCLUDE_NUM_INPUT_TOKENS_SEEN] + If set to `True`, will track the number of input + tokens seen throughout training. (May be slower in + distributed training) (default: False) + --neftune_noise_alpha NEFTUNE_NOISE_ALPHA + Activates neftune noise embeddings into the model. + NEFTune has been proven to drastically improve model + performances for instrcution fine-tuning. Check out + the original paper here: + https://arxiv.org/abs/2310.05914 and the original code + here: https://github.com/neelsjain/NEFTune. Only + supported for `PreTrainedModel` and `PeftModel` + classes. (default: None) + --optim_target_modules OPTIM_TARGET_MODULES + Target modules for the optimizer defined in the + `optim` argument. Only used for the GaLore optimizer + at the moment. (default: None) + --batch_eval_metrics [BATCH_EVAL_METRICS] + Break eval metrics calculation into batches to save + memory. (default: False) + --eval_on_start [EVAL_ON_START] + Whether to run through the entire `evaluation` step at + the very beginning of training as a sanity check. + (default: False) + --eval_use_gather_object [EVAL_USE_GATHER_OBJECT] + Whether to run recursively gather object in a nested + list/tuple/dictionary of objects from all devices. + (default: False) + --sortish_sampler [SORTISH_SAMPLER] + Whether to use SortishSampler or not. (default: False) + --predict_with_generate [PREDICT_WITH_GENERATE] + Whether to use generate to calculate generative + metrics (ROUGE, BLEU). (default: False) + --generation_max_length GENERATION_MAX_LENGTH + The `max_length` to use on each evaluation loop when + `predict_with_generate=True`. Will default to the + `max_length` value of the model configuration. + (default: None) + --generation_num_beams GENERATION_NUM_BEAMS + The `num_beams` to use on each evaluation loop when + `predict_with_generate=True`. Will default to the + `num_beams` value of the model configuration. + (default: None) + --generation_config GENERATION_CONFIG + Model id, file path or url pointing to a + GenerationConfig json file, to use during prediction. + (default: None) + --use_badam [USE_BADAM] + Whether or not to use the BAdam optimizer. (default: + False) + --badam_mode {layer,ratio} + Whether to use layer-wise or ratio-wise BAdam + optimizer. (default: layer) + --badam_start_block BADAM_START_BLOCK + The starting block index for layer-wise BAdam. + (default: None) + --badam_switch_mode {ascending,descending,random,fixed} + the strategy of picking block to update for layer-wise + BAdam. (default: ascending) + --badam_switch_interval BADAM_SWITCH_INTERVAL + Number of steps to update the block for layer-wise + BAdam. Use -1 to disable the block update. (default: + 50) + --badam_update_ratio BADAM_UPDATE_RATIO + The ratio of the update for ratio-wise BAdam. + (default: 0.05) + --badam_mask_mode {adjacent,scatter} + The mode of the mask for BAdam optimizer. `adjacent` + means that the trainable parameters are adjacent to + each other, `scatter` means that trainable parameters + are randomly choosed from the weight. (default: + adjacent) + --badam_verbose BADAM_VERBOSE + The verbosity level of BAdam optimizer. 0 for no + print, 1 for print the block prefix, 2 for print + trainable parameters. (default: 0) + --use_galore [USE_GALORE] + Whether or not to use the gradient low-Rank projection + (GaLore). (default: False) + --galore_target GALORE_TARGET + Name(s) of modules to apply GaLore. Use commas to + separate multiple modules. Use `all` to specify all + the linear modules. (default: all) + --galore_rank GALORE_RANK + The rank of GaLore gradients. (default: 16) + --galore_update_interval GALORE_UPDATE_INTERVAL + Number of steps to update the GaLore projection. + (default: 200) + --galore_scale GALORE_SCALE + GaLore scaling coefficient. (default: 0.25) + --galore_proj_type {std,reverse_std,right,left,full} + Type of GaLore projection. (default: std) + --galore_layerwise [GALORE_LAYERWISE] + Whether or not to enable layer-wise update to further + save memory. (default: False) + --pref_beta PREF_BETA + The beta parameter in the preference loss. (default: + 0.1) + --pref_ftx PREF_FTX The supervised fine-tuning loss coefficient in DPO + training. (default: 0.0) + --pref_loss {sigmoid,hinge,ipo,kto_pair,orpo,simpo} + The type of DPO loss to use. (default: sigmoid) + --dpo_label_smoothing DPO_LABEL_SMOOTHING + The robust DPO label smoothing parameter in cDPO that + should be between 0 and 0.5. (default: 0.0) + --kto_chosen_weight KTO_CHOSEN_WEIGHT + The weight factor of the desirable losses in KTO + training. (default: 1.0) + --kto_rejected_weight KTO_REJECTED_WEIGHT + The weight factor of the undesirable losses in KTO + training. (default: 1.0) + --simpo_gamma SIMPO_GAMMA + The target reward margin term in SimPO loss. (default: + 0.5) + --ppo_buffer_size PPO_BUFFER_SIZE + The number of mini-batches to make experience buffer + in a PPO optimization step. (default: 1) + --ppo_epochs PPO_EPOCHS + The number of epochs to perform in a PPO optimization + step. (default: 4) + --ppo_score_norm [PPO_SCORE_NORM] + Use score normalization in PPO training. (default: + False) + --ppo_target PPO_TARGET + Target KL value for adaptive KL control in PPO + training. (default: 6.0) + --ppo_whiten_rewards [PPO_WHITEN_REWARDS] + Whiten the rewards before compute advantages in PPO + training. (default: False) + --ref_model REF_MODEL + Path to the reference model used for the PPO or DPO + training. (default: None) + --ref_model_adapters REF_MODEL_ADAPTERS + Path to the adapters of the reference model. (default: + None) + --ref_model_quantization_bit REF_MODEL_QUANTIZATION_BIT + The number of bits to quantize the reference model. + (default: None) + --reward_model REWARD_MODEL + Path to the reward model used for the PPO training. + (default: None) + --reward_model_adapters REWARD_MODEL_ADAPTERS + Path to the adapters of the reward model. (default: + None) + --reward_model_quantization_bit REWARD_MODEL_QUANTIZATION_BIT + The number of bits to quantize the reward model. + (default: None) + --reward_model_type {lora,full,api} + The type of the reward model in PPO training. Lora + model only supports lora training. (default: lora) + --additional_target ADDITIONAL_TARGET + Name(s) of modules apart from LoRA layers to be set as + trainable and saved in the final checkpoint. Use + commas to separate multiple modules. (default: None) + --lora_alpha LORA_ALPHA + The scale factor for LoRA fine-tuning (default: + lora_rank * 2). (default: None) + --lora_dropout LORA_DROPOUT + Dropout rate for the LoRA fine-tuning. (default: 0.0) + --lora_rank LORA_RANK + The intrinsic dimension for LoRA fine-tuning. + (default: 8) + --lora_target LORA_TARGET + Name(s) of target modules to apply LoRA. Use commas to + separate multiple modules. Use `all` to specify all + the linear modules. (default: all) + --loraplus_lr_ratio LORAPLUS_LR_RATIO + LoRA plus learning rate ratio (lr_B / lr_A). (default: + None) + --loraplus_lr_embedding LORAPLUS_LR_EMBEDDING + LoRA plus learning rate for lora embedding layers. + (default: 1e-06) + --use_rslora [USE_RSLORA] + Whether or not to use the rank stabilization scaling + factor for LoRA layer. (default: False) + --use_dora [USE_DORA] + Whether or not to use the weight-decomposed lora + method (DoRA). (default: False) + --pissa_init [PISSA_INIT] + Whether or not to initialize a PiSSA adapter. + (default: False) + --pissa_iter PISSA_ITER + The number of iteration steps performed by FSVD in + PiSSA. Use -1 to disable it. (default: 16) + --pissa_convert [PISSA_CONVERT] + Whether or not to convert the PiSSA adapter to a + normal LoRA adapter. (default: False) + --create_new_adapter [CREATE_NEW_ADAPTER] + Whether or not to create a new adapter with randomly + initialized weight. (default: False) + --freeze_trainable_layers FREEZE_TRAINABLE_LAYERS + The number of trainable layers for freeze (partial- + parameter) fine-tuning. Positive numbers mean the last + n layers are set as trainable, negative numbers mean + the first n layers are set as trainable. (default: 2) + --freeze_trainable_modules FREEZE_TRAINABLE_MODULES + Name(s) of trainable modules for freeze (partial- + parameter) fine-tuning. Use commas to separate + multiple modules. Use `all` to specify all the + available modules. (default: all) + --freeze_extra_modules FREEZE_EXTRA_MODULES + Name(s) of modules apart from hidden layers to be set + as trainable for freeze (partial-parameter) fine- + tuning. Use commas to separate multiple modules. + (default: None) + --pure_bf16 [PURE_BF16] + Whether or not to train model in purely bf16 precision + (without AMP). (default: False) + --stage {pt,sft,rm,ppo,dpo,kto} + Which stage will be performed in training. (default: + sft) + --finetuning_type {lora,freeze,full} + Which fine-tuning method to use. (default: lora) + --use_llama_pro [USE_LLAMA_PRO] + Whether or not to make only the parameters in the + expanded blocks trainable. (default: False) + --use_adam_mini [USE_ADAM_MINI] + Whether or not to use the Adam-mini optimizer. + (default: False) + --freeze_vision_tower [FREEZE_VISION_TOWER] + Whether ot not to freeze vision tower in MLLM + training. (default: True) + --no_freeze_vision_tower + Whether ot not to freeze vision tower in MLLM + training. (default: False) + --train_mm_proj_only [TRAIN_MM_PROJ_ONLY] + Whether or not to train the multimodal projector for + MLLM only. (default: False) + --compute_accuracy [COMPUTE_ACCURACY] + Whether or not to compute the token-level accuracy at + evaluation. (default: False) + --plot_loss [PLOT_LOSS] + Whether or not to save the training loss curves. + (default: False) + --do_sample [DO_SAMPLE] + Whether or not to use sampling, use greedy decoding + otherwise. (default: True) + --no_do_sample Whether or not to use sampling, use greedy decoding + otherwise. (default: False) + --temperature TEMPERATURE + The value used to modulate the next token + probabilities. (default: 0.95) + --top_p TOP_P The smallest set of most probable tokens with + probabilities that add up to top_p or higher are kept. + (default: 0.7) + --top_k TOP_K The number of highest probability vocabulary tokens to + keep for top-k filtering. (default: 50) + --num_beams NUM_BEAMS + Number of beams for beam search. 1 means no beam + search. (default: 1) + --max_length MAX_LENGTH + The maximum length the generated tokens can have. It + can be overridden by max_new_tokens. (default: 1024) + --max_new_tokens MAX_NEW_TOKENS + The maximum numbers of tokens to generate, ignoring + the number of tokens in the prompt. (default: 1024) + --repetition_penalty REPETITION_PENALTY + The parameter for repetition penalty. 1.0 means no + penalty. (default: 1.0) + --length_penalty LENGTH_PENALTY + Exponential penalty to the length that is used with + beam-based generation. (default: 1.0) + --default_system DEFAULT_SYSTEM + Default system message to use in chat completion. + (default: None)