diff --git a/README.md b/README.md index 386177bb..beff2fb5 100644 --- a/README.md +++ b/README.md @@ -537,9 +537,10 @@ docker exec -it llamafactory bash
Details about volume -- hf_cache: Utilize Hugging Face cache on the host machine. Reassignable if a cache already exists in a different directory. -- data: Place datasets on this dir of the host machine so that they can be selected on LLaMA Board GUI. -- output: Set export dir to this location so that the merged result can be accessed directly on the host machine. +- `hf_cache`: Utilize Hugging Face cache on the host machine. Reassignable if a cache already exists in a different directory. +- `ms_cache`: Similar to Hugging Face cache but for ModelScope users. +- `data`: Place datasets on this dir of the host machine so that they can be selected on LLaMA Board GUI. +- `output`: Set export dir to this location so that the merged result can be accessed directly on the host machine.
diff --git a/README_zh.md b/README_zh.md index 812b7b28..41e4acfe 100644 --- a/README_zh.md +++ b/README_zh.md @@ -537,9 +537,10 @@ docker exec -it llamafactory bash
数据卷详情 -- hf_cache:使用宿主机的 Hugging Face 缓存文件夹,允许更改为新的目录。 -- data:宿主机中存放数据集的文件夹路径。 -- output:将导出目录设置为该路径后,即可在宿主机中访问导出后的模型。 +- `hf_cache`:使用宿主机的 Hugging Face 缓存文件夹,允许更改为新的目录。 +- `ms_cache`:类似 Hugging Face 缓存文件夹,为 ModelScope 用户提供。 +- `data`:宿主机中存放数据集的文件夹路径。 +- `output`:将导出目录设置为该路径后,即可在宿主机中访问导出后的模型。
diff --git a/examples/extras/pissa/init.sh b/examples/extras/pissa/init.sh new file mode 100644 index 00000000..11e1e357 --- /dev/null +++ b/examples/extras/pissa/init.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +python scripts/pissa_init.py \ + --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \ + --output_dir models/llama3-8b-pissa diff --git a/scripts/cal_lr.py b/scripts/cal_lr.py index bc629a70..a9b27b37 100644 --- a/scripts/cal_lr.py +++ b/scripts/cal_lr.py @@ -43,7 +43,7 @@ def calculate_lr( dataset_dir: str = "data", template: str = "default", cutoff_len: int = 1024, # i.e. maximum input length during training - is_mistral: bool = False, # mistral model uses a smaller learning rate, + is_mistral_or_gemma: bool = False, # mistral and gemma models opt for a smaller learning rate, packing: bool = False, ): r""" @@ -84,7 +84,7 @@ def calculate_lr( valid_ratio = valid_tokens / total_tokens batch_valid_len = batch_max_len * valid_ratio lr = BASE_LR * math.sqrt(batch_valid_len / BASE_BS) # lr ~ sqrt(batch_size) - lr = lr / 6.0 if is_mistral else lr + lr = lr / 6.0 if is_mistral_or_gemma else lr print( "Optimal learning rate is {:.2e} for valid ratio% {:.2f} and effective batch size {:.2f}".format( lr, valid_ratio * 100, batch_valid_len diff --git a/scripts/llama_pro.py b/scripts/llama_pro.py index 17bf6fc2..bd05feb2 100644 --- a/scripts/llama_pro.py +++ b/scripts/llama_pro.py @@ -19,7 +19,7 @@ import json import os from collections import OrderedDict -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING import fire import torch @@ -47,8 +47,8 @@ def block_expansion( model_name_or_path: str, output_dir: str, num_expand: int, - shard_size: Optional[str] = "2GB", - save_safetensors: Optional[bool] = False, + shard_size: str = "2GB", + save_safetensors: bool = True, ): r""" Performs block expansion for LLaMA, Mistral, Qwen1.5 or Yi models. diff --git a/scripts/llamafy_baichuan2.py b/scripts/llamafy_baichuan2.py index 19284f5f..95673859 100644 --- a/scripts/llamafy_baichuan2.py +++ b/scripts/llamafy_baichuan2.py @@ -16,7 +16,7 @@ import json import os from collections import OrderedDict -from typing import Any, Dict, Optional +from typing import Any, Dict import fire import torch @@ -86,7 +86,10 @@ def save_config(input_dir: str, output_dir: str): def llamafy_baichuan2( - input_dir: str, output_dir: str, shard_size: Optional[str] = "2GB", save_safetensors: Optional[bool] = False + input_dir: str, + output_dir: str, + shard_size: str = "2GB", + save_safetensors: bool = True, ): r""" Converts the Baichuan2-7B model in the same format as LLaMA2-7B. diff --git a/scripts/llamafy_qwen.py b/scripts/llamafy_qwen.py index e5b59483..785a2129 100644 --- a/scripts/llamafy_qwen.py +++ b/scripts/llamafy_qwen.py @@ -16,7 +16,7 @@ import json import os from collections import OrderedDict -from typing import Any, Dict, Optional +from typing import Any, Dict import fire import torch @@ -139,7 +139,10 @@ def save_config(input_dir: str, output_dir: str, torch_dtype: str): def llamafy_qwen( - input_dir: str, output_dir: str, shard_size: Optional[str] = "2GB", save_safetensors: Optional[bool] = False + input_dir: str, + output_dir: str, + shard_size: str = "2GB", + save_safetensors: bool = False, ): r""" Converts the Qwen models in the same format as LLaMA2. diff --git a/scripts/pissa_init.py b/scripts/pissa_init.py index 78b3fde0..f285e326 100644 --- a/scripts/pissa_init.py +++ b/scripts/pissa_init.py @@ -31,7 +31,7 @@ if TYPE_CHECKING: def quantize_pissa( model_name_or_path: str, output_dir: str, - pissa_iter: int = 4, + pissa_iter: int = 16, lora_alpha: int = None, lora_rank: int = 16, lora_dropout: float = 0,