tiny fix
This commit is contained in:
parent
33ef6f4ec2
commit
e44a4f07f0
|
@ -444,7 +444,7 @@ docker build -f ./docker/docker-cuda/Dockerfile \
|
||||||
--build-arg INSTALL_BNB=false \
|
--build-arg INSTALL_BNB=false \
|
||||||
--build-arg INSTALL_VLLM=false \
|
--build-arg INSTALL_VLLM=false \
|
||||||
--build-arg INSTALL_DEEPSPEED=false \
|
--build-arg INSTALL_DEEPSPEED=false \
|
||||||
--build-arg INSTALL_FLASH_ATTN=false \
|
--build-arg INSTALL_FLASHATTN=false \
|
||||||
--build-arg PIP_INDEX=https://pypi.org/simple \
|
--build-arg PIP_INDEX=https://pypi.org/simple \
|
||||||
-t llamafactory:latest .
|
-t llamafactory:latest .
|
||||||
|
|
||||||
|
|
|
@ -444,7 +444,7 @@ docker build -f ./docker/docker-cuda/Dockerfile \
|
||||||
--build-arg INSTALL_BNB=false \
|
--build-arg INSTALL_BNB=false \
|
||||||
--build-arg INSTALL_VLLM=false \
|
--build-arg INSTALL_VLLM=false \
|
||||||
--build-arg INSTALL_DEEPSPEED=false \
|
--build-arg INSTALL_DEEPSPEED=false \
|
||||||
--build-arg INSTALL_FLASH_ATTN=false \
|
--build-arg INSTALL_FLASHATTN=false \
|
||||||
--build-arg PIP_INDEX=https://pypi.org/simple \
|
--build-arg PIP_INDEX=https://pypi.org/simple \
|
||||||
-t llamafactory:latest .
|
-t llamafactory:latest .
|
||||||
|
|
||||||
|
|
|
@ -2,11 +2,14 @@
|
||||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-02.html
|
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-02.html
|
||||||
FROM nvcr.io/nvidia/pytorch:24.02-py3
|
FROM nvcr.io/nvidia/pytorch:24.02-py3
|
||||||
|
|
||||||
|
# Define environments
|
||||||
|
ENV MAX_JOBS=4
|
||||||
|
|
||||||
# Define installation arguments
|
# Define installation arguments
|
||||||
ARG INSTALL_BNB=false
|
ARG INSTALL_BNB=false
|
||||||
ARG INSTALL_VLLM=false
|
ARG INSTALL_VLLM=false
|
||||||
ARG INSTALL_DEEPSPEED=false
|
ARG INSTALL_DEEPSPEED=false
|
||||||
ARG INSTALL_FLASH_ATTN=false
|
ARG INSTALL_FLASHATTN=false
|
||||||
ARG PIP_INDEX=https://pypi.org/simple
|
ARG PIP_INDEX=https://pypi.org/simple
|
||||||
|
|
||||||
# Set the working directory
|
# Set the working directory
|
||||||
|
@ -14,34 +17,33 @@ WORKDIR /app
|
||||||
|
|
||||||
# Install the requirements
|
# Install the requirements
|
||||||
COPY requirements.txt /app
|
COPY requirements.txt /app
|
||||||
RUN pip config set global.index-url $PIP_INDEX
|
RUN pip config set global.index-url "$PIP_INDEX" && \
|
||||||
RUN pip config set global.extra-index-url $PIP_INDEX
|
pip config set global.extra-index-url "$PIP_INDEX" && \
|
||||||
RUN python -m pip install --upgrade pip
|
python -m pip install --upgrade pip && \
|
||||||
RUN python -m pip install -r requirements.txt
|
python -m pip install -r requirements.txt
|
||||||
|
|
||||||
|
# Rebuild flash attention
|
||||||
|
RUN pip uninstall -y transformer-engine flash-attn && \
|
||||||
|
if [ "$INSTALL_FLASHATTN" == "true" ]; then \
|
||||||
|
pip uninstall -y ninja && pip install ninja && \
|
||||||
|
pip install --no-cache-dir flash-attn --no-build-isolation \
|
||||||
|
fi;
|
||||||
|
|
||||||
# Copy the rest of the application into the image
|
# Copy the rest of the application into the image
|
||||||
COPY . /app
|
COPY . /app
|
||||||
|
|
||||||
# Install the LLaMA Factory
|
# Install the LLaMA Factory
|
||||||
RUN EXTRA_PACKAGES="metrics"; \
|
RUN EXTRA_PACKAGES="metrics"; \
|
||||||
if [ "$INSTALL_BNB" = "true" ]; then \
|
if [ "$INSTALL_BNB" == "true" ]; then \
|
||||||
EXTRA_PACKAGES="${EXTRA_PACKAGES},bitsandbytes"; \
|
EXTRA_PACKAGES="${EXTRA_PACKAGES},bitsandbytes"; \
|
||||||
fi; \
|
fi; \
|
||||||
if [ "$INSTALL_VLLM" = "true" ]; then \
|
if [ "$INSTALL_VLLM" == "true" ]; then \
|
||||||
EXTRA_PACKAGES="${EXTRA_PACKAGES},vllm"; \
|
EXTRA_PACKAGES="${EXTRA_PACKAGES},vllm"; \
|
||||||
fi; \
|
fi; \
|
||||||
if [ "$INSTALL_DEEPSPEED" = "true" ]; then \
|
if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
|
||||||
EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
|
EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
|
||||||
fi; \
|
fi; \
|
||||||
pip install -e .[$EXTRA_PACKAGES] && \
|
pip install -e ".[$EXTRA_PACKAGES]"
|
||||||
pip uninstall -y transformer-engine flash-attn
|
|
||||||
|
|
||||||
# Rebuild flash-attn
|
|
||||||
RUN if [ "$INSTALL_FLASH_ATTN" = "true" ]; then \
|
|
||||||
ninja --version || \
|
|
||||||
(pip uninstall -y ninja && pip install ninja) && \
|
|
||||||
MAX_JOBS=4 pip install --no-cache-dir flash-attn --no-build-isolation \
|
|
||||||
fi;
|
|
||||||
|
|
||||||
# Set up volumes
|
# Set up volumes
|
||||||
VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
|
VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
|
||||||
|
|
|
@ -7,7 +7,7 @@ services:
|
||||||
INSTALL_BNB: false
|
INSTALL_BNB: false
|
||||||
INSTALL_VLLM: false
|
INSTALL_VLLM: false
|
||||||
INSTALL_DEEPSPEED: false
|
INSTALL_DEEPSPEED: false
|
||||||
INSTALL_FLASH_ATTN: false
|
INSTALL_FLASHATTN: false
|
||||||
PIP_INDEX: https://pypi.org/simple
|
PIP_INDEX: https://pypi.org/simple
|
||||||
container_name: llamafactory
|
container_name: llamafactory
|
||||||
volumes:
|
volumes:
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
# More versions can be found at https://hub.docker.com/r/cosdt/cann/tags
|
# More versions can be found at https://hub.docker.com/r/cosdt/cann/tags
|
||||||
FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
|
FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
|
||||||
|
|
||||||
|
# Define environments
|
||||||
ENV DEBIAN_FRONTEND=noninteractive
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
# Define installation arguments
|
# Define installation arguments
|
||||||
|
@ -27,8 +28,7 @@ RUN EXTRA_PACKAGES="torch-npu,metrics"; \
|
||||||
if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
|
if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
|
||||||
EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
|
EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
|
||||||
fi; \
|
fi; \
|
||||||
pip install -e ".[$EXTRA_PACKAGES]" && \
|
pip install -e ".[$EXTRA_PACKAGES]"
|
||||||
pip uninstall -y transformer-engine flash-attn
|
|
||||||
|
|
||||||
# Set up volumes
|
# Set up volumes
|
||||||
VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
|
VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
|
||||||
|
|
|
@ -91,7 +91,7 @@ def main():
|
||||||
master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1")
|
master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1")
|
||||||
master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999)))
|
master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999)))
|
||||||
logger.info("Initializing distributed tasks at: {}:{}".format(master_addr, master_port))
|
logger.info("Initializing distributed tasks at: {}:{}".format(master_addr, master_port))
|
||||||
subproc = subprocess.run(
|
process = subprocess.run(
|
||||||
(
|
(
|
||||||
"torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} "
|
"torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} "
|
||||||
"--master_addr {master_addr} --master_port {master_port} {file_name} {args}"
|
"--master_addr {master_addr} --master_port {master_port} {file_name} {args}"
|
||||||
|
@ -106,7 +106,7 @@ def main():
|
||||||
),
|
),
|
||||||
shell=True,
|
shell=True,
|
||||||
)
|
)
|
||||||
sys.exit(subproc.returncode)
|
sys.exit(process.returncode)
|
||||||
else:
|
else:
|
||||||
run_exp()
|
run_exp()
|
||||||
elif command == Command.WEBDEMO:
|
elif command == Command.WEBDEMO:
|
||||||
|
|
|
@ -199,8 +199,8 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
|
||||||
if not is_torch_bf16_gpu_available():
|
if not is_torch_bf16_gpu_available():
|
||||||
raise ValueError("This device does not support `pure_bf16`.")
|
raise ValueError("This device does not support `pure_bf16`.")
|
||||||
|
|
||||||
if training_args.deepspeed:
|
if is_deepspeed_zero3_enabled():
|
||||||
raise ValueError("`pure_bf16` is incompatible with DeepSpeed.")
|
raise ValueError("`pure_bf16` is incompatible with DeepSpeed ZeRO-3.")
|
||||||
|
|
||||||
if training_args.fp16 or training_args.bf16:
|
if training_args.fp16 or training_args.bf16:
|
||||||
raise ValueError("Turn off mixed precision training when using `pure_bf16`.")
|
raise ValueError("Turn off mixed precision training when using `pure_bf16`.")
|
||||||
|
|
Loading…
Reference in New Issue