tiny fix

2024-06-27 20:14:48 +08:00 · 2024-06-27 20:14:48 +08:00 · e44a4f07f0
parent 33ef6f4ec2
commit e44a4f07f0
7 changed files with 28 additions and 26 deletions
--- a/README.md
+++ b/README.md
@ -444,7 +444,7 @@ docker build -f ./docker/docker-cuda/Dockerfile \
    --build-arg INSTALL_BNB=false \
    --build-arg INSTALL_VLLM=false \
    --build-arg INSTALL_DEEPSPEED=false \
-    --build-arg INSTALL_FLASH_ATTN=false \
+    --build-arg INSTALL_FLASHATTN=false \
    --build-arg PIP_INDEX=https://pypi.org/simple \
    -t llamafactory:latest .

--- a/README_zh.md
+++ b/README_zh.md
@ -444,7 +444,7 @@ docker build -f ./docker/docker-cuda/Dockerfile \
    --build-arg INSTALL_BNB=false \
    --build-arg INSTALL_VLLM=false \
    --build-arg INSTALL_DEEPSPEED=false \
-    --build-arg INSTALL_FLASH_ATTN=false \
+    --build-arg INSTALL_FLASHATTN=false \
    --build-arg PIP_INDEX=https://pypi.org/simple \
    -t llamafactory:latest .

--- a/docker/docker-cuda/Dockerfile
+++ b/docker/docker-cuda/Dockerfile
@ -2,11 +2,14 @@
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-02.html
 FROM nvcr.io/nvidia/pytorch:24.02-py3

+# Define environments
+ENV MAX_JOBS=4
+
 # Define installation arguments
 ARG INSTALL_BNB=false
 ARG INSTALL_VLLM=false
 ARG INSTALL_DEEPSPEED=false
-ARG INSTALL_FLASH_ATTN=false
+ARG INSTALL_FLASHATTN=false
 ARG PIP_INDEX=https://pypi.org/simple

 # Set the working directory
@ -14,34 +17,33 @@ WORKDIR /app

 # Install the requirements
 COPY requirements.txt /app
-RUN pip config set global.index-url $PIP_INDEX
-RUN pip config set global.extra-index-url $PIP_INDEX
-RUN python -m pip install --upgrade pip
-RUN python -m pip install -r requirements.txt
+RUN pip config set global.index-url "$PIP_INDEX" && \
+    pip config set global.extra-index-url "$PIP_INDEX" && \
+    python -m pip install --upgrade pip && \
+    python -m pip install -r requirements.txt
+
+# Rebuild flash attention
+RUN pip uninstall -y transformer-engine flash-attn && \
+    if [ "$INSTALL_FLASHATTN" == "true" ]; then \
+        pip uninstall -y ninja && pip install ninja && \
+        pip install --no-cache-dir flash-attn --no-build-isolation \
+    fi;

 # Copy the rest of the application into the image
 COPY . /app

 # Install the LLaMA Factory
 RUN EXTRA_PACKAGES="metrics"; \
-    if [ "$INSTALL_BNB" = "true" ]; then \
+    if [ "$INSTALL_BNB" == "true" ]; then \
        EXTRA_PACKAGES="${EXTRA_PACKAGES},bitsandbytes"; \
    fi; \
-    if [ "$INSTALL_VLLM" = "true" ]; then \
+    if [ "$INSTALL_VLLM" == "true" ]; then \
        EXTRA_PACKAGES="${EXTRA_PACKAGES},vllm"; \
    fi; \
-    if [ "$INSTALL_DEEPSPEED" = "true" ]; then \
+    if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
        EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
    fi; \
-    pip install -e .[$EXTRA_PACKAGES] && \
-    pip uninstall -y transformer-engine flash-attn
-
-# Rebuild flash-attn
-RUN if [ "$INSTALL_FLASH_ATTN" = "true" ]; then \
-        ninja --version || \
-        (pip uninstall -y ninja && pip install ninja) && \
-        MAX_JOBS=4 pip install --no-cache-dir flash-attn --no-build-isolation \
-    fi;
+    pip install -e ".[$EXTRA_PACKAGES]"

 # Set up volumes
 VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
--- a/docker/docker-cuda/docker-compose.yml
+++ b/docker/docker-cuda/docker-compose.yml
@ -7,7 +7,7 @@ services:
        INSTALL_BNB: false
        INSTALL_VLLM: false
        INSTALL_DEEPSPEED: false
-        INSTALL_FLASH_ATTN: false
+        INSTALL_FLASHATTN: false
        PIP_INDEX: https://pypi.org/simple
    container_name: llamafactory
    volumes:
--- a/docker/docker-npu/Dockerfile
+++ b/docker/docker-npu/Dockerfile
@ -2,6 +2,7 @@
 # More versions can be found at https://hub.docker.com/r/cosdt/cann/tags
 FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04

+# Define environments
 ENV DEBIAN_FRONTEND=noninteractive

 # Define installation arguments
@ -27,8 +28,7 @@ RUN EXTRA_PACKAGES="torch-npu,metrics"; \
    if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
        EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
    fi; \
-    pip install -e ".[$EXTRA_PACKAGES]" && \
-    pip uninstall -y transformer-engine flash-attn
+    pip install -e ".[$EXTRA_PACKAGES]"

 # Set up volumes
 VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
--- a/src/llamafactory/cli.py
+++ b/src/llamafactory/cli.py
@ -91,7 +91,7 @@ def main():
            master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1")
            master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999)))
            logger.info("Initializing distributed tasks at: {}:{}".format(master_addr, master_port))
-            subproc = subprocess.run(
+            process = subprocess.run(
                (
                    "torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} "
                    "--master_addr {master_addr} --master_port {master_port} {file_name} {args}"
@ -106,7 +106,7 @@ def main():
                ),
                shell=True,
            )
-            sys.exit(subproc.returncode)
+            sys.exit(process.returncode)
        else:
            run_exp()
    elif command == Command.WEBDEMO:
--- a/src/llamafactory/hparams/parser.py
+++ b/src/llamafactory/hparams/parser.py
@ -199,8 +199,8 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
        if not is_torch_bf16_gpu_available():
            raise ValueError("This device does not support `pure_bf16`.")

-        if training_args.deepspeed:
-            raise ValueError("`pure_bf16` is incompatible with DeepSpeed.")
+        if is_deepspeed_zero3_enabled():
+            raise ValueError("`pure_bf16` is incompatible with DeepSpeed ZeRO-3.")

        if training_args.fp16 or training_args.bf16:
            raise ValueError("Turn off mixed precision training when using `pure_bf16`.")