This commit is contained in:
hiyouga 2024-06-27 20:14:48 +08:00
parent 33ef6f4ec2
commit e44a4f07f0
7 changed files with 28 additions and 26 deletions

View File

@ -444,7 +444,7 @@ docker build -f ./docker/docker-cuda/Dockerfile \
--build-arg INSTALL_BNB=false \ --build-arg INSTALL_BNB=false \
--build-arg INSTALL_VLLM=false \ --build-arg INSTALL_VLLM=false \
--build-arg INSTALL_DEEPSPEED=false \ --build-arg INSTALL_DEEPSPEED=false \
--build-arg INSTALL_FLASH_ATTN=false \ --build-arg INSTALL_FLASHATTN=false \
--build-arg PIP_INDEX=https://pypi.org/simple \ --build-arg PIP_INDEX=https://pypi.org/simple \
-t llamafactory:latest . -t llamafactory:latest .

View File

@ -444,7 +444,7 @@ docker build -f ./docker/docker-cuda/Dockerfile \
--build-arg INSTALL_BNB=false \ --build-arg INSTALL_BNB=false \
--build-arg INSTALL_VLLM=false \ --build-arg INSTALL_VLLM=false \
--build-arg INSTALL_DEEPSPEED=false \ --build-arg INSTALL_DEEPSPEED=false \
--build-arg INSTALL_FLASH_ATTN=false \ --build-arg INSTALL_FLASHATTN=false \
--build-arg PIP_INDEX=https://pypi.org/simple \ --build-arg PIP_INDEX=https://pypi.org/simple \
-t llamafactory:latest . -t llamafactory:latest .

View File

@ -2,11 +2,14 @@
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-02.html # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-02.html
FROM nvcr.io/nvidia/pytorch:24.02-py3 FROM nvcr.io/nvidia/pytorch:24.02-py3
# Define environments
ENV MAX_JOBS=4
# Define installation arguments # Define installation arguments
ARG INSTALL_BNB=false ARG INSTALL_BNB=false
ARG INSTALL_VLLM=false ARG INSTALL_VLLM=false
ARG INSTALL_DEEPSPEED=false ARG INSTALL_DEEPSPEED=false
ARG INSTALL_FLASH_ATTN=false ARG INSTALL_FLASHATTN=false
ARG PIP_INDEX=https://pypi.org/simple ARG PIP_INDEX=https://pypi.org/simple
# Set the working directory # Set the working directory
@ -14,34 +17,33 @@ WORKDIR /app
# Install the requirements # Install the requirements
COPY requirements.txt /app COPY requirements.txt /app
RUN pip config set global.index-url $PIP_INDEX RUN pip config set global.index-url "$PIP_INDEX" && \
RUN pip config set global.extra-index-url $PIP_INDEX pip config set global.extra-index-url "$PIP_INDEX" && \
RUN python -m pip install --upgrade pip python -m pip install --upgrade pip && \
RUN python -m pip install -r requirements.txt python -m pip install -r requirements.txt
# Rebuild flash attention
RUN pip uninstall -y transformer-engine flash-attn && \
if [ "$INSTALL_FLASHATTN" == "true" ]; then \
pip uninstall -y ninja && pip install ninja && \
pip install --no-cache-dir flash-attn --no-build-isolation \
fi;
# Copy the rest of the application into the image # Copy the rest of the application into the image
COPY . /app COPY . /app
# Install the LLaMA Factory # Install the LLaMA Factory
RUN EXTRA_PACKAGES="metrics"; \ RUN EXTRA_PACKAGES="metrics"; \
if [ "$INSTALL_BNB" = "true" ]; then \ if [ "$INSTALL_BNB" == "true" ]; then \
EXTRA_PACKAGES="${EXTRA_PACKAGES},bitsandbytes"; \ EXTRA_PACKAGES="${EXTRA_PACKAGES},bitsandbytes"; \
fi; \ fi; \
if [ "$INSTALL_VLLM" = "true" ]; then \ if [ "$INSTALL_VLLM" == "true" ]; then \
EXTRA_PACKAGES="${EXTRA_PACKAGES},vllm"; \ EXTRA_PACKAGES="${EXTRA_PACKAGES},vllm"; \
fi; \ fi; \
if [ "$INSTALL_DEEPSPEED" = "true" ]; then \ if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \ EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
fi; \ fi; \
pip install -e .[$EXTRA_PACKAGES] && \ pip install -e ".[$EXTRA_PACKAGES]"
pip uninstall -y transformer-engine flash-attn
# Rebuild flash-attn
RUN if [ "$INSTALL_FLASH_ATTN" = "true" ]; then \
ninja --version || \
(pip uninstall -y ninja && pip install ninja) && \
MAX_JOBS=4 pip install --no-cache-dir flash-attn --no-build-isolation \
fi;
# Set up volumes # Set up volumes
VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ] VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]

View File

@ -7,7 +7,7 @@ services:
INSTALL_BNB: false INSTALL_BNB: false
INSTALL_VLLM: false INSTALL_VLLM: false
INSTALL_DEEPSPEED: false INSTALL_DEEPSPEED: false
INSTALL_FLASH_ATTN: false INSTALL_FLASHATTN: false
PIP_INDEX: https://pypi.org/simple PIP_INDEX: https://pypi.org/simple
container_name: llamafactory container_name: llamafactory
volumes: volumes:

View File

@ -2,6 +2,7 @@
# More versions can be found at https://hub.docker.com/r/cosdt/cann/tags # More versions can be found at https://hub.docker.com/r/cosdt/cann/tags
FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04 FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
# Define environments
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
# Define installation arguments # Define installation arguments
@ -27,8 +28,7 @@ RUN EXTRA_PACKAGES="torch-npu,metrics"; \
if [ "$INSTALL_DEEPSPEED" == "true" ]; then \ if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \ EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
fi; \ fi; \
pip install -e ".[$EXTRA_PACKAGES]" && \ pip install -e ".[$EXTRA_PACKAGES]"
pip uninstall -y transformer-engine flash-attn
# Set up volumes # Set up volumes
VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ] VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]

View File

@ -91,7 +91,7 @@ def main():
master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1") master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1")
master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999))) master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999)))
logger.info("Initializing distributed tasks at: {}:{}".format(master_addr, master_port)) logger.info("Initializing distributed tasks at: {}:{}".format(master_addr, master_port))
subproc = subprocess.run( process = subprocess.run(
( (
"torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} " "torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} "
"--master_addr {master_addr} --master_port {master_port} {file_name} {args}" "--master_addr {master_addr} --master_port {master_port} {file_name} {args}"
@ -106,7 +106,7 @@ def main():
), ),
shell=True, shell=True,
) )
sys.exit(subproc.returncode) sys.exit(process.returncode)
else: else:
run_exp() run_exp()
elif command == Command.WEBDEMO: elif command == Command.WEBDEMO:

View File

@ -199,8 +199,8 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
if not is_torch_bf16_gpu_available(): if not is_torch_bf16_gpu_available():
raise ValueError("This device does not support `pure_bf16`.") raise ValueError("This device does not support `pure_bf16`.")
if training_args.deepspeed: if is_deepspeed_zero3_enabled():
raise ValueError("`pure_bf16` is incompatible with DeepSpeed.") raise ValueError("`pure_bf16` is incompatible with DeepSpeed ZeRO-3.")
if training_args.fp16 or training_args.bf16: if training_args.fp16 or training_args.bf16:
raise ValueError("Turn off mixed precision training when using `pure_bf16`.") raise ValueError("Turn off mixed precision training when using `pure_bf16`.")