tiny fix
This commit is contained in:
parent
33ef6f4ec2
commit
e44a4f07f0
|
@ -444,7 +444,7 @@ docker build -f ./docker/docker-cuda/Dockerfile \
|
|||
--build-arg INSTALL_BNB=false \
|
||||
--build-arg INSTALL_VLLM=false \
|
||||
--build-arg INSTALL_DEEPSPEED=false \
|
||||
--build-arg INSTALL_FLASH_ATTN=false \
|
||||
--build-arg INSTALL_FLASHATTN=false \
|
||||
--build-arg PIP_INDEX=https://pypi.org/simple \
|
||||
-t llamafactory:latest .
|
||||
|
||||
|
|
|
@ -444,7 +444,7 @@ docker build -f ./docker/docker-cuda/Dockerfile \
|
|||
--build-arg INSTALL_BNB=false \
|
||||
--build-arg INSTALL_VLLM=false \
|
||||
--build-arg INSTALL_DEEPSPEED=false \
|
||||
--build-arg INSTALL_FLASH_ATTN=false \
|
||||
--build-arg INSTALL_FLASHATTN=false \
|
||||
--build-arg PIP_INDEX=https://pypi.org/simple \
|
||||
-t llamafactory:latest .
|
||||
|
||||
|
|
|
@ -2,11 +2,14 @@
|
|||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-02.html
|
||||
FROM nvcr.io/nvidia/pytorch:24.02-py3
|
||||
|
||||
# Define environments
|
||||
ENV MAX_JOBS=4
|
||||
|
||||
# Define installation arguments
|
||||
ARG INSTALL_BNB=false
|
||||
ARG INSTALL_VLLM=false
|
||||
ARG INSTALL_DEEPSPEED=false
|
||||
ARG INSTALL_FLASH_ATTN=false
|
||||
ARG INSTALL_FLASHATTN=false
|
||||
ARG PIP_INDEX=https://pypi.org/simple
|
||||
|
||||
# Set the working directory
|
||||
|
@ -14,34 +17,33 @@ WORKDIR /app
|
|||
|
||||
# Install the requirements
|
||||
COPY requirements.txt /app
|
||||
RUN pip config set global.index-url $PIP_INDEX
|
||||
RUN pip config set global.extra-index-url $PIP_INDEX
|
||||
RUN python -m pip install --upgrade pip
|
||||
RUN python -m pip install -r requirements.txt
|
||||
RUN pip config set global.index-url "$PIP_INDEX" && \
|
||||
pip config set global.extra-index-url "$PIP_INDEX" && \
|
||||
python -m pip install --upgrade pip && \
|
||||
python -m pip install -r requirements.txt
|
||||
|
||||
# Rebuild flash attention
|
||||
RUN pip uninstall -y transformer-engine flash-attn && \
|
||||
if [ "$INSTALL_FLASHATTN" == "true" ]; then \
|
||||
pip uninstall -y ninja && pip install ninja && \
|
||||
pip install --no-cache-dir flash-attn --no-build-isolation \
|
||||
fi;
|
||||
|
||||
# Copy the rest of the application into the image
|
||||
COPY . /app
|
||||
|
||||
# Install the LLaMA Factory
|
||||
RUN EXTRA_PACKAGES="metrics"; \
|
||||
if [ "$INSTALL_BNB" = "true" ]; then \
|
||||
if [ "$INSTALL_BNB" == "true" ]; then \
|
||||
EXTRA_PACKAGES="${EXTRA_PACKAGES},bitsandbytes"; \
|
||||
fi; \
|
||||
if [ "$INSTALL_VLLM" = "true" ]; then \
|
||||
if [ "$INSTALL_VLLM" == "true" ]; then \
|
||||
EXTRA_PACKAGES="${EXTRA_PACKAGES},vllm"; \
|
||||
fi; \
|
||||
if [ "$INSTALL_DEEPSPEED" = "true" ]; then \
|
||||
if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
|
||||
EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
|
||||
fi; \
|
||||
pip install -e .[$EXTRA_PACKAGES] && \
|
||||
pip uninstall -y transformer-engine flash-attn
|
||||
|
||||
# Rebuild flash-attn
|
||||
RUN if [ "$INSTALL_FLASH_ATTN" = "true" ]; then \
|
||||
ninja --version || \
|
||||
(pip uninstall -y ninja && pip install ninja) && \
|
||||
MAX_JOBS=4 pip install --no-cache-dir flash-attn --no-build-isolation \
|
||||
fi;
|
||||
pip install -e ".[$EXTRA_PACKAGES]"
|
||||
|
||||
# Set up volumes
|
||||
VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
|
||||
|
|
|
@ -7,7 +7,7 @@ services:
|
|||
INSTALL_BNB: false
|
||||
INSTALL_VLLM: false
|
||||
INSTALL_DEEPSPEED: false
|
||||
INSTALL_FLASH_ATTN: false
|
||||
INSTALL_FLASHATTN: false
|
||||
PIP_INDEX: https://pypi.org/simple
|
||||
container_name: llamafactory
|
||||
volumes:
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
# More versions can be found at https://hub.docker.com/r/cosdt/cann/tags
|
||||
FROM cosdt/cann:8.0.rc1-910b-ubuntu22.04
|
||||
|
||||
# Define environments
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# Define installation arguments
|
||||
|
@ -27,8 +28,7 @@ RUN EXTRA_PACKAGES="torch-npu,metrics"; \
|
|||
if [ "$INSTALL_DEEPSPEED" == "true" ]; then \
|
||||
EXTRA_PACKAGES="${EXTRA_PACKAGES},deepspeed"; \
|
||||
fi; \
|
||||
pip install -e ".[$EXTRA_PACKAGES]" && \
|
||||
pip uninstall -y transformer-engine flash-attn
|
||||
pip install -e ".[$EXTRA_PACKAGES]"
|
||||
|
||||
# Set up volumes
|
||||
VOLUME [ "/root/.cache/huggingface", "/root/.cache/modelscope", "/app/data", "/app/output" ]
|
||||
|
|
|
@ -91,7 +91,7 @@ def main():
|
|||
master_addr = os.environ.get("MASTER_ADDR", "127.0.0.1")
|
||||
master_port = os.environ.get("MASTER_PORT", str(random.randint(20001, 29999)))
|
||||
logger.info("Initializing distributed tasks at: {}:{}".format(master_addr, master_port))
|
||||
subproc = subprocess.run(
|
||||
process = subprocess.run(
|
||||
(
|
||||
"torchrun --nnodes {nnodes} --node_rank {node_rank} --nproc_per_node {nproc_per_node} "
|
||||
"--master_addr {master_addr} --master_port {master_port} {file_name} {args}"
|
||||
|
@ -106,7 +106,7 @@ def main():
|
|||
),
|
||||
shell=True,
|
||||
)
|
||||
sys.exit(subproc.returncode)
|
||||
sys.exit(process.returncode)
|
||||
else:
|
||||
run_exp()
|
||||
elif command == Command.WEBDEMO:
|
||||
|
|
|
@ -199,8 +199,8 @@ def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
|
|||
if not is_torch_bf16_gpu_available():
|
||||
raise ValueError("This device does not support `pure_bf16`.")
|
||||
|
||||
if training_args.deepspeed:
|
||||
raise ValueError("`pure_bf16` is incompatible with DeepSpeed.")
|
||||
if is_deepspeed_zero3_enabled():
|
||||
raise ValueError("`pure_bf16` is incompatible with DeepSpeed ZeRO-3.")
|
||||
|
||||
if training_args.fp16 or training_args.bf16:
|
||||
raise ValueError("Turn off mixed precision training when using `pure_bf16`.")
|
||||
|
|
Loading…
Reference in New Issue