Skip to content

Commit

Permalink
Initial refactoring into multistage build with separate stage for eac…
Browse files Browse the repository at this point in the history
…h component
  • Loading branch information
mawong-amd committed May 22, 2024
1 parent d4db2f9 commit 34174f8
Showing 1 changed file with 147 additions and 133 deletions.
280 changes: 147 additions & 133 deletions Dockerfile.rocm
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
# default base image
ARG BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu20.04_py3.9_pytorch_2.1.2"

FROM $BASE_IMAGE
ARG COMMON_WORKDIR=/app
ARG BUILD_HIPBLASLT="1"
ARG BUILD_RCCL="1"
ARG BUILD_FA="1"
ARG BUILD_CUPY="0"
ARG BUILD_TRITON="1"

# -----------------------
# vLLM base image
FROM $BASE_IMAGE AS base
USER root

# Import BASE_IMAGE arg from pre-FROM
ARG BASE_IMAGE
RUN echo "Base image is $BASE_IMAGE"

ARG COMMON_WORKDIR
# Used as ARCHes for all components
ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
RUN echo "PYTORCH_ROCM_ARCH is $PYTORCH_ROCM_ARCH"
Expand All @@ -17,167 +26,172 @@ RUN apt-get update && apt-get install python3 python3-pip -
RUN apt-get update && apt-get install -y \
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev

### Mount Point ###
# When launching the container, mount the code directory to /app
ARG APP_MOUNT=/app
VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
ENV PATH=$PATH:/opt/rocm/bin:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/bin:
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/lib:
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/include:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/include/torch/csrc/api/include/:/opt/rocm/include/:

WORKDIR ${COMMON_WORKDIR}

ARG BUILD_HIPBLASLT="1"
# -----------------------
# hipBLASLt build stages
FROM base AS build_hipblaslt
ARG HIPBLASLT_BRANCH="ee51a9d1"

RUN if [ "$BUILD_HIPBLASLT" = "1" ]; then \
echo "HIPBLASLT_BRANCH is $HIPBLASLT_BRANCH"; \
fi
# Build HipblasLt
RUN if [ "$BUILD_HIPBLASLT" = "1" ] ; then \
apt-get purge -y hipblaslt \
&& mkdir -p libs \
&& cd libs \
&& git clone https://github.com/ROCm/hipBLASLt \
RUN git clone https://github.com/ROCm/hipBLASLt \
&& cd hipBLASLt \
&& git checkout ${HIPBLASLT_BRANCH} \
&& SCCACHE_IDLE_TIMEOUT=1800 ./install.sh -i --architecture ${PYTORCH_ROCM_ARCH} \
&& cd .. && rm -rf hipBLASLt \
&& sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
&& sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status \
&& cd ..; \
fi


RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*


ARG BUILD_RCCL="1"
&& SCCACHE_IDLE_TIMEOUT=1800 ./install.sh --architecture ${PYTORCH_ROCM_ARCH} \
&& cd build/release \
&& make package
FROM scratch AS export_hipblaslt_1
ARG COMMON_WORKDIR
COPY --from=build_hipblaslt ${COMMON_WORKDIR}/hipBLASLt/build/release/*.deb /
FROM scratch AS export_hipblaslt_0

# -----------------------
# RCCL build stages
FROM base AS build_rccl
ARG RCCL_BRANCH="eeea3b6"

RUN if [ "$BUILD_RCCL" = "1" ]; then \
echo "RCCL_BRANCH is $RCCL_BRANCH"; \
fi
# Install RCCL
RUN if [ "$BUILD_RCCL" = "1" ]; then \
mkdir -p libs \
&& cd libs \
&& git clone https://github.com/ROCm/rccl \
RUN git clone https://github.com/ROCm/rccl \
&& cd rccl \
&& git checkout ${RCCL_BRANCH} \
&& ./install.sh -i --amdgpu_targets ${PYTORCH_ROCM_ARCH} \
&& cd .. \
&& rm -r rccl \
&& cd ..; \
fi


ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
ENV PATH=$PATH:/opt/rocm/bin:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/bin:
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/lib:
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/include:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/include/torch/csrc/api/include/:/opt/rocm/include/:


# whether to build flash-attention
# if 0, will not build flash attention
# this is useful for gfx target where flash-attention is not supported
# In that case, we need to use the python reference attention implementation in vllm
ARG BUILD_FA="1"
&& ./install.sh --amdgpu_targets ${PYTORCH_ROCM_ARCH} \
&& cd build/release \
&& make package
FROM scratch AS export_rccl_1
ARG COMMON_WORKDIR
COPY --from=build_rccl ${COMMON_WORKDIR}/rccl/build/release/*.deb /
FROM scratch AS export_rccl_0

# -----------------------
# flash attn build stages
FROM base AS build_flash_attn
ARG FA_BRANCH="ae7928c"

RUN if [ "$BUILD_FA" = "1" ]; then \
echo "FA_BRANCH is $FA_BRANCH"; \
fi
# Install ROCm flash-attention
RUN if [ "$BUILD_FA" = "1" ]; then \
mkdir -p libs \
&& cd libs \
&& git clone https://github.com/ROCm/flash-attention.git \
RUN git clone https://github.com/ROCm/flash-attention.git \
&& cd flash-attention \
&& git checkout ${FA_BRANCH} \
&& git submodule update --init \
&& GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py install \
&& cd .. \
&& rm -rf flash-attention \
&& cd ..; \
fi
&& GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist
FROM scratch AS export_flash_attn_1
ARG COMMON_WORKDIR
COPY --from=build_flash_attn ${COMMON_WORKDIR}/flash-attention/dist/*.whl /
FROM scratch AS export_flash_attn_0

# -----------------------
# CuPy build stages
FROM base AS build_cupy
ARG CUPY_BRANCH="hipgraph_enablement"
RUN git clone https://github.com/ROCm/cupy.git \
&& cd cupy \
&& git checkout $CUPY_BRANCH \
&& git submodule update --init --recursive \
&& pip install mpi4py-mpich scipy==1.9.3 cython==0.29.* \
&& CC=$MPI_HOME/bin/mpicc python -m pip install mpi4py \
&& CUPY_INSTALL_USE_HIP=1 ROCM_HOME=/opt/rocm HCC_AMDGPU_TARGET=${PYTORCH_ROCM_ARCH} \
python3 setup.py bdist_wheel --dist-dir=dist
FROM build_cupy AS export_cupy_1
ARG COMMON_WORKDIR
COPY --from=build_cupy ${COMMON_WORKDIR}/cupy/dist/*.whl /
FROM scratch AS export_cupy_0

# -----------------------
# Triton build stages
FROM base AS build_triton
ARG TRITON_BRANCH="main"
RUN git clone https://github.com/OpenAI/triton.git \
&& cd triton \
&& git checkout ${TRITON_BRANCH} \
&& cd python \
&& python3 setup.py bdist_wheel --dist-dir=dist
FROM scratch AS export_triton_1
ARG COMMON_WORKDIR
COPY --from=build_triton ${COMMON_WORKDIR}/triton/python/dist/*.whl /
FROM scratch AS export_triton_0

# -----------------------
# vLLM (and gradlib) build stages
FROM base AS build_vllm
ARG COMMON_WORKDIR
# To consider: Obtain vLLM via git clone
COPY ./ ${COMMON_WORKDIR}/vllm
# Build vLLM
RUN cd vllm \
&& python3 setup.py clean --all && python3 setup.py bdist_wheel --dist-dir=dist
# Build gradlib
RUN cd vllm/gradlib \
&& python3 setup.py clean --all && python3 setup.py bdist_wheel --dist-dir=dist
FROM scratch AS export_vllm
ARG COMMON_WORKDIR
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/gradlib/dist/*.whl /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/rocm_patch /rocm_patch
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/patch_xformers.rocm.sh /

# -----------------------
# Aliases to ensure we only use enabled components
FROM export_hipblaslt_${BUILD_HIPBLASLT} AS export_hipblaslt
FROM export_rccl_${BUILD_RCCL} AS export_rccl
FROM export_flash_attn_${BUILD_FA} AS export_flash_attn
FROM export_cupy_${BUILD_CUPY} AS export_cupy
FROM export_triton_${BUILD_TRITON} AS export_triton

# -----------------------
# Final vLLM image
FROM base AS final
ARG BASE_IMAGE
ARG BUILD_FA

RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
# Manually removed it so that later steps of numpy upgrade can continue
RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.1_ubuntu20.04_py3.9_pytorch_2.1.2" ]; then \
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi


# Whether to build CuPy. 0.3.3 <= vLLM < 0.4.0 might need it for HIPgraph.
ARG BUILD_CUPY="0"
ARG CUPY_BRANCH="hipgraph_enablement"

RUN if [ "$BUILD_CUPY" = "1" ]; then \
echo "CUPY_BRANCH is $CUPY_BRANCH"; \
RUN --mount=type=bind,from=export_hipblaslt,src=/,target=/install \
if ls /install/*.deb; then \
apt-get purge -y hipblaslt \
&& dpkg -i /install/*.deb \
&& sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
&& sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status; \
fi
# Build cupy
RUN if [ "$BUILD_CUPY" = "1" ]; then \
mkdir -p libs \
&& cd libs \
&& git clone $CUPY_BRANCH --recursive https://github.com/ROCm/cupy.git \
&& cd cupy \
&& pip install mpi4py-mpich scipy==1.9.3 cython==0.29.* \
&& CC=$MPI_HOME/bin/mpicc python -m pip install mpi4py \
&& CUPY_INSTALL_USE_HIP=1 ROCM_HOME=/opt/rocm HCC_AMDGPU_TARGET=${PYTORCH_ROCM_ARCH} pip install . \
&& cd .. \
&& rm -rf cupy \
&& cd ..; \
fi


# whether to build triton on rocm
ARG BUILD_TRITON="1"
ARG TRITON_BRANCH="main"

RUN if [ "$BUILD_TRITON" = "1" ]; then \
echo "TRITON_BRANCH is $TRITON_BRANCH"; \
RUN --mount=type=bind,from=export_rccl,src=/,target=/install \
if ls /install/*.deb; then \
dpkg -i /install/*.deb \
&& sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
&& sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status; \
fi
# build triton
RUN if [ "$BUILD_TRITON" = "1" ]; then \
mkdir -p libs \
&& cd libs \
&& pip uninstall -y triton \
&& git clone https://github.com/OpenAI/triton.git \
&& cd triton \
&& git checkout ${TRITON_BRANCH} \
&& cd python \
&& pip install . \
&& cd ../.. \
&& rm -rf triton \
&& cd ..; \

RUN --mount=type=bind,from=export_flash_attn,src=/,target=/install \
if ls /install/*.whl; then \
pip install /install/*.whl; \
fi

RUN --mount=type=bind,from=export_cupy,src=/,target=/install \
if ls /install/*.whl; then \
pip install /install/*.whl; \
fi

COPY ./ /app/vllm
# Fix HIP runtime on ROCm 6.1
RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.1_ubuntu20.04_py3.9_pytorch_2.1.2" ]; then \
cp /app/vllm/rocm_patch/libamdhip64.so.6 /opt/rocm-6.1.0/lib/libamdhip64.so.6; fi
RUN --mount=type=bind,from=export_triton,src=/,target=/install \
if ls /install/*.whl; then \
pip install /install/*.whl; \
fi

RUN python3 -m pip install --upgrade pip numba
RUN python3 -m pip install --upgrade numba
RUN python3 -m pip install xformers==0.0.23 --no-deps

# Install vLLM
ARG VLLM_BUILD_MODE="install"
# developer might choose to use "develop" mode. But for end-users, we should do an install mode.
# the current "develop" mode has issues with ImportError: cannot import name '_custom_C' from 'vllm' (/app/vllm/vllm/__init__.py)
RUN cd /app \
&& cd vllm \
# Install vLLM (and gradlib)
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
cd /install \
&& pip install -U -r requirements-rocm.txt \
&& if [ "$BUILD_FA" = "1" ]; then \
bash patch_xformers.rocm.sh; fi \
bash patch_xformers.rocm.sh; fi \
&& if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch; fi \
&& python3 setup.py clean --all && python3 setup.py $VLLM_BUILD_MODE \
&& cd ..


# Install gradlib
RUN cd /app/vllm/gradlib \
&& pip install . \
&& cd ../..

patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch; fi \
&& if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.1_ubuntu20.04_py3.9_pytorch_2.1.2" ]; then \
cp rocm_patch/libamdhip64.so.6 /opt/rocm-6.1.0/lib/libamdhip64.so.6; fi \
&& pip install *.whl

# Update Ray to latest version + set environment variable to ensure it works on TP > 1
RUN python3 -m pip install --no-cache-dir 'ray[all]>=2.10.0'
Expand Down

0 comments on commit 34174f8

Please sign in to comment.