From 34174f8fda5f95abc677d5d9ee8e6f2ddf23b76c Mon Sep 17 00:00:00 2001 From: Matthew Wong Date: Wed, 22 May 2024 18:45:44 +0000 Subject: [PATCH] Initial refactoring into multistage build with separate stage for each component --- Dockerfile.rocm | 280 +++++++++++++++++++++++++----------------------- 1 file changed, 147 insertions(+), 133 deletions(-) diff --git a/Dockerfile.rocm b/Dockerfile.rocm index 875355d9c2bbc..4247b3f4a284d 100644 --- a/Dockerfile.rocm +++ b/Dockerfile.rocm @@ -1,13 +1,22 @@ # default base image ARG BASE_IMAGE="rocm/pytorch:rocm6.1_ubuntu20.04_py3.9_pytorch_2.1.2" -FROM $BASE_IMAGE +ARG COMMON_WORKDIR=/app +ARG BUILD_HIPBLASLT="1" +ARG BUILD_RCCL="1" +ARG BUILD_FA="1" +ARG BUILD_CUPY="0" +ARG BUILD_TRITON="1" + +# ----------------------- +# vLLM base image +FROM $BASE_IMAGE AS base USER root # Import BASE_IMAGE arg from pre-FROM ARG BASE_IMAGE RUN echo "Base image is $BASE_IMAGE" - +ARG COMMON_WORKDIR # Used as ARCHes for all components ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942" RUN echo "PYTORCH_ROCM_ARCH is $PYTORCH_ROCM_ARCH" @@ -17,167 +26,172 @@ RUN apt-get update && apt-get install python3 python3-pip - RUN apt-get update && apt-get install -y \ sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev -### Mount Point ### -# When launching the container, mount the code directory to /app -ARG APP_MOUNT=/app -VOLUME [ ${APP_MOUNT} ] -WORKDIR ${APP_MOUNT} +ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer +ENV PATH=$PATH:/opt/rocm/bin:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/bin: +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/lib: +ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/include:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/include/torch/csrc/api/include/:/opt/rocm/include/: +WORKDIR ${COMMON_WORKDIR} -ARG BUILD_HIPBLASLT="1" +# ----------------------- +# hipBLASLt build stages +FROM base AS build_hipblaslt ARG HIPBLASLT_BRANCH="ee51a9d1" - -RUN if [ "$BUILD_HIPBLASLT" = "1" ]; then \ - echo "HIPBLASLT_BRANCH is $HIPBLASLT_BRANCH"; \ - fi -# Build HipblasLt -RUN if [ "$BUILD_HIPBLASLT" = "1" ] ; then \ - apt-get purge -y hipblaslt \ - && mkdir -p libs \ - && cd libs \ - && git clone https://github.com/ROCm/hipBLASLt \ +RUN git clone https://github.com/ROCm/hipBLASLt \ && cd hipBLASLt \ && git checkout ${HIPBLASLT_BRANCH} \ - && SCCACHE_IDLE_TIMEOUT=1800 ./install.sh -i --architecture ${PYTORCH_ROCM_ARCH} \ - && cd .. && rm -rf hipBLASLt \ - && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \ - && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status \ - && cd ..; \ - fi - - -RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* - - -ARG BUILD_RCCL="1" + && SCCACHE_IDLE_TIMEOUT=1800 ./install.sh --architecture ${PYTORCH_ROCM_ARCH} \ + && cd build/release \ + && make package +FROM scratch AS export_hipblaslt_1 +ARG COMMON_WORKDIR +COPY --from=build_hipblaslt ${COMMON_WORKDIR}/hipBLASLt/build/release/*.deb / +FROM scratch AS export_hipblaslt_0 + +# ----------------------- +# RCCL build stages +FROM base AS build_rccl ARG RCCL_BRANCH="eeea3b6" - -RUN if [ "$BUILD_RCCL" = "1" ]; then \ - echo "RCCL_BRANCH is $RCCL_BRANCH"; \ - fi -# Install RCCL -RUN if [ "$BUILD_RCCL" = "1" ]; then \ - mkdir -p libs \ - && cd libs \ - && git clone https://github.com/ROCm/rccl \ +RUN git clone https://github.com/ROCm/rccl \ && cd rccl \ && git checkout ${RCCL_BRANCH} \ - && ./install.sh -i --amdgpu_targets ${PYTORCH_ROCM_ARCH} \ - && cd .. \ - && rm -r rccl \ - && cd ..; \ - fi - - -ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer -ENV PATH=$PATH:/opt/rocm/bin:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/bin: -ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/lib: -ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/include:/opt/conda/envs/py_3.9/lib/python3.9/site-packages/torch/include/torch/csrc/api/include/:/opt/rocm/include/: - - -# whether to build flash-attention -# if 0, will not build flash attention -# this is useful for gfx target where flash-attention is not supported -# In that case, we need to use the python reference attention implementation in vllm -ARG BUILD_FA="1" + && ./install.sh --amdgpu_targets ${PYTORCH_ROCM_ARCH} \ + && cd build/release \ + && make package +FROM scratch AS export_rccl_1 +ARG COMMON_WORKDIR +COPY --from=build_rccl ${COMMON_WORKDIR}/rccl/build/release/*.deb / +FROM scratch AS export_rccl_0 + +# ----------------------- +# flash attn build stages +FROM base AS build_flash_attn ARG FA_BRANCH="ae7928c" - -RUN if [ "$BUILD_FA" = "1" ]; then \ - echo "FA_BRANCH is $FA_BRANCH"; \ - fi -# Install ROCm flash-attention -RUN if [ "$BUILD_FA" = "1" ]; then \ - mkdir -p libs \ - && cd libs \ - && git clone https://github.com/ROCm/flash-attention.git \ +RUN git clone https://github.com/ROCm/flash-attention.git \ && cd flash-attention \ && git checkout ${FA_BRANCH} \ && git submodule update --init \ - && GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py install \ - && cd .. \ - && rm -rf flash-attention \ - && cd ..; \ - fi + && GPU_ARCHS=${PYTORCH_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist +FROM scratch AS export_flash_attn_1 +ARG COMMON_WORKDIR +COPY --from=build_flash_attn ${COMMON_WORKDIR}/flash-attention/dist/*.whl / +FROM scratch AS export_flash_attn_0 + +# ----------------------- +# CuPy build stages +FROM base AS build_cupy +ARG CUPY_BRANCH="hipgraph_enablement" +RUN git clone https://github.com/ROCm/cupy.git \ + && cd cupy \ + && git checkout $CUPY_BRANCH \ + && git submodule update --init --recursive \ + && pip install mpi4py-mpich scipy==1.9.3 cython==0.29.* \ + && CC=$MPI_HOME/bin/mpicc python -m pip install mpi4py \ + && CUPY_INSTALL_USE_HIP=1 ROCM_HOME=/opt/rocm HCC_AMDGPU_TARGET=${PYTORCH_ROCM_ARCH} \ + python3 setup.py bdist_wheel --dist-dir=dist +FROM build_cupy AS export_cupy_1 +ARG COMMON_WORKDIR +COPY --from=build_cupy ${COMMON_WORKDIR}/cupy/dist/*.whl / +FROM scratch AS export_cupy_0 + +# ----------------------- +# Triton build stages +FROM base AS build_triton +ARG TRITON_BRANCH="main" +RUN git clone https://github.com/OpenAI/triton.git \ + && cd triton \ + && git checkout ${TRITON_BRANCH} \ + && cd python \ + && python3 setup.py bdist_wheel --dist-dir=dist +FROM scratch AS export_triton_1 +ARG COMMON_WORKDIR +COPY --from=build_triton ${COMMON_WORKDIR}/triton/python/dist/*.whl / +FROM scratch AS export_triton_0 + +# ----------------------- +# vLLM (and gradlib) build stages +FROM base AS build_vllm +ARG COMMON_WORKDIR +# To consider: Obtain vLLM via git clone +COPY ./ ${COMMON_WORKDIR}/vllm +# Build vLLM +RUN cd vllm \ + && python3 setup.py clean --all && python3 setup.py bdist_wheel --dist-dir=dist +# Build gradlib +RUN cd vllm/gradlib \ + && python3 setup.py clean --all && python3 setup.py bdist_wheel --dist-dir=dist +FROM scratch AS export_vllm +ARG COMMON_WORKDIR +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl / +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/gradlib/dist/*.whl / +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/rocm_patch /rocm_patch +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt / +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/patch_xformers.rocm.sh / + +# ----------------------- +# Aliases to ensure we only use enabled components +FROM export_hipblaslt_${BUILD_HIPBLASLT} AS export_hipblaslt +FROM export_rccl_${BUILD_RCCL} AS export_rccl +FROM export_flash_attn_${BUILD_FA} AS export_flash_attn +FROM export_cupy_${BUILD_CUPY} AS export_cupy +FROM export_triton_${BUILD_TRITON} AS export_triton + +# ----------------------- +# Final vLLM image +FROM base AS final +ARG BASE_IMAGE +ARG BUILD_FA +RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* # Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt. # Manually removed it so that later steps of numpy upgrade can continue RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.1_ubuntu20.04_py3.9_pytorch_2.1.2" ]; then \ rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi - -# Whether to build CuPy. 0.3.3 <= vLLM < 0.4.0 might need it for HIPgraph. -ARG BUILD_CUPY="0" -ARG CUPY_BRANCH="hipgraph_enablement" - -RUN if [ "$BUILD_CUPY" = "1" ]; then \ - echo "CUPY_BRANCH is $CUPY_BRANCH"; \ +RUN --mount=type=bind,from=export_hipblaslt,src=/,target=/install \ + if ls /install/*.deb; then \ + apt-get purge -y hipblaslt \ + && dpkg -i /install/*.deb \ + && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \ + && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status; \ fi -# Build cupy -RUN if [ "$BUILD_CUPY" = "1" ]; then \ - mkdir -p libs \ - && cd libs \ - && git clone $CUPY_BRANCH --recursive https://github.com/ROCm/cupy.git \ - && cd cupy \ - && pip install mpi4py-mpich scipy==1.9.3 cython==0.29.* \ - && CC=$MPI_HOME/bin/mpicc python -m pip install mpi4py \ - && CUPY_INSTALL_USE_HIP=1 ROCM_HOME=/opt/rocm HCC_AMDGPU_TARGET=${PYTORCH_ROCM_ARCH} pip install . \ - && cd .. \ - && rm -rf cupy \ - && cd ..; \ - fi - - -# whether to build triton on rocm -ARG BUILD_TRITON="1" -ARG TRITON_BRANCH="main" -RUN if [ "$BUILD_TRITON" = "1" ]; then \ - echo "TRITON_BRANCH is $TRITON_BRANCH"; \ +RUN --mount=type=bind,from=export_rccl,src=/,target=/install \ + if ls /install/*.deb; then \ + dpkg -i /install/*.deb \ + && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \ + && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status; \ fi -# build triton -RUN if [ "$BUILD_TRITON" = "1" ]; then \ - mkdir -p libs \ - && cd libs \ - && pip uninstall -y triton \ - && git clone https://github.com/OpenAI/triton.git \ - && cd triton \ - && git checkout ${TRITON_BRANCH} \ - && cd python \ - && pip install . \ - && cd ../.. \ - && rm -rf triton \ - && cd ..; \ + +RUN --mount=type=bind,from=export_flash_attn,src=/,target=/install \ + if ls /install/*.whl; then \ + pip install /install/*.whl; \ fi +RUN --mount=type=bind,from=export_cupy,src=/,target=/install \ + if ls /install/*.whl; then \ + pip install /install/*.whl; \ + fi -COPY ./ /app/vllm -# Fix HIP runtime on ROCm 6.1 -RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.1_ubuntu20.04_py3.9_pytorch_2.1.2" ]; then \ - cp /app/vllm/rocm_patch/libamdhip64.so.6 /opt/rocm-6.1.0/lib/libamdhip64.so.6; fi +RUN --mount=type=bind,from=export_triton,src=/,target=/install \ + if ls /install/*.whl; then \ + pip install /install/*.whl; \ + fi -RUN python3 -m pip install --upgrade pip numba +RUN python3 -m pip install --upgrade numba RUN python3 -m pip install xformers==0.0.23 --no-deps -# Install vLLM -ARG VLLM_BUILD_MODE="install" -# developer might choose to use "develop" mode. But for end-users, we should do an install mode. -# the current "develop" mode has issues with ImportError: cannot import name '_custom_C' from 'vllm' (/app/vllm/vllm/__init__.py) -RUN cd /app \ - && cd vllm \ +# Install vLLM (and gradlib) +RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ + cd /install \ && pip install -U -r requirements-rocm.txt \ && if [ "$BUILD_FA" = "1" ]; then \ - bash patch_xformers.rocm.sh; fi \ + bash patch_xformers.rocm.sh; fi \ && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \ - patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch; fi \ - && python3 setup.py clean --all && python3 setup.py $VLLM_BUILD_MODE \ - && cd .. - - -# Install gradlib -RUN cd /app/vllm/gradlib \ - && pip install . \ - && cd ../.. - + patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h rocm_patch/rocm_bf16.patch; fi \ + && if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.1_ubuntu20.04_py3.9_pytorch_2.1.2" ]; then \ + cp rocm_patch/libamdhip64.so.6 /opt/rocm-6.1.0/lib/libamdhip64.so.6; fi \ + && pip install *.whl # Update Ray to latest version + set environment variable to ensure it works on TP > 1 RUN python3 -m pip install --no-cache-dir 'ray[all]>=2.10.0'