From 78d16ebffcb71de55136055f0bad0f41c7e789b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 22 Oct 2024 15:06:12 +0200
Subject: [PATCH 01/22] [CI/Build] improve dev setup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 setup.py | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b936589869e76..8e8aa18af66e1 100644
--- a/setup.py
+++ b/setup.py
@@ -403,6 +403,8 @@ def get_vllm_version() -> str:
             # skip this for source tarball, required for pypi
             if "sdist" not in sys.argv:
                 version += f"{sep}cu{cuda_version_str}"
+        if envs.VLLM_USE_PRECOMPILED:
+            version += ".precompiled"
     elif _is_hip():
         # Get the HIP version
         hipcc_version = get_hipcc_rocm_version()
@@ -514,9 +516,41 @@ def _read_requirements(filename: str) -> List[str]:
 package_data = {
     "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
 }
+
 if envs.VLLM_USE_PRECOMPILED:
+    assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+
+    import zipfile
+
     ext_modules = []
-    package_data["vllm"].append("*.so")
+
+    wheel_location = os.getenv(
+        "VLLM_PRECOMPILED_WHEEL_LOCATION",
+        "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+    )
+
+    if os.path.exists(wheel_filename := os.path.basename(wheel_location)):
+        print(f"Using existing wheel={wheel_filename}")
+    else:
+        try:
+            subprocess.check_call(
+                f"pip download --no-deps {wheel_location}".split(" "))
+        except subprocess.CalledProcessError as exc:
+            from setuptools.errors import SetupError
+
+            raise SetupError(
+                f"Failed to get vLLM wheel from {wheel_location}") from exc
+
+    with zipfile.ZipFile(wheel_filename) as wheel:
+        for lib in filter(lambda file: file.filename.endswith(".so"),
+                          wheel.filelist):
+            package_name = os.path.dirname(lib.filename).replace("/", ".")
+            if package_name not in package_data:
+                package_data[package_name] = []
+
+            wheel.extract(lib)
+            package_data[package_name].append(lib.filename)
+            print(f"Added {lib.filename} to package_data[\"{package_name}\"]")
 
 if _no_device():
     ext_modules = []

From 548433ba52ac9fb33ade6d191c626beebf7bf40d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Wed, 23 Oct 2024 17:35:09 +0200
Subject: [PATCH 02/22] remove python_only_dev.py script, update docs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 docs/source/getting_started/installation.rst | 35 ++------
 python_only_dev.py                           | 92 --------------------
 2 files changed, 7 insertions(+), 120 deletions(-)
 delete mode 100644 python_only_dev.py

diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index e3dbbc9affe66..8aa312e9cfc70 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -89,45 +89,24 @@ Build from source
 Python-only build (without compilation)
 ---------------------------------------
 
-If you only need to change Python code, you can simply build vLLM without compilation.
-
-The first step is to install the latest vLLM wheel:
-
-.. code-block:: console
-
-    pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-
-You can find more information about vLLM's wheels `above <#install-the-latest-code>`_.
-
-After verifying that the installation is successful, you can use `the following script <https://github.com/vllm-project/vllm/blob/main/python_only_dev.py>`_:
+If you only need to change Python code, you can build and install vLLM without compilation.
 
 .. code-block:: console
 
     $ git clone https://github.com/vllm-project/vllm.git
     $ cd vllm
-    $ python python_only_dev.py
-
-The script will:
-
-* Find the installed vLLM package in the current environment.
-* Copy built files to the current directory.
-* Rename the installed vLLM package.
-* Symbolically link the current directory to the installed vLLM package.
+    $ VLLM_USE_PRE_COMPILED=1 pip install --editable .
 
-Now, you can edit the Python code in the current directory, and the changes will be reflected when you run vLLM.
-
-Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script <https://github.com/vllm-project/vllm/blob/main/python_only_dev.py>`_ with the ``--quit-dev`` (or ``-q`` for short) flag:
+This will download and the latest available nightly wheel and include the compiled libraries from there. The wheel that is used to retrieve the prebuilt libraries can be set using the ``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable, for example, to use the `PyPi wheel <https://pypi.org/project/vllm/#files>`:
 
 .. code-block:: console
 
-    $ python python_only_dev.py --quit-dev
-
-The ``--quit-dev`` flag will:
+   $ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl
+   $ VLLM_USE_PRE_COMPILED=1 pip install --editable .
 
-* Remove the symbolic link from the current directory to the vLLM package.
-* Restore the original vLLM package from the backup.
+You can find more information about vLLM's wheels `above <#install-the-latest-code>`_.
 
-If you update the vLLM wheel and rebuild from the source to make further edits, you will need to repeat the `Python-only build <#python-only-build>`_ steps again.
+Python code changes, will reflected when you run vLLM thanks to pip's ``--editable`` flag.
 
 .. note::
 
diff --git a/python_only_dev.py b/python_only_dev.py
deleted file mode 100644
index 1ca0f5c30b741..0000000000000
--- a/python_only_dev.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# enable python only development
-# copy compiled files to the current directory directly
-
-import argparse
-import os
-import shutil
-import subprocess
-import sys
-import warnings
-
-parser = argparse.ArgumentParser(
-    description="Development mode for python-only code")
-parser.add_argument('-q',
-                    '--quit-dev',
-                    action='store_true',
-                    help='Set the flag to quit development mode')
-args = parser.parse_args()
-
-# cannot directly `import vllm` , because it will try to
-# import from the current directory
-output = subprocess.run([sys.executable, "-m", "pip", "show", "vllm"],
-                        capture_output=True)
-
-assert output.returncode == 0, "vllm is not installed"
-
-text = output.stdout.decode("utf-8")
-
-package_path = None
-for line in text.split("\n"):
-    if line.startswith("Location: "):
-        package_path = line.split(": ")[1]
-        break
-
-assert package_path is not None, "could not find package path"
-
-cwd = os.getcwd()
-
-assert cwd != package_path, "should not import from the current directory"
-
-files_to_copy = [
-    "vllm/_C.abi3.so",
-    "vllm/_moe_C.abi3.so",
-    "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
-    "vllm/vllm_flash_attn/flash_attn_interface.py",
-    "vllm/vllm_flash_attn/__init__.py",
-    # "vllm/_version.py", # not available in nightly wheels yet
-]
-
-# Try to create _version.py to avoid version related warning
-# Refer to https://github.com/vllm-project/vllm/pull/8771
-try:
-    from setuptools_scm import get_version
-    get_version(write_to="vllm/_version.py")
-except ImportError:
-    warnings.warn(
-        "To avoid warnings related to vllm._version, "
-        "you should install setuptools-scm by `pip install setuptools-scm`",
-        stacklevel=2)
-
-if not args.quit_dev:
-    for file in files_to_copy:
-        src = os.path.join(package_path, file)
-        dst = file
-        print(f"Copying {src} to {dst}")
-        shutil.copyfile(src, dst)
-
-    pre_built_vllm_path = os.path.join(package_path, "vllm")
-    tmp_path = os.path.join(package_path, "vllm_pre_built")
-    current_vllm_path = os.path.join(cwd, "vllm")
-
-    print(f"Renaming {pre_built_vllm_path} to {tmp_path} for backup")
-    shutil.copytree(pre_built_vllm_path, tmp_path)
-    shutil.rmtree(pre_built_vllm_path)
-
-    print(f"Linking {current_vllm_path} to {pre_built_vllm_path}")
-    os.symlink(current_vllm_path, pre_built_vllm_path)
-else:
-    vllm_symlink_path = os.path.join(package_path, "vllm")
-    vllm_backup_path = os.path.join(package_path, "vllm_pre_built")
-    current_vllm_path = os.path.join(cwd, "vllm")
-
-    print(f"Unlinking {current_vllm_path} to {vllm_symlink_path}")
-    assert os.path.islink(
-        vllm_symlink_path
-    ), f"not in dev mode: {vllm_symlink_path} is not a symbolic link"
-    assert current_vllm_path == os.readlink(
-        vllm_symlink_path
-    ), "current directory is not the source code of package"
-    os.unlink(vllm_symlink_path)
-
-    print(f"Recovering backup from {vllm_backup_path} to {vllm_symlink_path}")
-    os.rename(vllm_backup_path, vllm_symlink_path)

From 75f2a2a8a6ec15cc6d3f09e54ce8681e680c4ebc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Wed, 23 Oct 2024 17:35:22 +0200
Subject: [PATCH 03/22] bump python version in installation guide
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 docs/source/getting_started/installation.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index 8aa312e9cfc70..d667f34ebb5ed 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -21,7 +21,7 @@ You can install vLLM using pip:
 .. code-block:: console
 
     $ # (Recommended) Create a new conda environment.
-    $ conda create -n myenv python=3.10 -y
+    $ conda create -n myenv python=3.12 -y
     $ conda activate myenv
 
     $ # Install vLLM with CUDA 12.1.

From bb14531647782214a3c807d63b4cc7a846b53250 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Wed, 23 Oct 2024 17:51:34 +0200
Subject: [PATCH 04/22] docs: add sccache section
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 docs/source/getting_started/installation.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index d667f34ebb5ed..d22038af23945 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -127,9 +127,13 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T
 .. tip::
 
     Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
+
     For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` .
     As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
 
+    `sccache <https://github.com/mozilla/sccache>` works similarly to ``ccache``, but has the capability to utilize caching in remote storage environments.
+    The following env vars can be set for to configure the vLLM ``sccache`` remote: ``SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1``. We also recommend setting ``SCCACHE_IDLE_TIMEOUT=0``
+
 
 Use an existing PyTorch installation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

From 66754357424b7492f3b5f4b20429f62ecca836d7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Thu, 24 Oct 2024 11:25:56 +0200
Subject: [PATCH 05/22] docs: fix VLLM_USE_PRECOMPILED env var usage, fix
 typos/rewording
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 docs/source/getting_started/installation.rst | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index d22038af23945..1fb7c8bce5565 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -89,25 +89,25 @@ Build from source
 Python-only build (without compilation)
 ---------------------------------------
 
-If you only need to change Python code, you can build and install vLLM without compilation.
+If you only need to change Python code, you can build and install vLLM without compilation. Using `pip's ``--editable`` flag <https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs>`_, changes you make to the code will be reflected when you run vLLM:
 
 .. code-block:: console
 
     $ git clone https://github.com/vllm-project/vllm.git
     $ cd vllm
-    $ VLLM_USE_PRE_COMPILED=1 pip install --editable .
+    $ VLLM_USE_PRECOMPILED=1 pip install --editable .
 
-This will download and the latest available nightly wheel and include the compiled libraries from there. The wheel that is used to retrieve the prebuilt libraries can be set using the ``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable, for example, to use the `PyPi wheel <https://pypi.org/project/vllm/#files>`:
+This will download the latest nightly wheel and use the compiled libraries from there in the install.
+
+The wheel that is used to retrieve the prebuilt libraries (e.g. the `PyPi wheel <https://pypi.org/project/vllm/#files>`_) can be set using the ``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable:
 
 .. code-block:: console
 
    $ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl
-   $ VLLM_USE_PRE_COMPILED=1 pip install --editable .
+   $ VLLM_USE_PRECOMPILED=1 pip install --editable .
 
 You can find more information about vLLM's wheels `above <#install-the-latest-code>`_.
 
-Python code changes, will reflected when you run vLLM thanks to pip's ``--editable`` flag.
-
 .. note::
 
     There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
@@ -132,7 +132,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T
     As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
 
     `sccache <https://github.com/mozilla/sccache>` works similarly to ``ccache``, but has the capability to utilize caching in remote storage environments.
-    The following env vars can be set for to configure the vLLM ``sccache`` remote: ``SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1``. We also recommend setting ``SCCACHE_IDLE_TIMEOUT=0``
+    The following environment variables can be set to configure the vLLM ``sccache`` remote: ``SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1``. We also recommend setting ``SCCACHE_IDLE_TIMEOUT=0``.
 
 
 Use an existing PyTorch installation

From 8076513452b2234b5c6e2dd623aecf6b7d443a8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Mon, 28 Oct 2024 16:48:53 +0100
Subject: [PATCH 06/22] fix inclusion of vllm_flash_attn python/compiled files
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 setup.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 8e8aa18af66e1..7da3f20683f00 100644
--- a/setup.py
+++ b/setup.py
@@ -542,15 +542,23 @@ def _read_requirements(filename: str) -> List[str]:
                 f"Failed to get vLLM wheel from {wheel_location}") from exc
 
     with zipfile.ZipFile(wheel_filename) as wheel:
-        for lib in filter(lambda file: file.filename.endswith(".so"),
-                          wheel.filelist):
+        for lib in filter(
+                lambda file: file.filename.endswith(".so") or file.filename.
+                startswith("vllm/vllm_flash_attn"), wheel.filelist):
+            print(
+                "Extracting and including {lib.filename} from existing wheel")
             package_name = os.path.dirname(lib.filename).replace("/", ".")
+            file_name = os.path.basename(lib.filename)
+
             if package_name not in package_data:
                 package_data[package_name] = []
 
             wheel.extract(lib)
-            package_data[package_name].append(lib.filename)
-            print(f"Added {lib.filename} to package_data[\"{package_name}\"]")
+            if file_name.endswith(".py"):
+                # python files shouldn't be added to package_data
+                continue
+
+            package_data[package_name].append(file_name)
 
 if _no_device():
     ext_modules = []

From f83ade07f53aaf8bcb54b140acbc19fc8aa6bf3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Mon, 28 Oct 2024 17:14:46 +0100
Subject: [PATCH 07/22] fix build isolation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 setup.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 7da3f20683f00..4df46b1441a51 100644
--- a/setup.py
+++ b/setup.py
@@ -532,9 +532,15 @@ def _read_requirements(filename: str) -> List[str]:
     if os.path.exists(wheel_filename := os.path.basename(wheel_location)):
         print(f"Using existing wheel={wheel_filename}")
     else:
+        # pip will not be available in PEP-517 style builds with build isolation (pip install <url/path>)
         try:
-            subprocess.check_call(
-                f"pip download --no-deps {wheel_location}".split(" "))
+            if which("pip"):
+                subprocess.check_call(
+                    f"pip download --no-deps {wheel_location}".split(" "))
+            else:
+                from urllib.request import urlretrieve
+
+                urlretrieve(wheel_location, filename=wheel_filename)
         except subprocess.CalledProcessError as exc:
             from setuptools.errors import SetupError
 

From ff364ae8df4fb63437098706c9d7536aa930cef4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Mon, 28 Oct 2024 17:31:41 +0100
Subject: [PATCH 08/22] fixup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 setup.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/setup.py b/setup.py
index 4df46b1441a51..dfde82f401b5e 100644
--- a/setup.py
+++ b/setup.py
@@ -534,18 +534,20 @@ def _read_requirements(filename: str) -> List[str]:
     else:
         # pip will not be available in PEP-517 style builds with build isolation (pip install <url/path>)
         try:
-            if which("pip"):
-                subprocess.check_call(
-                    f"pip download --no-deps {wheel_location}".split(" "))
-            else:
+            subprocess.check_call(
+                f"pip download --no-deps {wheel_location}".split(" "))
+        except subprocess.CalledProcessError:
+            print("#" * 30)
+            print("Failed to download using pip, retrying using urlretrieve")
+            try:
                 from urllib.request import urlretrieve
 
-                urlretrieve(wheel_location, filename=wheel_filename)
-        except subprocess.CalledProcessError as exc:
-            from setuptools.errors import SetupError
+                result = urlretrieve(wheel_location, filename=wheel_filename)
+            except Exception as e:
+                from setuptools.errors import SetupError
 
-            raise SetupError(
-                f"Failed to get vLLM wheel from {wheel_location}") from exc
+                raise SetupError(
+                    f"Failed to get vLLM wheel from {wheel_location}") from e
 
     with zipfile.ZipFile(wheel_filename) as wheel:
         for lib in filter(

From 9c41aba2701baa0004a1eb2cff07b1699c582e3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Mon, 28 Oct 2024 17:58:33 +0100
Subject: [PATCH 09/22] extract pre-compiled wheel logic into repackage_wheel()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 setup.py | 71 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 38 insertions(+), 33 deletions(-)

diff --git a/setup.py b/setup.py
index dfde82f401b5e..43c68239318eb 100644
--- a/setup.py
+++ b/setup.py
@@ -498,51 +498,28 @@ def _read_requirements(filename: str) -> List[str]:
     return requirements
 
 
-ext_modules = []
-
-if _is_cuda() or _is_hip():
-    ext_modules.append(CMakeExtension(name="vllm._moe_C"))
-
-if _is_hip():
-    ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
-
-if _is_cuda():
-    ext_modules.append(
-        CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
-
-if _build_custom_ops():
-    ext_modules.append(CMakeExtension(name="vllm._C"))
-
-package_data = {
-    "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
-}
-
-if envs.VLLM_USE_PRECOMPILED:
+def repackage_wheel(package_data: Dict[str, List[str]],
+                    wheel_location) -> None:
+    """Extracts libraries and other files from an existing wheel."""
     assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
 
     import zipfile
 
-    ext_modules = []
-
-    wheel_location = os.getenv(
-        "VLLM_PRECOMPILED_WHEEL_LOCATION",
-        "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-    )
-
     if os.path.exists(wheel_filename := os.path.basename(wheel_location)):
         print(f"Using existing wheel={wheel_filename}")
     else:
-        # pip will not be available in PEP-517 style builds with build isolation (pip install <url/path>)
         try:
             subprocess.check_call(
                 f"pip download --no-deps {wheel_location}".split(" "))
         except subprocess.CalledProcessError:
-            print("#" * 30)
-            print("Failed to download using pip, retrying using urlretrieve")
-            try:
-                from urllib.request import urlretrieve
+            # pip will not be available in PEP-517 style builds with
+            # build isolation, such as when running
+            #    `pip install <path|git+https://<url>`.
+            from urllib.request import urlretrieve
 
-                result = urlretrieve(wheel_location, filename=wheel_filename)
+            print("Failed to download using pip, retrying using urlretrieve.")
+            try:
+                urlretrieve(wheel_location, filename=wheel_filename)
             except Exception as e:
                 from setuptools.errors import SetupError
 
@@ -568,6 +545,34 @@ def _read_requirements(filename: str) -> List[str]:
 
             package_data[package_name].append(file_name)
 
+
+ext_modules = []
+
+if _is_cuda() or _is_hip():
+    ext_modules.append(CMakeExtension(name="vllm._moe_C"))
+
+if _is_hip():
+    ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
+
+if _is_cuda():
+    ext_modules.append(
+        CMakeExtension(name="vllm.vllm_flash_attn.vllm_flash_attn_c"))
+
+if _build_custom_ops():
+    ext_modules.append(CMakeExtension(name="vllm._C"))
+
+package_data = {
+    "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
+}
+if envs.VLLM_USE_PRECOMPILED:
+    ext_modules = []
+    wheel_location = os.getenv(
+        "VLLM_PRECOMPILED_WHEEL_LOCATION",
+        "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+    )
+
+    repackage_wheel(package_data, wheel_location)
+
 if _no_device():
     ext_modules = []
 

From 94a2facec27dfbebf02aa0d9727781681fc4a6d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 3 Dec 2024 15:17:14 +0100
Subject: [PATCH 10/22] use files_to_copy
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 setup.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index 43c68239318eb..720a4bedec610 100644
--- a/setup.py
+++ b/setup.py
@@ -527,18 +527,21 @@ def repackage_wheel(package_data: Dict[str, List[str]],
                     f"Failed to get vLLM wheel from {wheel_location}") from e
 
     with zipfile.ZipFile(wheel_filename) as wheel:
-        for lib in filter(
-                lambda file: file.filename.endswith(".so") or file.filename.
-                startswith("vllm/vllm_flash_attn"), wheel.filelist):
+        files_to_copy = filter(
+            lambda file: file.filename.endswith(".so") or file.filename.
+            startswith("vllm/vllm_flash_attn"), wheel.filelist)
+
+        for file in files_to_copy:
             print(
-                "Extracting and including {lib.filename} from existing wheel")
-            package_name = os.path.dirname(lib.filename).replace("/", ".")
-            file_name = os.path.basename(lib.filename)
+                f"Extracting and including {file.filename} from existing wheel"
+            )
+            package_name = os.path.dirname(file.filename).replace("/", ".")
+            file_name = os.path.basename(file.filename)
 
             if package_name not in package_data:
                 package_data[package_name] = []
 
-            wheel.extract(lib)
+            wheel.extract(file)
             if file_name.endswith(".py"):
                 # python files shouldn't be added to package_data
                 continue

From 0acb1b29548f521a7f9365f3dde107f7fb24ed98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 3 Dec 2024 15:17:34 +0100
Subject: [PATCH 11/22] allow to set VLLM_PRECOMPILED_WHEEL_LOCATION for custom
 wheel location
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 docs/source/getting_started/installation.rst | 4 ++--
 vllm/envs.py                                 | 3 ++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index 1fb7c8bce5565..cfa2c6d109861 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -99,12 +99,12 @@ If you only need to change Python code, you can build and install vLLM without c
 
 This will download the latest nightly wheel and use the compiled libraries from there in the install.
 
-The wheel that is used to retrieve the prebuilt libraries (e.g. the `PyPi wheel <https://pypi.org/project/vllm/#files>`_) can be set using the ``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable:
+The``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable can be used instead of ``VLLM_USE_PRECOMPILED`` to specify a custom path or URL to the wheel file. For example, to use the `0.6.1.post1 PyPi wheel <https://pypi.org/project/vllm/#files>`_:
 
 .. code-block:: console
 
    $ export VLLM_PRECOMPILED_WHEEL_LOCATION=https://files.pythonhosted.org/packages/4a/4c/ee65ba33467a4c0de350ce29fbae39b9d0e7fcd887cc756fa993654d1228/vllm-0.6.3.post1-cp38-abi3-manylinux1_x86_64.whl
-   $ VLLM_USE_PRECOMPILED=1 pip install --editable .
+   $ pip install --editable .
 
 You can find more information about vLLM's wheels `above <#install-the-latest-code>`_.
 
diff --git a/vllm/envs.py b/vllm/envs.py
index c896770e5f6bc..28797ac1e4af2 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -113,7 +113,8 @@ def get_default_config_root():
 
     # If set, vllm will use precompiled binaries (*.so)
     "VLLM_USE_PRECOMPILED":
-    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")),
+    lambda: bool(os.environ.get("VLLM_USE_PRECOMPILED")) or bool(
+        os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION")),
 
     # CMake build type
     # If not set, defaults to "Debug" or "RelWithDebInfo"

From b17f15b88fe8968626a2636f793fd7dc8bac2daa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 3 Dec 2024 15:34:39 +0100
Subject: [PATCH 12/22] setup.py: use use wheel location instead of wheel
 filename
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 720a4bedec610..d382c1b64e680 100644
--- a/setup.py
+++ b/setup.py
@@ -505,7 +505,8 @@ def repackage_wheel(package_data: Dict[str, List[str]],
 
     import zipfile
 
-    if os.path.exists(wheel_filename := os.path.basename(wheel_location)):
+    if os.path.isfile(wheel_location):
+        wheel_filename = os.path.realpath(wheel_location)
         print(f"Using existing wheel={wheel_filename}")
     else:
         try:

From 42c9a4568ce65f9f1441cb56e863fe2b74e56bae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Tue, 3 Dec 2024 15:46:30 +0100
Subject: [PATCH 13/22] fix docs linting complaints
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 docs/source/getting_started/installation.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation.rst
index cfa2c6d109861..af745a7a635c8 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@@ -99,7 +99,7 @@ If you only need to change Python code, you can build and install vLLM without c
 
 This will download the latest nightly wheel and use the compiled libraries from there in the install.
 
-The``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable can be used instead of ``VLLM_USE_PRECOMPILED`` to specify a custom path or URL to the wheel file. For example, to use the `0.6.1.post1 PyPi wheel <https://pypi.org/project/vllm/#files>`_:
+The ``VLLM_PRECOMPILED_WHEEL_LOCATION`` environment variable can be used instead of ``VLLM_USE_PRECOMPILED`` to specify a custom path or URL to the wheel file. For example, to use the `0.6.1.post1 PyPi wheel <https://pypi.org/project/vllm/#files>`_:
 
 .. code-block:: console
 
@@ -131,7 +131,7 @@ If you want to modify C++ or CUDA code, you'll need to build vLLM from source. T
     For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` .
     As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
 
-    `sccache <https://github.com/mozilla/sccache>` works similarly to ``ccache``, but has the capability to utilize caching in remote storage environments.
+    `sccache <https://github.com/mozilla/sccache>`_ works similarly to ``ccache``, but has the capability to utilize caching in remote storage environments.
     The following environment variables can be set to configure the vLLM ``sccache`` remote: ``SCCACHE_BUCKET=vllm-build-sccache SCCACHE_REGION=us-west-2 SCCACHE_S3_NO_CREDENTIALS=1``. We also recommend setting ``SCCACHE_IDLE_TIMEOUT=0``.
 
 

From 83dcfb6a99671a31157c3b4e727b69600422d7a7 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 3 Dec 2024 21:52:38 -0800
Subject: [PATCH 14/22] explicit files to copy

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 setup.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index d382c1b64e680..ae12b97dafdd2 100644
--- a/setup.py
+++ b/setup.py
@@ -528,9 +528,14 @@ def repackage_wheel(package_data: Dict[str, List[str]],
                     f"Failed to get vLLM wheel from {wheel_location}") from e
 
     with zipfile.ZipFile(wheel_filename) as wheel:
-        files_to_copy = filter(
-            lambda file: file.filename.endswith(".so") or file.filename.
-            startswith("vllm/vllm_flash_attn"), wheel.filelist)
+        files_to_copy = [
+            "vllm/_C.abi3.so",
+            "vllm/_moe_C.abi3.so",
+            "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
+            "vllm/vllm_flash_attn/flash_attn_interface.py",
+            "vllm/vllm_flash_attn/__init__.py",
+            # "vllm/_version.py", # not available in nightly wheels yet
+        ]
 
         for file in files_to_copy:
             print(

From 400d1e27b9c592b26681a4d64d41d2d1610a9743 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 3 Dec 2024 22:08:34 -0800
Subject: [PATCH 15/22] use wheel_path

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 setup.py | 42 +++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/setup.py b/setup.py
index ae12b97dafdd2..6f9f4a0167fc7 100644
--- a/setup.py
+++ b/setup.py
@@ -499,35 +499,39 @@ def _read_requirements(filename: str) -> List[str]:
 
 
 def repackage_wheel(package_data: Dict[str, List[str]],
-                    wheel_location) -> None:
+                    wheel_location: str) -> None:
     """Extracts libraries and other files from an existing wheel."""
     assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
 
     import zipfile
 
     if os.path.isfile(wheel_location):
-        wheel_filename = os.path.realpath(wheel_location)
-        print(f"Using existing wheel={wheel_filename}")
+        wheel_path = wheel_location
+        print(f"Using existing wheel={wheel_path}")
     else:
+        # Download the wheel from a given URL, assume
+        # the filename is the last part of the URL
+        wheel_filename = wheel_location.split("/")[-1]
+
+        import tempfile
+
+        # create a temporary directory to store the wheel
+        temp_dir = tempfile.mkdtemp()
+        wheel_path = os.path.join(temp_dir, wheel_filename)
+
+        print(f"Downloading wheel from {wheel_location} to {wheel_path}")
+
+        from urllib.request import urlretrieve
+
         try:
-            subprocess.check_call(
-                f"pip download --no-deps {wheel_location}".split(" "))
-        except subprocess.CalledProcessError:
-            # pip will not be available in PEP-517 style builds with
-            # build isolation, such as when running
-            #    `pip install <path|git+https://<url>`.
-            from urllib.request import urlretrieve
-
-            print("Failed to download using pip, retrying using urlretrieve.")
-            try:
-                urlretrieve(wheel_location, filename=wheel_filename)
-            except Exception as e:
-                from setuptools.errors import SetupError
+            urlretrieve(wheel_location, filename=wheel_path)
+        except Exception as e:
+            from setuptools.errors import SetupError
 
-                raise SetupError(
-                    f"Failed to get vLLM wheel from {wheel_location}") from e
+            raise SetupError(
+                f"Failed to get vLLM wheel from {wheel_location}") from e
 
-    with zipfile.ZipFile(wheel_filename) as wheel:
+    with zipfile.ZipFile(wheel_path) as wheel:
         files_to_copy = [
             "vllm/_C.abi3.so",
             "vllm/_moe_C.abi3.so",

From 60532e4e38c6406f232948a8eccd8b510bfa0369 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 3 Dec 2024 22:13:22 -0800
Subject: [PATCH 16/22] use member

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6f9f4a0167fc7..510be6c7c8cec 100644
--- a/setup.py
+++ b/setup.py
@@ -540,8 +540,9 @@ def repackage_wheel(package_data: Dict[str, List[str]],
             "vllm/vllm_flash_attn/__init__.py",
             # "vllm/_version.py", # not available in nightly wheels yet
         ]
+        file_members = filter(lambda x: x.filename in files_to_copy, wheel.filelist)
 
-        for file in files_to_copy:
+        for file in file_members:
             print(
                 f"Extracting and including {file.filename} from existing wheel"
             )

From 4c0c89b7a42be7bdeb32fad05bddcaa141d6a333 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 3 Dec 2024 22:18:26 -0800
Subject: [PATCH 17/22] fix format

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 510be6c7c8cec..5a25194b0519f 100644
--- a/setup.py
+++ b/setup.py
@@ -540,7 +540,8 @@ def repackage_wheel(package_data: Dict[str, List[str]],
             "vllm/vllm_flash_attn/__init__.py",
             # "vllm/_version.py", # not available in nightly wheels yet
         ]
-        file_members = filter(lambda x: x.filename in files_to_copy, wheel.filelist)
+        file_members = filter(lambda x: x.filename in files_to_copy,
+                              wheel.filelist)
 
         for file in file_members:
             print(

From c129f9ccf10a024431817e2d04ec54dae8252a8a Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 3 Dec 2024 22:40:16 -0800
Subject: [PATCH 18/22] add notes

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 python_only_dev.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)
 create mode 100644 python_only_dev.py

diff --git a/python_only_dev.py b/python_only_dev.py
new file mode 100644
index 0000000000000..9389850e1b311
--- /dev/null
+++ b/python_only_dev.py
@@ -0,0 +1,14 @@
+msg = """Old style python only build (without compilation) is deprecated, please check https://docs.vllm.ai/en/latest/getting_started/installation.html#python-only-build-without-compilation for the new way to do python only build (without compilation).
+
+TL;DR:
+
+VLLM_USE_PRECOMPILED=1 pip install -vvv -e .
+
+or
+
+export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
+export VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+pip install -vvv -e .
+""" # noqa
+
+print(msg)

From 69e6c4e6135630ebe22721da392c12ea21389166 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Wed, 4 Dec 2024 12:50:23 +0100
Subject: [PATCH 19/22] setup.py: refactor repackage_wheel into custom
 build_ext class
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 setup.py | 150 +++++++++++++++++++++++++++++--------------------------
 1 file changed, 78 insertions(+), 72 deletions(-)

diff --git a/setup.py b/setup.py
index 5a25194b0519f..b8abbd0a6b8cd 100644
--- a/setup.py
+++ b/setup.py
@@ -249,6 +249,75 @@ def run(self):
             self.copy_file(file, dst_file)
 
 
+class repackage_wheel(build_ext):
+    """Extracts libraries and other files from an existing wheel."""
+    default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+
+    def run(self) -> None:
+        super().run()
+        wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION",
+                                   self.default_wheel)
+
+        assert _is_cuda(
+        ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
+
+        import zipfile
+
+        if os.path.isfile(wheel_location):
+            wheel_path = wheel_location
+            print(f"Using existing wheel={wheel_path}")
+        else:
+            # Download the wheel from a given URL, assume
+            # the filename is the last part of the URL
+            wheel_filename = wheel_location.split("/")[-1]
+
+            import tempfile
+
+            # create a temporary directory to store the wheel
+            temp_dir = tempfile.mkdtemp()
+            wheel_path = os.path.join(temp_dir, wheel_filename)
+
+            print(f"Downloading wheel from {wheel_location} to {wheel_path}")
+
+            from urllib.request import urlretrieve
+
+            try:
+                urlretrieve(wheel_location, filename=wheel_path)
+            except Exception as e:
+                from setuptools.errors import SetupError
+
+                raise SetupError(
+                    f"Failed to get vLLM wheel from {wheel_location}") from e
+
+        with zipfile.ZipFile(wheel_path) as wheel:
+            files_to_copy = [
+                "vllm/_C.abi3.so",
+                "vllm/_moe_C.abi3.so",
+                "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
+                "vllm/vllm_flash_attn/flash_attn_interface.py",
+                "vllm/vllm_flash_attn/__init__.py",
+                # "vllm/_version.py", # not available in nightly wheels yet
+            ]
+            file_members = filter(lambda x: x.filename in files_to_copy,
+                                  wheel.filelist)
+
+            for file in file_members:
+                print(f"Extracting and including {file.filename} "
+                      "from existing wheel")
+                package_name = os.path.dirname(file.filename).replace("/", ".")
+                file_name = os.path.basename(file.filename)
+
+                if package_name not in package_data:
+                    package_data[package_name] = []
+
+                wheel.extract(file)
+                if file_name.endswith(".py"):
+                    # python files shouldn't be added to package_data
+                    continue
+
+                package_data[package_name].append(file_name)
+
+
 def _is_hpu() -> bool:
     is_hpu_available = True
     try:
@@ -498,69 +567,6 @@ def _read_requirements(filename: str) -> List[str]:
     return requirements
 
 
-def repackage_wheel(package_data: Dict[str, List[str]],
-                    wheel_location: str) -> None:
-    """Extracts libraries and other files from an existing wheel."""
-    assert _is_cuda(), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
-
-    import zipfile
-
-    if os.path.isfile(wheel_location):
-        wheel_path = wheel_location
-        print(f"Using existing wheel={wheel_path}")
-    else:
-        # Download the wheel from a given URL, assume
-        # the filename is the last part of the URL
-        wheel_filename = wheel_location.split("/")[-1]
-
-        import tempfile
-
-        # create a temporary directory to store the wheel
-        temp_dir = tempfile.mkdtemp()
-        wheel_path = os.path.join(temp_dir, wheel_filename)
-
-        print(f"Downloading wheel from {wheel_location} to {wheel_path}")
-
-        from urllib.request import urlretrieve
-
-        try:
-            urlretrieve(wheel_location, filename=wheel_path)
-        except Exception as e:
-            from setuptools.errors import SetupError
-
-            raise SetupError(
-                f"Failed to get vLLM wheel from {wheel_location}") from e
-
-    with zipfile.ZipFile(wheel_path) as wheel:
-        files_to_copy = [
-            "vllm/_C.abi3.so",
-            "vllm/_moe_C.abi3.so",
-            "vllm/vllm_flash_attn/vllm_flash_attn_c.abi3.so",
-            "vllm/vllm_flash_attn/flash_attn_interface.py",
-            "vllm/vllm_flash_attn/__init__.py",
-            # "vllm/_version.py", # not available in nightly wheels yet
-        ]
-        file_members = filter(lambda x: x.filename in files_to_copy,
-                              wheel.filelist)
-
-        for file in file_members:
-            print(
-                f"Extracting and including {file.filename} from existing wheel"
-            )
-            package_name = os.path.dirname(file.filename).replace("/", ".")
-            file_name = os.path.basename(file.filename)
-
-            if package_name not in package_data:
-                package_data[package_name] = []
-
-            wheel.extract(file)
-            if file_name.endswith(".py"):
-                # python files shouldn't be added to package_data
-                continue
-
-            package_data[package_name].append(file_name)
-
-
 ext_modules = []
 
 if _is_cuda() or _is_hip():
@@ -579,18 +585,18 @@ def repackage_wheel(package_data: Dict[str, List[str]],
 package_data = {
     "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
 }
-if envs.VLLM_USE_PRECOMPILED:
-    ext_modules = []
-    wheel_location = os.getenv(
-        "VLLM_PRECOMPILED_WHEEL_LOCATION",
-        "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-    )
-
-    repackage_wheel(package_data, wheel_location)
 
 if _no_device():
     ext_modules = []
 
+if not ext_modules:
+    cmdclass = {}
+else:
+    cmdclass = {
+        "build_ext":
+        repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
+    }
+
 setup(
     name="vllm",
     version=get_vllm_version(),
@@ -627,7 +633,7 @@ def repackage_wheel(package_data: Dict[str, List[str]],
         "audio": ["librosa", "soundfile"],  # Required for audio processing
         "video": ["decord"]  # Required for video processing
     },
-    cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
+    cmdclass=cmdclass,
     package_data=package_data,
     entry_points={
         "console_scripts": [

From 73c07fc034495bf52d306ff1af8b67118d4fe208 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniele=20Trifir=C3=B2?= <dtrifiro@redhat.com>
Date: Wed, 4 Dec 2024 12:52:53 +0100
Subject: [PATCH 20/22] setup.py: use vllm-wheels prefix for nightly wheels
 download dir
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b8abbd0a6b8cd..558c7252fc679 100644
--- a/setup.py
+++ b/setup.py
@@ -274,7 +274,7 @@ def run(self) -> None:
             import tempfile
 
             # create a temporary directory to store the wheel
-            temp_dir = tempfile.mkdtemp()
+            temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
             wheel_path = os.path.join(temp_dir, wheel_filename)
 
             print(f"Downloading wheel from {wheel_location} to {wheel_path}")

From cef112adcd4cd31d3b3b6119c1ddfd1a1b015ed1 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 4 Dec 2024 09:23:23 -0800
Subject: [PATCH 21/22] remove verbose flag

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 python_only_dev.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python_only_dev.py b/python_only_dev.py
index 9389850e1b311..f70b4984025b3 100644
--- a/python_only_dev.py
+++ b/python_only_dev.py
@@ -2,13 +2,13 @@
 
 TL;DR:
 
-VLLM_USE_PRECOMPILED=1 pip install -vvv -e .
+VLLM_USE_PRECOMPILED=1 pip install -e .
 
 or
 
 export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
 export VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
-pip install -vvv -e .
+pip install -e .
 """ # noqa
 
 print(msg)

From 57ee0c1715abefefa2bfc51db2f9f4ea96334b2c Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Wed, 4 Dec 2024 09:35:11 -0800
Subject: [PATCH 22/22] remove super run

Signed-off-by: youkaichao <youkaichao@gmail.com>
---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index 558c7252fc679..182dabe449674 100644
--- a/setup.py
+++ b/setup.py
@@ -254,7 +254,6 @@ class repackage_wheel(build_ext):
     default_wheel = "https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
 
     def run(self) -> None:
-        super().run()
         wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION",
                                    self.default_wheel)