From caa368e06b654a2ef653c0511c329b8536e1b6fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Cumplido?= Date: Thu, 5 Sep 2024 09:56:19 +0200 Subject: [PATCH] GH-43299: [Release][Packaging] Only include pyarrow folder when finding packages on setuptools (#43325) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Rationale for this change Currently we include everything when building wheels, see: ``` $ pip install pyarrow Collecting pyarrow Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 39.9/39.9 MB 33.8 MB/s eta 0:00:00 Collecting numpy>=1.16.6 Using cached numpy-2.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.3 MB) Installing collected packages: numpy, pyarrow Successfully installed numpy-2.0.0 pyarrow-17.0.0 (test-env) $ ls test-env/lib/python3.10/site-packages/ benchmarks/ distutils-precedence.pth numpy-2.0.0.dist-info/ pip-22.0.2.dist-info/ pyarrow-17.0.0.dist-info/ setuptools-59.6.0.dist-info/ cmake_modules/ examples/ numpy.libs/ pkg_resources/ scripts/ _distutils_hack/ numpy/ pip/ pyarrow/ setuptools/ ``` ### What changes are included in this PR? Use `include` as seen here: https://setuptools.pypa.io/en/latest/userguide/package_discovery.html#finding-simple-packages ### Are these changes tested? Will check via the build wheel on CI ### Are there any user-facing changes? No and yes :) We will remove unnecessary files * GitHub Issue: #43299 Lead-authored-by: Raúl Cumplido Co-authored-by: Antoine Pitrou Signed-off-by: Joris Van den Bossche --- ci/docker/python-wheel-manylinux.dockerfile | 3 ++ ci/scripts/python_wheel_macos_build.sh | 1 - ci/scripts/python_wheel_manylinux_build.sh | 3 +- ci/scripts/python_wheel_unix_test.sh | 6 +++ ci/scripts/python_wheel_validate_contents.py | 48 ++++++++++++++++++++ ci/scripts/python_wheel_windows_build.bat | 1 - ci/scripts/python_wheel_windows_test.bat | 3 ++ docker-compose.yml | 2 + docs/source/developers/python.rst | 3 -- python/pyproject.toml | 3 +- python/setup.py | 16 +------ 11 files changed, 66 insertions(+), 23 deletions(-) create mode 100644 ci/scripts/python_wheel_validate_contents.py diff --git a/ci/docker/python-wheel-manylinux.dockerfile b/ci/docker/python-wheel-manylinux.dockerfile index 42f088fd8a22a..5cc1711608c03 100644 --- a/ci/docker/python-wheel-manylinux.dockerfile +++ b/ci/docker/python-wheel-manylinux.dockerfile @@ -100,6 +100,9 @@ RUN vcpkg install \ --x-feature=parquet \ --x-feature=s3 +# Make sure auditwheel is up-to-date +RUN pipx upgrade auditwheel + # Configure Python for applications running in the bash shell of this Dockerfile ARG python=3.8 ENV PYTHON_VERSION=${python} diff --git a/ci/scripts/python_wheel_macos_build.sh b/ci/scripts/python_wheel_macos_build.sh index 92b962f1740bd..d2c392e6b9db3 100755 --- a/ci/scripts/python_wheel_macos_build.sh +++ b/ci/scripts/python_wheel_macos_build.sh @@ -150,7 +150,6 @@ echo "=== (${PYTHON_VERSION}) Building wheel ===" export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE} export PYARROW_BUNDLE_ARROW_CPP=1 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR} -export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_ACERO=${ARROW_ACERO} export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} diff --git a/ci/scripts/python_wheel_manylinux_build.sh b/ci/scripts/python_wheel_manylinux_build.sh index aa86494a9d47d..885019ff3049f 100755 --- a/ci/scripts/python_wheel_manylinux_build.sh +++ b/ci/scripts/python_wheel_manylinux_build.sh @@ -140,7 +140,6 @@ echo "=== (${PYTHON_VERSION}) Building wheel ===" export PYARROW_BUILD_TYPE=${CMAKE_BUILD_TYPE} export PYARROW_BUNDLE_ARROW_CPP=1 export PYARROW_CMAKE_GENERATOR=${CMAKE_GENERATOR} -export PYARROW_INSTALL_TESTS=1 export PYARROW_WITH_ACERO=${ARROW_ACERO} export PYARROW_WITH_AZURE=${ARROW_AZURE} export PYARROW_WITH_DATASET=${ARROW_DATASET} @@ -181,5 +180,5 @@ popd rm -rf dist/temp-fix-wheel echo "=== (${PYTHON_VERSION}) Tag the wheel with manylinux${MANYLINUX_VERSION} ===" -auditwheel repair -L . dist/pyarrow-*.whl -w repaired_wheels +auditwheel repair dist/pyarrow-*.whl -w repaired_wheels popd diff --git a/ci/scripts/python_wheel_unix_test.sh b/ci/scripts/python_wheel_unix_test.sh index cf87a17056783..6bdc3d3621e14 100755 --- a/ci/scripts/python_wheel_unix_test.sh +++ b/ci/scripts/python_wheel_unix_test.sh @@ -34,6 +34,7 @@ source_dir=${1} : ${ARROW_S3:=ON} : ${ARROW_SUBSTRAIT:=ON} : ${CHECK_IMPORTS:=ON} +: ${CHECK_WHEEL_CONTENT:=ON} : ${CHECK_UNITTESTS:=ON} : ${INSTALL_PYARROW:=ON} @@ -87,6 +88,11 @@ import pyarrow.parquet fi fi +if [ "${CHECK_WHEEL_CONTENT}" == "ON" ]; then + python ${source_dir}/ci/scripts/python_wheel_validate_contents.py \ + --path ${source_dir}/python/repaired_wheels +fi + if [ "${CHECK_UNITTESTS}" == "ON" ]; then # Install testing dependencies pip install -U -r ${source_dir}/python/requirements-wheel-test.txt diff --git a/ci/scripts/python_wheel_validate_contents.py b/ci/scripts/python_wheel_validate_contents.py new file mode 100644 index 0000000000000..22b3a890f036b --- /dev/null +++ b/ci/scripts/python_wheel_validate_contents.py @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import argparse +from pathlib import Path +import re +import zipfile + + +def validate_wheel(path): + p = Path(path) + wheels = list(p.glob('*.whl')) + error_msg = f"{len(wheels)} wheels found but only 1 expected ({wheels})" + assert len(wheels) == 1, error_msg + f = zipfile.ZipFile(wheels[0]) + outliers = [ + info.filename for info in f.filelist if not re.match( + r'(pyarrow/|pyarrow-[-.\w\d]+\.dist-info/)', info.filename + ) + ] + assert not outliers, f"Unexpected contents in wheel: {sorted(outliers)}" + print(f"The wheel: {wheels[0]} seems valid.") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--path", type=str, required=True, + help="Directory where wheel is located") + args = parser.parse_args() + validate_wheel(args.path) + + +if __name__ == '__main__': + main() diff --git a/ci/scripts/python_wheel_windows_build.bat b/ci/scripts/python_wheel_windows_build.bat index 54f02ec6f6ed0..1f1d5dca721d9 100644 --- a/ci/scripts/python_wheel_windows_build.bat +++ b/ci/scripts/python_wheel_windows_build.bat @@ -106,7 +106,6 @@ echo "=== (%PYTHON_VERSION%) Building wheel ===" set PYARROW_BUILD_TYPE=%CMAKE_BUILD_TYPE% set PYARROW_BUNDLE_ARROW_CPP=ON set PYARROW_CMAKE_GENERATOR=%CMAKE_GENERATOR% -set PYARROW_INSTALL_TESTS=ON set PYARROW_WITH_ACERO=%ARROW_ACERO% set PYARROW_WITH_DATASET=%ARROW_DATASET% set PYARROW_WITH_FLIGHT=%ARROW_FLIGHT% diff --git a/ci/scripts/python_wheel_windows_test.bat b/ci/scripts/python_wheel_windows_test.bat index cac3f18434b6c..de5a2c2e965cb 100755 --- a/ci/scripts/python_wheel_windows_test.bat +++ b/ci/scripts/python_wheel_windows_test.bat @@ -64,6 +64,9 @@ set PYTHON_CMD=py -%PYTHON% %PYTHON_CMD% -c "import pyarrow.parquet" || exit /B 1 %PYTHON_CMD% -c "import pyarrow.substrait" || exit /B 1 +@REM Validate wheel contents +%PYTHON_CMD% C:\arrow\ci\scripts\python_wheel_validate_contents.py --path C:\arrow\python\dist || exit /B 1 + @rem Download IANA Timezone Database for ORC C++ curl https://cygwin.osuosl.org/noarch/release/tzdata/tzdata-2024a-1.tar.xz --output tzdata.tar.xz || exit /B mkdir %USERPROFILE%\Downloads\test\tzdata diff --git a/docker-compose.yml b/docker-compose.yml index 19a9dd0de3932..36cf150f25f39 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1144,6 +1144,7 @@ services: <<: *common CHECK_IMPORTS: "ON" CHECK_UNITTESTS: "OFF" + CHECK_WHEEL_CONTENT: "ON" command: /arrow/ci/scripts/python_wheel_unix_test.sh /arrow python-wheel-manylinux-test-unittests: @@ -1164,6 +1165,7 @@ services: <<: *common CHECK_IMPORTS: "OFF" CHECK_UNITTESTS: "ON" + CHECK_WHEEL_CONTENT: "OFF" command: /arrow/ci/scripts/python_wheel_unix_test.sh /arrow python-wheel-windows-vs2019: diff --git a/docs/source/developers/python.rst b/docs/source/developers/python.rst index 6beea55e66b86..2ba4b534caeff 100644 --- a/docs/source/developers/python.rst +++ b/docs/source/developers/python.rst @@ -632,9 +632,6 @@ PyArrow are: * - ``PYARROW_BUNDLE_CYTHON_CPP`` - Bundle the C++ files generated by Cython - ``0`` (``OFF``) - * - ``PYARROW_INSTALL_TESTS`` - - Add the test to the python package - - ``1`` (``ON``) * - ``PYARROW_BUILD_VERBOSE`` - Enable verbose output from Makefile builds - ``0`` (``OFF``) diff --git a/python/pyproject.toml b/python/pyproject.toml index 8ece65dd467bb..7c3fcae5cb306 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -74,7 +74,8 @@ zip-safe=false include-package-data=true [tool.setuptools.packages.find] -where = ["."] +include = ["pyarrow"] +namespaces = false [tool.setuptools.package-data] pyarrow = ["*.pxd", "*.pyx", "includes/*.pxd"] diff --git a/python/setup.py b/python/setup.py index d3ef3a091467c..60b9a696d9785 100755 --- a/python/setup.py +++ b/python/setup.py @@ -32,7 +32,7 @@ # Get correct EXT_SUFFIX on Windows (https://bugs.python.org/issue39825) from distutils import sysconfig -from setuptools import setup, Extension, Distribution, find_namespace_packages +from setuptools import setup, Extension, Distribution from Cython.Distutils import build_ext as _build_ext import Cython @@ -396,21 +396,7 @@ def has_ext_modules(foo): return True -if strtobool(os.environ.get('PYARROW_INSTALL_TESTS', '1')): - packages = find_namespace_packages(include=['pyarrow*']) - exclude_package_data = {} -else: - packages = find_namespace_packages(include=['pyarrow*'], - exclude=["pyarrow.tests*"]) - # setuptools adds back importable packages even when excluded. - # https://github.com/pypa/setuptools/issues/3260 - # https://github.com/pypa/setuptools/issues/3340#issuecomment-1219383976 - exclude_package_data = {"pyarrow": ["tests*"]} - - setup( - packages=packages, - exclude_package_data=exclude_package_data, distclass=BinaryDistribution, # Dummy extension to trigger build_ext ext_modules=[Extension('__dummy__', sources=[])],