From 7d9a24557cca983f2e2767854f9cc88e59c7b7f6 Mon Sep 17 00:00:00 2001 From: Richard Stotz Date: Wed, 13 Mar 2024 11:51:51 -0700 Subject: [PATCH] Prepare release of TF-DF 1.9.0 and update installation instructions PiperOrigin-RevId: 615495475 --- CHANGELOG.md | 9 +- README.md | 2 - WORKSPACE | 6 +- configure/setup.py | 17 +- documentation/installation.md | 157 ++++++++++++------ documentation/known_issues.md | 26 ++- tensorflow_decision_forests/__init__.py | 4 +- .../keras/wrappers_pre_generated.py | 126 ++++++++++++-- .../yggdrasil_decision_forests/workspace.bzl | 7 +- tools/test_bazel.sh | 8 +- 10 files changed, 263 insertions(+), 99 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 48008c6..5c7200b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## 1.9.0rc0 - 2024-02-26 +## 1.9.0 - 2024-03-12 ### Fix @@ -10,8 +10,15 @@ ### Features - Compatibility with TensorFlow 2.16.0rc0. +- Expose new parameter sparse_oblique_max_num_projections. - Using tf_keras instead tf.keras in examples, documentation. - Support NAConditions for fast engine. +- Faster model loading for models with many features and dense oblique + conditions. + +### Documentation + +- Clarified documentation of parameters for oblique splits. ## 1.8.1 - 2023-11-17 diff --git a/README.md b/README.md index 39d7eb7..356022b 100644 --- a/README.md +++ b/README.md @@ -68,8 +68,6 @@ The following resources are available: - [Issue tracker](https://github.com/tensorflow/decision-forests/issues) - [Known issues](documentation/known_issues.md) - [Changelog](CHANGELOG.md) -- [TensorFlow Forum](https://discuss.tensorflow.org) (on - discuss.tensorflow.org) - [More examples](documentation/more_examples.md) ## Installation diff --git a/WORKSPACE b/WORKSPACE index 9980289..30e5291 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -11,9 +11,9 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") # absl used by tensorflow. http_archive( name = "org_tensorflow", - strip_prefix = "tensorflow-2.15.0", - sha256 = "9cec5acb0ecf2d47b16891f8bc5bc6fbfdffe1700bdadc0d9ebe27ea34f0c220", - urls = ["https://github.com/tensorflow/tensorflow/archive/v2.15.0.zip"], + strip_prefix = "tensorflow-2.16.1", + sha256 = "c729e56efc945c6df08efe5c9f5b8b89329c7c91b8f40ad2bb3e13900bd4876d", + urls = ["https://github.com/tensorflow/tensorflow/archive/v2.16.1.tar.gz"], # Starting with TF 2.14, disable hermetic Python builds. patch_args = ["-p1"], patches = ["//third_party/tensorflow:tf.patch"], diff --git a/configure/setup.py b/configure/setup.py index 441a6b5..19ea78d 100644 --- a/configure/setup.py +++ b/configure/setup.py @@ -21,7 +21,7 @@ from setuptools.command.install import install from setuptools.dist import Distribution -_VERSION = "1.9.0rc0" +_VERSION = "1.9.0" with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() @@ -29,12 +29,12 @@ REQUIRED_PACKAGES = [ "numpy", "pandas", - "tensorflow~=2.16.0rc0", + "tensorflow~=2.16.1", "six", "absl_py", "wheel", "wurlitzer", - "tf_keras~=2.16.0rc2", + "tf_keras~=2.16", ] @@ -84,8 +84,10 @@ def get_tag(self): name="tensorflow_decision_forests", version=_VERSION, author="Google Inc.", - author_email="packages@tensorflow.org", - description="Collection of training and inference decision forest algorithms.", + author_email="decision-forests-contact@google.com", + description=( + "Collection of training and inference decision forest algorithms." + ), long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/tensorflow/decision-forests", @@ -113,7 +115,10 @@ def get_tag(self): packages=setuptools.find_packages(), python_requires=">=3.9", license="Apache 2.0", - keywords="tensorflow tensor machine learning decision forests random forest gradient boosted decision trees", + keywords=( + "tensorflow tensor machine learning decision forests random forest" + " gradient boosted decision trees" + ), install_requires=REQUIRED_PACKAGES, include_package_data=True, zip_safe=False, diff --git a/documentation/installation.md b/documentation/installation.md index ab37825..355a921 100644 --- a/documentation/installation.md +++ b/documentation/installation.md @@ -10,15 +10,15 @@ * [Table of Contents](#table-of-contents) * [Installation with Pip](#installation-with-pip) * [Build from source](#build-from-source) + * [Technical details](#technical-details) * [Linux](#linux) - * [Setup](#setup) - * [Compilation](#compilation) + * [Docker build](#docker-build) + * [Manual build](#manual-build) * [MacOS](#macos) - * [Setup](#setup-1) - * [Building / Packaging (Apple CPU)](#building---packaging-apple-cpu) + * [Setup](#setup) + * [Arm64 CPU](#arm64-cpu) * [Cross-compiling for Intel CPUs](#cross-compiling-for-intel-cpus) - * [Final note](#final-note) - * [Troubleshooting](#troubleshooting) + * [Windows](#windows) @@ -44,24 +44,74 @@ python3 -c "import tensorflow_decision_forests as tfdf; print('Found TF-DF v' + ## Build from source +### Technical details + +TensorFlow Decision Forests (TF-DF) implements custom ops for TensorFlow and +therefore depends on TensorFlow's ABI. Since the ABI can change between +versions, any TF-DF version is only compatible with one specific TensorFlow +version. + +To avoid compiling and shipping all of TensorFlow with TF-DF, TF-DF +links against libtensorflow shared library that is distributed with TensorFlow's +Pip package. Only a small part of Tensorflow is compiled and compilation only +takes ~10 minutes on a strong workstation (instead of multiple hours when +compiling all of TensorFlow). To ensure this works, the version of TensorFlow +that is actually compiled and the libtensorflow shared library must match +exactly. + +The `tools/test_bazel.sh` script configures the TF-DF build to ensure the +versions of the packages used match. For details on this process, see the source +code of this script. Since TensorFlow compilation changes often, it only +supports building with the most recent TensorFlow versions and nightly. + +**Note**: When distributing builds, you may set the `__git_version__` string in +`tensorflow_decision_forests/__init__.py` to identify the commit you built from. + ### Linux -#### Setup +#### Docker build + +The easiest way to build TF-DF on Linux is by using TensorFlow's build +[Build docker](https://github.com/tensorflow/build). Just run the following +steps to build: + +```shell +./tools/start_compile_docker.sh # Start the docker, might require root +export RUN_TESTS=1 # Whether to run tests after build +export PY_VERSION=3.9 # Python version to use for build +# TensorFlow version to compile against. This must match exactly the version +# of TensorFlow used at runtime, otherwise TF-DF may crash unexpectedly. +export TF_VERSION=2.16.1 # Set to "nightly" for building with tf-nightly +./tools/test_bazel.sh +``` + +This places the compiled C++ code in the `bazel-bin` directory. Note that this +is a symbolic link that is not exposed outside the container (i.e. the build is +gone after leaving the container). + +For building the wheels, run +```shell +tools/build_pip_package.sh ALL_VERSIONS INSTALL_PYENV +``` + +This will install [Pyenv](https://github.com/pyenv/pyenv) and +[Pyenv-virtualenv](https://github.com/pyenv/pyenv-virtualenv) inside the docker +and use it to install Python in all supported versions for building. The wheels +are placed in the `dist/` subdirectory. + +#### Manual build + +Building TF-DF without the docker might be harder, and the team is probably not +able to help with this. **Requirements** -- Bazel >= 3.7.2 +- Bazel >= 6.3.0 - Python >= 3 - Git -- Python packages: numpy tensorflow pandas - -Instead of installing the dependencies by hands, you can use the -[TensorFlow Build docker](https://github.com/tensorflow/build). If you choose -this options, install Docker: +- Pyenv, Pyenv-virtualenv (only if packaging for many Python versions) -- [Docker](https://docs.docker.com/get-docker/). - -#### Compilation +**Building** Download TensorFlow Decision Forests as follows: @@ -71,31 +121,22 @@ git clone https://github.com/tensorflow/decision-forests.git cd decision-forests ``` -**Optional:** TensorFlow Decision Forests depends on +*Optional:* TensorFlow Decision Forests depends on [Yggdrasil Decision Forests](https://github.com/google/yggdrasil-decision-forests) . If you want to edit the Yggdrasil code, you can clone the Yggdrasil repository and change the path accordingly in `third_party/yggdrasil_decision_forests/workspace.bzl`. -**Optional:** If you want to use the docker option, run the -`start_compile_docker.sh` script and continue to the next step. If you don't -want to use the docker option, continue to the next step directly. - -```shell -# Optional: Install and start the build docker. -./tools/start_compile_docker.sh -``` - Compile and run the unit tests of TF-DF with the following command. Note that -`test_bazel.sh` is configured for `python3.8` and the default compiler on your -machine. Edit the file directly to change this configuration. +`test_bazel.sh` is configured for the default compiler on your machine. Edit the +file directly to change this configuration. ```shell # Build and test TF-DF. -./tools/test_bazel.sh +RUN_TESTS=1 PY_VERSION=3.9 TF_VERSION=2.16.1 ./tools/test_bazel.sh ``` -Create and test a pip package with the following command. Replace python3.8 by +Create and test a pip package with the following command. Replace python3.9 by the version of python you want to use. Note that you don't have to use the same version of Python as in the `test_bazel.sh` script. @@ -154,25 +195,28 @@ For MacOS systems with ARM64 CPU, follow these steps: 1. Prepare your environment - ``` + ```shell git clone https://github.com/tensorflow/decision-forests.git python3 -m venv venv - source venv/source/activate + source venv/bin/activate ``` 1. Decide which Python version and TensorFlow version you want to use and run - ``` + ```shell cd decision-forests - export TF_VERSION=2.15.0 # Change to the TensorFlow Version you need. - export PY_VERSION=3.9 # Change to the Python you need. - export RUN_TESTS=1 # Change to 0 if you want to skip tests. - ./tools/test_bazel.sh # Takes ~15 minutes on a modern Mac. + bazel clean --expunge # Remove old builds (esp. cross-compiled). + export RUN_TESTS=1 # Whether to run tests after build. + export PY_VERSION=3.9 # Python version to use for build. + # TensorFlow version to compile against. This must match exactly the version + # of TensorFlow used at runtime, otherwise TF-DF may crash unexpectedly. + export TF_VERSION=2.16.1 + ./tools/test_bazel.sh # Takes ~15 minutes on a modern Mac. ``` -1. Package the code. +1. Package the build. - ``` + ```shell # Building the packages uses different virtualenvs through Pyenv. deactivate # Build the packages. @@ -188,7 +232,7 @@ machines with Intel CPUs as follows. 1. Prepare your environment - ``` + ```shell git clone https://github.com/tensorflow/decision-forests.git python3 -m venv venv source venv/source/activate @@ -196,28 +240,35 @@ machines with Intel CPUs as follows. 1. Decide which Python version you want to use and run - ``` + ```shell cd decision-forests - export TF_VERSION=2.15.0 # Change to the TensorFlow Version you need. - export PY_VERSION=3.9 # Change to the Python you need. - export RUN_TESTS=0 # Cross-compiled packages cannot be tested. - export MAC_INTEL_CROSSCOMPILE=1 - ./tools/test_bazel.sh # Takes ~15 minutes on a modern Mac. + bazel clean --expunge # Remove old builds (esp. cross-compiled). + export RUN_TESTS=0 # Cross-compiled builds can't run tests. + export PY_VERSION=3.9 # Python version to use for build. + # TensorFlow version to compile against. This must match exactly the version + # of TensorFlow used at runtime, otherwise TF-DF may crash unexpectedly. + export TF_VERSION=2.16.1 + export MAC_INTEL_CROSSCOMPILE=1 # Enable cross-compilation. + ./tools/test_bazel.sh # Takes ~15 minutes on a modern Mac. ``` -1. Package the code. +1. Package the build. - ``` + ```shell # Building the packages uses different virtualenvs through Pyenv. deactivate # Build the packages. ./tools/build_pip_package.sh ALL_VERSIONS_MAC_INTEL_CROSSCOMPILE ``` -1. The packages can be found in `decision-forests/dist/`. +1. The packages can be found in `decision-forests/dist/`. Note that they have + not been tested and it would be prudent to test them before distribution. + +### Windows -## Final note +A Windows build has been successfully produced in the past, but is not +maintained at this point. See `tools/test_bazel.bat` and `tools/test_bazel.sh` +for (possibly outdated) pointers for compiling on Windows. -Compiling TF-DF relies on the TensorFlow Pip package *and* the TensorFlow Bazel -dependency. Only a small part of TensorFlow will be compiled. -Compiling TF-DF on a single powerful workstation takes ~10 minutes. +For Windows users, [YDF](https://ydf.readthedocs.io) offers official Windows +builds and most of the functionality (and more!) of TF-DF. diff --git a/documentation/known_issues.md b/documentation/known_issues.md index fd4e0ef..e181ae1 100644 --- a/documentation/known_issues.md +++ b/documentation/known_issues.md @@ -17,16 +17,26 @@ TensorFlow Decision Forests is not yet available as a Windows Pip package. [Windows Subsystem for Linux (WSL)](https://en.wikipedia.org/wiki/Windows_Subsystem_for_Linux) on your Windows machine and follow the Linux instructions. +## Incompatibility with Keras 3 + +Compatibility with Keras 3 is not yet implemented. Use tf_keras or a TensorFlow +version before 2.16. + +## Untested for conda + +While TF-DF might work with Conda, this is not tested and we currently do not +maintain packages on conda-forge. + ## Incompatibility with old or nightly versions of TensorFlow -TensorFlow [ABI](https://en.wikipedia.org/wiki/Application_binary_interface) is -not compatible in between releases. Because TF-DF relies on custom TensorFlow +TensorFlow's [ABI](https://en.wikipedia.org/wiki/Application_binary_interface) +is not compatible in between releases. Because TF-DF relies on custom TensorFlow C++ ops, each version of TF-DF is tied to a specific version of TensorFlow. The last released version of TF-DF is always tied to the last released version of TensorFlow. -For reasons, the current version of TF-DF might not be compatible with older -versions or with the nightly build of TensorFlow. +For these reasons, the current version of TF-DF might not be compatible with +older versions or with the nightly build of TensorFlow. If using incompatible versions of TF and TF-DF, you will see cryptic errors such as: @@ -37,8 +47,6 @@ tensorflow_decision_forests/tensorflow/ops/training/training.so: undefined symbo - Use the version of TF-DF that is compatible with your version of TensorFlow. -Note that TF-DF is not compatible with Keras 3 at this time. - ### Compatibility table The following table shows the compatibility between @@ -46,7 +54,9 @@ The following table shows the compatibility between tensorflow_decision_forests | tensorflow --------------------------- | --------------- -1.6.0 | 2.14.0 +1.9.0 | 2.16.1 +1.8.0 - 1.8.1 | 2.15.0 +1.6.0 - 1.7.0 | 2.14.0 1.5.0 | 2.13.0 1.3.0 - 1.4.0 | 2.12.0 1.1.0 - 1.2.0 | 2.11.0 @@ -72,7 +82,7 @@ does. **Workarounds:** -- Use a model that support distribution strategies (e.g. +- Use a model that supports distribution strategies (e.g. `DistributedGradientBoostedTreesModel`), or downsample your dataset so that it fits on a single machine. diff --git a/tensorflow_decision_forests/__init__.py b/tensorflow_decision_forests/__init__.py index 2f11695..e99378d 100644 --- a/tensorflow_decision_forests/__init__.py +++ b/tensorflow_decision_forests/__init__.py @@ -51,10 +51,10 @@ """ -__version__ = "1.9.0rc0" +__version__ = "1.9.0" __author__ = "Mathieu Guillame-Bert" -compatible_tf_versions = ["2.16.0-rc0"] +compatible_tf_versions = ["2.16.1"] __git_version__ = "HEAD" # Modify for release build. from tensorflow_decision_forests.tensorflow import check_version diff --git a/tensorflow_decision_forests/keras/wrappers_pre_generated.py b/tensorflow_decision_forests/keras/wrappers_pre_generated.py index 13fd58d..b7684f0 100644 --- a/tensorflow_decision_forests/keras/wrappers_pre_generated.py +++ b/tensorflow_decision_forests/keras/wrappers_pre_generated.py @@ -297,6 +297,16 @@ class CartModel(core.CoreModel): IN_NODE. - IN_NODE: The features are sorted just before being used in the node. This solution is slow but consumes little amount of memory. . Default: "PRESORT". + sparse_oblique_max_num_projections: For sparse oblique splits i.e. + `split_axis=SPARSE_OBLIQUE`. Maximum number of projections (applied after + the num_projections_exponent). Oblique splits try out + max(p^num_projections_exponent, max_num_projections) random projections + for choosing a split, where p is the number of numerical features. + Increasing "max_num_projections" increases the training time but not the + inference time. In late stage model development, if every bit of accuracy + if important, increase this value. The paper "Sparse Projection Oblique + Random Forests" (Tomita et al, 2020) does not define this hyperparameter. + Default: None. sparse_oblique_normalization: For sparse oblique splits i.e. `split_axis=SPARSE_OBLIQUE`. Normalization applied on the features, before applying the sparse oblique projections. - `NONE`: No normalization. - @@ -306,12 +316,28 @@ class CartModel(core.CoreModel): max-min) estimated on the entire train dataset. Default: None. sparse_oblique_num_projections_exponent: For sparse oblique splits i.e. `split_axis=SPARSE_OBLIQUE`. Controls of the number of random projections - to test at each node as `num_features^num_projections_exponent`. Default: - None. - sparse_oblique_projection_density_factor: For sparse oblique splits i.e. - `split_axis=SPARSE_OBLIQUE`. Controls of the number of random projections - to test at each node as `num_features^num_projections_exponent`. Default: - None. + to test at each node. Increasing this value very likely improves the + quality of the model, drastically increases the training time, and doe not + impact the inference time. Oblique splits try out + max(p^num_projections_exponent, max_num_projections) random projections + for choosing a split, where p is the number of numerical features. + Therefore, increasing this `num_projections_exponent` and possibly + `max_num_projections` may improve model quality, but will also + significantly increase training time. Note that the complexity of + (classic) Random Forests is roughly proportional to + `num_projections_exponent=0.5`, since it considers sqrt(num_features) for + a split. The complexity of (classic) GBDT is roughly proportional to + `num_projections_exponent=1`, since it considers all features for a split. + The paper "Sparse Projection Oblique Random Forests" (Tomita et al, 2020) + recommends values in [1/4, 2]. Default: None. + sparse_oblique_projection_density_factor: Density of the projections as an + exponent of the number of features. Independently for each projection, + each feature has a probability "projection_density_factor / num_features" + to be considered in the projection. The paper "Sparse Projection Oblique + Random Forests" (Tomita et al, 2020) calls this parameter `lambda` and + recommends values in [1, 5]. Increasing this value increases training and + inference time (on average). This value is best tuned for each dataset. + Default: None. sparse_oblique_weights: For sparse oblique splits i.e. `split_axis=SPARSE_OBLIQUE`. Possible values: - `BINARY`: The oblique weights are sampled in {-1,1} (default). - `CONTINUOUS`: The oblique @@ -383,6 +409,7 @@ def __init__( pure_serving_model: Optional[bool] = False, random_seed: Optional[int] = 123456, sorting_strategy: Optional[str] = "PRESORT", + sparse_oblique_max_num_projections: Optional[int] = None, sparse_oblique_normalization: Optional[str] = None, sparse_oblique_num_projections_exponent: Optional[float] = None, sparse_oblique_projection_density_factor: Optional[float] = None, @@ -425,6 +452,9 @@ def __init__( "pure_serving_model": pure_serving_model, "random_seed": random_seed, "sorting_strategy": sorting_strategy, + "sparse_oblique_max_num_projections": ( + sparse_oblique_max_num_projections + ), "sparse_oblique_normalization": sparse_oblique_normalization, "sparse_oblique_num_projections_exponent": ( sparse_oblique_num_projections_exponent @@ -1154,6 +1184,16 @@ class GradientBoostedTreesModel(core.CoreModel): IN_NODE. - IN_NODE: The features are sorted just before being used in the node. This solution is slow but consumes little amount of memory. . Default: "PRESORT". + sparse_oblique_max_num_projections: For sparse oblique splits i.e. + `split_axis=SPARSE_OBLIQUE`. Maximum number of projections (applied after + the num_projections_exponent). Oblique splits try out + max(p^num_projections_exponent, max_num_projections) random projections + for choosing a split, where p is the number of numerical features. + Increasing "max_num_projections" increases the training time but not the + inference time. In late stage model development, if every bit of accuracy + if important, increase this value. The paper "Sparse Projection Oblique + Random Forests" (Tomita et al, 2020) does not define this hyperparameter. + Default: None. sparse_oblique_normalization: For sparse oblique splits i.e. `split_axis=SPARSE_OBLIQUE`. Normalization applied on the features, before applying the sparse oblique projections. - `NONE`: No normalization. - @@ -1163,12 +1203,28 @@ class GradientBoostedTreesModel(core.CoreModel): max-min) estimated on the entire train dataset. Default: None. sparse_oblique_num_projections_exponent: For sparse oblique splits i.e. `split_axis=SPARSE_OBLIQUE`. Controls of the number of random projections - to test at each node as `num_features^num_projections_exponent`. Default: - None. - sparse_oblique_projection_density_factor: For sparse oblique splits i.e. - `split_axis=SPARSE_OBLIQUE`. Controls of the number of random projections - to test at each node as `num_features^num_projections_exponent`. Default: - None. + to test at each node. Increasing this value very likely improves the + quality of the model, drastically increases the training time, and doe not + impact the inference time. Oblique splits try out + max(p^num_projections_exponent, max_num_projections) random projections + for choosing a split, where p is the number of numerical features. + Therefore, increasing this `num_projections_exponent` and possibly + `max_num_projections` may improve model quality, but will also + significantly increase training time. Note that the complexity of + (classic) Random Forests is roughly proportional to + `num_projections_exponent=0.5`, since it considers sqrt(num_features) for + a split. The complexity of (classic) GBDT is roughly proportional to + `num_projections_exponent=1`, since it considers all features for a split. + The paper "Sparse Projection Oblique Random Forests" (Tomita et al, 2020) + recommends values in [1/4, 2]. Default: None. + sparse_oblique_projection_density_factor: Density of the projections as an + exponent of the number of features. Independently for each projection, + each feature has a probability "projection_density_factor / num_features" + to be considered in the projection. The paper "Sparse Projection Oblique + Random Forests" (Tomita et al, 2020) calls this parameter `lambda` and + recommends values in [1, 5]. Increasing this value increases training and + inference time (on average). This value is best tuned for each dataset. + Default: None. sparse_oblique_weights: For sparse oblique splits i.e. `split_axis=SPARSE_OBLIQUE`. Possible values: - `BINARY`: The oblique weights are sampled in {-1,1} (default). - `CONTINUOUS`: The oblique @@ -1279,6 +1335,7 @@ def __init__( selective_gradient_boosting_ratio: Optional[float] = 0.01, shrinkage: Optional[float] = 0.1, sorting_strategy: Optional[str] = "PRESORT", + sparse_oblique_max_num_projections: Optional[int] = None, sparse_oblique_normalization: Optional[str] = None, sparse_oblique_num_projections_exponent: Optional[float] = None, sparse_oblique_projection_density_factor: Optional[float] = None, @@ -1351,6 +1408,9 @@ def __init__( "selective_gradient_boosting_ratio": selective_gradient_boosting_ratio, "shrinkage": shrinkage, "sorting_strategy": sorting_strategy, + "sparse_oblique_max_num_projections": ( + sparse_oblique_max_num_projections + ), "sparse_oblique_normalization": sparse_oblique_normalization, "sparse_oblique_num_projections_exponent": ( sparse_oblique_num_projections_exponent @@ -2207,6 +2267,16 @@ class RandomForestModel(core.CoreModel): IN_NODE. - IN_NODE: The features are sorted just before being used in the node. This solution is slow but consumes little amount of memory. . Default: "PRESORT". + sparse_oblique_max_num_projections: For sparse oblique splits i.e. + `split_axis=SPARSE_OBLIQUE`. Maximum number of projections (applied after + the num_projections_exponent). Oblique splits try out + max(p^num_projections_exponent, max_num_projections) random projections + for choosing a split, where p is the number of numerical features. + Increasing "max_num_projections" increases the training time but not the + inference time. In late stage model development, if every bit of accuracy + if important, increase this value. The paper "Sparse Projection Oblique + Random Forests" (Tomita et al, 2020) does not define this hyperparameter. + Default: None. sparse_oblique_normalization: For sparse oblique splits i.e. `split_axis=SPARSE_OBLIQUE`. Normalization applied on the features, before applying the sparse oblique projections. - `NONE`: No normalization. - @@ -2216,12 +2286,28 @@ class RandomForestModel(core.CoreModel): max-min) estimated on the entire train dataset. Default: None. sparse_oblique_num_projections_exponent: For sparse oblique splits i.e. `split_axis=SPARSE_OBLIQUE`. Controls of the number of random projections - to test at each node as `num_features^num_projections_exponent`. Default: - None. - sparse_oblique_projection_density_factor: For sparse oblique splits i.e. - `split_axis=SPARSE_OBLIQUE`. Controls of the number of random projections - to test at each node as `num_features^num_projections_exponent`. Default: - None. + to test at each node. Increasing this value very likely improves the + quality of the model, drastically increases the training time, and doe not + impact the inference time. Oblique splits try out + max(p^num_projections_exponent, max_num_projections) random projections + for choosing a split, where p is the number of numerical features. + Therefore, increasing this `num_projections_exponent` and possibly + `max_num_projections` may improve model quality, but will also + significantly increase training time. Note that the complexity of + (classic) Random Forests is roughly proportional to + `num_projections_exponent=0.5`, since it considers sqrt(num_features) for + a split. The complexity of (classic) GBDT is roughly proportional to + `num_projections_exponent=1`, since it considers all features for a split. + The paper "Sparse Projection Oblique Random Forests" (Tomita et al, 2020) + recommends values in [1/4, 2]. Default: None. + sparse_oblique_projection_density_factor: Density of the projections as an + exponent of the number of features. Independently for each projection, + each feature has a probability "projection_density_factor / num_features" + to be considered in the projection. The paper "Sparse Projection Oblique + Random Forests" (Tomita et al, 2020) calls this parameter `lambda` and + recommends values in [1, 5]. Increasing this value increases training and + inference time (on average). This value is best tuned for each dataset. + Default: None. sparse_oblique_weights: For sparse oblique splits i.e. `split_axis=SPARSE_OBLIQUE`. Possible values: - `BINARY`: The oblique weights are sampled in {-1,1} (default). - `CONTINUOUS`: The oblique @@ -2304,6 +2390,7 @@ def __init__( random_seed: Optional[int] = 123456, sampling_with_replacement: Optional[bool] = True, sorting_strategy: Optional[str] = "PRESORT", + sparse_oblique_max_num_projections: Optional[int] = None, sparse_oblique_normalization: Optional[str] = None, sparse_oblique_num_projections_exponent: Optional[float] = None, sparse_oblique_projection_density_factor: Optional[float] = None, @@ -2358,6 +2445,9 @@ def __init__( "random_seed": random_seed, "sampling_with_replacement": sampling_with_replacement, "sorting_strategy": sorting_strategy, + "sparse_oblique_max_num_projections": ( + sparse_oblique_max_num_projections + ), "sparse_oblique_normalization": sparse_oblique_normalization, "sparse_oblique_num_projections_exponent": ( sparse_oblique_num_projections_exponent diff --git a/third_party/yggdrasil_decision_forests/workspace.bzl b/third_party/yggdrasil_decision_forests/workspace.bzl index 32b8dc3..a7b0dab 100644 --- a/third_party/yggdrasil_decision_forests/workspace.bzl +++ b/third_party/yggdrasil_decision_forests/workspace.bzl @@ -4,10 +4,13 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") def deps(from_git_repo = True): if from_git_repo: + YDF_VERSION = "1.9.0" + YDF_SHA = "4b102dc3a08989aa069e1a830f58331e0b17d0a73df98bc3351e3378f2cfebc2" http_archive( name = "ydf", - urls = ["https://github.com/google/yggdrasil-decision-forests/archive/refs/heads/main.zip"], - strip_prefix = "yggdrasil-decision-forests-main", + urls = ["https://github.com/google/yggdrasil-decision-forests/archive/refs/tags/v{version}.tar.gz".format(version = YDF_VERSION)], + strip_prefix = "yggdrasil-decision-forests-{version}".format(version = YDF_VERSION), + sha256 = YDF_SHA, # patch_args = ["-p1"], # patches = ["@ydf//yggdrasil_decision_forests:ydf.patch"], ) diff --git a/tools/test_bazel.sh b/tools/test_bazel.sh index 938ddd6..c578918 100755 --- a/tools/test_bazel.sh +++ b/tools/test_bazel.sh @@ -19,14 +19,14 @@ # Options # RUN_TESTS: Run the unit tests e.g. 0 or 1. # PY_VERSION: Version of Python to be used, must be at least 3.9 -# STARTUP_FLAGS: Any flags given to baze on startup +# STARTUP_FLAGS: Any flags given to bazel on startup # TF_VERSION: Tensorflow version to use or "nightly". # MAC_INTEL_CROSSCOMPILE: Cross-compile for Intel Macs # FULL_COMPILATION: If 1, compile all parts of TF-DF. This may take a long time. # # Usage example # -# RUN_TESTS=1 PY_VERSION=3.9 TF_VERSION=2.15.0 ./tools/test_bazel.sh +# RUN_TESTS=1 PY_VERSION=3.9 TF_VERSION=2.16.1 ./tools/test_bazel.sh set -vex @@ -108,8 +108,8 @@ short_commit_sha=$(echo $short_commit_sha | grep -oP '(?<=-g)[0-9a-f]*$') echo "Found tensorflow commit sha: $short_commit_sha" commit_slug=$(curl -s "https://api.github.com/repos/tensorflow/tensorflow/commits/$short_commit_sha" | grep "sha" | head -n 1 | cut -d '"' -f 4) # Update TF dependency to the chosen version -sed -E -i "s/strip_prefix = \"tensorflow-2\.[0-9]+\.[0-9]+(-rc[0-9]+)?\",/strip_prefix = \"tensorflow-${commit_slug}\",/" WORKSPACE -sed -E -i "s|\"https://github.com/tensorflow/tensorflow/archive/v.+\.zip\"|\"https://github.com/tensorflow/tensorflow/archive/${commit_slug}.zip\"|" WORKSPACE +sed -E -i "s/strip_prefix = \"tensorflow-2\.[0-9]+(\.[0-9]+)*(-rc[0-9]+)?\",/strip_prefix = \"tensorflow-${commit_slug}\",/" WORKSPACE +sed -E -i "s|\"https://github.com/tensorflow/tensorflow/archive/v.+\.tar.gz\"|\"https://github.com/tensorflow/tensorflow/archive/${commit_slug}.tar.gz\"|" WORKSPACE prev_shasum=$(grep -A 1 -e "strip_prefix.*tensorflow-" WORKSPACE | tail -1 | awk -F '"' '{print $2}') sed -i "s/sha256 = \"${prev_shasum}\",//" WORKSPACE