diff --git a/VERSION.md b/VERSION.md index 8e3b76e9d..26ca59460 100644 --- a/VERSION.md +++ b/VERSION.md @@ -1 +1 @@ -1.5.1-rc.2 +1.5.1 diff --git a/docs/modules/functions.rst b/docs/modules/functions.rst index 32ed34bc2..a27c83e50 100644 --- a/docs/modules/functions.rst +++ b/docs/modules/functions.rst @@ -876,7 +876,7 @@ Tile Primitives :returns: A tile with ``shape=(m,n)`` and dtype the same as the source array -.. py:function:: tile_store(a: Array[Any], i: int32, t: Any) -> None +.. py:function:: tile_store(a: Array[Any], i: int32, t: Tile) -> None Stores a 1D tile to a global memory array. @@ -887,7 +887,7 @@ Tile Primitives :param t: The source tile to store data from, must have the same dtype as the destination array -.. py:function:: tile_store(a: Array[Any], i: int32, j: int32, t: Any) -> None +.. py:function:: tile_store(a: Array[Any], i: int32, j: int32, t: Tile) -> None :noindex: :nocontentsentry: @@ -901,7 +901,7 @@ Tile Primitives :param t: The source tile to store data from, must have the same dtype as the destination array -.. py:function:: tile_atomic_add(a: Array[Any], x: int32, y: int32, t: Any) -> Tile +.. py:function:: tile_atomic_add(a: Array[Any], x: int32, y: int32, t: Tile) -> Tile Atomically add a tile to the array `a`, each element will be updated atomically. @@ -967,7 +967,7 @@ Tile Primitives -.. py:function:: untile(a: Any) -> Scalar +.. py:function:: untile(a: Tile) -> Scalar Convert a Tile back to per-thread values. @@ -991,7 +991,7 @@ Tile Primitives t = wp.tile(i)*2 # convert back to per-thread values - s = wp.untile() + s = wp.untile(t) print(s) @@ -1038,7 +1038,7 @@ Tile Primitives Broadcast a tile. - This method will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules. + This function will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules. :param a: Tile to broadcast :returns: Tile with broadcast ``shape=(m, n)`` @@ -1061,9 +1061,9 @@ Tile Primitives t = wp.tile_ones(dtype=float, m=16, n=16) s = wp.tile_sum(t) - print(t) + print(s) - wp.launch(compute, dim=[64], inputs=[]) + wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64) Prints: @@ -1088,18 +1088,19 @@ Tile Primitives @wp.kernel def compute(): - t = wp.tile_arange(start=--10, stop=10, dtype=float) + t = wp.tile_arange(64, 128) s = wp.tile_min(t) - print(t) + print(s) - wp.launch(compute, dim=[64], inputs=[]) + + wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64) Prints: .. code-block:: text - tile(m=1, n=1, storage=register) = [[-10]] + tile(m=1, n=1, storage=register) = [[64 ]] @@ -1118,23 +1119,23 @@ Tile Primitives @wp.kernel def compute(): - t = wp.tile_arange(start=--10, stop=10, dtype=float) - s = wp.tile_min(t) + t = wp.tile_arange(64, 128) + s = wp.tile_max(t) - print(t) + print(s) - wp.launch(compute, dim=[64], inputs=[]) + wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64) Prints: .. code-block:: text - tile(m=1, n=1, storage=register) = [[10]] + tile(m=1, n=1, storage=register) = [[127 ]] -.. py:function:: tile_reduce(op: Callable, a: Any) -> Tile +.. py:function:: tile_reduce(op: Callable, a: Tile) -> Tile Apply a custom reduction operator across the tile. @@ -1156,7 +1157,7 @@ Tile Primitives print(s) - wp.launch(factorial, dim=[16], inputs=[], block_dim=16) + wp.launch_tiled(factorial, dim=[1], inputs=[], block_dim=16) Prints: @@ -1166,7 +1167,7 @@ Tile Primitives -.. py:function:: tile_map(op: Callable, a: Any) -> Tile +.. py:function:: tile_map(op: Callable, a: Tile) -> Tile Apply a unary function onto the tile. @@ -1188,7 +1189,7 @@ Tile Primitives print(s) - wp.launch(compute, dim=[16], inputs=[]) + wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=16) Prints: @@ -1198,7 +1199,7 @@ Tile Primitives -.. py:function:: tile_map(op: Callable, a: Any, b: Any) -> Tile +.. py:function:: tile_map(op: Callable, a: Tile, b: Tile) -> Tile :noindex: :nocontentsentry: @@ -1226,7 +1227,7 @@ Tile Primitives print(s) - wp.launch(compute, dim=[16], inputs=[]) + wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=16) Prints: diff --git a/docs/modules/tiles.rst b/docs/modules/tiles.rst index d8d6f12f9..e0adcbf53 100644 --- a/docs/modules/tiles.rst +++ b/docs/modules/tiles.rst @@ -22,25 +22,42 @@ Inside kernels, tile operations are executed cooperatively across each block of In the following example, we launch a grid of threads where each block is responsible for loading a row of data from a 2D array and computing its sum: -.. code:: python +.. testcode:: TILE_SIZE = wp.constant(256) TILE_THREADS = 64 @wp.kernel - def compute(a: array2d(dtype=float)) - + def compute(a: wp.array2d(dtype=float), b: wp.array2d(dtype=float)): + # obtain our block index i = wp.tid() # load a row from global memory - t = wp.tile_load(array[i], i, TILE_SIZE) + t = wp.tile_load(a[i], 0, TILE_SIZE) + + # cooperatively compute the sum of the tile elements; s is a 1x1 tile s = wp.tile_sum(t) - ... - wp.launch_tiled(compute, dim=[a.shape[0]], inputs=[a], block_dim=TILE_THREADS) + # store s in global memory + wp.tile_store(b[0], i, s) + + N = 10 + + a_np = np.arange(N).reshape(-1, 1) * np.ones((1, 256), dtype=float) + a = wp.array(a_np, dtype=float) + b = wp.zeros((1,N), dtype=float) + + wp.launch_tiled(compute, dim=[a.shape[0]], inputs=[a, b], block_dim=TILE_THREADS) + + print(f"b = {b}") + +.. testoutput:: + + b = [[ 0. 256. 512. 768. 1024. 1280. 1536. 1792. 2048. 2304.]] Here, we have used the new :func:`warp.launch_tiled` function which assigns ``TILE_THREADS`` threads to each of the elements in the launch grid. Each block of ``TILE_THREADS`` threads then loads an entire row of 256 values from the global memory array and computes its sum (cooperatively). +Note that we loaded the row by writing ``t = wp.tile_load(a[i], 0, TILE_SIZE)`` but we could have used the equivalent statement ``t = wp.tile_load(a[0], i, TILE_SIZE)`` instead. Tile Properties @@ -55,7 +72,7 @@ In Warp, tile objects are 2D arrays of data where the tile elements may be scala TILE_THREADS = 64 @wp.kernel - def compute(a: array2d(dtype=float)) + def compute(a: array2d(dtype=float)): # obtain our 2d block index i, j = wp.tid() @@ -213,7 +230,7 @@ Traditionally, Warp kernels are primarily written in the SIMT programming model, TILE_THREADS = 64 @wp.kernel - def compute() + def compute(): i = wp.tid() # perform some per-thread computation diff --git a/exts/omni.warp.core/config/extension.toml b/exts/omni.warp.core/config/extension.toml index 2ae29633b..0a55904a5 100644 --- a/exts/omni.warp.core/config/extension.toml +++ b/exts/omni.warp.core/config/extension.toml @@ -1,6 +1,6 @@ [package] # Semantic Versioning is used: https://semver.org/ -version = "1.5.1-rc.2" +version = "1.5.1" authors = ["NVIDIA"] title = "Warp Core" description="The core Warp Python module" diff --git a/exts/omni.warp.core/docs/CHANGELOG.md b/exts/omni.warp.core/docs/CHANGELOG.md index 43c563612..d42c8a3ed 100644 --- a/exts/omni.warp.core/docs/CHANGELOG.md +++ b/exts/omni.warp.core/docs/CHANGELOG.md @@ -1,6 +1,6 @@ # CHANGELOG -## [1.5.1-rc.2] - 2025-01-02 +## [1.5.1] - 2025-01-02 ### Added diff --git a/exts/omni.warp/config/extension.toml b/exts/omni.warp/config/extension.toml index d63163bea..8a5558a83 100644 --- a/exts/omni.warp/config/extension.toml +++ b/exts/omni.warp/config/extension.toml @@ -1,6 +1,6 @@ [package] # Semantic Versioning is used: https://semver.org/ -version = "1.5.1-rc.2" +version = "1.5.1" authors = ["NVIDIA"] title = "Warp" description="Warp OmniGraph Nodes and Sample Scenes" @@ -35,7 +35,7 @@ exclude = ["Ogn*Database.py", "*/ogn*"] "omni.timeline" = {} "omni.ui" = {optional = true} "omni.usd" = {} -"omni.warp.core" = {version = "1.5.1-rc.2", exact = true} +"omni.warp.core" = {version = "1.5.1", exact = true} [[python.module]] name = "omni.warp._extension" diff --git a/exts/omni.warp/docs/CHANGELOG.md b/exts/omni.warp/docs/CHANGELOG.md index 43c563612..d42c8a3ed 100644 --- a/exts/omni.warp/docs/CHANGELOG.md +++ b/exts/omni.warp/docs/CHANGELOG.md @@ -1,6 +1,6 @@ # CHANGELOG -## [1.5.1-rc.2] - 2025-01-02 +## [1.5.1] - 2025-01-02 ### Added diff --git a/warp/builtins.py b/warp/builtins.py index d3a89dbe2..73a9261a9 100644 --- a/warp/builtins.py +++ b/warp/builtins.py @@ -1852,6 +1852,7 @@ def tile_arange_value_func(arg_types: Mapping[str, type], arg_values: Mapping[st step = args[2] if start is None or stop is None or step is None: + print(args) raise RuntimeError("wp.tile_arange() arguments must be compile time constants") if "dtype" in arg_values: @@ -2083,7 +2084,7 @@ def tile_store_1d_value_func(arg_types, arg_values): add_builtin( "tile_store", - input_types={"a": array(dtype=Any), "i": int, "t": Any}, + input_types={"a": array(dtype=Any), "i": int, "t": Tile(dtype=Any, M=Any, N=Any)}, value_func=tile_store_1d_value_func, variadic=False, skip_replay=True, @@ -2132,7 +2133,7 @@ def tile_store_2d_value_func(arg_types, arg_values): add_builtin( "tile_store", - input_types={"a": array(dtype=Any), "i": int, "j": int, "t": Any}, + input_types={"a": array(dtype=Any), "i": int, "j": int, "t": Tile(dtype=Any, M=Any, N=Any)}, value_func=tile_store_2d_value_func, variadic=False, skip_replay=True, @@ -2177,7 +2178,7 @@ def tile_atomic_add_value_func(arg_types, arg_values): add_builtin( "tile_atomic_add", - input_types={"a": array(dtype=Any), "x": int, "y": int, "t": Any}, + input_types={"a": array(dtype=Any), "x": int, "y": int, "t": Tile(dtype=Any, M=Any, N=Any)}, value_func=tile_atomic_add_value_func, variadic=True, skip_replay=True, @@ -2365,7 +2366,7 @@ def untile_value_func(arg_types, arg_values): add_builtin( "untile", - input_types={"a": Any}, + input_types={"a": Tile(dtype=Any, M=Any, N=Any)}, value_func=untile_value_func, variadic=True, doc="""Convert a Tile back to per-thread values. @@ -2390,7 +2391,7 @@ def compute(): t = wp.tile(i)*2 # convert back to per-thread values - s = wp.untile() + s = wp.untile(t) print(s) @@ -2562,7 +2563,7 @@ def tile_broadcast_dispatch_func(arg_types: Mapping[str, type], return_type: Any variadic=True, doc="""Broadcast a tile. - This method will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules. + This function will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules. :param a: Tile to broadcast :returns: Tile with broadcast ``shape=(m, n)``""", @@ -2654,9 +2655,9 @@ def compute(): t = wp.tile_ones(dtype=float, m=16, n=16) s = wp.tile_sum(t) - print(t) + print(s) - wp.launch(compute, dim=[64], inputs=[]) + wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64) Prints: @@ -2703,18 +2704,19 @@ def tile_min_value_func(arg_types, arg_values): @wp.kernel def compute(): - t = wp.tile_arange(start=--10, stop=10, dtype=float) + t = wp.tile_arange(64, 128) s = wp.tile_min(t) - print(t) + print(s) + - wp.launch(compute, dim=[64], inputs=[]) + wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64) Prints: .. code-block:: text - tile(m=1, n=1, storage=register) = [[-10]] + tile(m=1, n=1, storage=register) = [[64 ]] """, group="Tile Primitives", @@ -2755,18 +2757,18 @@ def tile_max_value_func(arg_types, arg_values): @wp.kernel def compute(): - t = wp.tile_arange(start=--10, stop=10, dtype=float) - s = wp.tile_min(t) + t = wp.tile_arange(64, 128) + s = wp.tile_max(t) - print(t) + print(s) - wp.launch(compute, dim=[64], inputs=[]) + wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64) Prints: .. code-block:: text - tile(m=1, n=1, storage=register) = [[10]] + tile(m=1, n=1, storage=register) = [[127 ]] """, group="Tile Primitives", @@ -2796,7 +2798,7 @@ def tile_reduce_dispatch_func(input_types: Mapping[str, type], return_type: Any, add_builtin( "tile_reduce", - input_types={"op": Callable, "a": Any}, + input_types={"op": Callable, "a": Tile(dtype=Any, M=Any, N=Any)}, value_func=tile_reduce_value_func, native_func="tile_reduce", doc="""Apply a custom reduction operator across the tile. @@ -2819,7 +2821,7 @@ def factorial(): print(s) - wp.launch(factorial, dim=[16], inputs=[], block_dim=16) + wp.launch_tiled(factorial, dim=[1], inputs=[], block_dim=16) Prints: @@ -2856,7 +2858,7 @@ def tile_unary_map_value_func(arg_types, arg_values): add_builtin( "tile_map", - input_types={"op": Callable, "a": Any}, + input_types={"op": Callable, "a": Tile(dtype=Any, M=Any, N=Any)}, value_func=tile_unary_map_value_func, # dispatch_func=tile_map_dispatch_func, # variadic=True, @@ -2881,7 +2883,7 @@ def compute(): print(s) - wp.launch(compute, dim=[16], inputs=[]) + wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=16) Prints: @@ -2923,7 +2925,7 @@ def tile_binary_map_value_func(arg_types, arg_values): add_builtin( "tile_map", - input_types={"op": Callable, "a": Any, "b": Any}, + input_types={"op": Callable, "a": Tile(dtype=Any, M=Any, N=Any), "b": Tile(dtype=Any, M=Any, N=Any)}, value_func=tile_binary_map_value_func, # dispatch_func=tile_map_dispatch_func, # variadic=True, @@ -2952,7 +2954,7 @@ def compute(): print(s) - wp.launch(compute, dim=[16], inputs=[]) + wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=16) Prints: diff --git a/warp/config.py b/warp/config.py index 195b4ce46..f684198a9 100644 --- a/warp/config.py +++ b/warp/config.py @@ -7,7 +7,7 @@ from typing import Optional -version: str = "1.5.1-rc.2" +version: str = "1.5.1" """Warp version string""" verify_fp: bool = False diff --git a/warp/native/tile.h b/warp/native/tile.h index edbc8d665..185f04c28 100644 --- a/warp/native/tile.h +++ b/warp/native/tile.h @@ -1125,8 +1125,6 @@ inline CUDA_CALLABLE auto untile(Tile& tile) } } - - template inline CUDA_CALLABLE void adj_untile(Tile& tile, Tile& adj_tile, Value& adj_ret) { @@ -1156,7 +1154,7 @@ inline CUDA_CALLABLE auto tile_zeros() return T(0); } -// zero initialized tile +// one-initialized tile template inline CUDA_CALLABLE auto tile_ones() { @@ -1164,7 +1162,7 @@ inline CUDA_CALLABLE auto tile_ones() return T(1); } -// zero initialized tile +// tile with evenly spaced values template inline CUDA_CALLABLE auto tile_arange(T start, T stop, T step) { @@ -1220,7 +1218,6 @@ inline CUDA_CALLABLE void tile_store(array_t& dest, int x, int y, Tile& src) src.copy_to_global(dest, x, y); } -// entry point for store template inline CUDA_CALLABLE auto tile_atomic_add(array_t& dest, int x, int y, Tile& src) { diff --git a/warp/stubs.py b/warp/stubs.py index a88eb4444..21ac7a871 100644 --- a/warp/stubs.py +++ b/warp/stubs.py @@ -975,7 +975,7 @@ def tile_load(a: Array[Any], i: int32, j: int32, m: int32, n: int32, storage: st @over -def tile_store(a: Array[Any], i: int32, t: Any): +def tile_store(a: Array[Any], i: int32, t: Tile): """Stores a 1D tile to a global memory array. This method will cooperatively store a tile to global memory using all threads in the block. @@ -988,7 +988,7 @@ def tile_store(a: Array[Any], i: int32, t: Any): @over -def tile_store(a: Array[Any], i: int32, j: int32, t: Any): +def tile_store(a: Array[Any], i: int32, j: int32, t: Tile): """Stores a tile to a global memory array. This method will cooperatively store a tile to global memory using all threads in the block. @@ -1002,7 +1002,7 @@ def tile_store(a: Array[Any], i: int32, j: int32, t: Any): @over -def tile_atomic_add(a: Array[Any], x: int32, y: int32, t: Any) -> Tile: +def tile_atomic_add(a: Array[Any], x: int32, y: int32, t: Tile) -> Tile: """Atomically add a tile to the array `a`, each element will be updated atomically. :param a: Array in global memory, should have the same ``dtype`` as the input tile @@ -1077,7 +1077,7 @@ def compute(): @over -def untile(a: Any) -> Scalar: +def untile(a: Tile) -> Scalar: """Convert a Tile back to per-thread values. This function converts a block-wide tile back to per-thread values. @@ -1100,7 +1100,7 @@ def compute(): t = wp.tile(i) * 2 # convert back to per-thread values - s = wp.untile() + s = wp.untile(t) print(s) @@ -1154,7 +1154,7 @@ def tile_transpose(a: Tile) -> Tile: def tile_broadcast(a: Tile, m: int32, n: int32) -> Tile: """Broadcast a tile. - This method will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules. + This function will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules. :param a: Tile to broadcast :returns: Tile with broadcast ``shape=(m, n)`` @@ -1178,10 +1178,10 @@ def compute(): t = wp.tile_ones(dtype=float, m=16, n=16) s = wp.tile_sum(t) - print(t) + print(s) - wp.launch(compute, dim=[64], inputs=[]) + wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64) Prints: @@ -1207,19 +1207,19 @@ def tile_min(a: Tile) -> Tile: @wp.kernel def compute(): - t = wp.tile_arange(start=--10, stop=10, dtype=float) + t = wp.tile_arange(64, 128) s = wp.tile_min(t) - print(t) + print(s) - wp.launch(compute, dim=[64], inputs=[]) + wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64) Prints: .. code-block:: text - tile(m=1, n=1, storage=register) = [[-10]] + tile(m=1, n=1, storage=register) = [[64 ]] """ @@ -1239,19 +1239,19 @@ def tile_max(a: Tile) -> Tile: @wp.kernel def compute(): - t = wp.tile_arange(start=--10, stop=10, dtype=float) - s = wp.tile_min(t) + t = wp.tile_arange(64, 128) + s = wp.tile_max(t) - print(t) + print(s) - wp.launch(compute, dim=[64], inputs=[]) + wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64) Prints: .. code-block:: text - tile(m=1, n=1, storage=register) = [[10]] + tile(m=1, n=1, storage=register) = [[127 ]] """ @@ -1259,7 +1259,7 @@ def compute(): @over -def tile_reduce(op: Callable, a: Any) -> Tile: +def tile_reduce(op: Callable, a: Tile) -> Tile: """Apply a custom reduction operator across the tile. This function cooperatively performs a reduction using the provided operator across the tile. @@ -1280,7 +1280,7 @@ def factorial(): print(s) - wp.launch(factorial, dim=[16], inputs=[], block_dim=16) + wp.launch_tiled(factorial, dim=[1], inputs=[], block_dim=16) Prints: @@ -1293,7 +1293,7 @@ def factorial(): @over -def tile_map(op: Callable, a: Any) -> Tile: +def tile_map(op: Callable, a: Tile) -> Tile: """Apply a unary function onto the tile. This function cooperatively applies a unary function to each element of the tile using all threads in the block. @@ -1314,7 +1314,7 @@ def compute(): print(s) - wp.launch(compute, dim=[16], inputs=[]) + wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=16) Prints: @@ -1327,7 +1327,7 @@ def compute(): @over -def tile_map(op: Callable, a: Any, b: Any) -> Tile: +def tile_map(op: Callable, a: Tile, b: Tile) -> Tile: """Apply a binary function onto the tile. This function cooperatively applies a binary function to each element of the tiles using all threads in the block. @@ -1352,7 +1352,7 @@ def compute(): print(s) - wp.launch(compute, dim=[16], inputs=[]) + wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=16) Prints: