diff --git a/VERSION.md b/VERSION.md
index 8e3b76e9d..26ca59460 100644
--- a/VERSION.md
+++ b/VERSION.md
@@ -1 +1 @@
-1.5.1-rc.2
+1.5.1
diff --git a/docs/modules/functions.rst b/docs/modules/functions.rst
index 32ed34bc2..a27c83e50 100644
--- a/docs/modules/functions.rst
+++ b/docs/modules/functions.rst
@@ -876,7 +876,7 @@ Tile Primitives
     :returns: A tile with ``shape=(m,n)`` and dtype the same as the source array
 
 
-.. py:function:: tile_store(a: Array[Any], i: int32, t: Any) -> None
+.. py:function:: tile_store(a: Array[Any], i: int32, t: Tile) -> None
 
     Stores a 1D tile to a global memory array.
 
@@ -887,7 +887,7 @@ Tile Primitives
     :param t: The source tile to store data from, must have the same dtype as the destination array
 
 
-.. py:function:: tile_store(a: Array[Any], i: int32, j: int32, t: Any) -> None
+.. py:function:: tile_store(a: Array[Any], i: int32, j: int32, t: Tile) -> None
     :noindex:
     :nocontentsentry:
 
@@ -901,7 +901,7 @@ Tile Primitives
     :param t: The source tile to store data from, must have the same dtype as the destination array
 
 
-.. py:function:: tile_atomic_add(a: Array[Any], x: int32, y: int32, t: Any) -> Tile
+.. py:function:: tile_atomic_add(a: Array[Any], x: int32, y: int32, t: Tile) -> Tile
 
     Atomically add a tile to the array `a`, each element will be updated atomically.
 
@@ -967,7 +967,7 @@ Tile Primitives
     
 
 
-.. py:function:: untile(a: Any) -> Scalar
+.. py:function:: untile(a: Tile) -> Scalar
 
     Convert a Tile back to per-thread values.
 
@@ -991,7 +991,7 @@ Tile Primitives
             t = wp.tile(i)*2
 
             # convert back to per-thread values
-            s = wp.untile()
+            s = wp.untile(t)
 
             print(s)
 
@@ -1038,7 +1038,7 @@ Tile Primitives
 
     Broadcast a tile.
 
-    This method will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules.
+    This function will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules.
 
     :param a: Tile to broadcast
     :returns: Tile with broadcast ``shape=(m, n)``
@@ -1061,9 +1061,9 @@ Tile Primitives
             t = wp.tile_ones(dtype=float, m=16, n=16)
             s = wp.tile_sum(t)
 
-            print(t)
+            print(s)
 
-        wp.launch(compute, dim=[64], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
 
     Prints:
 
@@ -1088,18 +1088,19 @@ Tile Primitives
         @wp.kernel
         def compute():
 
-            t = wp.tile_arange(start=--10, stop=10, dtype=float)
+            t = wp.tile_arange(64, 128)
             s = wp.tile_min(t)
 
-            print(t)
+            print(s)
 
-        wp.launch(compute, dim=[64], inputs=[])
+
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
 
     Prints:
 
     .. code-block:: text
 
-        tile(m=1, n=1, storage=register) = [[-10]]
+        tile(m=1, n=1, storage=register) = [[64 ]]
 
     
 
@@ -1118,23 +1119,23 @@ Tile Primitives
         @wp.kernel
         def compute():
 
-            t = wp.tile_arange(start=--10, stop=10, dtype=float)
-            s = wp.tile_min(t)
+            t = wp.tile_arange(64, 128)
+            s = wp.tile_max(t)
 
-            print(t)
+            print(s)
 
-        wp.launch(compute, dim=[64], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
 
     Prints:
 
     .. code-block:: text
 
-        tile(m=1, n=1, storage=register) = [[10]]
+        tile(m=1, n=1, storage=register) = [[127 ]]
 
     
 
 
-.. py:function:: tile_reduce(op: Callable, a: Any) -> Tile
+.. py:function:: tile_reduce(op: Callable, a: Tile) -> Tile
 
     Apply a custom reduction operator across the tile.
 
@@ -1156,7 +1157,7 @@ Tile Primitives
 
             print(s)
 
-        wp.launch(factorial, dim=[16], inputs=[], block_dim=16)
+        wp.launch_tiled(factorial, dim=[1], inputs=[], block_dim=16)
 
     Prints:
 
@@ -1166,7 +1167,7 @@ Tile Primitives
     
 
 
-.. py:function:: tile_map(op: Callable, a: Any) -> Tile
+.. py:function:: tile_map(op: Callable, a: Tile) -> Tile
 
     Apply a unary function onto the tile.
 
@@ -1188,7 +1189,7 @@ Tile Primitives
 
             print(s)
 
-        wp.launch(compute, dim=[16], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=16)
 
     Prints:
 
@@ -1198,7 +1199,7 @@ Tile Primitives
     
 
 
-.. py:function:: tile_map(op: Callable, a: Any, b: Any) -> Tile
+.. py:function:: tile_map(op: Callable, a: Tile, b: Tile) -> Tile
     :noindex:
     :nocontentsentry:
 
@@ -1226,7 +1227,7 @@ Tile Primitives
 
             print(s)
 
-        wp.launch(compute, dim=[16], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=16)
 
     Prints:
 
diff --git a/docs/modules/tiles.rst b/docs/modules/tiles.rst
index d8d6f12f9..e0adcbf53 100644
--- a/docs/modules/tiles.rst
+++ b/docs/modules/tiles.rst
@@ -22,25 +22,42 @@ Inside kernels, tile operations are executed cooperatively across each block of
 
 In the following example, we launch a grid of threads where each block is responsible for loading a row of data from a 2D array and computing its sum:
 
-.. code:: python
+.. testcode::
     
     TILE_SIZE = wp.constant(256)
     TILE_THREADS = 64
 
     @wp.kernel
-    def compute(a: array2d(dtype=float))
-        
+    def compute(a: wp.array2d(dtype=float), b: wp.array2d(dtype=float)):
+
         # obtain our block index
         i = wp.tid()
 
         # load a row from global memory
-        t = wp.tile_load(array[i], i, TILE_SIZE)
+        t = wp.tile_load(a[i], 0, TILE_SIZE)
+
+        # cooperatively compute the sum of the tile elements; s is a 1x1 tile
         s = wp.tile_sum(t)
-        ...
 
-    wp.launch_tiled(compute, dim=[a.shape[0]], inputs=[a], block_dim=TILE_THREADS)
+        # store s in global memory
+        wp.tile_store(b[0], i, s)
+
+    N = 10
+
+    a_np = np.arange(N).reshape(-1, 1) * np.ones((1, 256), dtype=float)
+    a = wp.array(a_np, dtype=float)
+    b = wp.zeros((1,N), dtype=float)
+
+    wp.launch_tiled(compute, dim=[a.shape[0]], inputs=[a, b], block_dim=TILE_THREADS)
+
+    print(f"b = {b}")
+
+.. testoutput::
+
+    b = [[   0.  256.  512.  768. 1024. 1280. 1536. 1792. 2048. 2304.]]
     
 Here, we have used the new :func:`warp.launch_tiled` function which assigns ``TILE_THREADS`` threads to each of the elements in the launch grid. Each block of ``TILE_THREADS`` threads then loads an entire row of 256 values from the global memory array and computes its sum (cooperatively).
+Note that we loaded the row by writing ``t = wp.tile_load(a[i], 0, TILE_SIZE)`` but we could have used the equivalent statement ``t = wp.tile_load(a[0], i, TILE_SIZE)`` instead.
 
 
 Tile Properties
@@ -55,7 +72,7 @@ In Warp, tile objects are 2D arrays of data where the tile elements may be scala
     TILE_THREADS = 64
 
     @wp.kernel
-    def compute(a: array2d(dtype=float))
+    def compute(a: array2d(dtype=float)):
         
         # obtain our 2d block index
         i, j = wp.tid()
@@ -213,7 +230,7 @@ Traditionally, Warp kernels are primarily written in the SIMT programming model,
     TILE_THREADS = 64
 
     @wp.kernel
-    def compute()
+    def compute():
         i = wp.tid()
 
         # perform some per-thread computation
diff --git a/exts/omni.warp.core/config/extension.toml b/exts/omni.warp.core/config/extension.toml
index 2ae29633b..0a55904a5 100644
--- a/exts/omni.warp.core/config/extension.toml
+++ b/exts/omni.warp.core/config/extension.toml
@@ -1,6 +1,6 @@
 [package]
 # Semantic Versioning is used: https://semver.org/
-version = "1.5.1-rc.2"
+version = "1.5.1"
 authors = ["NVIDIA"]
 title = "Warp Core"
 description="The core Warp Python module"
diff --git a/exts/omni.warp.core/docs/CHANGELOG.md b/exts/omni.warp.core/docs/CHANGELOG.md
index 43c563612..d42c8a3ed 100644
--- a/exts/omni.warp.core/docs/CHANGELOG.md
+++ b/exts/omni.warp.core/docs/CHANGELOG.md
@@ -1,6 +1,6 @@
 # CHANGELOG
 
-## [1.5.1-rc.2] - 2025-01-02
+## [1.5.1] - 2025-01-02
 
 ### Added
 
diff --git a/exts/omni.warp/config/extension.toml b/exts/omni.warp/config/extension.toml
index d63163bea..8a5558a83 100644
--- a/exts/omni.warp/config/extension.toml
+++ b/exts/omni.warp/config/extension.toml
@@ -1,6 +1,6 @@
 [package]
 # Semantic Versioning is used: https://semver.org/
-version = "1.5.1-rc.2"
+version = "1.5.1"
 authors = ["NVIDIA"]
 title = "Warp"
 description="Warp OmniGraph Nodes and Sample Scenes"
@@ -35,7 +35,7 @@ exclude = ["Ogn*Database.py", "*/ogn*"]
 "omni.timeline" = {}
 "omni.ui" = {optional = true}
 "omni.usd" = {}
-"omni.warp.core" = {version = "1.5.1-rc.2", exact = true}
+"omni.warp.core" = {version = "1.5.1", exact = true}
 
 [[python.module]]
 name = "omni.warp._extension"
diff --git a/exts/omni.warp/docs/CHANGELOG.md b/exts/omni.warp/docs/CHANGELOG.md
index 43c563612..d42c8a3ed 100644
--- a/exts/omni.warp/docs/CHANGELOG.md
+++ b/exts/omni.warp/docs/CHANGELOG.md
@@ -1,6 +1,6 @@
 # CHANGELOG
 
-## [1.5.1-rc.2] - 2025-01-02
+## [1.5.1] - 2025-01-02
 
 ### Added
 
diff --git a/warp/builtins.py b/warp/builtins.py
index d3a89dbe2..73a9261a9 100644
--- a/warp/builtins.py
+++ b/warp/builtins.py
@@ -1852,6 +1852,7 @@ def tile_arange_value_func(arg_types: Mapping[str, type], arg_values: Mapping[st
         step = args[2]
 
     if start is None or stop is None or step is None:
+        print(args)
         raise RuntimeError("wp.tile_arange() arguments must be compile time constants")
 
     if "dtype" in arg_values:
@@ -2083,7 +2084,7 @@ def tile_store_1d_value_func(arg_types, arg_values):
 
 add_builtin(
     "tile_store",
-    input_types={"a": array(dtype=Any), "i": int, "t": Any},
+    input_types={"a": array(dtype=Any), "i": int, "t": Tile(dtype=Any, M=Any, N=Any)},
     value_func=tile_store_1d_value_func,
     variadic=False,
     skip_replay=True,
@@ -2132,7 +2133,7 @@ def tile_store_2d_value_func(arg_types, arg_values):
 
 add_builtin(
     "tile_store",
-    input_types={"a": array(dtype=Any), "i": int, "j": int, "t": Any},
+    input_types={"a": array(dtype=Any), "i": int, "j": int, "t": Tile(dtype=Any, M=Any, N=Any)},
     value_func=tile_store_2d_value_func,
     variadic=False,
     skip_replay=True,
@@ -2177,7 +2178,7 @@ def tile_atomic_add_value_func(arg_types, arg_values):
 
 add_builtin(
     "tile_atomic_add",
-    input_types={"a": array(dtype=Any), "x": int, "y": int, "t": Any},
+    input_types={"a": array(dtype=Any), "x": int, "y": int, "t": Tile(dtype=Any, M=Any, N=Any)},
     value_func=tile_atomic_add_value_func,
     variadic=True,
     skip_replay=True,
@@ -2365,7 +2366,7 @@ def untile_value_func(arg_types, arg_values):
 
 add_builtin(
     "untile",
-    input_types={"a": Any},
+    input_types={"a": Tile(dtype=Any, M=Any, N=Any)},
     value_func=untile_value_func,
     variadic=True,
     doc="""Convert a Tile back to per-thread values.
@@ -2390,7 +2391,7 @@ def compute():
             t = wp.tile(i)*2
 
             # convert back to per-thread values
-            s = wp.untile()
+            s = wp.untile(t)
 
             print(s)
 
@@ -2562,7 +2563,7 @@ def tile_broadcast_dispatch_func(arg_types: Mapping[str, type], return_type: Any
     variadic=True,
     doc="""Broadcast a tile.
 
-    This method will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules.
+    This function will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules.
 
     :param a: Tile to broadcast
     :returns: Tile with broadcast ``shape=(m, n)``""",
@@ -2654,9 +2655,9 @@ def compute():
             t = wp.tile_ones(dtype=float, m=16, n=16)
             s = wp.tile_sum(t)
 
-            print(t)
+            print(s)
 
-        wp.launch(compute, dim=[64], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
 
     Prints:
 
@@ -2703,18 +2704,19 @@ def tile_min_value_func(arg_types, arg_values):
         @wp.kernel
         def compute():
 
-            t = wp.tile_arange(start=--10, stop=10, dtype=float)
+            t = wp.tile_arange(64, 128)
             s = wp.tile_min(t)
 
-            print(t)
+            print(s)
+
 
-        wp.launch(compute, dim=[64], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
 
     Prints:
 
     .. code-block:: text
 
-        tile(m=1, n=1, storage=register) = [[-10]]
+        tile(m=1, n=1, storage=register) = [[64 ]]
 
     """,
     group="Tile Primitives",
@@ -2755,18 +2757,18 @@ def tile_max_value_func(arg_types, arg_values):
         @wp.kernel
         def compute():
 
-            t = wp.tile_arange(start=--10, stop=10, dtype=float)
-            s = wp.tile_min(t)
+            t = wp.tile_arange(64, 128)
+            s = wp.tile_max(t)
 
-            print(t)
+            print(s)
 
-        wp.launch(compute, dim=[64], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
 
     Prints:
 
     .. code-block:: text
 
-        tile(m=1, n=1, storage=register) = [[10]]
+        tile(m=1, n=1, storage=register) = [[127 ]]
 
     """,
     group="Tile Primitives",
@@ -2796,7 +2798,7 @@ def tile_reduce_dispatch_func(input_types: Mapping[str, type], return_type: Any,
 
 add_builtin(
     "tile_reduce",
-    input_types={"op": Callable, "a": Any},
+    input_types={"op": Callable, "a": Tile(dtype=Any, M=Any, N=Any)},
     value_func=tile_reduce_value_func,
     native_func="tile_reduce",
     doc="""Apply a custom reduction operator across the tile.
@@ -2819,7 +2821,7 @@ def factorial():
 
             print(s)
 
-        wp.launch(factorial, dim=[16], inputs=[], block_dim=16)
+        wp.launch_tiled(factorial, dim=[1], inputs=[], block_dim=16)
 
     Prints:
 
@@ -2856,7 +2858,7 @@ def tile_unary_map_value_func(arg_types, arg_values):
 
 add_builtin(
     "tile_map",
-    input_types={"op": Callable, "a": Any},
+    input_types={"op": Callable, "a": Tile(dtype=Any, M=Any, N=Any)},
     value_func=tile_unary_map_value_func,
     # dispatch_func=tile_map_dispatch_func,
     # variadic=True,
@@ -2881,7 +2883,7 @@ def compute():
 
             print(s)
 
-        wp.launch(compute, dim=[16], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=16)
 
     Prints:
 
@@ -2923,7 +2925,7 @@ def tile_binary_map_value_func(arg_types, arg_values):
 
 add_builtin(
     "tile_map",
-    input_types={"op": Callable, "a": Any, "b": Any},
+    input_types={"op": Callable, "a": Tile(dtype=Any, M=Any, N=Any), "b": Tile(dtype=Any, M=Any, N=Any)},
     value_func=tile_binary_map_value_func,
     # dispatch_func=tile_map_dispatch_func,
     # variadic=True,
@@ -2952,7 +2954,7 @@ def compute():
 
             print(s)
 
-        wp.launch(compute, dim=[16], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=16)
 
     Prints:
 
diff --git a/warp/config.py b/warp/config.py
index 195b4ce46..f684198a9 100644
--- a/warp/config.py
+++ b/warp/config.py
@@ -7,7 +7,7 @@
 
 from typing import Optional
 
-version: str = "1.5.1-rc.2"
+version: str = "1.5.1"
 """Warp version string"""
 
 verify_fp: bool = False
diff --git a/warp/native/tile.h b/warp/native/tile.h
index edbc8d665..185f04c28 100644
--- a/warp/native/tile.h
+++ b/warp/native/tile.h
@@ -1125,8 +1125,6 @@ inline CUDA_CALLABLE auto untile(Tile& tile)
     }
 }
 
-
-
 template <typename Tile, typename Value>
 inline CUDA_CALLABLE void adj_untile(Tile& tile, Tile& adj_tile, Value& adj_ret)
 {    
@@ -1156,7 +1154,7 @@ inline CUDA_CALLABLE auto tile_zeros()
     return T(0);
 }
 
-// zero initialized tile
+// one-initialized tile
 template <typename T, int M, int N>
 inline CUDA_CALLABLE auto tile_ones()
 {
@@ -1164,7 +1162,7 @@ inline CUDA_CALLABLE auto tile_ones()
     return T(1);
 }
 
-// zero initialized tile
+// tile with evenly spaced values
 template <typename T, int M, int N>
 inline CUDA_CALLABLE auto tile_arange(T start, T stop, T step)
 {
@@ -1220,7 +1218,6 @@ inline CUDA_CALLABLE void tile_store(array_t<T>& dest, int x, int y, Tile& src)
     src.copy_to_global(dest, x, y);
 }
 
-// entry point for store
 template <typename T, typename Tile>
 inline CUDA_CALLABLE auto tile_atomic_add(array_t<T>& dest, int x, int y, Tile& src)
 {
diff --git a/warp/stubs.py b/warp/stubs.py
index a88eb4444..21ac7a871 100644
--- a/warp/stubs.py
+++ b/warp/stubs.py
@@ -975,7 +975,7 @@ def tile_load(a: Array[Any], i: int32, j: int32, m: int32, n: int32, storage: st
 
 
 @over
-def tile_store(a: Array[Any], i: int32, t: Any):
+def tile_store(a: Array[Any], i: int32, t: Tile):
     """Stores a 1D tile to a global memory array.
 
     This method will cooperatively store a tile to global memory using all threads in the block.
@@ -988,7 +988,7 @@ def tile_store(a: Array[Any], i: int32, t: Any):
 
 
 @over
-def tile_store(a: Array[Any], i: int32, j: int32, t: Any):
+def tile_store(a: Array[Any], i: int32, j: int32, t: Tile):
     """Stores a tile to a global memory array.
 
     This method will cooperatively store a tile to global memory using all threads in the block.
@@ -1002,7 +1002,7 @@ def tile_store(a: Array[Any], i: int32, j: int32, t: Any):
 
 
 @over
-def tile_atomic_add(a: Array[Any], x: int32, y: int32, t: Any) -> Tile:
+def tile_atomic_add(a: Array[Any], x: int32, y: int32, t: Tile) -> Tile:
     """Atomically add a tile to the array `a`, each element will be updated atomically.
 
     :param a: Array in global memory, should have the same ``dtype`` as the input tile
@@ -1077,7 +1077,7 @@ def compute():
 
 
 @over
-def untile(a: Any) -> Scalar:
+def untile(a: Tile) -> Scalar:
     """Convert a Tile back to per-thread values.
 
     This function converts a block-wide tile back to per-thread values.
@@ -1100,7 +1100,7 @@ def compute():
             t = wp.tile(i) * 2
 
             # convert back to per-thread values
-            s = wp.untile()
+            s = wp.untile(t)
 
             print(s)
 
@@ -1154,7 +1154,7 @@ def tile_transpose(a: Tile) -> Tile:
 def tile_broadcast(a: Tile, m: int32, n: int32) -> Tile:
     """Broadcast a tile.
 
-    This method will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules.
+    This function will attempt to broadcast the input tile ``a`` to the destination shape (m, n), broadcasting follows NumPy broadcast rules.
 
     :param a: Tile to broadcast
     :returns: Tile with broadcast ``shape=(m, n)``
@@ -1178,10 +1178,10 @@ def compute():
             t = wp.tile_ones(dtype=float, m=16, n=16)
             s = wp.tile_sum(t)
 
-            print(t)
+            print(s)
 
 
-        wp.launch(compute, dim=[64], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
 
     Prints:
 
@@ -1207,19 +1207,19 @@ def tile_min(a: Tile) -> Tile:
 
         @wp.kernel
         def compute():
-            t = wp.tile_arange(start=--10, stop=10, dtype=float)
+            t = wp.tile_arange(64, 128)
             s = wp.tile_min(t)
 
-            print(t)
+            print(s)
 
 
-        wp.launch(compute, dim=[64], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
 
     Prints:
 
     .. code-block:: text
 
-        tile(m=1, n=1, storage=register) = [[-10]]
+        tile(m=1, n=1, storage=register) = [[64 ]]
 
 
     """
@@ -1239,19 +1239,19 @@ def tile_max(a: Tile) -> Tile:
 
         @wp.kernel
         def compute():
-            t = wp.tile_arange(start=--10, stop=10, dtype=float)
-            s = wp.tile_min(t)
+            t = wp.tile_arange(64, 128)
+            s = wp.tile_max(t)
 
-            print(t)
+            print(s)
 
 
-        wp.launch(compute, dim=[64], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=64)
 
     Prints:
 
     .. code-block:: text
 
-        tile(m=1, n=1, storage=register) = [[10]]
+        tile(m=1, n=1, storage=register) = [[127 ]]
 
 
     """
@@ -1259,7 +1259,7 @@ def compute():
 
 
 @over
-def tile_reduce(op: Callable, a: Any) -> Tile:
+def tile_reduce(op: Callable, a: Tile) -> Tile:
     """Apply a custom reduction operator across the tile.
 
     This function cooperatively performs a reduction using the provided operator across the tile.
@@ -1280,7 +1280,7 @@ def factorial():
             print(s)
 
 
-        wp.launch(factorial, dim=[16], inputs=[], block_dim=16)
+        wp.launch_tiled(factorial, dim=[1], inputs=[], block_dim=16)
 
     Prints:
 
@@ -1293,7 +1293,7 @@ def factorial():
 
 
 @over
-def tile_map(op: Callable, a: Any) -> Tile:
+def tile_map(op: Callable, a: Tile) -> Tile:
     """Apply a unary function onto the tile.
 
     This function cooperatively applies a unary function to each element of the tile using all threads in the block.
@@ -1314,7 +1314,7 @@ def compute():
             print(s)
 
 
-        wp.launch(compute, dim=[16], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=16)
 
     Prints:
 
@@ -1327,7 +1327,7 @@ def compute():
 
 
 @over
-def tile_map(op: Callable, a: Any, b: Any) -> Tile:
+def tile_map(op: Callable, a: Tile, b: Tile) -> Tile:
     """Apply a binary function onto the tile.
 
     This function cooperatively applies a binary function to each element of the tiles using all threads in the block.
@@ -1352,7 +1352,7 @@ def compute():
             print(s)
 
 
-        wp.launch(compute, dim=[16], inputs=[])
+        wp.launch_tiled(compute, dim=[1], inputs=[], block_dim=16)
 
     Prints: