From 8226b8e9c87881a41aba36abbf2964aefda56ebe Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Wed, 10 Jul 2024 09:36:17 -0700
Subject: [PATCH] overview of gpu threads

---
 .../01-GPU-Threads/01-GPU-Threads.cpp         | 174 ++++++++++++++----
 1 file changed, 140 insertions(+), 34 deletions(-)
diff --git a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
index 5790add..06a53b8 100644
--- a/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
+++ b/Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
@@ -1,21 +1,68 @@
+#include "RAJA/RAJA.hpp"
+
 #include <iostream>
 
-int main(int argc, char *argv[])
+int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv))
 {
 
-
-  auto& rm = umpire::ResourceManager::getInstance();
-  unsigned char *cnt{nullptr};
-  auto allocator = rm.getAllocator("UM");
-  auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
-  cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
-
-  using device_launch = RAJA::cuda_launch_t<false>;
-  using launch_policy = RAJA::LaunchPolicy<device_launch>;
-
-  //Example 1. Global Indexing: 
-  //GPU programming models such as CUDA and HIP follow a thread/block(team) programming model 
-  //in which a predefined compute grid 
+  // GPU programming models such as CUDA and HIP peform computation
+  // on a predefined grid composed of threads and blocks.
+  // RAJA provides policies enabling users to utilize different
+  // thread enumeration strategies. The two main strategies
+  // are Block/Thread local enumeration and Global enumeration.
+  //
+  //
+  //  Under a block and thread enumeration,
+  //  threads have block local coordinate enumeration.
+  //  While blocks are enumerated with respect
+  //  to their location within the compute grid.
+  //  The illustration below shows  2 x 2 compute grid
+  //  wherein each block has 3 x 2 threads. Current
+  //  programing models support up to three-dimensional
+  //  block and thread configurations.
+  //
+  //
+  //     Block (0,0)       Block (1,0)
+  //   [0,0][0,1][0,2]  [0,0][0,1][0,2]
+  //   [1,0][1,1][1,2]  [1,0][1,1][1,2]
+  //
+  //     Block (0,1)      Block (1,1)
+  //   [0,0][0,1][0,2]  [0,0][0,1][0,2]
+  //   [1,0][1,1][1,2]  [1,0][1,1][1,2]
+  //
+  //  Under the global enumeration each thread
+  //  is a assigned a unique thread id based on
+  //  on the dimension (2D illustrated here).
+  //  The utility here comes when the iteration
+  //  space is amendable to tiles in which blocks
+  //  can be assigned to a tile and threads are
+  //  assigned to work within a tile.
+  //
+  //   [0,0][0,1][0,2]  [0,3][0,4][0,5]
+  //   [1,0][1,1][1,2]  [1,3][1,4][1,5]
+  //
+  //   [2,0][2,1][2,2]  [2,3][2,4][2,5]
+  //   [3,0][3,1][3,2]  [3,3][3,4][3,5]
+  //
+
+  // Short note on RAJA nomenclature:
+  // As RAJA serves as an abstraction layer
+  // the RAJA::launch API uses the terms
+  // teams and threads. Teams are analogous
+  // to blocks in CUDA/HIP nomenclature
+  // and workgroups in the SYCL programming model.
+  // Threads are analogous to threads within CUDA/HIP
+  // and work-items within the SYCL programming model.
+
+#if defined(RAJA_ENABLE_CUDA)
+
+  // The examples below showcase commonly used GPU policies.
+  // For the HIP and SYCL programming models, we offer analogous policies.
+
+  using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;
+
+  // Example 1. Global Indexing
+  // Main use case: Perfectly nested loops with large iteration spaces.
   {
     const int N_x        = 10000;
     const int N_y        = 20000;
@@ -23,57 +70,116 @@ int main(int argc, char *argv[])
     const int n_blocks_x = (N_x + block_sz) / block_sz + 1;
     const int n_blocks_y = (N_y + block_sz) / block_sz + 1;
 
-    using loop_pol_x = RAJA::LoopPolicy<RAJA::cuda_global_x>;
+    using global_pol_y = RAJA::LoopPolicy<RAJA::cuda_global_thread_y>;
+    using global_pol_x = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;
 
-    RAJA::launch<device_launch>
+    RAJA::launch<launch_policy>
       (RAJA::LaunchParams(RAJA::Teams(n_blocks_x, n_blocks_y), RAJA::Threads(block_sz)),
        [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
-	RAJA::loop<loop_pol_y>(ctx, RAJA::RangeSegment(0, N_y), [&] (int gy) {
-	    RAJA::loop<loop_pol_x>(ctx, RAJA::RangeSegment(0, N_x), [&] (int gx) {
+	RAJA::loop<global_pol_y>(ctx, RAJA::RangeSegment(0, N_y), [&] (int gy) {
+	    RAJA::loop<global_pol_x>(ctx, RAJA::RangeSegment(0, N_x), [&] (int gx) {
+
+		//Do something
 
-		//populate 
-		
-		
 	      });
 	  });
 
-
       });
 
   }
 
 
+  // Example 2. Block and thread direct polcies
+  // Ideal for when iteration space can broken up into tiles
+  // Teams can be assigned to a tile and threads can perform
+  // computations within the tile
+
+  // The example below employs the direct version for block
+  // and thread policies, the underlying assumption is that
+  // the loops are within the range of the grid and block sizes
 
+  // In CUDA the direct loops are expressed as:
+  //
+  //   const int i = threadIdx.x;
+  //   if(i < N) { //kernel }
+  //
 
+  {
+    const int n_blocks = 50000;
+    const int block_sz = 64;
 
+    using outer_pol = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
+    using inner_pol = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;
 
+    RAJA::launch<launch_policy>
+      (RAJA::LaunchParams(RAJA::Teams(n_blocks), RAJA::Threads(block_sz)),
+       [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
+	RAJA::loop<outer_pol>(ctx, RAJA::RangeSegment(0, n_blocks), [&] (int bx) {
 
 
-  //Iteration Space:
-  {
-  const int n_blocks = 50000;
-  const int block_sz = 64;
+	    RAJA::loop<inner_pol>(ctx, RAJA::RangeSegment(0, block_sz), [&] (int tx) {
 
-  RAJA::launch<launch_policy>
-    ( RAJA::LaunchParams(RAJA::Teams(n_blocks),
-			RAJA::Threads(block_sz)),
-     [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+		//Do something
 
+	      });
+	  });
 
-      RAJA::loop<col_loop>(ctx, RAJA::RangeSegment(0, width), [&] (int col) {
 
-	});
+      });
 
-    });
   }
-  
 
 
+  // Example 3. Block and thread loop polcies
+  // Similar to the example above but using a thread loop
+  // policy. The utility of the thread loop policy rises when
+  // we consider multiple thread loops with varying iteration sizes.
+
+  // If a RAJA loop iteration space is beyond the configured number
+  // of threads in a team. The thread loop policies will perform a team
+  // stride loop to span the whole range.
 
+  // In CUDA the block stride loop is expressed as
+  //
+  //   for(int i=threadIdx.x; i<N; i+=blockDim.x)
+  //
+  {
+    const int n_blocks = 50000;
+    const int block_sz = 64;
+
+    using outer_pol = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
+    using inner_pol = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
+
+    RAJA::launch<launch_policy>
+      (RAJA::LaunchParams(RAJA::Teams(n_blocks), RAJA::Threads(block_sz)),
+       [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
+
+	RAJA::loop<outer_pol>(ctx, RAJA::RangeSegment(0, n_blocks), [&] (int bx) {
+
+	    //Iteration space is same as number of blocks per thread
+	    //We could also use direct policy here
+	    RAJA::loop<inner_pol>(ctx, RAJA::RangeSegment(0, block_sz), [&] (int tx) {
+		//Do something here
+	      }); //inner loop
+
+
+	    //Iteration space is *more* than number of blocks per thread
+	    RAJA::loop<inner_pol>(ctx, RAJA::RangeSegment(0, 2*block_sz), [&] (int tx) {
+		//Do something here
+	      }); //inner loop
+
+	  }); //outer loop
+
+      });
+
+  }
 
+#else
 
+  std::cout<<"Please compile with CUDA"<<std::endl;
+#endif
 
   return 0;
 }