Merge pull request #24 from LLNL/artv3/intermediate-tutorial

Minor fixes for intermediate tutorial
LLNL · Aug 2, 2024 · d4c1e89 · d4c1e89
2 parents 053c213 + da374e1
commit d4c1e89
Show file tree

Hide file tree

Showing 6 changed files with 58 additions and 50 deletions.
diff --git a/Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp b/Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp
@@ -9,14 +9,16 @@
 #include <sys/time.h>
 
 #include "RAJA/RAJA.hpp"
+#include "umpire/Umpire.hpp"
+#include "umpire/strategy/QuickPool.hpp"
 #include "../../tpl/writeBMP.hpp"
 
 #define xMin 0.74395
 #define xMax 0.74973
 #define yMin 0.11321
 #define yMax 0.11899
 
-#define THREADS 512
+#define THREADS 256
 
 //TODO: uncomment this out in order to build!
 // #define COMPILE
@@ -41,7 +43,7 @@ int main(int argc, char *argv[])
 
   printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
 
-  //TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the
+  //TODO: Create an Umpire QuickPool allocator with unified memory that will hold the
   //pixels of the fractal image.
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};
@@ -85,7 +87,7 @@ int main(int argc, char *argv[])
         x = x2 - y2 - cx;
         depth--;
       } while ((depth > 0) && ((x2 + y2) <= 5.0));
-      d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal
+      cnt[row * width + col] = depth & 255; //Remember to index the image like normal
   });
   gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.
 

diff --git a/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp b/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp
@@ -9,6 +9,8 @@
 #include <sys/time.h>
 
 #include "RAJA/RAJA.hpp"
+#include "umpire/Umpire.hpp"
+#include "umpire/strategy/QuickPool.hpp"
 #include "../../tpl/writeBMP.hpp"
 
 #define xMin 0.74395
@@ -17,7 +19,7 @@
 #define yMax 0.11899
 
 /* TODO: create a variable called "THREADS" to be used when calling the kernel*/
-#define THREADS 512 
+#define THREADS 256
 
 int main(int argc, char *argv[])
 {
@@ -37,27 +39,27 @@ int main(int argc, char *argv[])
 
   printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
 
-  //TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the
+  //TODO: Create an Umpire QuickPool allocator with unified memory that will hold the
   //pixels of the fractal image.
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};
-  auto allocator = rm.getAllocator("PINNED");
+  auto allocator = rm.getAllocator("UM");
   auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
   cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
 
-  /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop 
-   * and one inner loop. Only the inner for loop will be calculating pixels. 
+  /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop
+   * and one inner loop. Only the inner for loop will be calculating pixels.
    */
   using KERNEL_POLICY = RAJA::KernelPolicy<
-    RAJA::statement::CudaKernel<
-      RAJA::statement::For<1, RAJA::cuda_block_x_loop,
-        RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+    RAJA::statement::CudaKernelFixed<THREADS,
+      RAJA::statement::For<1, RAJA::cuda_global_size_y_direct<16>,
+        RAJA::statement::For<0, RAJA::cuda_global_size_x_direct<16>,
           RAJA::statement::Lambda<0>
         >
-      > 
+      >
     >
   >;
-  
+
   /* compute fractal */
   gettimeofday(&start, NULL);
   /* TODO: Add a RAJA::Kernel which takes the KERNEL_POLICY you just created above.
@@ -85,7 +87,7 @@ int main(int argc, char *argv[])
         x = x2 - y2 - cx;
         depth--;
       } while ((depth > 0) && ((x2 + y2) <= 5.0));
-      d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal
+      cnt[row * width + col] = depth & 255; //Remember to index the image like normal
   });
   gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.
 

diff --git a/Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp b/Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp
@@ -9,14 +9,16 @@
 #include <sys/time.h>
 
 #include "RAJA/RAJA.hpp"
+#include "umpire/Umpire.hpp"
+#include "umpire/strategy/QuickPool.hpp"
 #include "../../tpl/writeBMP.hpp"
 
 #define xMin 0.74395
 #define xMax 0.74973
 #define yMin 0.11321
 #define yMax 0.11899
 
-#define THREADS 512
+#define THREADS 256
 
 //TODO: uncomment this out in order to build!
 // #define COMPILE
@@ -41,11 +43,11 @@ int main(int argc, char *argv[])
 
   printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
 
-  //TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the
+  //TODO: Create an Umpire QuickPool allocator with unified memory that will hold the
   //pixels of the fractal image.
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};
-  auto allocator = rm.getAllocator("PINNED");
+  auto allocator = rm.getAllocator("UM");
   auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
   cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
 
@@ -85,7 +87,7 @@ int main(int argc, char *argv[])
         x = x2 - y2 - cx;
         depth--;
       } while ((depth > 0) && ((x2 + y2) <= 5.0));
-      d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal
+      cnt[row * width + col] = depth & 255; //Remember to index the image like normal
   });
   gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.
 

diff --git a/Intermediate_Tutorial/03-HIP/solutions/fractal-ex3-RAJA-HIP-solution.cpp b/Intermediate_Tutorial/03-HIP/solutions/fractal-ex3-RAJA-HIP-solution.cpp
@@ -9,15 +9,16 @@
 #include <sys/time.h>
 
 #include "RAJA/RAJA.hpp"
+#include "umpire/Umpire.hpp"
+#include "umpire/strategy/QuickPool.hpp"
 #include "../../tpl/writeBMP.hpp"
 
 #define xMin 0.74395
 #define xMax 0.74973
 #define yMin 0.11321
 #define yMax 0.11899
 
-/* TODO: create a variable called "THREADS" to be used when calling the kernel*/
-#define THREADS 512 
+#define THREADS 256
 
 int main(int argc, char *argv[])
 {
@@ -37,21 +38,21 @@ int main(int argc, char *argv[])
 
   printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
 
-  //TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the
+  //TODO: Create an Umpire QuickPool allocator with unified memory that will hold the
   //pixels of the fractal image.
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};
-  auto allocator = rm.getAllocator("PINNED");
+  auto allocator = rm.getAllocator("UM");
   auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
   cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
 
   /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a hip kernel with one outer loop 
    * and one inner loop. Only the inner for loop will be calculating pixels. 
    */
-  using KERNEL_POLICY = RAJA::KernelPolicy<
-    RAJA::statement::HipKernel<
-      RAJA::statement::For<1, RAJA::hip_block_x_loop,
-        RAJA::statement::For<0, RAJA::hip_thread_x_loop,
+  using KERNEL_POLICY = RAJA::KernelPolicyFixed<
+    RAJA::statement::HipKernel<THREADS,
+      RAJA::statement::For<1, RAJA::hip_global_size_y_direct<16>,
+        RAJA::statement::For<0, RAJA::hip_global_size_x_direct<16>,
           RAJA::statement::Lambda<0>
         >
       > 
@@ -84,15 +85,12 @@ int main(int argc, char *argv[])
         x = x2 - y2 - cx;
         depth--;
       } while ((depth > 0) && ((x2 + y2) <= 5.0));
-      d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal
+      cnt[row * width + col] = depth & 255; //Remember to index the image like normal
   });
   gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.
 
   printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0);
 
-  /* TODO: In order to create a bmp image, we need to copy the completed fractal to the Host memory space */
-  hipMemcpyAsync(cnt, d_cnt, width * width * sizeof(unsigned char), hipMemcpyDeviceToHost);
-
   /* verify result by writing it to a file */
   if (width <= 2048) {
     wbmp.WriteBMP(width, width, cnt, "fractal.bmp");

diff --git a/Intermediate_Tutorial/04-LAUNCH/fractal-ex4-RAJA-launch.cpp b/Intermediate_Tutorial/04-LAUNCH/fractal-ex4-RAJA-launch.cpp
@@ -49,15 +49,18 @@ int main(int argc, char *argv[])
 
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};
-  auto allocator = rm.getAllocator("PINNED");
+  auto allocator = rm.getAllocator("UM");
   auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
   cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
 
+  constexpr int team_dim = 16;
+  using host_launch = RAJA::seq_launch_t;
+
   //TODO: Create a RAJA launch policy for the host and device
   using launch_policy = RAJA::LaunchPolicy</* host launch policy */, /* device launch policies */>;
 
 
-  //TODO: create RAJA loop policies for the host and device
+  //TODO: create RAJA global thread loop policies for the host and device
   using col_loop = RAJA::LoopPolicy</*host policy */, /*device policy*/>;
 
   using row_loop = RAJA::LoopPolicy</*host policy */, /*device policy*/>;
@@ -66,13 +69,12 @@ int main(int argc, char *argv[])
   gettimeofday(&start, NULL);
 
   //Calculate number of blocks
-  constexpr int team_sz = 16;
-  int n_teams = (width + team_sz - 1) / team_sz + 1;
+  int n_teams = (width + team_dim - 1) / team_dim + 1;
 
   //Teams are akin to to CUDA/HIP blocks
   RAJA::launch<launch_policy>
     (select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(n_teams, n_teams),
-                                           RAJA::Threads(team_sz, team_sz)),
+                                           RAJA::Threads(team_dim, team_dim)),
      [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
       RAJA::loop<col_loop>(ctx, RAJA::RangeSegment(0, width), [&] (int col) {

diff --git a/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp b/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp
@@ -42,45 +42,47 @@ int main(int argc, char *argv[])
 
   printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
 
-  //TODO: Create an Umpire QuickPool allocator with Unified Memory that will hold the
-  // pixels of the fractal image.
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};
-  auto allocator = rm.getAllocator("PINNED");
+  auto allocator = rm.getAllocator("UM");
   auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
   cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
 
-  //TODO: Create a RAJA Kernel Policy which uses the loop_exec policy. We want to start
-  //with a normal serial nested loop first before continuing onward.
-
+  constexpr int team_dim = 16;
   using host_launch = RAJA::seq_launch_t;
 
+  //TODO: create RAJA global thread loop policies for the host and device
 #if defined(RAJA_ENABLE_CUDA)
   using device_launch = RAJA::cuda_launch_t<false>;
-#elif defined(RAJA_ENABLE_HIP)
-  using device_launch = RAJA::hip_launch_t<false>;
 #endif
 
   using launch_policy = RAJA::LaunchPolicy<
     host_launch
-#if defined(RAJA_GPU_ACTIVE)
+#if defined(RAJA_ENABLE_CUDA)
     ,device_launch
 #endif
     >;
 
-  using col_loop = RAJA::LoopPolicy<RAJA::loop_exec, RAJA::cuda_global_thread_x>;
+  using col_loop = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_CUDA)
+                                    ,RAJA::cuda_global_size_y_direct<team_dim>
+#endif
+                                    >;
 
-  using row_loop = RAJA::LoopPolicy<RAJA::loop_exec, RAJA::cuda_global_thread_y>;
+  using row_loop = RAJA::LoopPolicy<RAJA::seq_exec
+#if defined(RAJA_ENABLE_CUDA)
+                                    ,RAJA::cuda_global_size_x_direct<team_dim>
+#endif
+                                    >;
 
   /* start time */
   gettimeofday(&start, NULL);
 
-  constexpr int block_sz = 16;
-  int n_blocks = (width + block_sz-1) / block_sz + 1;
+  int n_teams = (width + team_dim-1) / team_dim + 1;
 
   RAJA::launch<launch_policy>
-    (select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(n_blocks, n_blocks),
-                                           RAJA::Threads(block_sz, block_sz)),
+    (select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(n_teams, n_teams),
+                                           RAJA::Threads(team_dim, team_dim)),
      [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
       RAJA::loop<col_loop>(ctx, RAJA::RangeSegment(0, width), [&] (int col) {