From 4f3203749123b33c8fe99f551961c728d1454c70 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Tue, 30 Jul 2024 14:59:56 -0700
Subject: [PATCH 1/7] minor fixes

---
 .../fractal-ex2-RAJA-CUDA-solution.cpp        | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp b/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp
index dc4a057..1f6ef06 100644
--- a/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp
+++ b/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp
@@ -9,6 +9,8 @@
 #include <sys/time.h>
 
 #include "RAJA/RAJA.hpp"
+#include "umpire/Umpire.hpp"
+#include "umpire/strategy/QuickPool.hpp"
 #include "../../tpl/writeBMP.hpp"
 
 #define xMin 0.74395
@@ -17,7 +19,7 @@
 #define yMax 0.11899
 
 /* TODO: create a variable called "THREADS" to be used when calling the kernel*/
-#define THREADS 512 
+#define THREADS 512
 
 int main(int argc, char *argv[])
 {
@@ -41,23 +43,23 @@ int main(int argc, char *argv[])
   //pixels of the fractal image.
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};
-  auto allocator = rm.getAllocator("PINNED");
+  auto allocator = rm.getAllocator("UVM");
   auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
   cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
 
-  /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop 
-   * and one inner loop. Only the inner for loop will be calculating pixels. 
+  /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop
+   * and one inner loop. Only the inner for loop will be calculating pixels.
    */
   using KERNEL_POLICY = RAJA::KernelPolicy<
     RAJA::statement::CudaKernel<
-      RAJA::statement::For<1, RAJA::cuda_block_x_loop,
-        RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
+      RAJA::statement::For<1, RAJA::cuda_global_thread_y,
+        RAJA::statement::For<0, RAJA::cuda_global_thread_x,
           RAJA::statement::Lambda<0>
         >
-      > 
+      >
     >
   >;
-  
+
   /* compute fractal */
   gettimeofday(&start, NULL);
   /* TODO: Add a RAJA::Kernel which takes the KERNEL_POLICY you just created above.
@@ -85,7 +87,7 @@ int main(int argc, char *argv[])
         x = x2 - y2 - cx;
         depth--;
       } while ((depth > 0) && ((x2 + y2) <= 5.0));
-      d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal
+      cnt[row * width + col] = depth & 255; //Remember to index the image like normal
   });
   gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.
 

From fad972a88a1dbc3c5522dadd1cdbb55f819d0aaa Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Tue, 30 Jul 2024 15:48:27 -0700
Subject: [PATCH 2/7] minor bug fixes

---
 .../fractal-ex2-RAJA-CUDA-solution.cpp        | 10 ++++-----
 .../03-HIP/fractal-ex3-RAJA-HIP.cpp           |  2 ++
 .../fractal-ex3-RAJA-HIP-solution.cpp         | 21 +++++++++----------
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp b/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp
index 1f6ef06..ea21672 100644
--- a/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp
+++ b/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp
@@ -19,7 +19,7 @@
 #define yMax 0.11899
 
 /* TODO: create a variable called "THREADS" to be used when calling the kernel*/
-#define THREADS 512
+#define THREADS 256
 
 int main(int argc, char *argv[])
 {
@@ -43,7 +43,7 @@ int main(int argc, char *argv[])
   //pixels of the fractal image.
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};
-  auto allocator = rm.getAllocator("UVM");
+  auto allocator = rm.getAllocator("UM");
   auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
   cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
 
@@ -51,9 +51,9 @@ int main(int argc, char *argv[])
    * and one inner loop. Only the inner for loop will be calculating pixels.
    */
   using KERNEL_POLICY = RAJA::KernelPolicy<
-    RAJA::statement::CudaKernel<
-      RAJA::statement::For<1, RAJA::cuda_global_thread_y,
-        RAJA::statement::For<0, RAJA::cuda_global_thread_x,
+    RAJA::statement::CudaKernelFixed<THREADS,
+      RAJA::statement::For<1, RAJA::cuda_global_size_y_direct<16>,
+        RAJA::statement::For<0, RAJA::cuda_global_size_x_direct<16>,
           RAJA::statement::Lambda<0>
         >
       >
diff --git a/Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp b/Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp
index 478b643..a780ab5 100644
--- a/Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp
+++ b/Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp
@@ -9,6 +9,8 @@
 #include <sys/time.h>
 
 #include "RAJA/RAJA.hpp"
+#include "umpire/Umpire.hpp"
+#include "umpire/strategy/QuickPool.hpp"
 #include "../../tpl/writeBMP.hpp"
 
 #define xMin 0.74395
diff --git a/Intermediate_Tutorial/03-HIP/solutions/fractal-ex3-RAJA-HIP-solution.cpp b/Intermediate_Tutorial/03-HIP/solutions/fractal-ex3-RAJA-HIP-solution.cpp
index 17fd01f..983705a 100644
--- a/Intermediate_Tutorial/03-HIP/solutions/fractal-ex3-RAJA-HIP-solution.cpp
+++ b/Intermediate_Tutorial/03-HIP/solutions/fractal-ex3-RAJA-HIP-solution.cpp
@@ -9,6 +9,8 @@
 #include <sys/time.h>
 
 #include "RAJA/RAJA.hpp"
+#include "umpire/Umpire.hpp"
+#include "umpire/strategy/QuickPool.hpp"
 #include "../../tpl/writeBMP.hpp"
 
 #define xMin 0.74395
@@ -17,7 +19,7 @@
 #define yMax 0.11899
 
 /* TODO: create a variable called "THREADS" to be used when calling the kernel*/
-#define THREADS 512 
+#define THREADS 256
 
 int main(int argc, char *argv[])
 {
@@ -37,21 +39,21 @@ int main(int argc, char *argv[])
 
   printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
 
-  //TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the
+  //TODO: Create an Umpire QuickPool allocator with unified memory that will hold the
   //pixels of the fractal image.
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};
-  auto allocator = rm.getAllocator("PINNED");
+  auto allocator = rm.getAllocator("UM");
   auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
   cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
 
   /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a hip kernel with one outer loop 
    * and one inner loop. Only the inner for loop will be calculating pixels. 
    */
-  using KERNEL_POLICY = RAJA::KernelPolicy<
-    RAJA::statement::HipKernel<
-      RAJA::statement::For<1, RAJA::hip_block_x_loop,
-        RAJA::statement::For<0, RAJA::hip_thread_x_loop,
+  using KERNEL_POLICY = RAJA::KernelPolicyFixed<
+    RAJA::statement::HipKernel<THREADS,
+      RAJA::statement::For<1, RAJA::hip_global_size_y_direct<16>,
+        RAJA::statement::For<0, RAJA::hip_global_size_x_direct<16>,
           RAJA::statement::Lambda<0>
         >
       > 
@@ -84,15 +86,12 @@ int main(int argc, char *argv[])
         x = x2 - y2 - cx;
         depth--;
       } while ((depth > 0) && ((x2 + y2) <= 5.0));
-      d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal
+      cnt[row * width + col] = depth & 255; //Remember to index the image like normal
   });
   gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.
 
   printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0);
 
-  /* TODO: In order to create a bmp image, we need to copy the completed fractal to the Host memory space */
-  hipMemcpyAsync(cnt, d_cnt, width * width * sizeof(unsigned char), hipMemcpyDeviceToHost);
-
   /* verify result by writing it to a file */
   if (width <= 2048) {
     wbmp.WriteBMP(width, width, cnt, "fractal.bmp");

From 5738783ebb9e90e3f92e7792b540972abcef25a6 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Tue, 30 Jul 2024 15:51:03 -0700
Subject: [PATCH 3/7] clean up hip code

---
 Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp     | 8 ++++----
 .../03-HIP/solutions/fractal-ex3-RAJA-HIP-solution.cpp    | 1 -
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp b/Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp
index a780ab5..ba5b9ed 100644
--- a/Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp
+++ b/Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp
@@ -18,7 +18,7 @@
 #define yMin 0.11321
 #define yMax 0.11899
 
-#define THREADS 512
+#define THREADS 256
 
 //TODO: uncomment this out in order to build!
 // #define COMPILE
@@ -43,11 +43,11 @@ int main(int argc, char *argv[])
 
   printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
 
-  //TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the
+  //TODO: Create an Umpire QuickPool allocator with unified memory that will hold the
   //pixels of the fractal image.
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};
-  auto allocator = rm.getAllocator("PINNED");
+  auto allocator = rm.getAllocator("UM");
   auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
   cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
 
@@ -87,7 +87,7 @@ int main(int argc, char *argv[])
         x = x2 - y2 - cx;
         depth--;
       } while ((depth > 0) && ((x2 + y2) <= 5.0));
-      d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal
+      cnt[row * width + col] = depth & 255; //Remember to index the image like normal
   });
   gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.
 
diff --git a/Intermediate_Tutorial/03-HIP/solutions/fractal-ex3-RAJA-HIP-solution.cpp b/Intermediate_Tutorial/03-HIP/solutions/fractal-ex3-RAJA-HIP-solution.cpp
index 983705a..de0408c 100644
--- a/Intermediate_Tutorial/03-HIP/solutions/fractal-ex3-RAJA-HIP-solution.cpp
+++ b/Intermediate_Tutorial/03-HIP/solutions/fractal-ex3-RAJA-HIP-solution.cpp
@@ -18,7 +18,6 @@
 #define yMin 0.11321
 #define yMax 0.11899
 
-/* TODO: create a variable called "THREADS" to be used when calling the kernel*/
 #define THREADS 256
 
 int main(int argc, char *argv[])

From 2814e06da3c453d91fddfd547b6bd09695fa454d Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Tue, 30 Jul 2024 15:53:23 -0700
Subject: [PATCH 4/7] clean up cuda version

---
 Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp   | 8 +++++---
 .../02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp  | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp b/Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp
index 8e35f91..ef38846 100644
--- a/Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp
+++ b/Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp
@@ -9,6 +9,8 @@
 #include <sys/time.h>
 
 #include "RAJA/RAJA.hpp"
+#include "umpire/Umpire.hpp"
+#include "umpire/strategy/QuickPool.hpp"
 #include "../../tpl/writeBMP.hpp"
 
 #define xMin 0.74395
@@ -16,7 +18,7 @@
 #define yMin 0.11321
 #define yMax 0.11899
 
-#define THREADS 512
+#define THREADS 256
 
 //TODO: uncomment this out in order to build!
 // #define COMPILE
@@ -41,7 +43,7 @@ int main(int argc, char *argv[])
 
   printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
 
-  //TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the
+  //TODO: Create an Umpire QuickPool allocator with unified memory that will hold the
   //pixels of the fractal image.
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};
@@ -85,7 +87,7 @@ int main(int argc, char *argv[])
         x = x2 - y2 - cx;
         depth--;
       } while ((depth > 0) && ((x2 + y2) <= 5.0));
-      d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal
+      cnt[row * width + col] = depth & 255; //Remember to index the image like normal
   });
   gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.
 
diff --git a/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp b/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp
index ea21672..a844c23 100644
--- a/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp
+++ b/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp
@@ -39,7 +39,7 @@ int main(int argc, char *argv[])
 
   printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
 
-  //TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the
+  //TODO: Create an Umpire QuickPool allocator with unified memory that will hold the
   //pixels of the fractal image.
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};

From e09269912efbe9bcbf45328a7086543f31283b74 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Tue, 30 Jul 2024 16:34:41 -0700
Subject: [PATCH 5/7] clean up pass

---
 .../fractal-ex4-RAJA-launch-solution.cpp      | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp b/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp
index 96aba17..d4f693d 100644
--- a/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp
+++ b/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp
@@ -42,45 +42,51 @@ int main(int argc, char *argv[])
 
   printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
 
-  //TODO: Create an Umpire QuickPool allocator with Unified Memory that will hold the
+  //TODO: Create an Umpire QuickPool allocator with unified memory that will hold the
   // pixels of the fractal image.
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};
-  auto allocator = rm.getAllocator("PINNED");
+  auto allocator = rm.getAllocator("UM");
   auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
   cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
 
   //TODO: Create a RAJA Kernel Policy which uses the loop_exec policy. We want to start
   //with a normal serial nested loop first before continuing onward.
 
+  constexpr int block_dim = 16;
   using host_launch = RAJA::seq_launch_t;
 
 #if defined(RAJA_ENABLE_CUDA)
   using device_launch = RAJA::cuda_launch_t<false>;
-#elif defined(RAJA_ENABLE_HIP)
-  using device_launch = RAJA::hip_launch_t<false>;
 #endif
 
   using launch_policy = RAJA::LaunchPolicy<
     host_launch
-#if defined(RAJA_GPU_ACTIVE)
+#if defined(RAJA_ENABLE_CUDA)
     ,device_launch
 #endif
     >;
 
-  using col_loop = RAJA::LoopPolicy<RAJA::loop_exec, RAJA::cuda_global_thread_x>;
+  using col_loop = RAJA::LoopPolicy<RAJA::loop_exec
+#if defined(RAJA_ENABLE_CUDA)
+                                    ,RAJA::cuda_global_size_y_direct<block_dim>
+#endif
+                                    >;
 
-  using row_loop = RAJA::LoopPolicy<RAJA::loop_exec, RAJA::cuda_global_thread_y>;
+  using row_loop = RAJA::LoopPolicy<RAJA::loop_exec
+#if defined(RAJA_ENABLE_CUDA)
+                                    ,RAJA::cuda_global_size_x_direct<block_dim>
+#endif
+                                    >;
 
   /* start time */
   gettimeofday(&start, NULL);
 
-  constexpr int block_sz = 16;
-  int n_blocks = (width + block_sz-1) / block_sz + 1;
+  int n_blocks = (width + block_dim-1) / block_dim + 1;
 
   RAJA::launch<launch_policy>
     (select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(n_blocks, n_blocks),
-                                           RAJA::Threads(block_sz, block_sz)),
+                                           RAJA::Threads(block_dim, block_dim)),
      [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
       RAJA::loop<col_loop>(ctx, RAJA::RangeSegment(0, width), [&] (int col) {

From 31f6ae6c135b096d68bbaa00afdb047739521d8b Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Tue, 30 Jul 2024 16:40:53 -0700
Subject: [PATCH 6/7] clean up pass

---
 .../04-LAUNCH/fractal-ex4-RAJA-launch.cpp      | 12 +++++++-----
 .../fractal-ex4-RAJA-launch-solution.cpp       | 18 +++++++-----------
 2 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/Intermediate_Tutorial/04-LAUNCH/fractal-ex4-RAJA-launch.cpp b/Intermediate_Tutorial/04-LAUNCH/fractal-ex4-RAJA-launch.cpp
index 7cd1c18..37db388 100644
--- a/Intermediate_Tutorial/04-LAUNCH/fractal-ex4-RAJA-launch.cpp
+++ b/Intermediate_Tutorial/04-LAUNCH/fractal-ex4-RAJA-launch.cpp
@@ -49,15 +49,18 @@ int main(int argc, char *argv[])
 
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};
-  auto allocator = rm.getAllocator("PINNED");
+  auto allocator = rm.getAllocator("UM");
   auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
   cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
 
+  constexpr int team_dim = 16;
+  using host_launch = RAJA::seq_launch_t;
+
   //TODO: Create a RAJA launch policy for the host and device
   using launch_policy = RAJA::LaunchPolicy</* host launch policy */, /* device launch policies */>;
 
 
-  //TODO: create RAJA loop policies for the host and device
+  //TODO: create RAJA global thread loop policies for the host and device
   using col_loop = RAJA::LoopPolicy</*host policy */, /*device policy*/>;
 
   using row_loop = RAJA::LoopPolicy</*host policy */, /*device policy*/>;
@@ -66,13 +69,12 @@ int main(int argc, char *argv[])
   gettimeofday(&start, NULL);
 
   //Calculate number of blocks
-  constexpr int team_sz = 16;
-  int n_teams = (width + team_sz - 1) / team_sz + 1;
+  int n_teams = (width + team_dim - 1) / team_dim + 1;
 
   //Teams are akin to to CUDA/HIP blocks
   RAJA::launch<launch_policy>
     (select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(n_teams, n_teams),
-                                           RAJA::Threads(team_sz, team_sz)),
+                                           RAJA::Threads(team_dim, team_dim)),
      [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
       RAJA::loop<col_loop>(ctx, RAJA::RangeSegment(0, width), [&] (int col) {
diff --git a/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp b/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp
index d4f693d..d315d5e 100644
--- a/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp
+++ b/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp
@@ -42,20 +42,16 @@ int main(int argc, char *argv[])
 
   printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
 
-  //TODO: Create an Umpire QuickPool allocator with unified memory that will hold the
-  // pixels of the fractal image.
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};
   auto allocator = rm.getAllocator("UM");
   auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
   cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
 
-  //TODO: Create a RAJA Kernel Policy which uses the loop_exec policy. We want to start
-  //with a normal serial nested loop first before continuing onward.
-
-  constexpr int block_dim = 16;
+  constexpr int team_dim = 16;
   using host_launch = RAJA::seq_launch_t;
 
+  //TODO: create RAJA global thread loop policies for the host and device  
 #if defined(RAJA_ENABLE_CUDA)
   using device_launch = RAJA::cuda_launch_t<false>;
 #endif
@@ -69,24 +65,24 @@ int main(int argc, char *argv[])
 
   using col_loop = RAJA::LoopPolicy<RAJA::loop_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                    ,RAJA::cuda_global_size_y_direct<block_dim>
+                                    ,RAJA::cuda_global_size_y_direct<team_dim>
 #endif
                                     >;
 
   using row_loop = RAJA::LoopPolicy<RAJA::loop_exec
 #if defined(RAJA_ENABLE_CUDA)
-                                    ,RAJA::cuda_global_size_x_direct<block_dim>
+                                    ,RAJA::cuda_global_size_x_direct<team_dim>
 #endif
                                     >;
 
   /* start time */
   gettimeofday(&start, NULL);
 
-  int n_blocks = (width + block_dim-1) / block_dim + 1;
+  int n_teams = (width + team_dim-1) / team_dim + 1;
 
   RAJA::launch<launch_policy>
-    (select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(n_blocks, n_blocks),
-                                           RAJA::Threads(block_dim, block_dim)),
+    (select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(n_teams, n_teams),
+                                           RAJA::Threads(team_dim, team_dim)),
      [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
 
       RAJA::loop<col_loop>(ctx, RAJA::RangeSegment(0, width), [&] (int col) {

From da374e16b68a96abd94d0cde52ed979e94059db5 Mon Sep 17 00:00:00 2001
From: Arturo Vargas <vargas45@llnl.gov>
Date: Wed, 31 Jul 2024 10:29:48 -0700
Subject: [PATCH 7/7] loop -> seq

---
 .../04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp b/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp
index d315d5e..c991070 100644
--- a/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp
+++ b/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp
@@ -51,7 +51,7 @@ int main(int argc, char *argv[])
   constexpr int team_dim = 16;
   using host_launch = RAJA::seq_launch_t;
 
-  //TODO: create RAJA global thread loop policies for the host and device  
+  //TODO: create RAJA global thread loop policies for the host and device
 #if defined(RAJA_ENABLE_CUDA)
   using device_launch = RAJA::cuda_launch_t<false>;
 #endif
@@ -63,13 +63,13 @@ int main(int argc, char *argv[])
 #endif
     >;
 
-  using col_loop = RAJA::LoopPolicy<RAJA::loop_exec
+  using col_loop = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
                                     ,RAJA::cuda_global_size_y_direct<team_dim>
 #endif
                                     >;
 
-  using row_loop = RAJA::LoopPolicy<RAJA::loop_exec
+  using row_loop = RAJA::LoopPolicy<RAJA::seq_exec
 #if defined(RAJA_ENABLE_CUDA)
                                     ,RAJA::cuda_global_size_x_direct<team_dim>
 #endif