Merge pull request #17 from LLNL/artv3/Intermediate_Tutorial_edits

Update readmes for intermediate tutorial
LLNL · Aug 15, 2023 · 06da65b · 06da65b
2 parents f82a6f6 + 88551dc
commit 06da65b
Show file tree

Hide file tree

Showing 24 changed files with 316 additions and 238 deletions.
diff --git a/...Tutorial/02-CUDA/solutions/CMakeLists.txt → Intermediate_Tutorial/00-BASE/CMakeLists.txt b/...Tutorial/02-CUDA/solutions/CMakeLists.txt → Intermediate_Tutorial/00-BASE/CMakeLists.txt
@@ -5,7 +5,7 @@
 # SPDX-License-Identifier: (BSD-3-Clause)
 ###############################################################################
 
-raja_add_executable(
-  NAME fractal-solution-RAJA-CUDA
-  SOURCES fractal-solution-RAJA-CUDA.cpp
-  DEPENDS_ON cuda writeBMP umpire RAJA)
+blt_add_executable(
+  NAME fractal-ex0-c-loop
+  SOURCES fractal-ex0-c-loop.cpp
+  DEPENDS_ON writeBMP)
diff --git a/Intermediate_Tutorial/00-BASE/README.md b/Intermediate_Tutorial/00-BASE/README.md
@@ -0,0 +1,24 @@
+===============================================
+Fractal Tutorial - Base (Sequential) Execution
+===============================================
+
+This file has no exercises but it
+is used as a base reference implementation.
+
+Use the code to compare implementations
+between the RAJA kernel and launch abstractions.
+
+Compile and run the code:
+
+```
+$ make fractal-ex0-c-loop
+$ ./bin/fractal-ex0-c-loop
+```
+
+Before starting, be sure to study the seq-exec implementation of the fractal. 
+It is important to note:
+ * Read-only, write-only, and read-write variables used in the main computation
+ * The main data structure that holds the values of the fractal pixels
+ * Any data dependencies, if any, throughout the computation of the pixels
+
+Also note that this is a sequential implementation. Timing information will be output to the screen. As we add RAJA and Umpire, it will be interesting to see how performance improves.
diff --git a/Intermediate_Tutorial/00-BASE/fractal-ex0-c-loop.cpp b/Intermediate_Tutorial/00-BASE/fractal-ex0-c-loop.cpp
@@ -0,0 +1,72 @@
+#include <malloc.h>
+#include <sys/time.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "../../tpl/writeBMP.hpp"
+
+#define xMin 0.74395
+#define xMax 0.74973
+#define yMin 0.11321
+#define yMax 0.11899
+
+int main(int argc, char *argv[])
+{
+
+  double dx, dy;
+  int width;
+  const int maxdepth = 256;
+  struct timeval start, end;
+  writebmp wbmp;
+
+  /* check command line */
+  if(argc != 2) {fprintf(stderr, "usage: exe <width> \n"); exit(-1);}
+  width = atoi(argv[1]);
+  if (width < 10) {fprintf(stderr, "edge_length must be at least 10\n"); exit(-1);}
+
+  dx = (xMax - xMin) / width;
+  dy = (yMax - yMin) / width;
+
+  printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
+
+  unsigned char *cnt = (unsigned char*)malloc(width * width * sizeof(unsigned char));
+
+  /* start time */
+  gettimeofday(&start, NULL);
+
+  for(int row = 0; row < width; ++row) {
+    for(int col = 0; col < width; ++col) {
+
+      double x2, y2, x, y, cx, cy;
+      int depth;
+
+      cy = yMin + row * dy; //compute row #
+      cx = xMin + col * dx; //compute column #
+      x = -cx;
+      y = -cy;
+      depth = maxdepth;
+      do {
+        x2 = x * x;
+        y2 = y * y;
+        y = 2 * x * y - cy;
+        x = x2 - y2 - cx;
+        depth--;
+      } while ((depth > 0) && ((x2 + y2) <= 5.0));
+      cnt[row * width + col] = depth & 255;
+
+    }
+  }
+
+  /* end time */
+  gettimeofday(&end, NULL);
+  printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0);
+
+  /* verify result by writing it to a file */
+  if (width <= 2048) {
+    wbmp.WriteBMP(width, width, cnt, "fractal.bmp");
+  }
+
+  free(cnt);
+
+  return 0;
+}
diff --git a/Intermediate_Tutorial/01-SEQ/CMakeLists.txt b/Intermediate_Tutorial/01-SEQ/CMakeLists.txt
@@ -8,4 +8,4 @@
 blt_add_executable(
   NAME fractal-ex1-RAJA-seq
   SOURCES fractal-ex1-RAJA-seq.cpp
-  DEPENDS_ON cuda RAJA umpire writeBMP)
+  DEPENDS_ON RAJA umpire writeBMP)
diff --git a/...ediate_Tutorial/01-SEQ/tutorial-guide.rst → Intermediate_Tutorial/01-SEQ/README.md b/...ediate_Tutorial/01-SEQ/tutorial-guide.rst → Intermediate_Tutorial/01-SEQ/README.md
@@ -1,9 +1,9 @@
 =================================
-Fractal Tutorial - LOOP Execution
+Fractal Tutorial - SEQ Execution
 =================================
 
-Before starting, be sure to study the loop-exec implementation of the fractal 
-before continuing. It is important to note:
+Before starting, be sure to study the base implementation in the 00-BASE directory.
+Remember, it is important to note:
  * Read-only, write-only, and read-write variables used in the main computation
  * The main data structure that holds the values of the fractal pixels
  * Any data dependencies, if any, throughout the computation of the pixels
@@ -14,6 +14,18 @@ lesson 12 from the Introduction Tutorial) for the fractal and
 complete the appropriate `RAJA::Kernel` statement using the `RAJA::seq_exec` execution
 policy.
 
+A complete description of the different policies is available in the online RAJA
+documentation:
+https://raja.readthedocs.io/en/develop/sphinx/user_guide/feature/policies.html#raja-loop-kernel-execution-policies
+
 The `seq_exec` policy is a good first step because it allows us to get a sense of the
-performance using serial nested loops. From here, we have a good baseline to compare against
-when transitioning to CUDA, HIP, etc. 
+performance using sequential, nested loops with RAJA. 
+From here, we have a good baseline to compare against when transitioning to 
+CUDA, HIP, etc. 
+
+To run the code compile and run via:
+
+```
+$ make fractal-ex1-RAJA-seq
+$ ./bin/fractal-ex1-RAJA-seq
+```
diff --git a/Intermediate_Tutorial/01-SEQ/fractal-ex1-RAJA-seq.cpp b/Intermediate_Tutorial/01-SEQ/fractal-ex1-RAJA-seq.cpp
@@ -11,6 +11,7 @@
 #define yMin 0.11321
 #define yMax 0.11899
 
+//TODO: uncomment this out in order to build!
 // #define COMPILE
 
 int main(int argc, char *argv[])
@@ -33,7 +34,7 @@ int main(int argc, char *argv[])
 
   printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
 
-  //TODO: Create an Umpire QuickPool allocator with Unified Memory that will hold the
+  //TODO: Create an Umpire QuickPool allocator with host memory that will hold the
   //pixels of the fractal image.
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};

diff --git a/...01-SEQ/solutions/fractal-ex1-RAJA-seq.cpp → ...lutions/fractal-ex1-RAJA-seq-solution.cpp b/...01-SEQ/solutions/fractal-ex1-RAJA-seq.cpp → ...lutions/fractal-ex1-RAJA-seq-solution.cpp
@@ -29,11 +29,11 @@ int main(int argc, char *argv[])
 
   printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
 
-  //TODO: Create an Umpire QuickPool allocator with Unified Memory that will hold the
+  //TODO: Create an Umpire QuickPool allocator with host memory that will hold the
   //pixels of the fractal image.
   auto& rm = umpire::ResourceManager::getInstance();
   unsigned char *cnt{nullptr};
-  auto allocator = rm.getAllocator("UM");
+  auto allocator = rm.getAllocator("HOST");
   auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
   cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
 

diff --git a/Intermediate_Tutorial/02-CUDA/CMakeLists.txt b/Intermediate_Tutorial/02-CUDA/CMakeLists.txt
@@ -11,5 +11,3 @@ if (ENABLE_CUDA)
     SOURCES fractal-ex2-RAJA-CUDA.cpp
     DEPENDS_ON cuda writeBMP umpire RAJA)
 endif()
-
-add_subdirectory(solutions)
diff --git a/Intermediate_Tutorial/02-CUDA/README.md b/Intermediate_Tutorial/02-CUDA/README.md
@@ -0,0 +1,17 @@
+=================================
+Fractal Tutorial - CUDA Execution
+=================================
+
+Look for the `TODO` comments in the source code. Here you will have to choose 
+two RAJA CUDA policies for the kernel API.
+
+A complete description of the different policies is available in the online RAJA
+documentation:
+https://raja.readthedocs.io/en/develop/sphinx/user_guide/tutorial/kernel_exec_pols.html#
+
+Once you are ready, uncomment the COMPILE define on on top of the file and do
+
+```
+$ make fractal-ex2-RAJA-CUDA
+$ ./bin/fractal-ex2-RAJA-CUDA
+```
diff --git a/Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp b/Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp
@@ -16,9 +16,10 @@
 #define yMin 0.11321
 #define yMax 0.11899
 
-/* TODO: create a variable called "THREADS" to be used when calling the kernel*/
+#define THREADS 512
 
-//#define COMPILE
+//TODO: uncomment this out in order to build!
+// #define COMPILE
 
 int main(int argc, char *argv[])
 {
@@ -40,25 +41,34 @@ int main(int argc, char *argv[])
 
   printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
 
-  /* TODO: Create the "cnt" array to store the pixels and allocate space for it on CPU using pinned memory */
+  //TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the
+  //pixels of the fractal image.
+  auto& rm = umpire::ResourceManager::getInstance();
+  unsigned char *cnt{nullptr};
+  auto allocator = rm.getAllocator("???");
+  auto pool = ???
+  cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
 
-
-  /* TODO: Create the "d_cnt" array to store pixels on the GPU and allocate space for it on the GPU */
-
-
-  /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop 
-   * and one inner loop. Only the inner for loop will be calculating pixels. 
+  /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop
+   * and one inner loop. Only the inner for loop will be calculating pixels.
    */
+  using KERNEL_POLICY = RAJA::KernelPolicy<
+    RAJA::statement::CudaKernel<
+      RAJA::statement::For<1, /* CUDA policy */
+        RAJA::statement::For<0, /* CUDA policy */
+          RAJA::statement::Lambda<0>
+        >
+      >
+    >
+  >;
 
-
   /* compute fractal */
   gettimeofday(&start, NULL);
-  /* TODO: Add a RAJA::Kernel which takes the KERNEL_POLICY you just created above.
-   * It should take range segments that go the same range as our for-loops from before.
-   * The iterators inside the kernel body will describe the row and col of the image.
-   */
 
-    //Remember, RAJA takes care of finding the global thread ID, so just index into the image like normal
+  RAJA::kernel<KERNEL_POLICY>(
+        RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, width),
+                         RAJA::TypedRangeSegment<int>(0, width)),
+        [=] RAJA_DEVICE (int row, int col) {
     double x2, y2, x, y, cx, cy;
     int depth;
 
@@ -76,22 +86,19 @@ int main(int argc, char *argv[])
         depth--;
       } while ((depth > 0) && ((x2 + y2) <= 5.0));
       d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal
-
+  });
   gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.
 
   printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0);
 
-  /* TODO: In order to create a bmp image, we need to copy the completed fractal to the Host memory space */
-
-
   /* verify result by writing it to a file */
   if (width <= 2048) {
     wbmp.WriteBMP(width, width, cnt, "fractal.bmp");
   }
 
-  /* TODO: Free the memory we allocated. */
-
-
+  //TODO: Use the Umpire pooled allocator to deallocate the memory.
+  pool.deallocate(cnt);
 #endif
+
   return 0;
 }
diff --git a/...-CUDA/solutions/fractal-ex2-RAJA-CUDA.cpp → ...utions/fractal-ex2-RAJA-CUDA-solution.cpp b/...-CUDA/solutions/fractal-ex2-RAJA-CUDA.cpp → ...utions/fractal-ex2-RAJA-CUDA-solution.cpp
@@ -37,13 +37,13 @@ int main(int argc, char *argv[])
 
   printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);
 
-  /* TODO: Create the "cnt" array to store the pixels and allocate space for it on CPU using pinned memory */
-  unsigned char *cnt;
-  cudaHostAlloc((void**)&cnt, (width * width * sizeof(unsigned char)), cudaHostAllocDefault);
-
-  /* TODO: Create the "d_cnt" array to store pixels on the GPU and allocate space for it on the GPU */
-  unsigned char *d_cnt;
-  cudaMalloc((void**)&d_cnt, width * width * sizeof(unsigned char));
+  //TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the
+  //pixels of the fractal image.
+  auto& rm = umpire::ResourceManager::getInstance();
+  unsigned char *cnt{nullptr};
+  auto allocator = rm.getAllocator("PINNED");
+  auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
+  cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));
 
   /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop 
    * and one inner loop. Only the inner for loop will be calculating pixels. 
@@ -91,16 +91,11 @@ int main(int argc, char *argv[])
 
   printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0);
 
-  /* TODO: In order to create a bmp image, we need to copy the completed fractal to the Host memory space */
-  cudaMemcpyAsync(cnt, d_cnt, width * width * sizeof(unsigned char), cudaMemcpyDeviceToHost);
-
   /* verify result by writing it to a file */
   if (width <= 2048) {
     wbmp.WriteBMP(width, width, cnt, "fractal.bmp");
   }
 
-  /* TODO: Free the memory we allocated. */
-  cudaFreeHost(cnt);
-  cudaFree(d_cnt);
+  pool.deallocate(cnt);
   return 0;
 }