diff --git a/Intermediate_Tutorial/02-CUDA/solutions/CMakeLists.txt b/Intermediate_Tutorial/00-BASE/CMakeLists.txt similarity index 71% rename from Intermediate_Tutorial/02-CUDA/solutions/CMakeLists.txt rename to Intermediate_Tutorial/00-BASE/CMakeLists.txt index 028683d..677db12 100644 --- a/Intermediate_Tutorial/02-CUDA/solutions/CMakeLists.txt +++ b/Intermediate_Tutorial/00-BASE/CMakeLists.txt @@ -5,7 +5,7 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### -raja_add_executable( - NAME fractal-solution-RAJA-CUDA - SOURCES fractal-solution-RAJA-CUDA.cpp - DEPENDS_ON cuda writeBMP umpire RAJA) +blt_add_executable( + NAME fractal-ex0-c-loop + SOURCES fractal-ex0-c-loop.cpp + DEPENDS_ON writeBMP) diff --git a/Intermediate_Tutorial/00-BASE/README.md b/Intermediate_Tutorial/00-BASE/README.md new file mode 100644 index 0000000..d7c9dff --- /dev/null +++ b/Intermediate_Tutorial/00-BASE/README.md @@ -0,0 +1,24 @@ +=============================================== +Fractal Tutorial - Base (Sequential) Execution +=============================================== + +This file has no exercises but it +is used as a base reference implementation. + +Use the code to compare implementations +between the RAJA kernel and launch abstractions. + +Compile and run the code: + +``` +$ make fractal-ex0-c-loop +$ ./bin/fractal-ex0-c-loop +``` + +Before starting, be sure to study the seq-exec implementation of the fractal. +It is important to note: + * Read-only, write-only, and read-write variables used in the main computation + * The main data structure that holds the values of the fractal pixels + * Any data dependencies, if any, throughout the computation of the pixels + +Also note that this is a sequential implementation. Timing information will be output to the screen. As we add RAJA and Umpire, it will be interesting to see how performance improves. \ No newline at end of file diff --git a/Intermediate_Tutorial/00-BASE/fractal-ex0-c-loop.cpp b/Intermediate_Tutorial/00-BASE/fractal-ex0-c-loop.cpp new file mode 100644 index 0000000..01b51ad --- /dev/null +++ b/Intermediate_Tutorial/00-BASE/fractal-ex0-c-loop.cpp @@ -0,0 +1,72 @@ +#include +#include +#include +#include + +#include "../../tpl/writeBMP.hpp" + +#define xMin 0.74395 +#define xMax 0.74973 +#define yMin 0.11321 +#define yMax 0.11899 + +int main(int argc, char *argv[]) +{ + + double dx, dy; + int width; + const int maxdepth = 256; + struct timeval start, end; + writebmp wbmp; + + /* check command line */ + if(argc != 2) {fprintf(stderr, "usage: exe \n"); exit(-1);} + width = atoi(argv[1]); + if (width < 10) {fprintf(stderr, "edge_length must be at least 10\n"); exit(-1);} + + dx = (xMax - xMin) / width; + dy = (yMax - yMin) / width; + + printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth); + + unsigned char *cnt = (unsigned char*)malloc(width * width * sizeof(unsigned char)); + + /* start time */ + gettimeofday(&start, NULL); + + for(int row = 0; row < width; ++row) { + for(int col = 0; col < width; ++col) { + + double x2, y2, x, y, cx, cy; + int depth; + + cy = yMin + row * dy; //compute row # + cx = xMin + col * dx; //compute column # + x = -cx; + y = -cy; + depth = maxdepth; + do { + x2 = x * x; + y2 = y * y; + y = 2 * x * y - cy; + x = x2 - y2 - cx; + depth--; + } while ((depth > 0) && ((x2 + y2) <= 5.0)); + cnt[row * width + col] = depth & 255; + + } + } + + /* end time */ + gettimeofday(&end, NULL); + printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0); + + /* verify result by writing it to a file */ + if (width <= 2048) { + wbmp.WriteBMP(width, width, cnt, "fractal.bmp"); + } + + free(cnt); + + return 0; +} diff --git a/Intermediate_Tutorial/01-SEQ/CMakeLists.txt b/Intermediate_Tutorial/01-SEQ/CMakeLists.txt index 011faf3..57ad06d 100644 --- a/Intermediate_Tutorial/01-SEQ/CMakeLists.txt +++ b/Intermediate_Tutorial/01-SEQ/CMakeLists.txt @@ -8,4 +8,4 @@ blt_add_executable( NAME fractal-ex1-RAJA-seq SOURCES fractal-ex1-RAJA-seq.cpp - DEPENDS_ON cuda RAJA umpire writeBMP) + DEPENDS_ON RAJA umpire writeBMP) diff --git a/Intermediate_Tutorial/01-SEQ/tutorial-guide.rst b/Intermediate_Tutorial/01-SEQ/README.md similarity index 53% rename from Intermediate_Tutorial/01-SEQ/tutorial-guide.rst rename to Intermediate_Tutorial/01-SEQ/README.md index c97d6c2..638b8e8 100644 --- a/Intermediate_Tutorial/01-SEQ/tutorial-guide.rst +++ b/Intermediate_Tutorial/01-SEQ/README.md @@ -1,9 +1,9 @@ ================================= -Fractal Tutorial - LOOP Execution +Fractal Tutorial - SEQ Execution ================================= -Before starting, be sure to study the loop-exec implementation of the fractal -before continuing. It is important to note: +Before starting, be sure to study the base implementation in the 00-BASE directory. +Remember, it is important to note: * Read-only, write-only, and read-write variables used in the main computation * The main data structure that holds the values of the fractal pixels * Any data dependencies, if any, throughout the computation of the pixels @@ -14,6 +14,18 @@ lesson 12 from the Introduction Tutorial) for the fractal and complete the appropriate `RAJA::Kernel` statement using the `RAJA::seq_exec` execution policy. +A complete description of the different policies is available in the online RAJA +documentation: +https://raja.readthedocs.io/en/develop/sphinx/user_guide/feature/policies.html#raja-loop-kernel-execution-policies + The `seq_exec` policy is a good first step because it allows us to get a sense of the -performance using serial nested loops. From here, we have a good baseline to compare against -when transitioning to CUDA, HIP, etc. +performance using sequential, nested loops with RAJA. +From here, we have a good baseline to compare against when transitioning to +CUDA, HIP, etc. + +To run the code compile and run via: + +``` +$ make fractal-ex1-RAJA-seq +$ ./bin/fractal-ex1-RAJA-seq +``` diff --git a/Intermediate_Tutorial/01-SEQ/fractal-ex1-RAJA-seq.cpp b/Intermediate_Tutorial/01-SEQ/fractal-ex1-RAJA-seq.cpp index 5757bad..1a2a363 100644 --- a/Intermediate_Tutorial/01-SEQ/fractal-ex1-RAJA-seq.cpp +++ b/Intermediate_Tutorial/01-SEQ/fractal-ex1-RAJA-seq.cpp @@ -11,6 +11,7 @@ #define yMin 0.11321 #define yMax 0.11899 +//TODO: uncomment this out in order to build! // #define COMPILE int main(int argc, char *argv[]) @@ -33,7 +34,7 @@ int main(int argc, char *argv[]) printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth); - //TODO: Create an Umpire QuickPool allocator with Unified Memory that will hold the + //TODO: Create an Umpire QuickPool allocator with host memory that will hold the //pixels of the fractal image. auto& rm = umpire::ResourceManager::getInstance(); unsigned char *cnt{nullptr}; diff --git a/Intermediate_Tutorial/01-SEQ/solutions/fractal-ex1-RAJA-seq.cpp b/Intermediate_Tutorial/01-SEQ/solutions/fractal-ex1-RAJA-seq-solution.cpp similarity index 95% rename from Intermediate_Tutorial/01-SEQ/solutions/fractal-ex1-RAJA-seq.cpp rename to Intermediate_Tutorial/01-SEQ/solutions/fractal-ex1-RAJA-seq-solution.cpp index 7ba898b..5dcab97 100644 --- a/Intermediate_Tutorial/01-SEQ/solutions/fractal-ex1-RAJA-seq.cpp +++ b/Intermediate_Tutorial/01-SEQ/solutions/fractal-ex1-RAJA-seq-solution.cpp @@ -29,11 +29,11 @@ int main(int argc, char *argv[]) printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth); - //TODO: Create an Umpire QuickPool allocator with Unified Memory that will hold the + //TODO: Create an Umpire QuickPool allocator with host memory that will hold the //pixels of the fractal image. auto& rm = umpire::ResourceManager::getInstance(); unsigned char *cnt{nullptr}; - auto allocator = rm.getAllocator("UM"); + auto allocator = rm.getAllocator("HOST"); auto pool = rm.makeAllocator("qpool", allocator); cnt = static_cast(pool.allocate(width * width * sizeof(unsigned char))); diff --git a/Intermediate_Tutorial/02-CUDA/CMakeLists.txt b/Intermediate_Tutorial/02-CUDA/CMakeLists.txt index 458bdcc..28f456d 100644 --- a/Intermediate_Tutorial/02-CUDA/CMakeLists.txt +++ b/Intermediate_Tutorial/02-CUDA/CMakeLists.txt @@ -11,5 +11,3 @@ if (ENABLE_CUDA) SOURCES fractal-ex2-RAJA-CUDA.cpp DEPENDS_ON cuda writeBMP umpire RAJA) endif() - -add_subdirectory(solutions) diff --git a/Intermediate_Tutorial/02-CUDA/README.md b/Intermediate_Tutorial/02-CUDA/README.md new file mode 100644 index 0000000..a5dd1f2 --- /dev/null +++ b/Intermediate_Tutorial/02-CUDA/README.md @@ -0,0 +1,17 @@ +================================= +Fractal Tutorial - CUDA Execution +================================= + +Look for the `TODO` comments in the source code. Here you will have to choose +two RAJA CUDA policies for the kernel API. + +A complete description of the different policies is available in the online RAJA +documentation: +https://raja.readthedocs.io/en/develop/sphinx/user_guide/tutorial/kernel_exec_pols.html# + +Once you are ready, uncomment the COMPILE define on on top of the file and do + +``` +$ make fractal-ex2-RAJA-CUDA +$ ./bin/fractal-ex2-RAJA-CUDA +``` \ No newline at end of file diff --git a/Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp b/Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp index 43c5b26..8e35f91 100644 --- a/Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp +++ b/Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp @@ -16,9 +16,10 @@ #define yMin 0.11321 #define yMax 0.11899 -/* TODO: create a variable called "THREADS" to be used when calling the kernel*/ +#define THREADS 512 -//#define COMPILE +//TODO: uncomment this out in order to build! +// #define COMPILE int main(int argc, char *argv[]) { @@ -40,25 +41,34 @@ int main(int argc, char *argv[]) printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth); - /* TODO: Create the "cnt" array to store the pixels and allocate space for it on CPU using pinned memory */ + //TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the + //pixels of the fractal image. + auto& rm = umpire::ResourceManager::getInstance(); + unsigned char *cnt{nullptr}; + auto allocator = rm.getAllocator("???"); + auto pool = ??? + cnt = static_cast(pool.allocate(width * width * sizeof(unsigned char))); - - /* TODO: Create the "d_cnt" array to store pixels on the GPU and allocate space for it on the GPU */ - - - /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop - * and one inner loop. Only the inner for loop will be calculating pixels. + /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop + * and one inner loop. Only the inner for loop will be calculating pixels. */ + using KERNEL_POLICY = RAJA::KernelPolicy< + RAJA::statement::CudaKernel< + RAJA::statement::For<1, /* CUDA policy */ + RAJA::statement::For<0, /* CUDA policy */ + RAJA::statement::Lambda<0> + > + > + > + >; - /* compute fractal */ gettimeofday(&start, NULL); - /* TODO: Add a RAJA::Kernel which takes the KERNEL_POLICY you just created above. - * It should take range segments that go the same range as our for-loops from before. - * The iterators inside the kernel body will describe the row and col of the image. - */ - //Remember, RAJA takes care of finding the global thread ID, so just index into the image like normal + RAJA::kernel( + RAJA::make_tuple(RAJA::TypedRangeSegment(0, width), + RAJA::TypedRangeSegment(0, width)), + [=] RAJA_DEVICE (int row, int col) { double x2, y2, x, y, cx, cy; int depth; @@ -76,22 +86,19 @@ int main(int argc, char *argv[]) depth--; } while ((depth > 0) && ((x2 + y2) <= 5.0)); d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal - + }); gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us. printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0); - /* TODO: In order to create a bmp image, we need to copy the completed fractal to the Host memory space */ - - /* verify result by writing it to a file */ if (width <= 2048) { wbmp.WriteBMP(width, width, cnt, "fractal.bmp"); } - /* TODO: Free the memory we allocated. */ - - + //TODO: Use the Umpire pooled allocator to deallocate the memory. + pool.deallocate(cnt); #endif + return 0; } diff --git a/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA.cpp b/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp similarity index 81% rename from Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA.cpp rename to Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp index 33d11d2..dc4a057 100644 --- a/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA.cpp +++ b/Intermediate_Tutorial/02-CUDA/solutions/fractal-ex2-RAJA-CUDA-solution.cpp @@ -37,13 +37,13 @@ int main(int argc, char *argv[]) printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth); - /* TODO: Create the "cnt" array to store the pixels and allocate space for it on CPU using pinned memory */ - unsigned char *cnt; - cudaHostAlloc((void**)&cnt, (width * width * sizeof(unsigned char)), cudaHostAllocDefault); - - /* TODO: Create the "d_cnt" array to store pixels on the GPU and allocate space for it on the GPU */ - unsigned char *d_cnt; - cudaMalloc((void**)&d_cnt, width * width * sizeof(unsigned char)); + //TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the + //pixels of the fractal image. + auto& rm = umpire::ResourceManager::getInstance(); + unsigned char *cnt{nullptr}; + auto allocator = rm.getAllocator("PINNED"); + auto pool = rm.makeAllocator("qpool", allocator); + cnt = static_cast(pool.allocate(width * width * sizeof(unsigned char))); /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop * and one inner loop. Only the inner for loop will be calculating pixels. @@ -91,16 +91,11 @@ int main(int argc, char *argv[]) printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0); - /* TODO: In order to create a bmp image, we need to copy the completed fractal to the Host memory space */ - cudaMemcpyAsync(cnt, d_cnt, width * width * sizeof(unsigned char), cudaMemcpyDeviceToHost); - /* verify result by writing it to a file */ if (width <= 2048) { wbmp.WriteBMP(width, width, cnt, "fractal.bmp"); } - /* TODO: Free the memory we allocated. */ - cudaFreeHost(cnt); - cudaFree(d_cnt); + pool.deallocate(cnt); return 0; } diff --git a/Intermediate_Tutorial/02-CUDA/solutions/fractal-solution-RAJA-CUDA.cpp b/Intermediate_Tutorial/02-CUDA/solutions/fractal-solution-RAJA-CUDA.cpp deleted file mode 100644 index f1f4983..0000000 --- a/Intermediate_Tutorial/02-CUDA/solutions/fractal-solution-RAJA-CUDA.cpp +++ /dev/null @@ -1,95 +0,0 @@ -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// -// Copyright (c) 2016-22, Lawrence Livermore National Security, LLC -// and RAJA project contributors. See the RAJA/LICENSE file for details. -// -// SPDX-License-Identifier: (BSD-3-Clause) -//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// - -#include -#include - -#include "RAJA/RAJA.hpp" -#include "../../../tpl/writeBMP.hpp" - -#define xMin 0.74395 -#define xMax 0.74973 -#define yMin 0.11321 -#define yMax 0.11899 - -#define THREADS 512 - -int main(int argc, char *argv[]) -{ - double dx, dy; - int width; - int maxdepth = 256; - unsigned char *cnt; - struct timeval start, end; - writebmp wbmp; - - - /* check command line */ - if(argc != 2) {fprintf(stderr, "usage: exe \n"); exit(-1);} - width = atoi(argv[1]); - if (width < 10) {fprintf(stderr, "edge_length must be at least 10\n"); exit(-1);} - - unsigned char *d_cnt; - - dx = (xMax - xMin) / width; - dy = (yMax - yMin) / width; - - printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth); - - cudaHostAlloc((void**)&cnt, (width * width * sizeof(unsigned char)), cudaHostAllocDefault); - - /* allocate space on GPU */ - cudaMalloc((void**)&d_cnt, width * width * sizeof(unsigned char)); - - using KERNEL_POLICY = RAJA::KernelPolicy< - RAJA::statement::CudaKernel< - RAJA::statement::For<1, RAJA::cuda_block_x_loop, - RAJA::statement::For<0, RAJA::cuda_thread_x_loop, - RAJA::statement::Lambda<0> - > - > - > - >; - - gettimeofday(&start, NULL); - RAJA::kernel( - RAJA::make_tuple(RAJA::TypedRangeSegment(0, width), - RAJA::TypedRangeSegment(0, width)), - [=] RAJA_DEVICE (int row, int col) { - double x2, y2, x, y, cx, cy; - int depth; - - /* compute fractal */ - cy = yMin + row * dy; //compute row # - cx = xMin + col * dx; //compute column # - x = -cx; - y = -cy; - depth = maxdepth; - do { - x2 = x * x; - y2 = y * y; - y = 2 * x * y - cy; - x = x2 - y2 - cx; - depth--; - } while ((depth > 0) && ((x2 + y2) <= 5.0)); - d_cnt[row * width + col] = depth & 255; - }); - gettimeofday(&end, NULL); - - printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0); - - cudaMemcpyAsync(cnt, d_cnt, width * width * sizeof(unsigned char), cudaMemcpyDeviceToHost); - - /* verify result by writing it to a file */ - if (width <= 2048) { - wbmp.WriteBMP(width, width, cnt, "fractal.bmp"); - } - - cudaFreeHost(cnt); - cudaFree(d_cnt); - return 0; -} diff --git a/Intermediate_Tutorial/03-HIP/CMakeLists.txt b/Intermediate_Tutorial/03-HIP/CMakeLists.txt index fd170e1..362cf29 100644 --- a/Intermediate_Tutorial/03-HIP/CMakeLists.txt +++ b/Intermediate_Tutorial/03-HIP/CMakeLists.txt @@ -11,5 +11,3 @@ if (ENABLE_HIP) SOURCES fractal-ex3-RAJA-HIP.cpp DEPENDS_ON blt::hip RAJA umpire writeBMP) endif() - -add_subdirectory(solutions) diff --git a/Intermediate_Tutorial/03-HIP/README.md b/Intermediate_Tutorial/03-HIP/README.md new file mode 100644 index 0000000..9fd5809 --- /dev/null +++ b/Intermediate_Tutorial/03-HIP/README.md @@ -0,0 +1,21 @@ +================================= +Fractal Tutorial - HIP Execution +================================= + +The main purpose of this lesson is to demonstrate the performance portability of RAJA and Umpire. Up until now, we have been using the cuda_exec policy which is specific to NVIDIA GPUs and the CUDA API. Now, we will have to prepare our program for use on AMD GPUs with the HIP API. + +Note: Running this part of the code can be tricky because we will now have to run this on a different machine that is equipped with AMD GPUs. At LLNL, this will mean just ssh'ing into a different machine with the right backend hardware. However, if you don't have access to these types of machines, you can try porting this lesson to the openmp offload or SYCL execution policies (but that is beyond the scope of this tutorial). + +Look for the `TODO` comments in the source code. Here you will have to choose +two RAJA HIP policies for the kernel API. + +A complete description of the different policies is available in the online RAJA +documentation: +https://raja.readthedocs.io/en/develop/sphinx/user_guide/tutorial/kernel_exec_pols.html# + +Once you are ready, uncomment the COMPILE define on on top of the file and do + +``` +$ make fractal-ex3-RAJA-HIP +$ ./bin/fractal-ex3-RAJA-HIP +``` \ No newline at end of file diff --git a/Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp b/Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp index 7e8a23a..478b643 100644 --- a/Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp +++ b/Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp @@ -16,11 +16,15 @@ #define yMin 0.11321 #define yMax 0.11899 -/* TODO: create a variable called "THREADS" to be used when calling the kernel*/ #define THREADS 512 +//TODO: uncomment this out in order to build! +// #define COMPILE + int main(int argc, char *argv[]) { +#if defined(COMPILE) + double dx, dy; int width; const int maxdepth = 256; @@ -37,21 +41,21 @@ int main(int argc, char *argv[]) printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth); - /* TODO: Create the "cnt" array to store the pixels and allocate space for it on CPU using pinned memory */ - unsigned char *cnt; - cudaHostMalloc((void**)&cnt, (width * width * sizeof(unsigned char)), cudaHostRegisterDefault); + //TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the + //pixels of the fractal image. + auto& rm = umpire::ResourceManager::getInstance(); + unsigned char *cnt{nullptr}; + auto allocator = rm.getAllocator("PINNED"); + auto pool = rm.makeAllocator("qpool", allocator); + cnt = static_cast(pool.allocate(width * width * sizeof(unsigned char))); - /* TODO: Create the "d_cnt" array to store pixels on the GPU and allocate space for it on the GPU */ - unsigned char *d_cnt; - cudaMalloc((void**)&d_cnt, width * width * sizeof(unsigned char)); - - /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop + /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a hip kernel with one outer loop * and one inner loop. Only the inner for loop will be calculating pixels. */ using KERNEL_POLICY = RAJA::KernelPolicy< - RAJA::statement::CudaKernel< - RAJA::statement::For<1, RAJA::cuda_block_x_loop, - RAJA::statement::For<0, RAJA::cuda_thread_x_loop, + RAJA::statement::HipKernel< + RAJA::statement::For<1, /* HIP policy */ + RAJA::statement::For<0, /* HIP policy */ RAJA::statement::Lambda<0> > > @@ -60,15 +64,11 @@ int main(int argc, char *argv[]) /* compute fractal */ gettimeofday(&start, NULL); - /* TODO: Add a RAJA::Kernel which takes the KERNEL_POLICY you just created above. - * It should take range segments that go the same range as our for-loops from before. - * The iterators inside the kernel body will describe the row and col of the image. - */ + RAJA::kernel( RAJA::make_tuple(RAJA::TypedRangeSegment(0, width), RAJA::TypedRangeSegment(0, width)), [=] RAJA_DEVICE (int row, int col) { - //Remember, RAJA takes care of finding the global thread ID, so just index into the image like normal double x2, y2, x, y, cx, cy; int depth; @@ -91,16 +91,14 @@ int main(int argc, char *argv[]) printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0); - /* TODO: In order to create a bmp image, we need to copy the completed fractal to the Host memory space */ - cudaMemcpyAsync(cnt, d_cnt, width * width * sizeof(unsigned char), cudaMemcpyDeviceToHost); - /* verify result by writing it to a file */ if (width <= 2048) { wbmp.WriteBMP(width, width, cnt, "fractal.bmp"); } - /* TODO: Free the memory we allocated. */ - cudaHostFree(cnt); - cudaFree(d_cnt); + pool.deallocate(cnt); + +#endif + return 0; } diff --git a/Intermediate_Tutorial/03-HIP/solutions/CMakeLists.txt b/Intermediate_Tutorial/03-HIP/solutions/CMakeLists.txt deleted file mode 100644 index d9bdc71..0000000 --- a/Intermediate_Tutorial/03-HIP/solutions/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -############################################################################### -# Copyright (c) 2016-23, Lawrence Livermore National Security, LLC -# and RAJA project contributors. See the RAJA/LICENSE file for details. -# -# SPDX-License-Identifier: (BSD-3-Clause) -############################################################################### - -if(RAJA_ENABLE_HIP) -raja_add_executable( - NAME fractal-RAJA-HIP - SOURCES fractal-solution-RAJA-HIP.cpp - DEPENDS_ON blt::hip RAJA umpire writeBMP) -endif() - diff --git a/Intermediate_Tutorial/03-HIP/solutions/fractal-ex3-RAJA-HIP.cpp b/Intermediate_Tutorial/03-HIP/solutions/fractal-ex3-RAJA-HIP-solution.cpp similarity index 84% rename from Intermediate_Tutorial/03-HIP/solutions/fractal-ex3-RAJA-HIP.cpp rename to Intermediate_Tutorial/03-HIP/solutions/fractal-ex3-RAJA-HIP-solution.cpp index 65c6840..17fd01f 100644 --- a/Intermediate_Tutorial/03-HIP/solutions/fractal-ex3-RAJA-HIP.cpp +++ b/Intermediate_Tutorial/03-HIP/solutions/fractal-ex3-RAJA-HIP-solution.cpp @@ -37,13 +37,13 @@ int main(int argc, char *argv[]) printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth); - /* TODO: Create the "cnt" array to store the pixels and allocate space for it on CPU using pinned memory */ - unsigned char *cnt; - hipHostMalloc((void**)&cnt, (width * width * sizeof(unsigned char)), hipHostRegisterDefault); - - /* TODO: Create the "d_cnt" array to store pixels on the GPU and allocate space for it on the GPU */ - unsigned char *d_cnt; - hipMalloc((void**)&d_cnt, width * width * sizeof(unsigned char)); + //TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the + //pixels of the fractal image. + auto& rm = umpire::ResourceManager::getInstance(); + unsigned char *cnt{nullptr}; + auto allocator = rm.getAllocator("PINNED"); + auto pool = rm.makeAllocator("qpool", allocator); + cnt = static_cast(pool.allocate(width * width * sizeof(unsigned char))); /* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a hip kernel with one outer loop * and one inner loop. Only the inner for loop will be calculating pixels. @@ -68,7 +68,6 @@ int main(int argc, char *argv[]) RAJA::make_tuple(RAJA::TypedRangeSegment(0, width), RAJA::TypedRangeSegment(0, width)), [=] RAJA_DEVICE (int row, int col) { - //Remember, RAJA takes care of finding the global thread ID, so just index into the image like normal double x2, y2, x, y, cx, cy; int depth; @@ -100,7 +99,6 @@ int main(int argc, char *argv[]) } /* TODO: Free the memory we allocated. */ - hipHostFree(cnt); - hipFree(d_cnt); + pool.deallocate(cnt); return 0; } diff --git a/Intermediate_Tutorial/04-LAUNCH/README.md b/Intermediate_Tutorial/04-LAUNCH/README.md new file mode 100644 index 0000000..eaa89c6 --- /dev/null +++ b/Intermediate_Tutorial/04-LAUNCH/README.md @@ -0,0 +1,24 @@ +================================= +Fractal Tutorial - LAUNCH Execution +================================= + +The RAJA launch API introduces the concept of an execution space enabling +developers to express algorithms in terms of nested RAJA::loops. As the kernel +execution space is exposed to developers, static shared memory is avaible when +using the CUDA/HIP backends. The launch abstraction also takes a more explicit +approach in configuring device compute grid parameters. Finally, RAJA launch +can take both a host and device execution policy enabling run-time dispatch selection. + +Look for the `TODO` comments in the source code. The main task here is to select +a host and device policy for the launch configuration and loop function. + +A complete description of the different policies is available in the online RAJA +documentation: +https://raja.readthedocs.io/en/develop/sphinx/user_guide/feature/policies.html#raja-loop-kernel-execution-policies + +Once you are ready, uncomment the COMPILE define on on top of the file and do + +``` +$ make fractal-ex4-RAJA-HIP +$ ./bin/fractal-ex4-RAJA-HIP +``` \ No newline at end of file diff --git a/Intermediate_Tutorial/04-LAUNCH/fractal-ex4-RAJA-launch.cpp b/Intermediate_Tutorial/04-LAUNCH/fractal-ex4-RAJA-launch.cpp index 70ec9c4..7cd1c18 100644 --- a/Intermediate_Tutorial/04-LAUNCH/fractal-ex4-RAJA-launch.cpp +++ b/Intermediate_Tutorial/04-LAUNCH/fractal-ex4-RAJA-launch.cpp @@ -11,7 +11,8 @@ #define yMin 0.11321 #define yMax 0.11899 -//#define COMPILE +//TODO: uncomment this out in order to build! +// #define COMPILE int main(int argc, char *argv[]) { @@ -46,46 +47,32 @@ int main(int argc, char *argv[]) printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth); - //TODO: Create an Umpire QuickPool allocator with Unified Memory that will hold the - // pixels of the fractal image. auto& rm = umpire::ResourceManager::getInstance(); unsigned char *cnt{nullptr}; - auto allocator = rm.getAllocator("UM"); + auto allocator = rm.getAllocator("PINNED"); auto pool = rm.makeAllocator("qpool", allocator); cnt = static_cast(pool.allocate(width * width * sizeof(unsigned char))); //TODO: Create a RAJA launch policy for the host and device + using launch_policy = RAJA::LaunchPolicy; - using host_launch = -#if defined(RAJA_ENABLE_CUDA) - using device_launch = -#elif defined(RAJA_ENABLE_HIP) - using device_launch = -#endif - - using launch_policy = RAJA::LaunchPolicy< - host_launch -#if defined(RAJA_GPU_ACTIVE) - ,device_launch -#endif - >; + //TODO: create RAJA loop policies for the host and device + using col_loop = RAJA::LoopPolicy; - //RAJA loop policies take a pair of policies enabling run time selection of - - using col_loop = RAJA::LoopPolicy; - - using row_loop = RAJA::LoopPolicy; + using row_loop = RAJA::LoopPolicy; /* start time */ gettimeofday(&start, NULL); - constexpr int block_sz = 16; - int n_blocks = (width + block_sz - 1) / block_sz + 1; + //Calculate number of blocks + constexpr int team_sz = 16; + int n_teams = (width + team_sz - 1) / team_sz + 1; + //Teams are akin to to CUDA/HIP blocks RAJA::launch - (select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(n_blocks, n_blocks), - RAJA::Threads(block_sz, block_sz)), + (select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(n_teams, n_teams), + RAJA::Threads(team_sz, team_sz)), [=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, width), [&] (int col) { @@ -122,8 +109,7 @@ int main(int argc, char *argv[]) wbmp.WriteBMP(width, width, cnt, "fractal.bmp"); } - //TODO: Use the Umpire pooled allocator to deallocate the memory. - + pool.deallocate(cnt); #endif return 0; } diff --git a/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch.cpp b/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp similarity index 98% rename from Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch.cpp rename to Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp index e5487cf..96aba17 100644 --- a/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch.cpp +++ b/Intermediate_Tutorial/04-LAUNCH/solution/fractal-ex4-RAJA-launch-solution.cpp @@ -46,7 +46,7 @@ int main(int argc, char *argv[]) // pixels of the fractal image. auto& rm = umpire::ResourceManager::getInstance(); unsigned char *cnt{nullptr}; - auto allocator = rm.getAllocator("UM"); + auto allocator = rm.getAllocator("PINNED"); auto pool = rm.makeAllocator("qpool", allocator); cnt = static_cast(pool.allocate(width * width * sizeof(unsigned char))); diff --git a/Intermediate_Tutorial/CMakeLists.txt b/Intermediate_Tutorial/CMakeLists.txt index 0af58e1..3190c0c 100644 --- a/Intermediate_Tutorial/CMakeLists.txt +++ b/Intermediate_Tutorial/CMakeLists.txt @@ -5,6 +5,8 @@ # SPDX-License-Identifier: (BSD-3-Clause) ############################################################################### +add_subdirectory(00-BASE) + add_subdirectory(01-SEQ) if(RAJA_ENABLE_CUDA) diff --git a/Intermediate_Tutorial/README.md b/Intermediate_Tutorial/README.md index 19de4b8..5a68217 100644 --- a/Intermediate_Tutorial/README.md +++ b/Intermediate_Tutorial/README.md @@ -1,9 +1,47 @@ -In the following lessons you will compare implementations of a fractal generating kernel -using the kernel and launch frameworks. +================ +Fractal Tutorial +================ -The first three examples the RAJA kernel method is used while in the forth example -RAJA launch is used. RAJA kernel requires recompilation when changing backend dispatch. -RAJA launch supports run time selection between a host and device backend. +This tutorial includes several implementations of a Mandelbrot set Fractal code. +The code originated from Dr. Martin Burtscher of the Efficient Computing Lab at +Texas State University. You can find more here: https://userweb.cs.txstate.edu/~burtscher/research.html -As before the exercises have COMPILE macro guards, to compile the code uncomment the -COMPILE define on top of the file. +In the "extras" directories, there are a few other RAJA implementations such +as OpenMP and even a native CUDA implementation just for comparison. You can reference +these implementations to study the differences in implementation and runtime comparison. +However, anything beyond that is outside the scope of this tutorial. + +In the following lessons, you will compare RAJA implementations of the fractal generating code. +We will start with a sequential implementation of the fractal and gradually build our +way up to a more complex RAJA launch implementation. +You will notice that these lessons will employ the RAJA kernel and launch abstractions. +Additionally, as the lessons progress, we will be exploring the performance portability +of RAJA by looking at how we can change the targeted backend from CUDA to HIP. +(Refer to lessons 02-CUDA and 03-HIP). + +As before, the exercises have COMPILE macro guards. To compile the code, uncomment the +COMPILE define at the top of the file. + +If you are doing this tutorial outside of the RADIUSS tutorial series, be sure to build +the tutorial within a newly created, empty `build` directory located +in the `raja-suite-tutorial` repo. If you're on a LC machine, you can run these commands: +``` +module load cuda/11.2.0 +module load cmake/3.20.2 +module load gcc/8.3.1 +cmake -DENABLE_CUDA=On -DENABLE_OPENMP=Off -DCMAKE_CUDA_ARCHITECTURES=70 -DCMAKE_CUDA_COMPILER=/usr/tce/packages/cuda/cuda-11.2.0/bin/nvcc -DCUDA_TOOLKIT_ROOT_DIR=/usr/tce/packages/cuda/cuda-11.2.0 -DBLT_CXX_STD=c++14 -DCMAKE_BUILD_TYPE=Release -DRAJA_ENABLE_EXERCISES=On -DRAJA_ENABLE_OPENMP=Off -DCMAKE_CUDA_FLAGS=--extended-lambda -DCUDA_ARCH=sm_70 ../ +``` + +I am building this code on LC's lassen machine. If these build instructions don't work for you, you can refer to the build documentation from RAJA's ReadTheDocs or use one of the provided build scripts. + +Now, we can build the RAJA loop-exec implementation with `./bin/fractal 1024`. The +first argument is the width of the fractal (1024). It may be interesting to see how +the fractal changes with different width values. + +To verify your results in each lesson, you can look at the resulting .bmp file output. If you +have completed everything correctly, you will see a complete image of the fractal. +Currently, there is an `if` statement that makes sure the `writeBMP` function +is only called for smaller fractal runs (of width <= 2048). You can edit this `if` statement, but be careful because trying +to write a .bmp file that is too large will take a very long time. + +Continue on to the first lesson located in the `00-BASE` directory. diff --git a/Intermediate_Tutorial/tutorial-guide.rst b/Intermediate_Tutorial/old_guide.rst similarity index 86% rename from Intermediate_Tutorial/tutorial-guide.rst rename to Intermediate_Tutorial/old_guide.rst index 009b4f7..6bef200 100644 --- a/Intermediate_Tutorial/tutorial-guide.rst +++ b/Intermediate_Tutorial/old_guide.rst @@ -6,14 +6,10 @@ This tutorial includes several implementations of a Mandelbrot set Fractal code. The code originated from Dr. Martin Burtscher of the Efficient Computing Lab at Texas State University. You can find more here: https://userweb.cs.txstate.edu/~burtscher/research.html -The tutorial first starts with a RAJA loop-exec policy implementation of the fractal code. -From there, we learn how to use RAJA-CUDA and RAJA-HIP execution policies. -The final lessons include a more complex fractal implementation that includes -RAJA-TEAMS. - In the "extras" directories, there are a few other RAJA implementations such as OpenMP and even a native CUDA implementation just for comparison. You can reference these implementation to study the differences in implementation and runtime comparison. +However, anything beyond that is outside the scope of this tutorial. To start, let's build the tutorial within a newly created, empty `build` directory located in the `raja-suite-tutorial` repo:: @@ -37,4 +33,4 @@ Currently, there is an `if` statement that makes sure the `writeBMP` function is only called for smaller fractal runs (of width <= 2048). You can edit this `if` statement, but be careful because trying to write a .bmp file that is too large will take a very long time. -Continue on to the first lesson located in the `LOOP` directory. +Continue on to the first lesson located in the `00-BASE` directory. diff --git a/README.md b/README.md index f92955b..231c788 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,8 @@ advanced (i.e. Intermediate) tutorial meant for those who would like more hands-on instruction for RAJA. We would suggest starting with the Intro tutorial and then moving on to the Intermediate tutorial. -If you would like to build locally, be sure to do the following -to build and run the tutorials: +If you are running on an LC machine and would like to build locally, +be sure to do the following to build and run the tutorials: ``` module load cmake/3.20.2 module load gcc/8.3.1