Skip to content

Commit

Permalink
Merge pull request #17 from LLNL/artv3/Intermediate_Tutorial_edits
Browse files Browse the repository at this point in the history
Update readmes for intermediate tutorial
  • Loading branch information
kab163 authored Aug 15, 2023
2 parents f82a6f6 + 88551dc commit 06da65b
Show file tree
Hide file tree
Showing 24 changed files with 316 additions and 238 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# SPDX-License-Identifier: (BSD-3-Clause)
###############################################################################

raja_add_executable(
NAME fractal-solution-RAJA-CUDA
SOURCES fractal-solution-RAJA-CUDA.cpp
DEPENDS_ON cuda writeBMP umpire RAJA)
blt_add_executable(
NAME fractal-ex0-c-loop
SOURCES fractal-ex0-c-loop.cpp
DEPENDS_ON writeBMP)
24 changes: 24 additions & 0 deletions Intermediate_Tutorial/00-BASE/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
===============================================
Fractal Tutorial - Base (Sequential) Execution
===============================================

This file has no exercises but it
is used as a base reference implementation.

Use the code to compare implementations
between the RAJA kernel and launch abstractions.

Compile and run the code:

```
$ make fractal-ex0-c-loop
$ ./bin/fractal-ex0-c-loop
```

Before starting, be sure to study the seq-exec implementation of the fractal.
It is important to note:
* Read-only, write-only, and read-write variables used in the main computation
* The main data structure that holds the values of the fractal pixels
* Any data dependencies, if any, throughout the computation of the pixels

Also note that this is a sequential implementation. Timing information will be output to the screen. As we add RAJA and Umpire, it will be interesting to see how performance improves.
72 changes: 72 additions & 0 deletions Intermediate_Tutorial/00-BASE/fractal-ex0-c-loop.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#include <malloc.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>

#include "../../tpl/writeBMP.hpp"

#define xMin 0.74395
#define xMax 0.74973
#define yMin 0.11321
#define yMax 0.11899

int main(int argc, char *argv[])
{

double dx, dy;
int width;
const int maxdepth = 256;
struct timeval start, end;
writebmp wbmp;

/* check command line */
if(argc != 2) {fprintf(stderr, "usage: exe <width> \n"); exit(-1);}
width = atoi(argv[1]);
if (width < 10) {fprintf(stderr, "edge_length must be at least 10\n"); exit(-1);}

dx = (xMax - xMin) / width;
dy = (yMax - yMin) / width;

printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);

unsigned char *cnt = (unsigned char*)malloc(width * width * sizeof(unsigned char));

/* start time */
gettimeofday(&start, NULL);

for(int row = 0; row < width; ++row) {
for(int col = 0; col < width; ++col) {

double x2, y2, x, y, cx, cy;
int depth;

cy = yMin + row * dy; //compute row #
cx = xMin + col * dx; //compute column #
x = -cx;
y = -cy;
depth = maxdepth;
do {
x2 = x * x;
y2 = y * y;
y = 2 * x * y - cy;
x = x2 - y2 - cx;
depth--;
} while ((depth > 0) && ((x2 + y2) <= 5.0));
cnt[row * width + col] = depth & 255;

}
}

/* end time */
gettimeofday(&end, NULL);
printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0);

/* verify result by writing it to a file */
if (width <= 2048) {
wbmp.WriteBMP(width, width, cnt, "fractal.bmp");
}

free(cnt);

return 0;
}
2 changes: 1 addition & 1 deletion Intermediate_Tutorial/01-SEQ/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
blt_add_executable(
NAME fractal-ex1-RAJA-seq
SOURCES fractal-ex1-RAJA-seq.cpp
DEPENDS_ON cuda RAJA umpire writeBMP)
DEPENDS_ON RAJA umpire writeBMP)
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
=================================
Fractal Tutorial - LOOP Execution
Fractal Tutorial - SEQ Execution
=================================

Before starting, be sure to study the loop-exec implementation of the fractal
before continuing. It is important to note:
Before starting, be sure to study the base implementation in the 00-BASE directory.
Remember, it is important to note:
* Read-only, write-only, and read-write variables used in the main computation
* The main data structure that holds the values of the fractal pixels
* Any data dependencies, if any, throughout the computation of the pixels
Expand All @@ -14,6 +14,18 @@ lesson 12 from the Introduction Tutorial) for the fractal and
complete the appropriate `RAJA::Kernel` statement using the `RAJA::seq_exec` execution
policy.

A complete description of the different policies is available in the online RAJA
documentation:
https://raja.readthedocs.io/en/develop/sphinx/user_guide/feature/policies.html#raja-loop-kernel-execution-policies

The `seq_exec` policy is a good first step because it allows us to get a sense of the
performance using serial nested loops. From here, we have a good baseline to compare against
when transitioning to CUDA, HIP, etc.
performance using sequential, nested loops with RAJA.
From here, we have a good baseline to compare against when transitioning to
CUDA, HIP, etc.

To run the code compile and run via:

```
$ make fractal-ex1-RAJA-seq
$ ./bin/fractal-ex1-RAJA-seq
```
3 changes: 2 additions & 1 deletion Intermediate_Tutorial/01-SEQ/fractal-ex1-RAJA-seq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#define yMin 0.11321
#define yMax 0.11899

//TODO: uncomment this out in order to build!
// #define COMPILE

int main(int argc, char *argv[])
Expand All @@ -33,7 +34,7 @@ int main(int argc, char *argv[])

printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);

//TODO: Create an Umpire QuickPool allocator with Unified Memory that will hold the
//TODO: Create an Umpire QuickPool allocator with host memory that will hold the
//pixels of the fractal image.
auto& rm = umpire::ResourceManager::getInstance();
unsigned char *cnt{nullptr};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ int main(int argc, char *argv[])

printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);

//TODO: Create an Umpire QuickPool allocator with Unified Memory that will hold the
//TODO: Create an Umpire QuickPool allocator with host memory that will hold the
//pixels of the fractal image.
auto& rm = umpire::ResourceManager::getInstance();
unsigned char *cnt{nullptr};
auto allocator = rm.getAllocator("UM");
auto allocator = rm.getAllocator("HOST");
auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));

Expand Down
2 changes: 0 additions & 2 deletions Intermediate_Tutorial/02-CUDA/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,3 @@ if (ENABLE_CUDA)
SOURCES fractal-ex2-RAJA-CUDA.cpp
DEPENDS_ON cuda writeBMP umpire RAJA)
endif()

add_subdirectory(solutions)
17 changes: 17 additions & 0 deletions Intermediate_Tutorial/02-CUDA/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
=================================
Fractal Tutorial - CUDA Execution
=================================

Look for the `TODO` comments in the source code. Here you will have to choose
two RAJA CUDA policies for the kernel API.

A complete description of the different policies is available in the online RAJA
documentation:
https://raja.readthedocs.io/en/develop/sphinx/user_guide/tutorial/kernel_exec_pols.html#

Once you are ready, uncomment the COMPILE define on on top of the file and do

```
$ make fractal-ex2-RAJA-CUDA
$ ./bin/fractal-ex2-RAJA-CUDA
```
51 changes: 29 additions & 22 deletions Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@
#define yMin 0.11321
#define yMax 0.11899

/* TODO: create a variable called "THREADS" to be used when calling the kernel*/
#define THREADS 512

//#define COMPILE
//TODO: uncomment this out in order to build!
// #define COMPILE

int main(int argc, char *argv[])
{
Expand All @@ -40,25 +41,34 @@ int main(int argc, char *argv[])

printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);

/* TODO: Create the "cnt" array to store the pixels and allocate space for it on CPU using pinned memory */
//TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the
//pixels of the fractal image.
auto& rm = umpire::ResourceManager::getInstance();
unsigned char *cnt{nullptr};
auto allocator = rm.getAllocator("???");
auto pool = ???
cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));


/* TODO: Create the "d_cnt" array to store pixels on the GPU and allocate space for it on the GPU */


/* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop
* and one inner loop. Only the inner for loop will be calculating pixels.
/* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop
* and one inner loop. Only the inner for loop will be calculating pixels.
*/
using KERNEL_POLICY = RAJA::KernelPolicy<
RAJA::statement::CudaKernel<
RAJA::statement::For<1, /* CUDA policy */
RAJA::statement::For<0, /* CUDA policy */
RAJA::statement::Lambda<0>
>
>
>
>;


/* compute fractal */
gettimeofday(&start, NULL);
/* TODO: Add a RAJA::Kernel which takes the KERNEL_POLICY you just created above.
* It should take range segments that go the same range as our for-loops from before.
* The iterators inside the kernel body will describe the row and col of the image.
*/

//Remember, RAJA takes care of finding the global thread ID, so just index into the image like normal
RAJA::kernel<KERNEL_POLICY>(
RAJA::make_tuple(RAJA::TypedRangeSegment<int>(0, width),
RAJA::TypedRangeSegment<int>(0, width)),
[=] RAJA_DEVICE (int row, int col) {
double x2, y2, x, y, cx, cy;
int depth;

Expand All @@ -76,22 +86,19 @@ int main(int argc, char *argv[])
depth--;
} while ((depth > 0) && ((x2 + y2) <= 5.0));
d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal

});
gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.

printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0);

/* TODO: In order to create a bmp image, we need to copy the completed fractal to the Host memory space */


/* verify result by writing it to a file */
if (width <= 2048) {
wbmp.WriteBMP(width, width, cnt, "fractal.bmp");
}

/* TODO: Free the memory we allocated. */


//TODO: Use the Umpire pooled allocator to deallocate the memory.
pool.deallocate(cnt);
#endif

return 0;
}
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,13 @@ int main(int argc, char *argv[])

printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);

/* TODO: Create the "cnt" array to store the pixels and allocate space for it on CPU using pinned memory */
unsigned char *cnt;
cudaHostAlloc((void**)&cnt, (width * width * sizeof(unsigned char)), cudaHostAllocDefault);

/* TODO: Create the "d_cnt" array to store pixels on the GPU and allocate space for it on the GPU */
unsigned char *d_cnt;
cudaMalloc((void**)&d_cnt, width * width * sizeof(unsigned char));
//TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the
//pixels of the fractal image.
auto& rm = umpire::ResourceManager::getInstance();
unsigned char *cnt{nullptr};
auto allocator = rm.getAllocator("PINNED");
auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));

/* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop
* and one inner loop. Only the inner for loop will be calculating pixels.
Expand Down Expand Up @@ -91,16 +91,11 @@ int main(int argc, char *argv[])

printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0);

/* TODO: In order to create a bmp image, we need to copy the completed fractal to the Host memory space */
cudaMemcpyAsync(cnt, d_cnt, width * width * sizeof(unsigned char), cudaMemcpyDeviceToHost);

/* verify result by writing it to a file */
if (width <= 2048) {
wbmp.WriteBMP(width, width, cnt, "fractal.bmp");
}

/* TODO: Free the memory we allocated. */
cudaFreeHost(cnt);
cudaFree(d_cnt);
pool.deallocate(cnt);
return 0;
}
Loading

0 comments on commit 06da65b

Please sign in to comment.