Skip to content

Commit

Permalink
Merge pull request #24 from LLNL/artv3/intermediate-tutorial
Browse files Browse the repository at this point in the history
Minor fixes for intermediate tutorial
  • Loading branch information
artv3 authored Aug 2, 2024
2 parents 053c213 + da374e1 commit d4c1e89
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 50 deletions.
8 changes: 5 additions & 3 deletions Intermediate_Tutorial/02-CUDA/fractal-ex2-RAJA-CUDA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,16 @@
#include <sys/time.h>

#include "RAJA/RAJA.hpp"
#include "umpire/Umpire.hpp"
#include "umpire/strategy/QuickPool.hpp"
#include "../../tpl/writeBMP.hpp"

#define xMin 0.74395
#define xMax 0.74973
#define yMin 0.11321
#define yMax 0.11899

#define THREADS 512
#define THREADS 256

//TODO: uncomment this out in order to build!
// #define COMPILE
Expand All @@ -41,7 +43,7 @@ int main(int argc, char *argv[])

printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);

//TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the
//TODO: Create an Umpire QuickPool allocator with unified memory that will hold the
//pixels of the fractal image.
auto& rm = umpire::ResourceManager::getInstance();
unsigned char *cnt{nullptr};
Expand Down Expand Up @@ -85,7 +87,7 @@ int main(int argc, char *argv[])
x = x2 - y2 - cx;
depth--;
} while ((depth > 0) && ((x2 + y2) <= 5.0));
d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal
cnt[row * width + col] = depth & 255; //Remember to index the image like normal
});
gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include <sys/time.h>

#include "RAJA/RAJA.hpp"
#include "umpire/Umpire.hpp"
#include "umpire/strategy/QuickPool.hpp"
#include "../../tpl/writeBMP.hpp"

#define xMin 0.74395
Expand All @@ -17,7 +19,7 @@
#define yMax 0.11899

/* TODO: create a variable called "THREADS" to be used when calling the kernel*/
#define THREADS 512
#define THREADS 256

int main(int argc, char *argv[])
{
Expand All @@ -37,27 +39,27 @@ int main(int argc, char *argv[])

printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);

//TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the
//TODO: Create an Umpire QuickPool allocator with unified memory that will hold the
//pixels of the fractal image.
auto& rm = umpire::ResourceManager::getInstance();
unsigned char *cnt{nullptr};
auto allocator = rm.getAllocator("PINNED");
auto allocator = rm.getAllocator("UM");
auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));

/* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop
* and one inner loop. Only the inner for loop will be calculating pixels.
/* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a cuda kernel with one outer loop
* and one inner loop. Only the inner for loop will be calculating pixels.
*/
using KERNEL_POLICY = RAJA::KernelPolicy<
RAJA::statement::CudaKernel<
RAJA::statement::For<1, RAJA::cuda_block_x_loop,
RAJA::statement::For<0, RAJA::cuda_thread_x_loop,
RAJA::statement::CudaKernelFixed<THREADS,
RAJA::statement::For<1, RAJA::cuda_global_size_y_direct<16>,
RAJA::statement::For<0, RAJA::cuda_global_size_x_direct<16>,
RAJA::statement::Lambda<0>
>
>
>
>
>;

/* compute fractal */
gettimeofday(&start, NULL);
/* TODO: Add a RAJA::Kernel which takes the KERNEL_POLICY you just created above.
Expand Down Expand Up @@ -85,7 +87,7 @@ int main(int argc, char *argv[])
x = x2 - y2 - cx;
depth--;
} while ((depth > 0) && ((x2 + y2) <= 5.0));
d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal
cnt[row * width + col] = depth & 255; //Remember to index the image like normal
});
gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.

Expand Down
10 changes: 6 additions & 4 deletions Intermediate_Tutorial/03-HIP/fractal-ex3-RAJA-HIP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,16 @@
#include <sys/time.h>

#include "RAJA/RAJA.hpp"
#include "umpire/Umpire.hpp"
#include "umpire/strategy/QuickPool.hpp"
#include "../../tpl/writeBMP.hpp"

#define xMin 0.74395
#define xMax 0.74973
#define yMin 0.11321
#define yMax 0.11899

#define THREADS 512
#define THREADS 256

//TODO: uncomment this out in order to build!
// #define COMPILE
Expand All @@ -41,11 +43,11 @@ int main(int argc, char *argv[])

printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);

//TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the
//TODO: Create an Umpire QuickPool allocator with unified memory that will hold the
//pixels of the fractal image.
auto& rm = umpire::ResourceManager::getInstance();
unsigned char *cnt{nullptr};
auto allocator = rm.getAllocator("PINNED");
auto allocator = rm.getAllocator("UM");
auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));

Expand Down Expand Up @@ -85,7 +87,7 @@ int main(int argc, char *argv[])
x = x2 - y2 - cx;
depth--;
} while ((depth > 0) && ((x2 + y2) <= 5.0));
d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal
cnt[row * width + col] = depth & 255; //Remember to index the image like normal
});
gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,16 @@
#include <sys/time.h>

#include "RAJA/RAJA.hpp"
#include "umpire/Umpire.hpp"
#include "umpire/strategy/QuickPool.hpp"
#include "../../tpl/writeBMP.hpp"

#define xMin 0.74395
#define xMax 0.74973
#define yMin 0.11321
#define yMax 0.11899

/* TODO: create a variable called "THREADS" to be used when calling the kernel*/
#define THREADS 512
#define THREADS 256

int main(int argc, char *argv[])
{
Expand All @@ -37,21 +38,21 @@ int main(int argc, char *argv[])

printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);

//TODO: Create an Umpire QuickPool allocator with pinned memory that will hold the
//TODO: Create an Umpire QuickPool allocator with unified memory that will hold the
//pixels of the fractal image.
auto& rm = umpire::ResourceManager::getInstance();
unsigned char *cnt{nullptr};
auto allocator = rm.getAllocator("PINNED");
auto allocator = rm.getAllocator("UM");
auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));

/* TODO: Set up a RAJA::KernelPolicy. The Policy should describe a hip kernel with one outer loop
* and one inner loop. Only the inner for loop will be calculating pixels.
*/
using KERNEL_POLICY = RAJA::KernelPolicy<
RAJA::statement::HipKernel<
RAJA::statement::For<1, RAJA::hip_block_x_loop,
RAJA::statement::For<0, RAJA::hip_thread_x_loop,
using KERNEL_POLICY = RAJA::KernelPolicyFixed<
RAJA::statement::HipKernel<THREADS,
RAJA::statement::For<1, RAJA::hip_global_size_y_direct<16>,
RAJA::statement::For<0, RAJA::hip_global_size_x_direct<16>,
RAJA::statement::Lambda<0>
>
>
Expand Down Expand Up @@ -84,15 +85,12 @@ int main(int argc, char *argv[])
x = x2 - y2 - cx;
depth--;
} while ((depth > 0) && ((x2 + y2) <= 5.0));
d_cnt[row * width + col] = depth & 255; //Remember to index the image like normal
cnt[row * width + col] = depth & 255; //Remember to index the image like normal
});
gettimeofday(&end, NULL); //By the time we exit the RAJA::Kernel, host and device are synchronized for us.

printf("compute time: %.8f s\n", end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0);

/* TODO: In order to create a bmp image, we need to copy the completed fractal to the Host memory space */
hipMemcpyAsync(cnt, d_cnt, width * width * sizeof(unsigned char), hipMemcpyDeviceToHost);

/* verify result by writing it to a file */
if (width <= 2048) {
wbmp.WriteBMP(width, width, cnt, "fractal.bmp");
Expand Down
12 changes: 7 additions & 5 deletions Intermediate_Tutorial/04-LAUNCH/fractal-ex4-RAJA-launch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,18 @@ int main(int argc, char *argv[])

auto& rm = umpire::ResourceManager::getInstance();
unsigned char *cnt{nullptr};
auto allocator = rm.getAllocator("PINNED");
auto allocator = rm.getAllocator("UM");
auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));

constexpr int team_dim = 16;
using host_launch = RAJA::seq_launch_t;

//TODO: Create a RAJA launch policy for the host and device
using launch_policy = RAJA::LaunchPolicy</* host launch policy */, /* device launch policies */>;


//TODO: create RAJA loop policies for the host and device
//TODO: create RAJA global thread loop policies for the host and device
using col_loop = RAJA::LoopPolicy</*host policy */, /*device policy*/>;

using row_loop = RAJA::LoopPolicy</*host policy */, /*device policy*/>;
Expand All @@ -66,13 +69,12 @@ int main(int argc, char *argv[])
gettimeofday(&start, NULL);

//Calculate number of blocks
constexpr int team_sz = 16;
int n_teams = (width + team_sz - 1) / team_sz + 1;
int n_teams = (width + team_dim - 1) / team_dim + 1;

//Teams are akin to to CUDA/HIP blocks
RAJA::launch<launch_policy>
(select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(n_teams, n_teams),
RAJA::Threads(team_sz, team_sz)),
RAJA::Threads(team_dim, team_dim)),
[=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {

RAJA::loop<col_loop>(ctx, RAJA::RangeSegment(0, width), [&] (int col) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,45 +42,47 @@ int main(int argc, char *argv[])

printf("computing %d by %d fractal with a maximum depth of %d\n", width, width, maxdepth);

//TODO: Create an Umpire QuickPool allocator with Unified Memory that will hold the
// pixels of the fractal image.
auto& rm = umpire::ResourceManager::getInstance();
unsigned char *cnt{nullptr};
auto allocator = rm.getAllocator("PINNED");
auto allocator = rm.getAllocator("UM");
auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));

//TODO: Create a RAJA Kernel Policy which uses the loop_exec policy. We want to start
//with a normal serial nested loop first before continuing onward.

constexpr int team_dim = 16;
using host_launch = RAJA::seq_launch_t;

//TODO: create RAJA global thread loop policies for the host and device
#if defined(RAJA_ENABLE_CUDA)
using device_launch = RAJA::cuda_launch_t<false>;
#elif defined(RAJA_ENABLE_HIP)
using device_launch = RAJA::hip_launch_t<false>;
#endif

using launch_policy = RAJA::LaunchPolicy<
host_launch
#if defined(RAJA_GPU_ACTIVE)
#if defined(RAJA_ENABLE_CUDA)
,device_launch
#endif
>;

using col_loop = RAJA::LoopPolicy<RAJA::loop_exec, RAJA::cuda_global_thread_x>;
using col_loop = RAJA::LoopPolicy<RAJA::seq_exec
#if defined(RAJA_ENABLE_CUDA)
,RAJA::cuda_global_size_y_direct<team_dim>
#endif
>;

using row_loop = RAJA::LoopPolicy<RAJA::loop_exec, RAJA::cuda_global_thread_y>;
using row_loop = RAJA::LoopPolicy<RAJA::seq_exec
#if defined(RAJA_ENABLE_CUDA)
,RAJA::cuda_global_size_x_direct<team_dim>
#endif
>;

/* start time */
gettimeofday(&start, NULL);

constexpr int block_sz = 16;
int n_blocks = (width + block_sz-1) / block_sz + 1;
int n_teams = (width + team_dim-1) / team_dim + 1;

RAJA::launch<launch_policy>
(select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(n_blocks, n_blocks),
RAJA::Threads(block_sz, block_sz)),
(select_cpu_or_gpu, RAJA::LaunchParams(RAJA::Teams(n_teams, n_teams),
RAJA::Threads(team_dim, team_dim)),
[=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {

RAJA::loop<col_loop>(ctx, RAJA::RangeSegment(0, width), [&] (int col) {
Expand Down

0 comments on commit d4c1e89

Please sign in to comment.