Skip to content

Commit

Permalink
overview of gpu threads
Browse files Browse the repository at this point in the history
  • Loading branch information
artv3 committed Jul 10, 2024
1 parent b58bc24 commit 8226b8e
Showing 1 changed file with 140 additions and 34 deletions.
174 changes: 140 additions & 34 deletions Advanced_Tutorial/01-GPU-Threads/01-GPU-Threads.cpp
Original file line number Diff line number Diff line change
@@ -1,79 +1,185 @@
#include "RAJA/RAJA.hpp"

#include <iostream>

int main(int argc, char *argv[])
int main(int RAJA_UNUSED_ARG(argc), char **RAJA_UNUSED_ARG(argv))
{


auto& rm = umpire::ResourceManager::getInstance();
unsigned char *cnt{nullptr};
auto allocator = rm.getAllocator("UM");
auto pool = rm.makeAllocator<umpire::strategy::QuickPool>("qpool", allocator);
cnt = static_cast<unsigned char*>(pool.allocate(width * width * sizeof(unsigned char)));

using device_launch = RAJA::cuda_launch_t<false>;
using launch_policy = RAJA::LaunchPolicy<device_launch>;

//Example 1. Global Indexing:
//GPU programming models such as CUDA and HIP follow a thread/block(team) programming model
//in which a predefined compute grid
// GPU programming models such as CUDA and HIP peform computation
// on a predefined grid composed of threads and blocks.
// RAJA provides policies enabling users to utilize different
// thread enumeration strategies. The two main strategies
// are Block/Thread local enumeration and Global enumeration.
//
//
// Under a block and thread enumeration,
// threads have block local coordinate enumeration.
// While blocks are enumerated with respect
// to their location within the compute grid.
// The illustration below shows 2 x 2 compute grid
// wherein each block has 3 x 2 threads. Current
// programing models support up to three-dimensional
// block and thread configurations.
//
//
// Block (0,0) Block (1,0)
// [0,0][0,1][0,2] [0,0][0,1][0,2]
// [1,0][1,1][1,2] [1,0][1,1][1,2]
//
// Block (0,1) Block (1,1)
// [0,0][0,1][0,2] [0,0][0,1][0,2]
// [1,0][1,1][1,2] [1,0][1,1][1,2]
//
// Under the global enumeration each thread
// is a assigned a unique thread id based on
// on the dimension (2D illustrated here).
// The utility here comes when the iteration
// space is amendable to tiles in which blocks
// can be assigned to a tile and threads are
// assigned to work within a tile.
//
// [0,0][0,1][0,2] [0,3][0,4][0,5]
// [1,0][1,1][1,2] [1,3][1,4][1,5]
//
// [2,0][2,1][2,2] [2,3][2,4][2,5]
// [3,0][3,1][3,2] [3,3][3,4][3,5]
//

// Short note on RAJA nomenclature:
// As RAJA serves as an abstraction layer
// the RAJA::launch API uses the terms
// teams and threads. Teams are analogous
// to blocks in CUDA/HIP nomenclature
// and workgroups in the SYCL programming model.
// Threads are analogous to threads within CUDA/HIP
// and work-items within the SYCL programming model.

#if defined(RAJA_ENABLE_CUDA)

// The examples below showcase commonly used GPU policies.
// For the HIP and SYCL programming models, we offer analogous policies.

using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<false>>;

// Example 1. Global Indexing
// Main use case: Perfectly nested loops with large iteration spaces.
{
const int N_x = 10000;
const int N_y = 20000;
const int block_sz = 256;
const int n_blocks_x = (N_x + block_sz) / block_sz + 1;
const int n_blocks_y = (N_y + block_sz) / block_sz + 1;

using loop_pol_x = RAJA::LoopPolicy<RAJA::cuda_global_x>;
using global_pol_y = RAJA::LoopPolicy<RAJA::cuda_global_thread_y>;
using global_pol_x = RAJA::LoopPolicy<RAJA::cuda_global_thread_x>;

RAJA::launch<device_launch>
RAJA::launch<launch_policy>
(RAJA::LaunchParams(RAJA::Teams(n_blocks_x, n_blocks_y), RAJA::Threads(block_sz)),
[=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {

RAJA::loop<loop_pol_y>(ctx, RAJA::RangeSegment(0, N_y), [&] (int gy) {
RAJA::loop<loop_pol_x>(ctx, RAJA::RangeSegment(0, N_x), [&] (int gx) {
RAJA::loop<global_pol_y>(ctx, RAJA::RangeSegment(0, N_y), [&] (int gy) {
RAJA::loop<global_pol_x>(ctx, RAJA::RangeSegment(0, N_x), [&] (int gx) {

//Do something

//populate


});
});


});

}


// Example 2. Block and thread direct polcies
// Ideal for when iteration space can broken up into tiles
// Teams can be assigned to a tile and threads can perform
// computations within the tile

// The example below employs the direct version for block
// and thread policies, the underlying assumption is that
// the loops are within the range of the grid and block sizes

// In CUDA the direct loops are expressed as:
//
// const int i = threadIdx.x;
// if(i < N) { //kernel }
//

{
const int n_blocks = 50000;
const int block_sz = 64;

using outer_pol = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
using inner_pol = RAJA::LoopPolicy<RAJA::cuda_thread_x_direct>;

RAJA::launch<launch_policy>
(RAJA::LaunchParams(RAJA::Teams(n_blocks), RAJA::Threads(block_sz)),
[=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {

RAJA::loop<outer_pol>(ctx, RAJA::RangeSegment(0, n_blocks), [&] (int bx) {


//Iteration Space:
{
const int n_blocks = 50000;
const int block_sz = 64;
RAJA::loop<inner_pol>(ctx, RAJA::RangeSegment(0, block_sz), [&] (int tx) {

RAJA::launch<launch_policy>
( RAJA::LaunchParams(RAJA::Teams(n_blocks),
RAJA::Threads(block_sz)),
[=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {
//Do something

});
});

RAJA::loop<col_loop>(ctx, RAJA::RangeSegment(0, width), [&] (int col) {

});
});

});
}



// Example 3. Block and thread loop polcies
// Similar to the example above but using a thread loop
// policy. The utility of the thread loop policy rises when
// we consider multiple thread loops with varying iteration sizes.

// If a RAJA loop iteration space is beyond the configured number
// of threads in a team. The thread loop policies will perform a team
// stride loop to span the whole range.

// In CUDA the block stride loop is expressed as
//
// for(int i=threadIdx.x; i<N; i+=blockDim.x)
//
{
const int n_blocks = 50000;
const int block_sz = 64;

using outer_pol = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;
using inner_pol = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;

RAJA::launch<launch_policy>
(RAJA::LaunchParams(RAJA::Teams(n_blocks), RAJA::Threads(block_sz)),
[=] RAJA_HOST_DEVICE (RAJA::LaunchContext ctx) {

RAJA::loop<outer_pol>(ctx, RAJA::RangeSegment(0, n_blocks), [&] (int bx) {

//Iteration space is same as number of blocks per thread
//We could also use direct policy here
RAJA::loop<inner_pol>(ctx, RAJA::RangeSegment(0, block_sz), [&] (int tx) {
//Do something here
}); //inner loop


//Iteration space is *more* than number of blocks per thread
RAJA::loop<inner_pol>(ctx, RAJA::RangeSegment(0, 2*block_sz), [&] (int tx) {
//Do something here
}); //inner loop

}); //outer loop

});

}

#else

std::cout<<"Please compile with CUDA"<<std::endl;
#endif

return 0;
}

0 comments on commit 8226b8e

Please sign in to comment.