Skip to content

Commit

Permalink
Move host-side allocation to benchmarks and reuse device with UVM
Browse files Browse the repository at this point in the history
This commit puts benchmarks in control of allocating the host
memory used for verifying the results.

This enables benchmarks that use Unified Memory for the device
allocations, to avoid the host-side allocation and just pass
pointers to the device allocation to the benchmark driver.

Closes #128 .
  • Loading branch information
gonzalobg committed Jun 4, 2024
1 parent 321ba62 commit 2b9129e
Show file tree
Hide file tree
Showing 45 changed files with 609 additions and 533 deletions.
7 changes: 6 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,14 @@ if ((NOT BUILD_TYPE STREQUAL RELEASE) AND (NOT BUILD_TYPE STREQUAL DEBUG))
message(FATAL_ERROR "Only Release or Debug is supported, got `${CMAKE_BUILD_TYPE}`")
endif ()

option(BUILD_NATIVE "Builds for the current systems CPU and GPU architecture." ON)

# setup some defaults flags for everything
set(DEFAULT_DEBUG_FLAGS -O2 -fno-omit-frame-pointer)
set(DEFAULT_RELEASE_FLAGS -O3 -march=native)
set(DEFAULT_RELEASE_FLAGS -O3)
if (BUILD_NATIVE)
set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -march=native)
endif()

macro(hint_flag FLAG DESCRIPTION)
if (NOT DEFINED ${FLAG})
Expand Down
13 changes: 4 additions & 9 deletions src/Stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,10 @@

#pragma once

#include <array>
#include <vector>
#include <string>

// Array values
#define startA (0.1)
#define startB (0.2)
#define startC (0.0)
#define startScalar (0.4)
#include "benchmark.h"

template <class T>
class Stream
Expand All @@ -31,9 +27,8 @@ class Stream
virtual void nstream() = 0;
virtual T dot() = 0;

// Copy memory between host and device
virtual void init_arrays(T initA, T initB, T initC) = 0;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) = 0;
// Set pointers to read from arrays
virtual void get_arrays(T const*& a, T const*& b, T const*& c) = 0;
};

// Implementation specific device functions
Expand Down
34 changes: 17 additions & 17 deletions src/StreamModels.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,67 +35,67 @@
#include "FutharkStream.h"
#endif

template <typename T>
std::unique_ptr<Stream<T>> make_stream(intptr_t array_size, int deviceIndex) {
template <typename T, typename...Args>
std::unique_ptr<Stream<T>> make_stream(Args... args) {
#if defined(CUDA)
// Use the CUDA implementation
return std::make_unique<CUDAStream<T>>(array_size, deviceIndex);
return std::make_unique<CUDAStream<T>>(args...);

#elif defined(HIP)
// Use the HIP implementation
return std::make_unique<HIPStream<T>>(array_size, deviceIndex);
return std::make_unique<HIPStream<T>>(args...);

#elif defined(HC)
// Use the HC implementation
return std::make_unique<HCStream<T>>(array_size, deviceIndex);
return std::make_unique<HCStream<T>>(args...);

#elif defined(OCL)
// Use the OpenCL implementation
return std::make_unique<OCLStream<T>>(array_size, deviceIndex);
return std::make_unique<OCLStream<T>>(args...);

#elif defined(USE_RAJA)
// Use the RAJA implementation
return std::make_unique<RAJAStream<T>>(array_size, deviceIndex);
return std::make_unique<RAJAStream<T>>(args...);

#elif defined(KOKKOS)
// Use the Kokkos implementation
return std::make_unique<KokkosStream<T>>(array_size, deviceIndex);
return std::make_unique<KokkosStream<T>>(args...);

#elif defined(STD_DATA)
// Use the C++ STD data-oriented implementation
return std::make_unique<STDDataStream<T>>(array_size, deviceIndex);
return std::make_unique<STDDataStream<T>>(args...);

#elif defined(STD_INDICES)
// Use the C++ STD index-oriented implementation
return std::make_unique<STDIndicesStream<T>>(array_size, deviceIndex);
return std::make_unique<STDIndicesStream<T>>(args...);

#elif defined(STD_RANGES)
// Use the C++ STD ranges implementation
return std::make_unique<STDRangesStream<T>>(array_size, deviceIndex);
return std::make_unique<STDRangesStream<T>>(args...);

#elif defined(TBB)
// Use the C++20 implementation
return std::make_unique<TBBStream<T>>(array_size, deviceIndex);
return std::make_unique<TBBStream<T>>(args...);

#elif defined(THRUST)
// Use the Thrust implementation
return std::make_unique<ThrustStream<T>>(array_size, deviceIndex);
return std::make_unique<ThrustStream<T>>(args...);

#elif defined(ACC)
// Use the OpenACC implementation
return std::make_unique<ACCStream<T>>(array_size, deviceIndex);
return std::make_unique<ACCStream<T>>(args...);

#elif defined(SYCL) || defined(SYCL2020)
// Use the SYCL implementation
return std::make_unique<SYCLStream<T>>(array_size, deviceIndex);
return std::make_unique<SYCLStream<T>>(args...);

#elif defined(OMP)
// Use the OpenMP implementation
return std::make_unique<OMPStream<T>>(array_size, deviceIndex);
return std::make_unique<OMPStream<T>>(args...);

#elif defined(FUTHARK)
// Use the Futhark implementation
return std::make_unique<FutharkStream<T>>(array_size, deviceIndex);
return std::make_unique<FutharkStream<T>>(args...);

#else

Expand Down
20 changes: 10 additions & 10 deletions src/acc/ACCStream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@
#include "ACCStream.h"

template <class T>
ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)
: array_size{ARRAY_SIZE}
ACCStream<T>::ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
T initA, T initB, T initC)
: array_size{array_size}
{
acc_device_t device_type = acc_get_device_type();
acc_set_device_num(device, device_type);
acc_set_device_num(device_id, device_type);

// Set up data region on device
this->a = new T[array_size];
Expand All @@ -25,6 +26,8 @@ ACCStream<T>::ACCStream(const intptr_t ARRAY_SIZE, int device)

#pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
{}

init_arrays(initA, initB, initC);
}

template <class T>
Expand Down Expand Up @@ -62,20 +65,17 @@ void ACCStream<T>::init_arrays(T initA, T initB, T initC)
}

template <class T>
void ACCStream<T>::read_arrays(std::vector<T>& h_a, std::vector<T>& h_b, std::vector<T>& h_c)
void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
{
T *a = this->a;
T *b = this->b;
T *c = this->c;
#pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size])
{}

for (intptr_t i = 0; i < array_size; i++)
{
h_a[i] = a[i];
h_b[i] = b[i];
h_c[i] = c[i];
}
h_a = a;
h_b = b;
h_c = c;
}

template <class T>
Expand Down
33 changes: 13 additions & 20 deletions src/acc/ACCStream.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,32 +19,25 @@
template <class T>
class ACCStream : public Stream<T>
{
struct A{
T *a;
T *b;
T *c;
};

protected:
// Size of arrays
intptr_t array_size;
A aa;
// Device side pointers
T *a;
T *b;
T *c;
T* restrict a;
T* restrict b;
T* restrict c;

public:
ACCStream(const intptr_t, int);
ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
T initA, T initB, T initC);
~ACCStream();

virtual void copy() override;
virtual void add() override;
virtual void mul() override;
virtual void triad() override;
virtual void nstream() override;
virtual T dot() override;
void copy() override;
void add() override;
void mul() override;
void triad() override;
void nstream() override;
T dot() override;

virtual void init_arrays(T initA, T initB, T initC) override;
virtual void read_arrays(std::vector<T>& a, std::vector<T>& b, std::vector<T>& c) override;
void get_arrays(T const*& a, T const*& b, T const*& c) override;
void init_arrays(T initA, T initB, T initC);
};
66 changes: 66 additions & 0 deletions src/benchmark.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#pragma once

#include <algorithm>
#include <array>
#include <initializer_list>
#include <iostream>

// Array values
#define startA (0.1)
#define startB (0.2)
#define startC (0.0)
#define startScalar (0.4)

// Benchmark Identifier: identifies individual & groups of benchmarks:
// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
// - All: all kernels.
// - Individual kernels only.
enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};

struct Benchmark {
BenchId id;
char const* label;
// Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW:
// bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur
size_t weight;
// Is it one of: Copy, Mul, Add, Triad, Dot?
bool classic = false;
};

// Benchmarks in the order in which - if present - should be run for validation purposes:
constexpr size_t num_benchmarks = 6;
constexpr std::array<Benchmark, num_benchmarks> bench = {
Benchmark { .id = BenchId::Copy, .label = "Copy", .weight = 2, .classic = true },
Benchmark { .id = BenchId::Mul, .label = "Mul", .weight = 2, .classic = true },
Benchmark { .id = BenchId::Add, .label = "Add", .weight = 3, .classic = true },
Benchmark { .id = BenchId::Triad, .label = "Triad", .weight = 3, .classic = true },
Benchmark { .id = BenchId::Dot, .label = "Dot", .weight = 2, .classic = true },
Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
};

// Which buffers are needed by each benchmark
inline bool needs_buffer(BenchId id, char n) {
auto in = [n](std::initializer_list<char> values) {
return std::find(values.begin(), values.end(), n) != values.end();
};
switch(id) {
case BenchId::All: return in({'a','b','c'});
case BenchId::Classic: return in({'a','b','c'});
case BenchId::Copy: return in({'a','c'});
case BenchId::Mul: return in({'b','c'});
case BenchId::Add: return in({'a','b','c'});
case BenchId::Triad: return in({'a','b','c'});
case BenchId::Dot: return in({'a','b'});
case BenchId::Nstream: return in({'a','b','c'});
default:
std::cerr << "Unknown benchmark" << std::endl;
abort();
}
}

// Returns true if the benchmark needs to be run:
inline bool run_benchmark(BenchId selection, Benchmark const& b) {
if (selection == BenchId::All) return true;
if (selection == BenchId::Classic && b.classic) return true;
return selection == b.id;
}
37 changes: 19 additions & 18 deletions src/ci-prepare-bionic.sh
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,9 @@ setup_aocc() {

setup_nvhpc() {
echo "Preparing Nvidia HPC SDK"
local nvhpc_ver="23.1" # TODO FIXME > 23.1 has a bug with -A
local nvhpc_release="2023_231"
local cuda_ver="12.0"
local nvhpc_ver="24.5"
local nvhpc_release="2024_245"
local cuda_ver="12.4"

local tarball="nvhpc_$nvhpc_ver.tar.gz"

Expand Down Expand Up @@ -237,7 +237,10 @@ setup_tbb() {

setup_clang_gcc() {

sudo apt-get install -y -qq gcc-12-offload-nvptx gcc-12-offload-amdgcn libtbb2 libtbb-dev g++-12 clang libomp-dev libc6
sudo apt-get install -y -qq gcc-12-offload-nvptx gcc-12-offload-amdgcn libtbb2 libtbb-dev g++-12 libc6
wget https://apt.llvm.org/llvm.sh
chmod +x llvm.sh
apt install -y clang-18 libclang-18-dev clang-tools-18 libomp-18-dev llvm-18-dev lld-18

export_var GCC_CXX "$(which g++-12)"
verify_bin_exists "$GCC_CXX"
Expand Down Expand Up @@ -289,20 +292,18 @@ setup_dpcpp() {
check_size
}

setup_hipsycl() {
setup_adaptivecpp() {

sudo apt-get install -y -qq libboost-fiber-dev libboost-context-dev
local hipsycl_ver="0.9.1"
local tarball="v$hipsycl_ver.tar.gz"
local install_dir="$PWD/hipsycl_dist_$hipsycl_ver"

local url="https://github.com/AdaptiveCpp/AdaptiveCpp/archive/v$hipsycl_ver.tar.gz"
# local url="http://localhost:8000/AdaptiveCpp-$hipsycl_ver.tar.gz"
local adaptivecpp_ver="24.02.0"
local tarball="AdaptiveCpp-${adaptivecpp_ver}.tar.gz"
local install_dir="$PWD/adaptivecpp_dist_v${adaptivecpp_ver}"
local url="https://github.com/AdaptiveCpp/AdaptiveCpp/archive/refs/tags/v${adaptivecpp_ver}.tar.gz"

get_and_untar "$tarball" "$url"

if [ "$SETUP" = true ]; then
local src="$PWD/AdaptiveCpp-$hipsycl_ver"
local src="$PWD/AdaptiveCpp-${adaptivecpp_ver}"
rm -rf "$src/build"
rm -rf "$install_dir"
cmake "-B$src/build" "-H$src" \
Expand All @@ -315,10 +316,10 @@ setup_hipsycl() {
cmake --build "$src/build" --target install -j "$(nproc)"
fi

export_var HIPSYCL_DIR "$install_dir"
verify_dir_exists "$HIPSYCL_DIR"
# note: this will forward --version to the default compiler so it won't say anything about hipsycl
"$HIPSYCL_DIR/bin/syclcc-clang" --version
export_var AdaptiveCpp_DIR "$install_dir"
verify_dir_exists "$AdaptiveCpp_DIR"
# note: this will forward --version to the default compiler so it won't say anything about adaptivecpp
"$AdaptiveCpp_DIR/bin/syclcc-clang" --version
check_size
}

Expand Down Expand Up @@ -391,7 +392,7 @@ setup_cmake() {
}

if [ "$PARALLEL" = true ]; then
(setup_clang_gcc && setup_rocm && setup_hipsycl) & # these need apt so run sequentially
(setup_clang_gcc && setup_rocm && setup_adaptivecpp) & # these need apt so run sequentially
setup_cmake &
setup_oclcpu &
setup_aocc &
Expand All @@ -406,7 +407,7 @@ else
# these need apt
setup_clang_gcc
setup_rocm
setup_hipsycl
setup_adaptivecpp
setup_cmake
setup_aocc
setup_oclcpu
Expand Down
Loading

0 comments on commit 2b9129e

Please sign in to comment.