diff --git a/CMakeLists.txt b/CMakeLists.txt index c98cab48..0df3836f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,9 +44,14 @@ if ((NOT BUILD_TYPE STREQUAL RELEASE) AND (NOT BUILD_TYPE STREQUAL DEBUG)) message(FATAL_ERROR "Only Release or Debug is supported, got `${CMAKE_BUILD_TYPE}`") endif () +option(BUILD_NATIVE "Builds for the current systems CPU and GPU architecture." ON) + # setup some defaults flags for everything set(DEFAULT_DEBUG_FLAGS -O2 -fno-omit-frame-pointer) -set(DEFAULT_RELEASE_FLAGS -O3 -march=native) +set(DEFAULT_RELEASE_FLAGS -O3) +if (BUILD_NATIVE) + set(DEFAULT_RELEASE_FLAGS ${DEFAULT_RELEASE_FLAGS} -march=native) +endif() macro(hint_flag FLAG DESCRIPTION) if (NOT DEFINED ${FLAG}) diff --git a/src/Stream.h b/src/Stream.h index 45c144c3..c8c6af1c 100644 --- a/src/Stream.h +++ b/src/Stream.h @@ -7,14 +7,10 @@ #pragma once +#include #include #include - -// Array values -#define startA (0.1) -#define startB (0.2) -#define startC (0.0) -#define startScalar (0.4) +#include "benchmark.h" template class Stream @@ -31,9 +27,8 @@ class Stream virtual void nstream() = 0; virtual T dot() = 0; - // Copy memory between host and device - virtual void init_arrays(T initA, T initB, T initC) = 0; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) = 0; + // Set pointers to read from arrays + virtual void get_arrays(T const*& a, T const*& b, T const*& c) = 0; }; // Implementation specific device functions diff --git a/src/StreamModels.h b/src/StreamModels.h index 556beb4d..6a0836f3 100644 --- a/src/StreamModels.h +++ b/src/StreamModels.h @@ -35,67 +35,67 @@ #include "FutharkStream.h" #endif -template -std::unique_ptr> make_stream(intptr_t array_size, int deviceIndex) { +template +std::unique_ptr> make_stream(Args... args) { #if defined(CUDA) // Use the CUDA implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(HIP) // Use the HIP implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(HC) // Use the HC implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(OCL) // Use the OpenCL implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(USE_RAJA) // Use the RAJA implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(KOKKOS) // Use the Kokkos implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(STD_DATA) // Use the C++ STD data-oriented implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(STD_INDICES) // Use the C++ STD index-oriented implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(STD_RANGES) // Use the C++ STD ranges implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(TBB) // Use the C++20 implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(THRUST) // Use the Thrust implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(ACC) // Use the OpenACC implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(SYCL) || defined(SYCL2020) // Use the SYCL implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(OMP) // Use the OpenMP implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #elif defined(FUTHARK) // Use the Futhark implementation - return std::make_unique>(array_size, deviceIndex); + return std::make_unique>(args...); #else diff --git a/src/acc/ACCStream.cpp b/src/acc/ACCStream.cpp index a346a39c..034336a4 100644 --- a/src/acc/ACCStream.cpp +++ b/src/acc/ACCStream.cpp @@ -8,11 +8,12 @@ #include "ACCStream.h" template -ACCStream::ACCStream(const intptr_t ARRAY_SIZE, int device) - : array_size{ARRAY_SIZE} +ACCStream::ACCStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC) + : array_size{array_size} { acc_device_t device_type = acc_get_device_type(); - acc_set_device_num(device, device_type); + acc_set_device_num(device_id, device_type); // Set up data region on device this->a = new T[array_size]; @@ -25,6 +26,8 @@ ACCStream::ACCStream(const intptr_t ARRAY_SIZE, int device) #pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size]) {} + + init_arrays(initA, initB, initC); } template @@ -62,7 +65,7 @@ void ACCStream::init_arrays(T initA, T initB, T initC) } template -void ACCStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +void ACCStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) { T *a = this->a; T *b = this->b; @@ -70,12 +73,9 @@ void ACCStream::read_arrays(std::vector& h_a, std::vector& h_b, std::ve #pragma acc update host(a[0:array_size], b[0:array_size], c[0:array_size]) {} - for (intptr_t i = 0; i < array_size; i++) - { - h_a[i] = a[i]; - h_b[i] = b[i]; - h_c[i] = c[i]; - } + h_a = a; + h_b = b; + h_c = c; } template diff --git a/src/acc/ACCStream.h b/src/acc/ACCStream.h index 1b053cb4..8345b785 100644 --- a/src/acc/ACCStream.h +++ b/src/acc/ACCStream.h @@ -19,32 +19,25 @@ template class ACCStream : public Stream { - struct A{ - T *a; - T *b; - T *c; - }; - - protected: // Size of arrays intptr_t array_size; - A aa; // Device side pointers - T *a; - T *b; - T *c; + T* restrict a; + T* restrict b; + T* restrict c; public: - ACCStream(const intptr_t, int); + ACCStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~ACCStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/benchmark.h b/src/benchmark.h new file mode 100644 index 00000000..95d675f7 --- /dev/null +++ b/src/benchmark.h @@ -0,0 +1,66 @@ +#pragma once + +#include +#include +#include +#include + +// Array values +#define startA (0.1) +#define startB (0.2) +#define startC (0.0) +#define startScalar (0.4) + +// Benchmark Identifier: identifies individual & groups of benchmarks: +// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot. +// - All: all kernels. +// - Individual kernels only. +enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All}; + +struct Benchmark { + BenchId id; + char const* label; + // Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW: + // bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur + size_t weight; + // Is it one of: Copy, Mul, Add, Triad, Dot? + bool classic = false; +}; + +// Benchmarks in the order in which - if present - should be run for validation purposes: +constexpr size_t num_benchmarks = 6; +constexpr std::array bench = { + Benchmark { .id = BenchId::Copy, .label = "Copy", .weight = 2, .classic = true }, + Benchmark { .id = BenchId::Mul, .label = "Mul", .weight = 2, .classic = true }, + Benchmark { .id = BenchId::Add, .label = "Add", .weight = 3, .classic = true }, + Benchmark { .id = BenchId::Triad, .label = "Triad", .weight = 3, .classic = true }, + Benchmark { .id = BenchId::Dot, .label = "Dot", .weight = 2, .classic = true }, + Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false } +}; + +// Which buffers are needed by each benchmark +inline bool needs_buffer(BenchId id, char n) { + auto in = [n](std::initializer_list values) { + return std::find(values.begin(), values.end(), n) != values.end(); + }; + switch(id) { + case BenchId::All: return in({'a','b','c'}); + case BenchId::Classic: return in({'a','b','c'}); + case BenchId::Copy: return in({'a','c'}); + case BenchId::Mul: return in({'b','c'}); + case BenchId::Add: return in({'a','b','c'}); + case BenchId::Triad: return in({'a','b','c'}); + case BenchId::Dot: return in({'a','b'}); + case BenchId::Nstream: return in({'a','b','c'}); + default: + std::cerr << "Unknown benchmark" << std::endl; + abort(); + } +} + +// Returns true if the benchmark needs to be run: +inline bool run_benchmark(BenchId selection, Benchmark const& b) { + if (selection == BenchId::All) return true; + if (selection == BenchId::Classic && b.classic) return true; + return selection == b.id; +} diff --git a/src/ci-prepare-bionic.sh b/src/ci-prepare-bionic.sh index f5c1a704..5b2041b9 100755 --- a/src/ci-prepare-bionic.sh +++ b/src/ci-prepare-bionic.sh @@ -138,9 +138,9 @@ setup_aocc() { setup_nvhpc() { echo "Preparing Nvidia HPC SDK" - local nvhpc_ver="23.1" # TODO FIXME > 23.1 has a bug with -A - local nvhpc_release="2023_231" - local cuda_ver="12.0" + local nvhpc_ver="24.5" + local nvhpc_release="2024_245" + local cuda_ver="12.4" local tarball="nvhpc_$nvhpc_ver.tar.gz" @@ -237,7 +237,10 @@ setup_tbb() { setup_clang_gcc() { - sudo apt-get install -y -qq gcc-12-offload-nvptx gcc-12-offload-amdgcn libtbb2 libtbb-dev g++-12 clang libomp-dev libc6 + sudo apt-get install -y -qq gcc-12-offload-nvptx gcc-12-offload-amdgcn libtbb2 libtbb-dev g++-12 libc6 + wget https://apt.llvm.org/llvm.sh + chmod +x llvm.sh + apt install -y clang-18 libclang-18-dev clang-tools-18 libomp-18-dev llvm-18-dev lld-18 export_var GCC_CXX "$(which g++-12)" verify_bin_exists "$GCC_CXX" @@ -289,20 +292,18 @@ setup_dpcpp() { check_size } -setup_hipsycl() { +setup_adaptivecpp() { sudo apt-get install -y -qq libboost-fiber-dev libboost-context-dev - local hipsycl_ver="0.9.1" - local tarball="v$hipsycl_ver.tar.gz" - local install_dir="$PWD/hipsycl_dist_$hipsycl_ver" - - local url="https://github.com/AdaptiveCpp/AdaptiveCpp/archive/v$hipsycl_ver.tar.gz" - # local url="http://localhost:8000/AdaptiveCpp-$hipsycl_ver.tar.gz" + local adaptivecpp_ver="24.02.0" + local tarball="AdaptiveCpp-${adaptivecpp_ver}.tar.gz" + local install_dir="$PWD/adaptivecpp_dist_v${adaptivecpp_ver}" + local url="https://github.com/AdaptiveCpp/AdaptiveCpp/archive/refs/tags/v${adaptivecpp_ver}.tar.gz" get_and_untar "$tarball" "$url" if [ "$SETUP" = true ]; then - local src="$PWD/AdaptiveCpp-$hipsycl_ver" + local src="$PWD/AdaptiveCpp-${adaptivecpp_ver}" rm -rf "$src/build" rm -rf "$install_dir" cmake "-B$src/build" "-H$src" \ @@ -315,10 +316,10 @@ setup_hipsycl() { cmake --build "$src/build" --target install -j "$(nproc)" fi - export_var HIPSYCL_DIR "$install_dir" - verify_dir_exists "$HIPSYCL_DIR" - # note: this will forward --version to the default compiler so it won't say anything about hipsycl - "$HIPSYCL_DIR/bin/syclcc-clang" --version + export_var AdaptiveCpp_DIR "$install_dir" + verify_dir_exists "$AdaptiveCpp_DIR" + # note: this will forward --version to the default compiler so it won't say anything about adaptivecpp + "$AdaptiveCpp_DIR/bin/syclcc-clang" --version check_size } @@ -391,7 +392,7 @@ setup_cmake() { } if [ "$PARALLEL" = true ]; then - (setup_clang_gcc && setup_rocm && setup_hipsycl) & # these need apt so run sequentially + (setup_clang_gcc && setup_rocm && setup_adaptivecpp) & # these need apt so run sequentially setup_cmake & setup_oclcpu & setup_aocc & @@ -406,7 +407,7 @@ else # these need apt setup_clang_gcc setup_rocm - setup_hipsycl + setup_adaptivecpp setup_cmake setup_aocc setup_oclcpu diff --git a/src/ci-test-compile.sh b/src/ci-test-compile.sh index 249bab4f..a34795e9 100755 --- a/src/ci-test-compile.sh +++ b/src/ci-test-compile.sh @@ -335,10 +335,10 @@ build_dpcpp() { # run_build intel_build "dpcpp" sycl "-DCMAKE_CXX_COMPILER=${GCC_CXX:?} -DSYCL_COMPILER=ONEAPI-DPCPP" } -build_hipsycl() { - run_build hipsycl_build "syclcc" sycl " - -DSYCL_COMPILER=HIPSYCL \ - -DSYCL_COMPILER_DIR=${HIPSYCL_DIR:?}" +build_adaptivecpp() { + run_build adaptivecpp_build "syclcc" sycl " + -DSYCL_COMPILER=AdaptiveCpp \ + -DSYCL_COMPILER_DIR=${AdaptiveCpp_DIR:?}" } echo "Test compiling with ${COMPILER} CXX for ${MODEL} model" @@ -352,7 +352,7 @@ aocc) build_aocc ;; aomp) build_aomp ;; hip) build_hip ;; dpcpp) build_dpcpp ;; -hipsycl) build_hipsycl ;; +adaptivecpp) build_adaptivecpp ;; # XXX below are local only; licence or very large download required, candidate for local runner icpx) build_icpx ;; @@ -366,7 +366,7 @@ all) build_aomp build_hip build_dpcpp - build_hipsycl + build_adaptivecpp build_icpx build_icpc diff --git a/src/cuda/CUDAStream.cu b/src/cuda/CUDAStream.cu index 9d63ff3f..4f5599a7 100644 --- a/src/cuda/CUDAStream.cu +++ b/src/cuda/CUDAStream.cu @@ -77,7 +77,8 @@ void free_host(T* p) { } template -CUDAStream::CUDAStream(const intptr_t array_size, const int device_index) +CUDAStream::CUDAStream(BenchId bs, const intptr_t array_size, const int device_index, + T initA, T initB, T initC) : array_size(array_size) { // Set device @@ -131,14 +132,20 @@ CUDAStream::CUDAStream(const intptr_t array_size, const int device_index) std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE_DOT << std::endl; // Check buffers fit on the device - if (dprop.totalGlobalMem < total_bytes) + if (dprop.totalGlobalMem < total_bytes) { + std::cerr << "Requested array size of " << total_bytes * 1e-9 + << " GB exceeds memory capacity of " << dprop.totalGlobalMem * 1e-9 << " GB !" << std::endl; throw std::runtime_error("Device does not have enough memory for all buffers"); + } // Allocate buffers: d_a = alloc_device(array_size); d_b = alloc_device(array_size); d_c = alloc_device(array_size); sums = alloc_host(dot_num_blocks); + + // Initialize buffers: + init_arrays(initA, initB, initC); } template @@ -204,21 +211,26 @@ void CUDAStream::init_arrays(T initA, T initB, T initC) } template -void CUDAStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +void CUDAStream::get_arrays(T const*& a, T const*& b, T const*& c) { - // Copy device memory to host -#if defined(PAGEFAULT) || defined(MANAGED) CU(cudaStreamSynchronize(stream)); - for (intptr_t i = 0; i < array_size; ++i) - { - a[i] = d_a[i]; - b[i] = d_b[i]; - c[i] = d_c[i]; - } +#if defined(PAGEFAULT) || defined(MANAGED) + // Unified memory: return pointers to device memory + a = d_a; + b = d_b; + c = d_c; #else - CU(cudaMemcpy(a.data(), d_a, a.size()*sizeof(T), cudaMemcpyDeviceToHost)); - CU(cudaMemcpy(b.data(), d_b, b.size()*sizeof(T), cudaMemcpyDeviceToHost)); - CU(cudaMemcpy(c.data(), d_c, c.size()*sizeof(T), cudaMemcpyDeviceToHost)); + // No Unified memory: copy data to the host + size_t nbytes = array_size * sizeof(T); + h_a.resize(array_size); + h_b.resize(array_size); + h_c.resize(array_size); + a = h_a.data(); + b = h_b.data(); + c = h_c.data(); + CU(cudaMemcpy(h_a.data(), d_a, nbytes, cudaMemcpyDeviceToHost)); + CU(cudaMemcpy(h_b.data(), d_b, nbytes, cudaMemcpyDeviceToHost)); + CU(cudaMemcpy(h_c.data(), d_c, nbytes, cudaMemcpyDeviceToHost)); #endif } diff --git a/src/cuda/CUDAStream.h b/src/cuda/CUDAStream.h index 5b739569..50e099dc 100644 --- a/src/cuda/CUDAStream.h +++ b/src/cuda/CUDAStream.h @@ -26,27 +26,31 @@ class CUDAStream : public Stream intptr_t array_size; // Host array for partial sums for dot kernel - T *sums; + T* sums; // Device side pointers to arrays - T *d_a; - T *d_b; - T *d_c; + T* d_a; + T* d_b; + T* d_c; + + // If UVM is disabled, host arrays for verification purposes + std::vector h_a, h_b, h_c; // Number of blocks for dot kernel intptr_t dot_num_blocks; public: - CUDAStream(const intptr_t, const int); + CUDAStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~CUDAStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/futhark/FutharkStream.cpp b/src/futhark/FutharkStream.cpp index ebd3633b..392ff898 100644 --- a/src/futhark/FutharkStream.cpp +++ b/src/futhark/FutharkStream.cpp @@ -11,9 +11,10 @@ #include "FutharkStream.h" template -FutharkStream::FutharkStream(const int ARRAY_SIZE, int device) +FutharkStream::FutharkStream(BenchId bs, const intptr_t array_size, const int device, + T initA, T initB, T initC) + : array_size(array_size) { - this->array_size = ARRAY_SIZE; this->cfg = futhark_context_config_new(); this->device = "#" + std::to_string(device); #if defined(FUTHARK_BACKEND_cuda) || defined(FUTHARK_BACKEND_opencl) @@ -23,6 +24,7 @@ FutharkStream::FutharkStream(const int ARRAY_SIZE, int device) this->a = NULL; this->b = NULL; this->c = NULL; + init_arrays(initA, initB, initC); } template <> @@ -98,19 +100,31 @@ void FutharkStream::init_arrays(double initA, double initB, double initC } template <> -void FutharkStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { +void FutharkStream::get_arrays(float const*& a_, float const*& b_, float const*& c_) { + h_a.resize(array_size); + h_b.resize(array_size); + h_c.resize(array_size); futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->a, h_a.data()); futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->b, h_b.data()); futhark_values_f32_1d(this->ctx, (futhark_f32_1d*)this->c, h_c.data()); futhark_context_sync(this->ctx); + a_ = h_a.data(); + b_ = h_b.data(); + c_ = h_c.data(); } template <> -void FutharkStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) { +void FutharkStream::get_arrays(double const*& a_, double const*& b_, double const*& c_) { + h_a.resize(array_size); + h_b.resize(array_size); + h_c.resize(array_size); futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->a, h_a.data()); futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->b, h_b.data()); futhark_values_f64_1d(this->ctx, (futhark_f64_1d*)this->c, h_c.data()); futhark_context_sync(this->ctx); + a_ = h_a.data(); + b_ = h_b.data(); + c_ = h_c.data(); } template <> diff --git a/src/futhark/FutharkStream.h b/src/futhark/FutharkStream.h index 6290e79a..eabdabbe 100644 --- a/src/futhark/FutharkStream.h +++ b/src/futhark/FutharkStream.h @@ -44,17 +44,21 @@ class FutharkStream : public Stream void* b; void* c; + // Host side arrays for verification + std::vector h_a, h_b, h_c; + public: - FutharkStream(const int, int); + FutharkStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~FutharkStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/futhark/model.cmake b/src/futhark/model.cmake index edd21fa6..d7b08795 100644 --- a/src/futhark/model.cmake +++ b/src/futhark/model.cmake @@ -44,6 +44,7 @@ macro(setup) elseif (${FUTHARK_BACKEND} STREQUAL "cuda") find_package(CUDA REQUIRED) register_link_library("nvrtc" "cuda" "cudart") + set(CMAKE_C_COMPILER "nvcc") else () message(FATAL_ERROR "Unsupported Futhark backend: ${FUTHARK_BACKEND}") endif() diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index ec02425a..e3878afd 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -25,7 +25,9 @@ void check_error(void) __host__ __device__ constexpr size_t ceil_div(size_t a, size_t b) { return (a + b - 1)/b; } template -HIPStream::HIPStream(const intptr_t ARRAY_SIZE, const int device_index) +HIPStream::HIPStream(BenchId bs, const intptr_t array_size, const int device_index, + T initA, T initB, T initC) + : array_size(array_size) { // Set device int count; @@ -47,13 +49,12 @@ HIPStream::HIPStream(const intptr_t ARRAY_SIZE, const int device_index) std::cout << "Memory: DEFAULT" << std::endl; #endif - array_size = ARRAY_SIZE; // Round dot_num_blocks up to next multiple of (TBSIZE * dot_elements_per_lane) dot_num_blocks = (array_size + (TBSIZE * dot_elements_per_lane - 1)) / (TBSIZE * dot_elements_per_lane); size_t array_bytes = sizeof(T); - array_bytes *= ARRAY_SIZE; - size_t total_bytes = array_bytes * 3; + array_bytes *= array_size; + size_t total_bytes = array_bytes * std::size_t{3}; // Allocate the host array for partial sums for dot kernels using hipHostMalloc. // This creates an array on the host which is visible to the device. However, it requires @@ -65,7 +66,7 @@ HIPStream::HIPStream(const intptr_t ARRAY_SIZE, const int device_index) // Check buffers fit on the device hipDeviceProp_t props; hipGetDeviceProperties(&props, 0); - if (props.totalGlobalMem < std::size_t{3}*ARRAY_SIZE*sizeof(T)) + if (props.totalGlobalMem < total_bytes) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); // Create device buffers @@ -88,6 +89,8 @@ HIPStream::HIPStream(const intptr_t ARRAY_SIZE, const int device_index) hipMalloc(&d_c, array_bytes); check_error(); #endif + + init_arrays(initA, initB, initC); } @@ -127,24 +130,28 @@ void HIPStream::init_arrays(T initA, T initB, T initC) } template -void HIPStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +void HIPStream::get_arrays(T const*& a, T const*& b, T const*& c) { - - // Copy device memory to host + hipDeviceSynchronize(); #if defined(PAGEFAULT) || defined(MANAGED) - hipDeviceSynchronize(); - for (intptr_t i = 0; i < array_size; i++) - { - a[i] = d_a[i]; - b[i] = d_b[i]; - c[i] = d_c[i]; - } + // Unified memory: return pointers to device memory + a = d_a; + b = d_b; + c = d_c; #else - hipMemcpy(a.data(), d_a, a.size()*sizeof(T), hipMemcpyDeviceToHost); - check_error(); - hipMemcpy(b.data(), d_b, b.size()*sizeof(T), hipMemcpyDeviceToHost); - check_error(); - hipMemcpy(c.data(), d_c, c.size()*sizeof(T), hipMemcpyDeviceToHost); + // No Unified memory: copy data to the host + size_t nbytes = array_size * sizeof(T); + h_a.resize(array_size); + h_b.resize(array_size); + h_c.resize(array_size); + a = h_a.data(); + b = h_b.data(); + c = h_c.data(); + hipMemcpy(h_a.data(), d_a, nbytes, hipMemcpyDeviceToHost); + check_error(); + hipMemcpy(h_b.data(), d_b, nbytes, hipMemcpyDeviceToHost); + check_error(); + hipMemcpy(h_c.data(), d_c, nbytes, hipMemcpyDeviceToHost); check_error(); #endif } diff --git a/src/hip/HIPStream.h b/src/hip/HIPStream.h index 76ef7df4..a1c45802 100644 --- a/src/hip/HIPStream.h +++ b/src/hip/HIPStream.h @@ -48,20 +48,21 @@ class HIPStream : public Stream T *d_b; T *d_c; + // If UVM is disabled, host arrays for verification purposes + std::vector h_a, h_b, h_c; public: - - HIPStream(const intptr_t, const int); + HIPStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~HIPStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; - - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/kokkos/KokkosStream.cpp b/src/kokkos/KokkosStream.cpp index e49d5bcc..fcbdb7a7 100644 --- a/src/kokkos/KokkosStream.cpp +++ b/src/kokkos/KokkosStream.cpp @@ -8,21 +8,23 @@ #include "KokkosStream.hpp" template -KokkosStream::KokkosStream( - const intptr_t ARRAY_SIZE, const int device_index) - : array_size(ARRAY_SIZE) +KokkosStream::KokkosStream(BenchId bs, const intptr_t array_size, const int device_index, + T initA, T initB, T initC) + : array_size(array_size) { Kokkos::initialize(Kokkos::InitializationSettings().set_device_id(device_index)); - d_a = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_a"), ARRAY_SIZE); - d_b = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_b"), ARRAY_SIZE); - d_c = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_c"), ARRAY_SIZE); + d_a = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_a"), array_size); + d_b = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_b"), array_size); + d_c = new Kokkos::View(Kokkos::ViewAllocateWithoutInitializing("d_c"), array_size); hm_a = new typename Kokkos::View::HostMirror(); hm_b = new typename Kokkos::View::HostMirror(); hm_c = new typename Kokkos::View::HostMirror(); *hm_a = create_mirror_view(*d_a); *hm_b = create_mirror_view(*d_b); *hm_c = create_mirror_view(*d_c); + + init_arrays(initA, initB, initC); } template @@ -47,18 +49,14 @@ void KokkosStream::init_arrays(T initA, T initB, T initC) } template -void KokkosStream::read_arrays( - std::vector& a, std::vector& b, std::vector& c) +void KokkosStream::get_arrays(T const*& a, T const*& b, T const*& c) { deep_copy(*hm_a, *d_a); deep_copy(*hm_b, *d_b); deep_copy(*hm_c, *d_c); - for(intptr_t ii = 0; ii < array_size; ++ii) - { - a[ii] = (*hm_a)(ii); - b[ii] = (*hm_b)(ii); - c[ii] = (*hm_c)(ii); - } + a = hm_a->data(); + b = hm_b->data(); + c = hm_c->data(); } template diff --git a/src/kokkos/KokkosStream.hpp b/src/kokkos/KokkosStream.hpp index 8e40119c..bc3ac3ee 100644 --- a/src/kokkos/KokkosStream.hpp +++ b/src/kokkos/KokkosStream.hpp @@ -22,27 +22,27 @@ class KokkosStream : public Stream intptr_t array_size; // Device side pointers to arrays - typename Kokkos::View* d_a; - typename Kokkos::View* d_b; - typename Kokkos::View* d_c; - typename Kokkos::View::HostMirror* hm_a; - typename Kokkos::View::HostMirror* hm_b; - typename Kokkos::View::HostMirror* hm_c; + typename Kokkos::View* d_a; + typename Kokkos::View* d_b; + typename Kokkos::View* d_c; + typename Kokkos::View::HostMirror* hm_a; + typename Kokkos::View::HostMirror* hm_b; + typename Kokkos::View::HostMirror* hm_c; public: - KokkosStream(const intptr_t, const int); + KokkosStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~KokkosStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays( - std::vector& a, std::vector& b, std::vector& c) override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/kokkos/model.cmake b/src/kokkos/model.cmake index 7457eebd..2223c753 100644 --- a/src/kokkos/model.cmake +++ b/src/kokkos/model.cmake @@ -1,5 +1,5 @@ register_flag_optional(CMAKE_CXX_COMPILER - "Any CXX compiler that is supported by CMake detection and RAJA. + "Any CXX compiler that is supported by CMake detection and Kokkos. See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are" "c++") @@ -21,7 +21,7 @@ macro(setup) set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17 cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md - + message("KOKKOS_IN_PACKAGE=${KOKKOS_IN_PACKAGE}") if (EXISTS "${KOKKOS_IN_TREE}") message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos) diff --git a/src/main.cpp b/src/main.cpp index c677f048..55c3a4e7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -23,7 +23,7 @@ #include "Unit.h" // Default size of 2^25 -intptr_t ARRAY_SIZE = 33554432; +intptr_t array_size = 33554432; size_t num_times = 100; size_t deviceIndex = 0; bool use_float = false; @@ -33,42 +33,11 @@ Unit unit{Unit::Kind::MegaByte}; bool silence_errors = false; std::string csv_separator = ","; -// Benchmark Identifier: identifies individual & groups of benchmarks: -// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot. -// - All: all kernels. -// - Individual kernels only. -enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All}; - -struct Benchmark { - BenchId id; - char const* label; - // Weight counts data elements of original arrays moved each loop iteration - used to calculate achieved BW: - // bytes = weight * sizeof(T) * ARRAY_SIZE -> bw = bytes / dur - size_t weight; - // Is it one of: Copy, Mul, Add, Triad, Dot? - bool classic = false; -}; - -// Benchmarks in the order in which - if present - should be run for validation purposes: -constexpr size_t num_benchmarks = 6; -std::array bench = { - Benchmark { .id = BenchId::Copy, .label = "Copy", .weight = 2, .classic = true }, - Benchmark { .id = BenchId::Mul, .label = "Mul", .weight = 2, .classic = true }, - Benchmark { .id = BenchId::Add, .label = "Add", .weight = 3, .classic = true }, - Benchmark { .id = BenchId::Triad, .label = "Triad", .weight = 3, .classic = true }, - Benchmark { .id = BenchId::Dot, .label = "Dot", .weight = 2, .classic = true }, - Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false } -}; - // Selected benchmarks to run: default is all 5 classic benchmarks. BenchId selection = BenchId::Classic; // Returns true if the benchmark needs to be run: -bool run_benchmark(Benchmark const& b) { - if (selection == BenchId::All) return true; - if (selection == BenchId::Classic && b.classic) return true; - return selection == b.id; -} +bool run_benchmark(Benchmark const& b) { return run_benchmark(selection, b); } // Benchmark run order // - Classic: runs each bench once in the order above, and repeats n times. @@ -174,8 +143,7 @@ std::vector> run_all(std::unique_ptr>& stream, T& } template -void check_solution(const size_t ntimes, std::vector& a, std::vector& b, std::vector& c, - T& sum); +void check_solution(const size_t ntimes, T const* a, T const* b, T const* c, T sum); // Generic run routine // Runs the kernel(s) and prints output. @@ -186,7 +154,7 @@ void run() // Formatting utilities: auto fmt_bw = [&](size_t weight, double dt) { - return unit.fmt((weight * sizeof(T) * ARRAY_SIZE)/dt); + return unit.fmt((weight * sizeof(T) * array_size)/dt); }; auto fmt_csv_header = [] { std::cout @@ -251,46 +219,37 @@ void run() default: std::cerr << "Error: Unknown order" << std::endl; abort(); }; std::cout << " order " << std::endl; - std::cout << "Number of elements: " << ARRAY_SIZE << std::endl; + std::cout << "Number of elements: " << array_size << std::endl; std::cout << "Precision: " << (sizeof(T) == sizeof(float)? "float" : "double") << std::endl; - size_t nbytes = ARRAY_SIZE * sizeof(T); + size_t nbytes = array_size * sizeof(T); std::cout << std::setprecision(1) << std::fixed << "Array size: " << unit.fmt(nbytes) << " " << unit.str() << std::endl; std::cout << "Total size: " << unit.fmt(3.0*nbytes) << " " << unit.str() << std::endl; std::cout.precision(ss); } - std::unique_ptr> stream = make_stream(ARRAY_SIZE, deviceIndex); - auto initElapsedS = time([&] { stream->init_arrays(startA, startB, startC); }); + std::unique_ptr> stream + = make_stream(selection, array_size, deviceIndex, startA, startB, startC); // Result of the Dot kernel, if used. T sum{}; std::vector> timings = run_all(stream, sum); // Create & read host vectors: - std::vector a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE); - auto readElapsedS = time([&] { stream->read_arrays(a, b, c); }); + T const* a; + T const* b; + T const* c; + stream->get_arrays(a, b, c); check_solution(num_times, a, b, c, sum); - auto initBWps = fmt_bw(3, initElapsedS); - auto readBWps = fmt_bw(3, readElapsedS); if (output_as_csv) { fmt_csv_header(); - fmt_csv("Init", 1, ARRAY_SIZE, sizeof(T), initBWps, initElapsedS, initElapsedS, initElapsedS); - fmt_csv("Read", 1, ARRAY_SIZE, sizeof(T), readBWps, readElapsedS, readElapsedS, readElapsedS); } else { - std::cout << "Init: " - << std::setw(7) - << initElapsedS << " s (=" << initBWps << " " << unit.str() << "/s" << ")" << std::endl; - std::cout << "Read: " - << std::setw(7) - << readElapsedS << " s (=" << readBWps << " " << unit.str() << "/s" << ")" << std::endl; - std::cout << std::left << std::setw(12) << "Function" << std::left << std::setw(12) << (std::string(unit.str()) + "/s") @@ -313,15 +272,13 @@ void run() / (double)(num_times - 1); // Display results - fmt_result(bench[i].label, num_times, ARRAY_SIZE, sizeof(T), + fmt_result(bench[i].label, num_times, array_size, sizeof(T), fmt_bw(bench[i].weight, *minmax.first), *minmax.first, *minmax.second, average); } } template -void check_solution(const size_t num_times, - std::vector& a, std::vector& b, std::vector& c, T& sum) -{ +void check_solution(const size_t num_times, T const* a, T const* b, T const* c, T sum) { // Generate correct solution T goldA = startA; T goldB = startB; @@ -338,7 +295,7 @@ void check_solution(const size_t num_times, case BenchId::Add: goldC = goldA + goldB; break; case BenchId::Triad: goldA = goldB + scalar * goldC; break; case BenchId::Nstream: goldA += goldB + scalar * goldC; break; - case BenchId::Dot: goldS = goldA * goldB * T(ARRAY_SIZE); break; // This calculates the answer exactly + case BenchId::Dot: goldS = goldA * goldB * T(array_size); break; // This calculates the answer exactly default: std::cerr << "Unimplemented Check: " << bench[b].label << std::endl; abort(); @@ -372,38 +329,38 @@ void check_solution(const size_t num_times, // Error relative tolerance check size_t failed = 0; - T eps = std::numeric_limits::epsilon(); - T epsi = eps * T(100000.0); - auto check = [&](const char* name, T is, T should, T e, size_t i = size_t(-1)) { - if (e > epsi || std::isnan(e) || std::isnan(is)) { + T max_rel = std::numeric_limits::epsilon() * T(1000000.0); + auto check = [&](const char* name, T is, T should, size_t i = size_t(-1)) { + // Relative difference: + T diff = std::abs(is - should); + T abs_is = std::abs(is); + T abs_sh = std::abs(should); + T largest = std::max(abs_is, abs_sh); + T same = diff <= largest * max_rel; + if (!same || std::isnan(is)) { ++failed; if (failed > 10) return; std::cerr << "FAILED validation of " << name; if (i != size_t(-1)) std::cerr << "[" << i << "]"; - std::cerr << ": " << is << " != " << should - << ", relative error=" << e << " > " << epsi << std::endl; + std::cerr << ": " << is << " (is) != " << should + << " (should)" << ", diff=" << diff << " > " + << largest * max_rel << std::endl; } }; // Sum - T eS = std::fabs(sum - goldS) / std::fabs(goldS + eps); for (size_t i = 0; i < num_benchmarks; ++i) { if (bench[i].id != BenchId::Dot) continue; if (run_benchmark(bench[i])) - check("sum", sum, goldS, eS); + check("sum", sum, goldS); break; } // Calculate the L^infty-norm relative error - for (size_t i = 0; i < a.size(); ++i) { - T vA = a[i], vB = b[i], vC = c[i]; - T eA = std::fabs(vA - goldA) / std::fabs(goldA + eps); - T eB = std::fabs(vB - goldB) / std::fabs(goldB + eps); - T eC = std::fabs(vC - goldC) / std::fabs(goldC + eps); - - check("a", a[i], goldA, eA, i); - check("b", b[i], goldB, eB, i); - check("c", c[i], goldC, eC, i); + for (size_t i = 0; i < array_size; ++i) { + check("a", a[i], goldA, i); + check("b", b[i], goldB, i); + check("c", c[i], goldC, i); } if (failed > 0 && !silence_errors) @@ -449,13 +406,11 @@ void parseArguments(int argc, char *argv[]) else if (!std::string("--arraysize").compare(argv[i]) || !std::string("-s").compare(argv[i])) { - intptr_t array_size; if (++i >= argc || !parseInt(argv[i], &array_size) || array_size <= 0) { std::cerr << "Invalid array size." << std::endl; std::exit(EXIT_FAILURE); } - ARRAY_SIZE = array_size; } else if (!std::string("--numtimes").compare(argv[i]) || !std::string("-n").compare(argv[i])) diff --git a/src/ocl/OCLStream.cpp b/src/ocl/OCLStream.cpp index c70a701d..fc1ae30c 100644 --- a/src/ocl/OCLStream.cpp +++ b/src/ocl/OCLStream.cpp @@ -100,8 +100,9 @@ std::string kernels{R"CLC( template -OCLStream::OCLStream(const intptr_t ARRAY_SIZE, const int device_index) - : array_size{ARRAY_SIZE} +OCLStream::OCLStream(BenchId bs, const intptr_t array_size, const int device_index, + T initA, T initB, T initC) + : array_size{array_size} { if (!cached) getDeviceList(); @@ -172,18 +173,20 @@ OCLStream::OCLStream(const intptr_t ARRAY_SIZE, const int device_index) // Check buffers fit on the device cl_ulong totalmem = device.getInfo(); cl_ulong maxbuffer = device.getInfo(); - if (maxbuffer < sizeof(T)*ARRAY_SIZE) + if (maxbuffer < sizeof(T)*array_size) throw std::runtime_error("Device cannot allocate a buffer big enough"); - if (totalmem < 3*sizeof(T)*ARRAY_SIZE) + if (totalmem < 3*sizeof(T)*array_size) throw std::runtime_error("Device does not have enough memory for all 3 buffers"); // Create buffers - d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); - d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); - d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * ARRAY_SIZE); + d_a = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * array_size); + d_b = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * array_size); + d_c = cl::Buffer(context, CL_MEM_READ_WRITE, sizeof(T) * array_size); d_sum = cl::Buffer(context, CL_MEM_WRITE_ONLY, sizeof(T) * dot_num_groups); sums = std::vector(dot_num_groups); + + init_arrays(initA, initB, initC); } template @@ -277,11 +280,17 @@ void OCLStream::init_arrays(T initA, T initB, T initC) } template -void OCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +void OCLStream::get_arrays(T const*& a, T const*& b, T const*& c) { - cl::copy(queue, d_a, a.begin(), a.end()); - cl::copy(queue, d_b, b.begin(), b.end()); - cl::copy(queue, d_c, c.begin(), c.end()); + h_a.resize(array_size); + h_b.resize(array_size); + h_c.resize(array_size); + a = h_a.data(); + b = h_b.data(); + c = h_c.data(); + cl::copy(queue, d_a, h_a.begin(), h_a.end()); + cl::copy(queue, d_b, h_b.begin(), h_b.end()); + cl::copy(queue, d_c, h_c.begin(), h_c.end()); } void getDeviceList(void) diff --git a/src/ocl/OCLStream.h b/src/ocl/OCLStream.h index e2366dad..e5405dde 100644 --- a/src/ocl/OCLStream.h +++ b/src/ocl/OCLStream.h @@ -42,6 +42,9 @@ class OCLStream : public Stream cl::Buffer d_c; cl::Buffer d_sum; + // Host-side arrays for verification + std::vector h_a, h_b, h_c; + cl::KernelFunctor *init_kernel; cl::KernelFunctor *copy_kernel; cl::KernelFunctor * mul_kernel; @@ -56,19 +59,19 @@ class OCLStream : public Stream public: - OCLStream(const intptr_t, const int); + OCLStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~OCLStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; - - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; // Populate the devices list diff --git a/src/omp/OMPStream.cpp b/src/omp/OMPStream.cpp index 09b749fd..f0389373 100644 --- a/src/omp/OMPStream.cpp +++ b/src/omp/OMPStream.cpp @@ -13,10 +13,10 @@ #endif template -OMPStream::OMPStream(const intptr_t ARRAY_SIZE, int device) +OMPStream::OMPStream(BenchId bs, const intptr_t array_size, const int device, + T initA, T initB, T initC) + : array_size(array_size) { - array_size = ARRAY_SIZE; - // Allocate on the host this->a = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); this->b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); @@ -32,6 +32,7 @@ OMPStream::OMPStream(const intptr_t ARRAY_SIZE, int device) {} #endif + init_arrays(initA, initB, initC); } template @@ -77,7 +78,7 @@ void OMPStream::init_arrays(T initA, T initB, T initC) } template -void OMPStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +void OMPStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) { #ifdef OMP_TARGET_GPU @@ -87,15 +88,9 @@ void OMPStream::read_arrays(std::vector& h_a, std::vector& h_b, std::ve #pragma omp target update from(a[0:array_size], b[0:array_size], c[0:array_size]) {} #endif - - #pragma omp parallel for - for (intptr_t i = 0; i < array_size; i++) - { - h_a[i] = a[i]; - h_b[i] = b[i]; - h_c[i] = c[i]; - } - + h_a = a; + h_b = b; + h_c = c; } template diff --git a/src/omp/OMPStream.h b/src/omp/OMPStream.h index 40770005..fca4906c 100644 --- a/src/omp/OMPStream.h +++ b/src/omp/OMPStream.h @@ -29,16 +29,17 @@ class OMPStream : public Stream T *c; public: - OMPStream(const intptr_t, int); + OMPStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~OMPStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/raja/RAJAStream.cpp b/src/raja/RAJAStream.cpp index 6d6e8342..35fe6e8d 100644 --- a/src/raja/RAJAStream.cpp +++ b/src/raja/RAJAStream.cpp @@ -16,8 +16,9 @@ using RAJA::forall; #endif template -RAJAStream::RAJAStream(const intptr_t ARRAY_SIZE, const int device_index) - : array_size(ARRAY_SIZE), range(0, ARRAY_SIZE) +RAJAStream::RAJAStream(BenchId bs, const intptr_t array_size, const int device_index, + T initA, T initB, T initC) + : array_size(array_size), range(0, array_size) { #ifdef RAJA_TARGET_CPU @@ -25,11 +26,13 @@ RAJAStream::RAJAStream(const intptr_t ARRAY_SIZE, const int device_index) d_b = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); d_c = (T*)aligned_alloc(ALIGNMENT, sizeof(T)*array_size); #else - cudaMallocManaged((void**)&d_a, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); - cudaMallocManaged((void**)&d_b, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); - cudaMallocManaged((void**)&d_c, sizeof(T)*ARRAY_SIZE, cudaMemAttachGlobal); + cudaMallocManaged((void**)&d_a, sizeof(T)*array_size, cudaMemAttachGlobal); + cudaMallocManaged((void**)&d_b, sizeof(T)*array_size, cudaMemAttachGlobal); + cudaMallocManaged((void**)&d_c, sizeof(T)*array_size, cudaMemAttachGlobal); cudaDeviceSynchronize(); #endif + + init_arrays(initA, initB, initC); } template @@ -61,12 +64,11 @@ void RAJAStream::init_arrays(T initA, T initB, T initC) } template -void RAJAStream::read_arrays( - std::vector& a, std::vector& b, std::vector& c) +void RAJAStream::get_arrays(T const*& a, T const*& b, T const*& c) { - std::copy(d_a, d_a + array_size, a.data()); - std::copy(d_b, d_b + array_size, b.data()); - std::copy(d_c, d_c + array_size, c.data()); + a = d_a; + b = d_b; + c = d_c; } template diff --git a/src/raja/RAJAStream.hpp b/src/raja/RAJAStream.hpp index e98b0778..a2565ccc 100644 --- a/src/raja/RAJAStream.hpp +++ b/src/raja/RAJAStream.hpp @@ -50,19 +50,18 @@ class RAJAStream : public Stream T* d_c; public: - - RAJAStream(const intptr_t, const int); + RAJAStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~RAJAStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays( - std::vector& a, std::vector& b, std::vector& c) override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/std-data/STDDataStream.cpp b/src/std-data/STDDataStream.cpp index 3efeb1b3..8c280f8a 100644 --- a/src/std-data/STDDataStream.cpp +++ b/src/std-data/STDDataStream.cpp @@ -7,9 +7,10 @@ #include "STDDataStream.h" template -STDDataStream::STDDataStream(const intptr_t ARRAY_SIZE, int device) - noexcept : array_size{ARRAY_SIZE}, - a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) +STDDataStream::STDDataStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC) + noexcept : array_size{array_size}, + a(alloc_raw(array_size)), b(alloc_raw(array_size)), c(alloc_raw(array_size)) { std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; #ifdef USE_ONEDPL @@ -25,6 +26,7 @@ STDDataStream::STDDataStream(const intptr_t ARRAY_SIZE, int device) #endif std::cout << std::endl; #endif + init_arrays(initA, initB, initC); } template @@ -43,11 +45,11 @@ void STDDataStream::init_arrays(T initA, T initB, T initC) } template -void STDDataStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +void STDDataStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) { - std::copy(a, a + array_size, h_a.begin()); - std::copy(b, b + array_size, h_b.begin()); - std::copy(c, c + array_size, h_c.begin()); + h_a = a; + h_b = b; + h_c = c; } template diff --git a/src/std-data/STDDataStream.h b/src/std-data/STDDataStream.h index d92864be..6db998b2 100644 --- a/src/std-data/STDDataStream.h +++ b/src/std-data/STDDataStream.h @@ -25,17 +25,18 @@ class STDDataStream : public Stream T *a, *b, *c; public: - STDDataStream(const intptr_t, int) noexcept; + STDDataStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC) noexcept; ~STDDataStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/std-indices/STDIndicesStream.cpp b/src/std-indices/STDIndicesStream.cpp index 473d93d0..4f8efe20 100644 --- a/src/std-indices/STDIndicesStream.cpp +++ b/src/std-indices/STDIndicesStream.cpp @@ -11,9 +11,10 @@ #endif template -STDIndicesStream::STDIndicesStream(const intptr_t ARRAY_SIZE, int device) -noexcept : array_size{ARRAY_SIZE}, range(0, array_size), - a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) +STDIndicesStream::STDIndicesStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC) +noexcept : array_size{array_size}, range(0, array_size), + a(alloc_raw(array_size)), b(alloc_raw(array_size)), c(alloc_raw(array_size)) { std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; #ifdef USE_ONEDPL @@ -29,6 +30,7 @@ noexcept : array_size{ARRAY_SIZE}, range(0, array_size), #endif std::cout << std::endl; #endif + init_arrays(initA, initB, initC); } template @@ -47,11 +49,11 @@ void STDIndicesStream::init_arrays(T initA, T initB, T initC) } template -void STDIndicesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +void STDIndicesStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) { - std::copy(a, a + array_size, h_a.begin()); - std::copy(b, b + array_size, h_b.begin()); - std::copy(c, c + array_size, h_c.begin()); + h_a = a; + h_b = b; + h_c = c; } template diff --git a/src/std-indices/STDIndicesStream.h b/src/std-indices/STDIndicesStream.h index 8a8f5de8..7a43b1ec 100644 --- a/src/std-indices/STDIndicesStream.h +++ b/src/std-indices/STDIndicesStream.h @@ -80,17 +80,18 @@ class STDIndicesStream : public Stream T *a, *b, *c; public: - STDIndicesStream(const intptr_t, int) noexcept; + STDIndicesStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC) noexcept; ~STDIndicesStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/std-ranges/STDRangesStream.cpp b/src/std-ranges/STDRangesStream.cpp index 8b7ada4b..02bd56b2 100644 --- a/src/std-ranges/STDRangesStream.cpp +++ b/src/std-ranges/STDRangesStream.cpp @@ -12,9 +12,10 @@ #endif template -STDRangesStream::STDRangesStream(const intptr_t ARRAY_SIZE, int device) -noexcept : array_size{ARRAY_SIZE}, - a(alloc_raw(ARRAY_SIZE)), b(alloc_raw(ARRAY_SIZE)), c(alloc_raw(ARRAY_SIZE)) +STDRangesStream::STDRangesStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC) + noexcept : array_size{array_size}, + a(alloc_raw(array_size)), b(alloc_raw(array_size)), c(alloc_raw(array_size)) { std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; #ifdef USE_ONEDPL @@ -30,6 +31,7 @@ noexcept : array_size{ARRAY_SIZE}, #endif std::cout << std::endl; #endif + init_arrays(initA, initB, initC); } template @@ -54,12 +56,11 @@ void STDRangesStream::init_arrays(T initA, T initB, T initC) } template -void STDRangesStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +void STDRangesStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) { - // Element-wise copy. - std::copy(a, a + array_size, h_a.begin()); - std::copy(b, b + array_size, h_b.begin()); - std::copy(c, c + array_size, h_c.begin()); + h_a = a; + h_b = b; + h_c = c; } template diff --git a/src/std-ranges/STDRangesStream.hpp b/src/std-ranges/STDRangesStream.hpp index 51680c62..da04f1f4 100644 --- a/src/std-ranges/STDRangesStream.hpp +++ b/src/std-ranges/STDRangesStream.hpp @@ -24,18 +24,18 @@ class STDRangesStream : public Stream T *a, *b, *c; public: - STDRangesStream(const intptr_t, int) noexcept; + STDRangesStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC) noexcept; ~STDRangesStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; - - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/sycl/SYCLStream.cpp b/src/sycl/SYCLStream.cpp index e99454e6..5c00211e 100644 --- a/src/sycl/SYCLStream.cpp +++ b/src/sycl/SYCLStream.cpp @@ -17,13 +17,13 @@ std::vector devices; void getDeviceList(void); template -SYCLStream::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index) +SYCLStream::SYCLStream(BenchId bs, const intptr_t array_size, const int device_index, + T initA, T initB, T initC) + : array_size(array_size) { if (!cached) getDeviceList(); - array_size = ARRAY_SIZE; - if (device_index >= devices.size()) throw std::runtime_error("Invalid device index"); device dev = devices[device_index]; @@ -79,6 +79,8 @@ SYCLStream::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index) d_b = new buffer(array_size); d_c = new buffer(array_size); d_sum = new buffer(dot_num_groups); + + init_arrays(initA, initB, initC); } template @@ -238,17 +240,14 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) } template -void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +void SYCLStream::get_arrays(T const*& a, T const*& b, T const*& c) { auto _a = d_a->template get_access(); auto _b = d_b->template get_access(); auto _c = d_c->template get_access(); - for (int i = 0; i < array_size; i++) - { - a[i] = _a[i]; - b[i] = _b[i]; - c[i] = _c[i]; - } + a = &_a[0]; + b = &_b[0]; + c = &_c[0]; } void getDeviceList(void) diff --git a/src/sycl/SYCLStream.h b/src/sycl/SYCLStream.h index 1a40242d..94c3c4e9 100644 --- a/src/sycl/SYCLStream.h +++ b/src/sycl/SYCLStream.h @@ -54,19 +54,19 @@ class SYCLStream : public Stream public: - SYCLStream(const intptr_t, const int); + SYCLStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~SYCLStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; - - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; // Populate the devices list diff --git a/src/sycl/model.cmake b/src/sycl/model.cmake index 3826c3c7..8f45186d 100644 --- a/src/sycl/model.cmake +++ b/src/sycl/model.cmake @@ -9,36 +9,32 @@ register_flag_required(SYCL_COMPILER ONEAPI-ICPX - icpx as a standalone compiler ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`) DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) - HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL)") + AdaptiveCpp - AdaptiveCpp compiler (https://github.com/adaptivecpp/adaptivecpp)") register_flag_optional(SYCL_COMPILER_DIR "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) ONEAPI-Clang - set to the directory that contains the Intel clang++ binary. - HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" + AdaptiveCpp|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" "") macro(setup) set(CMAKE_CXX_STANDARD 17) + if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp") + set(adaptivecpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake/adaptivecpp) - if (${SYCL_COMPILER} STREQUAL "HIPSYCL") - - - set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL) - - if (NOT EXISTS "${hipSYCL_DIR}") - message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure") - set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake) + if (NOT EXISTS "${AdaptiveCpp_DIR}") + message(WARNING "Falling back to AdaptiveCpp < 0.9.0 CMake structure") + set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake) endif () - if (NOT EXISTS "${hipSYCL_DIR}") - message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL") + if (NOT EXISTS "${AdaptiveCpp_DIR}") + message(FATAL_ERROR "Can't find the appropriate CMake definitions for AdaptiveCpp") endif () # register_definitions(_GLIBCXX_USE_CXX11_ABI=0) - find_package(hipSYCL CONFIG REQUIRED) + find_package(AdaptiveCpp CONFIG REQUIRED) message(STATUS "ok") - elseif (${SYCL_COMPILER} STREQUAL "DPCPP") set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++) include_directories(${SYCL_COMPILER_DIR}/include/sycl) @@ -62,8 +58,8 @@ endmacro() macro(setup_target NAME) - if (${SYCL_COMPILER} STREQUAL "HIPSYCL") - # so hipSYCL has this weird (and bad) CMake usage where they append their + if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp") + # so AdaptiveCpp has this weird (and bad) CMake usage where they append their # own custom integration header flags AFTER the target has been specified # hence this macro here add_sycl_to_target( diff --git a/src/sycl2020-acc/SYCLStream2020.cpp b/src/sycl2020-acc/SYCLStream2020.cpp index 742be95b..d0f97e68 100644 --- a/src/sycl2020-acc/SYCLStream2020.cpp +++ b/src/sycl2020-acc/SYCLStream2020.cpp @@ -15,11 +15,12 @@ std::vector devices; void getDeviceList(void); template -SYCLStream::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index) -: array_size {ARRAY_SIZE}, - d_a {ARRAY_SIZE}, - d_b {ARRAY_SIZE}, - d_c {ARRAY_SIZE}, +SYCLStream::SYCLStream(BenchId bs, const intptr_t array_size, const int device_index, + T initA, T initB, T initC) + : array_size(array_size), + d_a {array_size}, + d_b {array_size}, + d_c {array_size}, d_sum {1} { if (!cached) @@ -68,7 +69,7 @@ SYCLStream::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index) devices.clear(); cached = true; - + init_arrays(initA, initB, initC); } @@ -164,18 +165,17 @@ T SYCLStream::dot() sycl::accessor kb {d_b, cgh, sycl::read_only}; cgh.parallel_for(sycl::range<1>{array_size}, - // Reduction object, to perform summation - initialises the result to zero - // hipSYCL doesn't sypport the initialize_to_identity property yet -#if defined(__HIPSYCL__) || defined(__OPENSYCL__) + // Reduction object, to perform summation - initialises the result to zero + // AdaptiveCpp doesn't sypport the initialize_to_identity property yet +#if defined(__HIPSYCL__) || defined(__OPENSYCL__) || defined(__ADAPTIVECPP__) sycl::reduction(d_sum. template get_access(cgh), sycl::plus()), #else - sycl::reduction(d_sum, cgh, sycl::plus(), sycl::property::reduction::initialize_to_identity{}), -#endif + sycl::reduction(sum, sycl::plus(), sycl::property::reduction::initialize_to_identity{}), +#endif [=](sycl::id<1> idx, auto& sum) { sum += ka[idx] * kb[idx]; }); - }); // Get access on the host, and return a copy of the data (single number) @@ -206,17 +206,14 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) } template -void SYCLStream::read_arrays(std::vector& a, std::vector& b, std::vector& c) +void SYCLStream::get_arrays(T const*& a, T const*& b, T const*& c) { sycl::host_accessor _a {d_a, sycl::read_only}; sycl::host_accessor _b {d_b, sycl::read_only}; sycl::host_accessor _c {d_c, sycl::read_only}; - for (int i = 0; i < array_size; i++) - { - a[i] = _a[i]; - b[i] = _b[i]; - c[i] = _c[i]; - } + a = &_a[0]; + b = &_b[0]; + c = &_c[0]; } void getDeviceList(void) diff --git a/src/sycl2020-acc/SYCLStream2020.h b/src/sycl2020-acc/SYCLStream2020.h index cd515f87..c0caae2e 100644 --- a/src/sycl2020-acc/SYCLStream2020.h +++ b/src/sycl2020-acc/SYCLStream2020.h @@ -35,19 +35,19 @@ class SYCLStream : public Stream public: - SYCLStream(const intptr_t, const int); + SYCLStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~SYCLStream() = default; - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; - - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; // Populate the devices list diff --git a/src/sycl2020-acc/model.cmake b/src/sycl2020-acc/model.cmake index 3826c3c7..c34051ea 100644 --- a/src/sycl2020-acc/model.cmake +++ b/src/sycl2020-acc/model.cmake @@ -9,34 +9,34 @@ register_flag_required(SYCL_COMPILER ONEAPI-ICPX - icpx as a standalone compiler ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`) DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) - HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL)") + AdaptiveCpp - AdaptiveCpp compiler (https://github.com/adaptivecpp/adaptivecpp)") register_flag_optional(SYCL_COMPILER_DIR "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) ONEAPI-Clang - set to the directory that contains the Intel clang++ binary. - HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" + AdaptiveCpp|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" "") macro(setup) set(CMAKE_CXX_STANDARD 17) - if (${SYCL_COMPILER} STREQUAL "HIPSYCL") + if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp") - set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL) + set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake/AdaptiveCpp) - if (NOT EXISTS "${hipSYCL_DIR}") - message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure") - set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake) + if (NOT EXISTS "${AdaptiveCpp_DIR}") + message(WARNING "Falling back to AdaptiveCpp < 0.9.0 CMake structure") + set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake) endif () - if (NOT EXISTS "${hipSYCL_DIR}") - message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL") + if (NOT EXISTS "${AdaptiveCpp_DIR}") + message(FATAL_ERROR "Can't find the appropriate CMake definitions for AdaptiveCpp") endif () # register_definitions(_GLIBCXX_USE_CXX11_ABI=0) - find_package(hipSYCL CONFIG REQUIRED) + find_package(AdaptiveCpp CONFIG REQUIRED) message(STATUS "ok") elseif (${SYCL_COMPILER} STREQUAL "DPCPP") @@ -62,8 +62,8 @@ endmacro() macro(setup_target NAME) - if (${SYCL_COMPILER} STREQUAL "HIPSYCL") - # so hipSYCL has this weird (and bad) CMake usage where they append their + if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp") + # so AdaptiveCpp has this weird (and bad) CMake usage where they append their # own custom integration header flags AFTER the target has been specified # hence this macro here add_sycl_to_target( diff --git a/src/sycl2020-usm/SYCLStream2020.cpp b/src/sycl2020-usm/SYCLStream2020.cpp index e4c6ec27..c8b863ad 100644 --- a/src/sycl2020-usm/SYCLStream2020.cpp +++ b/src/sycl2020-usm/SYCLStream2020.cpp @@ -15,8 +15,9 @@ std::vector devices; void getDeviceList(void); template -SYCLStream::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index) -: array_size {ARRAY_SIZE} +SYCLStream::SYCLStream(BenchId bs, const intptr_t array_size, const int device_index, + T initA, T initB, T initC) + : array_size(array_size) { if (!cached) getDeviceList(); @@ -69,7 +70,7 @@ SYCLStream::SYCLStream(const intptr_t ARRAY_SIZE, const int device_index) devices.clear(); cached = true; - + init_arrays(initA, initB, initC); } template @@ -156,8 +157,8 @@ T SYCLStream::dot() { cgh.parallel_for(sycl::range<1>{array_size}, // Reduction object, to perform summation - initialises the result to zero - // hipSYCL doesn't sypport the initialize_to_identity property yet -#if defined(__HIPSYCL__) || defined(__OPENSYCL__) + // AdaptiveCpp doesn't sypport the initialize_to_identity property yet +#if defined(__HIPSYCL__) || defined(__OPENSYCL__) || defined(__ADAPTIVECPP__) sycl::reduction(sum, sycl::plus()), #else sycl::reduction(sum, sycl::plus(), sycl::property::reduction::initialize_to_identity{}), @@ -166,7 +167,6 @@ T SYCLStream::dot() { sum += a[idx] * b[idx]; }); - }); queue->wait(); return *sum; @@ -189,14 +189,11 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) } template -void SYCLStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +void SYCLStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) { - for (int i = 0; i < array_size; i++) - { - h_a[i] = a[i]; - h_b[i] = b[i]; - h_c[i] = c[i]; - } + h_a = a; + h_b = b; + h_c = c; } void getDeviceList(void) diff --git a/src/sycl2020-usm/SYCLStream2020.h b/src/sycl2020-usm/SYCLStream2020.h index 811c26ef..c88c87a3 100644 --- a/src/sycl2020-usm/SYCLStream2020.h +++ b/src/sycl2020-usm/SYCLStream2020.h @@ -35,19 +35,19 @@ class SYCLStream : public Stream public: - SYCLStream(const intptr_t, const int); + SYCLStream(BenchId bs, const intptr_t array_size, const int device_index, + T initA, T initB, T initC); ~SYCLStream(); - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; - - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; // Populate the devices list diff --git a/src/sycl2020-usm/model.cmake b/src/sycl2020-usm/model.cmake index 950daefd..8db608ba 100644 --- a/src/sycl2020-usm/model.cmake +++ b/src/sycl2020-usm/model.cmake @@ -9,13 +9,13 @@ register_flag_required(SYCL_COMPILER ONEAPI-ICPX - icpx as a standalone compiler ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`) DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) - HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL)") + AdaptiveCpp - AdaptiveCpp compiler (https://github.com/adaptivecpp/adaptivecpp)") register_flag_optional(SYCL_COMPILER_DIR "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) ONEAPI-Clang - set to the directory that contains the Intel clang++ binary. - HIPSYCL|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" + AdaptiveCpp|DPCPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" "") @@ -23,21 +23,21 @@ macro(setup) set(CMAKE_CXX_STANDARD 17) - if (${SYCL_COMPILER} STREQUAL "HIPSYCL") + if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp") - set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL) + set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake/AdaptiveCpp) - if (NOT EXISTS "${hipSYCL_DIR}") - message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure") - set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake) + if (NOT EXISTS "${AdaptiveCpp_DIR}") + message(WARNING "Falling back to AdaptiveCpp < 0.9.0 CMake structure") + set(AdaptiveCpp_DIR ${SYCL_COMPILER_DIR}/lib/cmake) endif () - if (NOT EXISTS "${hipSYCL_DIR}") - message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL") + if (NOT EXISTS "${AdaptiveCpp_DIR}") + message(FATAL_ERROR "Can't find the appropriate CMake definitions for AdaptiveCpp") endif () # register_definitions(_GLIBCXX_USE_CXX11_ABI=0) - find_package(hipSYCL CONFIG REQUIRED) + find_package(AdaptiveCpp CONFIG REQUIRED) message(STATUS "ok") elseif (${SYCL_COMPILER} STREQUAL "DPCPP") @@ -63,8 +63,8 @@ endmacro() macro(setup_target NAME) - if (${SYCL_COMPILER} STREQUAL "HIPSYCL") - # so hipSYCL has this weird (and bad) CMake usage where they append their + if (${SYCL_COMPILER} STREQUAL "AdaptiveCpp") + # so AdaptiveCpp has this weird (and bad) CMake usage where they append their # own custom integration header flags AFTER the target has been specified # hence this macro here add_sycl_to_target( diff --git a/src/tbb/TBBStream.cpp b/src/tbb/TBBStream.cpp index 75af6141..01508022 100644 --- a/src/tbb/TBBStream.cpp +++ b/src/tbb/TBBStream.cpp @@ -20,15 +20,16 @@ #endif template -TBBStream::TBBStream(const intptr_t ARRAY_SIZE, int device) - : partitioner(), range(0, (size_t)ARRAY_SIZE), +TBBStream::TBBStream(BenchId bs, const intptr_t array_size, const int device, + T initA, T initB, T initC) + : partitioner(), range(0, (size_t)array_size), #ifdef USE_VECTOR - a(ARRAY_SIZE), b(ARRAY_SIZE), c(ARRAY_SIZE) + a(array_size), b(array_size), c(array_size) #else - array_size(ARRAY_SIZE), - a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), - b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)), - c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * ARRAY_SIZE)) + array_size(array_size), + a((T *) aligned_alloc(ALIGNMENT, sizeof(T) * array_size)), + b((T *) aligned_alloc(ALIGNMENT, sizeof(T) * array_size)), + c((T *) aligned_alloc(ALIGNMENT, sizeof(T) * array_size)) #endif { if(device != 0){ @@ -36,6 +37,8 @@ TBBStream::TBBStream(const intptr_t ARRAY_SIZE, int device) } std::cout << "Using TBB partitioner: " PARTITIONER_NAME << std::endl; std::cout << "Backing storage typeid: " << typeid(a).name() << std::endl; + + init_arrays(initA, initB, initC); } @@ -54,12 +57,17 @@ void TBBStream::init_arrays(T initA, T initB, T initC) } template -void TBBStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) +void TBBStream::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c) { - // Element-wise copy. - std::copy(BEGIN(a), END(a), h_a.begin()); - std::copy(BEGIN(b), END(b), h_b.begin()); - std::copy(BEGIN(c), END(c), h_c.begin()); +#ifdef USE_VECTOR + h_a = a.data(); + h_b = b.data(); + h_c = c.data(); +#else + h_a = a; + h_b = b; + h_c = c; +#endif } template diff --git a/src/tbb/TBBStream.hpp b/src/tbb/TBBStream.hpp index 80f11c17..0a73e892 100644 --- a/src/tbb/TBBStream.hpp +++ b/src/tbb/TBBStream.hpp @@ -31,7 +31,6 @@ using tbb_partitioner = tbb::auto_partitioner; #define PARTITIONER_NAME "auto_partitioner" #endif - template class TBBStream : public Stream { @@ -48,17 +47,17 @@ class TBBStream : public Stream #endif public: - TBBStream(const intptr_t, int); + TBBStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~TBBStream() = default; - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; - diff --git a/src/thrust/ThrustStream.cu b/src/thrust/ThrustStream.cu index 84b27b8e..321470b8 100644 --- a/src/thrust/ThrustStream.cu +++ b/src/thrust/ThrustStream.cu @@ -19,7 +19,8 @@ static inline void synchronise() } template -ThrustStream::ThrustStream(const intptr_t array_size, int device) +ThrustStream::ThrustStream(BenchId bs, const intptr_t array_size, const int device, + T initA, T initB, T initC) : array_size{array_size}, a(array_size), b(array_size), c(array_size) { std::cout << "Using CUDA device: " << getDeviceName(device) << std::endl; std::cout << "Driver: " << getDeviceDriver(device) << std::endl; @@ -36,8 +37,6 @@ ThrustStream::ThrustStream(const intptr_t array_size, int device) std::cout << "Thrust backend: TBB" << std::endl; #elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CPP std::cout << "Thrust backend: CPP" << std::endl; -#elif THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_TBB - std::cout << "Thrust backend: TBB" << std::endl; #else #if defined(THRUST_DEVICE_SYSTEM_HIP) && THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_HIP @@ -48,6 +47,7 @@ ThrustStream::ThrustStream(const intptr_t array_size, int device) #endif + init_arrays(initA, initB, initC); } template @@ -60,11 +60,23 @@ void ThrustStream::init_arrays(T initA, T initB, T initC) } template -void ThrustStream::read_arrays(std::vector& h_a, std::vector& h_b, std::vector& h_c) -{ +void ThrustStream::get_arrays(T const*& a_, T const*& b_, T const*& c_) +{ + #if defined(MANAGED) + a_ = &*a.data(); + b_ = &*b.data(); + c_ = &*c.data(); + #else + h_a.resize(array_size); + h_b.resize(array_size); + h_c.resize(array_size); thrust::copy(a.begin(), a.end(), h_a.begin()); thrust::copy(b.begin(), b.end(), h_b.begin()); thrust::copy(c.begin(), c.end(), h_c.begin()); + a_ = h_a.data(); + b_ = h_b.data(); + c_ = h_c.data(); + #endif } template diff --git a/src/thrust/ThrustStream.h b/src/thrust/ThrustStream.h index b0acd80f..676ecaeb 100644 --- a/src/thrust/ThrustStream.h +++ b/src/thrust/ThrustStream.h @@ -26,28 +26,25 @@ class ThrustStream : public Stream intptr_t array_size; #if defined(MANAGED) - thrust::universtal_vector a; - thrust::universtal_vector b; - thrust::universtal_vector c; + thrust::universal_vector a, b, c; #else - thrust::device_vector a; - thrust::device_vector b; - thrust::device_vector c; + thrust::device_vector a, b, c; + std::vector h_a, h_b, h_c; #endif public: - ThrustStream(const intptr_t, int); + ThrustStream(BenchId bs, const intptr_t array_size, const int device_id, + T initA, T initB, T initC); ~ThrustStream() = default; - virtual void copy() override; - virtual void add() override; - virtual void mul() override; - virtual void triad() override; - virtual void nstream() override; - virtual T dot() override; - - virtual void init_arrays(T initA, T initB, T initC) override; - virtual void read_arrays(std::vector& a, std::vector& b, std::vector& c) override; + void copy() override; + void add() override; + void mul() override; + void triad() override; + void nstream() override; + T dot() override; + void get_arrays(T const*& a, T const*& b, T const*& c) override; + void init_arrays(T initA, T initB, T initC); }; diff --git a/src/thrust/model.cmake b/src/thrust/model.cmake index 6b82ef59..23627c11 100644 --- a/src/thrust/model.cmake +++ b/src/thrust/model.cmake @@ -18,8 +18,7 @@ register_flag_optional(BACKEND " "CUDA") - register_flag_optional(MANAGED "Enabled managed memory mode." - "OFF") +register_flag_optional(MANAGED "Enabled managed memory mode." "OFF") register_flag_optional(CMAKE_CUDA_COMPILER "[THRUST_IMPL==CUDA] Path to the CUDA nvcc compiler"