diff --git a/CMakeLists.txt b/CMakeLists.txt index 616896bc..8e599e7a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,7 @@ project(BabelStream VERSION 4.0 LANGUAGES CXX) # uncomment for debugging build issues: # set(CMAKE_VERBOSE_MAKEFILE ON) +option(WITH_MPI "Enable MPI" OFF) find_package(MPI) # some nicer defaults for standard C++ @@ -164,11 +165,16 @@ target_compile_options(${EXE_NAME} PUBLIC "$<$:${ACTUAL_DEBUG_FLAG target_link_options(${EXE_NAME} PUBLIC LINKER:${CXX_EXTRA_LINKER_FLAGS}) target_link_options(${EXE_NAME} PUBLIC ${LINK_FLAGS} ${CXX_EXTRA_LINK_FLAGS}) -# If MPI is available, link relevant libraries -if(MPI_FOUND) - message(STATUS "MPI Detected. Linking relevant MPI libraries") - target_link_libraries(${EXE_NAME} PUBLIC MPI::MPI_CXX) -endif() +# If MPI is requested, link relevant libraries +if (WITH_MPI) + if (MPI_FOUND) + message(STATUS "MPI requested and found. Linking relevant MPI libraries.") + target_link_libraries(${EXE_NAME} PUBLIC MPI::MPI_CXX) + add_definitions(-DUSE_MPI) + else () + message(FATAL_ERROR "MPI requested, but no implementation has been detected.") + endif () +endif () # some models require the target to be already specified so they can finish their setup here # this only happens if the model.cmake definition contains the `setup_target` macro diff --git a/src/hip/HIPStream.cpp b/src/hip/HIPStream.cpp index 6aed1ee1..4c636fd5 100644 --- a/src/hip/HIPStream.cpp +++ b/src/hip/HIPStream.cpp @@ -43,7 +43,8 @@ HIPStream::HIPStream(const int ARRAY_SIZE, const int device_index) check_error(); // Print out device information - std::cout << "Using HIP device " << getDeviceName(device_index) << std::endl; + std::cout << "Using HIP device " << getDeviceName(device_index) + << " with index: " << device_index << std::endl; std::cout << "Driver: " << getDeviceDriver(device_index) << std::endl; array_size = ARRAY_SIZE; diff --git a/src/main.cpp b/src/main.cpp index c9d76942..a67d9068 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -51,6 +51,12 @@ #include "OMPStream.h" #endif +#if USE_MPI +#include +// MPI parameters +int rank, procs; +#endif + // Default size of 2^25 int ARRAY_SIZE = 33554432; unsigned int num_times = 100; @@ -79,15 +85,41 @@ void parseArguments(int argc, char *argv[]); int main(int argc, char *argv[]) { +#if USE_MPI + int provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided); + if (provided < MPI_THREAD_FUNNELED) { + MPI_Abort(MPI_COMM_WORLD, provided); + } + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &procs); + + // Each local rank on a given node will own a single device/GCD + MPI_Comm shmcomm; + MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, + MPI_INFO_NULL, &shmcomm); + int localRank; + MPI_Comm_rank(shmcomm, &localRank); + // Set device index to be the local MPI rank + deviceIndex = localRank; +#endif parseArguments(argc, argv); if (!output_as_csv) { - std::cout - << "BabelStream" << std::endl - << "Version: " << VERSION_STRING << std::endl - << "Implementation: " << IMPLEMENTATION_STRING << std::endl; +#if USE_MPI + if (rank == 0) { +#endif + std::cout + << "BabelStream" << std::endl + << "Version: " << VERSION_STRING << std::endl + << "Implementation: " << IMPLEMENTATION_STRING << std::endl; +#if USE_MPI + std::cout << "Number of MPI ranks: " << procs << std::endl; + } +#endif } if (use_float) @@ -95,6 +127,10 @@ int main(int argc, char *argv[]) else run(); +#if USE_MPI + MPI_Finalize(); +#endif + } @@ -112,33 +148,52 @@ std::vector> run_all(Stream *stream, T& sum) // Main loop for (unsigned int k = 0; k < num_times; k++) { +#if USE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + // Execute Copy t1 = std::chrono::high_resolution_clock::now(); stream->copy(); +#if USE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif t2 = std::chrono::high_resolution_clock::now(); timings[0].push_back(std::chrono::duration_cast >(t2 - t1).count()); // Execute Mul t1 = std::chrono::high_resolution_clock::now(); stream->mul(); +#if USE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif t2 = std::chrono::high_resolution_clock::now(); timings[1].push_back(std::chrono::duration_cast >(t2 - t1).count()); // Execute Add t1 = std::chrono::high_resolution_clock::now(); stream->add(); +#if USE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif t2 = std::chrono::high_resolution_clock::now(); timings[2].push_back(std::chrono::duration_cast >(t2 - t1).count()); // Execute Triad t1 = std::chrono::high_resolution_clock::now(); stream->triad(); +#if USE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif t2 = std::chrono::high_resolution_clock::now(); timings[3].push_back(std::chrono::duration_cast >(t2 - t1).count()); // Execute Dot t1 = std::chrono::high_resolution_clock::now(); sum = stream->dot(); +#if USE_MPI + MPI_Allreduce(MPI_IN_PLACE, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); +#endif t2 = std::chrono::high_resolution_clock::now(); timings[4].push_back(std::chrono::duration_cast >(t2 - t1).count()); @@ -162,6 +217,9 @@ std::vector> run_triad(Stream *stream) t1 = std::chrono::high_resolution_clock::now(); for (unsigned int k = 0; k < num_times; k++) { +#if USE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif stream->triad(); } t2 = std::chrono::high_resolution_clock::now(); @@ -183,8 +241,14 @@ std::vector> run_nstream(Stream *stream) // Run nstream in loop for (int k = 0; k < num_times; k++) { +#if USE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif t1 = std::chrono::high_resolution_clock::now(); stream->nstream(); +#if USE_MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif t2 = std::chrono::high_resolution_clock::now(); timings[0].push_back(std::chrono::duration_cast >(t2 - t1).count()); } @@ -201,43 +265,67 @@ void run() { std::streamsize ss = std::cout.precision(); - if (!output_as_csv) +#if USE_MPI + if (rank == 0) +#endif { - if (selection == Benchmark::All) - std::cout << "Running kernels " << num_times << " times" << std::endl; - else if (selection == Benchmark::Triad) + if (!output_as_csv) { - std::cout << "Running triad " << num_times << " times" << std::endl; - std::cout << "Number of elements: " << ARRAY_SIZE << std::endl; - } - - - if (sizeof(T) == sizeof(float)) - std::cout << "Precision: float" << std::endl; - else - std::cout << "Precision: double" << std::endl; + if (selection == Benchmark::All) + std::cout << "Running kernels " << num_times << " times" << std::endl; + else if (selection == Benchmark::Triad) + { + std::cout << "Running triad " << num_times << " times" << std::endl; + std::cout << "Number of elements: " << ARRAY_SIZE << std::endl; + } + if (sizeof(T) == sizeof(float)) + std::cout << "Precision: float" << std::endl; + else + std::cout << "Precision: double" << std::endl; - if (mibibytes) - { - // MiB = 2^20 - std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB" - << " (=" << ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB" - << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl; - } - else - { - // MB = 10^6 - std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" - << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" - << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl; + if (mibibytes) + { + // MiB = 2^20 + std::cout << std::setprecision(1) << std::fixed +#if USE_MPI + << "Array size (per rank): " +#else + << "Array size: " +#endif + << ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB" + << " (=" << ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl; + std::cout << +#if USE_MPI + "Total size (per rank): " +#else + "Total size: " +#endif + << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -20.0) << " MiB" + << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*pow(2.0, -30.0) << " GiB)" << std::endl; + } + else + { + // MB = 10^6 + std::cout << std::setprecision(1) << std::fixed +#if USE_MPI + << "Array size (per rank): " +#else + << "Array size: " +#endif + << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" + << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl; + std::cout << +#if USE_MPI + "Total size (per rank): " +#else + "Total size: " +#endif + << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" + << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl; + } + std::cout.precision(ss); } - std::cout.precision(ss); - } Stream *stream; @@ -328,34 +416,42 @@ void run() stream->read_arrays(a, b, c); +#if USE_MPI + // Only check solutions on rank 0 in case verificaiton fails + if (rank == 0) +#endif check_solution(num_times, a, b, c, sum); // Display timing results - if (output_as_csv) - { - std::cout - << "function" << csv_separator - << "num_times" << csv_separator - << "n_elements" << csv_separator - << "sizeof" << csv_separator - << ((mibibytes) ? "max_mibytes_per_sec" : "max_mbytes_per_sec") << csv_separator - << "min_runtime" << csv_separator - << "max_runtime" << csv_separator - << "avg_runtime" << std::endl; - } - else +#if USE_MPI + if (rank == 0) +#endif { - std::cout - << std::left << std::setw(12) << "Function" - << std::left << std::setw(12) << ((mibibytes) ? "MiBytes/sec" : "MBytes/sec") - << std::left << std::setw(12) << "Min (sec)" - << std::left << std::setw(12) << "Max" - << std::left << std::setw(12) << "Average" - << std::endl - << std::fixed; + if (output_as_csv) + { + std::cout + << "function" << csv_separator + << "num_times" << csv_separator + << "n_elements" << csv_separator + << "sizeof" << csv_separator + << ((mibibytes) ? "max_mibytes_per_sec" : "max_mbytes_per_sec") << csv_separator + << "min_runtime" << csv_separator + << "max_runtime" << csv_separator + << "avg_runtime" << std::endl; + } + else + { + std::cout + << std::left << std::setw(12) << "Function" + << std::left << std::setw(12) << ((mibibytes) ? "MiBytes/sec" : "MBytes/sec") + << std::left << std::setw(12) << "Min (sec)" + << std::left << std::setw(12) << "Max" + << std::left << std::setw(12) << "Average" + << std::endl + << std::fixed; + } } - if (selection == Benchmark::All || selection == Benchmark::Nstream) { @@ -385,30 +481,53 @@ void run() // Calculate average; ignore the first result double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) / (double)(num_times - 1); - // Display results - if (output_as_csv) + double min = *minmax.first; + double max = *minmax.second; + +#if USE_MPI + // Collate timings + if (rank == 0) { - std::cout - << labels[i] << csv_separator - << num_times << csv_separator - << ARRAY_SIZE << csv_separator - << sizeof(T) << csv_separator - << ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) << csv_separator - << *minmax.first << csv_separator - << *minmax.second << csv_separator - << average - << std::endl; + MPI_Reduce(MPI_IN_PLACE, &min, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); + MPI_Reduce(MPI_IN_PLACE, &max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); } else { - std::cout - << std::left << std::setw(12) << labels[i] - << std::left << std::setw(12) << std::setprecision(3) << - ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (*minmax.first) - << std::left << std::setw(12) << std::setprecision(5) << *minmax.first - << std::left << std::setw(12) << std::setprecision(5) << *minmax.second - << std::left << std::setw(12) << std::setprecision(5) << average - << std::endl; + MPI_Reduce(&min, NULL, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); + MPI_Reduce(&max, NULL, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + } + sizes[i] *= procs; +#endif + + // Display results +#if USE_MPI + if (rank == 0) +#endif + { + if (output_as_csv) + { + std::cout + << labels[i] << csv_separator + << num_times << csv_separator + << ARRAY_SIZE << csv_separator + << sizeof(T) << csv_separator + << ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (min) << csv_separator + << min << csv_separator + << max << csv_separator + << average + << std::endl; + } + else + { + std::cout + << std::left << std::setw(12) << labels[i] + << std::left << std::setw(12) << std::setprecision(3) << + ((mibibytes) ? pow(2.0, -20.0) : 1.0E-6) * sizes[i] / (min) + << std::left << std::setw(12) << std::setprecision(5) << min + << std::left << std::setw(12) << std::setprecision(5) << max + << std::left << std::setw(12) << std::setprecision(5) << average + << std::endl; + } } } } else if (selection == Benchmark::Triad) @@ -417,35 +536,40 @@ void run() double total_bytes = 3 * sizeof(T) * ARRAY_SIZE * num_times; double bandwidth = ((mibibytes) ? pow(2.0, -30.0) : 1.0E-9) * (total_bytes / timings[0][0]); - if (output_as_csv) - { - std::cout - << "function" << csv_separator - << "num_times" << csv_separator - << "n_elements" << csv_separator - << "sizeof" << csv_separator - << ((mibibytes) ? "gibytes_per_sec" : "gbytes_per_sec") << csv_separator - << "runtime" - << std::endl; - std::cout - << "Triad" << csv_separator - << num_times << csv_separator - << ARRAY_SIZE << csv_separator - << sizeof(T) << csv_separator - << bandwidth << csv_separator - << timings[0][0] - << std::endl; - } - else +#if USE_MPI + if (rank == 0) +#endif { - std::cout - << "--------------------------------" - << std::endl << std::fixed - << "Runtime (seconds): " << std::left << std::setprecision(5) - << timings[0][0] << std::endl - << "Bandwidth (" << ((mibibytes) ? "GiB/s" : "GB/s") << "): " - << std::left << std::setprecision(3) - << bandwidth << std::endl; + if (output_as_csv) + { + std::cout + << "function" << csv_separator + << "num_times" << csv_separator + << "n_elements" << csv_separator + << "sizeof" << csv_separator + << ((mibibytes) ? "gibytes_per_sec" : "gbytes_per_sec") << csv_separator + << "runtime" + << std::endl; + std::cout + << "Triad" << csv_separator + << num_times << csv_separator + << ARRAY_SIZE << csv_separator + << sizeof(T) << csv_separator + << bandwidth << csv_separator + << timings[0][0] + << std::endl; + } + else + { + std::cout + << "--------------------------------" + << std::endl << std::fixed + << "Runtime (seconds): " << std::left << std::setprecision(5) + << timings[0][0] << std::endl + << "Bandwidth (" << ((mibibytes) ? "GiB/s" : "GB/s") << "): " + << std::left << std::setprecision(3) + << bandwidth << std::endl; + } } } @@ -486,6 +610,10 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector // Do the reduction goldSum = goldA * goldB * ARRAY_SIZE; +#if USE_MPI + goldSum *= (T)procs; +#endif + // Calculate the average error long double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); }); errA /= a.size();