From fd200664b0d977aa83db3f1c6d8a74cf5e5730c4 Mon Sep 17 00:00:00 2001 From: eagiem Date: Tue, 8 Oct 2024 10:45:05 -0700 Subject: [PATCH] MiniMD error injection update, merge --- src/resilience/openMP/OpenMPResParallel.hpp | 2 +- src/resilience/openMP/OpenMPResSubscriber.hpp | 102 ++++++++++++++---- tests/TestMiniMD.cpp | 60 ++++++++++- tests/TestOpenMPResilientExecution.cpp | 28 +++-- 4 files changed, 161 insertions(+), 31 deletions(-) diff --git a/src/resilience/openMP/OpenMPResParallel.hpp b/src/resilience/openMP/OpenMPResParallel.hpp index 16d7aa0..a9a333c 100644 --- a/src/resilience/openMP/OpenMPResParallel.hpp +++ b/src/resilience/openMP/OpenMPResParallel.hpp @@ -194,7 +194,7 @@ class ParallelFor< FunctorType const auto start{std::chrono::steady_clock::now()}; - //KokkosResilience::inject_error_duplicates(); + KokkosResilience::inject_error_duplicates(); const auto stop{std::chrono::steady_clock::now()}; KokkosResilience::ETimer::elapsed_seconds = KokkosResilience::ETimer::elapsed_seconds + (std::chrono::duration_cast(stop - start)); KokkosResilience::ETimer::total_error_time = KokkosResilience::ETimer::total_error_time + KokkosResilience::ETimer::elapsed_seconds; diff --git a/src/resilience/openMP/OpenMPResSubscriber.hpp b/src/resilience/openMP/OpenMPResSubscriber.hpp index d43f5da..13c4050 100644 --- a/src/resilience/openMP/OpenMPResSubscriber.hpp +++ b/src/resilience/openMP/OpenMPResSubscriber.hpp @@ -87,6 +87,7 @@ explicit Error(double rate) : error_rate(rate), geometric(rate){} double error_rate; std::geometric_distribution<> geometric{error_rate}; + }; inline std::optional global_error_settings; @@ -284,7 +285,13 @@ struct CombineDuplicates: public CombineDuplicatesBase } size_t next_inject = ErrorInject::global_next_inject; +#if 0 + double temp = global_error_settings->error_rate; + std::cout << "Error::error_rate is " << temp << std::endl; + std::cout << "ErrorInject::error_counter is " << ErrorInject::error_counter << std::endl; + std::cout << "next_inject is " << next_inject << std::endl; +#endif for (int j=0; j<=2; j++){ while (next_inject < original.size()) { @@ -305,50 +312,103 @@ struct CombineDuplicates: public CombineDuplicatesBase } } - void MultiDimTMRInject(){ -#if 0 - size_t total_extent = 1; - for (int i = 0; i <= original.rank(); i++){ - total_extent = total_extent * original.extent(i); - } - - //Ranked on if sum of extents match global_next_inject - if ((total_extent != 1) && (ErrorInject::global_next_inject > total_extent)) + void TwoDimTMRInject(){ +//#if 0 + //This error injection works with 2D views, subtracts total extent from global_next_inject + size_t total_extent = original.extent(0) * original.extent(1); + if (total_extent !=1 && (ErrorInject::global_next_inject > total_extent)) { ErrorInject::global_next_inject = ErrorInject::global_next_inject - total_extent; } size_t next_inject = ErrorInject::global_next_inject; +#if 0 + double temp = global_error_settings->error_rate; + std::cout << "Error::error_rate is " << temp << std::endl; + + std::cout << "ErrorInject::error_counter is " << ErrorInject::error_counter << std::endl; + std::cout << "original.extent(0) is " << original.extent(0) << std::endl; + std::cout << "original.extent(1) is " << original.extent(1) << std::endl; + std::cout << "total_extent is " << total_extent << std::endl; + std::cout << "next_inject is " << next_inject << std::endl; +#endif +#if 0 + //Completely closed off print loop. DELETE! + for (int j=0; j<=2; j++){ + while (next_inject < total_extent) + { + std::cout << "The value at next_inject translates to array(" << floor(next_inject/original.extent(0)) << "," + << next_inject - (original.extent(0) * floor(next_inject/original.extent(0))) << ") = " + << static_cast(original((int)floor(next_inject/original.extent(0)),next_inject - (original.extent(0) * (int)floor(next_inject/original.extent(0))))) + << "." << std::endl; + + ErrorInject::error_counter++; + next_inject = global_error_settings->geometric(ErrorInject::random_gen)+next_inject+1; + std::cout << "next_inject is " << next_inject << std::endl; + } + if(total_extent != 1){ + next_inject = next_inject - total_extent; + } + } +#endif + +//#if 0 for (int j = 0; j<=2; j++){ while (next_inject < total_extent) { +#if 0 + std::cout << "The value at next_inject translates to array(" << floor(next_inject/original.extent(0)) << "," + << next_inject - (original.extent(0) * floor(next_inject/original.extent(0))) << ") = " + << static_cast(original((int)floor(next_inject/original.extent(0)),next_inject - (original.extent(0) * (int)floor(next_inject/original.extent(0))))) + << "." << std::endl; +#endif + next_inject = global_error_settings->geometric(ErrorInject::random_gen)+next_inject+1; + //std::cout << "next_inject is " << next_inject << std::endl; + + if (j==0){//Inject in the original if j is 0 - original(next_inject) = static_cast( 2 * original(next_inject) + 2 * ErrorInject::random_gen());//generate using () + original((int)floor(next_inject/original.extent(0)), next_inject % original.extent(0)) + = static_cast( + 2 * original((int)floor(next_inject/original.extent(0)), next_inject % original.extent(0)) + + 2 * ErrorInject::random_gen());//generate using () ErrorInject::error_counter++; } else{//Else inject in one of the other two copies, copy[0] or copy[1] - copy[j-1](next_inject) = static_cast( 2 * copy[j-1](next_inject) + 2 * ErrorInject::random_gen()); + copy[j-1]((int)floor(next_inject/original.extent(0)), next_inject % original.extent(0)) + = static_cast( + 2 * copy[j-1]((int)floor(next_inject/original.extent(0)), next_inject % original.extent(0)) + + 2 * ErrorInject::random_gen()); ErrorInject::error_counter++; } next_inject = global_error_settings->geometric(ErrorInject::random_gen)+next_inject+1; - + //std::cout << "next_inject is " << next_inject << std::endl; } - if(original.size() != 1){ - next_inject = (next_inject) - (original.size()); + if(total_extent != 1){ + next_inject = next_inject - total_extent; } } -#endif - return; +//#endif } void inject_error() override { - if constexpr(rank > 1){ - - //Worry about injecting errors here later (in extent(0) ) - MultiDimTMRInject(); - + + //std::cout << "We got into the error injector and rank is: " << rank << "\n"; + if constexpr(rank == 2){ +#ifdef KR_ENABLE_DMR + //Implies dmr_failover_to_tmr + if(duplicate_count == 2) { + //goto main tmr inject + TwoDimTMRInject(); + } + else{//Actual DMR error injection with only 1 copy + }//End DMR error injection +#else + //Working not perfect + //std::cout << "We got into the 2d error injector section\n"; + TwoDimTMRInject(); +#endif }else{ #ifdef KR_ENABLE_DMR diff --git a/tests/TestMiniMD.cpp b/tests/TestMiniMD.cpp index 6ccdd29..32810a6 100644 --- a/tests/TestMiniMD.cpp +++ b/tests/TestMiniMD.cpp @@ -350,7 +350,7 @@ TEST(TestResOpenMP, TestMiniMDKernel) Kokkos::View z( "z", N ); size_t rank = x.rank(); - std::cout << "The rank of View x is rank: " << rank << "\n"; + //std::cout << "The rank of View x is rank: " << rank << "\n"; Kokkos::Timer timer; @@ -382,6 +382,64 @@ TEST(TestResOpenMP, TestMiniMDKernel) } +//Test MiniMD Exact Kernel Behavior with Resilience +TEST(TestResOpenMP, TestMiniMDKernelResilient) +{ + std::cout << "\n\n"; + KokkosResilience::ErrorInject::error_counter = 0; + std::cout << "ErrorInject::error_counter is " << KokkosResilience::ErrorInject::error_counter << "\n"; + std::cout << "This is the test of minMD 2D Resilient Error Injection \n\n\n"; + KokkosResilience::global_error_settings = KokkosResilience::Error(0.005); + + // Allocate 2D y, x vectors. + Kokkos::View> x( "x", N ); + Kokkos::View> y( "y", N ); + Kokkos::View> z( "z", N ); + + size_t rank = x.rank(); + //std::cout << "The rank of View x is rank: " << rank << "\n"; + + Kokkos::Timer timer; + + //Initialize x vector RESIIENT kernel WITH ERRORS + Kokkos::parallel_for( range_policy (0, N), KOKKOS_LAMBDA ( const int i) { + x ( i,0 ) = 1; + }); + + int j = 0; + + while (j<5){ + //Test MiniMD Kernel Behavior with RESILIENT kernel, RESILEINT views WITH ERRORS (cont prev count) + Kokkos::parallel_for( range_policy (0, N), KOKKOS_LAMBDA ( const int i) { + y ( i, 0 ) += test_const * x ( i, 0 ); + z ( i, 0 ) += test_const * y ( i, 0 ); + }); + j++; + } + + std::cout << "Test values y(1,0) and z(1,0) are " << y(1,0) << " and " << z(1,0) << " respectively." << std::endl; + + for ( int i = 0; i < N; i++) { + ASSERT_EQ(y(i,0), 5*test_const ); + ASSERT_EQ(z(i,0), 15*test_const*test_const ); + } + + KokkosResilience::print_total_error_time(); + KokkosResilience::clear_duplicates_cache(); + KokkosResilience::ErrorInject::error_counter=0; + KokkosResilience::global_error_settings.reset(); + + std::cout << std::endl <