Skip to content

Commit

Permalink
MiniMD error injection update, merge
Browse files Browse the repository at this point in the history
  • Loading branch information
ElisabethGiem committed Oct 8, 2024
1 parent 173790f commit fd20066
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 31 deletions.
2 changes: 1 addition & 1 deletion src/resilience/openMP/OpenMPResParallel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ class ParallelFor< FunctorType


const auto start{std::chrono::steady_clock::now()};
//KokkosResilience::inject_error_duplicates();
KokkosResilience::inject_error_duplicates();
const auto stop{std::chrono::steady_clock::now()};
KokkosResilience::ETimer::elapsed_seconds = KokkosResilience::ETimer::elapsed_seconds + (std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start));
KokkosResilience::ETimer::total_error_time = KokkosResilience::ETimer::total_error_time + KokkosResilience::ETimer::elapsed_seconds;
Expand Down
102 changes: 81 additions & 21 deletions src/resilience/openMP/OpenMPResSubscriber.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ explicit Error(double rate) : error_rate(rate), geometric(rate){}

double error_rate;
std::geometric_distribution<> geometric{error_rate};

};

inline std::optional<Error> global_error_settings;
Expand Down Expand Up @@ -284,7 +285,13 @@ struct CombineDuplicates: public CombineDuplicatesBase
}

size_t next_inject = ErrorInject::global_next_inject;
#if 0
double temp = global_error_settings->error_rate;
std::cout << "Error::error_rate is " << temp << std::endl;

std::cout << "ErrorInject::error_counter is " << ErrorInject::error_counter << std::endl;
std::cout << "next_inject is " << next_inject << std::endl;
#endif
for (int j=0; j<=2; j++){
while (next_inject < original.size())
{
Expand All @@ -305,50 +312,103 @@ struct CombineDuplicates: public CombineDuplicatesBase
}
}

void MultiDimTMRInject(){
#if 0
size_t total_extent = 1;
for (int i = 0; i <= original.rank(); i++){
total_extent = total_extent * original.extent(i);
}

//Ranked on if sum of extents match global_next_inject
if ((total_extent != 1) && (ErrorInject::global_next_inject > total_extent))
void TwoDimTMRInject(){
//#if 0
//This error injection works with 2D views, subtracts total extent from global_next_inject
size_t total_extent = original.extent(0) * original.extent(1);
if (total_extent !=1 && (ErrorInject::global_next_inject > total_extent))
{
ErrorInject::global_next_inject = ErrorInject::global_next_inject - total_extent;
}

size_t next_inject = ErrorInject::global_next_inject;
#if 0
double temp = global_error_settings->error_rate;
std::cout << "Error::error_rate is " << temp << std::endl;

std::cout << "ErrorInject::error_counter is " << ErrorInject::error_counter << std::endl;
std::cout << "original.extent(0) is " << original.extent(0) << std::endl;
std::cout << "original.extent(1) is " << original.extent(1) << std::endl;
std::cout << "total_extent is " << total_extent << std::endl;
std::cout << "next_inject is " << next_inject << std::endl;
#endif
#if 0
//Completely closed off print loop. DELETE!
for (int j=0; j<=2; j++){
while (next_inject < total_extent)
{
std::cout << "The value at next_inject translates to array(" << floor(next_inject/original.extent(0)) << ","
<< next_inject - (original.extent(0) * floor(next_inject/original.extent(0))) << ") = "
<< static_cast<typename View::value_type>(original((int)floor(next_inject/original.extent(0)),next_inject - (original.extent(0) * (int)floor(next_inject/original.extent(0)))))
<< "." << std::endl;

ErrorInject::error_counter++;
next_inject = global_error_settings->geometric(ErrorInject::random_gen)+next_inject+1;
std::cout << "next_inject is " << next_inject << std::endl;
}
if(total_extent != 1){
next_inject = next_inject - total_extent;
}
}
#endif


//#if 0
for (int j = 0; j<=2; j++){
while (next_inject < total_extent)
{
#if 0
std::cout << "The value at next_inject translates to array(" << floor(next_inject/original.extent(0)) << ","
<< next_inject - (original.extent(0) * floor(next_inject/original.extent(0))) << ") = "
<< static_cast<typename View::value_type>(original((int)floor(next_inject/original.extent(0)),next_inject - (original.extent(0) * (int)floor(next_inject/original.extent(0)))))
<< "." << std::endl;
#endif
next_inject = global_error_settings->geometric(ErrorInject::random_gen)+next_inject+1;
//std::cout << "next_inject is " << next_inject << std::endl;


if (j==0){//Inject in the original if j is 0
original(next_inject) = static_cast<typename View::value_type>( 2 * original(next_inject) + 2 * ErrorInject::random_gen());//generate using ()
original((int)floor(next_inject/original.extent(0)), next_inject % original.extent(0))
= static_cast<typename View::value_type>(
2 * original((int)floor(next_inject/original.extent(0)), next_inject % original.extent(0))
+ 2 * ErrorInject::random_gen());//generate using ()
ErrorInject::error_counter++;
}
else{//Else inject in one of the other two copies, copy[0] or copy[1]
copy[j-1](next_inject) = static_cast<typename View::value_type>( 2 * copy[j-1](next_inject) + 2 * ErrorInject::random_gen());
copy[j-1]((int)floor(next_inject/original.extent(0)), next_inject % original.extent(0))
= static_cast<typename View::value_type>(
2 * copy[j-1]((int)floor(next_inject/original.extent(0)), next_inject % original.extent(0))
+ 2 * ErrorInject::random_gen());
ErrorInject::error_counter++;
}
next_inject = global_error_settings->geometric(ErrorInject::random_gen)+next_inject+1;

//std::cout << "next_inject is " << next_inject << std::endl;
}
if(original.size() != 1){
next_inject = (next_inject) - (original.size());
if(total_extent != 1){
next_inject = next_inject - total_extent;
}
}
#endif
return;
//#endif
}

void inject_error() override
{
if constexpr(rank > 1){

//Worry about injecting errors here later (in extent(0) )
MultiDimTMRInject();


//std::cout << "We got into the error injector and rank is: " << rank << "\n";
if constexpr(rank == 2){
#ifdef KR_ENABLE_DMR
//Implies dmr_failover_to_tmr
if(duplicate_count == 2) {
//goto main tmr inject
TwoDimTMRInject();
}
else{//Actual DMR error injection with only 1 copy
}//End DMR error injection
#else
//Working not perfect
//std::cout << "We got into the 2d error injector section\n";
TwoDimTMRInject();
#endif
}else{

#ifdef KR_ENABLE_DMR
Expand Down
60 changes: 59 additions & 1 deletion tests/TestMiniMD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ TEST(TestResOpenMP, TestMiniMDKernel)
Kokkos::View<double*[2],Kokkos::LayoutRight, Kokkos::HostSpace > z( "z", N );

size_t rank = x.rank();
std::cout << "The rank of View x is rank: " << rank << "\n";
//std::cout << "The rank of View x is rank: " << rank << "\n";

Kokkos::Timer timer;

Expand Down Expand Up @@ -382,6 +382,64 @@ TEST(TestResOpenMP, TestMiniMDKernel)

}

//Test MiniMD Exact Kernel Behavior with Resilience
TEST(TestResOpenMP, TestMiniMDKernelResilient)
{
std::cout << "\n\n";
KokkosResilience::ErrorInject::error_counter = 0;
std::cout << "ErrorInject::error_counter is " << KokkosResilience::ErrorInject::error_counter << "\n";
std::cout << "This is the test of minMD 2D Resilient Error Injection \n\n\n";
KokkosResilience::global_error_settings = KokkosResilience::Error(0.005);

// Allocate 2D y, x vectors.
Kokkos::View<double*[2],Kokkos::LayoutRight, MemSpace,
Kokkos::Experimental::SubscribableViewHooks<
KokkosResilience::ResilientDuplicatesSubscriber >> x( "x", N );
Kokkos::View<double*[2],Kokkos::LayoutRight, MemSpace,
Kokkos::Experimental::SubscribableViewHooks<
KokkosResilience::ResilientDuplicatesSubscriber >> y( "y", N );
Kokkos::View<double*[2],Kokkos::LayoutRight, MemSpace,
Kokkos::Experimental::SubscribableViewHooks<
KokkosResilience::ResilientDuplicatesSubscriber >> z( "z", N );

size_t rank = x.rank();
//std::cout << "The rank of View x is rank: " << rank << "\n";

Kokkos::Timer timer;

//Initialize x vector RESIIENT kernel WITH ERRORS
Kokkos::parallel_for( range_policy (0, N), KOKKOS_LAMBDA ( const int i) {
x ( i,0 ) = 1;
});

int j = 0;

while (j<5){
//Test MiniMD Kernel Behavior with RESILIENT kernel, RESILEINT views WITH ERRORS (cont prev count)
Kokkos::parallel_for( range_policy (0, N), KOKKOS_LAMBDA ( const int i) {
y ( i, 0 ) += test_const * x ( i, 0 );
z ( i, 0 ) += test_const * y ( i, 0 );
});
j++;
}

std::cout << "Test values y(1,0) and z(1,0) are " << y(1,0) << " and " << z(1,0) << " respectively." << std::endl;

for ( int i = 0; i < N; i++) {
ASSERT_EQ(y(i,0), 5*test_const );
ASSERT_EQ(z(i,0), 15*test_const*test_const );
}

KokkosResilience::print_total_error_time();
KokkosResilience::clear_duplicates_cache();
KokkosResilience::ErrorInject::error_counter=0;
KokkosResilience::global_error_settings.reset();

std::cout << std::endl <<std::endl;

}


//Test RandomAccess
TEST(TestResOpenMP, TestRandomAccess)
{
Expand Down
28 changes: 20 additions & 8 deletions tests/TestOpenMPResilientExecution.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ TEST(TestResOpenMP, TestKokkosFor)
// Expect counter to count iterations.
TEST(TestResOpenMP, TestResilientForDouble)
{

KokkosResilience::global_error_settings = KokkosResilience::Error(0.001);

// Allocate y, x vectors.
Expand Down Expand Up @@ -139,6 +140,7 @@ TEST(TestResOpenMP, TestResilientForDouble)
ASSERT_EQ(counter(0), N);

//reset global error settings
KokkosResilience::ErrorInject::error_counter=0;
KokkosResilience::global_error_settings.reset();

}
Expand Down Expand Up @@ -286,7 +288,6 @@ TEST(TestResOpenMP, TestKokkos2D)
Kokkos::atomic_increment(&counter(0));
}
});

// std::cout << "Check 3: The error was after parallel_for." << std::endl;

Kokkos::deep_copy(x,y);
Expand All @@ -309,24 +310,32 @@ TEST(TestResOpenMP, TestKokkos2D)
// Expect counter to count accesses to each vector element.
TEST(TestResOpenMP, TestResilient2D)
{
// Allocate y, x vectors.

KokkosResilience::ErrorInject::error_counter = 0;
std::cout << "ErrorInject::error_counter is " << KokkosResilience::ErrorInject::error_counter << "\n";
std::cout << "\n\n\n\n\nThis is the test of 2D Resilient Error Injection \n\n\n";
KokkosResilience::global_error_settings = KokkosResilience::Error(0.00001);

// Allocate y, x vectors.
ViewVectorDoubleSubscriber2D y( "y", N, N );
ViewVectorDoubleSubscriber2D x( "x", N, N );

//Integer vector 1 long to count data accesses, because scalar view bugs
ViewVectorIntSubscriber counter( "DataAccesses", 1);
//ViewVectorIntSubscriber counter( "DataAccesses", 1);

Kokkos::Timer timer;
counter(0) = 0;
//Kokkos::Timer timer;
//counter(0) = 0;

std::cout << "Rank of x is " << x.rank() << std::endl;

//Initialize y vector on host using parallel_for, increment a counter for data accesses.
Kokkos::parallel_for( range_policy (0, N), KOKKOS_LAMBDA ( const int i) {
for (int j = 0; j < N; j++){
y ( i,j ) = i+j;
Kokkos::atomic_increment(&counter(0));
//Kokkos::atomic_increment(&counter(0));
}
});

KokkosResilience::print_total_error_time();
Kokkos::deep_copy(x, y);
KokkosResilience::clear_duplicates_cache();

Expand All @@ -335,7 +344,10 @@ TEST(TestResOpenMP, TestResilient2D)
ASSERT_EQ(x(i,j), i+j);
}
}
ASSERT_EQ(counter(0), N*N);
//ASSERT_EQ(counter(0), N*N);
KokkosResilience::ErrorInject::error_counter=0;
KokkosResilience::global_error_settings.reset();

}

/**********************************
Expand Down

0 comments on commit fd20066

Please sign in to comment.