From e54152b8a303e102395868fbdff8b88b0e447200 Mon Sep 17 00:00:00 2001 From: Hidayat Khan Date: Tue, 1 Sep 2020 07:04:33 -0400 Subject: [PATCH] sse4.2: added the implementation for mm_cmpestra --- simde/x86/sse4.2.h | 237 +++++++++++++++++++++++++++++++++++++++++++++ test/x86/sse4.2.c | 35 +++++++ 2 files changed, 272 insertions(+) diff --git a/simde/x86/sse4.2.h b/simde/x86/sse4.2.h index 19905b2ac..99217d57c 100644 --- a/simde/x86/sse4.2.h +++ b/simde/x86/sse4.2.h @@ -93,6 +93,243 @@ SIMDE_BEGIN_DECLS_ #define _SIDD_UNIT_MASK SIMDE_SIDD_UNIT_MASK #endif +SIMDE_FUNCTION_ATTRIBUTES +int +simde_mm_cmpestra_8_(simde__m128i a, int la, simde__m128i b, int lb, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + const int cmp_op = imm8 & 0x0c; + const int polarity = imm8 & 0x30; + simde__m128i_private + bool_res_ = simde__m128i_to_private(simde_mm_setzero_si128()), + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + const int upper_bound = (128 / 8) - 1; + int a_invalid = 0; + int b_invalid = 0; + for(int i = 0 ; i <= upper_bound ; i++) { + for(int j = 0; j <= upper_bound ; j++){ + int bitvalue = ((a_.i8[i] == b_.i8[j]) ? 1 : 0); + if(i == la) + a_invalid = 1; + if(j == lb) + b_invalid = 1; + switch(cmp_op){ + case SIMDE_SIDD_CMP_EQUAL_ANY: + case SIMDE_SIDD_CMP_RANGES: + if(!a_invalid && !b_invalid); + else + bitvalue = 0; + break; + case SIMDE_SIDD_CMP_EQUAL_EACH: + if(!a_invalid && !b_invalid); + else if(a_invalid && b_invalid) + bitvalue = 1; + else + bitvalue = 0; + break; + case SIMDE_SIDD_CMP_EQUAL_ORDERED: + if(!a_invalid && !b_invalid); + else if(a_invalid && !b_invalid) + bitvalue = 1; + else if(a_invalid && b_invalid) + bitvalue = 1; + else + bitvalue = 0; + break; + } + bool_res_.i8[i] |= (bitvalue << j); + } + } + int32_t int_res_1 = 0; + int32_t int_res_2 = 0; + switch(cmp_op) { + case SIMDE_SIDD_CMP_EQUAL_ANY: + for(int i = 0 ; i <= upper_bound ; i++){ + SIMDE_VECTORIZE_REDUCTION(|:int_res_1) + for(int j = 0 ; j <= upper_bound ; j++){ + int_res_1 |= (((bool_res_.i8[i] >> j) & 1) << i); + } + } + break; + case SIMDE_SIDD_CMP_RANGES: + for(int i = 0 ; i <= upper_bound ; i++){ + SIMDE_VECTORIZE_REDUCTION(|:int_res_1) + for(int j = 0 ; j <= upper_bound ; j++){ + int_res_1 |= ((((bool_res_.i8[i] >> j) & 1) & ((bool_res_.i8[i] >> (j + 1)) & 1)) << i); + j += 2; + } + } + break; + case SIMDE_SIDD_CMP_EQUAL_EACH: + SIMDE_VECTORIZE_REDUCTION(|:int_res_1) + for(int i = 0 ; i <= upper_bound ; i++){ + int_res_1 |= (((bool_res_.i8[i] >> i) & 1) << i); + } + break; + case SIMDE_SIDD_CMP_EQUAL_ORDERED: + int_res_1 = 0xff; + for(int i = 0 ; i <= upper_bound ; i++){ + int k = i; + HEDLEY_DIAGNOSTIC_PUSH + #if defined(SIMDE_BUG_CLANG_45959) + #pragma clang diagnostic ignored "-Wsign-conversion" + #endif + SIMDE_VECTORIZE_REDUCTION(&:int_res_1) + for(int j = 0 ; j <= (upper_bound-i) ; j++){ + int_res_1 &= (((bool_res_.i8[k] >> j) & 1 ) << i) ; + k += 1; + } + HEDLEY_DIAGNOSTIC_POP + } + break; + } + for(int i = 0; i <= upper_bound ; i++){ + if(polarity & SIMDE_SIDD_NEGATIVE_POLARITY){ + if(polarity & SIMDE_SIDD_MASKED_POSITIVE_POLARITY) { + if (i >= lb) { + int_res_2 |= (((int_res_1 >> i) & 1) << i); + } + else { + int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i); + } + } + else{ + int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i); + } + } + else{ + int_res_2 |= ( ((int_res_1 >> i) & 1) << i); + } + } + return !int_res_2 & (lb > upper_bound); +} + +SIMDE_FUNCTION_ATTRIBUTES +int +simde_mm_cmpestra_16_(simde__m128i a, int la, simde__m128i b, int lb, const int imm8) + SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { + const int cmp_op = imm8 & 0x0c; + const int polarity = imm8 & 0x30; + simde__m128i_private + bool_res_ = simde__m128i_to_private(simde_mm_setzero_si128()), + a_ = simde__m128i_to_private(a), + b_ = simde__m128i_to_private(b); + const int upper_bound = (128 / 16) - 1; + int a_invalid = 0; + int b_invalid = 0; + for(int i = 0 ; i <= upper_bound ; i++) { + for(int j = 0; j <= upper_bound ; j++) + { + int bitvalue = ((a_.i16[i] == b_.i16[j]) ? 1 : 0); + a_invalid = 0; + b_invalid = 0; + if(i >= la) + a_invalid = 1; + if(j >= lb) + b_invalid = 1; + switch(cmp_op){ + case SIMDE_SIDD_CMP_EQUAL_ANY: + case SIMDE_SIDD_CMP_RANGES: + if(!a_invalid && !b_invalid); + else + bitvalue = 0; + break; + case SIMDE_SIDD_CMP_EQUAL_EACH: + if(!a_invalid && !b_invalid); + else if(a_invalid && b_invalid) + bitvalue = 1; + else + bitvalue = 0; + break; + case SIMDE_SIDD_CMP_EQUAL_ORDERED: + if(!a_invalid && !b_invalid); + else if(a_invalid && !b_invalid) + bitvalue = 1; + else if(a_invalid && b_invalid) + bitvalue = 1; + else + bitvalue = 0; + break; + } + bool_res_.i16[i] |= (bitvalue << j); + } + } + int32_t int_res_1 = 0; + int32_t int_res_2 = 0; + switch(cmp_op) { + case SIMDE_SIDD_CMP_EQUAL_ANY: + for(int i = 0 ; i <= upper_bound ; i++){ + SIMDE_VECTORIZE_REDUCTION(|:int_res_1) + for (int j = 0 ; j <= upper_bound ; j++){ + int_res_1 |= (((bool_res_.i16[i] >> j) & 1) << i) ; + } + } + break; + case SIMDE_SIDD_CMP_RANGES: + for(int i = 0 ; i <= upper_bound ; i++){ + SIMDE_VECTORIZE_REDUCTION(|:int_res_1) + for(int j = 0 ; j <= upper_bound ; j++){ + int_res_1 |= ((((bool_res_.i16[i] >> j) & 1) & ((bool_res_.i16[i] >> (j + 1)) & 1)) << i); + j += 2; + } + } + break; + case SIMDE_SIDD_CMP_EQUAL_EACH: + SIMDE_VECTORIZE_REDUCTION(|:int_res_1) + for(int i = 0 ; i <= upper_bound ; i++){ + int_res_1 |= (((bool_res_.i16[i] >> i) & 1) << i); + } + break; + case SIMDE_SIDD_CMP_EQUAL_ORDERED: + int_res_1 = 0xffff; + for(int i = 0 ; i <= upper_bound ; i++){ + int k = i; + HEDLEY_DIAGNOSTIC_PUSH + #if defined(SIMDE_BUG_CLANG_45959) + #pragma clang diagnostic ignored "-Wsign-conversion" + #endif + SIMDE_VECTORIZE_REDUCTION(&:int_res_1) + for(int j = 0 ; j <= (upper_bound-i) ; j++){ + int_res_1 &= (((bool_res_.i16[k] >> j) & 1) << i) ; + k += 1; + } + HEDLEY_DIAGNOSTIC_POP + } + break; + } + for(int i = 0; i <= upper_bound ; i++){ + if(polarity & SIMDE_SIDD_NEGATIVE_POLARITY){ + if(polarity & SIMDE_SIDD_MASKED_POSITIVE_POLARITY) { + if (i >= lb) { + int_res_2 |= (((int_res_1 >> i) & 1) << i); + } + else { + int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i); + } + } + else{ + int_res_2 |= ((((int_res_1 >> i) & 1) ^ (-1)) << i); + } + } + else{ + int_res_2 |= (((int_res_1 >> i) & 1) << i); + } + } + return !int_res_2 & (lb > upper_bound); +} + +#if defined(SIMDE_X86_SSE4_2_NATIVE) + #define simde_mm_cmpestra(a, la, b, lb, imm8) _mm_cmpestra(a, la, b, lb, imm8) +#else + #define simde_mm_cmpestra(a, la, b, lb, imm8) \ + (((imm8) & SIMDE_SIDD_UWORD_OPS) \ + ? simde_mm_cmpestra_16_((a), (la), (b), (lb), (imm8)) \ + : simde_mm_cmpestra_8_((a), (la), (b), (lb), (imm8))) +#endif +#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) + #define _mm_cmpestra(a, la, b, lb, imm8) simde_mm_cmpestra(a, la, b, lb, imm8) +#endif + SIMDE_FUNCTION_ATTRIBUTES int simde_mm_cmpestrs (simde__m128i a, int la, simde__m128i b, int lb, const int imm8) SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { diff --git a/test/x86/sse4.2.c b/test/x86/sse4.2.c index 874cb2092..24e096df0 100644 --- a/test/x86/sse4.2.c +++ b/test/x86/sse4.2.c @@ -25,6 +25,40 @@ #include #include +static MunitResult +test_simde_mm_cmpestra_ranges_8(const MunitParameter params[], void* data) { + (void) params; + (void) data; + + const struct { + simde__m128i a; + int la; + simde__m128i b; + int lb; + int r; + } test_vec[] = { + { simde_mm_set_epi8(INT8_C( 45), INT8_C( -94), INT8_C( 38), INT8_C( -11), + INT8_C( 84), INT8_C(-123), INT8_C( -43), INT8_C( -49), + INT8_C( 25), INT8_C( -55), INT8_C(-121), INT8_C( -6), + INT8_C( 57), INT8_C( 108), INT8_C( -55), INT8_C( 69)), + 23 , + simde_mm_set_epi8(INT8_C( -26), INT8_C( -61), INT8_C( -21), INT8_C( -96), + INT8_C( 48), INT8_C(-112), INT8_C( 95), INT8_C( -56), + INT8_C( 29), INT8_C( -55), INT8_C(-121), INT8_C( -6), + INT8_C( 57), INT8_C( 108), INT8_C( -55), INT8_C( 69)), + 28 , + 0 } + }; + + for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])); i++) { + int r; + r = simde_mm_cmpestra(test_vec[i].a, test_vec[i].la, test_vec[i].b, test_vec[i].lb, 36); + munit_assert_int(r, ==, test_vec[i].r); + } + + return MUNIT_OK; +} + static int test_simde_mm_cmpestrs_8(SIMDE_MUNIT_TEST_ARGS) { const struct { @@ -1064,6 +1098,7 @@ test_simde_mm_crc32_u64 (SIMDE_MUNIT_TEST_ARGS) { } SIMDE_TEST_FUNC_LIST_BEGIN + SIMDE_TEST_FUNC_LIST_ENTRY(mm_cmpestra_ranges_8) SIMDE_TEST_FUNC_LIST_ENTRY(mm_cmpestrs_8) SIMDE_TEST_FUNC_LIST_ENTRY(mm_cmpestrs_16) SIMDE_TEST_FUNC_LIST_ENTRY(mm_cmpestrz_8)