Skip to content

Commit

Permalink
Remove incorrect qrdmulh SSE code.
Browse files Browse the repository at this point in the history
The only reason the SSE code for qrdmulh passed is because the edge
cases were not included in the tests unless SSE was disabled.  The
INT16_MIN * INT16_MIN case ought result in INT16_MAX - and it does,
in the fallback code.  It does not, in the SSE code, which is
what will typically be used on x86 hardware.  Saturating code not
handling edge cases is simply wrong.
  • Loading branch information
Syonyk committed Jan 3, 2025
1 parent 8067442 commit 1d4014d
Show file tree
Hide file tree
Showing 2 changed files with 0 additions and 24 deletions.
21 changes: 0 additions & 21 deletions simde/arm/neon/qrdmulh.h
Original file line number Diff line number Diff line change
Expand Up @@ -128,29 +128,8 @@ simde_vqrdmulhq_s16(simde_int16x8_t a, simde_int16x8_t b) {
a_ = simde_int16x8_to_private(a),
b_ = simde_int16x8_to_private(b);

/* https://github.com/WebAssembly/simd/pull/365 */
#if defined(SIMDE_ARM_NEON_A32V7_NATIVE)
r_.neon_i16 = vqrdmulhq_s16(a_.neon_i16, b_.neon_i16);
#elif defined(SIMDE_X86_SSSE3_NATIVE)
__m128i y = _mm_mulhrs_epi16(a_.m128i, b_.m128i);
__m128i tmp = _mm_cmpeq_epi16(y, _mm_set1_epi16(INT16_MAX));
r_.m128i = _mm_xor_si128(y, tmp);
#elif defined(SIMDE_X86_SSE2_NATIVE)
const __m128i prod_lo = _mm_mullo_epi16(a_.m128i, b_.m128i);
const __m128i prod_hi = _mm_mulhi_epi16(a_.m128i, b_.m128i);
const __m128i tmp =
_mm_add_epi16(
_mm_avg_epu16(
_mm_srli_epi16(prod_lo, 14),
_mm_setzero_si128()
),
_mm_add_epi16(prod_hi, prod_hi)
);
r_.m128i =
_mm_xor_si128(
tmp,
_mm_cmpeq_epi16(_mm_set1_epi16(INT16_MAX), tmp)
);
#else
SIMDE_VECTORIZE
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
Expand Down
3 changes: 0 additions & 3 deletions test/arm/neon/qrdmulh.c
Original file line number Diff line number Diff line change
Expand Up @@ -159,12 +159,9 @@ test_simde_vqrdmulhq_s16 (SIMDE_MUNIT_TEST_ARGS) {
{ { INT16_C( 28579), INT16_C( 26571), INT16_C( 23618), INT16_C( 3470), INT16_C( 10594), INT16_C( 31318), -INT16_C( 24794), INT16_C( 1860) },
{ -INT16_C( 22526), -INT16_C( 12632), INT16_C( 21464), INT16_C( 8577), INT16_C( 28627), INT16_C( 27596), -INT16_C( 26895), -INT16_C( 27290) },
{ -INT16_C( 19646), -INT16_C( 10243), INT16_C( 15470), INT16_C( 908), INT16_C( 9255), INT16_C( 26375), INT16_C( 20350), -INT16_C( 1549) } },
#if !defined(SIMDE_X86_SSE_NATIVE) && !defined(SIMDE_X86_MMX_NATIVE)
{ { INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN },
{ INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN },
{ INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX } },
#endif

};

for (size_t i = 0 ; i < (sizeof(test_vec) / sizeof(test_vec[0])) ; i++) {
Expand Down

0 comments on commit 1d4014d

Please sign in to comment.