diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp index abcd0ae69..280abaa0f 100644 --- a/include/xsimd/arch/xsimd_sse2.hpp +++ b/include/xsimd/arch/xsimd_sse2.hpp @@ -1640,6 +1640,31 @@ namespace xsimd return bitwise_cast(swizzle(bitwise_cast(self), mask, sse2 {})); } + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + // permute within each lane + constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3); + constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7); + __m128i lo = _mm_shufflelo_epi16(self, mask_lo); + __m128i hi = _mm_shufflehi_epi16(self, mask_hi); + + __m128i lo_lo = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(lo), _mm_castsi128_pd(lo), _MM_SHUFFLE2(0, 0))); + __m128i hi_hi = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(hi), _mm_castsi128_pd(hi), _MM_SHUFFLE2(1, 1))); + + // mask to choose the right lane + batch_bool_constant blend_mask; + + // blend the two permutes + return select(blend_mask, batch(lo_lo), batch(hi_hi)); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, sse2 {})); + } + // transpose template XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept diff --git a/test/test_shuffle.cpp b/test/test_shuffle.cpp index bcfc4aeb5..6a6e02e89 100644 --- a/test/test_shuffle.cpp +++ b/test/test_shuffle.cpp @@ -732,4 +732,15 @@ TEST_CASE_TEMPLATE("[small integer transpose]", B, xsimd::batch, xsimd } } +#if XSIMD_WITH_NEON64 || (XSIMD_WITH_SSE2 && !XSIMD_WITH_AVX) +TEST_CASE_TEMPLATE("[small integer swizzle]", B, xsimd::batch, xsimd::batch) +{ + shuffle_test Test; + SUBCASE("swizzle") + { + Test.swizzle(); + } +} +#endif + #endif