xtensor-stack · serge-sans-paille · Jan 8, 2025 · Jan 8, 2025 · Jan 9, 2025 · Jan 9, 2025
diff --git a/.github/workflows/cross.yml b/.github/workflows/cross.yml
@@ -15,7 +15,7 @@ jobs:
           - { platform: 'aarch64', arch: 'armv8-a', dir: 'aarch64-linux-gnu', flags: '', full: 'ON' }
         sys:
           - { compiler: 'gcc',   version: '9' }
-          - { compiler: 'clang', version: 'latest' }
+          - { compiler: 'clang', version: '16' }
     steps:
     - name: Setup compiler
       if: ${{ matrix.sys.compiler == 'clang' }}

diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp
@@ -1640,6 +1640,31 @@ namespace xsimd
             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
         }
 
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<sse2>) noexcept
+        {
+            // permute within each lane
+            constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
+            constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7);
+            __m128i lo = _mm_shufflelo_epi16(self, mask_lo);
+            __m128i hi = _mm_shufflehi_epi16(self, mask_hi);
+
+            __m128i lo_lo = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(lo), _mm_castsi128_pd(lo), _MM_SHUFFLE2(0, 0)));
+            __m128i hi_hi = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(hi), _mm_castsi128_pd(hi), _MM_SHUFFLE2(1, 1)));
+
+            // mask to choose the right lane
+            batch_bool_constant<uint16_t, A, (V0 < 4), (V1 < 4), (V2 < 4), (V3 < 4), (V4 < 4), (V5 < 4), (V6 < 4), (V7 < 4)> blend_mask;
+
+            // blend the two permutes
+            return select(blend_mask, batch<uint16_t, A>(lo_lo), batch<uint16_t, A>(hi_hi));
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<sse2>) noexcept
+        {
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, sse2 {}));
+        }
+
         // transpose
         template <class A>
         XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<sse2>) noexcept

diff --git a/test/test_shuffle.cpp b/test/test_shuffle.cpp
@@ -732,4 +732,15 @@ TEST_CASE_TEMPLATE("[small integer transpose]", B, xsimd::batch<uint16_t>, xsimd
     }
 }
 
+#if (XSIMD_WITH_SSE2 && !XSIMD_WITH_AVX)
+TEST_CASE_TEMPLATE("[small integer swizzle]", B, xsimd::batch<uint16_t>, xsimd::batch<int16_t>)
+{
+    shuffle_test<B> Test;
+    SUBCASE("swizzle")
+    {
+        Test.swizzle();
+    }
+}
+#endif
+
 #endif