From 678f4b82a82471d2e41928fa5c92ba86ad52e409 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Tue, 17 Oct 2023 22:38:25 +0200 Subject: [PATCH] Use __builtin_shufflevector for the generic implementation of shuffle Let the compiler do the heavy lifting when possible :-) Fix #926 --- .../xsimd/arch/generic/xsimd_generic_memory.hpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/xsimd/arch/generic/xsimd_generic_memory.hpp b/include/xsimd/arch/generic/xsimd_generic_memory.hpp index df00982ee..2c31407ee 100644 --- a/include/xsimd/arch/generic/xsimd_generic_memory.hpp +++ b/include/xsimd/arch/generic/xsimd_generic_memory.hpp @@ -419,12 +419,27 @@ namespace xsimd return select(batch_bool_constant, (Indices < bsize)...>(), x, y); } +#if defined(__has_builtin) +#define builtin_shuffle __builtin_shuffle_vector +#endif + +#if defined(builtin_shuffle) + return builtin_shuffle(x.data, y.data, Indices...); + +// FIXME: my experiments show that GCC only correctly optimizes this builtin +// starting at GCC 13, where it already has __builtin_shuffle_vector +// +//#elif __has_builtin(__builtin_shuffle) || GCC >= 6 +// typedef ITy integer_vector_type __attribute__((vector_size(sizeof(batch)))); +// return __builtin_shuffle(x.data, y.data, integer_vector_type{Indices...}); +#else // Use a generic_pattern. It is suboptimal but clang optimizes this // pretty well. batch x_lane = swizzle(x, batch_constant, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>()); batch y_lane = swizzle(y, batch_constant, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>()); batch_bool_constant, (Indices < bsize)...> select_x_lane; return select(select_x_lane, x_lane, y_lane); +#endif } // store