diff --git a/include/xsimd/arch/generic/xsimd_generic_memory.hpp b/include/xsimd/arch/generic/xsimd_generic_memory.hpp index df00982ee..9fbdd9cba 100644 --- a/include/xsimd/arch/generic/xsimd_generic_memory.hpp +++ b/include/xsimd/arch/generic/xsimd_generic_memory.hpp @@ -419,12 +419,22 @@ namespace xsimd return select(batch_bool_constant, (Indices < bsize)...>(), x, y); } +#if defined(__has_builtin) && __has_builtin(__builtin_shuffle_vector) + return __builtin_shufflevector(x.data, y.data, Indices...); +// FIXME: my experiments show that GCC only correctly optimizes this builtin +// starting at GCC 13, where it already has __builtin_shuffle_vector +// +//#elif (defined(__has_builtin) && __has_builtin(__builtin_shuffle)) || GCC >= 6 +// typedef ITy integer_vector_type __attribute__((vector_size(sizeof(batch)))); +// return __builtin_shuffle(x.data, y.data, integer_vector_type{Indices...}); +#else // Use a generic_pattern. It is suboptimal but clang optimizes this // pretty well. batch x_lane = swizzle(x, batch_constant, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>()); batch y_lane = swizzle(y, batch_constant, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>()); batch_bool_constant, (Indices < bsize)...> select_x_lane; return select(select_x_lane, x_lane, y_lane); +#endif } // store