Skip to content

Commit

Permalink
Use __builtin_shufflevector for the generic implementation of shuffle
Browse files Browse the repository at this point in the history
Let the compiler do the heavy lifting when possible :-)
Fix #926
  • Loading branch information
serge-sans-paille committed Oct 17, 2023
1 parent 0f51f66 commit 95a8fee
Showing 1 changed file with 10 additions and 0 deletions.
10 changes: 10 additions & 0 deletions include/xsimd/arch/generic/xsimd_generic_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -419,12 +419,22 @@ namespace xsimd
return select(batch_bool_constant<batch<T, A>, (Indices < bsize)...>(), x, y);
}

#if defined(__has_builtin) && __has_builtin(__builtin_shuffle_vector)
return __builtin_shufflevector(x.data, y.data, Indices...);
// FIXME: my experiments show that GCC only correctly optimizes this builtin
// starting at GCC 13, where it already has __builtin_shuffle_vector
//
//#elif (defined(__has_builtin) && __has_builtin(__builtin_shuffle)) || GCC >= 6
// typedef ITy integer_vector_type __attribute__((vector_size(sizeof(batch<ITy, A>))));
// return __builtin_shuffle(x.data, y.data, integer_vector_type{Indices...});
#else
// Use a generic_pattern. It is suboptimal but clang optimizes this
// pretty well.
batch<T, A> x_lane = swizzle(x, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
batch<T, A> y_lane = swizzle(y, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
batch_bool_constant<batch<T, A>, (Indices < bsize)...> select_x_lane;
return select(select_x_lane, x_lane, y_lane);
#endif
}

// store
Expand Down

0 comments on commit 95a8fee

Please sign in to comment.