From 1e9692c5512002bb296475d7f68412e0bb6f5749 Mon Sep 17 00:00:00 2001
From: Steven Munroe <munroesj52@gmail.com>
Date: Thu, 15 Sep 2022 19:18:02 -0500
Subject: [PATCH] Resolve FTBFS [Bug 2113609] pveclib in rawhide/f37

The switch to GCC-12 exposed some latent issues in pveclib-1.0.4-4
1) The (accidental) typedef of __float128 over __ieee128 and the cast
of (vector unsigned int *) constants to (vector unsigned __int128 *)
seems to violate strict-aliasing. But this is not reported unless
-Wstrict-aliasing=2 is used. The result was silently generating
incorrect code. GCC PR 106755.
2) GCC-12 corrected a long latent problem with vec_cpsgn
(swapped operands). PVECLIB followed the implementation from
previous GCC versions (GCC <= 11). This broke vector float/double
unit tests. GCC PR 101984.
3) The implementation of IEEE Float128 is evolving and various types
(__float128, __ieee128, _Float128, ...) are highly dependent on the
compiler version/options used. The implementation (tries again) to
supports. The API uses __binary128 and avoids _Float128.

	* src/pveclib/vec_f128_ppc.h (__binary128): Define/typedef to
	the quad-precision float type that the support supports.

	* src/pveclib/vec_f32_ppc.h (vec_copysignf32):
	Unless PVECLIB_CPSGN_FIXED is defined, avoid vec_cpsgn.
	* src/pveclib/vec_f64_ppc.h (vec_copysignf64):
	Unless PVECLIB_CPSGN_FIXED is defined, avoid vec_cpsgn.

	* src/testsuite/arith128_test_f32.c (test_float_cpsgn):
	Swap operands/results to match GCC12.
	* src/testsuite/arith128_test_f64.c (test_double_cpsgn):
	Swap operands/results to match GCC12.

	* src/testsuite/arith128_test_i128.c (test_muludq):
	Change local variable 'l' type to match vec_muludq parameter.
	(test_madduq)
	Change local variable 'l' type to match vec_madduq parameter.

	* src/testsuite/vec_f128_dummy.c: Replace all usage of type
	__Float128 with __binary128.

Signed-off-by: Steven Munroe <munroesj52@gmail.com>
---
 src/pveclib/vec_f128_ppc.h         | 23 +++++++++++++--
 src/pveclib/vec_f32_ppc.h          | 37 ++++++++++++++++++------
 src/pveclib/vec_f64_ppc.h          | 46 ++++++++++++++++++++++--------
 src/testsuite/arith128_test_f32.c  | 14 ++++-----
 src/testsuite/arith128_test_f64.c  | 26 ++++++++---------
 src/testsuite/arith128_test_i128.c |  6 ++--
 src/testsuite/vec_f128_dummy.c     | 24 ++++++++--------
 7 files changed, 120 insertions(+), 56 deletions(-)

diff --git a/src/pveclib/vec_f128_ppc.h b/src/pveclib/vec_f128_ppc.h
index b5398db..7bb96a3 100644
--- a/src/pveclib/vec_f128_ppc.h
+++ b/src/pveclib/vec_f128_ppc.h
@@ -195,12 +195,31 @@ test_cosf128 (__binary128 value)
    -mcu=power9 and -mfloat128.
    So far clang does not support/define the __ibm128 type. */
 #ifdef __FLOAT128__
-typedef __float128 __Float128;
+#ifndef __clang__
+// For now assume the not __clang__ implies GCC
+// Can't just #ifdef __GNUC__ as Clang defined it
+#ifdef __float128
+// Can assume GCC 7 or later so ...
+// That version defines __ieee128 internally and
+// #defines __float128 to __ieee128, so both are defined
+// Define __binary128 so both GCC and CLang can use a single type
+#define __binary128 __ieee128
+#else
+// Assume GCC 6 or earlier
+// So the compiler defines __float128 only
 typedef __float128 __binary128;
 typedef __float128 __ieee128;
-#ifndef __clang__
+#endif
+#if (__GNUC__ < 7)
+typedef __float128 _Float128;
+#endif
 typedef __ibm128 __IBM128;
 #else
+/* Clang started defining __FLOAT128__ and does not allow redefining
+   __float128 or __ieee128. Worse it will give errors if you try to
+   use either type. So define __binary128 as if __FLOAT128__ is not
+   defined. */
+typedef vui128_t __binary128;
 /* Clang does not define __ibm128 over IBM long double.
    So defined it here. */
 typedef long double __IBM128;
diff --git a/src/pveclib/vec_f32_ppc.h b/src/pveclib/vec_f32_ppc.h
index 8a146c5..2cd2c89 100644
--- a/src/pveclib/vec_f32_ppc.h
+++ b/src/pveclib/vec_f32_ppc.h
@@ -774,31 +774,52 @@ vec_any_iszerof32 (vf32_t vf32)
 #endif
 }
 
-/** \brief Copy the sign bit from vf32y merged with magnitude from
- *  vf32x and return the resulting vector float values.
+/** \brief Copy the sign bit from vf32x merged with magnitude from
+ *  vf32y and return the resulting vector float values.
+ *
+ *  \note This operation was patterned after the intrinsic vec_cpsgn
+ *  (altivec.h) introduced for POWER7 and VSX. It turns out the
+ *  original (GCC 4.9) compiler implementation reversed the operands
+ *  and does not match the PowerISA or the Vector Intrinsic Programming
+ *  Reference manuals. Subsequent compilers and PVECLIB
+ *  implementations replicated this (operand order) error.
+ *  This has now been reported as bug against the compilers, which are
+ *  in the process of applying fixes and distributing updates.
+ *  This version of PVECLIB is updated to match the Vector Intrinsic
+ *  Programming Reference. This implementation is independent of the
+ *  compilers update status.
  *
  *  |processor|Latency|Throughput|
  *  |--------:|:-----:|:---------|
  *  |power8   | 6-7   | 2/cycle  |
  *  |power9   | 2     | 2/cycle  |
  *
- *  @param vf32x vector float values containing the magnitudes.
- *  @param vf32y vector float values containing the sign bits.
- *  @return vector float values with magnitude from vf32x and the
- *  sign of vf32y.
+ *  @param vf32x vector float values containing the sign bits.
+ *  @param vf32y vector float values containing the magnitudes.
+ *  @return vector float values with magnitude from vf32y and the
+ *  sign of vf32x.
  */
 static inline vf32_t
 vec_copysignf32 (vf32_t vf32x, vf32_t vf32y)
 {
 #if _ARCH_PWR7
-  /* P9 has a 2 cycle xvcpsgnsp and eliminates a const load. */
+#ifdef PVECLIB_CPSGN_FIXED
   return (vec_cpsgn (vf32x, vf32y));
+#else
+  vf32_t result;
+  __asm__(
+      "xvcpsgnsp %x0,%x1,%x2;\n"
+      : "=wa" (result)
+      : "wa" (vf32x), "wa" (vf32y)
+      :);
+  return (result);
+#endif
 #else
   const vui32_t signmask = CONST_VINT128_W(0x80000000, 0x80000000,
       0x80000000, 0x80000000);
   vf32_t result;
 
-  result = (vf32_t)vec_sel ((vui32_t)vf32x, (vui32_t)vf32y, signmask);
+  result = (vf32_t)vec_sel ((vui32_t)vf32y, (vui32_t)vf32x, signmask);
   return (result);
 #endif
 }
diff --git a/src/pveclib/vec_f64_ppc.h b/src/pveclib/vec_f64_ppc.h
index 69fbd05..b599e39 100644
--- a/src/pveclib/vec_f64_ppc.h
+++ b/src/pveclib/vec_f64_ppc.h
@@ -773,31 +773,53 @@ vec_any_iszerof64 (vf64_t vf64)
 #endif
 }
 
-/** \brief Copy the sign bit from vf64y merged with magnitude from
- *  vf64x and return the resulting vector double values.
+/** \brief Copy the sign bit from vf64x merged with magnitude from
+ *  vf64y and return the resulting vector double values.
+ *
+ *  \note This operation was patterned after the intrinsic vec_cpsgn
+ *  (altivec.h) introduced for POWER7 and VSX. It turns out the
+ *  original (GCC 4.9) compiler implementation reversed the operands
+ *  and does not match the PowerISA or the Vector Intrinsic Programming
+ *  Reference manuals. Subsequent compilers and PVECLIB
+ *  implementations replicated this (operand order) error.
+ *  This has now been reported as bug against the compilers, which are
+ *  in the process of applying fixes and distributing updates.
+ *  This version of PVECLIB is updated to match the Vector Intrinsic
+ *  Programming Reference. This implementation is independent of the
+ *  compilers update status.
  *
  *  |processor|Latency|Throughput|
  *  |--------:|:-----:|:---------|
  *  |power8   | 6-7   | 2/cycle  |
  *  |power9   | 2     | 2/cycle  |
  *
- *  @param vf64x vector double values containing the magnitudes.
- *  @param vf64y vector double values containing the sign bits.
- *  @return vector double values with magnitude from vf64x and the
- *  sign of vf64y.
+ *  @param vf64x vector double values containing the sign bits.
+ *  @param vf64y vector double values containing the magnitudes.
+ *  @return vector double values with magnitude from vf64y and the
+ *  sign of vf64x.
  */
 static inline vf64_t
-vec_copysignf64 (vf64_t vf64x , vf64_t vf64y)
+vec_copysignf64 (vf64_t vf64x, vf64_t vf64y)
 {
 #if _ARCH_PWR7
   /* P9 has a 2 cycle xvcpsgndp and eliminates a const load. */
-	return (vec_cpsgn (vf64x, vf64y));
+#ifdef PVECLIB_CPSGN_FIXED
+  return (vec_cpsgn (vf64x, vf64y));
+#else
+  vf64_t result;
+  __asm__(
+      "xvcpsgndp %x0,%x1,%x2;\n"
+      : "=wa" (result)
+      : "wa" (vf64x), "wa" (vf64y)
+      :);
+  return (result);
+#endif
 #else
-	const vui32_t signmask  = CONST_VINT128_W(0x80000000, 0, 0x80000000, 0);
-	vf64_t result;
+  const vui32_t signmask = CONST_VINT128_W(0x80000000, 0, 0x80000000, 0);
+  vf64_t result;
 
-	result  = (vf64_t)vec_sel ((vui32_t)vf64x, (vui32_t)vf64y, signmask);
-	return (result);
+  result = (vf64_t) vec_sel ((vui32_t) vf64y, (vui32_t) vf64x, signmask);
+  return (result);
 #endif
 }
 
diff --git a/src/testsuite/arith128_test_f32.c b/src/testsuite/arith128_test_f32.c
index dd65f90..016bd54 100644
--- a/src/testsuite/arith128_test_f32.c
+++ b/src/testsuite/arith128_test_f32.c
@@ -1186,7 +1186,7 @@ test_float_cpsgn (void)
 
   i = (vf32_t) { 0.0, -0.0, 0.0, -0.0 };
   j = (vf32_t) {-0.0, 0.0, -0.0, 0.0 };
-  e = (vf32_t) {-0.0, 0.0, -0.0, 0.0 };
+  e = (vf32_t) { 0.0, -0.0, 0.0, -0.0 };
   k = vec_copysignf32 (i, j);
 
 #ifdef __DEBUG_PRINT__
@@ -1196,9 +1196,9 @@ test_float_cpsgn (void)
 #endif
   rc += check_v4f32x ("vec_copysignf32 1:", k, e);
 
-  i = (vf32_t) { __FLT_MAX__, __FLT_MIN__, __FLT_EPSILON__,
+  i = (vf32_t) {-0.0, 0.0, -0.0, 0.0 };
+  j = (vf32_t) { __FLT_MAX__, __FLT_MIN__, __FLT_EPSILON__,
 		  __FLT_DENORM_MIN__ };
-  j = (vf32_t) {-0.0, 0.0, -0.0, 0.0 };
   e = (vf32_t) { -(__FLT_MAX__), __FLT_MIN__, -(__FLT_EPSILON__),
 		  __FLT_DENORM_MIN__ };
   k = vec_copysignf32 (i, j);
@@ -1210,9 +1210,9 @@ test_float_cpsgn (void)
 #endif
   rc += check_v4f32x ("vec_copysignf32 2:", k, e);
 
-  i = (vf32_t) CONST_VINT128_W(__FLOAT_INF, __FLOAT_NINF, __FLOAT_INF,
+  i = (vf32_t) CONST_VINT32_W(0.0, -0.0, 0.0, -0.0);
+  j = (vf32_t) CONST_VINT128_W(__FLOAT_INF, __FLOAT_NINF, __FLOAT_INF,
 			       __FLOAT_NINF);
-  j = (vf32_t) CONST_VINT32_W(0.0, -0.0, 0.0, -0.0);
   e = (vf32_t) CONST_VINT128_W(__FLOAT_INF, __FLOAT_NINF, __FLOAT_INF,
 			       __FLOAT_NINF);
   k = vec_copysignf32 (i, j);
@@ -1224,9 +1224,9 @@ test_float_cpsgn (void)
 #endif
   rc += check_v4f32x ("vec_copysignf32 3:", k, e);
 
-  i = (vf32_t) CONST_VINT128_W(__FLOAT_NAN, __FLOAT_NNAN, __FLOAT_NSNAN,
+  i = (vf32_t) {-0.0, 0.0, 0.0, -0.0 };
+  j = (vf32_t) CONST_VINT128_W(__FLOAT_NAN, __FLOAT_NNAN, __FLOAT_NSNAN,
 			       __FLOAT_SNAN);
-  j = (vf32_t) {-0.0, 0.0, 0.0, -0.0 };
   e = (vf32_t) CONST_VINT128_W(__FLOAT_NNAN, __FLOAT_NAN, __FLOAT_SNAN,
 			       __FLOAT_NSNAN);
   k = vec_copysignf32 (i, j);
diff --git a/src/testsuite/arith128_test_f64.c b/src/testsuite/arith128_test_f64.c
index eb77700..1bec02e 100644
--- a/src/testsuite/arith128_test_f64.c
+++ b/src/testsuite/arith128_test_f64.c
@@ -1596,7 +1596,7 @@ test_double_cpsgn (void)
 
   i = (vf64_t) { 0.0, -0.0 };
   j = (vf64_t) { -0.0, 0.0 };
-  e = (vf64_t) { -0.0, 0.0 };
+  e = (vf64_t) { 0.0, -0.0 };
   k = vec_copysignf64 (i, j);
 
 #ifdef __DEBUG_PRINT__
@@ -1606,8 +1606,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 1:", k, e);
 
-  i = (vf64_t) { __DBL_MAX__, __DBL_MIN__ };
-  j = (vf64_t) { -0.0, 0.0 };
+  i = (vf64_t) { -0.0, 0.0 };
+  j = (vf64_t) { __DBL_MAX__, __DBL_MIN__ };
   e = (vf64_t) { -(__DBL_MAX__), __DBL_MIN__ };
   k = vec_copysignf64 (i, j);
 
@@ -1618,8 +1618,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 2:", k, e);
 
-  i = (vf64_t) { __DBL_EPSILON__, __DBL_DENORM_MIN__ };
-  j = (vf64_t) { -0.0, 0.0 };
+  i = (vf64_t) { -0.0, 0.0 };
+  j = (vf64_t) { __DBL_EPSILON__, __DBL_DENORM_MIN__ };
   e = (vf64_t) { -(__DBL_EPSILON__), __DBL_DENORM_MIN__ };
   k = vec_copysignf64 (i, j);
 
@@ -1630,8 +1630,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 3:", k, e);
 
-  i = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
-  j = (vf64_t) CONST_VINT64_DW(0.0, -0.0);
+  i = (vf64_t) CONST_VINT64_DW(0.0, -0.0);
+  j = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
   e = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
   k = vec_copysignf64 (i, j);
 
@@ -1642,8 +1642,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 4:", k, e);
 
-  i = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
-  j = (vf64_t) CONST_VINT64_DW(0.0, -0.0);
+  i = (vf64_t) CONST_VINT64_DW(0.0, -0.0);
+  j = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
   e = (vf64_t) CONST_VINT128_DW(__DOUBLE_INF, __DOUBLE_NINF);
   k = vec_copysignf64 (i, j);
 
@@ -1654,8 +1654,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 5:", k, e);
 
-  i = (vf64_t) CONST_VINT128_DW(__DOUBLE_NAN, __DOUBLE_NNAN);
-  j = (vf64_t) CONST_VINT64_DW( -0.0, 0.0 );
+  i = (vf64_t) CONST_VINT64_DW( -0.0, 0.0 );
+  j = (vf64_t) CONST_VINT128_DW(__DOUBLE_NAN, __DOUBLE_NNAN);
   e = (vf64_t) CONST_VINT128_DW(__DOUBLE_NNAN, __DOUBLE_NAN);
   k = vec_copysignf64 (i, j);
 
@@ -1666,8 +1666,8 @@ test_double_cpsgn (void)
 #endif
   rc += check_v2f64x ("vec_copysignf64 6:", k, e);
 
-  i = (vf64_t) CONST_VINT128_DW(__DOUBLE_NSNAN, __DOUBLE_SNAN);
-  j = (vf64_t) CONST_VINT64_DW ( 0.0, -0.0 );
+  i = (vf64_t) CONST_VINT64_DW ( 0.0, -0.0 );
+  j = (vf64_t) CONST_VINT128_DW(__DOUBLE_NSNAN, __DOUBLE_SNAN);
   e = (vf64_t) CONST_VINT128_DW(__DOUBLE_SNAN, __DOUBLE_NSNAN);
   k = vec_copysignf64 (i, j);
 
diff --git a/src/testsuite/arith128_test_i128.c b/src/testsuite/arith128_test_i128.c
index b0a5c0b..a3de901 100644
--- a/src/testsuite/arith128_test_i128.c
+++ b/src/testsuite/arith128_test_i128.c
@@ -2308,7 +2308,8 @@ test_msumudm (void)
 int
 test_muludq (void)
 {
-  vui32_t i, j, k, l /*, m*/;
+  vui32_t i, j, k/*, l , m*/;
+  vui128_t l;
   vui32_t e, ec;
   int rc = 0;
 
@@ -2383,7 +2384,8 @@ test_muludq (void)
 int
 test_madduq (void)
 {
-  vui32_t i, j, k, l, m, n;
+  vui32_t i, j, k, m, n;
+  vui128_t l;
   vui32_t e, ec;
   int rc = 0;
 
diff --git a/src/testsuite/vec_f128_dummy.c b/src/testsuite/vec_f128_dummy.c
index adbb004..29fee5a 100644
--- a/src/testsuite/vec_f128_dummy.c
+++ b/src/testsuite/vec_f128_dummy.c
@@ -224,37 +224,37 @@ test_cosf128 (__binary128 value)
 #endif
 
 vb128_t
-_test_f128_isinff128 (__Float128 value)
+_test_f128_isinff128 (__binary128 value)
 {
   return (vec_isinff128 (value));
 }
 
 int
-_test_f128_isinf_sign (__Float128 value)
+_test_f128_isinf_sign (__binary128 value)
 {
   return (vec_isinf_signf128 (value));
 }
 
 vb128_t
-_test_f128_isnan (__Float128 value)
+_test_f128_isnan (__binary128 value)
 {
   return (vec_isnanf128 (value));
 }
 
 vb128_t
-_test_pred_f128_finite (__Float128 value)
+_test_pred_f128_finite (__binary128 value)
 {
   return (vec_isfinitef128 (value));
 }
 
 vb128_t
-_test_pred_f128_normal (__Float128 value)
+_test_pred_f128_normal (__binary128 value)
 {
   return (vec_isnormalf128 (value));
 }
 
 vb128_t
-_test_pred_f128_subnormal (__Float128 value)
+_test_pred_f128_subnormal (__binary128 value)
 {
   return (vec_issubnormalf128 (value));
 }
@@ -274,31 +274,31 @@ _test_xfer_bin128_2_vui16t (__binary128 f128)
 // __clang__ has a bug whenever -mfloat128 is enabled, maybe clang 10.0.1
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 int
-test_gcc_f128_signbit (__Float128 value)
+test_gcc_f128_signbit (__binary128 value)
   {
     return (signbit(value));
   }
 
 int
-test_gcc_f128_isinf (__Float128 value)
+test_gcc_f128_isinf (__binary128 value)
   {
     return (isinf(value));
   }
 
 int
-test_gcc_float128_isnan (__Float128 value)
+test_gcc_float128_isnan (__binary128 value)
   {
     return (isnan(value));
   }
 
-__Float128
-test_gcc_f128_copysign (__Float128 valx, __Float128 valy)
+__binary128
+test_gcc_f128_copysign (__binary128 valx, __binary128 valy)
   {
     return (__builtin_copysignf128(valx, valy));
   }
 
 int
-test_glibc_f128_classify (__Float128 value)
+test_glibc_f128_classify (__binary128 value)
   {
     if (isfinite(value))
     return 1;