diff --git a/src/lib/OpenEXRCore/internal_dwa_encoder.h b/src/lib/OpenEXRCore/internal_dwa_encoder.h index cb05c955a..2ad6c5ed7 100644 --- a/src/lib/OpenEXRCore/internal_dwa_encoder.h +++ b/src/lib/OpenEXRCore/internal_dwa_encoder.h @@ -28,7 +28,7 @@ #endif #ifndef USE_CLZ -# ifdef _WIN32 +# ifdef _MSC_VER static int __inline __builtin_clz(uint32_t v) { #ifdef __BMI1__ diff --git a/src/test/OpenEXRCoreTest/compressionTables.cpp b/src/test/OpenEXRCoreTest/compressionTables.cpp index 9237cbb45..5cd5b0ec5 100644 --- a/src/test/OpenEXRCoreTest/compressionTables.cpp +++ b/src/test/OpenEXRCoreTest/compressionTables.cpp @@ -41,6 +41,7 @@ #if defined(_WIN32) #include +#include #endif #if defined(OPENEXR_ENABLE_API_VISIBILITY) @@ -509,11 +510,32 @@ testDWATable (const std::string&) # if __has_builtin(__builtin_popcount) # define USE_POPCOUNT 1 # endif +# if __has_builtin(__builtin_clz) +# define USE_CLZ 1 +# endif #endif #ifndef USE_POPCOUNT # define USE_POPCOUNT 0 #endif +#ifndef USE_CLZ +# ifdef _WIN32 +static int __inline __builtin_clz(uint32_t v) +{ +#ifdef __BMI1__ + return __lzcnt(v); +#else + unsigned long r; + _BitScanReverse(&r, v); + return 31 - r; +#endif +} +# define USE_CLZ 1 +# else +# define USE_CLZ 0 +# endif +#endif + #if USE_POPCOUNT static inline int countSetBits (uint16_t src) @@ -521,28 +543,34 @@ countSetBits (uint16_t src) return __builtin_popcount (src); } #else -static inline uint8_t -countSetBits8 (uint8_t src) +// courtesy hacker's delight +static inline int countSetBits(uint32_t x) { - static const int8_t numBitsSet[256] = { - 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, - 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, - 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, - 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, - 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, - 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, - 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; - return numBitsSet[src]; + uint64_t y; + y = x * 0x0002000400080010ULL; + y = y & 0x1111111111111111ULL; + y = y * 0x1111111111111111ULL; + y = y >> 60; + return y; } +#endif -static inline uint8_t -countSetBits (uint16_t src) +#if USE_CLZ +static inline int +countLeadingZeros(uint16_t src) +{ + return __builtin_clz (src); +} +#else +// courtesy hacker's delight +static int inline countLeadingZeros( uint32_t x ) { - return countSetBits8 (src & 0xff) + countSetBits8 (src >> 8); + x |= (x >> 1); + x |= (x >> 2); + x |= (x >> 4); + x |= (x >> 8); + x |= (x >> 16); + return 32 - countSetBits(x); } #endif @@ -597,7 +625,7 @@ countSetBits (uint16_t src) static uint16_t handleQuantizeDenormTol ( uint16_t abssrc, uint16_t tolSig, float errTol, float srcFloat, int doprint) { - const uint16_t tsigshift = (32 - __builtin_clz (tolSig)); + const uint16_t tsigshift = (32 - countLeadingZeros (tolSig)); const uint16_t npow2 = (1 << tsigshift); const uint16_t lowermask = npow2 - 1; const uint16_t mask = ~lowermask; @@ -642,7 +670,7 @@ static uint16_t handleQuantizeGeneric ( // classic would do clz(significand - 1) but here we are trying to // construct a mask, so want to ensure for an power of 2, we // actually get the next (i.e. 2 returns 4) - const uint16_t tsigshift = (32 - __builtin_clz (tolSig)); + const uint16_t tsigshift = (32 - countLeadingZeros (tolSig)); const uint16_t npow2 = (1 << tsigshift); const uint16_t lowermask = npow2 - 1; const uint16_t mask = ~lowermask; @@ -1083,7 +1111,7 @@ static uint16_t handleQuantizeDefault ( // classic would do clz(significand - 1) but here we are trying to // construct a mask, so want to ensure for an power of 2, we // actually get the next (i.e. 2 returns 4) - const uint16_t tsigshift = (32 - __builtin_clz (tolSig)); + const uint16_t tsigshift = (32 - countLeadingZeros (tolSig)); const uint16_t npow2 = (1 << tsigshift); const uint16_t lowermask = npow2 - 1; const uint16_t mask = ~lowermask;