From c3be276a76897bcef7d8d403caff7ccaac739a53 Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Tue, 31 Jan 2023 11:28:14 -0800 Subject: [PATCH] Force inline on cygwin only --- src/avx512-16bit-qsort.hpp | 26 +++++++++++++------------- src/avx512-32bit-qsort.hpp | 28 ++++++++++++++-------------- src/avx512-64bit-qsort.hpp | 32 ++++++++++++++++---------------- src/avx512-common-qsort.h | 12 +++++++++++- 4 files changed, 54 insertions(+), 44 deletions(-) diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp index 86974552..b7130e2f 100644 --- a/src/avx512-16bit-qsort.hpp +++ b/src/avx512-16bit-qsort.hpp @@ -374,7 +374,7 @@ struct zmm_vector { * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg */ template -X86_SIMD_SORT_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm) +X86_SIMD_SORT_INLINE zmm_t sort_zmm_16bit(zmm_t zmm) { // Level 1 zmm = cmp_merge( @@ -434,7 +434,7 @@ X86_SIMD_SORT_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm) // Assumes zmm is bitonic and performs a recursive half cleaner template -X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm) +X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm) { // 1) half_cleaner[32]: compare 1-17, 2-18, 3-19 etc .. zmm = cmp_merge( @@ -460,7 +460,7 @@ X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm) // Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner template -X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2) +X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2) { // 1) First step of a merging network: coex of zmm1 and zmm2 reversed zmm2 = vtype::permutexvar(vtype::get_network(4), zmm2); @@ -474,7 +474,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2) // Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive // half cleaner template -X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm) +X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm) { zmm_t zmm2r = vtype::permutexvar(vtype::get_network(4), zmm[2]); zmm_t zmm3r = vtype::permutexvar(vtype::get_network(4), zmm[3]); @@ -495,7 +495,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm) } template -X86_SIMD_SORT_FINLINE void sort_32_16bit(type_t *arr, int32_t N) +X86_SIMD_SORT_INLINE void sort_32_16bit(type_t *arr, int32_t N) { typename vtype::opmask_t load_mask = ((0x1ull << N) - 0x1ull) & 0xFFFFFFFF; typename vtype::zmm_t zmm @@ -504,7 +504,7 @@ X86_SIMD_SORT_FINLINE void sort_32_16bit(type_t *arr, int32_t N) } template -X86_SIMD_SORT_FINLINE void sort_64_16bit(type_t *arr, int32_t N) +X86_SIMD_SORT_INLINE void sort_64_16bit(type_t *arr, int32_t N) { if (N <= 32) { sort_32_16bit(arr, N); @@ -523,7 +523,7 @@ X86_SIMD_SORT_FINLINE void sort_64_16bit(type_t *arr, int32_t N) } template -X86_SIMD_SORT_FINLINE void sort_128_16bit(type_t *arr, int32_t N) +X86_SIMD_SORT_INLINE void sort_128_16bit(type_t *arr, int32_t N) { if (N <= 64) { sort_64_16bit(arr, N); @@ -556,9 +556,9 @@ X86_SIMD_SORT_FINLINE void sort_128_16bit(type_t *arr, int32_t N) } template -X86_SIMD_SORT_FINLINE type_t get_pivot_16bit(type_t *arr, - const int64_t left, - const int64_t right) +X86_SIMD_SORT_INLINE type_t get_pivot_16bit(type_t *arr, + const int64_t left, + const int64_t right) { // median of 32 int64_t size = (right - left) / 32; @@ -657,8 +657,8 @@ qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) qsort_16bit_(arr, pivot_index, right, max_iters - 1); } -X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(uint16_t *arr, - int64_t arrsize) +X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(uint16_t *arr, + int64_t arrsize) { int64_t nan_count = 0; __mmask16 loadmask = 0xFFFF; @@ -676,7 +676,7 @@ X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(uint16_t *arr, return nan_count; } -X86_SIMD_SORT_FINLINE void +X86_SIMD_SORT_INLINE void replace_inf_with_nan(uint16_t *arr, int64_t arrsize, int64_t nan_count) { for (int64_t ii = arrsize - 1; nan_count > 0; --ii) { diff --git a/src/avx512-32bit-qsort.hpp b/src/avx512-32bit-qsort.hpp index f21910ce..1cbba00b 100644 --- a/src/avx512-32bit-qsort.hpp +++ b/src/avx512-32bit-qsort.hpp @@ -336,7 +336,7 @@ struct zmm_vector { * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg */ template -X86_SIMD_SORT_FINLINE zmm_t sort_zmm_32bit(zmm_t zmm) +X86_SIMD_SORT_INLINE zmm_t sort_zmm_32bit(zmm_t zmm) { zmm = cmp_merge( zmm, @@ -383,7 +383,7 @@ X86_SIMD_SORT_FINLINE zmm_t sort_zmm_32bit(zmm_t zmm) // Assumes zmm is bitonic and performs a recursive half cleaner template -X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm) +X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm) { // 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc .. zmm = cmp_merge( @@ -410,7 +410,7 @@ X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm) // Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner template -X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2) +X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2) { // 1) First step of a merging network: coex of zmm1 and zmm2 reversed *zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), *zmm2); @@ -424,7 +424,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2) // Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive // half cleaner template -X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm) +X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm) { zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[2]); zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[3]); @@ -445,7 +445,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm) } template -X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm) +X86_SIMD_SORT_INLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm) { zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[4]); zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[5]); @@ -482,7 +482,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm) } template -X86_SIMD_SORT_FINLINE void sort_16_32bit(type_t *arr, int32_t N) +X86_SIMD_SORT_INLINE void sort_16_32bit(type_t *arr, int32_t N) { typename vtype::opmask_t load_mask = (0x0001 << N) - 0x0001; typename vtype::zmm_t zmm @@ -491,7 +491,7 @@ X86_SIMD_SORT_FINLINE void sort_16_32bit(type_t *arr, int32_t N) } template -X86_SIMD_SORT_FINLINE void sort_32_32bit(type_t *arr, int32_t N) +X86_SIMD_SORT_INLINE void sort_32_32bit(type_t *arr, int32_t N) { if (N <= 16) { sort_16_32bit(arr, N); @@ -509,7 +509,7 @@ X86_SIMD_SORT_FINLINE void sort_32_32bit(type_t *arr, int32_t N) } template -X86_SIMD_SORT_FINLINE void sort_64_32bit(type_t *arr, int32_t N) +X86_SIMD_SORT_INLINE void sort_64_32bit(type_t *arr, int32_t N) { if (N <= 32) { sort_32_32bit(arr, N); @@ -540,7 +540,7 @@ X86_SIMD_SORT_FINLINE void sort_64_32bit(type_t *arr, int32_t N) } template -X86_SIMD_SORT_FINLINE void sort_128_32bit(type_t *arr, int32_t N) +X86_SIMD_SORT_INLINE void sort_128_32bit(type_t *arr, int32_t N) { if (N <= 64) { sort_64_32bit(arr, N); @@ -592,9 +592,9 @@ X86_SIMD_SORT_FINLINE void sort_128_32bit(type_t *arr, int32_t N) } template -X86_SIMD_SORT_FINLINE type_t get_pivot_32bit(type_t *arr, - const int64_t left, - const int64_t right) +X86_SIMD_SORT_INLINE type_t get_pivot_32bit(type_t *arr, + const int64_t left, + const int64_t right) { // median of 16 int64_t size = (right - left) / 16; @@ -656,7 +656,7 @@ qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) qsort_32bit_(arr, pivot_index, right, max_iters - 1); } -X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize) +X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize) { int64_t nan_count = 0; __mmask16 loadmask = 0xFFFF; @@ -672,7 +672,7 @@ X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize) return nan_count; } -X86_SIMD_SORT_FINLINE void +X86_SIMD_SORT_INLINE void replace_inf_with_nan(float *arr, int64_t arrsize, int64_t nan_count) { for (int64_t ii = arrsize - 1; nan_count > 0; --ii) { diff --git a/src/avx512-64bit-qsort.hpp b/src/avx512-64bit-qsort.hpp index 0c26afdb..7e8db546 100644 --- a/src/avx512-64bit-qsort.hpp +++ b/src/avx512-64bit-qsort.hpp @@ -330,7 +330,7 @@ struct zmm_vector { * https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg */ template -X86_SIMD_SORT_FINLINE zmm_t sort_zmm_64bit(zmm_t zmm) +X86_SIMD_SORT_INLINE zmm_t sort_zmm_64bit(zmm_t zmm) { const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); zmm = cmp_merge( @@ -353,7 +353,7 @@ X86_SIMD_SORT_FINLINE zmm_t sort_zmm_64bit(zmm_t zmm) // Assumes zmm is bitonic and performs a recursive half cleaner template -X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm) +X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm) { // 1) half_cleaner[8]: compare 0-4, 1-5, 2-6, 3-7 @@ -374,7 +374,7 @@ X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm) // Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner template -X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2) +X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2) { const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); // 1) First step of a merging network: coex of zmm1 and zmm2 reversed @@ -389,7 +389,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2) // Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive // half cleaner template -X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm) +X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm) { const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); // 1) First step of a merging network @@ -411,7 +411,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm) } template -X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm) +X86_SIMD_SORT_INLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm) { const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); zmm_t zmm4r = vtype::permutexvar(rev_index, zmm[4]); @@ -445,7 +445,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm) } template -X86_SIMD_SORT_FINLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm) +X86_SIMD_SORT_INLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm) { const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2); zmm_t zmm8r = vtype::permutexvar(rev_index, zmm[8]); @@ -519,7 +519,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm) } template -X86_SIMD_SORT_FINLINE void sort_8_64bit(type_t *arr, int32_t N) +X86_SIMD_SORT_INLINE void sort_8_64bit(type_t *arr, int32_t N) { typename vtype::opmask_t load_mask = (0x01 << N) - 0x01; typename vtype::zmm_t zmm @@ -528,7 +528,7 @@ X86_SIMD_SORT_FINLINE void sort_8_64bit(type_t *arr, int32_t N) } template -X86_SIMD_SORT_FINLINE void sort_16_64bit(type_t *arr, int32_t N) +X86_SIMD_SORT_INLINE void sort_16_64bit(type_t *arr, int32_t N) { if (N <= 8) { sort_8_64bit(arr, N); @@ -546,7 +546,7 @@ X86_SIMD_SORT_FINLINE void sort_16_64bit(type_t *arr, int32_t N) } template -X86_SIMD_SORT_FINLINE void sort_32_64bit(type_t *arr, int32_t N) +X86_SIMD_SORT_INLINE void sort_32_64bit(type_t *arr, int32_t N) { if (N <= 16) { sort_16_64bit(arr, N); @@ -577,7 +577,7 @@ X86_SIMD_SORT_FINLINE void sort_32_64bit(type_t *arr, int32_t N) } template -X86_SIMD_SORT_FINLINE void sort_64_64bit(type_t *arr, int32_t N) +X86_SIMD_SORT_INLINE void sort_64_64bit(type_t *arr, int32_t N) { if (N <= 32) { sort_32_64bit(arr, N); @@ -628,7 +628,7 @@ X86_SIMD_SORT_FINLINE void sort_64_64bit(type_t *arr, int32_t N) } template -X86_SIMD_SORT_FINLINE void sort_128_64bit(type_t *arr, int32_t N) +X86_SIMD_SORT_INLINE void sort_128_64bit(type_t *arr, int32_t N) { if (N <= 64) { sort_64_64bit(arr, N); @@ -718,9 +718,9 @@ X86_SIMD_SORT_FINLINE void sort_128_64bit(type_t *arr, int32_t N) } template -X86_SIMD_SORT_FINLINE type_t get_pivot_64bit(type_t *arr, - const int64_t left, - const int64_t right) +X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr, + const int64_t left, + const int64_t right) { // median of 8 int64_t size = (right - left) / 8; @@ -769,7 +769,7 @@ qsort_64bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters) qsort_64bit_(arr, pivot_index, right, max_iters - 1); } -X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize) +X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize) { int64_t nan_count = 0; __mmask8 loadmask = 0xFF; @@ -785,7 +785,7 @@ X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize) return nan_count; } -X86_SIMD_SORT_FINLINE void +X86_SIMD_SORT_INLINE void replace_inf_with_nan(double *arr, int64_t arrsize, int64_t nan_count) { for (int64_t ii = arrsize - 1; nan_count > 0; --ii) { diff --git a/src/avx512-common-qsort.h b/src/avx512-common-qsort.h index 6d34c691..d1f6cbb4 100644 --- a/src/avx512-common-qsort.h +++ b/src/avx512-common-qsort.h @@ -64,10 +64,20 @@ #define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d #ifdef _MSC_VER +#define X86_SIMD_SORT_INLINE static inline #define X86_SIMD_SORT_FINLINE static __forceinline +#elif defined(__CYGWIN__) +/* + * Force inline in cygwin to work around a compiler bug. See + * https://github.com/numpy/numpy/pull/22315#issuecomment-1267757584 + */ +#define X86_SIMD_SORT_INLINE static __attribute__((always_inline)) +#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline)) #elif defined(__GNUC__) -#define X86_SIMD_SORT_FINLINE static inline //__attribute__((always_inline)) +#define X86_SIMD_SORT_INLINE static inline +#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline)) #else +#define X86_SIMD_SORT_INLINE static #define X86_SIMD_SORT_FINLINE static #endif