Skip to content

Commit

Permalink
Merge pull request #4 from r-devulap/cygwin-bug
Browse files Browse the repository at this point in the history
Force inline on cygwin only
  • Loading branch information
r-devulap committed Jan 31, 2023
2 parents 0f1023b + c3be276 commit 7d7591c
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 44 deletions.
26 changes: 13 additions & 13 deletions src/avx512-16bit-qsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -374,7 +374,7 @@ struct zmm_vector<uint16_t> {
* https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
*/
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
X86_SIMD_SORT_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm)
X86_SIMD_SORT_INLINE zmm_t sort_zmm_16bit(zmm_t zmm)
{
// Level 1
zmm = cmp_merge<vtype>(
Expand Down Expand Up @@ -434,7 +434,7 @@ X86_SIMD_SORT_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm)

// Assumes zmm is bitonic and performs a recursive half cleaner
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)
X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)
{
// 1) half_cleaner[32]: compare 1-17, 2-18, 3-19 etc ..
zmm = cmp_merge<vtype>(
Expand All @@ -460,7 +460,7 @@ X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)

// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)
X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)
{
// 1) First step of a merging network: coex of zmm1 and zmm2 reversed
zmm2 = vtype::permutexvar(vtype::get_network(4), zmm2);
Expand All @@ -474,7 +474,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)
// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
// half cleaner
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm)
X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm)
{
zmm_t zmm2r = vtype::permutexvar(vtype::get_network(4), zmm[2]);
zmm_t zmm3r = vtype::permutexvar(vtype::get_network(4), zmm[3]);
Expand All @@ -495,7 +495,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm)
}

template <typename vtype, typename type_t>
X86_SIMD_SORT_FINLINE void sort_32_16bit(type_t *arr, int32_t N)
X86_SIMD_SORT_INLINE void sort_32_16bit(type_t *arr, int32_t N)
{
typename vtype::opmask_t load_mask = ((0x1ull << N) - 0x1ull) & 0xFFFFFFFF;
typename vtype::zmm_t zmm
Expand All @@ -504,7 +504,7 @@ X86_SIMD_SORT_FINLINE void sort_32_16bit(type_t *arr, int32_t N)
}

template <typename vtype, typename type_t>
X86_SIMD_SORT_FINLINE void sort_64_16bit(type_t *arr, int32_t N)
X86_SIMD_SORT_INLINE void sort_64_16bit(type_t *arr, int32_t N)
{
if (N <= 32) {
sort_32_16bit<vtype>(arr, N);
Expand All @@ -523,7 +523,7 @@ X86_SIMD_SORT_FINLINE void sort_64_16bit(type_t *arr, int32_t N)
}

template <typename vtype, typename type_t>
X86_SIMD_SORT_FINLINE void sort_128_16bit(type_t *arr, int32_t N)
X86_SIMD_SORT_INLINE void sort_128_16bit(type_t *arr, int32_t N)
{
if (N <= 64) {
sort_64_16bit<vtype>(arr, N);
Expand Down Expand Up @@ -556,9 +556,9 @@ X86_SIMD_SORT_FINLINE void sort_128_16bit(type_t *arr, int32_t N)
}

template <typename vtype, typename type_t>
X86_SIMD_SORT_FINLINE type_t get_pivot_16bit(type_t *arr,
const int64_t left,
const int64_t right)
X86_SIMD_SORT_INLINE type_t get_pivot_16bit(type_t *arr,
const int64_t left,
const int64_t right)
{
// median of 32
int64_t size = (right - left) / 32;
Expand Down Expand Up @@ -657,8 +657,8 @@ qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
qsort_16bit_<vtype>(arr, pivot_index, right, max_iters - 1);
}

X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(uint16_t *arr,
int64_t arrsize)
X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(uint16_t *arr,
int64_t arrsize)
{
int64_t nan_count = 0;
__mmask16 loadmask = 0xFFFF;
Expand All @@ -676,7 +676,7 @@ X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(uint16_t *arr,
return nan_count;
}

X86_SIMD_SORT_FINLINE void
X86_SIMD_SORT_INLINE void
replace_inf_with_nan(uint16_t *arr, int64_t arrsize, int64_t nan_count)
{
for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
Expand Down
28 changes: 14 additions & 14 deletions src/avx512-32bit-qsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ struct zmm_vector<float> {
* https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
*/
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
X86_SIMD_SORT_FINLINE zmm_t sort_zmm_32bit(zmm_t zmm)
X86_SIMD_SORT_INLINE zmm_t sort_zmm_32bit(zmm_t zmm)
{
zmm = cmp_merge<vtype>(
zmm,
Expand Down Expand Up @@ -383,7 +383,7 @@ X86_SIMD_SORT_FINLINE zmm_t sort_zmm_32bit(zmm_t zmm)

// Assumes zmm is bitonic and performs a recursive half cleaner
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm)
X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm)
{
// 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
zmm = cmp_merge<vtype>(
Expand All @@ -410,7 +410,7 @@ X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm)

// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2)
X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2)
{
// 1) First step of a merging network: coex of zmm1 and zmm2 reversed
*zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), *zmm2);
Expand All @@ -424,7 +424,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2)
// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
// half cleaner
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm)
X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm)
{
zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[2]);
zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[3]);
Expand All @@ -445,7 +445,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm)
}

template <typename vtype, typename zmm_t = typename vtype::zmm_t>
X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm)
X86_SIMD_SORT_INLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm)
{
zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[4]);
zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[5]);
Expand Down Expand Up @@ -482,7 +482,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm)
}

template <typename vtype, typename type_t>
X86_SIMD_SORT_FINLINE void sort_16_32bit(type_t *arr, int32_t N)
X86_SIMD_SORT_INLINE void sort_16_32bit(type_t *arr, int32_t N)
{
typename vtype::opmask_t load_mask = (0x0001 << N) - 0x0001;
typename vtype::zmm_t zmm
Expand All @@ -491,7 +491,7 @@ X86_SIMD_SORT_FINLINE void sort_16_32bit(type_t *arr, int32_t N)
}

template <typename vtype, typename type_t>
X86_SIMD_SORT_FINLINE void sort_32_32bit(type_t *arr, int32_t N)
X86_SIMD_SORT_INLINE void sort_32_32bit(type_t *arr, int32_t N)
{
if (N <= 16) {
sort_16_32bit<vtype>(arr, N);
Expand All @@ -509,7 +509,7 @@ X86_SIMD_SORT_FINLINE void sort_32_32bit(type_t *arr, int32_t N)
}

template <typename vtype, typename type_t>
X86_SIMD_SORT_FINLINE void sort_64_32bit(type_t *arr, int32_t N)
X86_SIMD_SORT_INLINE void sort_64_32bit(type_t *arr, int32_t N)
{
if (N <= 32) {
sort_32_32bit<vtype>(arr, N);
Expand Down Expand Up @@ -540,7 +540,7 @@ X86_SIMD_SORT_FINLINE void sort_64_32bit(type_t *arr, int32_t N)
}

template <typename vtype, typename type_t>
X86_SIMD_SORT_FINLINE void sort_128_32bit(type_t *arr, int32_t N)
X86_SIMD_SORT_INLINE void sort_128_32bit(type_t *arr, int32_t N)
{
if (N <= 64) {
sort_64_32bit<vtype>(arr, N);
Expand Down Expand Up @@ -592,9 +592,9 @@ X86_SIMD_SORT_FINLINE void sort_128_32bit(type_t *arr, int32_t N)
}

template <typename vtype, typename type_t>
X86_SIMD_SORT_FINLINE type_t get_pivot_32bit(type_t *arr,
const int64_t left,
const int64_t right)
X86_SIMD_SORT_INLINE type_t get_pivot_32bit(type_t *arr,
const int64_t left,
const int64_t right)
{
// median of 16
int64_t size = (right - left) / 16;
Expand Down Expand Up @@ -656,7 +656,7 @@ qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
qsort_32bit_<vtype>(arr, pivot_index, right, max_iters - 1);
}

X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize)
X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize)
{
int64_t nan_count = 0;
__mmask16 loadmask = 0xFFFF;
Expand All @@ -672,7 +672,7 @@ X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize)
return nan_count;
}

X86_SIMD_SORT_FINLINE void
X86_SIMD_SORT_INLINE void
replace_inf_with_nan(float *arr, int64_t arrsize, int64_t nan_count)
{
for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
Expand Down
32 changes: 16 additions & 16 deletions src/avx512-64bit-qsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ struct zmm_vector<double> {
* https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
*/
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
X86_SIMD_SORT_FINLINE zmm_t sort_zmm_64bit(zmm_t zmm)
X86_SIMD_SORT_INLINE zmm_t sort_zmm_64bit(zmm_t zmm)
{
const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
zmm = cmp_merge<vtype>(
Expand All @@ -353,7 +353,7 @@ X86_SIMD_SORT_FINLINE zmm_t sort_zmm_64bit(zmm_t zmm)

// Assumes zmm is bitonic and performs a recursive half cleaner
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)
X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)
{

// 1) half_cleaner[8]: compare 0-4, 1-5, 2-6, 3-7
Expand All @@ -374,7 +374,7 @@ X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)

// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)
X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)
{
const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
// 1) First step of a merging network: coex of zmm1 and zmm2 reversed
Expand All @@ -389,7 +389,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)
// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
// half cleaner
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm)
X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm)
{
const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
// 1) First step of a merging network
Expand All @@ -411,7 +411,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm)
}

template <typename vtype, typename zmm_t = typename vtype::zmm_t>
X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)
X86_SIMD_SORT_INLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)
{
const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
zmm_t zmm4r = vtype::permutexvar(rev_index, zmm[4]);
Expand Down Expand Up @@ -445,7 +445,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)
}

template <typename vtype, typename zmm_t = typename vtype::zmm_t>
X86_SIMD_SORT_FINLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)
X86_SIMD_SORT_INLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)
{
const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
zmm_t zmm8r = vtype::permutexvar(rev_index, zmm[8]);
Expand Down Expand Up @@ -519,7 +519,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)
}

template <typename vtype, typename type_t>
X86_SIMD_SORT_FINLINE void sort_8_64bit(type_t *arr, int32_t N)
X86_SIMD_SORT_INLINE void sort_8_64bit(type_t *arr, int32_t N)
{
typename vtype::opmask_t load_mask = (0x01 << N) - 0x01;
typename vtype::zmm_t zmm
Expand All @@ -528,7 +528,7 @@ X86_SIMD_SORT_FINLINE void sort_8_64bit(type_t *arr, int32_t N)
}

template <typename vtype, typename type_t>
X86_SIMD_SORT_FINLINE void sort_16_64bit(type_t *arr, int32_t N)
X86_SIMD_SORT_INLINE void sort_16_64bit(type_t *arr, int32_t N)
{
if (N <= 8) {
sort_8_64bit<vtype>(arr, N);
Expand All @@ -546,7 +546,7 @@ X86_SIMD_SORT_FINLINE void sort_16_64bit(type_t *arr, int32_t N)
}

template <typename vtype, typename type_t>
X86_SIMD_SORT_FINLINE void sort_32_64bit(type_t *arr, int32_t N)
X86_SIMD_SORT_INLINE void sort_32_64bit(type_t *arr, int32_t N)
{
if (N <= 16) {
sort_16_64bit<vtype>(arr, N);
Expand Down Expand Up @@ -577,7 +577,7 @@ X86_SIMD_SORT_FINLINE void sort_32_64bit(type_t *arr, int32_t N)
}

template <typename vtype, typename type_t>
X86_SIMD_SORT_FINLINE void sort_64_64bit(type_t *arr, int32_t N)
X86_SIMD_SORT_INLINE void sort_64_64bit(type_t *arr, int32_t N)
{
if (N <= 32) {
sort_32_64bit<vtype>(arr, N);
Expand Down Expand Up @@ -628,7 +628,7 @@ X86_SIMD_SORT_FINLINE void sort_64_64bit(type_t *arr, int32_t N)
}

template <typename vtype, typename type_t>
X86_SIMD_SORT_FINLINE void sort_128_64bit(type_t *arr, int32_t N)
X86_SIMD_SORT_INLINE void sort_128_64bit(type_t *arr, int32_t N)
{
if (N <= 64) {
sort_64_64bit<vtype>(arr, N);
Expand Down Expand Up @@ -718,9 +718,9 @@ X86_SIMD_SORT_FINLINE void sort_128_64bit(type_t *arr, int32_t N)
}

template <typename vtype, typename type_t>
X86_SIMD_SORT_FINLINE type_t get_pivot_64bit(type_t *arr,
const int64_t left,
const int64_t right)
X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr,
const int64_t left,
const int64_t right)
{
// median of 8
int64_t size = (right - left) / 8;
Expand Down Expand Up @@ -769,7 +769,7 @@ qsort_64bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
qsort_64bit_<vtype>(arr, pivot_index, right, max_iters - 1);
}

X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize)
X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize)
{
int64_t nan_count = 0;
__mmask8 loadmask = 0xFF;
Expand All @@ -785,7 +785,7 @@ X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize)
return nan_count;
}

X86_SIMD_SORT_FINLINE void
X86_SIMD_SORT_INLINE void
replace_inf_with_nan(double *arr, int64_t arrsize, int64_t nan_count)
{
for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {
Expand Down
12 changes: 11 additions & 1 deletion src/avx512-common-qsort.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,20 @@
#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d

#ifdef _MSC_VER
#define X86_SIMD_SORT_INLINE static inline
#define X86_SIMD_SORT_FINLINE static __forceinline
#elif defined(__CYGWIN__)
/*
* Force inline in cygwin to work around a compiler bug. See
* https://github.com/numpy/numpy/pull/22315#issuecomment-1267757584
*/
#define X86_SIMD_SORT_INLINE static __attribute__((always_inline))
#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline))
#elif defined(__GNUC__)
#define X86_SIMD_SORT_FINLINE static inline //__attribute__((always_inline))
#define X86_SIMD_SORT_INLINE static inline
#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline))
#else
#define X86_SIMD_SORT_INLINE static
#define X86_SIMD_SORT_FINLINE static
#endif

Expand Down

0 comments on commit 7d7591c

Please sign in to comment.