From ff7d24647839d4fa956836a8ef4a58682988a51c Mon Sep 17 00:00:00 2001 From: wangzhen38 <41941775+wangzhen38@users.noreply.github.com> Date: Tue, 21 Jun 2022 19:03:02 +0800 Subject: [PATCH] cpplint fix 3 (#43679) * cpplint fix 3 * cpplint fix 3 * cpplint fix 3 * cpplint fix 3 --- paddle/phi/kernels/funcs/detail/avx_mathfun.h | 285 ++++++++++-------- 1 file changed, 160 insertions(+), 125 deletions(-) diff --git a/paddle/phi/kernels/funcs/detail/avx_mathfun.h b/paddle/phi/kernels/funcs/detail/avx_mathfun.h index 75e4922648c20..90017f3c760f4 100644 --- a/paddle/phi/kernels/funcs/detail/avx_mathfun.h +++ b/paddle/phi/kernels/funcs/detail/avx_mathfun.h @@ -41,7 +41,7 @@ (this is the zlib license) */ - +#pragma once #include "paddle/fluid/platform/cpu_info.h" /* __m128 is ugly to write */ @@ -134,7 +134,7 @@ typedef union imm_xmm_union { return (ret); \ } -//#warning "Using SSE2 to perform AVX2 bitshift ops" +// #warning "Using SSE2 to perform AVX2 bitshift ops" AVX2_BITOP_USING_SSE2(slli_epi32) AVX2_BITOP_USING_SSE2(srli_epi32) @@ -152,7 +152,7 @@ AVX2_BITOP_USING_SSE2(srli_epi32) return (ret); \ } -//#warning "Using SSE2 to perform AVX2 integer ops" +// #warning "Using SSE2 to perform AVX2 integer ops" AVX2_INTOP_USING_SSE2(and_si128) AVX2_INTOP_USING_SSE2(andnot_si128) AVX2_INTOP_USING_SSE2(cmpeq_epi32) @@ -175,23 +175,24 @@ AVX2_INTOP_USING_SSE2(add_epi32) */ v8sf log256_ps(v8sf x) { v8si imm0; - v8sf one = *(v8sf *)_ps256_1; + v8sf one = *reinterpret_cast(_ps256_1); // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); - x = _mm256_max_ps( - x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */ + /* cut off denormalized stuff */ + x = _mm256_max_ps(x, *reinterpret_cast(_ps256_min_norm_pos)); // can be done with AVX2 imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23); /* keep only the fractional part */ - x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask); - x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5); + x = _mm256_and_ps(x, *reinterpret_cast(_ps256_inv_mant_mask)); + x = _mm256_or_ps(x, *reinterpret_cast(_ps256_0p5)); // this is again another AVX2 instruction - imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f); + imm0 = avx2_mm256_sub_epi32(imm0, + *reinterpret_cast(_pi32_256_0x7f)); v8sf e = _mm256_cvtepi32_ps(imm0); e = _mm256_add_ps(e, one); @@ -203,7 +204,8 @@ v8sf log256_ps(v8sf x) { } else { x = x - 1.0; } */ // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); - v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS); + v8sf mask = _mm256_cmp_ps( + x, *reinterpret_cast(_ps256_cephes_SQRTHF), _CMP_LT_OS); v8sf tmp = _mm256_and_ps(x, mask); x = _mm256_sub_ps(x, one); e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); @@ -211,34 +213,34 @@ v8sf log256_ps(v8sf x) { v8sf z = _mm256_mul_ps(x, x); - v8sf y = *(v8sf *)_ps256_cephes_log_p0; + v8sf y = *reinterpret_cast(_ps256_cephes_log_p0); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_cephes_log_p1)); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_cephes_log_p2)); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_cephes_log_p3)); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_cephes_log_p4)); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_cephes_log_p5)); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_cephes_log_p6)); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_cephes_log_p7)); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_cephes_log_p8)); y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, z); - tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1); + tmp = _mm256_mul_ps(e, *reinterpret_cast(_ps256_cephes_log_q1)); y = _mm256_add_ps(y, tmp); - tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); + tmp = _mm256_mul_ps(z, *reinterpret_cast(_ps256_0p5)); y = _mm256_sub_ps(y, tmp); - tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2); + tmp = _mm256_mul_ps(e, *reinterpret_cast(_ps256_cephes_log_q2)); x = _mm256_add_ps(x, y); x = _mm256_add_ps(x, tmp); x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN @@ -262,14 +264,14 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1); v8sf exp256_ps(v8sf x) { v8sf tmp = _mm256_setzero_ps(), fx; v8si imm0; - v8sf one = *(v8sf *)_ps256_1; + v8sf one = *reinterpret_cast(_ps256_1); - x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi); - x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo); + x = _mm256_min_ps(x, *reinterpret_cast(_ps256_exp_hi)); + x = _mm256_max_ps(x, *reinterpret_cast(_ps256_exp_lo)); /* express exp(x) as exp(g + n*log(2)) */ - fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF); - fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5); + fx = _mm256_mul_ps(x, *reinterpret_cast(_ps256_cephes_LOG2EF)); + fx = _mm256_add_ps(fx, *reinterpret_cast(_ps256_0p5)); /* how to perform a floorf with SSE: just below */ // imm0 = _mm256_cvttps_epi32(fx); @@ -283,24 +285,26 @@ v8sf exp256_ps(v8sf x) { mask = _mm256_and_ps(mask, one); fx = _mm256_sub_ps(tmp, mask); - tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1); - v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2); + tmp = + _mm256_mul_ps(fx, *reinterpret_cast(_ps256_cephes_exp_C1)); + v8sf z = + _mm256_mul_ps(fx, *reinterpret_cast(_ps256_cephes_exp_C2)); x = _mm256_sub_ps(x, tmp); x = _mm256_sub_ps(x, z); z = _mm256_mul_ps(x, x); - v8sf y = *(v8sf *)_ps256_cephes_exp_p0; + v8sf y = *reinterpret_cast(_ps256_cephes_exp_p0); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_cephes_exp_p1)); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_cephes_exp_p2)); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_cephes_exp_p3)); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_cephes_exp_p4)); y = _mm256_mul_ps(y, x); - y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_cephes_exp_p5)); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, one); @@ -308,7 +312,8 @@ v8sf exp256_ps(v8sf x) { /* build 2^n */ imm0 = _mm256_cvttps_epi32(fx); // another two AVX2 instructions - imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f); + imm0 = avx2_mm256_add_epi32(imm0, + *reinterpret_cast(_pi32_256_0x7f)); imm0 = avx2_mm256_slli_epi32(imm0, 23); v8sf pow2n = _mm256_castsi256_ps(imm0); y = _mm256_mul_ps(y, pow2n); @@ -349,12 +354,13 @@ v8sf sin256_ps(v8sf x) { // any x sign_bit = x; /* take the absolute value */ - x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); + x = _mm256_and_ps(x, *reinterpret_cast(_ps256_inv_sign_mask)); /* extract the sign bit (upper one) */ - sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask); + sign_bit = _mm256_and_ps(sign_bit, + *reinterpret_cast(_ps256_sign_mask)); /* scale by 4/Pi */ - y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); + y = _mm256_mul_ps(x, *reinterpret_cast(_ps256_cephes_FOPI)); /* Here we start a series of integer operations, which are in the @@ -367,12 +373,15 @@ v8sf sin256_ps(v8sf x) { // any x imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ // another two AVX2 instruction - imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); + imm2 = + avx2_mm256_add_epi32(imm2, *reinterpret_cast(_pi32_256_1)); + imm2 = avx2_mm256_and_si256(imm2, + *reinterpret_cast(_pi32_256_inv1)); y = _mm256_cvtepi32_ps(imm2); /* get the swap sign flag */ - imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); + imm0 = + avx2_mm256_and_si256(imm2, *reinterpret_cast(_pi32_256_4)); imm0 = avx2_mm256_slli_epi32(imm0, 29); /* get the polynom selection mask there is one polynom for 0 <= x <= Pi/4 @@ -380,31 +389,35 @@ v8sf sin256_ps(v8sf x) { // any x Both branches will be computed. */ - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); - imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); + imm2 = + avx2_mm256_and_si256(imm2, *reinterpret_cast(_pi32_256_2)); + imm2 = avx2_mm256_cmpeq_epi32(imm2, + *reinterpret_cast(_pi32_256_0)); #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2); - imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); - imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); + imm2_1 = _mm_add_epi32(imm2_1, *reinterpret_cast(_pi32avx_1)); + imm2_2 = _mm_add_epi32(imm2_2, *reinterpret_cast(_pi32avx_1)); - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); + imm2_1 = + _mm_and_si128(imm2_1, *reinterpret_cast(_pi32avx_inv1)); + imm2_2 = + _mm_and_si128(imm2_2, *reinterpret_cast(_pi32avx_inv1)); COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); y = _mm256_cvtepi32_ps(imm2); - imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4); - imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4); + imm0_1 = _mm_and_si128(imm2_1, *reinterpret_cast(_pi32avx_4)); + imm0_2 = _mm_and_si128(imm2_2, *reinterpret_cast(_pi32avx_4)); imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); + imm2_1 = _mm_and_si128(imm2_1, *reinterpret_cast(_pi32avx_2)); + imm2_2 = _mm_and_si128(imm2_2, *reinterpret_cast(_pi32avx_2)); imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); @@ -418,9 +431,9 @@ v8sf sin256_ps(v8sf x) { // any x /* The magic pass: "Extended precision modular arithmetic" x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; - xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; - xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; + xmm1 = *reinterpret_cast(_ps256_minus_cephes_DP1); + xmm2 = *reinterpret_cast(_ps256_minus_cephes_DP2); + xmm3 = *reinterpret_cast(_ps256_minus_cephes_DP3); xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); @@ -429,26 +442,26 @@ v8sf sin256_ps(v8sf x) { // any x x = _mm256_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ - y = *(v8sf *)_ps256_coscof_p0; + y = *reinterpret_cast(_ps256_coscof_p0); v8sf z = _mm256_mul_ps(x, x); y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_coscof_p1)); y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_coscof_p2)); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); - v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); + v8sf tmp = _mm256_mul_ps(z, *reinterpret_cast(_ps256_0p5)); y = _mm256_sub_ps(y, tmp); - y = _mm256_add_ps(y, *(v8sf *)_ps256_1); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_1)); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - v8sf y2 = *(v8sf *)_ps256_sincof_p0; + v8sf y2 = *reinterpret_cast(_ps256_sincof_p0); y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); + y2 = _mm256_add_ps(y2, *reinterpret_cast(_ps256_sincof_p1)); y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); + y2 = _mm256_add_ps(y2, *reinterpret_cast(_ps256_sincof_p2)); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); @@ -475,53 +488,63 @@ v8sf cos256_ps(v8sf x) { // any x #endif /* take the absolute value */ - x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); + x = _mm256_and_ps(x, *reinterpret_cast(_ps256_inv_sign_mask)); /* scale by 4/Pi */ - y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); + y = _mm256_mul_ps(x, *reinterpret_cast(_ps256_cephes_FOPI)); #ifdef __AVX2__ /* store the integer part of y in mm0 */ imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ - imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); + imm2 = + avx2_mm256_add_epi32(imm2, *reinterpret_cast(_pi32_256_1)); + imm2 = avx2_mm256_and_si256(imm2, + *reinterpret_cast(_pi32_256_inv1)); y = _mm256_cvtepi32_ps(imm2); - imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2); + imm2 = + avx2_mm256_sub_epi32(imm2, *reinterpret_cast(_pi32_256_2)); /* get the swap sign flag */ - imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4); + imm0 = avx2_mm256_andnot_si256(imm2, + *reinterpret_cast(_pi32_256_4)); imm0 = avx2_mm256_slli_epi32(imm0, 29); /* get the polynom selection mask */ - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); - imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); + imm2 = + avx2_mm256_and_si256(imm2, *reinterpret_cast(_pi32_256_2)); + imm2 = avx2_mm256_cmpeq_epi32(imm2, + *reinterpret_cast(_pi32_256_0)); #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2); - imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); - imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); + imm2_1 = _mm_add_epi32(imm2_1, *reinterpret_cast(_pi32avx_1)); + imm2_2 = _mm_add_epi32(imm2_2, *reinterpret_cast(_pi32avx_1)); - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); + imm2_1 = + _mm_and_si128(imm2_1, *reinterpret_cast(_pi32avx_inv1)); + imm2_2 = + _mm_and_si128(imm2_2, *reinterpret_cast(_pi32avx_inv1)); COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); y = _mm256_cvtepi32_ps(imm2); - imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2); - imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2); + imm2_1 = _mm_sub_epi32(imm2_1, *reinterpret_cast(_pi32avx_2)); + imm2_2 = _mm_sub_epi32(imm2_2, *reinterpret_cast(_pi32avx_2)); - imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4); - imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4); + imm0_1 = + _mm_andnot_si128(imm2_1, *reinterpret_cast(_pi32avx_4)); + imm0_2 = + _mm_andnot_si128(imm2_2, *reinterpret_cast(_pi32avx_4)); imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); + imm2_1 = _mm_and_si128(imm2_1, *reinterpret_cast(_pi32avx_2)); + imm2_2 = _mm_and_si128(imm2_2, *reinterpret_cast(_pi32avx_2)); imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); @@ -534,9 +557,9 @@ v8sf cos256_ps(v8sf x) { // any x /* The magic pass: "Extended precision modular arithmetic" x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; - xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; - xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; + xmm1 = *reinterpret_cast(_ps256_minus_cephes_DP1); + xmm2 = *reinterpret_cast(_ps256_minus_cephes_DP2); + xmm3 = *reinterpret_cast(_ps256_minus_cephes_DP3); xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); @@ -545,26 +568,26 @@ v8sf cos256_ps(v8sf x) { // any x x = _mm256_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ - y = *(v8sf *)_ps256_coscof_p0; + y = *reinterpret_cast(_ps256_coscof_p0); v8sf z = _mm256_mul_ps(x, x); y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_coscof_p1)); y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_coscof_p2)); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); - v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); + v8sf tmp = _mm256_mul_ps(z, *reinterpret_cast(_ps256_0p5)); y = _mm256_sub_ps(y, tmp); - y = _mm256_add_ps(y, *(v8sf *)_ps256_1); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_1)); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - v8sf y2 = *(v8sf *)_ps256_sincof_p0; + v8sf y2 = *reinterpret_cast(_ps256_sincof_p0); y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); + y2 = _mm256_add_ps(y2, *reinterpret_cast(_ps256_sincof_p1)); y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); + y2 = _mm256_add_ps(y2, *reinterpret_cast(_ps256_sincof_p2)); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); @@ -595,42 +618,50 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { sign_bit_sin = x; /* take the absolute value */ - x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask); + x = _mm256_and_ps(x, *reinterpret_cast(_ps256_inv_sign_mask)); /* extract the sign bit (upper one) */ - sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask); + sign_bit_sin = _mm256_and_ps( + sign_bit_sin, *reinterpret_cast(_ps256_sign_mask)); /* scale by 4/Pi */ - y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI); + y = _mm256_mul_ps(x, *reinterpret_cast(_ps256_cephes_FOPI)); #ifdef __AVX2__ /* store the integer part of y in imm2 */ imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ - imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1); - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1); + imm2 = + avx2_mm256_add_epi32(imm2, *reinterpret_cast(_pi32_256_1)); + imm2 = avx2_mm256_and_si256(imm2, + *reinterpret_cast(_pi32_256_inv1)); y = _mm256_cvtepi32_ps(imm2); imm4 = imm2; /* get the swap sign flag for the sine */ - imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4); + imm0 = + avx2_mm256_and_si256(imm2, *reinterpret_cast(_pi32_256_4)); imm0 = avx2_mm256_slli_epi32(imm0, 29); // v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0); /* get the polynom selection mask for the sine*/ - imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2); - imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0); + imm2 = + avx2_mm256_and_si256(imm2, *reinterpret_cast(_pi32_256_2)); + imm2 = avx2_mm256_cmpeq_epi32(imm2, + *reinterpret_cast(_pi32_256_0)); // v8sf poly_mask = _mm256_castsi256_ps(imm2); #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2); - imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1); - imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1); + imm2_1 = _mm_add_epi32(imm2_1, *reinterpret_cast(_pi32avx_1)); + imm2_2 = _mm_add_epi32(imm2_2, *reinterpret_cast(_pi32avx_1)); - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1); + imm2_1 = + _mm_and_si128(imm2_1, *reinterpret_cast(_pi32avx_inv1)); + imm2_2 = + _mm_and_si128(imm2_2, *reinterpret_cast(_pi32avx_inv1)); COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); y = _mm256_cvtepi32_ps(imm2); @@ -638,16 +669,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { imm4_1 = imm2_1; imm4_2 = imm2_2; - imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4); - imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4); + imm0_1 = _mm_and_si128(imm2_1, *reinterpret_cast(_pi32avx_4)); + imm0_2 = _mm_and_si128(imm2_2, *reinterpret_cast(_pi32avx_4)); imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); - imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2); - imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2); + imm2_1 = _mm_and_si128(imm2_1, *reinterpret_cast(_pi32avx_2)); + imm2_2 = _mm_and_si128(imm2_2, *reinterpret_cast(_pi32avx_2)); imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); @@ -659,9 +690,9 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { /* The magic pass: "Extended precision modular arithmetic" x = ((x - y * DP1) - y * DP2) - y * DP3; */ - xmm1 = *(v8sf *)_ps256_minus_cephes_DP1; - xmm2 = *(v8sf *)_ps256_minus_cephes_DP2; - xmm3 = *(v8sf *)_ps256_minus_cephes_DP3; + xmm1 = *reinterpret_cast(_ps256_minus_cephes_DP1); + xmm2 = *reinterpret_cast(_ps256_minus_cephes_DP2); + xmm3 = *reinterpret_cast(_ps256_minus_cephes_DP3); xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); @@ -670,15 +701,19 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { x = _mm256_add_ps(x, xmm3); #ifdef __AVX2__ - imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2); - imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4); + imm4 = + avx2_mm256_sub_epi32(imm4, *reinterpret_cast(_pi32_256_2)); + imm4 = avx2_mm256_andnot_si256(imm4, + *reinterpret_cast(_pi32_256_4)); imm4 = avx2_mm256_slli_epi32(imm4, 29); #else - imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2); - imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2); + imm4_1 = _mm_sub_epi32(imm4_1, *reinterpret_cast(_pi32avx_2)); + imm4_2 = _mm_sub_epi32(imm4_2, *reinterpret_cast(_pi32avx_2)); - imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4); - imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4); + imm4_1 = + _mm_andnot_si128(imm4_1, *reinterpret_cast(_pi32avx_4)); + imm4_2 = + _mm_andnot_si128(imm4_2, *reinterpret_cast(_pi32avx_4)); imm4_1 = _mm_slli_epi32(imm4_1, 29); imm4_2 = _mm_slli_epi32(imm4_2, 29); @@ -692,25 +727,25 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { /* Evaluate the first polynom (0 <= x <= Pi/4) */ v8sf z = _mm256_mul_ps(x, x); - y = *(v8sf *)_ps256_coscof_p0; + y = *reinterpret_cast(_ps256_coscof_p0); y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_coscof_p1)); y = _mm256_mul_ps(y, z); - y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_coscof_p2)); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); - v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5); + v8sf tmp = _mm256_mul_ps(z, *reinterpret_cast(_ps256_0p5)); y = _mm256_sub_ps(y, tmp); - y = _mm256_add_ps(y, *(v8sf *)_ps256_1); + y = _mm256_add_ps(y, *reinterpret_cast(_ps256_1)); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ - v8sf y2 = *(v8sf *)_ps256_sincof_p0; + v8sf y2 = *reinterpret_cast(_ps256_sincof_p0); y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1); + y2 = _mm256_add_ps(y2, *reinterpret_cast(_ps256_sincof_p1)); y2 = _mm256_mul_ps(y2, z); - y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2); + y2 = _mm256_add_ps(y2, *reinterpret_cast(_ps256_sincof_p2)); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x);