From ff7d24647839d4fa956836a8ef4a58682988a51c Mon Sep 17 00:00:00 2001
From: wangzhen38 <41941775+wangzhen38@users.noreply.github.com>
Date: Tue, 21 Jun 2022 19:03:02 +0800
Subject: [PATCH] cpplint fix 3 (#43679)

* cpplint fix 3

* cpplint fix 3

* cpplint fix 3

* cpplint fix 3
---
 paddle/phi/kernels/funcs/detail/avx_mathfun.h | 285 ++++++++++--------
 1 file changed, 160 insertions(+), 125 deletions(-)
diff --git a/paddle/phi/kernels/funcs/detail/avx_mathfun.h b/paddle/phi/kernels/funcs/detail/avx_mathfun.h
index 75e4922648c20..90017f3c760f4 100644
--- a/paddle/phi/kernels/funcs/detail/avx_mathfun.h
+++ b/paddle/phi/kernels/funcs/detail/avx_mathfun.h
@@ -41,7 +41,7 @@
 
   (this is the zlib license)
 */
-
+#pragma once
 #include "paddle/fluid/platform/cpu_info.h"
 
 /* __m128 is ugly to write */
@@ -134,7 +134,7 @@ typedef union imm_xmm_union {
     return (ret);                                        \
   }
 
-//#warning "Using SSE2 to perform AVX2 bitshift ops"
+// #warning "Using SSE2 to perform AVX2 bitshift ops"
 AVX2_BITOP_USING_SSE2(slli_epi32)
 AVX2_BITOP_USING_SSE2(srli_epi32)
 
@@ -152,7 +152,7 @@ AVX2_BITOP_USING_SSE2(srli_epi32)
     return (ret);                                                     \
   }
 
-//#warning "Using SSE2 to perform AVX2 integer ops"
+// #warning "Using SSE2 to perform AVX2 integer ops"
 AVX2_INTOP_USING_SSE2(and_si128)
 AVX2_INTOP_USING_SSE2(andnot_si128)
 AVX2_INTOP_USING_SSE2(cmpeq_epi32)
@@ -175,23 +175,24 @@ AVX2_INTOP_USING_SSE2(add_epi32)
 */
 v8sf log256_ps(v8sf x) {
   v8si imm0;
-  v8sf one = *(v8sf *)_ps256_1;
+  v8sf one = *reinterpret_cast<const v8sf *>(_ps256_1);
 
   // v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
   v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);
 
-  x = _mm256_max_ps(
-      x, *(v8sf *)_ps256_min_norm_pos); /* cut off denormalized stuff */
+  /* cut off denormalized stuff */
+  x = _mm256_max_ps(x, *reinterpret_cast<const v8sf *>(_ps256_min_norm_pos));
 
   // can be done with AVX2
   imm0 = avx2_mm256_srli_epi32(_mm256_castps_si256(x), 23);
 
   /* keep only the fractional part */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_mant_mask);
-  x = _mm256_or_ps(x, *(v8sf *)_ps256_0p5);
+  x = _mm256_and_ps(x, *reinterpret_cast<const v8sf *>(_ps256_inv_mant_mask));
+  x = _mm256_or_ps(x, *reinterpret_cast<const v8sf *>(_ps256_0p5));
 
   // this is again another AVX2 instruction
-  imm0 = avx2_mm256_sub_epi32(imm0, *(v8si *)_pi32_256_0x7f);
+  imm0 = avx2_mm256_sub_epi32(imm0,
+                              *reinterpret_cast<const v8si *>(_pi32_256_0x7f));
   v8sf e = _mm256_cvtepi32_ps(imm0);
 
   e = _mm256_add_ps(e, one);
@@ -203,7 +204,8 @@ v8sf log256_ps(v8sf x) {
      } else { x = x - 1.0; }
   */
   // v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
-  v8sf mask = _mm256_cmp_ps(x, *(v8sf *)_ps256_cephes_SQRTHF, _CMP_LT_OS);
+  v8sf mask = _mm256_cmp_ps(
+      x, *reinterpret_cast<const v8sf *>(_ps256_cephes_SQRTHF), _CMP_LT_OS);
   v8sf tmp = _mm256_and_ps(x, mask);
   x = _mm256_sub_ps(x, one);
   e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
@@ -211,34 +213,34 @@ v8sf log256_ps(v8sf x) {
 
   v8sf z = _mm256_mul_ps(x, x);
 
-  v8sf y = *(v8sf *)_ps256_cephes_log_p0;
+  v8sf y = *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p0);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p1);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p1));
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p2);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p2));
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p3);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p3));
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p4);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p4));
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p5);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p5));
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p6);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p6));
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p7);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p7));
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_log_p8);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_p8));
   y = _mm256_mul_ps(y, x);
 
   y = _mm256_mul_ps(y, z);
 
-  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q1);
+  tmp = _mm256_mul_ps(e, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_q1));
   y = _mm256_add_ps(y, tmp);
 
-  tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  tmp = _mm256_mul_ps(z, *reinterpret_cast<const v8sf *>(_ps256_0p5));
   y = _mm256_sub_ps(y, tmp);
 
-  tmp = _mm256_mul_ps(e, *(v8sf *)_ps256_cephes_log_q2);
+  tmp = _mm256_mul_ps(e, *reinterpret_cast<const v8sf *>(_ps256_cephes_log_q2));
   x = _mm256_add_ps(x, y);
   x = _mm256_add_ps(x, tmp);
   x = _mm256_or_ps(x, invalid_mask);  // negative arg will be NAN
@@ -262,14 +264,14 @@ _PS256_CONST(cephes_exp_p5, 5.0000001201E-1);
 v8sf exp256_ps(v8sf x) {
   v8sf tmp = _mm256_setzero_ps(), fx;
   v8si imm0;
-  v8sf one = *(v8sf *)_ps256_1;
+  v8sf one = *reinterpret_cast<const v8sf *>(_ps256_1);
 
-  x = _mm256_min_ps(x, *(v8sf *)_ps256_exp_hi);
-  x = _mm256_max_ps(x, *(v8sf *)_ps256_exp_lo);
+  x = _mm256_min_ps(x, *reinterpret_cast<const v8sf *>(_ps256_exp_hi));
+  x = _mm256_max_ps(x, *reinterpret_cast<const v8sf *>(_ps256_exp_lo));
 
   /* express exp(x) as exp(g + n*log(2)) */
-  fx = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_LOG2EF);
-  fx = _mm256_add_ps(fx, *(v8sf *)_ps256_0p5);
+  fx = _mm256_mul_ps(x, *reinterpret_cast<const v8sf *>(_ps256_cephes_LOG2EF));
+  fx = _mm256_add_ps(fx, *reinterpret_cast<const v8sf *>(_ps256_0p5));
 
   /* how to perform a floorf with SSE: just below */
   // imm0 = _mm256_cvttps_epi32(fx);
@@ -283,24 +285,26 @@ v8sf exp256_ps(v8sf x) {
   mask = _mm256_and_ps(mask, one);
   fx = _mm256_sub_ps(tmp, mask);
 
-  tmp = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C1);
-  v8sf z = _mm256_mul_ps(fx, *(v8sf *)_ps256_cephes_exp_C2);
+  tmp =
+      _mm256_mul_ps(fx, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_C1));
+  v8sf z =
+      _mm256_mul_ps(fx, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_C2));
   x = _mm256_sub_ps(x, tmp);
   x = _mm256_sub_ps(x, z);
 
   z = _mm256_mul_ps(x, x);
 
-  v8sf y = *(v8sf *)_ps256_cephes_exp_p0;
+  v8sf y = *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p0);
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p1);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p1));
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p2);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p2));
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p3);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p3));
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p4);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p4));
   y = _mm256_mul_ps(y, x);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_cephes_exp_p5);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_cephes_exp_p5));
   y = _mm256_mul_ps(y, z);
   y = _mm256_add_ps(y, x);
   y = _mm256_add_ps(y, one);
@@ -308,7 +312,8 @@ v8sf exp256_ps(v8sf x) {
   /* build 2^n */
   imm0 = _mm256_cvttps_epi32(fx);
   // another two AVX2 instructions
-  imm0 = avx2_mm256_add_epi32(imm0, *(v8si *)_pi32_256_0x7f);
+  imm0 = avx2_mm256_add_epi32(imm0,
+                              *reinterpret_cast<const v8si *>(_pi32_256_0x7f));
   imm0 = avx2_mm256_slli_epi32(imm0, 23);
   v8sf pow2n = _mm256_castsi256_ps(imm0);
   y = _mm256_mul_ps(y, pow2n);
@@ -349,12 +354,13 @@ v8sf sin256_ps(v8sf x) {  // any x
 
   sign_bit = x;
   /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *reinterpret_cast<const v8sf *>(_ps256_inv_sign_mask));
   /* extract the sign bit (upper one) */
-  sign_bit = _mm256_and_ps(sign_bit, *(v8sf *)_ps256_sign_mask);
+  sign_bit = _mm256_and_ps(sign_bit,
+                           *reinterpret_cast<const v8sf *>(_ps256_sign_mask));
 
   /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *reinterpret_cast<const v8sf *>(_ps256_cephes_FOPI));
 
   /*
     Here we start a series of integer operations, which are in the
@@ -367,12 +373,15 @@ v8sf sin256_ps(v8sf x) {  // any x
   imm2 = _mm256_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
   // another two AVX2 instruction
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+  imm2 =
+      avx2_mm256_add_epi32(imm2, *reinterpret_cast<const v8si *>(_pi32_256_1));
+  imm2 = avx2_mm256_and_si256(imm2,
+                              *reinterpret_cast<const v8si *>(_pi32_256_inv1));
   y = _mm256_cvtepi32_ps(imm2);
 
   /* get the swap sign flag */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 =
+      avx2_mm256_and_si256(imm2, *reinterpret_cast<const v8si *>(_pi32_256_4));
   imm0 = avx2_mm256_slli_epi32(imm0, 29);
   /* get the polynom selection mask
      there is one polynom for 0 <= x <= Pi/4
@@ -380,31 +389,35 @@ v8sf sin256_ps(v8sf x) {  // any x
 
      Both branches will be computed.
   */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+  imm2 =
+      avx2_mm256_and_si256(imm2, *reinterpret_cast<const v8si *>(_pi32_256_2));
+  imm2 = avx2_mm256_cmpeq_epi32(imm2,
+                                *reinterpret_cast<const v8si *>(_pi32_256_0));
 #else
   /* we use SSE2 routines to perform the integer ops */
   COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
 
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_1));
+  imm2_2 = _mm_add_epi32(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_1));
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+  imm2_1 =
+      _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
+  imm2_2 =
+      _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
 
   COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
   y = _mm256_cvtepi32_ps(imm2);
 
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
+  imm0_1 = _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_4));
+  imm0_2 = _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_4));
 
   imm0_1 = _mm_slli_epi32(imm0_1, 29);
   imm0_2 = _mm_slli_epi32(imm0_2, 29);
 
   COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_2));
+  imm2_2 = _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_2));
 
   imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
   imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -418,9 +431,9 @@ v8sf sin256_ps(v8sf x) {  // any x
 
   /* The magic pass: "Extended precision modular arithmetic"
      x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm1 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP1);
+  xmm2 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP2);
+  xmm3 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP3);
   xmm1 = _mm256_mul_ps(y, xmm1);
   xmm2 = _mm256_mul_ps(y, xmm2);
   xmm3 = _mm256_mul_ps(y, xmm3);
@@ -429,26 +442,26 @@ v8sf sin256_ps(v8sf x) {  // any x
   x = _mm256_add_ps(x, xmm3);
 
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf *)_ps256_coscof_p0;
+  y = *reinterpret_cast<const v8sf *>(_ps256_coscof_p0);
   v8sf z = _mm256_mul_ps(x, x);
 
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p1));
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p2));
   y = _mm256_mul_ps(y, z);
   y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *reinterpret_cast<const v8sf *>(_ps256_0p5));
   y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_1));
 
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  v8sf y2 = *reinterpret_cast<const v8sf *>(_ps256_sincof_p0);
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p1));
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p2));
   y2 = _mm256_mul_ps(y2, z);
   y2 = _mm256_mul_ps(y2, x);
   y2 = _mm256_add_ps(y2, x);
@@ -475,53 +488,63 @@ v8sf cos256_ps(v8sf x) {  // any x
 #endif
 
   /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *reinterpret_cast<const v8sf *>(_ps256_inv_sign_mask));
 
   /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *reinterpret_cast<const v8sf *>(_ps256_cephes_FOPI));
 
 #ifdef __AVX2__
   /* store the integer part of y in mm0 */
   imm2 = _mm256_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+  imm2 =
+      avx2_mm256_add_epi32(imm2, *reinterpret_cast<const v8si *>(_pi32_256_1));
+  imm2 = avx2_mm256_and_si256(imm2,
+                              *reinterpret_cast<const v8si *>(_pi32_256_inv1));
   y = _mm256_cvtepi32_ps(imm2);
-  imm2 = avx2_mm256_sub_epi32(imm2, *(v8si *)_pi32_256_2);
+  imm2 =
+      avx2_mm256_sub_epi32(imm2, *reinterpret_cast<const v8si *>(_pi32_256_2));
 
   /* get the swap sign flag */
-  imm0 = avx2_mm256_andnot_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 = avx2_mm256_andnot_si256(imm2,
+                                 *reinterpret_cast<const v8si *>(_pi32_256_4));
   imm0 = avx2_mm256_slli_epi32(imm0, 29);
   /* get the polynom selection mask */
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+  imm2 =
+      avx2_mm256_and_si256(imm2, *reinterpret_cast<const v8si *>(_pi32_256_2));
+  imm2 = avx2_mm256_cmpeq_epi32(imm2,
+                                *reinterpret_cast<const v8si *>(_pi32_256_0));
 #else
 
   /* we use SSE2 routines to perform the integer ops */
   COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
 
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_1));
+  imm2_2 = _mm_add_epi32(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_1));
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+  imm2_1 =
+      _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
+  imm2_2 =
+      _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
 
   COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
   y = _mm256_cvtepi32_ps(imm2);
 
-  imm2_1 = _mm_sub_epi32(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_sub_epi32(imm2_2, *(v4si *)_pi32avx_2);
+  imm2_1 = _mm_sub_epi32(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_2));
+  imm2_2 = _mm_sub_epi32(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_2));
 
-  imm0_1 = _mm_andnot_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_andnot_si128(imm2_2, *(v4si *)_pi32avx_4);
+  imm0_1 =
+      _mm_andnot_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_4));
+  imm0_2 =
+      _mm_andnot_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_4));
 
   imm0_1 = _mm_slli_epi32(imm0_1, 29);
   imm0_2 = _mm_slli_epi32(imm0_2, 29);
 
   COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_2));
+  imm2_2 = _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_2));
 
   imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
   imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -534,9 +557,9 @@ v8sf cos256_ps(v8sf x) {  // any x
 
   /* The magic pass: "Extended precision modular arithmetic"
      x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm1 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP1);
+  xmm2 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP2);
+  xmm3 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP3);
   xmm1 = _mm256_mul_ps(y, xmm1);
   xmm2 = _mm256_mul_ps(y, xmm2);
   xmm3 = _mm256_mul_ps(y, xmm3);
@@ -545,26 +568,26 @@ v8sf cos256_ps(v8sf x) {  // any x
   x = _mm256_add_ps(x, xmm3);
 
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = *(v8sf *)_ps256_coscof_p0;
+  y = *reinterpret_cast<const v8sf *>(_ps256_coscof_p0);
   v8sf z = _mm256_mul_ps(x, x);
 
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p1));
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p2));
   y = _mm256_mul_ps(y, z);
   y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *reinterpret_cast<const v8sf *>(_ps256_0p5));
   y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_1));
 
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  v8sf y2 = *reinterpret_cast<const v8sf *>(_ps256_sincof_p0);
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p1));
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p2));
   y2 = _mm256_mul_ps(y2, z);
   y2 = _mm256_mul_ps(y2, x);
   y2 = _mm256_add_ps(y2, x);
@@ -595,42 +618,50 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
 
   sign_bit_sin = x;
   /* take the absolute value */
-  x = _mm256_and_ps(x, *(v8sf *)_ps256_inv_sign_mask);
+  x = _mm256_and_ps(x, *reinterpret_cast<const v8sf *>(_ps256_inv_sign_mask));
   /* extract the sign bit (upper one) */
-  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf *)_ps256_sign_mask);
+  sign_bit_sin = _mm256_and_ps(
+      sign_bit_sin, *reinterpret_cast<const v8sf *>(_ps256_sign_mask));
 
   /* scale by 4/Pi */
-  y = _mm256_mul_ps(x, *(v8sf *)_ps256_cephes_FOPI);
+  y = _mm256_mul_ps(x, *reinterpret_cast<const v8sf *>(_ps256_cephes_FOPI));
 
 #ifdef __AVX2__
   /* store the integer part of y in imm2 */
   imm2 = _mm256_cvttps_epi32(y);
 
   /* j=(j+1) & (~1) (see the cephes sources) */
-  imm2 = avx2_mm256_add_epi32(imm2, *(v8si *)_pi32_256_1);
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_inv1);
+  imm2 =
+      avx2_mm256_add_epi32(imm2, *reinterpret_cast<const v8si *>(_pi32_256_1));
+  imm2 = avx2_mm256_and_si256(imm2,
+                              *reinterpret_cast<const v8si *>(_pi32_256_inv1));
 
   y = _mm256_cvtepi32_ps(imm2);
   imm4 = imm2;
 
   /* get the swap sign flag for the sine */
-  imm0 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_4);
+  imm0 =
+      avx2_mm256_and_si256(imm2, *reinterpret_cast<const v8si *>(_pi32_256_4));
   imm0 = avx2_mm256_slli_epi32(imm0, 29);
   // v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
 
   /* get the polynom selection mask for the sine*/
-  imm2 = avx2_mm256_and_si256(imm2, *(v8si *)_pi32_256_2);
-  imm2 = avx2_mm256_cmpeq_epi32(imm2, *(v8si *)_pi32_256_0);
+  imm2 =
+      avx2_mm256_and_si256(imm2, *reinterpret_cast<const v8si *>(_pi32_256_2));
+  imm2 = avx2_mm256_cmpeq_epi32(imm2,
+                                *reinterpret_cast<const v8si *>(_pi32_256_0));
 // v8sf poly_mask = _mm256_castsi256_ps(imm2);
 #else
   /* we use SSE2 routines to perform the integer ops */
   COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y), imm2_1, imm2_2);
 
-  imm2_1 = _mm_add_epi32(imm2_1, *(v4si *)_pi32avx_1);
-  imm2_2 = _mm_add_epi32(imm2_2, *(v4si *)_pi32avx_1);
+  imm2_1 = _mm_add_epi32(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_1));
+  imm2_2 = _mm_add_epi32(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_1));
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_inv1);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_inv1);
+  imm2_1 =
+      _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
+  imm2_2 =
+      _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_inv1));
 
   COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
   y = _mm256_cvtepi32_ps(imm2);
@@ -638,16 +669,16 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
   imm4_1 = imm2_1;
   imm4_2 = imm2_2;
 
-  imm0_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_4);
-  imm0_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_4);
+  imm0_1 = _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_4));
+  imm0_2 = _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_4));
 
   imm0_1 = _mm_slli_epi32(imm0_1, 29);
   imm0_2 = _mm_slli_epi32(imm0_2, 29);
 
   COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);
 
-  imm2_1 = _mm_and_si128(imm2_1, *(v4si *)_pi32avx_2);
-  imm2_2 = _mm_and_si128(imm2_2, *(v4si *)_pi32avx_2);
+  imm2_1 = _mm_and_si128(imm2_1, *reinterpret_cast<const v4si *>(_pi32avx_2));
+  imm2_2 = _mm_and_si128(imm2_2, *reinterpret_cast<const v4si *>(_pi32avx_2));
 
   imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
   imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());
@@ -659,9 +690,9 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
 
   /* The magic pass: "Extended precision modular arithmetic"
      x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = *(v8sf *)_ps256_minus_cephes_DP1;
-  xmm2 = *(v8sf *)_ps256_minus_cephes_DP2;
-  xmm3 = *(v8sf *)_ps256_minus_cephes_DP3;
+  xmm1 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP1);
+  xmm2 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP2);
+  xmm3 = *reinterpret_cast<const v8sf *>(_ps256_minus_cephes_DP3);
   xmm1 = _mm256_mul_ps(y, xmm1);
   xmm2 = _mm256_mul_ps(y, xmm2);
   xmm3 = _mm256_mul_ps(y, xmm3);
@@ -670,15 +701,19 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
   x = _mm256_add_ps(x, xmm3);
 
 #ifdef __AVX2__
-  imm4 = avx2_mm256_sub_epi32(imm4, *(v8si *)_pi32_256_2);
-  imm4 = avx2_mm256_andnot_si256(imm4, *(v8si *)_pi32_256_4);
+  imm4 =
+      avx2_mm256_sub_epi32(imm4, *reinterpret_cast<const v8si *>(_pi32_256_2));
+  imm4 = avx2_mm256_andnot_si256(imm4,
+                                 *reinterpret_cast<const v8si *>(_pi32_256_4));
   imm4 = avx2_mm256_slli_epi32(imm4, 29);
 #else
-  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si *)_pi32avx_2);
-  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si *)_pi32avx_2);
+  imm4_1 = _mm_sub_epi32(imm4_1, *reinterpret_cast<const v4si *>(_pi32avx_2));
+  imm4_2 = _mm_sub_epi32(imm4_2, *reinterpret_cast<const v4si *>(_pi32avx_2));
 
-  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si *)_pi32avx_4);
-  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si *)_pi32avx_4);
+  imm4_1 =
+      _mm_andnot_si128(imm4_1, *reinterpret_cast<const v4si *>(_pi32avx_4));
+  imm4_2 =
+      _mm_andnot_si128(imm4_2, *reinterpret_cast<const v4si *>(_pi32avx_4));
 
   imm4_1 = _mm_slli_epi32(imm4_1, 29);
   imm4_2 = _mm_slli_epi32(imm4_2, 29);
@@ -692,25 +727,25 @@ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {
 
   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
   v8sf z = _mm256_mul_ps(x, x);
-  y = *(v8sf *)_ps256_coscof_p0;
+  y = *reinterpret_cast<const v8sf *>(_ps256_coscof_p0);
 
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p1);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p1));
   y = _mm256_mul_ps(y, z);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_coscof_p2);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_coscof_p2));
   y = _mm256_mul_ps(y, z);
   y = _mm256_mul_ps(y, z);
-  v8sf tmp = _mm256_mul_ps(z, *(v8sf *)_ps256_0p5);
+  v8sf tmp = _mm256_mul_ps(z, *reinterpret_cast<const v8sf *>(_ps256_0p5));
   y = _mm256_sub_ps(y, tmp);
-  y = _mm256_add_ps(y, *(v8sf *)_ps256_1);
+  y = _mm256_add_ps(y, *reinterpret_cast<const v8sf *>(_ps256_1));
 
   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
 
-  v8sf y2 = *(v8sf *)_ps256_sincof_p0;
+  v8sf y2 = *reinterpret_cast<const v8sf *>(_ps256_sincof_p0);
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p1);
+  y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p1));
   y2 = _mm256_mul_ps(y2, z);
-  y2 = _mm256_add_ps(y2, *(v8sf *)_ps256_sincof_p2);
+  y2 = _mm256_add_ps(y2, *reinterpret_cast<const v8sf *>(_ps256_sincof_p2));
   y2 = _mm256_mul_ps(y2, z);
   y2 = _mm256_mul_ps(y2, x);
   y2 = _mm256_add_ps(y2, x);