From 4281ab99e3e989de0a58dabc418caa48040fad05 Mon Sep 17 00:00:00 2001
From: gnzlbg <gonzalobg88@gmail.com>
Date: Wed, 11 Apr 2018 15:20:53 +0200
Subject: [PATCH] Workarounds for poor LLVM6 code-gen of all/any mask
 reductions on x86

This commit adds workarounds for the mask reductions: `all` and `any`.

Note:

* not all LLVM bugs have been properly filled yet
* workarounds for 256-bit vector masks on armv7 and aarch64 are
  not provided in this PR

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
```

After this PR for `m8x8`, `m16x4`, `m32x2`:

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

Before this PR:

```asm
all_8x8:
 call    L9$pb
L9$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
any_8x8:
 call    L8$pb
L8$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
```

After this PR:

```asm
all_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 ret
any_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 ret
```

Before this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

Before this PR:

```asm
all_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, #4
 vand    d0, d0, d1
 vext.8  d1, d0, d0, #2
 vand    d0, d0, d1
 vdup.8  d1, d0[1]
 vand    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, #4
 vorr    d0, d0, d1
 vext.8  d1, d0, d0, #2
 vorr    d0, d0, d1
 vdup.8  d1, d0[1]
 vorr    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x8:
 vldr    d0, [r0]
 b       <m8x8 as All>::all

<m8x8 as All>::all:
 vpmin.u8 d16, d0, d16
 vpmin.u8 d16, d16, d16
 vpmin.u8 d0, d16, d16
 b       m8x8::extract

any_8x8:
 vldr    d0, [r0]
 b       <m8x8 as Any>::any

<m8x8 as Any>::any:
 vpmax.u8 d16, d0, d16
 vpmax.u8 d16, d16, d16
 vpmax.u8 d0, d16, d16
 b       m8x8::extract
```

(note: inlining does not work properly on ARMv7)

Before this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 sete    al
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

Before this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[8]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[9]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[10]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[11]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[12]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[13]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[14]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[15]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 umov    w9, v0.b[8]
 orr     w8, w8, w9
 umov    w9, v0.b[9]
 orr     w8, w8, w9
 umov    w9, v0.b[10]
 orr     w8, w8, w9
 umov    w9, v0.b[11]
 orr     w8, w8, w9
 umov    w9, v0.b[12]
 orr     w8, w8, w9
 umov    w9, v0.b[13]
 orr     w8, w8, w9
 umov    w9, v0.b[14]
 orr     w8, w8, w9
 umov    w9, v0.b[15]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x16:
 ldr     q0, [x0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

Before this PR:

```asm
all_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, #8
 vand    q0, q0, q1
 vext.8  q1, q0, q0, #4
 vand    q0, q0, q1
 vext.8  q1, q0, q0, #2
 vand    q0, q0, q1
 vdup.8  q1, d0[1]
 vand    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, #8
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, #4
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, #2
 vorr    q0, q0, q1
 vdup.8  q1, d0[1]
 vorr    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as All>::all

<m8x16 as All>::all:
 vpmin.u8 d0, d0, d
 b       <m8x8 as All>::all
any_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as Any>::any

<m8x16 as Any>::any:
 vpmax.u8 d0, d0, d1
 b       <m8x8 as Any>::any
```

The inlining problems are pretty bad on ARMv7 + NEON.

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI17_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 movdqa  xmm2, xmmword, ptr, [rdi, +, 16]
 pand    xmm2, xmm0
 pcmpeqb xmm2, xmm0
 pcmpeqb xmm1, xmm0
 pand    xmm1, xmm2
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
 any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 por     xmm0, xmmword, ptr, [rdi, +, 16]
 movdqa  xmm1, xmmword, ptr, [rip, +, LCPI16_0]
 pand    xmm0, xmm1
 pcmpeqb xmm0, xmm1
 pmovmskb eax, xmm0
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 jne     LBB17_1
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb ecx, xmm0
 mov     al, 1
 cmp     ecx, 65535
 je      LBB17_3
LBB17_1:
 xor     eax, eax
LBB17_3:
 pop     rbp
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb ecx, xmm0
 mov     al, 1
 test    ecx, ecx
 je      LBB16_1
 pop     rbp
 ret
LBB16_1:
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vandps  ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vandps  ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vandps  ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vorps   ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vorps   ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vorps   ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vxorps  xmm1, xmm1, xmm1
 vcmptrueps ymm1, ymm1, ymm1
 vptest  ymm0, ymm1
 setb    al
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vptest  ymm0, ymm0
 setne   al
 pop     rbp
 vzeroupper
 ret
```

---

Closes #362 .
---
 coresimd/ppsv/api/masks_reductions.rs     |  26 +-
 coresimd/ppsv/codegen/masks_reductions.rs | 603 ++++++++++++++++++++++
 coresimd/ppsv/codegen/mod.rs              |   6 +
 coresimd/ppsv/codegen/wrapping.rs         |  42 ++
 coresimd/ppsv/mod.rs                      |  50 +-
 crates/coresimd/Cargo.toml                |   3 +
 crates/coresimd/src/lib.rs                |   2 +
 7 files changed, 660 insertions(+), 72 deletions(-)
 create mode 100644 coresimd/ppsv/codegen/masks_reductions.rs
 create mode 100644 coresimd/ppsv/codegen/mod.rs
 create mode 100644 coresimd/ppsv/codegen/wrapping.rs

diff --git a/coresimd/ppsv/api/masks_reductions.rs b/coresimd/ppsv/api/masks_reductions.rs
index e348a42d7b..bc7ac36d34 100644
--- a/coresimd/ppsv/api/masks_reductions.rs
+++ b/coresimd/ppsv/api/masks_reductions.rs
@@ -5,37 +5,15 @@ macro_rules! impl_mask_reductions {
     ($id:ident) => {
         impl $id {
             /// Are `all` vector lanes `true`?
-            #[cfg(not(target_arch = "aarch64"))]
             #[inline]
             pub fn all(self) -> bool {
-                use coresimd::simd_llvm::simd_reduce_all;
-                unsafe { simd_reduce_all(self) }
+                unsafe { super::codegen::masks_reductions::All::all(self) }
             }
-            /// Are `all` vector lanes `true`?
-            #[cfg(target_arch = "aarch64")]
-            #[inline]
-            pub fn all(self) -> bool {
-                // FIXME: Broken on AArch64
-                // https://bugs.llvm.org/show_bug.cgi?id=36796
-                self.and()
-            }
-
             /// Is `any` vector lanes `true`?
-            #[cfg(not(target_arch = "aarch64"))]
             #[inline]
             pub fn any(self) -> bool {
-                use coresimd::simd_llvm::simd_reduce_any;
-                unsafe { simd_reduce_any(self) }
+                unsafe { super::codegen::masks_reductions::Any::any(self) }
             }
-            /// Is `any` vector lanes `true`?
-            #[cfg(target_arch = "aarch64")]
-            #[inline]
-            pub fn any(self) -> bool {
-                // FIXME: Broken on AArch64
-                // https://bugs.llvm.org/show_bug.cgi?id=36796
-                self.or()
-            }
-
             /// Are `all` vector lanes `false`?
             #[inline]
             pub fn none(self) -> bool {
diff --git a/coresimd/ppsv/codegen/masks_reductions.rs b/coresimd/ppsv/codegen/masks_reductions.rs
new file mode 100644
index 0000000000..16c7cb8d52
--- /dev/null
+++ b/coresimd/ppsv/codegen/masks_reductions.rs
@@ -0,0 +1,603 @@
+//! LLVM6 currently generates sub-optimal code for the `all` mask reductions.
+//!
+//! See https://github.com/rust-lang-nursery/stdsimd/issues/362#issuecomment-372774371
+//! and the associated LLVM bug:
+//! https://bugs.llvm.org/show_bug.cgi?id=36702
+
+#![allow(unused)]
+
+use coresimd::simd::*;
+
+pub trait All: ::marker::Sized {
+    unsafe fn all(self) -> bool;
+}
+
+pub trait Any: ::marker::Sized {
+    unsafe fn any(self) -> bool;
+}
+
+// By default we use the simd_reduce_{all,any} intrinsics, which produces
+// sub-optimal code, except on aarch64 where that intrinsic is broken
+// due to https://bugs.llvm.org/show_bug.cgi?id=36796 so we just use
+// full-blown bitwise and/or reduction there.
+macro_rules! default_impl {
+    ($id:ident) => {
+        impl All for $id {
+            #[inline]
+            unsafe fn all(self) -> bool {
+                #[cfg(not(target_arch = "aarch64"))] {
+                    use coresimd::simd_llvm::simd_reduce_all;
+                    simd_reduce_all(self)
+                }
+                #[cfg(target_arch = "aarch64")] {
+                    // FIXME: Broken on AArch64
+                    // https://bugs.llvm.org/show_bug.cgi?id=36796
+                    self.and()
+                }
+            }
+        }
+
+        impl Any for $id {
+            #[inline]
+            unsafe fn any(self) -> bool {
+                #[cfg(not(target_arch = "aarch64"))] {
+                    use coresimd::simd_llvm::simd_reduce_any;
+                    simd_reduce_any(self)
+                }
+                #[cfg(target_arch = "aarch64")] {
+                    // FIXME: Broken on AArch64
+                    // https://bugs.llvm.org/show_bug.cgi?id=36796
+                    self.or()
+                }
+            }
+        }
+    };
+}
+
+// On x86 both SSE2 and AVX2 provide movemask instructions that can be used
+// here. The AVX2 instructions aren't necessarily better than the AVX
+// instructions below, so they aren't implemented here.
+//
+// FIXME: for mask generated from f32x4 LLVM6 emits pmovmskb but should emit
+// movmskps. Since the masks don't track whether they were produced by integer
+// or floating point vectors, we can't currently work around this yet. The
+// performance impact for this shouldn't be large, but this is filled as:
+// https://bugs.llvm.org/show_bug.cgi?id=37087
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))]
+macro_rules! x86_128_sse2_movemask_impl {
+    ($id:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "sse2")]
+            unsafe fn all(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use ::coresimd::arch::x86::_mm_movemask_epi8;
+                #[cfg(target_arch = "x86_64")]
+                use ::coresimd::arch::x86_64::_mm_movemask_epi8;
+                // _mm_movemask_epi8(a) creates a 16bit mask containing the most
+                // significant bit of each byte of `a`. If all bits are set,
+                // then all 16 lanes of the mask are true.
+                _mm_movemask_epi8(::mem::transmute(self)) == u16::max_value() as i32
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "sse2")]
+            unsafe fn any(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use ::coresimd::arch::x86::_mm_movemask_epi8;
+                #[cfg(target_arch = "x86_64")]
+                use ::coresimd::arch::x86_64::_mm_movemask_epi8;
+
+                _mm_movemask_epi8(::mem::transmute(self)) != 0
+            }
+        }
+    }
+}
+
+// On x86 with AVX we use _mm256_testc_si256 and _mm256_testz_si256.
+//
+// FIXME: for masks generated from floating point vectors one should use
+// x86_mm256_testc_ps, x86_mm256_testz_ps, x86_mm256_testc_pd,
+// x86_mm256_testz_pd.Since the masks don't track whether they were produced by
+// integer or floating point vectors, we can't currently work around this yet.
+//
+// TODO: investigate perf impact and fill LLVM bugs as necessary.
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "avx"))]
+macro_rules! x86_256_avx_test_impl {
+    ($id:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "avx")]
+            unsafe fn all(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use ::coresimd::arch::x86::_mm256_testc_si256;
+                #[cfg(target_arch = "x86_64")]
+                use ::coresimd::arch::x86_64::_mm256_testc_si256;
+                _mm256_testc_si256(::mem::transmute(self),
+                                   ::mem::transmute($id::splat(true))) != 0
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "avx")]
+            unsafe fn any(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use ::coresimd::arch::x86::_mm256_testz_si256;
+                #[cfg(target_arch = "x86_64")]
+                use ::coresimd::arch::x86_64::_mm256_testz_si256;
+                _mm256_testz_si256(::mem::transmute(self),
+                                   ::mem::transmute(self)) == 0
+            }
+        }
+    }
+}
+
+// On x86 with SSE2 all/any for 256-bit wide vectors is implemented by executing
+// the algorithm for 128-bit on the higher and lower elements of the vector
+// independently.
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "sse2"))]
+macro_rules! x86_256_sse2_impl {
+    ($id:ident, $v128:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "sse2")]
+            unsafe fn all(self) -> bool {
+                unsafe {
+                    union U {
+                        halves: ($v128, $v128),
+                        vec: $id
+                    }
+                    let halves = U {vec: self}.halves;
+                    halves.0.all() && halves.1.all()
+                }
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "sse2")]
+            unsafe fn any(self) -> bool {
+                unsafe {
+                    union U {
+                        halves: ($v128, $v128),
+                        vec: $id
+                    }
+                    let halves = U {vec: self}.halves;
+                    halves.0.any() || halves.1.any()
+                }
+            }
+        }
+    }
+}
+
+// Implementation for 64-bit wide masks on x86.
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+macro_rules! x86_64_mmx_movemask_impl {
+    ($id:ident, $vec128:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "mmx")]
+            unsafe fn all(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use ::coresimd::arch::x86::_mm_movemask_pi8;
+                #[cfg(target_arch = "x86_64")]
+                use ::coresimd::arch::x86_64::_mm_movemask_pi8;
+                // _mm_movemask_pi8(a) creates an 8bit mask containing the most
+                // significant bit of each byte of `a`. If all bits are set,
+                // then all 8 lanes of the mask are true.
+                 _mm_movemask_pi8(::mem::transmute(self)) == u8::max_value() as i32
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "mmx")]
+            unsafe fn any(self) -> bool {
+                #[cfg(target_arch = "x86")]
+                use ::coresimd::arch::x86::_mm_movemask_pi8;
+                #[cfg(target_arch = "x86_64")]
+                use ::coresimd::arch::x86_64::_mm_movemask_pi8;
+
+                _mm_movemask_pi8(::mem::transmute(self)) != 0
+            }
+        }
+    }
+}
+
+// Implementation for 128-bit wide masks on x86
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+macro_rules! x86_128_impl {
+    ($id:ident) => {
+        cfg_if! {
+            if #[cfg(target_feature = "sse2")] {
+                x86_128_sse2_movemask_impl!($id);
+            }  else {
+                default_impl!($id);
+            }
+        }
+    }
+}
+
+// Implementation for 256-bit wide masks on x86
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+macro_rules! x86_256_impl {
+    ($id:ident, $half_id:ident) => {
+        cfg_if! {
+            if #[cfg(target_feature = "avx")] {
+                x86_256_avx_test_impl!($id);
+            } else if #[cfg(target_feature = "sse2")] {
+                x86_256_sse2_impl!($id, $half_id);
+            } else {
+                default_impl!($id);
+            }
+        }
+    }
+}
+
+// Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
+// minimum/maximum of adjacent pairs) for 64-bit wide two-element vectors.
+#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+macro_rules! arm_64_x2_v7_neon_impl {
+    ($id:ident, $vpmin:ident, $vpmax:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn all(self) -> bool {
+                use ::coresimd::arch::arm::$vpmin;
+                use ::mem::transmute;
+                // pmin((a, b), (-,-)) => (b, -).0 => b
+                let tmp: $id = transmute($vpmin(transmute(self), ::mem::uninitialized()));
+                tmp.extract(0)
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn any(self) -> bool {
+                use ::coresimd::arch::arm::$vpmax;
+                use ::mem::transmute;
+                // pmax((a, b), (-,-)) => (b, -).0 => b
+                let tmp: $id = transmute($vpmax(transmute(self), ::mem::uninitialized()));
+                tmp.extract(0)
+            }
+        }
+    }
+}
+
+// Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
+// minimum/maximum of adjacent pairs) for 64-bit wide four-element vectors.
+#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+macro_rules! arm_64_x4_v7_neon_impl {
+    ($id:ident, $vpmin:ident, $vpmax:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn all(self) -> bool {
+                use ::coresimd::arch::arm::$vpmin;
+                use ::mem::transmute;
+                // tmp = pmin((a, b, c, d), (-,-,-,-)) => (a, c, -, -)
+                let tmp = $vpmin(transmute(self), ::mem::uninitialized());
+                // tmp = pmin((a, b, -, -), (-,-,-,-)) => (c, -, -, -).0 => c
+                let tmp: $id = transmute($vpmin(tmp, ::mem::uninitialized()));
+                tmp.extract(0)
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn any(self) -> bool {
+                use ::coresimd::arch::arm::$vpmax;
+                use ::mem::transmute;
+                // tmp = pmax((a, b, c, d), (-,-,-,-)) => (a, c, -, -)
+                let tmp =  $vpmax(transmute(self), ::mem::uninitialized());
+                // tmp = pmax((a, b, -, -), (-,-,-,-)) => (c, -, -, -).0 => c
+                let tmp: $id = transmute($vpmax(tmp, ::mem::uninitialized()));
+                tmp.extract(0)
+            }
+        }
+    }
+}
+
+// Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
+// minimum/maximum of adjacent pairs) for 64-bit wide eight-element vectors.
+#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+macro_rules! arm_64_x8_v7_neon_impl {
+    ($id:ident, $vpmin:ident, $vpmax:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn all(self) -> bool {
+                use ::coresimd::arch::arm::$vpmin;
+                use ::mem::transmute;
+                // tmp = pmin(
+                //     (a, b, c, d, e, f, g, h),
+                //     (-, -, -, -, -, -, -, -)
+                // ) => (a, c, e, g, -, -, -, -)
+                let tmp = $vpmin(transmute(self), ::mem::uninitialized());
+                // tmp = pmin(
+                //     (a, c, e, g, -, -, -, -),
+                //     (-, -, -, -, -, -, -, -)
+                // ) => (c, g, -, -, -, -, -, -)
+                let tmp = $vpmin(tmp, ::mem::uninitialized());
+                // tmp = pmin(
+                //     (c, g, -, -, -, -, -, -),
+                //     (-, -, -, -, -, -, -, -)
+                // ) => (g, -, -, -, -, -, -, -).0 => g
+                let tmp: $id = transmute($vpmin(tmp, ::mem::uninitialized()));
+                tmp.extract(0)
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn any(self) -> bool {
+                use ::coresimd::arch::arm::$vpmax;
+                use ::mem::transmute;
+                // tmp = pmax(
+                //     (a, b, c, d, e, f, g, h),
+                //     (-, -, -, -, -, -, -, -)
+                // ) => (a, c, e, g, -, -, -, -)
+                let tmp = $vpmax(transmute(self), ::mem::uninitialized());
+                // tmp = pmax(
+                //     (a, c, e, g, -, -, -, -),
+                //     (-, -, -, -, -, -, -, -)
+                // ) => (c, g, -, -, -, -, -, -)
+                let tmp = $vpmax(tmp, ::mem::uninitialized());
+                // tmp = pmax(
+                //     (c, g, -, -, -, -, -, -),
+                //     (-, -, -, -, -, -, -, -)
+                // ) => (g, -, -, -, -, -, -, -).0 => g
+                let tmp: $id = transmute($vpmax(tmp, ::mem::uninitialized()));
+                tmp.extract(0)
+            }
+        }
+    }
+}
+
+
+// Implementation for ARM + v7 + NEON using vpmin and vpmax (folding
+// minimum/maximum of adjacent pairs) for 64-bit or 128-bit wide vectors with
+// more than two elements.
+#[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))]
+macro_rules! arm_128_v7_neon_impl {
+    ($id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn all(self) -> bool {
+                use ::coresimd::arch::arm::$vpmin;
+                use ::mem::transmute;
+                union U {
+                    halves: ($half, $half),
+                    vec: $id
+                }
+                let halves = U { vec: self }.halves;
+                let h: $half = transmute($vpmin(transmute(halves.0), transmute(halves.1)));
+                h.all()
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "v7,neon")]
+            unsafe fn any(self) -> bool {
+                use ::coresimd::arch::arm::$vpmax;
+                use ::mem::transmute;
+                union U {
+                    halves: ($half, $half),
+                    vec: $id
+                }
+                let halves = U { vec: self }.halves;
+                let h: $half = transmute($vpmax(transmute(halves.0), transmute(halves.1)));
+                h.any()
+            }
+        }
+    }
+}
+
+// Implementation for AArch64 + NEON using vmin and vmax (horizontal vector
+// min/max) for 128-bit wide vectors.
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+macro_rules! aarch64_128_neon_impl {
+    ($id:ident, $vmin:ident, $vmax:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "neon")]
+            unsafe fn all(self) -> bool {
+                use ::coresimd::arch::aarch64::$vmin;
+                $vmin(::mem::transmute(self)) != 0
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "neon")]
+            unsafe fn any(self) -> bool {
+                use ::coresimd::arch::aarch64::$vmax;
+                $vmax(::mem::transmute(self)) != 0
+            }
+        }
+    }
+}
+
+// Implementation for AArch64 + NEON using vmin and vmax (horizontal vector
+// min/max) for 64-bit wide vectors.
+//
+// This impl duplicates the 64-bit vector into a 128-bit one and calls
+// all/any on that.
+#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
+macro_rules! aarch64_64_neon_impl {
+    ($id:ident, $vec128:ident) => {
+        impl All for $id {
+            #[inline]
+            #[target_feature(enable = "neon")]
+            unsafe fn all(self) -> bool {
+                union U {
+                    halves: ($id, $id),
+                    vec: $vec128
+                }
+                U { halves: (self, self) }.vec.all()
+            }
+        }
+        impl Any for $id {
+            #[inline]
+            #[target_feature(enable = "neon")]
+            unsafe fn any(self) -> bool {
+                union U {
+                    halves: ($id, $id),
+                    vec: $vec128
+                }
+                U { halves: (self, self) }.vec.any()
+            }
+        }
+    }
+}
+
+macro_rules! impl_mask_all_any {
+    // 64-bit wide masks
+    (m8x8) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_64_mmx_movemask_impl!(m8x8, m8x16);
+            } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] {
+                arm_64_x8_v7_neon_impl!(m8x8, vpmin_u8, vpmax_u8);
+            } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
+                aarch64_64_neon_impl!(m8x8, m8x16);
+            } else {
+                default_impl!(m8x8);
+            }
+        }
+    };
+    (m16x4) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_64_mmx_movemask_impl!(m16x4, m16x8);
+            } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] {
+                arm_64_x4_v7_neon_impl!(m16x4, vpmin_u16, vpmax_u16);
+            } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
+                aarch64_64_neon_impl!(m16x4, m16x8);
+            } else {
+                default_impl!(m16x4);
+            }
+        }
+    };
+    (m32x2) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_64_mmx_movemask_impl!(m32x2, m32x4);
+            } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] {
+                arm_64_x2_v7_neon_impl!(m32x2, vpmin_u32, vpmax_u32);
+            } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
+                aarch64_64_neon_impl!(m32x2, m32x4);
+            } else {
+                default_impl!(m32x2);
+            }
+        }
+    };
+    // 128-bit wide masks
+    (m8x16) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_128_impl!(m8x16);
+            } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] {
+                arm_128_v7_neon_impl!(m8x16, m8x8, vpmin_u8, vpmax_u8);
+            } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
+                aarch64_128_neon_impl!(m8x16, vminvq_u8, vmaxvq_u8);
+            } else {
+                default_impl!(m8x16);
+            }
+        }
+    };
+    (m16x8) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_128_impl!(m16x8);
+            } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] {
+                arm_128_v7_neon_impl!(m16x8, m16x4, vpmin_u16, vpmax_u16);
+            } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
+                aarch64_128_neon_impl!(m16x8, vminvq_u16, vmaxvq_u16);
+            } else {
+                default_impl!(m16x8);
+            }
+        }
+    };
+    (m32x4) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_128_impl!(m32x4);
+            } else if #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] {
+                arm_128_v7_neon_impl!(m32x4, m32x2, vpmin_u32, vpmax_u32);
+            } else if #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] {
+                aarch64_128_neon_impl!(m32x4, vminvq_u32, vmaxvq_u32);
+            } else {
+                default_impl!(m32x4);
+            }
+        }
+    };
+    (m64x2) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_128_impl!(m64x2);
+            } else {
+                default_impl!(m64x2);
+            }
+        }
+    };
+    // 256-bit wide masks:
+    (m8x32) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_256_impl!(m8x32, m8x16);
+            } else {
+                default_impl!(m8x32);
+            }
+        }
+    };
+    (m16x16) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_256_impl!(m16x16, m16x8);
+            } else {
+                default_impl!(m16x16);
+            }
+        }
+    };
+    (m32x8) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_256_impl!(m32x8, m32x4);
+            } else {
+                default_impl!(m32x8);
+            }
+        }
+    };
+    (m64x4) => {
+        cfg_if! {
+            if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
+                x86_256_impl!(m64x4, m64x2);
+            } else {
+                default_impl!(m64x4);
+            }
+        }
+    };
+    // Fallback to LLVM's default code-generation:
+    ($id:ident) => { default_impl!($id); };
+}
+
+vector_impl!(
+    [impl_mask_all_any, m1x8],
+    [impl_mask_all_any, m1x16],
+    [impl_mask_all_any, m1x32],
+    [impl_mask_all_any, m1x64],
+    [impl_mask_all_any, m8x2],
+    [impl_mask_all_any, m8x4],
+    [impl_mask_all_any, m8x8],
+    [impl_mask_all_any, m8x16],
+    [impl_mask_all_any, m8x32],
+    [impl_mask_all_any, m16x2],
+    [impl_mask_all_any, m16x4],
+    [impl_mask_all_any, m16x8],
+    [impl_mask_all_any, m16x16],
+    [impl_mask_all_any, m32x2],
+    [impl_mask_all_any, m32x4],
+    [impl_mask_all_any, m32x8],
+    [impl_mask_all_any, m64x2],
+    [impl_mask_all_any, m64x4]
+);
diff --git a/coresimd/ppsv/codegen/mod.rs b/coresimd/ppsv/codegen/mod.rs
new file mode 100644
index 0000000000..448587b795
--- /dev/null
+++ b/coresimd/ppsv/codegen/mod.rs
@@ -0,0 +1,6 @@
+//! Work arounds for code generation issues
+
+#[cfg(target_arch = "aarch64")]
+pub mod wrapping;
+
+pub mod masks_reductions;
diff --git a/coresimd/ppsv/codegen/wrapping.rs b/coresimd/ppsv/codegen/wrapping.rs
new file mode 100644
index 0000000000..0e2f306eb0
--- /dev/null
+++ b/coresimd/ppsv/codegen/wrapping.rs
@@ -0,0 +1,42 @@
+//! Used by the wrapping_sum and wrapping_product algorithms for AArch64.
+
+pub(crate) trait Wrapping {
+    fn add(self, other: Self) -> Self;
+    fn mul(self, other: Self) -> Self;
+}
+
+macro_rules! int_impl {
+    ($id:ident) => {
+        impl Wrapping for $id {
+            fn add(self, other: Self) -> Self {
+                self.wrapping_add(other)
+            }
+            fn mul(self, other: Self) -> Self {
+                self.wrapping_mul(other)
+            }
+        }
+    };
+}
+int_impl!(i8);
+int_impl!(i16);
+int_impl!(i32);
+int_impl!(i64);
+int_impl!(u8);
+int_impl!(u16);
+int_impl!(u32);
+int_impl!(u64);
+
+macro_rules! float_impl {
+    ($id:ident) => {
+        impl Wrapping for $id {
+            fn add(self, other: Self) -> Self {
+                self + other
+            }
+            fn mul(self, other: Self) -> Self {
+                self * other
+            }
+        }
+    };
+}
+float_impl!(f32);
+float_impl!(f64);
diff --git a/coresimd/ppsv/mod.rs b/coresimd/ppsv/mod.rs
index 08b7ce80d6..0d48509e3a 100644
--- a/coresimd/ppsv/mod.rs
+++ b/coresimd/ppsv/mod.rs
@@ -80,51 +80,5 @@ impl<T> FromBits<T> for T {
     }
 }
 
-/// Workarounds code generation issues.
-#[cfg(target_arch = "aarch64")]
-mod codegen {
-    #[cfg(target_arch = "aarch64")]
-    pub mod wrapping {
-        pub trait Wrapping {
-            fn add(self, other: Self) -> Self;
-            fn mul(self, other: Self) -> Self;
-        }
-
-        macro_rules! int_impl {
-            ($id:ident) => {
-                impl Wrapping for $id {
-                    fn add(self, other: Self) -> Self {
-                        self.wrapping_add(other)
-                    }
-                    fn mul(self, other: Self) -> Self {
-                        self.wrapping_mul(other)
-                    }
-                }
-            };
-        }
-        int_impl!(i8);
-        int_impl!(i16);
-        int_impl!(i32);
-        int_impl!(i64);
-        int_impl!(u8);
-        int_impl!(u16);
-        int_impl!(u32);
-        int_impl!(u64);
-
-        macro_rules! float_impl {
-            ($id:ident) => {
-                impl Wrapping for $id {
-                    fn add(self, other: Self) -> Self {
-                        self + other
-                    }
-                    fn mul(self, other: Self) -> Self {
-                        self * other
-                    }
-                }
-            };
-        }
-        float_impl!(f32);
-        float_impl!(f64);
-    }
-
-}
+/// Work arounds code generation issues.
+mod codegen;
diff --git a/crates/coresimd/Cargo.toml b/crates/coresimd/Cargo.toml
index 5bc2e5d7ef..3fb757c544 100644
--- a/crates/coresimd/Cargo.toml
+++ b/crates/coresimd/Cargo.toml
@@ -18,6 +18,9 @@ is-it-maintained-issue-resolution = { repository = "rust-lang-nursery/stdsimd" }
 is-it-maintained-open-issues = { repository = "rust-lang-nursery/stdsimd" }
 maintenance = { status = "experimental" }
 
+[dependencies]
+cfg-if = "0.1"
+
 [dev-dependencies]
 stdsimd-test = { version = "0.*", path = "../stdsimd-test" }
 stdsimd = { version = "0.0.3", path = "../stdsimd" }
diff --git a/crates/coresimd/src/lib.rs b/crates/coresimd/src/lib.rs
index f4b3f1ff5d..81c49623f6 100644
--- a/crates/coresimd/src/lib.rs
+++ b/crates/coresimd/src/lib.rs
@@ -43,6 +43,8 @@ extern crate stdsimd;
 extern crate stdsimd_test;
 #[cfg(test)]
 extern crate test;
+#[macro_use]
+extern crate cfg_if;
 
 macro_rules! test_v16 {
     ($item:item) => {};