Workarounds for all/any mask reductions on x86, armv7, and aarch64 #425

gnzlbg · 2018-04-12T13:53:00Z

This commit adds workarounds for the mask reductions: all and any.

Note:

not all LLVM bugs have been properly reported yet
workarounds for 256-bit wide vector masks on armv7 and aarch64 are
not provided in this PR

64-bit wide mask types (`m8x8`, `m16x4`, `m32x2`)

`x86_64` with `MMX` enabled

all_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret

After this PR for m8x8, m16x4, m32x2:

all_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 pop     rbp
 ret

`x86` with `MMX` enabled

Before this PR:

all_8x8:
 call    L9$pb
L9$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
any_8x8:
 call    L8$pb
L8$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret

After this PR:

all_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 ret
any_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 ret

`aarch64`

Before this PR:

all_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret

After this PR:

all_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret

`ARMv7` + `neon`

Before this PR:

all_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, #4
 vand    d0, d0, d1
 vext.8  d1, d0, d0, #2
 vand    d0, d0, d1
 vdup.8  d1, d0[1]
 vand    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, #4
 vorr    d0, d0, d1
 vext.8  d1, d0, d0, #2
 vorr    d0, d0, d1
 vdup.8  d1, d0[1]
 vorr    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr

After this PR:

all_8x8:
 vldr    d0, [r0]
 b       <m8x8 as All>::all

<m8x8 as All>::all:
 vpmin.u8 d16, d0, d16
 vpmin.u8 d16, d16, d16
 vpmin.u8 d0, d16, d16
 b       m8x8::extract

any_8x8:
 vldr    d0, [r0]
 b       <m8x8 as Any>::any
 
<m8x8 as Any>::any:
 vpmax.u8 d16, d0, d16
 vpmax.u8 d16, d16, d16
 vpmax.u8 d0, d16, d16
 b       m8x8::extract

(note: inlining does not work properly on ARMv7)

128-bit wide mask types (`m8x16`, `m16x8`, `m32x4`, `m64x2`)

`x86_64` with SSE2 enabled

Before this PR:

all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret

After this PR:

all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 sete    al
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret

`aarch64`

Before this PR:

all_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[8]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[9]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[10]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[11]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[12]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[13]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[14]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[15]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 umov    w9, v0.b[8]
 orr     w8, w8, w9
 umov    w9, v0.b[9]
 orr     w8, w8, w9
 umov    w9, v0.b[10]
 orr     w8, w8, w9
 umov    w9, v0.b[11]
 orr     w8, w8, w9
 umov    w9, v0.b[12]
 orr     w8, w8, w9
 umov    w9, v0.b[13]
 orr     w8, w8, w9
 umov    w9, v0.b[14]
 orr     w8, w8, w9
 umov    w9, v0.b[15]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret

After this PR:

all_8x16:
 ldr     q0, [x0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x16:
 ldr     q0, [x0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret

`ARMv7` + `neon`

Before this PR:

all_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, #8
 vand    q0, q0, q1
 vext.8  q1, q0, q0, #4
 vand    q0, q0, q1
 vext.8  q1, q0, q0, #2
 vand    q0, q0, q1
 vdup.8  q1, d0[1]
 vand    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, #8
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, #4
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, #2
 vorr    q0, q0, q1
 vdup.8  q1, d0[1]
 vorr    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr

After this PR:

all_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as All>::all
 
<m8x16 as All>::all: 
 vpmin.u8 d0, d0, d
 b       <m8x8 as All>::all
any_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as Any>::any
 
<m8x16 as Any>::any:
 vpmax.u8 d0, d0, d1
 b       <m8x8 as Any>::any

The inlining problems are pretty bad on ARMv7 + NEON.

256-bit wide mask types (`m8x32`, `m16x16`, `m32x8`, `m64x4`)

With SSE2 enabled

Before this PR:

all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI17_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 movdqa  xmm2, xmmword, ptr, [rdi, +, 16]
 pand    xmm2, xmm0
 pcmpeqb xmm2, xmm0
 pcmpeqb xmm1, xmm0
 pand    xmm1, xmm2
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
 any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 por     xmm0, xmmword, ptr, [rdi, +, 16]
 movdqa  xmm1, xmmword, ptr, [rip, +, LCPI16_0]
 pand    xmm0, xmm1
 pcmpeqb xmm0, xmm1
 pmovmskb eax, xmm0
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret

After this PR:

all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 jne     LBB17_1
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb ecx, xmm0
 mov     al, 1
 cmp     ecx, 65535
 je      LBB17_3
LBB17_1:
 xor     eax, eax
LBB17_3:
 pop     rbp
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb ecx, xmm0
 mov     al, 1
 test    ecx, ecx
 je      LBB16_1
 pop     rbp
 ret
LBB16_1:
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret

With AVX enabled

Before this PR:

all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vandps  ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vandps  ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vandps  ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vorps   ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vorps   ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vorps   ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret

After this PR:

all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vxorps  xmm1, xmm1, xmm1
 vcmptrueps ymm1, ymm1, ymm1
 vptest  ymm0, ymm1
 setb    al
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vptest  ymm0, ymm0
 setne   al
 pop     rbp
 vzeroupper
 ret

Closes #362 .

gnzlbg · 2018-04-12T13:53:52Z

cc @hsivonen

--

@alexcrichton there are still some failures on x86, but I have no idea why they happen yet, the asm code generated for the x86 and the x86_64 functions looks identical to me. Also, it appears that on ARMv7 nothing gets ever inlined and I don't know why. AFAICT I am setting #[inline] and #[target_feature] appropriately.

alexcrichton · 2018-04-12T15:37:30Z

Thanks for this! I'll admit though I'm very quickly getting lost in all this...

So purely on the surface I'm curious what's going on here? For example if we conditionalize these impls on #[cfg(target_feature = ...)] that's not too useful, right? Don't we want everything to be compiled depending on downstream users compilation environment?

I realize that we don't really have any better solution than this today, but this to me feels like we should be pursuing a different language-level solution. For example I feel like we should be able to monomorphize these implementations into downstream crates, reconfiguring the settings dynamically as they're monomorphized. That way if you compile with -C target-feature=+neon you get all the optimized versions (and still no runtime overhead of checking) from stdsimd.

Does that makes sense? Is that perhaps too far out though?

gnzlbg · 2018-04-12T16:09:36Z

@alexcrichton

For example if we conditionalize these impls on #[cfg(target_feature = ...)] that's not too useful, right? Don't we want everything to be compiled depending on downstream users compilation environment?

Well sse2 is available on x86_64 by default, and neon is available on aarch64 by default, so those would work with std out of the box.

For AVX and neon on armv7 users would need to recompile std with those features enabled.

I realize that we don't really have any better solution than this today, but this to me feels like we should be pursuing a different language-level solution. For example I feel like we should be able to monomorphize these implementations into downstream crates, reconfiguring the settings dynamically as they're monomorphized. That way if you compile with -C target-feature=+neon you get all the optimized versions (and still no runtime overhead of checking) from stdsimd.

I was talking with @rkruppe about this the other day. The main problems are that:

we can't do this with an attribute-based approach
because any crate in the ecosystem can write #[target_feature(enable = "avx")] fn foo() { ...} and call anything from there, all the code of the whole program has to be available to the current translation unit, so that all of it can be monomorphized for the current set of target features enabled depending on where it is called.

Does that makes sense?

Yes, this is what I want, and probably what all other users want too. T

Is that perhaps too far out though?

That depends. Three main goals for 2018 are effects: async, const fn, and target-feature.

Rust code wants to be able to ask "am I being executed at compile-time" about the const effect, "am I being called synchronously or asynchronously" for async code (e.g. if the code is called with or without await), and none of these is that much different from asking "am I being compiled with AVX", and branch depending on the answer.

Rust's approach towards these 2018 goals has been "let's #[attribute] the hell out of these problems", but now that we have solutions to all these problems, instead of rushing towards stabilization, I think we should give the PL PhDs working on Rust enough time to propose an effect system that handles all of these in a principled way.

@Centril is preparing an RFC for an effect-system for Rust. If we think of target-feature as an effect, all Rust code must be polymorphic on it. This would fix some of the hard-to-fix bugs that are currently open, like cfg(target_feature) returning false for a feature that is enabled via #[target_feature] in the function where it is used, and it would fix them across functions and crates, not only in lexical scopes. Also, with an effect-system the RFC target-feature 1.1 wouldn't probably be necessary.

Honestly, I wish we could ship std::arch without shipping #[target_feature], cfg(target_feature), is_..._feature_detected, and company.

hanna-kruppe · 2018-04-12T16:21:06Z

A general effect system is a huge step. certainly won't make it into Rust 2018, any may not ever be added.

But regardless of that, I don't think it's a good idea to treat this (propagating target features to libraries) as a general monomorphization type deal. Making all code parametric over target features and monomorphizing for different sets of target features seems like an incredibly great way for binary size to explode completely. There has to be some sort of opt-in, otherwise any program that uses some non-trivial #[target_feature] attributes would contain multiple copies of the libraries it uses, in most cases without any appreciable performance improvement.

alexcrichton · 2018-04-12T16:48:59Z

Hm ok this makes sense. I agree with @rkruppe that gaining an effect system is unlikely to happen any time soon so we probably don't want to plan on that happening.

I suppose another way to put my reaction is that I'm confused why we have to do this in the first place. Sort of the whole point of "portable SIMD" is to "pray the backend takes care of everything". If we end up defining everything ourselves and having massive swaths of #[cfg] then why not just do this all on crates.io?

It sounds like these are also considered LLVM bugs but presumably we're going to continuously run into these issues, right?

hanna-kruppe · 2018-04-12T17:02:38Z

Well there is a fundamental issue with putting such code into libraries that are called from multiple contexts: you either duplicate the function for each context to exploit knowledge, or you share the code by serving the lowest common denominator.

There are of course intermediates between these extremes: for some kinds of operations, regular inlining can help when it kicks in, and only generating LLVM IR when compiling leaf crates (rust-lang/rust#38913) would effectively propagate -C options (but not #[target_feature]s) to all crates in the dependency graph. But if you want to always use the caller's target features, you have to duplicate the code for every set of target features, which is not obviously desirable even if restricted to functions that use the portable vector types (e.g., most algorithms on 128-bit vectors won't benefit from having AVX available).

gnzlbg · 2018-04-12T17:44:31Z

I suppose another way to put my reaction is that I'm confused why we have to do this in the first place. Sort of the whole point of "portable SIMD" is to "pray the backend takes care of everything".

I am open to better ways to proceed. The current approach has been to:

fill these bugs (LLVM bugs: 37087, 36982, 36933, 36796, 36734, 36702, amongst others).
get them fixed: two are fixed already, two have patches under review.
add workarounds for the critical ones, like these two, which prevent Firefox from moving off the simd crate

What do you propose that we do instead? We could not add workarounds and tell people to live with the bugs for a while or work around them in their own crates. That would be fine for me. Since we are not following LLVM trunk, "for a while" might be 6 months or 1.5 years, depending on how unlucky we get. And for these two, I think LLVM8 is optimistic at best.

If we end up defining everything ourselves and having massive swaths of #[cfg] then why not just do this all on crates.io?

The workarounds here account for less than 1% of the vector methods (these are less than 20 methods, and we have currently ~2000). Over 99% of the methods currently use the LLVM intrinsics, so we would need to stabilize those to be able to do this on crates.io.

The alternative would be to not add any methods with bugs to the vector types. Sadly, when the issue to add these methods (that were already part of the simd crate) was opened, nobody mentioned that their LLVM implementations had bugs. Nobody even mentioned that their implementation in the simd crate also had bugs.

EDIT: I would prefer to fix these directly in LLVM, but fixing these in LLVM just to discover 6 months from now that the fix wasn't enough and having to wait another 6 months to try again sucks. Having the workarounds already in place makes it suck a whole lot less.

Centril · 2018-04-12T18:27:54Z

Small note on effects: I'm not preparing an RFC on a general effect system, but rather for const specifically, but I want to gain understanding of effects in general as well.

alexcrichton · 2018-04-12T19:13:44Z

@gnzlbg hm ok, good to have numbers!

In my mind the "absolute ideal" solution here is to have a special node in MIR corresponding to "expand to true or false depending on target features when monomorphized". That would be an unstable language feature, obviously, but means you could write everything here as:

if __unstable_target_feature_activated("foo") {
    // ...
} else {
    // ...
}

and that wouldn't have the same runtime penalty of is_x86_feature_detected!.

This is definitely very similar to MIR-only rlibs and what happens with overflow checking and debug assertions for primitives today. I think such a feature would at least help reduce the boilerplate here and also lift the need for recompiling std as these are all inlined and monomorphized on the fly anyway.

In any case I do think that we could probably land this in the meantime. I'd be hesitant to land anything that's not already exposed thorugh the standard targets though. For example the AVX-specific code here I don't think is ever tested or would be compiled?

In terms of our own CI as well, perhaps we should run cargo test for a few various levels of -C target-feature=.. for the portable types?

gnzlbg · 2018-04-12T19:41:34Z

I think such a feature would at least help reduce the boilerplate here.

AFAICT most of the boilerplate here is because of the different targets, and because for each target the intrinsics have different paths. We would need to keep cfg_if for the targets, and use the new stuff for target features. So I am unsure that such a feature could make it significantly less messy.

lift the need for recompiling std as these are all inlined and

monomorphized on the fly anyway. If the feature would monomorphize on the fly std for AVX2 when the user enables that via RUSTFLAGS or target feature then it would be worth it, but then we should keep the AVX code.

For example the AVX-specific code here I don't think is ever tested or

would be compiled? I tested it locally, but I think we might want to either add a build bot for it or test it on each x86 CI bot.

we should run cargo test for a few various levels of -C target-feature=.. for

the portable types? The only thing not currently tested is AVX on x86/x86_64 I’d like to add code gen test for the portable vector types anyways but I don’t know what the best way to do this is yet. We basically need to check full assembly snippets, not only single instructions, maybe something like assert_instr could be used here. Doing this for each arch and intrinsics is a lot of work :/

alexcrichton · 2018-04-12T23:58:47Z

Oh I'm not too worried about the codegen for the function itself but moreso am worried about the correctness. In that sense I think it's fine to hand-inspect assembly and otherwise just test for correctness

gnzlbg · 2018-04-13T08:17:45Z

am worried about the correctness.

I've updated the ci/run script to compile with avx as well on all x86 targets and run the release tests.

These two intrinsics are tested by the portable vector types tests, and all is tested as well in the std::arch tests (because PartialEq is implemented as a.eq(b).all()). So if these pass that should cover the correctness.

Oh I'm not too worried about the codegen for the function itself

I am not too worried either, but the whole point of these workarounds is to fix codegen bugs, so I'd rather check that they work as intended and that we don't inadvertently break this in the future.

hsivonen · 2018-04-13T12:06:23Z

coresimd/ppsv/codegen/masks_reductions.rs

+                    halves: ($half, $half),
+                    vec: $id
+                }
+                let halves = U { vec: self }.halves;


Is this a valid way to split a vector into its halves? It seems that clang uses shuffles for this purpose.

With this implementation, I see a bunch of functions whose core follows this pattern fail unit tests:

#[inline(always)] pub fn simd_is_ascii(s: u8x16) -> bool { let above_ascii = u8x16::splat(0x80); s.lt(above_ascii).all() }

Is this a valid way to split a vector into its halves?

It is a valid way to do this. The RFC specifies that the layout of vectors is the same as arrays of the same element type and the same number of lanes, e.g., f32x2 has the same layout as [f32; 2]. Since the layout of arrays is the same as the layout of tuples of the same element type and same number of element, e.g., (f32, f32) in the previous example, then yes, this is valid. I chose a tuple over an array here because I did not want to worry about doing .get_unchecked.

With this implementation, I see a bunch of functions whose core follows this pattern fail unit tests:

On which arch? Can you post a minimal example that reproduces it here?

On which arch? Can you post a minimal example that reproduces it here?

ARMv7 with -C target_feature=+neon. I'll try to minimize it.

Does this work on the other targets like x86 and x86_64 ?

Yes. I tested that the pattern works on x86_64 by temporarily replacing the x86_64-specific code with the portable code.

Sorry about misattributing the bug. The bug was a misplaced parenthesis in my conditional compilation directives, which caused ARMv7+NEON to use a stride size constant intended for ARMv7 ALU code.

So: Not a blocker for getting this PR merged.

hsivonen · 2018-04-13T12:26:34Z

I suppose another way to put my reaction is that I'm confused why we have to do this in the first place. Sort of the whole point of "portable SIMD" is to "pray the backend takes care of everything".

That position may make sense from the compiler front-end developer point of view, but from the point of view of a user of the language, the user of the language sees the standard library interface and doesn't care which layer (standard library, compiler front end or LLVM) does the Right Thing.

This PR makes the black box that the user of the language sees do the Right Thing (ignoring the concern I have about whether the ARMv7+NEON code path splits the vector into its halves using the right LLVM idiom; I didn't debug it fully yet).

It would be really sad not to get the user-of-the-language-facing Right Thing at this time even if ideally it could be moved to another layer later.

If we end up defining everything ourselves and having massive swaths of #[cfg] then why not just do this all on crates.io?

These operations logically belong on the types provided by std::simd, so moving these operations to crates.io would be really bad API design.

The PR addresses the performance concern I've had regarding moving encoding_rs/Firefox from the simd crate to stdsimd on x86_64 and aarch64. (x86 benchmarks are still running. I'm running them on actual non-x86_64 hardware, so it's like watching paint dry.)

hsivonen · 2018-04-13T12:56:15Z

With this implementation, I see a bunch of functions whose core follows this pattern fail unit tests:

       #[inline(always)]
       pub fn simd_is_ascii(s: u8x16) -> bool {
           let above_ascii = u8x16::splat(0x80);
           s.lt(above_ascii).all()
       }

I used

RUSTFLAGS='-C target_feature=+neon' cargo test --features simd-accel

With revision d7013a3393611b678b9f4abc99510bc875de7b68 of the stdsimd branch of encoding_rs.

(x86 benchmarks are still running. I'm running them on actual non-x86_64 hardware, so it's like watching paint dry.)

And the x86 results are in.

This PR addresses my performance concerns on all the architectures that encoding_rs uses the simd crate on at present (x86, x86_64 and aarch64). 🎉 Thank you, @gnzlbg!

gnzlbg · 2018-04-13T13:18:59Z

It would be really sad not to get the user-of-the-language-facing Right Thing at this time even if ideally it could be moved to another layer later.

@hsivonen the problem is that patching this on the front-end is brittle. For example, this patch does not work for #[target_feature(enable = "avx")] unless the user recompiles std with avx enabled. Patching this on the backend wouldn't require that. So this is at best a stop-gap solution until this gets fixed in LLVM and we upgrade to that particular LLVM version.

hsivonen · 2018-04-13T14:19:38Z

So this is at best a stop-gap solution until this gets fixed in LLVM and we upgrade to that particular LLVM version.

To be clear, I think that this ideally belongs in LLVM and should be fixed there eventually. But I think this PR should land to make stuff not unusably slow in the mean time instead of making users of the language wait for the LLVM change plus the LLVM change making its way to rustc. In particular, failure to address the AVX case in an ideal way should not be a reason not to land the fix that makes 128-bit SIMD more portable in a performant way.

alexcrichton · 2018-04-13T16:19:21Z

ci/run.sh

+case ${TARGET} in
+    x86*)
+        RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx"
+        cargo_test "--release"


This'll somehow need to disable all the assert_instr tests as well as anything beneath avx is likely to generate different instructions with avx enabled

alexcrichton · 2018-04-13T16:21:07Z

r=me with green CI

gnzlbg · 2018-04-20T07:44:59Z

@hsivonen this PR has pretty much stalled since I was low on time, feel free to pick it up. Otherwise I might have some time to finish it next week.

gnzlbg · 2018-04-27T15:18:58Z

So the intel sde build bot is failing due to failing inlining, probably #427 but showing up on x86 :/

I have enabled all relevant target-features in the last commit; we'll see if that reduces the number of failures there.

alexcrichton · 2018-04-27T15:57:14Z

Hm it may be that the env var to ignore assert_instr isn't working? When enabling features we should ignore the assert_instr tests I think

gnzlbg · 2018-04-27T16:29:20Z

@alexcrichton yes that's not working yet, but the issue i mentioned is unrelated. Raising the -C target-feature=+avx to AVX from SSE2 results in a lot of intrinsics that were previously inlined not being inlined anymore :/

I would expect the intrinsics to get inlined, and those asser instr to fail because different code is generated (e.g. because LLVM can do something better), but that's not why they are failing. They are failing because they contain a call instruction to the #[target_feature] intrinsics :/

alexcrichton · 2018-04-27T17:49:47Z

Right yeah but I think your diagnosis of #427 is correct, and this should be fixed by rust-lang/rust#50188

This commit adds workarounds for the mask reductions: `all` and `any`. 64-bit wide mask types (`m8x8`, `m16x4`, `m32x2`) `x86_64` with `MMX` enabled ```asm all_8x8: push rbp mov rbp, rsp movzx eax, byte, ptr, [rdi, +, 7] movd xmm0, eax movzx eax, byte, ptr, [rdi, +, 6] movd xmm1, eax punpcklwd xmm1, xmm0 movzx eax, byte, ptr, [rdi, +, 5] movd xmm0, eax movzx eax, byte, ptr, [rdi, +, 4] movd xmm2, eax punpcklwd xmm2, xmm0 punpckldq xmm2, xmm1 movzx eax, byte, ptr, [rdi, +, 3] movd xmm0, eax movzx eax, byte, ptr, [rdi, +, 2] movd xmm1, eax punpcklwd xmm1, xmm0 movzx eax, byte, ptr, [rdi, +, 1] movd xmm0, eax movzx eax, byte, ptr, [rdi] movd xmm3, eax punpcklwd xmm3, xmm0 punpckldq xmm3, xmm1 punpcklqdq xmm3, xmm2 movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0] pand xmm3, xmm0 pcmpeqw xmm3, xmm0 pshufd xmm0, xmm3, 78 pand xmm0, xmm3 pshufd xmm1, xmm0, 229 pand xmm1, xmm0 movdqa xmm0, xmm1 psrld xmm0, 16 pand xmm0, xmm1 movd eax, xmm0 and al, 1 pop rbp ret any_8x8: push rbp mov rbp, rsp movzx eax, byte, ptr, [rdi, +, 7] movd xmm0, eax movzx eax, byte, ptr, [rdi, +, 6] movd xmm1, eax punpcklwd xmm1, xmm0 movzx eax, byte, ptr, [rdi, +, 5] movd xmm0, eax movzx eax, byte, ptr, [rdi, +, 4] movd xmm2, eax punpcklwd xmm2, xmm0 punpckldq xmm2, xmm1 movzx eax, byte, ptr, [rdi, +, 3] movd xmm0, eax movzx eax, byte, ptr, [rdi, +, 2] movd xmm1, eax punpcklwd xmm1, xmm0 movzx eax, byte, ptr, [rdi, +, 1] movd xmm0, eax movzx eax, byte, ptr, [rdi] movd xmm3, eax punpcklwd xmm3, xmm0 punpckldq xmm3, xmm1 punpcklqdq xmm3, xmm2 movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0] pand xmm3, xmm0 pcmpeqw xmm3, xmm0 pshufd xmm0, xmm3, 78 por xmm0, xmm3 pshufd xmm1, xmm0, 229 por xmm1, xmm0 movdqa xmm0, xmm1 psrld xmm0, 16 por xmm0, xmm1 movd eax, xmm0 and al, 1 pop rbp ret ``` After this PR for `m8x8`, `m16x4`, `m32x2`: ```asm all_8x8: push rbp mov rbp, rsp movq mm0, qword, ptr, [rdi] pmovmskb eax, mm0 cmp eax, 255 sete al pop rbp ret any_8x8: push rbp mov rbp, rsp movq mm0, qword, ptr, [rdi] pmovmskb eax, mm0 test eax, eax setne al pop rbp ret ``` x86` with `MMX` enabled Before this PR: ```asm all_8x8: call L9$pb L9$pb: pop eax mov ecx, dword, ptr, [esp, +, 4] movzx edx, byte, ptr, [ecx, +, 7] movd xmm0, edx movzx edx, byte, ptr, [ecx, +, 6] movd xmm1, edx punpcklwd xmm1, xmm0 movzx edx, byte, ptr, [ecx, +, 5] movd xmm0, edx movzx edx, byte, ptr, [ecx, +, 4] movd xmm2, edx punpcklwd xmm2, xmm0 punpckldq xmm2, xmm1 movzx edx, byte, ptr, [ecx, +, 3] movd xmm0, edx movzx edx, byte, ptr, [ecx, +, 2] movd xmm1, edx punpcklwd xmm1, xmm0 movzx edx, byte, ptr, [ecx, +, 1] movd xmm0, edx movzx ecx, byte, ptr, [ecx] movd xmm3, ecx punpcklwd xmm3, xmm0 punpckldq xmm3, xmm1 punpcklqdq xmm3, xmm2 movdqa xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb] pand xmm3, xmm0 pcmpeqw xmm3, xmm0 pshufd xmm0, xmm3, 78 pand xmm0, xmm3 pshufd xmm1, xmm0, 229 pand xmm1, xmm0 movdqa xmm0, xmm1 psrld xmm0, 16 pand xmm0, xmm1 movd eax, xmm0 and al, 1 ret any_8x8: call L8$pb L8$pb: pop eax mov ecx, dword, ptr, [esp, +, 4] movzx edx, byte, ptr, [ecx, +, 7] movd xmm0, edx movzx edx, byte, ptr, [ecx, +, 6] movd xmm1, edx punpcklwd xmm1, xmm0 movzx edx, byte, ptr, [ecx, +, 5] movd xmm0, edx movzx edx, byte, ptr, [ecx, +, 4] movd xmm2, edx punpcklwd xmm2, xmm0 punpckldq xmm2, xmm1 movzx edx, byte, ptr, [ecx, +, 3] movd xmm0, edx movzx edx, byte, ptr, [ecx, +, 2] movd xmm1, edx punpcklwd xmm1, xmm0 movzx edx, byte, ptr, [ecx, +, 1] movd xmm0, edx movzx ecx, byte, ptr, [ecx] movd xmm3, ecx punpcklwd xmm3, xmm0 punpckldq xmm3, xmm1 punpcklqdq xmm3, xmm2 movdqa xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb] pand xmm3, xmm0 pcmpeqw xmm3, xmm0 pshufd xmm0, xmm3, 78 por xmm0, xmm3 pshufd xmm1, xmm0, 229 por xmm1, xmm0 movdqa xmm0, xmm1 psrld xmm0, 16 por xmm0, xmm1 movd eax, xmm0 and al, 1 ret ``` After this PR: ```asm all_8x8: mov eax, dword, ptr, [esp, +, 4] movq mm0, qword, ptr, [eax] pmovmskb eax, mm0 cmp eax, 255 sete al ret any_8x8: mov eax, dword, ptr, [esp, +, 4] movq mm0, qword, ptr, [eax] pmovmskb eax, mm0 test eax, eax setne al ret ``` `aarch64` Before this PR: ```asm all_8x8: ldr d0, [x0] umov w8, v0.b[0] umov w9, v0.b[1] tst w8, #0xff umov w10, v0.b[2] cset w8, ne tst w9, #0xff cset w9, ne tst w10, #0xff umov w10, v0.b[3] and w8, w8, w9 cset w9, ne tst w10, #0xff umov w10, v0.b[4] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[5] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[6] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[7] and w8, w9, w8 cset w9, ne tst w10, #0xff and w8, w9, w8 cset w9, ne and w0, w9, w8 ret any_8x8: ldr d0, [x0] umov w8, v0.b[0] umov w9, v0.b[1] orr w8, w8, w9 umov w9, v0.b[2] orr w8, w8, w9 umov w9, v0.b[3] orr w8, w8, w9 umov w9, v0.b[4] orr w8, w8, w9 umov w9, v0.b[5] orr w8, w8, w9 umov w9, v0.b[6] orr w8, w8, w9 umov w9, v0.b[7] orr w8, w8, w9 tst w8, #0xff cset w0, ne ret ``` After this PR: ```asm all_8x8: ldr d0, [x0] mov v0.d[1], v0.d[0] uminv b0, v0.16b fmov w8, s0 tst w8, #0xff cset w0, ne ret any_8x8: ldr d0, [x0] mov v0.d[1], v0.d[0] umaxv b0, v0.16b fmov w8, s0 tst w8, #0xff cset w0, ne ret ``` `ARMv7` + `neon` Before this PR: ```asm all_8x8: vmov.i8 d0, #0x1 vldr d1, [r0] vtst.8 d0, d1, d0 vext.8 d1, d0, d0, rust-lang#4 vand d0, d0, d1 vext.8 d1, d0, d0, rust-lang#2 vand d0, d0, d1 vdup.8 d1, d0[1] vand d0, d0, d1 vmov.u8 r0, d0[0] and r0, r0, #1 bx lr any_8x8: vmov.i8 d0, #0x1 vldr d1, [r0] vtst.8 d0, d1, d0 vext.8 d1, d0, d0, rust-lang#4 vorr d0, d0, d1 vext.8 d1, d0, d0, rust-lang#2 vorr d0, d0, d1 vdup.8 d1, d0[1] vorr d0, d0, d1 vmov.u8 r0, d0[0] and r0, r0, #1 bx lr ``` After this PR: ```asm all_8x8: vldr d0, [r0] b <m8x8 as All>::all <m8x8 as All>::all: vpmin.u8 d16, d0, d16 vpmin.u8 d16, d16, d16 vpmin.u8 d0, d16, d16 b m8x8::extract any_8x8: vldr d0, [r0] b <m8x8 as Any>::any <m8x8 as Any>::any: vpmax.u8 d16, d0, d16 vpmax.u8 d16, d16, d16 vpmax.u8 d0, d16, d16 b m8x8::extract ``` (note: inlining does not work properly on ARMv7) 128-bit wide mask types (`m8x16`, `m16x8`, `m32x4`, `m64x2`) `x86_64` with SSE2 enabled Before this PR: ```asm all_8x16: push rbp mov rbp, rsp movdqa xmm0, xmmword, ptr, [rip, +, LCPI9_0] movdqa xmm1, xmmword, ptr, [rdi] pand xmm1, xmm0 pcmpeqb xmm1, xmm0 pmovmskb eax, xmm1 xor ecx, ecx cmp eax, 65535 mov eax, -1 cmovne eax, ecx and al, 1 pop rbp ret any_8x16: push rbp mov rbp, rsp movdqa xmm0, xmmword, ptr, [rip, +, LCPI8_0] movdqa xmm1, xmmword, ptr, [rdi] pand xmm1, xmm0 pcmpeqb xmm1, xmm0 pmovmskb eax, xmm1 neg eax sbb eax, eax and al, 1 pop rbp ret ``` After this PR: ```asm all_8x16: push rbp mov rbp, rsp movdqa xmm0, xmmword, ptr, [rdi] pmovmskb eax, xmm0 cmp eax, 65535 sete al pop rbp ret any_8x16: push rbp mov rbp, rsp movdqa xmm0, xmmword, ptr, [rdi] pmovmskb eax, xmm0 test eax, eax setne al pop rbp ret ``` `aarch64` Before this PR: ```asm all_8x16: ldr q0, [x0] umov w8, v0.b[0] umov w9, v0.b[1] tst w8, #0xff umov w10, v0.b[2] cset w8, ne tst w9, #0xff cset w9, ne tst w10, #0xff umov w10, v0.b[3] and w8, w8, w9 cset w9, ne tst w10, #0xff umov w10, v0.b[4] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[5] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[6] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[7] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[8] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[9] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[10] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[11] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[12] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[13] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[14] and w8, w9, w8 cset w9, ne tst w10, #0xff umov w10, v0.b[15] and w8, w9, w8 cset w9, ne tst w10, #0xff and w8, w9, w8 cset w9, ne and w0, w9, w8 ret any_8x16: ldr q0, [x0] umov w8, v0.b[0] umov w9, v0.b[1] orr w8, w8, w9 umov w9, v0.b[2] orr w8, w8, w9 umov w9, v0.b[3] orr w8, w8, w9 umov w9, v0.b[4] orr w8, w8, w9 umov w9, v0.b[5] orr w8, w8, w9 umov w9, v0.b[6] orr w8, w8, w9 umov w9, v0.b[7] orr w8, w8, w9 umov w9, v0.b[8] orr w8, w8, w9 umov w9, v0.b[9] orr w8, w8, w9 umov w9, v0.b[10] orr w8, w8, w9 umov w9, v0.b[11] orr w8, w8, w9 umov w9, v0.b[12] orr w8, w8, w9 umov w9, v0.b[13] orr w8, w8, w9 umov w9, v0.b[14] orr w8, w8, w9 umov w9, v0.b[15] orr w8, w8, w9 tst w8, #0xff cset w0, ne ret ``` After this PR: ```asm all_8x16: ldr q0, [x0] uminv b0, v0.16b fmov w8, s0 tst w8, #0xff cset w0, ne ret any_8x16: ldr q0, [x0] umaxv b0, v0.16b fmov w8, s0 tst w8, #0xff cset w0, ne ret ``` `ARMv7` + `neon` Before this PR: ```asm all_8x16: vmov.i8 q0, #0x1 vld1.64 {d2, d3}, [r0] vtst.8 q0, q1, q0 vext.8 q1, q0, q0, rust-lang#8 vand q0, q0, q1 vext.8 q1, q0, q0, rust-lang#4 vand q0, q0, q1 vext.8 q1, q0, q0, rust-lang#2 vand q0, q0, q1 vdup.8 q1, d0[1] vand q0, q0, q1 vmov.u8 r0, d0[0] and r0, r0, #1 bx lr any_8x16: vmov.i8 q0, #0x1 vld1.64 {d2, d3}, [r0] vtst.8 q0, q1, q0 vext.8 q1, q0, q0, rust-lang#8 vorr q0, q0, q1 vext.8 q1, q0, q0, rust-lang#4 vorr q0, q0, q1 vext.8 q1, q0, q0, rust-lang#2 vorr q0, q0, q1 vdup.8 q1, d0[1] vorr q0, q0, q1 vmov.u8 r0, d0[0] and r0, r0, #1 bx lr ``` After this PR: ```asm all_8x16: vld1.64 {d0, d1}, [r0] b <m8x16 as All>::all <m8x16 as All>::all: vpmin.u8 d0, d0, d b <m8x8 as All>::all any_8x16: vld1.64 {d0, d1}, [r0] b <m8x16 as Any>::any <m8x16 as Any>::any: vpmax.u8 d0, d0, d1 b <m8x8 as Any>::any ``` The inlining problems are pretty bad on ARMv7 + NEON. 256-bit wide mask types (`m8x32`, `m16x16`, `m32x8`, `m64x4`) With SSE2 enabled Before this PR: ```asm all_8x32: push rbp mov rbp, rsp movdqa xmm0, xmmword, ptr, [rip, +, LCPI17_0] movdqa xmm1, xmmword, ptr, [rdi] pand xmm1, xmm0 movdqa xmm2, xmmword, ptr, [rdi, +, 16] pand xmm2, xmm0 pcmpeqb xmm2, xmm0 pcmpeqb xmm1, xmm0 pand xmm1, xmm2 pmovmskb eax, xmm1 xor ecx, ecx cmp eax, 65535 mov eax, -1 cmovne eax, ecx and al, 1 pop rbp ret any_8x32: push rbp mov rbp, rsp movdqa xmm0, xmmword, ptr, [rdi] por xmm0, xmmword, ptr, [rdi, +, 16] movdqa xmm1, xmmword, ptr, [rip, +, LCPI16_0] pand xmm0, xmm1 pcmpeqb xmm0, xmm1 pmovmskb eax, xmm0 neg eax sbb eax, eax and al, 1 pop rbp ret ``` After this PR: ```asm all_8x32: push rbp mov rbp, rsp movdqa xmm0, xmmword, ptr, [rdi] pmovmskb eax, xmm0 cmp eax, 65535 jne LBB17_1 movdqa xmm0, xmmword, ptr, [rdi, +, 16] pmovmskb ecx, xmm0 mov al, 1 cmp ecx, 65535 je LBB17_3 LBB17_1: xor eax, eax LBB17_3: pop rbp ret any_8x32: push rbp mov rbp, rsp movdqa xmm0, xmmword, ptr, [rdi] pmovmskb ecx, xmm0 mov al, 1 test ecx, ecx je LBB16_1 pop rbp ret LBB16_1: movdqa xmm0, xmmword, ptr, [rdi, +, 16] pmovmskb eax, xmm0 test eax, eax setne al pop rbp ret ``` With AVX enabled Before this PR: ```asm all_8x32: push rbp mov rbp, rsp vmovaps ymm0, ymmword, ptr, [rdi] vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0] vextractf128 xmm1, ymm0, 1 vpxor xmm2, xmm2, xmm2 vpcmpeqb xmm1, xmm1, xmm2 vpcmpeqd xmm3, xmm3, xmm3 vpxor xmm1, xmm1, xmm3 vpcmpeqb xmm0, xmm0, xmm2 vpxor xmm0, xmm0, xmm3 vinsertf128 ymm0, ymm0, xmm1, 1 vandps ymm0, ymm0, ymm1 vpermilps xmm1, xmm0, 78 vandps ymm0, ymm0, ymm1 vpermilps xmm1, xmm0, 229 vandps ymm0, ymm0, ymm1 vpsrld xmm1, xmm0, 16 vandps ymm0, ymm0, ymm1 vpsrlw xmm1, xmm0, 8 vandps ymm0, ymm0, ymm1 vpextrb eax, xmm0, 0 and al, 1 pop rbp vzeroupper ret any_8x32: push rbp mov rbp, rsp vmovaps ymm0, ymmword, ptr, [rdi] vandps ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0] vextractf128 xmm1, ymm0, 1 vpxor xmm2, xmm2, xmm2 vpcmpeqb xmm1, xmm1, xmm2 vpcmpeqd xmm3, xmm3, xmm3 vpxor xmm1, xmm1, xmm3 vpcmpeqb xmm0, xmm0, xmm2 vpxor xmm0, xmm0, xmm3 vinsertf128 ymm0, ymm0, xmm1, 1 vorps ymm0, ymm0, ymm1 vpermilps xmm1, xmm0, 78 vorps ymm0, ymm0, ymm1 vpermilps xmm1, xmm0, 229 vorps ymm0, ymm0, ymm1 vpsrld xmm1, xmm0, 16 vorps ymm0, ymm0, ymm1 vpsrlw xmm1, xmm0, 8 vorps ymm0, ymm0, ymm1 vpextrb eax, xmm0, 0 and al, 1 pop rbp vzeroupper ret ``` After this PR: ```asm all_8x32: push rbp mov rbp, rsp vmovdqa ymm0, ymmword, ptr, [rdi] vxorps xmm1, xmm1, xmm1 vcmptrueps ymm1, ymm1, ymm1 vptest ymm0, ymm1 setb al pop rbp vzeroupper ret any_8x32: push rbp mov rbp, rsp vmovdqa ymm0, ymmword, ptr, [rdi] vptest ymm0, ymm0 setne al pop rbp vzeroupper ret ``` --- Closes rust-lang#362 .

alexcrichton · 2018-05-04T14:07:30Z

Looks like AppVeyor CI is failing for running too many tests?

alexcrichton · 2018-05-04T14:09:33Z

64-bit osx has these failures:

failures:
---- ppsv::v64::f32x2_tests::arithmetic stdout ----
	thread 'ppsv::v64::f32x2_tests::arithmetic' panicked at 'assertion failed: `(left == right)`
  left: `f32x2(NaN, 0.0)`,
 right: `f32x2(0.0, 0.0)`', crates/coresimd/tests/../../../coresimd/ppsv/v64.rs:66:1
note: Run with `RUST_BACKTRACE=1` for a backtrace.
---- ppsv::v64::f32x2_tests::arithmetic_scalar stdout ----
	thread 'ppsv::v64::f32x2_tests::arithmetic_scalar' panicked at 'assertion failed: `(left == right)`
  left: `f32x2(NaN, 0.0)`,
 right: `f32x2(0.0, 0.0)`', crates/coresimd/tests/../../../coresimd/ppsv/v64.rs:66:1
failures:
    ppsv::v64::f32x2_tests::arithmetic
    ppsv::v64::f32x2_tests::arithmetic_scalar

The DOCUMENTATION build is gonna need a few updates as well as it's failing

The x86_64-linux-emulated build I think like AppVeyor is running too many tests (and is failing)

The wasm build is failing but not due to issues here. It's ok to move that to allow failures and I'll fix it later

alexcrichton · 2018-05-04T14:10:01Z

crates/coresimd/Cargo.toml

@@ -18,6 +18,9 @@ is-it-maintained-issue-resolution = { repository = "rust-lang-nursery/stdsimd" }
 is-it-maintained-open-issues = { repository = "rust-lang-nursery/stdsimd" }
 maintenance = { status = "experimental" }

+[dependencies]
+cfg-if = "0.1"


For ease of integration into rust-lang/rust, could this macro be inlined? (It's actually quite a small macro)

Sure, done.

gnzlbg · 2018-05-04T19:27:04Z

All green :/ I'll open an issue to figure out what's happening in MacOSX on travis.

alexcrichton · 2018-05-04T20:00:26Z

crates/coresimd/src/lib.rs

@@ -46,6 +46,50 @@ extern crate stdsimd_test;
 #[cfg(test)]
 extern crate test;

+#[doc(hidden)]
+macro_rules! cfg_if {


Since this won't be available in libcore can this be moved diretly to the coresimd/mod.rs file to be used there?

gnzlbg requested a review from alexcrichton April 12, 2018 13:53

gnzlbg force-pushed the codegen2 branch from 3b77773 to a4044e5 Compare April 12, 2018 13:58

gnzlbg mentioned this pull request Apr 12, 2018

Workaround bad generated code for 256-bit vector all/any reductions on ARM/AArch64 #426

Closed

hsivonen reviewed Apr 13, 2018

View reviewed changes

alexcrichton reviewed Apr 13, 2018

View reviewed changes

gnzlbg force-pushed the codegen2 branch from 9950010 to 1078be3 Compare April 27, 2018 07:58

gnzlbg added 2 commits May 2, 2018 09:50

test avx on all x86 targets

f99000b

gnzlbg added 2 commits May 2, 2018 09:50

disable assert_instr on avx test

81641d6

enable all appropriate features

985893e

gnzlbg force-pushed the codegen2 branch from 787522f to 985893e Compare May 2, 2018 07:51

gnzlbg added 6 commits May 4, 2018 13:35

disable assert_instr on x86+avx

2d6cc25

the fn_must_use is stable

a6bd57c

fix nbody example on armv7

4462afb

fixup

bd8e0e8

fixup

17466e9

enable 64-bit wide mask MMX optimizations on x86_64 only

8819741

alexcrichton reviewed May 4, 2018

View reviewed changes

gnzlbg added 4 commits May 4, 2018 16:53

remove coresimd dependency on cfg_if

68f7719

allow wasm to fail

4ee0c5a

use an env variable to disable assert_instr tests

fec0a3a

disable m32x2 mask MMX optimization on macos

c589b1c

alexcrichton reviewed May 4, 2018

View reviewed changes

move cfg_if to coresimd/macros.rs

a1390be

alexcrichton merged commit b517602 into rust-lang:master May 4, 2018

Workarounds for all/any mask reductions on x86, armv7, and aarch64 #425

Workarounds for all/any mask reductions on x86, armv7, and aarch64 #425

Conversation

gnzlbg commented Apr 12, 2018 • edited Loading

64-bit wide mask types (m8x8, m16x4, m32x2)

x86_64 with MMX enabled

x86 with MMX enabled

aarch64

ARMv7 + neon

128-bit wide mask types (m8x16, m16x8, m32x4, m64x2)

x86_64 with SSE2 enabled

aarch64

ARMv7 + neon

256-bit wide mask types (m8x32, m16x16, m32x8, m64x4)

With SSE2 enabled

With AVX enabled

gnzlbg commented Apr 12, 2018 • edited Loading

alexcrichton commented Apr 12, 2018

gnzlbg commented Apr 12, 2018 • edited Loading

hanna-kruppe commented Apr 12, 2018

alexcrichton commented Apr 12, 2018

hanna-kruppe commented Apr 12, 2018

gnzlbg commented Apr 12, 2018 • edited Loading

Centril commented Apr 12, 2018

alexcrichton commented Apr 12, 2018

gnzlbg commented Apr 12, 2018 via email

alexcrichton commented Apr 12, 2018

gnzlbg commented Apr 13, 2018

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

hsivonen commented Apr 13, 2018

hsivonen commented Apr 13, 2018

gnzlbg commented Apr 13, 2018

hsivonen commented Apr 13, 2018

Choose a reason for hiding this comment

alexcrichton commented Apr 13, 2018

gnzlbg commented Apr 20, 2018

gnzlbg commented Apr 27, 2018

alexcrichton commented Apr 27, 2018

gnzlbg commented Apr 27, 2018

alexcrichton commented Apr 27, 2018

alexcrichton commented May 4, 2018

alexcrichton commented May 4, 2018

Choose a reason for hiding this comment

Choose a reason for hiding this comment

gnzlbg commented May 4, 2018

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

gnzlbg commented Apr 12, 2018 •

edited

Loading

64-bit wide mask types (`m8x8`, `m16x4`, `m32x2`)

`x86_64` with `MMX` enabled

`x86` with `MMX` enabled

`aarch64`

`ARMv7` + `neon`

128-bit wide mask types (`m8x16`, `m16x8`, `m32x4`, `m64x2`)

`x86_64` with SSE2 enabled

`aarch64`

`ARMv7` + `neon`

256-bit wide mask types (`m8x32`, `m16x16`, `m32x8`, `m64x4`)

gnzlbg commented Apr 12, 2018 •

edited

Loading

gnzlbg commented Apr 12, 2018 •

edited

Loading

gnzlbg commented Apr 12, 2018 •

edited

Loading