Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Workarounds for all/any mask reductions on x86, armv7, and aarch64 #425

Merged
merged 15 commits into from
May 4, 2018

Conversation

gnzlbg
Copy link
Contributor

@gnzlbg gnzlbg commented Apr 12, 2018

This commit adds workarounds for the mask reductions: all and any.

Note:

  • not all LLVM bugs have been properly reported yet
  • workarounds for 256-bit wide vector masks on armv7 and aarch64 are
    not provided in this PR

64-bit wide mask types (m8x8, m16x4, m32x2)

x86_64 with MMX enabled

all_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret

After this PR for m8x8, m16x4, m32x2:

all_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 pop     rbp
 ret

x86 with MMX enabled

Before this PR:

all_8x8:
 call    L9$pb
L9$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
any_8x8:
 call    L8$pb
L8$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret

After this PR:

all_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 ret
any_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 ret 

aarch64

Before this PR:

all_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret

After this PR:

all_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret

ARMv7 + neon

Before this PR:

all_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, #4
 vand    d0, d0, d1
 vext.8  d1, d0, d0, #2
 vand    d0, d0, d1
 vdup.8  d1, d0[1]
 vand    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, #4
 vorr    d0, d0, d1
 vext.8  d1, d0, d0, #2
 vorr    d0, d0, d1
 vdup.8  d1, d0[1]
 vorr    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr

After this PR:

all_8x8:
 vldr    d0, [r0]
 b       <m8x8 as All>::all

<m8x8 as All>::all:
 vpmin.u8 d16, d0, d16
 vpmin.u8 d16, d16, d16
 vpmin.u8 d0, d16, d16
 b       m8x8::extract

any_8x8:
 vldr    d0, [r0]
 b       <m8x8 as Any>::any
 
<m8x8 as Any>::any:
 vpmax.u8 d16, d0, d16
 vpmax.u8 d16, d16, d16
 vpmax.u8 d0, d16, d16
 b       m8x8::extract

(note: inlining does not work properly on ARMv7)

128-bit wide mask types (m8x16, m16x8, m32x4, m64x2)

x86_64 with SSE2 enabled

Before this PR:

all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret

After this PR:

all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 sete    al
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret

aarch64

Before this PR:

all_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[8]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[9]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[10]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[11]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[12]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[13]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[14]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[15]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 umov    w9, v0.b[8]
 orr     w8, w8, w9
 umov    w9, v0.b[9]
 orr     w8, w8, w9
 umov    w9, v0.b[10]
 orr     w8, w8, w9
 umov    w9, v0.b[11]
 orr     w8, w8, w9
 umov    w9, v0.b[12]
 orr     w8, w8, w9
 umov    w9, v0.b[13]
 orr     w8, w8, w9
 umov    w9, v0.b[14]
 orr     w8, w8, w9
 umov    w9, v0.b[15]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret

After this PR:

all_8x16:
 ldr     q0, [x0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x16:
 ldr     q0, [x0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret

ARMv7 + neon

Before this PR:

all_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, #8
 vand    q0, q0, q1
 vext.8  q1, q0, q0, #4
 vand    q0, q0, q1
 vext.8  q1, q0, q0, #2
 vand    q0, q0, q1
 vdup.8  q1, d0[1]
 vand    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, #8
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, #4
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, #2
 vorr    q0, q0, q1
 vdup.8  q1, d0[1]
 vorr    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr

After this PR:

all_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as All>::all
 
<m8x16 as All>::all: 
 vpmin.u8 d0, d0, d
 b       <m8x8 as All>::all
any_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as Any>::any
 
<m8x16 as Any>::any:
 vpmax.u8 d0, d0, d1
 b       <m8x8 as Any>::any

The inlining problems are pretty bad on ARMv7 + NEON.

256-bit wide mask types (m8x32, m16x16, m32x8, m64x4)

With SSE2 enabled

Before this PR:

all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI17_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 movdqa  xmm2, xmmword, ptr, [rdi, +, 16]
 pand    xmm2, xmm0
 pcmpeqb xmm2, xmm0
 pcmpeqb xmm1, xmm0
 pand    xmm1, xmm2
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
 any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 por     xmm0, xmmword, ptr, [rdi, +, 16]
 movdqa  xmm1, xmmword, ptr, [rip, +, LCPI16_0]
 pand    xmm0, xmm1
 pcmpeqb xmm0, xmm1
 pmovmskb eax, xmm0
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret

After this PR:

all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 jne     LBB17_1
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb ecx, xmm0
 mov     al, 1
 cmp     ecx, 65535
 je      LBB17_3
LBB17_1:
 xor     eax, eax
LBB17_3:
 pop     rbp
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb ecx, xmm0
 mov     al, 1
 test    ecx, ecx
 je      LBB16_1
 pop     rbp
 ret
LBB16_1:
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret

With AVX enabled

Before this PR:

all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vandps  ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vandps  ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vandps  ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vorps   ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vorps   ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vorps   ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret

After this PR:

all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vxorps  xmm1, xmm1, xmm1
 vcmptrueps ymm1, ymm1, ymm1
 vptest  ymm0, ymm1
 setb    al
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vptest  ymm0, ymm0
 setne   al
 pop     rbp
 vzeroupper
 ret

Closes #362 .

@gnzlbg
Copy link
Contributor Author

gnzlbg commented Apr 12, 2018

cc @hsivonen

--

@alexcrichton there are still some failures on x86, but I have no idea why they happen yet, the asm code generated for the x86 and the x86_64 functions looks identical to me. Also, it appears that on ARMv7 nothing gets ever inlined and I don't know why. AFAICT I am setting #[inline] and #[target_feature] appropriately.

@alexcrichton
Copy link
Member

Thanks for this! I'll admit though I'm very quickly getting lost in all this...

So purely on the surface I'm curious what's going on here? For example if we conditionalize these impls on #[cfg(target_feature = ...)] that's not too useful, right? Don't we want everything to be compiled depending on downstream users compilation environment?

I realize that we don't really have any better solution than this today, but this to me feels like we should be pursuing a different language-level solution. For example I feel like we should be able to monomorphize these implementations into downstream crates, reconfiguring the settings dynamically as they're monomorphized. That way if you compile with -C target-feature=+neon you get all the optimized versions (and still no runtime overhead of checking) from stdsimd.

Does that makes sense? Is that perhaps too far out though?

@gnzlbg
Copy link
Contributor Author

gnzlbg commented Apr 12, 2018

@alexcrichton

For example if we conditionalize these impls on #[cfg(target_feature = ...)] that's not too useful, right? Don't we want everything to be compiled depending on downstream users compilation environment?

Well sse2 is available on x86_64 by default, and neon is available on aarch64 by default, so those would work with std out of the box.

For AVX and neon on armv7 users would need to recompile std with those features enabled.

I realize that we don't really have any better solution than this today, but this to me feels like we should be pursuing a different language-level solution. For example I feel like we should be able to monomorphize these implementations into downstream crates, reconfiguring the settings dynamically as they're monomorphized. That way if you compile with -C target-feature=+neon you get all the optimized versions (and still no runtime overhead of checking) from stdsimd.

I was talking with @rkruppe about this the other day. The main problems are that:

  • we can't do this with an attribute-based approach
  • because any crate in the ecosystem can write #[target_feature(enable = "avx")] fn foo() { ...} and call anything from there, all the code of the whole program has to be available to the current translation unit, so that all of it can be monomorphized for the current set of target features enabled depending on where it is called.

Does that makes sense?

Yes, this is what I want, and probably what all other users want too. T

Is that perhaps too far out though?

That depends. Three main goals for 2018 are effects: async, const fn, and target-feature.

Rust code wants to be able to ask "am I being executed at compile-time" about the const effect, "am I being called synchronously or asynchronously" for async code (e.g. if the code is called with or without await), and none of these is that much different from asking "am I being compiled with AVX", and branch depending on the answer.

Rust's approach towards these 2018 goals has been "let's #[attribute] the hell out of these problems", but now that we have solutions to all these problems, instead of rushing towards stabilization, I think we should give the PL PhDs working on Rust enough time to propose an effect system that handles all of these in a principled way.

@Centril is preparing an RFC for an effect-system for Rust. If we think of target-feature as an effect, all Rust code must be polymorphic on it. This would fix some of the hard-to-fix bugs that are currently open, like cfg(target_feature) returning false for a feature that is enabled via #[target_feature] in the function where it is used, and it would fix them across functions and crates, not only in lexical scopes. Also, with an effect-system the RFC target-feature 1.1 wouldn't probably be necessary.

Honestly, I wish we could ship std::arch without shipping #[target_feature], cfg(target_feature), is_..._feature_detected, and company.

@hanna-kruppe
Copy link

A general effect system is a huge step. certainly won't make it into Rust 2018, any may not ever be added.

But regardless of that, I don't think it's a good idea to treat this (propagating target features to libraries) as a general monomorphization type deal. Making all code parametric over target features and monomorphizing for different sets of target features seems like an incredibly great way for binary size to explode completely. There has to be some sort of opt-in, otherwise any program that uses some non-trivial #[target_feature] attributes would contain multiple copies of the libraries it uses, in most cases without any appreciable performance improvement.

@alexcrichton
Copy link
Member

Hm ok this makes sense. I agree with @rkruppe that gaining an effect system is unlikely to happen any time soon so we probably don't want to plan on that happening.

I suppose another way to put my reaction is that I'm confused why we have to do this in the first place. Sort of the whole point of "portable SIMD" is to "pray the backend takes care of everything". If we end up defining everything ourselves and having massive swaths of #[cfg] then why not just do this all on crates.io?

It sounds like these are also considered LLVM bugs but presumably we're going to continuously run into these issues, right?

@hanna-kruppe
Copy link

Well there is a fundamental issue with putting such code into libraries that are called from multiple contexts: you either duplicate the function for each context to exploit knowledge, or you share the code by serving the lowest common denominator.

There are of course intermediates between these extremes: for some kinds of operations, regular inlining can help when it kicks in, and only generating LLVM IR when compiling leaf crates (rust-lang/rust#38913) would effectively propagate -C options (but not #[target_feature]s) to all crates in the dependency graph. But if you want to always use the caller's target features, you have to duplicate the code for every set of target features, which is not obviously desirable even if restricted to functions that use the portable vector types (e.g., most algorithms on 128-bit vectors won't benefit from having AVX available).

@gnzlbg
Copy link
Contributor Author

gnzlbg commented Apr 12, 2018

I suppose another way to put my reaction is that I'm confused why we have to do this in the first place. Sort of the whole point of "portable SIMD" is to "pray the backend takes care of everything".

I am open to better ways to proceed. The current approach has been to:

  • fill these bugs (LLVM bugs: 37087, 36982, 36933, 36796, 36734, 36702, amongst others).
  • get them fixed: two are fixed already, two have patches under review.
  • add workarounds for the critical ones, like these two, which prevent Firefox from moving off the simd crate

What do you propose that we do instead? We could not add workarounds and tell people to live with the bugs for a while or work around them in their own crates. That would be fine for me. Since we are not following LLVM trunk, "for a while" might be 6 months or 1.5 years, depending on how unlucky we get. And for these two, I think LLVM8 is optimistic at best.

If we end up defining everything ourselves and having massive swaths of #[cfg] then why not just do this all on crates.io?

The workarounds here account for less than 1% of the vector methods (these are less than 20 methods, and we have currently ~2000). Over 99% of the methods currently use the LLVM intrinsics, so we would need to stabilize those to be able to do this on crates.io.

The alternative would be to not add any methods with bugs to the vector types. Sadly, when the issue to add these methods (that were already part of the simd crate) was opened, nobody mentioned that their LLVM implementations had bugs. Nobody even mentioned that their implementation in the simd crate also had bugs.


EDIT: I would prefer to fix these directly in LLVM, but fixing these in LLVM just to discover 6 months from now that the fix wasn't enough and having to wait another 6 months to try again sucks. Having the workarounds already in place makes it suck a whole lot less.

@Centril
Copy link

Centril commented Apr 12, 2018

Small note on effects: I'm not preparing an RFC on a general effect system, but rather for const specifically, but I want to gain understanding of effects in general as well.

@alexcrichton
Copy link
Member

@gnzlbg hm ok, good to have numbers!

In my mind the "absolute ideal" solution here is to have a special node in MIR corresponding to "expand to true or false depending on target features when monomorphized". That would be an unstable language feature, obviously, but means you could write everything here as:

if __unstable_target_feature_activated("foo") {
    // ...
} else {
    // ...
}

and that wouldn't have the same runtime penalty of is_x86_feature_detected!.

This is definitely very similar to MIR-only rlibs and what happens with overflow checking and debug assertions for primitives today. I think such a feature would at least help reduce the boilerplate here and also lift the need for recompiling std as these are all inlined and monomorphized on the fly anyway.

In any case I do think that we could probably land this in the meantime. I'd be hesitant to land anything that's not already exposed thorugh the standard targets though. For example the AVX-specific code here I don't think is ever tested or would be compiled?

In terms of our own CI as well, perhaps we should run cargo test for a few various levels of -C target-feature=.. for the portable types?

@gnzlbg
Copy link
Contributor Author

gnzlbg commented Apr 12, 2018 via email

@alexcrichton
Copy link
Member

Oh I'm not too worried about the codegen for the function itself but moreso am worried about the correctness. In that sense I think it's fine to hand-inspect assembly and otherwise just test for correctness

@gnzlbg
Copy link
Contributor Author

gnzlbg commented Apr 13, 2018

am worried about the correctness.

I've updated the ci/run script to compile with avx as well on all x86 targets and run the release tests.

These two intrinsics are tested by the portable vector types tests, and all is tested as well in the std::arch tests (because PartialEq is implemented as a.eq(b).all()). So if these pass that should cover the correctness.

Oh I'm not too worried about the codegen for the function itself

I am not too worried either, but the whole point of these workarounds is to fix codegen bugs, so I'd rather check that they work as intended and that we don't inadvertently break this in the future.

halves: ($half, $half),
vec: $id
}
let halves = U { vec: self }.halves;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a valid way to split a vector into its halves? It seems that clang uses shuffles for this purpose.

With this implementation, I see a bunch of functions whose core follows this pattern fail unit tests:

        #[inline(always)]
        pub fn simd_is_ascii(s: u8x16) -> bool {
            let above_ascii = u8x16::splat(0x80);
            s.lt(above_ascii).all()
        }

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a valid way to split a vector into its halves?

It is a valid way to do this. The RFC specifies that the layout of vectors is the same as arrays of the same element type and the same number of lanes, e.g., f32x2 has the same layout as [f32; 2]. Since the layout of arrays is the same as the layout of tuples of the same element type and same number of element, e.g., (f32, f32) in the previous example, then yes, this is valid. I chose a tuple over an array here because I did not want to worry about doing .get_unchecked.

With this implementation, I see a bunch of functions whose core follows this pattern fail unit tests:

On which arch? Can you post a minimal example that reproduces it here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On which arch? Can you post a minimal example that reproduces it here?

ARMv7 with -C target_feature=+neon. I'll try to minimize it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this work on the other targets like x86 and x86_64 ?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I tested that the pattern works on x86_64 by temporarily replacing the x86_64-specific code with the portable code.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry about misattributing the bug. The bug was a misplaced parenthesis in my conditional compilation directives, which caused ARMv7+NEON to use a stride size constant intended for ARMv7 ALU code.

So: Not a blocker for getting this PR merged.

@hsivonen
Copy link
Member

I suppose another way to put my reaction is that I'm confused why we have to do this in the first place. Sort of the whole point of "portable SIMD" is to "pray the backend takes care of everything".

That position may make sense from the compiler front-end developer point of view, but from the point of view of a user of the language, the user of the language sees the standard library interface and doesn't care which layer (standard library, compiler front end or LLVM) does the Right Thing.

This PR makes the black box that the user of the language sees do the Right Thing (ignoring the concern I have about whether the ARMv7+NEON code path splits the vector into its halves using the right LLVM idiom; I didn't debug it fully yet).

It would be really sad not to get the user-of-the-language-facing Right Thing at this time even if ideally it could be moved to another layer later.

If we end up defining everything ourselves and having massive swaths of #[cfg] then why not just do this all on crates.io?

These operations logically belong on the types provided by std::simd, so moving these operations to crates.io would be really bad API design.

The PR addresses the performance concern I've had regarding moving encoding_rs/Firefox from the simd crate to stdsimd on x86_64 and aarch64. (x86 benchmarks are still running. I'm running them on actual non-x86_64 hardware, so it's like watching paint dry.)

@hsivonen
Copy link
Member

With this implementation, I see a bunch of functions whose core follows this pattern fail unit tests:

       #[inline(always)]
       pub fn simd_is_ascii(s: u8x16) -> bool {
           let above_ascii = u8x16::splat(0x80);
           s.lt(above_ascii).all()
       }

I used

RUSTFLAGS='-C target_feature=+neon' cargo test --features simd-accel

With revision d7013a3393611b678b9f4abc99510bc875de7b68 of the stdsimd branch of encoding_rs.

(x86 benchmarks are still running. I'm running them on actual non-x86_64 hardware, so it's like watching paint dry.)

And the x86 results are in.

This PR addresses my performance concerns on all the architectures that encoding_rs uses the simd crate on at present (x86, x86_64 and aarch64). 🎉 Thank you, @gnzlbg!

@gnzlbg
Copy link
Contributor Author

gnzlbg commented Apr 13, 2018

It would be really sad not to get the user-of-the-language-facing Right Thing at this time even if ideally it could be moved to another layer later.

@hsivonen the problem is that patching this on the front-end is brittle. For example, this patch does not work for #[target_feature(enable = "avx")] unless the user recompiles std with avx enabled. Patching this on the backend wouldn't require that. So this is at best a stop-gap solution until this gets fixed in LLVM and we upgrade to that particular LLVM version.

@hsivonen
Copy link
Member

So this is at best a stop-gap solution until this gets fixed in LLVM and we upgrade to that particular LLVM version.

To be clear, I think that this ideally belongs in LLVM and should be fixed there eventually. But I think this PR should land to make stuff not unusably slow in the mean time instead of making users of the language wait for the LLVM change plus the LLVM change making its way to rustc. In particular, failure to address the AVX case in an ideal way should not be a reason not to land the fix that makes 128-bit SIMD more portable in a performant way.

ci/run.sh Outdated
case ${TARGET} in
x86*)
RUSTFLAGS="${RUSTFLAGS} -C target-feature=+avx"
cargo_test "--release"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This'll somehow need to disable all the assert_instr tests as well as anything beneath avx is likely to generate different instructions with avx enabled

@alexcrichton
Copy link
Member

r=me with green CI

@gnzlbg
Copy link
Contributor Author

gnzlbg commented Apr 20, 2018

@hsivonen this PR has pretty much stalled since I was low on time, feel free to pick it up. Otherwise I might have some time to finish it next week.

@gnzlbg
Copy link
Contributor Author

gnzlbg commented Apr 27, 2018

So the intel sde build bot is failing due to failing inlining, probably #427 but showing up on x86 :/

I have enabled all relevant target-features in the last commit; we'll see if that reduces the number of failures there.

@alexcrichton
Copy link
Member

Hm it may be that the env var to ignore assert_instr isn't working? When enabling features we should ignore the assert_instr tests I think

@gnzlbg
Copy link
Contributor Author

gnzlbg commented Apr 27, 2018

@alexcrichton yes that's not working yet, but the issue i mentioned is unrelated. Raising the -C target-feature=+avx to AVX from SSE2 results in a lot of intrinsics that were previously inlined not being inlined anymore :/

I would expect the intrinsics to get inlined, and those asser instr to fail because different code is generated (e.g. because LLVM can do something better), but that's not why they are failing. They are failing because they contain a call instruction to the #[target_feature] intrinsics :/

@alexcrichton
Copy link
Member

Right yeah but I think your diagnosis of #427 is correct, and this should be fixed by rust-lang/rust#50188

gnzlbg added 2 commits May 2, 2018 09:50
This commit adds workarounds for the mask reductions: `all` and `any`.

64-bit wide mask types (`m8x8`, `m16x4`, `m32x2`)

`x86_64` with `MMX` enabled

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
```

After this PR for `m8x8`, `m16x4`, `m32x2`:

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

x86` with `MMX` enabled

Before this PR:

```asm
all_8x8:
 call    L9$pb
L9$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
any_8x8:
 call    L8$pb
L8$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
```

After this PR:

```asm
all_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 ret
any_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 ret
```

`aarch64`

Before this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

`ARMv7` + `neon`

Before this PR:

```asm
all_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, rust-lang#4
 vand    d0, d0, d1
 vext.8  d1, d0, d0, rust-lang#2
 vand    d0, d0, d1
 vdup.8  d1, d0[1]
 vand    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, rust-lang#4
 vorr    d0, d0, d1
 vext.8  d1, d0, d0, rust-lang#2
 vorr    d0, d0, d1
 vdup.8  d1, d0[1]
 vorr    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x8:
 vldr    d0, [r0]
 b       <m8x8 as All>::all

<m8x8 as All>::all:
 vpmin.u8 d16, d0, d16
 vpmin.u8 d16, d16, d16
 vpmin.u8 d0, d16, d16
 b       m8x8::extract

any_8x8:
 vldr    d0, [r0]
 b       <m8x8 as Any>::any

<m8x8 as Any>::any:
 vpmax.u8 d16, d0, d16
 vpmax.u8 d16, d16, d16
 vpmax.u8 d0, d16, d16
 b       m8x8::extract
```

(note: inlining does not work properly on ARMv7)

128-bit wide mask types (`m8x16`, `m16x8`, `m32x4`, `m64x2`)

`x86_64` with SSE2 enabled

Before this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 sete    al
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

`aarch64`

Before this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[8]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[9]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[10]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[11]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[12]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[13]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[14]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[15]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 umov    w9, v0.b[8]
 orr     w8, w8, w9
 umov    w9, v0.b[9]
 orr     w8, w8, w9
 umov    w9, v0.b[10]
 orr     w8, w8, w9
 umov    w9, v0.b[11]
 orr     w8, w8, w9
 umov    w9, v0.b[12]
 orr     w8, w8, w9
 umov    w9, v0.b[13]
 orr     w8, w8, w9
 umov    w9, v0.b[14]
 orr     w8, w8, w9
 umov    w9, v0.b[15]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x16:
 ldr     q0, [x0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

 `ARMv7` + `neon`

Before this PR:

```asm
all_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, rust-lang#8
 vand    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#4
 vand    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#2
 vand    q0, q0, q1
 vdup.8  q1, d0[1]
 vand    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, rust-lang#8
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#4
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#2
 vorr    q0, q0, q1
 vdup.8  q1, d0[1]
 vorr    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as All>::all

<m8x16 as All>::all:
 vpmin.u8 d0, d0, d
 b       <m8x8 as All>::all
any_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as Any>::any

<m8x16 as Any>::any:
 vpmax.u8 d0, d0, d1
 b       <m8x8 as Any>::any
```

The inlining problems are pretty bad on ARMv7 + NEON.

256-bit wide mask types (`m8x32`, `m16x16`, `m32x8`, `m64x4`)

With SSE2 enabled

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI17_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 movdqa  xmm2, xmmword, ptr, [rdi, +, 16]
 pand    xmm2, xmm0
 pcmpeqb xmm2, xmm0
 pcmpeqb xmm1, xmm0
 pand    xmm1, xmm2
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
 any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 por     xmm0, xmmword, ptr, [rdi, +, 16]
 movdqa  xmm1, xmmword, ptr, [rip, +, LCPI16_0]
 pand    xmm0, xmm1
 pcmpeqb xmm0, xmm1
 pmovmskb eax, xmm0
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 jne     LBB17_1
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb ecx, xmm0
 mov     al, 1
 cmp     ecx, 65535
 je      LBB17_3
LBB17_1:
 xor     eax, eax
LBB17_3:
 pop     rbp
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb ecx, xmm0
 mov     al, 1
 test    ecx, ecx
 je      LBB16_1
 pop     rbp
 ret
LBB16_1:
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

With AVX enabled

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vandps  ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vandps  ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vandps  ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vorps   ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vorps   ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vorps   ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vxorps  xmm1, xmm1, xmm1
 vcmptrueps ymm1, ymm1, ymm1
 vptest  ymm0, ymm1
 setb    al
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vptest  ymm0, ymm0
 setne   al
 pop     rbp
 vzeroupper
 ret
```

---

Closes rust-lang#362 .
@alexcrichton
Copy link
Member

Looks like AppVeyor CI is failing for running too many tests?

@alexcrichton
Copy link
Member

64-bit osx has these failures:

failures:
---- ppsv::v64::f32x2_tests::arithmetic stdout ----
	thread 'ppsv::v64::f32x2_tests::arithmetic' panicked at 'assertion failed: `(left == right)`
  left: `f32x2(NaN, 0.0)`,
 right: `f32x2(0.0, 0.0)`', crates/coresimd/tests/../../../coresimd/ppsv/v64.rs:66:1
note: Run with `RUST_BACKTRACE=1` for a backtrace.
---- ppsv::v64::f32x2_tests::arithmetic_scalar stdout ----
	thread 'ppsv::v64::f32x2_tests::arithmetic_scalar' panicked at 'assertion failed: `(left == right)`
  left: `f32x2(NaN, 0.0)`,
 right: `f32x2(0.0, 0.0)`', crates/coresimd/tests/../../../coresimd/ppsv/v64.rs:66:1
failures:
    ppsv::v64::f32x2_tests::arithmetic
    ppsv::v64::f32x2_tests::arithmetic_scalar

The DOCUMENTATION build is gonna need a few updates as well as it's failing

The x86_64-linux-emulated build I think like AppVeyor is running too many tests (and is failing)

The wasm build is failing but not due to issues here. It's ok to move that to allow failures and I'll fix it later

@@ -18,6 +18,9 @@ is-it-maintained-issue-resolution = { repository = "rust-lang-nursery/stdsimd" }
is-it-maintained-open-issues = { repository = "rust-lang-nursery/stdsimd" }
maintenance = { status = "experimental" }

[dependencies]
cfg-if = "0.1"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For ease of integration into rust-lang/rust, could this macro be inlined? (It's actually quite a small macro)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, done.

@gnzlbg
Copy link
Contributor Author

gnzlbg commented May 4, 2018

All green :/ I'll open an issue to figure out what's happening in MacOSX on travis.

@@ -46,6 +46,50 @@ extern crate stdsimd_test;
#[cfg(test)]
extern crate test;

#[doc(hidden)]
macro_rules! cfg_if {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this won't be available in libcore can this be moved diretly to the coresimd/mod.rs file to be used there?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done :)

@alexcrichton alexcrichton merged commit b517602 into rust-lang:master May 4, 2018
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants