Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Boolean vectors should use hand-tuned any() and all() #362

Closed
hsivonen opened this issue Mar 9, 2018 · 17 comments
Closed

Boolean vectors should use hand-tuned any() and all() #362

hsivonen opened this issue Mar 9, 2018 · 17 comments

Comments

@hsivonen
Copy link
Member

hsivonen commented Mar 9, 2018

Thank you for adding portable SIMD and boolean vectors!

In stdsimd, any() and all() are based LLVM cross-ISA on bitwise reductions across the vector, so these operations fail to use the type system information that is the reason for their existence: The information that all the bits of each lane are the same. I.e. in b8x16, each lane is either 0 or 0xFF.

This means that looking at all the bits is unnecessary, since for each lane, examining any one bit is sufficient. In the SSE2 case, _mm_movemask_epi8 takes one bit from each lane of a u8x16. It's what the simd crate uses.

I've uploaded a demo crate that exports uninlined versions of any() and all() on u8x16 using both simd and stdsimd. The generated code for the simd crate versions is much simpler. Code generated using stdsimd should be as good.

    .text
    .file   "anyall0-66776763bbd7db1b14e447f414e2cf9a.rs"
    .section    .text.stdsimd_any_8x16,"ax",@progbits
    .globl  stdsimd_any_8x16
    .p2align    4, 0x90
    .type   stdsimd_any_8x16,@function
stdsimd_any_8x16:
    .cfi_startproc
    movdqa  (%rdi), %xmm0
    pshufd  $78, %xmm0, %xmm1
    por %xmm0, %xmm1
    pshufd  $229, %xmm1, %xmm0
    por %xmm1, %xmm0
    movdqa  %xmm0, %xmm1
    psrld   $16, %xmm1
    por %xmm0, %xmm1
    movdqa  %xmm1, %xmm0
    psrlw   $8, %xmm0
    por %xmm1, %xmm0
    movd    %xmm0, %eax
    testb   %al, %al
    setne   %al
    retq
.Lfunc_end0:
    .size   stdsimd_any_8x16, .Lfunc_end0-stdsimd_any_8x16
    .cfi_endproc

    .section    .text.stdsimd_all_8x16,"ax",@progbits
    .globl  stdsimd_all_8x16
    .p2align    4, 0x90
    .type   stdsimd_all_8x16,@function
stdsimd_all_8x16:
    .cfi_startproc
    movdqa  (%rdi), %xmm0
    pshufd  $78, %xmm0, %xmm1
    pand    %xmm0, %xmm1
    pshufd  $229, %xmm1, %xmm0
    pand    %xmm1, %xmm0
    movdqa  %xmm0, %xmm1
    psrld   $16, %xmm1
    pand    %xmm0, %xmm1
    movdqa  %xmm1, %xmm0
    psrlw   $8, %xmm0
    pand    %xmm1, %xmm0
    movd    %xmm0, %eax
    testb   %al, %al
    setne   %al
    retq
.Lfunc_end1:
    .size   stdsimd_all_8x16, .Lfunc_end1-stdsimd_all_8x16
    .cfi_endproc

    .section    .text.simd_any_8x16,"ax",@progbits
    .globl  simd_any_8x16
    .p2align    4, 0x90
    .type   simd_any_8x16,@function
simd_any_8x16:
    .cfi_startproc
    movdqa  (%rdi), %xmm0
    pmovmskb    %xmm0, %eax
    testl   %eax, %eax
    setne   %al
    retq
.Lfunc_end2:
    .size   simd_any_8x16, .Lfunc_end2-simd_any_8x16
    .cfi_endproc

    .section    .text.simd_all_8x16,"ax",@progbits
    .globl  simd_all_8x16
    .p2align    4, 0x90
    .type   simd_all_8x16,@function
simd_all_8x16:
    .cfi_startproc
    movdqa  (%rdi), %xmm0
    pmovmskb    %xmm0, %eax
    cmpl    $65535, %eax
    sete    %al
    retq
.Lfunc_end3:
    .size   simd_all_8x16, .Lfunc_end3-simd_all_8x16
    .cfi_endproc


    .section    ".note.GNU-stack","",@progbits
@hsivonen hsivonen mentioned this issue Mar 9, 2018
3 tasks
@gnzlbg
Copy link
Contributor

gnzlbg commented Mar 9, 2018

Thanks for the report @hsivonen, this is pretty much expected at this point.

The problem here is that b8x16 is actually just an <i8 x 16> llvm vector. Because you can transmute any other <i8 x 16> llvm vector into it, in this particular case, LLVM does not know that it only contains 0xffs or 0x00s.

If I add the following two functions:

pub fn stdsimd_eq_all_8x16(a: stdsimd::simd::i8x16, b: stdsimd::simd::i8x16) -> bool {
    stdsimd_all_8x16(a.eq(b))
}

pub fn simd_eq_all_8x16(a: simd::i8x16, b: simd::i8x16) -> bool {
    simd_all_8x16(a.eq(b))
}

You will see that they "pretty much" generate identical assembly (*).

While the current implementation makes it trivial to add custom implementations of and/or/xor for the boolean vector types, I'd rather try a bit harder to get LLVM to emit proper code-gen first. If that does not work, then I'll add custom implementations for these EDIT: or just remove the reductions completely (they can be implemented in a crate external to stdsimd, like faster).


@rkruppe @alexcrichton I suspect that it would be enough to simd_cast the boolean vectors into LLVM vectors of type <i1 x N>'s in the implementation of all/none/any to significantly improve code-gen here.


  • (*) The assembly is not identical, but that's probably a different issue. Need to investigate here further.

@hsivonen
Copy link
Member Author

hsivonen commented Mar 9, 2018

Because you can transmute any other <i8 x 16> llvm vector into it,

Transmutes are unsafe. If the programmer transmutes something to a boolean vector, it's up to the programmer to make sure the invariants of the type are upheld.

The whole point of having boolean vectors is to use the type system to communicate the precondition (all bits on a lane are the same) of any() and all() to enable guaranteed-_mm_movemask_epi8-based any() and all() on SSE2.

in this particular case, LLVM does not know that it only contains 0xffs or 0x00s.

It seems more reliable to have manually-written any() and all(), since LLVM doesn't have the information that boolean vector types are trying to signal.

just remove the reductions completely (they can be implemented in a crate external

This would be unfortunate. In particular, the ARMv7+NEON implementation is subtle enough that it should be in the standard library to save application programmers from themselves. (It's still unclear to me if I got the simd crate PR for the ARMv7+NEON versions right.) Also, it would be sad if adding support for SIMD on POWER8 required hunting around the crate ecosystem for outside-core implementations of any() and all() instead of being able to provide implementations in core.

@gnzlbg
Copy link
Contributor

gnzlbg commented Mar 9, 2018

Transmutes are unsafe. If the programmer transmutes something to a boolean vector, it's up to the programmer to make sure the invariants of the type are upheld.

As mentioned, the problem is that we are currently not telling LLVM what the invariants on std::simd's boolean vectors are.

The whole point of having boolean vectors is to use the type system to communicate the precondition (all bits on a lane are the same) of any() and all() to enable guaranteed-_mm_movemask_epi8-based any() and all() on SSE2.

I strongly disagree. That's a consequence of boolean vectors, but it's not the point. std::simd's boolean vectors try offer a way to convey to the optimizer that the code operates on vectors of boolean values, so that the optimizer can choose whatever instructions it deems fit for whatever ISA is being targeted.

This must be done in std because the compiler intrinsics necessary for it are not available for third-party crates.

This would be unfortunate. [...] it would be sad [...] hunting around the crate ecosystem [...]
It seems more reliable to have manually-written any() and all() [...] ARMv7+NEON [...]

Anyone can do this today on nightly:

pub struct b8x16(std::arch::x86::__m128i);
impl b8x16 { fn all(self) -> { ... } } 

put it on a crate, and have it work on stable Rust soon.

The point of std is to allow users to do these things on their crates, not to prevent them from having to use crates.io every time they need to do X. So I fundamentally disagree with all you said there as well.

@alexcrichton
Copy link
Member

@gnzlbg for any and all it looks like we're using the "experimental" reductions in LLVM, maybe they're just not well tuned yet? Presumably we could go ahead and call the more optimized versions in particular cases, right? (aka the direct intrinsics)

@hanna-kruppe
Copy link

Yeah the reduction intrinsics are new-ish and originally intended for ARM SVE if I remember correctly, so it wouldn't surprise me if the x86 backend simple doesn't have good lowering for them yet.

@gnzlbg
Copy link
Contributor

gnzlbg commented Mar 9, 2018

@alexcrichton

for any and all it looks like we're using the "experimental" reductions in LLVM, maybe they're just not well tuned yet?

The reductions are working perfectly. If you write:

fn foo(x: i8x8) -> i8 { x.and() }

you get pretty good assembly code for it:

 punpcklbw xmm0, xmm0
 pshufd  xmm1, xmm0, 78
 pand    xmm1, xmm0
 pshufd  xmm0, xmm1, 229
 pand    xmm0, xmm1
 movdqa  xmm1, xmm0
 psrld   xmm1, 16
 pand    xmm1, xmm0
 movd    eax, xmm1
 test    al, al
 setne   al

The problem is that when @hsivonen writes:

fn foo(x: b8x8) -> bool { x.all() }

stdsimd translates it into fn foo(x: i8x8) -> bool { x.and() != 0 } and that is exactly what llvm is generating code for.

So at least in this case LLVM is doing exactly what we are telling it to do.

Also, if we call foo(a.eq(b)) then LLVM generates the instruction that @hsivonen wanted. Why? IMO it is because a.eq(b) maps to the generic intrinsic of vector comparisons, which returns a boolean LLVM vector, and even though we are then zero-extending it to i8x8, LLVM is still able to know that the lanes of this i8x8 contain only 0x00 and 0xff which map to true and false, and is able to optimize the and reduction from a sequence of shuffles into a single instruction (the one @hsivonen wanted).

Does this sound like something a buggy implementation would be able to do?


Presumably we could go ahead and call the more optimized versions in particular cases, right?

If this would be failing for one operation of one vector type on one ISA and this were a filled and acknowledged LLVM bug then adding a temporary workaround until the bug fix lands in Rust's LLVM would probably be the right call.

But this bug affects all portable vector operations for all boolean vectors on all ISAs, and considering all the cases above, it just looks like a stdsimd/rustc bug that we might be able to fix by just simd_casting from the boolean vectors into i1xN types and back before calling any llvm intrinsic on boolean vectors.

@hsivonen
Copy link
Member Author

hsivonen commented Mar 12, 2018

The whole point of having boolean vectors is to use the type system to communicate the precondition (all bits on a lane are the same) of any() and all() to enable guaranteed-_mm_movemask_epi8-based any() and all() on SSE2.

I strongly disagree. That's a consequence of boolean vectors, but it's not the point. std::simd's boolean vectors try offer a way to convey to the optimizer that the code operates on vectors of boolean values, so that the optimizer can choose whatever instructions it deems fit for whatever ISA is being targeted.

When the target is SSE2, could there possibly be an instructions other than pmovmskb followed by a single ALU comparison that would be more performant? Is there any scenario where a failure to generate pmovmskb (and no other SIMD instructions) when targeting SSE2 is not a performance bug?

Anyone can do this today on nightly:

pub struct b8x16(std::arch::x86::__m128i);
impl b8x16 { fn all(self) -> { ... } } 

put it on a crate, and have it work on stable Rust soon.

The point of std is to allow users to do these things on their crates, not to prevent them from having to use crates.io every time they need to do X. So I fundamentally disagree with all you said there as well.

If we look at std as it exists prior to all SIMD stuff and try to infer a purpose from what's there, I don't think that the result is that stuff should be in std only if it couldn't be implemented outside std.

Clearly, std provides portability beyond providing the primitives that would allow a crates.io crate provide portability using the primitives plus conditional compilation. Example: Abstracting away the fact that NT file paths are sequences of 16-bit units instead of being sequences of 8-bit units like on other mainstream platforms.

Furthermore, std provides stuff that needs to be fast and correct even when there's no concern about portability abstractions or access to intrinsics. Example: Binary search.

In this light, it seems entirely reasonable to me to take the position that std should provide any() and all() such that they are as fast as they can be and whose implementations get extended to cover new ISAs as Rust expands its ISA reach, and it's an std bug if programmers working outside std need to reach for std::arch to get faster any() and all().

But this bug affects all portable vector operations for all boolean vectors on all ISAs,

FWIW, in the case of aarch64, you might get away with switching to a different portable reduction across the vector that "happens" to map directly to a reduction that aarch64 provides a single instruction for.

and considering all the cases above, it just looks like a stdsimd/rustc bug that we might be able to fix by just simd_casting from the boolean vectors into i1xN types and back before calling any llvm intrinsic on boolean vectors.

Are the semantics of LLVM internal SIMD casts such that the cast itself trusts that the invariant of all bits in each lane being the same held so that there's no need to generate code to look at all the bits?

If this can be fixed within the constraint of rustc currently using LLVM 6 such that any() and all() consistently generate no more SIMD instructions than pmovmskb when targeting SSE2, that's fine (and preferable!). If not, I think it's not OK to avoid doing the thing that the simd crate does and wait for the generic LLVM-level fix. (That is, for the purpose of getting Firefox off the simd crate and using stdsimd, I think the performance regression that currently results is not OK.)

@gnzlbg
Copy link
Contributor

gnzlbg commented Mar 13, 2018

Here is an update. I've sent a rustc PR (rust-lang/rust#48983) and a stdsimd PR (#371) that improve the situation from:

; stdsimd_all_8x16
movdqa  (%rdi), %xmm0
    pshufd  $78, %xmm0, %xmm1
    pand    %xmm0, %xmm1
    pshufd  $229, %xmm1, %xmm0
    pand    %xmm1, %xmm0
    movdqa  %xmm0, %xmm1
    psrld   $16, %xmm1
    pand    %xmm0, %xmm1
    movdqa  %xmm1, %xmm0
    psrlw   $8, %xmm0
    pand    %xmm1, %xmm0
    movd    %xmm0, %eax
    testb   %al, %al
    setne   %al

to this

  pand %xmm0, %xmm1 # xmm0 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
  pcmpeqb %xmm0, %xmm1
  pmovmskb %xmm1, %eax
  xorl %ecx, %ecx
  cmpl $65535, %eax # imm = 0xFFFF
  movl $-1, %eax
  cmovnel %ecx, %eax
  testb $1, %al

which is still far from perfect, but way better than the shuffle sequence that was being generated before.

The pand with ones and the subsequent pcmpeqb instructions are a no-op if the input is a boolean vector, but I haven't been able to trick LLVM into removing these yet. I've filled an LLVM bug about this here: https://bugs.llvm.org/show_bug.cgi?id=36702 We'll see what happens with that. Maybe it's not a bug and I am just missing the obvious way of telling LLVM about the invariants of the input.

@gnzlbg
Copy link
Contributor

gnzlbg commented Apr 11, 2018

The simd crate only provides an implementation of all() for bool8ix16 in the sse2 module, which should not be available if sse2 is not available. ~~However, compiling without sse2 still produces a pmovmskb. ~~

How can this even happen?

EDIT: It's a bug: the simd crate currently requires sse2.

If you fix the current compilation error and then try to compile without it (e.g. via RUSTFLAGS="-C target-feature=-sse2") the crate does not compile because among other things, all() for bool8ix16 is missing.

@gnzlbg
Copy link
Contributor

gnzlbg commented Apr 11, 2018

Also, the workarounds in the simd crate should not work when compiling globally without sse2 and using #[target_feature] to enable it locally. That is:

#[target_feature(enable = "sse2")]
pub unsafe fn foo(b: bool8ix16) -> bool {
    b.all()
}

won't be able to use pmovmskb because cfg(target_feature = "sse2") inside the simd crate is false because the simd crate was compiled without enabling sse2 globally.

Therefore, we can't really workaround these issues in either rustc or stdsimd without propagating #[target_feature] across function calls such that cfg(target_feature)s inside them are expanded again with updated feature sets and the functions re-compiled.


EDIT: for sse2 this does not matter much because it is enabled by default on the i686 and x86_64 tier 1 targets, but sse2 is just an example. This will matter for any other intrinsics for which LLVM might produce bad code.

@hsivonen
Copy link
Member Author

For my use cases, I care about the 128-bit reductions being fast when everything is compiled with SSE2 enabled or when everything is compiled with NEON enabled.

@gnzlbg
Copy link
Contributor

gnzlbg commented Apr 11, 2018

For my use cases, I care about the 128-bit reductions being fast when everything is compiled with SSE2 enabled or when everything is compiled with NEON enabled.

Sure, I am preparing a PR with workarounds for those. Just be aware that if you start caring about AVX/AVX2 and you can't compile everything with that enabled then you are going to have to manually select which instructions to use until the upstream LLVM bugs are fixed because right now we can't really workaround these anywhere.

@gnzlbg
Copy link
Contributor

gnzlbg commented Apr 11, 2018

if you start caring about AVX/AVX2 and you can't compile everything with that enabled

Probably not even that. For those using std::simd, they might need to actually recompile std with avx2 enabled to benefit from this.

@gnzlbg
Copy link
Contributor

gnzlbg commented Apr 11, 2018

@hsivonen which arm targets are you using? I need to check whether they have NEON enabled in rustc by default.

@hsivonen
Copy link
Member Author

which arm targets are you using?

Currently, I'm testing on aarch64-unknown-linux-gnu, armv7-unknown-linux-gnueabihf and occasionally armv7-unknown-linux-gnueabihf with neon feature added from the command line. The reason why I don't do the latter all the time is that the ARMv7+NEON is broken in the simd crate and I've been hoping to switch to stdsimd instead of fixing that properly.

Mozilla ships with armv7-linux-androideabi and additionally builds nightlies but does not yet ship releases with aarch64-linux-android.

I need to check whether they have NEON enabled in rustc by default.

$ rustc -Z unstable-options --target armv7-linux-androideabi --print target-spec-json | grep features
  "features": "+v7,+thumb-mode,+thumb2,+vfp3,+d16,-neon",
$ rustc -Z unstable-options --target armv7-unknown-linux-gnueabihf --print target-spec-json | grep features
  "features": "+v7,+vfp3,+d16,+thumb2,-neon",

C++ code in Firefox for ARMv7 Android is built with NEON unconditionally enabled, so I think Firefox would benefit from Rust re-introducing an ARMv7+NEON target both for Android and GNU/Linux (Android for shipping and GNU/Linux for testing).

@gnzlbg
Copy link
Contributor

gnzlbg commented Apr 12, 2018

@hsivonen so I am going to send a PR today that fixes this issue on master, and I want to ask you to try it by using stdsimd master branch and let me know if it works.

You won't get good code-gen using std::simd unless you recompile standard with neon enabled on both armv7 and aarch64 targets. That is, either you need to use xargo, or we would need to add two new targets {armv7,aarch64}neon-unknown-linux-gnu to rustc somehow (cc @alexcrichton what are our options here?)

@hsivonen
Copy link
Member Author

Thanks. I filed an issue about NEON-enabled ARMv7 targets and I intend to try to write a PR for it now. (aarch64 should be fine already.)

gnzlbg added a commit to gnzlbg/stdsimd that referenced this issue Apr 12, 2018
This commit adds workarounds for the mask reductions: `all` and `any`.

Note:

* not all LLVM bugs have been properly filled yet
* workarounds for 256-bit vector masks on armv7 and aarch64 are
  not provided in this PR

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
```

After this PR for `m8x8`, `m16x4`, `m32x2`:

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

Before this PR:

```asm
all_8x8:
 call    L9$pb
L9$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
any_8x8:
 call    L8$pb
L8$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
```

After this PR:

```asm
all_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 ret
any_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 ret
```

Before this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

Before this PR:

```asm
all_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, rust-lang#4
 vand    d0, d0, d1
 vext.8  d1, d0, d0, rust-lang#2
 vand    d0, d0, d1
 vdup.8  d1, d0[1]
 vand    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, rust-lang#4
 vorr    d0, d0, d1
 vext.8  d1, d0, d0, rust-lang#2
 vorr    d0, d0, d1
 vdup.8  d1, d0[1]
 vorr    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x8:
 vldr    d0, [r0]
 b       <m8x8 as All>::all

<m8x8 as All>::all:
 vpmin.u8 d16, d0, d16
 vpmin.u8 d16, d16, d16
 vpmin.u8 d0, d16, d16
 b       m8x8::extract

any_8x8:
 vldr    d0, [r0]
 b       <m8x8 as Any>::any

<m8x8 as Any>::any:
 vpmax.u8 d16, d0, d16
 vpmax.u8 d16, d16, d16
 vpmax.u8 d0, d16, d16
 b       m8x8::extract
```

(note: inlining does not work properly on ARMv7)

Before this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 sete    al
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

Before this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[8]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[9]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[10]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[11]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[12]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[13]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[14]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[15]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 umov    w9, v0.b[8]
 orr     w8, w8, w9
 umov    w9, v0.b[9]
 orr     w8, w8, w9
 umov    w9, v0.b[10]
 orr     w8, w8, w9
 umov    w9, v0.b[11]
 orr     w8, w8, w9
 umov    w9, v0.b[12]
 orr     w8, w8, w9
 umov    w9, v0.b[13]
 orr     w8, w8, w9
 umov    w9, v0.b[14]
 orr     w8, w8, w9
 umov    w9, v0.b[15]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x16:
 ldr     q0, [x0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

Before this PR:

```asm
all_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, rust-lang#8
 vand    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#4
 vand    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#2
 vand    q0, q0, q1
 vdup.8  q1, d0[1]
 vand    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, rust-lang#8
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#4
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#2
 vorr    q0, q0, q1
 vdup.8  q1, d0[1]
 vorr    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as All>::all

<m8x16 as All>::all:
 vpmin.u8 d0, d0, d
 b       <m8x8 as All>::all
any_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as Any>::any

<m8x16 as Any>::any:
 vpmax.u8 d0, d0, d1
 b       <m8x8 as Any>::any
```

The inlining problems are pretty bad on ARMv7 + NEON.

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI17_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 movdqa  xmm2, xmmword, ptr, [rdi, +, 16]
 pand    xmm2, xmm0
 pcmpeqb xmm2, xmm0
 pcmpeqb xmm1, xmm0
 pand    xmm1, xmm2
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
 any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 por     xmm0, xmmword, ptr, [rdi, +, 16]
 movdqa  xmm1, xmmword, ptr, [rip, +, LCPI16_0]
 pand    xmm0, xmm1
 pcmpeqb xmm0, xmm1
 pmovmskb eax, xmm0
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 jne     LBB17_1
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb ecx, xmm0
 mov     al, 1
 cmp     ecx, 65535
 je      LBB17_3
LBB17_1:
 xor     eax, eax
LBB17_3:
 pop     rbp
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb ecx, xmm0
 mov     al, 1
 test    ecx, ecx
 je      LBB16_1
 pop     rbp
 ret
LBB16_1:
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vandps  ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vandps  ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vandps  ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vorps   ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vorps   ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vorps   ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vxorps  xmm1, xmm1, xmm1
 vcmptrueps ymm1, ymm1, ymm1
 vptest  ymm0, ymm1
 setb    al
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vptest  ymm0, ymm0
 setne   al
 pop     rbp
 vzeroupper
 ret
```

---

Closes rust-lang#362 .
gnzlbg added a commit to gnzlbg/stdsimd that referenced this issue Apr 12, 2018
This commit adds workarounds for the mask reductions: `all` and `any`.

Note:

* not all LLVM bugs have been properly filled yet
* workarounds for 256-bit vector masks on armv7 and aarch64 are
  not provided in this PR

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
```

After this PR for `m8x8`, `m16x4`, `m32x2`:

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

Before this PR:

```asm
all_8x8:
 call    L9$pb
L9$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
any_8x8:
 call    L8$pb
L8$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
```

After this PR:

```asm
all_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 ret
any_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 ret
```

Before this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

Before this PR:

```asm
all_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, rust-lang#4
 vand    d0, d0, d1
 vext.8  d1, d0, d0, rust-lang#2
 vand    d0, d0, d1
 vdup.8  d1, d0[1]
 vand    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, rust-lang#4
 vorr    d0, d0, d1
 vext.8  d1, d0, d0, rust-lang#2
 vorr    d0, d0, d1
 vdup.8  d1, d0[1]
 vorr    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x8:
 vldr    d0, [r0]
 b       <m8x8 as All>::all

<m8x8 as All>::all:
 vpmin.u8 d16, d0, d16
 vpmin.u8 d16, d16, d16
 vpmin.u8 d0, d16, d16
 b       m8x8::extract

any_8x8:
 vldr    d0, [r0]
 b       <m8x8 as Any>::any

<m8x8 as Any>::any:
 vpmax.u8 d16, d0, d16
 vpmax.u8 d16, d16, d16
 vpmax.u8 d0, d16, d16
 b       m8x8::extract
```

(note: inlining does not work properly on ARMv7)

Before this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 sete    al
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

Before this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[8]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[9]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[10]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[11]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[12]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[13]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[14]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[15]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 umov    w9, v0.b[8]
 orr     w8, w8, w9
 umov    w9, v0.b[9]
 orr     w8, w8, w9
 umov    w9, v0.b[10]
 orr     w8, w8, w9
 umov    w9, v0.b[11]
 orr     w8, w8, w9
 umov    w9, v0.b[12]
 orr     w8, w8, w9
 umov    w9, v0.b[13]
 orr     w8, w8, w9
 umov    w9, v0.b[14]
 orr     w8, w8, w9
 umov    w9, v0.b[15]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x16:
 ldr     q0, [x0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

Before this PR:

```asm
all_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, rust-lang#8
 vand    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#4
 vand    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#2
 vand    q0, q0, q1
 vdup.8  q1, d0[1]
 vand    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, rust-lang#8
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#4
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#2
 vorr    q0, q0, q1
 vdup.8  q1, d0[1]
 vorr    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as All>::all

<m8x16 as All>::all:
 vpmin.u8 d0, d0, d
 b       <m8x8 as All>::all
any_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as Any>::any

<m8x16 as Any>::any:
 vpmax.u8 d0, d0, d1
 b       <m8x8 as Any>::any
```

The inlining problems are pretty bad on ARMv7 + NEON.

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI17_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 movdqa  xmm2, xmmword, ptr, [rdi, +, 16]
 pand    xmm2, xmm0
 pcmpeqb xmm2, xmm0
 pcmpeqb xmm1, xmm0
 pand    xmm1, xmm2
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
 any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 por     xmm0, xmmword, ptr, [rdi, +, 16]
 movdqa  xmm1, xmmword, ptr, [rip, +, LCPI16_0]
 pand    xmm0, xmm1
 pcmpeqb xmm0, xmm1
 pmovmskb eax, xmm0
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 jne     LBB17_1
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb ecx, xmm0
 mov     al, 1
 cmp     ecx, 65535
 je      LBB17_3
LBB17_1:
 xor     eax, eax
LBB17_3:
 pop     rbp
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb ecx, xmm0
 mov     al, 1
 test    ecx, ecx
 je      LBB16_1
 pop     rbp
 ret
LBB16_1:
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vandps  ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vandps  ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vandps  ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vorps   ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vorps   ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vorps   ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vxorps  xmm1, xmm1, xmm1
 vcmptrueps ymm1, ymm1, ymm1
 vptest  ymm0, ymm1
 setb    al
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vptest  ymm0, ymm0
 setne   al
 pop     rbp
 vzeroupper
 ret
```

---

Closes rust-lang#362 .
gnzlbg added a commit to gnzlbg/stdsimd that referenced this issue Apr 12, 2018
This commit adds workarounds for the mask reductions: `all` and `any`.

64-bit wide mask types (`m8x8`, `m16x4`, `m32x2`)

`x86_64` with `MMX` enabled

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
```

After this PR for `m8x8`, `m16x4`, `m32x2`:

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

x86` with `MMX` enabled

Before this PR:

```asm
all_8x8:
 call    L9$pb
L9$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
any_8x8:
 call    L8$pb
L8$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
```

After this PR:

```asm
all_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 ret
any_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 ret
```

`aarch64`

Before this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

`ARMv7` + `neon`

Before this PR:

```asm
all_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, rust-lang#4
 vand    d0, d0, d1
 vext.8  d1, d0, d0, rust-lang#2
 vand    d0, d0, d1
 vdup.8  d1, d0[1]
 vand    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, rust-lang#4
 vorr    d0, d0, d1
 vext.8  d1, d0, d0, rust-lang#2
 vorr    d0, d0, d1
 vdup.8  d1, d0[1]
 vorr    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x8:
 vldr    d0, [r0]
 b       <m8x8 as All>::all

<m8x8 as All>::all:
 vpmin.u8 d16, d0, d16
 vpmin.u8 d16, d16, d16
 vpmin.u8 d0, d16, d16
 b       m8x8::extract

any_8x8:
 vldr    d0, [r0]
 b       <m8x8 as Any>::any

<m8x8 as Any>::any:
 vpmax.u8 d16, d0, d16
 vpmax.u8 d16, d16, d16
 vpmax.u8 d0, d16, d16
 b       m8x8::extract
```

(note: inlining does not work properly on ARMv7)

128-bit wide mask types (`m8x16`, `m16x8`, `m32x4`, `m64x2`)

`x86_64` with SSE2 enabled

Before this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 sete    al
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

`aarch64`

Before this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[8]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[9]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[10]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[11]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[12]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[13]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[14]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[15]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 umov    w9, v0.b[8]
 orr     w8, w8, w9
 umov    w9, v0.b[9]
 orr     w8, w8, w9
 umov    w9, v0.b[10]
 orr     w8, w8, w9
 umov    w9, v0.b[11]
 orr     w8, w8, w9
 umov    w9, v0.b[12]
 orr     w8, w8, w9
 umov    w9, v0.b[13]
 orr     w8, w8, w9
 umov    w9, v0.b[14]
 orr     w8, w8, w9
 umov    w9, v0.b[15]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x16:
 ldr     q0, [x0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

 `ARMv7` + `neon`

Before this PR:

```asm
all_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, rust-lang#8
 vand    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#4
 vand    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#2
 vand    q0, q0, q1
 vdup.8  q1, d0[1]
 vand    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, rust-lang#8
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#4
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#2
 vorr    q0, q0, q1
 vdup.8  q1, d0[1]
 vorr    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as All>::all

<m8x16 as All>::all:
 vpmin.u8 d0, d0, d
 b       <m8x8 as All>::all
any_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as Any>::any

<m8x16 as Any>::any:
 vpmax.u8 d0, d0, d1
 b       <m8x8 as Any>::any
```

The inlining problems are pretty bad on ARMv7 + NEON.

256-bit wide mask types (`m8x32`, `m16x16`, `m32x8`, `m64x4`)

With SSE2 enabled

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI17_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 movdqa  xmm2, xmmword, ptr, [rdi, +, 16]
 pand    xmm2, xmm0
 pcmpeqb xmm2, xmm0
 pcmpeqb xmm1, xmm0
 pand    xmm1, xmm2
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
 any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 por     xmm0, xmmword, ptr, [rdi, +, 16]
 movdqa  xmm1, xmmword, ptr, [rip, +, LCPI16_0]
 pand    xmm0, xmm1
 pcmpeqb xmm0, xmm1
 pmovmskb eax, xmm0
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 jne     LBB17_1
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb ecx, xmm0
 mov     al, 1
 cmp     ecx, 65535
 je      LBB17_3
LBB17_1:
 xor     eax, eax
LBB17_3:
 pop     rbp
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb ecx, xmm0
 mov     al, 1
 test    ecx, ecx
 je      LBB16_1
 pop     rbp
 ret
LBB16_1:
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

With AVX enabled

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vandps  ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vandps  ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vandps  ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vorps   ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vorps   ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vorps   ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vxorps  xmm1, xmm1, xmm1
 vcmptrueps ymm1, ymm1, ymm1
 vptest  ymm0, ymm1
 setb    al
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vptest  ymm0, ymm0
 setne   al
 pop     rbp
 vzeroupper
 ret
```

---

Closes rust-lang#362 .
gnzlbg added a commit to gnzlbg/stdsimd that referenced this issue Apr 27, 2018
This commit adds workarounds for the mask reductions: `all` and `any`.

64-bit wide mask types (`m8x8`, `m16x4`, `m32x2`)

`x86_64` with `MMX` enabled

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
```

After this PR for `m8x8`, `m16x4`, `m32x2`:

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

x86` with `MMX` enabled

Before this PR:

```asm
all_8x8:
 call    L9$pb
L9$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
any_8x8:
 call    L8$pb
L8$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
```

After this PR:

```asm
all_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 ret
any_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 ret
```

`aarch64`

Before this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

`ARMv7` + `neon`

Before this PR:

```asm
all_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, rust-lang#4
 vand    d0, d0, d1
 vext.8  d1, d0, d0, rust-lang#2
 vand    d0, d0, d1
 vdup.8  d1, d0[1]
 vand    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, rust-lang#4
 vorr    d0, d0, d1
 vext.8  d1, d0, d0, rust-lang#2
 vorr    d0, d0, d1
 vdup.8  d1, d0[1]
 vorr    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x8:
 vldr    d0, [r0]
 b       <m8x8 as All>::all

<m8x8 as All>::all:
 vpmin.u8 d16, d0, d16
 vpmin.u8 d16, d16, d16
 vpmin.u8 d0, d16, d16
 b       m8x8::extract

any_8x8:
 vldr    d0, [r0]
 b       <m8x8 as Any>::any

<m8x8 as Any>::any:
 vpmax.u8 d16, d0, d16
 vpmax.u8 d16, d16, d16
 vpmax.u8 d0, d16, d16
 b       m8x8::extract
```

(note: inlining does not work properly on ARMv7)

128-bit wide mask types (`m8x16`, `m16x8`, `m32x4`, `m64x2`)

`x86_64` with SSE2 enabled

Before this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 sete    al
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

`aarch64`

Before this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[8]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[9]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[10]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[11]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[12]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[13]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[14]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[15]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 umov    w9, v0.b[8]
 orr     w8, w8, w9
 umov    w9, v0.b[9]
 orr     w8, w8, w9
 umov    w9, v0.b[10]
 orr     w8, w8, w9
 umov    w9, v0.b[11]
 orr     w8, w8, w9
 umov    w9, v0.b[12]
 orr     w8, w8, w9
 umov    w9, v0.b[13]
 orr     w8, w8, w9
 umov    w9, v0.b[14]
 orr     w8, w8, w9
 umov    w9, v0.b[15]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x16:
 ldr     q0, [x0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

 `ARMv7` + `neon`

Before this PR:

```asm
all_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, rust-lang#8
 vand    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#4
 vand    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#2
 vand    q0, q0, q1
 vdup.8  q1, d0[1]
 vand    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, rust-lang#8
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#4
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#2
 vorr    q0, q0, q1
 vdup.8  q1, d0[1]
 vorr    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as All>::all

<m8x16 as All>::all:
 vpmin.u8 d0, d0, d
 b       <m8x8 as All>::all
any_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as Any>::any

<m8x16 as Any>::any:
 vpmax.u8 d0, d0, d1
 b       <m8x8 as Any>::any
```

The inlining problems are pretty bad on ARMv7 + NEON.

256-bit wide mask types (`m8x32`, `m16x16`, `m32x8`, `m64x4`)

With SSE2 enabled

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI17_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 movdqa  xmm2, xmmword, ptr, [rdi, +, 16]
 pand    xmm2, xmm0
 pcmpeqb xmm2, xmm0
 pcmpeqb xmm1, xmm0
 pand    xmm1, xmm2
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
 any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 por     xmm0, xmmword, ptr, [rdi, +, 16]
 movdqa  xmm1, xmmword, ptr, [rip, +, LCPI16_0]
 pand    xmm0, xmm1
 pcmpeqb xmm0, xmm1
 pmovmskb eax, xmm0
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 jne     LBB17_1
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb ecx, xmm0
 mov     al, 1
 cmp     ecx, 65535
 je      LBB17_3
LBB17_1:
 xor     eax, eax
LBB17_3:
 pop     rbp
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb ecx, xmm0
 mov     al, 1
 test    ecx, ecx
 je      LBB16_1
 pop     rbp
 ret
LBB16_1:
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

With AVX enabled

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vandps  ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vandps  ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vandps  ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vorps   ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vorps   ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vorps   ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vxorps  xmm1, xmm1, xmm1
 vcmptrueps ymm1, ymm1, ymm1
 vptest  ymm0, ymm1
 setb    al
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vptest  ymm0, ymm0
 setne   al
 pop     rbp
 vzeroupper
 ret
```

---

Closes rust-lang#362 .
gnzlbg added a commit to gnzlbg/stdsimd that referenced this issue May 2, 2018
This commit adds workarounds for the mask reductions: `all` and `any`.

64-bit wide mask types (`m8x8`, `m16x4`, `m32x2`)

`x86_64` with `MMX` enabled

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
```

After this PR for `m8x8`, `m16x4`, `m32x2`:

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

x86` with `MMX` enabled

Before this PR:

```asm
all_8x8:
 call    L9$pb
L9$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
any_8x8:
 call    L8$pb
L8$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
```

After this PR:

```asm
all_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 ret
any_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 ret
```

`aarch64`

Before this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

`ARMv7` + `neon`

Before this PR:

```asm
all_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, rust-lang#4
 vand    d0, d0, d1
 vext.8  d1, d0, d0, rust-lang#2
 vand    d0, d0, d1
 vdup.8  d1, d0[1]
 vand    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, rust-lang#4
 vorr    d0, d0, d1
 vext.8  d1, d0, d0, rust-lang#2
 vorr    d0, d0, d1
 vdup.8  d1, d0[1]
 vorr    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x8:
 vldr    d0, [r0]
 b       <m8x8 as All>::all

<m8x8 as All>::all:
 vpmin.u8 d16, d0, d16
 vpmin.u8 d16, d16, d16
 vpmin.u8 d0, d16, d16
 b       m8x8::extract

any_8x8:
 vldr    d0, [r0]
 b       <m8x8 as Any>::any

<m8x8 as Any>::any:
 vpmax.u8 d16, d0, d16
 vpmax.u8 d16, d16, d16
 vpmax.u8 d0, d16, d16
 b       m8x8::extract
```

(note: inlining does not work properly on ARMv7)

128-bit wide mask types (`m8x16`, `m16x8`, `m32x4`, `m64x2`)

`x86_64` with SSE2 enabled

Before this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 sete    al
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

`aarch64`

Before this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[8]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[9]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[10]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[11]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[12]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[13]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[14]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[15]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 umov    w9, v0.b[8]
 orr     w8, w8, w9
 umov    w9, v0.b[9]
 orr     w8, w8, w9
 umov    w9, v0.b[10]
 orr     w8, w8, w9
 umov    w9, v0.b[11]
 orr     w8, w8, w9
 umov    w9, v0.b[12]
 orr     w8, w8, w9
 umov    w9, v0.b[13]
 orr     w8, w8, w9
 umov    w9, v0.b[14]
 orr     w8, w8, w9
 umov    w9, v0.b[15]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x16:
 ldr     q0, [x0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

 `ARMv7` + `neon`

Before this PR:

```asm
all_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, rust-lang#8
 vand    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#4
 vand    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#2
 vand    q0, q0, q1
 vdup.8  q1, d0[1]
 vand    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, rust-lang#8
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#4
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, rust-lang#2
 vorr    q0, q0, q1
 vdup.8  q1, d0[1]
 vorr    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as All>::all

<m8x16 as All>::all:
 vpmin.u8 d0, d0, d
 b       <m8x8 as All>::all
any_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as Any>::any

<m8x16 as Any>::any:
 vpmax.u8 d0, d0, d1
 b       <m8x8 as Any>::any
```

The inlining problems are pretty bad on ARMv7 + NEON.

256-bit wide mask types (`m8x32`, `m16x16`, `m32x8`, `m64x4`)

With SSE2 enabled

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI17_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 movdqa  xmm2, xmmword, ptr, [rdi, +, 16]
 pand    xmm2, xmm0
 pcmpeqb xmm2, xmm0
 pcmpeqb xmm1, xmm0
 pand    xmm1, xmm2
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
 any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 por     xmm0, xmmword, ptr, [rdi, +, 16]
 movdqa  xmm1, xmmword, ptr, [rip, +, LCPI16_0]
 pand    xmm0, xmm1
 pcmpeqb xmm0, xmm1
 pmovmskb eax, xmm0
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 jne     LBB17_1
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb ecx, xmm0
 mov     al, 1
 cmp     ecx, 65535
 je      LBB17_3
LBB17_1:
 xor     eax, eax
LBB17_3:
 pop     rbp
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb ecx, xmm0
 mov     al, 1
 test    ecx, ecx
 je      LBB16_1
 pop     rbp
 ret
LBB16_1:
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

With AVX enabled

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vandps  ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vandps  ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vandps  ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vorps   ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vorps   ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vorps   ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vxorps  xmm1, xmm1, xmm1
 vcmptrueps ymm1, ymm1, ymm1
 vptest  ymm0, ymm1
 setb    al
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vptest  ymm0, ymm0
 setne   al
 pop     rbp
 vzeroupper
 ret
```

---

Closes rust-lang#362 .
alexcrichton pushed a commit that referenced this issue May 4, 2018
)

* Work arounds for LLVM6 code-gen bugs in all/any reductions

This commit adds workarounds for the mask reductions: `all` and `any`.

64-bit wide mask types (`m8x8`, `m16x4`, `m32x2`)

`x86_64` with `MMX` enabled

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movzx   eax, byte, ptr, [rdi, +, 7]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 6]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 5]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 4]
 movd    xmm2, eax
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   eax, byte, ptr, [rdi, +, 3]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi, +, 2]
 movd    xmm1, eax
 punpcklwd xmm1, xmm0
 movzx   eax, byte, ptr, [rdi, +, 1]
 movd    xmm0, eax
 movzx   eax, byte, ptr, [rdi]
 movd    xmm3, eax
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 pop     rbp
 ret
```

After this PR for `m8x8`, `m16x4`, `m32x2`:

```asm
all_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 pop     rbp
 ret
any_8x8:
 push    rbp
 mov     rbp, rsp
 movq    mm0, qword, ptr, [rdi]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

x86` with `MMX` enabled

Before this PR:

```asm
all_8x8:
 call    L9$pb
L9$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI9_0-L9$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 pand    xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 pand    xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 pand    xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
any_8x8:
 call    L8$pb
L8$pb:
 pop     eax
 mov     ecx, dword, ptr, [esp, +, 4]
 movzx   edx, byte, ptr, [ecx, +, 7]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 6]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 5]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 4]
 movd    xmm2, edx
 punpcklwd xmm2, xmm0
 punpckldq xmm2, xmm1
 movzx   edx, byte, ptr, [ecx, +, 3]
 movd    xmm0, edx
 movzx   edx, byte, ptr, [ecx, +, 2]
 movd    xmm1, edx
 punpcklwd xmm1, xmm0
 movzx   edx, byte, ptr, [ecx, +, 1]
 movd    xmm0, edx
 movzx   ecx, byte, ptr, [ecx]
 movd    xmm3, ecx
 punpcklwd xmm3, xmm0
 punpckldq xmm3, xmm1
 punpcklqdq xmm3, xmm2
 movdqa  xmm0, xmmword, ptr, [eax, +, LCPI8_0-L8$pb]
 pand    xmm3, xmm0
 pcmpeqw xmm3, xmm0
 pshufd  xmm0, xmm3, 78
 por     xmm0, xmm3
 pshufd  xmm1, xmm0, 229
 por     xmm1, xmm0
 movdqa  xmm0, xmm1
 psrld   xmm0, 16
 por     xmm0, xmm1
 movd    eax, xmm0
 and     al, 1
 ret
```

After this PR:

```asm
all_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 cmp     eax, 255
 sete    al
 ret
any_8x8:
 mov     eax, dword, ptr, [esp, +, 4]
 movq    mm0, qword, ptr, [eax]
 pmovmskb eax, mm0
 test    eax, eax
 setne   al
 ret
```

`aarch64`

Before this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x8:
 ldr     d0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x8:
 ldr     d0, [x0]
 mov     v0.d[1], v0.d[0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

`ARMv7` + `neon`

Before this PR:

```asm
all_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, #4
 vand    d0, d0, d1
 vext.8  d1, d0, d0, #2
 vand    d0, d0, d1
 vdup.8  d1, d0[1]
 vand    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x8:
 vmov.i8 d0, #0x1
 vldr    d1, [r0]
 vtst.8  d0, d1, d0
 vext.8  d1, d0, d0, #4
 vorr    d0, d0, d1
 vext.8  d1, d0, d0, #2
 vorr    d0, d0, d1
 vdup.8  d1, d0[1]
 vorr    d0, d0, d1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x8:
 vldr    d0, [r0]
 b       <m8x8 as All>::all

<m8x8 as All>::all:
 vpmin.u8 d16, d0, d16
 vpmin.u8 d16, d16, d16
 vpmin.u8 d0, d16, d16
 b       m8x8::extract

any_8x8:
 vldr    d0, [r0]
 b       <m8x8 as Any>::any

<m8x8 as Any>::any:
 vpmax.u8 d16, d0, d16
 vpmax.u8 d16, d16, d16
 vpmax.u8 d0, d16, d16
 b       m8x8::extract
```

(note: inlining does not work properly on ARMv7)

128-bit wide mask types (`m8x16`, `m16x8`, `m32x4`, `m64x2`)

`x86_64` with SSE2 enabled

Before this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI9_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI8_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 pcmpeqb xmm1, xmm0
 pmovmskb eax, xmm1
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 sete    al
 pop     rbp
 ret
any_8x16:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

`aarch64`

Before this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 tst     w8, #0xff
 umov    w10, v0.b[2]
 cset    w8, ne
 tst     w9, #0xff
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[3]
 and     w8, w8, w9
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[4]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[5]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[6]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[7]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[8]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[9]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[10]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[11]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[12]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[13]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[14]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 umov    w10, v0.b[15]
 and     w8, w9, w8
 cset    w9, ne
 tst     w10, #0xff
 and     w8, w9, w8
 cset    w9, ne
 and     w0, w9, w8
 ret
any_8x16:
 ldr     q0, [x0]
 umov    w8, v0.b[0]
 umov    w9, v0.b[1]
 orr     w8, w8, w9
 umov    w9, v0.b[2]
 orr     w8, w8, w9
 umov    w9, v0.b[3]
 orr     w8, w8, w9
 umov    w9, v0.b[4]
 orr     w8, w8, w9
 umov    w9, v0.b[5]
 orr     w8, w8, w9
 umov    w9, v0.b[6]
 orr     w8, w8, w9
 umov    w9, v0.b[7]
 orr     w8, w8, w9
 umov    w9, v0.b[8]
 orr     w8, w8, w9
 umov    w9, v0.b[9]
 orr     w8, w8, w9
 umov    w9, v0.b[10]
 orr     w8, w8, w9
 umov    w9, v0.b[11]
 orr     w8, w8, w9
 umov    w9, v0.b[12]
 orr     w8, w8, w9
 umov    w9, v0.b[13]
 orr     w8, w8, w9
 umov    w9, v0.b[14]
 orr     w8, w8, w9
 umov    w9, v0.b[15]
 orr     w8, w8, w9
 tst     w8, #0xff
 cset    w0, ne
 ret
```

After this PR:

```asm
all_8x16:
 ldr     q0, [x0]
 uminv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
any_8x16:
 ldr     q0, [x0]
 umaxv   b0, v0.16b
 fmov    w8, s0
 tst     w8, #0xff
 cset    w0, ne
 ret
```

 `ARMv7` + `neon`

Before this PR:

```asm
all_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, #8
 vand    q0, q0, q1
 vext.8  q1, q0, q0, #4
 vand    q0, q0, q1
 vext.8  q1, q0, q0, #2
 vand    q0, q0, q1
 vdup.8  q1, d0[1]
 vand    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
any_8x16:
 vmov.i8 q0, #0x1
 vld1.64 {d2, d3}, [r0]
 vtst.8  q0, q1, q0
 vext.8  q1, q0, q0, #8
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, #4
 vorr    q0, q0, q1
 vext.8  q1, q0, q0, #2
 vorr    q0, q0, q1
 vdup.8  q1, d0[1]
 vorr    q0, q0, q1
 vmov.u8 r0, d0[0]
 and     r0, r0, #1
 bx      lr
```

After this PR:

```asm
all_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as All>::all

<m8x16 as All>::all:
 vpmin.u8 d0, d0, d
 b       <m8x8 as All>::all
any_8x16:
 vld1.64 {d0, d1}, [r0]
 b       <m8x16 as Any>::any

<m8x16 as Any>::any:
 vpmax.u8 d0, d0, d1
 b       <m8x8 as Any>::any
```

The inlining problems are pretty bad on ARMv7 + NEON.

256-bit wide mask types (`m8x32`, `m16x16`, `m32x8`, `m64x4`)

With SSE2 enabled

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rip, +, LCPI17_0]
 movdqa  xmm1, xmmword, ptr, [rdi]
 pand    xmm1, xmm0
 movdqa  xmm2, xmmword, ptr, [rdi, +, 16]
 pand    xmm2, xmm0
 pcmpeqb xmm2, xmm0
 pcmpeqb xmm1, xmm0
 pand    xmm1, xmm2
 pmovmskb eax, xmm1
 xor     ecx, ecx
 cmp     eax, 65535
 mov     eax, -1
 cmovne  eax, ecx
 and     al, 1
 pop     rbp
 ret
 any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 por     xmm0, xmmword, ptr, [rdi, +, 16]
 movdqa  xmm1, xmmword, ptr, [rip, +, LCPI16_0]
 pand    xmm0, xmm1
 pcmpeqb xmm0, xmm1
 pmovmskb eax, xmm0
 neg     eax
 sbb     eax, eax
 and     al, 1
 pop     rbp
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb eax, xmm0
 cmp     eax, 65535
 jne     LBB17_1
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb ecx, xmm0
 mov     al, 1
 cmp     ecx, 65535
 je      LBB17_3
LBB17_1:
 xor     eax, eax
LBB17_3:
 pop     rbp
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 movdqa  xmm0, xmmword, ptr, [rdi]
 pmovmskb ecx, xmm0
 mov     al, 1
 test    ecx, ecx
 je      LBB16_1
 pop     rbp
 ret
LBB16_1:
 movdqa  xmm0, xmmword, ptr, [rdi, +, 16]
 pmovmskb eax, xmm0
 test    eax, eax
 setne   al
 pop     rbp
 ret
```

With AVX enabled

Before this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI25_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vandps  ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vandps  ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vandps  ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vandps  ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovaps ymm0, ymmword, ptr, [rdi]
 vandps  ymm0, ymm0, ymmword, ptr, [rip, +, LCPI24_0]
 vextractf128 xmm1, ymm0, 1
 vpxor   xmm2, xmm2, xmm2
 vpcmpeqb xmm1, xmm1, xmm2
 vpcmpeqd xmm3, xmm3, xmm3
 vpxor   xmm1, xmm1, xmm3
 vpcmpeqb xmm0, xmm0, xmm2
 vpxor   xmm0, xmm0, xmm3
 vinsertf128 ymm0, ymm0, xmm1, 1
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 78
 vorps   ymm0, ymm0, ymm1
 vpermilps xmm1, xmm0, 229
 vorps   ymm0, ymm0, ymm1
 vpsrld  xmm1, xmm0, 16
 vorps   ymm0, ymm0, ymm1
 vpsrlw  xmm1, xmm0, 8
 vorps   ymm0, ymm0, ymm1
 vpextrb eax, xmm0, 0
 and     al, 1
 pop     rbp
 vzeroupper
 ret
```

After this PR:

```asm
all_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vxorps  xmm1, xmm1, xmm1
 vcmptrueps ymm1, ymm1, ymm1
 vptest  ymm0, ymm1
 setb    al
 pop     rbp
 vzeroupper
 ret
any_8x32:
 push    rbp
 mov     rbp, rsp
 vmovdqa ymm0, ymmword, ptr, [rdi]
 vptest  ymm0, ymm0
 setne   al
 pop     rbp
 vzeroupper
 ret
```

---

Closes #362 .

* test avx on all x86 targets

* disable assert_instr on avx test

* enable all appropriate features

* disable assert_instr on x86+avx

* the fn_must_use is stable

* fix nbody example on armv7

* fixup

* fixup

* enable 64-bit wide mask MMX optimizations on x86_64 only

* remove coresimd dependency on cfg_if

* allow wasm to fail

* use an env variable to disable assert_instr tests

* disable m32x2 mask MMX optimization on macos

* move cfg_if to coresimd/macros.rs
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

4 participants