Small POD struct equality ineffective #83585

leonardo-m · 2021-03-27T20:54:33Z

From a recent Reddit discussion (https://old.reddit.com/r/rust/comments/medh15/why_are_derived_partialeqimplementations_not_more/ ):

type T = u8;
type T1 = (T, T, T, T, T, T, T, T);
type T2 = [T; 8];

pub fn foo1a(a: T1, b: T1) -> bool {
    a == b
}
pub fn foo1b(a: &T1, b: &T1) -> bool {
    a == b
}

pub fn foo2a(a: T2, b: T2) -> bool {
    a == b
}
pub fn foo2b(a: &T2, b: &T2) -> bool {
    a == b
}

Gives (rustc 1.53.0-nightly 5e65467 2021-03-26):

foo1a:
        cmp     dil, sil
        jne     .LBB0_2
        mov     rax, rdi
        shr     rax, 8
        mov     rcx, rsi
        shr     rcx, 8
        cmp     al, cl
        jne     .LBB0_2
        mov     rax, rsi
        shr     rax, 16
        mov     rcx, rdi
        shr     rcx, 16
        cmp     cl, al
        jne     .LBB0_2
        mov     rax, rsi
        shr     rax, 24
        mov     rcx, rdi
        shr     rcx, 24
        cmp     cl, al
        jne     .LBB0_2
        mov     rax, rdi
        shr     rax, 32
        mov     rcx, rsi
        shr     rcx, 32
        cmp     al, cl
        jne     .LBB0_2
        mov     rax, rdi
        shr     rax, 40
        mov     rcx, rsi
        shr     rcx, 40
        cmp     al, cl
        jne     .LBB0_2
        mov     rax, rdi
        shr     rax, 48
        mov     rcx, rsi
        shr     rcx, 48
        cmp     al, cl
        jne     .LBB0_2
        shr     rdi, 56
        shr     rsi, 56
        mov     al, 1
        cmp     dil, sil
        jne     .LBB0_2
        ret
.LBB0_2:
        xor     eax, eax
        ret

foo1b:
        mov     al, byte ptr [rdi]
        cmp     al, byte ptr [rsi]
        jne     .LBB1_1
        mov     al, byte ptr [rdi + 1]
        cmp     al, byte ptr [rsi + 1]
        jne     .LBB1_1
        mov     al, byte ptr [rdi + 2]
        cmp     al, byte ptr [rsi + 2]
        jne     .LBB1_1
        mov     al, byte ptr [rdi + 3]
        cmp     al, byte ptr [rsi + 3]
        jne     .LBB1_1
        mov     al, byte ptr [rdi + 4]
        cmp     al, byte ptr [rsi + 4]
        jne     .LBB1_1
        mov     al, byte ptr [rdi + 5]
        cmp     al, byte ptr [rsi + 5]
        jne     .LBB1_1
        mov     al, byte ptr [rdi + 6]
        cmp     al, byte ptr [rsi + 6]
        jne     .LBB1_1
        mov     cl, byte ptr [rdi + 7]
        mov     al, 1
        cmp     cl, byte ptr [rsi + 7]
        jne     .LBB1_1
        ret
.LBB1_1:
        xor     eax, eax
        ret

foo2a:
        cmp     rdi, rsi
        sete    al
        ret

foo2b:
        mov     rax, qword ptr [rdi]
        cmp     rax, qword ptr [rsi]
        sete    al
        ret

That Reddit discussion shows other cases.

The text was updated successfully, but these errors were encountered:

leonardo-m · 2021-03-28T22:22:13Z

See also: #83623

AngelicosPhosphoros · 2021-04-03T07:40:31Z

I am quite surprised but fix of #83623 doesn't fix this issue.

ASM

playground::foo1a: # @playground::foo1a
# %bb.0:
	cmpb	%sil, %dil
	jne	.LBB0_6
# %bb.1:
	movq	%rsi, %rcx
	shrq	$8, %rcx
	movq	%rdi, %rdx
	shrq	$8, %rdx
	xorl	%eax, %eax
	cmpb	%cl, %dl
	jne	.LBB0_7
# %bb.2:
	movq	%rsi, %rcx
	shrq	$16, %rcx
	movq	%rdi, %rdx
	shrq	$16, %rdx
	cmpb	%cl, %dl
	jne	.LBB0_7
# %bb.3:
	movq	%rdi, %rcx
	shrq	$24, %rcx
	movq	%rsi, %rdx
	shrq	$24, %rdx
	xorl	%eax, %eax
	cmpb	%dl, %cl
	jne	.LBB0_7
# %bb.4:
	movq	%rdi, %rcx
	shrq	$32, %rcx
	movq	%rsi, %rdx
	shrq	$32, %rdx
	cmpb	%dl, %cl
	jne	.LBB0_7
# %bb.5:
	movq	%rdi, %r8
	shrq	$40, %r8
	movq	%rdi, %rcx
	shrq	$48, %rcx
	shrq	$56, %rdi
	movq	%rsi, %rdx
	shrq	$40, %rdx
	movq	%rsi, %rax
	shrq	$48, %rax
	shrq	$56, %rsi
	xorb	%r8b, %dl
	xorb	%cl, %al
	orb	%dl, %al
	xorb	%dil, %sil
	orb	%al, %sil
	sete	%al
                                        # kill: def $al killed $al killed $eax
	retq

.LBB0_6:
	xorl	%eax, %eax

.LBB0_7:
                                        # kill: def $al killed $al killed $eax
	retq

leonardo-m · 2021-04-03T10:00:42Z

With the last nightly I'm seeing a probable improvement in foo1b (the second):

foo1a:
    cmp     dil, sil
    jne     .LBB0_6
    mov     rcx, rsi
    shr     rcx, 8
    mov     rdx, rdi
    shr     rdx, 8
    xor     eax, eax
    cmp     dl, cl
    jne     .LBB0_7
    mov     rcx, rsi
    shr     rcx, 16
    mov     rdx, rdi
    shr     rdx, 16
    cmp     dl, cl
    jne     .LBB0_7
    mov     rcx, rdi
    shr     rcx, 24
    mov     rdx, rsi
    shr     rdx, 24
    xor     eax, eax
    cmp     cl, dl
    jne     .LBB0_7
    mov     rcx, rdi
    shr     rcx, 32
    mov     rdx, rsi
    shr     rdx, 32
    cmp     cl, dl
    jne     .LBB0_7
    mov     r8, rdi
    shr     r8, 40
    mov     rcx, rdi
    shr     rcx, 48
    shr     rdi, 56
    mov     rdx, rsi
    shr     rdx, 40
    mov     rax, rsi
    shr     rax, 48
    shr     rsi, 56
    xor     dl, r8b
    xor     al, cl
    or      al, dl
    xor     sil, dil
    or      sil, al
    sete    al
    ret
.LBB0_6:
    xor     eax, eax
.LBB0_7:
    ret


foo1b:
    mov     al, byte ptr [rdi]
    cmp     al, byte ptr [rsi]
    jne     .LBB1_1
    mov     eax, dword ptr [rdi + 1]
    mov     ecx, dword ptr [rdi + 4]
    xor     eax, dword ptr [rsi + 1]
    xor     ecx, dword ptr [rsi + 4]
    or      ecx, eax
    sete    al
    ret
.LBB1_1:
    xor     eax, eax
    ret


foo2a:
    cmp     rdi, rsi
    sete    al
    ret


foo2b:
    mov     rax, qword ptr [rdi]
    cmp     rax, qword ptr [rsi]
    sete    al
    ret

AngelicosPhosphoros · 2021-04-03T11:06:57Z

It looks like that one of SROA passes transforms getting of tuple fields into nearly this:

    ((a >> (0*8)) as u8 == (b >> (0*8)) as u8) &&
    ((a >> (7*8)) as u8 == (b >> (7*8)) as u8) &&
    ((a >> (6*8)) as u8 == (b >> (6*8)) as u8) &&
    ((a >> (5*8)) as u8 == (b >> (5*8)) as u8) &&
    ((a >> (4*8)) as u8 == (b >> (4*8)) as u8) &&
    ((a >> (3*8)) as u8 == (b >> (3*8)) as u8) &&
    ((a >> (2*8)) as u8 == (b >> (2*8)) as u8) &&
    ((a >> (1*8)) as u8 == (b >> (1*8)) as u8)

Edit: Removing SROA doesn't prevent this transformation, it just happens in other steps.

AngelicosPhosphoros · 2021-04-03T13:59:43Z

This leads to big overhead of tuples and structs (because tuples is structs).

Benchmark

use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};

type TElem = u8;
type Tpl = (TElem, TElem, TElem, TElem, TElem, TElem, TElem, TElem, );
struct Data {
    original: Vec<Tpl>,
    diff_random_field: Vec<Tpl>,
    diff_first_field: Vec<Tpl>,
    diff_last_field: Vec<Tpl>,
    diff_every_second: Vec<Tpl>,
}

fn generate_data(n: usize) -> Data {
    use rand::prelude::{Rng, SeedableRng};
    use rand_chacha::ChaCha8Rng;
    let mut rng = ChaCha8Rng::seed_from_u64(357u64);
    let mut data: Vec<TElem> = vec![0; n * 8];
    rng.fill(data.as_mut_slice());
    let original: Vec<Tpl> = data.windows(8).map(|x|{
        (x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7],)
    }).collect();

    let diff_random_field: Vec<Tpl> = original
        .iter()
        .map(|&x| unsafe {
            let mut arr: [TElem; 8] = std::mem::transmute(x);
            let index = rng.gen_range(0..arr.len());
            arr[index] = arr[index].wrapping_add(5);
            std::mem::transmute(arr)
        })
        .collect();
    let diff_first_field: Vec<Tpl> = original
        .iter()
        .copied()
        .map(|mut x| {
            x.0 = x.0.wrapping_add(5);
            x
        })
        .collect();
    let diff_last_field: Vec<Tpl> = original
        .iter()
        .copied()
        .map(|mut x| {
            x.7 = x.7.wrapping_add(5);
            x
        })
        .collect();

    let mut diff_every_second: Vec<Tpl> = original.clone();
    for (a, b) in diff_every_second
        .iter_mut()
        .zip(diff_random_field.iter())
        .step_by(2)
    {
        *a = b.clone();
    }

    Data {
        original,
        diff_random_field,
        diff_first_field,
        diff_last_field,
        diff_every_second,
    }
}

fn arr_cmp(a: Tpl, b: Tpl)->bool{
    unsafe{
        let a: [TElem; 8] = std::mem::transmute(a);
        let b: [TElem; 8] = std::mem::transmute(b);
        a==b
    }
}

pub fn criterion_benchmark(c: &mut Criterion) {
    let Data {
        original,
        diff_random_field,
        diff_first_field,
        diff_last_field,
        diff_every_second,
    } = generate_data(1000);
    let original_copy = original.clone();

    let mut group = c.benchmark_group("u8");

    let pairs = [
        ("Self", original_copy),
        ("Random field", diff_random_field),
        ("First field", diff_first_field),
        ("Last field", diff_last_field),
        ("Every Second", diff_every_second),
    ];
    for (name, to_compare) in pairs.iter().cloned() {
        group.bench_with_input(
            BenchmarkId::new("u8 tuple", name),
            &to_compare,
            |b, to_compare| {
                b.iter_batched(
                    || -> (Vec<bool>, Vec<Tpl>) {
                        let buffer = Vec::with_capacity(to_compare.len());
                        let to_compare_copy = to_compare.clone();
                        (buffer, to_compare_copy)
                    },
                    |(mut out_buff, other)| {
                        let comparison = original.iter().zip(other.iter()).map(|(a, b)| a == b);
                        out_buff.extend(comparison);
                        (out_buff, other)
                    },
                    criterion::BatchSize::LargeInput,
                );
            },
        );
    }

    for (name, to_compare) in pairs.iter().cloned() {
        group.bench_with_input(
            BenchmarkId::new("u8 arr", name),
            &to_compare,
            |b, to_compare| {
                b.iter_batched(
                    || -> (Vec<bool>, Vec<Tpl>) {
                        let buffer = Vec::with_capacity(to_compare.len());
                        let to_compare_copy = to_compare.clone();
                        (buffer, to_compare_copy)
                    },
                    |(mut out_buff, other)| {
                        let comparison = original.iter().zip(other.iter()).map(|(a, b)| arr_cmp(*a, *b));
                        out_buff.extend(comparison);
                        (out_buff, other)
                    },
                    criterion::BatchSize::LargeInput,
                );
            },
        );
    }

    group.finish();
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

Results:

u8/u8 tuple/Self        time:   [16.593 us 16.606 us 16.621 us]
                        change: [-1.3185% -1.1247% -0.9371%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 1 outliers among 100 measurements (1.00%)
  1 (1.00%) high mild
u8/u8 tuple/Random field
                        time:   [39.751 us 39.880 us 39.992 us]
                        change: [-3.0130% -2.6021% -2.2518%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 6 outliers among 100 measurements (6.00%)
  6 (6.00%) low mild
u8/u8 tuple/First field time:   [3.9828 us 3.9861 us 3.9894 us]
                        change: [-2.9688% -2.7055% -2.4496%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 3 outliers among 100 measurements (3.00%)
  3 (3.00%) low mild
u8/u8 tuple/Last field  time:   [16.612 us 16.624 us 16.637 us]
                        change: [-4.1670% -3.8678% -3.5697%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 3 outliers among 100 measurements (3.00%)
  1 (1.00%) low mild
  1 (1.00%) high mild
  1 (1.00%) high severe
u8/u8 tuple/Every Second
                        time:   [29.684 us 29.718 us 29.755 us]
                        change: [-3.1898% -2.8833% -2.5668%] (p = 0.00 < 0.05)
                        Performance has improved.
u8/u8 arr/Self          time:   [1.5546 us 1.5563 us 1.5584 us]
Found 4 outliers among 100 measurements (4.00%)
  1 (1.00%) high mild
  3 (3.00%) high severe
u8/u8 arr/Random field  time:   [1.5515 us 1.5564 us 1.5612 us]
u8/u8 arr/First field   time:   [1.5567 us 1.5584 us 1.5602 us]

So, PartialEq slower from 4 to 40 times.

AngelicosPhosphoros · 2021-04-03T14:52:55Z

Hack

We could change std using this weird trick but possibly shouldn't.

Hacky code

type T = u8;
pub struct T1<A, B, C, D, E, F, G, H>(A, B, C, D, E, F, G, H);

impl<A, B, C, D, E, F, G, H> PartialEq for T1<A, B, C, D, E, F, G, H>
where 
A: PartialEq, B: PartialEq, C: PartialEq, D: PartialEq, E: PartialEq, F: PartialEq, G: PartialEq, H: PartialEq
{
    default fn eq(&self, b:&Self)->bool{
        let a = self;
        a.0 == b.0 &&
        a.1 == b.1 &&
        a.2 == b.2 &&
        a.3 == b.3 &&
        a.4 == b.4 &&
        a.5 == b.5 &&
        a.6 == b.6 &&
        a.7 == b.7
    }
}

impl<A: PartialEq> PartialEq for T1<A,A,A,A,A,A,A,A>
{
    fn eq(&self, b:&Self)->bool{
        assert_eq!(
            std::mem::size_of::<Self>(),
            std::mem::size_of::<[A;8]>()
        );
        unsafe{
            let a: &[A;8] = std::mem::transmute(self);
            let b: &[A;8] = std::mem::transmute(b);
            a == b
        }
    }
}

#[no_mangle]
pub fn is_eq0(a: T1<T, T, T, T, T, T, T, T>, 
            b: T1<T, T, T, T, T, T, T, T>)->bool{
    a==b
}

The problem is that we still:

Don't fix situation for structs (actually, problem affects every struct which size less or equal to 8 bytes)
Don't fix situation with tuples like (u8, u8, u16) or (newtype_over_u8, u8, ...)

@leonardo-m

With the last nightly I'm seeing a probable improvement in foo1b (the second):

Unfortunately, argpromotion and inlining would convert this function to foo1a case.

I suggest you to rename the issue to something like "Small POD struct equality ineffective".

AngelicosPhosphoros · 2021-04-03T15:07:33Z

It looks like Clang incapable for making such optimizations as well.
godbolt link

nikic · 2021-04-19T21:04:22Z

On the LLVM side, we're basically missing this fold: https://alive2.llvm.org/ce/z/Ljo0nX

nikic · 2021-05-02T19:07:19Z

Upstream review: https://reviews.llvm.org/D101232

nikic · 2021-05-10T20:25:05Z

Upstream fix: llvm/llvm-project@463ea28

ZuseZ4 · 2021-06-05T23:40:47Z

I ran into an issue where using a struct of two arrays was significant slower than passing those arrays directly.
#issue85354
My arrays were larger, but the struct also came with a large overhead.
Could those issues be related?

nikic · 2021-08-22T14:12:00Z

Unfortunately this has not been fixed by the LLVM 13 upgrade. Might be an unfortunate interaction with the migration from bitwise to logical and.

nikic · 2021-08-22T15:02:10Z

Upstream fix for the logical and/or issue: llvm/llvm-project@be4b836, llvm/llvm-project@fafe5a6

Might make sense to cherry-pick this to our fork to avoid waiting another release cycle.

nikic · 2021-08-22T15:10:21Z

For the case foo1b the MergeICmps pass would usually handle this in the backend, but it's getting blocked by additional llvm.experimental.noalias.scope.decl() calls.

nikic · 2021-09-05T21:33:16Z

We're now generating good code for everything but foo1b: https://rust.godbolt.org/z/6rs8EbhEb

foo1b still unnecessarily splits out one comparison. Either https://reviews.llvm.org/D108517 or https://reviews.llvm.org/D108782 should address that.

nikic · 2022-02-19T15:04:19Z

Remaining case is fixed by the LLVM upgrade in #93577.

Adding a codegen test for this may be worthwhile.

klensy · 2022-02-19T17:38:21Z

Remaining case is fixed by the LLVM upgrade in #93577.

Adding a codegen test for this may be worthwhile.

Thats true for -Copt-level=3 https://rust.godbolt.org/z/bWEo5qE6d, for -Copt-level=2 it's still the same.

Add codegen tests for E-needs-test close rust-lang#36010 close rust-lang#68667 close rust-lang#74938 close rust-lang#83585 close rust-lang#93036 close rust-lang#109328 close rust-lang#110797 close rust-lang#111508 close rust-lang#112509 close rust-lang#113757 close rust-lang#120440 close rust-lang#118392 close rust-lang#71096 r? nikic

leonardo-m changed the title ~~[ER] Optimization of tuple equality vs array equality~~ Small POD struct equality ineffective Apr 3, 2021

nikic added A-LLVM Area: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues. I-slow Issue: Problems and improvements with respect to performance of generated code. labels Apr 19, 2021

nikic self-assigned this May 29, 2021

nikic mentioned this issue Aug 26, 2021

Backport "eq of parts" fold with logical ops rust-lang/llvm-project#113

Merged

nikic added the E-needs-test Call for participation: An issue has been fixed and does not reproduce, but no test has been added. label Feb 19, 2022

nikic removed their assignment Feb 19, 2022

Noratrieb added the T-compiler Relevant to the compiler team, which will review and decide on the PR/issue. label Apr 5, 2023

tesuji added a commit to tesuji/rustc that referenced this issue May 20, 2024

add assembly test for rust-lang#83585

799f944

tesuji mentioned this issue May 20, 2024

Add codegen tests for E-needs-test #125347

Merged

tesuji added a commit to tesuji/rustc that referenced this issue May 20, 2024

add assembly test for rust-lang#83585

5cccf3f

tesuji added a commit to tesuji/rustc that referenced this issue Jun 8, 2024

add assembly test for rust-lang#83585

bb8246a

tesuji added a commit to tesuji/rustc that referenced this issue Jun 9, 2024

add assembly test for rust-lang#83585

2120ea5

bors closed this as completed in 7ac6c2f Jun 14, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Small POD struct equality ineffective #83585

Small POD struct equality ineffective #83585

leonardo-m commented Mar 27, 2021

leonardo-m commented Mar 28, 2021

AngelicosPhosphoros commented Apr 3, 2021 •

edited

Loading

leonardo-m commented Apr 3, 2021

AngelicosPhosphoros commented Apr 3, 2021 •

edited

Loading

AngelicosPhosphoros commented Apr 3, 2021

AngelicosPhosphoros commented Apr 3, 2021 •

edited

Loading

AngelicosPhosphoros commented Apr 3, 2021

nikic commented Apr 19, 2021

nikic commented May 2, 2021

nikic commented May 10, 2021

ZuseZ4 commented Jun 5, 2021

nikic commented Aug 22, 2021

nikic commented Aug 22, 2021

nikic commented Aug 22, 2021

nikic commented Sep 5, 2021

nikic commented Feb 19, 2022

klensy commented Feb 19, 2022

Small POD struct equality ineffective #83585

Small POD struct equality ineffective #83585

Comments

leonardo-m commented Mar 27, 2021

leonardo-m commented Mar 28, 2021

AngelicosPhosphoros commented Apr 3, 2021 • edited Loading

leonardo-m commented Apr 3, 2021

AngelicosPhosphoros commented Apr 3, 2021 • edited Loading

AngelicosPhosphoros commented Apr 3, 2021

AngelicosPhosphoros commented Apr 3, 2021 • edited Loading

AngelicosPhosphoros commented Apr 3, 2021

nikic commented Apr 19, 2021

nikic commented May 2, 2021

nikic commented May 10, 2021

ZuseZ4 commented Jun 5, 2021

nikic commented Aug 22, 2021

nikic commented Aug 22, 2021

nikic commented Aug 22, 2021

nikic commented Sep 5, 2021

nikic commented Feb 19, 2022

klensy commented Feb 19, 2022

AngelicosPhosphoros commented Apr 3, 2021 •

edited

Loading

AngelicosPhosphoros commented Apr 3, 2021 •

edited

Loading

AngelicosPhosphoros commented Apr 3, 2021 •

edited

Loading