Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Small POD struct equality ineffective #83585

Closed
leonardo-m opened this issue Mar 27, 2021 · 17 comments · Fixed by #125347
Closed

Small POD struct equality ineffective #83585

leonardo-m opened this issue Mar 27, 2021 · 17 comments · Fixed by #125347
Labels
A-LLVM Area: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues. E-needs-test Call for participation: An issue has been fixed and does not reproduce, but no test has been added. I-slow Issue: Problems and improvements with respect to performance of generated code. T-compiler Relevant to the compiler team, which will review and decide on the PR/issue.

Comments

@leonardo-m
Copy link

From a recent Reddit discussion (https://old.reddit.com/r/rust/comments/medh15/why_are_derived_partialeqimplementations_not_more/ ):

type T = u8;
type T1 = (T, T, T, T, T, T, T, T);
type T2 = [T; 8];

pub fn foo1a(a: T1, b: T1) -> bool {
    a == b
}
pub fn foo1b(a: &T1, b: &T1) -> bool {
    a == b
}

pub fn foo2a(a: T2, b: T2) -> bool {
    a == b
}
pub fn foo2b(a: &T2, b: &T2) -> bool {
    a == b
}

Gives (rustc 1.53.0-nightly 5e65467 2021-03-26):

foo1a:
        cmp     dil, sil
        jne     .LBB0_2
        mov     rax, rdi
        shr     rax, 8
        mov     rcx, rsi
        shr     rcx, 8
        cmp     al, cl
        jne     .LBB0_2
        mov     rax, rsi
        shr     rax, 16
        mov     rcx, rdi
        shr     rcx, 16
        cmp     cl, al
        jne     .LBB0_2
        mov     rax, rsi
        shr     rax, 24
        mov     rcx, rdi
        shr     rcx, 24
        cmp     cl, al
        jne     .LBB0_2
        mov     rax, rdi
        shr     rax, 32
        mov     rcx, rsi
        shr     rcx, 32
        cmp     al, cl
        jne     .LBB0_2
        mov     rax, rdi
        shr     rax, 40
        mov     rcx, rsi
        shr     rcx, 40
        cmp     al, cl
        jne     .LBB0_2
        mov     rax, rdi
        shr     rax, 48
        mov     rcx, rsi
        shr     rcx, 48
        cmp     al, cl
        jne     .LBB0_2
        shr     rdi, 56
        shr     rsi, 56
        mov     al, 1
        cmp     dil, sil
        jne     .LBB0_2
        ret
.LBB0_2:
        xor     eax, eax
        ret

foo1b:
        mov     al, byte ptr [rdi]
        cmp     al, byte ptr [rsi]
        jne     .LBB1_1
        mov     al, byte ptr [rdi + 1]
        cmp     al, byte ptr [rsi + 1]
        jne     .LBB1_1
        mov     al, byte ptr [rdi + 2]
        cmp     al, byte ptr [rsi + 2]
        jne     .LBB1_1
        mov     al, byte ptr [rdi + 3]
        cmp     al, byte ptr [rsi + 3]
        jne     .LBB1_1
        mov     al, byte ptr [rdi + 4]
        cmp     al, byte ptr [rsi + 4]
        jne     .LBB1_1
        mov     al, byte ptr [rdi + 5]
        cmp     al, byte ptr [rsi + 5]
        jne     .LBB1_1
        mov     al, byte ptr [rdi + 6]
        cmp     al, byte ptr [rsi + 6]
        jne     .LBB1_1
        mov     cl, byte ptr [rdi + 7]
        mov     al, 1
        cmp     cl, byte ptr [rsi + 7]
        jne     .LBB1_1
        ret
.LBB1_1:
        xor     eax, eax
        ret

foo2a:
        cmp     rdi, rsi
        sete    al
        ret

foo2b:
        mov     rax, qword ptr [rdi]
        cmp     rax, qword ptr [rsi]
        sete    al
        ret

That Reddit discussion shows other cases.

@leonardo-m
Copy link
Author

See also: #83623

@AngelicosPhosphoros
Copy link
Contributor

AngelicosPhosphoros commented Apr 3, 2021

I am quite surprised but fix of #83623 doesn't fix this issue.

ASM
playground::foo1a: # @playground::foo1a
# %bb.0:
	cmpb	%sil, %dil
	jne	.LBB0_6
# %bb.1:
	movq	%rsi, %rcx
	shrq	$8, %rcx
	movq	%rdi, %rdx
	shrq	$8, %rdx
	xorl	%eax, %eax
	cmpb	%cl, %dl
	jne	.LBB0_7
# %bb.2:
	movq	%rsi, %rcx
	shrq	$16, %rcx
	movq	%rdi, %rdx
	shrq	$16, %rdx
	cmpb	%cl, %dl
	jne	.LBB0_7
# %bb.3:
	movq	%rdi, %rcx
	shrq	$24, %rcx
	movq	%rsi, %rdx
	shrq	$24, %rdx
	xorl	%eax, %eax
	cmpb	%dl, %cl
	jne	.LBB0_7
# %bb.4:
	movq	%rdi, %rcx
	shrq	$32, %rcx
	movq	%rsi, %rdx
	shrq	$32, %rdx
	cmpb	%dl, %cl
	jne	.LBB0_7
# %bb.5:
	movq	%rdi, %r8
	shrq	$40, %r8
	movq	%rdi, %rcx
	shrq	$48, %rcx
	shrq	$56, %rdi
	movq	%rsi, %rdx
	shrq	$40, %rdx
	movq	%rsi, %rax
	shrq	$48, %rax
	shrq	$56, %rsi
	xorb	%r8b, %dl
	xorb	%cl, %al
	orb	%dl, %al
	xorb	%dil, %sil
	orb	%al, %sil
	sete	%al
                                        # kill: def $al killed $al killed $eax
	retq

.LBB0_6:
	xorl	%eax, %eax

.LBB0_7:
                                        # kill: def $al killed $al killed $eax
	retq

@leonardo-m
Copy link
Author

With the last nightly I'm seeing a probable improvement in foo1b (the second):

foo1a:
    cmp     dil, sil
    jne     .LBB0_6
    mov     rcx, rsi
    shr     rcx, 8
    mov     rdx, rdi
    shr     rdx, 8
    xor     eax, eax
    cmp     dl, cl
    jne     .LBB0_7
    mov     rcx, rsi
    shr     rcx, 16
    mov     rdx, rdi
    shr     rdx, 16
    cmp     dl, cl
    jne     .LBB0_7
    mov     rcx, rdi
    shr     rcx, 24
    mov     rdx, rsi
    shr     rdx, 24
    xor     eax, eax
    cmp     cl, dl
    jne     .LBB0_7
    mov     rcx, rdi
    shr     rcx, 32
    mov     rdx, rsi
    shr     rdx, 32
    cmp     cl, dl
    jne     .LBB0_7
    mov     r8, rdi
    shr     r8, 40
    mov     rcx, rdi
    shr     rcx, 48
    shr     rdi, 56
    mov     rdx, rsi
    shr     rdx, 40
    mov     rax, rsi
    shr     rax, 48
    shr     rsi, 56
    xor     dl, r8b
    xor     al, cl
    or      al, dl
    xor     sil, dil
    or      sil, al
    sete    al
    ret
.LBB0_6:
    xor     eax, eax
.LBB0_7:
    ret


foo1b:
    mov     al, byte ptr [rdi]
    cmp     al, byte ptr [rsi]
    jne     .LBB1_1
    mov     eax, dword ptr [rdi + 1]
    mov     ecx, dword ptr [rdi + 4]
    xor     eax, dword ptr [rsi + 1]
    xor     ecx, dword ptr [rsi + 4]
    or      ecx, eax
    sete    al
    ret
.LBB1_1:
    xor     eax, eax
    ret


foo2a:
    cmp     rdi, rsi
    sete    al
    ret


foo2b:
    mov     rax, qword ptr [rdi]
    cmp     rax, qword ptr [rsi]
    sete    al
    ret

@AngelicosPhosphoros
Copy link
Contributor

AngelicosPhosphoros commented Apr 3, 2021

It looks like that one of SROA passes transforms getting of tuple fields into nearly this:

    ((a >> (0*8)) as u8 == (b >> (0*8)) as u8) &&
    ((a >> (7*8)) as u8 == (b >> (7*8)) as u8) &&
    ((a >> (6*8)) as u8 == (b >> (6*8)) as u8) &&
    ((a >> (5*8)) as u8 == (b >> (5*8)) as u8) &&
    ((a >> (4*8)) as u8 == (b >> (4*8)) as u8) &&
    ((a >> (3*8)) as u8 == (b >> (3*8)) as u8) &&
    ((a >> (2*8)) as u8 == (b >> (2*8)) as u8) &&
    ((a >> (1*8)) as u8 == (b >> (1*8)) as u8)

Edit: Removing SROA doesn't prevent this transformation, it just happens in other steps.

@AngelicosPhosphoros
Copy link
Contributor

This leads to big overhead of tuples and structs (because tuples is structs).

Benchmark

use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};

type TElem = u8;
type Tpl = (TElem, TElem, TElem, TElem, TElem, TElem, TElem, TElem, );
struct Data {
    original: Vec<Tpl>,
    diff_random_field: Vec<Tpl>,
    diff_first_field: Vec<Tpl>,
    diff_last_field: Vec<Tpl>,
    diff_every_second: Vec<Tpl>,
}

fn generate_data(n: usize) -> Data {
    use rand::prelude::{Rng, SeedableRng};
    use rand_chacha::ChaCha8Rng;
    let mut rng = ChaCha8Rng::seed_from_u64(357u64);
    let mut data: Vec<TElem> = vec![0; n * 8];
    rng.fill(data.as_mut_slice());
    let original: Vec<Tpl> = data.windows(8).map(|x|{
        (x[0],x[1],x[2],x[3],x[4],x[5],x[6],x[7],)
    }).collect();

    let diff_random_field: Vec<Tpl> = original
        .iter()
        .map(|&x| unsafe {
            let mut arr: [TElem; 8] = std::mem::transmute(x);
            let index = rng.gen_range(0..arr.len());
            arr[index] = arr[index].wrapping_add(5);
            std::mem::transmute(arr)
        })
        .collect();
    let diff_first_field: Vec<Tpl> = original
        .iter()
        .copied()
        .map(|mut x| {
            x.0 = x.0.wrapping_add(5);
            x
        })
        .collect();
    let diff_last_field: Vec<Tpl> = original
        .iter()
        .copied()
        .map(|mut x| {
            x.7 = x.7.wrapping_add(5);
            x
        })
        .collect();

    let mut diff_every_second: Vec<Tpl> = original.clone();
    for (a, b) in diff_every_second
        .iter_mut()
        .zip(diff_random_field.iter())
        .step_by(2)
    {
        *a = b.clone();
    }

    Data {
        original,
        diff_random_field,
        diff_first_field,
        diff_last_field,
        diff_every_second,
    }
}

fn arr_cmp(a: Tpl, b: Tpl)->bool{
    unsafe{
        let a: [TElem; 8] = std::mem::transmute(a);
        let b: [TElem; 8] = std::mem::transmute(b);
        a==b
    }
}

pub fn criterion_benchmark(c: &mut Criterion) {
    let Data {
        original,
        diff_random_field,
        diff_first_field,
        diff_last_field,
        diff_every_second,
    } = generate_data(1000);
    let original_copy = original.clone();

    let mut group = c.benchmark_group("u8");

    let pairs = [
        ("Self", original_copy),
        ("Random field", diff_random_field),
        ("First field", diff_first_field),
        ("Last field", diff_last_field),
        ("Every Second", diff_every_second),
    ];
    for (name, to_compare) in pairs.iter().cloned() {
        group.bench_with_input(
            BenchmarkId::new("u8 tuple", name),
            &to_compare,
            |b, to_compare| {
                b.iter_batched(
                    || -> (Vec<bool>, Vec<Tpl>) {
                        let buffer = Vec::with_capacity(to_compare.len());
                        let to_compare_copy = to_compare.clone();
                        (buffer, to_compare_copy)
                    },
                    |(mut out_buff, other)| {
                        let comparison = original.iter().zip(other.iter()).map(|(a, b)| a == b);
                        out_buff.extend(comparison);
                        (out_buff, other)
                    },
                    criterion::BatchSize::LargeInput,
                );
            },
        );
    }

    for (name, to_compare) in pairs.iter().cloned() {
        group.bench_with_input(
            BenchmarkId::new("u8 arr", name),
            &to_compare,
            |b, to_compare| {
                b.iter_batched(
                    || -> (Vec<bool>, Vec<Tpl>) {
                        let buffer = Vec::with_capacity(to_compare.len());
                        let to_compare_copy = to_compare.clone();
                        (buffer, to_compare_copy)
                    },
                    |(mut out_buff, other)| {
                        let comparison = original.iter().zip(other.iter()).map(|(a, b)| arr_cmp(*a, *b));
                        out_buff.extend(comparison);
                        (out_buff, other)
                    },
                    criterion::BatchSize::LargeInput,
                );
            },
        );
    }

    group.finish();
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

Results:

u8/u8 tuple/Self        time:   [16.593 us 16.606 us 16.621 us]
                        change: [-1.3185% -1.1247% -0.9371%] (p = 0.00 < 0.05)
                        Change within noise threshold.
Found 1 outliers among 100 measurements (1.00%)
  1 (1.00%) high mild
u8/u8 tuple/Random field
                        time:   [39.751 us 39.880 us 39.992 us]
                        change: [-3.0130% -2.6021% -2.2518%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 6 outliers among 100 measurements (6.00%)
  6 (6.00%) low mild
u8/u8 tuple/First field time:   [3.9828 us 3.9861 us 3.9894 us]
                        change: [-2.9688% -2.7055% -2.4496%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 3 outliers among 100 measurements (3.00%)
  3 (3.00%) low mild
u8/u8 tuple/Last field  time:   [16.612 us 16.624 us 16.637 us]
                        change: [-4.1670% -3.8678% -3.5697%] (p = 0.00 < 0.05)
                        Performance has improved.
Found 3 outliers among 100 measurements (3.00%)
  1 (1.00%) low mild
  1 (1.00%) high mild
  1 (1.00%) high severe
u8/u8 tuple/Every Second
                        time:   [29.684 us 29.718 us 29.755 us]
                        change: [-3.1898% -2.8833% -2.5668%] (p = 0.00 < 0.05)
                        Performance has improved.
u8/u8 arr/Self          time:   [1.5546 us 1.5563 us 1.5584 us]
Found 4 outliers among 100 measurements (4.00%)
  1 (1.00%) high mild
  3 (3.00%) high severe
u8/u8 arr/Random field  time:   [1.5515 us 1.5564 us 1.5612 us]
u8/u8 arr/First field   time:   [1.5567 us 1.5584 us 1.5602 us]

So, PartialEq slower from 4 to 40 times.

@AngelicosPhosphoros
Copy link
Contributor

AngelicosPhosphoros commented Apr 3, 2021

Hack

We could change std using this weird trick but possibly shouldn't.

Hacky code
type T = u8;
pub struct T1<A, B, C, D, E, F, G, H>(A, B, C, D, E, F, G, H);

impl<A, B, C, D, E, F, G, H> PartialEq for T1<A, B, C, D, E, F, G, H>
where 
A: PartialEq, B: PartialEq, C: PartialEq, D: PartialEq, E: PartialEq, F: PartialEq, G: PartialEq, H: PartialEq
{
    default fn eq(&self, b:&Self)->bool{
        let a = self;
        a.0 == b.0 &&
        a.1 == b.1 &&
        a.2 == b.2 &&
        a.3 == b.3 &&
        a.4 == b.4 &&
        a.5 == b.5 &&
        a.6 == b.6 &&
        a.7 == b.7
    }
}

impl<A: PartialEq> PartialEq for T1<A,A,A,A,A,A,A,A>
{
    fn eq(&self, b:&Self)->bool{
        assert_eq!(
            std::mem::size_of::<Self>(),
            std::mem::size_of::<[A;8]>()
        );
        unsafe{
            let a: &[A;8] = std::mem::transmute(self);
            let b: &[A;8] = std::mem::transmute(b);
            a == b
        }
    }
}

#[no_mangle]
pub fn is_eq0(a: T1<T, T, T, T, T, T, T, T>, 
            b: T1<T, T, T, T, T, T, T, T>)->bool{
    a==b
}

The problem is that we still:

  1. Don't fix situation for structs (actually, problem affects every struct which size less or equal to 8 bytes)
  2. Don't fix situation with tuples like (u8, u8, u16) or (newtype_over_u8, u8, ...)

@leonardo-m

With the last nightly I'm seeing a probable improvement in foo1b (the second):

Unfortunately, argpromotion and inlining would convert this function to foo1a case.

I suggest you to rename the issue to something like "Small POD struct equality ineffective".

@AngelicosPhosphoros
Copy link
Contributor

It looks like Clang incapable for making such optimizations as well.
godbolt link

@leonardo-m leonardo-m changed the title [ER] Optimization of tuple equality vs array equality Small POD struct equality ineffective Apr 3, 2021
@nikic nikic added A-LLVM Area: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues. I-slow Issue: Problems and improvements with respect to performance of generated code. labels Apr 19, 2021
@nikic
Copy link
Contributor

nikic commented Apr 19, 2021

On the LLVM side, we're basically missing this fold: https://alive2.llvm.org/ce/z/Ljo0nX

@nikic
Copy link
Contributor

nikic commented May 2, 2021

Upstream review: https://reviews.llvm.org/D101232

@nikic
Copy link
Contributor

nikic commented May 10, 2021

Upstream fix: llvm/llvm-project@463ea28

@nikic nikic self-assigned this May 29, 2021
@ZuseZ4
Copy link
Contributor

ZuseZ4 commented Jun 5, 2021

I ran into an issue where using a struct of two arrays was significant slower than passing those arrays directly.
#issue85354
My arrays were larger, but the struct also came with a large overhead.
Could those issues be related?

@nikic
Copy link
Contributor

nikic commented Aug 22, 2021

Unfortunately this has not been fixed by the LLVM 13 upgrade. Might be an unfortunate interaction with the migration from bitwise to logical and.

@nikic
Copy link
Contributor

nikic commented Aug 22, 2021

Upstream fix for the logical and/or issue: llvm/llvm-project@be4b836, llvm/llvm-project@fafe5a6

Might make sense to cherry-pick this to our fork to avoid waiting another release cycle.

@nikic
Copy link
Contributor

nikic commented Aug 22, 2021

For the case foo1b the MergeICmps pass would usually handle this in the backend, but it's getting blocked by additional llvm.experimental.noalias.scope.decl() calls.

@nikic
Copy link
Contributor

nikic commented Sep 5, 2021

We're now generating good code for everything but foo1b: https://rust.godbolt.org/z/6rs8EbhEb

foo1b still unnecessarily splits out one comparison. Either https://reviews.llvm.org/D108517 or https://reviews.llvm.org/D108782 should address that.

@nikic
Copy link
Contributor

nikic commented Feb 19, 2022

Remaining case is fixed by the LLVM upgrade in #93577.

Adding a codegen test for this may be worthwhile.

@nikic nikic added the E-needs-test Call for participation: An issue has been fixed and does not reproduce, but no test has been added. label Feb 19, 2022
@nikic nikic removed their assignment Feb 19, 2022
@klensy
Copy link
Contributor

klensy commented Feb 19, 2022

Remaining case is fixed by the LLVM upgrade in #93577.

Adding a codegen test for this may be worthwhile.

Thats true for -Copt-level=3 https://rust.godbolt.org/z/bWEo5qE6d, for -Copt-level=2 it's still the same.

@Noratrieb Noratrieb added the T-compiler Relevant to the compiler team, which will review and decide on the PR/issue. label Apr 5, 2023
tesuji added a commit to tesuji/rustc that referenced this issue May 20, 2024
tesuji added a commit to tesuji/rustc that referenced this issue May 20, 2024
tesuji added a commit to tesuji/rustc that referenced this issue Jun 8, 2024
tesuji added a commit to tesuji/rustc that referenced this issue Jun 9, 2024
bors added a commit to rust-lang-ci/rust that referenced this issue Jun 9, 2024
bors added a commit to rust-lang-ci/rust that referenced this issue Jun 10, 2024
bors added a commit to rust-lang-ci/rust that referenced this issue Jun 11, 2024
bors added a commit to rust-lang-ci/rust that referenced this issue Jun 13, 2024
@bors bors closed this as completed in 7ac6c2f Jun 14, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
A-LLVM Area: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues. E-needs-test Call for participation: An issue has been fixed and does not reproduce, but no test has been added. I-slow Issue: Problems and improvements with respect to performance of generated code. T-compiler Relevant to the compiler team, which will review and decide on the PR/issue.
Projects
None yet
Development

Successfully merging a pull request may close this issue.

6 participants