forked from MihaZupan/runtime-utils
-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[JitDiff X64] xtqqczze/dotnet-runtime/BitCastFill #486
Comments
Top method improvements-26 (-9.45 % of base) - System.SpanHelpers:Fill[short](byref,ulong,short) ; Assembly listing for method System.SpanHelpers:Fill[short](byref,ulong,short) (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
-; 0 inlinees with PGO data; 1 single block inlinees; 2 inlinees without PGO data
+; 0 inlinees with PGO data; 1 single block inlinees; 3 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T01] ( 18, 38 ) byref -> rdi single-def
; V01 arg1 [V01,T07] ( 10, 6.50) long -> rsi single-def
; V02 arg2 [V02,T02] ( 18, 37.75) short -> rdx single-def
; V03 loc0 [V03,T00] ( 23, 50 ) long -> rax
-; V04 loc1 [V04,T12] ( 2, 0.50) short -> rdx ld-addr-op
-; V05 loc2 [V05,T13] ( 5, 9.50) simd32 -> mm0 ld-addr-op <System.Numerics.Vector`1[ubyte]>
-; V06 loc3 [V06,T06] ( 5, 9.50) byref -> rdi single-def
-; V07 loc4 [V07,T11] ( 4, 2 ) long -> rax
-; V08 loc5 [V08,T09] ( 2, 4.50) long -> rcx
-; V09 loc6 [V09,T03] ( 7, 21 ) long -> rdx
-;* V10 loc7 [V10 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-; V11 loc8 [V11,T10] ( 2, 4.50) long -> rcx
-;# V12 OutArgs [V12 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-; V13 tmp1 [V13,T05] ( 2, 16 ) long -> rax "dup spill"
-; V14 tmp2 [V14 ] ( 3, 9.50) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "NewObj constructor temp" <System.Numerics.Vector`1[ushort]>
-;* V15 tmp3 [V15 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V16 tmp4 [V16,T04] ( 5, 16.25) int -> rcx "Inline stloc first use temp"
-; V17 tmp5 [V17,T08] ( 2, 8.50) ushort -> rax "Inlining Arg"
+; V04 loc1 [V04,T12] ( 5, 9.50) simd32 -> mm0 <System.Numerics.Vector`1[ubyte]>
+; V05 loc2 [V05,T06] ( 5, 9.50) byref -> rdi single-def
+; V06 loc3 [V06,T11] ( 4, 2 ) long -> rax
+; V07 loc4 [V07,T08] ( 2, 4.50) long -> rcx
+; V08 loc5 [V08,T03] ( 7, 21 ) long -> rdx
+; V09 loc6 [V09,T09] ( 2, 4.50) long -> rcx
+;# V10 OutArgs [V10 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+; V11 tmp1 [V11,T05] ( 2, 16 ) long -> rax "dup spill"
+;* V12 tmp2 [V12 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
+;* V13 tmp3 [V13 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V14 tmp4 [V14,T10] ( 2, 4.25) ushort -> rdx "Inline return value spill temp"
+;* V15 tmp5 [V15 ] ( 0, 0 ) short -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V16 tmp6 [V16 ] ( 0, 0 ) short -> zero-ref ld-addr-op "Inlining Arg"
+; V17 tmp7 [V17 ] ( 2, 4.50) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <System.Numerics.Vector`1[ushort]>
+; V18 tmp8 [V18,T04] ( 5, 16.25) int -> rax "Inline stloc first use temp"
;
; Lcl frame size = 48
G_M24463_IG01:
push rbp
sub rsp, 48
lea rbp, [rsp+0x30]
;; size=10 bbWeight=1 PerfScore 1.75
G_M24463_IG02:
cmp rsi, 16
jb SHORT G_M24463_IG10
- ;; NOP compensation instructions of 4 bytes.
- ;; size=10 bbWeight=1 PerfScore 1.25
+ ;; size=6 bbWeight=1 PerfScore 1.25
G_M24463_IG03:
- movsx rdx, dx
- vxorps ymm0, ymm0, ymm0
- vmovups ymmword ptr [rbp-0x30], ymm0
- movzx rax, dx
- xor ecx, ecx
- align [0 bytes for IG04]
- ;; size=18 bbWeight=0.25 PerfScore 0.52
+ movzx rdx, dx
+ xor eax, eax
+ align [11 bytes for IG04]
+ ;; size=16 bbWeight=0.25 PerfScore 0.19
G_M24463_IG04:
- lea rdx, [rbp-0x30]
- movsxd r8, ecx
- mov word ptr [rdx+2*r8], ax
- inc ecx
- cmp ecx, 16
+ lea rcx, [rbp-0x30]
+ movsxd r8, eax
+ mov word ptr [rcx+2*r8], dx
+ inc eax
+ cmp eax, 16
jl SHORT G_M24463_IG04
;; size=19 bbWeight=4 PerfScore 13.00
G_M24463_IG05:
vmovups ymm0, ymmword ptr [rbp-0x30]
lea rax, [rsi+rsi]
mov rcx, rax
and rcx, -64
xor edx, edx
cmp rsi, 32
jb SHORT G_M24463_IG07
- align [15 bytes for IG06]
- ;; size=39 bbWeight=0.50 PerfScore 3.38
+ align [0 bytes for IG06]
+ ;; size=24 bbWeight=0.50 PerfScore 3.25
G_M24463_IG06:
vmovups ymmword ptr [rdi+rdx], ymm0
vmovups ymmword ptr [rdi+rdx+0x20], ymm0
add rdx, 64
cmp rdx, rcx
jb SHORT G_M24463_IG06
;; size=20 bbWeight=4 PerfScore 22.00
G_M24463_IG07:
test al, 32
je SHORT G_M24463_IG08
vmovups ymmword ptr [rdi+rdx], ymm0
;; size=9 bbWeight=0.50 PerfScore 1.62
G_M24463_IG08:
vmovups ymmword ptr [rdi+rax-0x20], ymm0
;; size=6 bbWeight=0.50 PerfScore 1.00
G_M24463_IG09:
vzeroupper
add rsp, 48
pop rbp
ret
;; size=9 bbWeight=0.50 PerfScore 1.38
G_M24463_IG10:
xor eax, eax
cmp rsi, 8
jb SHORT G_M24463_IG12
mov rcx, rsi
and rcx, -8
- align [5 bytes for IG11]
- ;; size=20 bbWeight=0.50 PerfScore 1.12
+ align [0 bytes for IG11]
+ ;; size=15 bbWeight=0.50 PerfScore 1.00
G_M24463_IG11:
mov word ptr [rdi+2*rax], dx
mov word ptr [rdi+2*rax+0x02], dx
mov word ptr [rdi+2*rax+0x04], dx
mov word ptr [rdi+2*rax+0x06], dx
mov word ptr [rdi+2*rax+0x08], dx
mov word ptr [rdi+2*rax+0x0A], dx
mov word ptr [rdi+2*rax+0x0C], dx
mov word ptr [rdi+2*rax+0x0E], dx
add rax, 8
cmp rax, rcx
jb SHORT G_M24463_IG11
;; size=48 bbWeight=4 PerfScore 38.00
G_M24463_IG12:
test sil, 4
je SHORT G_M24463_IG13
mov word ptr [rdi+2*rax], dx
mov word ptr [rdi+2*rax+0x02], dx
mov word ptr [rdi+2*rax+0x04], dx
mov word ptr [rdi+2*rax+0x06], dx
add rax, 4
;; size=29 bbWeight=0.50 PerfScore 2.75
G_M24463_IG13:
test sil, 2
je SHORT G_M24463_IG14
mov word ptr [rdi+2*rax], dx
mov word ptr [rdi+2*rax+0x02], dx
add rax, 2
;; size=19 bbWeight=0.50 PerfScore 1.75
G_M24463_IG14:
test sil, 1
je SHORT G_M24463_IG15
mov word ptr [rdi+2*rax], dx
;; size=10 bbWeight=0.50 PerfScore 1.12
G_M24463_IG15:
vzeroupper
add rsp, 48
pop rbp
ret
;; size=9 bbWeight=0.50 PerfScore 1.38
-; Total bytes of code 275, prolog size 10, PerfScore 92.02, instruction count 74, allocated bytes for code 275 (MethodHash=7507a070) for method System.SpanHelpers:Fill[short](byref,ulong,short) (FullOpts)
+; Total bytes of code 249, prolog size 10, PerfScore 91.44, instruction count 71, allocated bytes for code 249 (MethodHash=7507a070) for method System.SpanHelpers:Fill[short](byref,ulong,short) (FullOpts) -16 (-5.65 % of base) - System.SpanHelpers:Fill[System.Nullable`1[int]](byref,ulong,System.Nullable`1[int]) ; Assembly listing for method System.SpanHelpers:Fill[System.Nullable`1[int]](byref,ulong,System.Nullable`1[int]) (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
-; 0 inlinees with PGO data; 1 single block inlinees; 2 inlinees without PGO data
+; 0 inlinees with PGO data; 1 single block inlinees; 4 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T00] ( 18, 38 ) byref -> rdi single-def
; V01 arg1 [V01,T07] ( 10, 6.50) long -> rsi single-def
; V02 arg2 [V02,T01] ( 18, 38 ) struct ( 8) rdx single-def <System.Nullable`1[int]>
; V03 loc0 [V03,T04] ( 12, 20 ) long -> rax
-; V04 loc1 [V04 ] ( 2, 1 ) struct ( 8) [rbp-0x08] do-not-enreg[SF] ld-addr-op <System.Nullable`1[int]>
-; V05 loc2 [V05,T17] ( 5, 9.50) simd32 -> mm0 ld-addr-op <System.Numerics.Vector`1[ubyte]>
-; V06 loc3 [V06,T06] ( 5, 9.50) byref -> rdi single-def
-; V07 loc4 [V07,T12] ( 4, 2 ) long -> rax
-; V08 loc5 [V08,T09] ( 2, 4.50) long -> rcx
-; V09 loc6 [V09,T03] ( 7, 21 ) long -> rdx
-;* V10 loc7 [V10 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-; V11 loc8 [V11,T10] ( 2, 4.50) long -> rcx
-;# V12 OutArgs [V12 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-; V13 tmp1 [V13,T05] ( 2, 16 ) long -> rax "dup spill"
-; V14 tmp2 [V14 ] ( 6, 6 ) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "NewObj constructor temp" <System.Numerics.Vector`1[ulong]>
-;* V15 tmp3 [V15 ] ( 0, 0 ) simd32 -> zero-ref
-;* V16 tmp4 [V16 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V17 tmp5 [V17,T16] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp"
-; V18 tmp6 [V18,T08] ( 5, 5 ) long -> rdx "Inlining Arg"
-; V19 tmp7 [V19,T14] ( 2, 1 ) ubyte -> [rbp-0x08] do-not-enreg[] "field V04.hasValue (fldOffset=0x0)" P-DEP
-; V20 tmp8 [V20,T15] ( 2, 1 ) int -> [rbp-0x04] do-not-enreg[] "field V04.value (fldOffset=0x4)" P-DEP
-; V21 cse0 [V21,T02] ( 9, 36 ) long -> r8 "CSE #01: aggressive"
-; V22 cse1 [V22,T11] ( 5, 2.50) long -> rcx "CSE #02: moderate"
-; V23 cse2 [V23,T13] ( 3, 1.50) long -> rcx "CSE #03: moderate"
+; V04 loc1 [V04,T17] ( 5, 9.50) simd32 -> mm0 <System.Numerics.Vector`1[ubyte]>
+; V05 loc2 [V05,T06] ( 5, 9.50) byref -> rdi single-def
+; V06 loc3 [V06,T12] ( 4, 2 ) long -> rax
+; V07 loc4 [V07,T08] ( 2, 4.50) long -> rcx
+; V08 loc5 [V08,T03] ( 7, 21 ) long -> rdx
+; V09 loc6 [V09,T09] ( 2, 4.50) long -> rcx
+;# V10 OutArgs [V10 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+; V11 tmp1 [V11,T05] ( 2, 16 ) long -> rax "dup spill"
+;* V12 tmp2 [V12 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
+;* V13 tmp3 [V13 ] ( 0, 0 ) simd32 -> zero-ref
+;* V14 tmp4 [V14 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V15 tmp5 [V15,T10] ( 5, 2.50) long -> rdx "Inline return value spill temp"
+;* V16 tmp6 [V16 ] ( 0, 0 ) struct ( 8) zero-ref ld-addr-op "Inline ldloca(s) first use temp" <System.Nullable`1[int]>
+; V17 tmp7 [V17 ] ( 2, 2 ) struct ( 8) [rbp-0x08] do-not-enreg[SF] ld-addr-op "Inlining Arg" <System.Nullable`1[int]>
+;* V18 tmp8 [V18 ] ( 0, 0 ) struct ( 8) zero-ref ld-addr-op "Inline ldloca(s) first use temp" <System.Nullable`1[int]>
+;* V19 tmp9 [V19 ] ( 0, 0 ) struct ( 8) zero-ref do-not-enreg[SF] ld-addr-op "Inlining Arg" <System.Nullable`1[int]>
+;* V20 tmp10 [V20 ] ( 0, 0 ) long -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+; V21 tmp11 [V21 ] ( 5, 2.50) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <System.Numerics.Vector`1[ulong]>
+;* V22 tmp12 [V22,T16] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp"
+;* V23 tmp13 [V23 ] ( 0, 0 ) ubyte -> zero-ref "field V16.hasValue (fldOffset=0x0)" P-INDEP
+;* V24 tmp14 [V24 ] ( 0, 0 ) int -> zero-ref "field V16.value (fldOffset=0x4)" P-INDEP
+; V25 tmp15 [V25,T13] ( 2, 2 ) ubyte -> [rbp-0x08] do-not-enreg[] "field V17.hasValue (fldOffset=0x0)" P-DEP
+; V26 tmp16 [V26,T14] ( 2, 2 ) int -> [rbp-0x04] do-not-enreg[] "field V17.value (fldOffset=0x4)" P-DEP
+;* V27 tmp17 [V27 ] ( 0, 0 ) ubyte -> zero-ref "field V18.hasValue (fldOffset=0x0)" P-INDEP
+;* V28 tmp18 [V28 ] ( 0, 0 ) int -> zero-ref "field V18.value (fldOffset=0x4)" P-INDEP
+;* V29 tmp19 [V29 ] ( 0, 0 ) ubyte -> zero-ref do-not-enreg[] "field V19.hasValue (fldOffset=0x0)" P-DEP
+;* V30 tmp20 [V30 ] ( 0, 0 ) int -> zero-ref do-not-enreg[] "field V19.value (fldOffset=0x4)" P-DEP
+; V31 cse0 [V31,T02] ( 9, 36 ) long -> r8 "CSE #01: aggressive"
+; V32 cse1 [V32,T11] ( 5, 2.50) long -> rcx "CSE #02: moderate"
+; V33 cse2 [V33,T15] ( 3, 1.50) long -> rcx "CSE #03: moderate"
;
; Lcl frame size = 48
G_M56207_IG01:
push rbp
sub rsp, 48
lea rbp, [rsp+0x30]
;; size=10 bbWeight=1 PerfScore 1.75
G_M56207_IG02:
cmp rsi, 4
jb SHORT G_M56207_IG08
;; size=6 bbWeight=1 PerfScore 1.25
G_M56207_IG03:
mov qword ptr [rbp-0x08], rdx
- vxorps ymm0, ymm0, ymm0
- vmovups ymmword ptr [rbp-0x30], ymm0
- mov rdx, qword ptr [rbp-0x08]
mov qword ptr [rbp-0x30], rdx
mov qword ptr [rbp-0x28], rdx
mov qword ptr [rbp-0x20], rdx
mov qword ptr [rbp-0x18], rdx
vmovups ymm0, ymmword ptr [rbp-0x30]
lea rax, [8*rsi]
mov rcx, rax
and rcx, -64
xor edx, edx
cmp rsi, 8
jb SHORT G_M56207_IG05
- align [3 bytes for IG04]
- ;; size=64 bbWeight=0.50 PerfScore 7.04
+ align [0 bytes for IG04]
+ ;; size=48 bbWeight=0.50 PerfScore 5.75
G_M56207_IG04:
vmovups ymmword ptr [rdi+rdx], ymm0
vmovups ymmword ptr [rdi+rdx+0x20], ymm0
add rdx, 64
cmp rdx, rcx
jb SHORT G_M56207_IG04
;; size=20 bbWeight=4 PerfScore 22.00
G_M56207_IG05:
test al, 32
je SHORT G_M56207_IG06
vmovups ymmword ptr [rdi+rdx], ymm0
;; size=9 bbWeight=0.50 PerfScore 1.62
G_M56207_IG06:
vmovups ymmword ptr [rdi+rax-0x20], ymm0
;; size=6 bbWeight=0.50 PerfScore 1.00
G_M56207_IG07:
vzeroupper
add rsp, 48
pop rbp
ret
;; size=9 bbWeight=0.50 PerfScore 1.38
G_M56207_IG08:
xor eax, eax
cmp rsi, 8
jb SHORT G_M56207_IG10
mov rcx, rsi
and rcx, -8
align [5 bytes for IG09]
;; size=20 bbWeight=0.50 PerfScore 1.12
G_M56207_IG09:
lea r8, [8*rax]
mov qword ptr [rdi+r8], rdx
mov qword ptr [rdi+r8+0x08], rdx
mov qword ptr [rdi+r8+0x10], rdx
mov qword ptr [rdi+r8+0x18], rdx
mov qword ptr [rdi+r8+0x20], rdx
mov qword ptr [rdi+r8+0x28], rdx
mov qword ptr [rdi+r8+0x30], rdx
mov qword ptr [rdi+r8+0x38], rdx
add rax, 8
cmp rax, rcx
jb SHORT G_M56207_IG09
;; size=56 bbWeight=4 PerfScore 40.00
G_M56207_IG10:
test sil, 4
je SHORT G_M56207_IG11
lea rcx, [8*rax]
mov qword ptr [rdi+rcx], rdx
mov qword ptr [rdi+rcx+0x08], rdx
mov qword ptr [rdi+rcx+0x10], rdx
mov qword ptr [rdi+rcx+0x18], rdx
add rax, 4
;; size=37 bbWeight=0.50 PerfScore 3.00
G_M56207_IG11:
test sil, 2
je SHORT G_M56207_IG12
lea rcx, [8*rax]
mov qword ptr [rdi+rcx], rdx
mov qword ptr [rdi+rcx+0x08], rdx
add rax, 2
;; size=27 bbWeight=0.50 PerfScore 2.00
G_M56207_IG12:
test sil, 1
je SHORT G_M56207_IG13
mov qword ptr [rdi+8*rax], rdx
;; size=10 bbWeight=0.50 PerfScore 1.12
G_M56207_IG13:
vzeroupper
add rsp, 48
pop rbp
ret
;; size=9 bbWeight=0.50 PerfScore 1.38
-; Total bytes of code 283, prolog size 10, PerfScore 84.67, instruction count 73, allocated bytes for code 283 (MethodHash=b4fe2470) for method System.SpanHelpers:Fill[System.Nullable`1[int]](byref,ulong,System.Nullable`1[int]) (FullOpts)
+; Total bytes of code 267, prolog size 10, PerfScore 83.38, instruction count 70, allocated bytes for code 267 (MethodHash=b4fe2470) for method System.SpanHelpers:Fill[System.Nullable`1[int]](byref,ulong,System.Nullable`1[int]) (FullOpts) -15 (-6.10 % of base) - System.SpanHelpers:Fill[ubyte](byref,ulong,ubyte) ; Assembly listing for method System.SpanHelpers:Fill[ubyte](byref,ulong,ubyte) (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
-; 0 inlinees with PGO data; 1 single block inlinees; 2 inlinees without PGO data
+; 0 inlinees with PGO data; 1 single block inlinees; 3 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T01] ( 18, 38 ) byref -> rdi single-def
-; V01 arg1 [V01,T08] ( 9, 6 ) long -> rsi single-def
+; V01 arg1 [V01,T07] ( 9, 6 ) long -> rsi single-def
; V02 arg2 [V02,T02] ( 18, 37.75) ubyte -> rdx single-def
; V03 loc0 [V03,T00] ( 23, 50 ) long -> rax
-;* V04 loc1 [V04 ] ( 0, 0 ) ubyte -> zero-ref ld-addr-op
-; V05 loc2 [V05 ] ( 5, 13 ) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector`1[ubyte]>
-; V06 loc3 [V06,T06] ( 5, 9.50) byref -> rdi single-def
-; V07 loc4 [V07,T11] ( 5, 2.50) long -> rsi
-; V08 loc5 [V08,T09] ( 2, 4.50) long -> rax
-; V09 loc6 [V09,T03] ( 7, 21 ) long -> rcx
-;* V10 loc7 [V10 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-; V11 loc8 [V11,T10] ( 2, 4.50) long -> rcx
-;# V12 OutArgs [V12 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-; V13 tmp1 [V13,T05] ( 2, 16 ) long -> rax "dup spill"
-;* V14 tmp2 [V14 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V15 tmp3 [V15,T04] ( 5, 16.25) int -> rax "Inline stloc first use temp"
-; V16 tmp4 [V16,T07] ( 2, 8.50) ubyte -> rdx "Inlining Arg"
+; V04 loc1 [V04,T12] ( 5, 9.50) simd32 -> mm0 <System.Numerics.Vector`1[ubyte]>
+; V05 loc2 [V05,T06] ( 5, 9.50) byref -> rdi single-def
+; V06 loc3 [V06,T11] ( 5, 2.50) long -> rsi
+; V07 loc4 [V07,T08] ( 2, 4.50) long -> rax
+; V08 loc5 [V08,T03] ( 7, 21 ) long -> rcx
+; V09 loc6 [V09,T09] ( 2, 4.50) long -> rcx
+;# V10 OutArgs [V10 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+; V11 tmp1 [V11,T05] ( 2, 16 ) long -> rax "dup spill"
+;* V12 tmp2 [V12 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V13 tmp3 [V13,T10] ( 2, 4.25) ubyte -> rdx "Inline return value spill temp"
+;* V14 tmp4 [V14 ] ( 0, 0 ) ubyte -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V15 tmp5 [V15 ] ( 0, 0 ) ubyte -> zero-ref ld-addr-op "Inlining Arg"
+; V16 tmp6 [V16 ] ( 2, 4.50) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <System.Numerics.Vector`1[ubyte]>
+; V17 tmp7 [V17,T04] ( 5, 16.25) int -> rax "Inline stloc first use temp"
;
; Lcl frame size = 48
G_M56047_IG01:
push rbp
sub rsp, 48
lea rbp, [rsp+0x30]
;; size=10 bbWeight=1 PerfScore 1.75
G_M56047_IG02:
cmp rsi, 32
jb SHORT G_M56047_IG10
- ;; NOP compensation instructions of 4 bytes.
- ;; size=10 bbWeight=1 PerfScore 1.25
+ ;; size=6 bbWeight=1 PerfScore 1.25
G_M56047_IG03:
movzx rdx, dl
xor eax, eax
- align [7 bytes for IG04]
- ;; size=12 bbWeight=0.25 PerfScore 0.19
+ align [11 bytes for IG04]
+ ;; size=16 bbWeight=0.25 PerfScore 0.19
G_M56047_IG04:
lea rcx, [rbp-0x30]
movsxd r8, eax
mov byte ptr [rcx+r8], dl
inc eax
cmp eax, 32
jl SHORT G_M56047_IG04
;; size=18 bbWeight=4 PerfScore 13.00
G_M56047_IG05:
+ vmovups ymm0, ymmword ptr [rbp-0x30]
mov rax, rsi
and rax, -64
xor ecx, ecx
cmp rsi, 64
jb SHORT G_M56047_IG07
align [0 bytes for IG06]
- ;; size=15 bbWeight=0.50 PerfScore 1.00
+ ;; size=20 bbWeight=0.50 PerfScore 3.00
G_M56047_IG06:
- vmovups ymm0, ymmword ptr [rbp-0x30]
vmovups ymmword ptr [rdi+rcx], ymm0
- vmovups ymm0, ymmword ptr [rbp-0x30]
vmovups ymmword ptr [rdi+rcx+0x20], ymm0
add rcx, 64
cmp rcx, rax
jb SHORT G_M56047_IG06
- ;; size=30 bbWeight=4 PerfScore 54.00
+ ;; size=20 bbWeight=4 PerfScore 22.00
G_M56047_IG07:
test sil, 32
je SHORT G_M56047_IG08
- vmovups ymm0, ymmword ptr [rbp-0x30]
vmovups ymmword ptr [rdi+rcx], ymm0
- ;; size=16 bbWeight=0.50 PerfScore 3.62
+ ;; size=11 bbWeight=0.50 PerfScore 1.62
G_M56047_IG08:
- vmovups ymm0, ymmword ptr [rbp-0x30]
vmovups ymmword ptr [rdi+rsi-0x20], ymm0
- ;; size=11 bbWeight=0.50 PerfScore 3.00
+ ;; size=6 bbWeight=0.50 PerfScore 1.00
G_M56047_IG09:
vzeroupper
add rsp, 48
pop rbp
ret
;; size=9 bbWeight=0.50 PerfScore 1.38
G_M56047_IG10:
xor eax, eax
cmp rsi, 8
jb SHORT G_M56047_IG12
mov rcx, rsi
and rcx, -8
align [0 bytes for IG11]
;; size=15 bbWeight=0.50 PerfScore 1.00
G_M56047_IG11:
mov byte ptr [rdi+rax], dl
mov byte ptr [rdi+rax+0x01], dl
mov byte ptr [rdi+rax+0x02], dl
mov byte ptr [rdi+rax+0x03], dl
mov byte ptr [rdi+rax+0x04], dl
mov byte ptr [rdi+rax+0x05], dl
mov byte ptr [rdi+rax+0x06], dl
mov byte ptr [rdi+rax+0x07], dl
add rax, 8
cmp rax, rcx
jb SHORT G_M56047_IG11
;; size=40 bbWeight=4 PerfScore 38.00
G_M56047_IG12:
test sil, 4
je SHORT G_M56047_IG13
mov byte ptr [rdi+rax], dl
mov byte ptr [rdi+rax+0x01], dl
mov byte ptr [rdi+rax+0x02], dl
mov byte ptr [rdi+rax+0x03], dl
add rax, 4
;; size=25 bbWeight=0.50 PerfScore 2.75
G_M56047_IG13:
test sil, 2
je SHORT G_M56047_IG14
mov byte ptr [rdi+rax], dl
mov byte ptr [rdi+rax+0x01], dl
add rax, 2
;; size=17 bbWeight=0.50 PerfScore 1.75
G_M56047_IG14:
test sil, 1
je SHORT G_M56047_IG15
mov byte ptr [rdi+rax], dl
;; size=9 bbWeight=0.50 PerfScore 1.12
G_M56047_IG15:
vzeroupper
add rsp, 48
pop rbp
ret
;; size=9 bbWeight=0.50 PerfScore 1.38
-; Total bytes of code 246, prolog size 10, PerfScore 125.19, instruction count 73, allocated bytes for code 246 (MethodHash=d8592510) for method System.SpanHelpers:Fill[ubyte](byref,ulong,ubyte) (FullOpts)
+; Total bytes of code 231, prolog size 10, PerfScore 91.19, instruction count 70, allocated bytes for code 231 (MethodHash=d8592510) for method System.SpanHelpers:Fill[ubyte](byref,ulong,ubyte) (FullOpts) -12 (-6.45 % of base) - System.SpanHelpers:Fill[int](byref,ulong,int) ; Assembly listing for method System.SpanHelpers:Fill[int](byref,ulong,int) (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
-; 0 inlinees with PGO data; 1 single block inlinees; 2 inlinees without PGO data
+; 0 inlinees with PGO data; 1 single block inlinees; 3 inlinees without PGO data
; Final local variable assignments
;
-; V00 arg0 [V00,T03] ( 10, 6 ) byref -> rdi single-def
-; V01 arg1 [V01,T05] ( 8, 5.50) long -> rsi single-def
-; V02 arg2 [V02,T04] ( 10, 6 ) int -> rdx single-def
-; V03 loc0 [V03,T06] ( 12, 6 ) long -> rax
-;* V04 loc1 [V04 ] ( 0, 0 ) int -> zero-ref ld-addr-op
-; V05 loc2 [V05,T10] ( 5, 9.50) simd32 -> mm0 ld-addr-op <System.Numerics.Vector`1[ubyte]>
-; V06 loc3 [V06,T01] ( 5, 9.50) byref -> rdi single-def
-; V07 loc4 [V07,T08] ( 4, 2 ) long -> rax
-; V08 loc5 [V08,T07] ( 2, 4.50) long -> rcx
-; V09 loc6 [V09,T00] ( 7, 21 ) long -> rdx
-;* V10 loc7 [V10 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-;* V11 loc8 [V11 ] ( 0, 0 ) long -> zero-ref
-;# V12 OutArgs [V12 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;* V13 tmp1 [V13 ] ( 0, 0 ) long -> zero-ref "dup spill"
-; V14 tmp2 [V14 ] ( 10, 10 ) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "NewObj constructor temp" <System.Numerics.Vector`1[uint]>
-;* V15 tmp3 [V15 ] ( 0, 0 ) simd32 -> zero-ref
-;* V16 tmp4 [V16 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V17 tmp5 [V17,T09] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp"
-; V18 tmp6 [V18,T02] ( 9, 9 ) int -> rdx "Inlining Arg"
+; V00 arg0 [V00,T02] ( 10, 6 ) byref -> rdi single-def
+; V01 arg1 [V01,T04] ( 8, 5.50) long -> rsi single-def
+; V02 arg2 [V02,T03] ( 10, 6 ) int -> rdx single-def
+; V03 loc0 [V03,T05] ( 12, 6 ) long -> rax
+; V04 loc1 [V04,T10] ( 5, 9.50) simd32 -> mm0 <System.Numerics.Vector`1[ubyte]>
+; V05 loc2 [V05,T01] ( 5, 9.50) byref -> rdi single-def
+; V06 loc3 [V06,T08] ( 4, 2 ) long -> rax
+; V07 loc4 [V07,T07] ( 2, 4.50) long -> rcx
+; V08 loc5 [V08,T00] ( 7, 21 ) long -> rdx
+;* V09 loc6 [V09 ] ( 0, 0 ) long -> zero-ref
+;# V10 OutArgs [V10 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+;* V11 tmp1 [V11 ] ( 0, 0 ) long -> zero-ref "dup spill"
+;* V12 tmp2 [V12 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
+;* V13 tmp3 [V13 ] ( 0, 0 ) simd32 -> zero-ref
+;* V14 tmp4 [V14 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V15 tmp5 [V15,T06] ( 9, 4.50) int -> rdx "Inline return value spill temp"
+;* V16 tmp6 [V16 ] ( 0, 0 ) int -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V17 tmp7 [V17 ] ( 0, 0 ) int -> zero-ref ld-addr-op "Inlining Arg"
+; V18 tmp8 [V18 ] ( 9, 4.50) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <System.Numerics.Vector`1[uint]>
+;* V19 tmp9 [V19,T09] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp"
;
; Lcl frame size = 48
G_M11887_IG01:
push rbp
sub rsp, 48
lea rbp, [rsp+0x30]
;; size=10 bbWeight=1 PerfScore 1.75
G_M11887_IG02:
cmp rsi, 8
jb SHORT G_M11887_IG08
;; size=6 bbWeight=1 PerfScore 1.25
G_M11887_IG03:
- vxorps ymm0, ymm0, ymm0
- vmovups ymmword ptr [rbp-0x30], ymm0
mov dword ptr [rbp-0x30], edx
mov dword ptr [rbp-0x2C], edx
mov dword ptr [rbp-0x28], edx
mov dword ptr [rbp-0x24], edx
mov dword ptr [rbp-0x20], edx
mov dword ptr [rbp-0x1C], edx
mov dword ptr [rbp-0x18], edx
mov dword ptr [rbp-0x14], edx
vmovups ymm0, ymmword ptr [rbp-0x30]
lea rax, [4*rsi]
mov rcx, rax
and rcx, -64
xor edx, edx
cmp rsi, 16
jb SHORT G_M11887_IG05
- align [3 bytes for IG04]
- ;; size=64 bbWeight=0.50 PerfScore 8.04
+ align [0 bytes for IG04]
+ ;; size=52 bbWeight=0.50 PerfScore 7.25
G_M11887_IG04:
vmovups ymmword ptr [rdi+rdx], ymm0
vmovups ymmword ptr [rdi+rdx+0x20], ymm0
add rdx, 64
cmp rdx, rcx
jb SHORT G_M11887_IG04
;; size=20 bbWeight=4 PerfScore 22.00
G_M11887_IG05:
test al, 32
je SHORT G_M11887_IG06
vmovups ymmword ptr [rdi+rdx], ymm0
;; size=9 bbWeight=0.50 PerfScore 1.62
G_M11887_IG06:
vmovups ymmword ptr [rdi+rax-0x20], ymm0
;; size=6 bbWeight=0.50 PerfScore 1.00
G_M11887_IG07:
vzeroupper
add rsp, 48
pop rbp
ret
;; size=9 bbWeight=0.50 PerfScore 1.38
G_M11887_IG08:
xor eax, eax
test sil, 4
je SHORT G_M11887_IG09
mov dword ptr [rdi+4*rax], edx
mov dword ptr [rdi+4*rax+0x04], edx
mov dword ptr [rdi+4*rax+0x08], edx
mov dword ptr [rdi+4*rax+0x0C], edx
add rax, 4
;; size=27 bbWeight=0.50 PerfScore 2.88
G_M11887_IG09:
test sil, 2
je SHORT G_M11887_IG10
mov dword ptr [rdi+4*rax], edx
mov dword ptr [rdi+4*rax+0x04], edx
add rax, 2
;; size=17 bbWeight=0.50 PerfScore 1.75
G_M11887_IG10:
test sil, 1
je SHORT G_M11887_IG11
mov dword ptr [rdi+4*rax], edx
;; size=9 bbWeight=0.50 PerfScore 1.12
G_M11887_IG11:
vzeroupper
add rsp, 48
pop rbp
ret
;; size=9 bbWeight=0.50 PerfScore 1.38
-; Total bytes of code 186, prolog size 10, PerfScore 44.17, instruction count 56, allocated bytes for code 186 (MethodHash=2dc3d190) for method System.SpanHelpers:Fill[int](byref,ulong,int) (FullOpts)
+; Total bytes of code 174, prolog size 10, PerfScore 43.38, instruction count 54, allocated bytes for code 174 (MethodHash=2dc3d190) for method System.SpanHelpers:Fill[int](byref,ulong,int) (FullOpts) -5 (-1.97 % of base) - System.SpanHelpers:Fill[ushort](byref,ulong,ushort) ; Assembly listing for method System.SpanHelpers:Fill[ushort](byref,ulong,ushort) (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
-; 0 inlinees with PGO data; 1 single block inlinees; 2 inlinees without PGO data
+; 0 inlinees with PGO data; 1 single block inlinees; 3 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T01] ( 18, 38 ) byref -> rdi single-def
; V01 arg1 [V01,T07] ( 10, 6.50) long -> rsi single-def
; V02 arg2 [V02,T02] ( 18, 37.75) ushort -> rdx single-def
; V03 loc0 [V03,T00] ( 23, 50 ) long -> rax
-; V04 loc1 [V04,T12] ( 2, 0.50) ushort -> rdx ld-addr-op
-; V05 loc2 [V05,T13] ( 5, 9.50) simd32 -> mm0 ld-addr-op <System.Numerics.Vector`1[ubyte]>
-; V06 loc3 [V06,T06] ( 5, 9.50) byref -> rdi single-def
-; V07 loc4 [V07,T11] ( 4, 2 ) long -> rax
-; V08 loc5 [V08,T09] ( 2, 4.50) long -> rcx
-; V09 loc6 [V09,T03] ( 7, 21 ) long -> rdx
-;* V10 loc7 [V10 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-; V11 loc8 [V11,T10] ( 2, 4.50) long -> rcx
-;# V12 OutArgs [V12 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-; V13 tmp1 [V13,T05] ( 2, 16 ) long -> rax "dup spill"
-; V14 tmp2 [V14 ] ( 3, 9.50) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "NewObj constructor temp" <System.Numerics.Vector`1[ushort]>
-;* V15 tmp3 [V15 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-; V16 tmp4 [V16,T04] ( 5, 16.25) int -> rax "Inline stloc first use temp"
-; V17 tmp5 [V17,T08] ( 2, 8.50) ushort -> rdx "Inlining Arg"
+; V04 loc1 [V04,T12] ( 5, 9.50) simd32 -> mm0 <System.Numerics.Vector`1[ubyte]>
+; V05 loc2 [V05,T06] ( 5, 9.50) byref -> rdi single-def
+; V06 loc3 [V06,T11] ( 4, 2 ) long -> rax
+; V07 loc4 [V07,T08] ( 2, 4.50) long -> rcx
+; V08 loc5 [V08,T03] ( 7, 21 ) long -> rdx
+; V09 loc6 [V09,T09] ( 2, 4.50) long -> rcx
+;# V10 OutArgs [V10 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+; V11 tmp1 [V11,T05] ( 2, 16 ) long -> rax "dup spill"
+;* V12 tmp2 [V12 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
+;* V13 tmp3 [V13 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V14 tmp4 [V14,T10] ( 2, 4.25) ushort -> rdx "Inline return value spill temp"
+;* V15 tmp5 [V15 ] ( 0, 0 ) ushort -> zero-ref ld-addr-op "Inline ldloca(s) first use temp"
+;* V16 tmp6 [V16 ] ( 0, 0 ) ushort -> zero-ref ld-addr-op "Inlining Arg"
+; V17 tmp7 [V17 ] ( 2, 4.50) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <System.Numerics.Vector`1[ushort]>
+; V18 tmp8 [V18,T04] ( 5, 16.25) int -> rax "Inline stloc first use temp"
;
; Lcl frame size = 48
G_M51983_IG01:
push rbp
sub rsp, 48
lea rbp, [rsp+0x30]
;; size=10 bbWeight=1 PerfScore 1.75
G_M51983_IG02:
cmp rsi, 16
jb SHORT G_M51983_IG10
- ;; NOP compensation instructions of 4 bytes.
- ;; size=10 bbWeight=1 PerfScore 1.25
+ ;; size=6 bbWeight=1 PerfScore 1.25
G_M51983_IG03:
movzx rdx, dx
- vxorps ymm0, ymm0, ymm0
- vmovups ymmword ptr [rbp-0x30], ymm0
xor eax, eax
- align [0 bytes for IG04]
- ;; size=14 bbWeight=0.25 PerfScore 0.46
+ align [11 bytes for IG04]
+ ;; size=16 bbWeight=0.25 PerfScore 0.19
G_M51983_IG04:
lea rcx, [rbp-0x30]
movsxd r8, eax
mov word ptr [rcx+2*r8], dx
inc eax
cmp eax, 16
jl SHORT G_M51983_IG04
;; size=19 bbWeight=4 PerfScore 13.00
G_M51983_IG05:
vmovups ymm0, ymmword ptr [rbp-0x30]
lea rax, [rsi+rsi]
mov rcx, rax
and rcx, -64
xor edx, edx
cmp rsi, 32
jb SHORT G_M51983_IG07
- align [3 bytes for IG06]
- ;; size=27 bbWeight=0.50 PerfScore 3.38
+ align [0 bytes for IG06]
+ ;; size=24 bbWeight=0.50 PerfScore 3.25
G_M51983_IG06:
vmovups ymmword ptr [rdi+rdx], ymm0
vmovups ymmword ptr [rdi+rdx+0x20], ymm0
add rdx, 64
cmp rdx, rcx
jb SHORT G_M51983_IG06
;; size=20 bbWeight=4 PerfScore 22.00
G_M51983_IG07:
test al, 32
je SHORT G_M51983_IG08
vmovups ymmword ptr [rdi+rdx], ymm0
;; size=9 bbWeight=0.50 PerfScore 1.62
G_M51983_IG08:
vmovups ymmword ptr [rdi+rax-0x20], ymm0
;; size=6 bbWeight=0.50 PerfScore 1.00
G_M51983_IG09:
vzeroupper
add rsp, 48
pop rbp
ret
;; size=9 bbWeight=0.50 PerfScore 1.38
G_M51983_IG10:
xor eax, eax
cmp rsi, 8
jb SHORT G_M51983_IG12
mov rcx, rsi
and rcx, -8
align [0 bytes for IG11]
;; size=15 bbWeight=0.50 PerfScore 1.00
G_M51983_IG11:
mov word ptr [rdi+2*rax], dx
mov word ptr [rdi+2*rax+0x02], dx
mov word ptr [rdi+2*rax+0x04], dx
mov word ptr [rdi+2*rax+0x06], dx
mov word ptr [rdi+2*rax+0x08], dx
mov word ptr [rdi+2*rax+0x0A], dx
mov word ptr [rdi+2*rax+0x0C], dx
mov word ptr [rdi+2*rax+0x0E], dx
add rax, 8
cmp rax, rcx
jb SHORT G_M51983_IG11
;; size=48 bbWeight=4 PerfScore 38.00
G_M51983_IG12:
test sil, 4
je SHORT G_M51983_IG13
mov word ptr [rdi+2*rax], dx
mov word ptr [rdi+2*rax+0x02], dx
mov word ptr [rdi+2*rax+0x04], dx
mov word ptr [rdi+2*rax+0x06], dx
add rax, 4
;; size=29 bbWeight=0.50 PerfScore 2.75
G_M51983_IG13:
test sil, 2
je SHORT G_M51983_IG14
mov word ptr [rdi+2*rax], dx
mov word ptr [rdi+2*rax+0x02], dx
add rax, 2
;; size=19 bbWeight=0.50 PerfScore 1.75
G_M51983_IG14:
test sil, 1
je SHORT G_M51983_IG15
mov word ptr [rdi+2*rax], dx
;; size=10 bbWeight=0.50 PerfScore 1.12
G_M51983_IG15:
vzeroupper
add rsp, 48
pop rbp
ret
;; size=9 bbWeight=0.50 PerfScore 1.38
-; Total bytes of code 254, prolog size 10, PerfScore 91.83, instruction count 73, allocated bytes for code 254 (MethodHash=267434f0) for method System.SpanHelpers:Fill[ushort](byref,ulong,ushort) (FullOpts)
+; Total bytes of code 249, prolog size 10, PerfScore 91.44, instruction count 71, allocated bytes for code 249 (MethodHash=267434f0) for method System.SpanHelpers:Fill[ushort](byref,ulong,ushort) (FullOpts) -4 (-1.53 % of base) - System.SpanHelpers:Fill[double](byref,ulong,double) ; Assembly listing for method System.SpanHelpers:Fill[double](byref,ulong,double) (FullOpts)
; Emitting BLENDED_CODE for X64 with AVX512 - Unix
; FullOpts code
; optimized code
; rbp based frame
; fully interruptible
; No PGO data
; 0 inlinees with PGO data; 1 single block inlinees; 2 inlinees without PGO data
; Final local variable assignments
;
; V00 arg0 [V00,T01] ( 18, 38 ) byref -> rdi single-def
; V01 arg1 [V01,T05] ( 10, 6.50) long -> rsi single-def
-; V02 arg2 [V02,T10] ( 18, 38 ) double -> mm0 single-def
+; V02 arg2 [V02,T10] ( 21, 39.50) double -> mm0 single-def
; V03 loc0 [V03,T00] ( 23, 50 ) long -> rax
-;* V04 loc1 [V04 ] ( 0, 0 ) double -> zero-ref ld-addr-op
-; V05 loc2 [V05,T11] ( 5, 9.50) simd32 -> mm0 ld-addr-op <System.Numerics.Vector`1[ubyte]>
-; V06 loc3 [V06,T04] ( 5, 9.50) byref -> rdi single-def
-; V07 loc4 [V07,T08] ( 4, 2 ) long -> rax
-; V08 loc5 [V08,T06] ( 2, 4.50) long -> rcx
-; V09 loc6 [V09,T02] ( 7, 21 ) long -> rdx
-;* V10 loc7 [V10 ] ( 0, 0 ) simd16 -> zero-ref <System.Runtime.Intrinsics.Vector128`1[ubyte]>
-; V11 loc8 [V11,T07] ( 2, 4.50) long -> rcx
-;# V12 OutArgs [V12 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-; V13 tmp1 [V13,T03] ( 2, 16 ) long -> rax "dup spill"
-; V14 tmp2 [V14 ] ( 6, 6 ) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "NewObj constructor temp" <System.Numerics.Vector`1[double]>
-;* V15 tmp3 [V15 ] ( 0, 0 ) simd32 -> zero-ref
-;* V16 tmp4 [V16 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
-;* V17 tmp5 [V17,T09] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp"
-; V18 tmp6 [V18,T12] ( 5, 5 ) double -> mm0 "Inlining Arg"
+; V04 loc1 [V04,T11] ( 5, 9.50) simd32 -> mm0 <System.Numerics.Vector`1[ubyte]>
+; V05 loc2 [V05,T04] ( 5, 9.50) byref -> rdi single-def
+; V06 loc3 [V06,T08] ( 4, 2 ) long -> rax
+; V07 loc4 [V07,T06] ( 2, 4.50) long -> rcx
+; V08 loc5 [V08,T02] ( 7, 21 ) long -> rdx
+; V09 loc6 [V09,T07] ( 2, 4.50) long -> rcx
+;# V10 OutArgs [V10 ] ( 1, 1 ) struct ( 0) [rsp+0x00] do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+; V11 tmp1 [V11,T03] ( 2, 16 ) long -> rax "dup spill"
+;* V12 tmp2 [V12 ] ( 0, 0 ) simd32 -> zero-ref "spilled call-like call argument"
+;* V13 tmp3 [V13 ] ( 0, 0 ) simd32 -> zero-ref
+;* V14 tmp4 [V14 ] ( 0, 0 ) ubyte -> zero-ref "Inline return value spill temp"
+; V15 tmp5 [V15 ] ( 5, 2.50) simd32 -> [rbp-0x30] do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <System.Numerics.Vector`1[double]>
+;* V16 tmp6 [V16,T09] ( 0, 0 ) int -> zero-ref "Inline stloc first use temp"
;
; Lcl frame size = 48
G_M41871_IG01:
push rbp
sub rsp, 48
lea rbp, [rsp+0x30]
;; size=10 bbWeight=1 PerfScore 1.75
G_M41871_IG02:
cmp rsi, 4
jb SHORT G_M41871_IG08
;; size=6 bbWeight=1 PerfScore 1.25
G_M41871_IG03:
- vxorps ymm1, ymm1, ymm1
- vmovups ymmword ptr [rbp-0x30], ymm1
vmovsd qword ptr [rbp-0x30], xmm0
vmovsd qword ptr [rbp-0x28], xmm0
vmovsd qword ptr [rbp-0x20], xmm0
vmovsd qword ptr [rbp-0x18], xmm0
vmovups ymm0, ymmword ptr [rbp-0x30]
lea rax, [8*rsi]
mov rcx, rax
and rcx, -64
xor edx, edx
cmp rsi, 8
jb SHORT G_M41871_IG05
align [0 bytes for IG04]
- ;; size=57 bbWeight=0.50 PerfScore 5.92
+ ;; size=48 bbWeight=0.50 PerfScore 5.25
G_M41871_IG04:
vmovups ymmword ptr [rdi+rdx], ymm0
vmovups ymmword ptr [rdi+rdx+0x20], ymm0
add rdx, 64
cmp rdx, rcx
jb SHORT G_M41871_IG04
;; size=20 bbWeight=4 PerfScore 22.00
G_M41871_IG05:
test al, 32
je SHORT G_M41871_IG06
vmovups ymmword ptr [rdi+rdx], ymm0
;; size=9 bbWeight=0.50 PerfScore 1.62
G_M41871_IG06:
vmovups ymmword ptr [rdi+rax-0x20], ymm0
;; size=6 bbWeight=0.50 PerfScore 1.00
G_M41871_IG07:
vzeroupper
add rsp, 48
pop rbp
ret
;; size=9 bbWeight=0.50 PerfScore 1.38
G_M41871_IG08:
xor eax, eax
cmp rsi, 8
jb SHORT G_M41871_IG10
mov rcx, rsi
and rcx, -8
- align [0 bytes for IG09]
- ;; size=15 bbWeight=0.50 PerfScore 1.00
+ align [5 bytes for IG09]
+ ;; size=20 bbWeight=0.50 PerfScore 1.12
G_M41871_IG09:
vmovsd qword ptr [rdi+8*rax], xmm0
vmovsd qword ptr [rdi+8*rax+0x08], xmm0
vmovsd qword ptr [rdi+8*rax+0x10], xmm0
vmovsd qword ptr [rdi+8*rax+0x18], xmm0
vmovsd qword ptr [rdi+8*rax+0x20], xmm0
vmovsd qword ptr [rdi+8*rax+0x28], xmm0
vmovsd qword ptr [rdi+8*rax+0x30], xmm0
vmovsd qword ptr [rdi+8*rax+0x38], xmm0
add rax, 8
cmp rax, rcx
jb SHORT G_M41871_IG09
;; size=56 bbWeight=4 PerfScore 70.00
G_M41871_IG10:
test sil, 4
je SHORT G_M41871_IG11
vmovsd qword ptr [rdi+8*rax], xmm0
vmovsd qword ptr [rdi+8*rax+0x08], xmm0
vmovsd qword ptr [rdi+8*rax+0x10], xmm0
vmovsd qword ptr [rdi+8*rax+0x18], xmm0
add rax, 4
;; size=33 bbWeight=0.50 PerfScore 4.75
G_M41871_IG11:
test sil, 2
je SHORT G_M41871_IG12
vmovsd qword ptr [rdi+8*rax], xmm0
vmovsd qword ptr [rdi+8*rax+0x08], xmm0
add rax, 2
;; size=21 bbWeight=0.50 PerfScore 2.75
G_M41871_IG12:
test sil, 1
je SHORT G_M41871_IG13
vmovsd qword ptr [rdi+8*rax], xmm0
;; size=11 bbWeight=0.50 PerfScore 1.62
G_M41871_IG13:
vzeroupper
add rsp, 48
pop rbp
ret
;; size=9 bbWeight=0.50 PerfScore 1.38
-; Total bytes of code 262, prolog size 10, PerfScore 116.42, instruction count 68, allocated bytes for code 262 (MethodHash=eb775c70) for method System.SpanHelpers:Fill[double](byref,ulong,double) (FullOpts)
+; Total bytes of code 258, prolog size 10, PerfScore 115.88, instruction count 66, allocated bytes for code 258 (MethodHash=eb775c70) for method System.SpanHelpers:Fill[double](byref,ulong,double) (FullOpts) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Job completed in 14 minutes.
Diffs
Artifacts:
The text was updated successfully, but these errors were encountered: