Skip to content

Commit

Permalink
aarch64: Use LDIAPP/STILP if FEAT_LRCPC3 and FEAT_LSE2 are available
Browse files Browse the repository at this point in the history
  • Loading branch information
taiki-e committed Feb 6, 2023
1 parent e07b647 commit e83b63a
Show file tree
Hide file tree
Showing 7 changed files with 107 additions and 6 deletions.
4 changes: 4 additions & 0 deletions .github/.cspell/project-dictionary.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,14 @@ ishld
isync
ldaxp
ldclrp
ldiapp
ldsetp
ldxp
libcalls
libtcl
libtest
lqarx
lrcpc
lwsync
machdep
mclass
Expand All @@ -82,6 +84,7 @@ quadword
qword
RAII
rclass
rcpc
reentrancy
sbcs
semihosting
Expand All @@ -98,6 +101,7 @@ sreg
sstatus
stdarch
stdsimd
stilp
stlxp
stpq
stqcx
Expand Down
6 changes: 4 additions & 2 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -179,13 +179,15 @@ fn main() {
let is_macos = target_os == "macos";
let mut has_lse = is_macos;
// FEAT_LSE2 doesn't imply FEAT_LSE. FEAT_LSE128 implies FEAT_LSE but not FEAT_LSE2.
// As of rustc 1.67, target_feature "lse2"/"lse128" is not available on rustc side:
// As of rustc 1.67, target_feature "lse2"/"lse128"/"rcpc3" is not available on rustc side:
// https://github.com/rust-lang/rust/blob/1.67.0/compiler/rustc_codegen_ssa/src/target_features.rs#L47
target_feature_if("lse2", is_macos, &version, None, false);
// LLVM supports FEAT_LSE128 on only LLVM 16+:
// LLVM supports FEAT_LRCPC3 and FEAT_LSE128 on only LLVM 16+:
// https://github.com/llvm/llvm-project/commit/a6aaa969f7caec58a994142f8d855861cf3a1463
// https://github.com/llvm/llvm-project/commit/7fea6f2e0e606e5339c3359568f680eaf64aa306
if version.llvm >= 16 {
has_lse |= target_feature_if("lse128", false, &version, None, false);
target_feature_if("rcpc3", false, &version, None, false);
}
// aarch64_target_feature stabilized in Rust 1.61.
target_feature_if("lse", has_lse, &version, Some(61), true);
Expand Down
2 changes: 1 addition & 1 deletion src/imp/atomic128/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ The table of targets that support 128-bit atomics and the instructions used:
| target_arch | load | store | CAS | RMW | note |
| ----------- | ----- | ----- | ---- | ---- | ---- |
| x86_64 | cmpxchg16b or vmovdqa | cmpxchg16b or vmovdqa | cmpxchg16b | cmpxchg16b | cmpxchg16b target feature required. vmovdqa requires Intel or AMD CPU with AVX. <br> Both compile-time and run-time detection are supported for cmpxchg16b. vmovdqa is currently run-time detection only. <br> Requires rustc 1.59+ when cmpxchg16b target feature is enabled at compile-time, otherwise requires nightly |
| aarch64 | ldxp/stxp or casp or ldp | ldxp/stxp or casp or stp | ldxp/stxp or casp | ldxp/stxp or casp/swpp/ldclrp/ldsetp | casp requires lse target feature, ldp/stp requires lse2 target feature, swpp/ldclrp/ldsetp requires lse128 target feature. <br> Both compile-time and run-time detection are supported for lse. lse2 and lse128 are currently compile-time detection only. <br> Requires rustc 1.59+ |
| aarch64 | ldxp/stxp or casp or ldp/ldiapp | ldxp/stxp or casp or stp/stilp | ldxp/stxp or casp | ldxp/stxp or casp/swpp/ldclrp/ldsetp | casp requires lse target feature, ldp/stp requires lse2 target feature, swpp/ldclrp/ldsetp requires lse128 target feature, ldiapp/stilp requires rcpc3 target feature. <br> Both compile-time and run-time detection are supported for lse. lse2, lse128, and rcpc3 are currently compile-time detection only. <br> Requires rustc 1.59+ |
| powerpc64 | lq | stq | lqarx/stqcx. | lqarx/stqcx. | Little endian or target CPU pwr8+. <br> Requires nightly |
| s390x | lpq | stpq | cdsg | cdsg | Requires nightly |

Expand Down
33 changes: 31 additions & 2 deletions src/imp/atomic128/aarch64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
// - LDXP/STXP loop (DW LL/SC)
// - CASP (DWCAS) added as FEAT_LSE (mandatory from armv8.1-a)
// - LDP/STP (DW load/store) if FEAT_LSE2 (optional from armv8.2-a, mandatory from armv8.4-a) is available
// - LDIAPP/STILP (DW acquire-load/release-store) added as FEAT_LRCPC3 (optional from armv8.9-a/armv9.4-a) (if FEAT_LSE2 is also available)
// - LDCLRP/LDSETP/SWPP (DW RMW) added as FEAT_LSE128 (optional from armv9.4-a)
//
// If outline-atomics is not enabled and FEAT_LSE is not available at
Expand All @@ -14,6 +15,7 @@
// at run-time, otherwise, use LDXP/STXP loop.
// If FEAT_LSE is available at compile-time, we use CASP for load/store/CAS/RMW.
// If FEAT_LSE2 is available at compile-time, we use LDP/STP for load/store.
// If FEAT_LSE2 and FEAT_LRCPC3 are available at compile-time, we use LDIAPP/STILP for acquire-load/release-store.
// If FEAT_LSE128 is available at compile-time, we use LDCLRP/LDSETP/SWPP for fetch_and/fetch_or/swap.
//
// Note: FEAT_LSE2 doesn't imply FEAT_LSE. FEAT_LSE128 implies FEAT_LSE but not FEAT_LSE2.
Expand Down Expand Up @@ -46,6 +48,7 @@
// - aarch64 https://godbolt.org/z/8E1PanvhY
// - aarch64 (+lse) https://godbolt.org/z/4377cG4T7
// - aarch64 (+lse,+lse2) https://godbolt.org/z/6hsdMfKWv
// - aarch64 (+lse,+lse2,+rcpc3) https://godbolt.org/z/M5TxGz5xM
// - aarch64 (+lse2,+lse128) https://godbolt.org/z/Msa1s4xcd

include!("macros.rs");
Expand Down Expand Up @@ -164,7 +167,7 @@ unsafe fn atomic_load(src: *mut u128, order: Ordering) -> u128 {
}
}
}
// If CPU supports FEAT_LSE2, LDP is single-copy atomic reads,
// If CPU supports FEAT_LSE2, LDP/LDIAPP is single-copy atomic reads,
// otherwise it is two single-copy atomic reads.
// Refs: B2.2.1 of the Arm Architecture Reference Manual Armv8, for Armv8-A architecture profile
#[inline]
Expand Down Expand Up @@ -192,6 +195,19 @@ unsafe fn _atomic_load_ldp(src: *mut u128, order: Ordering) -> u128 {
}
match order {
Ordering::Relaxed => atomic_load!("", readonly),
#[cfg(any(target_feature = "rcpc3", portable_atomic_target_feature = "rcpc3"))]
Ordering::Acquire => {
// SAFETY: cfg guarantee that the CPU supports FEAT_LRCPC3.
// Refs: https://developer.arm.com/documentation/ddi0602/2022-12/Base-Instructions/LDIAPP--Load-Acquire-RCpc-ordered-Pair-of-registers-
asm!(
concat!("ldiapp {prev_lo}, {prev_hi}, [{src", ptr_modifier!(), "}]"),
src = in(reg) src,
prev_hi = lateout(reg) prev_hi,
prev_lo = lateout(reg) prev_lo,
options(nostack, preserves_flags),
);
}
#[cfg(not(any(target_feature = "rcpc3", portable_atomic_target_feature = "rcpc3")))]
Ordering::Acquire => atomic_load!("dmb ishld"),
Ordering::SeqCst => atomic_load!("dmb ish"),
_ => unreachable_unchecked!("{:?}", order),
Expand Down Expand Up @@ -246,7 +262,7 @@ unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
atomic_swap(dst, val, order);
}
}
// If CPU supports FEAT_LSE2, STP is single-copy atomic writes,
// If CPU supports FEAT_LSE2, STP/STILP is single-copy atomic writes,
// otherwise it is two single-copy atomic writes.
// Refs: B2.2.1 of the Arm Architecture Reference Manual Armv8, for Armv8-A architecture profile
#[cfg(any(target_feature = "lse2", portable_atomic_target_feature = "lse2", test))]
Expand Down Expand Up @@ -276,6 +292,19 @@ unsafe fn _atomic_store_stp(dst: *mut u128, val: u128, order: Ordering) {
}
match order {
Ordering::Relaxed => atomic_store!("", ""),
#[cfg(any(target_feature = "rcpc3", portable_atomic_target_feature = "rcpc3"))]
Ordering::Release => {
// SAFETY: cfg guarantee that the CPU supports FEAT_LRCPC3.
// Refs: https://developer.arm.com/documentation/ddi0602/2022-12/Base-Instructions/STILP--Store-Release-ordered-Pair-of-registers-
asm!(
concat!("stilp {val_lo}, {val_hi}, [{dst", ptr_modifier!(), "}]"),
dst = in(reg) dst,
val_lo = in(reg) val.pair.lo,
val_hi = in(reg) val.pair.hi,
options(nostack, preserves_flags),
);
}
#[cfg(not(any(target_feature = "rcpc3", portable_atomic_target_feature = "rcpc3")))]
Ordering::Release => atomic_store!("", "dmb ish"),
Ordering::SeqCst => atomic_store!("dmb ish", "dmb ish"),
_ => unreachable_unchecked!("{:?}", order),
Expand Down
41 changes: 40 additions & 1 deletion src/imp/atomic128/detect/aarch64_aa64reg.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ include!("common.rs");
struct AA64Reg {
aa64isar0: u64,
#[cfg(test)]
aa64isar1: u64,
#[cfg(test)]
aa64mmfr2: u64,
}

Expand All @@ -45,6 +47,8 @@ fn _detect(info: &mut CpuInfo) {
let AA64Reg {
aa64isar0,
#[cfg(test)]
aa64isar1,
#[cfg(test)]
aa64mmfr2,
} = imp::aa64reg();

Expand All @@ -63,6 +67,11 @@ fn _detect(info: &mut CpuInfo) {

#[cfg(test)]
{
// ID_AA64ISAR1_EL1, Instruction Set Attribute Register 1
// https://developer.arm.com/documentation/ddi0601/2022-12/AArch64-Registers/ID-AA64ISAR1-EL1--AArch64-Instruction-Set-Attribute-Register-1?lang=en
if extract(aa64isar1, 23, 20) >= 3 {
info.set(CpuInfo::HAS_RCPC3);
}
// ID_AA64MMFR2_EL1, AArch64 Memory Model Feature Register 2
// https://developer.arm.com/documentation/ddi0601/2022-12/AArch64-Registers/ID-AA64MMFR2-EL1--AArch64-Memory-Model-Feature-Register-2?lang=en
if extract(aa64mmfr2, 35, 32) >= 1 {
Expand Down Expand Up @@ -96,6 +105,16 @@ mod imp {
options(pure, nomem, nostack, preserves_flags)
);
#[cfg(test)]
let aa64isar1: u64;
#[cfg(test)]
{
asm!(
"mrs {}, ID_AA64ISAR1_EL1",
out(reg) aa64isar1,
options(pure, nomem, nostack, preserves_flags)
);
}
#[cfg(test)]
let aa64mmfr2: u64;
#[cfg(test)]
{
Expand All @@ -108,6 +127,8 @@ mod imp {
AA64Reg {
aa64isar0,
#[cfg(test)]
aa64isar1,
#[cfg(test)]
aa64mmfr2,
}
}
Expand Down Expand Up @@ -155,6 +176,8 @@ mod imp {
// https://github.com/openbsd/src/blob/72ccc03bd11da614f31f7ff76e3f6fce99bc1c79/sys/arch/arm64/include/cpu.h#L25-L40
pub(crate) const CPU_ID_AA64ISAR0: c_int = 2;
#[cfg(test)]
pub(crate) const CPU_ID_AA64ISAR1: c_int = 3;
#[cfg(test)]
pub(crate) const CPU_ID_AA64MMFR2: c_int = 7;
}

Expand All @@ -168,10 +191,14 @@ mod imp {
pub(super) fn aa64reg() -> AA64Reg {
let aa64isar0 = sysctl64(&[ffi::CTL_MACHDEP, ffi::CPU_ID_AA64ISAR0]).unwrap_or(0);
#[cfg(test)]
let aa64isar1 = sysctl64(&[ffi::CTL_MACHDEP, ffi::CPU_ID_AA64ISAR1]).unwrap_or(0);
#[cfg(test)]
let aa64mmfr2 = sysctl64(&[ffi::CTL_MACHDEP, ffi::CPU_ID_AA64MMFR2]).unwrap_or(0);
AA64Reg {
aa64isar0,
#[cfg(test)]
aa64isar1,
#[cfg(test)]
aa64mmfr2,
}
}
Expand Down Expand Up @@ -222,8 +249,9 @@ mod tests {

#[test]
fn test_aa64reg() {
let AA64Reg { aa64isar0, aa64mmfr2 } = imp::aa64reg();
let AA64Reg { aa64isar0, aa64isar1, aa64mmfr2 } = imp::aa64reg();
std::eprintln!("aa64isar0={}", aa64isar0);
std::eprintln!("aa64isar1={}", aa64isar1);
std::eprintln!("aa64mmfr2={}", aa64mmfr2);
if cfg!(target_os = "openbsd") {
let output = Command::new("sysctl").arg("machdep").output().unwrap();
Expand All @@ -233,6 +261,10 @@ mod tests {
stdout.lines().find_map(|s| s.strip_prefix("machdep.id_aa64isar0=")).unwrap(),
aa64isar0.to_string(),
);
assert_eq!(
stdout.lines().find_map(|s| s.strip_prefix("machdep.id_aa64isar1=")).unwrap(),
aa64isar1.to_string(),
);
assert_eq!(
stdout.lines().find_map(|s| s.strip_prefix("machdep.id_aa64mmfr2=")).unwrap_or("0"),
aa64mmfr2.to_string(),
Expand All @@ -249,6 +281,9 @@ mod tests {
if detect().test(CpuInfo::HAS_LSE2) {
assert_eq!(extract(aa64mmfr2, 35, 32), 1);
}
if detect().test(CpuInfo::HAS_RCPC3) {
assert_eq!(extract(aa64isar1, 23, 20), 3);
}
}

// Static assertions for FFI bindings.
Expand Down Expand Up @@ -291,6 +326,10 @@ mod tests {
let [] =
[(); (ffi::CPU_ID_AA64ISAR0 - machine_cpu::CPU_ID_AA64ISAR0 as ffi::c_int) as usize];
// libc doesn't have this
// let [] = [(); (ffi::CPU_ID_AA64ISAR1 - libc::CPU_ID_AA64ISAR1) as usize];
let [] =
[(); (ffi::CPU_ID_AA64ISAR1 - machine_cpu::CPU_ID_AA64ISAR1 as ffi::c_int) as usize];
// libc doesn't have this
// let [] = [(); (ffi::CPU_ID_AA64MMFR2 - libc::CPU_ID_AA64MMFR2) as usize];
let [] =
[(); (ffi::CPU_ID_AA64MMFR2 - machine_cpu::CPU_ID_AA64MMFR2 as ffi::c_int) as usize];
Expand Down
24 changes: 24 additions & 0 deletions src/imp/atomic128/detect/common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ impl CpuInfo {
// This is currently only used in tests.
#[cfg(test)]
const HAS_LSE128: u32 = 3;
/// Whether FEAT_LRCPC3 is available
// This is currently only used in tests.
#[cfg(test)]
const HAS_RCPC3: u32 = 4;
}
#[cfg(target_arch = "aarch64")]
#[inline]
Expand Down Expand Up @@ -131,26 +135,37 @@ mod tests_aarch64_common {
assert!(!x.test(CpuInfo::HAS_LSE));
assert!(!x.test(CpuInfo::HAS_LSE2));
assert!(!x.test(CpuInfo::HAS_LSE128));
assert!(!x.test(CpuInfo::HAS_RCPC3));
x.set(CpuInfo::INIT);
assert!(x.test(CpuInfo::INIT));
assert!(!x.test(CpuInfo::HAS_LSE));
assert!(!x.test(CpuInfo::HAS_LSE2));
assert!(!x.test(CpuInfo::HAS_LSE128));
assert!(!x.test(CpuInfo::HAS_RCPC3));
x.set(CpuInfo::HAS_LSE);
assert!(x.test(CpuInfo::INIT));
assert!(x.test(CpuInfo::HAS_LSE));
assert!(!x.test(CpuInfo::HAS_LSE2));
assert!(!x.test(CpuInfo::HAS_LSE128));
assert!(!x.test(CpuInfo::HAS_RCPC3));
x.set(CpuInfo::HAS_LSE2);
assert!(x.test(CpuInfo::INIT));
assert!(x.test(CpuInfo::HAS_LSE));
assert!(x.test(CpuInfo::HAS_LSE2));
assert!(!x.test(CpuInfo::HAS_LSE128));
assert!(!x.test(CpuInfo::HAS_RCPC3));
x.set(CpuInfo::HAS_LSE128);
assert!(x.test(CpuInfo::INIT));
assert!(x.test(CpuInfo::HAS_LSE));
assert!(x.test(CpuInfo::HAS_LSE2));
assert!(x.test(CpuInfo::HAS_LSE128));
assert!(!x.test(CpuInfo::HAS_RCPC3));
x.set(CpuInfo::HAS_RCPC3);
assert!(x.test(CpuInfo::INIT));
assert!(x.test(CpuInfo::HAS_LSE));
assert!(x.test(CpuInfo::HAS_LSE2));
assert!(x.test(CpuInfo::HAS_LSE128));
assert!(x.test(CpuInfo::HAS_RCPC3));
}

// CPU feature detection from reading /proc/cpuinfo (Linux/NetBSD)
Expand Down Expand Up @@ -300,5 +315,14 @@ mod tests_aarch64_common {
// assert!(!std::arch::is_aarch64_feature_detected!("lse128"));
// }
}
if detect().test(CpuInfo::HAS_RCPC3) {
assert!(detect().test(CpuInfo::HAS_RCPC3));
} else {
assert!(!detect().test(CpuInfo::HAS_RCPC3));
// #[cfg(not(portable_atomic_no_aarch64_target_feature))]
// {
// assert!(!std::arch::is_aarch64_feature_detected!("rcpc3"));
// }
}
}
}
3 changes: 3 additions & 0 deletions tools/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,9 @@ build() {
x_cargo "${args[@]}" "$@"
;;
esac
CARGO_TARGET_DIR="${target_dir}/rcpc3" \
RUSTFLAGS="${target_rustflags} -C target-feature=+lse,+lse2,+rcpc3" \
x_cargo "${args[@]}" "$@"
# FEAT_LSE128 implies FEAT_LSE but not FEAT_LSE2.
CARGO_TARGET_DIR="${target_dir}/lse128" \
RUSTFLAGS="${target_rustflags} -C target-feature=+lse128" \
Expand Down

0 comments on commit e83b63a

Please sign in to comment.