diff --git a/.github/.cspell/project-dictionary.txt b/.github/.cspell/project-dictionary.txt index cf44990f..58f52bc5 100644 --- a/.github/.cspell/project-dictionary.txt +++ b/.github/.cspell/project-dictionary.txt @@ -56,6 +56,8 @@ isync kuser ldar ldaxp +ldclrp +ldsetp ldxp lghi libcalls @@ -127,6 +129,7 @@ subc subfe subfic subfze +swpp syscall sysctlbyname systemsim diff --git a/build.rs b/build.rs index 3fd18c86..4c16ef60 100644 --- a/build.rs +++ b/build.rs @@ -215,11 +215,16 @@ fn main() { // aarch64 macOS always supports FEAT_LSE and FEAT_LSE2 because it is armv8.5-a: // https://github.com/llvm/llvm-project/blob/llvmorg-17.0.0-rc2/llvm/include/llvm/TargetParser/AArch64TargetParser.h#L494 let is_macos = target_os == "macos"; - // aarch64_target_feature stabilized in Rust 1.61. - target_feature_if("lse", is_macos, &version, Some(61), true); - // As of rustc 1.70, target_feature "lse2" is not available on rustc side: + let mut has_lse = is_macos; + // FEAT_LSE2 doesn't imply FEAT_LSE. FEAT_LSE128 implies FEAT_LSE but not FEAT_LSE2. + // As of rustc 1.70, target_feature "lse2"/"lse128" is not available on rustc side: // https://github.com/rust-lang/rust/blob/1.70.0/compiler/rustc_codegen_ssa/src/target_features.rs#L58 target_feature_if("lse2", is_macos, &version, None, false); + // LLVM supports FEAT_LSE128 on LLVM 16+: + // https://github.com/llvm/llvm-project/commit/7fea6f2e0e606e5339c3359568f680eaf64aa306 + has_lse |= target_feature_if("lse128", false, &version, None, false); + // aarch64_target_feature stabilized in Rust 1.61. + target_feature_if("lse", has_lse, &version, Some(61), true); // As of Apple M1/M1 Pro, on Apple hardware, CAS loop-based RMW is much slower than LL/SC // loop-based RMW: https://github.com/taiki-e/portable-atomic/pull/89 @@ -339,7 +344,7 @@ fn target_feature_if( version: &Version, stabilized: Option, is_rustc_target_feature: bool, -) { +) -> bool { // HACK: Currently, it seems that the only way to handle unstable target // features on the stable is to parse the `-C target-feature` in RUSTFLAGS. // @@ -354,7 +359,7 @@ fn target_feature_if( && (version.nightly || stabilized.map_or(false, |stabilized| version.minor >= stabilized)) { // In this case, cfg(target_feature = "...") would work, so skip emitting our own target_feature cfg. - return; + return false; } else if let Some(rustflags) = env::var_os("CARGO_ENCODED_RUSTFLAGS") { for mut flag in rustflags.to_string_lossy().split('\x1f') { flag = strip_prefix(flag, "-C").unwrap_or(flag); @@ -374,6 +379,7 @@ fn target_feature_if( if has_target_feature { println!("cargo:rustc-cfg=portable_atomic_target_feature=\"{}\"", name); } + has_target_feature } fn target_cpu() -> Option { diff --git a/src/imp/atomic128/README.md b/src/imp/atomic128/README.md index 6e0c8798..a036d23d 100644 --- a/src/imp/atomic128/README.md +++ b/src/imp/atomic128/README.md @@ -7,7 +7,7 @@ Here is the table of targets that support 128-bit atomics and the instructions u | target_arch | load | store | CAS | RMW | note | | ----------- | ---- | ----- | --- | --- | ---- | | x86_64 | cmpxchg16b or vmovdqa | cmpxchg16b or vmovdqa | cmpxchg16b | cmpxchg16b | cmpxchg16b target feature required. vmovdqa requires Intel or AMD CPU with AVX.
Both compile-time and run-time detection are supported for cmpxchg16b. vmovdqa is currently run-time detection only.
Requires rustc 1.59+ when cmpxchg16b target feature is enabled at compile-time, otherwise requires rustc 1.69+ | -| aarch64 | ldxp/stxp or casp or ldp | ldxp/stxp or casp or stp | ldxp/stxp or casp | ldxp/stxp or casp | casp requires lse target feature, ldp/stp requires lse2 target feature.
Both compile-time and run-time detection are supported for lse. lse2 is currently compile-time detection only.
Requires rustc 1.59+ | +| aarch64 | ldxp/stxp or casp or ldp | ldxp/stxp or casp or stp/swpp | ldxp/stxp or casp | ldxp/stxp or casp/swpp/ldclrp/ldsetp | casp requires lse target feature, ldp/stp requires lse2 target feature, swpp/ldclrp/ldsetp requires lse128 target feature.
Both compile-time and run-time detection are supported for lse. lse2 and lse128 are currently compile-time detection only.
Requires rustc 1.59+ | | powerpc64 | lq | stq | lqarx/stqcx. | lqarx/stqcx. | Requires target-cpu pwr8+ (powerpc64le is pwr8 by default). Both compile-time and run-time detection are supported (run-time detection is currently disabled by default).
Requires nightly | | s390x | lpq | stpq | cdsg | cdsg | Requires nightly | diff --git a/src/imp/atomic128/aarch64.rs b/src/imp/atomic128/aarch64.rs index 00418dfb..5fb94b35 100644 --- a/src/imp/atomic128/aarch64.rs +++ b/src/imp/atomic128/aarch64.rs @@ -5,6 +5,7 @@ // - LDXP/STXP loop (DW LL/SC) // - CASP (DWCAS) added as FEAT_LSE (mandatory from armv8.1-a) // - LDP/STP (DW load/store) if FEAT_LSE2 (optional from armv8.2-a, mandatory from armv8.4-a) is available +// - LDCLRP/LDSETP/SWPP (DW RMW) added as FEAT_LSE128 (optional from armv9.4-a) // // If outline-atomics is not enabled and FEAT_LSE is not available at // compile-time, we use LDXP/STXP loop. @@ -15,8 +16,9 @@ // However, when portable_atomic_ll_sc_rmw cfg is set, use LDXP/STXP loop instead of CASP // loop for RMW (by default, it is set on Apple hardware; see build script for details). // If FEAT_LSE2 is available at compile-time, we use LDP/STP for load/store. +// If FEAT_LSE128 is available at compile-time, we use LDCLRP/LDSETP/SWPP for fetch_and/fetch_or/swap/{release,seqcst}-store. // -// Note: FEAT_LSE2 doesn't imply FEAT_LSE. +// Note: FEAT_LSE2 doesn't imply FEAT_LSE. FEAT_LSE128 implies FEAT_LSE but not FEAT_LSE2. // // Note that we do not separate LL and SC into separate functions, but handle // them within a single asm block. This is because it is theoretically possible @@ -53,6 +55,7 @@ // - aarch64 (+lse) https://godbolt.org/z/5GzssfTKc // - aarch64 msvc (+lse) https://godbolt.org/z/oYE87caM7 // - aarch64 (+lse,+lse2) https://godbolt.org/z/36dPjMbaG +// - aarch64 (+lse2,+lse128) https://godbolt.org/z/9MKa4ofbo include!("macros.rs"); @@ -343,10 +346,27 @@ unsafe fn _atomic_load_ldxp_stxp(src: *mut u128, order: Ordering) -> u128 { #[inline] unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) { #[cfg(any(target_feature = "lse2", portable_atomic_target_feature = "lse2"))] - // SAFETY: the caller must uphold the safety contract. - // cfg guarantee that the CPU supports FEAT_LSE2. - unsafe { - atomic_store_stp(dst, val, order); + { + #[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))] + // SAFETY: the caller must uphold the safety contract. + // cfg guarantee that the CPU supports FEAT_LSE2 and FEAT_FSE128. + unsafe { + // Use swpp if stp requires fences. + // https://reviews.llvm.org/D143506 + match order { + Ordering::Relaxed => atomic_store_stp(dst, val, order), + Ordering::Release | Ordering::SeqCst => { + _atomic_swap_swpp(dst, val, order); + } + _ => unreachable!("{:?}", order), + } + } + #[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))] + // SAFETY: the caller must uphold the safety contract. + // cfg guarantee that the CPU supports FEAT_LSE2. + unsafe { + atomic_store_stp(dst, val, order); + } } #[cfg(not(any(target_feature = "lse2", portable_atomic_target_feature = "lse2")))] // SAFETY: the caller must uphold the safety contract. @@ -682,16 +702,50 @@ use self::atomic_compare_exchange as atomic_compare_exchange_weak; // If FEAT_LSE is available at compile-time and portable_atomic_ll_sc_rmw cfg is not set, // we use CAS-based atomic RMW. +#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))] #[cfg(all( any(target_feature = "lse", portable_atomic_target_feature = "lse"), not(portable_atomic_ll_sc_rmw), ))] use _atomic_swap_casp as atomic_swap; +#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))] #[cfg(not(all( any(target_feature = "lse", portable_atomic_target_feature = "lse"), not(portable_atomic_ll_sc_rmw), )))] use _atomic_swap_ldxp_stxp as atomic_swap; +#[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))] +use _atomic_swap_swpp as atomic_swap; +#[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))] +#[inline] +unsafe fn _atomic_swap_swpp(dst: *mut u128, val: u128, order: Ordering) -> u128 { + debug_assert!(dst as usize % 16 == 0); + + // SAFETY: the caller must guarantee that `dst` is valid for both writes and + // reads, 16-byte aligned, that there are no concurrent non-atomic operations, + // and the CPU supports FEAT_LSE128. + // + // Refs: + // - https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/SWPP--SWPPA--SWPPAL--SWPPL--Swap-quadword-in-memory-?lang=en + unsafe { + let val = U128 { whole: val }; + let (prev_lo, prev_hi); + macro_rules! swap { + ($acquire:tt, $release:tt, $fence:tt) => { + asm!( + concat!("swpp", $acquire, $release, " {val_lo}, {val_hi}, [{dst}]"), + $fence, + dst = in(reg) ptr_reg!(dst), + val_lo = inout(reg) val.pair.lo => prev_lo, + val_hi = inout(reg) val.pair.hi => prev_hi, + options(nostack, preserves_flags), + ) + }; + } + atomic_rmw!(swap, order); + U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole + } +} // Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap. #[cfg(any( test, @@ -1066,16 +1120,48 @@ atomic_rmw_cas_3! { select_le_or_be!("sbc x5, x7, {val_hi}", "sbc x4, x6, {val_lo}"), } +#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))] atomic_rmw_ll_sc_3! { _atomic_and_ldxp_stxp as atomic_and (preserves_flags), "and {new_lo}, {prev_lo}, {val_lo}", "and {new_hi}, {prev_hi}, {val_hi}", } +#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))] atomic_rmw_cas_3! { _atomic_and_casp as atomic_and, "and x4, x6, {val_lo}", "and x5, x7, {val_hi}", } +#[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))] +#[inline] +unsafe fn atomic_and(dst: *mut u128, val: u128, order: Ordering) -> u128 { + debug_assert!(dst as usize % 16 == 0); + + // SAFETY: the caller must guarantee that `dst` is valid for both writes and + // reads, 16-byte aligned, that there are no concurrent non-atomic operations, + // and the CPU supports FEAT_LSE128. + // + // Refs: + // - https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/LDCLRP--LDCLRPA--LDCLRPAL--LDCLRPL--Atomic-bit-clear-on-quadword-in-memory-?lang=en + unsafe { + let val = U128 { whole: !val }; + let (prev_lo, prev_hi); + macro_rules! and { + ($acquire:tt, $release:tt, $fence:tt) => { + asm!( + concat!("ldclrp", $acquire, $release, " {val_lo}, {val_hi}, [{dst}]"), + $fence, + dst = in(reg) ptr_reg!(dst), + val_lo = inout(reg) val.pair.lo => prev_lo, + val_hi = inout(reg) val.pair.hi => prev_hi, + options(nostack, preserves_flags), + ) + }; + } + atomic_rmw!(and, order); + U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole + } +} atomic_rmw_ll_sc_3! { _atomic_nand_ldxp_stxp as atomic_nand (preserves_flags), @@ -1092,16 +1178,48 @@ atomic_rmw_cas_3! { "mvn x5, x5", } +#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))] atomic_rmw_ll_sc_3! { _atomic_or_ldxp_stxp as atomic_or (preserves_flags), "orr {new_lo}, {prev_lo}, {val_lo}", "orr {new_hi}, {prev_hi}, {val_hi}", } +#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))] atomic_rmw_cas_3! { _atomic_or_casp as atomic_or, "orr x4, x6, {val_lo}", "orr x5, x7, {val_hi}", } +#[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))] +#[inline] +unsafe fn atomic_or(dst: *mut u128, val: u128, order: Ordering) -> u128 { + debug_assert!(dst as usize % 16 == 0); + + // SAFETY: the caller must guarantee that `dst` is valid for both writes and + // reads, 16-byte aligned, that there are no concurrent non-atomic operations, + // and the CPU supports FEAT_LSE128. + // + // Refs: + // - https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/LDSETP--LDSETPA--LDSETPAL--LDSETPL--Atomic-bit-set-on-quadword-in-memory-?lang=en + unsafe { + let val = U128 { whole: val }; + let (prev_lo, prev_hi); + macro_rules! or { + ($acquire:tt, $release:tt, $fence:tt) => { + asm!( + concat!("ldsetp", $acquire, $release, " {val_lo}, {val_hi}, [{dst}]"), + $fence, + dst = in(reg) ptr_reg!(dst), + val_lo = inout(reg) val.pair.lo => prev_lo, + val_hi = inout(reg) val.pair.hi => prev_hi, + options(nostack, preserves_flags), + ) + }; + } + atomic_rmw!(or, order); + U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole + } +} atomic_rmw_ll_sc_3! { _atomic_xor_ldxp_stxp as atomic_xor (preserves_flags), diff --git a/tools/build.sh b/tools/build.sh index 7ff40210..d9a43e55 100755 --- a/tools/build.sh +++ b/tools/build.sh @@ -588,6 +588,13 @@ build() { x_cargo "${args[@]}" "$@" ;; esac + # Support for FEAT_LRCPC3 and FEAT_LSE128 requires LLVM 16+ (Rust 1.70+). + if [[ "${rustc_minor_version}" -ge 70 ]]; then + # FEAT_LSE128 implies FEAT_LSE but not FEAT_LSE2. + CARGO_TARGET_DIR="${target_dir}/lse128" \ + RUSTFLAGS="${target_rustflags} -C target-feature=+lse2,+lse128" \ + x_cargo "${args[@]}" "$@" + fi ;; powerpc64-*) # powerpc64le- (little-endian) is skipped because it is pwr8 by default