Skip to content

Commit

Permalink
aarch64: Use LDCLRP/LDSETP/SWPP if FEAT_LSE128 is available
Browse files Browse the repository at this point in the history
  • Loading branch information
taiki-e committed Aug 10, 2023
1 parent f73001a commit c06f3ff
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 11 deletions.
3 changes: 3 additions & 0 deletions .github/.cspell/project-dictionary.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ isync
kuser
ldar
ldaxp
ldclrp
ldsetp
ldxp
lghi
libcalls
Expand Down Expand Up @@ -127,6 +129,7 @@ subc
subfe
subfic
subfze
swpp
syscall
sysctlbyname
systemsim
Expand Down
16 changes: 11 additions & 5 deletions build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,11 +215,16 @@ fn main() {
// aarch64 macOS always supports FEAT_LSE and FEAT_LSE2 because it is armv8.5-a:
// https://github.com/llvm/llvm-project/blob/llvmorg-17.0.0-rc2/llvm/include/llvm/TargetParser/AArch64TargetParser.h#L494
let is_macos = target_os == "macos";
// aarch64_target_feature stabilized in Rust 1.61.
target_feature_if("lse", is_macos, &version, Some(61), true);
// As of rustc 1.70, target_feature "lse2" is not available on rustc side:
let mut has_lse = is_macos;
// FEAT_LSE2 doesn't imply FEAT_LSE. FEAT_LSE128 implies FEAT_LSE but not FEAT_LSE2.
// As of rustc 1.70, target_feature "lse2"/"lse128" is not available on rustc side:
// https://github.com/rust-lang/rust/blob/1.70.0/compiler/rustc_codegen_ssa/src/target_features.rs#L58
target_feature_if("lse2", is_macos, &version, None, false);
// LLVM supports FEAT_LSE128 on LLVM 16+:
// https://github.com/llvm/llvm-project/commit/7fea6f2e0e606e5339c3359568f680eaf64aa306
has_lse |= target_feature_if("lse128", false, &version, None, false);
// aarch64_target_feature stabilized in Rust 1.61.
target_feature_if("lse", has_lse, &version, Some(61), true);

// As of Apple M1/M1 Pro, on Apple hardware, CAS loop-based RMW is much slower than LL/SC
// loop-based RMW: https://github.com/taiki-e/portable-atomic/pull/89
Expand Down Expand Up @@ -339,7 +344,7 @@ fn target_feature_if(
version: &Version,
stabilized: Option<u32>,
is_rustc_target_feature: bool,
) {
) -> bool {
// HACK: Currently, it seems that the only way to handle unstable target
// features on the stable is to parse the `-C target-feature` in RUSTFLAGS.
//
Expand All @@ -354,7 +359,7 @@ fn target_feature_if(
&& (version.nightly || stabilized.map_or(false, |stabilized| version.minor >= stabilized))
{
// In this case, cfg(target_feature = "...") would work, so skip emitting our own target_feature cfg.
return;
return false;
} else if let Some(rustflags) = env::var_os("CARGO_ENCODED_RUSTFLAGS") {
for mut flag in rustflags.to_string_lossy().split('\x1f') {
flag = strip_prefix(flag, "-C").unwrap_or(flag);
Expand All @@ -374,6 +379,7 @@ fn target_feature_if(
if has_target_feature {
println!("cargo:rustc-cfg=portable_atomic_target_feature=\"{}\"", name);
}
has_target_feature
}

fn target_cpu() -> Option<String> {
Expand Down
2 changes: 1 addition & 1 deletion src/imp/atomic128/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Here is the table of targets that support 128-bit atomics and the instructions u
| target_arch | load | store | CAS | RMW | note |
| ----------- | ---- | ----- | --- | --- | ---- |
| x86_64 | cmpxchg16b or vmovdqa | cmpxchg16b or vmovdqa | cmpxchg16b | cmpxchg16b | cmpxchg16b target feature required. vmovdqa requires Intel or AMD CPU with AVX. <br> Both compile-time and run-time detection are supported for cmpxchg16b. vmovdqa is currently run-time detection only. <br> Requires rustc 1.59+ when cmpxchg16b target feature is enabled at compile-time, otherwise requires rustc 1.69+ |
| aarch64 | ldxp/stxp or casp or ldp | ldxp/stxp or casp or stp | ldxp/stxp or casp | ldxp/stxp or casp | casp requires lse target feature, ldp/stp requires lse2 target feature. <br> Both compile-time and run-time detection are supported for lse. lse2 is currently compile-time detection only. <br> Requires rustc 1.59+ |
| aarch64 | ldxp/stxp or casp or ldp | ldxp/stxp or casp or stp/swpp | ldxp/stxp or casp | ldxp/stxp or casp/swpp/ldclrp/ldsetp | casp requires lse target feature, ldp/stp requires lse2 target feature, swpp/ldclrp/ldsetp requires lse128 target feature. <br> Both compile-time and run-time detection are supported for lse. lse2 and lse128 are currently compile-time detection only. <br> Requires rustc 1.59+ |
| powerpc64 | lq | stq | lqarx/stqcx. | lqarx/stqcx. | Requires target-cpu pwr8+ (powerpc64le is pwr8 by default). Both compile-time and run-time detection are supported (run-time detection is currently disabled by default). <br> Requires nightly |
| s390x | lpq | stpq | cdsg | cdsg | Requires nightly |

Expand Down
128 changes: 123 additions & 5 deletions src/imp/atomic128/aarch64.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
// - LDXP/STXP loop (DW LL/SC)
// - CASP (DWCAS) added as FEAT_LSE (mandatory from armv8.1-a)
// - LDP/STP (DW load/store) if FEAT_LSE2 (optional from armv8.2-a, mandatory from armv8.4-a) is available
// - LDCLRP/LDSETP/SWPP (DW RMW) added as FEAT_LSE128 (optional from armv9.4-a)
//
// If outline-atomics is not enabled and FEAT_LSE is not available at
// compile-time, we use LDXP/STXP loop.
Expand All @@ -15,8 +16,9 @@
// However, when portable_atomic_ll_sc_rmw cfg is set, use LDXP/STXP loop instead of CASP
// loop for RMW (by default, it is set on Apple hardware; see build script for details).
// If FEAT_LSE2 is available at compile-time, we use LDP/STP for load/store.
// If FEAT_LSE128 is available at compile-time, we use LDCLRP/LDSETP/SWPP for fetch_and/fetch_or/swap/{release,seqcst}-store.
//
// Note: FEAT_LSE2 doesn't imply FEAT_LSE.
// Note: FEAT_LSE2 doesn't imply FEAT_LSE. FEAT_LSE128 implies FEAT_LSE but not FEAT_LSE2.
//
// Note that we do not separate LL and SC into separate functions, but handle
// them within a single asm block. This is because it is theoretically possible
Expand Down Expand Up @@ -53,6 +55,7 @@
// - aarch64 (+lse) https://godbolt.org/z/5GzssfTKc
// - aarch64 msvc (+lse) https://godbolt.org/z/oYE87caM7
// - aarch64 (+lse,+lse2) https://godbolt.org/z/36dPjMbaG
// - aarch64 (+lse2,+lse128) https://godbolt.org/z/9MKa4ofbo

include!("macros.rs");

Expand Down Expand Up @@ -343,10 +346,27 @@ unsafe fn _atomic_load_ldxp_stxp(src: *mut u128, order: Ordering) -> u128 {
#[inline]
unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
#[cfg(any(target_feature = "lse2", portable_atomic_target_feature = "lse2"))]
// SAFETY: the caller must uphold the safety contract.
// cfg guarantee that the CPU supports FEAT_LSE2.
unsafe {
atomic_store_stp(dst, val, order);
{
#[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))]
// SAFETY: the caller must uphold the safety contract.
// cfg guarantee that the CPU supports FEAT_LSE2 and FEAT_FSE128.
unsafe {
// Use swpp if stp requires fences.
// https://reviews.llvm.org/D143506
match order {
Ordering::Relaxed => atomic_store_stp(dst, val, order),
Ordering::Release | Ordering::SeqCst => {
_atomic_swap_swpp(dst, val, order);
}
_ => unreachable!("{:?}", order),
}
}
#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))]
// SAFETY: the caller must uphold the safety contract.
// cfg guarantee that the CPU supports FEAT_LSE2.
unsafe {
atomic_store_stp(dst, val, order);
}
}
#[cfg(not(any(target_feature = "lse2", portable_atomic_target_feature = "lse2")))]
// SAFETY: the caller must uphold the safety contract.
Expand Down Expand Up @@ -682,16 +702,50 @@ use self::atomic_compare_exchange as atomic_compare_exchange_weak;

// If FEAT_LSE is available at compile-time and portable_atomic_ll_sc_rmw cfg is not set,
// we use CAS-based atomic RMW.
#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))]
#[cfg(all(
any(target_feature = "lse", portable_atomic_target_feature = "lse"),
not(portable_atomic_ll_sc_rmw),
))]
use _atomic_swap_casp as atomic_swap;
#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))]
#[cfg(not(all(
any(target_feature = "lse", portable_atomic_target_feature = "lse"),
not(portable_atomic_ll_sc_rmw),
)))]
use _atomic_swap_ldxp_stxp as atomic_swap;
#[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))]
use _atomic_swap_swpp as atomic_swap;
#[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))]
#[inline]
unsafe fn _atomic_swap_swpp(dst: *mut u128, val: u128, order: Ordering) -> u128 {
debug_assert!(dst as usize % 16 == 0);

// SAFETY: the caller must guarantee that `dst` is valid for both writes and
// reads, 16-byte aligned, that there are no concurrent non-atomic operations,
// and the CPU supports FEAT_LSE128.
//
// Refs:
// - https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/SWPP--SWPPA--SWPPAL--SWPPL--Swap-quadword-in-memory-?lang=en
unsafe {
let val = U128 { whole: val };
let (prev_lo, prev_hi);
macro_rules! swap {
($acquire:tt, $release:tt, $fence:tt) => {
asm!(
concat!("swpp", $acquire, $release, " {val_lo}, {val_hi}, [{dst}]"),
$fence,
dst = in(reg) ptr_reg!(dst),
val_lo = inout(reg) val.pair.lo => prev_lo,
val_hi = inout(reg) val.pair.hi => prev_hi,
options(nostack, preserves_flags),
)
};
}
atomic_rmw!(swap, order);
U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
}
}
// Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap.
#[cfg(any(
test,
Expand Down Expand Up @@ -1066,16 +1120,48 @@ atomic_rmw_cas_3! {
select_le_or_be!("sbc x5, x7, {val_hi}", "sbc x4, x6, {val_lo}"),
}

#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))]
atomic_rmw_ll_sc_3! {
_atomic_and_ldxp_stxp as atomic_and (preserves_flags),
"and {new_lo}, {prev_lo}, {val_lo}",
"and {new_hi}, {prev_hi}, {val_hi}",
}
#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))]
atomic_rmw_cas_3! {
_atomic_and_casp as atomic_and,
"and x4, x6, {val_lo}",
"and x5, x7, {val_hi}",
}
#[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))]
#[inline]
unsafe fn atomic_and(dst: *mut u128, val: u128, order: Ordering) -> u128 {
debug_assert!(dst as usize % 16 == 0);

// SAFETY: the caller must guarantee that `dst` is valid for both writes and
// reads, 16-byte aligned, that there are no concurrent non-atomic operations,
// and the CPU supports FEAT_LSE128.
//
// Refs:
// - https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/LDCLRP--LDCLRPA--LDCLRPAL--LDCLRPL--Atomic-bit-clear-on-quadword-in-memory-?lang=en
unsafe {
let val = U128 { whole: !val };
let (prev_lo, prev_hi);
macro_rules! and {
($acquire:tt, $release:tt, $fence:tt) => {
asm!(
concat!("ldclrp", $acquire, $release, " {val_lo}, {val_hi}, [{dst}]"),
$fence,
dst = in(reg) ptr_reg!(dst),
val_lo = inout(reg) val.pair.lo => prev_lo,
val_hi = inout(reg) val.pair.hi => prev_hi,
options(nostack, preserves_flags),
)
};
}
atomic_rmw!(and, order);
U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
}
}

atomic_rmw_ll_sc_3! {
_atomic_nand_ldxp_stxp as atomic_nand (preserves_flags),
Expand All @@ -1092,16 +1178,48 @@ atomic_rmw_cas_3! {
"mvn x5, x5",
}

#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))]
atomic_rmw_ll_sc_3! {
_atomic_or_ldxp_stxp as atomic_or (preserves_flags),
"orr {new_lo}, {prev_lo}, {val_lo}",
"orr {new_hi}, {prev_hi}, {val_hi}",
}
#[cfg(not(any(target_feature = "lse128", portable_atomic_target_feature = "lse128")))]
atomic_rmw_cas_3! {
_atomic_or_casp as atomic_or,
"orr x4, x6, {val_lo}",
"orr x5, x7, {val_hi}",
}
#[cfg(any(target_feature = "lse128", portable_atomic_target_feature = "lse128"))]
#[inline]
unsafe fn atomic_or(dst: *mut u128, val: u128, order: Ordering) -> u128 {
debug_assert!(dst as usize % 16 == 0);

// SAFETY: the caller must guarantee that `dst` is valid for both writes and
// reads, 16-byte aligned, that there are no concurrent non-atomic operations,
// and the CPU supports FEAT_LSE128.
//
// Refs:
// - https://developer.arm.com/documentation/ddi0602/2023-03/Base-Instructions/LDSETP--LDSETPA--LDSETPAL--LDSETPL--Atomic-bit-set-on-quadword-in-memory-?lang=en
unsafe {
let val = U128 { whole: val };
let (prev_lo, prev_hi);
macro_rules! or {
($acquire:tt, $release:tt, $fence:tt) => {
asm!(
concat!("ldsetp", $acquire, $release, " {val_lo}, {val_hi}, [{dst}]"),
$fence,
dst = in(reg) ptr_reg!(dst),
val_lo = inout(reg) val.pair.lo => prev_lo,
val_hi = inout(reg) val.pair.hi => prev_hi,
options(nostack, preserves_flags),
)
};
}
atomic_rmw!(or, order);
U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
}
}

atomic_rmw_ll_sc_3! {
_atomic_xor_ldxp_stxp as atomic_xor (preserves_flags),
Expand Down
7 changes: 7 additions & 0 deletions tools/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,13 @@ build() {
x_cargo "${args[@]}" "$@"
;;
esac
# Support for FEAT_LRCPC3 and FEAT_LSE128 requires LLVM 16+ (Rust 1.70+).
if [[ "${rustc_minor_version}" -ge 70 ]]; then
# FEAT_LSE128 implies FEAT_LSE but not FEAT_LSE2.
CARGO_TARGET_DIR="${target_dir}/lse128" \
RUSTFLAGS="${target_rustflags} -C target-feature=+lse2,+lse128" \
x_cargo "${args[@]}" "$@"
fi
;;
powerpc64-*)
# powerpc64le- (little-endian) is skipped because it is pwr8 by default
Expand Down

0 comments on commit c06f3ff

Please sign in to comment.