portable_atomic/imp/atomic128/
x86_64.rs

1// SPDX-License-Identifier: Apache-2.0 OR MIT
2
3/*
4128-bit atomic implementation on x86_64 using CMPXCHG16B (DWCAS).
5
6Note: On Miri and ThreadSanitizer which do not support inline assembly, we don't use
7this module and use intrinsics.rs instead.
8
9Refs:
10- x86 and amd64 instruction reference https://www.felixcloutier.com/x86
11- atomic-maybe-uninit https://github.com/taiki-e/atomic-maybe-uninit
12
13Generated asm:
14- x86_64 (+cmpxchg16b) https://godbolt.org/z/rfs1jxd51
15*/
16
17// TODO: use core::arch::x86_64::cmpxchg16b where available and efficient than asm
18
19include!("macros.rs");
20
21#[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
22#[path = "../fallback/outline_atomics.rs"]
23mod fallback;
24
25#[cfg(not(portable_atomic_no_outline_atomics))]
26#[cfg(not(target_env = "sgx"))]
27#[cfg_attr(
28    not(target_feature = "sse"),
29    cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))
30)]
31#[path = "../detect/x86_64.rs"]
32mod detect;
33
34#[cfg(not(portable_atomic_no_asm))]
35use core::arch::asm;
36use core::sync::atomic::Ordering;
37
38use crate::utils::{Pair, U128};
39
40// Asserts that the function is called in the correct context.
41macro_rules! debug_assert_cmpxchg16b {
42    () => {
43        #[cfg(not(any(
44            target_feature = "cmpxchg16b",
45            portable_atomic_target_feature = "cmpxchg16b",
46        )))]
47        {
48            debug_assert!(detect::detect().has_cmpxchg16b());
49        }
50    };
51}
52#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
53#[cfg(target_feature = "sse")]
54macro_rules! debug_assert_vmovdqa_atomic {
55    () => {{
56        debug_assert_cmpxchg16b!();
57        debug_assert!(detect::detect().has_vmovdqa_atomic());
58    }};
59}
60
61#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
62#[cfg(target_feature = "sse")]
63#[cfg(target_pointer_width = "32")]
64macro_rules! ptr_modifier {
65    () => {
66        ":e"
67    };
68}
69#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
70#[cfg(target_feature = "sse")]
71#[cfg(target_pointer_width = "64")]
72macro_rules! ptr_modifier {
73    () => {
74        ""
75    };
76}
77
78// Unlike AArch64 and RISC-V, x86's assembler doesn't check instruction
79// requirements for the currently enabled target features. In the first place,
80// there is no option in the x86 assembly for such case, like Arm .arch_extension,
81// RISC-V .option arch, PowerPC .machine, etc.
82// However, we set target_feature(enable) when available (Rust 1.69+) in case a
83// new codegen backend is added that checks for it in the future, or an option
84// is added to the assembler to check for it.
85#[cfg_attr(
86    not(portable_atomic_no_cmpxchg16b_target_feature),
87    target_feature(enable = "cmpxchg16b")
88)]
89#[inline]
90unsafe fn cmpxchg16b(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
91    debug_assert!(dst as usize % 16 == 0);
92    debug_assert_cmpxchg16b!();
93
94    // SAFETY: the caller must guarantee that `dst` is valid for both writes and
95    // reads, 16-byte aligned (required by CMPXCHG16B), that there are no
96    // concurrent non-atomic operations, and that the CPU supports CMPXCHG16B.
97    //
98    // If the value at `dst` (destination operand) and rdx:rax are equal, the
99    // 128-bit value in rcx:rbx is stored in the `dst`, otherwise the value at
100    // `dst` is loaded to rdx:rax.
101    //
102    // The ZF flag is set if the value at `dst` and rdx:rax are equal,
103    // otherwise it is cleared. Other flags are unaffected.
104    //
105    // Refs: https://www.felixcloutier.com/x86/cmpxchg8b:cmpxchg16b
106    unsafe {
107        // cmpxchg16b is always SeqCst.
108        let r: u8;
109        let old = U128 { whole: old };
110        let new = U128 { whole: new };
111        let (prev_lo, prev_hi);
112        macro_rules! cmpxchg16b {
113            ($rdi:tt) => {
114                asm!(
115                    "xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
116                    concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
117                    "sete cl",
118                    "mov rbx, {rbx_tmp}", // restore rbx
119                    rbx_tmp = inout(reg) new.pair.lo => _,
120                    in("rcx") new.pair.hi,
121                    inout("rax") old.pair.lo => prev_lo,
122                    inout("rdx") old.pair.hi => prev_hi,
123                    in($rdi) dst,
124                    lateout("cl") r,
125                    // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
126                    options(nostack),
127                )
128            };
129        }
130        #[cfg(target_pointer_width = "32")]
131        cmpxchg16b!("edi");
132        #[cfg(target_pointer_width = "64")]
133        cmpxchg16b!("rdi");
134        crate::utils::assert_unchecked(r == 0 || r == 1); // needed to remove extra test
135        (U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole, r != 0)
136    }
137}
138
139// VMOVDQA is atomic on Intel, AMD, and Zhaoxin CPUs with AVX.
140// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104688 for details.
141//
142// Refs: https://www.felixcloutier.com/x86/movdqa:vmovdqa32:vmovdqa64
143//
144// Use cfg(target_feature = "sse") here -- SSE is included in the x86_64
145// baseline and is always available, but the SSE target feature is disabled for
146// use cases such as kernels and firmware that should not use vector registers.
147// So, do not use vector registers unless SSE target feature is enabled.
148// See also https://github.com/rust-lang/rust/blob/1.80.0/src/doc/rustc/src/platform-support/x86_64-unknown-none.md.
149#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
150#[cfg(target_feature = "sse")]
151#[target_feature(enable = "avx")]
152#[inline]
153unsafe fn atomic_load_vmovdqa(src: *mut u128) -> u128 {
154    debug_assert!(src as usize % 16 == 0);
155    debug_assert_vmovdqa_atomic!();
156
157    // SAFETY: the caller must uphold the safety contract.
158    //
159    // atomic load by vmovdqa is always SeqCst.
160    unsafe {
161        let out: core::arch::x86_64::__m128i;
162        asm!(
163            concat!("vmovdqa {out}, xmmword ptr [{src", ptr_modifier!(), "}]"),
164            src = in(reg) src,
165            out = out(xmm_reg) out,
166            options(nostack, preserves_flags),
167        );
168        core::mem::transmute(out)
169    }
170}
171#[cfg(not(any(portable_atomic_no_outline_atomics, target_env = "sgx")))]
172#[cfg(target_feature = "sse")]
173#[target_feature(enable = "avx")]
174#[inline]
175unsafe fn atomic_store_vmovdqa(dst: *mut u128, val: u128, order: Ordering) {
176    debug_assert!(dst as usize % 16 == 0);
177    debug_assert_vmovdqa_atomic!();
178
179    // SAFETY: the caller must uphold the safety contract.
180    unsafe {
181        let val: core::arch::x86_64::__m128i = core::mem::transmute(val);
182        match order {
183            // Relaxed and Release stores are equivalent.
184            Ordering::Relaxed | Ordering::Release => {
185                asm!(
186                    concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
187                    dst = in(reg) dst,
188                    val = in(xmm_reg) val,
189                    options(nostack, preserves_flags),
190                );
191            }
192            Ordering::SeqCst => {
193                let p = core::cell::UnsafeCell::new(core::mem::MaybeUninit::<u64>::uninit());
194                asm!(
195                    concat!("vmovdqa xmmword ptr [{dst", ptr_modifier!(), "}], {val}"),
196                    // Equivalent to mfence, but is up to 3.1x faster on Coffee Lake and up to 2.4x faster on Raptor Lake-H at least in simple cases.
197                    // - https://github.com/taiki-e/portable-atomic/pull/156
198                    // - LLVM uses lock or for x86_32 64-bit atomic SeqCst store using SSE https://godbolt.org/z/9sKEr8YWc
199                    // - Windows uses xchg for x86_32 for MemoryBarrier https://learn.microsoft.com/en-us/windows/win32/api/winnt/nf-winnt-memorybarrier
200                    // - MSVC STL uses lock inc https://github.com/microsoft/STL/pull/740
201                    // - boost uses lock or https://github.com/boostorg/atomic/commit/559eba81af71386cedd99f170dc6101c6ad7bf22
202                    concat!("xchg qword ptr [{p", ptr_modifier!(), "}], {tmp}"),
203                    dst = in(reg) dst,
204                    val = in(xmm_reg) val,
205                    p = inout(reg) p.get() => _,
206                    tmp = lateout(reg) _,
207                    options(nostack, preserves_flags),
208                );
209            }
210            _ => unreachable!(),
211        }
212    }
213}
214
215#[cfg(not(all(
216    any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
217    any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
218)))]
219macro_rules! load_store_detect {
220    (
221        vmovdqa = $vmovdqa:ident
222        cmpxchg16b = $cmpxchg16b:ident
223        fallback = $fallback:ident
224    ) => {{
225        let cpuid = detect::detect();
226        #[cfg(not(any(
227            target_feature = "cmpxchg16b",
228            portable_atomic_target_feature = "cmpxchg16b",
229        )))]
230        {
231            // Check CMPXCHG16B first to prevent mixing atomic and non-atomic access.
232            if cpuid.has_cmpxchg16b() {
233                // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
234                #[cfg(target_feature = "sse")]
235                {
236                    if cpuid.has_vmovdqa_atomic() {
237                        $vmovdqa
238                    } else {
239                        $cmpxchg16b
240                    }
241                }
242                #[cfg(not(target_feature = "sse"))]
243                {
244                    $cmpxchg16b
245                }
246            } else {
247                fallback::$fallback
248            }
249        }
250        #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
251        {
252            if cpuid.has_vmovdqa_atomic() {
253                $vmovdqa
254            } else {
255                $cmpxchg16b
256            }
257        }
258    }};
259}
260
261#[inline]
262unsafe fn atomic_load(src: *mut u128, _order: Ordering) -> u128 {
263    // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
264    // SGX doesn't support CPUID.
265    #[cfg(all(
266        any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
267        any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
268    ))]
269    // SAFETY: the caller must uphold the safety contract.
270    // cfg guarantees that CMPXCHG16B is available at compile-time.
271    unsafe {
272        // cmpxchg16b is always SeqCst.
273        atomic_load_cmpxchg16b(src)
274    }
275    #[cfg(not(all(
276        any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
277        any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
278    )))]
279    // SAFETY: the caller must uphold the safety contract.
280    unsafe {
281        ifunc!(unsafe fn(src: *mut u128) -> u128 {
282            load_store_detect! {
283                vmovdqa = atomic_load_vmovdqa
284                cmpxchg16b = atomic_load_cmpxchg16b
285                // Use SeqCst because cmpxchg16b and atomic load by vmovdqa is always SeqCst.
286                fallback = atomic_load_seqcst
287            }
288        })
289    }
290}
291// See cmpxchg16b() for target_feature(enable).
292#[cfg_attr(
293    not(portable_atomic_no_cmpxchg16b_target_feature),
294    target_feature(enable = "cmpxchg16b")
295)]
296#[inline]
297unsafe fn atomic_load_cmpxchg16b(src: *mut u128) -> u128 {
298    debug_assert!(src as usize % 16 == 0);
299    debug_assert_cmpxchg16b!();
300
301    // SAFETY: the caller must guarantee that `src` is valid for both writes and
302    // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
303    // cfg guarantees that the CPU supports CMPXCHG16B.
304    //
305    // See cmpxchg16b function for more.
306    //
307    // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
308    // omitting the storing of condition flags and avoid use of xchg to handle rbx.
309    unsafe {
310        // cmpxchg16b is always SeqCst.
311        let (out_lo, out_hi);
312        macro_rules! cmpxchg16b {
313            ($rdi:tt) => {
314                asm!(
315                    "mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
316                    "xor rbx, rbx", // zeroed rbx
317                    concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
318                    "mov rbx, {rbx_tmp}", // restore rbx
319                    // set old/new args of cmpxchg16b to 0 (rbx is zeroed after saved to rbx_tmp, to avoid xchg)
320                    rbx_tmp = out(reg) _,
321                    in("rcx") 0_u64,
322                    inout("rax") 0_u64 => out_lo,
323                    inout("rdx") 0_u64 => out_hi,
324                    in($rdi) src,
325                    // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
326                    options(nostack),
327                )
328            };
329        }
330        #[cfg(target_pointer_width = "32")]
331        cmpxchg16b!("edi");
332        #[cfg(target_pointer_width = "64")]
333        cmpxchg16b!("rdi");
334        U128 { pair: Pair { lo: out_lo, hi: out_hi } }.whole
335    }
336}
337
338#[inline]
339unsafe fn atomic_store(dst: *mut u128, val: u128, order: Ordering) {
340    // We only use VMOVDQA when SSE is enabled. See atomic_load_vmovdqa() for more.
341    // SGX doesn't support CPUID.
342    #[cfg(all(
343        any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
344        any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
345    ))]
346    // SAFETY: the caller must uphold the safety contract.
347    // cfg guarantees that CMPXCHG16B is available at compile-time.
348    unsafe {
349        // cmpxchg16b is always SeqCst.
350        let _ = order;
351        atomic_store_cmpxchg16b(dst, val);
352    }
353    #[cfg(not(all(
354        any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"),
355        any(portable_atomic_no_outline_atomics, target_env = "sgx", not(target_feature = "sse")),
356    )))]
357    // SAFETY: the caller must uphold the safety contract.
358    unsafe {
359        #[cfg(target_feature = "sse")]
360        fn_alias! {
361            #[target_feature(enable = "avx")]
362            unsafe fn(dst: *mut u128, val: u128);
363            // atomic store by vmovdqa has at least release semantics.
364            atomic_store_vmovdqa_non_seqcst = atomic_store_vmovdqa(Ordering::Release);
365            atomic_store_vmovdqa_seqcst = atomic_store_vmovdqa(Ordering::SeqCst);
366        }
367        match order {
368            // Relaxed and Release stores are equivalent in all implementations
369            // that may be called here (vmovdqa, asm-based cmpxchg16b, and fallback).
370            // core::arch's cmpxchg16b will never called here.
371            Ordering::Relaxed | Ordering::Release => {
372                ifunc!(unsafe fn(dst: *mut u128, val: u128) {
373                    load_store_detect! {
374                        vmovdqa = atomic_store_vmovdqa_non_seqcst
375                        cmpxchg16b = atomic_store_cmpxchg16b
376                        fallback = atomic_store_non_seqcst
377                    }
378                });
379            }
380            Ordering::SeqCst => {
381                ifunc!(unsafe fn(dst: *mut u128, val: u128) {
382                    load_store_detect! {
383                        vmovdqa = atomic_store_vmovdqa_seqcst
384                        cmpxchg16b = atomic_store_cmpxchg16b
385                        fallback = atomic_store_seqcst
386                    }
387                });
388            }
389            _ => unreachable!(),
390        }
391    }
392}
393// See cmpxchg16b() for target_feature(enable).
394#[cfg_attr(
395    not(portable_atomic_no_cmpxchg16b_target_feature),
396    target_feature(enable = "cmpxchg16b")
397)]
398#[inline]
399unsafe fn atomic_store_cmpxchg16b(dst: *mut u128, val: u128) {
400    // SAFETY: the caller must uphold the safety contract.
401    unsafe {
402        // cmpxchg16b is always SeqCst.
403        atomic_swap_cmpxchg16b(dst, val, Ordering::SeqCst);
404    }
405}
406
407#[inline]
408unsafe fn atomic_compare_exchange(
409    dst: *mut u128,
410    old: u128,
411    new: u128,
412    _success: Ordering,
413    _failure: Ordering,
414) -> Result<u128, u128> {
415    #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
416    // SAFETY: the caller must guarantee that `dst` is valid for both writes and
417    // reads, 16-byte aligned, that there are no concurrent non-atomic operations,
418    // and cfg guarantees that CMPXCHG16B is available at compile-time.
419    let (prev, ok) = unsafe { cmpxchg16b(dst, old, new) };
420    #[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
421    // SAFETY: the caller must guarantee that `dst` is valid for both writes and
422    // reads, 16-byte aligned, and that there are no different kinds of concurrent accesses.
423    let (prev, ok) = unsafe {
424        ifunc!(unsafe fn(dst: *mut u128, old: u128, new: u128) -> (u128, bool) {
425            if detect::detect().has_cmpxchg16b() {
426                cmpxchg16b
427            } else {
428                // Use SeqCst because cmpxchg16b is always SeqCst.
429                fallback::atomic_compare_exchange_seqcst
430            }
431        })
432    };
433    if ok {
434        Ok(prev)
435    } else {
436        Err(prev)
437    }
438}
439
440// cmpxchg16b is always strong.
441use self::atomic_compare_exchange as atomic_compare_exchange_weak;
442
443// See cmpxchg16b() for target_feature(enable).
444#[cfg_attr(
445    not(portable_atomic_no_cmpxchg16b_target_feature),
446    target_feature(enable = "cmpxchg16b")
447)]
448#[inline]
449unsafe fn atomic_swap_cmpxchg16b(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
450    debug_assert!(dst as usize % 16 == 0);
451    debug_assert_cmpxchg16b!();
452
453    // SAFETY: the caller must guarantee that `dst` is valid for both writes and
454    // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
455    // cfg guarantees that the CPU supports CMPXCHG16B.
456    //
457    // See cmpxchg16b function for more.
458    //
459    // We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
460    // omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
461    //
462    // Do not use atomic_rmw_cas_3 because it needs extra MOV to implement swap.
463    unsafe {
464        // cmpxchg16b is always SeqCst.
465        let val = U128 { whole: val };
466        let (mut prev_lo, mut prev_hi);
467        macro_rules! cmpxchg16b {
468            ($rdi:tt) => {
469                asm!(
470                    "xchg {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
471                    // This is not single-copy atomic reads, but this is ok because subsequent
472                    // CAS will check for consistency.
473                    //
474                    // This is based on the code generated for the first load in DW RMWs by LLVM.
475                    //
476                    // Note that the C++20 memory model does not allow mixed-sized atomic access,
477                    // so we must use inline assembly to implement this.
478                    // (i.e., byte-wise atomic based on the standard library's atomic types
479                    // cannot be used here).
480                    concat!("mov rax, qword ptr [", $rdi, "]"),
481                    concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
482                    "2:",
483                        concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
484                        "jne 2b",
485                    "mov rbx, {rbx_tmp}", // restore rbx
486                    rbx_tmp = inout(reg) val.pair.lo => _,
487                    in("rcx") val.pair.hi,
488                    out("rax") prev_lo,
489                    out("rdx") prev_hi,
490                    in($rdi) dst,
491                    // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
492                    options(nostack),
493                )
494            };
495        }
496        #[cfg(target_pointer_width = "32")]
497        cmpxchg16b!("edi");
498        #[cfg(target_pointer_width = "64")]
499        cmpxchg16b!("rdi");
500        U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
501    }
502}
503
504/// Atomic RMW by CAS loop (3 arguments)
505/// `unsafe fn(dst: *mut u128, val: u128, order: Ordering) -> u128;`
506///
507/// `$op` can use the following registers:
508/// - rsi/r8 pair: val argument (read-only for `$op`)
509/// - rax/rdx pair: previous value loaded (read-only for `$op`)
510/// - rbx/rcx pair: new value that will be stored
511// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
512// omitting the storing/comparing of condition flags and reducing uses of xchg/mov to handle rbx.
513macro_rules! atomic_rmw_cas_3 {
514    ($name:ident, $($op:tt)*) => {
515        // See cmpxchg16b() for target_feature(enable).
516        #[cfg_attr(
517            not(portable_atomic_no_cmpxchg16b_target_feature),
518            target_feature(enable = "cmpxchg16b")
519        )]
520        #[inline]
521        unsafe fn $name(dst: *mut u128, val: u128, _order: Ordering) -> u128 {
522            debug_assert!(dst as usize % 16 == 0);
523            debug_assert_cmpxchg16b!();
524            // SAFETY: the caller must guarantee that `dst` is valid for both writes and
525            // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
526            // cfg guarantees that the CPU supports CMPXCHG16B.
527            //
528            // See cmpxchg16b function for more.
529            unsafe {
530                // cmpxchg16b is always SeqCst.
531                let val = U128 { whole: val };
532                let (mut prev_lo, mut prev_hi);
533                macro_rules! cmpxchg16b {
534                    ($rdi:tt) => {
535                        asm!(
536                            "mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
537                            // This is not single-copy atomic reads, but this is ok because subsequent
538                            // CAS will check for consistency.
539                            //
540                            // This is based on the code generated for the first load in DW RMWs by LLVM.
541                            //
542                            // Note that the C++20 memory model does not allow mixed-sized atomic access,
543                            // so we must use inline assembly to implement this.
544                            // (i.e., byte-wise atomic based on the standard library's atomic types
545                            // cannot be used here).
546                            concat!("mov rax, qword ptr [", $rdi, "]"),
547                            concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
548                            "2:",
549                                $($op)*
550                                concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
551                                "jne 2b",
552                            "mov rbx, {rbx_tmp}", // restore rbx
553                            rbx_tmp = out(reg) _,
554                            out("rcx") _,
555                            out("rax") prev_lo,
556                            out("rdx") prev_hi,
557                            in($rdi) dst,
558                            in("rsi") val.pair.lo,
559                            in("r8") val.pair.hi,
560                            // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
561                            options(nostack),
562                        )
563                    };
564                }
565                #[cfg(target_pointer_width = "32")]
566                cmpxchg16b!("edi");
567                #[cfg(target_pointer_width = "64")]
568                cmpxchg16b!("rdi");
569                U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
570            }
571        }
572    };
573}
574/// Atomic RMW by CAS loop (2 arguments)
575/// `unsafe fn(dst: *mut u128, order: Ordering) -> u128;`
576///
577/// `$op` can use the following registers:
578/// - rax/rdx pair: previous value loaded (read-only for `$op`)
579/// - rbx/rcx pair: new value that will be stored
580// We could use CAS loop by atomic_compare_exchange here, but using an inline assembly allows
581// omitting the storing of condition flags and avoid use of xchg to handle rbx.
582macro_rules! atomic_rmw_cas_2 {
583    ($name:ident, $($op:tt)*) => {
584        // See cmpxchg16b() for target_feature(enable).
585        #[cfg_attr(
586            not(portable_atomic_no_cmpxchg16b_target_feature),
587            target_feature(enable = "cmpxchg16b")
588        )]
589        #[inline]
590        unsafe fn $name(dst: *mut u128, _order: Ordering) -> u128 {
591            debug_assert!(dst as usize % 16 == 0);
592            debug_assert_cmpxchg16b!();
593            // SAFETY: the caller must guarantee that `dst` is valid for both writes and
594            // reads, 16-byte aligned, and that there are no concurrent non-atomic operations.
595            // cfg guarantees that the CPU supports CMPXCHG16B.
596            //
597            // See cmpxchg16b function for more.
598            unsafe {
599                // cmpxchg16b is always SeqCst.
600                let (mut prev_lo, mut prev_hi);
601                macro_rules! cmpxchg16b {
602                    ($rdi:tt) => {
603                        asm!(
604                            "mov {rbx_tmp}, rbx", // save rbx which is reserved by LLVM
605                            // This is not single-copy atomic reads, but this is ok because subsequent
606                            // CAS will check for consistency.
607                            //
608                            // This is based on the code generated for the first load in DW RMWs by LLVM.
609                            //
610                            // Note that the C++20 memory model does not allow mixed-sized atomic access,
611                            // so we must use inline assembly to implement this.
612                            // (i.e., byte-wise atomic based on the standard library's atomic types
613                            // cannot be used here).
614                            concat!("mov rax, qword ptr [", $rdi, "]"),
615                            concat!("mov rdx, qword ptr [", $rdi, " + 8]"),
616                            "2:",
617                                $($op)*
618                                concat!("lock cmpxchg16b xmmword ptr [", $rdi, "]"),
619                                "jne 2b",
620                            "mov rbx, {rbx_tmp}", // restore rbx
621                            rbx_tmp = out(reg) _,
622                            out("rcx") _,
623                            out("rax") prev_lo,
624                            out("rdx") prev_hi,
625                            in($rdi) dst,
626                            // Do not use `preserves_flags` because CMPXCHG16B modifies the ZF flag.
627                            options(nostack),
628                        )
629                    };
630                }
631                #[cfg(target_pointer_width = "32")]
632                cmpxchg16b!("edi");
633                #[cfg(target_pointer_width = "64")]
634                cmpxchg16b!("rdi");
635                U128 { pair: Pair { lo: prev_lo, hi: prev_hi } }.whole
636            }
637        }
638    };
639}
640
641atomic_rmw_cas_3! {
642    atomic_add_cmpxchg16b,
643    "mov rbx, rax",
644    "add rbx, rsi",
645    "mov rcx, rdx",
646    "adc rcx, r8",
647}
648atomic_rmw_cas_3! {
649    atomic_sub_cmpxchg16b,
650    "mov rbx, rax",
651    "sub rbx, rsi",
652    "mov rcx, rdx",
653    "sbb rcx, r8",
654}
655atomic_rmw_cas_3! {
656    atomic_and_cmpxchg16b,
657    "mov rbx, rax",
658    "and rbx, rsi",
659    "mov rcx, rdx",
660    "and rcx, r8",
661}
662atomic_rmw_cas_3! {
663    atomic_nand_cmpxchg16b,
664    "mov rbx, rax",
665    "and rbx, rsi",
666    "not rbx",
667    "mov rcx, rdx",
668    "and rcx, r8",
669    "not rcx",
670}
671atomic_rmw_cas_3! {
672    atomic_or_cmpxchg16b,
673    "mov rbx, rax",
674    "or rbx, rsi",
675    "mov rcx, rdx",
676    "or rcx, r8",
677}
678atomic_rmw_cas_3! {
679    atomic_xor_cmpxchg16b,
680    "mov rbx, rax",
681    "xor rbx, rsi",
682    "mov rcx, rdx",
683    "xor rcx, r8",
684}
685
686atomic_rmw_cas_2! {
687    atomic_not_cmpxchg16b,
688    "mov rbx, rax",
689    "not rbx",
690    "mov rcx, rdx",
691    "not rcx",
692}
693atomic_rmw_cas_2! {
694    atomic_neg_cmpxchg16b,
695    "mov rbx, rax",
696    "neg rbx",
697    "mov rcx, 0",
698    "sbb rcx, rdx",
699}
700
701atomic_rmw_cas_3! {
702    atomic_max_cmpxchg16b,
703    "cmp rsi, rax",
704    "mov rcx, r8",
705    "sbb rcx, rdx",
706    "mov rcx, r8",
707    "cmovl rcx, rdx",
708    "mov rbx, rsi",
709    "cmovl rbx, rax",
710}
711atomic_rmw_cas_3! {
712    atomic_umax_cmpxchg16b,
713    "cmp rsi, rax",
714    "mov rcx, r8",
715    "sbb rcx, rdx",
716    "mov rcx, r8",
717    "cmovb rcx, rdx",
718    "mov rbx, rsi",
719    "cmovb rbx, rax",
720}
721atomic_rmw_cas_3! {
722    atomic_min_cmpxchg16b,
723    "cmp rsi, rax",
724    "mov rcx, r8",
725    "sbb rcx, rdx",
726    "mov rcx, r8",
727    "cmovge rcx, rdx",
728    "mov rbx, rsi",
729    "cmovge rbx, rax",
730}
731atomic_rmw_cas_3! {
732    atomic_umin_cmpxchg16b,
733    "cmp rsi, rax",
734    "mov rcx, r8",
735    "sbb rcx, rdx",
736    "mov rcx, r8",
737    "cmovae rcx, rdx",
738    "mov rbx, rsi",
739    "cmovae rbx, rax",
740}
741
742macro_rules! select_atomic_rmw {
743    (
744        unsafe fn $name:ident($($arg:tt)*) $(-> $ret_ty:ty)?;
745        cmpxchg16b = $cmpxchg16b_fn:ident;
746        fallback = $seqcst_fallback_fn:ident;
747    ) => {
748        // If cmpxchg16b is available at compile-time, we can always use cmpxchg16b_fn.
749        #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
750        use self::$cmpxchg16b_fn as $name;
751        // Otherwise, we need to do run-time detection and can use cmpxchg16b_fn only if cmpxchg16b is available.
752        #[cfg(not(any(
753            target_feature = "cmpxchg16b",
754            portable_atomic_target_feature = "cmpxchg16b",
755        )))]
756        #[inline]
757        unsafe fn $name($($arg)*, _order: Ordering) $(-> $ret_ty)? {
758            fn_alias! {
759                // See cmpxchg16b() for target_feature(enable).
760                #[cfg_attr(
761                    not(portable_atomic_no_cmpxchg16b_target_feature),
762                    target_feature(enable = "cmpxchg16b")
763                )]
764                unsafe fn($($arg)*) $(-> $ret_ty)?;
765                // cmpxchg16b is always SeqCst.
766                cmpxchg16b_seqcst_fn = $cmpxchg16b_fn(Ordering::SeqCst);
767            }
768            // SAFETY: the caller must uphold the safety contract.
769            // we only calls cmpxchg16b_fn if cmpxchg16b is available.
770            unsafe {
771                ifunc!(unsafe fn($($arg)*) $(-> $ret_ty)? {
772                    if detect::detect().has_cmpxchg16b() {
773                        cmpxchg16b_seqcst_fn
774                    } else {
775                        // Use SeqCst because cmpxchg16b is always SeqCst.
776                        fallback::$seqcst_fallback_fn
777                    }
778                })
779            }
780        }
781    };
782}
783
784select_atomic_rmw! {
785    unsafe fn atomic_swap(dst: *mut u128, val: u128) -> u128;
786    cmpxchg16b = atomic_swap_cmpxchg16b;
787    fallback = atomic_swap_seqcst;
788}
789select_atomic_rmw! {
790    unsafe fn atomic_add(dst: *mut u128, val: u128) -> u128;
791    cmpxchg16b = atomic_add_cmpxchg16b;
792    fallback = atomic_add_seqcst;
793}
794select_atomic_rmw! {
795    unsafe fn atomic_sub(dst: *mut u128, val: u128) -> u128;
796    cmpxchg16b = atomic_sub_cmpxchg16b;
797    fallback = atomic_sub_seqcst;
798}
799select_atomic_rmw! {
800    unsafe fn atomic_and(dst: *mut u128, val: u128) -> u128;
801    cmpxchg16b = atomic_and_cmpxchg16b;
802    fallback = atomic_and_seqcst;
803}
804select_atomic_rmw! {
805    unsafe fn atomic_nand(dst: *mut u128, val: u128) -> u128;
806    cmpxchg16b = atomic_nand_cmpxchg16b;
807    fallback = atomic_nand_seqcst;
808}
809select_atomic_rmw! {
810    unsafe fn atomic_or(dst: *mut u128, val: u128) -> u128;
811    cmpxchg16b = atomic_or_cmpxchg16b;
812    fallback = atomic_or_seqcst;
813}
814select_atomic_rmw! {
815    unsafe fn atomic_xor(dst: *mut u128, val: u128) -> u128;
816    cmpxchg16b = atomic_xor_cmpxchg16b;
817    fallback = atomic_xor_seqcst;
818}
819select_atomic_rmw! {
820    unsafe fn atomic_max(dst: *mut u128, val: u128) -> u128;
821    cmpxchg16b = atomic_max_cmpxchg16b;
822    fallback = atomic_max_seqcst;
823}
824select_atomic_rmw! {
825    unsafe fn atomic_umax(dst: *mut u128, val: u128) -> u128;
826    cmpxchg16b = atomic_umax_cmpxchg16b;
827    fallback = atomic_umax_seqcst;
828}
829select_atomic_rmw! {
830    unsafe fn atomic_min(dst: *mut u128, val: u128) -> u128;
831    cmpxchg16b = atomic_min_cmpxchg16b;
832    fallback = atomic_min_seqcst;
833}
834select_atomic_rmw! {
835    unsafe fn atomic_umin(dst: *mut u128, val: u128) -> u128;
836    cmpxchg16b = atomic_umin_cmpxchg16b;
837    fallback = atomic_umin_seqcst;
838}
839select_atomic_rmw! {
840    unsafe fn atomic_not(dst: *mut u128) -> u128;
841    cmpxchg16b = atomic_not_cmpxchg16b;
842    fallback = atomic_not_seqcst;
843}
844select_atomic_rmw! {
845    unsafe fn atomic_neg(dst: *mut u128) -> u128;
846    cmpxchg16b = atomic_neg_cmpxchg16b;
847    fallback = atomic_neg_seqcst;
848}
849
850#[inline]
851fn is_lock_free() -> bool {
852    #[cfg(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"))]
853    {
854        // CMPXCHG16B is available at compile-time.
855        true
856    }
857    #[cfg(not(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b")))]
858    {
859        detect::detect().has_cmpxchg16b()
860    }
861}
862const IS_ALWAYS_LOCK_FREE: bool =
863    cfg!(any(target_feature = "cmpxchg16b", portable_atomic_target_feature = "cmpxchg16b"));
864
865atomic128!(AtomicI128, i128, atomic_max, atomic_min);
866atomic128!(AtomicU128, u128, atomic_umax, atomic_umin);
867
868#[allow(clippy::undocumented_unsafe_blocks, clippy::wildcard_imports)]
869#[cfg(test)]
870mod tests {
871    use super::*;
872
873    test_atomic_int!(i128);
874    test_atomic_int!(u128);
875
876    // load/store/swap implementation is not affected by signedness, so it is
877    // enough to test only unsigned types.
878    stress_test!(u128);
879}