Module Name: src
Committed By: riastradh
Date: Fri Nov 29 22:17:24 UTC 2019
Modified Files:
src/distrib/sets/lists/comp: mi
src/share/man/man9: Makefile
src/sys/sys: atomic.h
Added Files:
src/share/man/man9: atomic_loadstore.9
Log Message:
New atomic load/store operations for the kernel.
Guarantee no fusing and no tearing, and can optionally impose
ordering relative to other memory operations.
Unordered:
- atomic_load_relaxed
- atomic_store_relaxed
Ordered:
- atomic_load_acquire
- atomic_load_consume
- atomic_store_release
These are intended to match C11 semantics, and can be defined in
terms of the C11 atomic API when ready.
To generate a diff of this commit:
cvs rdiff -u -r1.2293 -r1.2294 src/distrib/sets/lists/comp/mi
cvs rdiff -u -r1.442 -r1.443 src/share/man/man9/Makefile
cvs rdiff -u -r0 -r1.1 src/share/man/man9/atomic_loadstore.9
cvs rdiff -u -r1.17 -r1.18 src/sys/sys/atomic.h
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/distrib/sets/lists/comp/mi
diff -u src/distrib/sets/lists/comp/mi:1.2293 src/distrib/sets/lists/comp/mi:1.2294
--- src/distrib/sets/lists/comp/mi:1.2293 Fri Nov 29 20:31:35 2019
+++ src/distrib/sets/lists/comp/mi Fri Nov 29 22:17:23 2019
@@ -1,4 +1,4 @@
-# $NetBSD: mi,v 1.2293 2019/11/29 20:31:35 riastradh Exp $
+# $NetBSD: mi,v 1.2294 2019/11/29 22:17:23 riastradh Exp $
#
# Note: don't delete entries from here - mark them as "obsolete" instead.
./etc/mtree/set.comp comp-sys-root
@@ -10704,6 +10704,12 @@
./usr/share/man/cat9/arp_ifinit.0 comp-sys-catman .cat
./usr/share/man/cat9/arpintr.0 comp-sys-catman .cat
./usr/share/man/cat9/arpresolve.0 comp-sys-catman .cat
+./usr/share/man/cat9/atomic_load_acquire.0 comp-sys-catman .cat
+./usr/share/man/cat9/atomic_load_consume.0 comp-sys-catman .cat
+./usr/share/man/cat9/atomic_load_relaxed.0 comp-sys-catman .cat
+./usr/share/man/cat9/atomic_loadstore.0 comp-sys-catman .cat
+./usr/share/man/cat9/atomic_store_relaxed.0 comp-sys-catman .cat
+./usr/share/man/cat9/atomic_store_release.0 comp-sys-catman .cat
./usr/share/man/cat9/atop.0 comp-sys-catman .cat
./usr/share/man/cat9/audio.0 comp-sys-catman .cat
./usr/share/man/cat9/audio_system.0 comp-sys-catman obsolete
@@ -18652,6 +18658,12 @@
./usr/share/man/html9/arp_ifinit.html comp-sys-htmlman html
./usr/share/man/html9/arpintr.html comp-sys-htmlman html
./usr/share/man/html9/arpresolve.html comp-sys-htmlman html
+./usr/share/man/html9/atomic_load_acquire.html comp-sys-htmlman html
+./usr/share/man/html9/atomic_load_consume.html comp-sys-htmlman html
+./usr/share/man/html9/atomic_load_relaxed.html comp-sys-htmlman html
+./usr/share/man/html9/atomic_loadstore.html comp-sys-htmlman html
+./usr/share/man/html9/atomic_store_relaxed.html comp-sys-htmlman html
+./usr/share/man/html9/atomic_store_release.html comp-sys-htmlman html
./usr/share/man/html9/atop.html comp-sys-htmlman html
./usr/share/man/html9/audio.html comp-sys-htmlman html
./usr/share/man/html9/audio_system.html comp-sys-htmlman obsolete
@@ -26703,6 +26715,12 @@
./usr/share/man/man9/arp_ifinit.9 comp-sys-man .man
./usr/share/man/man9/arpintr.9 comp-sys-man .man
./usr/share/man/man9/arpresolve.9 comp-sys-man .man
+./usr/share/man/man9/atomic_load_acquire.9 comp-sys-man .man
+./usr/share/man/man9/atomic_load_consume.9 comp-sys-man .man
+./usr/share/man/man9/atomic_load_relaxed.9 comp-sys-man .man
+./usr/share/man/man9/atomic_loadstore.9 comp-sys-man .man
+./usr/share/man/man9/atomic_store_relaxed.9 comp-sys-man .man
+./usr/share/man/man9/atomic_store_release.9 comp-sys-man .man
./usr/share/man/man9/atop.9 comp-sys-man .man
./usr/share/man/man9/audio.9 comp-sys-man .man
./usr/share/man/man9/audio_system.9 comp-sys-man obsolete
Index: src/share/man/man9/Makefile
diff -u src/share/man/man9/Makefile:1.442 src/share/man/man9/Makefile:1.443
--- src/share/man/man9/Makefile:1.442 Fri Nov 29 20:31:35 2019
+++ src/share/man/man9/Makefile Fri Nov 29 22:17:23 2019
@@ -1,4 +1,4 @@
-# $NetBSD: Makefile,v 1.442 2019/11/29 20:31:35 riastradh Exp $
+# $NetBSD: Makefile,v 1.443 2019/11/29 22:17:23 riastradh Exp $
# Makefile for section 9 (kernel function and variable) manual pages.
@@ -67,6 +67,13 @@ MAN= accept_filter.9 accf_data.9 accf_ht
wsbell.9 wscons.9 wsdisplay.9 wsfont.9 wskbd.9 wsmouse.9 \
xcall.9
+MAN+= atomic_loadstore.9
+MLINKS+=atomic_loadstore.9 atomic_load_acquire.9 \
+ atomic_loadstore.9 atomic_load_consume.9 \
+ atomic_loadstore.9 atomic_load_relaxed.9 \
+ atomic_loadstore.9 atomic_store_relaxed.9 \
+ atomic_loadstore.9 atomic_store_release.9
+
MAN+= boothowto.9
MLINKS+=boothowto.9 BOOT_FLAG.9
Index: src/sys/sys/atomic.h
diff -u src/sys/sys/atomic.h:1.17 src/sys/sys/atomic.h:1.18
--- src/sys/sys/atomic.h:1.17 Thu Nov 14 16:23:53 2019
+++ src/sys/sys/atomic.h Fri Nov 29 22:17:23 2019
@@ -1,4 +1,4 @@
-/* $NetBSD: atomic.h,v 1.17 2019/11/14 16:23:53 maxv Exp $ */
+/* $NetBSD: atomic.h,v 1.18 2019/11/29 22:17:23 riastradh Exp $ */
/*-
* Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
@@ -383,4 +383,109 @@ __END_DECLS
#define atomic_inc_64_nv kmsan_atomic_inc_64_nv
#endif
+#ifdef _KERNEL
+
+#if 1 // XXX: __STDC_VERSION__ < 201112L
+
+/* Pre-C11 definitions */
+
+#include <sys/cdefs.h>
+#include <sys/bitops.h>
+
+#ifdef _LP64
+#define __HAVE_ATOMIC64_LOADSTORE 1
+#define __ATOMIC_SIZE_MAX 8
+#else
+#define __ATOMIC_SIZE_MAX 4
+#endif
+
+/*
+ * We assume that access to an aligned pointer to a volatile object of
+ * at most __ATOMIC_SIZE_MAX bytes is guaranteed to be atomic. This is
+ * an assumption that may be wrong, but we hope it won't be wrong
+ * before we just adopt the C11 atomic API.
+ */
+#define __ATOMIC_PTR_CHECK(p) do \
+{ \
+ CTASSERT(sizeof(*(p)) <= __ATOMIC_SIZE_MAX); \
+ KASSERT(((uintptr_t)(p) & ilog2(sizeof(*(p)))) == 0); \
+} while (0)
+
+#define atomic_load_relaxed(p) \
+({ \
+ const volatile __typeof__(*(p)) *__al_ptr = (p); \
+ __ATOMIC_PTR_CHECK(__al_ptr); \
+ *__al_ptr; \
+})
+
+#define atomic_load_consume(p) \
+({ \
+ const volatile __typeof__(*(p)) *__al_ptr = (p); \
+ __ATOMIC_PTR_CHECK(__al_ptr); \
+ __typeof__(*(p)) __al_val = *__al_ptr; \
+ membar_datadep_consumer(); \
+ __al_val; \
+})
+
+/*
+ * We want {loads}-before-{loads,stores}. It is tempting to use
+ * membar_enter, but that provides {stores}-before-{loads,stores},
+ * which may not help. So we must use membar_sync, which does the
+ * slightly stronger {loads,stores}-before-{loads,stores}.
+ */
+#define atomic_load_acquire(p) \
+({ \
+ const volatile __typeof__(*(p)) *__al_ptr = (p); \
+ __ATOMIC_PTR_CHECK(__al_ptr); \
+ __typeof__(*(p)) __al_val = *__al_ptr; \
+ membar_sync(); \
+ __al_val; \
+})
+
+#define atomic_store_relaxed(p,v) \
+({ \
+ volatile __typeof__(*(p)) *__as_ptr = (p); \
+ __ATOMIC_PTR_CHECK(__as_ptr); \
+ *__as_ptr = (v); \
+})
+
+#define atomic_store_release(p,v) \
+({ \
+ volatile __typeof__(*(p)) *__as_ptr = (p); \
+ __typeof__(*(p)) __as_val = (v); \
+ __ATOMIC_PTR_CHECK(__as_ptr); \
+ membar_exit(); \
+ *__as_ptr = __as_val; \
+})
+
+#else /* __STDC_VERSION__ >= 201112L */
+
+/* C11 definitions, not yet available */
+
+#include <stdatomic.h>
+
+#define atomic_load_relaxed(p) \
+ atomic_load_explicit((p), memory_order_relaxed)
+#if 0 /* memory_order_consume is not there yet */
+#define atomic_load_consume(p) \
+ atomic_load_explicit((p), memory_order_consume)
+#else
+#define atomic_load_consume(p) \
+({ \
+ const __typeof__(*(p)) __al_val = atomic_load_relaxed(p); \
+ membar_datadep_consumer(); \
+ __al_val; \
+})
+#endif
+#define atomic_load_acquire(p) \
+ atomic_load_explicit((p), memory_order_acquire)
+#define atomic_store_relaxed(p, v) \
+ atomic_store_explicit((p), (v), memory_order_relaxed)
+#define atomic_store_release(p, v) \
+ atomic_store_explicit((p), (v), memory_order_release)
+
+#endif /* __STDC_VERSION__ */
+
+#endif /* _KERNEL */
+
#endif /* ! _SYS_ATOMIC_H_ */
Added files:
Index: src/share/man/man9/atomic_loadstore.9
diff -u /dev/null src/share/man/man9/atomic_loadstore.9:1.1
--- /dev/null Fri Nov 29 22:17:24 2019
+++ src/share/man/man9/atomic_loadstore.9 Fri Nov 29 22:17:23 2019
@@ -0,0 +1,744 @@
+.\" $NetBSD: atomic_loadstore.9,v 1.1 2019/11/29 22:17:23 riastradh Exp $
+.\"
+.\" Copyright (c) 2019 The NetBSD Foundation
+.\" All rights reserved.
+.\"
+.\" This code is derived from software contributed to The NetBSD Foundation
+.\" by Taylor R. Campbell.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+.\" PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+.\" POSSIBILITY OF SUCH DAMAGE.
+.\"
+.Dd November 25, 2019
+.Dt ATOMIC_LOADSTORE 9
+.Os
+.Sh NAME
+.Nm atomic_load_relaxed ,
+.Nm atomic_load_acquire ,
+.Nm atomic_load_consume ,
+.Nm atomic_store_relaxed ,
+.Nm atomic_store_release
+.Nd atomic and ordered memory operations
+.Sh SYNOPSIS
+.In sys/atomic.h
+.Ft T
+.Fn atomic_load_relaxed "const volatile T *p"
+.Ft T
+.Fn atomic_load_acquire "const volatile T *p"
+.Ft T
+.Fn atomic_load_consume "const volatile T *p"
+.Ft void
+.Fn atomic_store_relaxed "volatile T *p" "T v"
+.Ft void
+.Fn atomic_store_release "volatile T *p" "T v"
+.Sh DESCRIPTION
+These type-generic macros implement memory operations that are
+.Em atomic
+and that have
+.Em memory ordering constraints .
+Aside from atomicity and ordering, the load operations are equivalent
+to
+.Li * Ns Fa p
+and the store operations are equivalent to
+.Li * Ns Fa p Li "=" Fa v .
+The pointer
+.Fa p
+must be aligned, even on architectures like x86 which generally lack
+strict alignment requirements; see
+.Sx SIZE AND ALIGNMENT
+for details.
+.Pp
+.Em Atomic
+means that the memory operations cannot be
+.Em fused
+or
+.Em torn :
+.Bl -bullet
+.It
+.Em Fusing
+is combining multiple memory operations on a single object into one
+memory operation, such as replacing
+.Bd -literal -compact
+ *p = v;
+ x = *p;
+.Ed
+by
+.Bd -literal -compact
+ *p = v;
+ x = v;
+.Ed
+since the compiler can prove that
+.Li * Ns Fa p
+will yield
+.Fa v
+after
+.Li * Ns Fa p Li = Fa v .
+For
+.Em atomic
+memory operations, the implementation
+.Em will not
+assume that
+.Bl -dash -compact
+.It
+consecutive loads of the same object will return the same value, or
+.It
+a store followed by a load of the same object will return the value
+stored, or
+.It
+consecutive stores of the same object are redundant.
+.El
+Thus, the implementation will not replace two consecutive atomic loads
+by one, will not elide an atomic load following a store, and will not
+combine two consecutive atomic stores into one.
+.Pp
+For example,
+.Bd -literal
+ atomic_store_relaxed(&flag, 1);
+ while (atomic_load_relaxed(&flag))
+ continue;
+.Ed
+.Pp
+may be used to set a flag and then busy-wait until another thread
+clears it, whereas
+.Bd -literal
+ flag = 1;
+ while (flag)
+ continue;
+.Ed
+.Pp
+may be transformed into the infinite loop
+.Bd -literal
+ flag = 1;
+ while (1)
+ continue;
+.Ed
+.It
+.Em Tearing
+is implementing a memory operation on a large data unit such as a
+32-bit word by issuing multiple memory operations on smaller data units
+such as 8-bit bytes.
+The implementation will not tear
+.Em atomic
+loads or stores into smaller ones.
+Thus, as far as any interrupt, other thread, or other CPU can tell, an
+atomic memory operation is issued either all at once or not at all.
+.Pp
+For example, if a 32-bit word
+.Fa w
+is written with
+.Li atomic_store_relaxed(& Ns Fa w Ns Li "," 0x00010002) ,
+then an interrupt, other thread, or other CPU reading it with
+.Li atomic_load_relaxed(& Ns Fa w Ns Li ")"
+will never witness it partially written, whereas
+.Fa w Li = 0x00010002
+might be compiled into a pair of separate 16-bit store instructions
+instead of one single word-sized store instruction, in which case other
+threads may see the intermediate state with only one of the halves
+written.
+.El
+.Pp
+Atomic operations on any single object occur in a total order shared by
+all interrupts, threads, and CPUs, which is consistent with the program
+order in every interrupt, thread, and CPU.
+A single program without interruption or other threads or CPUs will
+always observe its own loads and stores in program order, but another
+program in an interrupt handler, in another thread, or on another CPU
+may issue loads that return values as if the first program's stores
+occurred out of program order, and vice versa.
+Two different threads might each observe a third thread's memory
+operations in different orders.
+.Pp
+The
+.Em memory ordering constraints
+make limited guarantees of ordering relative to memory operations on
+.Em other
+objects as witnessed by interrupts, other threads, or other CPUs, and
+have the following meanings:
+.Bl -tag -width relaxed
+.It relaxed
+No ordering relative to memory operations on any other objects is
+guaranteed.
+Relaxed ordering is the default for ordinary non-atomic memory
+operations like
+.Li * Ns Fa p
+and
+.Li * Ns Fa p Li = Fa v .
+.Pp
+Atomic operations with relaxed ordering are cheap: they are not
+read/modify/write atomic operations, and they do not involve any kind
+of inter-CPU ordering barriers.
+.It acquire
+This memory operation happens before all subsequent memory operations
+in program order.
+However, prior memory operations in program order may be reordered to
+happen after this one.
+For example, assuming no aliasing between the pointers, the
+implementation is allowed to treat
+.Bd -literal
+ int x = *p;
+ if (atomic_load_acquire(q)) {
+ int y = *r;
+ *s = x + y;
+ return 1;
+ }
+.Ed
+.Pp
+as if it were
+.Bd -literal
+ if (atomic_load_acquire(q)) {
+ int x = *p;
+ int y = *r;
+ *s = x + y;
+ return 1;
+ }
+.Ed
+.Pp
+but
+.Em not
+as if it were
+.Bd -literal
+ int x = *p;
+ int y = *r;
+ *s = x + y;
+ if (atomic_load_acquire(q)) {
+ return 1;
+ }
+.Ed
+.It consume
+This memory operation happens before all memory operations on objects
+at addresses that are computed from the value returned by this one.
+Otherwise, no ordering relative to memory operations on other objects
+is implied.
+.Pp
+For example, the implementation is allowed to treat
+.Bd -literal
+ struct foo *foo0, *foo1;
+
+ struct foo *f0 = atomic_load_consume(&foo0);
+ struct foo *f1 = atomic_load_consume(&foo1);
+ int x = f0->x;
+ int y = f1->y;
+.Ed
+.Pp
+as if it were
+.Bd -literal
+ struct foo *foo0, *foo1;
+
+ struct foo *f1 = atomic_load_consume(&foo1);
+ struct foo *f0 = atomic_load_consume(&foo0);
+ int y = f1->y;
+ int x = f0->x;
+.Ed
+.Pp
+but loading
+.Li f0->x
+is guaranteed to happen after loading
+.Li foo0
+even if the CPU had a cached value for the address that
+.Li f0->x
+happened to be at, and likewise for
+.Li f1->y
+and
+.Li foo1 .
+.Pp
+.Fn atomic_load_consume
+functions like
+.Fn atomic_load_acquire
+as long as the memory operations that must happen after it are limited
+to addresses that depend on the value returned by it, but it is almost
+always as cheap as
+.Fn atomic_load_relaxed .
+See
+.Sx ACQUIRE OR CONSUME?
+below for more details.
+.It release
+All prior memory operations in program order happen before this one.
+However, subsequent memory operations in program order may be reordered
+to happen before this one too.
+For example, assuming no aliasing between the pointers, the
+implementation is allowed to treat
+.Bd -literal
+ int x = *p;
+ *q = x;
+ atomic_store_release(r, 0);
+ int y = *s;
+ return x + y;
+.Ed
+.Pp
+as if it were
+.Bd -literal
+ int y = *s;
+ int x = *p;
+ *q = x;
+ atomic_store_release(r, 0);
+ return x + y;
+.Ed
+.Pp
+but
+.Em not
+as if it were
+.Bd -literal
+ atomic_store_release(r, 0);
+ int x = *p;
+ int y = *s;
+ *q = x;
+ return x + y;
+.Ed
+.El
+.Ss PAIRING ORDERED MEMORY OPERATIONS
+In general, each
+.Fn atomic_store_release
+.Em must
+be paired with either
+.Fn atomic_load_acquire
+or
+.Fn atomic_load_consume
+in order to have an effect \(em it is only when a release operation
+synchronizes with an acquire or consume operation that any ordering
+guaranteed between memory operations
+.Em before
+the release operation and memory operations
+.Em after
+the acquire/consume operation.
+.Pp
+For example, to set up an entry in a table and then mark the entry
+ready, you should:
+.Bl -enum
+.It
+Perform memory operations to initialize the data.
+.Bd -literal
+ tab[i].x = ...;
+ tab[i].y = ...;
+.Ed
+.It
+Issue
+.Fn atomic_store_release
+to mark it ready.
+.Bd -literal
+ atomic_store_release(&tab[i].ready, 1);
+.Ed
+.It
+Possibly in another thread, issue
+.Fn atomic_load_acquire
+to ascertain whether it is ready.
+.Bd -literal
+ if (atomic_load_acquire(&tab[i].ready) == 0)
+ return EWOULDBLOCK;
+.Ed
+.It
+Perform memory operations to use the data.
+.Bd -literal
+ do_stuff(tab[i].x, tab[i].y);
+.Ed
+.El
+.Pp
+Similarly, if you want to create an object, initialize it, and then
+publish it to be used by another thread, then you should:
+.Bl -enum
+.It
+Perform memory operations to initialize the object.
+.Bd -literal
+ struct mumble *m = kmem_alloc(sizeof(*m), KM_SLEEP);
+ m->x = x;
+ m->y = y;
+ m->z = m->x + m->y;
+.Ed
+.It
+Issue
+.Fn atomic_store_release
+to publish it.
+.Bd -literal
+ atomic_store_release(&the_mumble, m);
+.Ed
+.It
+Possibly in another thread, issue
+.Fn atomic_load_consume
+to get it.
+.Bd -literal
+ struct mumble *m = atomic_load_consume(&the_mumble);
+.Ed
+.It
+Perform memory operations to use the object's members.
+.Bd -literal
+ m->y &= m->x;
+ do_things(m->x, m->y, m->z);
+.Ed
+.El
+.Pp
+In both examples, assuming that the value written by
+.Fn atomic_store_release
+in step\~2
+is read by
+.Fn atomic_load_acquire
+or
+.Fn atomic_load_consume
+in step\~3, this guarantees that all of the memory operations in
+step\~1 complete before any of the memory operations in step\~4 \(em
+even if they happen on different CPUs.
+.Pp
+Without
+.Em both
+the release operation in step\~2
+.Em and
+the acquire or consume operation in step\~3, no ordering is guaranteed
+between the memory operations in steps\~1 and\~4.
+In fact, without
+.Em both
+release and acquire/consume, even the assignment
+.Li m->z = m->x + m->y
+in step\~1 might read values of
+.Li m->x
+and
+.Li m->y
+that were written in step\~4.
+.Ss ACQUIRE OR CONSUME?
+You must use
+.Fn atomic_load_acquire
+when subsequent memory operations in program order that must happen
+after the load are on objects at
+.Em addresses that might not depend arithmetically on the resulting value .
+This applies particularly when the choice of whether to do the
+subsequent memory operation depends on a
+.Em control-flow decision based on the resulting value :
+.Bd -literal
+ struct gadget {
+ int ready, x;
+ } the_gadget;
+
+ /* Producer */
+ the_gadget.x = 42;
+ atomic_store_release(&the_gadget.ready, 1);
+
+ /* Consumer */
+ if (atomic_load_acquire(&the_gadget.ready) == 0)
+ return EWOULDBLOCK;
+ int x = the_gadget.x;
+.Ed
+.Pp
+Here the
+.Em decision of whether to load
+.Li the_gadget.x
+depends on a control-flow decision depending on value loaded from
+.Li the_gadget.ready ,
+and loading
+.Li the_gadget.x
+must happen after loading
+.Li the_gadget.ready .
+Using
+.Fn atomic_load_acquire
+guarantees that the compiler and CPU do not conspire to load
+.Li the_gadget.x
+before we have ascertained that it is ready.
+.Pp
+You may use
+.Fn atomic_load_consume
+if all subsequent memory operations in program order that must happen
+after the load are performed on objects at
+.Em addresses computed arithmetically from the resulting value ,
+such as loading a pointer to a structure object and then dereferencing
+it:
+.Bd -literal
+ struct gizmo {
+ int x, y, z;
+ };
+ struct gizmo null_gizmo;
+ struct gizmo *the_gizmo = &null_gizmo;
+
+ /* Producer */
+ struct gizmo *g = kmem_alloc(sizeof(*g), KM_SLEEP);
+ g->x = 12;
+ g->y = 34;
+ g->z = 56;
+ atomic_store_release(&the_gizmo, g);
+
+ /* Consumer */
+ struct gizmo *g = atomic_load_consume(&the_gizmo);
+ int y = g->y;
+.Ed
+.Pp
+Here the
+.Em address
+of
+.Li g->y
+depends on the value of the pointer loaded from
+.Li the_gizmo .
+Using
+.Fn atomic_load_consume
+guarantees that we do not witness a stale cache for that address.
+.Pp
+In some cases it may be unclear.
+For example:
+.Bd -literal
+ int x[2];
+ bool b;
+
+ /* Producer */
+ x[0] = 42;
+ atomic_store_release(&b, 0);
+
+ /* Consumer 1 */
+ int y = atomic_load_???(&b) ? x[0] : x[1];
+
+ /* Consumer 2 */
+ int y = x[atomic_load_???(&b) ? 0 : 1];
+
+ /* Consumer 3 */
+ int y = x[atomic_load_???(&b) ^ 1];
+.Ed
+.Pp
+Although the three consumers seem to be equivalent, by the letter of
+C11 consumers\~1 and\~2 require
+.Fn atomic_load_acquire
+because the value determines the address of a subsequent load only via
+control-flow decisions in the
+.Li ?:
+operator, whereas consumer\~3 can use
+.Fn atomic_load_consume .
+However, if you're not sure, you should err on the side of
+.Fn atomic_load_acquire
+until C11 implementations have ironed out the kinks in the semantics.
+.Pp
+On all CPUs other than DEC Alpha,
+.Fn atomic_load_consume
+is cheap \(em it is identical to
+.Fn atomic_load_relaxed .
+In contrast,
+.Fn atomic_load_acquire
+usually implies an expensive memory barrier.
+.Ss SIZE AND ALIGNMENT
+The pointer
+.Fa p
+must be aligned \(em that is, if the object it points to is
+.No 2^ Ns Fa n
+bytes long, then the low-order
+.Fa n
+bits of
+.Fa p
+must be zero.
+.Pp
+All
+.Nx
+ports support atomic loads and stores on units of data up to 32 bits.
+Some ports additionally support atomic loads and stores on larger
+quantities, like 64-bit quantities, if
+.Dv __HAVE_ATOMIC64_LOADSTORE
+is defined.
+The macros are not allowed on larger quantities of data than the port
+supports atomically; attempts to use them for such quantities should
+result in a compile-time assertion failure.
+.Pp
+For example, as long as you use
+.Fn atomic_store_*
+to write a 32-bit quantity, you can safely use
+.Fn atomic_load_relaxed
+to optimistically read it outside a lock, but for a 64-bit quantity it
+must be conditional on
+.Dv __HAVE_ATOMIC64_LOADSTORE
+\(em otherwise it will lead to compile-time errors on platforms without
+64-bit atomic loads and stores:
+.Bd -literal
+ struct foo {
+ kmutex_t f_lock;
+ uint32_t f_refcnt;
+ uint64_t f_ticket;
+ };
+
+ if (atomic_load_relaxed(&foo->f_refcnt) == 0)
+ return 123;
+#ifdef __HAVE_ATOMIC64_LOADSTORE
+ if (atomic_load_relaxed(&foo->f_ticket) == ticket)
+ return 123;
+#endif
+ mutex_enter(&foo->f_lock);
+ if (foo->f_refcnt == 0 || foo->f_ticket == ticket)
+ ret = 123;
+ ...
+#ifdef __HAVE_ATOMIC64_LOADSTORE
+ atomic_store_relaxed(&foo->f_ticket, foo->f_ticket + 1);
+#else
+ foo->f_ticket++;
+#endif
+ ...
+ mutex_exit(&foo->f_lock);
+.Ed
+.Sh C11 COMPATIBILITY
+These macros are meant to follow
+.Tn C11
+semantics, in terms of
+.Fn atomic_load_explicit
+and
+.Fn atomic_store_explicit
+with the appropriate memory order specifiers, and are meant to make
+future adoption of the
+.Tn C11
+atomic API easier.
+Eventually it may be mandatory to use the
+.Tn C11
+.Vt _Atomic
+type qualifier or equivalent for the operands.
+.Sh LINUX ANALOGUES
+The Linux kernel provides two macros
+.Fn READ_ONCE x
+and
+.Fn WRITE_ONCE x v
+which are similar to
+.Li atomic_load_consume(& Ns Fa x Ns Li ")"
+and
+.Li atomic_store_relaxed(& Ns Fa x Ns Li "," Fa v Ns Li ")" ,
+respectively.
+However, while Linux's
+.Fn READ_ONCE
+and
+.Fn WRITE_ONCE
+prevent fusing, they may in some cases be torn \(em and therefore fail
+to guarantee atomicity \(em because:
+.Bl -bullet
+.It
+They do not require the address
+.Li & Ns Fa x
+to be aligned.
+.It
+They do not require
+.Li sizeof( Ns Fa x Ns Li ")"
+to be at most the largest size of available atomic loads and stores on
+the host architecture.
+.El
+.Sh EXAMPLES
+Maintaining lossy counters.
+These may lose some counts, because the read/modify/write cycle as a
+whole is not atomic.
+But this guarantees that the count will increase by at most one each
+time.
+In contrast, without atomic operations, in principle a write to a
+32-bit counter might be torn into multiple smaller stores, which could
+appear to happen out of order from another CPU's perspective, leading
+to nonsensical counter readouts.
+(For frequent events, consider using per-CPU counters instead in
+practice.)
+.Bd -literal
+ unsigned count;
+
+ void
+ record_event(void)
+ {
+ atomic_store_relaxed(&count,
+ 1 + atomic_load_relaxed(&count));
+ }
+
+ unsigned
+ read_event_count(void)
+ {
+
+ return atomic_load_relaxed(&count);
+ }
+.Ed
+.Pp
+Initialization barrier.
+.Bd -literal
+ int ready;
+ struct data d;
+
+ void
+ setup_and_notify(void)
+ {
+
+ setup_data(&d.things);
+ atomic_store_release(&ready, 1);
+ }
+
+ void
+ try_if_ready(void)
+ {
+
+ if (atomic_load_acquire(&ready))
+ do_stuff(d.things);
+ }
+.Ed
+.Pp
+Publishing a pointer to the current snapshot of data.
+(Caller must arrange that only one call to take_snapshot happens at any
+given time; generally this should be done in coordination with
+.Xr pserialize 9
+or similar to enable resource reclamation.)
+.Bd -literal
+ struct data *current_d;
+
+ void
+ take_snapshot(void)
+ {
+ struct data *d = kmem_alloc(sizeof(*d));
+
+ d->things = ...;
+
+ atomic_store_release(¤t_d, d);
+ }
+
+ struct data *
+ get_snapshot(void)
+ {
+
+ return atomic_load_consume(¤t_d);
+ }
+.Ed
+.Sh CODE REFERENCES
+.Pa sys/sys/atomic.h
+.Sh SEE ALSO
+.Xr atomic_ops 3 ,
+.Xr membar_ops 3 ,
+.Xr pserialize 9
+.Sh HISTORY
+These atomic operations first appeared in
+.Nx 10.0 .
+.Sh CAVEATS
+C11 formally specifies that all subexpressions, except the left
+operands of the
+.Li "&&" , "||" , "?:" ,
+and
+.Li ","
+operators and the
+.Fn kill_dependency
+macro, carry dependencies for which
+.Dv memory_order_consume
+guarantees ordering, but most or all implementations to date simply
+treat
+.Dv memory_order_consume
+as
+.Dv memory_order_acquire
+and do not take advantage of data dependencies to elide costly memory
+barriers or load-acquire CPU instructions.
+.Pp
+Instead, we implement
+.Fn atomic_load_consume
+as
+.Fn atomic_load_relaxed
+followed by
+.Xr membar_datadep_consumer 3 ,
+which is equivalent to
+.Xr membar_consumer 3
+on DEC Alpha and
+.Xr __insn_barrier 3
+elsewhere.
+.Sh BUGS
+Some idiot decided to call it
+.Em tearing ,
+depriving us of the opportunity to say that atomic operations prevent
+fusion and
+.Em fission .