Module Name:    src
Committed By:   riastradh
Date:           Fri Nov 29 22:17:24 UTC 2019

Modified Files:
        src/distrib/sets/lists/comp: mi
        src/share/man/man9: Makefile
        src/sys/sys: atomic.h
Added Files:
        src/share/man/man9: atomic_loadstore.9

Log Message:
New atomic load/store operations for the kernel.

Guarantee no fusing and no tearing, and can optionally impose
ordering relative to other memory operations.

Unordered:
- atomic_load_relaxed
- atomic_store_relaxed

Ordered:
- atomic_load_acquire
- atomic_load_consume
- atomic_store_release

These are intended to match C11 semantics, and can be defined in
terms of the C11 atomic API when ready.


To generate a diff of this commit:
cvs rdiff -u -r1.2293 -r1.2294 src/distrib/sets/lists/comp/mi
cvs rdiff -u -r1.442 -r1.443 src/share/man/man9/Makefile
cvs rdiff -u -r0 -r1.1 src/share/man/man9/atomic_loadstore.9
cvs rdiff -u -r1.17 -r1.18 src/sys/sys/atomic.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/distrib/sets/lists/comp/mi
diff -u src/distrib/sets/lists/comp/mi:1.2293 src/distrib/sets/lists/comp/mi:1.2294
--- src/distrib/sets/lists/comp/mi:1.2293	Fri Nov 29 20:31:35 2019
+++ src/distrib/sets/lists/comp/mi	Fri Nov 29 22:17:23 2019
@@ -1,4 +1,4 @@
-#	$NetBSD: mi,v 1.2293 2019/11/29 20:31:35 riastradh Exp $
+#	$NetBSD: mi,v 1.2294 2019/11/29 22:17:23 riastradh Exp $
 #
 # Note: don't delete entries from here - mark them as "obsolete" instead.
 ./etc/mtree/set.comp				comp-sys-root
@@ -10704,6 +10704,12 @@
 ./usr/share/man/cat9/arp_ifinit.0		comp-sys-catman		.cat
 ./usr/share/man/cat9/arpintr.0			comp-sys-catman		.cat
 ./usr/share/man/cat9/arpresolve.0		comp-sys-catman		.cat
+./usr/share/man/cat9/atomic_load_acquire.0	comp-sys-catman		.cat
+./usr/share/man/cat9/atomic_load_consume.0	comp-sys-catman		.cat
+./usr/share/man/cat9/atomic_load_relaxed.0	comp-sys-catman		.cat
+./usr/share/man/cat9/atomic_loadstore.0		comp-sys-catman		.cat
+./usr/share/man/cat9/atomic_store_relaxed.0	comp-sys-catman		.cat
+./usr/share/man/cat9/atomic_store_release.0	comp-sys-catman		.cat
 ./usr/share/man/cat9/atop.0			comp-sys-catman		.cat
 ./usr/share/man/cat9/audio.0			comp-sys-catman		.cat
 ./usr/share/man/cat9/audio_system.0		comp-sys-catman		obsolete
@@ -18652,6 +18658,12 @@
 ./usr/share/man/html9/arp_ifinit.html		comp-sys-htmlman	html
 ./usr/share/man/html9/arpintr.html		comp-sys-htmlman	html
 ./usr/share/man/html9/arpresolve.html		comp-sys-htmlman	html
+./usr/share/man/html9/atomic_load_acquire.html	comp-sys-htmlman	html
+./usr/share/man/html9/atomic_load_consume.html	comp-sys-htmlman	html
+./usr/share/man/html9/atomic_load_relaxed.html	comp-sys-htmlman	html
+./usr/share/man/html9/atomic_loadstore.html	comp-sys-htmlman	html
+./usr/share/man/html9/atomic_store_relaxed.html	comp-sys-htmlman	html
+./usr/share/man/html9/atomic_store_release.html	comp-sys-htmlman	html
 ./usr/share/man/html9/atop.html			comp-sys-htmlman	html
 ./usr/share/man/html9/audio.html		comp-sys-htmlman	html
 ./usr/share/man/html9/audio_system.html		comp-sys-htmlman	obsolete
@@ -26703,6 +26715,12 @@
 ./usr/share/man/man9/arp_ifinit.9		comp-sys-man		.man
 ./usr/share/man/man9/arpintr.9			comp-sys-man		.man
 ./usr/share/man/man9/arpresolve.9		comp-sys-man		.man
+./usr/share/man/man9/atomic_load_acquire.9	comp-sys-man		.man
+./usr/share/man/man9/atomic_load_consume.9	comp-sys-man		.man
+./usr/share/man/man9/atomic_load_relaxed.9	comp-sys-man		.man
+./usr/share/man/man9/atomic_loadstore.9		comp-sys-man		.man
+./usr/share/man/man9/atomic_store_relaxed.9	comp-sys-man		.man
+./usr/share/man/man9/atomic_store_release.9	comp-sys-man		.man
 ./usr/share/man/man9/atop.9			comp-sys-man		.man
 ./usr/share/man/man9/audio.9			comp-sys-man		.man
 ./usr/share/man/man9/audio_system.9		comp-sys-man		obsolete

Index: src/share/man/man9/Makefile
diff -u src/share/man/man9/Makefile:1.442 src/share/man/man9/Makefile:1.443
--- src/share/man/man9/Makefile:1.442	Fri Nov 29 20:31:35 2019
+++ src/share/man/man9/Makefile	Fri Nov 29 22:17:23 2019
@@ -1,4 +1,4 @@
-#       $NetBSD: Makefile,v 1.442 2019/11/29 20:31:35 riastradh Exp $
+#       $NetBSD: Makefile,v 1.443 2019/11/29 22:17:23 riastradh Exp $
 
 #	Makefile for section 9 (kernel function and variable) manual pages.
 
@@ -67,6 +67,13 @@ MAN=	accept_filter.9 accf_data.9 accf_ht
 	wsbell.9 wscons.9 wsdisplay.9 wsfont.9 wskbd.9 wsmouse.9 \
 	xcall.9
 
+MAN+=	atomic_loadstore.9
+MLINKS+=atomic_loadstore.9 atomic_load_acquire.9 \
+	atomic_loadstore.9 atomic_load_consume.9 \
+	atomic_loadstore.9 atomic_load_relaxed.9 \
+	atomic_loadstore.9 atomic_store_relaxed.9 \
+	atomic_loadstore.9 atomic_store_release.9
+
 MAN+=	boothowto.9
 MLINKS+=boothowto.9 BOOT_FLAG.9
 

Index: src/sys/sys/atomic.h
diff -u src/sys/sys/atomic.h:1.17 src/sys/sys/atomic.h:1.18
--- src/sys/sys/atomic.h:1.17	Thu Nov 14 16:23:53 2019
+++ src/sys/sys/atomic.h	Fri Nov 29 22:17:23 2019
@@ -1,4 +1,4 @@
-/*	$NetBSD: atomic.h,v 1.17 2019/11/14 16:23:53 maxv Exp $	*/
+/*	$NetBSD: atomic.h,v 1.18 2019/11/29 22:17:23 riastradh Exp $	*/
 
 /*-
  * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc.
@@ -383,4 +383,109 @@ __END_DECLS
 #define atomic_inc_64_nv	kmsan_atomic_inc_64_nv
 #endif
 
+#ifdef _KERNEL
+
+#if 1 // XXX: __STDC_VERSION__ < 201112L
+
+/* Pre-C11 definitions */
+
+#include <sys/cdefs.h>
+#include <sys/bitops.h>
+
+#ifdef _LP64
+#define	__HAVE_ATOMIC64_LOADSTORE	1
+#define	__ATOMIC_SIZE_MAX		8
+#else
+#define	__ATOMIC_SIZE_MAX		4
+#endif
+
+/*
+ * We assume that access to an aligned pointer to a volatile object of
+ * at most __ATOMIC_SIZE_MAX bytes is guaranteed to be atomic.  This is
+ * an assumption that may be wrong, but we hope it won't be wrong
+ * before we just adopt the C11 atomic API.
+ */
+#define	__ATOMIC_PTR_CHECK(p) do					      \
+{									      \
+	CTASSERT(sizeof(*(p)) <= __ATOMIC_SIZE_MAX);			      \
+	KASSERT(((uintptr_t)(p) & ilog2(sizeof(*(p)))) == 0);		      \
+} while (0)
+
+#define	atomic_load_relaxed(p)						      \
+({									      \
+	const volatile __typeof__(*(p)) *__al_ptr = (p);		      \
+	__ATOMIC_PTR_CHECK(__al_ptr);					      \
+	*__al_ptr;							      \
+})
+
+#define	atomic_load_consume(p)						      \
+({									      \
+	const volatile __typeof__(*(p)) *__al_ptr = (p);		      \
+	__ATOMIC_PTR_CHECK(__al_ptr);					      \
+	__typeof__(*(p)) __al_val = *__al_ptr;				      \
+	membar_datadep_consumer();					      \
+	__al_val;							      \
+})
+
+/*
+ * We want {loads}-before-{loads,stores}.  It is tempting to use
+ * membar_enter, but that provides {stores}-before-{loads,stores},
+ * which may not help.  So we must use membar_sync, which does the
+ * slightly stronger {loads,stores}-before-{loads,stores}.
+ */
+#define	atomic_load_acquire(p)						      \
+({									      \
+	const volatile __typeof__(*(p)) *__al_ptr = (p);		      \
+	__ATOMIC_PTR_CHECK(__al_ptr);					      \
+	__typeof__(*(p)) __al_val = *__al_ptr;				      \
+	membar_sync();							      \
+	__al_val;							      \
+})
+
+#define	atomic_store_relaxed(p,v)					      \
+({									      \
+	volatile __typeof__(*(p)) *__as_ptr = (p);			      \
+	__ATOMIC_PTR_CHECK(__as_ptr);					      \
+	*__as_ptr = (v);						      \
+})
+
+#define	atomic_store_release(p,v)					      \
+({									      \
+	volatile __typeof__(*(p)) *__as_ptr = (p);			      \
+	__typeof__(*(p)) __as_val = (v);				      \
+	__ATOMIC_PTR_CHECK(__as_ptr);					      \
+	membar_exit();							      \
+	*__as_ptr = __as_val;						      \
+})
+
+#else  /* __STDC_VERSION__ >= 201112L */
+
+/* C11 definitions, not yet available */
+
+#include <stdatomic.h>
+
+#define	atomic_load_relaxed(p)						      \
+	atomic_load_explicit((p), memory_order_relaxed)
+#if 0				/* memory_order_consume is not there yet */
+#define	atomic_load_consume(p)						      \
+	atomic_load_explicit((p), memory_order_consume)
+#else
+#define	atomic_load_consume(p)						      \
+({									      \
+	const __typeof__(*(p)) __al_val = atomic_load_relaxed(p);	      \
+	membar_datadep_consumer();					      \
+	__al_val;							      \
+})
+#endif
+#define	atomic_load_acquire(p)						      \
+	atomic_load_explicit((p), memory_order_acquire)
+#define	atomic_store_relaxed(p, v)					      \
+	atomic_store_explicit((p), (v), memory_order_relaxed)
+#define	atomic_store_release(p, v)					      \
+	atomic_store_explicit((p), (v), memory_order_release)
+
+#endif	/* __STDC_VERSION__ */
+
+#endif	/* _KERNEL */
+
 #endif /* ! _SYS_ATOMIC_H_ */

Added files:

Index: src/share/man/man9/atomic_loadstore.9
diff -u /dev/null src/share/man/man9/atomic_loadstore.9:1.1
--- /dev/null	Fri Nov 29 22:17:24 2019
+++ src/share/man/man9/atomic_loadstore.9	Fri Nov 29 22:17:23 2019
@@ -0,0 +1,744 @@
+.\"	$NetBSD: atomic_loadstore.9,v 1.1 2019/11/29 22:17:23 riastradh Exp $
+.\"
+.\" Copyright (c) 2019 The NetBSD Foundation
+.\" All rights reserved.
+.\"
+.\" This code is derived from software contributed to The NetBSD Foundation
+.\" by Taylor R. Campbell.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
+.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+.\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
+.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+.\" POSSIBILITY OF SUCH DAMAGE.
+.\"
+.Dd November 25, 2019
+.Dt ATOMIC_LOADSTORE 9
+.Os
+.Sh NAME
+.Nm atomic_load_relaxed ,
+.Nm atomic_load_acquire ,
+.Nm atomic_load_consume ,
+.Nm atomic_store_relaxed ,
+.Nm atomic_store_release
+.Nd atomic and ordered memory operations
+.Sh SYNOPSIS
+.In sys/atomic.h
+.Ft T
+.Fn atomic_load_relaxed "const volatile T *p"
+.Ft T
+.Fn atomic_load_acquire "const volatile T *p"
+.Ft T
+.Fn atomic_load_consume "const volatile T *p"
+.Ft void
+.Fn atomic_store_relaxed "volatile T *p" "T v"
+.Ft void
+.Fn atomic_store_release "volatile T *p" "T v"
+.Sh DESCRIPTION
+These type-generic macros implement memory operations that are
+.Em atomic
+and that have
+.Em memory ordering constraints .
+Aside from atomicity and ordering, the load operations are equivalent
+to
+.Li * Ns Fa p
+and the store operations are equivalent to
+.Li * Ns Fa p Li "=" Fa v .
+The pointer
+.Fa p
+must be aligned, even on architectures like x86 which generally lack
+strict alignment requirements; see
+.Sx SIZE AND ALIGNMENT
+for details.
+.Pp
+.Em Atomic
+means that the memory operations cannot be
+.Em fused
+or
+.Em torn :
+.Bl -bullet
+.It
+.Em Fusing
+is combining multiple memory operations on a single object into one
+memory operation, such as replacing
+.Bd -literal -compact
+        *p = v;
+	x = *p;
+.Ed
+by
+.Bd -literal -compact
+	*p = v;
+	x = v;
+.Ed
+since the compiler can prove that
+.Li * Ns Fa p
+will yield
+.Fa v
+after
+.Li * Ns Fa p Li = Fa v .
+For
+.Em atomic
+memory operations, the implementation
+.Em will not
+assume that
+.Bl -dash -compact
+.It
+consecutive loads of the same object will return the same value, or
+.It
+a store followed by a load of the same object will return the value
+stored, or
+.It
+consecutive stores of the same object are redundant.
+.El
+Thus, the implementation will not replace two consecutive atomic loads
+by one, will not elide an atomic load following a store, and will not
+combine two consecutive atomic stores into one.
+.Pp
+For example,
+.Bd -literal
+	atomic_store_relaxed(&flag, 1);
+	while (atomic_load_relaxed(&flag))
+		continue;
+.Ed
+.Pp
+may be used to set a flag and then busy-wait until another thread
+clears it, whereas
+.Bd -literal
+	flag = 1;
+	while (flag)
+		continue;
+.Ed
+.Pp
+may be transformed into the infinite loop
+.Bd -literal
+	flag = 1;
+	while (1)
+		continue;
+.Ed
+.It
+.Em Tearing
+is implementing a memory operation on a large data unit such as a
+32-bit word by issuing multiple memory operations on smaller data units
+such as 8-bit bytes.
+The implementation will not tear
+.Em atomic
+loads or stores into smaller ones.
+Thus, as far as any interrupt, other thread, or other CPU can tell, an
+atomic memory operation is issued either all at once or not at all.
+.Pp
+For example, if a 32-bit word
+.Fa w
+is written with
+.Li atomic_store_relaxed(& Ns Fa w Ns Li "," 0x00010002) ,
+then an interrupt, other thread, or other CPU reading it with
+.Li atomic_load_relaxed(& Ns Fa w Ns Li ")"
+will never witness it partially written, whereas
+.Fa w Li = 0x00010002
+might be compiled into a pair of separate 16-bit store instructions
+instead of one single word-sized store instruction, in which case other
+threads may see the intermediate state with only one of the halves
+written.
+.El
+.Pp
+Atomic operations on any single object occur in a total order shared by
+all interrupts, threads, and CPUs, which is consistent with the program
+order in every interrupt, thread, and CPU.
+A single program without interruption or other threads or CPUs will
+always observe its own loads and stores in program order, but another
+program in an interrupt handler, in another thread, or on another CPU
+may issue loads that return values as if the first program's stores
+occurred out of program order, and vice versa.
+Two different threads might each observe a third thread's memory
+operations in different orders.
+.Pp
+The
+.Em memory ordering constraints
+make limited guarantees of ordering relative to memory operations on
+.Em other
+objects as witnessed by interrupts, other threads, or other CPUs, and
+have the following meanings:
+.Bl -tag -width relaxed
+.It relaxed
+No ordering relative to memory operations on any other objects is
+guaranteed.
+Relaxed ordering is the default for ordinary non-atomic memory
+operations like
+.Li * Ns Fa p
+and
+.Li * Ns Fa p Li = Fa v .
+.Pp
+Atomic operations with relaxed ordering are cheap: they are not
+read/modify/write atomic operations, and they do not involve any kind
+of inter-CPU ordering barriers.
+.It acquire
+This memory operation happens before all subsequent memory operations
+in program order.
+However, prior memory operations in program order may be reordered to
+happen after this one.
+For example, assuming no aliasing between the pointers, the
+implementation is allowed to treat
+.Bd -literal
+	int x = *p;
+	if (atomic_load_acquire(q)) {
+		int y = *r;
+		*s = x + y;
+		return 1;
+	}
+.Ed
+.Pp
+as if it were
+.Bd -literal
+	if (atomic_load_acquire(q)) {
+		int x = *p;
+		int y = *r;
+		*s = x + y;
+		return 1;
+	}
+.Ed
+.Pp
+but
+.Em not
+as if it were
+.Bd -literal
+	int x = *p;
+	int y = *r;
+	*s = x + y;
+	if (atomic_load_acquire(q)) {
+		return 1;
+	}
+.Ed
+.It consume
+This memory operation happens before all memory operations on objects
+at addresses that are computed from the value returned by this one.
+Otherwise, no ordering relative to memory operations on other objects
+is implied.
+.Pp
+For example, the implementation is allowed to treat
+.Bd -literal
+	struct foo *foo0, *foo1;
+
+	struct foo *f0 = atomic_load_consume(&foo0);
+	struct foo *f1 = atomic_load_consume(&foo1);
+	int x = f0->x;
+	int y = f1->y;
+.Ed
+.Pp
+as if it were
+.Bd -literal
+	struct foo *foo0, *foo1;
+
+	struct foo *f1 = atomic_load_consume(&foo1);
+	struct foo *f0 = atomic_load_consume(&foo0);
+	int y = f1->y;
+	int x = f0->x;
+.Ed
+.Pp
+but loading
+.Li f0->x
+is guaranteed to happen after loading
+.Li foo0
+even if the CPU had a cached value for the address that
+.Li f0->x
+happened to be at, and likewise for
+.Li f1->y
+and
+.Li foo1 .
+.Pp
+.Fn atomic_load_consume
+functions like
+.Fn atomic_load_acquire
+as long as the memory operations that must happen after it are limited
+to addresses that depend on the value returned by it, but it is almost
+always as cheap as
+.Fn atomic_load_relaxed .
+See
+.Sx ACQUIRE OR CONSUME?
+below for more details.
+.It release
+All prior memory operations in program order happen before this one.
+However, subsequent memory operations in program order may be reordered
+to happen before this one too.
+For example, assuming no aliasing between the pointers, the
+implementation is allowed to treat
+.Bd -literal
+	int x = *p;
+	*q = x;
+	atomic_store_release(r, 0);
+	int y = *s;
+	return x + y;
+.Ed
+.Pp
+as if it were
+.Bd -literal
+	int y = *s;
+	int x = *p;
+	*q = x;
+	atomic_store_release(r, 0);
+	return x + y;
+.Ed
+.Pp
+but
+.Em not
+as if it were
+.Bd -literal
+	atomic_store_release(r, 0);
+	int x = *p;
+	int y = *s;
+	*q = x;
+	return x + y;
+.Ed
+.El
+.Ss PAIRING ORDERED MEMORY OPERATIONS
+In general, each
+.Fn atomic_store_release
+.Em must
+be paired with either
+.Fn atomic_load_acquire
+or
+.Fn atomic_load_consume
+in order to have an effect \(em it is only when a release operation
+synchronizes with an acquire or consume operation that any ordering
+guaranteed between memory operations
+.Em before
+the release operation and memory operations
+.Em after
+the acquire/consume operation.
+.Pp
+For example, to set up an entry in a table and then mark the entry
+ready, you should:
+.Bl -enum
+.It
+Perform memory operations to initialize the data.
+.Bd -literal
+	tab[i].x = ...;
+	tab[i].y = ...;
+.Ed
+.It
+Issue
+.Fn atomic_store_release
+to mark it ready.
+.Bd -literal
+	atomic_store_release(&tab[i].ready, 1);
+.Ed
+.It
+Possibly in another thread, issue
+.Fn atomic_load_acquire
+to ascertain whether it is ready.
+.Bd -literal
+	if (atomic_load_acquire(&tab[i].ready) == 0)
+		return EWOULDBLOCK;
+.Ed
+.It
+Perform memory operations to use the data.
+.Bd -literal
+	do_stuff(tab[i].x, tab[i].y);
+.Ed
+.El
+.Pp
+Similarly, if you want to create an object, initialize it, and then
+publish it to be used by another thread, then you should:
+.Bl -enum
+.It
+Perform memory operations to initialize the object.
+.Bd -literal
+	struct mumble *m = kmem_alloc(sizeof(*m), KM_SLEEP);
+	m->x = x;
+	m->y = y;
+	m->z = m->x + m->y;
+.Ed
+.It
+Issue
+.Fn atomic_store_release
+to publish it.
+.Bd -literal
+	atomic_store_release(&the_mumble, m);
+.Ed
+.It
+Possibly in another thread, issue
+.Fn atomic_load_consume
+to get it.
+.Bd -literal
+	struct mumble *m = atomic_load_consume(&the_mumble);
+.Ed
+.It
+Perform memory operations to use the object's members.
+.Bd -literal
+	m->y &= m->x;
+	do_things(m->x, m->y, m->z);
+.Ed
+.El
+.Pp
+In both examples, assuming that the value written by
+.Fn atomic_store_release
+in step\~2
+is read by
+.Fn atomic_load_acquire
+or
+.Fn atomic_load_consume
+in step\~3, this guarantees that all of the memory operations in
+step\~1 complete before any of the memory operations in step\~4 \(em
+even if they happen on different CPUs.
+.Pp
+Without
+.Em both
+the release operation in step\~2
+.Em and
+the acquire or consume operation in step\~3, no ordering is guaranteed
+between the memory operations in steps\~1 and\~4.
+In fact, without
+.Em both
+release and acquire/consume, even the assignment
+.Li m->z = m->x + m->y
+in step\~1 might read values of
+.Li m->x
+and
+.Li m->y
+that were written in step\~4.
+.Ss ACQUIRE OR CONSUME?
+You must use
+.Fn atomic_load_acquire
+when subsequent memory operations in program order that must happen
+after the load are on objects at
+.Em addresses that might not depend arithmetically on the resulting value .
+This applies particularly when the choice of whether to do the
+subsequent memory operation depends on a
+.Em control-flow decision based on the resulting value :
+.Bd -literal
+	struct gadget {
+		int ready, x;
+	} the_gadget;
+
+	/* Producer */
+	the_gadget.x = 42;
+	atomic_store_release(&the_gadget.ready, 1);
+
+	/* Consumer */
+	if (atomic_load_acquire(&the_gadget.ready) == 0)
+		return EWOULDBLOCK;
+	int x = the_gadget.x;
+.Ed
+.Pp
+Here the
+.Em decision of whether to load
+.Li the_gadget.x
+depends on a control-flow decision depending on value loaded from
+.Li the_gadget.ready ,
+and loading
+.Li the_gadget.x
+must happen after loading
+.Li the_gadget.ready .
+Using
+.Fn atomic_load_acquire
+guarantees that the compiler and CPU do not conspire to load
+.Li the_gadget.x
+before we have ascertained that it is ready.
+.Pp
+You may use
+.Fn atomic_load_consume
+if all subsequent memory operations in program order that must happen
+after the load are performed on objects at
+.Em addresses computed arithmetically from the resulting value ,
+such as loading a pointer to a structure object and then dereferencing
+it:
+.Bd -literal
+	struct gizmo {
+		int x, y, z;
+	};
+	struct gizmo null_gizmo;
+	struct gizmo *the_gizmo = &null_gizmo;
+
+	/* Producer */
+	struct gizmo *g = kmem_alloc(sizeof(*g), KM_SLEEP);
+	g->x = 12;
+	g->y = 34;
+	g->z = 56;
+	atomic_store_release(&the_gizmo, g);
+
+	/* Consumer */
+	struct gizmo *g = atomic_load_consume(&the_gizmo);
+	int y = g->y;
+.Ed
+.Pp
+Here the
+.Em address
+of
+.Li g->y
+depends on the value of the pointer loaded from
+.Li the_gizmo .
+Using
+.Fn atomic_load_consume
+guarantees that we do not witness a stale cache for that address.
+.Pp
+In some cases it may be unclear.
+For example:
+.Bd -literal
+	int x[2];
+	bool b;
+
+	/* Producer */
+	x[0] = 42;
+	atomic_store_release(&b, 0);
+
+	/* Consumer 1 */
+	int y = atomic_load_???(&b) ? x[0] : x[1];
+
+	/* Consumer 2 */
+	int y = x[atomic_load_???(&b) ? 0 : 1];
+
+	/* Consumer 3 */
+	int y = x[atomic_load_???(&b) ^ 1];
+.Ed
+.Pp
+Although the three consumers seem to be equivalent, by the letter of
+C11 consumers\~1 and\~2 require
+.Fn atomic_load_acquire
+because the value determines the address of a subsequent load only via
+control-flow decisions in the
+.Li ?:
+operator, whereas consumer\~3 can use
+.Fn atomic_load_consume .
+However, if you're not sure, you should err on the side of
+.Fn atomic_load_acquire
+until C11 implementations have ironed out the kinks in the semantics.
+.Pp
+On all CPUs other than DEC Alpha,
+.Fn atomic_load_consume
+is cheap \(em it is identical to
+.Fn atomic_load_relaxed .
+In contrast,
+.Fn atomic_load_acquire
+usually implies an expensive memory barrier.
+.Ss SIZE AND ALIGNMENT
+The pointer
+.Fa p
+must be aligned \(em that is, if the object it points to is
+.No 2^ Ns Fa n
+bytes long, then the low-order
+.Fa n
+bits of
+.Fa p
+must be zero.
+.Pp
+All
+.Nx
+ports support atomic loads and stores on units of data up to 32 bits.
+Some ports additionally support atomic loads and stores on larger
+quantities, like 64-bit quantities, if
+.Dv __HAVE_ATOMIC64_LOADSTORE
+is defined.
+The macros are not allowed on larger quantities of data than the port
+supports atomically; attempts to use them for such quantities should
+result in a compile-time assertion failure.
+.Pp
+For example, as long as you use
+.Fn atomic_store_*
+to write a 32-bit quantity, you can safely use
+.Fn atomic_load_relaxed
+to optimistically read it outside a lock, but for a 64-bit quantity it
+must be conditional on
+.Dv __HAVE_ATOMIC64_LOADSTORE
+\(em otherwise it will lead to compile-time errors on platforms without
+64-bit atomic loads and stores:
+.Bd -literal
+	struct foo {
+		kmutex_t	f_lock;
+		uint32_t	f_refcnt;
+		uint64_t	f_ticket;
+	};
+
+	if (atomic_load_relaxed(&foo->f_refcnt) == 0)
+		return 123;
+#ifdef __HAVE_ATOMIC64_LOADSTORE
+	if (atomic_load_relaxed(&foo->f_ticket) == ticket)
+		return 123;
+#endif
+	mutex_enter(&foo->f_lock);
+	if (foo->f_refcnt == 0 || foo->f_ticket == ticket)
+		ret = 123;
+	...
+#ifdef __HAVE_ATOMIC64_LOADSTORE
+	atomic_store_relaxed(&foo->f_ticket, foo->f_ticket + 1);
+#else
+	foo->f_ticket++;
+#endif
+	...
+	mutex_exit(&foo->f_lock);
+.Ed
+.Sh C11 COMPATIBILITY
+These macros are meant to follow
+.Tn C11
+semantics, in terms of
+.Fn atomic_load_explicit
+and
+.Fn atomic_store_explicit
+with the appropriate memory order specifiers, and are meant to make
+future adoption of the
+.Tn C11
+atomic API easier.
+Eventually it may be mandatory to use the
+.Tn C11
+.Vt _Atomic
+type qualifier or equivalent for the operands.
+.Sh LINUX ANALOGUES
+The Linux kernel provides two macros
+.Fn READ_ONCE x
+and
+.Fn WRITE_ONCE x v
+which are similar to
+.Li atomic_load_consume(& Ns Fa x Ns Li ")"
+and
+.Li atomic_store_relaxed(& Ns Fa x Ns Li "," Fa v Ns Li ")" ,
+respectively.
+However, while Linux's
+.Fn READ_ONCE
+and
+.Fn WRITE_ONCE
+prevent fusing, they may in some cases be torn \(em and therefore fail
+to guarantee atomicity \(em because:
+.Bl -bullet
+.It
+They do not require the address
+.Li & Ns Fa x
+to be aligned.
+.It
+They do not require
+.Li sizeof( Ns Fa x Ns Li ")"
+to be at most the largest size of available atomic loads and stores on
+the host architecture.
+.El
+.Sh EXAMPLES
+Maintaining lossy counters.
+These may lose some counts, because the read/modify/write cycle as a
+whole is not atomic.
+But this guarantees that the count will increase by at most one each
+time.
+In contrast, without atomic operations, in principle a write to a
+32-bit counter might be torn into multiple smaller stores, which could
+appear to happen out of order from another CPU's perspective, leading
+to nonsensical counter readouts.
+(For frequent events, consider using per-CPU counters instead in
+practice.)
+.Bd -literal
+	unsigned count;
+
+	void
+	record_event(void)
+	{
+		atomic_store_relaxed(&count,
+		    1 + atomic_load_relaxed(&count));
+	}
+
+	unsigned
+	read_event_count(void)
+	{
+
+		return atomic_load_relaxed(&count);
+	}
+.Ed
+.Pp
+Initialization barrier.
+.Bd -literal
+	int ready;
+	struct data d;
+
+	void
+	setup_and_notify(void)
+	{
+
+		setup_data(&d.things);
+		atomic_store_release(&ready, 1);
+	}
+
+	void
+	try_if_ready(void)
+	{
+
+		if (atomic_load_acquire(&ready))
+			do_stuff(d.things);
+	}
+.Ed
+.Pp
+Publishing a pointer to the current snapshot of data.
+(Caller must arrange that only one call to take_snapshot happens at any
+given time; generally this should be done in coordination with
+.Xr pserialize 9
+or similar to enable resource reclamation.)
+.Bd -literal
+	struct data *current_d;
+
+	void
+	take_snapshot(void)
+        {
+		struct data *d = kmem_alloc(sizeof(*d));
+
+		d->things = ...;
+
+		atomic_store_release(&current_d, d);
+	}
+
+	struct data *
+	get_snapshot(void)
+	{
+
+		return atomic_load_consume(&current_d);
+	}
+.Ed
+.Sh CODE REFERENCES
+.Pa sys/sys/atomic.h
+.Sh SEE ALSO
+.Xr atomic_ops 3 ,
+.Xr membar_ops 3 ,
+.Xr pserialize 9
+.Sh HISTORY
+These atomic operations first appeared in
+.Nx 10.0 .
+.Sh CAVEATS
+C11 formally specifies that all subexpressions, except the left
+operands of the
+.Li "&&" , "||" , "?:" ,
+and
+.Li ","
+operators and the
+.Fn kill_dependency
+macro, carry dependencies for which
+.Dv memory_order_consume
+guarantees ordering, but most or all implementations to date simply
+treat
+.Dv memory_order_consume
+as
+.Dv memory_order_acquire
+and do not take advantage of data dependencies to elide costly memory
+barriers or load-acquire CPU instructions.
+.Pp
+Instead, we implement
+.Fn atomic_load_consume
+as
+.Fn atomic_load_relaxed
+followed by
+.Xr membar_datadep_consumer 3 ,
+which is equivalent to
+.Xr membar_consumer 3
+on DEC Alpha and
+.Xr __insn_barrier 3
+elsewhere.
+.Sh BUGS
+Some idiot decided to call it
+.Em tearing ,
+depriving us of the opportunity to say that atomic operations prevent
+fusion and
+.Em fission .

Reply via email to