git: 9c0b759bf9b5 - main - x86 atomics: use lock prefix unconditionally

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Fri, 04 Feb 2022 12:01:45 UTC
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=9c0b759bf9b520537616d026f21a0a98d70acd11

commit 9c0b759bf9b520537616d026f21a0a98d70acd11
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2022-02-03 09:51:36 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2022-02-04 12:01:39 +0000

    x86 atomics: use lock prefix unconditionally
    
    Atomics have significant other use besides providing in-system
    primitives for safe memory updates.  They are used for implementing
    communication with out of system software or hardware following some
    protocols.
    
    For instance, even UP kernel might require a protocol using atomics to
    communicate with the software-emulated device on SMP hypervisor.  Or
    real hardware might need atomic accesses as part of the proper
    management protocol.
    
    Another point is that UP configurations on x86 are extinct, so slight
    performance hit by unconditionally use proper atomics is not important.
    It is compensated by less code clutter, which in fact improves the
    UP/i386 lifetime expectations.
    
    Requested by:   Elliott Mitchell <ehem+freebsd@m5p.com>
    Reviewed by:    Elliott Mitchell, imp, jhb, markj, royger
    Sponsored by:   The FreeBSD Foundation
    MFC after:      1 week
    Differential revision:  https://reviews.freebsd.org/D34153
---
 sys/amd64/include/atomic.h | 67 +++++++++++-----------------------------------
 sys/i386/include/atomic.h  | 52 +++++++++++------------------------
 2 files changed, 32 insertions(+), 87 deletions(-)

diff --git a/sys/amd64/include/atomic.h b/sys/amd64/include/atomic.h
index ed80c426add2..f0191970e8cc 100644
--- a/sys/amd64/include/atomic.h
+++ b/sys/amd64/include/atomic.h
@@ -143,16 +143,10 @@ void		atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
 #else /* !__GNUCLIKE_ASM */
 
 /*
- * For userland, always use lock prefixes so that the binaries will run
- * on both SMP and !SMP systems.
- */
-#if defined(SMP) || !defined(_KERNEL) || defined(KLD_MODULE)
-#define	MPLOCKED	"lock ; "
-#else
-#define	MPLOCKED
-#endif
-
-/*
+ * Always use lock prefixes.  The result is slighly less optimal for
+ * UP systems, but it matters less now, and sometimes UP is emulated
+ * over SMP.
+ *
  * The assembly is volatilized to avoid code chunk removal by the compiler.
  * GCC aggressively reorders operations and memory clobbering is necessary
  * in order to avoid that for memory barriers.
@@ -161,7 +155,7 @@ void		atomic_store_rel_##TYPE(volatile u_##TYPE *p, u_##TYPE v)
 static __inline void					\
 atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
 {							\
-	__asm __volatile(MPLOCKED OP			\
+	__asm __volatile("lock; " OP			\
 	: "+m" (*p)					\
 	: CONS (V)					\
 	: "cc");					\
@@ -170,7 +164,7 @@ atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
 static __inline void					\
 atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
 {							\
-	__asm __volatile(MPLOCKED OP			\
+	__asm __volatile("lock; " OP			\
 	: "+m" (*p)					\
 	: CONS (V)					\
 	: "memory", "cc");				\
@@ -199,8 +193,7 @@ atomic_cmpset_##TYPE(volatile u_##TYPE *dst, u_##TYPE expect, u_##TYPE src) \
 	u_char res;					\
 							\
 	__asm __volatile(				\
-	"	" MPLOCKED "		"		\
-	"	cmpxchg %3,%1 ;	"			\
+	" lock; cmpxchg %3,%1 ;	"			\
 	"# atomic_cmpset_" #TYPE "	"		\
 	: "=@cce" (res),		/* 0 */		\
 	  "+m" (*dst),			/* 1 */		\
@@ -216,8 +209,7 @@ atomic_fcmpset_##TYPE(volatile u_##TYPE *dst, u_##TYPE *expect, u_##TYPE src) \
 	u_char res;					\
 							\
 	__asm __volatile(				\
-	"	" MPLOCKED "		"		\
-	"	cmpxchg %3,%1 ;		"		\
+	" lock; cmpxchg %3,%1 ;		"		\
 	"# atomic_fcmpset_" #TYPE "	"		\
 	: "=@cce" (res),		/* 0 */		\
 	  "+m" (*dst),			/* 1 */		\
@@ -241,8 +233,7 @@ atomic_fetchadd_int(volatile u_int *p, u_int v)
 {
 
 	__asm __volatile(
-	"	" MPLOCKED "		"
-	"	xaddl	%0,%1 ;		"
+	" lock; xaddl	%0,%1 ;		"
 	"# atomic_fetchadd_int"
 	: "+r" (v),			/* 0 */
 	  "+m" (*p)			/* 1 */
@@ -259,8 +250,7 @@ atomic_fetchadd_long(volatile u_long *p, u_long v)
 {
 
 	__asm __volatile(
-	"	" MPLOCKED "		"
-	"	xaddq	%0,%1 ;		"
+	" lock;	xaddq	%0,%1 ;		"
 	"# atomic_fetchadd_long"
 	: "+r" (v),			/* 0 */
 	  "+m" (*p)			/* 1 */
@@ -274,8 +264,7 @@ atomic_testandset_int(volatile u_int *p, u_int v)
 	u_char res;
 
 	__asm __volatile(
-	"	" MPLOCKED "		"
-	"	btsl	%2,%1 ;		"
+	" lock;	btsl	%2,%1 ;		"
 	"# atomic_testandset_int"
 	: "=@ccc" (res),		/* 0 */
 	  "+m" (*p)			/* 1 */
@@ -290,8 +279,7 @@ atomic_testandset_long(volatile u_long *p, u_int v)
 	u_char res;
 
 	__asm __volatile(
-	"	" MPLOCKED "		"
-	"	btsq	%2,%1 ;		"
+	" lock;	btsq	%2,%1 ;		"
 	"# atomic_testandset_long"
 	: "=@ccc" (res),		/* 0 */
 	  "+m" (*p)			/* 1 */
@@ -306,8 +294,7 @@ atomic_testandclear_int(volatile u_int *p, u_int v)
 	u_char res;
 
 	__asm __volatile(
-	"	" MPLOCKED "		"
-	"	btrl	%2,%1 ;		"
+	" lock;	btrl	%2,%1 ;		"
 	"# atomic_testandclear_int"
 	: "=@ccc" (res),		/* 0 */
 	  "+m" (*p)			/* 1 */
@@ -322,8 +309,7 @@ atomic_testandclear_long(volatile u_long *p, u_int v)
 	u_char res;
 
 	__asm __volatile(
-	"	" MPLOCKED "		"
-	"	btrq	%2,%1 ;		"
+	" lock;	btrq	%2,%1 ;		"
 	"# atomic_testandclear_long"
 	: "=@ccc" (res),		/* 0 */
 	  "+m" (*p)			/* 1 */
@@ -344,39 +330,18 @@ atomic_testandclear_long(volatile u_long *p, u_int v)
  * special address for "mem".  In the kernel, we use a private per-cpu
  * cache line.  In user space, we use a word in the stack's red zone
  * (-8(%rsp)).
- *
- * For UP kernels, however, the memory of the single processor is
- * always consistent, so we only need to stop the compiler from
- * reordering accesses in a way that violates the semantics of acquire
- * and release.
  */
 
-#if defined(_KERNEL)
-
-#if defined(SMP) || defined(KLD_MODULE)
 static __inline void
 __storeload_barrier(void)
 {
-
+#if defined(_KERNEL)
 	__asm __volatile("lock; addl $0,%%gs:%0"
 	    : "+m" (*(u_int *)OFFSETOF_MONITORBUF) : : "memory", "cc");
-}
-#else /* _KERNEL && UP */
-static __inline void
-__storeload_barrier(void)
-{
-
-	__compiler_membar();
-}
-#endif /* SMP */
 #else /* !_KERNEL */
-static __inline void
-__storeload_barrier(void)
-{
-
 	__asm __volatile("lock; addl $0,-8(%%rsp)" : : : "memory", "cc");
-}
 #endif /* _KERNEL*/
+}
 
 #define	ATOMIC_LOAD(TYPE)					\
 static __inline u_##TYPE					\
diff --git a/sys/i386/include/atomic.h b/sys/i386/include/atomic.h
index c65a7ac83a93..2e72de935c23 100644
--- a/sys/i386/include/atomic.h
+++ b/sys/i386/include/atomic.h
@@ -141,16 +141,10 @@ void		atomic_subtract_64(volatile uint64_t *, uint64_t);
 #else /* !__GNUCLIKE_ASM */
 
 /*
- * For userland, always use lock prefixes so that the binaries will run
- * on both SMP and !SMP systems.
- */
-#if defined(SMP) || !defined(_KERNEL) || defined(KLD_MODULE)
-#define	MPLOCKED	"lock ; "
-#else
-#define	MPLOCKED
-#endif
-
-/*
+ * Always use lock prefixes.  The result is slighly less optimal for
+ * UP systems, but it matters less now, and sometimes UP is emulated
+ * over SMP.
+ *
  * The assembly is volatilized to avoid code chunk removal by the compiler.
  * GCC aggressively reorders operations and memory clobbering is necessary
  * in order to avoid that for memory barriers.
@@ -159,7 +153,7 @@ void		atomic_subtract_64(volatile uint64_t *, uint64_t);
 static __inline void					\
 atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
 {							\
-	__asm __volatile(MPLOCKED OP			\
+	__asm __volatile("lock; " OP			\
 	: "+m" (*p)					\
 	: CONS (V)					\
 	: "cc");					\
@@ -168,7 +162,7 @@ atomic_##NAME##_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
 static __inline void					\
 atomic_##NAME##_barr_##TYPE(volatile u_##TYPE *p, u_##TYPE v)\
 {							\
-	__asm __volatile(MPLOCKED OP			\
+	__asm __volatile("lock; " OP			\
 	: "+m" (*p)					\
 	: CONS (V)					\
 	: "memory", "cc");				\
@@ -197,8 +191,7 @@ atomic_cmpset_##TYPE(volatile u_##TYPE *dst, u_##TYPE expect, u_##TYPE src) \
 	u_char res;					\
 							\
 	__asm __volatile(				\
-	"	" MPLOCKED "		"		\
-	"	cmpxchg	%3,%1 ;		"		\
+	"	lock; cmpxchg	%3,%1 ;	"		\
 	"	sete	%0 ;		"		\
 	"# atomic_cmpset_" #TYPE "	"		\
 	: "=q" (res),			/* 0 */		\
@@ -215,8 +208,7 @@ atomic_fcmpset_##TYPE(volatile u_##TYPE *dst, u_##TYPE *expect, u_##TYPE src) \
 	u_char res;					\
 							\
 	__asm __volatile(				\
-	"	" MPLOCKED "		"		\
-	"	cmpxchg	%3,%1 ;		"		\
+	"	lock; cmpxchg	%3,%1 ;	"		\
 	"	sete	%0 ;		"		\
 	"# atomic_fcmpset_" #TYPE "	"		\
 	: "=q" (res),			/* 0 */		\
@@ -240,8 +232,7 @@ atomic_fetchadd_int(volatile u_int *p, u_int v)
 {
 
 	__asm __volatile(
-	"	" MPLOCKED "		"
-	"	xaddl	%0,%1 ;		"
+	"	lock; xaddl	%0,%1 ;	"
 	"# atomic_fetchadd_int"
 	: "+r" (v),			/* 0 */
 	  "+m" (*p)			/* 1 */
@@ -255,8 +246,7 @@ atomic_testandset_int(volatile u_int *p, u_int v)
 	u_char res;
 
 	__asm __volatile(
-	"	" MPLOCKED "		"
-	"	btsl	%2,%1 ;		"
+	"	lock; btsl	%2,%1 ;	"
 	"	setc	%0 ;		"
 	"# atomic_testandset_int"
 	: "=q" (res),			/* 0 */
@@ -272,8 +262,7 @@ atomic_testandclear_int(volatile u_int *p, u_int v)
 	u_char res;
 
 	__asm __volatile(
-	"	" MPLOCKED "		"
-	"	btrl	%2,%1 ;		"
+	"	lock; btrl	%2,%1 ;	"
 	"	setc	%0 ;		"
 	"# atomic_testandclear_int"
 	: "=q" (res),			/* 0 */
@@ -303,11 +292,7 @@ atomic_testandclear_int(volatile u_int *p, u_int v)
  */
 
 #if defined(_KERNEL)
-#if defined(SMP) || defined(KLD_MODULE)
 #define	__storeload_barrier()	__mbk()
-#else /* _KERNEL && UP */
-#define	__storeload_barrier()	__compiler_membar()
-#endif /* SMP */
 #else /* !_KERNEL */
 #define	__storeload_barrier()	__mbu()
 #endif /* _KERNEL*/
@@ -484,8 +469,7 @@ atomic_cmpset_64_i586(volatile uint64_t *dst, uint64_t expect, uint64_t src)
 	u_char res;
 
 	__asm __volatile(
-	"	" MPLOCKED "		"
-	"	cmpxchg8b %1 ;		"
+	"	lock; cmpxchg8b %1 ;	"
 	"	sete	%0"
 	: "=q" (res),			/* 0 */
 	  "+m" (*dst),			/* 1 */
@@ -502,8 +486,7 @@ atomic_fcmpset_64_i586(volatile uint64_t *dst, uint64_t *expect, uint64_t src)
 	u_char res;
 
 	__asm __volatile(
-	"	" MPLOCKED "		"
-	"	cmpxchg8b %1 ;		"
+	"	lock; cmpxchg8b %1 ;	"
 	"	sete	%0"
 	: "=q" (res),			/* 0 */
 	  "+m" (*dst),			/* 1 */
@@ -522,8 +505,7 @@ atomic_load_acq_64_i586(volatile uint64_t *p)
 	__asm __volatile(
 	"	movl	%%ebx,%%eax ;	"
 	"	movl	%%ecx,%%edx ;	"
-	"	" MPLOCKED "		"
-	"	cmpxchg8b %1"
+	"	lock; cmpxchg8b %1"
 	: "=&A" (res),			/* 0 */
 	  "+m" (*p)			/* 1 */
 	: : "memory", "cc");
@@ -538,8 +520,7 @@ atomic_store_rel_64_i586(volatile uint64_t *p, uint64_t v)
 	"	movl	%%eax,%%ebx ;	"
 	"	movl	%%edx,%%ecx ;	"
 	"1:				"
-	"	" MPLOCKED "		"
-	"	cmpxchg8b %0 ;		"
+	"	lock; cmpxchg8b %0 ;	"
 	"	jne	1b"
 	: "+m" (*p),			/* 0 */
 	  "+A" (v)			/* 1 */
@@ -554,8 +535,7 @@ atomic_swap_64_i586(volatile uint64_t *p, uint64_t v)
 	"	movl	%%eax,%%ebx ;	"
 	"	movl	%%edx,%%ecx ;	"
 	"1:				"
-	"	" MPLOCKED "		"
-	"	cmpxchg8b %0 ;		"
+	"	lock; cmpxchg8b %0 ;	"
 	"	jne	1b"
 	: "+m" (*p),			/* 0 */
 	  "+A" (v)			/* 1 */