git: 4a69fc16a583 - main - Add membarrier(2)

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Wed, 23 Aug 2023 00:07:38 UTC
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=4a69fc16a583face922319c476f3e739d9ce9140

commit 4a69fc16a583face922319c476f3e739d9ce9140
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2021-10-07 21:10:07 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2023-08-23 00:02:21 +0000

    Add membarrier(2)
    
    This is an attempt at clean-room implementation of the Linux'
    membarrier(2) syscall.  For documentation, you would need to read
    both membarrier(2) Linux man page, the comments in Linux
    kernel/sched/membarrier.c implementation and possibly look at
    actual uses.
    
    Sponsored by:   The FreeBSD Foundation
    MFC after:      1 week
    Differential revision:  https://reviews.freebsd.org/D32360
---
 lib/libc/sys/Symbol.map    |   1 +
 sys/conf/files             |   1 +
 sys/kern/kern_exec.c       |   2 +
 sys/kern/kern_membarrier.c | 239 +++++++++++++++++++++++++++++++++++++++++++++
 sys/kern/syscalls.master   |   9 +-
 sys/sys/membarrier.h       |  70 +++++++++++++
 sys/sys/proc.h             |   6 ++
 sys/sys/syscallsubr.h      |   2 +
 8 files changed, 329 insertions(+), 1 deletion(-)

diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
index c09c3696c17b..9a07bb457eb8 100644
--- a/lib/libc/sys/Symbol.map
+++ b/lib/libc/sys/Symbol.map
@@ -419,6 +419,7 @@ FBSD_1.7 {
 	_Fork;
 	fspacectl;
 	kqueuex;
+	membarrier;
 	swapoff;
 };
 
diff --git a/sys/conf/files b/sys/conf/files
index b5cd85cba0e4..3f79ce752c80 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3780,6 +3780,7 @@ kern/kern_lockstat.c		optional kdtrace_hooks
 kern/kern_loginclass.c		standard
 kern/kern_malloc.c		standard
 kern/kern_mbuf.c		standard
+kern/kern_membarrier.c		standard
 kern/kern_mib.c			standard
 kern/kern_module.c		standard
 kern/kern_mtxpool.c		standard
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index 6b0798921da2..ae2b624c2659 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -805,6 +805,8 @@ interpret:
 		p->p_flag2 &= ~P2_NOTRACE;
 	if ((p->p_flag2 & P2_STKGAP_DISABLE_EXEC) == 0)
 		p->p_flag2 &= ~P2_STKGAP_DISABLE;
+	p->p_flag2 &= ~(P2_MEMBAR_PRIVE | P2_MEMBAR_PRIVE_SYNCORE |
+	    P2_MEMBAR_GLOBE);
 	if (p->p_flag & P_PPWAIT) {
 		p->p_flag &= ~(P_PPWAIT | P_PPTRACE);
 		cv_broadcast(&p->p_pwait);
diff --git a/sys/kern/kern_membarrier.c b/sys/kern/kern_membarrier.c
new file mode 100644
index 000000000000..eabd00e8ddf4
--- /dev/null
+++ b/sys/kern/kern_membarrier.c
@@ -0,0 +1,239 @@
+/*-
+ * Copyright (c) 2021 The FreeBSD Foundation
+ *
+ * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+#include <sys/lock.h>
+#include <sys/membarrier.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysproto.h>
+
+#include <vm/vm_param.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+
+#define MEMBARRIER_SUPPORTED_CMDS	(			\
+    MEMBARRIER_CMD_GLOBAL |					\
+    MEMBARRIER_CMD_GLOBAL_EXPEDITED |				\
+    MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED |			\
+    MEMBARRIER_CMD_PRIVATE_EXPEDITED |				\
+    MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED |			\
+    MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE |		\
+    MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
+
+static void
+membarrier_action_seqcst(void *arg __unused)
+{
+	atomic_thread_fence_seq_cst();
+}
+
+static void
+membarrier_action_seqcst_sync_core(void *arg __unused)
+{
+	atomic_thread_fence_seq_cst();
+	cpu_sync_core();
+}
+
+static void
+do_membarrier_ipi(cpuset_t *csp, void (*func)(void *))
+{
+	atomic_thread_fence_seq_cst();
+	smp_rendezvous_cpus(*csp, smp_no_rendezvous_barrier, func,
+	    smp_no_rendezvous_barrier, NULL);
+	atomic_thread_fence_seq_cst();
+}
+
+static void
+check_cpu_switched(int c, cpuset_t *csp, uint64_t *swt, bool init)
+{
+	struct pcpu *pc;
+	uint64_t sw;
+
+	if (CPU_ISSET(c, csp))
+		return;
+
+	pc = cpuid_to_pcpu[c];
+	if (pc->pc_curthread == pc->pc_idlethread) {
+		CPU_SET(c, csp);
+		return;
+	}
+
+	/*
+	 * Sync with context switch to ensure that override of
+	 * pc_curthread with non-idle thread pointer is visible before
+	 * reading of pc_switchtime.
+	 */
+	atomic_thread_fence_acq();
+
+	sw = pc->pc_switchtime;
+	if (init)
+		swt[c] = sw;
+	else if (sw != swt[c])
+		CPU_SET(c, csp);
+}
+
+/*
+ *
+ * XXXKIB: We execute the requested action (seq_cst and possibly
+ * sync_core) on current CPU as well.  There is no guarantee that
+ * current thread executes anything with the full fence semantics
+ * during syscall execution.  Similarly, cpu_core_sync() semantics
+ * might be not provided by the syscall return.  E.g. on amd64 we
+ * typically return without IRET.
+ */
+int
+kern_membarrier(struct thread *td, int cmd, unsigned flags, int cpu_id)
+{
+	struct proc *p, *p1;
+	struct thread *td1;
+	cpuset_t cs;
+	uint64_t *swt;
+	int c, error;
+	bool first;
+
+	if (flags != 0 || (cmd & ~MEMBARRIER_SUPPORTED_CMDS) != 0)
+		return (EINVAL);
+
+	if (cmd == MEMBARRIER_CMD_QUERY) {
+		td->td_retval[0] = MEMBARRIER_SUPPORTED_CMDS;
+		return (0);
+	}
+
+	p = td->td_proc;
+	error = 0;
+
+	switch (cmd) {
+	case MEMBARRIER_CMD_GLOBAL:
+		swt = malloc((mp_maxid + 1) * sizeof(*swt), M_TEMP, M_WAITOK);
+		CPU_ZERO(&cs);
+		sched_pin();
+		CPU_SET(PCPU_GET(cpuid), &cs);
+		for (first = true; error == 0; first = false) {
+			CPU_FOREACH(c)
+				check_cpu_switched(c, &cs, swt, first);
+			if (CPU_CMP(&cs, &all_cpus) == 0)
+				break;
+			error = pause_sig("mmbr", 1);
+			if (error == EWOULDBLOCK)
+				error = 0;
+		}
+		sched_unpin();
+		free(swt, M_TEMP);
+		atomic_thread_fence_seq_cst();
+		break;
+
+	case MEMBARRIER_CMD_GLOBAL_EXPEDITED:
+		if ((td->td_proc->p_flag2 & P2_MEMBAR_GLOBE) == 0) {
+			error = EPERM;
+		} else {
+			CPU_ZERO(&cs);
+			CPU_FOREACH(c) {
+				td1 = cpuid_to_pcpu[c]->pc_curthread;
+				p1 = td1->td_proc;
+				if (p1 != NULL &&
+				    (p1->p_flag2 & P2_MEMBAR_GLOBE) != 0)
+					CPU_SET(c, &cs);
+			}
+			do_membarrier_ipi(&cs, membarrier_action_seqcst);
+		}
+		break;
+
+	case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
+		if ((p->p_flag2 & P2_MEMBAR_GLOBE) == 0) {
+			PROC_LOCK(p);
+			p->p_flag2 |= P2_MEMBAR_GLOBE;
+			PROC_UNLOCK(p);
+		}
+		break;
+
+	case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+		if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE) == 0) {
+			error = EPERM;
+		} else {
+			pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);
+			do_membarrier_ipi(&cs, membarrier_action_seqcst);
+		}
+		break;
+
+	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
+		if ((p->p_flag2 & P2_MEMBAR_PRIVE) == 0) {
+			PROC_LOCK(p);
+			p->p_flag2 |= P2_MEMBAR_PRIVE;
+			PROC_UNLOCK(p);
+		}
+		break;
+
+	case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
+		if ((td->td_proc->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) {
+			error = EPERM;
+		} else {
+			/*
+			 * Calculating the IPI multicast mask from
+			 * pmap active mask means that we do not call
+			 * cpu_sync_core() on CPUs that were missed
+			 * from pmap active mask but could be switched
+			 * from or to meantime.  This is fine at least
+			 * on amd64 because threads always use slow
+			 * (IRETQ) path to return from syscall after
+			 * context switch.
+			 */
+			pmap_active_cpus(vmspace_pmap(p->p_vmspace), &cs);
+
+			do_membarrier_ipi(&cs,
+			    membarrier_action_seqcst_sync_core);
+		}
+		break;
+
+	case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
+		if ((p->p_flag2 & P2_MEMBAR_PRIVE_SYNCORE) == 0) {
+			PROC_LOCK(p);
+			p->p_flag2 |= P2_MEMBAR_PRIVE_SYNCORE;
+			PROC_UNLOCK(p);
+		}
+		break;
+
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	return (error);
+}
+
+int
+sys_membarrier(struct thread *td, struct membarrier_args *uap)
+{
+	return (kern_membarrier(td, uap->cmd, uap->flags, uap->cpu_id));
+}
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index 209c3c2fd6db..158ea7036d50 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -3300,7 +3300,6 @@
 581	AUE_NULL	STD|CAPENABLED {
 		int sched_getcpu(void);
 	}
-
 582	AUE_SWAPOFF	STD {
 		int swapoff(
 		    _In_z_ const char *name,
@@ -3312,5 +3311,13 @@
 		    u_int flags
 		);
 	}
+584	AUE_NULL	STD|CAPENABLED {
+		int membarrier(
+		    int cmd,
+		    unsigned flags,
+		    int cpu_id
+		);
+	}
+
 
 ; vim: syntax=off
diff --git a/sys/sys/membarrier.h b/sys/sys/membarrier.h
new file mode 100644
index 000000000000..958b769da23e
--- /dev/null
+++ b/sys/sys/membarrier.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 2021 The FreeBSD Foundation
+ *
+ * This software were developed by Konstantin Belousov <kib@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef __SYS_MEMBARRIER_H__
+#define	__SYS_MEMBARRIER_H__
+
+#include <sys/cdefs.h>
+
+/*
+ * The enum membarrier_cmd values are bits.  The MEMBARRIER_CMD_QUERY
+ * command returns a bitset indicating which commands are supported.
+ * Also the value of MEMBARRIER_CMD_QUERY is zero, so it is
+ * effectively not returned by the query.
+ */
+enum membarrier_cmd {
+	MEMBARRIER_CMD_QUERY =				0x00000000,
+	MEMBARRIER_CMD_GLOBAL =				0x00000001,
+	MEMBARRIER_CMD_SHARED =				MEMBARRIER_CMD_GLOBAL,
+	MEMBARRIER_CMD_GLOBAL_EXPEDITED =		0x00000002,
+	MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED =	0x00000004,
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED =		0x00000008,
+	MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED =	0x00000010,
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE =	0x00000020,
+	MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE =	0x00000040,
+
+	/*
+	 * RSEQ constants are defined for source compatibility but are
+	 * not yes supported, MEMBARRIER_CMD_QUERY does not return
+	 * them in the mask.
+	 */
+	MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ =		0x00000080,
+	MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ = 0x00000100,
+};
+
+enum membarrier_cmd_flag {
+	MEMBARRIER_CMD_FLAG_CPU =			0x00000001,
+};
+
+#ifndef _KERNEL
+__BEGIN_DECLS
+int membarrier(int, unsigned, int);
+__END_DECLS
+#endif	/* _KERNEL */
+
+#endif /* __SYS_MEMBARRIER_H__ */
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 05ab914af409..3102cae7add0 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -883,6 +883,12 @@ struct proc {
 						   external thread_single() is
 						   permitted */
 #define	P2_REAPKILLED		0x00080000
+#define	P2_MEMBAR_PRIVE		0x00100000	/* membar private expedited
+						   registered */
+#define	P2_MEMBAR_PRIVE_SYNCORE	0x00200000	/* membar private expedited
+						   sync core registered */
+#define	P2_MEMBAR_GLOBE		0x00400000	/* membar global expedited
+						   registered */
 
 /* Flags protected by proctree_lock, kept in p_treeflags. */
 #define	P_TREE_ORPHANED		0x00000001	/* Reparented, on orphan list */
diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h
index 62cda6d03fd4..8be860dc0fd4 100644
--- a/sys/sys/syscallsubr.h
+++ b/sys/sys/syscallsubr.h
@@ -204,6 +204,8 @@ int	kern_minherit(struct thread *td, uintptr_t addr, size_t len,
 	    int inherit);
 int	kern_mkdirat(struct thread *td, int fd, const char *path,
 	    enum uio_seg segflg, int mode);
+int	kern_membarrier(struct thread *td, int cmd, unsigned flags,
+	    int cpu_id);
 int	kern_mkfifoat(struct thread *td, int fd, const char *path,
 	    enum uio_seg pathseg, int mode);
 int	kern_mknodat(struct thread *td, int fd, const char *path,