git: aca0d0b421d3 - stable/13 - vmm: Use an sx lock to protect the memory map.

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Thu, 26 Jan 2023 22:12:07 UTC
The branch stable/13 has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=aca0d0b421d3ee7b844edf949902c0c189279ad7

commit aca0d0b421d3ee7b844edf949902c0c189279ad7
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2022-11-18 18:04:37 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2023-01-26 22:05:02 +0000

    vmm: Use an sx lock to protect the memory map.
    
    Previously bhyve obtained a "read lock" on the memory map for ioctls
    needing to read the map by locking the last vCPU.  This is now
    replaced by a new per-VM sx lock.  Modifying the map requires
    exclusively locking the sx lock as well as locking all existing vCPUs.
    Reading the map requires either locking one vCPU or the sx lock.
    
    This permits safely modifying or querying the memory map while some
    vCPUs do not exist which will be true in a future commit.
    
    Reviewed by:    corvink, markj
    Differential Revision:  https://reviews.freebsd.org/D37172
    
    (cherry picked from commit 67b69e76e8eecfd204f6de636d622a1d681c8d7e)
---
 sys/amd64/include/vmm.h |  3 +++
 sys/amd64/vmm/vmm.c     | 55 +++++++++++++++++++++++++++--------------
 sys/amd64/vmm/vmm_dev.c | 66 ++++++++++++++++++++++++-------------------------
 3 files changed, 72 insertions(+), 52 deletions(-)

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 7c346d7ed4ed..b7506faaee92 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -231,6 +231,9 @@ int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
 /*
  * APIs that modify the guest memory map require all vcpus to be frozen.
  */
+void vm_slock_memsegs(struct vm *vm);
+void vm_xlock_memsegs(struct vm *vm);
+void vm_unlock_memsegs(struct vm *vm);
 int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off,
     size_t len, int prot, int flags);
 int vm_munmap_memseg(struct vm *vm, vm_paddr_t gpa, size_t len);
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 4659495ca5a2..87f1d9e45d58 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -46,6 +46,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/rwlock.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
+#include <sys/sx.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
@@ -155,6 +156,11 @@ struct mem_map {
  * (o) initialized the first time the VM is created
  * (i) initialized when VM is created and when it is reinitialized
  * (x) initialized before use
+ *
+ * Locking:
+ * [m] mem_segs_lock
+ * [r] rendezvous_mtx
+ * [v] reads require one frozen vcpu, writes require freezing all vcpus
  */
 struct vm {
 	void		*cookie;		/* (i) cpu-specific data */
@@ -170,13 +176,13 @@ struct vm {
 	int		suspend;		/* (i) stop VM execution */
 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
-	cpuset_t	rendezvous_req_cpus;	/* (x) rendezvous requested */
-	cpuset_t	rendezvous_done_cpus;	/* (x) rendezvous finished */
-	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
+	cpuset_t	rendezvous_req_cpus;	/* (x) [r] rendezvous requested */
+	cpuset_t	rendezvous_done_cpus;	/* (x) [r] rendezvous finished */
+	void		*rendezvous_arg;	/* (x) [r] rendezvous func/arg */
 	vm_rendezvous_func_t rendezvous_func;
 	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
-	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
-	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
+	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) [m+v] guest address space */
+	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) [m+v] guest memory regions */
 	struct vmspace	*vmspace;		/* (o) guest's address space */
 	char		name[VM_MAX_NAMELEN+1];	/* (o) virtual machine name */
 	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
@@ -185,6 +191,7 @@ struct vm {
 	uint16_t	cores;			/* (o) num of cores/socket */
 	uint16_t	threads;		/* (o) num of threads/core */
 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
+	struct sx	mem_segs_lock;		/* (o) */
 };
 
 #define	VMM_CTR0(vcpu, format)						\
@@ -518,6 +525,7 @@ vm_create(const char *name, struct vm **retvm)
 	strcpy(vm->name, name);
 	vm->vmspace = vmspace;
 	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
+	sx_init(&vm->mem_segs_lock, "vm mem_segs");
 
 	vm->sockets = 1;
 	vm->cores = cores_per_package;	/* XXX backwards compatibility */
@@ -609,6 +617,7 @@ vm_cleanup(struct vm *vm, bool destroy)
 		vmmops_vmspace_free(vm->vmspace);
 		vm->vmspace = NULL;
 
+		sx_destroy(&vm->mem_segs_lock);
 		mtx_destroy(&vm->rendezvous_mtx);
 	}
 }
@@ -645,6 +654,24 @@ vm_name(struct vm *vm)
 	return (vm->name);
 }
 
+void
+vm_slock_memsegs(struct vm *vm)
+{
+	sx_slock(&vm->mem_segs_lock);
+}
+
+void
+vm_xlock_memsegs(struct vm *vm)
+{
+	sx_xlock(&vm->mem_segs_lock);
+}
+
+void
+vm_unlock_memsegs(struct vm *vm)
+{
+	sx_unlock(&vm->mem_segs_lock);
+}
+
 int
 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
 {
@@ -702,6 +729,8 @@ vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
 	struct mem_seg *seg;
 	vm_object_t obj;
 
+	sx_assert(&vm->mem_segs_lock, SX_XLOCKED);
+
 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 		return (EINVAL);
 
@@ -732,6 +761,8 @@ vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
 {
 	struct mem_seg *seg;
 
+	sx_assert(&vm->mem_segs_lock, SX_LOCKED);
+
 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
 		return (EINVAL);
 
@@ -1075,19 +1106,7 @@ void *
 vm_gpa_hold_global(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
     void **cookie)
 {
-#ifdef INVARIANTS
-	/*
-	 * All vcpus are frozen by ioctls that modify the memory map
-	 * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
-	 * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
-	 */
-	int state;
-	for (int i = 0; i < vm->maxcpus; i++) {
-		state = vcpu_get_state(vm_vcpu(vm, i), NULL);
-		KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
-		    __func__, state));
-	}
-#endif
+	sx_assert(&vm->mem_segs_lock, SX_LOCKED);
 	return (_vm_gpa_hold(vm, gpa, len, reqprot, cookie));
 }
 
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index 249131b16464..1b8b1e6d388f 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -221,8 +221,6 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
 	vm_paddr_t gpa, maxaddr;
 	void *hpa, *cookie;
 	struct vmmdev_softc *sc;
-	struct vcpu *vcpu;
-	uint16_t lastcpu;
 
 	error = vmm_priv_check(curthread->td_ucred);
 	if (error)
@@ -233,13 +231,9 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
 		return (ENXIO);
 
 	/*
-	 * Get a read lock on the guest memory map by freezing any vcpu.
+	 * Get a read lock on the guest memory map.
 	 */
-	lastcpu = vm_get_maxcpus(sc->vm) - 1;
-	error = vcpu_lock_one(sc, lastcpu);
-	if (error)
-		return (error);
-	vcpu = vm_vcpu(sc->vm, lastcpu);
+	vm_slock_memsegs(sc->vm);
 
 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
@@ -256,7 +250,7 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
 		 * Since this device does not support lseek(2), dd(1) will
 		 * read(2) blocks of data to simulate the lseek(2).
 		 */
-		hpa = vm_gpa_hold(vcpu, gpa, c, prot, &cookie);
+		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
 		if (hpa == NULL) {
 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
 				error = uiomove(__DECONST(void *, zero_region),
@@ -268,7 +262,7 @@ vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
 			vm_gpa_release(cookie);
 		}
 	}
-	vcpu_unlock_one(sc, lastcpu);
+	vm_unlock_memsegs(sc->vm);
 	return (error);
 }
 
@@ -420,6 +414,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_readwrite_kernemu_device *kernemu;
 	uint64_t *regvals;
 	int *regnums;
+	bool memsegs_locked;
 #ifdef BHYVE_SNAPSHOT
 	struct vm_snapshot_meta *snapshot_meta;
 #endif
@@ -435,6 +430,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	vcpuid = -1;
 	vcpu = NULL;
 	state_changed = 0;
+	memsegs_locked = false;
 
 	/*
 	 * For VMM ioctls that operate on a single vCPU, lookup the
@@ -476,10 +472,6 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		vcpu = vm_vcpu(sc->vm, vcpuid);
 		break;
 
-	case VM_MAP_PPTDEV_MMIO:
-	case VM_UNMAP_PPTDEV_MMIO:
-	case VM_BIND_PPTDEV:
-	case VM_UNBIND_PPTDEV:
 #ifdef COMPAT_FREEBSD12
 	case VM_ALLOC_MEMSEG_FBSD12:
 #endif
@@ -487,6 +479,21 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	case VM_MMAP_MEMSEG:
 	case VM_MUNMAP_MEMSEG:
 	case VM_REINIT:
+		/*
+		 * ioctls that modify the memory map must lock memory
+		 * segments exclusively.
+		 */
+		vm_xlock_memsegs(sc->vm);
+		memsegs_locked = true;
+		/* FALLTHROUGH */
+	case VM_MAP_PPTDEV_MMIO:
+	case VM_UNMAP_PPTDEV_MMIO:
+	case VM_BIND_PPTDEV:
+	case VM_UNBIND_PPTDEV:
+#ifdef BHYVE_SNAPSHOT
+	case VM_SNAPSHOT_REQ:
+	case VM_RESTORE_TIME:
+#endif
 		/*
 		 * ioctls that operate on the entire virtual machine must
 		 * prevent all vcpus from running.
@@ -503,14 +510,10 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	case VM_GET_MEMSEG:
 	case VM_MMAP_GETNEXT:
 		/*
-		 * Lock a vcpu to make sure that the memory map cannot be
-		 * modified while it is being inspected.
+		 * Lock the memory map while it is being inspected.
 		 */
-		vcpuid = vm_get_maxcpus(sc->vm) - 1;
-		error = vcpu_lock_one(sc, vcpuid);
-		if (error)
-			goto done;
-		state_changed = 1;
+		vm_slock_memsegs(sc->vm);
+		memsegs_locked = true;
 		break;
 
 #ifdef COMPAT_FREEBSD13
@@ -958,6 +961,8 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		vcpu_unlock_one(sc, vcpuid);
 	else if (state_changed == 2)
 		vcpu_unlock_all(sc);
+	if (memsegs_locked)
+		vm_unlock_memsegs(sc->vm);
 
 done:
 	/*
@@ -978,7 +983,6 @@ vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
 	size_t len;
 	vm_ooffset_t segoff, first, last;
 	int error, found, segid;
-	uint16_t lastcpu;
 	bool sysmem;
 
 	error = vmm_priv_check(curthread->td_ucred);
@@ -997,12 +1001,9 @@ vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
 	}
 
 	/*
-	 * Get a read lock on the guest memory map by freezing any vcpu.
+	 * Get a read lock on the guest memory map.
 	 */
-	lastcpu = vm_get_maxcpus(sc->vm) - 1;
-	error = vcpu_lock_one(sc, lastcpu);
-	if (error)
-		return (error);
+	vm_slock_memsegs(sc->vm);
 
 	gpa = 0;
 	found = 0;
@@ -1029,7 +1030,7 @@ vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
 			error = EINVAL;
 		}
 	}
-	vcpu_unlock_one(sc, lastcpu);
+	vm_unlock_memsegs(sc->vm);
 	return (error);
 }
 
@@ -1242,7 +1243,6 @@ devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
 	vm_ooffset_t first, last;
 	size_t seglen;
 	int error;
-	uint16_t lastcpu;
 	bool sysmem;
 
 	dsc = cdev->si_drv1;
@@ -1256,16 +1256,14 @@ devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
 		return (EINVAL);
 
-	lastcpu = vm_get_maxcpus(dsc->sc->vm) - 1;
-	error = vcpu_lock_one(dsc->sc, lastcpu);
-	if (error)
-		return (error);
+	vm_slock_memsegs(dsc->sc->vm);
 
 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
 	KASSERT(error == 0 && !sysmem && *objp != NULL,
 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
 
-	vcpu_unlock_one(dsc->sc, lastcpu);
+
+	vm_unlock_memsegs(dsc->sc->vm);
 
 	if (seglen >= last) {
 		vm_object_reference(*objp);