git: f39768e52e51 - main - vmm: Fix a deadlock between vm_smp_rendezvous() and vcpu_lock_all()

Go to: [ bottom of page ] [ top of archives ] [ this month ]
From: Mark Johnston <markj_at_FreeBSD.org>
Date: Fri, 17 Oct 2025 14:31:12 UTC
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=f39768e52e513264da60add0ca2412bddda271ff

commit f39768e52e513264da60add0ca2412bddda271ff
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2025-10-17 13:09:38 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2025-10-17 14:28:41 +0000

    vmm: Fix a deadlock between vm_smp_rendezvous() and vcpu_lock_all()
    
    vm_smp_rendezvous() invokes a callback on all vCPUs, blocking the
    initiator until all vCPUs have responded.  vcpu_lock_all() blocks each
    vCPU by waiting for it to go idle and setting the vCPU state to frozen.
    These two operations can deadlock on each other, particularly when
    booting a Windows guest, when vcpu_lock_all() blocks waiting for a
    rendezvous initiator, and the initiator is blocked waiting for the vCPU
    thread which called vcpu_lock_all() to invoke the rendezvous callback.
    
    Implement vcpu_lock_all() in a way that avoids deadlocks with
    vm_smp_rendezvous().  In particular, when traversing vCPUs, invoke the
    rendezvous callback on the vCPU's behalf to help the initiator finish.
    We can only safely do so when the vCPU is IDLE or we have already locked
    it, otherwise we may be racing with the target vCPU thread.  Thus:
    - Use an exclusive lock to serialize vcpu_lock_all() callers, which lets
      us lock vCPUs out of order without fear of deadlock with parallel
      vcpu_lock_all() callers.
    - If a rendezvous is pending, lock all idle vCPUs and invoke the
      callback on their behalf.  If the vcpu_lock_all() caller is itself a
      vCPU thread, this will handle that thread.
    - Block waiting for all non-idle vCPUs to idle, or until one of them
      initiates a rendezvous, in which case we go back and invoke callbacks
      on behalf of already-locked vCPUs.
    
    Note that on !amd64 no changes are needed since there is no rendezvous
    mechanism, so there is a separate vcpu_set_state_all() for them based on
    the previous vcpu_lock_all().  These will be merged together once vcpu
    state handling is consolidated into sys/dev/vmm.
    
    Reviewed by:    corvink (previous version)
    MFC after:      3 weeks
    Differential Revision:  https://reviews.freebsd.org/D52968
---
 sys/amd64/include/vmm.h |   3 +-
 sys/amd64/vmm/vmm.c     | 177 ++++++++++++++++++++++++++++++++++++++++--------
 sys/arm64/include/vmm.h |   2 +-
 sys/arm64/vmm/vmm.c     |   4 +-
 sys/dev/vmm/vmm_dev.c   |  28 ++++++--
 sys/riscv/include/vmm.h |   2 +-
 sys/riscv/vmm/vmm.c     |   4 +-
 7 files changed, 178 insertions(+), 42 deletions(-)

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 66d8991d36e8..ad67510fecf3 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -237,7 +237,7 @@ extern u_int vm_maxcpu;			/* maximum virtual cpus */
 int vm_create(const char *name, struct vm **retvm);
 struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid);
 void vm_disable_vcpu_creation(struct vm *vm);
-void vm_slock_vcpus(struct vm *vm);
+void vm_lock_vcpus(struct vm *vm);
 void vm_unlock_vcpus(struct vm *vm);
 void vm_destroy(struct vm *vm);
 int vm_reinit(struct vm *vm);
@@ -362,6 +362,7 @@ enum vcpu_state {
 };
 
 int vcpu_set_state(struct vcpu *vcpu, enum vcpu_state state, bool from_idle);
+int vcpu_set_state_all(struct vm *vm, enum vcpu_state state);
 enum vcpu_state vcpu_get_state(struct vcpu *vcpu, int *hostcpu);
 
 static int __inline
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index 2ac076551165..f7c59847140b 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -562,9 +562,9 @@ vm_alloc_vcpu(struct vm *vm, int vcpuid)
 }
 
 void
-vm_slock_vcpus(struct vm *vm)
+vm_lock_vcpus(struct vm *vm)
 {
-	sx_slock(&vm->vcpus_init_lock);
+	sx_xlock(&vm->vcpus_init_lock);
 }
 
 void
@@ -990,6 +990,54 @@ save_guest_fpustate(struct vcpu *vcpu)
 
 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
 
+/*
+ * Invoke the rendezvous function on the specified vcpu if applicable.  Return
+ * true if the rendezvous is finished, false otherwise.
+ */
+static bool
+vm_rendezvous(struct vcpu *vcpu)
+{
+	struct vm *vm = vcpu->vm;
+	int vcpuid;
+
+	mtx_assert(&vcpu->vm->rendezvous_mtx, MA_OWNED);
+	KASSERT(vcpu->vm->rendezvous_func != NULL,
+	    ("vm_rendezvous: no rendezvous pending"));
+
+	/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
+	CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus,
+	    &vm->active_cpus);
+
+	vcpuid = vcpu->vcpuid;
+	if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
+	    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
+		VMM_CTR0(vcpu, "Calling rendezvous func");
+		(*vm->rendezvous_func)(vcpu, vm->rendezvous_arg);
+		CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
+	}
+	if (CPU_CMP(&vm->rendezvous_req_cpus,
+	    &vm->rendezvous_done_cpus) == 0) {
+		VMM_CTR0(vcpu, "Rendezvous completed");
+		CPU_ZERO(&vm->rendezvous_req_cpus);
+		vm->rendezvous_func = NULL;
+		wakeup(&vm->rendezvous_func);
+		return (true);
+	}
+	return (false);
+}
+
+static void
+vcpu_wait_idle(struct vcpu *vcpu)
+{
+	KASSERT(vcpu->state != VCPU_IDLE, ("vcpu already idle"));
+
+	vcpu->reqidle = 1;
+	vcpu_notify_event_locked(vcpu, false);
+	VMM_CTR1(vcpu, "vcpu state change from %s to "
+	    "idle requested", vcpu_state2str(vcpu->state));
+	msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
+}
+
 static int
 vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
     bool from_idle)
@@ -1004,13 +1052,8 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
 	 * ioctl() operating on a vcpu at any point.
 	 */
 	if (from_idle) {
-		while (vcpu->state != VCPU_IDLE) {
-			vcpu->reqidle = 1;
-			vcpu_notify_event_locked(vcpu, false);
-			VMM_CTR1(vcpu, "vcpu state change from %s to "
-			    "idle requested", vcpu_state2str(vcpu->state));
-			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
-		}
+		while (vcpu->state != VCPU_IDLE)
+			vcpu_wait_idle(vcpu);
 	} else {
 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
 		    "vcpu idle state"));
@@ -1062,6 +1105,95 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
 	return (0);
 }
 
+/*
+ * Try to lock all of the vCPUs in the VM while taking care to avoid deadlocks
+ * with vm_smp_rendezvous().
+ *
+ * The complexity here suggests that the rendezvous mechanism needs a rethink.
+ */
+int
+vcpu_set_state_all(struct vm *vm, enum vcpu_state newstate)
+{
+	cpuset_t locked;
+	struct vcpu *vcpu;
+	int error, i;
+	uint16_t maxcpus;
+
+	KASSERT(newstate != VCPU_IDLE,
+	    ("vcpu_set_state_all: invalid target state %d", newstate));
+
+	error = 0;
+	CPU_ZERO(&locked);
+	maxcpus = vm->maxcpus;
+
+	mtx_lock(&vm->rendezvous_mtx);
+restart:
+	if (vm->rendezvous_func != NULL) {
+		/*
+		 * If we have a pending rendezvous, then the initiator may be
+		 * blocked waiting for other vCPUs to execute the callback.  The
+		 * current thread may be a vCPU thread so we must not block
+		 * waiting for the initiator, otherwise we get a deadlock.
+		 * Thus, execute the callback on behalf of any idle vCPUs.
+		 */
+		for (i = 0; i < maxcpus; i++) {
+			vcpu = vm_vcpu(vm, i);
+			if (vcpu == NULL)
+				continue;
+			vcpu_lock(vcpu);
+			if (vcpu->state == VCPU_IDLE) {
+				(void)vcpu_set_state_locked(vcpu, VCPU_FROZEN,
+				    true);
+				CPU_SET(i, &locked);
+			}
+			if (CPU_ISSET(i, &locked)) {
+				/*
+				 * We can safely execute the callback on this
+				 * vCPU's behalf.
+				 */
+				vcpu_unlock(vcpu);
+				(void)vm_rendezvous(vcpu);
+				vcpu_lock(vcpu);
+			}
+			vcpu_unlock(vcpu);
+		}
+	}
+
+	/*
+	 * Now wait for remaining vCPUs to become idle.  This may include the
+	 * initiator of a rendezvous that is currently blocked on the rendezvous
+	 * mutex.
+	 */
+	CPU_FOREACH_ISCLR(i, &locked) {
+		if (i >= maxcpus)
+			break;
+		vcpu = vm_vcpu(vm, i);
+		if (vcpu == NULL)
+			continue;
+		vcpu_lock(vcpu);
+		while (vcpu->state != VCPU_IDLE) {
+			mtx_unlock(&vm->rendezvous_mtx);
+			vcpu_wait_idle(vcpu);
+			vcpu_unlock(vcpu);
+			mtx_lock(&vm->rendezvous_mtx);
+			if (vm->rendezvous_func != NULL)
+				goto restart;
+			vcpu_lock(vcpu);
+		}
+		error = vcpu_set_state_locked(vcpu, newstate, true);
+		vcpu_unlock(vcpu);
+		if (error != 0) {
+			/* Roll back state changes. */
+			CPU_FOREACH_ISSET(i, &locked)
+				(void)vcpu_set_state(vcpu, VCPU_IDLE, false);
+			break;
+		}
+		CPU_SET(i, &locked);
+	}
+	mtx_unlock(&vm->rendezvous_mtx);
+	return (error);
+}
+
 static void
 vcpu_require_state(struct vcpu *vcpu, enum vcpu_state newstate)
 {
@@ -1083,36 +1215,23 @@ vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
 static int
 vm_handle_rendezvous(struct vcpu *vcpu)
 {
-	struct vm *vm = vcpu->vm;
+	struct vm *vm;
 	struct thread *td;
-	int error, vcpuid;
 
-	error = 0;
-	vcpuid = vcpu->vcpuid;
 	td = curthread;
+	vm = vcpu->vm;
+
 	mtx_lock(&vm->rendezvous_mtx);
 	while (vm->rendezvous_func != NULL) {
-		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
-		CPU_AND(&vm->rendezvous_req_cpus, &vm->rendezvous_req_cpus, &vm->active_cpus);
-
-		if (CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
-		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
-			VMM_CTR0(vcpu, "Calling rendezvous func");
-			(*vm->rendezvous_func)(vcpu, vm->rendezvous_arg);
-			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
-		}
-		if (CPU_CMP(&vm->rendezvous_req_cpus,
-		    &vm->rendezvous_done_cpus) == 0) {
-			VMM_CTR0(vcpu, "Rendezvous completed");
-			CPU_ZERO(&vm->rendezvous_req_cpus);
-			vm->rendezvous_func = NULL;
-			wakeup(&vm->rendezvous_func);
+		if (vm_rendezvous(vcpu))
 			break;
-		}
+
 		VMM_CTR0(vcpu, "Wait for rendezvous completion");
 		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
 		    "vmrndv", hz);
 		if (td_ast_pending(td, TDA_SUSPEND)) {
+			int error;
+
 			mtx_unlock(&vm->rendezvous_mtx);
 			error = thread_check_susp(td, true);
 			if (error != 0)
diff --git a/sys/arm64/include/vmm.h b/sys/arm64/include/vmm.h
index 84b286a60b38..696a69669a2a 100644
--- a/sys/arm64/include/vmm.h
+++ b/sys/arm64/include/vmm.h
@@ -177,7 +177,7 @@ DECLARE_VMMOPS_FUNC(int, restore_tsc, (void *vcpui, uint64_t now));
 int vm_create(const char *name, struct vm **retvm);
 struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid);
 void vm_disable_vcpu_creation(struct vm *vm);
-void vm_slock_vcpus(struct vm *vm);
+void vm_lock_vcpus(struct vm *vm);
 void vm_unlock_vcpus(struct vm *vm);
 void vm_destroy(struct vm *vm);
 int vm_reinit(struct vm *vm);
diff --git a/sys/arm64/vmm/vmm.c b/sys/arm64/vmm/vmm.c
index aeda689f3b1a..bf52dc0fe916 100644
--- a/sys/arm64/vmm/vmm.c
+++ b/sys/arm64/vmm/vmm.c
@@ -469,9 +469,9 @@ vm_alloc_vcpu(struct vm *vm, int vcpuid)
 }
 
 void
-vm_slock_vcpus(struct vm *vm)
+vm_lock_vcpus(struct vm *vm)
 {
-	sx_slock(&vm->vcpus_init_lock);
+	sx_xlock(&vm->vcpus_init_lock);
 }
 
 void
diff --git a/sys/dev/vmm/vmm_dev.c b/sys/dev/vmm/vmm_dev.c
index 460a508a60dc..4961b21180e1 100644
--- a/sys/dev/vmm/vmm_dev.c
+++ b/sys/dev/vmm/vmm_dev.c
@@ -120,18 +120,18 @@ vcpu_unlock_one(struct vcpu *vcpu)
 	vcpu_set_state(vcpu, VCPU_IDLE, false);
 }
 
+#ifndef __amd64__
 static int
-vcpu_lock_all(struct vmmdev_softc *sc)
+vcpu_set_state_all(struct vm *vm, enum vcpu_state newstate)
 {
 	struct vcpu *vcpu;
 	int error;
 	uint16_t i, j, maxcpus;
 
 	error = 0;
-	vm_slock_vcpus(sc->vm);
-	maxcpus = vm_get_maxcpus(sc->vm);
+	maxcpus = vm_get_maxcpus(vm);
 	for (i = 0; i < maxcpus; i++) {
-		vcpu = vm_vcpu(sc->vm, i);
+		vcpu = vm_vcpu(vm, i);
 		if (vcpu == NULL)
 			continue;
 		error = vcpu_lock_one(vcpu);
@@ -141,16 +141,32 @@ vcpu_lock_all(struct vmmdev_softc *sc)
 
 	if (error) {
 		for (j = 0; j < i; j++) {
-			vcpu = vm_vcpu(sc->vm, j);
+			vcpu = vm_vcpu(vm, j);
 			if (vcpu == NULL)
 				continue;
 			vcpu_unlock_one(vcpu);
 		}
-		vm_unlock_vcpus(sc->vm);
 	}
 
 	return (error);
 }
+#endif
+
+static int
+vcpu_lock_all(struct vmmdev_softc *sc)
+{
+	int error;
+
+	/*
+	 * Serialize vcpu_lock_all() callers.  Individual vCPUs are not locked
+	 * in a consistent order so we need to serialize to avoid deadlocks.
+	 */
+	vm_lock_vcpus(sc->vm);
+	error = vcpu_set_state_all(sc->vm, VCPU_FROZEN);
+	if (error != 0)
+		vm_unlock_vcpus(sc->vm);
+	return (error);
+}
 
 static void
 vcpu_unlock_all(struct vmmdev_softc *sc)
diff --git a/sys/riscv/include/vmm.h b/sys/riscv/include/vmm.h
index bc00474ed0fd..e227dd825966 100644
--- a/sys/riscv/include/vmm.h
+++ b/sys/riscv/include/vmm.h
@@ -149,7 +149,7 @@ DECLARE_VMMOPS_FUNC(void, vmspace_free, (struct vmspace *vmspace));
 int vm_create(const char *name, struct vm **retvm);
 struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid);
 void vm_disable_vcpu_creation(struct vm *vm);
-void vm_slock_vcpus(struct vm *vm);
+void vm_lock_vcpus(struct vm *vm);
 void vm_unlock_vcpus(struct vm *vm);
 void vm_destroy(struct vm *vm);
 int vm_reinit(struct vm *vm);
diff --git a/sys/riscv/vmm/vmm.c b/sys/riscv/vmm/vmm.c
index 790dcc576507..4c9b1fa53f7a 100644
--- a/sys/riscv/vmm/vmm.c
+++ b/sys/riscv/vmm/vmm.c
@@ -346,9 +346,9 @@ vm_alloc_vcpu(struct vm *vm, int vcpuid)
 }
 
 void
-vm_slock_vcpus(struct vm *vm)
+vm_lock_vcpus(struct vm *vm)
 {
-	sx_slock(&vm->vcpus_init_lock);
+	sx_xlock(&vm->vcpus_init_lock);
 }
 
 void