git: 98568a005a19 - main - vmm: Allocate vCPUs on first use of a vCPU.

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Fri, 18 Nov 2022 18:26:55 UTC
The branch main has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=98568a005a193ce2c37702a8377ddd10c570e452

commit 98568a005a193ce2c37702a8377ddd10c570e452
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2022-11-18 18:05:35 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2022-11-18 18:25:38 +0000

    vmm: Allocate vCPUs on first use of a vCPU.
    
    Convert the vcpu[] array in struct vm to an array of pointers and
    allocate vCPUs on first use.  This avoids always allocating VM_MAXCPU
    vCPUs for each VM, but instead only allocates the vCPUs in use.  A new
    per-VM sx lock is added to serialize attempts to allocate vCPUs on
    first use.  However, a given vCPU is never freed while the VM is
    active, so the pointer is read via an unlocked read first to avoid the
    need for the lock in the common case once the vCPU has been created.
    
    Some ioctls need to lock all vCPUs.  To prevent races with ioctls that
    want to allocate a new vCPU, these ioctls also lock the sx lock that
    protects vCPU creation.
    
    Reviewed by:    corvink, markj
    Differential Revision:  https://reviews.freebsd.org/D37174
---
 sys/amd64/include/vmm.h   |   4 ++
 sys/amd64/vmm/io/vlapic.c |   6 ++-
 sys/amd64/vmm/vmm.c       | 133 +++++++++++++++++++++++++++++++++-------------
 sys/amd64/vmm/vmm_dev.c   |  87 ++++++++++++++++--------------
 4 files changed, 150 insertions(+), 80 deletions(-)

diff --git a/sys/amd64/include/vmm.h b/sys/amd64/include/vmm.h
index 713c4a8b46e9..c6194c32b095 100644
--- a/sys/amd64/include/vmm.h
+++ b/sys/amd64/include/vmm.h
@@ -219,6 +219,10 @@ extern const struct vmm_ops vmm_ops_intel;
 extern const struct vmm_ops vmm_ops_amd;
 
 int vm_create(const char *name, struct vm **retvm);
+struct vcpu *vm_alloc_vcpu(struct vm *vm, int vcpuid);
+void vm_disable_vcpu_creation(struct vm *vm);
+void vm_slock_vcpus(struct vm *vm);
+void vm_unlock_vcpus(struct vm *vm);
 void vm_destroy(struct vm *vm);
 int vm_reinit(struct vm *vm);
 const char *vm_name(struct vm *vm);
diff --git a/sys/amd64/vmm/io/vlapic.c b/sys/amd64/vmm/io/vlapic.c
index e13cdcc63d57..6307ce341c72 100644
--- a/sys/amd64/vmm/io/vlapic.c
+++ b/sys/amd64/vmm/io/vlapic.c
@@ -1840,6 +1840,7 @@ int
 vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta)
 {
 	int ret;
+	struct vcpu *vcpu;
 	struct vlapic *vlapic;
 	struct LAPIC *lapic;
 	uint32_t ccr;
@@ -1851,7 +1852,10 @@ vlapic_snapshot(struct vm *vm, struct vm_snapshot_meta *meta)
 
 	maxcpus = vm_get_maxcpus(vm);
 	for (i = 0; i < maxcpus; i++) {
-		vlapic = vm_lapic(vm_vcpu(vm, i));
+		vcpu = vm_vcpu(vm, i);
+		if (vcpu == NULL)
+			continue;
+		vlapic = vm_lapic(vcpu);
 
 		/* snapshot the page first; timer period depends on icr_timer */
 		lapic = vlapic->apic_page;
diff --git a/sys/amd64/vmm/vmm.c b/sys/amd64/vmm/vmm.c
index ec6504772db3..7cde7b4005c4 100644
--- a/sys/amd64/vmm/vmm.c
+++ b/sys/amd64/vmm/vmm.c
@@ -127,7 +127,6 @@ struct vcpu {
 	uint64_t	tsc_offset;	/* (o) TSC offsetting */
 };
 
-#define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
 #define	vcpu_lock_destroy(v)	mtx_destroy(&((v)->mtx))
 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
@@ -175,6 +174,7 @@ struct vm {
 	volatile cpuset_t debug_cpus;		/* (i) vcpus stopped for debug */
 	cpuset_t	startup_cpus;		/* (i) [r] waiting for startup */
 	int		suspend;		/* (i) stop VM execution */
+	bool		dying;			/* (o) is dying */
 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
 	cpuset_t	rendezvous_req_cpus;	/* (x) [r] rendezvous requested */
@@ -186,13 +186,14 @@ struct vm {
 	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) [m+v] guest memory regions */
 	struct vmspace	*vmspace;		/* (o) guest's address space */
 	char		name[VM_MAX_NAMELEN+1];	/* (o) virtual machine name */
-	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
+	struct vcpu	*vcpu[VM_MAXCPU];	/* (x) guest vcpus */
 	/* The following describe the vm cpu topology */
 	uint16_t	sockets;		/* (o) num of sockets */
 	uint16_t	cores;			/* (o) num of cores/socket */
 	uint16_t	threads;		/* (o) num of threads/core */
 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
 	struct sx	mem_segs_lock;		/* (o) */
+	struct sx	vcpus_init_lock;	/* (o) */
 };
 
 #define	VMM_CTR0(vcpu, format)						\
@@ -319,10 +320,8 @@ vcpu_state2str(enum vcpu_state state)
 #endif
 
 static void
-vcpu_cleanup(struct vm *vm, int i, bool destroy)
+vcpu_cleanup(struct vcpu *vcpu, bool destroy)
 {
-	struct vcpu *vcpu = &vm->vcpu[i];
-
 	vmmops_vlapic_cleanup(vcpu->vlapic);
 	vmmops_vcpu_cleanup(vcpu->cookie);
 	vcpu->cookie = NULL;
@@ -333,30 +332,30 @@ vcpu_cleanup(struct vm *vm, int i, bool destroy)
 	}
 }
 
-static void
-vcpu_init(struct vm *vm, int vcpu_id, bool create)
+static struct vcpu *
+vcpu_alloc(struct vm *vm, int vcpu_id)
 {
 	struct vcpu *vcpu;
 
 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
 	    ("vcpu_init: invalid vcpu %d", vcpu_id));
-	  
-	vcpu = &vm->vcpu[vcpu_id];
-
-	if (create) {
-		KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
-		    "initialized", vcpu_id));
-		vcpu_lock_init(vcpu);
-		vcpu->state = VCPU_IDLE;
-		vcpu->hostcpu = NOCPU;
-		vcpu->vcpuid = vcpu_id;
-		vcpu->vm = vm;
-		vcpu->guestfpu = fpu_save_area_alloc();
-		vcpu->stats = vmm_stat_alloc();
-		vcpu->tsc_offset = 0;
-	}
 
-	vcpu->cookie = vmmops_vcpu_init(vm->cookie, vcpu, vcpu_id);
+	vcpu = malloc(sizeof(*vcpu), M_VM, M_WAITOK | M_ZERO);
+	vcpu_lock_init(vcpu);
+	vcpu->state = VCPU_IDLE;
+	vcpu->hostcpu = NOCPU;
+	vcpu->vcpuid = vcpu_id;
+	vcpu->vm = vm;
+	vcpu->guestfpu = fpu_save_area_alloc();
+	vcpu->stats = vmm_stat_alloc();
+	vcpu->tsc_offset = 0;
+	return (vcpu);
+}
+
+static void
+vcpu_init(struct vcpu *vcpu)
+{
+	vcpu->cookie = vmmops_vcpu_init(vcpu->vm->cookie, vcpu, vcpu->vcpuid);
 	vcpu->vlapic = vmmops_vlapic_init(vcpu->cookie);
 	vm_set_x2apic_state(vcpu, X2APIC_DISABLED);
 	vcpu->reqidle = 0;
@@ -473,8 +472,6 @@ MODULE_VERSION(vmm, 1);
 static void
 vm_init(struct vm *vm, bool create)
 {
-	int i;
-
 	vm->cookie = vmmops_init(vm, vmspace_pmap(vm->vmspace));
 	vm->iommu = NULL;
 	vm->vioapic = vioapic_init(vm);
@@ -492,8 +489,61 @@ vm_init(struct vm *vm, bool create)
 	vm->suspend = 0;
 	CPU_ZERO(&vm->suspended_cpus);
 
-	for (i = 0; i < vm->maxcpus; i++)
-		vcpu_init(vm, i, create);
+	if (!create) {
+		for (int i = 0; i < vm->maxcpus; i++) {
+			if (vm->vcpu[i] != NULL)
+				vcpu_init(vm->vcpu[i]);
+		}
+	}
+}
+
+void
+vm_disable_vcpu_creation(struct vm *vm)
+{
+	sx_xlock(&vm->vcpus_init_lock);
+	vm->dying = true;
+	sx_xunlock(&vm->vcpus_init_lock);
+}
+
+struct vcpu *
+vm_alloc_vcpu(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
+		return (NULL);
+
+	vcpu = atomic_load_ptr(&vm->vcpu[vcpuid]);
+	if (__predict_true(vcpu != NULL))
+		return (vcpu);
+
+	sx_xlock(&vm->vcpus_init_lock);
+	vcpu = vm->vcpu[vcpuid];
+	if (vcpu == NULL && !vm->dying) {
+		vcpu = vcpu_alloc(vm, vcpuid);
+		vcpu_init(vcpu);
+
+		/*
+		 * Ensure vCPU is fully created before updating pointer
+		 * to permit unlocked reads above.
+		 */
+		atomic_store_rel_ptr((uintptr_t *)&vm->vcpu[vcpuid],
+		    (uintptr_t)vcpu);
+	}
+	sx_xunlock(&vm->vcpus_init_lock);
+	return (vcpu);
+}
+
+void
+vm_slock_vcpus(struct vm *vm)
+{
+	sx_slock(&vm->vcpus_init_lock);
+}
+
+void
+vm_unlock_vcpus(struct vm *vm)
+{
+	sx_unlock(&vm->vcpus_init_lock);
 }
 
 /*
@@ -528,6 +578,7 @@ vm_create(const char *name, struct vm **retvm)
 	vm->vmspace = vmspace;
 	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
 	sx_init(&vm->mem_segs_lock, "vm mem_segs");
+	sx_init(&vm->vcpus_init_lock, "vm vcpus");
 
 	vm->sockets = 1;
 	vm->cores = cores_per_package;	/* XXX backwards compatibility */
@@ -558,17 +609,14 @@ vm_get_maxcpus(struct vm *vm)
 
 int
 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
-    uint16_t threads, uint16_t maxcpus)
+    uint16_t threads, uint16_t maxcpus __unused)
 {
-	if (maxcpus != 0)
-		return (EINVAL);	/* XXX remove when supported */
+	/* Ignore maxcpus. */
 	if ((sockets * cores * threads) > vm->maxcpus)
 		return (EINVAL);
-	/* XXX need to check sockets * cores * threads == vCPU, how? */
 	vm->sockets = sockets;
 	vm->cores = cores;
 	vm->threads = threads;
-	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
 	return(0);
 }
 
@@ -593,8 +641,10 @@ vm_cleanup(struct vm *vm, bool destroy)
 	vatpic_cleanup(vm->vatpic);
 	vioapic_cleanup(vm->vioapic);
 
-	for (i = 0; i < vm->maxcpus; i++)
-		vcpu_cleanup(vm, i, destroy);
+	for (i = 0; i < vm->maxcpus; i++) {
+		if (vm->vcpu[i] != NULL)
+			vcpu_cleanup(vm->vcpu[i], destroy);
+	}
 
 	vmmops_cleanup(vm->cookie);
 
@@ -619,6 +669,7 @@ vm_cleanup(struct vm *vm, bool destroy)
 		vmmops_vmspace_free(vm->vmspace);
 		vm->vmspace = NULL;
 
+		sx_destroy(&vm->vcpus_init_lock);
 		sx_destroy(&vm->mem_segs_lock);
 		mtx_destroy(&vm->rendezvous_mtx);
 	}
@@ -2250,7 +2301,7 @@ vcpu_vcpuid(struct vcpu *vcpu)
 struct vcpu *
 vm_vcpu(struct vm *vm, int vcpuid)
 {
-	return (&vm->vcpu[vcpuid]);
+	return (vm->vcpu[vcpuid]);
 }
 
 struct vlapic *
@@ -2761,7 +2812,9 @@ vm_snapshot_vcpus(struct vm *vm, struct vm_snapshot_meta *meta)
 	now = rdtsc();
 	maxcpus = vm_get_maxcpus(vm);
 	for (i = 0; i < maxcpus; i++) {
-		vcpu = &vm->vcpu[i];
+		vcpu = vm->vcpu[i];
+		if (vcpu == NULL)
+			continue;
 
 		SNAPSHOT_VAR_OR_LEAVE(vcpu->x2apic_state, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(vcpu->exitintinfo, meta, ret, done);
@@ -2811,7 +2864,9 @@ vm_snapshot_vcpu(struct vm *vm, struct vm_snapshot_meta *meta)
 
 	maxcpus = vm_get_maxcpus(vm);
 	for (i = 0; i < maxcpus; i++) {
-		vcpu = &vm->vcpu[i];
+		vcpu = vm->vcpu[i];
+		if (vcpu == NULL)
+			continue;
 
 		error = vmmops_vcpu_snapshot(vcpu->cookie, meta);
 		if (error != 0) {
@@ -2894,7 +2949,9 @@ vm_restore_time(struct vm *vm)
 
 	maxcpus = vm_get_maxcpus(vm);
 	for (i = 0; i < maxcpus; i++) {
-		vcpu = &vm->vcpu[i];
+		vcpu = vm->vcpu[i];
+		if (vcpu == NULL)
+			continue;
 
 		error = vmmops_restore_tsc(vcpu->cookie,
 		    vcpu->tsc_offset - now);
diff --git a/sys/amd64/vmm/vmm_dev.c b/sys/amd64/vmm/vmm_dev.c
index a7c12da68701..b4610f4f78f4 100644
--- a/sys/amd64/vmm/vmm_dev.c
+++ b/sys/amd64/vmm/vmm_dev.c
@@ -125,24 +125,16 @@ vmm_priv_check(struct ucred *ucred)
 }
 
 static int
-vcpu_lock_one(struct vmmdev_softc *sc, int vcpu)
+vcpu_lock_one(struct vcpu *vcpu)
 {
-	int error;
-
-	if (vcpu < 0 || vcpu >= vm_get_maxcpus(sc->vm))
-		return (EINVAL);
-
-	error = vcpu_set_state(vm_vcpu(sc->vm, vcpu), VCPU_FROZEN, true);
-	return (error);
+	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
 }
 
 static void
-vcpu_unlock_one(struct vmmdev_softc *sc, int vcpuid)
+vcpu_unlock_one(struct vmmdev_softc *sc, int vcpuid, struct vcpu *vcpu)
 {
-	struct vcpu *vcpu;
 	enum vcpu_state state;
 
-	vcpu = vm_vcpu(sc->vm, vcpuid);
 	state = vcpu_get_state(vcpu, NULL);
 	if (state != VCPU_FROZEN) {
 		panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
@@ -155,19 +147,29 @@ vcpu_unlock_one(struct vmmdev_softc *sc, int vcpuid)
 static int
 vcpu_lock_all(struct vmmdev_softc *sc)
 {
-	int error, vcpu;
-	uint16_t maxcpus;
+	struct vcpu *vcpu;
+	int error;
+	uint16_t i, maxcpus;
 
+	vm_slock_vcpus(sc->vm);
 	maxcpus = vm_get_maxcpus(sc->vm);
-	for (vcpu = 0; vcpu < maxcpus; vcpu++) {
-		error = vcpu_lock_one(sc, vcpu);
+	for (i = 0; i < maxcpus; i++) {
+		vcpu = vm_vcpu(sc->vm, i);
+		if (vcpu == NULL)
+			continue;
+		error = vcpu_lock_one(vcpu);
 		if (error)
 			break;
 	}
 
 	if (error) {
-		while (--vcpu >= 0)
-			vcpu_unlock_one(sc, vcpu);
+		while (--i >= 0) {
+			vcpu = vm_vcpu(sc->vm, i);
+			if (vcpu == NULL)
+				continue;
+			vcpu_unlock_one(sc, i, vcpu);
+		}
+		vm_unlock_vcpus(sc->vm);
 	}
 
 	return (error);
@@ -176,12 +178,17 @@ vcpu_lock_all(struct vmmdev_softc *sc)
 static void
 vcpu_unlock_all(struct vmmdev_softc *sc)
 {
-	int vcpu;
-	uint16_t maxcpus;
+	struct vcpu *vcpu;
+	uint16_t i, maxcpus;
 
 	maxcpus = vm_get_maxcpus(sc->vm);
-	for (vcpu = 0; vcpu < maxcpus; vcpu++)
-		vcpu_unlock_one(sc, vcpu);
+	for (i = 0; i < maxcpus; i++) {
+		vcpu = vm_vcpu(sc->vm, i);
+		if (vcpu == NULL)
+			continue;
+		vcpu_unlock_one(sc, i, vcpu);
+	}
+	vm_unlock_vcpus(sc->vm);
 }
 
 static struct vmmdev_softc *
@@ -363,20 +370,11 @@ vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
 	return (error);
 }
 
-static struct vcpu *
-lookup_vcpu(struct vm *vm, int vcpuid)
-{
-	if (vcpuid < 0 || vcpuid >= vm_get_maxcpus(vm))
-		return (NULL);
-
-	return (vm_vcpu(vm, vcpuid));
-}
-
 static int
 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	     struct thread *td)
 {
-	int error, vcpuid, state_changed, size;
+	int error, vcpuid, size;
 	cpuset_t *cpuset;
 	struct vmmdev_softc *sc;
 	struct vcpu *vcpu;
@@ -414,6 +412,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 	struct vm_readwrite_kernemu_device *kernemu;
 	uint64_t *regvals;
 	int *regnums;
+	enum { NONE, SINGLE, ALL } vcpus_locked;
 	bool memsegs_locked;
 #ifdef BHYVE_SNAPSHOT
 	struct vm_snapshot_meta *snapshot_meta;
@@ -429,7 +428,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 
 	vcpuid = -1;
 	vcpu = NULL;
-	state_changed = 0;
+	vcpus_locked = NONE;
 	memsegs_locked = false;
 
 	/*
@@ -465,11 +464,15 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		 * ioctls that can operate only on vcpus that are not running.
 		 */
 		vcpuid = *(int *)data;
-		error = vcpu_lock_one(sc, vcpuid);
+		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
+		if (vcpu == NULL) {
+			error = EINVAL;
+			goto done;
+		}
+		error = vcpu_lock_one(vcpu);
 		if (error)
 			goto done;
-		state_changed = 1;
-		vcpu = vm_vcpu(sc->vm, vcpuid);
+		vcpus_locked = SINGLE;
 		break;
 
 #ifdef COMPAT_FREEBSD12
@@ -501,7 +504,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		error = vcpu_lock_all(sc);
 		if (error)
 			goto done;
-		state_changed = 2;
+		vcpus_locked = ALL;
 		break;
 
 #ifdef COMPAT_FREEBSD12
@@ -528,7 +531,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		 * a specific vCPU.
 		 */
 		vcpuid = *(int *)data;
-		vcpu = lookup_vcpu(sc->vm, vcpuid);
+		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
 		if (vcpu == NULL) {
 			error = EINVAL;
 			goto done;
@@ -545,7 +548,7 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		vcpuid = *(int *)data;
 		if (vcpuid == -1)
 			break;
-		vcpu = lookup_vcpu(sc->vm, vcpuid);
+		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
 		if (vcpu == NULL) {
 			error = EINVAL;
 			goto done;
@@ -957,9 +960,9 @@ vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
 		break;
 	}
 
-	if (state_changed == 1)
-		vcpu_unlock_one(sc, vcpuid);
-	else if (state_changed == 2)
+	if (vcpus_locked == SINGLE)
+		vcpu_unlock_one(sc, vcpuid, vcpu);
+	else if (vcpus_locked == ALL)
 		vcpu_unlock_all(sc);
 	if (memsegs_locked)
 		vm_unlock_memsegs(sc->vm);
@@ -1041,8 +1044,10 @@ vmmdev_destroy(void *arg)
 	struct devmem_softc *dsc;
 	int error __diagused;
 
+	vm_disable_vcpu_creation(sc->vm);
 	error = vcpu_lock_all(sc);
 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
+	vm_unlock_vcpus(sc->vm);
 
 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));