git: 08c7dd2fbe4f - main - libvmmapi: Add support for setting up and configuring guest NUMA domains

From: Bojan Novković <bnovkov_at_FreeBSD.org>
Date: Sun, 27 Jul 2025 16:32:18 UTC
The branch main has been updated by bnovkov:

URL: https://cgit.FreeBSD.org/src/commit/?id=08c7dd2fbe4fb7ae5cd6943afef04bd4cb350c1f

commit 08c7dd2fbe4fb7ae5cd6943afef04bd4cb350c1f
Author:     Bojan Novković <bnovkov@FreeBSD.org>
AuthorDate: 2024-09-08 15:57:55 +0000
Commit:     Bojan Novković <bnovkov@FreeBSD.org>
CommitDate: 2025-07-27 16:31:48 +0000

    libvmmapi: Add support for setting up and configuring guest NUMA domains
    
    This patch reworks libvmmapi to provide support for emulating NUMA
    domains in guests.
    
    More specifically, it reworks 'vm_setup_memory' to setup system memory
    segments for each guest NUMA domain.
    
    An emulated NUMA domain is described by a 'struct vmdom' in vmmapi.h.
    Aside from its size in bytes, each domain can be configured to use a
    specific domainset(9) policy and domain mask.
    'vm_setup_memory' now takes two additional arguments - an array of
    struct vmdoms and the array's size. It then proceeds to set up a memory
    segment for each specified domain using the existing memory mapping
    scheme. If no domain info is passed, the memory setup falls back to
    the original, non-NUMA behaviour.
    
    Differential Revision:  https://reviews.freebsd.org/D44566
    Reviewed by:    markj
---
 lib/libvmmapi/internal.h |  11 ++-
 lib/libvmmapi/vmmapi.c   | 181 ++++++++++++++++++++++++++++++++++-------------
 lib/libvmmapi/vmmapi.h   |  12 +++-
 3 files changed, 145 insertions(+), 59 deletions(-)

diff --git a/lib/libvmmapi/internal.h b/lib/libvmmapi/internal.h
index aa7b1d8e6a93..4afe1cab3460 100644
--- a/lib/libvmmapi/internal.h
+++ b/lib/libvmmapi/internal.h
@@ -8,12 +8,7 @@
 #define	__VMMAPI_INTERNAL_H__
 
 #include <sys/types.h>
-
-enum {
-	VM_MEMSEG_LOW,
-	VM_MEMSEG_HIGH,
-	VM_MEMSEG_COUNT,
-};
+#include <dev/vmm/vmm_mem.h>
 
 struct vmctx {
 	int	fd;		/* device file descriptor */
@@ -21,7 +16,9 @@ struct vmctx {
 	struct {
 		vm_paddr_t base;
 		vm_size_t size;
-	} memsegs[VM_MEMSEG_COUNT];
+	} memsegs[VM_MAX_MEMSEGS];
+	size_t 	lowmem_size;
+	size_t 	highmem_size;
 	int	memflags;
 	char	*baseaddr;
 	char	*name;
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index a1a5d56ff8a2..77f0f8f5c581 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -28,13 +28,14 @@
 
 #include <sys/param.h>
 #include <sys/capsicum.h>
+#include <sys/cpuset.h>
+#include <sys/domainset.h>
 #include <sys/sysctl.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <sys/linker.h>
 #include <sys/module.h>
 #include <sys/_iovec.h>
-#include <sys/cpuset.h>
 
 #include <capsicum_helpers.h>
 #include <err.h>
@@ -322,8 +323,8 @@ vm_get_guestmem_from_ctx(struct vmctx *ctx, char **guest_baseaddr,
 {
 
 	*guest_baseaddr = ctx->baseaddr;
-	*lowmem_size = ctx->memsegs[VM_MEMSEG_LOW].size;
-	*highmem_size = ctx->memsegs[VM_MEMSEG_HIGH].size;
+	*lowmem_size = ctx->lowmem_size;
+	*highmem_size = ctx->highmem_size;
 	return (0);
 }
 
@@ -379,7 +380,8 @@ cmpseg(size_t len, const char *str, size_t len2, const char *str2)
 }
 
 static int
-vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
+vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name,
+    int ds_policy, domainset_t *ds_mask, size_t ds_size)
 {
 	struct vm_memseg memseg;
 	size_t n;
@@ -407,6 +409,13 @@ vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
 	bzero(&memseg, sizeof(struct vm_memseg));
 	memseg.segid = segid;
 	memseg.len = len;
+	if (ds_mask == NULL) {
+		memseg.ds_policy = DOMAINSET_POLICY_INVALID;
+	} else {
+		memseg.ds_policy = ds_policy;
+		memseg.ds_mask = ds_mask;
+		memseg.ds_mask_size = ds_size;
+	}
 	if (name != NULL) {
 		n = strlcpy(memseg.name, name, sizeof(memseg.name));
 		if (n >= sizeof(memseg.name)) {
@@ -442,13 +451,14 @@ vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
 }
 
 static int
-setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
+map_memory_segment(struct vmctx *ctx, int segid, vm_paddr_t gpa, size_t len,
+    size_t segoff, char *base)
 {
 	char *ptr;
 	int error, flags;
 
 	/* Map 'len' bytes starting at 'gpa' in the guest address space */
-	error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
+	error = vm_mmap_memseg(ctx, gpa, segid, segoff, len, PROT_ALL);
 	if (error)
 		return (error);
 
@@ -464,65 +474,136 @@ setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
 	return (0);
 }
 
+/*
+ * Allocates and maps virtual machine memory segments according
+ * to the NUMA topology specified by the 'doms' array.
+ *
+ * The domains are laid out sequentially in the guest's physical address space.
+ * The [VM_LOWMEM_LIMIT, VM_HIGHMEM_BASE) address range is skipped and
+ * left unmapped.
+ */
 int
-vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
+vm_setup_memory_domains(struct vmctx *ctx, enum vm_mmap_style vms,
+    struct vm_mem_domain *doms, int ndoms)
 {
-	size_t objsize, len;
-	vm_paddr_t gpa;
+	size_t low_len, len, totalsize;
+	struct vm_mem_domain *dom;
+	struct vm_memseg memseg;
 	char *baseaddr, *ptr;
-	int error;
+	int error, i, segid;
+	vm_paddr_t gpa;
 
+	/* Sanity checks. */
 	assert(vms == VM_MMAP_ALL);
-
-	/*
-	 * If 'memsize' cannot fit entirely in the 'lowmem' segment then create
-	 * another 'highmem' segment above VM_HIGHMEM_BASE for the remainder.
-	 */
-	if (memsize > VM_LOWMEM_LIMIT) {
-		ctx->memsegs[VM_MEMSEG_LOW].size = VM_LOWMEM_LIMIT;
-		ctx->memsegs[VM_MEMSEG_HIGH].size = memsize - VM_LOWMEM_LIMIT;
-		objsize = VM_HIGHMEM_BASE + ctx->memsegs[VM_MEMSEG_HIGH].size;
-	} else {
-		ctx->memsegs[VM_MEMSEG_LOW].size = memsize;
-		ctx->memsegs[VM_MEMSEG_HIGH].size = 0;
-		objsize = memsize;
+	if (doms == NULL || ndoms <= 0 || ndoms > VM_MAXMEMDOM) {
+		errno = EINVAL;
+		return (-1);
 	}
 
-	error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
-	if (error)
-		return (error);
+	/* Calculate total memory size. */
+	totalsize = 0;
+	for (i = 0; i < ndoms; i++)
+		totalsize += doms[i].size;
+
+	if (totalsize > VM_LOWMEM_LIMIT)
+		totalsize = VM_HIGHMEM_BASE + (totalsize - VM_LOWMEM_LIMIT);
 
 	/*
 	 * Stake out a contiguous region covering the guest physical memory
 	 * and the adjoining guard regions.
 	 */
-	len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
+	len = VM_MMAP_GUARD_SIZE + totalsize + VM_MMAP_GUARD_SIZE;
 	ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0);
 	if (ptr == MAP_FAILED)
 		return (-1);
-
 	baseaddr = ptr + VM_MMAP_GUARD_SIZE;
-	if (ctx->memsegs[VM_MEMSEG_HIGH].size > 0) {
-		gpa = VM_HIGHMEM_BASE;
-		len = ctx->memsegs[VM_MEMSEG_HIGH].size;
-		error = setup_memory_segment(ctx, gpa, len, baseaddr);
-		if (error)
-			return (error);
-	}
 
-	if (ctx->memsegs[VM_MEMSEG_LOW].size > 0) {
-		gpa = 0;
-		len = ctx->memsegs[VM_MEMSEG_LOW].size;
-		error = setup_memory_segment(ctx, gpa, len, baseaddr);
-		if (error)
-			return (error);
-	}
+	/*
+	 * Allocate and map memory segments for the virtual machine.
+	 */
+	gpa = VM_LOWMEM_LIMIT > 0 ? 0 : VM_HIGHMEM_BASE;
+	ctx->lowmem_size = 0;
+	ctx->highmem_size = 0;
+	for (i = 0; i < ndoms; i++) {
+		segid = VM_SYSMEM + i;
+		dom = &doms[i];
+
+		/*
+		 * Check if the memory segment already exists.
+		 * If 'ndoms' is greater than one, refuse to proceed if the
+		 * memseg already exists. If only one domain was requested, use
+		 * the existing segment to preserve the behaviour of the previous
+		 * implementation.
+		 *
+		 * Splitting existing memory segments is tedious and
+		 * error-prone, which is why we don't support NUMA
+		 * domains for bhyveload(8)-loaded VMs.
+		 */
+		error = vm_get_memseg(ctx, segid, &len, memseg.name,
+		    sizeof(memseg.name));
+		if (error == 0 && len != 0) {
+			if (ndoms != 1) {
+				errno = EEXIST;
+				return (-1);
+			} else
+				doms[0].size = len;
+		} else {
+			error = vm_alloc_memseg(ctx, segid, dom->size, NULL,
+			    dom->ds_policy, dom->ds_mask, dom->ds_size);
+			if (error)
+				return (error);
+		}
 
+		/*
+		 * If a domain is split by VM_LOWMEM_LIMIT then break
+		 * its segment mapping into two parts, one below VM_LOWMEM_LIMIT
+		 * and one above VM_HIGHMEM_BASE.
+		 */
+		if (gpa <= VM_LOWMEM_LIMIT &&
+		    gpa + dom->size > VM_LOWMEM_LIMIT) {
+			low_len = VM_LOWMEM_LIMIT - gpa;
+			error = map_memory_segment(ctx, segid, gpa, low_len, 0,
+			    baseaddr);
+			if (error)
+				return (error);
+			ctx->lowmem_size = VM_LOWMEM_LIMIT;
+			/* Map the remainder. */
+			gpa = VM_HIGHMEM_BASE;
+			len = dom->size - low_len;
+			error = map_memory_segment(ctx, segid, gpa, len,
+			    low_len, baseaddr);
+			if (error)
+				return (error);
+		} else {
+			len = dom->size;
+			error = map_memory_segment(ctx, segid, gpa, len, 0,
+			    baseaddr);
+			if (error)
+				return (error);
+		}
+		if (gpa <= VM_LOWMEM_LIMIT)
+			ctx->lowmem_size += len;
+		else
+			ctx->highmem_size += len;
+		gpa += len;
+	}
 	ctx->baseaddr = baseaddr;
 
 	return (0);
 }
 
+int
+vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
+{
+	struct vm_mem_domain dom0;
+
+	memset(&dom0, 0, sizeof(dom0));
+	dom0.ds_policy = DOMAINSET_POLICY_INVALID;
+	dom0.size = memsize;
+
+	return (vm_setup_memory_domains(ctx, vms, &dom0, 1));
+}
+
 /*
  * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in
  * the lowmem or highmem regions.
@@ -535,13 +616,13 @@ vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
 {
 	vm_size_t lowsize, highsize;
 
-	lowsize = ctx->memsegs[VM_MEMSEG_LOW].size;
+	lowsize = ctx->lowmem_size;
 	if (lowsize > 0) {
 		if (gaddr < lowsize && len <= lowsize && gaddr + len <= lowsize)
 			return (ctx->baseaddr + gaddr);
 	}
 
-	highsize = ctx->memsegs[VM_MEMSEG_HIGH].size;
+	highsize = ctx->highmem_size;
 	if (highsize > 0 && gaddr >= VM_HIGHMEM_BASE) {
 		if (gaddr < VM_HIGHMEM_BASE + highsize && len <= highsize &&
 		    gaddr + len <= VM_HIGHMEM_BASE + highsize)
@@ -559,12 +640,12 @@ vm_rev_map_gpa(struct vmctx *ctx, void *addr)
 
 	offaddr = (char *)addr - ctx->baseaddr;
 
-	lowsize = ctx->memsegs[VM_MEMSEG_LOW].size;
+	lowsize = ctx->lowmem_size;
 	if (lowsize > 0)
 		if (offaddr <= lowsize)
 			return (offaddr);
 
-	highsize = ctx->memsegs[VM_MEMSEG_HIGH].size;
+	highsize = ctx->highmem_size;
 	if (highsize > 0)
 		if (offaddr >= VM_HIGHMEM_BASE &&
 		    offaddr < VM_HIGHMEM_BASE + highsize)
@@ -583,8 +664,7 @@ vm_get_name(struct vmctx *ctx)
 size_t
 vm_get_lowmem_size(struct vmctx *ctx)
 {
-
-	return (ctx->memsegs[VM_MEMSEG_LOW].size);
+	return (ctx->lowmem_size);
 }
 
 vm_paddr_t
@@ -597,8 +677,7 @@ vm_get_highmem_base(struct vmctx *ctx __unused)
 size_t
 vm_get_highmem_size(struct vmctx *ctx)
 {
-
-	return (ctx->memsegs[VM_MEMSEG_HIGH].size);
+	return (ctx->highmem_size);
 }
 
 void *
@@ -616,7 +695,7 @@ vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
 		goto done;
 	}
 
-	error = vm_alloc_memseg(ctx, segid, len, name);
+	error = vm_alloc_memseg(ctx, segid, len, name, 0, NULL, 0);
 	if (error)
 		goto done;
 
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index 2072c0105e37..b637c45d1eff 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -64,6 +64,14 @@ enum vm_mmap_style {
 #define	VM_MEM_F_INCORE	0x01	/* include guest memory in core file */
 #define	VM_MEM_F_WIRED	0x02	/* guest memory is wired */
 
+/* Memory size and allocation policy for a single NUMA domain. */
+struct vm_mem_domain {
+	size_t size;
+	int ds_policy;
+	domainset_t *ds_mask;
+	size_t ds_size;
+};
+
 __BEGIN_DECLS
 /*
  * Get the length and name of the memory segment identified by 'segid'.
@@ -115,7 +123,9 @@ struct vcpu *vm_vcpu_open(struct vmctx *ctx, int vcpuid);
 void	vm_vcpu_close(struct vcpu *vcpu);
 int	vcpu_id(struct vcpu *vcpu);
 int	vm_parse_memsize(const char *optarg, size_t *memsize);
-int	vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
+int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
+int vm_setup_memory_domains(struct vmctx *ctx, enum vm_mmap_style s,
+			    struct vm_mem_domain *doms, int ndoms);
 void	*vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
 /* inverse operation to vm_map_gpa - extract guest address from host pointer */
 vm_paddr_t vm_rev_map_gpa(struct vmctx *ctx, void *addr);