git: a9b615120628 - main - memdesc: Helper function to construct mbuf chain backed by memdesc buffer

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Tue, 09 Jan 2024 19:09:50 UTC
The branch main has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=a9b615120628411cc20af730edb291c52be02f66

commit a9b615120628411cc20af730edb291c52be02f66
Author:     John Baldwin <jhb@FreeBSD.org>
AuthorDate: 2024-01-09 19:05:03 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2024-01-09 19:09:43 +0000

    memdesc: Helper function to construct mbuf chain backed by memdesc buffer
    
    memdesc_alloc_ext_mbufs constructs a chain of external (M_EXT or
    M_EXTPG) mbufs backed by a data buffer described by a memory
    descriptor.
    
    Since memory descriptors are not an actual buffer just a description
    of a buffer, the caller is required to supply a couple of helper
    routines to manage allocation of the raw mbufs and associating them
    with a reference to the underlying buffer.
    
    Reviewed by:    markj
    Sponsored by:   Chelsio Communications
    Differential Revision:  https://reviews.freebsd.org/D42933
---
 sys/kern/subr_memdesc.c | 474 ++++++++++++++++++++++++++++++++++++++++++++++++
 sys/sys/memdesc.h       |  36 ++++
 2 files changed, 510 insertions(+)

diff --git a/sys/kern/subr_memdesc.c b/sys/kern/subr_memdesc.c
index 9ba9d7fe031b..ff8aad7731cd 100644
--- a/sys/kern/subr_memdesc.c
+++ b/sys/kern/subr_memdesc.c
@@ -33,9 +33,14 @@
 #include <sys/uio.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
+#include <vm/vm_page.h>
 #include <vm/vm_param.h>
 #include <machine/bus.h>
 
+/*
+ * memdesc_copyback copies data from a source buffer into a buffer
+ * described by a memory descriptor.
+ */
 static void
 phys_copyback(vm_paddr_t pa, int off, int size, const void *src)
 {
@@ -180,6 +185,10 @@ memdesc_copyback(struct memdesc *mem, int off, int size, const void *src)
 	}
 }
 
+/*
+ * memdesc_copydata copies data from a buffer described by a memory
+ * descriptor into a destination buffer.
+ */
 static void
 phys_copydata(vm_paddr_t pa, int off, int size, void *dst)
 {
@@ -323,3 +332,468 @@ memdesc_copydata(struct memdesc *mem, int off, int size, void *dst)
 		__assert_unreachable();
 	}
 }
+
+/*
+ * memdesc_alloc_ext_mbufs allocates a chain of external mbufs backed
+ * by the storage of a memory descriptor's data buffer.
+ */
+static struct mbuf *
+vaddr_ext_mbuf(memdesc_alloc_ext_mbuf_t *ext_alloc, void *cb_arg, int how,
+    void *buf, size_t len, size_t *actual_len)
+{
+	*actual_len = len;
+	return (ext_alloc(cb_arg, how, buf, len));
+}
+
+static bool
+can_append_paddr(struct mbuf *m, vm_paddr_t pa)
+{
+	u_int last_len;
+
+	/* Can always append to an empty mbuf. */
+	if (m->m_epg_npgs == 0)
+		return (true);
+
+	/* Can't append to a full mbuf. */
+	if (m->m_epg_npgs == MBUF_PEXT_MAX_PGS)
+		return (false);
+
+	/* Can't append a non-page-aligned address to a non-empty mbuf. */
+	if ((pa & PAGE_MASK) != 0)
+		return (false);
+
+	/* Can't append if the last page is not a full page. */
+	last_len = m->m_epg_last_len;
+	if (m->m_epg_npgs == 1)
+		last_len += m->m_epg_1st_off;
+	return (last_len == PAGE_SIZE);
+}
+
+/*
+ * Returns amount of data added to an M_EXTPG mbuf.
+ */
+static size_t
+append_paddr_range(struct mbuf *m, vm_paddr_t pa, size_t len)
+{
+	size_t appended;
+
+	appended = 0;
+
+	/* Append the first page. */
+	if (m->m_epg_npgs == 0) {
+		m->m_epg_pa[0] = trunc_page(pa);
+		m->m_epg_npgs = 1;
+		m->m_epg_1st_off = pa & PAGE_MASK;
+		m->m_epg_last_len = PAGE_SIZE - m->m_epg_1st_off;
+		if (m->m_epg_last_len > len)
+			m->m_epg_last_len = len;
+		m->m_len = m->m_epg_last_len;
+		len -= m->m_epg_last_len;
+		pa += m->m_epg_last_len;
+		appended += m->m_epg_last_len;
+	}
+	KASSERT(len == 0 || (pa & PAGE_MASK) == 0,
+	    ("PA not aligned before full pages"));
+
+	/* Full pages. */
+	while (len >= PAGE_SIZE && m->m_epg_npgs < MBUF_PEXT_MAX_PGS) {
+		m->m_epg_pa[m->m_epg_npgs] = pa;
+		m->m_epg_npgs++;
+		m->m_epg_last_len = PAGE_SIZE;
+		m->m_len += PAGE_SIZE;
+		pa += PAGE_SIZE;
+		len -= PAGE_SIZE;
+		appended += PAGE_SIZE;
+	}
+
+	/* Final partial page. */
+	if (len > 0 && m->m_epg_npgs < MBUF_PEXT_MAX_PGS) {
+		KASSERT(len < PAGE_SIZE, ("final page is full page"));
+		m->m_epg_pa[m->m_epg_npgs] = pa;
+		m->m_epg_npgs++;
+		m->m_epg_last_len = len;
+		m->m_len += len;
+		appended += len;
+	}
+
+	return (appended);
+}
+
+static struct mbuf *
+paddr_ext_mbuf(memdesc_alloc_extpg_mbuf_t *extpg_alloc, void *cb_arg, int how,
+    vm_paddr_t pa, size_t len, size_t *actual_len, bool can_truncate)
+{
+	struct mbuf *m, *tail;
+	size_t appended;
+
+	if (can_truncate) {
+		vm_paddr_t end;
+
+		/*
+		 * Trim any partial page at the end, but not if it's
+		 * the only page.
+		 */
+		end = trunc_page(pa + len);
+		if (end > pa)
+			len = end - pa;
+	}
+	*actual_len = len;
+
+	m = tail = extpg_alloc(cb_arg, how);
+	if (m == NULL)
+		return (NULL);
+	while (len > 0) {
+		if (!can_append_paddr(tail, pa)) {
+			MBUF_EXT_PGS_ASSERT_SANITY(tail);
+			tail->m_next = extpg_alloc(cb_arg, how);
+			if (tail->m_next == NULL)
+				goto error;
+			tail = tail->m_next;
+		}
+
+		appended = append_paddr_range(tail, pa, len);
+		KASSERT(appended > 0, ("did not append anything"));
+		KASSERT(appended <= len, ("appended too much"));
+
+		pa += appended;
+		len -= appended;
+	}
+
+	MBUF_EXT_PGS_ASSERT_SANITY(tail);
+	return (m);
+error:
+	m_freem(m);
+	return (NULL);
+}
+
+static struct mbuf *
+vlist_ext_mbuf(memdesc_alloc_ext_mbuf_t *ext_alloc, void *cb_arg, int how,
+    struct bus_dma_segment *vlist, u_int sglist_cnt, size_t offset,
+    size_t len, size_t *actual_len)
+{
+	struct mbuf *m, *n, *tail;
+	size_t todo;
+
+	*actual_len = len;
+
+	while (vlist->ds_len <= offset) {
+		KASSERT(sglist_cnt > 1, ("out of sglist entries"));
+
+		offset -= vlist->ds_len;
+		vlist++;
+		sglist_cnt--;
+	}
+
+	m = tail = NULL;
+	while (len > 0) {
+		KASSERT(sglist_cnt >= 1, ("out of sglist entries"));
+
+		todo = len;
+		if (todo > vlist->ds_len - offset)
+			todo = vlist->ds_len - offset;
+
+		n = ext_alloc(cb_arg, how, (char *)(uintptr_t)vlist->ds_addr +
+		    offset, todo);
+		if (n == NULL)
+			goto error;
+
+		if (m == NULL) {
+			m = n;
+			tail = m;
+		} else {
+			tail->m_next = n;
+			tail = n;
+		}
+
+		offset = 0;
+		vlist++;
+		sglist_cnt--;
+		len -= todo;
+	}
+
+	return (m);
+error:
+	m_freem(m);
+	return (NULL);
+}
+
+static struct mbuf *
+plist_ext_mbuf(memdesc_alloc_extpg_mbuf_t *extpg_alloc, void *cb_arg, int how,
+    struct bus_dma_segment *plist, u_int sglist_cnt, size_t offset, size_t len,
+    size_t *actual_len, bool can_truncate)
+{
+	vm_paddr_t pa;
+	struct mbuf *m, *tail;
+	size_t appended, totlen, todo;
+
+	while (plist->ds_len <= offset) {
+		KASSERT(sglist_cnt > 1, ("out of sglist entries"));
+
+		offset -= plist->ds_len;
+		plist++;
+		sglist_cnt--;
+	}
+
+	totlen = 0;
+	m = tail = extpg_alloc(cb_arg, how);
+	if (m == NULL)
+		return (NULL);
+	while (len > 0) {
+		KASSERT(sglist_cnt >= 1, ("out of sglist entries"));
+
+		pa = plist->ds_addr + offset;
+		todo = len;
+		if (todo > plist->ds_len - offset)
+			todo = plist->ds_len - offset;
+
+		/*
+		 * If truncation is enabled, avoid sending a final
+		 * partial page, but only if there is more data
+		 * available in the current segment.  Also, at least
+		 * some data must be sent, so only drop the final page
+		 * for this segment if the segment spans multiple
+		 * pages or some other data is already queued.
+		 */
+		else if (can_truncate) {
+			vm_paddr_t end;
+
+			end = trunc_page(pa + len);
+			if (end <= pa && totlen != 0) {
+				/*
+				 * This last segment is only a partial
+				 * page.
+				 */
+				len = 0;
+				break;
+			}
+			todo = end - pa;
+		}
+
+		offset = 0;
+		len -= todo;
+		totlen += todo;
+
+		while (todo > 0) {
+			if (!can_append_paddr(tail, pa)) {
+				MBUF_EXT_PGS_ASSERT_SANITY(tail);
+				tail->m_next = extpg_alloc(cb_arg, how);
+				if (tail->m_next == NULL)
+					goto error;
+				tail = tail->m_next;
+			}
+
+			appended = append_paddr_range(tail, pa, todo);
+			KASSERT(appended > 0, ("did not append anything"));
+
+			pa += appended;
+			todo -= appended;
+		}
+	}
+
+	MBUF_EXT_PGS_ASSERT_SANITY(tail);
+	*actual_len = totlen;
+	return (m);
+error:
+	m_freem(m);
+	return (NULL);
+}
+
+static struct mbuf *
+vmpages_ext_mbuf(memdesc_alloc_extpg_mbuf_t *extpg_alloc, void *cb_arg, int how,
+    vm_page_t *ma, size_t offset, size_t len, size_t *actual_len,
+    bool can_truncate)
+{
+	struct mbuf *m, *tail;
+
+	while (offset >= PAGE_SIZE) {
+		ma++;
+		offset -= PAGE_SIZE;
+	}
+
+	if (can_truncate) {
+		size_t end;
+
+		/*
+		 * Trim any partial page at the end, but not if it's
+		 * the only page.
+		 */
+		end = trunc_page(offset + len);
+		if (end > offset)
+			len = end - offset;
+	}
+	*actual_len = len;
+
+	m = tail = extpg_alloc(cb_arg, how);
+	if (m == NULL)
+		return (NULL);
+
+	/* First page. */
+	m->m_epg_pa[0] = VM_PAGE_TO_PHYS(*ma);
+	ma++;
+	m->m_epg_npgs = 1;
+	m->m_epg_1st_off = offset;
+	m->m_epg_last_len = PAGE_SIZE - offset;
+	if (m->m_epg_last_len > len)
+		m->m_epg_last_len = len;
+	m->m_len = m->m_epg_last_len;
+	len -= m->m_epg_last_len;
+
+	/* Full pages. */
+	while (len >= PAGE_SIZE) {
+		if (tail->m_epg_npgs == MBUF_PEXT_MAX_PGS) {
+			MBUF_EXT_PGS_ASSERT_SANITY(tail);
+			tail->m_next = extpg_alloc(cb_arg, how);
+			if (tail->m_next == NULL)
+				goto error;
+			tail = tail->m_next;
+		}
+
+		tail->m_epg_pa[tail->m_epg_npgs] = VM_PAGE_TO_PHYS(*ma);
+		ma++;
+		tail->m_epg_npgs++;
+		tail->m_epg_last_len = PAGE_SIZE;
+		tail->m_len += PAGE_SIZE;
+		len -= PAGE_SIZE;
+	}
+
+	/* Last partial page. */
+	if (len > 0) {
+		if (tail->m_epg_npgs == MBUF_PEXT_MAX_PGS) {
+			MBUF_EXT_PGS_ASSERT_SANITY(tail);
+			tail->m_next = extpg_alloc(cb_arg, how);
+			if (tail->m_next == NULL)
+				goto error;
+			tail = tail->m_next;
+		}
+
+		tail->m_epg_pa[tail->m_epg_npgs] = VM_PAGE_TO_PHYS(*ma);
+		ma++;
+		tail->m_epg_npgs++;
+		tail->m_epg_last_len = len;
+		tail->m_len += len;
+	}
+
+	MBUF_EXT_PGS_ASSERT_SANITY(tail);
+	return (m);
+error:
+	m_freem(m);
+	return (NULL);
+}
+
+/*
+ * Somewhat similar to m_copym but optionally avoids a partial mbuf at
+ * the end.
+ */
+static struct mbuf *
+mbuf_subchain(struct mbuf *m0, size_t offset, size_t len,
+    size_t *actual_len, bool can_truncate, int how)
+{
+	struct mbuf *m, *tail;
+	size_t totlen;
+
+	while (offset >= m0->m_len) {
+		offset -= m0->m_len;
+		m0 = m0->m_next;
+	}
+
+	/* Always return at least one mbuf. */
+	totlen = m0->m_len - offset;
+	if (totlen > len)
+		totlen = len;
+
+	m = m_get(how, MT_DATA);
+	if (m == NULL)
+		return (NULL);
+	m->m_len = totlen;
+	if (m0->m_flags & (M_EXT | M_EXTPG)) {
+		m->m_data = m0->m_data + offset;
+		mb_dupcl(m, m0);
+	} else
+		memcpy(mtod(m, void *), mtodo(m0, offset), m->m_len);
+
+	tail = m;
+	m0 = m0->m_next;
+	len -= totlen;
+	while (len > 0) {
+		/*
+		 * If truncation is enabled, don't send any partial
+		 * mbufs besides the first one.
+		 */
+		if (can_truncate && m0->m_len > len)
+			break;
+
+		tail->m_next = m_get(how, MT_DATA);
+		if (tail->m_next == NULL)
+			goto error;
+		tail = tail->m_next;
+		tail->m_len = m0->m_len;
+		if (m0->m_flags & (M_EXT | M_EXTPG)) {
+			tail->m_data = m0->m_data;
+			mb_dupcl(tail, m0);
+		} else
+			memcpy(mtod(tail, void *), mtod(m0, void *),
+			    tail->m_len);
+
+		totlen += tail->m_len;
+		m0 = m0->m_next;
+		len -= tail->m_len;
+	}
+	*actual_len = totlen;
+	return (m);
+error:
+	m_freem(m);
+	return (NULL);
+}
+
+struct mbuf *
+memdesc_alloc_ext_mbufs(struct memdesc *mem,
+    memdesc_alloc_ext_mbuf_t *ext_alloc,
+    memdesc_alloc_extpg_mbuf_t *extpg_alloc, void *cb_arg, int how,
+    size_t offset, size_t len, size_t *actual_len, bool can_truncate)
+{
+	struct mbuf *m;
+	size_t done;
+
+	switch (mem->md_type) {
+	case MEMDESC_VADDR:
+		m = vaddr_ext_mbuf(ext_alloc, cb_arg, how,
+		    (char *)mem->u.md_vaddr + offset, len, &done);
+		break;
+	case MEMDESC_PADDR:
+		m = paddr_ext_mbuf(extpg_alloc, cb_arg, how, mem->u.md_paddr +
+		    offset, len, &done, can_truncate);
+		break;
+	case MEMDESC_VLIST:
+		m = vlist_ext_mbuf(ext_alloc, cb_arg, how, mem->u.md_list,
+		    mem->md_nseg, offset, len, &done);
+		break;
+	case MEMDESC_PLIST:
+		m = plist_ext_mbuf(extpg_alloc, cb_arg, how, mem->u.md_list,
+		    mem->md_nseg, offset, len, &done, can_truncate);
+		break;
+	case MEMDESC_UIO:
+		panic("uio not supported");
+	case MEMDESC_MBUF:
+		m = mbuf_subchain(mem->u.md_mbuf, offset, len, &done,
+		    can_truncate, how);
+		break;
+	case MEMDESC_VMPAGES:
+		m = vmpages_ext_mbuf(extpg_alloc, cb_arg, how, mem->u.md_ma,
+		    mem->md_offset + offset, len, &done, can_truncate);
+		break;
+	default:
+		__assert_unreachable();
+	}
+	if (m == NULL)
+		return (NULL);
+
+	if (can_truncate) {
+		KASSERT(done <= len, ("chain too long"));
+	} else {
+		KASSERT(done == len, ("short chain with no limit"));
+	}
+	KASSERT(m_length(m, NULL) == done, ("length mismatch"));
+	if (actual_len != NULL)
+		*actual_len = done;
+	return (m);
+}
diff --git a/sys/sys/memdesc.h b/sys/sys/memdesc.h
index 24a2c2da6347..f1880fee2cda 100644
--- a/sys/sys/memdesc.h
+++ b/sys/sys/memdesc.h
@@ -163,4 +163,40 @@ void	memdesc_copyback(struct memdesc *mem, int off, int size,
     const void *src);
 void	memdesc_copydata(struct memdesc *mem, int off, int size, void *dst);
 
+/*
+ * This routine constructs a chain of M_EXT mbufs backed by a data
+ * buffer described by a memory descriptor.  Some buffers may require
+ * multiple mbufs.  For memory descriptors using unmapped storage
+ * (e.g. memdesc_vmpages), M_EXTPG mbufs are used.
+ *
+ * Since memory descriptors are not an actual buffer, just a
+ * description of the buffer, the caller is required to supply a
+ * couple of helper routines to manage allocation of the raw mbufs and
+ * associate them with a reference to the underlying buffer.
+ *
+ * The memdesc_alloc_ext_mbuf_t callback is passed the callback
+ * argument as its first argument, the how flag as its second
+ * argument, and the pointer and length of a KVA buffer.  This
+ * callback should allocate an mbuf for the KVA buffer, either by
+ * making a copy of the data or using m_extaddref().
+ *
+ * The memdesc_alloc_extpg_mbuf_t callback is passed the callback
+ * argument as its first argument and the how flag as its second
+ * argument.  It should return an empty mbuf allocated by
+ * mb_alloc_ext_pgs.
+ *
+ * If either of the callbacks returns NULL, any partially allocated
+ * chain is freed and this routine returns NULL.
+ *
+ * If can_truncate is true, then this function might return a short
+ * chain to avoid gratuitously splitting up a page.
+ */
+typedef struct mbuf *memdesc_alloc_ext_mbuf_t(void *, int, void *, size_t);
+typedef struct mbuf *memdesc_alloc_extpg_mbuf_t(void *, int);
+
+struct mbuf *memdesc_alloc_ext_mbufs(struct memdesc *mem,
+    memdesc_alloc_ext_mbuf_t *ext_alloc,
+    memdesc_alloc_extpg_mbuf_t *extpg_alloc, void *cb_arg, int how,
+    size_t offset, size_t len, size_t *actual_len, bool can_truncate);
+
 #endif /* _SYS_MEMDESC_H_ */