svn commit: r349529 - in head: sbin/ifconfig share/man/man9 sys/conf sys/kern sys/net sys/netinet sys/netinet6 sys/sys

John Baldwin jhb at FreeBSD.org
Sat Jun 29 00:48:37 UTC 2019


Author: jhb
Date: Sat Jun 29 00:48:33 2019
New Revision: 349529
URL: https://svnweb.freebsd.org/changeset/base/349529

Log:
  Add an external mbuf buffer type that holds multiple unmapped pages.
  
  Unmapped mbufs allow sendfile to carry multiple pages of data in a
  single mbuf, without mapping those pages.  It is a requirement for
  Netflix's in-kernel TLS, and provides a 5-10% CPU savings on heavy web
  serving workloads when used by sendfile, due to effectively
  compressing socket buffers by an order of magnitude, and hence
  reducing cache misses.
  
  For this new external mbuf buffer type (EXT_PGS), the ext_buf pointer
  now points to a struct mbuf_ext_pgs structure instead of a data
  buffer.  This structure contains an array of physical addresses (this
  reduces cache misses compared to an earlier version that stored an
  array of vm_page_t pointers).  It also stores additional fields needed
  for in-kernel TLS such as the TLS header and trailer data that are
  currently unused.  To more easily detect these mbufs, the M_NOMAP flag
  is set in m_flags in addition to M_EXT.
  
  Various functions like m_copydata() have been updated to safely access
  packet contents (using uiomove_fromphys()), to make things like BPF
  safe.
  
  NIC drivers advertise support for unmapped mbufs on transmit via a new
  IFCAP_NOMAP capability.  This capability can be toggled via the new
  'nomap' and '-nomap' ifconfig(8) commands.  For NIC drivers that only
  transmit packet contents via DMA and use bus_dma, adding the
  capability to if_capabilities and if_capenable should be all that is
  required.
  
  If a NIC does not support unmapped mbufs, they are converted to a
  chain of mapped mbufs (using sf_bufs to provide the mapping) in
  ip_output or ip6_output.  If an unmapped mbuf requires software
  checksums, it is also converted to a chain of mapped mbufs before
  computing the checksum.
  
  Submitted by:	gallatin (earlier version)
  Reviewed by:	gallatin, hselasky, rrs
  Discussed with:	ae, kp (firewalls)
  Relnotes:	yes
  Sponsored by:	Netflix
  Differential Revision:	https://reviews.freebsd.org/D20616

Modified:
  head/sbin/ifconfig/ifconfig.8
  head/sbin/ifconfig/ifconfig.c
  head/share/man/man9/Makefile
  head/share/man/man9/mbuf.9
  head/share/man/man9/sglist.9
  head/sys/conf/files
  head/sys/conf/kern.mk
  head/sys/kern/kern_mbuf.c
  head/sys/kern/subr_bus_dma.c
  head/sys/kern/subr_sglist.c
  head/sys/kern/uipc_mbuf.c
  head/sys/kern/uipc_sockbuf.c
  head/sys/kern/uipc_socket.c
  head/sys/net/bpf.c
  head/sys/net/bpf_buffer.c
  head/sys/net/if.h
  head/sys/netinet/ip_output.c
  head/sys/netinet/tcp_pcap.c
  head/sys/netinet/tcp_usrreq.c
  head/sys/netinet6/ip6_output.c
  head/sys/sys/mbuf.h
  head/sys/sys/sglist.h

Modified: head/sbin/ifconfig/ifconfig.8
==============================================================================
--- head/sbin/ifconfig/ifconfig.8	Fri Jun 28 23:40:58 2019	(r349528)
+++ head/sbin/ifconfig/ifconfig.8	Sat Jun 29 00:48:33 2019	(r349529)
@@ -28,7 +28,7 @@
 .\"     From: @(#)ifconfig.8	8.3 (Berkeley) 1/5/94
 .\" $FreeBSD$
 .\"
-.Dd May 18, 2019
+.Dd June 28, 2019
 .Dt IFCONFIG 8
 .Os
 .Sh NAME
@@ -538,6 +538,12 @@ large receive offloading, enable LRO on the interface.
 If the driver supports
 .Xr tcp 4
 large receive offloading, disable LRO on the interface.
+.It Cm nomap
+If the driver supports unmapped network buffers,
+enable them on the interface.
+.It Fl nomap
+If the driver supports unmapped network buffers,
+disable them on the interface.
 .It Cm wol , wol_ucast , wol_mcast , wol_magic
 Enable Wake On Lan (WOL) support, if available.
 WOL is a facility whereby a machine in a low power state may be woken

Modified: head/sbin/ifconfig/ifconfig.c
==============================================================================
--- head/sbin/ifconfig/ifconfig.c	Fri Jun 28 23:40:58 2019	(r349528)
+++ head/sbin/ifconfig/ifconfig.c	Sat Jun 29 00:48:33 2019	(r349529)
@@ -1257,7 +1257,7 @@ unsetifdescr(const char *val, int value, int s, const 
 "\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \
 "\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \
 "\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \
-"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP"
+"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP\33NOMAP"
 
 /*
  * Print the status of the interface.  If an address family was
@@ -1557,6 +1557,8 @@ static struct cmd basic_cmds[] = {
 	DEF_CMD("-link2",	-IFF_LINK2,	setifflags),
 	DEF_CMD("monitor",	IFF_MONITOR,	setifflags),
 	DEF_CMD("-monitor",	-IFF_MONITOR,	setifflags),
+	DEF_CMD("nomap",	IFCAP_NOMAP,	setifcap),
+	DEF_CMD("-nomap",	-IFCAP_NOMAP,	setifcap),
 	DEF_CMD("staticarp",	IFF_STATICARP,	setifflags),
 	DEF_CMD("-staticarp",	-IFF_STATICARP,	setifflags),
 	DEF_CMD("rxcsum6",	IFCAP_RXCSUM_IPV6,	setifcap),

Modified: head/share/man/man9/Makefile
==============================================================================
--- head/share/man/man9/Makefile	Fri Jun 28 23:40:58 2019	(r349528)
+++ head/share/man/man9/Makefile	Sat Jun 29 00:48:33 2019	(r349529)
@@ -1834,6 +1834,8 @@ MLINKS+=sf_buf.9 sf_buf_alloc.9 \
 MLINKS+=sglist.9 sglist_alloc.9 \
 	sglist.9 sglist_append.9 \
 	sglist.9 sglist_append_bio.9 \
+	sglist.9 sglist_append_ext_pgs.9 \
+	sglist.9 sglist_append_mb_ext_pgs.9 \
 	sglist.9 sglist_append_mbuf.9 \
 	sglist.9 sglist_append_phys.9 \
 	sglist.9 sglist_append_sglist.9 \
@@ -1844,6 +1846,8 @@ MLINKS+=sglist.9 sglist_alloc.9 \
 	sglist.9 sglist_clone.9 \
 	sglist.9 sglist_consume_uio.9 \
 	sglist.9 sglist_count.9 \
+	sglist.9 sglist_count_ext_pgs.9 \
+	sglist.9 sglist_count_mb_ext_pgs.9 \
 	sglist.9 sglist_count_vmpages.9 \
 	sglist.9 sglist_free.9 \
 	sglist.9 sglist_hold.9 \

Modified: head/share/man/man9/mbuf.9
==============================================================================
--- head/share/man/man9/mbuf.9	Fri Jun 28 23:40:58 2019	(r349528)
+++ head/share/man/man9/mbuf.9	Sat Jun 29 00:48:33 2019	(r349529)
@@ -213,7 +213,7 @@ flag bits are defined as follows:
 #define	M_PKTHDR	0x00000002 /* start of record */
 #define	M_EOR		0x00000004 /* end of record */
 #define	M_RDONLY	0x00000008 /* associated data marked read-only */
-#define	M_NOMAP		0x00000100 /* mbuf data is unmapped (soon from Drew) */
+#define	M_NOMAP		0x00000100 /* mbuf data is unmapped */
 #define	M_NOFREE	0x00000200 /* do not free mbuf, embedded in cluster */
 #define	M_BCAST		0x00000010 /* send/received as link-level broadcast */
 #define	M_MCAST		0x00000020 /* send/received as link-level multicast */
@@ -272,6 +272,7 @@ The available external buffer types are defined as fol
 #define	EXT_PACKET	6	/* mbuf+cluster from packet zone */
 #define	EXT_MBUF	7	/* external mbuf reference */
 #define	EXT_RXRING	8	/* data in NIC receive ring */
+#define	EXT_PGS		9	/* array of unmapped pages */
 
 #define	EXT_VENDOR1	224	/* for vendor-internal use */
 #define	EXT_VENDOR2	225	/* for vendor-internal use */

Modified: head/share/man/man9/sglist.9
==============================================================================
--- head/share/man/man9/sglist.9	Fri Jun 28 23:40:58 2019	(r349528)
+++ head/share/man/man9/sglist.9	Sat Jun 29 00:48:33 2019	(r349529)
@@ -26,7 +26,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd May 16, 2017
+.Dd June 28, 2019
 .Dt SGLIST 9
 .Os
 .Sh NAME
@@ -34,6 +34,8 @@
 .Nm sglist_alloc ,
 .Nm sglist_append ,
 .Nm sglist_append_bio ,
+.Nm sglist_append_ext_pgs,
+.Nm sglist_append_mb_ext_pgs,
 .Nm sglist_append_mbuf ,
 .Nm sglist_append_phys ,
 .Nm sglist_append_sglist ,
@@ -44,6 +46,8 @@
 .Nm sglist_clone ,
 .Nm sglist_consume_uio ,
 .Nm sglist_count ,
+.Nm sglist_count_ext_pgs ,
+.Nm sglist_count_mb_ext_pgs ,
 .Nm sglist_count_vmpages ,
 .Nm sglist_free ,
 .Nm sglist_hold ,
@@ -64,6 +68,10 @@
 .Ft int
 .Fn sglist_append_bio "struct sglist *sg" "struct bio *bp"
 .Ft int
+.Fn sglist_append_ext_pgs "struct sglist *sg" "struct mbuf_ext_pgs *ext_pgs" "size_t offset" "size_t len"
+.Ft int
+.Fn sglist_append_mb_ext_pgs "struct sglist *sg" "struct mbuf *m"
+.Ft int
 .Fn sglist_append_mbuf "struct sglist *sg" "struct mbuf *m"
 .Ft int
 .Fn sglist_append_phys "struct sglist *sg" "vm_paddr_t paddr" "size_t len"
@@ -84,6 +92,10 @@
 .Ft int
 .Fn sglist_count "void *buf" "size_t len"
 .Ft int
+.Fn sglist_count_ext_pgs "struct mbuf_ext_pgs *ext_pgs" "size_t offset" "size_t len"
+.Ft int
+.Fn sglist_count_mb_ext_pgs "struct mbuf *m"
+.Ft int
 .Fn sglist_count_vmpages "vm_page_t *m" "size_t pgoff" "size_t len"
 .Ft void
 .Fn sglist_free "struct sglist *sg"
@@ -146,6 +158,22 @@ and is
 bytes long.
 .Pp
 The
+.Nm sglist_count_ext_pgs
+function returns the number of scatter/gather list elements needed to describe
+the unmapped external mbuf buffer
+.Fa ext_pgs .
+The ranges start at an offset of
+.Fa offset
+relative to the start of the buffer and is
+.Fa len
+bytes long.
+The
+.Nm sglist_count_mb_ext_pgs
+function returns the number of scatter/gather list elements needed to describe
+the physical address ranges of a single unmapped mbuf
+.Fa m .
+.Pp
+The
 .Nm sglist_count_vmpages
 function returns the number of scatter/gather list elements needed to describe
 the physical address ranges of a buffer backed by an array of virtual memory
@@ -237,6 +265,34 @@ to the scatter/gather list
 .Fa sg .
 .Pp
 The
+.Nm sglist_append_ext_pgs
+function appends the physical address ranges described by the unmapped
+external mbuf buffer
+.Fa ext_pgs
+to the scatter/gather list
+.Fa sg .
+The physical address ranges start at offset
+.Fa offset
+within
+.Fa ext_pgs
+and continue for
+.Fa len
+bytes.
+.Pp
+The
+.Nm sglist_append_mb_ext_pgs
+function appends the physical address ranges described by the unmapped
+mbuf
+.Fa m
+to the scatter/gather list
+.Fa sg .
+Note that unlike
+.Nm sglist_append_mbuf ,
+.Nm sglist_append_mb_ext_pgs
+only adds ranges for a single mbuf,
+not an entire mbuf chain.
+.Pp
+The
 .Nm sglist_append_mbuf
 function appends the physical address ranges described by an entire mbuf
 chain
@@ -467,8 +523,7 @@ functions return zero on success or an error on failur
 .Pp
 The
 .Nm sglist_count
-and
-.Nm sglist_count_vmpages
+family of
 functions return a count of scatter/gather list elements.
 .Pp
 The

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files	Fri Jun 28 23:40:58 2019	(r349528)
+++ head/sys/conf/files	Sat Jun 29 00:48:33 2019	(r349529)
@@ -4255,7 +4255,8 @@ netinet/tcp_lro.c		optional inet | inet6
 netinet/tcp_output.c		optional inet | inet6
 netinet/tcp_offload.c		optional tcp_offload inet | tcp_offload inet6
 netinet/tcp_hpts.c              optional tcphpts inet | tcphpts inet6
-netinet/tcp_pcap.c		optional inet tcppcap | inet6 tcppcap
+netinet/tcp_pcap.c		optional inet tcppcap | inet6 tcppcap \
+	compile-with "${NORMAL_C} ${NO_WNONNULL}"
 netinet/tcp_reass.c		optional inet | inet6
 netinet/tcp_sack.c		optional inet | inet6
 netinet/tcp_subr.c		optional inet | inet6

Modified: head/sys/conf/kern.mk
==============================================================================
--- head/sys/conf/kern.mk	Fri Jun 28 23:40:58 2019	(r349528)
+++ head/sys/conf/kern.mk	Sat Jun 29 00:48:33 2019	(r349529)
@@ -76,6 +76,7 @@ CWARNEXTRA?=	-Wno-uninitialized
 # GCC 4.2 doesn't have -Wno-error=cast-qual, so just disable the warning for
 # the few files that are already known to generate cast-qual warnings.
 NO_WCAST_QUAL= -Wno-cast-qual
+NO_WNONNULL=	-Wno-nonnull
 .endif
 .endif
 

Modified: head/sys/kern/kern_mbuf.c
==============================================================================
--- head/sys/kern/kern_mbuf.c	Fri Jun 28 23:40:58 2019	(r349528)
+++ head/sys/kern/kern_mbuf.c	Sat Jun 29 00:48:33 2019	(r349529)
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
+#include <sys/sf_buf.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sysctl.h>
@@ -281,6 +282,7 @@ uma_zone_t	zone_pack;
 uma_zone_t	zone_jumbop;
 uma_zone_t	zone_jumbo9;
 uma_zone_t	zone_jumbo16;
+uma_zone_t	zone_extpgs;
 
 /*
  * Local prototypes.
@@ -298,6 +300,9 @@ static void    *mbuf_jumbo_alloc(uma_zone_t, vm_size_t
 /* Ensure that MSIZE is a power of 2. */
 CTASSERT((((MSIZE - 1) ^ MSIZE) + 1) >> 1 == MSIZE);
 
+_Static_assert(sizeof(struct mbuf_ext_pgs) == 256,
+    "mbuf_ext_pgs size mismatch");
+
 /*
  * Initialize FreeBSD Network buffer allocation.
  */
@@ -379,6 +384,15 @@ mbuf_init(void *dummy)
 	uma_zone_set_warning(zone_jumbo16, "kern.ipc.nmbjumbo16 limit reached");
 	uma_zone_set_maxaction(zone_jumbo16, mb_reclaim);
 
+	zone_extpgs = uma_zcreate(MBUF_EXTPGS_MEM_NAME,
+	    sizeof(struct mbuf_ext_pgs),
+#ifdef INVARIANTS
+	    trash_ctor, trash_dtor, trash_init, trash_fini,
+#else
+	    NULL, NULL, NULL, NULL,
+#endif
+	    UMA_ALIGN_CACHE, 0);
+
 	/*
 	 * Hook event handler for low-memory situation, used to
 	 * drain protocols and push data back to the caches (UMA
@@ -824,6 +838,380 @@ mb_reclaim(uma_zone_t zone __unused, int pending __unu
 }
 
 /*
+ * Free "count" units of I/O from an mbuf chain.  They could be held
+ * in EXT_PGS or just as a normal mbuf.  This code is intended to be
+ * called in an error path (I/O error, closed connection, etc).
+ */
+void
+mb_free_notready(struct mbuf *m, int count)
+{
+	int i;
+
+	for (i = 0; i < count && m != NULL; i++) {
+		if ((m->m_flags & M_EXT) != 0 &&
+		    m->m_ext.ext_type == EXT_PGS) {
+			m->m_ext.ext_pgs->nrdy--;
+			if (m->m_ext.ext_pgs->nrdy != 0)
+				continue;
+		}
+		m = m_free(m);
+	}
+	KASSERT(i == count, ("Removed only %d items from %p", i, m));
+}
+
+/*
+ * Compress an unmapped mbuf into a simple mbuf when it holds a small
+ * amount of data.  This is used as a DOS defense to avoid having
+ * small packets tie up wired pages, an ext_pgs structure, and an
+ * mbuf.  Since this converts the existing mbuf in place, it can only
+ * be used if there are no other references to 'm'.
+ */
+int
+mb_unmapped_compress(struct mbuf *m)
+{
+	volatile u_int *refcnt;
+	struct mbuf m_temp;
+
+	/*
+	 * Assert that 'm' does not have a packet header.  If 'm' had
+	 * a packet header, it would only be able to hold MHLEN bytes
+	 * and m_data would have to be initialized differently.
+	 */
+	KASSERT((m->m_flags & M_PKTHDR) == 0 && (m->m_flags & M_EXT) &&
+	    m->m_ext.ext_type == EXT_PGS,
+            ("%s: m %p !M_EXT or !EXT_PGS or M_PKTHDR", __func__, m));
+	KASSERT(m->m_len <= MLEN, ("m_len too large %p", m));
+
+	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
+		refcnt = &m->m_ext.ext_count;
+	} else {
+		KASSERT(m->m_ext.ext_cnt != NULL,
+		    ("%s: no refcounting pointer on %p", __func__, m));
+		refcnt = m->m_ext.ext_cnt;
+	}
+
+	if (*refcnt != 1)
+		return (EBUSY);
+
+	/*
+	 * Copy mbuf header and m_ext portion of 'm' to 'm_temp' to
+	 * create a "fake" EXT_PGS mbuf that can be used with
+	 * m_copydata() as well as the ext_free callback.
+	 */
+	memcpy(&m_temp, m, offsetof(struct mbuf, m_ext) + sizeof (m->m_ext));
+	m_temp.m_next = NULL;
+	m_temp.m_nextpkt = NULL;
+
+	/* Turn 'm' into a "normal" mbuf. */
+	m->m_flags &= ~(M_EXT | M_RDONLY | M_NOMAP);
+	m->m_data = m->m_dat;
+
+	/* Copy data from template's ext_pgs. */
+	m_copydata(&m_temp, 0, m_temp.m_len, mtod(m, caddr_t));
+
+	/* Free the backing pages. */
+	m_temp.m_ext.ext_free(&m_temp);
+
+	/* Finally, free the ext_pgs struct. */
+	uma_zfree(zone_extpgs, m_temp.m_ext.ext_pgs);
+	return (0);
+}
+
+/*
+ * These next few routines are used to permit downgrading an unmapped
+ * mbuf to a chain of mapped mbufs.  This is used when an interface
+ * doesn't supported unmapped mbufs or if checksums need to be
+ * computed in software.
+ *
+ * Each unmapped mbuf is converted to a chain of mbufs.  First, any
+ * TLS header data is stored in a regular mbuf.  Second, each page of
+ * unmapped data is stored in an mbuf with an EXT_SFBUF external
+ * cluster.  These mbufs use an sf_buf to provide a valid KVA for the
+ * associated physical page.  They also hold a reference on the
+ * original EXT_PGS mbuf to ensure the physical page doesn't go away.
+ * Finally, any TLS trailer data is stored in a regular mbuf.
+ *
+ * mb_unmapped_free_mext() is the ext_free handler for the EXT_SFBUF
+ * mbufs.  It frees the associated sf_buf and releases its reference
+ * on the original EXT_PGS mbuf.
+ *
+ * _mb_unmapped_to_ext() is a helper function that converts a single
+ * unmapped mbuf into a chain of mbufs.
+ *
+ * mb_unmapped_to_ext() is the public function that walks an mbuf
+ * chain converting any unmapped mbufs to mapped mbufs.  It returns
+ * the new chain of unmapped mbufs on success.  On failure it frees
+ * the original mbuf chain and returns NULL.
+ */
+static void
+mb_unmapped_free_mext(struct mbuf *m)
+{
+	struct sf_buf *sf;
+	struct mbuf *old_m;
+
+	sf = m->m_ext.ext_arg1;
+	sf_buf_free(sf);
+
+	/* Drop the reference on the backing EXT_PGS mbuf. */
+	old_m = m->m_ext.ext_arg2;
+	mb_free_ext(old_m);
+}
+
+static struct mbuf *
+_mb_unmapped_to_ext(struct mbuf *m)
+{
+	struct mbuf_ext_pgs *ext_pgs;
+	struct mbuf *m_new, *top, *prev, *mref;
+	struct sf_buf *sf;
+	vm_page_t pg;
+	int i, len, off, pglen, pgoff, seglen, segoff;
+	volatile u_int *refcnt;
+	u_int ref_inc = 0;
+
+	MBUF_EXT_PGS_ASSERT(m);
+	ext_pgs = m->m_ext.ext_pgs;
+	len = m->m_len;
+	KASSERT(ext_pgs->tls == NULL, ("%s: can't convert TLS mbuf %p",
+	    __func__, m));
+
+	/* See if this is the mbuf that holds the embedded refcount. */
+	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
+		refcnt = &m->m_ext.ext_count;
+		mref = m;
+	} else {
+		KASSERT(m->m_ext.ext_cnt != NULL,
+		    ("%s: no refcounting pointer on %p", __func__, m));
+		refcnt = m->m_ext.ext_cnt;
+		mref = __containerof(refcnt, struct mbuf, m_ext.ext_count);
+	}
+
+	/* Skip over any data removed from the front. */
+	off = mtod(m, vm_offset_t);
+
+	top = NULL;
+	if (ext_pgs->hdr_len != 0) {
+		if (off >= ext_pgs->hdr_len) {
+			off -= ext_pgs->hdr_len;
+		} else {
+			seglen = ext_pgs->hdr_len - off;
+			segoff = off;
+			seglen = min(seglen, len);
+			off = 0;
+			len -= seglen;
+			m_new = m_get(M_NOWAIT, MT_DATA);
+			if (m_new == NULL)
+				goto fail;
+			m_new->m_len = seglen;
+			prev = top = m_new;
+			memcpy(mtod(m_new, void *), &ext_pgs->hdr[segoff],
+			    seglen);
+		}
+	}
+	pgoff = ext_pgs->first_pg_off;
+	for (i = 0; i < ext_pgs->npgs && len > 0; i++) {
+		pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+		if (off >= pglen) {
+			off -= pglen;
+			pgoff = 0;
+			continue;
+		}
+		seglen = pglen - off;
+		segoff = pgoff + off;
+		off = 0;
+		seglen = min(seglen, len);
+		len -= seglen;
+
+		pg = PHYS_TO_VM_PAGE(ext_pgs->pa[i]);
+		m_new = m_get(M_NOWAIT, MT_DATA);
+		if (m_new == NULL)
+			goto fail;
+		if (top == NULL) {
+			top = prev = m_new;
+		} else {
+			prev->m_next = m_new;
+			prev = m_new;
+		}
+		sf = sf_buf_alloc(pg, SFB_NOWAIT);
+		if (sf == NULL)
+			goto fail;
+
+		ref_inc++;
+		m_extadd(m_new, (char *)sf_buf_kva(sf), PAGE_SIZE,
+		    mb_unmapped_free_mext, sf, mref, M_RDONLY, EXT_SFBUF);
+		m_new->m_data += segoff;
+		m_new->m_len = seglen;
+
+		pgoff = 0;
+	};
+	if (len != 0) {
+		KASSERT((off + len) <= ext_pgs->trail_len,
+		    ("off + len > trail (%d + %d > %d)", off, len,
+		    ext_pgs->trail_len));
+		m_new = m_get(M_NOWAIT, MT_DATA);
+		if (m_new == NULL)
+			goto fail;
+		if (top == NULL)
+			top = m_new;
+		else
+			prev->m_next = m_new;
+		m_new->m_len = len;
+		memcpy(mtod(m_new, void *), &ext_pgs->trail[off], len);
+	}
+
+	if (ref_inc != 0) {
+		/*
+		 * Obtain an additional reference on the old mbuf for
+		 * each created EXT_SFBUF mbuf.  They will be dropped
+		 * in mb_unmapped_free_mext().
+		 */
+		if (*refcnt == 1)
+			*refcnt += ref_inc;
+		else
+			atomic_add_int(refcnt, ref_inc);
+	}
+	m_free(m);
+	return (top);
+
+fail:
+	if (ref_inc != 0) {
+		/*
+		 * Obtain an additional reference on the old mbuf for
+		 * each created EXT_SFBUF mbuf.  They will be
+		 * immediately dropped when these mbufs are freed
+		 * below.
+		 */
+		if (*refcnt == 1)
+			*refcnt += ref_inc;
+		else
+			atomic_add_int(refcnt, ref_inc);
+	}
+	m_free(m);
+	m_freem(top);
+	return (NULL);
+}
+
+struct mbuf *
+mb_unmapped_to_ext(struct mbuf *top)
+{
+	struct mbuf *m, *next, *prev = NULL;
+
+	prev = NULL;
+	for (m = top; m != NULL; m = next) {
+		/* m might be freed, so cache the next pointer. */
+		next = m->m_next;
+		if (m->m_flags & M_NOMAP) {
+			if (prev != NULL) {
+				/*
+				 * Remove 'm' from the new chain so
+				 * that the 'top' chain terminates
+				 * before 'm' in case 'top' is freed
+				 * due to an error.
+				 */
+				prev->m_next = NULL;
+			}
+			m = _mb_unmapped_to_ext(m);
+			if (m == NULL) {
+				m_freem(top);
+				m_freem(next);
+				return (NULL);
+			}
+			if (prev == NULL) {
+				top = m;
+			} else {
+				prev->m_next = m;
+			}
+
+			/*
+			 * Replaced one mbuf with a chain, so we must
+			 * find the end of chain.
+			 */
+			prev = m_last(m);
+		} else {
+			if (prev != NULL) {
+				prev->m_next = m;
+			}
+			prev = m;
+		}
+	}
+	return (top);
+}
+
+/*
+ * Allocate an empty EXT_PGS mbuf.  The ext_free routine is
+ * responsible for freeing any pages backing this mbuf when it is
+ * freed.
+ */
+struct mbuf *
+mb_alloc_ext_pgs(int how, bool pkthdr, m_ext_free_t ext_free)
+{
+	struct mbuf *m;
+	struct mbuf_ext_pgs *ext_pgs;
+
+	if (pkthdr)
+		m = m_gethdr(how, MT_DATA);
+	else
+		m = m_get(how, MT_DATA);
+	if (m == NULL)
+		return (NULL);
+
+	ext_pgs = uma_zalloc(zone_extpgs, how);
+	if (ext_pgs == NULL) {
+		m_free(m);
+		return (NULL);
+	}
+	ext_pgs->npgs = 0;
+	ext_pgs->nrdy = 0;
+	ext_pgs->first_pg_off = 0;
+	ext_pgs->last_pg_len = 0;
+	ext_pgs->hdr_len = 0;
+	ext_pgs->trail_len = 0;
+	ext_pgs->tls = NULL;
+	ext_pgs->so = NULL;
+	m->m_data = NULL;
+	m->m_flags |= (M_EXT | M_RDONLY | M_NOMAP);
+	m->m_ext.ext_type = EXT_PGS;
+	m->m_ext.ext_flags = EXT_FLAG_EMBREF;
+	m->m_ext.ext_count = 1;
+	m->m_ext.ext_pgs = ext_pgs;
+	m->m_ext.ext_size = 0;
+	m->m_ext.ext_free = ext_free;
+	return (m);
+}
+
+#ifdef INVARIANT_SUPPORT
+void
+mb_ext_pgs_check(struct mbuf_ext_pgs *ext_pgs)
+{
+
+	/*
+	 * NB: This expects a non-empty buffer (npgs > 0 and
+	 * last_pg_len > 0).
+	 */
+	KASSERT(ext_pgs->npgs > 0,
+	    ("ext_pgs with no valid pages: %p", ext_pgs));
+	KASSERT(ext_pgs->npgs <= nitems(ext_pgs->pa),
+	    ("ext_pgs with too many pages: %p", ext_pgs));
+	KASSERT(ext_pgs->nrdy <= ext_pgs->npgs,
+	    ("ext_pgs with too many ready pages: %p", ext_pgs));
+	KASSERT(ext_pgs->first_pg_off < PAGE_SIZE,
+	    ("ext_pgs with too large page offset: %p", ext_pgs));
+	KASSERT(ext_pgs->last_pg_len > 0,
+	    ("ext_pgs with zero last page length: %p", ext_pgs));
+	KASSERT(ext_pgs->last_pg_len <= PAGE_SIZE,
+	    ("ext_pgs with too large last page length: %p", ext_pgs));
+	if (ext_pgs->npgs == 1) {
+		KASSERT(ext_pgs->first_pg_off + ext_pgs->last_pg_len <=
+		    PAGE_SIZE, ("ext_pgs with single page too large: %p",
+		    ext_pgs));
+	}
+	KASSERT(ext_pgs->hdr_len <= sizeof(ext_pgs->hdr),
+	    ("ext_pgs with too large header length: %p", ext_pgs));
+	KASSERT(ext_pgs->trail_len <= sizeof(ext_pgs->trail),
+	    ("ext_pgs with too large header length: %p", ext_pgs));
+}
+#endif
+
+/*
  * Clean up after mbufs with M_EXT storage attached to them if the
  * reference count hits 1.
  */
@@ -886,6 +1274,10 @@ mb_free_ext(struct mbuf *m)
 			break;
 		case EXT_JUMBO16:
 			uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
+			uma_zfree(zone_mbuf, mref);
+			break;
+		case EXT_PGS:
+			uma_zfree(zone_extpgs, mref->m_ext.ext_pgs);
 			uma_zfree(zone_mbuf, mref);
 			break;
 		case EXT_SFBUF:

Modified: head/sys/kern/subr_bus_dma.c
==============================================================================
--- head/sys/kern/subr_bus_dma.c	Fri Jun 28 23:40:58 2019	(r349528)
+++ head/sys/kern/subr_bus_dma.c	Sat Jun 29 00:48:33 2019	(r349529)
@@ -111,6 +111,67 @@ _bus_dmamap_load_plist(bus_dma_tag_t dmat, bus_dmamap_
 }
 
 /*
+ * Load an unmapped mbuf
+ */
+static int
+_bus_dmamap_load_unmapped_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
+    struct mbuf *m, bus_dma_segment_t *segs, int *nsegs, int flags)
+{
+	struct mbuf_ext_pgs *ext_pgs;
+	int error, i, off, len, pglen, pgoff, seglen, segoff;
+
+	MBUF_EXT_PGS_ASSERT(m);
+	ext_pgs = m->m_ext.ext_pgs;
+
+	len = m->m_len;
+	error = 0;
+
+	/* Skip over any data removed from the front. */
+	off = mtod(m, vm_offset_t);
+
+	if (ext_pgs->hdr_len != 0) {
+		if (off >= ext_pgs->hdr_len) {
+			off -= ext_pgs->hdr_len;
+		} else {
+			seglen = ext_pgs->hdr_len - off;
+			segoff = off;
+			seglen = min(seglen, len);
+			off = 0;
+			len -= seglen;
+			error = _bus_dmamap_load_buffer(dmat, map,
+			    &ext_pgs->hdr[segoff], seglen, kernel_pmap,
+			    flags, segs, nsegs);
+		}
+	}
+	pgoff = ext_pgs->first_pg_off;
+	for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
+		pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+		if (off >= pglen) {
+			off -= pglen;
+			pgoff = 0;
+			continue;
+		}
+		seglen = pglen - off;
+		segoff = pgoff + off;
+		off = 0;
+		seglen = min(seglen, len);
+		len -= seglen;
+		error = _bus_dmamap_load_phys(dmat, map,
+		    ext_pgs->pa[i] + segoff, seglen, flags, segs, nsegs);
+		pgoff = 0;
+	};
+	if (len != 0 && error == 0) {
+		KASSERT((off + len) <= ext_pgs->trail_len,
+		    ("off + len > trail (%d + %d > %d)", off, len,
+		    ext_pgs->trail_len));
+		error = _bus_dmamap_load_buffer(dmat, map,
+		    &ext_pgs->trail[off], len, kernel_pmap, flags, segs,
+		    nsegs);
+	}
+	return (error);
+}
+
+/*
  * Load an mbuf chain.
  */
 static int
@@ -123,9 +184,13 @@ _bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmama
 	error = 0;
 	for (m = m0; m != NULL && error == 0; m = m->m_next) {
 		if (m->m_len > 0) {
-			error = _bus_dmamap_load_buffer(dmat, map, m->m_data,
-			    m->m_len, kernel_pmap, flags | BUS_DMA_LOAD_MBUF,
-			    segs, nsegs);
+			if ((m->m_flags & M_NOMAP) != 0)
+				error = _bus_dmamap_load_unmapped_mbuf_sg(dmat,
+				    map, m, segs, nsegs, flags);
+			else
+				error = _bus_dmamap_load_buffer(dmat, map,
+				    m->m_data, m->m_len, kernel_pmap,
+				    flags | BUS_DMA_LOAD_MBUF, segs, nsegs);
 		}
 	}
 	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",

Modified: head/sys/kern/subr_sglist.c
==============================================================================
--- head/sys/kern/subr_sglist.c	Fri Jun 28 23:40:58 2019	(r349528)
+++ head/sys/kern/subr_sglist.c	Sat Jun 29 00:48:33 2019	(r349529)
@@ -219,6 +219,75 @@ sglist_count_vmpages(vm_page_t *m, size_t pgoff, size_
 }
 
 /*
+ * Determine the number of scatter/gather list elements needed to
+ * describe an EXT_PGS buffer.
+ */
+int
+sglist_count_ext_pgs(struct mbuf_ext_pgs *ext_pgs, size_t off, size_t len)
+{
+	vm_paddr_t nextaddr, paddr;
+	size_t seglen, segoff;
+	int i, nsegs, pglen, pgoff;
+
+	if (len == 0)
+		return (0);
+
+	nsegs = 0;
+	if (ext_pgs->hdr_len != 0) {
+		if (off >= ext_pgs->hdr_len) {
+			off -= ext_pgs->hdr_len;
+		} else {
+			seglen = ext_pgs->hdr_len - off;
+			segoff = off;
+			seglen = MIN(seglen, len);
+			off = 0;
+			len -= seglen;
+			nsegs += sglist_count(&ext_pgs->hdr[segoff], seglen);
+		}
+	}
+	nextaddr = 0;
+	pgoff = ext_pgs->first_pg_off;
+	for (i = 0; i < ext_pgs->npgs && len > 0; i++) {
+		pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+		if (off >= pglen) {
+			off -= pglen;
+			pgoff = 0;
+			continue;
+		}
+		seglen = pglen - off;
+		segoff = pgoff + off;
+		off = 0;
+		seglen = MIN(seglen, len);
+		len -= seglen;
+		paddr = ext_pgs->pa[i] + segoff;
+		if (paddr != nextaddr)
+			nsegs++;
+		nextaddr = paddr + seglen;
+		pgoff = 0;
+	};
+	if (len != 0) {
+		seglen = MIN(len, ext_pgs->trail_len - off);
+		len -= seglen;
+		nsegs += sglist_count(&ext_pgs->trail[off], seglen);
+	}
+	KASSERT(len == 0, ("len != 0"));
+	return (nsegs);
+}
+
+/*
+ * Determine the number of scatter/gather list elements needed to
+ * describe an EXT_PGS mbuf.
+ */
+int
+sglist_count_mb_ext_pgs(struct mbuf *m)
+{
+
+	MBUF_EXT_PGS_ASSERT(m);
+	return (sglist_count_ext_pgs(m->m_ext.ext_pgs, mtod(m, vm_offset_t),
+	    m->m_len));
+}
+
+/*
  * Allocate a scatter/gather list along with 'nsegs' segments.  The
  * 'mflags' parameters are the same as passed to malloc(9).  The caller
  * should use sglist_free() to free this list.
@@ -320,6 +389,76 @@ sglist_append_phys(struct sglist *sg, vm_paddr_t paddr
 }
 
 /*
+ * Append the segments to describe an EXT_PGS buffer to a
+ * scatter/gather list.  If there are insufficient segments, then this
+ * fails with EFBIG.
+ */
+int
+sglist_append_ext_pgs(struct sglist *sg, struct mbuf_ext_pgs *ext_pgs,
+    size_t off, size_t len)
+{
+	size_t seglen, segoff;
+	vm_paddr_t paddr;
+	int error, i, pglen, pgoff;
+
+	error = 0;
+	if (ext_pgs->hdr_len != 0) {
+		if (off >= ext_pgs->hdr_len) {
+			off -= ext_pgs->hdr_len;
+		} else {
+			seglen = ext_pgs->hdr_len - off;
+			segoff = off;
+			seglen = MIN(seglen, len);
+			off = 0;
+			len -= seglen;
+			error = sglist_append(sg,
+			    &ext_pgs->hdr[segoff], seglen);
+		}
+	}
+	pgoff = ext_pgs->first_pg_off;
+	for (i = 0; i < ext_pgs->npgs && error == 0 && len > 0; i++) {
+		pglen = mbuf_ext_pg_len(ext_pgs, i, pgoff);
+		if (off >= pglen) {
+			off -= pglen;
+			pgoff = 0;
+			continue;
+		}
+		seglen = pglen - off;
+		segoff = pgoff + off;
+		off = 0;
+		seglen = MIN(seglen, len);
+		len -= seglen;
+		paddr = ext_pgs->pa[i] + segoff;
+		error = sglist_append_phys(sg, paddr, seglen);
+		pgoff = 0;
+	};
+	if (error == 0 && len > 0) {
+		seglen = MIN(len, ext_pgs->trail_len - off);
+		len -= seglen;
+		error = sglist_append(sg,
+		    &ext_pgs->trail[off], seglen);
+	}
+	if (error == 0)
+		KASSERT(len == 0, ("len != 0"));
+	return (error);
+}
+
+/*
+ * Append the segments to describe an EXT_PGS mbuf to a scatter/gather
+ * list.  If there are insufficient segments, then this fails with
+ * EFBIG.
+ */
+int
+sglist_append_mb_ext_pgs(struct sglist *sg, struct mbuf *m)
+{
+
+	/* for now, all unmapped mbufs are assumed to be EXT_PGS */
+	MBUF_EXT_PGS_ASSERT(m);
+	return (sglist_append_ext_pgs(sg, m->m_ext.ext_pgs,
+	    mtod(m, vm_offset_t), m->m_len));
+}
+
+/*
  * Append the segments that describe a single mbuf chain to a
  * scatter/gather list.  If there are insufficient segments, then this
  * fails with EFBIG.
@@ -338,7 +477,11 @@ sglist_append_mbuf(struct sglist *sg, struct mbuf *m0)
 	SGLIST_SAVE(sg, save);
 	for (m = m0; m != NULL; m = m->m_next) {
 		if (m->m_len > 0) {
-			error = sglist_append(sg, m->m_data, m->m_len);
+			if ((m->m_flags & M_NOMAP) != 0)
+				error = sglist_append_mb_ext_pgs(sg, m);
+			else
+				error = sglist_append(sg, m->m_data,
+				    m->m_len);
 			if (error) {
 				SGLIST_RESTORE(sg, save);
 				return (error);

Modified: head/sys/kern/uipc_mbuf.c
==============================================================================
--- head/sys/kern/uipc_mbuf.c	Fri Jun 28 23:40:58 2019	(r349528)
+++ head/sys/kern/uipc_mbuf.c	Sat Jun 29 00:48:33 2019	(r349529)
@@ -49,7 +49,11 @@ __FBSDID("$FreeBSD$");
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/uio.h>
+#include <sys/vmmeter.h>
 #include <sys/sdt.h>
+#include <vm/vm.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_page.h>
 
 SDT_PROBE_DEFINE5_XLATE(sdt, , , m__init,
     "struct mbuf *", "mbufinfo_t *",
@@ -202,7 +206,7 @@ mb_dupcl(struct mbuf *n, struct mbuf *m)
 	else
 		bcopy(&m->m_ext, &n->m_ext, m_ext_copylen);
 	n->m_flags |= M_EXT;
-	n->m_flags |= m->m_flags & M_RDONLY;
+	n->m_flags |= m->m_flags & (M_RDONLY | M_NOMAP);
 
 	/* See if this is the mbuf that holds the embedded refcount. */
 	if (m->m_ext.ext_flags & EXT_FLAG_EMBREF) {
@@ -246,7 +250,8 @@ m_demote(struct mbuf *m0, int all, int flags)
 		    __func__, m, m0));
 		if (m->m_flags & M_PKTHDR)
 			m_demote_pkthdr(m);
-		m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE | flags);
+		m->m_flags = m->m_flags & (M_EXT | M_RDONLY | M_NOFREE |
+		    M_NOMAP | flags);
 	}
 }
 
@@ -376,7 +381,8 @@ m_move_pkthdr(struct mbuf *to, struct mbuf *from)
 	if (to->m_flags & M_PKTHDR)
 		m_tag_delete_chain(to, NULL);
 #endif
-	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
+	to->m_flags = (from->m_flags & M_COPYFLAGS) |
+	    (to->m_flags & (M_EXT | M_NOMAP));
 	if ((to->m_flags & M_EXT) == 0)
 		to->m_data = to->m_pktdat;
 	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
@@ -414,7 +420,8 @@ m_dup_pkthdr(struct mbuf *to, const struct mbuf *from,
 	if (to->m_flags & M_PKTHDR)
 		m_tag_delete_chain(to, NULL);
 #endif
-	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
+	to->m_flags = (from->m_flags & M_COPYFLAGS) |
+	    (to->m_flags & (M_EXT | M_NOMAP));
 	if ((to->m_flags & M_EXT) == 0)
 		to->m_data = to->m_pktdat;
 	to->m_pkthdr = from->m_pkthdr;
@@ -579,6 +586,30 @@ nospace:
 	return (NULL);
 }
 
+static void
+m_copyfromunmapped(const struct mbuf *m, int off, int len, caddr_t cp)
+{
+	struct iovec iov;
+	struct uio uio;
+	int error;
+
+	KASSERT(off >= 0, ("m_copyfromunmapped: negative off %d", off));
+	KASSERT(len >= 0, ("m_copyfromunmapped: negative len %d", len));
+	KASSERT(off < m->m_len,
+	    ("m_copyfromunmapped: len exceeds mbuf length"));
+	iov.iov_base = cp;
+	iov.iov_len = len;
+	uio.uio_resid = len;
+	uio.uio_iov = &iov;
+	uio.uio_segflg = UIO_SYSSPACE;
+	uio.uio_iovcnt = 1;
+	uio.uio_offset = 0;
+	uio.uio_rw = UIO_READ;
+	error = m_unmappedtouio(m, off, &uio, len);
+	KASSERT(error == 0, ("m_unmappedtouio failed: off %d, len %d", off,
+	   len));
+}
+
 /*
  * Copy data from an mbuf chain starting "off" bytes from the beginning,
  * continuing for "len" bytes, into the indicated buffer.
@@ -600,7 +631,10 @@ m_copydata(const struct mbuf *m, int off, int len, cad

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-all mailing list