svn commit: r268535 - in head/sys: kern sys

Gleb Smirnoff glebius at FreeBSD.org
Fri Jul 11 19:40:52 UTC 2014


Author: glebius
Date: Fri Jul 11 19:40:50 2014
New Revision: 268535
URL: http://svnweb.freebsd.org/changeset/base/268535

Log:
  Improve reference counting of EXT_SFBUF pages attached to mbufs.
  
  o Do not use UMA refcount zone. The problem with this zone is that
    several refcounting words (16 on amd64) share the same cache line,
    and issueing atomic(9) updates on them creates cache line contention.
    Also, allocating and freeing them is extra CPU cycles.
    Instead, refcount the page directly via vm_page_wire() and the sfbuf
    via sf_buf_alloc(sf_buf_page(sf)) [1].
  
  o Call refcounting/freeing function for EXT_SFBUF via direct function
    call, instead of function pointer. This removes barrier for CPU
    branch predictor.
  
  o Do not cleanup the mbuf to be freed in mb_free_ext(), merely to
    satisfy assertion in mb_dtor_mbuf(). Remove the assertion from
    mb_dtor_mbuf(). Use bcopy() instead of manual assignments to
    copy m_ext in mb_dupcl().
  
  [1] This has some problems for now. Using sf_buf_alloc() merely to
      increase refcount is expensive, and is broken on sparc64. To be
      fixed.
  
  Sponsored by:	Netflix
  Sponsored by:	Nginx, Inc.

Modified:
  head/sys/kern/kern_mbuf.c
  head/sys/kern/uipc_mbuf.c
  head/sys/kern/uipc_syscalls.c
  head/sys/sys/mbuf.h
  head/sys/sys/sf_buf.h

Modified: head/sys/kern/kern_mbuf.c
==============================================================================
--- head/sys/kern/kern_mbuf.c	Fri Jul 11 17:31:40 2014	(r268534)
+++ head/sys/kern/kern_mbuf.c	Fri Jul 11 19:40:50 2014	(r268535)
@@ -449,7 +449,6 @@ mb_dtor_mbuf(void *mem, int size, void *
 
 	if ((m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags))
 		m_tag_delete_chain(m, NULL);
-	KASSERT((m->m_flags & M_EXT) == 0, ("%s: M_EXT set", __func__));
 	KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__));
 #ifdef INVARIANTS
 	trash_dtor(mem, size, arg);

Modified: head/sys/kern/uipc_mbuf.c
==============================================================================
--- head/sys/kern/uipc_mbuf.c	Fri Jul 11 17:31:40 2014	(r268534)
+++ head/sys/kern/uipc_mbuf.c	Fri Jul 11 19:40:50 2014	(r268535)
@@ -287,19 +287,31 @@ m_extadd(struct mbuf *mb, caddr_t buf, u
 void
 mb_free_ext(struct mbuf *m)
 {
-	int skipmbuf;
+	int freembuf;
 
-	KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
-	KASSERT(m->m_ext.ext_cnt != NULL, ("%s: ext_cnt not set", __func__));
+	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
 
 	/*
-	 * check if the header is embedded in the cluster
+	 * Check if the header is embedded in the cluster.
 	 */
-	skipmbuf = (m->m_flags & M_NOFREE);
+	freembuf = (m->m_flags & M_NOFREE) ? 0 : 1;
+
+	switch (m->m_ext.ext_type) {
+	case EXT_SFBUF:
+		sf_ext_free(m->m_ext.ext_arg1, m->m_ext.ext_arg2);
+		break;
+	default:
+		KASSERT(m->m_ext.ext_cnt != NULL,
+		    ("%s: no refcounting pointer on %p", __func__, m));
+		/* 
+		 * Free attached storage if this mbuf is the only
+		 * reference to it.
+		 */
+		if (*(m->m_ext.ext_cnt) != 1) {
+			if (atomic_fetchadd_int(m->m_ext.ext_cnt, -1) != 1)
+				break;
+		}
 
-	/* Free attached storage if this mbuf is the only reference to it. */
-	if (*(m->m_ext.ext_cnt) == 1 ||
-	    atomic_fetchadd_int(m->m_ext.ext_cnt, -1) == 1) {
 		switch (m->m_ext.ext_type) {
 		case EXT_PACKET:	/* The packet zone is special. */
 			if (*(m->m_ext.ext_cnt) == 0)
@@ -318,7 +330,6 @@ mb_free_ext(struct mbuf *m)
 		case EXT_JUMBO16:
 			uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
 			break;
-		case EXT_SFBUF:
 		case EXT_NET_DRV:
 		case EXT_MOD_TYPE:
 		case EXT_DISPOSABLE:
@@ -337,23 +348,9 @@ mb_free_ext(struct mbuf *m)
 				("%s: unknown ext_type", __func__));
 		}
 	}
-	if (skipmbuf)
-		return;
 
-	/*
-	 * Free this mbuf back to the mbuf zone with all m_ext
-	 * information purged.
-	 */
-	m->m_ext.ext_buf = NULL;
-	m->m_ext.ext_free = NULL;
-	m->m_ext.ext_arg1 = NULL;
-	m->m_ext.ext_arg2 = NULL;
-	m->m_ext.ext_cnt = NULL;
-	m->m_ext.ext_size = 0;
-	m->m_ext.ext_type = 0;
-	m->m_ext.ext_flags = 0;
-	m->m_flags &= ~M_EXT;
-	uma_zfree(zone_mbuf, m);
+	if (freembuf)
+		uma_zfree(zone_mbuf, m);
 }
 
 /*
@@ -363,22 +360,24 @@ mb_free_ext(struct mbuf *m)
 static void
 mb_dupcl(struct mbuf *n, struct mbuf *m)
 {
-	KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
-	KASSERT(m->m_ext.ext_cnt != NULL, ("%s: ext_cnt not set", __func__));
-	KASSERT((n->m_flags & M_EXT) == 0, ("%s: M_EXT set", __func__));
 
-	if (*(m->m_ext.ext_cnt) == 1)
-		*(m->m_ext.ext_cnt) += 1;
-	else
-		atomic_add_int(m->m_ext.ext_cnt, 1);
-	n->m_ext.ext_buf = m->m_ext.ext_buf;
-	n->m_ext.ext_free = m->m_ext.ext_free;
-	n->m_ext.ext_arg1 = m->m_ext.ext_arg1;
-	n->m_ext.ext_arg2 = m->m_ext.ext_arg2;
-	n->m_ext.ext_size = m->m_ext.ext_size;
-	n->m_ext.ext_cnt = m->m_ext.ext_cnt;
-	n->m_ext.ext_type = m->m_ext.ext_type;
-	n->m_ext.ext_flags = m->m_ext.ext_flags;
+	KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
+	KASSERT(!(n->m_flags & M_EXT), ("%s: M_EXT set on %p", __func__, n));
+
+	switch (m->m_ext.ext_type) {
+	case EXT_SFBUF:
+		sf_ext_ref(m->m_ext.ext_arg1, m->m_ext.ext_arg2);
+		break;
+	default:
+		KASSERT(m->m_ext.ext_cnt != NULL,
+		    ("%s: no refcounting pointer on %p", __func__, m));
+		if (*(m->m_ext.ext_cnt) == 1)
+			*(m->m_ext.ext_cnt) += 1;
+		else
+			atomic_add_int(m->m_ext.ext_cnt, 1);
+	}
+
+	bcopy(&m->m_ext, &n->m_ext, sizeof(m->m_ext));
 	n->m_flags |= M_EXT;
 	n->m_flags |= m->m_flags & M_RDONLY;
 }

Modified: head/sys/kern/uipc_syscalls.c
==============================================================================
--- head/sys/kern/uipc_syscalls.c	Fri Jul 11 17:31:40 2014	(r268534)
+++ head/sys/kern/uipc_syscalls.c	Fri Jul 11 19:40:50 2014	(r268535)
@@ -1983,32 +1983,56 @@ filt_sfsync(struct knote *kn, long hint)
 	return (ret);
 }
 
+/*
+ * Add more references to a vm_page + sf_buf + sendfile_sync.
+ */
+void
+sf_ext_ref(void *arg1, void *arg2)
+{
+	struct sf_buf *sf = arg1;
+	struct sendfile_sync *sfs = arg2;
+	vm_page_t pg = sf_buf_page(sf);
+
+	/* XXXGL: there should be sf_buf_ref() */
+	sf_buf_alloc(sf_buf_page(sf), SFB_NOWAIT);
+
+	vm_page_lock(pg);
+	vm_page_wire(pg);
+	vm_page_unlock(pg);
+
+	if (sfs != NULL) {
+		mtx_lock(&sfs->mtx);
+		KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
+		sfs->count++;
+		mtx_unlock(&sfs->mtx);
+	}
+}
 
 /*
  * Detach mapped page and release resources back to the system.
  */
 void
-sf_buf_mext(struct mbuf *mb, void *addr, void *args)
+sf_ext_free(void *arg1, void *arg2)
 {
-	vm_page_t m;
-	struct sendfile_sync *sfs;
+	struct sf_buf *sf = arg1;
+	struct sendfile_sync *sfs = arg2;
+	vm_page_t pg = sf_buf_page(sf);
+
+	sf_buf_free(sf);
 
-	m = sf_buf_page(args);
-	sf_buf_free(args);
-	vm_page_lock(m);
-	vm_page_unwire(m, PQ_INACTIVE);
+	vm_page_lock(pg);
+	vm_page_unwire(pg, PQ_INACTIVE);
 	/*
 	 * Check for the object going away on us. This can
 	 * happen since we don't hold a reference to it.
 	 * If so, we're responsible for freeing the page.
 	 */
-	if (m->wire_count == 0 && m->object == NULL)
-		vm_page_free(m);
-	vm_page_unlock(m);
-	if (addr != NULL) {
-		sfs = addr;
+	if (pg->wire_count == 0 && pg->object == NULL)
+		vm_page_free(pg);
+	vm_page_unlock(pg);
+
+	if (sfs != NULL)
 		sf_sync_deref(sfs);
-	}
 }
 
 /*
@@ -2124,7 +2148,7 @@ sf_sync_alloc(uint32_t flags)
 /*
  * Take a reference to a sfsync instance.
  *
- * This has to map 1:1 to free calls coming in via sf_buf_mext(),
+ * This has to map 1:1 to free calls coming in via sf_ext_free(),
  * so typically this will be referenced once for each mbuf allocated.
  */
 void
@@ -3062,17 +3086,19 @@ retry_space:
 			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
 			if (m0 == NULL) {
 				error = (mnw ? EAGAIN : ENOBUFS);
-				sf_buf_mext(NULL, NULL, sf);
-				break;
-			}
-			if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE,
-			    sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF,
-			    (mnw ? M_NOWAIT : M_WAITOK)) != 0) {
-				error = (mnw ? EAGAIN : ENOBUFS);
-				sf_buf_mext(NULL, NULL, sf);
-				m_freem(m0);
+				sf_ext_free(sf, NULL);
 				break;
 			}
+			/*
+			 * Attach EXT_SFBUF external storage.
+			 */
+			m0->m_ext.ext_buf = (caddr_t )sf_buf_kva(sf);
+			m0->m_ext.ext_size = PAGE_SIZE;
+			m0->m_ext.ext_arg1 = sf;
+			m0->m_ext.ext_arg2 = sfs;
+			m0->m_ext.ext_type = EXT_SFBUF;
+			m0->m_ext.ext_flags = 0;
+			m0->m_flags |= (M_EXT|M_RDONLY);
 			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
 			m0->m_len = xfsize;
 

Modified: head/sys/sys/mbuf.h
==============================================================================
--- head/sys/sys/mbuf.h	Fri Jul 11 17:31:40 2014	(r268534)
+++ head/sys/sys/mbuf.h	Fri Jul 11 19:40:50 2014	(r268535)
@@ -374,6 +374,12 @@ struct mbuf {
     "\30EXT_FLAG_EXP4"
 
 /*
+ * External reference/free functions.
+ */
+void sf_ext_ref(void *, void *);
+void sf_ext_free(void *, void *);
+
+/*
  * Flags indicating checksum, segmentation and other offload work to be
  * done, or already done, by hardware or lower layers.  It is split into
  * separate inbound and outbound flags.

Modified: head/sys/sys/sf_buf.h
==============================================================================
--- head/sys/sys/sf_buf.h	Fri Jul 11 17:31:40 2014	(r268534)
+++ head/sys/sys/sf_buf.h	Fri Jul 11 19:40:50 2014	(r268535)
@@ -52,7 +52,6 @@ struct sfstat {				/* sendfile statistic
 #include <machine/sf_buf.h>
 #include <sys/systm.h>
 #include <sys/counter.h>
-struct mbuf;	/* for sf_buf_mext() */
 
 extern counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
 #define	SFSTAT_ADD(name, val)	\
@@ -60,7 +59,4 @@ extern counter_u64_t sfstat[sizeof(struc
 	(val))
 #define	SFSTAT_INC(name)	SFSTAT_ADD(name, 1)
 #endif /* _KERNEL */
-
-void	sf_buf_mext(struct mbuf *mb, void *addr, void *args);
-
 #endif /* !_SYS_SF_BUF_H_ */


More information about the svn-src-head mailing list