svn commit: r337780 - head/sys/netinet

Jonathan T. Looney jtl at FreeBSD.org
Tue Aug 14 17:23:07 UTC 2018


Author: jtl
Date: Tue Aug 14 17:23:05 2018
New Revision: 337780
URL: https://svnweb.freebsd.org/changeset/base/337780

Log:
  Implement a limit on on the number of IPv4 reassembly queues per bucket.
  
  There is a hashing algorithm which should distribute IPv4 reassembly
  queues across the available buckets in a relatively even way. However,
  if there is a flaw in the hashing algorithm which allows a large number
  of IPv4 fragment reassembly queues to end up in a single bucket, a per-
  bucket limit could help mitigate the performance impact of this flaw.
  
  Implement such a limit, with a default of twice the maximum number of
  reassembly queues divided by the number of buckets. Recalculate the
  limit any time the maximum number of reassembly queues changes.
  However, allow the user to override the value using a sysctl
  (net.inet.ip.maxfragbucketsize).
  
  Reviewed by:	jhb
  Security:	FreeBSD-SA-18:10.ip
  Security:	CVE-2018-6923

Modified:
  head/sys/netinet/ip_reass.c

Modified: head/sys/netinet/ip_reass.c
==============================================================================
--- head/sys/netinet/ip_reass.c	Tue Aug 14 17:20:31 2018	(r337779)
+++ head/sys/netinet/ip_reass.c	Tue Aug 14 17:23:05 2018	(r337780)
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/hash.h>
 #include <sys/mbuf.h>
 #include <sys/malloc.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
@@ -70,6 +71,7 @@ SYSCTL_DECL(_net_inet_ip);
 struct ipqbucket {
 	TAILQ_HEAD(ipqhead, ipq) head;
 	struct mtx		 lock;
+	int			 count;
 };
 
 VNET_DEFINE_STATIC(struct ipqbucket, ipq[IPREASS_NHASH]);
@@ -82,6 +84,9 @@ VNET_DEFINE_STATIC(uint32_t, ipq_hashseed);
 #define	IPQ_UNLOCK(i)	mtx_unlock(&V_ipq[i].lock)
 #define	IPQ_LOCK_ASSERT(i)	mtx_assert(&V_ipq[i].lock, MA_OWNED)
 
+VNET_DEFINE_STATIC(int, ipreass_maxbucketsize);
+#define	V_ipreass_maxbucketsize	VNET(ipreass_maxbucketsize)
+
 void		ipreass_init(void);
 void		ipreass_drain(void);
 void		ipreass_slowtimo(void);
@@ -89,25 +94,26 @@ void		ipreass_slowtimo(void);
 void		ipreass_destroy(void);
 #endif
 static int	sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS);
+static int	sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS);
 static void	ipreass_zone_change(void *);
 static void	ipreass_drain_tomax(void);
-static void	ipq_free(struct ipqhead *, struct ipq *);
+static void	ipq_free(struct ipqbucket *, struct ipq *);
 static struct ipq * ipq_reuse(int);
 
 static inline void
-ipq_timeout(struct ipqhead *head, struct ipq *fp)
+ipq_timeout(struct ipqbucket *bucket, struct ipq *fp)
 {
 
 	IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
-	ipq_free(head, fp);
+	ipq_free(bucket, fp);
 }
 
 static inline void
-ipq_drop(struct ipqhead *head, struct ipq *fp)
+ipq_drop(struct ipqbucket *bucket, struct ipq *fp)
 {
 
 	IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
-	ipq_free(head, fp);
+	ipq_free(bucket, fp);
 }
 
 static int		maxfrags;
@@ -136,6 +142,10 @@ VNET_DEFINE_STATIC(int, maxfragsperpacket);
 SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(maxfragsperpacket), 0,
     "Maximum number of IPv4 fragments allowed per packet");
+SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize,
+    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0,
+    sysctl_maxfragbucketsize, "I",
+    "Maximum number of IPv4 fragment reassembly queue entries per bucket");
 
 /*
  * Take incoming datagram fragment and try to reassemble it into
@@ -241,9 +251,12 @@ ip_reass(struct mbuf *m)
 	 * If first fragment to arrive, create a reassembly queue.
 	 */
 	if (fp == NULL) {
-		fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
+		if (V_ipq[hash].count < V_ipreass_maxbucketsize)
+			fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
 		if (fp == NULL)
 			fp = ipq_reuse(hash);
+		if (fp == NULL)
+			goto dropfrag;
 #ifdef MAC
 		if (mac_ipq_init(fp, M_NOWAIT) != 0) {
 			uma_zfree(V_ipq_zone, fp);
@@ -253,6 +266,7 @@ ip_reass(struct mbuf *m)
 		mac_ipq_create(m, fp);
 #endif
 		TAILQ_INSERT_HEAD(head, fp, ipq_list);
+		V_ipq[hash].count++;
 		fp->ipq_nfrags = 1;
 		atomic_add_int(&nfrags, 1);
 		fp->ipq_ttl = IPFRAGTTL;
@@ -360,7 +374,7 @@ ip_reass(struct mbuf *m)
 	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
 		if (ntohs(GETIP(q)->ip_off) != next) {
 			if (fp->ipq_nfrags > V_maxfragsperpacket)
-				ipq_drop(head, fp);
+				ipq_drop(&V_ipq[hash], fp);
 			goto done;
 		}
 		next += ntohs(GETIP(q)->ip_len);
@@ -368,7 +382,7 @@ ip_reass(struct mbuf *m)
 	/* Make sure the last packet didn't have the IP_MF flag */
 	if (p->m_flags & M_IP_FRAG) {
 		if (fp->ipq_nfrags > V_maxfragsperpacket)
-			ipq_drop(head, fp);
+			ipq_drop(&V_ipq[hash], fp);
 		goto done;
 	}
 
@@ -379,7 +393,7 @@ ip_reass(struct mbuf *m)
 	ip = GETIP(q);
 	if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
 		IPSTAT_INC(ips_toolong);
-		ipq_drop(head, fp);
+		ipq_drop(&V_ipq[hash], fp);
 		goto done;
 	}
 
@@ -423,6 +437,7 @@ ip_reass(struct mbuf *m)
 	ip->ip_src = fp->ipq_src;
 	ip->ip_dst = fp->ipq_dst;
 	TAILQ_REMOVE(head, fp, ipq_list);
+	V_ipq[hash].count--;
 	uma_zfree(V_ipq_zone, fp);
 	m->m_len += (ip->ip_hl << 2);
 	m->m_data -= (ip->ip_hl << 2);
@@ -486,17 +501,21 @@ done:
 void
 ipreass_init(void)
 {
+	int max;
 
 	for (int i = 0; i < IPREASS_NHASH; i++) {
 		TAILQ_INIT(&V_ipq[i].head);
 		mtx_init(&V_ipq[i].lock, "IP reassembly", NULL,
 		    MTX_DEF | MTX_DUPOK);
+		V_ipq[i].count = 0;
 	}
 	V_ipq_hashseed = arc4random();
 	V_maxfragsperpacket = 16;
 	V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
 	    NULL, UMA_ALIGN_PTR, 0);
-	uma_zone_set_max(V_ipq_zone, nmbclusters / 32);
+	max = nmbclusters / 32;
+	max = uma_zone_set_max(V_ipq_zone, max);
+	V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
 
 	if (IS_DEFAULT_VNET(curvnet)) {
 		maxfrags = nmbclusters / 32;
@@ -517,7 +536,7 @@ ipreass_slowtimo(void)
 		IPQ_LOCK(i);
 		TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, tmp)
 		if (--fp->ipq_ttl == 0)
-				ipq_timeout(&V_ipq[i].head, fp);
+				ipq_timeout(&V_ipq[i], fp);
 		IPQ_UNLOCK(i);
 	}
 }
@@ -532,7 +551,10 @@ ipreass_drain(void)
 	for (int i = 0; i < IPREASS_NHASH; i++) {
 		IPQ_LOCK(i);
 		while(!TAILQ_EMPTY(&V_ipq[i].head))
-			ipq_drop(&V_ipq[i].head, TAILQ_FIRST(&V_ipq[i].head));
+			ipq_drop(&V_ipq[i], TAILQ_FIRST(&V_ipq[i].head));
+		KASSERT(V_ipq[i].count == 0,
+		    ("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i,
+		    V_ipq[i].count, V_ipq));
 		IPQ_UNLOCK(i);
 	}
 }
@@ -560,9 +582,23 @@ ipreass_destroy(void)
 static void
 ipreass_drain_tomax(void)
 {
+	struct ipq *fp;
 	int target;
 
 	/*
+	 * Make sure each bucket is under the new limit. If
+	 * necessary, drop enough of the oldest elements from
+	 * each bucket to get under the new limit.
+	 */
+	for (int i = 0; i < IPREASS_NHASH; i++) {
+		IPQ_LOCK(i);
+		while (V_ipq[i].count > V_ipreass_maxbucketsize &&
+		    (fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL)
+			ipq_timeout(&V_ipq[i], fp);
+		IPQ_UNLOCK(i);
+	}
+
+	/*
 	 * If we are over the maximum number of fragments,
 	 * drain off enough to get down to the new limit,
 	 * stripping off last elements on queues.  Every
@@ -570,13 +606,11 @@ ipreass_drain_tomax(void)
 	 */
 	target = uma_zone_get_max(V_ipq_zone);
 	while (uma_zone_get_cur(V_ipq_zone) > target) {
-		struct ipq *fp;
-
 		for (int i = 0; i < IPREASS_NHASH; i++) {
 			IPQ_LOCK(i);
 			fp = TAILQ_LAST(&V_ipq[i].head, ipqhead);
 			if (fp != NULL)
-				ipq_timeout(&V_ipq[i].head, fp);
+				ipq_timeout(&V_ipq[i], fp);
 			IPQ_UNLOCK(i);
 		}
 	}
@@ -593,7 +627,8 @@ ipreass_zone_change(void *tag)
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
-		uma_zone_set_max(V_ipq_zone, max);
+		max = uma_zone_set_max(V_ipq_zone, max);
+		V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
 		ipreass_drain_tomax();
 		CURVNET_RESTORE();
 	}
@@ -625,6 +660,7 @@ sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS)
 		 * and place an extreme upper bound.
 		 */
 		max = uma_zone_set_max(V_ipq_zone, max);
+		V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
 		ipreass_drain_tomax();
 		V_noreass = 0;
 	} else if (max == 0) {
@@ -633,6 +669,7 @@ sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS)
 	} else if (max == -1) {
 		V_noreass = 0;
 		uma_zone_set_max(V_ipq_zone, 0);
+		V_ipreass_maxbucketsize = INT_MAX;
 	} else
 		return (EINVAL);
 	return (0);
@@ -646,16 +683,15 @@ static struct ipq *
 ipq_reuse(int start)
 {
 	struct ipq *fp;
-	int i;
+	int bucket, i;
 
 	IPQ_LOCK_ASSERT(start);
 
-	for (i = start;; i++) {
-		if (i == IPREASS_NHASH)
-			i = 0;
-		if (i != start && IPQ_TRYLOCK(i) == 0)
+	for (i = 0; i < IPREASS_NHASH; i++) {
+		bucket = (start + i) % IPREASS_NHASH;
+		if (bucket != start && IPQ_TRYLOCK(bucket) == 0)
 			continue;
-		fp = TAILQ_LAST(&V_ipq[i].head, ipqhead);
+		fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead);
 		if (fp) {
 			struct mbuf *m;
 
@@ -666,22 +702,24 @@ ipq_reuse(int start)
 				fp->ipq_frags = m->m_nextpkt;
 				m_freem(m);
 			}
-			TAILQ_REMOVE(&V_ipq[i].head, fp, ipq_list);
-			if (i != start)
-				IPQ_UNLOCK(i);
-			IPQ_LOCK_ASSERT(start);
-			return (fp);
+			TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list);
+			V_ipq[bucket].count--;
+			if (bucket != start)
+				IPQ_UNLOCK(bucket);
+			break;
 		}
-		if (i != start)
-			IPQ_UNLOCK(i);
+		if (bucket != start)
+			IPQ_UNLOCK(bucket);
 	}
+	IPQ_LOCK_ASSERT(start);
+	return (fp);
 }
 
 /*
  * Free a fragment reassembly header and all associated datagrams.
  */
 static void
-ipq_free(struct ipqhead *fhp, struct ipq *fp)
+ipq_free(struct ipqbucket *bucket, struct ipq *fp)
 {
 	struct mbuf *q;
 
@@ -691,6 +729,26 @@ ipq_free(struct ipqhead *fhp, struct ipq *fp)
 		fp->ipq_frags = q->m_nextpkt;
 		m_freem(q);
 	}
-	TAILQ_REMOVE(fhp, fp, ipq_list);
+	TAILQ_REMOVE(&bucket->head, fp, ipq_list);
+	bucket->count--;
 	uma_zfree(V_ipq_zone, fp);
+}
+
+/*
+ * Get or set the maximum number of reassembly queues per bucket.
+ */
+static int
+sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS)
+{
+	int error, max;
+
+	max = V_ipreass_maxbucketsize;
+	error = sysctl_handle_int(oidp, &max, 0, req);
+	if (error || !req->newptr)
+		return (error);
+	if (max <= 0)
+		return (EINVAL);
+	V_ipreass_maxbucketsize = max;
+	ipreass_drain_tomax();
+	return (0);
 }


More information about the svn-src-all mailing list