svn commit: r273850 - head/sys/netinet

Julien Charbon jch at FreeBSD.org
Thu Oct 30 08:53:59 UTC 2014


Author: jch
Date: Thu Oct 30 08:53:56 2014
New Revision: 273850
URL: https://svnweb.freebsd.org/changeset/base/273850

Log:
  Fix a race condition in TCP timewait between tcp_tw_2msl_reuse() and
  tcp_tw_2msl_scan().  This race condition drives unplanned timewait
  timeout cancellation.  Also simplify implementation by holding inpcb
  reference and removing tcptw reference counting.
  
  Differential Revision:	https://reviews.freebsd.org/D826
  Submitted by:		Marc De la Gueronniere <mdelagueronniere at verisign.com>
  Submitted by:		jch
  Reviewed By:		jhb (mentor), adrian, rwatson
  Sponsored by:		Verisign, Inc.
  MFC after:		2 weeks
  X-MFC-With:		r264321

Modified:
  head/sys/netinet/tcp_timer.c
  head/sys/netinet/tcp_timer.h
  head/sys/netinet/tcp_timewait.c
  head/sys/netinet/tcp_usrreq.c
  head/sys/netinet/tcp_var.h

Modified: head/sys/netinet/tcp_timer.c
==============================================================================
--- head/sys/netinet/tcp_timer.c	Thu Oct 30 08:50:01 2014	(r273849)
+++ head/sys/netinet/tcp_timer.c	Thu Oct 30 08:53:56 2014	(r273850)
@@ -243,7 +243,7 @@ tcp_slowtimo(void)
 	VNET_LIST_RLOCK_NOSLEEP();
 	VNET_FOREACH(vnet_iter) {
 		CURVNET_SET(vnet_iter);
-		tcp_tw_2msl_scan();
+		(void) tcp_tw_2msl_scan(0);
 		CURVNET_RESTORE();
 	}
 	VNET_LIST_RUNLOCK_NOSLEEP();

Modified: head/sys/netinet/tcp_timer.h
==============================================================================
--- head/sys/netinet/tcp_timer.h	Thu Oct 30 08:50:01 2014	(r273849)
+++ head/sys/netinet/tcp_timer.h	Thu Oct 30 08:53:56 2014	(r273850)
@@ -178,8 +178,7 @@ extern int tcp_fast_finwait2_recycle;
 void	tcp_timer_init(void);
 void	tcp_timer_2msl(void *xtp);
 struct tcptw *
-	tcp_tw_2msl_reuse(void);	/* XXX temporary? */
-void	tcp_tw_2msl_scan(void);
+	tcp_tw_2msl_scan(int reuse);	/* XXX temporary? */
 void	tcp_timer_keep(void *xtp);
 void	tcp_timer_persist(void *xtp);
 void	tcp_timer_rexmt(void *xtp);

Modified: head/sys/netinet/tcp_timewait.c
==============================================================================
--- head/sys/netinet/tcp_timewait.c	Thu Oct 30 08:50:01 2014	(r273849)
+++ head/sys/netinet/tcp_timewait.c	Thu Oct 30 08:53:56 2014	(r273850)
@@ -49,7 +49,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
-#include <sys/refcount.h>
 
 #include <vm/uma.h>
 
@@ -101,6 +100,11 @@ static int	maxtcptw;
  * currently in the TIME_WAIT state.  The queue pointers, including the
  * queue pointers in each tcptw structure, are protected using the global
  * timewait lock, which must be held over queue iteration and modification.
+ *
+ * Rules on tcptw usage:
+ *  - a inpcb is always freed _after_ its tcptw
+ *  - a tcptw relies on its inpcb reference counting for memory stability
+ *  - a tcptw is dereferenceable only while its inpcb is locked
  */
 static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
 #define	V_twq_2msl		VNET(twq_2msl)
@@ -124,32 +128,6 @@ static void	tcp_tw_2msl_reset(struct tcp
 static void	tcp_tw_2msl_stop(struct tcptw *, int);
 static int	tcp_twrespond(struct tcptw *, int);
 
-/*
- * tw_pcbref() bumps the reference count on an tw in order to maintain
- * stability of an tw pointer despite the tw lock being released.
- */
-static void
-tw_pcbref(struct tcptw *tw)
-{
-
-	KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
-	refcount_acquire(&tw->tw_refcount);
-}
-
-/*
- * Drop a refcount on an tw elevated using tw_pcbref().
- */
-static int
-tw_pcbrele(struct tcptw *tw)
-{
-
-	KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
-	if (!refcount_release(&tw->tw_refcount))
-		return (0);
-	uma_zfree(V_tcptw_zone, tw);
-	return (1);
-}
-
 static int
 tcptw_auto_size(void)
 {
@@ -279,8 +257,11 @@ tcp_twstart(struct tcpcb *tp)
 		 * Reached limit on total number of TIMEWAIT connections
 		 * allowed. Remove a connection from TIMEWAIT queue in LRU
 		 * fashion to make room for this connection.
+		 *
+		 * pcbinfo lock is needed here to prevent deadlock as
+		 * two inpcb locks can be acquired simultaneously.
 		 */
-		tw = tcp_tw_2msl_reuse();
+		tw = tcp_tw_2msl_scan(1);
 		if (tw == NULL) {
 			tp = tcp_close(tp);
 			if (tp != NULL)
@@ -288,8 +269,12 @@ tcp_twstart(struct tcpcb *tp)
 			return;
 		}
 	}
+	/*
+	 * The tcptw will hold a reference on its inpcb until tcp_twclose
+	 * is called
+	 */
 	tw->tw_inpcb = inp;
-	refcount_init(&tw->tw_refcount, 1);
+	in_pcbref(inp);	/* Reference from tw */
 
 	/*
 	 * Recover last window size sent.
@@ -479,7 +464,6 @@ tcp_twclose(struct tcptw *tw, int reuse)
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);	/* in_pcbfree() */
 	INP_WLOCK_ASSERT(inp);
 
-	tw->tw_inpcb = NULL;
 	tcp_tw_2msl_stop(tw, reuse);
 	inp->inp_ppcb = NULL;
 	in_pcbdrop(inp);
@@ -509,8 +493,13 @@ tcp_twclose(struct tcptw *tw, int reuse)
 			 */
 			INP_WUNLOCK(inp);
 		}
-	} else
+	} else {
+		/*
+		 * The socket has been already cleaned-up for us, only free the
+		 * inpcb.
+		 */
 		in_pcbfree(inp);
+	}
 	TCPSTAT_INC(tcps_closed);
 }
 
@@ -641,71 +630,94 @@ tcp_tw_2msl_reset(struct tcptw *tw, int 
 static void
 tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
 {
+	struct ucred *cred;
+	struct inpcb *inp;
+	int released;
 
 	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 
 	TW_WLOCK(V_tw_lock);
+	inp = tw->tw_inpcb;
+	tw->tw_inpcb = NULL;
+
 	TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
-	crfree(tw->tw_cred);
+	cred = tw->tw_cred;
 	tw->tw_cred = NULL;
 	TW_WUNLOCK(V_tw_lock);
 
+	if (cred != NULL)
+		crfree(cred);
+
+	released = in_pcbrele_wlocked(inp);
+	KASSERT(!released, ("%s: inp should not be released here", __func__));
+
 	if (!reuse)
-		tw_pcbrele(tw);
+		uma_zfree(V_tcptw_zone, tw);
 }
 
 struct tcptw *
-tcp_tw_2msl_reuse(void)
+tcp_tw_2msl_scan(int reuse)
 {
 	struct tcptw *tw;
+	struct inpcb *inp;
 
-	INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
-
-	TW_WLOCK(V_tw_lock);
-	tw = TAILQ_FIRST(&V_twq_2msl);
-	if (tw == NULL) {
-		TW_WUNLOCK(V_tw_lock);
-		return NULL;
+#ifdef INVARIANTS
+	if (reuse) {
+		/*
+		 * pcbinfo lock is needed in reuse case to prevent deadlock
+		 * as two inpcb locks can be acquired simultaneously:
+		 *  - the inpcb transitioning to TIME_WAIT state in
+		 *    tcp_tw_start(),
+		 *  - the inpcb closed by tcp_twclose().
+		 */
+		INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
 	}
-	TW_WUNLOCK(V_tw_lock);
-
-	INP_WLOCK(tw->tw_inpcb);
-	tcp_twclose(tw, 1);
-
-	return (tw);
-}
-
-void
-tcp_tw_2msl_scan(void)
-{
-	struct tcptw *tw;
+#endif
 
 	for (;;) {
 		TW_RLOCK(V_tw_lock);
 		tw = TAILQ_FIRST(&V_twq_2msl);
-		if (tw == NULL || tw->tw_time - ticks > 0) {
+		if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) {
 			TW_RUNLOCK(V_tw_lock);
 			break;
 		}
-		tw_pcbref(tw);
+		KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL",
+		    __func__));
+
+		inp = tw->tw_inpcb;
+		in_pcbref(inp);
 		TW_RUNLOCK(V_tw_lock);
 
-		/* Close timewait state */
 		if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
-			if (tw_pcbrele(tw)) {
+
+			INP_WLOCK(inp);
+			tw = intotw(inp);
+			if (in_pcbrele_wlocked(inp)) {
+				KASSERT(tw == NULL, ("%s: held last inp "
+				    "reference but tw not NULL", __func__));
 				INP_INFO_WUNLOCK(&V_tcbinfo);
 				continue;
 			}
 
-			KASSERT(tw->tw_inpcb != NULL,
-			    ("%s: tw->tw_inpcb == NULL", __func__));
-			INP_WLOCK(tw->tw_inpcb);
-			tcp_twclose(tw, 0);
+			if (tw == NULL) {
+				/* tcp_twclose() has already been called */
+				INP_WUNLOCK(inp);
+				INP_INFO_WUNLOCK(&V_tcbinfo);
+				continue;
+			}
+
+			tcp_twclose(tw, reuse);
 			INP_INFO_WUNLOCK(&V_tcbinfo);
+			if (reuse)
+			    return tw;
 		} else {
-			/* INP_INFO lock is busy; continue later. */
-			tw_pcbrele(tw);
+			/* INP_INFO lock is busy, continue later. */
+			INP_WLOCK(inp);
+			if (!in_pcbrele_wlocked(inp))
+				INP_WUNLOCK(inp);
 			break;
 		}
 	}
+
+	return NULL;
 }

Modified: head/sys/netinet/tcp_usrreq.c
==============================================================================
--- head/sys/netinet/tcp_usrreq.c	Thu Oct 30 08:50:01 2014	(r273849)
+++ head/sys/netinet/tcp_usrreq.c	Thu Oct 30 08:53:56 2014	(r273850)
@@ -183,6 +183,21 @@ tcp_detach(struct socket *so, struct inp
 		 * present until timewait ends.
 		 *
 		 * XXXRW: Would it be cleaner to free the tcptw here?
+		 *
+		 * Astute question indeed, from twtcp perspective there are
+		 * three cases to consider:
+		 *
+		 * #1 tcp_detach is called at tcptw creation time by
+		 *  tcp_twstart, then do not discard the newly created tcptw
+		 *  and leave inpcb present until timewait ends
+		 * #2 tcp_detach is called at timewait end (or reuse) by
+		 *  tcp_twclose, then the tcptw has already been discarded
+		 *  and inpcb is freed here
+		 * #3 tcp_detach is called() after timewait ends (or reuse)
+		 *  (e.g. by soclose), then tcptw has already been discarded
+		 *  and inpcb is freed here
+		 *
+		 *  In all three cases the tcptw should not be freed here.
 		 */
 		if (inp->inp_flags & INP_DROPPED) {
 			KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "

Modified: head/sys/netinet/tcp_var.h
==============================================================================
--- head/sys/netinet/tcp_var.h	Thu Oct 30 08:50:01 2014	(r273849)
+++ head/sys/netinet/tcp_var.h	Thu Oct 30 08:53:56 2014	(r273850)
@@ -358,7 +358,6 @@ struct tcptw {
 	u_int		t_starttime;
 	int		tw_time;
 	TAILQ_ENTRY(tcptw) tw_2msl;
-	u_int		tw_refcount;	/* refcount */
 };
 
 #define	intotcpcb(ip)	((struct tcpcb *)(ip)->inp_ppcb)
@@ -651,7 +650,7 @@ struct tcpcb *
 	 tcp_close(struct tcpcb *);
 void	 tcp_discardcb(struct tcpcb *);
 void	 tcp_twstart(struct tcpcb *);
-void	 tcp_twclose(struct tcptw *_tw, int _reuse);
+void	 tcp_twclose(struct tcptw *, int);
 void	 tcp_ctlinput(int, struct sockaddr *, void *);
 int	 tcp_ctloutput(struct socket *, struct sockopt *);
 struct tcpcb *


More information about the svn-src-head mailing list