svn commit: r281599 - head/sys/netinet

Julien Charbon jch at FreeBSD.org
Thu Apr 16 10:00:08 UTC 2015


Author: jch
Date: Thu Apr 16 10:00:06 2015
New Revision: 281599
URL: https://svnweb.freebsd.org/changeset/base/281599

Log:
  Fix an old and well-documented use-after-free race condition in
  TCP timers:
   - Add a reference from tcpcb to its inpcb
   - Defer tcpcb deletion until TCP timers have finished
  
  Differential Revision:	https://reviews.freebsd.org/D2079
  Submitted by:		jch, Marc De La Gueronniere <mdelagueronniere at verisign.com>
  Reviewed by:		imp, rrs, adrian, jhb, bz
  Approved by:		jhb
  Sponsored by:		Verisign, Inc.

Modified:
  head/sys/netinet/tcp_subr.c
  head/sys/netinet/tcp_timer.c
  head/sys/netinet/tcp_timer.h
  head/sys/netinet/tcp_var.h

Modified: head/sys/netinet/tcp_subr.c
==============================================================================
--- head/sys/netinet/tcp_subr.c	Thu Apr 16 09:41:37 2015	(r281598)
+++ head/sys/netinet/tcp_subr.c	Thu Apr 16 10:00:06 2015	(r281599)
@@ -230,6 +230,7 @@ static struct inpcb *tcp_notify(struct i
 static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
 static char *	tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
 		    void *ip4hdr, const void *ip6hdr);
+static void	tcp_timer_discard(struct tcpcb *, uint32_t);
 
 /*
  * Target size of TCP PCB hash tables. Must be a power of two.
@@ -801,7 +802,13 @@ tcp_newtcpcb(struct inpcb *inp)
 	if (V_tcp_do_sack)
 		tp->t_flags |= TF_SACK_PERMIT;
 	TAILQ_INIT(&tp->snd_holes);
-	tp->t_inpcb = inp;	/* XXX */
+	/*
+	 * The tcpcb will hold a reference on its inpcb until tcp_discardcb()
+	 * is called.
+	 */
+	in_pcbref(inp);	/* Reference for tcpcb */
+	tp->t_inpcb = inp;
+
 	/*
 	 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
 	 * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
@@ -920,6 +927,7 @@ tcp_discardcb(struct tcpcb *tp)
 #ifdef INET6
 	int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
+	int released;
 
 	INP_WLOCK_ASSERT(inp);
 
@@ -927,22 +935,15 @@ tcp_discardcb(struct tcpcb *tp)
 	 * Make sure that all of our timers are stopped before we delete the
 	 * PCB.
 	 *
-	 * XXXRW: Really, we would like to use callout_drain() here in order
-	 * to avoid races experienced in tcp_timer.c where a timer is already
-	 * executing at this point.  However, we can't, both because we're
-	 * running in a context where we can't sleep, and also because we
-	 * hold locks required by the timers.  What we instead need to do is
-	 * test to see if callout_drain() is required, and if so, defer some
-	 * portion of the remainder of tcp_discardcb() to an asynchronous
-	 * context that can callout_drain() and then continue.  Some care
-	 * will be required to ensure that no further processing takes place
-	 * on the tcpcb, even though it hasn't been freed (a flag?).
-	 */
-	callout_stop(&tp->t_timers->tt_rexmt);
-	callout_stop(&tp->t_timers->tt_persist);
-	callout_stop(&tp->t_timers->tt_keep);
-	callout_stop(&tp->t_timers->tt_2msl);
-	callout_stop(&tp->t_timers->tt_delack);
+	 * If stopping a timer fails, we schedule a discard function in same
+	 * callout, and the last discard function called will take care of
+	 * deleting the tcpcb.
+	 */
+	tcp_timer_stop(tp, TT_REXMT);
+	tcp_timer_stop(tp, TT_PERSIST);
+	tcp_timer_stop(tp, TT_KEEP);
+	tcp_timer_stop(tp, TT_2MSL);
+	tcp_timer_stop(tp, TT_DELACK);
 
 	/*
 	 * If we got enough samples through the srtt filter,
@@ -1019,8 +1020,80 @@ tcp_discardcb(struct tcpcb *tp)
 
 	CC_ALGO(tp) = NULL;
 	inp->inp_ppcb = NULL;
-	tp->t_inpcb = NULL;
-	uma_zfree(V_tcpcb_zone, tp);
+	if ((tp->t_timers->tt_flags & TT_MASK) == 0) {
+		/* We own the last reference on tcpcb, let's free it. */
+		tp->t_inpcb = NULL;
+		uma_zfree(V_tcpcb_zone, tp);
+		released = in_pcbrele_wlocked(inp);
+		KASSERT(!released, ("%s: inp %p should not have been released "
+			"here", __func__, inp));
+	}
+}
+
+void
+tcp_timer_2msl_discard(void *xtp)
+{
+
+	tcp_timer_discard((struct tcpcb *)xtp, TT_2MSL);
+}
+
+void
+tcp_timer_keep_discard(void *xtp)
+{
+
+	tcp_timer_discard((struct tcpcb *)xtp, TT_KEEP);
+}
+
+void
+tcp_timer_persist_discard(void *xtp)
+{
+
+	tcp_timer_discard((struct tcpcb *)xtp, TT_PERSIST);
+}
+
+void
+tcp_timer_rexmt_discard(void *xtp)
+{
+
+	tcp_timer_discard((struct tcpcb *)xtp, TT_REXMT);
+}
+
+void
+tcp_timer_delack_discard(void *xtp)
+{
+
+	tcp_timer_discard((struct tcpcb *)xtp, TT_DELACK);
+}
+
+void
+tcp_timer_discard(struct tcpcb *tp, uint32_t timer_type)
+{
+	struct inpcb *inp;
+
+	CURVNET_SET(tp->t_vnet);
+	INP_INFO_WLOCK(&V_tcbinfo);
+	inp = tp->t_inpcb;
+	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL",
+		__func__, tp));
+	INP_WLOCK(inp);
+	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0,
+		("%s: tcpcb has to be stopped here", __func__));
+	KASSERT((tp->t_timers->tt_flags & timer_type) != 0,
+		("%s: discard callout should be running", __func__));
+	tp->t_timers->tt_flags &= ~timer_type;
+	if ((tp->t_timers->tt_flags & TT_MASK) == 0) {
+		/* We own the last reference on this tcpcb, let's free it. */
+		tp->t_inpcb = NULL;
+		uma_zfree(V_tcpcb_zone, tp);
+		if (in_pcbrele_wlocked(inp)) {
+			INP_INFO_WUNLOCK(&V_tcbinfo);
+			CURVNET_RESTORE();
+			return;
+		}
+	}
+	INP_WUNLOCK(inp);
+	INP_INFO_WUNLOCK(&V_tcbinfo);
+	CURVNET_RESTORE();
 }
 
 /*

Modified: head/sys/netinet/tcp_timer.c
==============================================================================
--- head/sys/netinet/tcp_timer.c	Thu Apr 16 09:41:37 2015	(r281598)
+++ head/sys/netinet/tcp_timer.c	Thu Apr 16 10:00:06 2015	(r281599)
@@ -258,10 +258,6 @@ int	tcp_backoff[TCP_MAXRXTSHIFT + 1] =
 
 static int tcp_totbackoff = 2559;	/* sum of tcp_backoff[] */
 
-static int tcp_timer_race;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_race, CTLFLAG_RD, &tcp_timer_race,
-    0, "Count of t_inpcb races on tcp_discardcb");
-
 /*
  * TCP timer processing.
  */
@@ -274,18 +270,7 @@ tcp_timer_delack(void *xtp)
 	CURVNET_SET(tp->t_vnet);
 
 	inp = tp->t_inpcb;
-	/*
-	 * XXXRW: While this assert is in fact correct, bugs in the tcpcb
-	 * tear-down mean we need it as a work-around for races between
-	 * timers and tcp_discardcb().
-	 *
-	 * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL"));
-	 */
-	if (inp == NULL) {
-		tcp_timer_race++;
-		CURVNET_RESTORE();
-		return;
-	}
+	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_delack) ||
 	    !callout_active(&tp->t_timers->tt_delack)) {
@@ -299,6 +284,10 @@ tcp_timer_delack(void *xtp)
 		CURVNET_RESTORE();
 		return;
 	}
+	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
+	KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0,
+		("%s: tp %p delack callout should be running", __func__, tp));
 
 	tp->t_flags |= TF_ACKNOW;
 	TCPSTAT_INC(tcps_delack);
@@ -318,24 +307,9 @@ tcp_timer_2msl(void *xtp)
 
 	ostate = tp->t_state;
 #endif
-	/*
-	 * XXXRW: Does this actually happen?
-	 */
 	INP_INFO_WLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
-	/*
-	 * XXXRW: While this assert is in fact correct, bugs in the tcpcb
-	 * tear-down mean we need it as a work-around for races between
-	 * timers and tcp_discardcb().
-	 *
-	 * KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL"));
-	 */
-	if (inp == NULL) {
-		tcp_timer_race++;
-		INP_INFO_WUNLOCK(&V_tcbinfo);
-		CURVNET_RESTORE();
-		return;
-	}
+	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	tcp_free_sackholes(tp);
 	if (callout_pending(&tp->t_timers->tt_2msl) ||
@@ -352,6 +326,10 @@ tcp_timer_2msl(void *xtp)
 		CURVNET_RESTORE();
 		return;
 	}
+	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
+	KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0,
+		("%s: tp %p 2msl callout should be running", __func__, tp));
 	/*
 	 * 2 MSL timeout in shutdown went off.  If we're closed but
 	 * still waiting for peer to close and connection has been idle
@@ -402,19 +380,7 @@ tcp_timer_keep(void *xtp)
 #endif
 	INP_INFO_WLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
-	/*
-	 * XXXRW: While this assert is in fact correct, bugs in the tcpcb
-	 * tear-down mean we need it as a work-around for races between
-	 * timers and tcp_discardcb().
-	 *
-	 * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL"));
-	 */
-	if (inp == NULL) {
-		tcp_timer_race++;
-		INP_INFO_WUNLOCK(&V_tcbinfo);
-		CURVNET_RESTORE();
-		return;
-	}
+	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_keep) ||
 	    !callout_active(&tp->t_timers->tt_keep)) {
@@ -430,6 +396,10 @@ tcp_timer_keep(void *xtp)
 		CURVNET_RESTORE();
 		return;
 	}
+	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
+	KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0,
+		("%s: tp %p keep callout should be running", __func__, tp));
 	/*
 	 * Keep-alive timer went off; send something
 	 * or drop connection if idle for too long.
@@ -505,19 +475,7 @@ tcp_timer_persist(void *xtp)
 #endif
 	INP_INFO_WLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
-	/*
-	 * XXXRW: While this assert is in fact correct, bugs in the tcpcb
-	 * tear-down mean we need it as a work-around for races between
-	 * timers and tcp_discardcb().
-	 *
-	 * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL"));
-	 */
-	if (inp == NULL) {
-		tcp_timer_race++;
-		INP_INFO_WUNLOCK(&V_tcbinfo);
-		CURVNET_RESTORE();
-		return;
-	}
+	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_persist) ||
 	    !callout_active(&tp->t_timers->tt_persist)) {
@@ -533,6 +491,10 @@ tcp_timer_persist(void *xtp)
 		CURVNET_RESTORE();
 		return;
 	}
+	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
+	KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0,
+		("%s: tp %p persist callout should be running", __func__, tp));
 	/*
 	 * Persistance timer into zero window.
 	 * Force a byte to be output, if possible.
@@ -594,19 +556,7 @@ tcp_timer_rexmt(void * xtp)
 
 	INP_INFO_RLOCK(&V_tcbinfo);
 	inp = tp->t_inpcb;
-	/*
-	 * XXXRW: While this assert is in fact correct, bugs in the tcpcb
-	 * tear-down mean we need it as a work-around for races between
-	 * timers and tcp_discardcb().
-	 *
-	 * KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL"));
-	 */
-	if (inp == NULL) {
-		tcp_timer_race++;
-		INP_INFO_RUNLOCK(&V_tcbinfo);
-		CURVNET_RESTORE();
-		return;
-	}
+	KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
 	INP_WLOCK(inp);
 	if (callout_pending(&tp->t_timers->tt_rexmt) ||
 	    !callout_active(&tp->t_timers->tt_rexmt)) {
@@ -622,6 +572,10 @@ tcp_timer_rexmt(void * xtp)
 		CURVNET_RESTORE();
 		return;
 	}
+	KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+		("%s: tp %p tcpcb can't be stopped here", __func__, tp));
+	KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0,
+		("%s: tp %p rexmt callout should be running", __func__, tp));
 	tcp_free_sackholes(tp);
 	/*
 	 * Retransmission timer went off.  Message has not
@@ -850,7 +804,7 @@ out:
 }
 
 void
-tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
+tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
 {
 	struct callout *t_callout;
 	timeout_t *f_callout;
@@ -862,6 +816,9 @@ tcp_timer_activate(struct tcpcb *tp, int
 		return;
 #endif
 
+	if (tp->t_timers->tt_flags & TT_STOPPED)
+		return;
+
 	switch (timer_type) {
 		case TT_DELACK:
 			t_callout = &tp->t_timers->tt_delack;
@@ -887,14 +844,23 @@ tcp_timer_activate(struct tcpcb *tp, int
 			panic("tp %p bad timer_type %#x", tp, timer_type);
 		}
 	if (delta == 0) {
-		callout_stop(t_callout);
+		if ((tp->t_timers->tt_flags & timer_type) &&
+		    callout_stop(t_callout)) {
+			tp->t_timers->tt_flags &= ~timer_type;
+		}
 	} else {
-		callout_reset_on(t_callout, delta, f_callout, tp, cpu);
+		if ((tp->t_timers->tt_flags & timer_type) == 0) {
+			tp->t_timers->tt_flags |= timer_type;
+			callout_reset_on(t_callout, delta, f_callout, tp, cpu);
+		} else {
+			/* Reset already running callout on the same CPU. */
+			callout_reset(t_callout, delta, f_callout, tp);
+		}
 	}
 }
 
 int
-tcp_timer_active(struct tcpcb *tp, int timer_type)
+tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
 {
 	struct callout *t_callout;
 
@@ -920,6 +886,58 @@ tcp_timer_active(struct tcpcb *tp, int t
 	return callout_active(t_callout);
 }
 
+void
+tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
+{
+	struct callout *t_callout;
+	timeout_t *f_callout;
+
+	tp->t_timers->tt_flags |= TT_STOPPED;
+
+	switch (timer_type) {
+		case TT_DELACK:
+			t_callout = &tp->t_timers->tt_delack;
+			f_callout = tcp_timer_delack_discard;
+			break;
+		case TT_REXMT:
+			t_callout = &tp->t_timers->tt_rexmt;
+			f_callout = tcp_timer_rexmt_discard;
+			break;
+		case TT_PERSIST:
+			t_callout = &tp->t_timers->tt_persist;
+			f_callout = tcp_timer_persist_discard;
+			break;
+		case TT_KEEP:
+			t_callout = &tp->t_timers->tt_keep;
+			f_callout = tcp_timer_keep_discard;
+			break;
+		case TT_2MSL:
+			t_callout = &tp->t_timers->tt_2msl;
+			f_callout = tcp_timer_2msl_discard;
+			break;
+		default:
+			panic("tp %p bad timer_type %#x", tp, timer_type);
+		}
+
+	if (tp->t_timers->tt_flags & timer_type) {
+		if (callout_stop(t_callout)) {
+			tp->t_timers->tt_flags &= ~timer_type;
+		} else {
+			/*
+			 * Can't stop the callout, defer tcpcb actual deletion
+			 * to the last tcp timer discard callout.
+			 * The TT_STOPPED flag will ensure that no tcp timer
+			 * callouts can be restarted on our behalf, and
+			 * past this point currently running callouts waiting
+			 * on inp lock will return right away after the
+			 * classical check for callout reset/stop events:
+			 * callout_pending() || !callout_active()
+			 */
+			callout_reset(t_callout, 1, f_callout, tp);
+		}
+	}
+}
+
 #define	ticks_to_msecs(t)	(1000*(t) / hz)
 
 void

Modified: head/sys/netinet/tcp_timer.h
==============================================================================
--- head/sys/netinet/tcp_timer.h	Thu Apr 16 09:41:37 2015	(r281598)
+++ head/sys/netinet/tcp_timer.h	Thu Apr 16 10:00:06 2015	(r281599)
@@ -146,12 +146,21 @@ struct tcp_timer {
 	struct	callout tt_keep;	/* keepalive */
 	struct	callout tt_2msl;	/* 2*msl TIME_WAIT timer */
 	struct	callout tt_delack;	/* delayed ACK timer */
+	uint32_t	tt_flags;	/* Timers flags */
+	uint32_t	tt_spare;	/* TDB */
 };
-#define TT_DELACK	0x01
-#define TT_REXMT	0x02
-#define TT_PERSIST	0x04
-#define TT_KEEP		0x08
-#define TT_2MSL		0x10
+
+/*
+ * Flags for the tt_flags field.
+ */
+#define TT_DELACK	0x0001
+#define TT_REXMT	0x0002
+#define TT_PERSIST	0x0004
+#define TT_KEEP		0x0008
+#define TT_2MSL		0x0010
+#define TT_MASK		(TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL)
+
+#define TT_STOPPED	0x00010000
 
 #define	TP_KEEPINIT(tp)	((tp)->t_keepinit ? (tp)->t_keepinit : tcp_keepinit)
 #define	TP_KEEPIDLE(tp)	((tp)->t_keepidle ? (tp)->t_keepidle : tcp_keepidle)
@@ -183,6 +192,11 @@ void	tcp_timer_keep(void *xtp);
 void	tcp_timer_persist(void *xtp);
 void	tcp_timer_rexmt(void *xtp);
 void	tcp_timer_delack(void *xtp);
+void	tcp_timer_2msl_discard(void *xtp);
+void	tcp_timer_keep_discard(void *xtp);
+void	tcp_timer_persist_discard(void *xtp);
+void	tcp_timer_rexmt_discard(void *xtp);
+void	tcp_timer_delack_discard(void *xtp);
 void	tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
 	struct xtcp_timer *xtimer);
 

Modified: head/sys/netinet/tcp_var.h
==============================================================================
--- head/sys/netinet/tcp_var.h	Thu Apr 16 09:41:37 2015	(r281598)
+++ head/sys/netinet/tcp_var.h	Thu Apr 16 10:00:06 2015	(r281599)
@@ -708,8 +708,9 @@ void	 tcp_slowtimo(void);
 struct tcptemp *
 	 tcpip_maketemplate(struct inpcb *);
 void	 tcpip_fillheaders(struct inpcb *, void *, void *);
-void	 tcp_timer_activate(struct tcpcb *, int, u_int);
-int	 tcp_timer_active(struct tcpcb *, int);
+void	 tcp_timer_activate(struct tcpcb *, uint32_t, u_int);
+int	 tcp_timer_active(struct tcpcb *, uint32_t);
+void	 tcp_timer_stop(struct tcpcb *, uint32_t);
 void	 tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
 /*
  * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)


More information about the svn-src-all mailing list