svn commit: r273850 - head/sys/netinet
Julien Charbon
jch at FreeBSD.org
Thu Oct 30 08:53:59 UTC 2014
Author: jch
Date: Thu Oct 30 08:53:56 2014
New Revision: 273850
URL: https://svnweb.freebsd.org/changeset/base/273850
Log:
Fix a race condition in TCP timewait between tcp_tw_2msl_reuse() and
tcp_tw_2msl_scan(). This race condition drives unplanned timewait
timeout cancellation. Also simplify implementation by holding inpcb
reference and removing tcptw reference counting.
Differential Revision: https://reviews.freebsd.org/D826
Submitted by: Marc De la Gueronniere <mdelagueronniere at verisign.com>
Submitted by: jch
Reviewed By: jhb (mentor), adrian, rwatson
Sponsored by: Verisign, Inc.
MFC after: 2 weeks
X-MFC-With: r264321
Modified:
head/sys/netinet/tcp_timer.c
head/sys/netinet/tcp_timer.h
head/sys/netinet/tcp_timewait.c
head/sys/netinet/tcp_usrreq.c
head/sys/netinet/tcp_var.h
Modified: head/sys/netinet/tcp_timer.c
==============================================================================
--- head/sys/netinet/tcp_timer.c Thu Oct 30 08:50:01 2014 (r273849)
+++ head/sys/netinet/tcp_timer.c Thu Oct 30 08:53:56 2014 (r273850)
@@ -243,7 +243,7 @@ tcp_slowtimo(void)
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- tcp_tw_2msl_scan();
+ (void) tcp_tw_2msl_scan(0);
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
Modified: head/sys/netinet/tcp_timer.h
==============================================================================
--- head/sys/netinet/tcp_timer.h Thu Oct 30 08:50:01 2014 (r273849)
+++ head/sys/netinet/tcp_timer.h Thu Oct 30 08:53:56 2014 (r273850)
@@ -178,8 +178,7 @@ extern int tcp_fast_finwait2_recycle;
void tcp_timer_init(void);
void tcp_timer_2msl(void *xtp);
struct tcptw *
- tcp_tw_2msl_reuse(void); /* XXX temporary? */
-void tcp_tw_2msl_scan(void);
+ tcp_tw_2msl_scan(int reuse); /* XXX temporary? */
void tcp_timer_keep(void *xtp);
void tcp_timer_persist(void *xtp);
void tcp_timer_rexmt(void *xtp);
Modified: head/sys/netinet/tcp_timewait.c
==============================================================================
--- head/sys/netinet/tcp_timewait.c Thu Oct 30 08:50:01 2014 (r273849)
+++ head/sys/netinet/tcp_timewait.c Thu Oct 30 08:53:56 2014 (r273850)
@@ -49,7 +49,6 @@ __FBSDID("$FreeBSD$");
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/random.h>
-#include <sys/refcount.h>
#include <vm/uma.h>
@@ -101,6 +100,11 @@ static int maxtcptw;
* currently in the TIME_WAIT state. The queue pointers, including the
* queue pointers in each tcptw structure, are protected using the global
* timewait lock, which must be held over queue iteration and modification.
+ *
+ * Rules on tcptw usage:
+ * - a inpcb is always freed _after_ its tcptw
+ * - a tcptw relies on its inpcb reference counting for memory stability
+ * - a tcptw is dereferenceable only while its inpcb is locked
*/
static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
#define V_twq_2msl VNET(twq_2msl)
@@ -124,32 +128,6 @@ static void tcp_tw_2msl_reset(struct tcp
static void tcp_tw_2msl_stop(struct tcptw *, int);
static int tcp_twrespond(struct tcptw *, int);
-/*
- * tw_pcbref() bumps the reference count on an tw in order to maintain
- * stability of an tw pointer despite the tw lock being released.
- */
-static void
-tw_pcbref(struct tcptw *tw)
-{
-
- KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
- refcount_acquire(&tw->tw_refcount);
-}
-
-/*
- * Drop a refcount on an tw elevated using tw_pcbref().
- */
-static int
-tw_pcbrele(struct tcptw *tw)
-{
-
- KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
- if (!refcount_release(&tw->tw_refcount))
- return (0);
- uma_zfree(V_tcptw_zone, tw);
- return (1);
-}
-
static int
tcptw_auto_size(void)
{
@@ -279,8 +257,11 @@ tcp_twstart(struct tcpcb *tp)
* Reached limit on total number of TIMEWAIT connections
* allowed. Remove a connection from TIMEWAIT queue in LRU
* fashion to make room for this connection.
+ *
+ * pcbinfo lock is needed here to prevent deadlock as
+ * two inpcb locks can be acquired simultaneously.
*/
- tw = tcp_tw_2msl_reuse();
+ tw = tcp_tw_2msl_scan(1);
if (tw == NULL) {
tp = tcp_close(tp);
if (tp != NULL)
@@ -288,8 +269,12 @@ tcp_twstart(struct tcpcb *tp)
return;
}
}
+ /*
+ * The tcptw will hold a reference on its inpcb until tcp_twclose
+ * is called
+ */
tw->tw_inpcb = inp;
- refcount_init(&tw->tw_refcount, 1);
+ in_pcbref(inp); /* Reference from tw */
/*
* Recover last window size sent.
@@ -479,7 +464,6 @@ tcp_twclose(struct tcptw *tw, int reuse)
INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */
INP_WLOCK_ASSERT(inp);
- tw->tw_inpcb = NULL;
tcp_tw_2msl_stop(tw, reuse);
inp->inp_ppcb = NULL;
in_pcbdrop(inp);
@@ -509,8 +493,13 @@ tcp_twclose(struct tcptw *tw, int reuse)
*/
INP_WUNLOCK(inp);
}
- } else
+ } else {
+ /*
+ * The socket has been already cleaned-up for us, only free the
+ * inpcb.
+ */
in_pcbfree(inp);
+ }
TCPSTAT_INC(tcps_closed);
}
@@ -641,71 +630,94 @@ tcp_tw_2msl_reset(struct tcptw *tw, int
static void
tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
{
+ struct ucred *cred;
+ struct inpcb *inp;
+ int released;
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
TW_WLOCK(V_tw_lock);
+ inp = tw->tw_inpcb;
+ tw->tw_inpcb = NULL;
+
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
- crfree(tw->tw_cred);
+ cred = tw->tw_cred;
tw->tw_cred = NULL;
TW_WUNLOCK(V_tw_lock);
+ if (cred != NULL)
+ crfree(cred);
+
+ released = in_pcbrele_wlocked(inp);
+ KASSERT(!released, ("%s: inp should not be released here", __func__));
+
if (!reuse)
- tw_pcbrele(tw);
+ uma_zfree(V_tcptw_zone, tw);
}
struct tcptw *
-tcp_tw_2msl_reuse(void)
+tcp_tw_2msl_scan(int reuse)
{
struct tcptw *tw;
+ struct inpcb *inp;
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
-
- TW_WLOCK(V_tw_lock);
- tw = TAILQ_FIRST(&V_twq_2msl);
- if (tw == NULL) {
- TW_WUNLOCK(V_tw_lock);
- return NULL;
+#ifdef INVARIANTS
+ if (reuse) {
+ /*
+ * pcbinfo lock is needed in reuse case to prevent deadlock
+ * as two inpcb locks can be acquired simultaneously:
+ * - the inpcb transitioning to TIME_WAIT state in
+ * tcp_tw_start(),
+ * - the inpcb closed by tcp_twclose().
+ */
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
}
- TW_WUNLOCK(V_tw_lock);
-
- INP_WLOCK(tw->tw_inpcb);
- tcp_twclose(tw, 1);
-
- return (tw);
-}
-
-void
-tcp_tw_2msl_scan(void)
-{
- struct tcptw *tw;
+#endif
for (;;) {
TW_RLOCK(V_tw_lock);
tw = TAILQ_FIRST(&V_twq_2msl);
- if (tw == NULL || tw->tw_time - ticks > 0) {
+ if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) {
TW_RUNLOCK(V_tw_lock);
break;
}
- tw_pcbref(tw);
+ KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL",
+ __func__));
+
+ inp = tw->tw_inpcb;
+ in_pcbref(inp);
TW_RUNLOCK(V_tw_lock);
- /* Close timewait state */
if (INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
- if (tw_pcbrele(tw)) {
+
+ INP_WLOCK(inp);
+ tw = intotw(inp);
+ if (in_pcbrele_wlocked(inp)) {
+ KASSERT(tw == NULL, ("%s: held last inp "
+ "reference but tw not NULL", __func__));
INP_INFO_WUNLOCK(&V_tcbinfo);
continue;
}
- KASSERT(tw->tw_inpcb != NULL,
- ("%s: tw->tw_inpcb == NULL", __func__));
- INP_WLOCK(tw->tw_inpcb);
- tcp_twclose(tw, 0);
+ if (tw == NULL) {
+ /* tcp_twclose() has already been called */
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ continue;
+ }
+
+ tcp_twclose(tw, reuse);
INP_INFO_WUNLOCK(&V_tcbinfo);
+ if (reuse)
+ return tw;
} else {
- /* INP_INFO lock is busy; continue later. */
- tw_pcbrele(tw);
+ /* INP_INFO lock is busy, continue later. */
+ INP_WLOCK(inp);
+ if (!in_pcbrele_wlocked(inp))
+ INP_WUNLOCK(inp);
break;
}
}
+
+ return NULL;
}
Modified: head/sys/netinet/tcp_usrreq.c
==============================================================================
--- head/sys/netinet/tcp_usrreq.c Thu Oct 30 08:50:01 2014 (r273849)
+++ head/sys/netinet/tcp_usrreq.c Thu Oct 30 08:53:56 2014 (r273850)
@@ -183,6 +183,21 @@ tcp_detach(struct socket *so, struct inp
* present until timewait ends.
*
* XXXRW: Would it be cleaner to free the tcptw here?
+ *
+ * Astute question indeed, from twtcp perspective there are
+ * three cases to consider:
+ *
+ * #1 tcp_detach is called at tcptw creation time by
+ * tcp_twstart, then do not discard the newly created tcptw
+ * and leave inpcb present until timewait ends
+ * #2 tcp_detach is called at timewait end (or reuse) by
+ * tcp_twclose, then the tcptw has already been discarded
+ * and inpcb is freed here
+ * #3 tcp_detach is called() after timewait ends (or reuse)
+ * (e.g. by soclose), then tcptw has already been discarded
+ * and inpcb is freed here
+ *
+ * In all three cases the tcptw should not be freed here.
*/
if (inp->inp_flags & INP_DROPPED) {
KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "
Modified: head/sys/netinet/tcp_var.h
==============================================================================
--- head/sys/netinet/tcp_var.h Thu Oct 30 08:50:01 2014 (r273849)
+++ head/sys/netinet/tcp_var.h Thu Oct 30 08:53:56 2014 (r273850)
@@ -358,7 +358,6 @@ struct tcptw {
u_int t_starttime;
int tw_time;
TAILQ_ENTRY(tcptw) tw_2msl;
- u_int tw_refcount; /* refcount */
};
#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
@@ -651,7 +650,7 @@ struct tcpcb *
tcp_close(struct tcpcb *);
void tcp_discardcb(struct tcpcb *);
void tcp_twstart(struct tcpcb *);
-void tcp_twclose(struct tcptw *_tw, int _reuse);
+void tcp_twclose(struct tcptw *, int);
void tcp_ctlinput(int, struct sockaddr *, void *);
int tcp_ctloutput(struct socket *, struct sockopt *);
struct tcpcb *
More information about the svn-src-all
mailing list