svn commit: r354483 - in head/sys/netinet: . tcp_stacks

Gleb Smirnoff glebius at FreeBSD.org
Thu Nov 7 21:23:08 UTC 2019


Author: glebius
Date: Thu Nov  7 21:23:07 2019
New Revision: 354483
URL: https://svnweb.freebsd.org/changeset/base/354483

Log:
  Now that all of the tcp_input() and all its branches are executed
  in the network epoch, we can greatly simplify synchronization.
  Remove all unneccesary epoch enters hidden under INP_INFO_RLOCK macro.
  Remove some unneccesary assertions and convert necessary ones into the
  NET_EPOCH_ASSERT macro.

Modified:
  head/sys/netinet/tcp_input.c
  head/sys/netinet/tcp_stacks/bbr.c
  head/sys/netinet/tcp_stacks/rack.c
  head/sys/netinet/tcp_stacks/rack_bbr_common.c

Modified: head/sys/netinet/tcp_input.c
==============================================================================
--- head/sys/netinet/tcp_input.c	Thu Nov  7 21:14:59 2019	(r354482)
+++ head/sys/netinet/tcp_input.c	Thu Nov  7 21:23:07 2019	(r354483)
@@ -561,7 +561,6 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
 	int rstreason = 0;	/* For badport_bandlim accounting purposes */
 	uint8_t iptos;
 	struct m_tag *fwd_tag = NULL;
-	struct epoch_tracker et;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;
 	int isipv6;
@@ -570,7 +569,6 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
 #endif /* INET6 */
 	struct tcpopt to;		/* options in this segment */
 	char *s = NULL;			/* address and port logging */
-	int ti_locked;
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
@@ -581,6 +579,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
 	short ostate = 0;
 #endif
 
+	NET_EPOCH_ASSERT();
+
 #ifdef INET6
 	isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
 #endif
@@ -747,19 +747,6 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
 	drop_hdrlen = off0 + off;
 
 	/*
-	 * Locate pcb for segment; if we're likely to add or remove a
-	 * connection then first acquire pcbinfo lock.  There are three cases
-	 * where we might discover later we need a write lock despite the
-	 * flags: ACKs moving a connection out of the syncache, ACKs for a
-	 * connection in TIMEWAIT and SYNs not targeting a listening socket.
-	 */
-	if ((thflags & (TH_FIN | TH_RST)) != 0) {
-		INP_INFO_RLOCK_ET(&V_tcbinfo, et);
-		ti_locked = TI_RLOCKED;
-	} else
-		ti_locked = TI_UNLOCKED;
-
-	/*
 	 * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain.
 	 */
         if (
@@ -776,13 +763,6 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
 		fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL);
 
 findpcb:
-#ifdef INVARIANTS
-	if (ti_locked == TI_RLOCKED) {
-		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
-	} else {
-		INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo);
-	}
-#endif
 #ifdef INET6
 	if (isipv6 && fwd_tag != NULL) {
 		struct sockaddr_in6 *next_hop6;
@@ -942,12 +922,6 @@ findpcb:
 	 * XXXRW: It may be time to rethink timewait locking.
 	 */
 	if (inp->inp_flags & INP_TIMEWAIT) {
-		if (ti_locked == TI_UNLOCKED) {
-			INP_INFO_RLOCK_ET(&V_tcbinfo, et);
-			ti_locked = TI_RLOCKED;
-		}
-		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
-
 		if (thflags & TH_SYN)
 			tcp_dooptions(&to, optp, optlen, TO_SYN);
 		/*
@@ -955,7 +929,6 @@ findpcb:
 		 */
 		if (tcp_twcheck(inp, &to, th, m, tlen))
 			goto findpcb;
-		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 		return (IPPROTO_DONE);
 	}
 	/*
@@ -977,27 +950,6 @@ findpcb:
 	}
 #endif
 
-	/*
-	 * We've identified a valid inpcb, but it could be that we need an
-	 * inpcbinfo write lock but don't hold it.  In this case, attempt to
-	 * acquire using the same strategy as the TIMEWAIT case above.  If we
-	 * relock, we have to jump back to 'relocked' as the connection might
-	 * now be in TIMEWAIT.
-	 */
-#ifdef INVARIANTS
-	if ((thflags & (TH_FIN | TH_RST)) != 0)
-		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
-#endif
-	if (!((tp->t_state == TCPS_ESTABLISHED && (thflags & TH_SYN) == 0) ||
-	      (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) &&
-	       !IS_FASTOPEN(tp->t_flags)))) {
-		if (ti_locked == TI_UNLOCKED) {
-			INP_INFO_RLOCK_ET(&V_tcbinfo, et);
-			ti_locked = TI_RLOCKED;
-		}
-		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
-	}
-
 #ifdef MAC
 	INP_WLOCK_ASSERT(inp);
 	if (mac_inpcb_check_deliver(inp, m))
@@ -1052,7 +1004,6 @@ findpcb:
 		 */
 		if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) {
 
-			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 			/*
 			 * Parse the TCP options here because
 			 * syncookies need access to the reflected
@@ -1130,8 +1081,6 @@ tfo_socket_result:
 			TCP_PROBE5(receive, NULL, tp, m, tp, th);
 			tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
 			    iptos);
-			if (ti_locked == TI_RLOCKED)
-				INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 			return (IPPROTO_DONE);
 		}
 		/*
@@ -1334,10 +1283,6 @@ tfo_socket_result:
 		 * Entry added to syncache and mbuf consumed.
 		 * Only the listen socket is unlocked by syncache_add().
 		 */
-		if (ti_locked == TI_RLOCKED) {
-			INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
-			ti_locked = TI_UNLOCKED;
-		}
 		INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo);
 		return (IPPROTO_DONE);
 	} else if (tp->t_state == TCPS_LISTEN) {
@@ -1370,25 +1315,11 @@ tfo_socket_result:
 	 * the inpcb, and unlocks pcbinfo.
 	 */
 	tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos);
-	if (ti_locked == TI_RLOCKED)
-		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 	return (IPPROTO_DONE);
 
 dropwithreset:
 	TCP_PROBE5(receive, NULL, tp, m, tp, th);
 
-	if (ti_locked == TI_RLOCKED) {
-		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
-		ti_locked = TI_UNLOCKED;
-	}
-#ifdef INVARIANTS
-	else {
-		KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset "
-		    "ti_locked: %d", __func__, ti_locked));
-		INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo);
-	}
-#endif
-
 	if (inp != NULL) {
 		tcp_dropwithreset(m, th, tp, tlen, rstreason);
 		INP_WUNLOCK(inp);
@@ -1401,18 +1332,6 @@ dropunlock:
 	if (m != NULL)
 		TCP_PROBE5(receive, NULL, tp, m, tp, th);
 
-	if (ti_locked == TI_RLOCKED) {
-		INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
-		ti_locked = TI_UNLOCKED;
-	}
-#ifdef INVARIANTS
-	else {
-		KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock "
-		    "ti_locked: %d", __func__, ti_locked));
-		INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo);
-	}
-#endif
-
 	if (inp != NULL)
 		INP_WUNLOCK(inp);
 
@@ -1501,7 +1420,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, stru
 	struct mbuf *mfree;
 	struct tcpopt to;
 	int tfo_syn;
-	
+
 #ifdef TCPDEBUG
 	/*
 	 * The size of tcp_saveipgen must be the size of the max ip header,
@@ -1516,16 +1435,8 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, stru
 	tp->sackhint.last_sack_ack = 0;
 	sack_changed = 0;
 	nsegs = max(1, m->m_pkthdr.lro_nsegs);
-	/*
-	 * If this is either a state-changing packet or current state isn't
-	 * established, we require a write lock on tcbinfo.  Otherwise, we
-	 * allow the tcbinfo to be in either alocked or unlocked, as the
-	 * caller may have unnecessarily acquired a write lock due to a race.
-	 */
-	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
-	    tp->t_state != TCPS_ESTABLISHED) {
-		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
-	}
+
+	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
 	    __func__));
@@ -2047,7 +1958,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, stru
 			tcp_state_change(tp, TCPS_SYN_RECEIVED);
 		}
 
-		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 		INP_WLOCK_ASSERT(tp->t_inpcb);
 
 		/*
@@ -2120,7 +2030,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, stru
 		    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
 		    (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
 
-			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 			KASSERT(tp->t_state != TCPS_SYN_SENT,
 			    ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
 			    __func__, th, tp));
@@ -2163,8 +2072,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, stru
 	 */
 	if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT &&
 	    tp->t_state != TCPS_SYN_RECEIVED) {
-		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
-
 		TCPSTAT_INC(tcps_badsyn);
 		if (V_tcp_insecure_syn &&
 		    SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
@@ -2288,8 +2195,6 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, stru
 	 */
 	if ((so->so_state & SS_NOFDREF) &&
 	    tp->t_state > TCPS_CLOSE_WAIT && tlen) {
-		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
-
 		if ((s = tcp_log_addrs(inc, th, NULL, NULL))) {
 			log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data "
 			    "after socket was closed, "
@@ -2875,7 +2780,6 @@ process_ACK:
 		 */
 		case TCPS_CLOSING:
 			if (ourfinisacked) {
-				INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 				tcp_twstart(tp);
 				m_freem(m);
 				return;
@@ -2890,7 +2794,6 @@ process_ACK:
 		 */
 		case TCPS_LAST_ACK:
 			if (ourfinisacked) {
-				INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 				tp = tcp_close(tp);
 				goto drop;
 			}
@@ -3139,8 +3042,6 @@ dodata:							/* XXX */
 		 * standard timers.
 		 */
 		case TCPS_FIN_WAIT_2:
-			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
-
 			tcp_twstart(tp);
 			return;
 		}

Modified: head/sys/netinet/tcp_stacks/bbr.c
==============================================================================
--- head/sys/netinet/tcp_stacks/bbr.c	Thu Nov  7 21:14:59 2019	(r354482)
+++ head/sys/netinet/tcp_stacks/bbr.c	Thu Nov  7 21:23:07 2019	(r354483)
@@ -8618,7 +8618,6 @@ dodata:				/* XXX */
 			bbr->rc_timer_first = 1;
 			bbr_timer_cancel(bbr,
 			    __LINE__, bbr->r_ctl.rc_rcvtime);
-			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 			INP_WLOCK_ASSERT(tp->t_inpcb);
 			tcp_twstart(tp);
 			return (1);
@@ -9619,7 +9618,6 @@ bbr_check_data_after_close(struct mbuf *m, struct tcp_
     struct tcpcb *tp, int32_t * tlen, struct tcphdr *th, struct socket *so)
 {
 
-	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	if (bbr->rc_allow_data_af_clo == 0) {
 close_now:
 		tp = tcp_close(tp);
@@ -9861,7 +9859,6 @@ bbr_do_closing(struct mbuf *m, struct tcphdr *th, stru
 		return (ret_val);
 	}
 	if (ourfinisacked) {
-		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 		tcp_twstart(tp);
 		m_freem(m);
 		return (1);
@@ -9974,7 +9971,6 @@ bbr_do_lastack(struct mbuf *m, struct tcphdr *th, stru
 		return (ret_val);
 	}
 	if (ourfinisacked) {
-		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 		tp = tcp_close(tp);
 		ctf_do_drop(m, tp);
 		return (1);

Modified: head/sys/netinet/tcp_stacks/rack.c
==============================================================================
--- head/sys/netinet/tcp_stacks/rack.c	Thu Nov  7 21:14:59 2019	(r354482)
+++ head/sys/netinet/tcp_stacks/rack.c	Thu Nov  7 21:23:07 2019	(r354483)
@@ -5875,7 +5875,6 @@ dodata:				/* XXX */
 		case TCPS_FIN_WAIT_2:
 			rack_timer_cancel(tp, rack,
 			    rack->r_ctl.rc_rcvtime, __LINE__);
-			INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 			tcp_twstart(tp);
 			return (1);
 		}
@@ -6353,7 +6352,6 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, st
 		tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
 		tcp_state_change(tp, TCPS_SYN_RECEIVED);
 	}
-	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	/*
 	 * Advance th->th_seq to correspond to first data byte. If data,
@@ -6847,7 +6845,6 @@ rack_check_data_after_close(struct mbuf *m, 
 {
 	struct tcp_rack *rack;
 
-	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 	rack = (struct tcp_rack *)tp->t_fb_ptr;
 	if (rack->rc_allow_data_af_clo == 0) {
 	close_now:
@@ -7079,7 +7076,6 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, str
 		return (ret_val);
 	}
 	if (ourfinisacked) {
-		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 		tcp_twstart(tp);
 		m_freem(m);
 		return (1);
@@ -7187,7 +7183,6 @@ rack_do_lastack(struct mbuf *m, struct tcphdr *th, str
 		return (ret_val);
 	}
 	if (ourfinisacked) {
-		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 		tp = tcp_close(tp);
 		ctf_do_drop(m, tp);
 		return (1);
@@ -7650,16 +7645,8 @@ rack_do_segment_nounlock(struct mbuf *m, struct tcphdr
 	kern_prefetch(rack, &prev_state);
 	prev_state = 0;
 	thflags = th->th_flags;
-	/*
-	 * If this is either a state-changing packet or current state isn't
-	 * established, we require a read lock on tcbinfo.  Otherwise, we
-	 * allow the tcbinfo to be in either locked or unlocked, as the
-	 * caller may have unnecessarily acquired a lock due to a race.
-	 */
-	if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 ||
-	    tp->t_state != TCPS_ESTABLISHED) {
-		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
-	}
+
+	NET_EPOCH_ASSERT();
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
 	    __func__));

Modified: head/sys/netinet/tcp_stacks/rack_bbr_common.c
==============================================================================
--- head/sys/netinet/tcp_stacks/rack_bbr_common.c	Thu Nov  7 21:14:59 2019	(r354482)
+++ head/sys/netinet/tcp_stacks/rack_bbr_common.c	Thu Nov  7 21:23:07 2019	(r354483)
@@ -253,7 +253,6 @@ ctf_process_inbound_raw(struct tcpcb *tp, struct socke
 	 */
 	struct mbuf *m_save;
 	struct ether_header *eh;
-	struct epoch_tracker et;
 	struct tcphdr *th;
 #ifdef INET6
 	struct ip6_hdr *ip6 = NULL;	/* Keep compiler happy. */
@@ -268,14 +267,8 @@ ctf_process_inbound_raw(struct tcpcb *tp, struct socke
 	uint16_t drop_hdrlen;
 	uint8_t iptos, no_vn=0, bpf_req=0;
 
-	/* 
-	 * This is a bit deceptive, we get the
-	 * "info epoch" which is really the network
-	 * epoch. This covers us on both any INP
-	 * type change but also if the ifp goes
-	 * away it covers us as well.
-	 */
-	INP_INFO_RLOCK_ET(&V_tcbinfo, et);
+	NET_EPOCH_ASSERT();
+
 	if (m && m->m_pkthdr.rcvif)
 		ifp = m->m_pkthdr.rcvif;
 	else
@@ -445,7 +438,6 @@ skip_vnet:
 			}
 			if (no_vn == 0)
 				CURVNET_RESTORE();
-			INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 			return(retval);
 		}
 skipped_pkt:
@@ -453,7 +445,6 @@ skipped_pkt:
 	}
 	if (no_vn == 0)
 		CURVNET_RESTORE();
-	INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
 	return(retval);
 }
 
@@ -680,7 +671,6 @@ ctf_process_rst(struct mbuf *m, struct tcphdr *th, str
 	    SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
 	    (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
 
-		INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
 		KASSERT(tp->t_state != TCPS_SYN_SENT,
 		    ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
 		    __func__, th, tp));
@@ -732,7 +722,8 @@ ctf_process_rst(struct mbuf *m, struct tcphdr *th, str
 void
 ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val)
 {
-	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+
+	NET_EPOCH_ASSERT();
 
 	TCPSTAT_INC(tcps_badsyn);
 	if (V_tcp_insecure_syn &&


More information about the svn-src-all mailing list