git: fa50e98328b4 - stable/13 - mend

Michael Tuexen tuexen at FreeBSD.org
Mon Jun 7 09:01:53 UTC 2021


The branch stable/13 has been updated by tuexen:

URL: https://cgit.FreeBSD.org/src/commit/?id=fa50e98328b48da4fa8dbd97d0a787962cf249f5

commit fa50e98328b48da4fa8dbd97d0a787962cf249f5
Author:     Michael Tuexen <tuexen at FreeBSD.org>
AuthorDate: 2021-04-18 14:08:08 +0000
Commit:     Michael Tuexen <tuexen at FreeBSD.org>
CommitDate: 2021-06-07 09:01:28 +0000

    mend
---
 share/man/man4/tcp.4          |  15 +-
 sys/netinet/tcp.h             |   1 +
 sys/netinet/tcp_input.c       |  48 ++++-
 sys/netinet/tcp_output.c      |  80 ++++++--
 sys/netinet/tcp_stacks/bbr.c  |  38 +---
 sys/netinet/tcp_stacks/rack.c |  26 +--
 sys/netinet/tcp_subr.c        | 462 ++++++++++++++++++++++++++++++++++++++++--
 sys/netinet/tcp_syncache.c    | 127 +++++++++---
 sys/netinet/tcp_syncache.h    |  12 +-
 sys/netinet/tcp_timewait.c    |  84 ++++++--
 sys/netinet/tcp_usrreq.c      |  30 +++
 sys/netinet/tcp_var.h         |  27 ++-
 sys/netinet/toecore.c         |   4 +-
 sys/netinet6/tcp6_var.h       |   2 +
 sys/sys/mbuf.h                |   1 +
 usr.bin/netstat/inet.c        |   4 +
 usr.bin/sockstat/sockstat.1   |   6 +-
 usr.bin/sockstat/sockstat.c   |  13 +-
 18 files changed, 822 insertions(+), 158 deletions(-)

diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
index d01505e58427..b5735a40b320 100644
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@@ -34,7 +34,7 @@
 .\"     From: @(#)tcp.4	8.1 (Berkeley) 6/5/93
 .\" $FreeBSD$
 .\"
-.Dd April 8, 2021
+.Dd April 18, 2021
 .Dt TCP 4
 .Os
 .Sh NAME
@@ -329,6 +329,9 @@ currently executing.
 This is typically used after a process or thread inherits a listen
 socket from its parent, and sets its CPU affinity to a particular core.
 .El
+.It Dv TCP_REMOTE_UDP_ENCAPS_PORT
+Set and get the remote UDP encapsulation port.
+It can only be set on a closed TCP socket.
 .El
 .Pp
 The option level for the
@@ -752,6 +755,16 @@ A CSV list of template_spec=percent key-value pairs which controls the per
 template sampling rates when
 .Xr stats 3
 sampling is enabled.
+.It Va udp_tunneling_port
+The local UDP encapsulation port.
+A value of 0 indicates that UDP encapsulation is disabled.
+The default is 0.
+.It Va udp_tunneling_overhead
+The overhead taken into account when using UDP encapsulation.
+Since MSS clamping by middleboxes will most likely not work, values larger than
+8 (the size of the UDP header) are also supported.
+Supported values are between 8 and 1024.
+The default is 8.
 .El
 .Sh ERRORS
 A socket operation may fail with one of the following errors returned:
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
index 0b71bd4658f8..d2bf1f8431fd 100644
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -183,6 +183,7 @@ struct tcphdr {
 #define	TCP_RXTLS_MODE	42	/* Receive TLS mode */
 #define	TCP_CONGESTION	64	/* get/set congestion control algorithm */
 #define	TCP_CCALGOOPT	65	/* get/set cc algorithm specific options */
+#define TCP_REMOTE_UDP_ENCAPS_PORT 71	/* Enable TCP over UDP tunneling via the specified port */
 #define TCP_DELACK  	72	/* socket option for delayed ack */
 #define TCP_FIN_IS_RST 73	/* A fin from the peer is treated has a RST */
 #define TCP_LOG_LIMIT  74	/* Limit to number of records in tcp-log */
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 397cbc5084e6..d36f9566ffba 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -123,6 +123,7 @@ __FBSDID("$FreeBSD$");
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
+#include <netinet/udp.h>
 
 #include <netipsec/ipsec_support.h>
 
@@ -573,7 +574,7 @@ cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
  */
 #ifdef INET6
 int
-tcp6_input(struct mbuf **mp, int *offp, int proto)
+tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
 {
 	struct mbuf *m;
 	struct in6_ifaddr *ia6;
@@ -603,12 +604,19 @@ tcp6_input(struct mbuf **mp, int *offp, int proto)
 	}
 
 	*mp = m;
-	return (tcp_input(mp, offp, proto));
+	return (tcp_input_with_port(mp, offp, proto, port));
+}
+
+int
+tcp6_input(struct mbuf **mp, int *offp, int proto)
+{
+
+	return(tcp6_input_with_port(mp, offp, proto, 0));
 }
 #endif /* INET6 */
 
 int
-tcp_input(struct mbuf **mp, int *offp, int proto)
+tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
 {
 	struct mbuf *m = *mp;
 	struct tcphdr *th = NULL;
@@ -664,6 +672,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
 		ip6 = mtod(m, struct ip6_hdr *);
 		th = (struct tcphdr *)((caddr_t)ip6 + off0);
 		tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
+		if (port)
+			goto skip6_csum;
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				th->th_sum = m->m_pkthdr.csum_data;
@@ -677,7 +687,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
 			TCPSTAT_INC(tcps_rcvbadsum);
 			goto drop;
 		}
-
+	skip6_csum:
 		/*
 		 * Be proactive about unspecified IPv6 address in source.
 		 * As we use all-zero to indicate unbounded/unconnected pcb,
@@ -718,6 +728,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
 		tlen = ntohs(ip->ip_len) - off0;
 
 		iptos = ip->ip_tos;
+		if (port)
+			goto skip_csum;
 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
 				th->th_sum = m->m_pkthdr.csum_data;
@@ -747,8 +759,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
 			ip->ip_v = IPVERSION;
 			ip->ip_hl = off0 >> 2;
 		}
-
-		if (th->th_sum) {
+	skip_csum:
+		if (th->th_sum && (port == 0)) {
 			TCPSTAT_INC(tcps_rcvbadsum);
 			goto drop;
 		}
@@ -1006,6 +1018,11 @@ findpcb:
 		goto dropwithreset;
 	}
 
+	if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) {
+		rstreason = BANDLIM_RST_CLOSEDPORT;
+		goto dropwithreset;
+	}
+
 #ifdef TCP_OFFLOAD
 	if (tp->t_flags & TF_TOE) {
 		tcp_offload_input(tp, m);
@@ -1077,7 +1094,7 @@ findpcb:
 			 * NB: syncache_expand() doesn't unlock
 			 * inp and tcpinfo locks.
 			 */
-			rstreason = syncache_expand(&inc, &to, th, &so, m);
+			rstreason = syncache_expand(&inc, &to, th, &so, m, port);
 			if (rstreason < 0) {
 				/*
 				 * A failing TCP MD5 signature comparison
@@ -1157,7 +1174,7 @@ tfo_socket_result:
 		 * causes.
 		 */
 		if (thflags & TH_RST) {
-			syncache_chkrst(&inc, th, m);
+			syncache_chkrst(&inc, th, m, port);
 			goto dropunlock;
 		}
 		/*
@@ -1179,7 +1196,7 @@ tfo_socket_result:
 				log(LOG_DEBUG, "%s; %s: Listen socket: "
 				    "SYN|ACK invalid, segment rejected\n",
 				    s, __func__);
-			syncache_badack(&inc);	/* XXX: Not needed! */
+			syncache_badack(&inc, port);	/* XXX: Not needed! */
 			TCPSTAT_INC(tcps_badsyn);
 			rstreason = BANDLIM_RST_OPENPORT;
 			goto dropwithreset;
@@ -1336,7 +1353,8 @@ tfo_socket_result:
 #endif
 		TCP_PROBE3(debug__input, tp, th, m);
 		tcp_dooptions(&to, optp, optlen, TO_SYN);
-		if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL, iptos))
+		if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL, iptos,
+		    port))
 			goto tfo_socket_result;
 
 		/*
@@ -1467,6 +1485,12 @@ tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
 	return (newsize);
 }
 
+int
+tcp_input(struct mbuf **mp, int *offp, int proto)
+{
+	return(tcp_input_with_port(mp, offp, proto, 0));
+}
+
 void
 tcp_handle_wakeup(struct tcpcb *tp, struct socket *so)
 {
@@ -3671,11 +3695,13 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
 			    sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
 			    sizeof (struct tcpiphdr);
 #else
-	const size_t min_protoh = sizeof(struct tcpiphdr);
+	 size_t min_protoh = sizeof(struct tcpiphdr);
 #endif
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
+	if (tp->t_port)
+		min_protoh += V_tcp_udp_tunneling_overhead;
 	if (mtuoffer != -1) {
 		KASSERT(offer == -1, ("%s: conflict", __func__));
 		offer = mtuoffer - min_protoh;
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index e23cdc749e98..5bda2be14df0 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -101,6 +101,8 @@ __FBSDID("$FreeBSD$");
 
 #include <netipsec/ipsec_support.h>
 
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
 #include <machine/in_cksum.h>
 
 #include <security/mac/mac_framework.h>
@@ -207,7 +209,7 @@ tcp_output(struct tcpcb *tp)
 #endif
 	struct tcphdr *th;
 	u_char opt[TCP_MAXOLEN];
-	unsigned ipoptlen, optlen, hdrlen;
+	unsigned ipoptlen, optlen, hdrlen, ulen;
 #if defined(IPSEC) || defined(IPSEC_SUPPORT)
 	unsigned ipsec_optlen = 0;
 #endif
@@ -216,6 +218,7 @@ tcp_output(struct tcpcb *tp)
 	struct sackhole *p;
 	int tso, mtu;
 	struct tcpopt to;
+	struct udphdr *udp = NULL;
 	unsigned int wanted_cookie = 0;
 	unsigned int dont_sendalot = 0;
 #if 0
@@ -558,6 +561,7 @@ after_sack_rexmit:
 #endif
 
 	if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
+	    (tp->t_port == 0) &&
 	    ((tp->t_flags & TF_SIGNATURE) == 0) &&
 	    tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
 	    ipoptlen == 0 && !(flags & TH_SYN))
@@ -800,6 +804,8 @@ send:
 		/* Maximum segment size. */
 		if (flags & TH_SYN) {
 			to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
+			if (tp->t_port)
+				to.to_mss -= V_tcp_udp_tunneling_overhead;
 			to.to_flags |= TOF_MSS;
 
 			/*
@@ -887,7 +893,14 @@ send:
 		    !(to.to_flags & TOF_FASTOPEN))
 			len = 0;
 	}
-
+	if (tp->t_port) {
+		if (V_tcp_udp_tunneling_port == 0) {
+			/* The port was removed?? */
+			SOCKBUF_UNLOCK(&so->so_snd);
+			return (EHOSTUNREACH);
+		}
+		hdrlen += sizeof(struct udphdr);
+	}
 	/*
 	 * Adjust data length if insertion of options will
 	 * bump the packet length beyond the t_maxseg length.
@@ -1140,8 +1153,17 @@ send:
 #ifdef INET6
 	if (isipv6) {
 		ip6 = mtod(m, struct ip6_hdr *);
-		th = (struct tcphdr *)(ip6 + 1);
-		tcpip_fillheaders(tp->t_inpcb, ip6, th);
+		if (tp->t_port) {
+			udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
+			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
+			udp->uh_dport = tp->t_port;
+			ulen = hdrlen + len - sizeof(struct ip6_hdr);
+			udp->uh_ulen = htons(ulen);
+			th = (struct tcphdr *)(udp + 1);
+		} else {
+			th = (struct tcphdr *)(ip6 + 1);
+		}
+		tcpip_fillheaders(tp->t_inpcb, tp->t_port, ip6, th);
 	} else
 #endif /* INET6 */
 	{
@@ -1149,8 +1171,16 @@ send:
 #ifdef TCPDEBUG
 		ipov = (struct ipovly *)ip;
 #endif
-		th = (struct tcphdr *)(ip + 1);
-		tcpip_fillheaders(tp->t_inpcb, ip, th);
+		if (tp->t_port) {
+			udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
+			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
+			udp->uh_dport = tp->t_port;
+			ulen = hdrlen + len - sizeof(struct ip);
+			udp->uh_ulen = htons(ulen);
+			th = (struct tcphdr *)(udp + 1);
+		} else
+			th = (struct tcphdr *)(ip + 1);
+		tcpip_fillheaders(tp->t_inpcb, tp->t_port, ip, th);
 	}
 
 	/*
@@ -1309,7 +1339,6 @@ send:
 	 * checksum extended header and data.
 	 */
 	m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
-	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 
 #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
 	if (to.to_flags & TOF_SIGNATURE) {
@@ -1336,9 +1365,19 @@ send:
 		 * There is no need to fill in ip6_plen right now.
 		 * It will be filled later by ip6_output.
 		 */
-		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
-		th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
-		    optlen + len, IPPROTO_TCP, 0);
+		if (tp->t_port) {
+			m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
+			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+			udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
+			th->th_sum = htons(0);
+			UDPSTAT_INC(udps_opackets);
+		} else {
+			m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
+			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+			th->th_sum = in6_cksum_pseudo(ip6,
+			    sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
+			    0);
+		}
 	}
 #endif
 #if defined(INET6) && defined(INET)
@@ -1346,9 +1385,20 @@ send:
 #endif
 #ifdef INET
 	{
-		m->m_pkthdr.csum_flags = CSUM_TCP;
-		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
-		    htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
+		if (tp->t_port) {
+			m->m_pkthdr.csum_flags = CSUM_UDP;
+			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+			udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
+			   ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
+			th->th_sum = htons(0);
+			UDPSTAT_INC(udps_opackets);
+		} else {
+			m->m_pkthdr.csum_flags = CSUM_TCP;
+			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+			th->th_sum = in_pseudo(ip->ip_src.s_addr,
+			    ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
+			    IPPROTO_TCP + len + optlen));
+		}
 
 		/* IP version must be set here for ipv4/ipv6 checking later */
 		KASSERT(ip->ip_v == IPVERSION,
@@ -1473,8 +1523,10 @@ send:
 	 * NB: Don't set DF on small MTU/MSS to have a safe fallback.
 	 */
 	if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
-		ip->ip_off |= htons(IP_DF);
 		tp->t_flags2 |= TF2_PLPMTU_PMTUD;
+		if (tp->t_port == 0 || len < V_tcp_minmss) {
+			ip->ip_off |= htons(IP_DF);
+		}
 	} else {
 		tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
 	}
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index cc20d6bf52ca..1ee8d26446fd 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -11969,14 +11969,10 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
 #endif
 	struct tcp_bbr *bbr;
 	struct tcphdr *th;
-#ifdef NETFLIX_TCPOUDP
 	struct udphdr *udp = NULL;
-#endif
 	u_char opt[TCP_MAXOLEN];
 	unsigned ipoptlen, optlen, hdrlen;
-#ifdef NETFLIX_TCPOUDP
 	unsigned ulen;
-#endif
 	uint32_t bbr_seq;
 	uint32_t delay_calc=0;
 	uint8_t doing_tlp = 0;
@@ -12991,10 +12987,8 @@ send:
 		/* Maximum segment size. */
 		if (flags & TH_SYN) {
 			to.to_mss = tcp_mssopt(&inp->inp_inc);
-#ifdef NETFLIX_TCPOUDP
 			if (tp->t_port)
 				to.to_mss -= V_tcp_udp_tunneling_overhead;
-#endif
 			to.to_flags |= TOF_MSS;
 			/*
 			 * On SYN or SYN|ACK transmits on TFO connections,
@@ -13063,7 +13057,6 @@ send:
 		    !(to.to_flags & TOF_FASTOPEN))
 			len = 0;
 	}
-#ifdef NETFLIX_TCPOUDP
 	if (tp->t_port) {
 		if (V_tcp_udp_tunneling_port == 0) {
 			/* The port was removed?? */
@@ -13072,7 +13065,6 @@ send:
 		}
 		hdrlen += sizeof(struct udphdr);
 	}
-#endif
 #ifdef INET6
 	if (isipv6)
 		ipoptlen = ip6_optlen(tp->t_inpcb);
@@ -13408,7 +13400,6 @@ send:
 #ifdef INET6
 	if (isipv6) {
 		ip6 = mtod(m, struct ip6_hdr *);
-#ifdef NETFLIX_TCPOUDP
 		if (tp->t_port) {
 			udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
@@ -13417,17 +13408,9 @@ send:
 			udp->uh_ulen = htons(ulen);
 			th = (struct tcphdr *)(udp + 1);
 		} else {
-#endif
 			th = (struct tcphdr *)(ip6 + 1);
-
-#ifdef NETFLIX_TCPOUDP
 		}
-#endif
-		tcpip_fillheaders(inp,
-#ifdef NETFLIX_TCPOUDP
-				  tp->t_port,
-#endif
-				  ip6, th);
+		tcpip_fillheaders(inp, tp->t_port, ip6, th);
 	} else
 #endif				/* INET6 */
 	{
@@ -13435,7 +13418,6 @@ send:
 #ifdef TCPDEBUG
 		ipov = (struct ipovly *)ip;
 #endif
-#ifdef NETFLIX_TCPOUDP
 		if (tp->t_port) {
 			udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
@@ -13443,14 +13425,10 @@ send:
 			ulen = hdrlen + len - sizeof(struct ip);
 			udp->uh_ulen = htons(ulen);
 			th = (struct tcphdr *)(udp + 1);
-		} else
-#endif
+		} else {
 			th = (struct tcphdr *)(ip + 1);
-		tcpip_fillheaders(inp,
-#ifdef NETFLIX_TCPOUDP
-				  tp->t_port,
-#endif
-				  ip, th);
+		}
+		tcpip_fillheaders(inp, tp->t_port, ip, th);
 	}
 	/*
 	 * If we are doing retransmissions, then snd_nxt will not reflect
@@ -13600,7 +13578,6 @@ send:
 		 * ip6_plen is not need to be filled now, and will be filled
 		 * in ip6_output.
 		 */
-#ifdef NETFLIX_TCPOUDP
 		if (tp->t_port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
@@ -13608,14 +13585,11 @@ send:
 			th->th_sum = htons(0);
 			UDPSTAT_INC(udps_opackets);
 		} else {
-#endif
 			csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
 			    optlen + len, IPPROTO_TCP, 0);
-#ifdef NETFLIX_TCPOUDP
 		}
-#endif
 	}
 #endif
 #if defined(INET6) && defined(INET)
@@ -13623,7 +13597,6 @@ send:
 #endif
 #ifdef INET
 	{
-#ifdef NETFLIX_TCPOUDP
 		if (tp->t_port) {
 			m->m_pkthdr.csum_flags = CSUM_UDP;
 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
@@ -13632,15 +13605,12 @@ send:
 			th->th_sum = htons(0);
 			UDPSTAT_INC(udps_opackets);
 		} else {
-#endif
 			csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP;
 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
 			th->th_sum = in_pseudo(ip->ip_src.s_addr,
 			    ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
 			    IPPROTO_TCP + len + optlen));
-#ifdef NETFLIX_TCPOUDP
 		}
-#endif
 		/* IP version must be set here for ipv4/ipv6 checking later */
 		KASSERT(ip->ip_v == IPVERSION,
 		    ("%s: IP version incorrect: %d", __func__, ip->ip_v));
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 0ee73a95a6d7..12827d1699d0 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -13008,10 +13008,8 @@ send:
 		if (flags & TH_SYN) {
 			tp->snd_nxt = tp->iss;
 			to.to_mss = tcp_mssopt(&inp->inp_inc);
-#ifdef NETFLIX_TCPOUDP
 			if (tp->t_port)
 				to.to_mss -= V_tcp_udp_tunneling_overhead;
-#endif
 			to.to_flags |= TOF_MSS;
 
 			/*
@@ -13088,7 +13086,6 @@ send:
 		    !(to.to_flags & TOF_FASTOPEN))
 			len = 0;
 	}
-#ifdef NETFLIX_TCPOUDP
 	if (tp->t_port) {
 		if (V_tcp_udp_tunneling_port == 0) {
 			/* The port was removed?? */
@@ -13097,7 +13094,6 @@ send:
 		}
 		hdrlen += sizeof(struct udphdr);
 	}
-#endif
 #ifdef INET6
 	if (isipv6)
 		ipoptlen = ip6_optlen(tp->t_inpcb);
@@ -13372,7 +13368,6 @@ send:
 #ifdef INET6
 	if (isipv6) {
 		ip6 = mtod(m, struct ip6_hdr *);
-#ifdef NETFLIX_TCPOUDP
 		if (tp->t_port) {
 			udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
@@ -13380,14 +13375,10 @@ send:
 			ulen = hdrlen + len - sizeof(struct ip6_hdr);
 			udp->uh_ulen = htons(ulen);
 			th = (struct tcphdr *)(udp + 1);
-		} else
-#endif
+		} else {
 			th = (struct tcphdr *)(ip6 + 1);
-		tcpip_fillheaders(inp,
-#ifdef NETFLIX_TCPOUDP
-				  tp->t_port,
-#endif
-				  ip6, th);
+		}
+		tcpip_fillheaders(inp, tp->t_port, ip6, th);
 	} else
 #endif				/* INET6 */
 	{
@@ -13395,7 +13386,6 @@ send:
 #ifdef TCPDEBUG
 		ipov = (struct ipovly *)ip;
 #endif
-#ifdef NETFLIX_TCPOUDP
 		if (tp->t_port) {
 			udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
@@ -13403,14 +13393,10 @@ send:
 			ulen = hdrlen + len - sizeof(struct ip);
 			udp->uh_ulen = htons(ulen);
 			th = (struct tcphdr *)(udp + 1);
-		} else
-#endif
+		} else {
 			th = (struct tcphdr *)(ip + 1);
-		tcpip_fillheaders(inp,
-#ifdef NETFLIX_TCPOUDP
-				  tp->t_port,
-#endif
-				  ip, th);
+		}
+		tcpip_fillheaders(inp, tp->t_port, ip, th);
 	}
 	/*
 	 * Fill in fields, remembering maximum advertised window for use in
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index dff7767cd9cf..6bdeb3984aee 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -126,6 +126,8 @@ __FBSDID("$FreeBSD$");
 #ifdef TCP_OFFLOAD
 #include <netinet/tcp_offload.h>
 #endif
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
 
 #include <netipsec/ipsec_support.h>
 
@@ -501,6 +503,80 @@ tcp_switch_back_to_default(struct tcpcb *tp)
 	}
 }
 
+static void
+tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp,
+    const struct sockaddr *sa, void *ctx)
+{
+	struct ip *iph;
+#ifdef INET6
+	struct ip6_hdr *ip6;
+#endif
+	struct udphdr *uh;
+	struct tcphdr *th;
+	int thlen;
+	uint16_t port;
+
+	TCPSTAT_INC(tcps_tunneled_pkts);
+	if ((m->m_flags & M_PKTHDR) == 0) {
+		/* Can't handle one that is not a pkt hdr */
+		TCPSTAT_INC(tcps_tunneled_errs);
+		goto out;
+	}
+	thlen = sizeof(struct tcphdr);
+	if (m->m_len < off + sizeof(struct udphdr) + thlen &&
+	    (m =  m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) {
+		TCPSTAT_INC(tcps_tunneled_errs);
+		goto out;
+	}
+	iph = mtod(m, struct ip *);
+	uh = (struct udphdr *)((caddr_t)iph + off);
+	th = (struct tcphdr *)(uh + 1);
+	thlen = th->th_off << 2;
+	if (m->m_len < off + sizeof(struct udphdr) + thlen) {
+		m =  m_pullup(m, off + sizeof(struct udphdr) + thlen);
+		if (m == NULL) {
+			TCPSTAT_INC(tcps_tunneled_errs);
+			goto out;
+		} else {
+			iph = mtod(m, struct ip *);
+			uh = (struct udphdr *)((caddr_t)iph + off);
+			th = (struct tcphdr *)(uh + 1);
+		}
+	}
+	m->m_pkthdr.tcp_tun_port = port = uh->uh_sport;
+	bcopy(th, uh, m->m_len - off);
+	m->m_len -= sizeof(struct udphdr);
+	m->m_pkthdr.len -= sizeof(struct udphdr);
+	/*
+	 * We use the same algorithm for
+	 * both UDP and TCP for c-sum. So
+	 * the code in tcp_input will skip
+	 * the checksum. So we do nothing
+	 * with the flag (m->m_pkthdr.csum_flags).
+	 */
+	switch (iph->ip_v) {
+#ifdef INET
+	case IPVERSION:
+		iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr));
+		tcp_input_with_port(&m, &off, IPPROTO_TCP, port);
+		break;
+#endif
+#ifdef INET6
+	case IPV6_VERSION >> 4:
+		ip6 = mtod(m, struct ip6_hdr *);
+		ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr));
+		tcp6_input_with_port(&m, &off, IPPROTO_TCP, port);
+		break;
+#endif
+	default:
+		goto out;
+		break;
+	}
+	return;
+out:
+	m_freem(m);
+}
+
 static int
 sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
 {
@@ -598,6 +674,183 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available,
     NULL, 0, sysctl_net_inet_list_available, "A",
     "list available TCP Function sets");
 
+VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT;
+
+#ifdef INET
+VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL;
+#define	V_udp4_tun_socket	VNET(udp4_tun_socket)
+#endif
+#ifdef INET6
+VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL;
+#define	V_udp6_tun_socket	VNET(udp6_tun_socket)
+#endif
+
+static void
+tcp_over_udp_stop(void)
+{
+	/*
+	 * This function assumes sysctl caller holds inp_rinfo_lock()
+	 * for writting!
+	 */
+#ifdef INET
+	if (V_udp4_tun_socket != NULL) {
+		soclose(V_udp4_tun_socket);
+		V_udp4_tun_socket = NULL;
+	}
+#endif
+#ifdef INET6
+	if (V_udp6_tun_socket != NULL) {
+		soclose(V_udp6_tun_socket);
+		V_udp6_tun_socket = NULL;
+	}
+#endif
+}
+
+static int
+tcp_over_udp_start(void)
+{
+	uint16_t port;
+	int ret;
+#ifdef INET
+	struct sockaddr_in sin;
+#endif
+#ifdef INET6
+	struct sockaddr_in6 sin6;
+#endif
+	/*
+	 * This function assumes sysctl caller holds inp_info_rlock()
+	 * for writting!
+	 */
+	port = V_tcp_udp_tunneling_port;
+	if (ntohs(port) == 0) {
+		/* Must have a port set */
+		return (EINVAL);
+	}
+#ifdef INET
+	if (V_udp4_tun_socket != NULL) {
+		/* Already running -- must stop first */
+		return (EALREADY);
+	}
+#endif
+#ifdef INET6
+	if (V_udp6_tun_socket != NULL) {
+		/* Already running -- must stop first */
+		return (EALREADY);
+	}
+#endif
+#ifdef INET
+	if ((ret = socreate(PF_INET, &V_udp4_tun_socket,
+	    SOCK_DGRAM, IPPROTO_UDP,
+	    curthread->td_ucred, curthread))) {
+		tcp_over_udp_stop();
+		return (ret);
+	}
+	/* Call the special UDP hook. */
+	if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket,
+	    tcp_recv_udp_tunneled_packet,
+	    tcp_ctlinput_viaudp,
+	    NULL))) {
+		tcp_over_udp_stop();
+		return (ret);
+	}
+	/* Ok, we have a socket, bind it to the port. */
+	memset(&sin, 0, sizeof(struct sockaddr_in));
+	sin.sin_len = sizeof(struct sockaddr_in);
+	sin.sin_family = AF_INET;
+	sin.sin_port = htons(port);
+	if ((ret = sobind(V_udp4_tun_socket,
+	    (struct sockaddr *)&sin, curthread))) {
+		tcp_over_udp_stop();
+		return (ret);
+	}
+#endif
+#ifdef INET6
+	if ((ret = socreate(PF_INET6, &V_udp6_tun_socket,
+	    SOCK_DGRAM, IPPROTO_UDP,
+	    curthread->td_ucred, curthread))) {
+		tcp_over_udp_stop();
+		return (ret);
+	}
+	/* Call the special UDP hook. */
+	if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket,
+	    tcp_recv_udp_tunneled_packet,
+	    tcp6_ctlinput_viaudp,
+	    NULL))) {
+		tcp_over_udp_stop();
+		return (ret);
+	}
+	/* Ok, we have a socket, bind it to the port. */
+	memset(&sin6, 0, sizeof(struct sockaddr_in6));
+	sin6.sin6_len = sizeof(struct sockaddr_in6);
+	sin6.sin6_family = AF_INET6;
+	sin6.sin6_port = htons(port);
+	if ((ret = sobind(V_udp6_tun_socket,
+	    (struct sockaddr *)&sin6, curthread))) {
+		tcp_over_udp_stop();
+		return (ret);
+	}
+#endif
+	return (0);
+}
+
+static int
+sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS)
+{
+	int error;
+	uint32_t old, new;
+
+	old = V_tcp_udp_tunneling_port;
+	new = old;
+	error = sysctl_handle_int(oidp, &new, 0, req);
+	if ((error == 0) &&
+	    (req->newptr != NULL)) {
+		if ((new < TCP_TUNNELING_PORT_MIN) ||
+		    (new > TCP_TUNNELING_PORT_MAX)) {
+			error = EINVAL;
+		} else {
+			V_tcp_udp_tunneling_port = new;
+			if (old != 0) {
+				tcp_over_udp_stop();
+			}
+			if (new != 0) {
+				error = tcp_over_udp_start();
+			}
+		}
+	}
+	return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port,
+    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    &VNET_NAME(tcp_udp_tunneling_port),
+    0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU",
+    "Tunneling port for tcp over udp");
+
+VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT;
+
+static int
+sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS)
+{
+	int error, new;
+
+	new = V_tcp_udp_tunneling_overhead;
+	error = sysctl_handle_int(oidp, &new, 0, req);
+	if (error == 0 && req->newptr) {
+		if ((new < TCP_TUNNELING_OVERHEAD_MIN) ||
+		    (new > TCP_TUNNELING_OVERHEAD_MAX))
+			error = EINVAL;
+		else
+			V_tcp_udp_tunneling_overhead = new;
+	}
+	return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead,
+    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    &VNET_NAME(tcp_udp_tunneling_overhead),
+    0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU",
+    "MSS reduction when using tcp over udp");
+
 /*
  * Exports one (struct tcp_function_info) for each alias/name.
  */
@@ -1305,7 +1558,7 @@ tcp_fini(void *xtp)
  * of the tcpcb each time to conserve mbufs.
  */
 void
-tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
+tcpip_fillheaders(struct inpcb *inp, uint16_t port, void *ip_ptr, void *tcp_ptr)
 {
 	struct tcphdr *th = (struct tcphdr *)tcp_ptr;
 
@@ -1320,7 +1573,10 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
 			(inp->inp_flow & IPV6_FLOWINFO_MASK);
 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
 			(IPV6_VERSION & IPV6_VERSION_MASK);
-		ip6->ip6_nxt = IPPROTO_TCP;
+		if (port == 0)
+			ip6->ip6_nxt = IPPROTO_TCP;
+		else
+			ip6->ip6_nxt = IPPROTO_UDP;
 		ip6->ip6_plen = htons(sizeof(struct tcphdr));
 		ip6->ip6_src = inp->in6p_laddr;
 		ip6->ip6_dst = inp->in6p_faddr;
@@ -1342,7 +1598,10 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
 		ip->ip_off = 0;
 		ip->ip_ttl = inp->inp_ip_ttl;
 		ip->ip_sum = 0;
-		ip->ip_p = IPPROTO_TCP;
+		if (port == 0)
+			ip->ip_p = IPPROTO_TCP;
+		else
+			ip->ip_p = IPPROTO_UDP;
 		ip->ip_src = inp->inp_laddr;
 		ip->ip_dst = inp->inp_faddr;
 	}
@@ -1372,7 +1631,7 @@ tcpip_maketemplate(struct inpcb *inp)
 	t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
 	if (t == NULL)
 		return (NULL);
-	tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t);
+	tcpip_fillheaders(inp, 0, (void *)&t->tt_ipgen, (void *)&t->tt_t);
 	return (t);
 }
 
@@ -1398,14 +1657,16 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
 	struct inpcb *inp;
 	struct ip *ip;
 	struct mbuf *optm;
+	struct udphdr *uh = NULL;
 	struct tcphdr *nth;
 	u_char *optp;
 #ifdef INET6
 	struct ip6_hdr *ip6;
 	int isipv6;
 #endif /* INET6 */
-	int optlen, tlen, win;
+	int optlen, tlen, win, ulen;
 	bool incl_opts;
+	uint16_t port;
 
 	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
 	NET_EPOCH_ASSERT();
@@ -1423,6 +1684,19 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
 	} else
 		inp = NULL;
 
+	if (m != NULL) {
+#ifdef INET6
+		if (isipv6 && ip6 && (ip6->ip6_nxt == IPPROTO_UDP))
+			port = m->m_pkthdr.tcp_tun_port;
+		else
*** 1128 LINES SKIPPED ***


More information about the dev-commits-src-all mailing list