git: fa50e98328b4 - stable/13 - mend

Michael Tuexen tuexen at freebsd.org
Mon Jun 7 12:12:05 UTC 2021


> On 7. Jun 2021, at 11:01, Michael Tuexen <tuexen at freebsd.org> wrote:
> 
> The branch stable/13 has been updated by tuexen:
> 
> URL: https://cgit.FreeBSD.org/src/commit/?id=fa50e98328b48da4fa8dbd97d0a787962cf249f5
> 
> commit fa50e98328b48da4fa8dbd97d0a787962cf249f5
> Author:     Michael Tuexen <tuexen at FreeBSD.org>
> AuthorDate: 2021-04-18 14:08:08 +0000
> Commit:     Michael Tuexen <tuexen at FreeBSD.org>
> CommitDate: 2021-06-07 09:01:28 +0000
> 
>    mend
Not sure how the commit ended up this way, but it is MFCing
https://cgit.FreeBSD.org/src/commit/?id=9e644c23000c2f5028b235f6263d17ffb24d3605
and manually resolving the merge conflicts.

Best regards
Michael
> ---
> share/man/man4/tcp.4          |  15 +-
> sys/netinet/tcp.h             |   1 +
> sys/netinet/tcp_input.c       |  48 ++++-
> sys/netinet/tcp_output.c      |  80 ++++++--
> sys/netinet/tcp_stacks/bbr.c  |  38 +---
> sys/netinet/tcp_stacks/rack.c |  26 +--
> sys/netinet/tcp_subr.c        | 462 ++++++++++++++++++++++++++++++++++++++++--
> sys/netinet/tcp_syncache.c    | 127 +++++++++---
> sys/netinet/tcp_syncache.h    |  12 +-
> sys/netinet/tcp_timewait.c    |  84 ++++++--
> sys/netinet/tcp_usrreq.c      |  30 +++
> sys/netinet/tcp_var.h         |  27 ++-
> sys/netinet/toecore.c         |   4 +-
> sys/netinet6/tcp6_var.h       |   2 +
> sys/sys/mbuf.h                |   1 +
> usr.bin/netstat/inet.c        |   4 +
> usr.bin/sockstat/sockstat.1   |   6 +-
> usr.bin/sockstat/sockstat.c   |  13 +-
> 18 files changed, 822 insertions(+), 158 deletions(-)
> 
> diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
> index d01505e58427..b5735a40b320 100644
> --- a/share/man/man4/tcp.4
> +++ b/share/man/man4/tcp.4
> @@ -34,7 +34,7 @@
> .\"     From: @(#)tcp.4	8.1 (Berkeley) 6/5/93
> .\" $FreeBSD$
> .\"
> -.Dd April 8, 2021
> +.Dd April 18, 2021
> .Dt TCP 4
> .Os
> .Sh NAME
> @@ -329,6 +329,9 @@ currently executing.
> This is typically used after a process or thread inherits a listen
> socket from its parent, and sets its CPU affinity to a particular core.
> .El
> +.It Dv TCP_REMOTE_UDP_ENCAPS_PORT
> +Set and get the remote UDP encapsulation port.
> +It can only be set on a closed TCP socket.
> .El
> .Pp
> The option level for the
> @@ -752,6 +755,16 @@ A CSV list of template_spec=percent key-value pairs which controls the per
> template sampling rates when
> .Xr stats 3
> sampling is enabled.
> +.It Va udp_tunneling_port
> +The local UDP encapsulation port.
> +A value of 0 indicates that UDP encapsulation is disabled.
> +The default is 0.
> +.It Va udp_tunneling_overhead
> +The overhead taken into account when using UDP encapsulation.
> +Since MSS clamping by middleboxes will most likely not work, values larger than
> +8 (the size of the UDP header) are also supported.
> +Supported values are between 8 and 1024.
> +The default is 8.
> .El
> .Sh ERRORS
> A socket operation may fail with one of the following errors returned:
> diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
> index 0b71bd4658f8..d2bf1f8431fd 100644
> --- a/sys/netinet/tcp.h
> +++ b/sys/netinet/tcp.h
> @@ -183,6 +183,7 @@ struct tcphdr {
> #define	TCP_RXTLS_MODE	42	/* Receive TLS mode */
> #define	TCP_CONGESTION	64	/* get/set congestion control algorithm */
> #define	TCP_CCALGOOPT	65	/* get/set cc algorithm specific options */
> +#define TCP_REMOTE_UDP_ENCAPS_PORT 71	/* Enable TCP over UDP tunneling via the specified port */
> #define TCP_DELACK  	72	/* socket option for delayed ack */
> #define TCP_FIN_IS_RST 73	/* A fin from the peer is treated has a RST */
> #define TCP_LOG_LIMIT  74	/* Limit to number of records in tcp-log */
> diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
> index 397cbc5084e6..d36f9566ffba 100644
> --- a/sys/netinet/tcp_input.c
> +++ b/sys/netinet/tcp_input.c
> @@ -123,6 +123,7 @@ __FBSDID("$FreeBSD$");
> #ifdef TCP_OFFLOAD
> #include <netinet/tcp_offload.h>
> #endif
> +#include <netinet/udp.h>
> 
> #include <netipsec/ipsec_support.h>
> 
> @@ -573,7 +574,7 @@ cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
>  */
> #ifdef INET6
> int
> -tcp6_input(struct mbuf **mp, int *offp, int proto)
> +tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
> {
> 	struct mbuf *m;
> 	struct in6_ifaddr *ia6;
> @@ -603,12 +604,19 @@ tcp6_input(struct mbuf **mp, int *offp, int proto)
> 	}
> 
> 	*mp = m;
> -	return (tcp_input(mp, offp, proto));
> +	return (tcp_input_with_port(mp, offp, proto, port));
> +}
> +
> +int
> +tcp6_input(struct mbuf **mp, int *offp, int proto)
> +{
> +
> +	return(tcp6_input_with_port(mp, offp, proto, 0));
> }
> #endif /* INET6 */
> 
> int
> -tcp_input(struct mbuf **mp, int *offp, int proto)
> +tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
> {
> 	struct mbuf *m = *mp;
> 	struct tcphdr *th = NULL;
> @@ -664,6 +672,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
> 		ip6 = mtod(m, struct ip6_hdr *);
> 		th = (struct tcphdr *)((caddr_t)ip6 + off0);
> 		tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
> +		if (port)
> +			goto skip6_csum;
> 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
> 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
> 				th->th_sum = m->m_pkthdr.csum_data;
> @@ -677,7 +687,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
> 			TCPSTAT_INC(tcps_rcvbadsum);
> 			goto drop;
> 		}
> -
> +	skip6_csum:
> 		/*
> 		 * Be proactive about unspecified IPv6 address in source.
> 		 * As we use all-zero to indicate unbounded/unconnected pcb,
> @@ -718,6 +728,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
> 		tlen = ntohs(ip->ip_len) - off0;
> 
> 		iptos = ip->ip_tos;
> +		if (port)
> +			goto skip_csum;
> 		if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
> 			if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
> 				th->th_sum = m->m_pkthdr.csum_data;
> @@ -747,8 +759,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
> 			ip->ip_v = IPVERSION;
> 			ip->ip_hl = off0 >> 2;
> 		}
> -
> -		if (th->th_sum) {
> +	skip_csum:
> +		if (th->th_sum && (port == 0)) {
> 			TCPSTAT_INC(tcps_rcvbadsum);
> 			goto drop;
> 		}
> @@ -1006,6 +1018,11 @@ findpcb:
> 		goto dropwithreset;
> 	}
> 
> +	if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) {
> +		rstreason = BANDLIM_RST_CLOSEDPORT;
> +		goto dropwithreset;
> +	}
> +
> #ifdef TCP_OFFLOAD
> 	if (tp->t_flags & TF_TOE) {
> 		tcp_offload_input(tp, m);
> @@ -1077,7 +1094,7 @@ findpcb:
> 			 * NB: syncache_expand() doesn't unlock
> 			 * inp and tcpinfo locks.
> 			 */
> -			rstreason = syncache_expand(&inc, &to, th, &so, m);
> +			rstreason = syncache_expand(&inc, &to, th, &so, m, port);
> 			if (rstreason < 0) {
> 				/*
> 				 * A failing TCP MD5 signature comparison
> @@ -1157,7 +1174,7 @@ tfo_socket_result:
> 		 * causes.
> 		 */
> 		if (thflags & TH_RST) {
> -			syncache_chkrst(&inc, th, m);
> +			syncache_chkrst(&inc, th, m, port);
> 			goto dropunlock;
> 		}
> 		/*
> @@ -1179,7 +1196,7 @@ tfo_socket_result:
> 				log(LOG_DEBUG, "%s; %s: Listen socket: "
> 				    "SYN|ACK invalid, segment rejected\n",
> 				    s, __func__);
> -			syncache_badack(&inc);	/* XXX: Not needed! */
> +			syncache_badack(&inc, port);	/* XXX: Not needed! */
> 			TCPSTAT_INC(tcps_badsyn);
> 			rstreason = BANDLIM_RST_OPENPORT;
> 			goto dropwithreset;
> @@ -1336,7 +1353,8 @@ tfo_socket_result:
> #endif
> 		TCP_PROBE3(debug__input, tp, th, m);
> 		tcp_dooptions(&to, optp, optlen, TO_SYN);
> -		if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL, iptos))
> +		if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL, iptos,
> +		    port))
> 			goto tfo_socket_result;
> 
> 		/*
> @@ -1467,6 +1485,12 @@ tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
> 	return (newsize);
> }
> 
> +int
> +tcp_input(struct mbuf **mp, int *offp, int proto)
> +{
> +	return(tcp_input_with_port(mp, offp, proto, 0));
> +}
> +
> void
> tcp_handle_wakeup(struct tcpcb *tp, struct socket *so)
> {
> @@ -3671,11 +3695,13 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
> 			    sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
> 			    sizeof (struct tcpiphdr);
> #else
> -	const size_t min_protoh = sizeof(struct tcpiphdr);
> +	 size_t min_protoh = sizeof(struct tcpiphdr);
> #endif
> 
> 	INP_WLOCK_ASSERT(tp->t_inpcb);
> 
> +	if (tp->t_port)
> +		min_protoh += V_tcp_udp_tunneling_overhead;
> 	if (mtuoffer != -1) {
> 		KASSERT(offer == -1, ("%s: conflict", __func__));
> 		offer = mtuoffer - min_protoh;
> diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
> index e23cdc749e98..5bda2be14df0 100644
> --- a/sys/netinet/tcp_output.c
> +++ b/sys/netinet/tcp_output.c
> @@ -101,6 +101,8 @@ __FBSDID("$FreeBSD$");
> 
> #include <netipsec/ipsec_support.h>
> 
> +#include <netinet/udp.h>
> +#include <netinet/udp_var.h>
> #include <machine/in_cksum.h>
> 
> #include <security/mac/mac_framework.h>
> @@ -207,7 +209,7 @@ tcp_output(struct tcpcb *tp)
> #endif
> 	struct tcphdr *th;
> 	u_char opt[TCP_MAXOLEN];
> -	unsigned ipoptlen, optlen, hdrlen;
> +	unsigned ipoptlen, optlen, hdrlen, ulen;
> #if defined(IPSEC) || defined(IPSEC_SUPPORT)
> 	unsigned ipsec_optlen = 0;
> #endif
> @@ -216,6 +218,7 @@ tcp_output(struct tcpcb *tp)
> 	struct sackhole *p;
> 	int tso, mtu;
> 	struct tcpopt to;
> +	struct udphdr *udp = NULL;
> 	unsigned int wanted_cookie = 0;
> 	unsigned int dont_sendalot = 0;
> #if 0
> @@ -558,6 +561,7 @@ after_sack_rexmit:
> #endif
> 
> 	if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
> +	    (tp->t_port == 0) &&
> 	    ((tp->t_flags & TF_SIGNATURE) == 0) &&
> 	    tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
> 	    ipoptlen == 0 && !(flags & TH_SYN))
> @@ -800,6 +804,8 @@ send:
> 		/* Maximum segment size. */
> 		if (flags & TH_SYN) {
> 			to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
> +			if (tp->t_port)
> +				to.to_mss -= V_tcp_udp_tunneling_overhead;
> 			to.to_flags |= TOF_MSS;
> 
> 			/*
> @@ -887,7 +893,14 @@ send:
> 		    !(to.to_flags & TOF_FASTOPEN))
> 			len = 0;
> 	}
> -
> +	if (tp->t_port) {
> +		if (V_tcp_udp_tunneling_port == 0) {
> +			/* The port was removed?? */
> +			SOCKBUF_UNLOCK(&so->so_snd);
> +			return (EHOSTUNREACH);
> +		}
> +		hdrlen += sizeof(struct udphdr);
> +	}
> 	/*
> 	 * Adjust data length if insertion of options will
> 	 * bump the packet length beyond the t_maxseg length.
> @@ -1140,8 +1153,17 @@ send:
> #ifdef INET6
> 	if (isipv6) {
> 		ip6 = mtod(m, struct ip6_hdr *);
> -		th = (struct tcphdr *)(ip6 + 1);
> -		tcpip_fillheaders(tp->t_inpcb, ip6, th);
> +		if (tp->t_port) {
> +			udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
> +			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
> +			udp->uh_dport = tp->t_port;
> +			ulen = hdrlen + len - sizeof(struct ip6_hdr);
> +			udp->uh_ulen = htons(ulen);
> +			th = (struct tcphdr *)(udp + 1);
> +		} else {
> +			th = (struct tcphdr *)(ip6 + 1);
> +		}
> +		tcpip_fillheaders(tp->t_inpcb, tp->t_port, ip6, th);
> 	} else
> #endif /* INET6 */
> 	{
> @@ -1149,8 +1171,16 @@ send:
> #ifdef TCPDEBUG
> 		ipov = (struct ipovly *)ip;
> #endif
> -		th = (struct tcphdr *)(ip + 1);
> -		tcpip_fillheaders(tp->t_inpcb, ip, th);
> +		if (tp->t_port) {
> +			udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
> +			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
> +			udp->uh_dport = tp->t_port;
> +			ulen = hdrlen + len - sizeof(struct ip);
> +			udp->uh_ulen = htons(ulen);
> +			th = (struct tcphdr *)(udp + 1);
> +		} else
> +			th = (struct tcphdr *)(ip + 1);
> +		tcpip_fillheaders(tp->t_inpcb, tp->t_port, ip, th);
> 	}
> 
> 	/*
> @@ -1309,7 +1339,6 @@ send:
> 	 * checksum extended header and data.
> 	 */
> 	m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
> -	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
> 
> #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
> 	if (to.to_flags & TOF_SIGNATURE) {
> @@ -1336,9 +1365,19 @@ send:
> 		 * There is no need to fill in ip6_plen right now.
> 		 * It will be filled later by ip6_output.
> 		 */
> -		m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
> -		th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
> -		    optlen + len, IPPROTO_TCP, 0);
> +		if (tp->t_port) {
> +			m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
> +			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
> +			udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
> +			th->th_sum = htons(0);
> +			UDPSTAT_INC(udps_opackets);
> +		} else {
> +			m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
> +			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
> +			th->th_sum = in6_cksum_pseudo(ip6,
> +			    sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
> +			    0);
> +		}
> 	}
> #endif
> #if defined(INET6) && defined(INET)
> @@ -1346,9 +1385,20 @@ send:
> #endif
> #ifdef INET
> 	{
> -		m->m_pkthdr.csum_flags = CSUM_TCP;
> -		th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
> -		    htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
> +		if (tp->t_port) {
> +			m->m_pkthdr.csum_flags = CSUM_UDP;
> +			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
> +			udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
> +			   ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
> +			th->th_sum = htons(0);
> +			UDPSTAT_INC(udps_opackets);
> +		} else {
> +			m->m_pkthdr.csum_flags = CSUM_TCP;
> +			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
> +			th->th_sum = in_pseudo(ip->ip_src.s_addr,
> +			    ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
> +			    IPPROTO_TCP + len + optlen));
> +		}
> 
> 		/* IP version must be set here for ipv4/ipv6 checking later */
> 		KASSERT(ip->ip_v == IPVERSION,
> @@ -1473,8 +1523,10 @@ send:
> 	 * NB: Don't set DF on small MTU/MSS to have a safe fallback.
> 	 */
> 	if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
> -		ip->ip_off |= htons(IP_DF);
> 		tp->t_flags2 |= TF2_PLPMTU_PMTUD;
> +		if (tp->t_port == 0 || len < V_tcp_minmss) {
> +			ip->ip_off |= htons(IP_DF);
> +		}
> 	} else {
> 		tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
> 	}
> diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
> index cc20d6bf52ca..1ee8d26446fd 100644
> --- a/sys/netinet/tcp_stacks/bbr.c
> +++ b/sys/netinet/tcp_stacks/bbr.c
> @@ -11969,14 +11969,10 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
> #endif
> 	struct tcp_bbr *bbr;
> 	struct tcphdr *th;
> -#ifdef NETFLIX_TCPOUDP
> 	struct udphdr *udp = NULL;
> -#endif
> 	u_char opt[TCP_MAXOLEN];
> 	unsigned ipoptlen, optlen, hdrlen;
> -#ifdef NETFLIX_TCPOUDP
> 	unsigned ulen;
> -#endif
> 	uint32_t bbr_seq;
> 	uint32_t delay_calc=0;
> 	uint8_t doing_tlp = 0;
> @@ -12991,10 +12987,8 @@ send:
> 		/* Maximum segment size. */
> 		if (flags & TH_SYN) {
> 			to.to_mss = tcp_mssopt(&inp->inp_inc);
> -#ifdef NETFLIX_TCPOUDP
> 			if (tp->t_port)
> 				to.to_mss -= V_tcp_udp_tunneling_overhead;
> -#endif
> 			to.to_flags |= TOF_MSS;
> 			/*
> 			 * On SYN or SYN|ACK transmits on TFO connections,
> @@ -13063,7 +13057,6 @@ send:
> 		    !(to.to_flags & TOF_FASTOPEN))
> 			len = 0;
> 	}
> -#ifdef NETFLIX_TCPOUDP
> 	if (tp->t_port) {
> 		if (V_tcp_udp_tunneling_port == 0) {
> 			/* The port was removed?? */
> @@ -13072,7 +13065,6 @@ send:
> 		}
> 		hdrlen += sizeof(struct udphdr);
> 	}
> -#endif
> #ifdef INET6
> 	if (isipv6)
> 		ipoptlen = ip6_optlen(tp->t_inpcb);
> @@ -13408,7 +13400,6 @@ send:
> #ifdef INET6
> 	if (isipv6) {
> 		ip6 = mtod(m, struct ip6_hdr *);
> -#ifdef NETFLIX_TCPOUDP
> 		if (tp->t_port) {
> 			udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
> 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
> @@ -13417,17 +13408,9 @@ send:
> 			udp->uh_ulen = htons(ulen);
> 			th = (struct tcphdr *)(udp + 1);
> 		} else {
> -#endif
> 			th = (struct tcphdr *)(ip6 + 1);
> -
> -#ifdef NETFLIX_TCPOUDP
> 		}
> -#endif
> -		tcpip_fillheaders(inp,
> -#ifdef NETFLIX_TCPOUDP
> -				  tp->t_port,
> -#endif
> -				  ip6, th);
> +		tcpip_fillheaders(inp, tp->t_port, ip6, th);
> 	} else
> #endif				/* INET6 */
> 	{
> @@ -13435,7 +13418,6 @@ send:
> #ifdef TCPDEBUG
> 		ipov = (struct ipovly *)ip;
> #endif
> -#ifdef NETFLIX_TCPOUDP
> 		if (tp->t_port) {
> 			udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
> 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
> @@ -13443,14 +13425,10 @@ send:
> 			ulen = hdrlen + len - sizeof(struct ip);
> 			udp->uh_ulen = htons(ulen);
> 			th = (struct tcphdr *)(udp + 1);
> -		} else
> -#endif
> +		} else {
> 			th = (struct tcphdr *)(ip + 1);
> -		tcpip_fillheaders(inp,
> -#ifdef NETFLIX_TCPOUDP
> -				  tp->t_port,
> -#endif
> -				  ip, th);
> +		}
> +		tcpip_fillheaders(inp, tp->t_port, ip, th);
> 	}
> 	/*
> 	 * If we are doing retransmissions, then snd_nxt will not reflect
> @@ -13600,7 +13578,6 @@ send:
> 		 * ip6_plen is not need to be filled now, and will be filled
> 		 * in ip6_output.
> 		 */
> -#ifdef NETFLIX_TCPOUDP
> 		if (tp->t_port) {
> 			m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
> 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
> @@ -13608,14 +13585,11 @@ send:
> 			th->th_sum = htons(0);
> 			UDPSTAT_INC(udps_opackets);
> 		} else {
> -#endif
> 			csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
> 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
> 			th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
> 			    optlen + len, IPPROTO_TCP, 0);
> -#ifdef NETFLIX_TCPOUDP
> 		}
> -#endif
> 	}
> #endif
> #if defined(INET6) && defined(INET)
> @@ -13623,7 +13597,6 @@ send:
> #endif
> #ifdef INET
> 	{
> -#ifdef NETFLIX_TCPOUDP
> 		if (tp->t_port) {
> 			m->m_pkthdr.csum_flags = CSUM_UDP;
> 			m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
> @@ -13632,15 +13605,12 @@ send:
> 			th->th_sum = htons(0);
> 			UDPSTAT_INC(udps_opackets);
> 		} else {
> -#endif
> 			csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP;
> 			m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
> 			th->th_sum = in_pseudo(ip->ip_src.s_addr,
> 			    ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
> 			    IPPROTO_TCP + len + optlen));
> -#ifdef NETFLIX_TCPOUDP
> 		}
> -#endif
> 		/* IP version must be set here for ipv4/ipv6 checking later */
> 		KASSERT(ip->ip_v == IPVERSION,
> 		    ("%s: IP version incorrect: %d", __func__, ip->ip_v));
> diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
> index 0ee73a95a6d7..12827d1699d0 100644
> --- a/sys/netinet/tcp_stacks/rack.c
> +++ b/sys/netinet/tcp_stacks/rack.c
> @@ -13008,10 +13008,8 @@ send:
> 		if (flags & TH_SYN) {
> 			tp->snd_nxt = tp->iss;
> 			to.to_mss = tcp_mssopt(&inp->inp_inc);
> -#ifdef NETFLIX_TCPOUDP
> 			if (tp->t_port)
> 				to.to_mss -= V_tcp_udp_tunneling_overhead;
> -#endif
> 			to.to_flags |= TOF_MSS;
> 
> 			/*
> @@ -13088,7 +13086,6 @@ send:
> 		    !(to.to_flags & TOF_FASTOPEN))
> 			len = 0;
> 	}
> -#ifdef NETFLIX_TCPOUDP
> 	if (tp->t_port) {
> 		if (V_tcp_udp_tunneling_port == 0) {
> 			/* The port was removed?? */
> @@ -13097,7 +13094,6 @@ send:
> 		}
> 		hdrlen += sizeof(struct udphdr);
> 	}
> -#endif
> #ifdef INET6
> 	if (isipv6)
> 		ipoptlen = ip6_optlen(tp->t_inpcb);
> @@ -13372,7 +13368,6 @@ send:
> #ifdef INET6
> 	if (isipv6) {
> 		ip6 = mtod(m, struct ip6_hdr *);
> -#ifdef NETFLIX_TCPOUDP
> 		if (tp->t_port) {
> 			udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
> 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
> @@ -13380,14 +13375,10 @@ send:
> 			ulen = hdrlen + len - sizeof(struct ip6_hdr);
> 			udp->uh_ulen = htons(ulen);
> 			th = (struct tcphdr *)(udp + 1);
> -		} else
> -#endif
> +		} else {
> 			th = (struct tcphdr *)(ip6 + 1);
> -		tcpip_fillheaders(inp,
> -#ifdef NETFLIX_TCPOUDP
> -				  tp->t_port,
> -#endif
> -				  ip6, th);
> +		}
> +		tcpip_fillheaders(inp, tp->t_port, ip6, th);
> 	} else
> #endif				/* INET6 */
> 	{
> @@ -13395,7 +13386,6 @@ send:
> #ifdef TCPDEBUG
> 		ipov = (struct ipovly *)ip;
> #endif
> -#ifdef NETFLIX_TCPOUDP
> 		if (tp->t_port) {
> 			udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
> 			udp->uh_sport = htons(V_tcp_udp_tunneling_port);
> @@ -13403,14 +13393,10 @@ send:
> 			ulen = hdrlen + len - sizeof(struct ip);
> 			udp->uh_ulen = htons(ulen);
> 			th = (struct tcphdr *)(udp + 1);
> -		} else
> -#endif
> +		} else {
> 			th = (struct tcphdr *)(ip + 1);
> -		tcpip_fillheaders(inp,
> -#ifdef NETFLIX_TCPOUDP
> -				  tp->t_port,
> -#endif
> -				  ip, th);
> +		}
> +		tcpip_fillheaders(inp, tp->t_port, ip, th);
> 	}
> 	/*
> 	 * Fill in fields, remembering maximum advertised window for use in
> diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
> index dff7767cd9cf..6bdeb3984aee 100644
> --- a/sys/netinet/tcp_subr.c
> +++ b/sys/netinet/tcp_subr.c
> @@ -126,6 +126,8 @@ __FBSDID("$FreeBSD$");
> #ifdef TCP_OFFLOAD
> #include <netinet/tcp_offload.h>
> #endif
> +#include <netinet/udp.h>
> +#include <netinet/udp_var.h>
> 
> #include <netipsec/ipsec_support.h>
> 
> @@ -501,6 +503,80 @@ tcp_switch_back_to_default(struct tcpcb *tp)
> 	}
> }
> 
> +static void
> +tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp,
> +    const struct sockaddr *sa, void *ctx)
> +{
> +	struct ip *iph;
> +#ifdef INET6
> +	struct ip6_hdr *ip6;
> +#endif
> +	struct udphdr *uh;
> +	struct tcphdr *th;
> +	int thlen;
> +	uint16_t port;
> +
> +	TCPSTAT_INC(tcps_tunneled_pkts);
> +	if ((m->m_flags & M_PKTHDR) == 0) {
> +		/* Can't handle one that is not a pkt hdr */
> +		TCPSTAT_INC(tcps_tunneled_errs);
> +		goto out;
> +	}
> +	thlen = sizeof(struct tcphdr);
> +	if (m->m_len < off + sizeof(struct udphdr) + thlen &&
> +	    (m =  m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) {
> +		TCPSTAT_INC(tcps_tunneled_errs);
> +		goto out;
> +	}
> +	iph = mtod(m, struct ip *);
> +	uh = (struct udphdr *)((caddr_t)iph + off);
> +	th = (struct tcphdr *)(uh + 1);
> +	thlen = th->th_off << 2;
> +	if (m->m_len < off + sizeof(struct udphdr) + thlen) {
> +		m =  m_pullup(m, off + sizeof(struct udphdr) + thlen);
> +		if (m == NULL) {
> +			TCPSTAT_INC(tcps_tunneled_errs);
> +			goto out;
> +		} else {
> +			iph = mtod(m, struct ip *);
> +			uh = (struct udphdr *)((caddr_t)iph + off);
> +			th = (struct tcphdr *)(uh + 1);
> +		}
> +	}
> +	m->m_pkthdr.tcp_tun_port = port = uh->uh_sport;
> +	bcopy(th, uh, m->m_len - off);
> +	m->m_len -= sizeof(struct udphdr);
> +	m->m_pkthdr.len -= sizeof(struct udphdr);
> +	/*
> +	 * We use the same algorithm for
> +	 * both UDP and TCP for c-sum. So
> +	 * the code in tcp_input will skip
> +	 * the checksum. So we do nothing
> +	 * with the flag (m->m_pkthdr.csum_flags).
> +	 */
> +	switch (iph->ip_v) {
> +#ifdef INET
> +	case IPVERSION:
> +		iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr));
> +		tcp_input_with_port(&m, &off, IPPROTO_TCP, port);
> +		break;
> +#endif
> +#ifdef INET6
> +	case IPV6_VERSION >> 4:
> +		ip6 = mtod(m, struct ip6_hdr *);
> +		ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr));
> +		tcp6_input_with_port(&m, &off, IPPROTO_TCP, port);
> +		break;
> +#endif
> +	default:
> +		goto out;
> +		break;
> +	}
> +	return;
> +out:
> +	m_freem(m);
> +}
> +
> static int
> sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
> {
> @@ -598,6 +674,183 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available,
>     NULL, 0, sysctl_net_inet_list_available, "A",
>     "list available TCP Function sets");
> 
> +VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT;
> +
> +#ifdef INET
> +VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL;
> +#define	V_udp4_tun_socket	VNET(udp4_tun_socket)
> +#endif
> +#ifdef INET6
> +VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL;
> +#define	V_udp6_tun_socket	VNET(udp6_tun_socket)
> +#endif
> +
> +static void
> +tcp_over_udp_stop(void)
> +{
> +	/*
> +	 * This function assumes sysctl caller holds inp_rinfo_lock()
> +	 * for writting!
> +	 */
> +#ifdef INET
> +	if (V_udp4_tun_socket != NULL) {
> +		soclose(V_udp4_tun_socket);
> +		V_udp4_tun_socket = NULL;
> +	}
> +#endif
> +#ifdef INET6
> +	if (V_udp6_tun_socket != NULL) {
> +		soclose(V_udp6_tun_socket);
> +		V_udp6_tun_socket = NULL;
> +	}
> +#endif
> +}
> +
> +static int
> +tcp_over_udp_start(void)
> +{
> +	uint16_t port;
> +	int ret;
> +#ifdef INET
> +	struct sockaddr_in sin;
> +#endif
> +#ifdef INET6
> +	struct sockaddr_in6 sin6;
> +#endif
> +	/*
> +	 * This function assumes sysctl caller holds inp_info_rlock()
> +	 * for writting!
> +	 */
> +	port = V_tcp_udp_tunneling_port;
> +	if (ntohs(port) == 0) {
> +		/* Must have a port set */
> +		return (EINVAL);
> +	}
> +#ifdef INET
> +	if (V_udp4_tun_socket != NULL) {
> +		/* Already running -- must stop first */
> +		return (EALREADY);
> +	}
> +#endif
> +#ifdef INET6
> +	if (V_udp6_tun_socket != NULL) {
> +		/* Already running -- must stop first */
> +		return (EALREADY);
> +	}
> +#endif
> +#ifdef INET
> +	if ((ret = socreate(PF_INET, &V_udp4_tun_socket,
> +	    SOCK_DGRAM, IPPROTO_UDP,
> +	    curthread->td_ucred, curthread))) {
> +		tcp_over_udp_stop();
> +		return (ret);
> +	}
> +	/* Call the special UDP hook. */
> +	if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket,
> +	    tcp_recv_udp_tunneled_packet,
> +	    tcp_ctlinput_viaudp,
> +	    NULL))) {
> +		tcp_over_udp_stop();
> +		return (ret);
> +	}
> +	/* Ok, we have a socket, bind it to the port. */
> +	memset(&sin, 0, sizeof(struct sockaddr_in));
> +	sin.sin_len = sizeof(struct sockaddr_in);
> +	sin.sin_family = AF_INET;
> +	sin.sin_port = htons(port);
> +	if ((ret = sobind(V_udp4_tun_socket,
> +	    (struct sockaddr *)&sin, curthread))) {
> +		tcp_over_udp_stop();
> +		return (ret);
> +	}
> +#endif
> +#ifdef INET6
> +	if ((ret = socreate(PF_INET6, &V_udp6_tun_socket,
> +	    SOCK_DGRAM, IPPROTO_UDP,
> +	    curthread->td_ucred, curthread))) {
> +		tcp_over_udp_stop();
> +		return (ret);
> +	}
> +	/* Call the special UDP hook. */
> +	if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket,
> +	    tcp_recv_udp_tunneled_packet,
> +	    tcp6_ctlinput_viaudp,
> +	    NULL))) {
> +		tcp_over_udp_stop();
> +		return (ret);
> +	}
> +	/* Ok, we have a socket, bind it to the port. */
> +	memset(&sin6, 0, sizeof(struct sockaddr_in6));
> +	sin6.sin6_len = sizeof(struct sockaddr_in6);
> +	sin6.sin6_family = AF_INET6;
> +	sin6.sin6_port = htons(port);
> +	if ((ret = sobind(V_udp6_tun_socket,
> +	    (struct sockaddr *)&sin6, curthread))) {
> +		tcp_over_udp_stop();
> +		return (ret);
> +	}
> +#endif
> +	return (0);
> +}
> +
> +static int
> +sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS)
> +{
> +	int error;
> +	uint32_t old, new;
> +
> +	old = V_tcp_udp_tunneling_port;
> +	new = old;
> +	error = sysctl_handle_int(oidp, &new, 0, req);
> +	if ((error == 0) &&
> +	    (req->newptr != NULL)) {
> +		if ((new < TCP_TUNNELING_PORT_MIN) ||
> +		    (new > TCP_TUNNELING_PORT_MAX)) {
> +			error = EINVAL;
> +		} else {
> +			V_tcp_udp_tunneling_port = new;
> +			if (old != 0) {
> +				tcp_over_udp_stop();
> +			}
> +			if (new != 0) {
> +				error = tcp_over_udp_start();
> +			}
> +		}
> +	}
> +	return (error);
> +}
> +
> +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port,
> +    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
> +    &VNET_NAME(tcp_udp_tunneling_port),
> +    0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU",
> +    "Tunneling port for tcp over udp");
> +
> +VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT;
> +
> +static int
> +sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS)
> +{
> +	int error, new;
> +
> +	new = V_tcp_udp_tunneling_overhead;
> +	error = sysctl_handle_int(oidp, &new, 0, req);
> +	if (error == 0 && req->newptr) {
> +		if ((new < TCP_TUNNELING_OVERHEAD_MIN) ||
> +		    (new > TCP_TUNNELING_OVERHEAD_MAX))
> +			error = EINVAL;
> +		else
> +			V_tcp_udp_tunneling_overhead = new;
> +	}
> +	return (error);
> +}
> +
> +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead,
> +    CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
> +    &VNET_NAME(tcp_udp_tunneling_overhead),
> +    0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU",
> +    "MSS reduction when using tcp over udp");
> +
> /*
>  * Exports one (struct tcp_function_info) for each alias/name.
>  */
> @@ -1305,7 +1558,7 @@ tcp_fini(void *xtp)
>  * of the tcpcb each time to conserve mbufs.
>  */
> void
> -tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
> +tcpip_fillheaders(struct inpcb *inp, uint16_t port, void *ip_ptr, void *tcp_ptr)
> {
> 	struct tcphdr *th = (struct tcphdr *)tcp_ptr;
> 
> @@ -1320,7 +1573,10 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
> 			(inp->inp_flow & IPV6_FLOWINFO_MASK);
> 		ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
> 			(IPV6_VERSION & IPV6_VERSION_MASK);
> -		ip6->ip6_nxt = IPPROTO_TCP;
> +		if (port == 0)
> +			ip6->ip6_nxt = IPPROTO_TCP;
> +		else
> +			ip6->ip6_nxt = IPPROTO_UDP;
> 		ip6->ip6_plen = htons(sizeof(struct tcphdr));
> 		ip6->ip6_src = inp->in6p_laddr;
> 		ip6->ip6_dst = inp->in6p_faddr;
> @@ -1342,7 +1598,10 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
> 		ip->ip_off = 0;
> 		ip->ip_ttl = inp->inp_ip_ttl;
> 		ip->ip_sum = 0;
> -		ip->ip_p = IPPROTO_TCP;
> +		if (port == 0)
> +			ip->ip_p = IPPROTO_TCP;
> +		else
> +			ip->ip_p = IPPROTO_UDP;
> 		ip->ip_src = inp->inp_laddr;
> 		ip->ip_dst = inp->inp_faddr;
> 	}
> @@ -1372,7 +1631,7 @@ tcpip_maketemplate(struct inpcb *inp)
> 	t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
> 	if (t == NULL)
> 		return (NULL);
> -	tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t);
> +	tcpip_fillheaders(inp, 0, (void *)&t->tt_ipgen, (void *)&t->tt_t);
> 	return (t);
> }
> 
> @@ -1398,14 +1657,16 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
> 	struct inpcb *inp;
> 	struct ip *ip;
> 	struct mbuf *optm;
> +	struct udphdr *uh = NULL;
> 	struct tcphdr *nth;
> 	u_char *optp;
> #ifdef INET6
> 	struct ip6_hdr *ip6;
> 	int isipv6;
> #endif /* INET6 */
> -	int optlen, tlen, win;
> +	int optlen, tlen, win, ulen;
> 	bool incl_opts;
> +	uint16_t port;
> 
> 	KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
> 	NET_EPOCH_ASSERT();
> @@ -1423,6 +1684,19 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
> 	} else
> 		inp = NULL;
> 
> +	if (m != NULL) {
> +#ifdef INET6
> +		if (isipv6 && ip6 && (ip6->ip6_nxt == IPPROTO_UDP))
> +			port = m->m_pkthdr.tcp_tun_port;
> +		else
> *** 1128 LINES SKIPPED ***



More information about the dev-commits-src-all mailing list