git: fa50e98328b4 - stable/13 - mend
Michael Tuexen
tuexen at freebsd.org
Mon Jun 7 12:12:05 UTC 2021
> On 7. Jun 2021, at 11:01, Michael Tuexen <tuexen at freebsd.org> wrote:
>
> The branch stable/13 has been updated by tuexen:
>
> URL: https://cgit.FreeBSD.org/src/commit/?id=fa50e98328b48da4fa8dbd97d0a787962cf249f5
>
> commit fa50e98328b48da4fa8dbd97d0a787962cf249f5
> Author: Michael Tuexen <tuexen at FreeBSD.org>
> AuthorDate: 2021-04-18 14:08:08 +0000
> Commit: Michael Tuexen <tuexen at FreeBSD.org>
> CommitDate: 2021-06-07 09:01:28 +0000
>
> mend
Not sure how the commit ended up this way, but it is MFCing
https://cgit.FreeBSD.org/src/commit/?id=9e644c23000c2f5028b235f6263d17ffb24d3605
and manually resolving the merge conflicts.
Best regards
Michael
> ---
> share/man/man4/tcp.4 | 15 +-
> sys/netinet/tcp.h | 1 +
> sys/netinet/tcp_input.c | 48 ++++-
> sys/netinet/tcp_output.c | 80 ++++++--
> sys/netinet/tcp_stacks/bbr.c | 38 +---
> sys/netinet/tcp_stacks/rack.c | 26 +--
> sys/netinet/tcp_subr.c | 462 ++++++++++++++++++++++++++++++++++++++++--
> sys/netinet/tcp_syncache.c | 127 +++++++++---
> sys/netinet/tcp_syncache.h | 12 +-
> sys/netinet/tcp_timewait.c | 84 ++++++--
> sys/netinet/tcp_usrreq.c | 30 +++
> sys/netinet/tcp_var.h | 27 ++-
> sys/netinet/toecore.c | 4 +-
> sys/netinet6/tcp6_var.h | 2 +
> sys/sys/mbuf.h | 1 +
> usr.bin/netstat/inet.c | 4 +
> usr.bin/sockstat/sockstat.1 | 6 +-
> usr.bin/sockstat/sockstat.c | 13 +-
> 18 files changed, 822 insertions(+), 158 deletions(-)
>
> diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
> index d01505e58427..b5735a40b320 100644
> --- a/share/man/man4/tcp.4
> +++ b/share/man/man4/tcp.4
> @@ -34,7 +34,7 @@
> .\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93
> .\" $FreeBSD$
> .\"
> -.Dd April 8, 2021
> +.Dd April 18, 2021
> .Dt TCP 4
> .Os
> .Sh NAME
> @@ -329,6 +329,9 @@ currently executing.
> This is typically used after a process or thread inherits a listen
> socket from its parent, and sets its CPU affinity to a particular core.
> .El
> +.It Dv TCP_REMOTE_UDP_ENCAPS_PORT
> +Set and get the remote UDP encapsulation port.
> +It can only be set on a closed TCP socket.
> .El
> .Pp
> The option level for the
> @@ -752,6 +755,16 @@ A CSV list of template_spec=percent key-value pairs which controls the per
> template sampling rates when
> .Xr stats 3
> sampling is enabled.
> +.It Va udp_tunneling_port
> +The local UDP encapsulation port.
> +A value of 0 indicates that UDP encapsulation is disabled.
> +The default is 0.
> +.It Va udp_tunneling_overhead
> +The overhead taken into account when using UDP encapsulation.
> +Since MSS clamping by middleboxes will most likely not work, values larger than
> +8 (the size of the UDP header) are also supported.
> +Supported values are between 8 and 1024.
> +The default is 8.
> .El
> .Sh ERRORS
> A socket operation may fail with one of the following errors returned:
> diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
> index 0b71bd4658f8..d2bf1f8431fd 100644
> --- a/sys/netinet/tcp.h
> +++ b/sys/netinet/tcp.h
> @@ -183,6 +183,7 @@ struct tcphdr {
> #define TCP_RXTLS_MODE 42 /* Receive TLS mode */
> #define TCP_CONGESTION 64 /* get/set congestion control algorithm */
> #define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */
> +#define TCP_REMOTE_UDP_ENCAPS_PORT 71 /* Enable TCP over UDP tunneling via the specified port */
> #define TCP_DELACK 72 /* socket option for delayed ack */
> #define TCP_FIN_IS_RST 73 /* A fin from the peer is treated has a RST */
> #define TCP_LOG_LIMIT 74 /* Limit to number of records in tcp-log */
> diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
> index 397cbc5084e6..d36f9566ffba 100644
> --- a/sys/netinet/tcp_input.c
> +++ b/sys/netinet/tcp_input.c
> @@ -123,6 +123,7 @@ __FBSDID("$FreeBSD$");
> #ifdef TCP_OFFLOAD
> #include <netinet/tcp_offload.h>
> #endif
> +#include <netinet/udp.h>
>
> #include <netipsec/ipsec_support.h>
>
> @@ -573,7 +574,7 @@ cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
> */
> #ifdef INET6
> int
> -tcp6_input(struct mbuf **mp, int *offp, int proto)
> +tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
> {
> struct mbuf *m;
> struct in6_ifaddr *ia6;
> @@ -603,12 +604,19 @@ tcp6_input(struct mbuf **mp, int *offp, int proto)
> }
>
> *mp = m;
> - return (tcp_input(mp, offp, proto));
> + return (tcp_input_with_port(mp, offp, proto, port));
> +}
> +
> +int
> +tcp6_input(struct mbuf **mp, int *offp, int proto)
> +{
> +
> + return(tcp6_input_with_port(mp, offp, proto, 0));
> }
> #endif /* INET6 */
>
> int
> -tcp_input(struct mbuf **mp, int *offp, int proto)
> +tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
> {
> struct mbuf *m = *mp;
> struct tcphdr *th = NULL;
> @@ -664,6 +672,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
> ip6 = mtod(m, struct ip6_hdr *);
> th = (struct tcphdr *)((caddr_t)ip6 + off0);
> tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
> + if (port)
> + goto skip6_csum;
> if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
> if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
> th->th_sum = m->m_pkthdr.csum_data;
> @@ -677,7 +687,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
> TCPSTAT_INC(tcps_rcvbadsum);
> goto drop;
> }
> -
> + skip6_csum:
> /*
> * Be proactive about unspecified IPv6 address in source.
> * As we use all-zero to indicate unbounded/unconnected pcb,
> @@ -718,6 +728,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
> tlen = ntohs(ip->ip_len) - off0;
>
> iptos = ip->ip_tos;
> + if (port)
> + goto skip_csum;
> if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
> if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
> th->th_sum = m->m_pkthdr.csum_data;
> @@ -747,8 +759,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
> ip->ip_v = IPVERSION;
> ip->ip_hl = off0 >> 2;
> }
> -
> - if (th->th_sum) {
> + skip_csum:
> + if (th->th_sum && (port == 0)) {
> TCPSTAT_INC(tcps_rcvbadsum);
> goto drop;
> }
> @@ -1006,6 +1018,11 @@ findpcb:
> goto dropwithreset;
> }
>
> + if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) {
> + rstreason = BANDLIM_RST_CLOSEDPORT;
> + goto dropwithreset;
> + }
> +
> #ifdef TCP_OFFLOAD
> if (tp->t_flags & TF_TOE) {
> tcp_offload_input(tp, m);
> @@ -1077,7 +1094,7 @@ findpcb:
> * NB: syncache_expand() doesn't unlock
> * inp and tcpinfo locks.
> */
> - rstreason = syncache_expand(&inc, &to, th, &so, m);
> + rstreason = syncache_expand(&inc, &to, th, &so, m, port);
> if (rstreason < 0) {
> /*
> * A failing TCP MD5 signature comparison
> @@ -1157,7 +1174,7 @@ tfo_socket_result:
> * causes.
> */
> if (thflags & TH_RST) {
> - syncache_chkrst(&inc, th, m);
> + syncache_chkrst(&inc, th, m, port);
> goto dropunlock;
> }
> /*
> @@ -1179,7 +1196,7 @@ tfo_socket_result:
> log(LOG_DEBUG, "%s; %s: Listen socket: "
> "SYN|ACK invalid, segment rejected\n",
> s, __func__);
> - syncache_badack(&inc); /* XXX: Not needed! */
> + syncache_badack(&inc, port); /* XXX: Not needed! */
> TCPSTAT_INC(tcps_badsyn);
> rstreason = BANDLIM_RST_OPENPORT;
> goto dropwithreset;
> @@ -1336,7 +1353,8 @@ tfo_socket_result:
> #endif
> TCP_PROBE3(debug__input, tp, th, m);
> tcp_dooptions(&to, optp, optlen, TO_SYN);
> - if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL, iptos))
> + if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL, iptos,
> + port))
> goto tfo_socket_result;
>
> /*
> @@ -1467,6 +1485,12 @@ tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
> return (newsize);
> }
>
> +int
> +tcp_input(struct mbuf **mp, int *offp, int proto)
> +{
> + return(tcp_input_with_port(mp, offp, proto, 0));
> +}
> +
> void
> tcp_handle_wakeup(struct tcpcb *tp, struct socket *so)
> {
> @@ -3671,11 +3695,13 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
> sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
> sizeof (struct tcpiphdr);
> #else
> - const size_t min_protoh = sizeof(struct tcpiphdr);
> + size_t min_protoh = sizeof(struct tcpiphdr);
> #endif
>
> INP_WLOCK_ASSERT(tp->t_inpcb);
>
> + if (tp->t_port)
> + min_protoh += V_tcp_udp_tunneling_overhead;
> if (mtuoffer != -1) {
> KASSERT(offer == -1, ("%s: conflict", __func__));
> offer = mtuoffer - min_protoh;
> diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
> index e23cdc749e98..5bda2be14df0 100644
> --- a/sys/netinet/tcp_output.c
> +++ b/sys/netinet/tcp_output.c
> @@ -101,6 +101,8 @@ __FBSDID("$FreeBSD$");
>
> #include <netipsec/ipsec_support.h>
>
> +#include <netinet/udp.h>
> +#include <netinet/udp_var.h>
> #include <machine/in_cksum.h>
>
> #include <security/mac/mac_framework.h>
> @@ -207,7 +209,7 @@ tcp_output(struct tcpcb *tp)
> #endif
> struct tcphdr *th;
> u_char opt[TCP_MAXOLEN];
> - unsigned ipoptlen, optlen, hdrlen;
> + unsigned ipoptlen, optlen, hdrlen, ulen;
> #if defined(IPSEC) || defined(IPSEC_SUPPORT)
> unsigned ipsec_optlen = 0;
> #endif
> @@ -216,6 +218,7 @@ tcp_output(struct tcpcb *tp)
> struct sackhole *p;
> int tso, mtu;
> struct tcpopt to;
> + struct udphdr *udp = NULL;
> unsigned int wanted_cookie = 0;
> unsigned int dont_sendalot = 0;
> #if 0
> @@ -558,6 +561,7 @@ after_sack_rexmit:
> #endif
>
> if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
> + (tp->t_port == 0) &&
> ((tp->t_flags & TF_SIGNATURE) == 0) &&
> tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
> ipoptlen == 0 && !(flags & TH_SYN))
> @@ -800,6 +804,8 @@ send:
> /* Maximum segment size. */
> if (flags & TH_SYN) {
> to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
> + if (tp->t_port)
> + to.to_mss -= V_tcp_udp_tunneling_overhead;
> to.to_flags |= TOF_MSS;
>
> /*
> @@ -887,7 +893,14 @@ send:
> !(to.to_flags & TOF_FASTOPEN))
> len = 0;
> }
> -
> + if (tp->t_port) {
> + if (V_tcp_udp_tunneling_port == 0) {
> + /* The port was removed?? */
> + SOCKBUF_UNLOCK(&so->so_snd);
> + return (EHOSTUNREACH);
> + }
> + hdrlen += sizeof(struct udphdr);
> + }
> /*
> * Adjust data length if insertion of options will
> * bump the packet length beyond the t_maxseg length.
> @@ -1140,8 +1153,17 @@ send:
> #ifdef INET6
> if (isipv6) {
> ip6 = mtod(m, struct ip6_hdr *);
> - th = (struct tcphdr *)(ip6 + 1);
> - tcpip_fillheaders(tp->t_inpcb, ip6, th);
> + if (tp->t_port) {
> + udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
> + udp->uh_sport = htons(V_tcp_udp_tunneling_port);
> + udp->uh_dport = tp->t_port;
> + ulen = hdrlen + len - sizeof(struct ip6_hdr);
> + udp->uh_ulen = htons(ulen);
> + th = (struct tcphdr *)(udp + 1);
> + } else {
> + th = (struct tcphdr *)(ip6 + 1);
> + }
> + tcpip_fillheaders(tp->t_inpcb, tp->t_port, ip6, th);
> } else
> #endif /* INET6 */
> {
> @@ -1149,8 +1171,16 @@ send:
> #ifdef TCPDEBUG
> ipov = (struct ipovly *)ip;
> #endif
> - th = (struct tcphdr *)(ip + 1);
> - tcpip_fillheaders(tp->t_inpcb, ip, th);
> + if (tp->t_port) {
> + udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
> + udp->uh_sport = htons(V_tcp_udp_tunneling_port);
> + udp->uh_dport = tp->t_port;
> + ulen = hdrlen + len - sizeof(struct ip);
> + udp->uh_ulen = htons(ulen);
> + th = (struct tcphdr *)(udp + 1);
> + } else
> + th = (struct tcphdr *)(ip + 1);
> + tcpip_fillheaders(tp->t_inpcb, tp->t_port, ip, th);
> }
>
> /*
> @@ -1309,7 +1339,6 @@ send:
> * checksum extended header and data.
> */
> m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
> - m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
>
> #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
> if (to.to_flags & TOF_SIGNATURE) {
> @@ -1336,9 +1365,19 @@ send:
> * There is no need to fill in ip6_plen right now.
> * It will be filled later by ip6_output.
> */
> - m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
> - th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
> - optlen + len, IPPROTO_TCP, 0);
> + if (tp->t_port) {
> + m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
> + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
> + udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
> + th->th_sum = htons(0);
> + UDPSTAT_INC(udps_opackets);
> + } else {
> + m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
> + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
> + th->th_sum = in6_cksum_pseudo(ip6,
> + sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
> + 0);
> + }
> }
> #endif
> #if defined(INET6) && defined(INET)
> @@ -1346,9 +1385,20 @@ send:
> #endif
> #ifdef INET
> {
> - m->m_pkthdr.csum_flags = CSUM_TCP;
> - th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
> - htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
> + if (tp->t_port) {
> + m->m_pkthdr.csum_flags = CSUM_UDP;
> + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
> + udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
> + ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
> + th->th_sum = htons(0);
> + UDPSTAT_INC(udps_opackets);
> + } else {
> + m->m_pkthdr.csum_flags = CSUM_TCP;
> + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
> + th->th_sum = in_pseudo(ip->ip_src.s_addr,
> + ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
> + IPPROTO_TCP + len + optlen));
> + }
>
> /* IP version must be set here for ipv4/ipv6 checking later */
> KASSERT(ip->ip_v == IPVERSION,
> @@ -1473,8 +1523,10 @@ send:
> * NB: Don't set DF on small MTU/MSS to have a safe fallback.
> */
> if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
> - ip->ip_off |= htons(IP_DF);
> tp->t_flags2 |= TF2_PLPMTU_PMTUD;
> + if (tp->t_port == 0 || len < V_tcp_minmss) {
> + ip->ip_off |= htons(IP_DF);
> + }
> } else {
> tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
> }
> diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
> index cc20d6bf52ca..1ee8d26446fd 100644
> --- a/sys/netinet/tcp_stacks/bbr.c
> +++ b/sys/netinet/tcp_stacks/bbr.c
> @@ -11969,14 +11969,10 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
> #endif
> struct tcp_bbr *bbr;
> struct tcphdr *th;
> -#ifdef NETFLIX_TCPOUDP
> struct udphdr *udp = NULL;
> -#endif
> u_char opt[TCP_MAXOLEN];
> unsigned ipoptlen, optlen, hdrlen;
> -#ifdef NETFLIX_TCPOUDP
> unsigned ulen;
> -#endif
> uint32_t bbr_seq;
> uint32_t delay_calc=0;
> uint8_t doing_tlp = 0;
> @@ -12991,10 +12987,8 @@ send:
> /* Maximum segment size. */
> if (flags & TH_SYN) {
> to.to_mss = tcp_mssopt(&inp->inp_inc);
> -#ifdef NETFLIX_TCPOUDP
> if (tp->t_port)
> to.to_mss -= V_tcp_udp_tunneling_overhead;
> -#endif
> to.to_flags |= TOF_MSS;
> /*
> * On SYN or SYN|ACK transmits on TFO connections,
> @@ -13063,7 +13057,6 @@ send:
> !(to.to_flags & TOF_FASTOPEN))
> len = 0;
> }
> -#ifdef NETFLIX_TCPOUDP
> if (tp->t_port) {
> if (V_tcp_udp_tunneling_port == 0) {
> /* The port was removed?? */
> @@ -13072,7 +13065,6 @@ send:
> }
> hdrlen += sizeof(struct udphdr);
> }
> -#endif
> #ifdef INET6
> if (isipv6)
> ipoptlen = ip6_optlen(tp->t_inpcb);
> @@ -13408,7 +13400,6 @@ send:
> #ifdef INET6
> if (isipv6) {
> ip6 = mtod(m, struct ip6_hdr *);
> -#ifdef NETFLIX_TCPOUDP
> if (tp->t_port) {
> udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
> udp->uh_sport = htons(V_tcp_udp_tunneling_port);
> @@ -13417,17 +13408,9 @@ send:
> udp->uh_ulen = htons(ulen);
> th = (struct tcphdr *)(udp + 1);
> } else {
> -#endif
> th = (struct tcphdr *)(ip6 + 1);
> -
> -#ifdef NETFLIX_TCPOUDP
> }
> -#endif
> - tcpip_fillheaders(inp,
> -#ifdef NETFLIX_TCPOUDP
> - tp->t_port,
> -#endif
> - ip6, th);
> + tcpip_fillheaders(inp, tp->t_port, ip6, th);
> } else
> #endif /* INET6 */
> {
> @@ -13435,7 +13418,6 @@ send:
> #ifdef TCPDEBUG
> ipov = (struct ipovly *)ip;
> #endif
> -#ifdef NETFLIX_TCPOUDP
> if (tp->t_port) {
> udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
> udp->uh_sport = htons(V_tcp_udp_tunneling_port);
> @@ -13443,14 +13425,10 @@ send:
> ulen = hdrlen + len - sizeof(struct ip);
> udp->uh_ulen = htons(ulen);
> th = (struct tcphdr *)(udp + 1);
> - } else
> -#endif
> + } else {
> th = (struct tcphdr *)(ip + 1);
> - tcpip_fillheaders(inp,
> -#ifdef NETFLIX_TCPOUDP
> - tp->t_port,
> -#endif
> - ip, th);
> + }
> + tcpip_fillheaders(inp, tp->t_port, ip, th);
> }
> /*
> * If we are doing retransmissions, then snd_nxt will not reflect
> @@ -13600,7 +13578,6 @@ send:
> * ip6_plen is not need to be filled now, and will be filled
> * in ip6_output.
> */
> -#ifdef NETFLIX_TCPOUDP
> if (tp->t_port) {
> m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
> m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
> @@ -13608,14 +13585,11 @@ send:
> th->th_sum = htons(0);
> UDPSTAT_INC(udps_opackets);
> } else {
> -#endif
> csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
> m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
> th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
> optlen + len, IPPROTO_TCP, 0);
> -#ifdef NETFLIX_TCPOUDP
> }
> -#endif
> }
> #endif
> #if defined(INET6) && defined(INET)
> @@ -13623,7 +13597,6 @@ send:
> #endif
> #ifdef INET
> {
> -#ifdef NETFLIX_TCPOUDP
> if (tp->t_port) {
> m->m_pkthdr.csum_flags = CSUM_UDP;
> m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
> @@ -13632,15 +13605,12 @@ send:
> th->th_sum = htons(0);
> UDPSTAT_INC(udps_opackets);
> } else {
> -#endif
> csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP;
> m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
> th->th_sum = in_pseudo(ip->ip_src.s_addr,
> ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
> IPPROTO_TCP + len + optlen));
> -#ifdef NETFLIX_TCPOUDP
> }
> -#endif
> /* IP version must be set here for ipv4/ipv6 checking later */
> KASSERT(ip->ip_v == IPVERSION,
> ("%s: IP version incorrect: %d", __func__, ip->ip_v));
> diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
> index 0ee73a95a6d7..12827d1699d0 100644
> --- a/sys/netinet/tcp_stacks/rack.c
> +++ b/sys/netinet/tcp_stacks/rack.c
> @@ -13008,10 +13008,8 @@ send:
> if (flags & TH_SYN) {
> tp->snd_nxt = tp->iss;
> to.to_mss = tcp_mssopt(&inp->inp_inc);
> -#ifdef NETFLIX_TCPOUDP
> if (tp->t_port)
> to.to_mss -= V_tcp_udp_tunneling_overhead;
> -#endif
> to.to_flags |= TOF_MSS;
>
> /*
> @@ -13088,7 +13086,6 @@ send:
> !(to.to_flags & TOF_FASTOPEN))
> len = 0;
> }
> -#ifdef NETFLIX_TCPOUDP
> if (tp->t_port) {
> if (V_tcp_udp_tunneling_port == 0) {
> /* The port was removed?? */
> @@ -13097,7 +13094,6 @@ send:
> }
> hdrlen += sizeof(struct udphdr);
> }
> -#endif
> #ifdef INET6
> if (isipv6)
> ipoptlen = ip6_optlen(tp->t_inpcb);
> @@ -13372,7 +13368,6 @@ send:
> #ifdef INET6
> if (isipv6) {
> ip6 = mtod(m, struct ip6_hdr *);
> -#ifdef NETFLIX_TCPOUDP
> if (tp->t_port) {
> udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
> udp->uh_sport = htons(V_tcp_udp_tunneling_port);
> @@ -13380,14 +13375,10 @@ send:
> ulen = hdrlen + len - sizeof(struct ip6_hdr);
> udp->uh_ulen = htons(ulen);
> th = (struct tcphdr *)(udp + 1);
> - } else
> -#endif
> + } else {
> th = (struct tcphdr *)(ip6 + 1);
> - tcpip_fillheaders(inp,
> -#ifdef NETFLIX_TCPOUDP
> - tp->t_port,
> -#endif
> - ip6, th);
> + }
> + tcpip_fillheaders(inp, tp->t_port, ip6, th);
> } else
> #endif /* INET6 */
> {
> @@ -13395,7 +13386,6 @@ send:
> #ifdef TCPDEBUG
> ipov = (struct ipovly *)ip;
> #endif
> -#ifdef NETFLIX_TCPOUDP
> if (tp->t_port) {
> udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
> udp->uh_sport = htons(V_tcp_udp_tunneling_port);
> @@ -13403,14 +13393,10 @@ send:
> ulen = hdrlen + len - sizeof(struct ip);
> udp->uh_ulen = htons(ulen);
> th = (struct tcphdr *)(udp + 1);
> - } else
> -#endif
> + } else {
> th = (struct tcphdr *)(ip + 1);
> - tcpip_fillheaders(inp,
> -#ifdef NETFLIX_TCPOUDP
> - tp->t_port,
> -#endif
> - ip, th);
> + }
> + tcpip_fillheaders(inp, tp->t_port, ip, th);
> }
> /*
> * Fill in fields, remembering maximum advertised window for use in
> diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
> index dff7767cd9cf..6bdeb3984aee 100644
> --- a/sys/netinet/tcp_subr.c
> +++ b/sys/netinet/tcp_subr.c
> @@ -126,6 +126,8 @@ __FBSDID("$FreeBSD$");
> #ifdef TCP_OFFLOAD
> #include <netinet/tcp_offload.h>
> #endif
> +#include <netinet/udp.h>
> +#include <netinet/udp_var.h>
>
> #include <netipsec/ipsec_support.h>
>
> @@ -501,6 +503,80 @@ tcp_switch_back_to_default(struct tcpcb *tp)
> }
> }
>
> +static void
> +tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp,
> + const struct sockaddr *sa, void *ctx)
> +{
> + struct ip *iph;
> +#ifdef INET6
> + struct ip6_hdr *ip6;
> +#endif
> + struct udphdr *uh;
> + struct tcphdr *th;
> + int thlen;
> + uint16_t port;
> +
> + TCPSTAT_INC(tcps_tunneled_pkts);
> + if ((m->m_flags & M_PKTHDR) == 0) {
> + /* Can't handle one that is not a pkt hdr */
> + TCPSTAT_INC(tcps_tunneled_errs);
> + goto out;
> + }
> + thlen = sizeof(struct tcphdr);
> + if (m->m_len < off + sizeof(struct udphdr) + thlen &&
> + (m = m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) {
> + TCPSTAT_INC(tcps_tunneled_errs);
> + goto out;
> + }
> + iph = mtod(m, struct ip *);
> + uh = (struct udphdr *)((caddr_t)iph + off);
> + th = (struct tcphdr *)(uh + 1);
> + thlen = th->th_off << 2;
> + if (m->m_len < off + sizeof(struct udphdr) + thlen) {
> + m = m_pullup(m, off + sizeof(struct udphdr) + thlen);
> + if (m == NULL) {
> + TCPSTAT_INC(tcps_tunneled_errs);
> + goto out;
> + } else {
> + iph = mtod(m, struct ip *);
> + uh = (struct udphdr *)((caddr_t)iph + off);
> + th = (struct tcphdr *)(uh + 1);
> + }
> + }
> + m->m_pkthdr.tcp_tun_port = port = uh->uh_sport;
> + bcopy(th, uh, m->m_len - off);
> + m->m_len -= sizeof(struct udphdr);
> + m->m_pkthdr.len -= sizeof(struct udphdr);
> + /*
> + * We use the same algorithm for
> + * both UDP and TCP for c-sum. So
> + * the code in tcp_input will skip
> + * the checksum. So we do nothing
> + * with the flag (m->m_pkthdr.csum_flags).
> + */
> + switch (iph->ip_v) {
> +#ifdef INET
> + case IPVERSION:
> + iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr));
> + tcp_input_with_port(&m, &off, IPPROTO_TCP, port);
> + break;
> +#endif
> +#ifdef INET6
> + case IPV6_VERSION >> 4:
> + ip6 = mtod(m, struct ip6_hdr *);
> + ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr));
> + tcp6_input_with_port(&m, &off, IPPROTO_TCP, port);
> + break;
> +#endif
> + default:
> + goto out;
> + break;
> + }
> + return;
> +out:
> + m_freem(m);
> +}
> +
> static int
> sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
> {
> @@ -598,6 +674,183 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available,
> NULL, 0, sysctl_net_inet_list_available, "A",
> "list available TCP Function sets");
>
> +VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT;
> +
> +#ifdef INET
> +VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL;
> +#define V_udp4_tun_socket VNET(udp4_tun_socket)
> +#endif
> +#ifdef INET6
> +VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL;
> +#define V_udp6_tun_socket VNET(udp6_tun_socket)
> +#endif
> +
> +static void
> +tcp_over_udp_stop(void)
> +{
> + /*
> + * This function assumes sysctl caller holds inp_rinfo_lock()
> + * for writting!
> + */
> +#ifdef INET
> + if (V_udp4_tun_socket != NULL) {
> + soclose(V_udp4_tun_socket);
> + V_udp4_tun_socket = NULL;
> + }
> +#endif
> +#ifdef INET6
> + if (V_udp6_tun_socket != NULL) {
> + soclose(V_udp6_tun_socket);
> + V_udp6_tun_socket = NULL;
> + }
> +#endif
> +}
> +
> +static int
> +tcp_over_udp_start(void)
> +{
> + uint16_t port;
> + int ret;
> +#ifdef INET
> + struct sockaddr_in sin;
> +#endif
> +#ifdef INET6
> + struct sockaddr_in6 sin6;
> +#endif
> + /*
> + * This function assumes sysctl caller holds inp_info_rlock()
> + * for writting!
> + */
> + port = V_tcp_udp_tunneling_port;
> + if (ntohs(port) == 0) {
> + /* Must have a port set */
> + return (EINVAL);
> + }
> +#ifdef INET
> + if (V_udp4_tun_socket != NULL) {
> + /* Already running -- must stop first */
> + return (EALREADY);
> + }
> +#endif
> +#ifdef INET6
> + if (V_udp6_tun_socket != NULL) {
> + /* Already running -- must stop first */
> + return (EALREADY);
> + }
> +#endif
> +#ifdef INET
> + if ((ret = socreate(PF_INET, &V_udp4_tun_socket,
> + SOCK_DGRAM, IPPROTO_UDP,
> + curthread->td_ucred, curthread))) {
> + tcp_over_udp_stop();
> + return (ret);
> + }
> + /* Call the special UDP hook. */
> + if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket,
> + tcp_recv_udp_tunneled_packet,
> + tcp_ctlinput_viaudp,
> + NULL))) {
> + tcp_over_udp_stop();
> + return (ret);
> + }
> + /* Ok, we have a socket, bind it to the port. */
> + memset(&sin, 0, sizeof(struct sockaddr_in));
> + sin.sin_len = sizeof(struct sockaddr_in);
> + sin.sin_family = AF_INET;
> + sin.sin_port = htons(port);
> + if ((ret = sobind(V_udp4_tun_socket,
> + (struct sockaddr *)&sin, curthread))) {
> + tcp_over_udp_stop();
> + return (ret);
> + }
> +#endif
> +#ifdef INET6
> + if ((ret = socreate(PF_INET6, &V_udp6_tun_socket,
> + SOCK_DGRAM, IPPROTO_UDP,
> + curthread->td_ucred, curthread))) {
> + tcp_over_udp_stop();
> + return (ret);
> + }
> + /* Call the special UDP hook. */
> + if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket,
> + tcp_recv_udp_tunneled_packet,
> + tcp6_ctlinput_viaudp,
> + NULL))) {
> + tcp_over_udp_stop();
> + return (ret);
> + }
> + /* Ok, we have a socket, bind it to the port. */
> + memset(&sin6, 0, sizeof(struct sockaddr_in6));
> + sin6.sin6_len = sizeof(struct sockaddr_in6);
> + sin6.sin6_family = AF_INET6;
> + sin6.sin6_port = htons(port);
> + if ((ret = sobind(V_udp6_tun_socket,
> + (struct sockaddr *)&sin6, curthread))) {
> + tcp_over_udp_stop();
> + return (ret);
> + }
> +#endif
> + return (0);
> +}
> +
> +static int
> +sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS)
> +{
> + int error;
> + uint32_t old, new;
> +
> + old = V_tcp_udp_tunneling_port;
> + new = old;
> + error = sysctl_handle_int(oidp, &new, 0, req);
> + if ((error == 0) &&
> + (req->newptr != NULL)) {
> + if ((new < TCP_TUNNELING_PORT_MIN) ||
> + (new > TCP_TUNNELING_PORT_MAX)) {
> + error = EINVAL;
> + } else {
> + V_tcp_udp_tunneling_port = new;
> + if (old != 0) {
> + tcp_over_udp_stop();
> + }
> + if (new != 0) {
> + error = tcp_over_udp_start();
> + }
> + }
> + }
> + return (error);
> +}
> +
> +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port,
> + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
> + &VNET_NAME(tcp_udp_tunneling_port),
> + 0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU",
> + "Tunneling port for tcp over udp");
> +
> +VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT;
> +
> +static int
> +sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS)
> +{
> + int error, new;
> +
> + new = V_tcp_udp_tunneling_overhead;
> + error = sysctl_handle_int(oidp, &new, 0, req);
> + if (error == 0 && req->newptr) {
> + if ((new < TCP_TUNNELING_OVERHEAD_MIN) ||
> + (new > TCP_TUNNELING_OVERHEAD_MAX))
> + error = EINVAL;
> + else
> + V_tcp_udp_tunneling_overhead = new;
> + }
> + return (error);
> +}
> +
> +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead,
> + CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
> + &VNET_NAME(tcp_udp_tunneling_overhead),
> + 0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU",
> + "MSS reduction when using tcp over udp");
> +
> /*
> * Exports one (struct tcp_function_info) for each alias/name.
> */
> @@ -1305,7 +1558,7 @@ tcp_fini(void *xtp)
> * of the tcpcb each time to conserve mbufs.
> */
> void
> -tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
> +tcpip_fillheaders(struct inpcb *inp, uint16_t port, void *ip_ptr, void *tcp_ptr)
> {
> struct tcphdr *th = (struct tcphdr *)tcp_ptr;
>
> @@ -1320,7 +1573,10 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
> (inp->inp_flow & IPV6_FLOWINFO_MASK);
> ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
> (IPV6_VERSION & IPV6_VERSION_MASK);
> - ip6->ip6_nxt = IPPROTO_TCP;
> + if (port == 0)
> + ip6->ip6_nxt = IPPROTO_TCP;
> + else
> + ip6->ip6_nxt = IPPROTO_UDP;
> ip6->ip6_plen = htons(sizeof(struct tcphdr));
> ip6->ip6_src = inp->in6p_laddr;
> ip6->ip6_dst = inp->in6p_faddr;
> @@ -1342,7 +1598,10 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
> ip->ip_off = 0;
> ip->ip_ttl = inp->inp_ip_ttl;
> ip->ip_sum = 0;
> - ip->ip_p = IPPROTO_TCP;
> + if (port == 0)
> + ip->ip_p = IPPROTO_TCP;
> + else
> + ip->ip_p = IPPROTO_UDP;
> ip->ip_src = inp->inp_laddr;
> ip->ip_dst = inp->inp_faddr;
> }
> @@ -1372,7 +1631,7 @@ tcpip_maketemplate(struct inpcb *inp)
> t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
> if (t == NULL)
> return (NULL);
> - tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t);
> + tcpip_fillheaders(inp, 0, (void *)&t->tt_ipgen, (void *)&t->tt_t);
> return (t);
> }
>
> @@ -1398,14 +1657,16 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
> struct inpcb *inp;
> struct ip *ip;
> struct mbuf *optm;
> + struct udphdr *uh = NULL;
> struct tcphdr *nth;
> u_char *optp;
> #ifdef INET6
> struct ip6_hdr *ip6;
> int isipv6;
> #endif /* INET6 */
> - int optlen, tlen, win;
> + int optlen, tlen, win, ulen;
> bool incl_opts;
> + uint16_t port;
>
> KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
> NET_EPOCH_ASSERT();
> @@ -1423,6 +1684,19 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
> } else
> inp = NULL;
>
> + if (m != NULL) {
> +#ifdef INET6
> + if (isipv6 && ip6 && (ip6->ip6_nxt == IPPROTO_UDP))
> + port = m->m_pkthdr.tcp_tun_port;
> + else
> *** 1128 LINES SKIPPED ***
More information about the dev-commits-src-branches
mailing list