git: fa50e98328b4 - stable/13 - mend
Michael Tuexen
tuexen at FreeBSD.org
Mon Jun 7 09:01:53 UTC 2021
The branch stable/13 has been updated by tuexen:
URL: https://cgit.FreeBSD.org/src/commit/?id=fa50e98328b48da4fa8dbd97d0a787962cf249f5
commit fa50e98328b48da4fa8dbd97d0a787962cf249f5
Author: Michael Tuexen <tuexen at FreeBSD.org>
AuthorDate: 2021-04-18 14:08:08 +0000
Commit: Michael Tuexen <tuexen at FreeBSD.org>
CommitDate: 2021-06-07 09:01:28 +0000
mend
---
share/man/man4/tcp.4 | 15 +-
sys/netinet/tcp.h | 1 +
sys/netinet/tcp_input.c | 48 ++++-
sys/netinet/tcp_output.c | 80 ++++++--
sys/netinet/tcp_stacks/bbr.c | 38 +---
sys/netinet/tcp_stacks/rack.c | 26 +--
sys/netinet/tcp_subr.c | 462 ++++++++++++++++++++++++++++++++++++++++--
sys/netinet/tcp_syncache.c | 127 +++++++++---
sys/netinet/tcp_syncache.h | 12 +-
sys/netinet/tcp_timewait.c | 84 ++++++--
sys/netinet/tcp_usrreq.c | 30 +++
sys/netinet/tcp_var.h | 27 ++-
sys/netinet/toecore.c | 4 +-
sys/netinet6/tcp6_var.h | 2 +
sys/sys/mbuf.h | 1 +
usr.bin/netstat/inet.c | 4 +
usr.bin/sockstat/sockstat.1 | 6 +-
usr.bin/sockstat/sockstat.c | 13 +-
18 files changed, 822 insertions(+), 158 deletions(-)
diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
index d01505e58427..b5735a40b320 100644
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@@ -34,7 +34,7 @@
.\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93
.\" $FreeBSD$
.\"
-.Dd April 8, 2021
+.Dd April 18, 2021
.Dt TCP 4
.Os
.Sh NAME
@@ -329,6 +329,9 @@ currently executing.
This is typically used after a process or thread inherits a listen
socket from its parent, and sets its CPU affinity to a particular core.
.El
+.It Dv TCP_REMOTE_UDP_ENCAPS_PORT
+Set and get the remote UDP encapsulation port.
+It can only be set on a closed TCP socket.
.El
.Pp
The option level for the
@@ -752,6 +755,16 @@ A CSV list of template_spec=percent key-value pairs which controls the per
template sampling rates when
.Xr stats 3
sampling is enabled.
+.It Va udp_tunneling_port
+The local UDP encapsulation port.
+A value of 0 indicates that UDP encapsulation is disabled.
+The default is 0.
+.It Va udp_tunneling_overhead
+The overhead taken into account when using UDP encapsulation.
+Since MSS clamping by middleboxes will most likely not work, values larger than
+8 (the size of the UDP header) are also supported.
+Supported values are between 8 and 1024.
+The default is 8.
.El
.Sh ERRORS
A socket operation may fail with one of the following errors returned:
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
index 0b71bd4658f8..d2bf1f8431fd 100644
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -183,6 +183,7 @@ struct tcphdr {
#define TCP_RXTLS_MODE 42 /* Receive TLS mode */
#define TCP_CONGESTION 64 /* get/set congestion control algorithm */
#define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */
+#define TCP_REMOTE_UDP_ENCAPS_PORT 71 /* Enable TCP over UDP tunneling via the specified port */
#define TCP_DELACK 72 /* socket option for delayed ack */
#define TCP_FIN_IS_RST 73 /* A fin from the peer is treated has a RST */
#define TCP_LOG_LIMIT 74 /* Limit to number of records in tcp-log */
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 397cbc5084e6..d36f9566ffba 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -123,6 +123,7 @@ __FBSDID("$FreeBSD$");
#ifdef TCP_OFFLOAD
#include <netinet/tcp_offload.h>
#endif
+#include <netinet/udp.h>
#include <netipsec/ipsec_support.h>
@@ -573,7 +574,7 @@ cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos)
*/
#ifdef INET6
int
-tcp6_input(struct mbuf **mp, int *offp, int proto)
+tcp6_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
{
struct mbuf *m;
struct in6_ifaddr *ia6;
@@ -603,12 +604,19 @@ tcp6_input(struct mbuf **mp, int *offp, int proto)
}
*mp = m;
- return (tcp_input(mp, offp, proto));
+ return (tcp_input_with_port(mp, offp, proto, port));
+}
+
+int
+tcp6_input(struct mbuf **mp, int *offp, int proto)
+{
+
+ return(tcp6_input_with_port(mp, offp, proto, 0));
}
#endif /* INET6 */
int
-tcp_input(struct mbuf **mp, int *offp, int proto)
+tcp_input_with_port(struct mbuf **mp, int *offp, int proto, uint16_t port)
{
struct mbuf *m = *mp;
struct tcphdr *th = NULL;
@@ -664,6 +672,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
ip6 = mtod(m, struct ip6_hdr *);
th = (struct tcphdr *)((caddr_t)ip6 + off0);
tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
+ if (port)
+ goto skip6_csum;
if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
th->th_sum = m->m_pkthdr.csum_data;
@@ -677,7 +687,7 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
TCPSTAT_INC(tcps_rcvbadsum);
goto drop;
}
-
+ skip6_csum:
/*
* Be proactive about unspecified IPv6 address in source.
* As we use all-zero to indicate unbounded/unconnected pcb,
@@ -718,6 +728,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
tlen = ntohs(ip->ip_len) - off0;
iptos = ip->ip_tos;
+ if (port)
+ goto skip_csum;
if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
th->th_sum = m->m_pkthdr.csum_data;
@@ -747,8 +759,8 @@ tcp_input(struct mbuf **mp, int *offp, int proto)
ip->ip_v = IPVERSION;
ip->ip_hl = off0 >> 2;
}
-
- if (th->th_sum) {
+ skip_csum:
+ if (th->th_sum && (port == 0)) {
TCPSTAT_INC(tcps_rcvbadsum);
goto drop;
}
@@ -1006,6 +1018,11 @@ findpcb:
goto dropwithreset;
}
+ if ((tp->t_port != port) && (tp->t_state > TCPS_LISTEN)) {
+ rstreason = BANDLIM_RST_CLOSEDPORT;
+ goto dropwithreset;
+ }
+
#ifdef TCP_OFFLOAD
if (tp->t_flags & TF_TOE) {
tcp_offload_input(tp, m);
@@ -1077,7 +1094,7 @@ findpcb:
* NB: syncache_expand() doesn't unlock
* inp and tcpinfo locks.
*/
- rstreason = syncache_expand(&inc, &to, th, &so, m);
+ rstreason = syncache_expand(&inc, &to, th, &so, m, port);
if (rstreason < 0) {
/*
* A failing TCP MD5 signature comparison
@@ -1157,7 +1174,7 @@ tfo_socket_result:
* causes.
*/
if (thflags & TH_RST) {
- syncache_chkrst(&inc, th, m);
+ syncache_chkrst(&inc, th, m, port);
goto dropunlock;
}
/*
@@ -1179,7 +1196,7 @@ tfo_socket_result:
log(LOG_DEBUG, "%s; %s: Listen socket: "
"SYN|ACK invalid, segment rejected\n",
s, __func__);
- syncache_badack(&inc); /* XXX: Not needed! */
+ syncache_badack(&inc, port); /* XXX: Not needed! */
TCPSTAT_INC(tcps_badsyn);
rstreason = BANDLIM_RST_OPENPORT;
goto dropwithreset;
@@ -1336,7 +1353,8 @@ tfo_socket_result:
#endif
TCP_PROBE3(debug__input, tp, th, m);
tcp_dooptions(&to, optp, optlen, TO_SYN);
- if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL, iptos))
+ if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL, iptos,
+ port))
goto tfo_socket_result;
/*
@@ -1467,6 +1485,12 @@ tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
return (newsize);
}
+int
+tcp_input(struct mbuf **mp, int *offp, int proto)
+{
+ return(tcp_input_with_port(mp, offp, proto, 0));
+}
+
void
tcp_handle_wakeup(struct tcpcb *tp, struct socket *so)
{
@@ -3671,11 +3695,13 @@ tcp_mss_update(struct tcpcb *tp, int offer, int mtuoffer,
sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
sizeof (struct tcpiphdr);
#else
- const size_t min_protoh = sizeof(struct tcpiphdr);
+ size_t min_protoh = sizeof(struct tcpiphdr);
#endif
INP_WLOCK_ASSERT(tp->t_inpcb);
+ if (tp->t_port)
+ min_protoh += V_tcp_udp_tunneling_overhead;
if (mtuoffer != -1) {
KASSERT(offer == -1, ("%s: conflict", __func__));
offer = mtuoffer - min_protoh;
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index e23cdc749e98..5bda2be14df0 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -101,6 +101,8 @@ __FBSDID("$FreeBSD$");
#include <netipsec/ipsec_support.h>
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
#include <machine/in_cksum.h>
#include <security/mac/mac_framework.h>
@@ -207,7 +209,7 @@ tcp_output(struct tcpcb *tp)
#endif
struct tcphdr *th;
u_char opt[TCP_MAXOLEN];
- unsigned ipoptlen, optlen, hdrlen;
+ unsigned ipoptlen, optlen, hdrlen, ulen;
#if defined(IPSEC) || defined(IPSEC_SUPPORT)
unsigned ipsec_optlen = 0;
#endif
@@ -216,6 +218,7 @@ tcp_output(struct tcpcb *tp)
struct sackhole *p;
int tso, mtu;
struct tcpopt to;
+ struct udphdr *udp = NULL;
unsigned int wanted_cookie = 0;
unsigned int dont_sendalot = 0;
#if 0
@@ -558,6 +561,7 @@ after_sack_rexmit:
#endif
if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
+ (tp->t_port == 0) &&
((tp->t_flags & TF_SIGNATURE) == 0) &&
tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
ipoptlen == 0 && !(flags & TH_SYN))
@@ -800,6 +804,8 @@ send:
/* Maximum segment size. */
if (flags & TH_SYN) {
to.to_mss = tcp_mssopt(&tp->t_inpcb->inp_inc);
+ if (tp->t_port)
+ to.to_mss -= V_tcp_udp_tunneling_overhead;
to.to_flags |= TOF_MSS;
/*
@@ -887,7 +893,14 @@ send:
!(to.to_flags & TOF_FASTOPEN))
len = 0;
}
-
+ if (tp->t_port) {
+ if (V_tcp_udp_tunneling_port == 0) {
+ /* The port was removed?? */
+ SOCKBUF_UNLOCK(&so->so_snd);
+ return (EHOSTUNREACH);
+ }
+ hdrlen += sizeof(struct udphdr);
+ }
/*
* Adjust data length if insertion of options will
* bump the packet length beyond the t_maxseg length.
@@ -1140,8 +1153,17 @@ send:
#ifdef INET6
if (isipv6) {
ip6 = mtod(m, struct ip6_hdr *);
- th = (struct tcphdr *)(ip6 + 1);
- tcpip_fillheaders(tp->t_inpcb, ip6, th);
+ if (tp->t_port) {
+ udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
+ udp->uh_sport = htons(V_tcp_udp_tunneling_port);
+ udp->uh_dport = tp->t_port;
+ ulen = hdrlen + len - sizeof(struct ip6_hdr);
+ udp->uh_ulen = htons(ulen);
+ th = (struct tcphdr *)(udp + 1);
+ } else {
+ th = (struct tcphdr *)(ip6 + 1);
+ }
+ tcpip_fillheaders(tp->t_inpcb, tp->t_port, ip6, th);
} else
#endif /* INET6 */
{
@@ -1149,8 +1171,16 @@ send:
#ifdef TCPDEBUG
ipov = (struct ipovly *)ip;
#endif
- th = (struct tcphdr *)(ip + 1);
- tcpip_fillheaders(tp->t_inpcb, ip, th);
+ if (tp->t_port) {
+ udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
+ udp->uh_sport = htons(V_tcp_udp_tunneling_port);
+ udp->uh_dport = tp->t_port;
+ ulen = hdrlen + len - sizeof(struct ip);
+ udp->uh_ulen = htons(ulen);
+ th = (struct tcphdr *)(udp + 1);
+ } else
+ th = (struct tcphdr *)(ip + 1);
+ tcpip_fillheaders(tp->t_inpcb, tp->t_port, ip, th);
}
/*
@@ -1309,7 +1339,6 @@ send:
* checksum extended header and data.
*/
m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */
- m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
if (to.to_flags & TOF_SIGNATURE) {
@@ -1336,9 +1365,19 @@ send:
* There is no need to fill in ip6_plen right now.
* It will be filled later by ip6_output.
*/
- m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
- th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
- optlen + len, IPPROTO_TCP, 0);
+ if (tp->t_port) {
+ m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
+ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+ udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
+ th->th_sum = htons(0);
+ UDPSTAT_INC(udps_opackets);
+ } else {
+ m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ th->th_sum = in6_cksum_pseudo(ip6,
+ sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
+ 0);
+ }
}
#endif
#if defined(INET6) && defined(INET)
@@ -1346,9 +1385,20 @@ send:
#endif
#ifdef INET
{
- m->m_pkthdr.csum_flags = CSUM_TCP;
- th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
- htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen));
+ if (tp->t_port) {
+ m->m_pkthdr.csum_flags = CSUM_UDP;
+ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
+ udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
+ th->th_sum = htons(0);
+ UDPSTAT_INC(udps_opackets);
+ } else {
+ m->m_pkthdr.csum_flags = CSUM_TCP;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ th->th_sum = in_pseudo(ip->ip_src.s_addr,
+ ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
+ IPPROTO_TCP + len + optlen));
+ }
/* IP version must be set here for ipv4/ipv6 checking later */
KASSERT(ip->ip_v == IPVERSION,
@@ -1473,8 +1523,10 @@ send:
* NB: Don't set DF on small MTU/MSS to have a safe fallback.
*/
if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) {
- ip->ip_off |= htons(IP_DF);
tp->t_flags2 |= TF2_PLPMTU_PMTUD;
+ if (tp->t_port == 0 || len < V_tcp_minmss) {
+ ip->ip_off |= htons(IP_DF);
+ }
} else {
tp->t_flags2 &= ~TF2_PLPMTU_PMTUD;
}
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index cc20d6bf52ca..1ee8d26446fd 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -11969,14 +11969,10 @@ bbr_output_wtime(struct tcpcb *tp, const struct timeval *tv)
#endif
struct tcp_bbr *bbr;
struct tcphdr *th;
-#ifdef NETFLIX_TCPOUDP
struct udphdr *udp = NULL;
-#endif
u_char opt[TCP_MAXOLEN];
unsigned ipoptlen, optlen, hdrlen;
-#ifdef NETFLIX_TCPOUDP
unsigned ulen;
-#endif
uint32_t bbr_seq;
uint32_t delay_calc=0;
uint8_t doing_tlp = 0;
@@ -12991,10 +12987,8 @@ send:
/* Maximum segment size. */
if (flags & TH_SYN) {
to.to_mss = tcp_mssopt(&inp->inp_inc);
-#ifdef NETFLIX_TCPOUDP
if (tp->t_port)
to.to_mss -= V_tcp_udp_tunneling_overhead;
-#endif
to.to_flags |= TOF_MSS;
/*
* On SYN or SYN|ACK transmits on TFO connections,
@@ -13063,7 +13057,6 @@ send:
!(to.to_flags & TOF_FASTOPEN))
len = 0;
}
-#ifdef NETFLIX_TCPOUDP
if (tp->t_port) {
if (V_tcp_udp_tunneling_port == 0) {
/* The port was removed?? */
@@ -13072,7 +13065,6 @@ send:
}
hdrlen += sizeof(struct udphdr);
}
-#endif
#ifdef INET6
if (isipv6)
ipoptlen = ip6_optlen(tp->t_inpcb);
@@ -13408,7 +13400,6 @@ send:
#ifdef INET6
if (isipv6) {
ip6 = mtod(m, struct ip6_hdr *);
-#ifdef NETFLIX_TCPOUDP
if (tp->t_port) {
udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
udp->uh_sport = htons(V_tcp_udp_tunneling_port);
@@ -13417,17 +13408,9 @@ send:
udp->uh_ulen = htons(ulen);
th = (struct tcphdr *)(udp + 1);
} else {
-#endif
th = (struct tcphdr *)(ip6 + 1);
-
-#ifdef NETFLIX_TCPOUDP
}
-#endif
- tcpip_fillheaders(inp,
-#ifdef NETFLIX_TCPOUDP
- tp->t_port,
-#endif
- ip6, th);
+ tcpip_fillheaders(inp, tp->t_port, ip6, th);
} else
#endif /* INET6 */
{
@@ -13435,7 +13418,6 @@ send:
#ifdef TCPDEBUG
ipov = (struct ipovly *)ip;
#endif
-#ifdef NETFLIX_TCPOUDP
if (tp->t_port) {
udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
udp->uh_sport = htons(V_tcp_udp_tunneling_port);
@@ -13443,14 +13425,10 @@ send:
ulen = hdrlen + len - sizeof(struct ip);
udp->uh_ulen = htons(ulen);
th = (struct tcphdr *)(udp + 1);
- } else
-#endif
+ } else {
th = (struct tcphdr *)(ip + 1);
- tcpip_fillheaders(inp,
-#ifdef NETFLIX_TCPOUDP
- tp->t_port,
-#endif
- ip, th);
+ }
+ tcpip_fillheaders(inp, tp->t_port, ip, th);
}
/*
* If we are doing retransmissions, then snd_nxt will not reflect
@@ -13600,7 +13578,6 @@ send:
* ip6_plen is not need to be filled now, and will be filled
* in ip6_output.
*/
-#ifdef NETFLIX_TCPOUDP
if (tp->t_port) {
m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
@@ -13608,14 +13585,11 @@ send:
th->th_sum = htons(0);
UDPSTAT_INC(udps_opackets);
} else {
-#endif
csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) +
optlen + len, IPPROTO_TCP, 0);
-#ifdef NETFLIX_TCPOUDP
}
-#endif
}
#endif
#if defined(INET6) && defined(INET)
@@ -13623,7 +13597,6 @@ send:
#endif
#ifdef INET
{
-#ifdef NETFLIX_TCPOUDP
if (tp->t_port) {
m->m_pkthdr.csum_flags = CSUM_UDP;
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
@@ -13632,15 +13605,12 @@ send:
th->th_sum = htons(0);
UDPSTAT_INC(udps_opackets);
} else {
-#endif
csum_flags = m->m_pkthdr.csum_flags = CSUM_TCP;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
th->th_sum = in_pseudo(ip->ip_src.s_addr,
ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
IPPROTO_TCP + len + optlen));
-#ifdef NETFLIX_TCPOUDP
}
-#endif
/* IP version must be set here for ipv4/ipv6 checking later */
KASSERT(ip->ip_v == IPVERSION,
("%s: IP version incorrect: %d", __func__, ip->ip_v));
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 0ee73a95a6d7..12827d1699d0 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -13008,10 +13008,8 @@ send:
if (flags & TH_SYN) {
tp->snd_nxt = tp->iss;
to.to_mss = tcp_mssopt(&inp->inp_inc);
-#ifdef NETFLIX_TCPOUDP
if (tp->t_port)
to.to_mss -= V_tcp_udp_tunneling_overhead;
-#endif
to.to_flags |= TOF_MSS;
/*
@@ -13088,7 +13086,6 @@ send:
!(to.to_flags & TOF_FASTOPEN))
len = 0;
}
-#ifdef NETFLIX_TCPOUDP
if (tp->t_port) {
if (V_tcp_udp_tunneling_port == 0) {
/* The port was removed?? */
@@ -13097,7 +13094,6 @@ send:
}
hdrlen += sizeof(struct udphdr);
}
-#endif
#ifdef INET6
if (isipv6)
ipoptlen = ip6_optlen(tp->t_inpcb);
@@ -13372,7 +13368,6 @@ send:
#ifdef INET6
if (isipv6) {
ip6 = mtod(m, struct ip6_hdr *);
-#ifdef NETFLIX_TCPOUDP
if (tp->t_port) {
udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
udp->uh_sport = htons(V_tcp_udp_tunneling_port);
@@ -13380,14 +13375,10 @@ send:
ulen = hdrlen + len - sizeof(struct ip6_hdr);
udp->uh_ulen = htons(ulen);
th = (struct tcphdr *)(udp + 1);
- } else
-#endif
+ } else {
th = (struct tcphdr *)(ip6 + 1);
- tcpip_fillheaders(inp,
-#ifdef NETFLIX_TCPOUDP
- tp->t_port,
-#endif
- ip6, th);
+ }
+ tcpip_fillheaders(inp, tp->t_port, ip6, th);
} else
#endif /* INET6 */
{
@@ -13395,7 +13386,6 @@ send:
#ifdef TCPDEBUG
ipov = (struct ipovly *)ip;
#endif
-#ifdef NETFLIX_TCPOUDP
if (tp->t_port) {
udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
udp->uh_sport = htons(V_tcp_udp_tunneling_port);
@@ -13403,14 +13393,10 @@ send:
ulen = hdrlen + len - sizeof(struct ip);
udp->uh_ulen = htons(ulen);
th = (struct tcphdr *)(udp + 1);
- } else
-#endif
+ } else {
th = (struct tcphdr *)(ip + 1);
- tcpip_fillheaders(inp,
-#ifdef NETFLIX_TCPOUDP
- tp->t_port,
-#endif
- ip, th);
+ }
+ tcpip_fillheaders(inp, tp->t_port, ip, th);
}
/*
* Fill in fields, remembering maximum advertised window for use in
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index dff7767cd9cf..6bdeb3984aee 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -126,6 +126,8 @@ __FBSDID("$FreeBSD$");
#ifdef TCP_OFFLOAD
#include <netinet/tcp_offload.h>
#endif
+#include <netinet/udp.h>
+#include <netinet/udp_var.h>
#include <netipsec/ipsec_support.h>
@@ -501,6 +503,80 @@ tcp_switch_back_to_default(struct tcpcb *tp)
}
}
+static void
+tcp_recv_udp_tunneled_packet(struct mbuf *m, int off, struct inpcb *inp,
+ const struct sockaddr *sa, void *ctx)
+{
+ struct ip *iph;
+#ifdef INET6
+ struct ip6_hdr *ip6;
+#endif
+ struct udphdr *uh;
+ struct tcphdr *th;
+ int thlen;
+ uint16_t port;
+
+ TCPSTAT_INC(tcps_tunneled_pkts);
+ if ((m->m_flags & M_PKTHDR) == 0) {
+ /* Can't handle one that is not a pkt hdr */
+ TCPSTAT_INC(tcps_tunneled_errs);
+ goto out;
+ }
+ thlen = sizeof(struct tcphdr);
+ if (m->m_len < off + sizeof(struct udphdr) + thlen &&
+ (m = m_pullup(m, off + sizeof(struct udphdr) + thlen)) == NULL) {
+ TCPSTAT_INC(tcps_tunneled_errs);
+ goto out;
+ }
+ iph = mtod(m, struct ip *);
+ uh = (struct udphdr *)((caddr_t)iph + off);
+ th = (struct tcphdr *)(uh + 1);
+ thlen = th->th_off << 2;
+ if (m->m_len < off + sizeof(struct udphdr) + thlen) {
+ m = m_pullup(m, off + sizeof(struct udphdr) + thlen);
+ if (m == NULL) {
+ TCPSTAT_INC(tcps_tunneled_errs);
+ goto out;
+ } else {
+ iph = mtod(m, struct ip *);
+ uh = (struct udphdr *)((caddr_t)iph + off);
+ th = (struct tcphdr *)(uh + 1);
+ }
+ }
+ m->m_pkthdr.tcp_tun_port = port = uh->uh_sport;
+ bcopy(th, uh, m->m_len - off);
+ m->m_len -= sizeof(struct udphdr);
+ m->m_pkthdr.len -= sizeof(struct udphdr);
+ /*
+ * We use the same algorithm for
+ * both UDP and TCP for c-sum. So
+ * the code in tcp_input will skip
+ * the checksum. So we do nothing
+ * with the flag (m->m_pkthdr.csum_flags).
+ */
+ switch (iph->ip_v) {
+#ifdef INET
+ case IPVERSION:
+ iph->ip_len = htons(ntohs(iph->ip_len) - sizeof(struct udphdr));
+ tcp_input_with_port(&m, &off, IPPROTO_TCP, port);
+ break;
+#endif
+#ifdef INET6
+ case IPV6_VERSION >> 4:
+ ip6 = mtod(m, struct ip6_hdr *);
+ ip6->ip6_plen = htons(ntohs(ip6->ip6_plen) - sizeof(struct udphdr));
+ tcp6_input_with_port(&m, &off, IPPROTO_TCP, port);
+ break;
+#endif
+ default:
+ goto out;
+ break;
+ }
+ return;
+out:
+ m_freem(m);
+}
+
static int
sysctl_net_inet_default_tcp_functions(SYSCTL_HANDLER_ARGS)
{
@@ -598,6 +674,183 @@ SYSCTL_PROC(_net_inet_tcp, OID_AUTO, functions_available,
NULL, 0, sysctl_net_inet_list_available, "A",
"list available TCP Function sets");
+VNET_DEFINE(int, tcp_udp_tunneling_port) = TCP_TUNNELING_PORT_DEFAULT;
+
+#ifdef INET
+VNET_DEFINE(struct socket *, udp4_tun_socket) = NULL;
+#define V_udp4_tun_socket VNET(udp4_tun_socket)
+#endif
+#ifdef INET6
+VNET_DEFINE(struct socket *, udp6_tun_socket) = NULL;
+#define V_udp6_tun_socket VNET(udp6_tun_socket)
+#endif
+
+static void
+tcp_over_udp_stop(void)
+{
+ /*
+ * This function assumes sysctl caller holds inp_rinfo_lock()
+ * for writting!
+ */
+#ifdef INET
+ if (V_udp4_tun_socket != NULL) {
+ soclose(V_udp4_tun_socket);
+ V_udp4_tun_socket = NULL;
+ }
+#endif
+#ifdef INET6
+ if (V_udp6_tun_socket != NULL) {
+ soclose(V_udp6_tun_socket);
+ V_udp6_tun_socket = NULL;
+ }
+#endif
+}
+
+static int
+tcp_over_udp_start(void)
+{
+ uint16_t port;
+ int ret;
+#ifdef INET
+ struct sockaddr_in sin;
+#endif
+#ifdef INET6
+ struct sockaddr_in6 sin6;
+#endif
+ /*
+ * This function assumes sysctl caller holds inp_info_rlock()
+ * for writting!
+ */
+ port = V_tcp_udp_tunneling_port;
+ if (ntohs(port) == 0) {
+ /* Must have a port set */
+ return (EINVAL);
+ }
+#ifdef INET
+ if (V_udp4_tun_socket != NULL) {
+ /* Already running -- must stop first */
+ return (EALREADY);
+ }
+#endif
+#ifdef INET6
+ if (V_udp6_tun_socket != NULL) {
+ /* Already running -- must stop first */
+ return (EALREADY);
+ }
+#endif
+#ifdef INET
+ if ((ret = socreate(PF_INET, &V_udp4_tun_socket,
+ SOCK_DGRAM, IPPROTO_UDP,
+ curthread->td_ucred, curthread))) {
+ tcp_over_udp_stop();
+ return (ret);
+ }
+ /* Call the special UDP hook. */
+ if ((ret = udp_set_kernel_tunneling(V_udp4_tun_socket,
+ tcp_recv_udp_tunneled_packet,
+ tcp_ctlinput_viaudp,
+ NULL))) {
+ tcp_over_udp_stop();
+ return (ret);
+ }
+ /* Ok, we have a socket, bind it to the port. */
+ memset(&sin, 0, sizeof(struct sockaddr_in));
+ sin.sin_len = sizeof(struct sockaddr_in);
+ sin.sin_family = AF_INET;
+ sin.sin_port = htons(port);
+ if ((ret = sobind(V_udp4_tun_socket,
+ (struct sockaddr *)&sin, curthread))) {
+ tcp_over_udp_stop();
+ return (ret);
+ }
+#endif
+#ifdef INET6
+ if ((ret = socreate(PF_INET6, &V_udp6_tun_socket,
+ SOCK_DGRAM, IPPROTO_UDP,
+ curthread->td_ucred, curthread))) {
+ tcp_over_udp_stop();
+ return (ret);
+ }
+ /* Call the special UDP hook. */
+ if ((ret = udp_set_kernel_tunneling(V_udp6_tun_socket,
+ tcp_recv_udp_tunneled_packet,
+ tcp6_ctlinput_viaudp,
+ NULL))) {
+ tcp_over_udp_stop();
+ return (ret);
+ }
+ /* Ok, we have a socket, bind it to the port. */
+ memset(&sin6, 0, sizeof(struct sockaddr_in6));
+ sin6.sin6_len = sizeof(struct sockaddr_in6);
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_port = htons(port);
+ if ((ret = sobind(V_udp6_tun_socket,
+ (struct sockaddr *)&sin6, curthread))) {
+ tcp_over_udp_stop();
+ return (ret);
+ }
+#endif
+ return (0);
+}
+
+static int
+sysctl_net_inet_tcp_udp_tunneling_port_check(SYSCTL_HANDLER_ARGS)
+{
+ int error;
+ uint32_t old, new;
+
+ old = V_tcp_udp_tunneling_port;
+ new = old;
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if ((error == 0) &&
+ (req->newptr != NULL)) {
+ if ((new < TCP_TUNNELING_PORT_MIN) ||
+ (new > TCP_TUNNELING_PORT_MAX)) {
+ error = EINVAL;
+ } else {
+ V_tcp_udp_tunneling_port = new;
+ if (old != 0) {
+ tcp_over_udp_stop();
+ }
+ if (new != 0) {
+ error = tcp_over_udp_start();
+ }
+ }
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_port,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ &VNET_NAME(tcp_udp_tunneling_port),
+ 0, &sysctl_net_inet_tcp_udp_tunneling_port_check, "IU",
+ "Tunneling port for tcp over udp");
+
+VNET_DEFINE(int, tcp_udp_tunneling_overhead) = TCP_TUNNELING_OVERHEAD_DEFAULT;
+
+static int
+sysctl_net_inet_tcp_udp_tunneling_overhead_check(SYSCTL_HANDLER_ARGS)
+{
+ int error, new;
+
+ new = V_tcp_udp_tunneling_overhead;
+ error = sysctl_handle_int(oidp, &new, 0, req);
+ if (error == 0 && req->newptr) {
+ if ((new < TCP_TUNNELING_OVERHEAD_MIN) ||
+ (new > TCP_TUNNELING_OVERHEAD_MAX))
+ error = EINVAL;
+ else
+ V_tcp_udp_tunneling_overhead = new;
+ }
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, udp_tunneling_overhead,
+ CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ &VNET_NAME(tcp_udp_tunneling_overhead),
+ 0, &sysctl_net_inet_tcp_udp_tunneling_overhead_check, "IU",
+ "MSS reduction when using tcp over udp");
+
/*
* Exports one (struct tcp_function_info) for each alias/name.
*/
@@ -1305,7 +1558,7 @@ tcp_fini(void *xtp)
* of the tcpcb each time to conserve mbufs.
*/
void
-tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
+tcpip_fillheaders(struct inpcb *inp, uint16_t port, void *ip_ptr, void *tcp_ptr)
{
struct tcphdr *th = (struct tcphdr *)tcp_ptr;
@@ -1320,7 +1573,10 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
(inp->inp_flow & IPV6_FLOWINFO_MASK);
ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
(IPV6_VERSION & IPV6_VERSION_MASK);
- ip6->ip6_nxt = IPPROTO_TCP;
+ if (port == 0)
+ ip6->ip6_nxt = IPPROTO_TCP;
+ else
+ ip6->ip6_nxt = IPPROTO_UDP;
ip6->ip6_plen = htons(sizeof(struct tcphdr));
ip6->ip6_src = inp->in6p_laddr;
ip6->ip6_dst = inp->in6p_faddr;
@@ -1342,7 +1598,10 @@ tcpip_fillheaders(struct inpcb *inp, void *ip_ptr, void *tcp_ptr)
ip->ip_off = 0;
ip->ip_ttl = inp->inp_ip_ttl;
ip->ip_sum = 0;
- ip->ip_p = IPPROTO_TCP;
+ if (port == 0)
+ ip->ip_p = IPPROTO_TCP;
+ else
+ ip->ip_p = IPPROTO_UDP;
ip->ip_src = inp->inp_laddr;
ip->ip_dst = inp->inp_faddr;
}
@@ -1372,7 +1631,7 @@ tcpip_maketemplate(struct inpcb *inp)
t = malloc(sizeof(*t), M_TEMP, M_NOWAIT);
if (t == NULL)
return (NULL);
- tcpip_fillheaders(inp, (void *)&t->tt_ipgen, (void *)&t->tt_t);
+ tcpip_fillheaders(inp, 0, (void *)&t->tt_ipgen, (void *)&t->tt_t);
return (t);
}
@@ -1398,14 +1657,16 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
struct inpcb *inp;
struct ip *ip;
struct mbuf *optm;
+ struct udphdr *uh = NULL;
struct tcphdr *nth;
u_char *optp;
#ifdef INET6
struct ip6_hdr *ip6;
int isipv6;
#endif /* INET6 */
- int optlen, tlen, win;
+ int optlen, tlen, win, ulen;
bool incl_opts;
+ uint16_t port;
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
NET_EPOCH_ASSERT();
@@ -1423,6 +1684,19 @@ tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
} else
inp = NULL;
+ if (m != NULL) {
+#ifdef INET6
+ if (isipv6 && ip6 && (ip6->ip6_nxt == IPPROTO_UDP))
+ port = m->m_pkthdr.tcp_tun_port;
+ else
*** 1128 LINES SKIPPED ***
More information about the dev-commits-src-all
mailing list