git: 28d0a740dd9a - main - ktls: auto-disable ifnet (inline hw) kTLS

Andrew Gallatin gallatin at FreeBSD.org
Tue Jul 6 14:29:29 UTC 2021


The branch main has been updated by gallatin:

URL: https://cgit.FreeBSD.org/src/commit/?id=28d0a740dd9a67e4a4fa9fda5bb39b5963316f35

commit 28d0a740dd9a67e4a4fa9fda5bb39b5963316f35
Author:     Andrew Gallatin <gallatin at FreeBSD.org>
AuthorDate: 2021-07-06 14:17:33 +0000
Commit:     Andrew Gallatin <gallatin at FreeBSD.org>
CommitDate: 2021-07-06 14:28:32 +0000

    ktls: auto-disable ifnet (inline hw) kTLS
    
    Ifnet (inline) hw kTLS NICs typically keep state within
    a TLS record, so that when transmitting in-order,
    they can continue encryption on each segment sent without
    DMA'ing extra state from the host.
    
    This breaks down when transmits are out of order (eg,
    TCP retransmits).  In this case, the NIC must re-DMA
    the entire TLS record up to and including the segment
    being retransmitted.  This means that when re-transmitting
    the last 1448 byte segment of a TLS record, the NIC will
    have to re-DMA the entire 16KB TLS record. This can lead
    to the NIC running out of PCIe bus bandwidth well before
    it saturates the network link if a lot of TCP connections have
    a high retransmoit rate.
    
    This change introduces a new sysctl (kern.ipc.tls.ifnet_max_rexmit_pct),
    where TCP connections with higher retransmit rate will be
    switched to SW kTLS so as to conserve PCIe bandwidth.
    
    Reviewed by:    hselasky, markj, rrs
    Sponsored by:   Netflix
    Differential Revision:  https://reviews.freebsd.org/D30908
---
 sys/kern/uipc_ktls.c  | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++
 sys/netinet/tcp_var.h |  13 +++++-
 sys/sys/ktls.h        |  15 ++++++-
 3 files changed, 133 insertions(+), 2 deletions(-)

diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
index 7e87e7c740e3..88e29157289d 100644
--- a/sys/kern/uipc_ktls.c
+++ b/sys/kern/uipc_ktls.c
@@ -30,6 +30,7 @@ __FBSDID("$FreeBSD$");
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
+#include "opt_kern_tls.h"
 #include "opt_ratelimit.h"
 #include "opt_rss.h"
 
@@ -121,6 +122,11 @@ SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads, CTLFLAG_RD,
     &ktls_number_threads, 0,
     "Number of TLS threads in thread-pool");
 
+unsigned int ktls_ifnet_max_rexmit_pct = 2;
+SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN,
+    &ktls_ifnet_max_rexmit_pct, 2,
+    "Max percent bytes retransmitted before ifnet TLS is disabled");
+
 static bool ktls_offload_enable;
 SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN,
     &ktls_offload_enable, 0,
@@ -184,6 +190,14 @@ static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed);
 SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed, CTLFLAG_RD,
     &ktls_switch_failed, "TLS sessions unable to switch between SW and ifnet");
 
+static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail);
+SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed, CTLFLAG_RD,
+    &ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from ifnet");
+
+static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok);
+SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok, CTLFLAG_RD,
+    &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from ifnet");
+
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "Software TLS session stats");
 SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
@@ -2187,3 +2201,96 @@ ktls_work_thread(void *ctx)
 		}
 	}
 }
+
+static void
+ktls_disable_ifnet_help(void *context, int pending __unused)
+{
+	struct ktls_session *tls;
+	struct inpcb *inp;
+	struct tcpcb *tp;
+	struct socket *so;
+	int err;
+
+	tls = context;
+	inp = tls->inp;
+	if (inp == NULL)
+		return;
+	INP_WLOCK(inp);
+	so = inp->inp_socket;
+	MPASS(so != NULL);
+	if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
+	    (inp->inp_flags2 & INP_FREED)) {
+		goto out;
+	}
+
+	if (so->so_snd.sb_tls_info != NULL)
+		err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW);
+	else
+		err = ENXIO;
+	if (err == 0) {
+		counter_u64_add(ktls_ifnet_disable_ok, 1);
+		/* ktls_set_tx_mode() drops inp wlock, so recheck flags */
+		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 &&
+		    (inp->inp_flags2 & INP_FREED) == 0 &&
+		    (tp = intotcpcb(inp)) != NULL &&
+		    tp->t_fb->tfb_hwtls_change != NULL)
+			(*tp->t_fb->tfb_hwtls_change)(tp, 0);
+	} else {
+		counter_u64_add(ktls_ifnet_disable_fail, 1);
+	}
+
+out:
+	SOCK_LOCK(so);
+	sorele(so);
+	if (!in_pcbrele_wlocked(inp))
+		INP_WUNLOCK(inp);
+	ktls_free(tls);
+}
+
+/*
+ * Called when re-transmits are becoming a substantial portion of the
+ * sends on this connection.  When this happens, we transition the
+ * connection to software TLS.  This is needed because most inline TLS
+ * NICs keep crypto state only for in-order transmits.  This means
+ * that to handle a TCP rexmit (which is out-of-order), the NIC must
+ * re-DMA the entire TLS record up to and including the current
+ * segment.  This means that when re-transmitting the last ~1448 byte
+ * segment of a 16KB TLS record, we could wind up re-DMA'ing an order
+ * of magnitude more data than we are sending.  This can cause the
+ * PCIe link to saturate well before the network, which can cause
+ * output drops, and a general loss of capacity.
+ */
+void
+ktls_disable_ifnet(void *arg)
+{
+	struct tcpcb *tp;
+	struct inpcb *inp;
+	struct socket *so;
+	struct ktls_session *tls;
+
+	tp = arg;
+	inp = tp->t_inpcb;
+	INP_WLOCK_ASSERT(inp);
+	so = inp->inp_socket;
+	SOCK_LOCK(so);
+	tls = so->so_snd.sb_tls_info;
+	if (tls->disable_ifnet_pending) {
+		SOCK_UNLOCK(so);
+		return;
+	}
+
+	/*
+	 * note that disable_ifnet_pending is never cleared; disabling
+	 * ifnet can only be done once per session, so we never want
+	 * to do it again
+	 */
+
+	(void)ktls_hold(tls);
+	in_pcbref(inp);
+	soref(so);
+	tls->disable_ifnet_pending = true;
+	tls->inp = inp;
+	SOCK_UNLOCK(so);
+	TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls);
+	(void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task);
+}
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index dd30f89896d2..3f72a821e71f 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -39,8 +39,10 @@
 #include <netinet/tcp_fsm.h>
 
 #ifdef _KERNEL
+#include "opt_kern_tls.h"
 #include <net/vnet.h>
 #include <sys/mbuf.h>
+#include <sys/ktls.h>
 #endif
 
 #define TCP_END_BYTE_INFO 8	/* Bytes that makeup the "end information array" */
@@ -1139,8 +1141,10 @@ tcp_fields_to_net(struct tcphdr *th)
 
 static inline void
 tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt,
-    uint8_t is_tlp, int hw_tls __unused)
+    uint8_t is_tlp, int hw_tls)
 {
+	uint64_t rexmit_percent;
+
 	if (is_tlp) {
 		tp->t_sndtlppack++;
 		tp->t_sndtlpbyte += len;
@@ -1150,6 +1154,13 @@ tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt,
 		tp->t_snd_rxt_bytes += len;
 	else
 		tp->t_sndbytes += len;
+
+	if (hw_tls && is_rxt) {
+		rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / (10ULL * (tp->t_snd_rxt_bytes + tp->t_sndbytes));
+		if (rexmit_percent > ktls_ifnet_max_rexmit_pct)
+			ktls_disable_ifnet(tp);
+	}
+
 }
 #endif /* _KERNEL */
 
diff --git a/sys/sys/ktls.h b/sys/sys/ktls.h
index b28c94965c97..7fd8831878b4 100644
--- a/sys/sys/ktls.h
+++ b/sys/sys/ktls.h
@@ -189,10 +189,12 @@ struct ktls_session {
 	u_int	wq_index;
 	volatile u_int refcount;
 	int mode;
-	bool reset_pending;
 
 	struct task reset_tag_task;
+	struct task disable_ifnet_task;
 	struct inpcb *inp;
+	bool reset_pending;
+	bool disable_ifnet_pending;
 } __aligned(CACHE_LINE_SIZE);
 
 void ktls_check_rx(struct sockbuf *sb);
@@ -231,5 +233,16 @@ ktls_free(struct ktls_session *tls)
 		ktls_destroy(tls);
 }
 
+#ifdef KERN_TLS
+extern unsigned int ktls_ifnet_max_rexmit_pct;
+void ktls_disable_ifnet(void *arg);
+#else
+#define ktls_ifnet_max_rexmit_pct 1
+inline void
+ktls_disable_ifnet(void *arg __unused)
+{
+}
+#endif
+
 #endif /* !_KERNEL */
 #endif /* !_SYS_KTLS_H_ */


More information about the dev-commits-src-main mailing list