git: 28d0a740dd9a - main - ktls: auto-disable ifnet (inline hw) kTLS

Andrew Gallatin gallatin at cs.duke.edu
Wed Jul 7 14:49:20 UTC 2021


On 7/7/21 7:00 AM, Mateusz Guzik wrote:
> This breaks NOIP kernel builds.

Thanks for pointing this out,  it should be fixed in 4150a5a87ed

> On 7/6/21, Andrew Gallatin <gallatin at freebsd.org> wrote:
>> The branch main has been updated by gallatin:
>>
>> URL:
>> https://urldefense.com/v3/__https://cgit.FreeBSD.org/src/commit/?id=28d0a740dd9a67e4a4fa9fda5bb39b5963316f35__;!!OToaGQ!_d4pkzhNaWowgMsR4-c1qtLXr1H9SC_kBWNDvXvVV15lerMV4elltm-V6OZj3iET-A$
>>
>> commit 28d0a740dd9a67e4a4fa9fda5bb39b5963316f35
>> Author:     Andrew Gallatin <gallatin at FreeBSD.org>
>> AuthorDate: 2021-07-06 14:17:33 +0000
>> Commit:     Andrew Gallatin <gallatin at FreeBSD.org>
>> CommitDate: 2021-07-06 14:28:32 +0000
>>
>>      ktls: auto-disable ifnet (inline hw) kTLS
>>
>>      Ifnet (inline) hw kTLS NICs typically keep state within
>>      a TLS record, so that when transmitting in-order,
>>      they can continue encryption on each segment sent without
>>      DMA'ing extra state from the host.
>>
>>      This breaks down when transmits are out of order (eg,
>>      TCP retransmits).  In this case, the NIC must re-DMA
>>      the entire TLS record up to and including the segment
>>      being retransmitted.  This means that when re-transmitting
>>      the last 1448 byte segment of a TLS record, the NIC will
>>      have to re-DMA the entire 16KB TLS record. This can lead
>>      to the NIC running out of PCIe bus bandwidth well before
>>      it saturates the network link if a lot of TCP connections have
>>      a high retransmoit rate.
>>
>>      This change introduces a new sysctl
>> (kern.ipc.tls.ifnet_max_rexmit_pct),
>>      where TCP connections with higher retransmit rate will be
>>      switched to SW kTLS so as to conserve PCIe bandwidth.
>>
>>      Reviewed by:    hselasky, markj, rrs
>>      Sponsored by:   Netflix
>>      Differential Revision:  https://urldefense.com/v3/__https://reviews.freebsd.org/D30908__;!!OToaGQ!_d4pkzhNaWowgMsR4-c1qtLXr1H9SC_kBWNDvXvVV15lerMV4elltm-V6OYOYLaV0A$
>> ---
>>   sys/kern/uipc_ktls.c  | 107
>> ++++++++++++++++++++++++++++++++++++++++++++++++++
>>   sys/netinet/tcp_var.h |  13 +++++-
>>   sys/sys/ktls.h        |  15 ++++++-
>>   3 files changed, 133 insertions(+), 2 deletions(-)
>>
>> diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
>> index 7e87e7c740e3..88e29157289d 100644
>> --- a/sys/kern/uipc_ktls.c
>> +++ b/sys/kern/uipc_ktls.c
>> @@ -30,6 +30,7 @@ __FBSDID("$FreeBSD$");
>>
>>   #include "opt_inet.h"
>>   #include "opt_inet6.h"
>> +#include "opt_kern_tls.h"
>>   #include "opt_ratelimit.h"
>>   #include "opt_rss.h"
>>
>> @@ -121,6 +122,11 @@ SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads,
>> CTLFLAG_RD,
>>       &ktls_number_threads, 0,
>>       "Number of TLS threads in thread-pool");
>>
>> +unsigned int ktls_ifnet_max_rexmit_pct = 2;
>> +SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN,
>> +    &ktls_ifnet_max_rexmit_pct, 2,
>> +    "Max percent bytes retransmitted before ifnet TLS is disabled");
>> +
>>   static bool ktls_offload_enable;
>>   SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN,
>>       &ktls_offload_enable, 0,
>> @@ -184,6 +190,14 @@ static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed);
>>   SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed,
>> CTLFLAG_RD,
>>       &ktls_switch_failed, "TLS sessions unable to switch between SW and
>> ifnet");
>>
>> +static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail);
>> +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed,
>> CTLFLAG_RD,
>> +    &ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from
>> ifnet");
>> +
>> +static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok);
>> +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok,
>> CTLFLAG_RD,
>> +    &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from
>> ifnet");
>> +
>>   SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
>>       "Software TLS session stats");
>>   SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE,
>> 0,
>> @@ -2187,3 +2201,96 @@ ktls_work_thread(void *ctx)
>>   		}
>>   	}
>>   }
>> +
>> +static void
>> +ktls_disable_ifnet_help(void *context, int pending __unused)
>> +{
>> +	struct ktls_session *tls;
>> +	struct inpcb *inp;
>> +	struct tcpcb *tp;
>> +	struct socket *so;
>> +	int err;
>> +
>> +	tls = context;
>> +	inp = tls->inp;
>> +	if (inp == NULL)
>> +		return;
>> +	INP_WLOCK(inp);
>> +	so = inp->inp_socket;
>> +	MPASS(so != NULL);
>> +	if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
>> +	    (inp->inp_flags2 & INP_FREED)) {
>> +		goto out;
>> +	}
>> +
>> +	if (so->so_snd.sb_tls_info != NULL)
>> +		err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW);
>> +	else
>> +		err = ENXIO;
>> +	if (err == 0) {
>> +		counter_u64_add(ktls_ifnet_disable_ok, 1);
>> +		/* ktls_set_tx_mode() drops inp wlock, so recheck flags */
>> +		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 &&
>> +		    (inp->inp_flags2 & INP_FREED) == 0 &&
>> +		    (tp = intotcpcb(inp)) != NULL &&
>> +		    tp->t_fb->tfb_hwtls_change != NULL)
>> +			(*tp->t_fb->tfb_hwtls_change)(tp, 0);
>> +	} else {
>> +		counter_u64_add(ktls_ifnet_disable_fail, 1);
>> +	}
>> +
>> +out:
>> +	SOCK_LOCK(so);
>> +	sorele(so);
>> +	if (!in_pcbrele_wlocked(inp))
>> +		INP_WUNLOCK(inp);
>> +	ktls_free(tls);
>> +}
>> +
>> +/*
>> + * Called when re-transmits are becoming a substantial portion of the
>> + * sends on this connection.  When this happens, we transition the
>> + * connection to software TLS.  This is needed because most inline TLS
>> + * NICs keep crypto state only for in-order transmits.  This means
>> + * that to handle a TCP rexmit (which is out-of-order), the NIC must
>> + * re-DMA the entire TLS record up to and including the current
>> + * segment.  This means that when re-transmitting the last ~1448 byte
>> + * segment of a 16KB TLS record, we could wind up re-DMA'ing an order
>> + * of magnitude more data than we are sending.  This can cause the
>> + * PCIe link to saturate well before the network, which can cause
>> + * output drops, and a general loss of capacity.
>> + */
>> +void
>> +ktls_disable_ifnet(void *arg)
>> +{
>> +	struct tcpcb *tp;
>> +	struct inpcb *inp;
>> +	struct socket *so;
>> +	struct ktls_session *tls;
>> +
>> +	tp = arg;
>> +	inp = tp->t_inpcb;
>> +	INP_WLOCK_ASSERT(inp);
>> +	so = inp->inp_socket;
>> +	SOCK_LOCK(so);
>> +	tls = so->so_snd.sb_tls_info;
>> +	if (tls->disable_ifnet_pending) {
>> +		SOCK_UNLOCK(so);
>> +		return;
>> +	}
>> +
>> +	/*
>> +	 * note that disable_ifnet_pending is never cleared; disabling
>> +	 * ifnet can only be done once per session, so we never want
>> +	 * to do it again
>> +	 */
>> +
>> +	(void)ktls_hold(tls);
>> +	in_pcbref(inp);
>> +	soref(so);
>> +	tls->disable_ifnet_pending = true;
>> +	tls->inp = inp;
>> +	SOCK_UNLOCK(so);
>> +	TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls);
>> +	(void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task);
>> +}
>> diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
>> index dd30f89896d2..3f72a821e71f 100644
>> --- a/sys/netinet/tcp_var.h
>> +++ b/sys/netinet/tcp_var.h
>> @@ -39,8 +39,10 @@
>>   #include <netinet/tcp_fsm.h>
>>
>>   #ifdef _KERNEL
>> +#include "opt_kern_tls.h"
>>   #include <net/vnet.h>
>>   #include <sys/mbuf.h>
>> +#include <sys/ktls.h>
>>   #endif
>>
>>   #define TCP_END_BYTE_INFO 8	/* Bytes that makeup the "end information
>> array" */
>> @@ -1139,8 +1141,10 @@ tcp_fields_to_net(struct tcphdr *th)
>>
>>   static inline void
>>   tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt,
>> -    uint8_t is_tlp, int hw_tls __unused)
>> +    uint8_t is_tlp, int hw_tls)
>>   {
>> +	uint64_t rexmit_percent;
>> +
>>   	if (is_tlp) {
>>   		tp->t_sndtlppack++;
>>   		tp->t_sndtlpbyte += len;
>> @@ -1150,6 +1154,13 @@ tcp_account_for_send(struct tcpcb *tp, uint32_t len,
>> uint8_t is_rxt,
>>   		tp->t_snd_rxt_bytes += len;
>>   	else
>>   		tp->t_sndbytes += len;
>> +
>> +	if (hw_tls && is_rxt) {
>> +		rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / (10ULL *
>> (tp->t_snd_rxt_bytes + tp->t_sndbytes));
>> +		if (rexmit_percent > ktls_ifnet_max_rexmit_pct)
>> +			ktls_disable_ifnet(tp);
>> +	}
>> +
>>   }
>>   #endif /* _KERNEL */
>>
>> diff --git a/sys/sys/ktls.h b/sys/sys/ktls.h
>> index b28c94965c97..7fd8831878b4 100644
>> --- a/sys/sys/ktls.h
>> +++ b/sys/sys/ktls.h
>> @@ -189,10 +189,12 @@ struct ktls_session {
>>   	u_int	wq_index;
>>   	volatile u_int refcount;
>>   	int mode;
>> -	bool reset_pending;
>>
>>   	struct task reset_tag_task;
>> +	struct task disable_ifnet_task;
>>   	struct inpcb *inp;
>> +	bool reset_pending;
>> +	bool disable_ifnet_pending;
>>   } __aligned(CACHE_LINE_SIZE);
>>
>>   void ktls_check_rx(struct sockbuf *sb);
>> @@ -231,5 +233,16 @@ ktls_free(struct ktls_session *tls)
>>   		ktls_destroy(tls);
>>   }
>>
>> +#ifdef KERN_TLS
>> +extern unsigned int ktls_ifnet_max_rexmit_pct;
>> +void ktls_disable_ifnet(void *arg);
>> +#else
>> +#define ktls_ifnet_max_rexmit_pct 1
>> +inline void
>> +ktls_disable_ifnet(void *arg __unused)
>> +{
>> +}
>> +#endif
>> +
>>   #endif /* !_KERNEL */
>>   #endif /* !_SYS_KTLS_H_ */
>>
> 
> 



More information about the dev-commits-src-main mailing list