git: 28d0a740dd9a - main - ktls: auto-disable ifnet (inline hw) kTLS

Mateusz Guzik mjguzik at gmail.com
Wed Jul 7 11:00:42 UTC 2021


This breaks NOIP kernel builds.

On 7/6/21, Andrew Gallatin <gallatin at freebsd.org> wrote:
> The branch main has been updated by gallatin:
>
> URL:
> https://cgit.FreeBSD.org/src/commit/?id=28d0a740dd9a67e4a4fa9fda5bb39b5963316f35
>
> commit 28d0a740dd9a67e4a4fa9fda5bb39b5963316f35
> Author:     Andrew Gallatin <gallatin at FreeBSD.org>
> AuthorDate: 2021-07-06 14:17:33 +0000
> Commit:     Andrew Gallatin <gallatin at FreeBSD.org>
> CommitDate: 2021-07-06 14:28:32 +0000
>
>     ktls: auto-disable ifnet (inline hw) kTLS
>
>     Ifnet (inline) hw kTLS NICs typically keep state within
>     a TLS record, so that when transmitting in-order,
>     they can continue encryption on each segment sent without
>     DMA'ing extra state from the host.
>
>     This breaks down when transmits are out of order (eg,
>     TCP retransmits).  In this case, the NIC must re-DMA
>     the entire TLS record up to and including the segment
>     being retransmitted.  This means that when re-transmitting
>     the last 1448 byte segment of a TLS record, the NIC will
>     have to re-DMA the entire 16KB TLS record. This can lead
>     to the NIC running out of PCIe bus bandwidth well before
>     it saturates the network link if a lot of TCP connections have
>     a high retransmoit rate.
>
>     This change introduces a new sysctl
> (kern.ipc.tls.ifnet_max_rexmit_pct),
>     where TCP connections with higher retransmit rate will be
>     switched to SW kTLS so as to conserve PCIe bandwidth.
>
>     Reviewed by:    hselasky, markj, rrs
>     Sponsored by:   Netflix
>     Differential Revision:  https://reviews.freebsd.org/D30908
> ---
>  sys/kern/uipc_ktls.c  | 107
> ++++++++++++++++++++++++++++++++++++++++++++++++++
>  sys/netinet/tcp_var.h |  13 +++++-
>  sys/sys/ktls.h        |  15 ++++++-
>  3 files changed, 133 insertions(+), 2 deletions(-)
>
> diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
> index 7e87e7c740e3..88e29157289d 100644
> --- a/sys/kern/uipc_ktls.c
> +++ b/sys/kern/uipc_ktls.c
> @@ -30,6 +30,7 @@ __FBSDID("$FreeBSD$");
>
>  #include "opt_inet.h"
>  #include "opt_inet6.h"
> +#include "opt_kern_tls.h"
>  #include "opt_ratelimit.h"
>  #include "opt_rss.h"
>
> @@ -121,6 +122,11 @@ SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads,
> CTLFLAG_RD,
>      &ktls_number_threads, 0,
>      "Number of TLS threads in thread-pool");
>
> +unsigned int ktls_ifnet_max_rexmit_pct = 2;
> +SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN,
> +    &ktls_ifnet_max_rexmit_pct, 2,
> +    "Max percent bytes retransmitted before ifnet TLS is disabled");
> +
>  static bool ktls_offload_enable;
>  SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN,
>      &ktls_offload_enable, 0,
> @@ -184,6 +190,14 @@ static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed);
>  SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed,
> CTLFLAG_RD,
>      &ktls_switch_failed, "TLS sessions unable to switch between SW and
> ifnet");
>
> +static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail);
> +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed,
> CTLFLAG_RD,
> +    &ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from
> ifnet");
> +
> +static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok);
> +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok,
> CTLFLAG_RD,
> +    &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from
> ifnet");
> +
>  SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
>      "Software TLS session stats");
>  SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE,
> 0,
> @@ -2187,3 +2201,96 @@ ktls_work_thread(void *ctx)
>  		}
>  	}
>  }
> +
> +static void
> +ktls_disable_ifnet_help(void *context, int pending __unused)
> +{
> +	struct ktls_session *tls;
> +	struct inpcb *inp;
> +	struct tcpcb *tp;
> +	struct socket *so;
> +	int err;
> +
> +	tls = context;
> +	inp = tls->inp;
> +	if (inp == NULL)
> +		return;
> +	INP_WLOCK(inp);
> +	so = inp->inp_socket;
> +	MPASS(so != NULL);
> +	if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
> +	    (inp->inp_flags2 & INP_FREED)) {
> +		goto out;
> +	}
> +
> +	if (so->so_snd.sb_tls_info != NULL)
> +		err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW);
> +	else
> +		err = ENXIO;
> +	if (err == 0) {
> +		counter_u64_add(ktls_ifnet_disable_ok, 1);
> +		/* ktls_set_tx_mode() drops inp wlock, so recheck flags */
> +		if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 &&
> +		    (inp->inp_flags2 & INP_FREED) == 0 &&
> +		    (tp = intotcpcb(inp)) != NULL &&
> +		    tp->t_fb->tfb_hwtls_change != NULL)
> +			(*tp->t_fb->tfb_hwtls_change)(tp, 0);
> +	} else {
> +		counter_u64_add(ktls_ifnet_disable_fail, 1);
> +	}
> +
> +out:
> +	SOCK_LOCK(so);
> +	sorele(so);
> +	if (!in_pcbrele_wlocked(inp))
> +		INP_WUNLOCK(inp);
> +	ktls_free(tls);
> +}
> +
> +/*
> + * Called when re-transmits are becoming a substantial portion of the
> + * sends on this connection.  When this happens, we transition the
> + * connection to software TLS.  This is needed because most inline TLS
> + * NICs keep crypto state only for in-order transmits.  This means
> + * that to handle a TCP rexmit (which is out-of-order), the NIC must
> + * re-DMA the entire TLS record up to and including the current
> + * segment.  This means that when re-transmitting the last ~1448 byte
> + * segment of a 16KB TLS record, we could wind up re-DMA'ing an order
> + * of magnitude more data than we are sending.  This can cause the
> + * PCIe link to saturate well before the network, which can cause
> + * output drops, and a general loss of capacity.
> + */
> +void
> +ktls_disable_ifnet(void *arg)
> +{
> +	struct tcpcb *tp;
> +	struct inpcb *inp;
> +	struct socket *so;
> +	struct ktls_session *tls;
> +
> +	tp = arg;
> +	inp = tp->t_inpcb;
> +	INP_WLOCK_ASSERT(inp);
> +	so = inp->inp_socket;
> +	SOCK_LOCK(so);
> +	tls = so->so_snd.sb_tls_info;
> +	if (tls->disable_ifnet_pending) {
> +		SOCK_UNLOCK(so);
> +		return;
> +	}
> +
> +	/*
> +	 * note that disable_ifnet_pending is never cleared; disabling
> +	 * ifnet can only be done once per session, so we never want
> +	 * to do it again
> +	 */
> +
> +	(void)ktls_hold(tls);
> +	in_pcbref(inp);
> +	soref(so);
> +	tls->disable_ifnet_pending = true;
> +	tls->inp = inp;
> +	SOCK_UNLOCK(so);
> +	TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls);
> +	(void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task);
> +}
> diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
> index dd30f89896d2..3f72a821e71f 100644
> --- a/sys/netinet/tcp_var.h
> +++ b/sys/netinet/tcp_var.h
> @@ -39,8 +39,10 @@
>  #include <netinet/tcp_fsm.h>
>
>  #ifdef _KERNEL
> +#include "opt_kern_tls.h"
>  #include <net/vnet.h>
>  #include <sys/mbuf.h>
> +#include <sys/ktls.h>
>  #endif
>
>  #define TCP_END_BYTE_INFO 8	/* Bytes that makeup the "end information
> array" */
> @@ -1139,8 +1141,10 @@ tcp_fields_to_net(struct tcphdr *th)
>
>  static inline void
>  tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt,
> -    uint8_t is_tlp, int hw_tls __unused)
> +    uint8_t is_tlp, int hw_tls)
>  {
> +	uint64_t rexmit_percent;
> +
>  	if (is_tlp) {
>  		tp->t_sndtlppack++;
>  		tp->t_sndtlpbyte += len;
> @@ -1150,6 +1154,13 @@ tcp_account_for_send(struct tcpcb *tp, uint32_t len,
> uint8_t is_rxt,
>  		tp->t_snd_rxt_bytes += len;
>  	else
>  		tp->t_sndbytes += len;
> +
> +	if (hw_tls && is_rxt) {
> +		rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / (10ULL *
> (tp->t_snd_rxt_bytes + tp->t_sndbytes));
> +		if (rexmit_percent > ktls_ifnet_max_rexmit_pct)
> +			ktls_disable_ifnet(tp);
> +	}
> +
>  }
>  #endif /* _KERNEL */
>
> diff --git a/sys/sys/ktls.h b/sys/sys/ktls.h
> index b28c94965c97..7fd8831878b4 100644
> --- a/sys/sys/ktls.h
> +++ b/sys/sys/ktls.h
> @@ -189,10 +189,12 @@ struct ktls_session {
>  	u_int	wq_index;
>  	volatile u_int refcount;
>  	int mode;
> -	bool reset_pending;
>
>  	struct task reset_tag_task;
> +	struct task disable_ifnet_task;
>  	struct inpcb *inp;
> +	bool reset_pending;
> +	bool disable_ifnet_pending;
>  } __aligned(CACHE_LINE_SIZE);
>
>  void ktls_check_rx(struct sockbuf *sb);
> @@ -231,5 +233,16 @@ ktls_free(struct ktls_session *tls)
>  		ktls_destroy(tls);
>  }
>
> +#ifdef KERN_TLS
> +extern unsigned int ktls_ifnet_max_rexmit_pct;
> +void ktls_disable_ifnet(void *arg);
> +#else
> +#define ktls_ifnet_max_rexmit_pct 1
> +inline void
> +ktls_disable_ifnet(void *arg __unused)
> +{
> +}
> +#endif
> +
>  #endif /* !_KERNEL */
>  #endif /* !_SYS_KTLS_H_ */
>


-- 
Mateusz Guzik <mjguzik gmail.com>


More information about the dev-commits-src-all mailing list