git: 28d0a740dd9a - main - ktls: auto-disable ifnet (inline hw) kTLS
Mateusz Guzik
mjguzik at gmail.com
Wed Jul 7 11:00:42 UTC 2021
This breaks NOIP kernel builds.
On 7/6/21, Andrew Gallatin <gallatin at freebsd.org> wrote:
> The branch main has been updated by gallatin:
>
> URL:
> https://cgit.FreeBSD.org/src/commit/?id=28d0a740dd9a67e4a4fa9fda5bb39b5963316f35
>
> commit 28d0a740dd9a67e4a4fa9fda5bb39b5963316f35
> Author: Andrew Gallatin <gallatin at FreeBSD.org>
> AuthorDate: 2021-07-06 14:17:33 +0000
> Commit: Andrew Gallatin <gallatin at FreeBSD.org>
> CommitDate: 2021-07-06 14:28:32 +0000
>
> ktls: auto-disable ifnet (inline hw) kTLS
>
> Ifnet (inline) hw kTLS NICs typically keep state within
> a TLS record, so that when transmitting in-order,
> they can continue encryption on each segment sent without
> DMA'ing extra state from the host.
>
> This breaks down when transmits are out of order (eg,
> TCP retransmits). In this case, the NIC must re-DMA
> the entire TLS record up to and including the segment
> being retransmitted. This means that when re-transmitting
> the last 1448 byte segment of a TLS record, the NIC will
> have to re-DMA the entire 16KB TLS record. This can lead
> to the NIC running out of PCIe bus bandwidth well before
> it saturates the network link if a lot of TCP connections have
> a high retransmoit rate.
>
> This change introduces a new sysctl
> (kern.ipc.tls.ifnet_max_rexmit_pct),
> where TCP connections with higher retransmit rate will be
> switched to SW kTLS so as to conserve PCIe bandwidth.
>
> Reviewed by: hselasky, markj, rrs
> Sponsored by: Netflix
> Differential Revision: https://reviews.freebsd.org/D30908
> ---
> sys/kern/uipc_ktls.c | 107
> ++++++++++++++++++++++++++++++++++++++++++++++++++
> sys/netinet/tcp_var.h | 13 +++++-
> sys/sys/ktls.h | 15 ++++++-
> 3 files changed, 133 insertions(+), 2 deletions(-)
>
> diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
> index 7e87e7c740e3..88e29157289d 100644
> --- a/sys/kern/uipc_ktls.c
> +++ b/sys/kern/uipc_ktls.c
> @@ -30,6 +30,7 @@ __FBSDID("$FreeBSD$");
>
> #include "opt_inet.h"
> #include "opt_inet6.h"
> +#include "opt_kern_tls.h"
> #include "opt_ratelimit.h"
> #include "opt_rss.h"
>
> @@ -121,6 +122,11 @@ SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads,
> CTLFLAG_RD,
> &ktls_number_threads, 0,
> "Number of TLS threads in thread-pool");
>
> +unsigned int ktls_ifnet_max_rexmit_pct = 2;
> +SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN,
> + &ktls_ifnet_max_rexmit_pct, 2,
> + "Max percent bytes retransmitted before ifnet TLS is disabled");
> +
> static bool ktls_offload_enable;
> SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN,
> &ktls_offload_enable, 0,
> @@ -184,6 +190,14 @@ static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed);
> SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed,
> CTLFLAG_RD,
> &ktls_switch_failed, "TLS sessions unable to switch between SW and
> ifnet");
>
> +static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail);
> +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed,
> CTLFLAG_RD,
> + &ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from
> ifnet");
> +
> +static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok);
> +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok,
> CTLFLAG_RD,
> + &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from
> ifnet");
> +
> SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
> "Software TLS session stats");
> SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE,
> 0,
> @@ -2187,3 +2201,96 @@ ktls_work_thread(void *ctx)
> }
> }
> }
> +
> +static void
> +ktls_disable_ifnet_help(void *context, int pending __unused)
> +{
> + struct ktls_session *tls;
> + struct inpcb *inp;
> + struct tcpcb *tp;
> + struct socket *so;
> + int err;
> +
> + tls = context;
> + inp = tls->inp;
> + if (inp == NULL)
> + return;
> + INP_WLOCK(inp);
> + so = inp->inp_socket;
> + MPASS(so != NULL);
> + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
> + (inp->inp_flags2 & INP_FREED)) {
> + goto out;
> + }
> +
> + if (so->so_snd.sb_tls_info != NULL)
> + err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW);
> + else
> + err = ENXIO;
> + if (err == 0) {
> + counter_u64_add(ktls_ifnet_disable_ok, 1);
> + /* ktls_set_tx_mode() drops inp wlock, so recheck flags */
> + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 &&
> + (inp->inp_flags2 & INP_FREED) == 0 &&
> + (tp = intotcpcb(inp)) != NULL &&
> + tp->t_fb->tfb_hwtls_change != NULL)
> + (*tp->t_fb->tfb_hwtls_change)(tp, 0);
> + } else {
> + counter_u64_add(ktls_ifnet_disable_fail, 1);
> + }
> +
> +out:
> + SOCK_LOCK(so);
> + sorele(so);
> + if (!in_pcbrele_wlocked(inp))
> + INP_WUNLOCK(inp);
> + ktls_free(tls);
> +}
> +
> +/*
> + * Called when re-transmits are becoming a substantial portion of the
> + * sends on this connection. When this happens, we transition the
> + * connection to software TLS. This is needed because most inline TLS
> + * NICs keep crypto state only for in-order transmits. This means
> + * that to handle a TCP rexmit (which is out-of-order), the NIC must
> + * re-DMA the entire TLS record up to and including the current
> + * segment. This means that when re-transmitting the last ~1448 byte
> + * segment of a 16KB TLS record, we could wind up re-DMA'ing an order
> + * of magnitude more data than we are sending. This can cause the
> + * PCIe link to saturate well before the network, which can cause
> + * output drops, and a general loss of capacity.
> + */
> +void
> +ktls_disable_ifnet(void *arg)
> +{
> + struct tcpcb *tp;
> + struct inpcb *inp;
> + struct socket *so;
> + struct ktls_session *tls;
> +
> + tp = arg;
> + inp = tp->t_inpcb;
> + INP_WLOCK_ASSERT(inp);
> + so = inp->inp_socket;
> + SOCK_LOCK(so);
> + tls = so->so_snd.sb_tls_info;
> + if (tls->disable_ifnet_pending) {
> + SOCK_UNLOCK(so);
> + return;
> + }
> +
> + /*
> + * note that disable_ifnet_pending is never cleared; disabling
> + * ifnet can only be done once per session, so we never want
> + * to do it again
> + */
> +
> + (void)ktls_hold(tls);
> + in_pcbref(inp);
> + soref(so);
> + tls->disable_ifnet_pending = true;
> + tls->inp = inp;
> + SOCK_UNLOCK(so);
> + TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls);
> + (void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task);
> +}
> diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
> index dd30f89896d2..3f72a821e71f 100644
> --- a/sys/netinet/tcp_var.h
> +++ b/sys/netinet/tcp_var.h
> @@ -39,8 +39,10 @@
> #include <netinet/tcp_fsm.h>
>
> #ifdef _KERNEL
> +#include "opt_kern_tls.h"
> #include <net/vnet.h>
> #include <sys/mbuf.h>
> +#include <sys/ktls.h>
> #endif
>
> #define TCP_END_BYTE_INFO 8 /* Bytes that makeup the "end information
> array" */
> @@ -1139,8 +1141,10 @@ tcp_fields_to_net(struct tcphdr *th)
>
> static inline void
> tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt,
> - uint8_t is_tlp, int hw_tls __unused)
> + uint8_t is_tlp, int hw_tls)
> {
> + uint64_t rexmit_percent;
> +
> if (is_tlp) {
> tp->t_sndtlppack++;
> tp->t_sndtlpbyte += len;
> @@ -1150,6 +1154,13 @@ tcp_account_for_send(struct tcpcb *tp, uint32_t len,
> uint8_t is_rxt,
> tp->t_snd_rxt_bytes += len;
> else
> tp->t_sndbytes += len;
> +
> + if (hw_tls && is_rxt) {
> + rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / (10ULL *
> (tp->t_snd_rxt_bytes + tp->t_sndbytes));
> + if (rexmit_percent > ktls_ifnet_max_rexmit_pct)
> + ktls_disable_ifnet(tp);
> + }
> +
> }
> #endif /* _KERNEL */
>
> diff --git a/sys/sys/ktls.h b/sys/sys/ktls.h
> index b28c94965c97..7fd8831878b4 100644
> --- a/sys/sys/ktls.h
> +++ b/sys/sys/ktls.h
> @@ -189,10 +189,12 @@ struct ktls_session {
> u_int wq_index;
> volatile u_int refcount;
> int mode;
> - bool reset_pending;
>
> struct task reset_tag_task;
> + struct task disable_ifnet_task;
> struct inpcb *inp;
> + bool reset_pending;
> + bool disable_ifnet_pending;
> } __aligned(CACHE_LINE_SIZE);
>
> void ktls_check_rx(struct sockbuf *sb);
> @@ -231,5 +233,16 @@ ktls_free(struct ktls_session *tls)
> ktls_destroy(tls);
> }
>
> +#ifdef KERN_TLS
> +extern unsigned int ktls_ifnet_max_rexmit_pct;
> +void ktls_disable_ifnet(void *arg);
> +#else
> +#define ktls_ifnet_max_rexmit_pct 1
> +inline void
> +ktls_disable_ifnet(void *arg __unused)
> +{
> +}
> +#endif
> +
> #endif /* !_KERNEL */
> #endif /* !_SYS_KTLS_H_ */
>
--
Mateusz Guzik <mjguzik gmail.com>
More information about the dev-commits-src-all
mailing list