git: 28d0a740dd9a - main - ktls: auto-disable ifnet (inline hw) kTLS
Andrew Gallatin
gallatin at cs.duke.edu
Wed Jul 7 14:49:20 UTC 2021
On 7/7/21 7:00 AM, Mateusz Guzik wrote:
> This breaks NOIP kernel builds.
Thanks for pointing this out, it should be fixed in 4150a5a87ed
> On 7/6/21, Andrew Gallatin <gallatin at freebsd.org> wrote:
>> The branch main has been updated by gallatin:
>>
>> URL:
>> https://urldefense.com/v3/__https://cgit.FreeBSD.org/src/commit/?id=28d0a740dd9a67e4a4fa9fda5bb39b5963316f35__;!!OToaGQ!_d4pkzhNaWowgMsR4-c1qtLXr1H9SC_kBWNDvXvVV15lerMV4elltm-V6OZj3iET-A$
>>
>> commit 28d0a740dd9a67e4a4fa9fda5bb39b5963316f35
>> Author: Andrew Gallatin <gallatin at FreeBSD.org>
>> AuthorDate: 2021-07-06 14:17:33 +0000
>> Commit: Andrew Gallatin <gallatin at FreeBSD.org>
>> CommitDate: 2021-07-06 14:28:32 +0000
>>
>> ktls: auto-disable ifnet (inline hw) kTLS
>>
>> Ifnet (inline) hw kTLS NICs typically keep state within
>> a TLS record, so that when transmitting in-order,
>> they can continue encryption on each segment sent without
>> DMA'ing extra state from the host.
>>
>> This breaks down when transmits are out of order (eg,
>> TCP retransmits). In this case, the NIC must re-DMA
>> the entire TLS record up to and including the segment
>> being retransmitted. This means that when re-transmitting
>> the last 1448 byte segment of a TLS record, the NIC will
>> have to re-DMA the entire 16KB TLS record. This can lead
>> to the NIC running out of PCIe bus bandwidth well before
>> it saturates the network link if a lot of TCP connections have
>> a high retransmoit rate.
>>
>> This change introduces a new sysctl
>> (kern.ipc.tls.ifnet_max_rexmit_pct),
>> where TCP connections with higher retransmit rate will be
>> switched to SW kTLS so as to conserve PCIe bandwidth.
>>
>> Reviewed by: hselasky, markj, rrs
>> Sponsored by: Netflix
>> Differential Revision: https://urldefense.com/v3/__https://reviews.freebsd.org/D30908__;!!OToaGQ!_d4pkzhNaWowgMsR4-c1qtLXr1H9SC_kBWNDvXvVV15lerMV4elltm-V6OYOYLaV0A$
>> ---
>> sys/kern/uipc_ktls.c | 107
>> ++++++++++++++++++++++++++++++++++++++++++++++++++
>> sys/netinet/tcp_var.h | 13 +++++-
>> sys/sys/ktls.h | 15 ++++++-
>> 3 files changed, 133 insertions(+), 2 deletions(-)
>>
>> diff --git a/sys/kern/uipc_ktls.c b/sys/kern/uipc_ktls.c
>> index 7e87e7c740e3..88e29157289d 100644
>> --- a/sys/kern/uipc_ktls.c
>> +++ b/sys/kern/uipc_ktls.c
>> @@ -30,6 +30,7 @@ __FBSDID("$FreeBSD$");
>>
>> #include "opt_inet.h"
>> #include "opt_inet6.h"
>> +#include "opt_kern_tls.h"
>> #include "opt_ratelimit.h"
>> #include "opt_rss.h"
>>
>> @@ -121,6 +122,11 @@ SYSCTL_INT(_kern_ipc_tls_stats, OID_AUTO, threads,
>> CTLFLAG_RD,
>> &ktls_number_threads, 0,
>> "Number of TLS threads in thread-pool");
>>
>> +unsigned int ktls_ifnet_max_rexmit_pct = 2;
>> +SYSCTL_UINT(_kern_ipc_tls, OID_AUTO, ifnet_max_rexmit_pct, CTLFLAG_RWTUN,
>> + &ktls_ifnet_max_rexmit_pct, 2,
>> + "Max percent bytes retransmitted before ifnet TLS is disabled");
>> +
>> static bool ktls_offload_enable;
>> SYSCTL_BOOL(_kern_ipc_tls, OID_AUTO, enable, CTLFLAG_RWTUN,
>> &ktls_offload_enable, 0,
>> @@ -184,6 +190,14 @@ static COUNTER_U64_DEFINE_EARLY(ktls_switch_failed);
>> SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, switch_failed,
>> CTLFLAG_RD,
>> &ktls_switch_failed, "TLS sessions unable to switch between SW and
>> ifnet");
>>
>> +static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_fail);
>> +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_failed,
>> CTLFLAG_RD,
>> + &ktls_ifnet_disable_fail, "TLS sessions unable to switch to SW from
>> ifnet");
>> +
>> +static COUNTER_U64_DEFINE_EARLY(ktls_ifnet_disable_ok);
>> +SYSCTL_COUNTER_U64(_kern_ipc_tls_stats, OID_AUTO, ifnet_disable_ok,
>> CTLFLAG_RD,
>> + &ktls_ifnet_disable_ok, "TLS sessions able to switch to SW from
>> ifnet");
>> +
>> SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, sw, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
>> "Software TLS session stats");
>> SYSCTL_NODE(_kern_ipc_tls, OID_AUTO, ifnet, CTLFLAG_RD | CTLFLAG_MPSAFE,
>> 0,
>> @@ -2187,3 +2201,96 @@ ktls_work_thread(void *ctx)
>> }
>> }
>> }
>> +
>> +static void
>> +ktls_disable_ifnet_help(void *context, int pending __unused)
>> +{
>> + struct ktls_session *tls;
>> + struct inpcb *inp;
>> + struct tcpcb *tp;
>> + struct socket *so;
>> + int err;
>> +
>> + tls = context;
>> + inp = tls->inp;
>> + if (inp == NULL)
>> + return;
>> + INP_WLOCK(inp);
>> + so = inp->inp_socket;
>> + MPASS(so != NULL);
>> + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
>> + (inp->inp_flags2 & INP_FREED)) {
>> + goto out;
>> + }
>> +
>> + if (so->so_snd.sb_tls_info != NULL)
>> + err = ktls_set_tx_mode(so, TCP_TLS_MODE_SW);
>> + else
>> + err = ENXIO;
>> + if (err == 0) {
>> + counter_u64_add(ktls_ifnet_disable_ok, 1);
>> + /* ktls_set_tx_mode() drops inp wlock, so recheck flags */
>> + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 &&
>> + (inp->inp_flags2 & INP_FREED) == 0 &&
>> + (tp = intotcpcb(inp)) != NULL &&
>> + tp->t_fb->tfb_hwtls_change != NULL)
>> + (*tp->t_fb->tfb_hwtls_change)(tp, 0);
>> + } else {
>> + counter_u64_add(ktls_ifnet_disable_fail, 1);
>> + }
>> +
>> +out:
>> + SOCK_LOCK(so);
>> + sorele(so);
>> + if (!in_pcbrele_wlocked(inp))
>> + INP_WUNLOCK(inp);
>> + ktls_free(tls);
>> +}
>> +
>> +/*
>> + * Called when re-transmits are becoming a substantial portion of the
>> + * sends on this connection. When this happens, we transition the
>> + * connection to software TLS. This is needed because most inline TLS
>> + * NICs keep crypto state only for in-order transmits. This means
>> + * that to handle a TCP rexmit (which is out-of-order), the NIC must
>> + * re-DMA the entire TLS record up to and including the current
>> + * segment. This means that when re-transmitting the last ~1448 byte
>> + * segment of a 16KB TLS record, we could wind up re-DMA'ing an order
>> + * of magnitude more data than we are sending. This can cause the
>> + * PCIe link to saturate well before the network, which can cause
>> + * output drops, and a general loss of capacity.
>> + */
>> +void
>> +ktls_disable_ifnet(void *arg)
>> +{
>> + struct tcpcb *tp;
>> + struct inpcb *inp;
>> + struct socket *so;
>> + struct ktls_session *tls;
>> +
>> + tp = arg;
>> + inp = tp->t_inpcb;
>> + INP_WLOCK_ASSERT(inp);
>> + so = inp->inp_socket;
>> + SOCK_LOCK(so);
>> + tls = so->so_snd.sb_tls_info;
>> + if (tls->disable_ifnet_pending) {
>> + SOCK_UNLOCK(so);
>> + return;
>> + }
>> +
>> + /*
>> + * note that disable_ifnet_pending is never cleared; disabling
>> + * ifnet can only be done once per session, so we never want
>> + * to do it again
>> + */
>> +
>> + (void)ktls_hold(tls);
>> + in_pcbref(inp);
>> + soref(so);
>> + tls->disable_ifnet_pending = true;
>> + tls->inp = inp;
>> + SOCK_UNLOCK(so);
>> + TASK_INIT(&tls->disable_ifnet_task, 0, ktls_disable_ifnet_help, tls);
>> + (void)taskqueue_enqueue(taskqueue_thread, &tls->disable_ifnet_task);
>> +}
>> diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
>> index dd30f89896d2..3f72a821e71f 100644
>> --- a/sys/netinet/tcp_var.h
>> +++ b/sys/netinet/tcp_var.h
>> @@ -39,8 +39,10 @@
>> #include <netinet/tcp_fsm.h>
>>
>> #ifdef _KERNEL
>> +#include "opt_kern_tls.h"
>> #include <net/vnet.h>
>> #include <sys/mbuf.h>
>> +#include <sys/ktls.h>
>> #endif
>>
>> #define TCP_END_BYTE_INFO 8 /* Bytes that makeup the "end information
>> array" */
>> @@ -1139,8 +1141,10 @@ tcp_fields_to_net(struct tcphdr *th)
>>
>> static inline void
>> tcp_account_for_send(struct tcpcb *tp, uint32_t len, uint8_t is_rxt,
>> - uint8_t is_tlp, int hw_tls __unused)
>> + uint8_t is_tlp, int hw_tls)
>> {
>> + uint64_t rexmit_percent;
>> +
>> if (is_tlp) {
>> tp->t_sndtlppack++;
>> tp->t_sndtlpbyte += len;
>> @@ -1150,6 +1154,13 @@ tcp_account_for_send(struct tcpcb *tp, uint32_t len,
>> uint8_t is_rxt,
>> tp->t_snd_rxt_bytes += len;
>> else
>> tp->t_sndbytes += len;
>> +
>> + if (hw_tls && is_rxt) {
>> + rexmit_percent = (1000ULL * tp->t_snd_rxt_bytes) / (10ULL *
>> (tp->t_snd_rxt_bytes + tp->t_sndbytes));
>> + if (rexmit_percent > ktls_ifnet_max_rexmit_pct)
>> + ktls_disable_ifnet(tp);
>> + }
>> +
>> }
>> #endif /* _KERNEL */
>>
>> diff --git a/sys/sys/ktls.h b/sys/sys/ktls.h
>> index b28c94965c97..7fd8831878b4 100644
>> --- a/sys/sys/ktls.h
>> +++ b/sys/sys/ktls.h
>> @@ -189,10 +189,12 @@ struct ktls_session {
>> u_int wq_index;
>> volatile u_int refcount;
>> int mode;
>> - bool reset_pending;
>>
>> struct task reset_tag_task;
>> + struct task disable_ifnet_task;
>> struct inpcb *inp;
>> + bool reset_pending;
>> + bool disable_ifnet_pending;
>> } __aligned(CACHE_LINE_SIZE);
>>
>> void ktls_check_rx(struct sockbuf *sb);
>> @@ -231,5 +233,16 @@ ktls_free(struct ktls_session *tls)
>> ktls_destroy(tls);
>> }
>>
>> +#ifdef KERN_TLS
>> +extern unsigned int ktls_ifnet_max_rexmit_pct;
>> +void ktls_disable_ifnet(void *arg);
>> +#else
>> +#define ktls_ifnet_max_rexmit_pct 1
>> +inline void
>> +ktls_disable_ifnet(void *arg __unused)
>> +{
>> +}
>> +#endif
>> +
>> #endif /* !_KERNEL */
>> #endif /* !_SYS_KTLS_H_ */
>>
>
>
More information about the dev-commits-src-main
mailing list