git: 1f628be888b7 - main - tcp_ratelimit: provide an api for drivers to release ratesets at detach

From: Andrew Gallatin <gallatin_at_FreeBSD.org>
Date: Mon, 05 Aug 2024 16:52:07 UTC
The branch main has been updated by gallatin:

URL: https://cgit.FreeBSD.org/src/commit/?id=1f628be888b74f1219b3ea7ccea1e7a3d1db77a2

commit 1f628be888b74f1219b3ea7ccea1e7a3d1db77a2
Author:     Andrew Gallatin <gallatin@FreeBSD.org>
AuthorDate: 2024-08-05 15:45:42 +0000
Commit:     Andrew Gallatin <gallatin@FreeBSD.org>
CommitDate: 2024-08-05 16:51:35 +0000

    tcp_ratelimit: provide an api for drivers to release ratesets at detach
    
    When the kernel is compiled with options RATELIMIT, the
    mlx5en driver cannot detach. It gets stuck waiting for all
    kernel users of its rates to drop to zero before finally calling
    ether_ifdetach.
    
    The tcp ratelimit code has an eventhandler for ifnet departure
    which causes rates to be released. However, this is called as an
    ifnet departure eventhandler, which is invoked as part of
    ifdetach(), via either_ifdetach(). This means that the tcp
    ratelimit code holds down many hw rates when the mlx5en driver
    is waiting for the rate count to go to 0. Thus devctl detach
    will deadlock on mlx5 with this stack:
    mi_switch+0xcf sleepq_timedwait+0x2f _sleep+0x1a3 pause_sbt+0x77 mlx5e_destroy_ifp+0xaf mlx5_remove_device+0xa7 mlx5_unregister_device+0x78 mlx5_unload_one+0x10a remove_one+0x1e linux_pci_detach_device+0x36 linux_pci_detach+0x24 device_detach+0x180 devctl2_ioctl+0x3dc devfs_ioctl+0xbb vn_ioctl+0xca devfs_ioctl_f+0x1e kern_ioctl+0x1c3 sys_ioctl+0x10a
    
    To fix this, provide an explicit API for a driver to call the tcp
    ratelimit code telling it to detach itself from an ifnet. This
    allows the mlx5 driver to unload cleanly. I considered adding an
    ifnet pre-departure eventhandler. However, that would need to be
    invoked by the driver, so a simple function call seemed better.
    
    The mlx5en driver has been updated to call this function.
    
    Reviewed by: kib, rrs
    
    Differential Revision:  https://reviews.freebsd.org/D46221
    Sponsored by: Netflix
---
 sys/dev/mlx5/mlx5_en/mlx5_en_main.c | 8 +++++++-
 sys/netinet/tcp_ratelimit.c         | 6 ++++++
 sys/netinet/tcp_ratelimit.h         | 9 +++++++++
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
index ccbdf11a1dd5..a80235f0f347 100644
--- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
+++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
@@ -36,6 +36,7 @@
 #include <machine/atomic.h>
 
 #include <net/debugnet.h>
+#include <netinet/tcp_ratelimit.h>
 #include <netipsec/keydb.h>
 #include <netipsec/ipsec_offload.h>
 
@@ -4876,7 +4877,12 @@ mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vpriv)
 
 #ifdef RATELIMIT
 	/*
-	 * The kernel can have reference(s) via the m_snd_tag's into
+	 * Tell the TCP ratelimit code to release the rate-sets attached
+	 * to our ifnet.
+	 */
+	tcp_rl_release_ifnet(ifp);
+	/*
+	 * The kernel can still have reference(s) via the m_snd_tag's into
 	 * the ratelimit channels, and these must go away before
 	 * detaching:
 	 */
diff --git a/sys/netinet/tcp_ratelimit.c b/sys/netinet/tcp_ratelimit.c
index 1834c702c493..22bdf707fa89 100644
--- a/sys/netinet/tcp_ratelimit.c
+++ b/sys/netinet/tcp_ratelimit.c
@@ -1298,6 +1298,12 @@ tcp_rl_ifnet_departure(void *arg __unused, struct ifnet *ifp)
 	NET_EPOCH_EXIT(et);
 }
 
+void
+tcp_rl_release_ifnet(struct ifnet *ifp)
+{
+	tcp_rl_ifnet_departure(NULL, ifp);
+}
+
 static void
 tcp_rl_shutdown(void *arg __unused, int howto __unused)
 {
diff --git a/sys/netinet/tcp_ratelimit.h b/sys/netinet/tcp_ratelimit.h
index cd540d1164e1..0ce42dea0d90 100644
--- a/sys/netinet/tcp_ratelimit.h
+++ b/sys/netinet/tcp_ratelimit.h
@@ -94,6 +94,8 @@ CK_LIST_HEAD(head_tcp_rate_set, tcp_rate_set);
 #ifndef ETHERNET_SEGMENT_SIZE
 #define ETHERNET_SEGMENT_SIZE 1514
 #endif
+struct tcpcb;
+
 #ifdef RATELIMIT
 #define DETAILED_RATELIMIT_SYSCTL 1	/*
 					 * Undefine this if you don't want
@@ -131,6 +133,9 @@ tcp_get_pacing_burst_size_w_divisor(struct tcpcb *tp, uint64_t bw, uint32_t segs
 void
 tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte);
 
+void
+tcp_rl_release_ifnet(struct ifnet *ifp);
+
 #else
 static inline const struct tcp_hwrate_limit_table *
 tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *ifp,
@@ -218,6 +223,10 @@ tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte)
 {
 }
 
+static inline void
+tcp_rl_release_ifnet(struct ifnet *ifp)
+{
+}
 #endif
 
 /*