git: d2b96f654a67 - main - iflib: Fix panic observed while doing sysctl -a with if_bnxt unload

From: Sumit Saxena <ssaxena_at_FreeBSD.org>
Date: Tue, 14 Apr 2026 09:14:18 UTC
The branch main has been updated by ssaxena:

URL: https://cgit.FreeBSD.org/src/commit/?id=d2b96f654a672f6059c5c623c276dcd76841ed12

commit d2b96f654a672f6059c5c623c276dcd76841ed12
Author:     Sreekanth Reddy <sreekanth.reddy@broadcom.com>
AuthorDate: 2026-04-13 06:28:08 +0000
Commit:     Sumit Saxena <ssaxena@FreeBSD.org>
CommitDate: 2026-04-14 09:13:34 +0000

    iflib: Fix panic observed while doing sysctl -a with if_bnxt unload
    
    Observed below kernel panic calltrace while performing sysctl -a
    operation while unloading the if_bnxt driver,
    
    Fatal trap 9: general protection fault while in kernel mode
    
    KDB: stack backtrace:
    db_trace_self_wrapper() at db_trace_self_wrapper+0x2b/frame 0xfffffe02a7569940
    vpanic() at vpanic+0x136/frame 0xfffffe02a7569a70
    panic() at panic+0x43/frame 0xfffffe02a7569ad0
    trap_fatal() at trap_fatal+0x68/frame 0xfffffe02a7569af0
    calltrap() at calltrap+0x8/frame 0xfffffe02a7569af0
    
    trap 0x9, rip = 0xffffffff80c0b411, rsp = 0xfffffe02a7569bc0, rbp = 0xfffffe02a7569be0 ---
    sysctl_handle_counter_u64() at sysctl_handle_counter_u64+0x61/frame 0xfffffe02a7569be0
    sysctl_root_handler_locked() at sysctl_root_handler_locked+0x9c/frame 0xfffffe02a7569c30
    sysctl_root() at sysctl_root+0x22f/frame 0xfffffe02a7569cb0
    userland_sysctl() at userland_sysctl+0x196/frame 0xfffffe02a7569d50
    sys___sysctl() at sys___sysctl+0x65/frame 0xfffffe02a7569e00
    amd64_syscall() at amd64_syscall+0x169/frame 0xfffffe02a7569f30
    fast_syscall_common() at fast_syscall_common+0xf8/frame 0xfffffe02a7569f30
    
    Root Cause:
    iflib adds per-device sysctl nodes under the device tree using the device
    sysctl context. Some of those nodes are counter sysctl that point at fields
    inside txq→ift_br. When the if_bnxt driver is unloaded, iflib_device_deregister
    runs and calls iflib_tx_structures_free, which frees the txqs ift_br. The device
    sysctl tree is only freed when the device is destroyed. If sysctl -a runs during
    unload, it can still traverse the device tree and call sysctl_handle_counter_u64
    for those nodes. The handler does counter_u64_fetch(*(counter_u64_t *)arg1).
    By then arg1 can point into freed memory and leads to use after free type kernel panic.
    
    Fix:
    flib now uses its own sysctl context for all iflib-related nodes
    instead of using device’s context. And iflib sysctl context is now
    removed before any queue/ring memory is freed.
    
    MFC after:      2 weeks
    Reviewed by:    gallatin, ssaxena, #iflib
    Differential Revision: https://reviews.freebsd.org/D55981
---
 sys/net/iflib.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/sys/net/iflib.c b/sys/net/iflib.c
index f9d0b1af0f83..186c41d9f839 100644
--- a/sys/net/iflib.c
+++ b/sys/net/iflib.c
@@ -190,6 +190,7 @@ struct iflib_ctx {
 	struct ifmedia	ifc_media;
 	struct ifmedia	*ifc_mediap;
 
+	struct sysctl_ctx_list ifc_sysctl_ctx;
 	struct sysctl_oid *ifc_sysctl_node;
 	uint16_t ifc_sysctl_ntxqs;
 	uint16_t ifc_sysctl_nrxqs;
@@ -5293,6 +5294,8 @@ iflib_device_register(device_t dev, void *sc, if_shared_ctx_t sctx, if_ctx_t *ct
 fail_detach:
 	ether_ifdetach(ctx->ifc_ifp);
 fail_queues:
+	sysctl_ctx_free(&ctx->ifc_sysctl_ctx);
+	ctx->ifc_sysctl_node = NULL;
 	taskqueue_free(ctx->ifc_tq);
 	iflib_tqg_detach(ctx);
 	iflib_tx_structures_free(ctx);
@@ -5332,6 +5335,9 @@ iflib_device_deregister(if_ctx_t ctx)
 	if_t ifp = ctx->ifc_ifp;
 	device_t dev = ctx->ifc_dev;
 
+	sysctl_ctx_free(&ctx->ifc_sysctl_ctx);
+	ctx->ifc_sysctl_node = NULL;
+
 	/* Make sure VLANS are not using driver */
 	if (if_vlantrunkinuse(ifp)) {
 		device_printf(dev, "Vlan in use, detach first\n");
@@ -6787,62 +6793,61 @@ iflib_add_device_sysctl_pre(if_ctx_t ctx)
 {
 	device_t dev = iflib_get_dev(ctx);
 	struct sysctl_oid_list *child, *oid_list;
-	struct sysctl_ctx_list *ctx_list;
 	struct sysctl_oid *node;
 
-	ctx_list = device_get_sysctl_ctx(dev);
+	sysctl_ctx_init(&ctx->ifc_sysctl_ctx);
 	child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev));
-	ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(ctx_list, child,
+	ctx->ifc_sysctl_node = node = SYSCTL_ADD_NODE(&ctx->ifc_sysctl_ctx, child,
 	    OID_AUTO, "iflib", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
 	    "IFLIB fields");
 	oid_list = SYSCTL_CHILDREN(node);
 
-	SYSCTL_ADD_CONST_STRING(ctx_list, oid_list, OID_AUTO, "driver_version",
+	SYSCTL_ADD_CONST_STRING(&ctx->ifc_sysctl_ctx, oid_list, OID_AUTO, "driver_version",
 	    CTLFLAG_RD, ctx->ifc_sctx->isc_driver_version, "driver version");
 
-	SYSCTL_ADD_BOOL(ctx_list, oid_list, OID_AUTO, "simple_tx",
+	SYSCTL_ADD_BOOL(&ctx->ifc_sysctl_ctx, oid_list, OID_AUTO, "simple_tx",
 	    CTLFLAG_RDTUN, &ctx->ifc_sysctl_simple_tx, 0,
 	    "use simple tx ring");
-	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_ntxqs",
+	SYSCTL_ADD_U16(&ctx->ifc_sysctl_ctx, oid_list, OID_AUTO, "override_ntxqs",
 	    CTLFLAG_RWTUN, &ctx->ifc_sysctl_ntxqs, 0,
 	    "# of txqs to use, 0 => use default #");
-	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_nrxqs",
+	SYSCTL_ADD_U16(&ctx->ifc_sysctl_ctx, oid_list, OID_AUTO, "override_nrxqs",
 	    CTLFLAG_RWTUN, &ctx->ifc_sysctl_nrxqs, 0,
 	    "# of rxqs to use, 0 => use default #");
-	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "override_qs_enable",
+	SYSCTL_ADD_U16(&ctx->ifc_sysctl_ctx, oid_list, OID_AUTO, "override_qs_enable",
 	    CTLFLAG_RWTUN, &ctx->ifc_sysctl_qs_eq_override, 0,
 	    "permit #txq != #rxq");
-	SYSCTL_ADD_INT(ctx_list, oid_list, OID_AUTO, "disable_msix",
+	SYSCTL_ADD_INT(&ctx->ifc_sysctl_ctx, oid_list, OID_AUTO, "disable_msix",
 	    CTLFLAG_RWTUN, &ctx->ifc_softc_ctx.isc_disable_msix, 0,
 	    "disable MSI-X (default 0)");
-	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "rx_budget",
+	SYSCTL_ADD_U16(&ctx->ifc_sysctl_ctx, oid_list, OID_AUTO, "rx_budget",
 	    CTLFLAG_RWTUN, &ctx->ifc_sysctl_rx_budget, 0, "set the RX budget");
-	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "tx_abdicate",
+	SYSCTL_ADD_U16(&ctx->ifc_sysctl_ctx, oid_list, OID_AUTO, "tx_abdicate",
 	    CTLFLAG_RWTUN, &ctx->ifc_sysctl_tx_abdicate, 0,
 	    "cause TX to abdicate instead of running to completion");
 	ctx->ifc_sysctl_core_offset = CORE_OFFSET_UNSPECIFIED;
-	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "core_offset",
+	SYSCTL_ADD_U16(&ctx->ifc_sysctl_ctx, oid_list, OID_AUTO, "core_offset",
 	    CTLFLAG_RDTUN, &ctx->ifc_sysctl_core_offset, 0,
 	    "offset to start using cores at");
-	SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "separate_txrx",
+	SYSCTL_ADD_U8(&ctx->ifc_sysctl_ctx, oid_list, OID_AUTO, "separate_txrx",
 	    CTLFLAG_RDTUN, &ctx->ifc_sysctl_separate_txrx, 0,
 	    "use separate cores for TX and RX");
-	SYSCTL_ADD_U8(ctx_list, oid_list, OID_AUTO, "use_logical_cores",
+	SYSCTL_ADD_U8(&ctx->ifc_sysctl_ctx, oid_list, OID_AUTO, "use_logical_cores",
 	    CTLFLAG_RDTUN, &ctx->ifc_sysctl_use_logical_cores, 0,
 	    "try to make use of logical cores for TX and RX");
-	SYSCTL_ADD_U16(ctx_list, oid_list, OID_AUTO, "use_extra_msix_vectors",
+	SYSCTL_ADD_U16(&ctx->ifc_sysctl_ctx, oid_list, OID_AUTO, "use_extra_msix_vectors",
 	    CTLFLAG_RDTUN, &ctx->ifc_sysctl_extra_msix_vectors, 0,
 	    "attempt to reserve the given number of extra MSI-X vectors during driver load for the creation of additional interfaces later");
-	SYSCTL_ADD_INT(ctx_list, oid_list, OID_AUTO, "allocated_msix_vectors",
+	SYSCTL_ADD_INT(&ctx->ifc_sysctl_ctx, oid_list, OID_AUTO, "allocated_msix_vectors",
 	    CTLFLAG_RDTUN, &ctx->ifc_softc_ctx.isc_vectors, 0,
 	    "total # of MSI-X vectors allocated by driver");
 
 	/* XXX change for per-queue sizes */
-	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_ntxds",
+	SYSCTL_ADD_PROC(&ctx->ifc_sysctl_ctx, oid_list, OID_AUTO, "override_ntxds",
 	    CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, ctx,
 	    IFLIB_NTXD_HANDLER, mp_ndesc_handler, "A",
 	    "list of # of TX descriptors to use, 0 = use default #");
-	SYSCTL_ADD_PROC(ctx_list, oid_list, OID_AUTO, "override_nrxds",
+	SYSCTL_ADD_PROC(&ctx->ifc_sysctl_ctx, oid_list, OID_AUTO, "override_nrxds",
 	    CTLTYPE_STRING | CTLFLAG_RWTUN | CTLFLAG_NEEDGIANT, ctx,
 	    IFLIB_NRXD_HANDLER, mp_ndesc_handler, "A",
 	    "list of # of RX descriptors to use, 0 = use default #");
@@ -6853,9 +6858,8 @@ iflib_add_device_sysctl_post(if_ctx_t ctx)
 {
 	if_shared_ctx_t sctx = ctx->ifc_sctx;
 	if_softc_ctx_t scctx = &ctx->ifc_softc_ctx;
-	device_t dev = iflib_get_dev(ctx);
 	struct sysctl_oid_list *child;
-	struct sysctl_ctx_list *ctx_list;
+	struct sysctl_ctx_list *ctx_list = &ctx->ifc_sysctl_ctx;
 	iflib_fl_t fl;
 	iflib_txq_t txq;
 	iflib_rxq_t rxq;
@@ -6864,7 +6868,6 @@ iflib_add_device_sysctl_post(if_ctx_t ctx)
 	char *qfmt;
 	struct sysctl_oid *queue_node, *fl_node, *node;
 	struct sysctl_oid_list *queue_list, *fl_list;
-	ctx_list = device_get_sysctl_ctx(dev);
 
 	node = ctx->ifc_sysctl_node;
 	child = SYSCTL_CHILDREN(node);