git: 468a6b5055f0 - main - ibcore: Fix use-after-free in IB mad completion handling.

Hans Petter Selasky hselasky at FreeBSD.org
Mon Jul 12 13:09:43 UTC 2021


The branch main has been updated by hselasky:

URL: https://cgit.FreeBSD.org/src/commit/?id=468a6b5055f0b6ea0bdb1ee8cbdf749204cb3b25

commit 468a6b5055f0b6ea0bdb1ee8cbdf749204cb3b25
Author:     Hans Petter Selasky <hselasky at FreeBSD.org>
AuthorDate: 2021-06-16 13:01:44 +0000
Commit:     Hans Petter Selasky <hselasky at FreeBSD.org>
CommitDate: 2021-07-12 12:22:31 +0000

    ibcore: Fix use-after-free in IB mad completion handling.
    
    We encountered a use-after-free bug when unloading the driver:
    
    BUG: KASAN: use-after-free in ib_mad_post_receive_mads+0xddc/0xed0 [ib_core]
    Read of size 4 at addr ffff8882ca5aa868 by task kworker/u13:2/23862
    
    Workqueue: ib-comp-unb-wq ib_cq_poll_work [ib_core]
    Call Trace:
    dump_stack+0x9a/0xeb
    print_address_description+0xe3/0x2e0
    ib_mad_post_receive_mads+0xddc/0xed0 [ib_core]
    __kasan_report+0x15c/0x1df
    ib_mad_post_receive_mads+0xddc/0xed0 [ib_core]
    kasan_report+0xe/0x20
    ib_mad_post_receive_mads+0xddc/0xed0 [ib_core]
    find_mad_agent+0xa00/0xa00 [ib_core]
    qlist_free_all+0x51/0xb0
    mlx4_ib_sqp_comp_worker+0x1970/0x1970 [mlx4_ib]
    quarantine_reduce+0x1fa/0x270
    kasan_unpoison_shadow+0x30/0x40
    ib_mad_recv_done+0xdf6/0x3000 [ib_core]
    _raw_spin_unlock_irqrestore+0x46/0x70
    ib_mad_send_done+0x1810/0x1810 [ib_core]
    mlx4_ib_destroy_cq+0x2a0/0x2a0 [mlx4_ib]
    _raw_spin_unlock_irqrestore+0x46/0x70
    debug_object_deactivate+0x2b9/0x4a0
    __ib_process_cq+0xe2/0x1d0 [ib_core]
    ib_cq_poll_work+0x45/0xf0 [ib_core]
    process_one_work+0x90c/0x1860
    pwq_dec_nr_in_flight+0x320/0x320
    worker_thread+0x87/0xbb0
    __kthread_parkme+0xb6/0x180
    process_one_work+0x1860/0x1860
    kthread+0x320/0x3e0
    kthread_park+0x120/0x120
    ret_from_fork+0x24/0x30
    ...
    Freed by task 31682:
    save_stack+0x19/0x80
    __kasan_slab_free+0x11d/0x160
    kfree+0xf5/0x2f0
    ib_mad_port_close+0x200/0x380 [ib_core]
    ib_mad_remove_device+0xf0/0x230 [ib_core]
    remove_client_context+0xa6/0xe0 [ib_core]
    disable_device+0x14e/0x260 [ib_core]
    __ib_unregister_device+0x79/0x150 [ib_core]
    ib_unregister_device+0x21/0x30 [ib_core]
    mlx4_ib_remove+0x162/0x690 [mlx4_ib]
    mlx4_remove_device+0x204/0x2c0 [mlx4_core]
    mlx4_unregister_interface+0x49/0x1d0 [mlx4_core]
    mlx4_ib_cleanup+0xc/0x1d [mlx4_ib]
    __x64_sys_delete_module+0x2d2/0x400
    do_syscall_64+0x95/0x470
    entry_SYSCALL_64_after_hwframe+0x49/0xbe
    
    The problem was that the MAD PD was deallocated before the MAD CQ.
    There was completion work pending for the CQ when the PD got deallocated.
    When the mad completion handling reached procedure
    ib_mad_post_receive_mads(), we got a use-after-free bug in the following
    line of code in that procedure:
    sg_list.lkey = qp_info->port_priv->pd->local_dma_lkey;
    (the pd pointer in the above line is no longer valid, because the
    pd has been deallocated).
    
    We fix this by allocating the PD before the CQ in procedure
    ib_mad_port_open(), and deallocating the PD after freeing the CQ
    in procedure ib_mad_port_close().
    
    Since the CQ completion work queue is flushed during ib_free_cq(),
    no completions will be pending for that CQ when the PD is later
    deallocated.
    
    Note that freeing the CQ before deallocating the PD is the practice
    in the ULPs.
    
    Linux commit:
    770b7d96cfff6a8bf6c9f261ba6f135dc9edf484
    
    MFC after:      1 week
    Reviewed by:    kib
    Sponsored by:   Mellanox Technologies // NVIDIA Networking
---
 sys/ofed/drivers/infiniband/core/ib_mad.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/sys/ofed/drivers/infiniband/core/ib_mad.c b/sys/ofed/drivers/infiniband/core/ib_mad.c
index 2f14535688aa..b9f45ff5e41d 100644
--- a/sys/ofed/drivers/infiniband/core/ib_mad.c
+++ b/sys/ofed/drivers/infiniband/core/ib_mad.c
@@ -3146,10 +3146,8 @@ static int ib_mad_port_open(struct ib_device *device,
 
 	/* Create new device info */
 	port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
-	if (!port_priv) {
-		dev_err(&device->dev, "No memory for ib_mad_port_private\n");
+	if (!port_priv)
 		return -ENOMEM;
-	}
 
 	port_priv->device = device;
 	port_priv->port_num = port_num;
@@ -3163,18 +3161,18 @@ static int ib_mad_port_open(struct ib_device *device,
 	if (has_smi)
 		cq_size *= 2;
 
+	port_priv->pd = ib_alloc_pd(device, 0);
+	if (IS_ERR(port_priv->pd)) {
+		dev_err(&device->dev, "Couldn't create ib_mad PD\n");
+		ret = PTR_ERR(port_priv->pd);
+		goto error3;
+	}
+
 	port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0,
 			IB_POLL_WORKQUEUE);
 	if (IS_ERR(port_priv->cq)) {
 		dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
 		ret = PTR_ERR(port_priv->cq);
-		goto error3;
-	}
-
-	port_priv->pd = ib_alloc_pd(device, 0);
-	if (IS_ERR(port_priv->pd)) {
-		dev_err(&device->dev, "Couldn't create ib_mad PD\n");
-		ret = PTR_ERR(port_priv->pd);
 		goto error4;
 	}
 
@@ -3217,11 +3215,11 @@ error8:
 error7:
 	destroy_mad_qp(&port_priv->qp_info[0]);
 error6:
-	ib_dealloc_pd(port_priv->pd);
-error4:
 	ib_free_cq(port_priv->cq);
 	cleanup_recv_queue(&port_priv->qp_info[1]);
 	cleanup_recv_queue(&port_priv->qp_info[0]);
+error4:
+	ib_dealloc_pd(port_priv->pd);
 error3:
 	kfree(port_priv);
 
@@ -3251,8 +3249,8 @@ static int ib_mad_port_close(struct ib_device *device, int port_num)
 	destroy_workqueue(port_priv->wq);
 	destroy_mad_qp(&port_priv->qp_info[1]);
 	destroy_mad_qp(&port_priv->qp_info[0]);
-	ib_dealloc_pd(port_priv->pd);
 	ib_free_cq(port_priv->cq);
+	ib_dealloc_pd(port_priv->pd);
 	cleanup_recv_queue(&port_priv->qp_info[1]);
 	cleanup_recv_queue(&port_priv->qp_info[0]);
 	/* XXX: Handle deallocation of MAD registration tables */


More information about the dev-commits-src-all mailing list