git: f9e1d9471077 - main - ena: Add sysctl support for spreading IRQs

From: Arthur Kiyanovski <akiyano_at_FreeBSD.org>
Date: Thu, 28 Dec 2023 13:57:13 UTC
The branch main has been updated by akiyano:

URL: https://cgit.FreeBSD.org/src/commit/?id=f9e1d9471077109c19fd7d6dc9c1d35432efdede

commit f9e1d9471077109c19fd7d6dc9c1d35432efdede
Author:     Osama Abboud <osamaabb@amazon.com>
AuthorDate: 2023-10-30 11:27:03 +0000
Commit:     Arthur Kiyanovski <akiyano@FreeBSD.org>
CommitDate: 2023-12-28 13:55:47 +0000

    ena: Add sysctl support for spreading IRQs
    
    This commit allows spreading IO IRQs over different CPUs through sysctl.
    Two sysctl nodes are introduced:
    1- base_cpu: servers as the first CPU to which the first IO IRQ
    will be bound.
    2- cpu_stride: sets the distance between every two CPUs to which every
    two consecutive IO IRQs are bound.
    
    For example for doing the following IO IRQs / CPU binding:
    
    IRQ idx |  CPU
    ----------------
       1    |   0
       2    |   2
       3    |   4
       4    |   6
    
    Run the following commands:
    sysctl dev.ena.<device index>.irq_affinity.base_cpu=0
    sysctl dev.ena.<device_index>.irq_affinity.cpu_stride=2
    
    Also introduced rss_enabled field, which is intended to replace
    '#ifdef RSS' in multiple places, in order to prevent code duplication.
    
    We want to bind interrupts to CPUs in case of rss set OR in case
    the newly defined sysctl paremeter is set. This requires to remove a
    couple of '#ifdef RSS' as well in the structs, since we'll be using the
    relevant parameters in the CPU binding code.
    
    Approved by: cperciva (mentor)
    MFC after: 2 weeks
    Sponsored by: Amazon, Inc.
---
 sys/dev/ena/ena.c        | 126 ++++++++++++++++++++++++++++++++++------
 sys/dev/ena/ena.h        |  14 +++--
 sys/dev/ena/ena_sysctl.c | 145 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 263 insertions(+), 22 deletions(-)

diff --git a/sys/dev/ena/ena.c b/sys/dev/ena/ena.c
index 455386a4f152..067c8a6f3a08 100644
--- a/sys/dev/ena/ena.c
+++ b/sys/dev/ena/ena.c
@@ -1237,6 +1237,84 @@ ena_update_io_rings(struct ena_adapter *adapter, uint32_t num)
 	ena_init_io_rings(adapter);
 }
 
+int
+ena_update_base_cpu(struct ena_adapter *adapter, int new_num)
+{
+	int old_num;
+	int rc = 0;
+	bool dev_was_up;
+
+	dev_was_up = ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter);
+	old_num = adapter->irq_cpu_base;
+
+	ena_down(adapter);
+
+	adapter->irq_cpu_base = new_num;
+
+	if (dev_was_up) {
+		rc = ena_up(adapter);
+		if (unlikely(rc != 0)) {
+			ena_log(adapter->pdev, ERR,
+			    "Failed to configure device %d IRQ base CPU. "
+			    "Reverting to previous value: %d\n",
+			    new_num, old_num);
+
+			adapter->irq_cpu_base = old_num;
+
+			rc = ena_up(adapter);
+			if (unlikely(rc != 0)) {
+				ena_log(adapter->pdev, ERR,
+				    "Failed to revert to previous setup."
+				    "Triggering device reset.\n");
+				ENA_FLAG_SET_ATOMIC(
+				    ENA_FLAG_DEV_UP_BEFORE_RESET, adapter);
+				ena_trigger_reset(adapter,
+				    ENA_REGS_RESET_OS_TRIGGER);
+			}
+		}
+	}
+	return (rc);
+}
+
+int
+ena_update_cpu_stride(struct ena_adapter *adapter, uint32_t new_num)
+{
+	uint32_t old_num;
+	int rc = 0;
+	bool dev_was_up;
+
+	dev_was_up = ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter);
+	old_num = adapter->irq_cpu_stride;
+
+	ena_down(adapter);
+
+	adapter->irq_cpu_stride = new_num;
+
+	if (dev_was_up) {
+		rc = ena_up(adapter);
+		if (unlikely(rc != 0)) {
+			ena_log(adapter->pdev, ERR,
+			    "Failed to configure device %d IRQ CPU stride. "
+			    "Reverting to previous value: %d\n",
+			    new_num, old_num);
+
+			adapter->irq_cpu_stride = old_num;
+
+			rc = ena_up(adapter);
+			if (unlikely(rc != 0)) {
+				ena_log(adapter->pdev, ERR,
+				    "Failed to revert to previous setup."
+				    "Triggering device reset.\n");
+				ENA_FLAG_SET_ATOMIC(
+				    ENA_FLAG_DEV_UP_BEFORE_RESET, adapter);
+				ena_trigger_reset(adapter,
+				    ENA_REGS_RESET_OS_TRIGGER);
+			}
+		}
+	}
+	return (rc);
+}
+
 /* Caller should sanitize new_num */
 int
 ena_update_io_queue_nb(struct ena_adapter *adapter, uint32_t new_num)
@@ -1683,6 +1761,13 @@ ena_setup_io_intr(struct ena_adapter *adapter)
 		ena_log(adapter->pdev, DBG, "ena_setup_io_intr vector: %d\n",
 		    adapter->msix_entries[irq_idx].vector);
 
+		if (adapter->irq_cpu_base > ENA_BASE_CPU_UNSPECIFIED) {
+			adapter->que[i].cpu = adapter->irq_tbl[irq_idx].cpu =
+			    (unsigned)(adapter->irq_cpu_base +
+			    i * adapter->irq_cpu_stride) % (unsigned)mp_ncpus;
+			CPU_SETOF(adapter->que[i].cpu, &adapter->que[i].cpu_mask);
+		}
+
 #ifdef RSS
 		adapter->que[i].cpu = adapter->irq_tbl[irq_idx].cpu =
 		    rss_getcpu(cur_bind);
@@ -1790,20 +1875,19 @@ ena_request_io_irq(struct ena_adapter *adapter)
 		}
 		irq->requested = true;
 
-#ifdef RSS
-		rc = bus_bind_intr(adapter->pdev, irq->res, irq->cpu);
-		if (unlikely(rc != 0)) {
-			ena_log(pdev, ERR,
-			    "failed to bind interrupt handler for irq %ju to cpu %d: %d\n",
-			    rman_get_start(irq->res), irq->cpu, rc);
-			goto err;
-		}
+		if (adapter->rss_enabled || adapter->irq_cpu_base > ENA_BASE_CPU_UNSPECIFIED) {
+			rc = bus_bind_intr(adapter->pdev, irq->res, irq->cpu);
+			if (unlikely(rc != 0)) {
+				ena_log(pdev, ERR,
+				    "failed to bind interrupt handler for irq %ju to cpu %d: %d\n",
+				    rman_get_start(irq->res), irq->cpu, rc);
+				goto err;
+			}
 
-		ena_log(pdev, INFO, "queue %d - cpu %d\n",
-		    i - ENA_IO_IRQ_FIRST_IDX, irq->cpu);
-#endif
+			ena_log(pdev, INFO, "queue %d - cpu %d\n",
+			    i - ENA_IO_IRQ_FIRST_IDX, irq->cpu);
+		}
 	}
-
 	return (rc);
 
 err:
@@ -1814,13 +1898,14 @@ err:
 
 		/* Once we entered err: section and irq->requested is true we
 		   free both intr and resources */
-		if (irq->requested)
+		if (irq->requested) {
 			rcc = bus_teardown_intr(adapter->pdev, irq->res,
 			    irq->cookie);
-		if (unlikely(rcc != 0))
-			ena_log(pdev, ERR,
-			    "could not release irq: %d, error: %d\n",
-			    irq->vector, rcc);
+			if (unlikely(rcc != 0))
+				ena_log(pdev, ERR,
+				    "could not release irq: %d, error: %d\n",
+				    irq->vector, rcc);
+		}
 
 		/* If we entered err: section without irq->requested set we know
 		   it was bus_alloc_resource_any() that needs cleanup, provided
@@ -3523,6 +3608,13 @@ ena_attach(device_t pdev)
 	adapter->missing_tx_max_queues = ENA_DEFAULT_TX_MONITORED_QUEUES;
 	adapter->missing_tx_threshold = ENA_DEFAULT_TX_CMP_THRESHOLD;
 
+	adapter->irq_cpu_base = ENA_BASE_CPU_UNSPECIFIED;
+	adapter->irq_cpu_stride = 0;
+
+#ifdef RSS
+	adapter->rss_enabled = 1;
+#endif
+
 	if (version_printed++ == 0)
 		ena_log(pdev, INFO, "%s\n", ena_version);
 
diff --git a/sys/dev/ena/ena.h b/sys/dev/ena/ena.h
index b6e791d6ff6a..3687e9b2522e 100644
--- a/sys/dev/ena/ena.h
+++ b/sys/dev/ena/ena.h
@@ -69,6 +69,7 @@
 #define ENA_DEFAULT_RING_SIZE		1024
 #define ENA_MIN_RING_SIZE		256
 
+#define ENA_BASE_CPU_UNSPECIFIED 	-1
 /*
  * Refill Rx queue when number of required descriptors is above
  * QUEUE_SIZE / ENA_RX_REFILL_THRESH_DIVIDER or ENA_RX_REFILL_THRESH_PACKET
@@ -201,9 +202,7 @@ struct ena_irq {
 	void *cookie;
 	unsigned int vector;
 	bool requested;
-#ifdef RSS
 	int cpu;
-#endif
 	char name[ENA_IRQNAME_SIZE];
 };
 
@@ -216,10 +215,8 @@ struct ena_que {
 	struct taskqueue *cleanup_tq;
 
 	uint32_t id;
-#ifdef RSS
 	int cpu;
 	cpuset_t cpu_mask;
-#endif
 	int domain;
 	struct sysctl_oid *oid;
 };
@@ -448,6 +445,12 @@ struct ena_adapter {
 
 	ena_state_t flags;
 
+	/* IRQ CPU affinity */
+	int irq_cpu_base;
+	uint32_t irq_cpu_stride;
+
+	uint8_t rss_enabled;
+
 	/* Queue will represent one TX and one RX ring */
 	struct ena_que que[ENA_MAX_NUM_IO_QUEUES]
 	    __aligned(CACHE_LINE_SIZE);
@@ -524,7 +527,8 @@ int	ena_update_buf_ring_size(struct ena_adapter *adapter,
 int	ena_update_queue_size(struct ena_adapter *adapter, uint32_t new_tx_size,
     uint32_t new_rx_size);
 int	ena_update_io_queue_nb(struct ena_adapter *adapter, uint32_t new_num);
-
+int     ena_update_base_cpu(struct ena_adapter *adapter, int new_num);
+int     ena_update_cpu_stride(struct ena_adapter *adapter, uint32_t new_num);
 static inline int
 ena_mbuf_count(struct mbuf *mbuf)
 {
diff --git a/sys/dev/ena/ena_sysctl.c b/sys/dev/ena/ena_sysctl.c
index 62a38c2a4b82..f3b59d047bff 100644
--- a/sys/dev/ena/ena_sysctl.c
+++ b/sys/dev/ena/ena_sysctl.c
@@ -38,6 +38,7 @@ static void ena_sysctl_add_wd(struct ena_adapter *);
 static void ena_sysctl_add_stats(struct ena_adapter *);
 static void ena_sysctl_add_eni_metrics(struct ena_adapter *);
 static void ena_sysctl_add_tuneables(struct ena_adapter *);
+static void ena_sysctl_add_irq_affinity(struct ena_adapter *);
 /* Kernel option RSS prevents manipulation of key hash and indirection table. */
 #ifndef RSS
 static void ena_sysctl_add_rss(struct ena_adapter *);
@@ -45,6 +46,8 @@ static void ena_sysctl_add_rss(struct ena_adapter *);
 static int ena_sysctl_buf_ring_size(SYSCTL_HANDLER_ARGS);
 static int ena_sysctl_rx_queue_size(SYSCTL_HANDLER_ARGS);
 static int ena_sysctl_io_queues_nb(SYSCTL_HANDLER_ARGS);
+static int ena_sysctl_irq_base_cpu(SYSCTL_HANDLER_ARGS);
+static int ena_sysctl_irq_cpu_stride(SYSCTL_HANDLER_ARGS);
 static int ena_sysctl_eni_metrics_interval(SYSCTL_HANDLER_ARGS);
 #ifndef RSS
 static int ena_sysctl_rss_key(SYSCTL_HANDLER_ARGS);
@@ -102,6 +105,7 @@ ena_sysctl_add_nodes(struct ena_adapter *adapter)
 	ena_sysctl_add_stats(adapter);
 	ena_sysctl_add_eni_metrics(adapter);
 	ena_sysctl_add_tuneables(adapter);
+	ena_sysctl_add_irq_affinity(adapter);
 #ifndef RSS
 	ena_sysctl_add_rss(adapter);
 #endif
@@ -448,6 +452,36 @@ ena_sysctl_add_rss(struct ena_adapter *adapter)
 }
 #endif /* RSS */
 
+static void
+ena_sysctl_add_irq_affinity(struct ena_adapter *adapter)
+{
+	device_t dev;
+
+	struct sysctl_ctx_list *ctx;
+	struct sysctl_oid *tree;
+	struct sysctl_oid_list *child;
+
+	dev = adapter->pdev;
+
+	ctx = device_get_sysctl_ctx(dev);
+	tree = device_get_sysctl_tree(dev);
+	child = SYSCTL_CHILDREN(tree);
+
+	tree = SYSCTL_ADD_NODE(ctx, child, OID_AUTO, "irq_affinity",
+	    CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Decide base CPU and stride for irqs affinity.");
+	child = SYSCTL_CHILDREN(tree);
+
+	/* Add base cpu leaf */
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "base_cpu",
+	    CTLTYPE_S32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
+	    ena_sysctl_irq_base_cpu, "I", "Base cpu index for setting irq affinity.");
+
+	/* Add cpu stride leaf */
+	SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "cpu_stride",
+	    CTLTYPE_S32 | CTLFLAG_RW | CTLFLAG_MPSAFE, adapter, 0,
+	    ena_sysctl_irq_cpu_stride, "I", "Distance between irqs when setting affinity.");
+}
+
 
 /*
  * ena_sysctl_update_queue_node_nb - Register/unregister sysctl queue nodes.
@@ -707,6 +741,117 @@ unlock:
 	return (0);
 }
 
+static int
+ena_sysctl_irq_base_cpu(SYSCTL_HANDLER_ARGS)
+{
+	struct ena_adapter *adapter = arg1;
+	int irq_base_cpu = 0;
+	int error;
+
+	ENA_LOCK_LOCK();
+	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
+		error = ENODEV;
+		goto unlock;
+	}
+
+	error = sysctl_wire_old_buffer(req, sizeof(irq_base_cpu));
+	if (error == 0) {
+		irq_base_cpu = adapter->irq_cpu_base;
+		error = sysctl_handle_int(oidp, &irq_base_cpu, 0, req);
+	}
+	if (error != 0 || req->newptr == NULL)
+		goto unlock;
+
+	if (irq_base_cpu <= ENA_BASE_CPU_UNSPECIFIED) {
+		ena_log(adapter->pdev, ERR,
+		    "Requested base CPU is less than zero.\n");
+		error = EINVAL;
+		goto unlock;
+	}
+
+	if (irq_base_cpu > mp_ncpus) {
+		ena_log(adapter->pdev, INFO,
+		    "Requested base CPU is larger than the number of available CPUs. \n");
+		error = EINVAL;
+		goto unlock;
+
+	}
+
+	if (irq_base_cpu == adapter->irq_cpu_base) {
+		ena_log(adapter->pdev, INFO,
+		    "Requested IRQ base CPU is equal to current value "
+		    "(%d)\n",
+		    adapter->irq_cpu_base);
+		goto unlock;
+	}
+
+	ena_log(adapter->pdev, INFO,
+	    "Requested new IRQ base CPU: %d, current value: %d\n",
+	    irq_base_cpu, adapter->irq_cpu_base);
+
+	error = ena_update_base_cpu(adapter, irq_base_cpu);
+
+unlock:
+	ENA_LOCK_UNLOCK();
+
+	return (error);
+}
+
+static int
+ena_sysctl_irq_cpu_stride(SYSCTL_HANDLER_ARGS)
+{
+	struct ena_adapter *adapter = arg1;
+	int32_t irq_cpu_stride = 0;
+	int error;
+
+	ENA_LOCK_LOCK();
+	if (unlikely(!ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter))) {
+		error = ENODEV;
+		goto unlock;
+	}
+
+	error = sysctl_wire_old_buffer(req, sizeof(irq_cpu_stride));
+	if (error == 0) {
+		irq_cpu_stride = adapter->irq_cpu_stride;
+		error = sysctl_handle_int(oidp, &irq_cpu_stride, 0, req);
+	}
+	if (error != 0 || req->newptr == NULL)
+		goto unlock;
+
+	if (irq_cpu_stride < 0) {
+		ena_log(adapter->pdev, ERR,
+		    "Requested IRQ stride is less than zero.\n");
+		error = EINVAL;
+		goto unlock;
+	}
+
+	if (irq_cpu_stride > mp_ncpus) {
+		ena_log(adapter->pdev, INFO,
+		    "Warning: Requested IRQ stride is larger than the number of available CPUs.\n");
+	}
+
+	if (irq_cpu_stride == adapter->irq_cpu_stride) {
+		ena_log(adapter->pdev, INFO,
+		    "Requested IRQ CPU stride is equal to current value "
+		    "(%u)\n",
+		    adapter->irq_cpu_stride);
+		goto unlock;
+	}
+
+	ena_log(adapter->pdev, INFO,
+	    "Requested new IRQ CPU stride: %u, current value: %u\n",
+	    irq_cpu_stride, adapter->irq_cpu_stride);
+
+	error = ena_update_cpu_stride(adapter, irq_cpu_stride);
+	if (error != 0)
+		goto unlock;
+
+unlock:
+	ENA_LOCK_UNLOCK();
+
+	return (error);
+}
+
 #ifndef RSS
 /*
  * Change the Receive Side Scaling hash key.