git: a33ec635d1f6 - main - ena: Add differentiation for missing TX completions reset
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Tue, 15 Oct 2024 17:43:10 UTC
The branch main has been updated by osamaabb:
URL: https://cgit.FreeBSD.org/src/commit/?id=a33ec635d1f6d574d54e6f6d74766d070183be4c
commit a33ec635d1f6d574d54e6f6d74766d070183be4c
Author: Osama Abboud <osamaabb@amazon.com>
AuthorDate: 2024-08-07 06:24:19 +0000
Commit: Osama Abboud <osamaabb@FreeBSD.org>
CommitDate: 2024-10-15 17:38:31 +0000
ena: Add differentiation for missing TX completions reset
This commit adds differentiation for a reset caused by missing tx
completions, by verifying if the driver didn't receive tx
completions caused by missing interrupts.
The cleanup_running field was added to ena_ring because
cleanup_task.ta_pending is zeroed before ena_cleanup() runs.
Also ena_increment_reset_counter() API was added in order to support
only incrementing the reset counter.
Approved by: cperciva (mentor)
MFC after: 2 weeks
Sponsored by: Amazon, Inc.
---
sys/dev/ena/ena.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
sys/dev/ena/ena.h | 29 ++++++++++++++++++-----------
sys/dev/ena/ena_datapath.c | 21 +++++++++++++++------
3 files changed, 77 insertions(+), 18 deletions(-)
diff --git a/sys/dev/ena/ena.c b/sys/dev/ena/ena.c
index 8c20596d3d23..bbc857004671 100644
--- a/sys/dev/ena/ena.c
+++ b/sys/dev/ena/ena.c
@@ -169,6 +169,9 @@ static int ena_copy_eni_metrics(struct ena_adapter *);
static int ena_copy_srd_metrics(struct ena_adapter *);
static int ena_copy_customer_metrics(struct ena_adapter *);
static void ena_timer_service(void *);
+static enum ena_regs_reset_reason_types check_cdesc_in_tx_cq(struct ena_adapter *,
+ struct ena_ring *);
+
static char ena_version[] = ENA_DEVICE_NAME ENA_DRV_MODULE_NAME
" v" ENA_DRV_MODULE_VERSION;
@@ -3088,6 +3091,31 @@ check_for_rx_interrupt_queue(struct ena_adapter *adapter,
return (0);
}
+static enum ena_regs_reset_reason_types
+check_cdesc_in_tx_cq(struct ena_adapter *adapter,
+ struct ena_ring *tx_ring)
+{
+ device_t pdev = adapter->pdev;
+ int rc;
+ u16 req_id;
+
+ rc = ena_com_tx_comp_req_id_get(tx_ring->ena_com_io_cq, &req_id);
+ /* TX CQ is empty */
+ if (rc == ENA_COM_TRY_AGAIN) {
+ ena_log(pdev, ERR,
+ "No completion descriptors found in CQ %d\n",
+ tx_ring->qid);
+ return ENA_REGS_RESET_MISS_TX_CMPL;
+ }
+
+ /* TX CQ has cdescs */
+ ena_log(pdev, ERR,
+ "Completion descriptors found in CQ %d",
+ tx_ring->qid);
+
+ return ENA_REGS_RESET_MISS_INTERRUPT;
+}
+
static int
check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
struct ena_ring *tx_ring)
@@ -3100,6 +3128,8 @@ check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
int missing_tx_comp_to;
sbintime_t time_offset;
int i, rc = 0;
+ enum ena_regs_reset_reason_types reset_reason = ENA_REGS_RESET_MISS_TX_CMPL;
+ bool cleanup_scheduled, cleanup_running;
getbinuptime(&curtime);
@@ -3155,7 +3185,19 @@ check_missing_comp_in_tx_queue(struct ena_adapter *adapter,
"The number of lost tx completion is above the threshold "
"(%d > %d). Reset the device\n",
missed_tx, adapter->missing_tx_threshold);
- ena_trigger_reset(adapter, ENA_REGS_RESET_MISS_TX_CMPL);
+ /* Set the reset flag to prevent ena_cleanup() from running */
+ ENA_FLAG_SET_ATOMIC(ENA_FLAG_TRIGGER_RESET, adapter);
+ /* Need to make sure that ENA_FLAG_TRIGGER_RESET is visible to ena_cleanup() and
+ * that cleanup_running is visible to check_missing_comp_in_tx_queue() to
+ * prevent the case of accessing CQ concurrently with check_cdesc_in_tx_cq()
+ */
+ mb();
+ cleanup_scheduled = !!(atomic_load_16(&tx_ring->que->cleanup_task.ta_pending));
+ cleanup_running = !!(atomic_load_8((&tx_ring->cleanup_running)));
+ if (!(cleanup_scheduled || cleanup_running))
+ reset_reason = check_cdesc_in_tx_cq(adapter, tx_ring);
+
+ adapter->reset_reason = reset_reason;
rc = EIO;
}
/* Add the newly discovered missing TX completions */
@@ -3618,6 +3660,7 @@ ena_reset_task(void *arg, int pending)
ENA_LOCK_LOCK();
if (likely(ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))) {
+ ena_increment_reset_counter(adapter);
ena_destroy_device(adapter, false);
ena_restore_device(adapter);
diff --git a/sys/dev/ena/ena.h b/sys/dev/ena/ena.h
index 876c3cd258aa..c9eb9e8c43d3 100644
--- a/sys/dev/ena/ena.h
+++ b/sys/dev/ena/ena.h
@@ -327,6 +327,7 @@ struct ena_ring {
};
uint8_t first_interrupt;
+ uint8_t cleanup_running;
uint16_t no_interrupt_event_cnt;
struct ena_com_rx_buf_info ena_bufs[ENA_PKT_MAX_BUFS];
@@ -584,21 +585,27 @@ ena_mbuf_count(struct mbuf *mbuf)
}
static inline void
-ena_trigger_reset(struct ena_adapter *adapter,
- enum ena_regs_reset_reason_types reset_reason)
+ena_increment_reset_counter(struct ena_adapter *adapter)
{
- if (likely(!ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))) {
- const struct ena_reset_stats_offset *ena_reset_stats_offset =
- &resets_to_stats_offset_map[reset_reason];
+ enum ena_regs_reset_reason_types reset_reason = adapter->reset_reason;
+ const struct ena_reset_stats_offset *ena_reset_stats_offset =
+ &resets_to_stats_offset_map[reset_reason];
- if (ena_reset_stats_offset->has_counter) {
- uint64_t *stat_ptr = (uint64_t *)&adapter->dev_stats +
- ena_reset_stats_offset->stat_offset;
+ if (ena_reset_stats_offset->has_counter) {
+ uint64_t *stat_ptr = (uint64_t *)&adapter->dev_stats +
+ ena_reset_stats_offset->stat_offset;
- counter_u64_add((counter_u64_t)(*stat_ptr), 1);
- }
+ counter_u64_add((counter_u64_t)(*stat_ptr), 1);
+ }
+
+ counter_u64_add(adapter->dev_stats.total_resets, 1);
+}
- counter_u64_add(adapter->dev_stats.total_resets, 1);
+static inline void
+ena_trigger_reset(struct ena_adapter *adapter,
+ enum ena_regs_reset_reason_types reset_reason)
+{
+ if (likely(!ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))) {
adapter->reset_reason = reset_reason;
ENA_FLAG_SET_ATOMIC(ENA_FLAG_TRIGGER_RESET, adapter);
}
diff --git a/sys/dev/ena/ena_datapath.c b/sys/dev/ena/ena_datapath.c
index 6cbe46cead3e..20864d0d2df6 100644
--- a/sys/dev/ena/ena_datapath.c
+++ b/sys/dev/ena/ena_datapath.c
@@ -77,17 +77,24 @@ ena_cleanup(void *arg, int pending)
int qid, ena_qid;
int txc, rxc, i;
- if (unlikely((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0))
- return;
-
- ena_log_io(adapter->pdev, DBG, "MSI-X TX/RX routine\n");
-
tx_ring = que->tx_ring;
rx_ring = que->rx_ring;
qid = que->id;
ena_qid = ENA_IO_TXQ_IDX(qid);
io_cq = &adapter->ena_dev->io_cq_queues[ena_qid];
+ atomic_store_8(&tx_ring->cleanup_running, 1);
+ /* Need to make sure that ENA_FLAG_TRIGGER_RESET is visible to ena_cleanup() and
+ * that cleanup_running is visible to check_missing_comp_in_tx_queue() to
+ * prevent the case of accessing CQ concurrently with check_cdesc_in_tx_cq()
+ */
+ mb();
+ if (unlikely(((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) ||
+ (ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))))
+ return;
+
+ ena_log_io(adapter->pdev, DBG, "MSI-X TX/RX routine\n");
+
atomic_store_8(&tx_ring->first_interrupt, 1);
atomic_store_8(&rx_ring->first_interrupt, 1);
@@ -95,7 +102,8 @@ ena_cleanup(void *arg, int pending)
rxc = ena_rx_cleanup(rx_ring);
txc = ena_tx_cleanup(tx_ring);
- if (unlikely((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0))
+ if (unlikely(((if_getdrvflags(ifp) & IFF_DRV_RUNNING) == 0) ||
+ (ENA_FLAG_ISSET(ENA_FLAG_TRIGGER_RESET, adapter))))
return;
if ((txc != ENA_TX_BUDGET) && (rxc != ENA_RX_BUDGET))
@@ -107,6 +115,7 @@ ena_cleanup(void *arg, int pending)
ENA_TX_IRQ_INTERVAL, true, false);
counter_u64_add(tx_ring->tx_stats.unmask_interrupt_num, 1);
ena_com_unmask_intr(io_cq, &intr_reg);
+ atomic_store_8(&tx_ring->cleanup_running, 0);
}
void