git: ed4368c216c6 - stable/13 - ena: start timer service on attach

From: Marcin Wojtas <mw_at_FreeBSD.org>
Date: Thu, 24 Feb 2022 12:55:47 UTC
The branch stable/13 has been updated by mw:

URL: https://cgit.FreeBSD.org/src/commit/?id=ed4368c216c6d278f6544b8855a587b520b70ed0

commit ed4368c216c6d278f6544b8855a587b520b70ed0
Author:     Dawid Gorecki <dgr@semihalf.com>
AuthorDate: 2022-01-03 13:50:13 +0000
Commit:     Marcin Wojtas <mw@FreeBSD.org>
CommitDate: 2022-02-24 12:53:44 +0000

    ena: start timer service on attach
    
    The timer service was started when the interface was brought up and it
    was stopped when it was brought down. Since ena_up requires the device
    to be responsive, triggering the reset would become impossible if the
    device became unresponsive with the interface down.
    
    Since most of the functions in timer service already perform the check
    to see if the device is running, this only requires starting the callout
    in attach and stopping it when bringing the interface up or down to
    avoid race between different admin queue calls.
    
    Since callout functions for timer service are always called with the
    same arguments, replace callout_{init,reset,drain} calls with
    ENA_TIMER_{INIT,RESET,DRAIN} macros.
    
    Submitted by: Dawid Gorecki <dgr@semihalf.com>
    Obtained from: Semihalf
    MFC after: 2 weeks
    Sponsored by: Amazon, Inc.
    
    (cherry picked from commit 78554d0c707c9401dbae53fb8bc65d291a5a09a5)
---
 sys/dev/ena/ena.c | 64 ++++++++++++++++++++++++++++++++-----------------------
 sys/dev/ena/ena.h |  8 +++++++
 2 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/sys/dev/ena/ena.c b/sys/dev/ena/ena.c
index 63b4598a9352..f4abe61f08ae 100644
--- a/sys/dev/ena/ena.c
+++ b/sys/dev/ena/ena.c
@@ -2101,6 +2101,13 @@ ena_up(struct ena_adapter *adapter)
 
 	ena_log(adapter->pdev, INFO, "device is going UP\n");
 
+	/*
+	 * ena_timer_service can use functions, which write to the admin queue.
+	 * Those calls are not protected by ENA_LOCK, and because of that, the
+	 * timer should be stopped when bringing the device up or down.
+	 */
+	ENA_TIMER_DRAIN(adapter);
+
 	/* setup interrupts for IO queues */
 	rc = ena_setup_io_intr(adapter);
 	if (unlikely(rc != 0)) {
@@ -2143,19 +2150,12 @@ ena_up(struct ena_adapter *adapter)
 	if_setdrvflagbits(adapter->ifp, IFF_DRV_RUNNING,
 		IFF_DRV_OACTIVE);
 
-	/* Activate timer service only if the device is running.
-		* If this flag is not set, it means that the driver is being
-		* reset and timer service will be activated afterwards.
-		*/
-	if (ENA_FLAG_ISSET(ENA_FLAG_DEVICE_RUNNING, adapter)) {
-		callout_reset_sbt(&adapter->timer_service, SBT_1S,
-			SBT_1S, ena_timer_service, (void *)adapter, 0);
-	}
-
 	ENA_FLAG_SET_ATOMIC(ENA_FLAG_DEV_UP, adapter);
 
 	ena_unmask_all_io_irqs(adapter);
 
+	ENA_TIMER_RESET(adapter);
+
 	return (0);
 
 err_up_complete:
@@ -2165,6 +2165,8 @@ err_up_complete:
 err_create_queues_with_backoff:
 	ena_free_io_irq(adapter);
 error:
+	ENA_TIMER_RESET(adapter);
+
 	return (rc);
 }
 
@@ -2469,9 +2471,10 @@ ena_down(struct ena_adapter *adapter)
 	if (!ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter))
 		return;
 
-	ena_log(adapter->pdev, INFO, "device is going DOWN\n");
+	/* Drain timer service to avoid admin queue race condition. */
+	ENA_TIMER_DRAIN(adapter);
 
-	callout_drain(&adapter->timer_service);
+	ena_log(adapter->pdev, INFO, "device is going DOWN\n");
 
 	ENA_FLAG_CLEAR_ATOMIC(ENA_FLAG_DEV_UP, adapter);
 	if_setdrvflagbits(adapter->ifp, IFF_DRV_OACTIVE,
@@ -2495,6 +2498,8 @@ ena_down(struct ena_adapter *adapter)
 	ena_free_all_rx_resources(adapter);
 
 	counter_u64_add(adapter->dev_stats.interface_down, 1);
+
+	ENA_TIMER_RESET(adapter);
 }
 
 static uint32_t
@@ -3249,8 +3254,10 @@ ena_timer_service(void *data)
 	     adapter->eni_metrics_sample_interval)) {
 		/*
 		 * There is no race with other admin queue calls, as:
-		 *   - Timer service runs after interface is up, so all
+		 *   - Timer service runs after attach function ends, so all
 		 *     configuration calls to the admin queue are finished.
+		 *   - Timer service is temporarily stopped when bringing
+		 *     the interface up or down.
 		 *   - After interface is up, the driver doesn't use (at least
 		 *     for now) other functions writing to the admin queue.
 		 *
@@ -3279,7 +3286,7 @@ ena_timer_service(void *data)
 	/*
 	 * Schedule another timeout one second from now.
 	 */
-	callout_schedule_sbt(&adapter->timer_service, SBT_1S, SBT_1S, 0);
+	ENA_TIMER_RESET(adapter);
 }
 
 void
@@ -3294,7 +3301,7 @@ ena_destroy_device(struct ena_adapter *adapter, bool graceful)
 
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 
-	callout_drain(&adapter->timer_service);
+	ENA_TIMER_DRAIN(adapter);
 
 	dev_up = ENA_FLAG_ISSET(ENA_FLAG_DEV_UP, adapter);
 	if (dev_up)
@@ -3425,17 +3432,15 @@ ena_restore_device(struct ena_adapter *adapter)
 	/* Indicate that device is running again and ready to work */
 	ENA_FLAG_SET_ATOMIC(ENA_FLAG_DEVICE_RUNNING, adapter);
 
-	if (ENA_FLAG_ISSET(ENA_FLAG_DEV_UP_BEFORE_RESET, adapter)) {
-		/*
-		 * As the AENQ handlers weren't executed during reset because
-		 * the flag ENA_FLAG_DEVICE_RUNNING was turned off, the
-		 * timestamp must be updated again That will prevent next reset
-		 * caused by missing keep alive.
-		 */
-		adapter->keep_alive_timestamp = getsbinuptime();
-		callout_reset_sbt(&adapter->timer_service, SBT_1S, SBT_1S,
-		    ena_timer_service, (void *)adapter, 0);
-	}
+	/*
+	 * As the AENQ handlers weren't executed during reset because
+	 * the flag ENA_FLAG_DEVICE_RUNNING was turned off, the
+	 * timestamp must be updated again That will prevent next reset
+	 * caused by missing keep alive.
+	 */
+	adapter->keep_alive_timestamp = getsbinuptime();
+	ENA_TIMER_RESET(adapter);
+
 	ENA_FLAG_CLEAR_ATOMIC(ENA_FLAG_DEV_UP_BEFORE_RESET, adapter);
 
 	ena_log(dev, INFO,
@@ -3457,6 +3462,8 @@ err:
 	ENA_FLAG_CLEAR_ATOMIC(ENA_FLAG_ONGOING_RESET, adapter);
 	ena_log(dev, ERR, "Reset attempt failed. Can not reset the device\n");
 
+	ENA_TIMER_RESET(adapter);
+
 	return (rc);
 }
 
@@ -3504,7 +3511,7 @@ ena_attach(device_t pdev)
 	 * Set up the timer service - driver is responsible for avoiding
 	 * concurrency, as the callout won't be using any locking inside.
 	 */
-	callout_init(&adapter->timer_service, true);
+	ENA_TIMER_INIT(adapter);
 	adapter->keep_alive_timeout = DEFAULT_KEEP_ALIVE_TO;
 	adapter->missing_tx_timeout = DEFAULT_TX_CMP_TO;
 	adapter->missing_tx_max_queues = DEFAULT_TX_MONITORED_QUEUES;
@@ -3689,6 +3696,9 @@ ena_attach(device_t pdev)
 	if_setdrvflagbits(adapter->ifp, IFF_DRV_OACTIVE, IFF_DRV_RUNNING);
 	ENA_FLAG_SET_ATOMIC(ENA_FLAG_DEVICE_RUNNING, adapter);
 
+	/* Run the timer service */
+	ENA_TIMER_RESET(adapter);
+
 	return (0);
 
 #ifdef DEV_NETMAP
@@ -3742,7 +3752,7 @@ ena_detach(device_t pdev)
 
 	/* Stop timer service */
 	ENA_LOCK_LOCK();
-	callout_drain(&adapter->timer_service);
+	ENA_TIMER_DRAIN(adapter);
 	ENA_LOCK_UNLOCK();
 
 	/* Release reset task */
diff --git a/sys/dev/ena/ena.h b/sys/dev/ena/ena.h
index 260c26482898..bd87f39a3297 100644
--- a/sys/dev/ena/ena.h
+++ b/sys/dev/ena/ena.h
@@ -499,6 +499,14 @@ struct ena_adapter {
 #define ENA_LOCK_UNLOCK()		sx_unlock(&ena_global_lock)
 #define ENA_LOCK_ASSERT()		sx_assert(&ena_global_lock, SA_XLOCKED)
 
+#define	ENA_TIMER_INIT(_adapter)					\
+	callout_init(&(_adapter)->timer_service, true)
+#define	ENA_TIMER_DRAIN(_adapter)					\
+	callout_drain(&(_adapter)->timer_service)
+#define	ENA_TIMER_RESET(_adapter)					\
+	callout_reset_sbt(&(_adapter)->timer_service, SBT_1S, SBT_1S,	\
+			ena_timer_service, (void*)(_adapter), 0)
+
 #define clamp_t(type, _x, min, max)	min_t(type, max_t(type, _x, min), max)
 #define clamp_val(val, lo, hi)		clamp_t(__typeof(val), val, lo, hi)