svn commit: r347253 - in head/sys/dev/mlx5: . mlx5_core

Hans Petter Selasky hselasky at FreeBSD.org
Wed May 8 10:30:49 UTC 2019


Author: hselasky
Date: Wed May  8 10:30:47 2019
New Revision: 347253
URL: https://svnweb.freebsd.org/changeset/base/347253

Log:
  Protect from infinite sw-reset loop in mlx5core.
  
  Avoid an infinite software firmware reset loop that may be caused by a
  hardware bug by limiting the maximum number of resets.
  The counter between resets is reset by request for reset, and not by a
  successful reset.
  The interval between two resets can be configured via sysctl:
  hw.mlx5.sw_reset_timeout
  which is global to all mlx5 devices in the system.
  
  Submitted by:	slavash@
  MFC after:	3 days
  Sponsored by:	Mellanox Technologies

Modified:
  head/sys/dev/mlx5/driver.h
  head/sys/dev/mlx5/mlx5_core/mlx5_health.c

Modified: head/sys/dev/mlx5/driver.h
==============================================================================
--- head/sys/dev/mlx5/driver.h	Wed May  8 10:30:18 2019	(r347252)
+++ head/sys/dev/mlx5/driver.h	Wed May  8 10:30:47 2019	(r347253)
@@ -536,6 +536,7 @@ struct mlx5_core_health {
 	unsigned long			flags;
 	struct work_struct		work;
 	struct delayed_work		recover_work;
+	unsigned int			last_reset_req;
 };
 
 #ifdef RATELIMIT

Modified: head/sys/dev/mlx5/mlx5_core/mlx5_health.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_core/mlx5_health.c	Wed May  8 10:30:18 2019	(r347252)
+++ head/sys/dev/mlx5/mlx5_core/mlx5_health.c	Wed May  8 10:30:47 2019	(r347253)
@@ -64,6 +64,12 @@ SYSCTL_INT(_hw_mlx5, OID_AUTO, fw_reset_enable, CTLFLA
     &mlx5_fw_reset_enable, 0,
     "Enable firmware reset");
 
+static unsigned int sw_reset_to = 1200;
+SYSCTL_UINT(_hw_mlx5, OID_AUTO, sw_reset_timeout, CTLFLAG_RWTUN,
+    &sw_reset_to, 0,
+    "Minimum timeout in seconds between two firmware resets");
+
+
 static int lock_sem_sw_reset(struct mlx5_core_dev *dev)
 {
 	int ret;
@@ -218,6 +224,32 @@ static void reset_fw_if_needed(struct mlx5_core_dev *d
 		    &dev->iseg->cmdq_addr_l_sz);
 }
 
+static bool
+mlx5_health_allow_reset(struct mlx5_core_dev *dev)
+{
+	struct mlx5_core_health *health = &dev->priv.health;
+	unsigned int delta;
+	bool ret;
+
+	if (health->last_reset_req != 0) {
+		delta = ticks - health->last_reset_req;
+		delta /= hz;
+		ret = delta >= sw_reset_to;
+	} else {
+		ret = true;
+	}
+
+	/*
+	 * In principle, ticks may be 0. Setting it to off by one (-1)
+	 * to prevent certain reset in next request.
+	 */
+	health->last_reset_req = ticks ? : -1;
+	if (!ret)
+		mlx5_core_warn(dev, "Firmware reset elided due to "
+		    "auto-reset frequency threshold.\n");
+	return (ret);
+}
+
 #define MLX5_CRDUMP_WAIT_MS	60000
 #define MLX5_FW_RESET_WAIT_MS	1000
 #define MLX5_NIC_STATE_POLL_MS	5
@@ -243,7 +275,8 @@ void mlx5_enter_error_state(struct mlx5_core_dev *dev,
 	if (force)
 		goto err_state_done;
 
-	if (fatal_error == MLX5_SENSOR_FW_SYND_RFR) {
+	if (fatal_error == MLX5_SENSOR_FW_SYND_RFR &&
+	    mlx5_health_allow_reset(dev)) {
 		/* Get cr-dump and reset FW semaphore */
 		if (mlx5_core_is_pf(dev))
 			lock = lock_sem_sw_reset(dev);


More information about the svn-src-head mailing list