svn commit: r353205 - in stable/12/sys/dev/mlx5: . mlx5_core mlx5_en

Hans Petter Selasky hselasky at FreeBSD.org
Mon Oct 7 09:01:23 UTC 2019


Author: hselasky
Date: Mon Oct  7 09:01:21 2019
New Revision: 353205
URL: https://svnweb.freebsd.org/changeset/base/353205

Log:
  MFC r352966:
  Add port module event software counters in mlx5core.
  While at it, fixup PME based on latest PRM defines.
  
  Submitted by:	slavash@
  Sponsored by:	Mellanox Technologies

Modified:
  stable/12/sys/dev/mlx5/device.h
  stable/12/sys/dev/mlx5/driver.h
  stable/12/sys/dev/mlx5/mlx5_core/mlx5_eq.c
  stable/12/sys/dev/mlx5/mlx5_core/mlx5_main.c
  stable/12/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
Directory Properties:
  stable/12/   (props changed)

Modified: stable/12/sys/dev/mlx5/device.h
==============================================================================
--- stable/12/sys/dev/mlx5/device.h	Mon Oct  7 09:00:08 2019	(r353204)
+++ stable/12/sys/dev/mlx5/device.h	Mon Oct  7 09:01:21 2019	(r353205)
@@ -537,7 +537,7 @@ enum {
 	MLX5_MODULE_STATUS_PLUGGED_ENABLED      = 0x1,
 	MLX5_MODULE_STATUS_UNPLUGGED            = 0x2,
 	MLX5_MODULE_STATUS_ERROR                = 0x3,
-	MLX5_MODULE_STATUS_PLUGGED_DISABLED     = 0x4,
+	MLX5_MODULE_STATUS_NUM			,
 };
 
 enum {
@@ -549,7 +549,7 @@ enum {
 	MLX5_MODULE_EVENT_ERROR_UNSUPPORTED_CABLE                     = 0x5,
 	MLX5_MODULE_EVENT_ERROR_HIGH_TEMPERATURE                      = 0x6,
 	MLX5_MODULE_EVENT_ERROR_CABLE_IS_SHORTED                      = 0x7,
-	MLX5_MODULE_EVENT_ERROR_PCIE_SYSTEM_POWER_SLOT_EXCEEDED       = 0xc,
+	MLX5_MODULE_EVENT_ERROR_NUM		                      ,
 };
 
 struct mlx5_eqe_port_module_event {

Modified: stable/12/sys/dev/mlx5/driver.h
==============================================================================
--- stable/12/sys/dev/mlx5/driver.h	Mon Oct  7 09:00:08 2019	(r353204)
+++ stable/12/sys/dev/mlx5/driver.h	Mon Oct  7 09:01:21 2019	(r353205)
@@ -569,6 +569,11 @@ struct mlx5_rl_table {
 };
 #endif
 
+struct mlx5_pme_stats {
+	u64			status_counters[MLX5_MODULE_STATUS_NUM];
+	u64			error_counters[MLX5_MODULE_EVENT_ERROR_NUM];
+};
+
 struct mlx5_priv {
 	char			name[MLX5_MAX_NAME_LEN];
 	struct mlx5_eq_table	eq_table;
@@ -624,6 +629,7 @@ struct mlx5_priv {
 #ifdef RATELIMIT
 	struct mlx5_rl_table	rl_table;
 #endif
+	struct mlx5_pme_stats pme_stats;
 };
 
 enum mlx5_device_state {

Modified: stable/12/sys/dev/mlx5/mlx5_core/mlx5_eq.c
==============================================================================
--- stable/12/sys/dev/mlx5/mlx5_core/mlx5_eq.c	Mon Oct  7 09:00:08 2019	(r353204)
+++ stable/12/sys/dev/mlx5/mlx5_core/mlx5_eq.c	Mon Oct  7 09:01:21 2019	(r353205)
@@ -639,9 +639,9 @@ static const char *mlx5_port_module_event_error_type_t
 {
 	switch (error_type) {
 	case MLX5_MODULE_EVENT_ERROR_POWER_BUDGET_EXCEEDED:
-		return "Power Budget Exceeded";
+		return "Power budget exceeded";
 	case MLX5_MODULE_EVENT_ERROR_LONG_RANGE_FOR_NON_MLNX_CABLE_MODULE:
-		return "Long Range for non MLNX cable/module";
+		return "Long Range for non MLNX cable";
 	case MLX5_MODULE_EVENT_ERROR_BUS_STUCK:
 		return "Bus stuck(I2C or data shorted)";
 	case MLX5_MODULE_EVENT_ERROR_NO_EEPROM_RETRY_TIMEOUT:
@@ -649,18 +649,11 @@ static const char *mlx5_port_module_event_error_type_t
 	case MLX5_MODULE_EVENT_ERROR_ENFORCE_PART_NUMBER_LIST:
 		return "Enforce part number list";
 	case MLX5_MODULE_EVENT_ERROR_UNSUPPORTED_CABLE:
-		return "Unsupported Cable";
+		return "Unknown identifier";
 	case MLX5_MODULE_EVENT_ERROR_HIGH_TEMPERATURE:
 		return "High Temperature";
 	case MLX5_MODULE_EVENT_ERROR_CABLE_IS_SHORTED:
-		return "Cable is shorted";
-	case MLX5_MODULE_EVENT_ERROR_PCIE_SYSTEM_POWER_SLOT_EXCEEDED:
-		return "One or more network ports have been powered "
-			"down due to insufficient/unadvertised power on "
-			"the PCIe slot. Please refer to the card's user "
-			"manual for power specifications or contact "
-			"Mellanox support.";
-
+		return "Bad or shorted cable/module";
 	default:
 		return "Unknown error type";
 	}
@@ -686,29 +679,36 @@ static void mlx5_port_module_event(struct mlx5_core_de
 
 	module_num = (unsigned int)module_event_eqe->module;
 	module_status = (unsigned int)module_event_eqe->module_status &
-			PORT_MODULE_EVENT_MODULE_STATUS_MASK;
+	    PORT_MODULE_EVENT_MODULE_STATUS_MASK;
 	error_type = (unsigned int)module_event_eqe->error_type &
-		     PORT_MODULE_EVENT_ERROR_TYPE_MASK;
+	    PORT_MODULE_EVENT_ERROR_TYPE_MASK;
 
+	if (module_status < MLX5_MODULE_STATUS_NUM)
+		dev->priv.pme_stats.status_counters[module_status]++;
 	switch (module_status) {
 	case MLX5_MODULE_STATUS_PLUGGED_ENABLED:
-		device_printf((&pdev->dev)->bsddev, "INFO: ""Module %u, status: plugged and enabled\n", module_num);
+		device_printf((&pdev->dev)->bsddev,
+		    "INFO: Module %u, status: plugged and enabled\n",
+		    module_num);
 		break;
 
 	case MLX5_MODULE_STATUS_UNPLUGGED:
-		device_printf((&pdev->dev)->bsddev, "INFO: ""Module %u, status: unplugged\n", module_num);
+		device_printf((&pdev->dev)->bsddev,
+		    "INFO: Module %u, status: unplugged\n", module_num);
 		break;
 
 	case MLX5_MODULE_STATUS_ERROR:
-		device_printf((&pdev->dev)->bsddev, "INFO: ""Module %u, status: error, %s\n", module_num, mlx5_port_module_event_error_type_to_string(error_type));
+		device_printf((&pdev->dev)->bsddev,
+		    "ERROR: Module %u, status: error, %s\n",
+		    module_num,
+		    mlx5_port_module_event_error_type_to_string(error_type));
+		if (error_type < MLX5_MODULE_EVENT_ERROR_NUM)
+			dev->priv.pme_stats.error_counters[error_type]++;
 		break;
 
-	case MLX5_MODULE_STATUS_PLUGGED_DISABLED:
-		device_printf((&pdev->dev)->bsddev, "INFO: ""Module %u, status: plugged but disabled\n", module_num);
-		break;
-
 	default:
-		device_printf((&pdev->dev)->bsddev, "INFO: ""Module %u, unknown status\n", module_num);
+		device_printf((&pdev->dev)->bsddev,
+		    "INFO: Module %u, unknown status\n", module_num);
 	}
 	/* store module status */
 	if (module_num < MLX5_MAX_PORTS)

Modified: stable/12/sys/dev/mlx5/mlx5_core/mlx5_main.c
==============================================================================
--- stable/12/sys/dev/mlx5/mlx5_core/mlx5_main.c	Mon Oct  7 09:00:08 2019	(r353204)
+++ stable/12/sys/dev/mlx5/mlx5_core/mlx5_main.c	Mon Oct  7 09:01:21 2019	(r353205)
@@ -1244,13 +1244,31 @@ struct mlx5_core_event_handler {
 		      void *data);
 };
 
+#define	MLX5_STATS_DESC(a, b, c, d, e, ...) d, e,
+
+#define	MLX5_PORT_MODULE_ERROR_STATS(m)				\
+m(+1, u64, power_budget_exceeded, "power_budget", "Module Power Budget Exceeded") \
+m(+1, u64, long_range, "long_range", "Module Long Range for non MLNX cable/module") \
+m(+1, u64, bus_stuck, "bus_stuck", "Module Bus stuck(I2C or data shorted)") \
+m(+1, u64, no_eeprom, "no_eeprom", "No EEPROM/retry timeout") \
+m(+1, u64, enforce_part_number, "enforce_part_number", "Module Enforce part number list") \
+m(+1, u64, unknown_id, "unknown_id", "Module Unknown identifier") \
+m(+1, u64, high_temp, "high_temp", "Module High Temperature") \
+m(+1, u64, cable_shorted, "cable_shorted", "Module Cable is shorted")
+
+static const char *mlx5_pme_err_desc[] = {
+	MLX5_PORT_MODULE_ERROR_STATS(MLX5_STATS_DESC)
+};
+
 static int init_one(struct pci_dev *pdev,
 		    const struct pci_device_id *id)
 {
 	struct mlx5_core_dev *dev;
 	struct mlx5_priv *priv;
 	device_t bsddev = pdev->dev.bsddev;
-	int err;
+	int i,err;
+	struct sysctl_oid *pme_sysctl_node;
+	struct sysctl_oid *pme_err_sysctl_node;
 
 	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 	priv = &dev->priv;
@@ -1282,6 +1300,41 @@ static int init_one(struct pci_dev *pdev,
 	    OID_AUTO, "power_value", CTLFLAG_RD, &dev->pwr_value, 0,
 	    "Current power value in Watts");
 
+	pme_sysctl_node = SYSCTL_ADD_NODE(&dev->sysctl_ctx,
+	    SYSCTL_CHILDREN(device_get_sysctl_tree(bsddev)),
+	    OID_AUTO, "pme_stats", CTLFLAG_RD, NULL,
+	    "Port module event statistics");
+	if (pme_sysctl_node == NULL) {
+		err = -ENOMEM;
+		goto clean_sysctl_ctx;
+	}
+	pme_err_sysctl_node = SYSCTL_ADD_NODE(&dev->sysctl_ctx,
+	    SYSCTL_CHILDREN(pme_sysctl_node),
+	    OID_AUTO, "errors", CTLFLAG_RD, NULL,
+	    "Port module event error statistics");
+	if (pme_err_sysctl_node == NULL) {
+		err = -ENOMEM;
+		goto clean_sysctl_ctx;
+	}
+	SYSCTL_ADD_U64(&dev->sysctl_ctx,
+	    SYSCTL_CHILDREN(pme_sysctl_node), OID_AUTO,
+	    "module_plug", CTLFLAG_RD | CTLFLAG_MPSAFE,
+	    &dev->priv.pme_stats.status_counters[MLX5_MODULE_STATUS_PLUGGED_ENABLED],
+	    0, "Number of time module plugged");
+	SYSCTL_ADD_U64(&dev->sysctl_ctx,
+	    SYSCTL_CHILDREN(pme_sysctl_node), OID_AUTO,
+	    "module_unplug", CTLFLAG_RD | CTLFLAG_MPSAFE,
+	    &dev->priv.pme_stats.status_counters[MLX5_MODULE_STATUS_UNPLUGGED],
+	    0, "Number of time module unplugged");
+	for (i = 0 ; i < MLX5_MODULE_EVENT_ERROR_NUM; i++) {
+		SYSCTL_ADD_U64(&dev->sysctl_ctx,
+		    SYSCTL_CHILDREN(pme_err_sysctl_node), OID_AUTO,
+		    mlx5_pme_err_desc[2 * i], CTLFLAG_RD | CTLFLAG_MPSAFE,
+		    &dev->priv.pme_stats.error_counters[i],
+		    0, mlx5_pme_err_desc[2 * i + 1]);
+	}
+
+
 	INIT_LIST_HEAD(&priv->ctx_list);
 	spin_lock_init(&priv->ctx_lock);
 	mutex_init(&dev->pci_status_mutex);
@@ -1320,8 +1373,9 @@ clean_health:
 close_pci:
 	mlx5_pci_close(dev, priv);
 clean_dev:
-	sysctl_ctx_free(&dev->sysctl_ctx);
 	mtx_destroy(&dev->dump_lock);
+clean_sysctl_ctx:
+	sysctl_ctx_free(&dev->sysctl_ctx);
 	kfree(dev);
 	return err;
 }

Modified: stable/12/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
==============================================================================
--- stable/12/sys/dev/mlx5/mlx5_en/mlx5_en_main.c	Mon Oct  7 09:00:08 2019	(r353204)
+++ stable/12/sys/dev/mlx5/mlx5_en/mlx5_en_main.c	Mon Oct  7 09:01:21 2019	(r353205)
@@ -3387,8 +3387,7 @@ out:
 		}
 		/* Check if module is present before doing an access */
 		module_status = mlx5_query_module_status(priv->mdev, module_num);
-		if (module_status != MLX5_MODULE_STATUS_PLUGGED_ENABLED &&
-		    module_status != MLX5_MODULE_STATUS_PLUGGED_DISABLED) {
+		if (module_status != MLX5_MODULE_STATUS_PLUGGED_ENABLED) {
 			error = EINVAL;
 			goto err_i2c;
 		}


More information about the svn-src-all mailing list