svn commit: r334332 - in head: share/man/man4 sys/conf sys/dev/mlx5 sys/dev/mlx5/mlx5_core sys/dev/mlx5/mlx5_en sys/modules/mlx5 sys/modules/mlx5en sys/modules/mlx5ib

Hans Petter Selasky hselasky at FreeBSD.org
Tue May 29 14:05:00 UTC 2018


Author: hselasky
Date: Tue May 29 14:04:57 2018
New Revision: 334332
URL: https://svnweb.freebsd.org/changeset/base/334332

Log:
  Add support for hardware rate limiting to mlx5en(4).
  
  The hardware rate limiting feature is enabled by the RATELIMIT kernel
  option. Please refer to ifconfig(8) and the txrtlmt option and the
  SO_MAX_PACING_RATE set socket option for more information. This
  feature is compatible with hardware transmit send offload, TSO.
  
  A set of sysctl(8) knobs under dev.mce.<N>.rate_limit are provided to
  setup the ratelimit table and also to fine tune various rate limit
  related parameters.
  
  Sponsored by:	Mellanox Technologies

Added:
  head/sys/dev/mlx5/mlx5_core/mlx5_rl.c   (contents, props changed)
  head/sys/dev/mlx5/mlx5_en/en_rl.h   (contents, props changed)
  head/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c   (contents, props changed)
Modified:
  head/share/man/man4/mlx5en.4
  head/sys/conf/files
  head/sys/conf/options
  head/sys/dev/mlx5/driver.h
  head/sys/dev/mlx5/mlx5_core/mlx5_main.c
  head/sys/dev/mlx5/mlx5_en/en.h
  head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
  head/sys/dev/mlx5/mlx5_en/mlx5_en_tx.c
  head/sys/modules/mlx5/Makefile
  head/sys/modules/mlx5en/Makefile
  head/sys/modules/mlx5ib/Makefile

Modified: head/share/man/man4/mlx5en.4
==============================================================================
--- head/share/man/man4/mlx5en.4	Tue May 29 14:04:50 2018	(r334331)
+++ head/share/man/man4/mlx5en.4	Tue May 29 14:04:57 2018	(r334332)
@@ -24,18 +24,19 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd December 3, 2015
+.Dd May 29, 2018
 .Dt MLX5EN 4
 .Os
 .Sh NAME
 .Nm mlx5en
-.Nd "Mellanox ConnectX-4 and ConnectX-4 LX based 100Gb, 50Gb, 40Gb, 25Gb and 10Gb Ethernet adapter driver"
+.Nd "Mellanox ConnectX-4, ConnectX-4 LX and ConnectX-5 based 100Gb, 50Gb, 40Gb, 25Gb and 10Gb Ethernet adapter driver"
 .Sh SYNOPSIS
 To compile this driver into the kernel,
 place the following lines in your
 kernel configuration file:
 .Bd -ragged -offset indent
 .Cd "options COMPAT_LINUXKPI"
+.Cd "options RATELIMIT"
 .Cd "device mlx5"
 .Cd "device mlx5en"
 .Ed
@@ -56,11 +57,12 @@ mlx5en_load="YES"
 The
 .Nm
 driver provides support for PCI Express Ethernet adapters based on
-ConnectX-4 and ConnectX-4 LX.
+ConnectX-4, ConnectX-4 LX and ConnectX-5.
 The driver supports Jumbo Frames, Transmit/Receive checksum offload,
 TCP segmentation offload (TSO), Large Receive Offload (LRO),
 HW Large Receive Offload (HW LRO), VLAN tag insertion/extraction,
-VLAN checksum offload, VLAN TSO, and Receive Side Steering (RSS).
+VLAN checksum offload, VLAN TSO, hardware rate limiting (TXRTLMT)
+and Receive Side Steering (RSS).
 .br
 The network interface is named mce.
 .br
@@ -74,6 +76,7 @@ For more information on configuring this device, see
 The
 .Nm
 driver supports 100Gb, 50Gb, 40Gb, 25Gb and 10Gb Ethernet adapters.
+ConnectX-5 supports:10/20/25/40/50/56/100Gb/s speeds.
 ConnectX-4 supports:10/20/25/40/50/56/100Gb/s speeds.
 ConnectX-4 LX supports:10/25/40/50Gb/s speeds (and reduced power consumption) :
 .Pp

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files	Tue May 29 14:04:50 2018	(r334331)
+++ head/sys/conf/files	Tue May 29 14:04:57 2018	(r334332)
@@ -4775,6 +4775,8 @@ dev/mlx5/mlx5_core/mlx5_port.c			optional mlx5 pci	\
 	compile-with "${OFED_C}"
 dev/mlx5/mlx5_core/mlx5_qp.c			optional mlx5 pci	\
 	compile-with "${OFED_C}"
+dev/mlx5/mlx5_core/mlx5_rl.c			optional mlx5 pci	\
+	compile-with "${OFED_C}"
 dev/mlx5/mlx5_core/mlx5_srq.c			optional mlx5 pci	\
 	compile-with "${OFED_C}"
 dev/mlx5/mlx5_core/mlx5_transobj.c		optional mlx5 pci	\
@@ -4797,6 +4799,8 @@ dev/mlx5/mlx5_en/mlx5_en_tx.c			optional mlx5en pci in
 dev/mlx5/mlx5_en/mlx5_en_flow_table.c		optional mlx5en pci inet inet6	\
 	compile-with "${OFED_C}"
 dev/mlx5/mlx5_en/mlx5_en_rx.c			optional mlx5en pci inet inet6	\
+	compile-with "${OFED_C}"
+dev/mlx5/mlx5_en/mlx5_en_rl.c			optional mlx5en pci inet inet6	\
 	compile-with "${OFED_C}"
 dev/mlx5/mlx5_en/mlx5_en_txrx.c			optional mlx5en pci inet inet6	\
 	compile-with "${OFED_C}"

Modified: head/sys/conf/options
==============================================================================
--- head/sys/conf/options	Tue May 29 14:04:50 2018	(r334331)
+++ head/sys/conf/options	Tue May 29 14:04:57 2018	(r334332)
@@ -420,6 +420,7 @@ BOOTP_WIRED_TO		opt_bootp.h
 DEVICE_POLLING
 DUMMYNET		opt_ipdn.h
 RATELIMIT		opt_ratelimit.h
+RATELIMIT_DEBUG		opt_ratelimit.h
 INET			opt_inet.h
 INET6			opt_inet6.h
 IPDIVERT

Modified: head/sys/dev/mlx5/driver.h
==============================================================================
--- head/sys/dev/mlx5/driver.h	Tue May 29 14:04:50 2018	(r334331)
+++ head/sys/dev/mlx5/driver.h	Tue May 29 14:04:57 2018	(r334332)
@@ -28,6 +28,8 @@
 #ifndef MLX5_DRIVER_H
 #define MLX5_DRIVER_H
 
+#include "opt_ratelimit.h"
+
 #include <linux/kernel.h>
 #include <linux/completion.h>
 #include <linux/pci.h>
@@ -500,7 +502,11 @@ struct mlx5_core_health {
 	struct delayed_work		recover_work;
 };
 
+#ifdef RATELIMIT
+#define	MLX5_CQ_LINEAR_ARRAY_SIZE	(128 * 1024)
+#else
 #define	MLX5_CQ_LINEAR_ARRAY_SIZE	1024
+#endif
 
 struct mlx5_cq_linear_array_entry {
 	spinlock_t	lock;
@@ -540,6 +546,23 @@ struct mlx5_irq_info {
 	char name[MLX5_MAX_IRQ_NAME];
 };
 
+#ifdef RATELIMIT
+struct mlx5_rl_entry {
+	u32			rate;
+	u16			burst;
+	u16			index;
+	u32			refcount;
+};
+
+struct mlx5_rl_table {
+	struct mutex		rl_lock;
+	u16			max_size;
+	u32			max_rate;
+	u32			min_rate;
+	struct mlx5_rl_entry   *rl_entry;
+};
+#endif
+
 struct mlx5_priv {
 	char			name[MLX5_MAX_NAME_LEN];
 	struct mlx5_eq_table	eq_table;
@@ -592,6 +615,9 @@ struct mlx5_priv {
 	struct list_head        ctx_list;
 	spinlock_t              ctx_lock;
 	unsigned long		pci_dev_data;
+#ifdef RATELIMIT
+	struct mlx5_rl_table	rl_table;
+#endif
 };
 
 enum mlx5_device_state {
@@ -1084,5 +1110,17 @@ static inline int mlx5_core_is_pf(struct mlx5_core_dev
 {
 	return !(dev->priv.pci_dev_data & MLX5_PCI_DEV_IS_VF);
 }
+#ifdef RATELIMIT
+int mlx5_init_rl_table(struct mlx5_core_dev *dev);
+void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev);
+int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u32 burst, u16 *index);
+void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate, u32 burst);
+bool mlx5_rl_is_in_range(const struct mlx5_core_dev *dev, u32 rate, u32 burst);
+
+static inline bool mlx5_rl_is_supported(struct mlx5_core_dev *dev)
+{
+	return !!(dev->priv.rl_table.max_size);
+}
+#endif
 
 #endif /* MLX5_DRIVER_H */

Modified: head/sys/dev/mlx5/mlx5_core/mlx5_main.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_core/mlx5_main.c	Tue May 29 14:04:50 2018	(r334331)
+++ head/sys/dev/mlx5/mlx5_core/mlx5_main.c	Tue May 29 14:04:57 2018	(r334332)
@@ -905,8 +905,23 @@ static int mlx5_init_once(struct mlx5_core_dev *dev, s
 	mlx5_init_srq_table(dev);
 	mlx5_init_mr_table(dev);
 
+#ifdef RATELIMIT
+	err = mlx5_init_rl_table(dev);
+	if (err) {
+		dev_err(&pdev->dev, "Failed to init rate limiting\n");
+		goto err_tables_cleanup;
+	}
+#endif
 	return 0;
 
+#ifdef RATELIMIT
+err_tables_cleanup:
+	mlx5_cleanup_mr_table(dev);
+	mlx5_cleanup_srq_table(dev);
+	mlx5_cleanup_qp_table(dev);
+	mlx5_cleanup_cq_table(dev);
+#endif
+
 err_eq_cleanup:
 	mlx5_eq_cleanup(dev);
 
@@ -916,6 +931,9 @@ out:
 
 static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
 {
+#ifdef RATELIMIT
+	mlx5_cleanup_rl_table(dev);
+#endif
 	mlx5_cleanup_mr_table(dev);
 	mlx5_cleanup_srq_table(dev);
 	mlx5_cleanup_qp_table(dev);

Added: head/sys/dev/mlx5/mlx5_core/mlx5_rl.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/dev/mlx5/mlx5_core/mlx5_rl.c	Tue May 29 14:04:57 2018	(r334332)
@@ -0,0 +1,206 @@
+/*-
+ * Copyright (c) 2013-2017, Mellanox Technologies, Ltd.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <dev/mlx5/driver.h>
+#include "mlx5_core.h"
+
+#ifdef RATELIMIT
+
+/* Finds an entry where we can register the given rate
+ * If the rate already exists, return the entry where it is registered,
+ * otherwise return the first available entry.
+ * If the table is full, return NULL
+ */
+static struct mlx5_rl_entry *find_rl_entry(struct mlx5_rl_table *table,
+					   u32 rate, u16 burst)
+{
+	struct mlx5_rl_entry *ret_entry = NULL;
+	struct mlx5_rl_entry *entry;
+	u16 i;
+
+	for (i = 0; i < table->max_size; i++) {
+		entry = table->rl_entry + i;
+		if (entry->rate == rate && entry->burst == burst)
+			return entry;
+		if (ret_entry == NULL && entry->rate == 0)
+			ret_entry = entry;
+	}
+
+	return ret_entry;
+}
+
+static int mlx5_set_rate_limit_cmd(struct mlx5_core_dev *dev,
+				   u32 rate, u32 burst, u16 index)
+{
+	u32 in[MLX5_ST_SZ_DW(set_rate_limit_in)] = {0};
+	u32 out[MLX5_ST_SZ_DW(set_rate_limit_out)] = {0};
+
+	MLX5_SET(set_rate_limit_in, in, opcode,
+		 MLX5_CMD_OP_SET_RATE_LIMIT);
+	MLX5_SET(set_rate_limit_in, in, rate_limit_index, index);
+	MLX5_SET(set_rate_limit_in, in, rate_limit, rate);
+
+	if (MLX5_CAP_QOS(dev, packet_pacing_burst_bound))
+		MLX5_SET(set_rate_limit_in, in, burst_upper_bound, burst);
+
+	return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+}
+
+bool mlx5_rl_is_in_range(const struct mlx5_core_dev *dev, u32 rate, u32 burst)
+{
+	const struct mlx5_rl_table *table = &dev->priv.rl_table;
+
+	return (rate <= table->max_rate && rate >= table->min_rate &&
+		burst <= 65535);
+}
+EXPORT_SYMBOL(mlx5_rl_is_in_range);
+
+int mlx5_rl_add_rate(struct mlx5_core_dev *dev, u32 rate, u32 burst, u16 *index)
+{
+	struct mlx5_rl_table *table = &dev->priv.rl_table;
+	struct mlx5_rl_entry *entry;
+	int err = 0;
+
+	mutex_lock(&table->rl_lock);
+
+	if (!rate || !mlx5_rl_is_in_range(dev, rate, burst)) {
+		mlx5_core_err(dev, "Invalid rate: %u, should be %u to %u\n",
+			      rate, table->min_rate, table->max_rate);
+		err = -ERANGE;
+		goto out;
+	}
+
+	entry = find_rl_entry(table, rate, burst);
+	if (!entry) {
+		mlx5_core_err(dev, "Max number of %u rates reached\n",
+			      table->max_size);
+		err = -ENOSPC;
+		goto out;
+	}
+	if (entry->refcount == 0xFFFFFFFFU) {
+		/* out of refcounts */
+		err = -ENOMEM;
+		goto out;
+	} else if (entry->refcount != 0) {
+		/* rate already configured */
+		entry->refcount++;
+	} else {
+		/* new rate limit */
+		err = mlx5_set_rate_limit_cmd(dev, rate, burst, entry->index);
+		if (err) {
+			mlx5_core_err(dev, "Failed configuring rate: %u (%d)\n",
+				      rate, err);
+			goto out;
+		}
+		entry->rate = rate;
+		entry->burst = burst;
+		entry->refcount = 1;
+	}
+	*index = entry->index;
+
+out:
+	mutex_unlock(&table->rl_lock);
+	return err;
+}
+EXPORT_SYMBOL(mlx5_rl_add_rate);
+
+void mlx5_rl_remove_rate(struct mlx5_core_dev *dev, u32 rate, u32 burst)
+{
+	struct mlx5_rl_table *table = &dev->priv.rl_table;
+	struct mlx5_rl_entry *entry = NULL;
+
+	/* 0 is a reserved value for unlimited rate */
+	if (rate == 0)
+		return;
+
+	mutex_lock(&table->rl_lock);
+	entry = find_rl_entry(table, rate, burst);
+	if (!entry || !entry->refcount) {
+		mlx5_core_warn(dev, "Rate %u is not configured\n", rate);
+		goto out;
+	}
+
+	entry->refcount--;
+	if (!entry->refcount) {
+		/* need to remove rate */
+		mlx5_set_rate_limit_cmd(dev, 0, 0, entry->index);
+		entry->rate = 0;
+		entry->burst = 0;
+	}
+
+out:
+	mutex_unlock(&table->rl_lock);
+}
+EXPORT_SYMBOL(mlx5_rl_remove_rate);
+
+int mlx5_init_rl_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_rl_table *table = &dev->priv.rl_table;
+	int i;
+
+	mutex_init(&table->rl_lock);
+	if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, packet_pacing)) {
+		table->max_size = 0;
+		return 0;
+	}
+
+	/* First entry is reserved for unlimited rate */
+	table->max_size = MLX5_CAP_QOS(dev, packet_pacing_rate_table_size) - 1;
+	table->max_rate = MLX5_CAP_QOS(dev, packet_pacing_max_rate);
+	table->min_rate = MLX5_CAP_QOS(dev, packet_pacing_min_rate);
+
+	table->rl_entry = kcalloc(table->max_size, sizeof(struct mlx5_rl_entry),
+				  GFP_KERNEL);
+	if (!table->rl_entry)
+		return -ENOMEM;
+
+	/* The index represents the index in HW rate limit table
+	 * Index 0 is reserved for unlimited rate
+	 */
+	for (i = 0; i < table->max_size; i++)
+		table->rl_entry[i].index = i + 1;
+
+	return 0;
+}
+
+void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev)
+{
+	struct mlx5_rl_table *table = &dev->priv.rl_table;
+	int i;
+
+	/* Clear all configured rates */
+	for (i = 0; i < table->max_size; i++)
+		if (table->rl_entry[i].rate)
+			mlx5_set_rate_limit_cmd(dev, 0, 0,
+						table->rl_entry[i].index);
+
+	kfree(dev->priv.rl_table.rl_entry);
+}
+
+#endif

Modified: head/sys/dev/mlx5/mlx5_en/en.h
==============================================================================
--- head/sys/dev/mlx5/mlx5_en/en.h	Tue May 29 14:04:50 2018	(r334331)
+++ head/sys/dev/mlx5/mlx5_en/en.h	Tue May 29 14:04:57 2018	(r334332)
@@ -49,6 +49,7 @@
 #include <netinet/udp.h>
 #include <net/ethernet.h>
 #include <sys/buf_ring.h>
+#include <sys/kthread.h>
 
 #include "opt_rss.h"
 
@@ -711,6 +712,10 @@ struct mlx5e_flow_tables {
 	struct mlx5e_flow_table inner_rss;
 };
 
+#ifdef RATELIMIT
+#include "en_rl.h"
+#endif
+
 #define	MLX5E_TSTMP_PREC 10
 
 struct mlx5e_clbr_point {
@@ -778,6 +783,9 @@ struct mlx5e_priv {
 	int	media_active_last;
 
 	struct callout watchdog;
+#ifdef RATELIMIT
+	struct mlx5e_rl_priv_data rl;
+#endif
 
 	struct callout tstmp_clbr;
 	int	clbr_done;

Added: head/sys/dev/mlx5/mlx5_en/en_rl.h
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/dev/mlx5/mlx5_en/en_rl.h	Tue May 29 14:04:57 2018	(r334332)
@@ -0,0 +1,174 @@
+/*-
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef __MLX5_EN_RL_H__
+#define __MLX5_EN_RL_H__
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/proc.h>
+#include <sys/condvar.h>
+#include <sys/interrupt.h>
+#include <sys/unistd.h>
+
+#include <sys/queue.h>
+
+#define	MLX5E_RL_MAX_WORKERS		128	/* limited by Toeplitz hash */
+#define	MLX5E_RL_MAX_TX_RATES		(64 * 1024)	/* software limit */
+#define	MLX5E_RL_DEF_SQ_PER_WORKER	(12 * 1024)	/* software limit */
+#define	MLX5E_RL_MAX_SQS		(120 * 1024)	/* software limit */
+
+#define	MLX5E_RL_TX_COAL_USEC_DEFAULT	32
+#define	MLX5E_RL_TX_COAL_PKTS_DEFAULT	4
+#define	MLX5E_RL_TX_COAL_MODE_DEFAULT	0
+#define	MLX5E_RL_TX_COMP_FACT_DEFAULT	1
+
+#define	MLX5E_RL_WORKER_LOCK(rlw)		mtx_lock(&(rlw)->mtx)
+#define	MLX5E_RL_WORKER_UNLOCK(rlw)		mtx_unlock(&(rlw)->mtx)
+
+#define	MLX5E_RL_RLOCK(rl) sx_slock(&(rl)->rl_sxlock)
+#define	MLX5E_RL_RUNLOCK(rl) sx_sunlock(&(rl)->rl_sxlock)
+
+#define	MLX5E_RL_WLOCK(rl) sx_xlock(&(rl)->rl_sxlock)
+#define	MLX5E_RL_WUNLOCK(rl) sx_xunlock(&(rl)->rl_sxlock)
+
+#define	MLX5E_RL_PARAMS(m) \
+  m(+1, u64 tx_queue_size, "tx_queue_size", "Default send queue size") \
+  m(+1, u64 tx_coalesce_usecs, "tx_coalesce_usecs", "Limit in usec for joining TX packets") \
+  m(+1, u64 tx_coalesce_pkts, "tx_coalesce_pkts", "Maximum number of TX packets to join") \
+  m(+1, u64 tx_coalesce_mode, "tx_coalesce_mode", "0: EQE mode 1: CQE mode") \
+  m(+1, u64 tx_completion_fact, "tx_completion_fact", "1..MAX: Completion event ratio") \
+  m(+1, u64 tx_completion_fact_max, "tx_completion_fact_max", "Maximum completion event ratio") \
+  m(+1, u64 tx_worker_threads_max, "tx_worker_threads_max", "Max number of TX worker threads") \
+  m(+1, u64 tx_worker_threads_def, "tx_worker_threads_def", "Default number of TX worker threads") \
+  m(+1, u64 tx_channels_per_worker_max, "tx_channels_per_worker_max", "Max number of TX channels per worker") \
+  m(+1, u64 tx_channels_per_worker_def, "tx_channels_per_worker_def", "Default number of TX channels per worker") \
+  m(+1, u64 tx_rates_max, "tx_rates_max", "Max number of TX rates") \
+  m(+1, u64 tx_rates_def, "tx_rates_def", "Default number of TX rates") \
+  m(+1, u64 tx_limit_min, "tx_limit_min", "Minimum TX rate in bits/s") \
+  m(+1, u64 tx_limit_max, "tx_limit_max", "Maximum TX rate in bits/s") \
+  m(+1, u64 tx_burst_size, "tx_burst_size", "Current burst size in number of packets. A value of zero means use firmware default.") \
+  m(+1, u64 tx_burst_size_max, "tx_burst_size_max", "Maximum burst size in number of packets") \
+  m(+1, u64 tx_burst_size_min, "tx_burst_size_min", "Minimum burst size in number of packets")
+
+#define	MLX5E_RL_PARAMS_NUM (0 MLX5E_RL_PARAMS(MLX5E_STATS_COUNT))
+
+#define MLX5E_RL_STATS(m) \
+  m(+1, u64 tx_allocate_resource_failure, "tx_allocate_resource_failure", "Number of times firmware resource allocation failed") \
+  m(+1, u64 tx_add_new_rate_failure, "tx_add_new_rate_failure", "Number of times adding a new firmware rate failed") \
+  m(+1, u64 tx_modify_rate_failure, "tx_modify_rate_failure", "Number of times modifying a firmware rate failed") \
+  m(+1, u64 tx_active_connections, "tx_active_connections", "Number of active connections") \
+  m(+1, u64 tx_open_queues, "tx_open_queues", "Number of open TX queues") \
+  m(+1, u64 tx_available_resource_failure, "tx_available_resource_failure", "Number of times TX resources were not available")
+
+#define MLX5E_RL_STATS_NUM (0 MLX5E_RL_STATS(MLX5E_STATS_COUNT))
+
+#define	MLX5E_RL_TABLE_PARAMS(m) \
+  m(+1, u64 tx_limit_add, "tx_limit_add", "Add TX rate limit in bits/s to empty slot") \
+  m(+1, u64 tx_limit_clr, "tx_limit_clr", "Clear all TX rates in table") \
+  m(+1, u64 tx_allowed_deviation, "tx_allowed_deviation", "Relative rate deviation allowed in 1/1000") \
+  m(+1, u64 tx_allowed_deviation_min, "tx_allowed_deviation_min", "Minimum allowed rate deviation in 1/1000") \
+  m(+1, u64 tx_allowed_deviation_max, "tx_allowed_deviation_max", "Maximum allowed rate deviation in 1/1000")
+
+#define	MLX5E_RL_TABLE_PARAMS_NUM (0 MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_COUNT))
+
+#define	MLX5E_RL_PARAMS_INDEX(n)			\
+    (__offsetof(struct mlx5e_rl_params, n) / sizeof(uint64_t))
+
+struct mlx5e_priv;
+
+/* Indicates channel's state */
+enum {
+	MLX5E_RL_ST_FREE,
+	MLX5E_RL_ST_USED,
+	MLX5E_RL_ST_MODIFY,
+	MLX5E_RL_ST_DESTROY,
+};
+
+struct mlx5e_rl_stats {
+	u64	arg [0];
+	MLX5E_RL_STATS(MLX5E_STATS_VAR)
+};
+
+struct mlx5e_rl_params {
+	u64	arg [0];
+	MLX5E_RL_PARAMS(MLX5E_STATS_VAR)
+	u64	table_arg [0];
+	MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_VAR)
+};
+
+struct mlx5e_rl_channel_param {
+	struct mlx5e_sq_param sq;
+	struct mlx5e_cq_param cq;
+};
+
+struct mlx5e_rl_channel {
+	struct m_snd_tag m_snd_tag;
+	STAILQ_ENTRY(mlx5e_rl_channel) entry;
+	struct mlx5e_sq * volatile sq;
+	struct mlx5e_rl_worker *worker;
+	uint64_t new_rate;
+	uint64_t init_rate;
+	uint64_t last_rate;
+	uint16_t last_burst;
+	uint16_t state;
+};
+
+struct mlx5e_rl_worker {
+	struct mtx mtx;
+	struct cv cv;
+	STAILQ_HEAD(, mlx5e_rl_channel) index_list_head;
+	STAILQ_HEAD(, mlx5e_rl_channel) process_head;
+	struct mlx5e_priv *priv;
+	struct mlx5e_rl_channel *channels;
+	unsigned worker_done;
+};
+
+struct mlx5e_rl_priv_data {
+	struct sx rl_sxlock;
+	struct sysctl_ctx_list ctx;
+	struct mlx5e_rl_channel_param chan_param;
+	struct mlx5e_rl_params param;
+	struct mlx5e_rl_stats stats;
+	struct mlx5_uar sq_uar;
+	struct mlx5e_rl_worker *workers;
+	struct mlx5e_priv *priv;
+	uint64_t *rate_limit_table;
+	unsigned opened;
+	uint32_t tisn;
+};
+
+int mlx5e_rl_init(struct mlx5e_priv *priv);
+void mlx5e_rl_cleanup(struct mlx5e_priv *priv);
+if_snd_tag_alloc_t mlx5e_rl_snd_tag_alloc;
+if_snd_tag_modify_t mlx5e_rl_snd_tag_modify;
+if_snd_tag_query_t mlx5e_rl_snd_tag_query;
+if_snd_tag_free_t mlx5e_rl_snd_tag_free;
+
+#endif		/* __MLX5_EN_RL_H__ */

Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c	Tue May 29 14:04:50 2018	(r334331)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c	Tue May 29 14:04:57 2018	(r334332)
@@ -3507,6 +3507,13 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
 	ifp->if_capabilities |= IFCAP_LRO;
 	ifp->if_capabilities |= IFCAP_TSO | IFCAP_VLAN_HWTSO;
 	ifp->if_capabilities |= IFCAP_HWSTATS | IFCAP_HWRXTSTMP;
+#ifdef RATELIMIT
+	ifp->if_capabilities |= IFCAP_TXRTLMT;
+	ifp->if_snd_tag_alloc = mlx5e_rl_snd_tag_alloc;
+	ifp->if_snd_tag_free = mlx5e_rl_snd_tag_free;
+	ifp->if_snd_tag_modify = mlx5e_rl_snd_tag_modify;
+	ifp->if_snd_tag_query = mlx5e_rl_snd_tag_query;
+#endif
 
 	/* set TSO limits so that we don't have to drop TX packets */
 	ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
@@ -3588,6 +3595,14 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
 		random_ether_addr(dev_addr);
 		if_printf(ifp, "Assigned random MAC address\n");
 	}
+#ifdef RATELIMIT
+	err = mlx5e_rl_init(priv);
+	if (err) {
+		if_printf(ifp, "%s: mlx5e_rl_init failed, %d\n",
+		    __func__, err);
+		goto err_create_mkey;
+	}
+#endif
 
 	/* set default MTU */
 	mlx5e_set_dev_port_mtu(ifp, ifp->if_mtu);
@@ -3673,6 +3688,10 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
 
 	return (priv);
 
+#ifdef RATELIMIT
+err_create_mkey:
+	mlx5_core_destroy_mkey(priv->mdev, &priv->mr);
+#endif
 err_dealloc_transport_domain:
 	mlx5_dealloc_transport_domain(mdev, priv->tdn);
 
@@ -3715,6 +3734,18 @@ mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vp
 	/* XXX wait a bit to allow IOCTL handlers to complete */
 	pause("W", hz);
 
+#ifdef RATELIMIT
+	/*
+	 * The kernel can have reference(s) via the m_snd_tag's into
+	 * the ratelimit channels, and these must go away before
+	 * detaching:
+	 */
+	while (READ_ONCE(priv->rl.stats.tx_active_connections) != 0) {
+		if_printf(priv->ifp, "Waiting for all ratelimit connections "
+		    "to terminate\n");
+		pause("W", hz);
+	}
+#endif
 	/* stop watchdog timer */
 	callout_drain(&priv->watchdog);
 
@@ -3735,6 +3766,9 @@ mlx5e_destroy_ifp(struct mlx5_core_dev *mdev, void *vp
 	ether_ifdetach(ifp);
 	if_free(ifp);
 
+#ifdef RATELIMIT
+	mlx5e_rl_cleanup(priv);
+#endif
 	/* destroy all remaining sysctl nodes */
 	if (priv->sysctl_debug)
 		sysctl_ctx_free(&priv->stats.port_stats_debug.ctx);

Added: head/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c	Tue May 29 14:04:57 2018	(r334332)
@@ -0,0 +1,1539 @@
+/*-
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#include "en.h"
+
+#ifdef RATELIMIT
+
+static int mlx5e_rl_open_workers(struct mlx5e_priv *);
+static void mlx5e_rl_close_workers(struct mlx5e_priv *);
+static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
+static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
+    struct sysctl_oid *, const char *name, const char *desc);
+static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
+      struct sysctl_oid *node, const char *name, const char *desc);
+static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
+static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
+
+static void
+mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
+    struct mlx5e_sq_param *param)
+{
+	void *sqc = param->sqc;
+	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
+	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
+
+	MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
+	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
+	MLX5_SET(wq, wq, pd, rl->priv->pdn);
+
+	param->wq.buf_numa_node = 0;
+	param->wq.db_numa_node = 0;
+	param->wq.linear = 1;
+}
+
+static void
+mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
+    struct mlx5e_cq_param *param)
+{
+	void *cqc = param->cqc;
+	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
+
+	MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
+	MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
+	MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
+
+	switch (rl->param.tx_coalesce_mode) {
+	case 0:
+		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
+		break;
+	default:
+		if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
+			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
+		else
+			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
+		break;
+	}
+}
+
+static void
+mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
+    struct mlx5e_rl_channel_param *cparam)
+{
+	memset(cparam, 0, sizeof(*cparam));
+
+	mlx5e_rl_build_sq_param(rl, &cparam->sq);
+	mlx5e_rl_build_cq_param(rl, &cparam->cq);
+}
+
+static int
+mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
+    struct mlx5e_sq_param *param, int ix)
+{
+	struct mlx5_core_dev *mdev = priv->mdev;
+	void *sqc = param->sqc;
+	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
+	int err;
+
+	/* Create DMA descriptor TAG */
+	if ((err = -bus_dma_tag_create(
+	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
+	    1,				/* any alignment */
+	    0,				/* no boundary */
+	    BUS_SPACE_MAXADDR,		/* lowaddr */
+	    BUS_SPACE_MAXADDR,		/* highaddr */
+	    NULL, NULL,			/* filter, filterarg */
+	    MLX5E_MAX_TX_PAYLOAD_SIZE,	/* maxsize */
+	    MLX5E_MAX_TX_MBUF_FRAGS,	/* nsegments */
+	    MLX5E_MAX_TX_MBUF_SIZE,	/* maxsegsize */
+	    0,				/* flags */
+	    NULL, NULL,			/* lockfunc, lockfuncarg */
+	    &sq->dma_tag)))
+		goto done;
+
+	/* use shared UAR */
+	sq->uar = priv->rl.sq_uar;
+
+	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
+	    &sq->wq_ctrl);
+	if (err)
+		goto err_free_dma_tag;
+
+	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
+	/*
+	 * The sq->bf_buf_size variable is intentionally left zero so
+	 * that the doorbell writes will occur at the same memory
+	 * location.
+	 */
+
+	err = mlx5e_alloc_sq_db(sq);
+	if (err)
+		goto err_sq_wq_destroy;
+
+	sq->mkey_be = cpu_to_be32(priv->mr.key);
+	sq->ifp = priv->ifp;
+	sq->priv = priv;
+
+	return (0);
+
+err_sq_wq_destroy:
+	mlx5_wq_destroy(&sq->wq_ctrl);
+err_free_dma_tag:
+	bus_dma_tag_destroy(sq->dma_tag);
+done:
+	return (err);
+}
+
+static void
+mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
+{
+
+	mlx5e_free_sq_db(sq);
+	mlx5_wq_destroy(&sq->wq_ctrl);
+}
+
+static int
+mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
+    struct mlx5e_sq_param *param, int ix)
+{
+	int err;
+
+	err = mlx5e_rl_create_sq(priv, sq, param, ix);
+	if (err)
+		return (err);
+
+	err = mlx5e_enable_sq(sq, param, priv->rl.tisn);
+	if (err)
+		goto err_destroy_sq;
+
+	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
+	if (err)
+		goto err_disable_sq;
+
+	return (0);
+
+err_disable_sq:
+	mlx5e_disable_sq(sq);
+err_destroy_sq:
+	mlx5e_rl_destroy_sq(sq);
+
+	return (err);
+}
+
+static void
+mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
+{
+	mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
+	mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
+
+	callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
+
+	sq->cev_factor = priv->rl.param.tx_completion_fact;
+
+	/* ensure the TX completion event factor is not zero */
+	if (sq->cev_factor == 0)
+		sq->cev_factor = 1;
+}
+
+static int
+mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
+    struct mlx5e_rl_channel_param *cparam,
+    struct mlx5e_sq *volatile *ppsq)
+{
+	struct mlx5e_priv *priv = rlw->priv;
+	struct mlx5e_sq *sq;
+	int err;
+
+	sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
+
+	/* init mutexes */
+	mlx5e_rl_chan_mtx_init(priv, sq);
+
+	/* open TX completion queue */
+	err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
+	    &mlx5e_tx_cq_comp, eq_ix);
+	if (err)
+		goto err_free;
+
+	err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
+	if (err)
+		goto err_close_tx_cq;
+
+	/* store TX channel pointer */
+	*ppsq = sq;
+
+	/* poll TX queue initially */
+	sq->cq.mcq.comp(&sq->cq.mcq);
+
+	return (0);
+
+err_close_tx_cq:
+	mlx5e_close_cq(&sq->cq);
+
+err_free:
+	/* destroy mutexes */
+	mtx_destroy(&sq->lock);
+	mtx_destroy(&sq->comp_lock);
+	free(sq, M_MLX5EN);
+	atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
+	return (err);
+}
+
+static void
+mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
+{
+	struct mlx5e_sq *sq = *ppsq;
+
+	/* check if channel is already closed */
+	if (sq == NULL)
+		return;
+	/* ensure channel pointer is no longer used */
+	*ppsq = NULL;
+
+	/* teardown and destroy SQ */
+	mlx5e_drain_sq(sq);
+	mlx5e_disable_sq(sq);
+	mlx5e_rl_destroy_sq(sq);
+
+	/* close CQ */
+	mlx5e_close_cq(&sq->cq);
+
+	/* destroy mutexes */
+	mtx_destroy(&sq->lock);
+	mtx_destroy(&sq->comp_lock);
+
+	free(sq, M_MLX5EN);
+}
+
+static void
+mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl)
+{
+	/*
+	 * Limit the maximum distance between completion events to
+	 * half of the currently set TX queue size.
+	 *
+	 * The maximum number of queue entries a single IP packet can
+	 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
+	 *
+	 * The worst case max value is then given as below:
+	 */
+	uint64_t max = rl->param.tx_queue_size /
+	    (2 * MLX5_SEND_WQE_MAX_WQEBBS);
+
+	/*
+	 * Update the maximum completion factor value in case the
+	 * tx_queue_size field changed. Ensure we don't overflow
+	 * 16-bits.
+	 */
+	if (max < 1)
+		max = 1;
+	else if (max > 65535)
+		max = 65535;
+	rl->param.tx_completion_fact_max = max;
+
+	/*
+	 * Verify that the current TX completion factor is within the
+	 * given limits:
+	 */
+	if (rl->param.tx_completion_fact < 1)
+		rl->param.tx_completion_fact = 1;
+	else if (rl->param.tx_completion_fact > max)
+		rl->param.tx_completion_fact = max;
+}
+
+static int
+mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index)
+{
+	struct mlx5e_priv *priv = sq->priv;
+	struct mlx5_core_dev *mdev = priv->mdev;
+
+	void *in;
+	void *sqc;
+	int inlen;
+	int err;
+
+	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
+	in = mlx5_vzalloc(inlen);
+	if (in == NULL)

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-head mailing list