git: 152fa6f5b00b - stable/13 - ice(4): Add RDMA Client Interface

From: Eric Joyner <erj_at_FreeBSD.org>
Date: Tue, 18 Oct 2022 04:55:06 UTC
The branch stable/13 has been updated by erj:

URL: https://cgit.FreeBSD.org/src/commit/?id=152fa6f5b00b0b024369a37599172c55c6fca654

commit 152fa6f5b00b0b024369a37599172c55c6fca654
Author:     Eric Joyner <erj@FreeBSD.org>
AuthorDate: 2021-12-02 00:50:06 +0000
Commit:     Eric Joyner <erj@FreeBSD.org>
CommitDate: 2022-10-18 04:19:36 +0000

    ice(4): Add RDMA Client Interface
    
    This allows the "irdma" driver to communicate with the ice(4)
    driver to allow it access to the underlying device's hardware
    resources as well as synchronize access to shared resources.
    
    This interface already existed in the standalone out-of-tree
    1.34.2 driver; this commit adds and enables it in the in-kernel
    driver.
    
    Note:
    
    Adds hack to module Makefile to compile interface/.m files
    
    These are required for the RDMA client interface, but they don't
    build as-is like the normal .c files. The source directory doesn't
    seem to be included by default, so add lines that specifically
    add them as libraries so that ice_rdma.h can be found and the
    interface files will compile.
    
    Signed-off-by: Eric Joyner <erj@FreeBSD.org>
    
    Sponsored by:   Intel Corporation
    Differential Revision:  https://reviews.freebsd.org/D30889
    
    (cherry picked from commit 8a13362d49bf07dfc654e25976d057adbe0ac9c1)
    (cherry picked from commit d8cce8145c39812cc31b50070c44d66ca21a5127)
---
 sys/conf/files.amd64             |   6 +
 sys/conf/files.arm64             |   6 +
 sys/conf/files.powerpc           |   6 +
 sys/dev/ice/ice_adminq_cmd.h     |  54 +++
 sys/dev/ice/ice_common.c         | 219 +++++++++-
 sys/dev/ice/ice_common.h         |  14 +
 sys/dev/ice/ice_common_sysctls.h |  24 ++
 sys/dev/ice/ice_iflib.h          |   5 +
 sys/dev/ice/ice_lib.c            |  27 ++
 sys/dev/ice/ice_lib.h            |   2 +
 sys/dev/ice/ice_rdma.c           | 859 +++++++++++++++++++++++++++++++++++++++
 sys/dev/ice/ice_rdma.h           | 311 ++++++++++++++
 sys/dev/ice/ice_rdma_internal.h  | 102 +++++
 sys/dev/ice/ice_sched.c          |  80 +++-
 sys/dev/ice/ice_sched.h          |   1 +
 sys/dev/ice/ice_switch.c         |  45 ++
 sys/dev/ice/ice_switch.h         |   4 +
 sys/dev/ice/ice_type.h           |   3 +
 sys/dev/ice/if_ice_iflib.c       |  74 +++-
 sys/dev/ice/irdma_di_if.m        |  97 +++++
 sys/dev/ice/irdma_if.m           | 106 +++++
 sys/modules/ice/Makefile         |  13 +
 22 files changed, 2050 insertions(+), 8 deletions(-)

diff --git a/sys/conf/files.amd64 b/sys/conf/files.amd64
index 990104441326..d8f7a1837f6a 100644
--- a/sys/conf/files.amd64
+++ b/sys/conf/files.amd64
@@ -199,6 +199,12 @@ dev/ice/ice_fw_logging.c	optional	ice pci \
 	compile-with "${NORMAL_C} -I$S/dev/ice"
 dev/ice/ice_fwlog.c		optional	ice pci \
 	compile-with "${NORMAL_C} -I$S/dev/ice"
+dev/ice/ice_rdma.c		optional	ice pci \
+	compile-with "${NORMAL_C} -I$S/dev/ice"
+dev/ice/irdma_if.m		optional	ice pci \
+	compile-with "${NORMAL_M} -I$S/dev/ice"
+dev/ice/irdma_di_if.m		optional	ice pci \
+	compile-with "${NORMAL_M} -I$S/dev/ice"
 ice_ddp.c			optional ice_ddp	\
 	compile-with	"${AWK} -f $S/tools/fw_stub.awk ice_ddp.fw:ice_ddp:0x01031b00 -mice_ddp -c${.TARGET}"	\
 	no-ctfconvert no-implicit-rule before-depend local	\
diff --git a/sys/conf/files.arm64 b/sys/conf/files.arm64
index 221ec6d0e409..05ee9b183df7 100644
--- a/sys/conf/files.arm64
+++ b/sys/conf/files.arm64
@@ -209,6 +209,12 @@ dev/ice/ice_fw_logging.c			optional ice pci \
 	compile-with "${NORMAL_C} -I$S/dev/ice"
 dev/ice/ice_fwlog.c				optional ice pci \
 	compile-with "${NORMAL_C} -I$S/dev/ice"
+dev/ice/ice_rdma.c				optional ice pci \
+	compile-with "${NORMAL_C} -I$S/dev/ice"
+dev/ice/irdma_if.m				optional ice pci \
+	compile-with "${NORMAL_M} -I$S/dev/ice"
+dev/ice/irdma_di_if.m				optional ice pci \
+	compile-with "${NORMAL_M} -I$S/dev/ice"
 ice_ddp.c					optional ice_ddp	\
 	compile-with	"${AWK} -f $S/tools/fw_stub.awk ice_ddp.fw:ice_ddp:0x01031b00 -mice_ddp -c${.TARGET}"	\
 	no-ctfconvert no-implicit-rule before-depend local	\
diff --git a/sys/conf/files.powerpc b/sys/conf/files.powerpc
index 6491c8ae3d9a..889310c0afaa 100644
--- a/sys/conf/files.powerpc
+++ b/sys/conf/files.powerpc
@@ -74,6 +74,12 @@ dev/ice/ice_fw_logging.c	optional	ice pci powerpc64 \
 	compile-with "${NORMAL_C} -I$S/dev/ice"
 dev/ice/ice_fwlog.c		optional	ice pci powerpc64 \
 	compile-with "${NORMAL_C} -I$S/dev/ice"
+dev/ice/ice_rdma.c		optional	ice pci powerpc64 \
+	compile-with "${NORMAL_C} -I$S/dev/ice"
+dev/ice/irdma_if.m		optional	ice pci powerpc64 \
+	compile-with "${NORMAL_M} -I$S/dev/ice"
+dev/ice/irdma_di_if.m		optional	ice pci powerpc64 \
+	compile-with "${NORMAL_M} -I$S/dev/ice"
 ice_ddp.c			optional	ice_ddp powerpc64 \
 	compile-with	"${AWK} -f $S/tools/fw_stub.awk ice_ddp.fw:ice_ddp:0x01031b00 -mice_ddp -c${.TARGET}"	\
 	no-ctfconvert no-implicit-rule before-depend local  \
diff --git a/sys/dev/ice/ice_adminq_cmd.h b/sys/dev/ice/ice_adminq_cmd.h
index eae8a412d1cc..a07ca6780a3c 100644
--- a/sys/dev/ice/ice_adminq_cmd.h
+++ b/sys/dev/ice/ice_adminq_cmd.h
@@ -2547,6 +2547,57 @@ struct ice_aqc_move_txqs_data {
 	struct ice_aqc_move_txqs_elem txqs[STRUCT_HACK_VAR_LEN];
 };
 
+/* Add Tx RDMA Queue Set (indirect 0x0C33) */
+struct ice_aqc_add_rdma_qset {
+	u8 num_qset_grps;
+	u8 reserved[7];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+/* This is the descriptor of each qset entry for the Add Tx RDMA Queue Set
+ * command (0x0C33). Only used within struct ice_aqc_add_rdma_qset.
+ */
+struct ice_aqc_add_tx_rdma_qset_entry {
+	__le16 tx_qset_id;
+	u8 rsvd[2];
+	__le32 qset_teid;
+	struct ice_aqc_txsched_elem info;
+};
+
+/* The format of the command buffer for Add Tx RDMA Queue Set(0x0C33)
+ * is an array of the following structs. Please note that the length of
+ * each struct ice_aqc_add_rdma_qset is variable due to the variable
+ * number of queues in each group!
+ */
+struct ice_aqc_add_rdma_qset_data {
+	__le32 parent_teid;
+	__le16 num_qsets;
+	u8 rsvd[2];
+	struct ice_aqc_add_tx_rdma_qset_entry rdma_qsets[STRUCT_HACK_VAR_LEN];
+};
+
+/* Move RDMA Queue Set (indirect 0x0C34) */
+struct ice_aqc_move_rdma_qset_cmd {
+	u8 num_rdma_qset;	/* Used by commands and response */
+	u8 flags;
+	u8 reserved[6];
+	__le32 addr_high;
+	__le32 addr_low;
+};
+
+/* Buffer */
+struct ice_aqc_move_rdma_qset_buffer_desc {
+	__le16 tx_qset_id;
+	__le16 qset_teid;
+};
+
+struct ice_aqc_move_rdma_qset_buffer {
+	__le32 src_parent_teid;
+	__le32 dest_parent_teid;
+	struct ice_aqc_move_rdma_qset_buffer_desc descs[STRUCT_HACK_VAR_LEN];
+};
+
 /* Download Package (indirect 0x0C40) */
 /* Also used for Update Package (indirect 0x0C42 and 0x0C41) */
 struct ice_aqc_download_pkg {
@@ -2897,6 +2948,7 @@ struct ice_aq_desc {
 		struct ice_aqc_add_txqs add_txqs;
 		struct ice_aqc_dis_txqs dis_txqs;
 		struct ice_aqc_move_txqs move_txqs;
+		struct ice_aqc_add_rdma_qset add_rdma_qset;
 		struct ice_aqc_txqs_cleanup txqs_cleanup;
 		struct ice_aqc_add_get_update_free_vsi vsi_cmd;
 		struct ice_aqc_add_update_free_vsi_resp add_update_free_vsi_res;
@@ -3156,6 +3208,8 @@ enum ice_adminq_opc {
 	ice_aqc_opc_dis_txqs				= 0x0C31,
 	ice_aqc_opc_txqs_cleanup			= 0x0C31,
 	ice_aqc_opc_move_recfg_txqs			= 0x0C32,
+	ice_aqc_opc_add_rdma_qset			= 0x0C33,
+	ice_aqc_opc_move_rdma_qset			= 0x0C34,
 
 	/* package commands */
 	ice_aqc_opc_download_pkg			= 0x0C40,
diff --git a/sys/dev/ice/ice_common.c b/sys/dev/ice/ice_common.c
index 80aa3557bf75..3ae266b72d1f 100644
--- a/sys/dev/ice/ice_common.c
+++ b/sys/dev/ice/ice_common.c
@@ -1198,7 +1198,8 @@ enum ice_status ice_check_reset(struct ice_hw *hw)
 				 GLNVM_ULD_POR_DONE_1_M |\
 				 GLNVM_ULD_PCIER_DONE_2_M)
 
-	uld_mask = ICE_RESET_DONE_MASK;
+	uld_mask = ICE_RESET_DONE_MASK | (hw->func_caps.common_cap.iwarp ?
+					  GLNVM_ULD_PE_DONE_M : 0);
 
 	/* Device is Active; check Global Reset processes are done */
 	for (cnt = 0; cnt < ICE_PF_RESET_WAIT_COUNT; cnt++) {
@@ -2364,6 +2365,10 @@ ice_parse_common_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps,
 		ice_debug(hw, ICE_DBG_INIT, "%s: mgmt_cem = %d\n", prefix,
 			  caps->mgmt_cem);
 		break;
+	case ICE_AQC_CAPS_IWARP:
+		caps->iwarp = (number == 1);
+		ice_debug(hw, ICE_DBG_INIT, "%s: iwarp = %d\n", prefix, caps->iwarp);
+		break;
 	case ICE_AQC_CAPS_LED:
 		if (phys_id < ICE_MAX_SUPPORTED_GPIO_LED) {
 			caps->led[phys_id] = true;
@@ -2481,6 +2486,16 @@ ice_recalc_port_limited_caps(struct ice_hw *hw, struct ice_hw_common_caps *caps)
 		caps->maxtc = 4;
 		ice_debug(hw, ICE_DBG_INIT, "reducing maxtc to %d (based on #ports)\n",
 			  caps->maxtc);
+		if (caps->iwarp) {
+			ice_debug(hw, ICE_DBG_INIT, "forcing RDMA off\n");
+			caps->iwarp = 0;
+		}
+
+		/* print message only when processing device capabilities
+		 * during initialization.
+		 */
+		if (caps == &hw->dev_caps.common_cap)
+			ice_info(hw, "RDMA functionality is not available with the current device configuration.\n");
 	}
 }
 
@@ -4338,6 +4353,56 @@ ice_aq_move_recfg_lan_txq(struct ice_hw *hw, u8 num_qs, bool is_move,
 	return status;
 }
 
+/**
+ * ice_aq_add_rdma_qsets
+ * @hw: pointer to the hardware structure
+ * @num_qset_grps: Number of RDMA Qset groups
+ * @qset_list: list of qset groups to be added
+ * @buf_size: size of buffer for indirect command
+ * @cd: pointer to command details structure or NULL
+ *
+ * Add Tx RDMA Qsets (0x0C33)
+ */
+enum ice_status
+ice_aq_add_rdma_qsets(struct ice_hw *hw, u8 num_qset_grps,
+		      struct ice_aqc_add_rdma_qset_data *qset_list,
+		      u16 buf_size, struct ice_sq_cd *cd)
+{
+	struct ice_aqc_add_rdma_qset_data *list;
+	struct ice_aqc_add_rdma_qset *cmd;
+	struct ice_aq_desc desc;
+	u16 i, sum_size = 0;
+
+	ice_debug(hw, ICE_DBG_TRACE, "%s\n", __func__);
+
+	cmd = &desc.params.add_rdma_qset;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_add_rdma_qset);
+
+	if (!qset_list)
+		return ICE_ERR_PARAM;
+
+	if (num_qset_grps > ICE_LAN_TXQ_MAX_QGRPS)
+		return ICE_ERR_PARAM;
+
+	for (i = 0, list = qset_list; i < num_qset_grps; i++) {
+		u16 num_qsets = LE16_TO_CPU(list->num_qsets);
+
+		sum_size += ice_struct_size(list, rdma_qsets, num_qsets);
+		list = (struct ice_aqc_add_rdma_qset_data *)(list->rdma_qsets +
+							     num_qsets);
+	}
+
+	if (buf_size != sum_size)
+		return ICE_ERR_PARAM;
+
+	desc.flags |= CPU_TO_LE16(ICE_AQ_FLAG_RD);
+
+	cmd->num_qset_grps = num_qset_grps;
+
+	return ice_aq_send_cmd(hw, &desc, qset_list, buf_size, cd);
+}
+
 /* End of FW Admin Queue command wrappers */
 
 /**
@@ -5100,6 +5165,158 @@ ice_cfg_vsi_lan(struct ice_port_info *pi, u16 vsi_handle, u16 tc_bitmap,
 			      ICE_SCHED_NODE_OWNER_LAN);
 }
 
+/**
+ * ice_cfg_vsi_rdma - configure the VSI RDMA queues
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc_bitmap: TC bitmap
+ * @max_rdmaqs: max RDMA queues array per TC
+ *
+ * This function adds/updates the VSI RDMA queues per TC.
+ */
+enum ice_status
+ice_cfg_vsi_rdma(struct ice_port_info *pi, u16 vsi_handle, u16 tc_bitmap,
+		 u16 *max_rdmaqs)
+{
+	return ice_cfg_vsi_qs(pi, vsi_handle, tc_bitmap, max_rdmaqs,
+			      ICE_SCHED_NODE_OWNER_RDMA);
+}
+
+/**
+ * ice_ena_vsi_rdma_qset
+ * @pi: port information structure
+ * @vsi_handle: software VSI handle
+ * @tc: TC number
+ * @rdma_qset: pointer to RDMA qset
+ * @num_qsets: number of RDMA qsets
+ * @qset_teid: pointer to qset node teids
+ *
+ * This function adds RDMA qset
+ */
+enum ice_status
+ice_ena_vsi_rdma_qset(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		      u16 *rdma_qset, u16 num_qsets, u32 *qset_teid)
+{
+	struct ice_aqc_txsched_elem_data node = { 0 };
+	struct ice_aqc_add_rdma_qset_data *buf;
+	struct ice_sched_node *parent;
+	enum ice_status status;
+	struct ice_hw *hw;
+	u16 i, buf_size;
+
+	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+		return ICE_ERR_CFG;
+	hw = pi->hw;
+
+	if (!ice_is_vsi_valid(hw, vsi_handle))
+		return ICE_ERR_PARAM;
+
+	buf_size = ice_struct_size(buf, rdma_qsets, num_qsets);
+	buf = (struct ice_aqc_add_rdma_qset_data *)ice_malloc(hw, buf_size);
+	if (!buf)
+		return ICE_ERR_NO_MEMORY;
+	ice_acquire_lock(&pi->sched_lock);
+
+	parent = ice_sched_get_free_qparent(pi, vsi_handle, tc,
+					    ICE_SCHED_NODE_OWNER_RDMA);
+	if (!parent) {
+		status = ICE_ERR_PARAM;
+		goto rdma_error_exit;
+	}
+	buf->parent_teid = parent->info.node_teid;
+	node.parent_teid = parent->info.node_teid;
+
+	buf->num_qsets = CPU_TO_LE16(num_qsets);
+	for (i = 0; i < num_qsets; i++) {
+		buf->rdma_qsets[i].tx_qset_id = CPU_TO_LE16(rdma_qset[i]);
+		buf->rdma_qsets[i].info.valid_sections =
+			ICE_AQC_ELEM_VALID_GENERIC | ICE_AQC_ELEM_VALID_CIR |
+			ICE_AQC_ELEM_VALID_EIR;
+		buf->rdma_qsets[i].info.generic = 0;
+		buf->rdma_qsets[i].info.cir_bw.bw_profile_idx =
+			CPU_TO_LE16(ICE_SCHED_DFLT_RL_PROF_ID);
+		buf->rdma_qsets[i].info.cir_bw.bw_alloc =
+			CPU_TO_LE16(ICE_SCHED_DFLT_BW_WT);
+		buf->rdma_qsets[i].info.eir_bw.bw_profile_idx =
+			CPU_TO_LE16(ICE_SCHED_DFLT_RL_PROF_ID);
+		buf->rdma_qsets[i].info.eir_bw.bw_alloc =
+			CPU_TO_LE16(ICE_SCHED_DFLT_BW_WT);
+	}
+	status = ice_aq_add_rdma_qsets(hw, 1, buf, buf_size, NULL);
+	if (status != ICE_SUCCESS) {
+		ice_debug(hw, ICE_DBG_RDMA, "add RDMA qset failed\n");
+		goto rdma_error_exit;
+	}
+	node.data.elem_type = ICE_AQC_ELEM_TYPE_LEAF;
+	for (i = 0; i < num_qsets; i++) {
+		node.node_teid = buf->rdma_qsets[i].qset_teid;
+		status = ice_sched_add_node(pi, hw->num_tx_sched_layers - 1,
+					    &node);
+		if (status)
+			break;
+		qset_teid[i] = LE32_TO_CPU(node.node_teid);
+	}
+rdma_error_exit:
+	ice_release_lock(&pi->sched_lock);
+	ice_free(hw, buf);
+	return status;
+}
+
+/**
+ * ice_dis_vsi_rdma_qset - free RDMA resources
+ * @pi: port_info struct
+ * @count: number of RDMA qsets to free
+ * @qset_teid: TEID of qset node
+ * @q_id: list of queue IDs being disabled
+ */
+enum ice_status
+ice_dis_vsi_rdma_qset(struct ice_port_info *pi, u16 count, u32 *qset_teid,
+		      u16 *q_id)
+{
+	struct ice_aqc_dis_txq_item *qg_list;
+	enum ice_status status = ICE_SUCCESS;
+	struct ice_hw *hw;
+	u16 qg_size;
+	int i;
+
+	if (!pi || pi->port_state != ICE_SCHED_PORT_STATE_READY)
+		return ICE_ERR_CFG;
+
+	hw = pi->hw;
+
+	qg_size = ice_struct_size(qg_list, q_id, 1);
+	qg_list = (struct ice_aqc_dis_txq_item *)ice_malloc(hw, qg_size);
+	if (!qg_list)
+		return ICE_ERR_NO_MEMORY;
+
+	ice_acquire_lock(&pi->sched_lock);
+
+	for (i = 0; i < count; i++) {
+		struct ice_sched_node *node;
+
+		node = ice_sched_find_node_by_teid(pi->root, qset_teid[i]);
+		if (!node)
+			continue;
+
+		qg_list->parent_teid = node->info.parent_teid;
+		qg_list->num_qs = 1;
+		qg_list->q_id[0] =
+			CPU_TO_LE16(q_id[i] |
+				    ICE_AQC_Q_DIS_BUF_ELEM_TYPE_RDMA_QSET);
+
+		status = ice_aq_dis_lan_txq(hw, 1, qg_list, qg_size,
+					    ICE_NO_RESET, 0, NULL);
+		if (status)
+			break;
+
+		ice_free_sched_node(pi, node);
+	}
+
+	ice_release_lock(&pi->sched_lock);
+	ice_free(hw, qg_list);
+	return status;
+}
+
 /**
  * ice_is_main_vsi - checks whether the VSI is main VSI
  * @hw: pointer to the HW struct
diff --git a/sys/dev/ice/ice_common.h b/sys/dev/ice/ice_common.h
index 48fd52cb2484..b113082b2394 100644
--- a/sys/dev/ice/ice_common.h
+++ b/sys/dev/ice/ice_common.h
@@ -147,6 +147,11 @@ ice_aq_move_recfg_lan_txq(struct ice_hw *hw, u8 num_qs, bool is_move,
 			  struct ice_aqc_move_txqs_data *buf, u16 buf_size,
 			  u8 *txqs_moved, struct ice_sq_cd *cd);
 
+enum ice_status
+ice_aq_add_rdma_qsets(struct ice_hw *hw, u8 num_qset_grps,
+		      struct ice_aqc_add_rdma_qset_data *qset_list,
+		      u16 buf_size, struct ice_sq_cd *cd);
+
 bool ice_check_sq_alive(struct ice_hw *hw, struct ice_ctl_q_info *cq);
 enum ice_status ice_aq_q_shutdown(struct ice_hw *hw, bool unloading);
 void ice_fill_dflt_direct_cmd_desc(struct ice_aq_desc *desc, u16 opcode);
@@ -257,6 +262,15 @@ __ice_write_sr_word(struct ice_hw *hw, u32 offset, const u16 *data);
 enum ice_status
 __ice_write_sr_buf(struct ice_hw *hw, u32 offset, u16 words, const u16 *data);
 enum ice_status
+ice_cfg_vsi_rdma(struct ice_port_info *pi, u16 vsi_handle, u16 tc_bitmap,
+		 u16 *max_rdmaqs);
+enum ice_status
+ice_ena_vsi_rdma_qset(struct ice_port_info *pi, u16 vsi_handle, u8 tc,
+		      u16 *rdma_qset, u16 num_qsets, u32 *qset_teid);
+enum ice_status
+ice_dis_vsi_rdma_qset(struct ice_port_info *pi, u16 count, u32 *qset_teid,
+		      u16 *q_id);
+enum ice_status
 ice_dis_vsi_txq(struct ice_port_info *pi, u16 vsi_handle, u8 tc, u8 num_queues,
 		u16 *q_handle, u16 *q_ids, u32 *q_teids,
 		enum ice_disq_rst_src rst_src, u16 vmvf_num,
diff --git a/sys/dev/ice/ice_common_sysctls.h b/sys/dev/ice/ice_common_sysctls.h
index f1a23ce828ff..0d149a5bc25c 100644
--- a/sys/dev/ice/ice_common_sysctls.h
+++ b/sys/dev/ice/ice_common_sysctls.h
@@ -45,6 +45,15 @@
 
 #include <sys/sysctl.h>
 
+/**
+ * @var ice_enable_irdma
+ * @brief boolean indicating if the iRDMA client interface is enabled
+ *
+ * Global sysctl variable indicating whether the RDMA client interface feature
+ * is enabled.
+ */
+bool ice_enable_irdma = true;
+
 /**
  * @var ice_enable_tx_fc_filter
  * @brief boolean indicating if the Tx Flow Control filter should be enabled
@@ -85,6 +94,15 @@ bool ice_enable_tx_lldp_filter = true;
  */
 bool ice_enable_health_events = true;
 
+/**
+ * @var ice_rdma_max_msix
+ * @brief maximum number of MSI-X vectors to reserve for RDMA interface
+ *
+ * Global sysctl variable indicating the maximum number of MSI-X vectors to
+ * reserve for a single RDMA interface.
+ */
+static uint16_t ice_rdma_max_msix = ICE_RDMA_MAX_MSIX;
+
 /* sysctls marked as tunable, (i.e. with the CTLFLAG_TUN set) will
  * automatically load tunable values, without the need to manually create the
  * TUNABLE definition.
@@ -105,6 +123,12 @@ SYSCTL_BOOL(_hw_ice, OID_AUTO, enable_health_events, CTLFLAG_RDTUN,
 	    &ice_enable_health_events, 0,
 	    "Enable FW health event reporting globally");
 
+SYSCTL_BOOL(_hw_ice, OID_AUTO, irdma, CTLFLAG_RDTUN, &ice_enable_irdma, 0,
+	    "Enable iRDMA client interface");
+
+SYSCTL_U16(_hw_ice, OID_AUTO, rdma_max_msix, CTLFLAG_RDTUN, &ice_rdma_max_msix,
+	   0, "Maximum number of MSI-X vectors to reserve per RDMA interface");
+
 SYSCTL_BOOL(_hw_ice_debug, OID_AUTO, enable_tx_fc_filter, CTLFLAG_RDTUN,
 	    &ice_enable_tx_fc_filter, 0,
 	    "Drop Ethertype 0x8808 control frames originating from non-HW sources");
diff --git a/sys/dev/ice/ice_iflib.h b/sys/dev/ice/ice_iflib.h
index 947881f1d076..07654afe4539 100644
--- a/sys/dev/ice/ice_iflib.h
+++ b/sys/dev/ice/ice_iflib.h
@@ -236,6 +236,11 @@ struct ice_softc {
 	struct mtx admin_mtx; /* mutex to protect the admin timer */
 	struct callout admin_timer; /* timer to trigger admin task */
 
+	/* iRDMA peer interface */
+	struct ice_rdma_entry rdma_entry;
+	int irdma_vectors;
+	u16 *rdma_imap;
+
 	struct ice_vsi **all_vsi;	/* Array of VSI pointers */
 	u16 num_available_vsi;		/* Size of VSI array */
 
diff --git a/sys/dev/ice/ice_lib.c b/sys/dev/ice/ice_lib.c
index 4212a0c76c88..f562b3b55b63 100644
--- a/sys/dev/ice/ice_lib.c
+++ b/sys/dev/ice/ice_lib.c
@@ -3984,6 +3984,11 @@ ice_config_pfc(struct ice_softc *sc, u8 new_mode)
 	local_dcbx_cfg->pfc.willing = 0;
 	local_dcbx_cfg->pfc.mbc = 0;
 
+	/* Warn if PFC is being disabled with RoCE v2 in use */
+	if (new_mode == 0 && sc->rdma_entry.attached)
+		device_printf(dev,
+		    "WARNING: Recommended that Priority Flow Control is enabled when RoCEv2 is in use\n");
+
 	status = ice_set_dcb_cfg(pi);
 	if (status) {
 		device_printf(dev,
@@ -7800,6 +7805,8 @@ ice_do_dcb_reconfig(struct ice_softc *sc)
 	pi = sc->hw.port_info;
 	local_dcbx_cfg = &pi->qos_cfg.local_dcbx_cfg;
 
+	ice_rdma_notify_dcb_qos_change(sc);
+
 	/* Set state when there's more than one TC */
 	tc_map = ice_dcb_get_tc_map(local_dcbx_cfg);
 	if (ice_dcb_num_tc(tc_map) > 1) {
@@ -7826,6 +7833,9 @@ ice_do_dcb_reconfig(struct ice_softc *sc)
 	/* Change PF VSI configuration */
 	ice_dcb_recfg(sc);
 
+	/* Send new configuration to RDMA client driver */
+	ice_rdma_dcb_qos_update(sc, pi);
+
 	ice_request_stack_reinit(sc);
 }
 
@@ -8663,6 +8673,7 @@ ice_init_saved_phy_cfg(struct ice_softc *sc)
 static int
 ice_module_init(void)
 {
+	ice_rdma_init();
 	return (0);
 }
 
@@ -8679,6 +8690,7 @@ ice_module_init(void)
 static int
 ice_module_exit(void)
 {
+	ice_rdma_exit();
 	return (0);
 }
 
@@ -9029,8 +9041,17 @@ ice_alloc_intr_tracking(struct ice_softc *sc)
 		err = ENOMEM;
 		goto free_imgr;
 	}
+	if (!(sc->rdma_imap =
+	      (u16 *)malloc(sizeof(u16) * hw->func_caps.common_cap.num_msix_vectors,
+	      M_ICE, M_NOWAIT))) {
+		device_printf(dev, "Unable to allocate RDMA imap memory\n");
+		err = ENOMEM;
+		free(sc->pf_imap, M_ICE);
+		goto free_imgr;
+	}
 	for (u32 i = 0; i < hw->func_caps.common_cap.num_msix_vectors; i++) {
 		sc->pf_imap[i] = ICE_INVALID_RES_IDX;
+		sc->rdma_imap[i] = ICE_INVALID_RES_IDX;
 	}
 
 	return (0);
@@ -9058,6 +9079,12 @@ ice_free_intr_tracking(struct ice_softc *sc)
 		free(sc->pf_imap, M_ICE);
 		sc->pf_imap = NULL;
 	}
+	if (sc->rdma_imap) {
+		ice_resmgr_release_map(&sc->imgr, sc->rdma_imap,
+				       sc->lan_vectors);
+		free(sc->rdma_imap, M_ICE);
+		sc->rdma_imap = NULL;
+	}
 
 	ice_resmgr_destroy(&sc->imgr);
 }
diff --git a/sys/dev/ice/ice_lib.h b/sys/dev/ice/ice_lib.h
index d0514a58a745..948f9858d43d 100644
--- a/sys/dev/ice/ice_lib.h
+++ b/sys/dev/ice/ice_lib.h
@@ -65,6 +65,8 @@
 #include "ice_sched.h"
 #include "ice_resmgr.h"
 
+#include "ice_rdma_internal.h"
+
 #include "ice_rss.h"
 
 /* Hide debug sysctls unless INVARIANTS is enabled */
diff --git a/sys/dev/ice/ice_rdma.c b/sys/dev/ice/ice_rdma.c
new file mode 100644
index 000000000000..5d89deed0f90
--- /dev/null
+++ b/sys/dev/ice/ice_rdma.c
@@ -0,0 +1,859 @@
+/* SPDX-License-Identifier: BSD-3-Clause */
+/*  Copyright (c) 2022, Intel Corporation
+ *  All rights reserved.
+ *
+ *  Redistribution and use in source and binary forms, with or without
+ *  modification, are permitted provided that the following conditions are met:
+ *
+ *   1. Redistributions of source code must retain the above copyright notice,
+ *      this list of conditions and the following disclaimer.
+ *
+ *   2. Redistributions in binary form must reproduce the above copyright
+ *      notice, this list of conditions and the following disclaimer in the
+ *      documentation and/or other materials provided with the distribution.
+ *
+ *   3. Neither the name of the Intel Corporation nor the names of its
+ *      contributors may be used to endorse or promote products derived from
+ *      this software without specific prior written permission.
+ *
+ *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ *  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *  POSSIBILITY OF SUCH DAMAGE.
+ */
+/*$FreeBSD$*/
+
+/**
+ * @file ice_rdma.c
+ * @brief RDMA client driver interface
+ *
+ * Functions to interface with the RDMA client driver, for enabling RMDA
+ * functionality for the ice driver.
+ *
+ * The RDMA client interface is based on a simple kobject interface which is
+ * defined by the rmda_if.m and irdma_di_if.m interfaces.
+ *
+ * The ice device driver provides the rmda_di_if.m interface methods, while
+ * the client RDMA driver provides the irdma_if.m interface methods as an
+ * extension ontop of the irdma_di_if kobject.
+ *
+ * The initial connection between drivers is done via the RDMA client driver
+ * calling ice_rdma_register.
+ */
+
+#include "ice_iflib.h"
+#include "ice_rdma_internal.h"
+
+#include "irdma_if.h"
+#include "irdma_di_if.h"
+
+/**
+ * @var ice_rdma
+ * @brief global RDMA driver state
+ *
+ * Contains global state the driver uses to connect to a client RDMA interface
+ * driver.
+ */
+static struct ice_rdma_state ice_rdma;
+
+/*
+ * Helper function prototypes
+ */
+static int ice_rdma_pf_attach_locked(struct ice_softc *sc);
+static void ice_rdma_pf_detach_locked(struct ice_softc *sc);
+static int ice_rdma_check_version(struct ice_rdma_info *info);
+static void ice_rdma_cp_qos_info(struct ice_hw *hw,
+				 struct ice_dcbx_cfg *dcbx_cfg,
+				 struct ice_qos_params *qos_info);
+
+/*
+ * RDMA Device Interface prototypes
+ */
+static int ice_rdma_pf_reset(struct ice_rdma_peer *peer);
+static int ice_rdma_pf_msix_init(struct ice_rdma_peer *peer,
+				 struct ice_rdma_msix_mapping *msix_info);
+static int ice_rdma_qset_register_request(struct ice_rdma_peer *peer,
+			     struct ice_rdma_qset_update *res);
+static int ice_rdma_update_vsi_filter(struct ice_rdma_peer *peer_dev,
+				      bool enable);
+static void ice_rdma_request_handler(struct ice_rdma_peer *peer,
+				     struct ice_rdma_request *req);
+
+
+/**
+ * @var ice_rdma_di_methods
+ * @brief RDMA driver interface methods
+ *
+ * Kobject methods implementing the driver-side interface for the RDMA peer
+ * clients. This method table contains the operations which the client can
+ * request from the driver.
+ *
+ * The client driver will then extend this kobject class with methods that the
+ * driver can request from the client.
+ */
+static kobj_method_t ice_rdma_di_methods[] = {
+	KOBJMETHOD(irdma_di_reset, ice_rdma_pf_reset),
+	KOBJMETHOD(irdma_di_msix_init, ice_rdma_pf_msix_init),
+	KOBJMETHOD(irdma_di_qset_register_request, ice_rdma_qset_register_request),
+	KOBJMETHOD(irdma_di_vsi_filter_update, ice_rdma_update_vsi_filter),
+	KOBJMETHOD(irdma_di_req_handler, ice_rdma_request_handler),
+	KOBJMETHOD_END
+};
+
+/* Define ice_rdma_di class which will be extended by the iRDMA driver */
+DEFINE_CLASS_0(ice_rdma_di, ice_rdma_di_class, ice_rdma_di_methods, sizeof(struct ice_rdma_peer));
+
+/**
+ * ice_rdma_pf_reset - RDMA client interface requested a reset
+ * @peer: the RDMA peer client structure
+ *
+ * Implements IRDMA_DI_RESET, called by the RDMA client driver to request
+ * a reset of an ice driver device.
+ */
+static int
+ice_rdma_pf_reset(struct ice_rdma_peer *peer)
+{
+	struct ice_softc *sc = ice_rdma_peer_to_sc(peer);
+
+	/*
+	 * Request that the driver re-initialize by bringing the interface
+	 * down and up.
+	 */
+	ice_request_stack_reinit(sc);
+
+	return (0);
+}
+
+/**
+ * ice_rdma_pf_msix_init - RDMA client interface request MSI-X initialization
+ * @peer: the RDMA peer client structure
+ * @msix_info: requested MSI-X mapping
+ *
+ * Implements IRDMA_DI_MSIX_INIT, called by the RDMA client driver to
+ * initialize the MSI-X resources required for RDMA functionality.
+ */
+static int
+ice_rdma_pf_msix_init(struct ice_rdma_peer *peer,
+		      struct ice_rdma_msix_mapping __unused *msix_info)
+{
+	struct ice_softc *sc = ice_rdma_peer_to_sc(peer);
+
+	MPASS(msix_info != NULL);
+
+	device_printf(sc->dev, "%s: iRDMA MSI-X initialization request is not yet implemented\n", __func__);
+
+	/* TODO: implement MSI-X initialization for RDMA */
+	return (ENOSYS);
+}
+
+/**
+ * ice_rdma_register_request - RDMA client interface request qset
+ *                             registration or unregistration
+ * @peer: the RDMA peer client structure
+ * @res: resources to be registered or unregistered
+ */
+static int
+ice_rdma_qset_register_request(struct ice_rdma_peer *peer, struct ice_rdma_qset_update *res)
+{
+	struct ice_softc *sc = ice_rdma_peer_to_sc(peer);
+	struct ice_vsi *vsi = NULL;
+	struct ice_dcbx_cfg *dcbx_cfg;
+	struct ice_hw *hw = &sc->hw;
+	enum ice_status status;
+	int count, i, ret = 0;
+	uint32_t *qset_teid;
+	uint16_t *qs_handle;
+	uint16_t max_rdmaqs[ICE_MAX_TRAFFIC_CLASS];
+	uint16_t vsi_id;
+	uint8_t ena_tc = 0;
+
+	if (!res)
+		return -EINVAL;
+
+	if (res->cnt_req > ICE_MAX_TXQ_PER_TXQG)
+		return -EINVAL;
+
+	switch(res->res_type) {
+	case ICE_RDMA_QSET_ALLOC:
+		count = res->cnt_req;
+		vsi_id = peer->pf_vsi_num;
+		break;
+	case ICE_RDMA_QSET_FREE:
+		count = res->res_allocated;
+		vsi_id = res->qsets.vsi_id;
+		break;
+	default:
+		return -EINVAL;
+	}
+	qset_teid = (uint32_t *)ice_calloc(hw, count, sizeof(*qset_teid));
+	if (!qset_teid)
+		return -ENOMEM;
+
+	qs_handle = (uint16_t *)ice_calloc(hw, count, sizeof(*qs_handle));
+	if (!qs_handle) {
+		ice_free(hw, qset_teid);
+		return -ENOMEM;
+	}
+
+	ice_for_each_traffic_class(i)
+		max_rdmaqs[i] = 0;
+	for (i = 0; i < sc->num_available_vsi; i++) {
+		if (sc->all_vsi[i] &&
+		    ice_get_hw_vsi_num(hw, sc->all_vsi[i]->idx) == vsi_id) {
+			vsi = sc->all_vsi[i];
+			break;
+		}
+	}
+
+	if (!vsi) {
+		ice_debug(hw, ICE_DBG_RDMA, "RDMA QSet invalid VSI\n");
+		ret = -EINVAL;
+		goto out;
+	}
+	if (sc != vsi->sc) {
+		ice_debug(hw, ICE_DBG_RDMA, "VSI is tied to unexpected device\n");
+		ret = -EXDEV;
+		goto out;
+	}
+
+	for (i = 0; i < count; i++) {
+		struct ice_rdma_qset_params *qset;
+
+		qset = &res->qsets;
+		if (qset->vsi_id != peer->pf_vsi_num) {
+			ice_debug(hw, ICE_DBG_RDMA, "RDMA QSet invalid VSI requested %d %d\n",
+				  qset->vsi_id, peer->pf_vsi_num);
+			ret = -EINVAL;
+			goto out;
+		}
+		max_rdmaqs[qset->tc]++;
+		qs_handle[i] = qset->qs_handle;
+		qset_teid[i] = qset->teid;
+	}
+
+	switch(res->res_type) {
+	case ICE_RDMA_QSET_ALLOC:
+		dcbx_cfg = &hw->port_info->qos_cfg.local_dcbx_cfg;
+		for (i = 0; i < ICE_MAX_TRAFFIC_CLASS; i++) {
+			ena_tc |= BIT(dcbx_cfg->etscfg.prio_table[i]);
+		}
+
+		ice_debug(hw, ICE_DBG_RDMA, "%s:%d ena_tc=%x\n", __func__, __LINE__, ena_tc);
+		status = ice_cfg_vsi_rdma(hw->port_info, vsi->idx, ena_tc,
+					  max_rdmaqs);
+		if (status) {
+			ice_debug(hw, ICE_DBG_RDMA, "Failed VSI RDMA qset config\n");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		for (i = 0; i < count; i++) {
+			struct ice_rdma_qset_params *qset;
+
+			qset = &res->qsets;
+			status = ice_ena_vsi_rdma_qset(hw->port_info, vsi->idx,
+						       qset->tc, &qs_handle[i], 1,
+						       &qset_teid[i]);
+			if (status) {
+				ice_debug(hw, ICE_DBG_RDMA, "Failed VSI RDMA qset enable\n");
+				ret = -EINVAL;
+				goto out;
+			}
+			qset->teid = qset_teid[i];
+		}
+		break;
+	case ICE_RDMA_QSET_FREE:
+		status = ice_dis_vsi_rdma_qset(hw->port_info, count, qset_teid, qs_handle);
+		if (status)
+			ret = -EINVAL;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+out:
+	ice_free(hw, qs_handle);
+	ice_free(hw, qset_teid);
+
+	return ret;
+}
+
+/**
+ *  ice_rdma_update_vsi_filter - configure vsi information
+ *                               when opening or closing rdma driver
+ *  @peer: the RDMA peer client structure
+ *  @enable: enable or disable the rdma filter
+ */
+static int
+ice_rdma_update_vsi_filter(struct ice_rdma_peer *peer,
+			   bool enable)
+{
+	struct ice_softc *sc = ice_rdma_peer_to_sc(peer);
+	struct ice_vsi *vsi;
+	int ret;
+
+	vsi = &sc->pf_vsi;
+	if (!vsi)
+		return -EINVAL;
+
+	ret = ice_cfg_iwarp_fltr(&sc->hw, vsi->idx, enable);
+	if (ret) {
+		device_printf(sc->dev, "Failed to  %sable iWARP filtering\n",
+				enable ? "en" : "dis");
+	} else {
+		if (enable)
+			vsi->info.q_opt_flags |= ICE_AQ_VSI_Q_OPT_PE_FLTR_EN;
+		else
+			vsi->info.q_opt_flags &= ~ICE_AQ_VSI_Q_OPT_PE_FLTR_EN;
+	}
+
+	return ret;
+}
+
+/**
+ * ice_rdma_request_handler - handle requests incoming from RDMA driver
+ * @peer: the RDMA peer client structure
+ * @req: structure containing request
+ */
+static void
+ice_rdma_request_handler(struct ice_rdma_peer *peer,
+			 struct ice_rdma_request *req)
+{
+	if (!req || !peer) {
+		log(LOG_WARNING, "%s: peer or req are not valid\n", __func__);
+		return;
+	}
+
+	switch(req->type) {
+	case ICE_RDMA_EVENT_RESET:
+		break;
+	case ICE_RDMA_EVENT_QSET_REGISTER:
+		ice_rdma_qset_register_request(peer, &req->res);
*** 1657 LINES SKIPPED ***