svn commit: r300723 - in head/sys: dev/iser modules/iser

Edward Tomasz Napierala trasz at FreeBSD.org
Thu May 26 09:49:30 UTC 2016


Author: trasz
Date: Thu May 26 09:49:29 2016
New Revision: 300723
URL: https://svnweb.freebsd.org/changeset/base/300723

Log:
  Bring in the Mellanox implementation of iSER (iSCSI over RDMA) initiator,
  written by Sagi Grimberg <sagig at mellanox.com> and Max Gurtovoy
  <maxg at mellanox.com>.
  
  This code comes from https://github.com/sagigrimberg/iser-freebsd, branch
  iser-rebase-11-current-r291993.  It's not connected to the build just yet;
  it still needs some tweaks to adapt to my changes to iSCSI infrastructure.
  
  Big thanks to Mellanox for their support for FreeBSD!
  
  Obtained from:	Mellanox Technologies
  MFC after:	1 month
  Relnotes:	yes

Added:
  head/sys/dev/iser/
  head/sys/dev/iser/icl_iser.c   (contents, props changed)
  head/sys/dev/iser/icl_iser.h   (contents, props changed)
  head/sys/dev/iser/iser_initiator.c   (contents, props changed)
  head/sys/dev/iser/iser_memory.c   (contents, props changed)
  head/sys/dev/iser/iser_verbs.c   (contents, props changed)
  head/sys/modules/iser/
  head/sys/modules/iser/Makefile   (contents, props changed)

Added: head/sys/dev/iser/icl_iser.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/dev/iser/icl_iser.c	Thu May 26 09:49:29 2016	(r300723)
@@ -0,0 +1,582 @@
+/* $FreeBSD$ */
+/*-
+ * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "icl_iser.h"
+
+SYSCTL_NODE(_kern, OID_AUTO, iser, CTLFLAG_RW, 0, "iSER module");
+int iser_debug = 0;
+SYSCTL_INT(_kern_iser, OID_AUTO, debug, CTLFLAG_RWTUN,
+    &iser_debug, 0, "Enable iser debug messages");
+
+static MALLOC_DEFINE(M_ICL_ISER, "icl_iser", "iSCSI iser backend");
+static uma_zone_t icl_pdu_zone;
+
+static volatile u_int	icl_iser_ncons;
+struct iser_global ig;
+
+static icl_conn_new_pdu_t	iser_conn_new_pdu;
+static icl_conn_pdu_free_t	iser_conn_pdu_free;
+static icl_conn_pdu_data_segment_length_t iser_conn_pdu_data_segment_length;
+static icl_conn_pdu_append_data_t	iser_conn_pdu_append_data;
+static icl_conn_pdu_queue_t	iser_conn_pdu_queue;
+static icl_conn_handoff_t	iser_conn_handoff;
+static icl_conn_free_t		iser_conn_free;
+static icl_conn_close_t		iser_conn_close;
+static icl_conn_release_t	iser_conn_release;
+static icl_conn_connect_t	iser_conn_connect;
+static icl_conn_connected_t	iser_conn_connected;
+static icl_conn_task_setup_t	iser_conn_task_setup;
+static icl_conn_task_done_t	iser_conn_task_done;
+static icl_conn_pdu_get_data_t	iser_conn_pdu_get_data;
+
+static kobj_method_t icl_iser_methods[] = {
+	KOBJMETHOD(icl_conn_new_pdu, iser_conn_new_pdu),
+	KOBJMETHOD(icl_conn_pdu_free, iser_conn_pdu_free),
+	KOBJMETHOD(icl_conn_pdu_data_segment_length, iser_conn_pdu_data_segment_length),
+	KOBJMETHOD(icl_conn_pdu_append_data, iser_conn_pdu_append_data),
+	KOBJMETHOD(icl_conn_pdu_queue, iser_conn_pdu_queue),
+	KOBJMETHOD(icl_conn_handoff, iser_conn_handoff),
+	KOBJMETHOD(icl_conn_free, iser_conn_free),
+	KOBJMETHOD(icl_conn_close, iser_conn_close),
+	KOBJMETHOD(icl_conn_release, iser_conn_release),
+	KOBJMETHOD(icl_conn_connect, iser_conn_connect),
+	KOBJMETHOD(icl_conn_connected, iser_conn_connected),
+	KOBJMETHOD(icl_conn_task_setup, iser_conn_task_setup),
+	KOBJMETHOD(icl_conn_task_done, iser_conn_task_done),
+	KOBJMETHOD(icl_conn_pdu_get_data, iser_conn_pdu_get_data),
+	{ 0, 0 }
+};
+
+DEFINE_CLASS(icl_iser, icl_iser_methods, sizeof(struct iser_conn));
+
+/**
+ * iser_initialize_headers() - Initialize task headers
+ * @pdu:       iser pdu
+ * @iser_conn:    iser connection
+ *
+ * Notes:
+ * This routine may race with iser teardown flow for scsi
+ * error handling TMFs. So for TMF we should acquire the
+ * state mutex to avoid dereferencing the IB device which
+ * may have already been terminated (racing teardown sequence).
+ */
+int
+iser_initialize_headers(struct icl_iser_pdu *pdu, struct iser_conn *iser_conn)
+{
+	struct iser_tx_desc *tx_desc = &pdu->desc;
+	struct iser_device *device = iser_conn->ib_conn.device;
+	u64 dma_addr;
+	int ret = 0;
+
+	dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc,
+				ISER_HEADERS_LEN, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(device->ib_device, dma_addr)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	tx_desc->mapped = true;
+	tx_desc->dma_addr = dma_addr;
+	tx_desc->tx_sg[0].addr   = tx_desc->dma_addr;
+	tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
+	tx_desc->tx_sg[0].lkey   = device->mr->lkey;
+
+out:
+
+	return (ret);
+}
+
+int
+iser_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *request,
+			  const void *addr, size_t len, int flags)
+{
+	struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+
+	if (request->ip_bhs->bhs_opcode & ISCSI_BHS_OPCODE_LOGIN_REQUEST ||
+	    request->ip_bhs->bhs_opcode & ISCSI_BHS_OPCODE_TEXT_REQUEST) {
+		ISER_DBG("copy to login buff");
+		memcpy(iser_conn->login_req_buf, addr, len);
+		request->ip_data_len = len;
+	}
+
+	return (0);
+}
+
+void
+iser_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip,
+		       size_t off, void *addr, size_t len)
+{
+	/* If we have a receive data, copy it to upper layer buffer */
+	if (ip->ip_data_mbuf)
+		memcpy(addr, ip->ip_data_mbuf + off, len);
+}
+
+/*
+ * Allocate icl_pdu with empty BHS to fill up by the caller.
+ */
+struct icl_pdu *
+iser_new_pdu(struct icl_conn *ic, int flags)
+{
+	struct icl_iser_pdu *iser_pdu;
+	struct icl_pdu *ip;
+	struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+
+	iser_pdu = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
+	if (iser_pdu == NULL) {
+		ISER_WARN("failed to allocate %zd bytes", sizeof(*iser_pdu));
+		return (NULL);
+	}
+
+	iser_pdu->iser_conn = iser_conn;
+	ip = &iser_pdu->icl_pdu;
+	ip->ip_conn = ic;
+	ip->ip_bhs = &iser_pdu->desc.iscsi_header;
+
+	return (ip);
+}
+
+struct icl_pdu *
+iser_conn_new_pdu(struct icl_conn *ic, int flags)
+{
+	return (iser_new_pdu(ic, flags));
+}
+
+void
+iser_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
+{
+	struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip);
+
+	uma_zfree(icl_pdu_zone, iser_pdu);
+}
+
+size_t
+iser_conn_pdu_data_segment_length(struct icl_conn *ic,
+				  const struct icl_pdu *request)
+{
+	uint32_t len = 0;
+
+	len += request->ip_bhs->bhs_data_segment_len[0];
+	len <<= 8;
+	len += request->ip_bhs->bhs_data_segment_len[1];
+	len <<= 8;
+	len += request->ip_bhs->bhs_data_segment_len[2];
+
+	return (len);
+}
+
+void
+iser_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
+{
+	iser_pdu_free(ic, ip);
+}
+
+static bool
+is_control_opcode(uint8_t opcode)
+{
+	bool is_control = false;
+
+	switch (opcode & ISCSI_OPCODE_MASK) {
+		case ISCSI_BHS_OPCODE_NOP_OUT:
+		case ISCSI_BHS_OPCODE_LOGIN_REQUEST:
+		case ISCSI_BHS_OPCODE_LOGOUT_REQUEST:
+		case ISCSI_BHS_OPCODE_TEXT_REQUEST:
+			is_control = true;
+			break;
+		case ISCSI_BHS_OPCODE_SCSI_COMMAND:
+			is_control = false;
+			break;
+		default:
+			ISER_ERR("unknown opcode %d", opcode);
+	}
+
+	return (is_control);
+}
+
+void
+iser_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip)
+{
+	struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+	struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip);
+	int ret;
+
+	ret = iser_initialize_headers(iser_pdu, iser_conn);
+	if (ret) {
+		ISER_ERR("Failed to map TX descriptor pdu %p", iser_pdu);
+		return;
+	}
+
+	if (is_control_opcode(ip->ip_bhs->bhs_opcode)) {
+		ret = iser_send_control(iser_conn, iser_pdu);
+		if (unlikely(ret))
+			ISER_ERR("Failed to send control pdu %p", iser_pdu);
+	} else {
+		ret = iser_send_command(iser_conn, iser_pdu);
+		if (unlikely(ret))
+			ISER_ERR("Failed to send command pdu %p", iser_pdu);
+	}
+}
+
+static struct icl_conn *
+iser_new_conn(const char *name, struct mtx *lock)
+{
+	struct iser_conn *iser_conn;
+	struct icl_conn *ic;
+
+	refcount_acquire(&icl_iser_ncons);
+
+	iser_conn = (struct iser_conn *)kobj_create(&icl_iser_class, M_ICL_ISER, M_WAITOK | M_ZERO);
+	if (!iser_conn) {
+		ISER_ERR("failed to allocate iser conn");
+		refcount_release(&icl_iser_ncons);
+		return (NULL);
+	}
+
+	cv_init(&iser_conn->up_cv, "iser_cv");
+	sx_init(&iser_conn->state_mutex, "iser_conn_state_mutex");
+	mtx_init(&iser_conn->ib_conn.beacon.flush_lock, "flush_lock", NULL, MTX_DEF);
+	cv_init(&iser_conn->ib_conn.beacon.flush_cv, "flush_cv");
+	mtx_init(&iser_conn->ib_conn.lock, "lock", NULL, MTX_DEF);
+
+	ic = &iser_conn->icl_conn;
+	ic->ic_lock = lock;
+	ic->ic_name = name;
+	ic->ic_driver = strdup("iser", M_TEMP);
+	ic->ic_iser = true;
+
+	return (ic);
+}
+
+void
+iser_conn_free(struct icl_conn *ic)
+{
+	struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+
+	cv_destroy(&iser_conn->ib_conn.beacon.flush_cv);
+	mtx_destroy(&iser_conn->ib_conn.beacon.flush_lock);
+	sx_destroy(&iser_conn->state_mutex);
+	cv_destroy(&iser_conn->up_cv);
+	kobj_delete((struct kobj *)iser_conn, M_ICL_ISER);
+	refcount_release(&icl_iser_ncons);
+}
+
+int
+iser_conn_handoff(struct icl_conn *ic, int cmds_max)
+{
+	struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+	int error = 0;
+
+	sx_xlock(&iser_conn->state_mutex);
+	if (iser_conn->state != ISER_CONN_UP) {
+		error = EINVAL;
+		ISER_ERR("iser_conn %p state is %d, teardown started\n",
+			 iser_conn, iser_conn->state);
+		goto out;
+	}
+
+	/*
+	 * In discovery session no need to allocate rx desc and posting recv
+	 * work request
+	 */
+	if (ic->ic_session_type_discovery(ic))
+		goto out;
+
+	error = iser_alloc_rx_descriptors(iser_conn, cmds_max);
+	if (error)
+		goto out;
+
+	error = iser_post_recvm(iser_conn, iser_conn->min_posted_rx);
+	if (error)
+		goto post_error;
+
+	sx_xunlock(&iser_conn->state_mutex);
+	return (error);
+
+post_error:
+	iser_free_rx_descriptors(iser_conn);
+out:
+	sx_xunlock(&iser_conn->state_mutex);
+	return (error);
+
+}
+
+/**
+ * Frees all conn objects
+ */
+void
+iser_conn_release(struct icl_conn *ic)
+{
+	struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct iser_conn *curr, *tmp;
+
+	mtx_lock(&ig.connlist_mutex);
+	/*
+	 * Search for iser connection in global list.
+	 * It may not be there in case of failure in connection establishment
+	 * stage.
+	 */
+	list_for_each_entry_safe(curr, tmp, &ig.connlist, conn_list) {
+		if (iser_conn == curr) {
+			ISER_WARN("found iser_conn %p", iser_conn);
+			list_del(&iser_conn->conn_list);
+		}
+	}
+	mtx_unlock(&ig.connlist_mutex);
+
+	/*
+	 * In case we reconnecting or removing session, we need to
+	 * release IB resources (which is safe to call more than once).
+	 */
+	sx_xlock(&iser_conn->state_mutex);
+	iser_free_ib_conn_res(iser_conn, true);
+	sx_xunlock(&iser_conn->state_mutex);
+
+	if (ib_conn->cma_id != NULL) {
+		rdma_destroy_id(ib_conn->cma_id);
+		ib_conn->cma_id = NULL;
+	}
+
+}
+
+void
+iser_conn_close(struct icl_conn *ic)
+{
+	struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+
+	ISER_INFO("closing conn %p", iser_conn);
+
+	sx_xlock(&iser_conn->state_mutex);
+	/*
+	 * In case iser connection is waiting on conditional variable
+	 * (state PENDING) and we try to close it before connection establishment,
+	 * we need to signal it to continue releasing connection properly.
+	 */
+	if (!iser_conn_terminate(iser_conn) && iser_conn->state == ISER_CONN_PENDING)
+		cv_signal(&iser_conn->up_cv);
+	sx_xunlock(&iser_conn->state_mutex);
+
+}
+
+int
+iser_conn_connect(struct icl_conn *ic, int domain, int socktype,
+		int protocol, struct sockaddr *from_sa, struct sockaddr *to_sa)
+{
+	struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	int err = 0;
+
+	sx_xlock(&iser_conn->state_mutex);
+	 /* the device is known only --after-- address resolution */
+	ib_conn->device = NULL;
+
+	iser_conn->state = ISER_CONN_PENDING;
+
+	ib_conn->cma_id = rdma_create_id(iser_cma_handler, (void *)iser_conn,
+			RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(ib_conn->cma_id)) {
+		err = -PTR_ERR(ib_conn->cma_id);
+		ISER_ERR("rdma_create_id failed: %d", err);
+		goto id_failure;
+	}
+
+	err = rdma_resolve_addr(ib_conn->cma_id, from_sa, to_sa, 1000);
+	if (err) {
+		ISER_ERR("rdma_resolve_addr failed: %d", err);
+		if (err < 0)
+			err = -err;
+		goto addr_failure;
+	}
+
+	ISER_DBG("before cv_wait: %p", iser_conn);
+	cv_wait(&iser_conn->up_cv, &iser_conn->state_mutex);
+	ISER_DBG("after cv_wait: %p", iser_conn);
+
+	if (iser_conn->state != ISER_CONN_UP) {
+		err = EIO;
+		goto addr_failure;
+	}
+
+	err = iser_alloc_login_buf(iser_conn);
+	if (err)
+		goto addr_failure;
+	sx_xunlock(&iser_conn->state_mutex);
+
+	mtx_lock(&ig.connlist_mutex);
+	list_add(&iser_conn->conn_list, &ig.connlist);
+	mtx_unlock(&ig.connlist_mutex);
+
+	return (0);
+
+id_failure:
+	ib_conn->cma_id = NULL;
+addr_failure:
+	sx_xunlock(&iser_conn->state_mutex);
+	return (err);
+}
+
+/**
+ * Called with session spinlock held.
+ * No need to lock state mutex on an advisory check.
+ **/
+bool
+iser_conn_connected(struct icl_conn *ic)
+{
+	struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+
+	return (iser_conn->state == ISER_CONN_UP);
+}
+
+int
+iser_conn_task_setup(struct icl_conn *ic, struct ccb_scsiio *csio,
+		     uint32_t *task_tagp, void **prvp, struct icl_pdu *ip)
+{
+	struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip);
+
+	*prvp = ip;
+	iser_pdu->csio = csio;
+
+	return (0);
+}
+
+void
+iser_conn_task_done(struct icl_conn *ic, void *prv)
+{
+	struct icl_pdu *ip = prv;
+	struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip);
+	struct iser_device *device = iser_pdu->iser_conn->ib_conn.device;
+	struct iser_tx_desc *tx_desc = &iser_pdu->desc;
+
+	if (iser_pdu->dir[ISER_DIR_IN]) {
+		iser_unreg_rdma_mem(iser_pdu, ISER_DIR_IN);
+		iser_dma_unmap_task_data(iser_pdu,
+					 &iser_pdu->data[ISER_DIR_IN],
+					 DMA_FROM_DEVICE);
+	}
+
+	if (iser_pdu->dir[ISER_DIR_OUT]) {
+		iser_unreg_rdma_mem(iser_pdu, ISER_DIR_OUT);
+		iser_dma_unmap_task_data(iser_pdu,
+					 &iser_pdu->data[ISER_DIR_OUT],
+					 DMA_TO_DEVICE);
+	}
+
+	if (likely(tx_desc->mapped)) {
+		ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,
+				    ISER_HEADERS_LEN, DMA_TO_DEVICE);
+		tx_desc->mapped = false;
+	}
+
+	iser_pdu_free(ic, ip);
+}
+
+static u_int32_t
+iser_hba_misc()
+{
+	return (PIM_UNMAPPED);
+}
+
+static int
+iser_limits(size_t *limitp)
+{
+	*limitp = 128 * 1024;
+
+	return (0);
+}
+
+static int
+icl_iser_load(void)
+{
+	int error;
+
+	ISER_DBG("Starting iSER datamover...");
+
+	icl_pdu_zone = uma_zcreate("icl_iser_pdu", sizeof(struct icl_iser_pdu),
+				   NULL, NULL, NULL, NULL,
+				   UMA_ALIGN_PTR, 0);
+	/* FIXME: Check rc */
+
+	refcount_init(&icl_iser_ncons, 0);
+
+	error = icl_register("iser", 0, iser_limits, iser_new_conn, iser_hba_misc);
+	KASSERT(error == 0, ("failed to register iser"));
+
+	memset(&ig, 0, sizeof(struct iser_global));
+
+	/* device init is called only after the first addr resolution */
+	sx_init(&ig.device_list_mutex,  "global_device_lock");
+	INIT_LIST_HEAD(&ig.device_list);
+	mtx_init(&ig.connlist_mutex, "global_conn_lock", NULL, MTX_DEF);
+	INIT_LIST_HEAD(&ig.connlist);
+	sx_init(&ig.close_conns_mutex,  "global_close_conns_lock");
+
+	return (error);
+}
+
+static int
+icl_iser_unload(void)
+{
+	ISER_DBG("Removing iSER datamover...");
+
+	if (icl_iser_ncons != 0)
+		return (EBUSY);
+
+	sx_destroy(&ig.close_conns_mutex);
+	mtx_destroy(&ig.connlist_mutex);
+	sx_destroy(&ig.device_list_mutex);
+
+	icl_unregister("iser");
+
+	uma_zdestroy(icl_pdu_zone);
+
+	return (0);
+}
+
+static int
+icl_iser_modevent(module_t mod, int what, void *arg)
+{
+	switch (what) {
+	case MOD_LOAD:
+		return (icl_iser_load());
+	case MOD_UNLOAD:
+		return (icl_iser_unload());
+	default:
+		return (EINVAL);
+	}
+}
+
+moduledata_t icl_iser_data = {
+	.name = "icl_iser",
+	.evhand = icl_iser_modevent,
+	.priv = 0
+};
+
+DECLARE_MODULE(icl_iser, icl_iser_data, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
+MODULE_DEPEND(icl_iser, icl, 1, 1, 1);
+MODULE_DEPEND(icl_iser, iscsi, 1, 1, 1);
+MODULE_DEPEND(icl_iser, ibcore, 1, 1, 1);
+MODULE_DEPEND(icl_iser, linuxkpi, 1, 1, 1);
+MODULE_VERSION(icl_iser, 1);
+

Added: head/sys/dev/iser/icl_iser.h
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/dev/iser/icl_iser.h	Thu May 26 09:49:29 2016	(r300723)
@@ -0,0 +1,547 @@
+/* $FreeBSD$ */
+/*-
+ * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef ICL_ISER_H
+#define ICL_ISER_H
+
+/*
+ * iSCSI Common Layer for RDMA.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/capsicum.h>
+#include <sys/condvar.h>
+#include <sys/conf.h>
+#include <sys/file.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/module.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/sx.h>
+#include <sys/uio.h>
+#include <sys/taskqueue.h>
+#include <sys/bio.h>
+#include <vm/uma.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <dev/iscsi/icl.h>
+#include <dev/iscsi/iscsi_proto.h>
+#include <icl_conn_if.h>
+#include <cam/cam.h>
+#include <cam/cam_ccb.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+#include <rdma/rdma_cm.h>
+
+
+#define	ISER_DBG(X, ...)						\
+	do {								\
+		if (unlikely(iser_debug > 2))				\
+			printf("DEBUG: %s: " X "\n",			\
+				__func__, ## __VA_ARGS__);		\
+	} while (0)
+
+#define	ISER_INFO(X, ...)						\
+	do {								\
+		if (unlikely(iser_debug > 1))				\
+			printf("INFO: %s: " X "\n",			\
+				__func__, ## __VA_ARGS__);		\
+	} while (0)
+
+#define	ISER_WARN(X, ...)						\
+	do {								\
+		if (unlikely(iser_debug > 0)) {				\
+			printf("WARNING: %s: " X "\n",			\
+				__func__, ## __VA_ARGS__);		\
+		}							\
+	} while (0)
+
+#define	ISER_ERR(X, ...) 						\
+	printf("ERROR: %s: " X "\n", __func__, ## __VA_ARGS__)
+
+#define ISER_VER			0x10
+#define ISER_WSV			0x08
+#define ISER_RSV			0x04
+
+#define ISER_FASTREG_LI_WRID		0xffffffffffffffffULL
+#define ISER_BEACON_WRID		0xfffffffffffffffeULL
+
+#define SHIFT_4K	12
+#define SIZE_4K	(1ULL << SHIFT_4K)
+#define MASK_4K	(~(SIZE_4K-1))
+
+/* support up to 512KB in one RDMA */
+#define ISCSI_ISER_SG_TABLESIZE         (0x80000 >> SHIFT_4K)
+#define ISER_DEF_XMIT_CMDS_MAX 256
+
+/* the max RX (recv) WR supported by the iSER QP is defined by                 *
+ * max_recv_wr = commands_max + recv_beacon                                    */
+#define ISER_QP_MAX_RECV_DTOS  (ISER_DEF_XMIT_CMDS_MAX + 1)
+#define ISER_MIN_POSTED_RX		(ISER_DEF_XMIT_CMDS_MAX >> 2)
+
+/* QP settings */
+/* Maximal bounds on received asynchronous PDUs */
+#define ISER_MAX_RX_MISC_PDUS           4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */
+#define ISER_MAX_TX_MISC_PDUS           6 /* NOOP_OUT(2), TEXT(1), SCSI_TMFUNC(2), LOGOUT(1) */
+
+/* the max TX (send) WR supported by the iSER QP is defined by                 *
+ * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   *
+ * to have at max for SCSI command. The tx posting & completion handling code  *
+ * supports -EAGAIN scheme where tx is suspended till the QP has room for more *
+ * send WR. D=8 comes from 64K/8K                                              */
+
+#define ISER_INFLIGHT_DATAOUTS		8
+
+/* the send_beacon increase the max_send_wr by 1  */
+#define ISER_QP_MAX_REQ_DTOS		(ISER_DEF_XMIT_CMDS_MAX *    \
+					(1 + ISER_INFLIGHT_DATAOUTS) + \
+					ISER_MAX_TX_MISC_PDUS        + \
+					ISER_MAX_RX_MISC_PDUS + 1)
+
+#define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr			\
+					 - ISER_MAX_TX_MISC_PDUS	\
+					 - ISER_MAX_RX_MISC_PDUS - 1) /	\
+					 (1 + ISER_INFLIGHT_DATAOUTS))
+
+#define ISER_WC_BATCH_COUNT   16
+#define ISER_SIGNAL_CMD_COUNT 32
+
+/* Maximal QP's recommended per CQ. In case we use more QP's per CQ we might   *
+ * encounter a CQ overrun state.                                               */
+#define ISCSI_ISER_MAX_CONN	8
+#define ISER_MAX_RX_LEN		(ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_TX_LEN		(ISER_QP_MAX_REQ_DTOS  * ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_CQ_LEN		(ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \
+				 ISCSI_ISER_MAX_CONN)
+
+#define ISER_ZBVA_NOT_SUPPORTED                0x80
+#define ISER_SEND_W_INV_NOT_SUPPORTED	0x40
+
+#define icl_to_iser_conn(ic) \
+	container_of(ic, struct iser_conn, icl_conn)
+#define icl_to_iser_pdu(ip) \
+	container_of(ip, struct icl_iser_pdu, icl_pdu)
+
+/**
+ * struct iser_hdr - iSER header
+ *
+ * @flags:        flags support (zbva, remote_inv)
+ * @rsvd:         reserved
+ * @write_stag:   write rkey
+ * @write_va:     write virtual address
+ * @reaf_stag:    read rkey
+ * @read_va:      read virtual address
+ */
+struct iser_hdr {
+	u8      flags;
+	u8      rsvd[3];
+	__be32  write_stag;
+	__be64  write_va;
+	__be32  read_stag;
+	__be64  read_va;
+} __attribute__((packed));
+
+struct iser_cm_hdr {
+	u8      flags;
+	u8      rsvd[3];
+} __packed;
+
+/* Constant PDU lengths calculations */
+#define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + ISCSI_BHS_SIZE)
+
+#define ISER_RECV_DATA_SEG_LEN	128
+#define ISER_RX_PAYLOAD_SIZE	(ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
+
+#define ISER_RX_LOGIN_SIZE	(ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
+
+enum iser_conn_state {
+	ISER_CONN_INIT,		   /* descriptor allocd, no conn          */
+	ISER_CONN_PENDING,	   /* in the process of being established */
+	ISER_CONN_UP,		   /* up and running                      */
+	ISER_CONN_TERMINATING,	   /* in the process of being terminated  */
+	ISER_CONN_DOWN,		   /* shut down                           */
+	ISER_CONN_STATES_NUM
+};
+
+enum iser_task_status {
+	ISER_TASK_STATUS_INIT = 0,
+	ISER_TASK_STATUS_STARTED,
+	ISER_TASK_STATUS_COMPLETED
+};
+
+enum iser_data_dir {
+	ISER_DIR_IN = 0,	   /* to initiator */
+	ISER_DIR_OUT,		   /* from initiator */
+	ISER_DIRS_NUM
+};
+
+/**
+ * struct iser_mem_reg - iSER memory registration info
+ *
+ * @sge:          memory region sg element
+ * @rkey:         memory region remote key
+ * @mem_h:        pointer to registration context (FMR/Fastreg)
+ */
+struct iser_mem_reg {
+	struct ib_sge	 sge;
+	u32		 rkey;
+	void		*mem_h;
+};
+
+enum iser_desc_type {
+	ISCSI_TX_CONTROL ,
+	ISCSI_TX_SCSI_COMMAND,
+	ISCSI_TX_DATAOUT
+};
+
+/**
+ * struct iser_data_buf - iSER data buffer
+ *
+ * @sg:           pointer to the sg list
+ * @size:         num entries of this sg
+ * @data_len:     total beffer byte len
+ * @dma_nents:    returned by dma_map_sg
+ * @copy_buf:     allocated copy buf for SGs unaligned
+ *                for rdma which are copied
+ * @orig_sg:      pointer to the original sg list (in case
+ *                we used a copy)
+ * @sg_single:    SG-ified clone of a non SG SC or
+ *                unaligned SG
+ */
+struct iser_data_buf {
+	struct scatterlist sgl[ISCSI_ISER_SG_TABLESIZE];
+	void               *sg;
+	unsigned int       size;
+	unsigned long      data_len;
+	unsigned int       dma_nents;
+	char               *copy_buf;
+	struct scatterlist *orig_sg;
+	struct scatterlist sg_single;
+  };
+
+/* fwd declarations */
+struct iser_conn;
+struct ib_conn;
+struct iser_device;
+
+/**
+ * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
+ *
+ * @iser_header:   iser header
+ * @iscsi_header:  iscsi header (bhs)
+ * @type:          command/control/dataout
+ * @dma_addr:      header buffer dma_address
+ * @tx_sg:         sg[0] points to iser/iscsi headers
+ *                 sg[1] optionally points to either of immediate data
+ *                 unsolicited data-out or control
+ * @num_sge:       number sges used on this TX task
+ * @mapped:        indicates if the descriptor is dma mapped
+ */
+struct iser_tx_desc {
+	struct iser_hdr              iser_header;
+	struct iscsi_bhs             iscsi_header __attribute__((packed));
+	enum   iser_desc_type        type;
+	u64		             dma_addr;
+	struct ib_sge		     tx_sg[2];
+	int                          num_sge;
+	bool                         mapped;
+};
+
+#define ISER_RX_PAD_SIZE	(256 - (ISER_RX_PAYLOAD_SIZE + \
+					sizeof(u64) + sizeof(struct ib_sge)))
+/**
+ * struct iser_rx_desc - iSER RX descriptor (for recv wr_id)
+ *
+ * @iser_header:   iser header
+ * @iscsi_header:  iscsi header
+ * @data:          received data segment
+ * @dma_addr:      receive buffer dma address
+ * @rx_sg:         ib_sge of receive buffer
+ * @pad:           for sense data TODO: Modify to maximum sense length supported
+ */
+struct iser_rx_desc {
+	struct iser_hdr              iser_header;
+	struct iscsi_bhs             iscsi_header;
+	char		             data[ISER_RECV_DATA_SEG_LEN];
+	u64		             dma_addr;
+	struct ib_sge		     rx_sg;
+	char		             pad[ISER_RX_PAD_SIZE];
+} __attribute__((packed));
+
+struct icl_iser_pdu {
+	struct icl_pdu               icl_pdu;
+	struct iser_tx_desc          desc;
+	struct iser_conn             *iser_conn;
+	enum iser_task_status        status;
+	struct ccb_scsiio 			 *csio;
+	int                          command_sent;
+	int                          dir[ISER_DIRS_NUM];
+	struct iser_mem_reg          rdma_reg[ISER_DIRS_NUM];
+	struct iser_data_buf         data[ISER_DIRS_NUM];
+};
+
+/**
+ * struct iser_comp - iSER completion context
+ *
+ * @device:     pointer to device handle
+ * @cq:         completion queue
+ * @wcs:        work completion array
+ * @tq:    	taskqueue handle
+ * @task:    	task to run task_fn
+ * @active_qps: Number of active QPs attached
+ *              to completion context
+ */
+struct iser_comp {
+	struct iser_device      *device;
+	struct ib_cq		*cq;
+	struct ib_wc		 wcs[ISER_WC_BATCH_COUNT];
+	struct taskqueue        *tq;
+	struct task             task;
+	int                      active_qps;
+};
+
+/**
+ * struct iser_device - iSER device handle
+ *
+ * @ib_device:     RDMA device
+ * @pd:            Protection Domain for this device
+ * @dev_attr:      Device attributes container
+ * @mr:            Global DMA memory region
+ * @event_handler: IB events handle routine
+ * @ig_list:	   entry in devices list
+ * @refcount:      Reference counter, dominated by open iser connections
+ * @comps_used:    Number of completion contexts used, Min between online
+ *                 cpus and device max completion vectors
+ * @comps:         Dinamically allocated array of completion handlers
+ */
+struct iser_device {
+	struct ib_device             *ib_device;
+	struct ib_pd	             *pd;
+	struct ib_device_attr	     dev_attr;
+	struct ib_mr	             *mr;
+	struct ib_event_handler      event_handler;
+	struct list_head             ig_list;
+	int                          refcount;
+	int			     comps_used;
+	struct iser_comp	     *comps;
+};
+
+/**
+ * struct iser_reg_resources - Fast registration recources
+ *
+ * @mr:         memory region
+ * @frpl:       fast reg page list
+ * @mr_valid:   is mr valid indicator
+ */
+struct iser_reg_resources {
+	struct ib_mr                     *mr;
+	struct ib_fast_reg_page_list     *frpl;
+	u8                                mr_valid:1;
+};
+
+/**
+ * struct fast_reg_descriptor - Fast registration descriptor
+ *
+ * @list:           entry in connection fastreg pool
+ * @rsc:            data buffer registration resources
+ */
+struct fast_reg_descriptor {
+	struct list_head		  list;
+	struct iser_reg_resources	  rsc;
+};
+
+
+/**
+ * struct iser_beacon - beacon to signal all flush errors were drained
+ *
+ * @send:           send wr
+ * @recv:           recv wr
+ * @flush_lock:     protects flush_cv
+ * @flush_cv:       condition variable for beacon flush
+ */
+struct iser_beacon {
+	union {
+		struct ib_send_wr	send;
+		struct ib_recv_wr	recv;
+	};
+	struct mtx		     flush_lock;
+	struct cv		     flush_cv;
+};
+
+/**
+ * struct ib_conn - Infiniband related objects
+ *
+ * @cma_id:              rdma_cm connection maneger handle
+ * @qp:                  Connection Queue-pair
+ * @device:              reference to iser device
+ * @comp:                iser completion context
+  */
+struct ib_conn {
+	struct rdma_cm_id           *cma_id;
+	struct ib_qp	            *qp;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-all mailing list