svn commit: r300723 - in head/sys: dev/iser modules/iser
Edward Tomasz Napierala
trasz at FreeBSD.org
Thu May 26 09:49:30 UTC 2016
Author: trasz
Date: Thu May 26 09:49:29 2016
New Revision: 300723
URL: https://svnweb.freebsd.org/changeset/base/300723
Log:
Bring in the Mellanox implementation of iSER (iSCSI over RDMA) initiator,
written by Sagi Grimberg <sagig at mellanox.com> and Max Gurtovoy
<maxg at mellanox.com>.
This code comes from https://github.com/sagigrimberg/iser-freebsd, branch
iser-rebase-11-current-r291993. It's not connected to the build just yet;
it still needs some tweaks to adapt to my changes to iSCSI infrastructure.
Big thanks to Mellanox for their support for FreeBSD!
Obtained from: Mellanox Technologies
MFC after: 1 month
Relnotes: yes
Added:
head/sys/dev/iser/
head/sys/dev/iser/icl_iser.c (contents, props changed)
head/sys/dev/iser/icl_iser.h (contents, props changed)
head/sys/dev/iser/iser_initiator.c (contents, props changed)
head/sys/dev/iser/iser_memory.c (contents, props changed)
head/sys/dev/iser/iser_verbs.c (contents, props changed)
head/sys/modules/iser/
head/sys/modules/iser/Makefile (contents, props changed)
Added: head/sys/dev/iser/icl_iser.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ head/sys/dev/iser/icl_iser.c Thu May 26 09:49:29 2016 (r300723)
@@ -0,0 +1,582 @@
+/* $FreeBSD$ */
+/*-
+ * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "icl_iser.h"
+
+SYSCTL_NODE(_kern, OID_AUTO, iser, CTLFLAG_RW, 0, "iSER module");
+int iser_debug = 0;
+SYSCTL_INT(_kern_iser, OID_AUTO, debug, CTLFLAG_RWTUN,
+ &iser_debug, 0, "Enable iser debug messages");
+
+static MALLOC_DEFINE(M_ICL_ISER, "icl_iser", "iSCSI iser backend");
+static uma_zone_t icl_pdu_zone;
+
+static volatile u_int icl_iser_ncons;
+struct iser_global ig;
+
+static icl_conn_new_pdu_t iser_conn_new_pdu;
+static icl_conn_pdu_free_t iser_conn_pdu_free;
+static icl_conn_pdu_data_segment_length_t iser_conn_pdu_data_segment_length;
+static icl_conn_pdu_append_data_t iser_conn_pdu_append_data;
+static icl_conn_pdu_queue_t iser_conn_pdu_queue;
+static icl_conn_handoff_t iser_conn_handoff;
+static icl_conn_free_t iser_conn_free;
+static icl_conn_close_t iser_conn_close;
+static icl_conn_release_t iser_conn_release;
+static icl_conn_connect_t iser_conn_connect;
+static icl_conn_connected_t iser_conn_connected;
+static icl_conn_task_setup_t iser_conn_task_setup;
+static icl_conn_task_done_t iser_conn_task_done;
+static icl_conn_pdu_get_data_t iser_conn_pdu_get_data;
+
+static kobj_method_t icl_iser_methods[] = {
+ KOBJMETHOD(icl_conn_new_pdu, iser_conn_new_pdu),
+ KOBJMETHOD(icl_conn_pdu_free, iser_conn_pdu_free),
+ KOBJMETHOD(icl_conn_pdu_data_segment_length, iser_conn_pdu_data_segment_length),
+ KOBJMETHOD(icl_conn_pdu_append_data, iser_conn_pdu_append_data),
+ KOBJMETHOD(icl_conn_pdu_queue, iser_conn_pdu_queue),
+ KOBJMETHOD(icl_conn_handoff, iser_conn_handoff),
+ KOBJMETHOD(icl_conn_free, iser_conn_free),
+ KOBJMETHOD(icl_conn_close, iser_conn_close),
+ KOBJMETHOD(icl_conn_release, iser_conn_release),
+ KOBJMETHOD(icl_conn_connect, iser_conn_connect),
+ KOBJMETHOD(icl_conn_connected, iser_conn_connected),
+ KOBJMETHOD(icl_conn_task_setup, iser_conn_task_setup),
+ KOBJMETHOD(icl_conn_task_done, iser_conn_task_done),
+ KOBJMETHOD(icl_conn_pdu_get_data, iser_conn_pdu_get_data),
+ { 0, 0 }
+};
+
+DEFINE_CLASS(icl_iser, icl_iser_methods, sizeof(struct iser_conn));
+
+/**
+ * iser_initialize_headers() - Initialize task headers
+ * @pdu: iser pdu
+ * @iser_conn: iser connection
+ *
+ * Notes:
+ * This routine may race with iser teardown flow for scsi
+ * error handling TMFs. So for TMF we should acquire the
+ * state mutex to avoid dereferencing the IB device which
+ * may have already been terminated (racing teardown sequence).
+ */
+int
+iser_initialize_headers(struct icl_iser_pdu *pdu, struct iser_conn *iser_conn)
+{
+ struct iser_tx_desc *tx_desc = &pdu->desc;
+ struct iser_device *device = iser_conn->ib_conn.device;
+ u64 dma_addr;
+ int ret = 0;
+
+ dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc,
+ ISER_HEADERS_LEN, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(device->ib_device, dma_addr)) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ tx_desc->mapped = true;
+ tx_desc->dma_addr = dma_addr;
+ tx_desc->tx_sg[0].addr = tx_desc->dma_addr;
+ tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
+ tx_desc->tx_sg[0].lkey = device->mr->lkey;
+
+out:
+
+ return (ret);
+}
+
+int
+iser_conn_pdu_append_data(struct icl_conn *ic, struct icl_pdu *request,
+ const void *addr, size_t len, int flags)
+{
+ struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+
+ if (request->ip_bhs->bhs_opcode & ISCSI_BHS_OPCODE_LOGIN_REQUEST ||
+ request->ip_bhs->bhs_opcode & ISCSI_BHS_OPCODE_TEXT_REQUEST) {
+ ISER_DBG("copy to login buff");
+ memcpy(iser_conn->login_req_buf, addr, len);
+ request->ip_data_len = len;
+ }
+
+ return (0);
+}
+
+void
+iser_conn_pdu_get_data(struct icl_conn *ic, struct icl_pdu *ip,
+ size_t off, void *addr, size_t len)
+{
+ /* If we have a receive data, copy it to upper layer buffer */
+ if (ip->ip_data_mbuf)
+ memcpy(addr, ip->ip_data_mbuf + off, len);
+}
+
+/*
+ * Allocate icl_pdu with empty BHS to fill up by the caller.
+ */
+struct icl_pdu *
+iser_new_pdu(struct icl_conn *ic, int flags)
+{
+ struct icl_iser_pdu *iser_pdu;
+ struct icl_pdu *ip;
+ struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+
+ iser_pdu = uma_zalloc(icl_pdu_zone, flags | M_ZERO);
+ if (iser_pdu == NULL) {
+ ISER_WARN("failed to allocate %zd bytes", sizeof(*iser_pdu));
+ return (NULL);
+ }
+
+ iser_pdu->iser_conn = iser_conn;
+ ip = &iser_pdu->icl_pdu;
+ ip->ip_conn = ic;
+ ip->ip_bhs = &iser_pdu->desc.iscsi_header;
+
+ return (ip);
+}
+
+struct icl_pdu *
+iser_conn_new_pdu(struct icl_conn *ic, int flags)
+{
+ return (iser_new_pdu(ic, flags));
+}
+
+void
+iser_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
+{
+ struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip);
+
+ uma_zfree(icl_pdu_zone, iser_pdu);
+}
+
+size_t
+iser_conn_pdu_data_segment_length(struct icl_conn *ic,
+ const struct icl_pdu *request)
+{
+ uint32_t len = 0;
+
+ len += request->ip_bhs->bhs_data_segment_len[0];
+ len <<= 8;
+ len += request->ip_bhs->bhs_data_segment_len[1];
+ len <<= 8;
+ len += request->ip_bhs->bhs_data_segment_len[2];
+
+ return (len);
+}
+
+void
+iser_conn_pdu_free(struct icl_conn *ic, struct icl_pdu *ip)
+{
+ iser_pdu_free(ic, ip);
+}
+
+static bool
+is_control_opcode(uint8_t opcode)
+{
+ bool is_control = false;
+
+ switch (opcode & ISCSI_OPCODE_MASK) {
+ case ISCSI_BHS_OPCODE_NOP_OUT:
+ case ISCSI_BHS_OPCODE_LOGIN_REQUEST:
+ case ISCSI_BHS_OPCODE_LOGOUT_REQUEST:
+ case ISCSI_BHS_OPCODE_TEXT_REQUEST:
+ is_control = true;
+ break;
+ case ISCSI_BHS_OPCODE_SCSI_COMMAND:
+ is_control = false;
+ break;
+ default:
+ ISER_ERR("unknown opcode %d", opcode);
+ }
+
+ return (is_control);
+}
+
+void
+iser_conn_pdu_queue(struct icl_conn *ic, struct icl_pdu *ip)
+{
+ struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+ struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip);
+ int ret;
+
+ ret = iser_initialize_headers(iser_pdu, iser_conn);
+ if (ret) {
+ ISER_ERR("Failed to map TX descriptor pdu %p", iser_pdu);
+ return;
+ }
+
+ if (is_control_opcode(ip->ip_bhs->bhs_opcode)) {
+ ret = iser_send_control(iser_conn, iser_pdu);
+ if (unlikely(ret))
+ ISER_ERR("Failed to send control pdu %p", iser_pdu);
+ } else {
+ ret = iser_send_command(iser_conn, iser_pdu);
+ if (unlikely(ret))
+ ISER_ERR("Failed to send command pdu %p", iser_pdu);
+ }
+}
+
+static struct icl_conn *
+iser_new_conn(const char *name, struct mtx *lock)
+{
+ struct iser_conn *iser_conn;
+ struct icl_conn *ic;
+
+ refcount_acquire(&icl_iser_ncons);
+
+ iser_conn = (struct iser_conn *)kobj_create(&icl_iser_class, M_ICL_ISER, M_WAITOK | M_ZERO);
+ if (!iser_conn) {
+ ISER_ERR("failed to allocate iser conn");
+ refcount_release(&icl_iser_ncons);
+ return (NULL);
+ }
+
+ cv_init(&iser_conn->up_cv, "iser_cv");
+ sx_init(&iser_conn->state_mutex, "iser_conn_state_mutex");
+ mtx_init(&iser_conn->ib_conn.beacon.flush_lock, "flush_lock", NULL, MTX_DEF);
+ cv_init(&iser_conn->ib_conn.beacon.flush_cv, "flush_cv");
+ mtx_init(&iser_conn->ib_conn.lock, "lock", NULL, MTX_DEF);
+
+ ic = &iser_conn->icl_conn;
+ ic->ic_lock = lock;
+ ic->ic_name = name;
+ ic->ic_driver = strdup("iser", M_TEMP);
+ ic->ic_iser = true;
+
+ return (ic);
+}
+
+void
+iser_conn_free(struct icl_conn *ic)
+{
+ struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+
+ cv_destroy(&iser_conn->ib_conn.beacon.flush_cv);
+ mtx_destroy(&iser_conn->ib_conn.beacon.flush_lock);
+ sx_destroy(&iser_conn->state_mutex);
+ cv_destroy(&iser_conn->up_cv);
+ kobj_delete((struct kobj *)iser_conn, M_ICL_ISER);
+ refcount_release(&icl_iser_ncons);
+}
+
+int
+iser_conn_handoff(struct icl_conn *ic, int cmds_max)
+{
+ struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+ int error = 0;
+
+ sx_xlock(&iser_conn->state_mutex);
+ if (iser_conn->state != ISER_CONN_UP) {
+ error = EINVAL;
+ ISER_ERR("iser_conn %p state is %d, teardown started\n",
+ iser_conn, iser_conn->state);
+ goto out;
+ }
+
+ /*
+ * In discovery session no need to allocate rx desc and posting recv
+ * work request
+ */
+ if (ic->ic_session_type_discovery(ic))
+ goto out;
+
+ error = iser_alloc_rx_descriptors(iser_conn, cmds_max);
+ if (error)
+ goto out;
+
+ error = iser_post_recvm(iser_conn, iser_conn->min_posted_rx);
+ if (error)
+ goto post_error;
+
+ sx_xunlock(&iser_conn->state_mutex);
+ return (error);
+
+post_error:
+ iser_free_rx_descriptors(iser_conn);
+out:
+ sx_xunlock(&iser_conn->state_mutex);
+ return (error);
+
+}
+
+/**
+ * Frees all conn objects
+ */
+void
+iser_conn_release(struct icl_conn *ic)
+{
+ struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+ struct ib_conn *ib_conn = &iser_conn->ib_conn;
+ struct iser_conn *curr, *tmp;
+
+ mtx_lock(&ig.connlist_mutex);
+ /*
+ * Search for iser connection in global list.
+ * It may not be there in case of failure in connection establishment
+ * stage.
+ */
+ list_for_each_entry_safe(curr, tmp, &ig.connlist, conn_list) {
+ if (iser_conn == curr) {
+ ISER_WARN("found iser_conn %p", iser_conn);
+ list_del(&iser_conn->conn_list);
+ }
+ }
+ mtx_unlock(&ig.connlist_mutex);
+
+ /*
+ * In case we reconnecting or removing session, we need to
+ * release IB resources (which is safe to call more than once).
+ */
+ sx_xlock(&iser_conn->state_mutex);
+ iser_free_ib_conn_res(iser_conn, true);
+ sx_xunlock(&iser_conn->state_mutex);
+
+ if (ib_conn->cma_id != NULL) {
+ rdma_destroy_id(ib_conn->cma_id);
+ ib_conn->cma_id = NULL;
+ }
+
+}
+
+void
+iser_conn_close(struct icl_conn *ic)
+{
+ struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+
+ ISER_INFO("closing conn %p", iser_conn);
+
+ sx_xlock(&iser_conn->state_mutex);
+ /*
+ * In case iser connection is waiting on conditional variable
+ * (state PENDING) and we try to close it before connection establishment,
+ * we need to signal it to continue releasing connection properly.
+ */
+ if (!iser_conn_terminate(iser_conn) && iser_conn->state == ISER_CONN_PENDING)
+ cv_signal(&iser_conn->up_cv);
+ sx_xunlock(&iser_conn->state_mutex);
+
+}
+
+int
+iser_conn_connect(struct icl_conn *ic, int domain, int socktype,
+ int protocol, struct sockaddr *from_sa, struct sockaddr *to_sa)
+{
+ struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+ struct ib_conn *ib_conn = &iser_conn->ib_conn;
+ int err = 0;
+
+ sx_xlock(&iser_conn->state_mutex);
+ /* the device is known only --after-- address resolution */
+ ib_conn->device = NULL;
+
+ iser_conn->state = ISER_CONN_PENDING;
+
+ ib_conn->cma_id = rdma_create_id(iser_cma_handler, (void *)iser_conn,
+ RDMA_PS_TCP, IB_QPT_RC);
+ if (IS_ERR(ib_conn->cma_id)) {
+ err = -PTR_ERR(ib_conn->cma_id);
+ ISER_ERR("rdma_create_id failed: %d", err);
+ goto id_failure;
+ }
+
+ err = rdma_resolve_addr(ib_conn->cma_id, from_sa, to_sa, 1000);
+ if (err) {
+ ISER_ERR("rdma_resolve_addr failed: %d", err);
+ if (err < 0)
+ err = -err;
+ goto addr_failure;
+ }
+
+ ISER_DBG("before cv_wait: %p", iser_conn);
+ cv_wait(&iser_conn->up_cv, &iser_conn->state_mutex);
+ ISER_DBG("after cv_wait: %p", iser_conn);
+
+ if (iser_conn->state != ISER_CONN_UP) {
+ err = EIO;
+ goto addr_failure;
+ }
+
+ err = iser_alloc_login_buf(iser_conn);
+ if (err)
+ goto addr_failure;
+ sx_xunlock(&iser_conn->state_mutex);
+
+ mtx_lock(&ig.connlist_mutex);
+ list_add(&iser_conn->conn_list, &ig.connlist);
+ mtx_unlock(&ig.connlist_mutex);
+
+ return (0);
+
+id_failure:
+ ib_conn->cma_id = NULL;
+addr_failure:
+ sx_xunlock(&iser_conn->state_mutex);
+ return (err);
+}
+
+/**
+ * Called with session spinlock held.
+ * No need to lock state mutex on an advisory check.
+ **/
+bool
+iser_conn_connected(struct icl_conn *ic)
+{
+ struct iser_conn *iser_conn = icl_to_iser_conn(ic);
+
+ return (iser_conn->state == ISER_CONN_UP);
+}
+
+int
+iser_conn_task_setup(struct icl_conn *ic, struct ccb_scsiio *csio,
+ uint32_t *task_tagp, void **prvp, struct icl_pdu *ip)
+{
+ struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip);
+
+ *prvp = ip;
+ iser_pdu->csio = csio;
+
+ return (0);
+}
+
+void
+iser_conn_task_done(struct icl_conn *ic, void *prv)
+{
+ struct icl_pdu *ip = prv;
+ struct icl_iser_pdu *iser_pdu = icl_to_iser_pdu(ip);
+ struct iser_device *device = iser_pdu->iser_conn->ib_conn.device;
+ struct iser_tx_desc *tx_desc = &iser_pdu->desc;
+
+ if (iser_pdu->dir[ISER_DIR_IN]) {
+ iser_unreg_rdma_mem(iser_pdu, ISER_DIR_IN);
+ iser_dma_unmap_task_data(iser_pdu,
+ &iser_pdu->data[ISER_DIR_IN],
+ DMA_FROM_DEVICE);
+ }
+
+ if (iser_pdu->dir[ISER_DIR_OUT]) {
+ iser_unreg_rdma_mem(iser_pdu, ISER_DIR_OUT);
+ iser_dma_unmap_task_data(iser_pdu,
+ &iser_pdu->data[ISER_DIR_OUT],
+ DMA_TO_DEVICE);
+ }
+
+ if (likely(tx_desc->mapped)) {
+ ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,
+ ISER_HEADERS_LEN, DMA_TO_DEVICE);
+ tx_desc->mapped = false;
+ }
+
+ iser_pdu_free(ic, ip);
+}
+
+static u_int32_t
+iser_hba_misc()
+{
+ return (PIM_UNMAPPED);
+}
+
+static int
+iser_limits(size_t *limitp)
+{
+ *limitp = 128 * 1024;
+
+ return (0);
+}
+
+static int
+icl_iser_load(void)
+{
+ int error;
+
+ ISER_DBG("Starting iSER datamover...");
+
+ icl_pdu_zone = uma_zcreate("icl_iser_pdu", sizeof(struct icl_iser_pdu),
+ NULL, NULL, NULL, NULL,
+ UMA_ALIGN_PTR, 0);
+ /* FIXME: Check rc */
+
+ refcount_init(&icl_iser_ncons, 0);
+
+ error = icl_register("iser", 0, iser_limits, iser_new_conn, iser_hba_misc);
+ KASSERT(error == 0, ("failed to register iser"));
+
+ memset(&ig, 0, sizeof(struct iser_global));
+
+ /* device init is called only after the first addr resolution */
+ sx_init(&ig.device_list_mutex, "global_device_lock");
+ INIT_LIST_HEAD(&ig.device_list);
+ mtx_init(&ig.connlist_mutex, "global_conn_lock", NULL, MTX_DEF);
+ INIT_LIST_HEAD(&ig.connlist);
+ sx_init(&ig.close_conns_mutex, "global_close_conns_lock");
+
+ return (error);
+}
+
+static int
+icl_iser_unload(void)
+{
+ ISER_DBG("Removing iSER datamover...");
+
+ if (icl_iser_ncons != 0)
+ return (EBUSY);
+
+ sx_destroy(&ig.close_conns_mutex);
+ mtx_destroy(&ig.connlist_mutex);
+ sx_destroy(&ig.device_list_mutex);
+
+ icl_unregister("iser");
+
+ uma_zdestroy(icl_pdu_zone);
+
+ return (0);
+}
+
+static int
+icl_iser_modevent(module_t mod, int what, void *arg)
+{
+ switch (what) {
+ case MOD_LOAD:
+ return (icl_iser_load());
+ case MOD_UNLOAD:
+ return (icl_iser_unload());
+ default:
+ return (EINVAL);
+ }
+}
+
+moduledata_t icl_iser_data = {
+ .name = "icl_iser",
+ .evhand = icl_iser_modevent,
+ .priv = 0
+};
+
+DECLARE_MODULE(icl_iser, icl_iser_data, SI_SUB_DRIVERS, SI_ORDER_MIDDLE);
+MODULE_DEPEND(icl_iser, icl, 1, 1, 1);
+MODULE_DEPEND(icl_iser, iscsi, 1, 1, 1);
+MODULE_DEPEND(icl_iser, ibcore, 1, 1, 1);
+MODULE_DEPEND(icl_iser, linuxkpi, 1, 1, 1);
+MODULE_VERSION(icl_iser, 1);
+
Added: head/sys/dev/iser/icl_iser.h
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ head/sys/dev/iser/icl_iser.h Thu May 26 09:49:29 2016 (r300723)
@@ -0,0 +1,547 @@
+/* $FreeBSD$ */
+/*-
+ * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef ICL_ISER_H
+#define ICL_ISER_H
+
+/*
+ * iSCSI Common Layer for RDMA.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/capsicum.h>
+#include <sys/condvar.h>
+#include <sys/conf.h>
+#include <sys/file.h>
+#include <sys/kernel.h>
+#include <sys/kthread.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/module.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/sx.h>
+#include <sys/uio.h>
+#include <sys/taskqueue.h>
+#include <sys/bio.h>
+#include <vm/uma.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <dev/iscsi/icl.h>
+#include <dev/iscsi/iscsi_proto.h>
+#include <icl_conn_if.h>
+#include <cam/cam.h>
+#include <cam/cam_ccb.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+#include <rdma/rdma_cm.h>
+
+
+#define ISER_DBG(X, ...) \
+ do { \
+ if (unlikely(iser_debug > 2)) \
+ printf("DEBUG: %s: " X "\n", \
+ __func__, ## __VA_ARGS__); \
+ } while (0)
+
+#define ISER_INFO(X, ...) \
+ do { \
+ if (unlikely(iser_debug > 1)) \
+ printf("INFO: %s: " X "\n", \
+ __func__, ## __VA_ARGS__); \
+ } while (0)
+
+#define ISER_WARN(X, ...) \
+ do { \
+ if (unlikely(iser_debug > 0)) { \
+ printf("WARNING: %s: " X "\n", \
+ __func__, ## __VA_ARGS__); \
+ } \
+ } while (0)
+
+#define ISER_ERR(X, ...) \
+ printf("ERROR: %s: " X "\n", __func__, ## __VA_ARGS__)
+
+#define ISER_VER 0x10
+#define ISER_WSV 0x08
+#define ISER_RSV 0x04
+
+#define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL
+#define ISER_BEACON_WRID 0xfffffffffffffffeULL
+
+#define SHIFT_4K 12
+#define SIZE_4K (1ULL << SHIFT_4K)
+#define MASK_4K (~(SIZE_4K-1))
+
+/* support up to 512KB in one RDMA */
+#define ISCSI_ISER_SG_TABLESIZE (0x80000 >> SHIFT_4K)
+#define ISER_DEF_XMIT_CMDS_MAX 256
+
+/* the max RX (recv) WR supported by the iSER QP is defined by *
+ * max_recv_wr = commands_max + recv_beacon */
+#define ISER_QP_MAX_RECV_DTOS (ISER_DEF_XMIT_CMDS_MAX + 1)
+#define ISER_MIN_POSTED_RX (ISER_DEF_XMIT_CMDS_MAX >> 2)
+
+/* QP settings */
+/* Maximal bounds on received asynchronous PDUs */
+#define ISER_MAX_RX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */
+#define ISER_MAX_TX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), SCSI_TMFUNC(2), LOGOUT(1) */
+
+/* the max TX (send) WR supported by the iSER QP is defined by *
+ * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect *
+ * to have at max for SCSI command. The tx posting & completion handling code *
+ * supports -EAGAIN scheme where tx is suspended till the QP has room for more *
+ * send WR. D=8 comes from 64K/8K */
+
+#define ISER_INFLIGHT_DATAOUTS 8
+
+/* the send_beacon increase the max_send_wr by 1 */
+#define ISER_QP_MAX_REQ_DTOS (ISER_DEF_XMIT_CMDS_MAX * \
+ (1 + ISER_INFLIGHT_DATAOUTS) + \
+ ISER_MAX_TX_MISC_PDUS + \
+ ISER_MAX_RX_MISC_PDUS + 1)
+
+#define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr \
+ - ISER_MAX_TX_MISC_PDUS \
+ - ISER_MAX_RX_MISC_PDUS - 1) / \
+ (1 + ISER_INFLIGHT_DATAOUTS))
+
+#define ISER_WC_BATCH_COUNT 16
+#define ISER_SIGNAL_CMD_COUNT 32
+
+/* Maximal QP's recommended per CQ. In case we use more QP's per CQ we might *
+ * encounter a CQ overrun state. */
+#define ISCSI_ISER_MAX_CONN 8
+#define ISER_MAX_RX_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_TX_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_CQ_LEN (ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \
+ ISCSI_ISER_MAX_CONN)
+
+#define ISER_ZBVA_NOT_SUPPORTED 0x80
+#define ISER_SEND_W_INV_NOT_SUPPORTED 0x40
+
+#define icl_to_iser_conn(ic) \
+ container_of(ic, struct iser_conn, icl_conn)
+#define icl_to_iser_pdu(ip) \
+ container_of(ip, struct icl_iser_pdu, icl_pdu)
+
+/**
+ * struct iser_hdr - iSER header
+ *
+ * @flags: flags support (zbva, remote_inv)
+ * @rsvd: reserved
+ * @write_stag: write rkey
+ * @write_va: write virtual address
+ * @reaf_stag: read rkey
+ * @read_va: read virtual address
+ */
+struct iser_hdr {
+ u8 flags;
+ u8 rsvd[3];
+ __be32 write_stag;
+ __be64 write_va;
+ __be32 read_stag;
+ __be64 read_va;
+} __attribute__((packed));
+
+struct iser_cm_hdr {
+ u8 flags;
+ u8 rsvd[3];
+} __packed;
+
+/* Constant PDU lengths calculations */
+#define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + ISCSI_BHS_SIZE)
+
+#define ISER_RECV_DATA_SEG_LEN 128
+#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
+
+#define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
+
+enum iser_conn_state {
+ ISER_CONN_INIT, /* descriptor allocd, no conn */
+ ISER_CONN_PENDING, /* in the process of being established */
+ ISER_CONN_UP, /* up and running */
+ ISER_CONN_TERMINATING, /* in the process of being terminated */
+ ISER_CONN_DOWN, /* shut down */
+ ISER_CONN_STATES_NUM
+};
+
+enum iser_task_status {
+ ISER_TASK_STATUS_INIT = 0,
+ ISER_TASK_STATUS_STARTED,
+ ISER_TASK_STATUS_COMPLETED
+};
+
+enum iser_data_dir {
+ ISER_DIR_IN = 0, /* to initiator */
+ ISER_DIR_OUT, /* from initiator */
+ ISER_DIRS_NUM
+};
+
+/**
+ * struct iser_mem_reg - iSER memory registration info
+ *
+ * @sge: memory region sg element
+ * @rkey: memory region remote key
+ * @mem_h: pointer to registration context (FMR/Fastreg)
+ */
+struct iser_mem_reg {
+ struct ib_sge sge;
+ u32 rkey;
+ void *mem_h;
+};
+
+enum iser_desc_type {
+ ISCSI_TX_CONTROL ,
+ ISCSI_TX_SCSI_COMMAND,
+ ISCSI_TX_DATAOUT
+};
+
+/**
+ * struct iser_data_buf - iSER data buffer
+ *
+ * @sg: pointer to the sg list
+ * @size: num entries of this sg
+ * @data_len: total beffer byte len
+ * @dma_nents: returned by dma_map_sg
+ * @copy_buf: allocated copy buf for SGs unaligned
+ * for rdma which are copied
+ * @orig_sg: pointer to the original sg list (in case
+ * we used a copy)
+ * @sg_single: SG-ified clone of a non SG SC or
+ * unaligned SG
+ */
+struct iser_data_buf {
+ struct scatterlist sgl[ISCSI_ISER_SG_TABLESIZE];
+ void *sg;
+ unsigned int size;
+ unsigned long data_len;
+ unsigned int dma_nents;
+ char *copy_buf;
+ struct scatterlist *orig_sg;
+ struct scatterlist sg_single;
+ };
+
+/* fwd declarations */
+struct iser_conn;
+struct ib_conn;
+struct iser_device;
+
+/**
+ * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
+ *
+ * @iser_header: iser header
+ * @iscsi_header: iscsi header (bhs)
+ * @type: command/control/dataout
+ * @dma_addr: header buffer dma_address
+ * @tx_sg: sg[0] points to iser/iscsi headers
+ * sg[1] optionally points to either of immediate data
+ * unsolicited data-out or control
+ * @num_sge: number sges used on this TX task
+ * @mapped: indicates if the descriptor is dma mapped
+ */
+struct iser_tx_desc {
+ struct iser_hdr iser_header;
+ struct iscsi_bhs iscsi_header __attribute__((packed));
+ enum iser_desc_type type;
+ u64 dma_addr;
+ struct ib_sge tx_sg[2];
+ int num_sge;
+ bool mapped;
+};
+
+#define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \
+ sizeof(u64) + sizeof(struct ib_sge)))
+/**
+ * struct iser_rx_desc - iSER RX descriptor (for recv wr_id)
+ *
+ * @iser_header: iser header
+ * @iscsi_header: iscsi header
+ * @data: received data segment
+ * @dma_addr: receive buffer dma address
+ * @rx_sg: ib_sge of receive buffer
+ * @pad: for sense data TODO: Modify to maximum sense length supported
+ */
+struct iser_rx_desc {
+ struct iser_hdr iser_header;
+ struct iscsi_bhs iscsi_header;
+ char data[ISER_RECV_DATA_SEG_LEN];
+ u64 dma_addr;
+ struct ib_sge rx_sg;
+ char pad[ISER_RX_PAD_SIZE];
+} __attribute__((packed));
+
+struct icl_iser_pdu {
+ struct icl_pdu icl_pdu;
+ struct iser_tx_desc desc;
+ struct iser_conn *iser_conn;
+ enum iser_task_status status;
+ struct ccb_scsiio *csio;
+ int command_sent;
+ int dir[ISER_DIRS_NUM];
+ struct iser_mem_reg rdma_reg[ISER_DIRS_NUM];
+ struct iser_data_buf data[ISER_DIRS_NUM];
+};
+
+/**
+ * struct iser_comp - iSER completion context
+ *
+ * @device: pointer to device handle
+ * @cq: completion queue
+ * @wcs: work completion array
+ * @tq: taskqueue handle
+ * @task: task to run task_fn
+ * @active_qps: Number of active QPs attached
+ * to completion context
+ */
+struct iser_comp {
+ struct iser_device *device;
+ struct ib_cq *cq;
+ struct ib_wc wcs[ISER_WC_BATCH_COUNT];
+ struct taskqueue *tq;
+ struct task task;
+ int active_qps;
+};
+
+/**
+ * struct iser_device - iSER device handle
+ *
+ * @ib_device: RDMA device
+ * @pd: Protection Domain for this device
+ * @dev_attr: Device attributes container
+ * @mr: Global DMA memory region
+ * @event_handler: IB events handle routine
+ * @ig_list: entry in devices list
+ * @refcount: Reference counter, dominated by open iser connections
+ * @comps_used: Number of completion contexts used, Min between online
+ * cpus and device max completion vectors
+ * @comps: Dinamically allocated array of completion handlers
+ */
+struct iser_device {
+ struct ib_device *ib_device;
+ struct ib_pd *pd;
+ struct ib_device_attr dev_attr;
+ struct ib_mr *mr;
+ struct ib_event_handler event_handler;
+ struct list_head ig_list;
+ int refcount;
+ int comps_used;
+ struct iser_comp *comps;
+};
+
+/**
+ * struct iser_reg_resources - Fast registration recources
+ *
+ * @mr: memory region
+ * @frpl: fast reg page list
+ * @mr_valid: is mr valid indicator
+ */
+struct iser_reg_resources {
+ struct ib_mr *mr;
+ struct ib_fast_reg_page_list *frpl;
+ u8 mr_valid:1;
+};
+
+/**
+ * struct fast_reg_descriptor - Fast registration descriptor
+ *
+ * @list: entry in connection fastreg pool
+ * @rsc: data buffer registration resources
+ */
+struct fast_reg_descriptor {
+ struct list_head list;
+ struct iser_reg_resources rsc;
+};
+
+
+/**
+ * struct iser_beacon - beacon to signal all flush errors were drained
+ *
+ * @send: send wr
+ * @recv: recv wr
+ * @flush_lock: protects flush_cv
+ * @flush_cv: condition variable for beacon flush
+ */
+struct iser_beacon {
+ union {
+ struct ib_send_wr send;
+ struct ib_recv_wr recv;
+ };
+ struct mtx flush_lock;
+ struct cv flush_cv;
+};
+
+/**
+ * struct ib_conn - Infiniband related objects
+ *
+ * @cma_id: rdma_cm connection maneger handle
+ * @qp: Connection Queue-pair
+ * @device: reference to iser device
+ * @comp: iser completion context
+ */
+struct ib_conn {
+ struct rdma_cm_id *cma_id;
+ struct ib_qp *qp;
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-all
mailing list