svn commit: r213824 - projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib

Jeff Roberson jeff at FreeBSD.org
Thu Oct 14 02:02:55 UTC 2010


Author: jeff
Date: Thu Oct 14 02:02:55 2010
New Revision: 213824
URL: http://svn.freebsd.org/changeset/base/213824

Log:
   - Port and enable CM based ipoib.
   - Enable multicast support.
   - Generally polish the ipoib port some, fix potential memory leaks,
     improve locking, comments, etc.
  
  Sponsored by:	Isilon Systems, iX Systems, and Panasas.

Modified:
  projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
  projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c
  projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c
  projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_main.c
  projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_multicast.c

Modified: projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h
==============================================================================
--- projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h	Thu Oct 14 01:49:40 2010	(r213823)
+++ projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib.h	Thu Oct 14 02:02:55 2010	(r213824)
@@ -92,7 +92,7 @@
 #define	INFINIBAND_ALEN		20	/* Octets in IPoIB HW addr */
 #define	MAX_MB_FRAGS		(8192 / MCLBYTES)
 
-#define	CONFIG_INFINIBAND_IPOIB_DEBUG
+#define	CONFIG_INFINIBAND_IPOIB_CM
 
 enum ipoib_flush_level {
 	IPOIB_FLUSH_LIGHT,
@@ -104,12 +104,12 @@ enum {
 	IPOIB_ENCAP_LEN		  = 4,
 	IPOIB_HEADER_LEN	  = IPOIB_ENCAP_LEN + INFINIBAND_ALEN,
 	IPOIB_UD_HEAD_SIZE	  = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
-	IPOIB_UD_RX_SG		  = 2, /* max buffer needed for 4K mtu */
+	IPOIB_UD_RX_SG		  = 1, /* max buffer needed for 4K mtu */
 
-	IPOIB_CM_MTU		  = 0x10000 - 0x10, /* padding to align header to 16 */
-	IPOIB_CM_BUF_SIZE	  = IPOIB_CM_MTU  + IPOIB_ENCAP_LEN,
+	IPOIB_CM_MTU		  = (16 * 1024) - 0x14,
+	IPOIB_CM_BUF_SIZE	  = IPOIB_CM_MTU + IPOIB_ENCAP_LEN,
 	IPOIB_CM_HEAD_SIZE	  = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
-	IPOIB_CM_RX_SG		  = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE,
+	IPOIB_CM_RX_SG		  = 1,	/* We only allocate a single mbuf. */
 	IPOIB_RX_RING_SIZE	  = 256,
 	IPOIB_TX_RING_SIZE	  = 128,
 	IPOIB_MAX_QUEUE_SIZE	  = 8192,
@@ -400,6 +400,7 @@ struct ipoib_path {
 	struct rb_node	      rb_node;
 	struct list_head      list;
 #ifdef CONFIG_INFINIBAND_IPOIB_CM
+	uint8_t		      hwaddr[INFINIBAND_ALEN];
 	struct ipoib_cm_tx   *cm;
 #endif
 	struct ipoib_ah      *ah;
@@ -437,6 +438,8 @@ int ipoib_open(struct ifnet *dev);
 int ipoib_add_pkey_attr(struct ifnet *dev);
 int ipoib_add_umcast_attr(struct ifnet *dev);
 
+void ipoib_demux(struct ifnet *ifp, struct mbuf *m, u_short proto);
+
 void ipoib_send(struct ifnet *dev, struct mbuf *mb,
 		struct ipoib_ah *address, u32 qpn);
 void ipoib_reap_ah(struct work_struct *work);
@@ -471,6 +474,7 @@ int ipoib_mcast_stop_thread(struct ifnet
 void ipoib_mcast_dev_down(struct ifnet *dev);
 void ipoib_mcast_dev_flush(struct ifnet *dev);
 
+void ipoib_path_free(struct ifnet *dev, struct ipoib_path *path);
 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
 struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct ifnet *dev);
 int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);
@@ -524,10 +528,10 @@ static inline int ipoib_cm_admin_enabled
 		test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
 }
 
-static inline int ipoib_cm_enabled(struct ifnet *dev, struct llentry *n)
+static inline int ipoib_cm_enabled(struct ifnet *dev, uint8_t *hwaddr)
 {
 	struct ipoib_dev_priv *priv = dev->if_softc;
-	return IPOIB_CM_SUPPORTED(rt_key(n)->sa_data) &&
+	return IPOIB_CM_SUPPORTED(hwaddr) &&
 		test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
 }
 
@@ -581,7 +585,7 @@ static inline int ipoib_cm_admin_enabled
 {
 	return 0;
 }
-static inline int ipoib_cm_enabled(struct ifnet *dev, struct llentry *n)
+static inline int ipoib_cm_enabled(struct ifnet *dev, uint8_t *hwaddr)
 
 {
 	return 0;
@@ -663,7 +667,7 @@ int ipoib_cm_add_mode_attr(struct ifnet 
 static inline void ipoib_cm_mb_too_long(struct ifnet *dev, struct mbuf *mb,
 					 unsigned int mtu)
 {
-	m_free(mb);
+	m_freem(mb);
 }
 
 static inline void ipoib_cm_handle_rx_wc(struct ifnet *dev, struct ib_wc *wc)
@@ -730,30 +734,4 @@ extern int ipoib_debug_level;
 
 #define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
 
-static inline long
-test_and_clear_bit(long bit, long *var)
-{
-	long val;
-
-	bit = 1 << bit;
-	do {
-		val = *(volatile long *)var;
-	} while (atomic_cmpset_long(var, val, val & ~bit) == 0);
-
-	return !!(val & bit);
-}
-
-static inline long
-test_and_set_bit(long bit, long *var)
-{
-	long val;
-
-	bit = 1 << bit;
-	do {
-		val = *(volatile long *)var;
-	} while (atomic_cmpset_long(var, val, val | bit) == 0);
-
-	return !!(val & bit);
-}
-
 #endif /* _IPOIB_H */

Modified: projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c
==============================================================================
--- projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c	Thu Oct 14 01:49:40 2010	(r213823)
+++ projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_cm.c	Thu Oct 14 02:02:55 2010	(r213824)
@@ -32,10 +32,13 @@
 
 #include "ipoib.h"
 
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>
+#include <netinet/icmp6.h>
+
 #include <rdma/ib_cm.h>
 #include <rdma/ib_cache.h>
 #include <linux/delay.h>
-#include <linux/vmalloc.h>
 
 int ipoib_max_conn_qp = 128;
 
@@ -73,34 +76,29 @@ static struct ib_send_wr ipoib_cm_rx_dra
 static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
 			       struct ib_cm_event *event);
 
-static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
+static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv,
 				  u64 mapping[IPOIB_CM_RX_SG])
 {
-	int i;
 
 	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
 
-	for (i = 0; i < frags; ++i)
-		ib_dma_unmap_single(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
 }
 
 static int ipoib_cm_post_receive_srq(struct ifnet *dev, int id)
 {
 	struct ipoib_dev_priv *priv = dev->if_softc;
 	struct ib_recv_wr *bad_wr;
-	int i, ret;
+	int ret;
 
 	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
 
-	for (i = 0; i < priv->cm.num_frags; ++i)
-		priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
+	priv->cm.rx_sge[0].addr = priv->cm.srq_ring[id].mapping[0];
 
 	ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
 	if (unlikely(ret)) {
 		ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
-		ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1,
-				      priv->cm.srq_ring[id].mapping);
-		m_free(priv->cm.srq_ring[id].mb);
+		ipoib_cm_dma_unmap_rx(priv, priv->cm.srq_ring[id].mapping);
+		m_freem(priv->cm.srq_ring[id].mb);
 		priv->cm.srq_ring[id].mb = NULL;
 	}
 
@@ -114,19 +112,17 @@ static int ipoib_cm_post_receive_nonsrq(
 {
 	struct ipoib_dev_priv *priv = dev->if_softc;
 	struct ib_recv_wr *bad_wr;
-	int i, ret;
+	int ret;
 
 	wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
 
-	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
-		sge[i].addr = rx->rx_ring[id].mapping[i];
+	sge[0].addr = rx->rx_ring[id].mapping[0];
 
 	ret = ib_post_recv(rx->qp, wr, &bad_wr);
 	if (unlikely(ret)) {
 		ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
-		ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
-				      rx->rx_ring[id].mapping);
-		m_free(rx->rx_ring[id].mb);
+		ipoib_cm_dma_unmap_rx(priv, rx->rx_ring[id].mapping);
+		m_freem(rx->rx_ring[id].mb);
 		rx->rx_ring[id].mb = NULL;
 	}
 
@@ -135,55 +131,36 @@ static int ipoib_cm_post_receive_nonsrq(
 
 static struct mbuf *ipoib_cm_alloc_rx_mb(struct ifnet *dev,
 					     struct ipoib_cm_rx_buf *rx_ring,
-					     int id, int frags,
+					     int id,
 					     u64 mapping[IPOIB_CM_RX_SG])
 {
 	struct ipoib_dev_priv *priv = dev->if_softc;
 	struct mbuf *mb;
-	int i;
+	int buf_size;
 
-	mb = dev_alloc_mb(IPOIB_CM_HEAD_SIZE + 12);
+	buf_size = IPOIB_CM_HEAD_SIZE + 12;
+	if (buf_size <= MCLBYTES)
+		buf_size = MCLBYTES;
+	else if (buf_size <= MJUMPAGESIZE)
+		buf_size = MJUMPAGESIZE;
+	else if (buf_size <= MJUM9BYTES)
+		buf_size = MJUM9BYTES;
+	else if (buf_size < MJUM16BYTES)
+		buf_size = MJUM16BYTES;
+
+	mb = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, buf_size);
 	if (unlikely(!mb))
 		return NULL;
 
-	/*
-	 * IPoIB adds a 4 byte header. So we need 12 more bytes to align the
-	 * IP header to a multiple of 16.
-	 */
-	mb_reserve(mb, 12);
-
 	mapping[0] = ib_dma_map_single(priv->ca, mtod(mb, void *),
 				       IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
 	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) {
-		m_free(mb);
+		m_freem(mb);
 		return NULL;
 	}
 
-	for (i = 0; i < frags; i++) {
-		struct page *page = alloc_page(GFP_ATOMIC);
-
-		if (!page)
-			goto partial_error;
-		mb_fill_page_desc(mb, i, page, 0, PAGE_SIZE);
-
-		mapping[i + 1] = ib_dma_map_page(priv->ca, mb_shinfo(mb)->frags[i].page,
-						 0, PAGE_SIZE, DMA_FROM_DEVICE);
-		if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1])))
-			goto partial_error;
-	}
-
 	rx_ring[id].mb = mb;
 	return mb;
-
-partial_error:
-
-	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
-
-	for (; i > 0; --i)
-		ib_dma_unmap_single(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE);
-
-	m_free(mb);
-	return NULL;
 }
 
 static void ipoib_cm_free_rx_ring(struct ifnet *dev,
@@ -194,12 +171,11 @@ static void ipoib_cm_free_rx_ring(struct
 
 	for (i = 0; i < ipoib_recvq_size; ++i)
 		if (rx_ring[i].mb) {
-			ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
-					      rx_ring[i].mapping);
-			m_free(rx_ring[i].mb);
+			ipoib_cm_dma_unmap_rx(priv, rx_ring[i].mapping);
+			m_freem(rx_ring[i].mb);
 		}
 
-	vfree(rx_ring);
+	kfree(rx_ring);
 }
 
 static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
@@ -349,7 +325,7 @@ static int ipoib_cm_nonsrq_init_rx(struc
 	int ret;
 	int i;
 
-	rx->rx_ring = vmalloc(ipoib_recvq_size * sizeof *rx->rx_ring);
+	rx->rx_ring = kzalloc(ipoib_recvq_size * sizeof *rx->rx_ring, GFP_KERNEL);
 	if (!rx->rx_ring) {
 		printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n",
 		       priv->ca->name, ipoib_recvq_size);
@@ -379,7 +355,7 @@ static int ipoib_cm_nonsrq_init_rx(struc
 	spin_unlock_irq(&priv->lock);
 
 	for (i = 0; i < ipoib_recvq_size; ++i) {
-		if (!ipoib_cm_alloc_rx_mb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1,
+		if (!ipoib_cm_alloc_rx_mb(dev, rx->rx_ring, i,
 					   rx->rx_ring[i].mapping)) {
 			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
 				ret = -ENOMEM;
@@ -458,7 +434,7 @@ static int ipoib_cm_req_handler(struct i
 		goto err_qp;
 	}
 
-	psn = random32() & 0xffffff;
+	psn = random() & 0xffffff;
 	ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
 	if (ret)
 		goto err_modify;
@@ -521,33 +497,9 @@ static int ipoib_cm_rx_handler(struct ib
 static void mb_put_frags(struct mbuf *mb, unsigned int hdr_space,
 			  unsigned int length, struct mbuf *tomb)
 {
-	int i, num_frags;
-	unsigned int size;
 
-	/* put header into mb */
-	size = min(length, hdr_space);
-	mb->tail += size;
-	mb->len += size;
-	length -= size;
-
-	num_frags = mb_shinfo(mb)->nr_frags;
-	for (i = 0; i < num_frags; i++) {
-		mb_frag_t *frag = &mb_shinfo(mb)->frags[i];
-
-		if (length == 0) {
-			/* don't need this page */
-			mb_fill_page_desc(tomb, i, frag->page, 0, PAGE_SIZE);
-			--mb_shinfo(mb)->nr_frags;
-		} else {
-			size = min(length, (unsigned) PAGE_SIZE);
-
-			frag->size = size;
-			mb->data_len += size;
-			mb->truesize += size;
-			mb->len += size;
-			length -= size;
-		}
-	}
+	mb->m_pkthdr.len = length;
+	mb->m_len = length;
 }
 
 void ipoib_cm_handle_rx_wc(struct ifnet *dev, struct ib_wc *wc)
@@ -557,22 +509,20 @@ void ipoib_cm_handle_rx_wc(struct ifnet 
 	unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
 	struct mbuf *mb, *newmb;
 	struct ipoib_cm_rx *p;
-	unsigned long flags;
 	u64 mapping[IPOIB_CM_RX_SG];
-	int frags;
 	int has_srq;
-	struct mbuf *small_mb;
+	u_short proto;
 
 	ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
 		       wr_id, wc->status);
 
 	if (unlikely(wr_id >= ipoib_recvq_size)) {
 		if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
-			spin_lock_irqsave(&priv->lock, flags);
 			list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
 			ipoib_cm_start_rx_drain(priv);
-			queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
-			spin_unlock_irqrestore(&priv->lock, flags);
+			if (priv->cm.id != NULL)
+				queue_work(ipoib_workqueue,
+				    &priv->cm.rx_reap_task);
 		} else
 			ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
 				   wr_id, ipoib_recvq_size);
@@ -595,9 +545,7 @@ void ipoib_cm_handle_rx_wc(struct ifnet 
 			goto repost;
 		else {
 			if (!--p->recv_count) {
-				spin_lock_irqsave(&priv->lock, flags);
 				list_move(&p->list, &priv->cm.rx_reap_list);
-				spin_unlock_irqrestore(&priv->lock, flags);
 				queue_work(ipoib_workqueue, &priv->cm.rx_reap_task);
 			}
 			return;
@@ -606,38 +554,15 @@ void ipoib_cm_handle_rx_wc(struct ifnet 
 
 	if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
 		if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
-			spin_lock_irqsave(&priv->lock, flags);
 			p->jiffies = jiffies;
 			/* Move this entry to list head, but do not re-add it
 			 * if it has been moved out of list. */
 			if (p->state == IPOIB_CM_RX_LIVE)
 				list_move(&p->list, &priv->cm.passive_ids);
-			spin_unlock_irqrestore(&priv->lock, flags);
-		}
-	}
-
-	if (wc->byte_len < IPOIB_CM_COPYBREAK) {
-		int dlen = wc->byte_len;
-
-		small_mb = dev_alloc_mb(dlen + 12);
-		if (small_mb) {
-			mb_reserve(small_mb, 12);
-			ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0],
-						   dlen, DMA_FROM_DEVICE);
-			mb_copy_from_linear_data(mb, mtod(small_mb, void *),
-						 dlen);
-			ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0],
-						      dlen, DMA_FROM_DEVICE);
-			mb_put(small_mb, dlen);
-			mb = small_mb;
-			goto copied;
 		}
 	}
 
-	frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
-					      (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
-
-	newmb = ipoib_cm_alloc_rx_mb(dev, rx_ring, wr_id, frags, mapping);
+	newmb = ipoib_cm_alloc_rx_mb(dev, rx_ring, wr_id, mapping);
 	if (unlikely(!newmb)) {
 		/*
 		 * If we can't allocate a new RX buffer, dump
@@ -648,27 +573,21 @@ void ipoib_cm_handle_rx_wc(struct ifnet 
 		goto repost;
 	}
 
-	ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping);
-	memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping);
+	ipoib_cm_dma_unmap_rx(priv, rx_ring[wr_id].mapping);
+	memcpy(rx_ring[wr_id].mapping, mapping, sizeof *mapping);
 
 	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
 		       wc->byte_len, wc->slid);
 
 	mb_put_frags(mb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newmb);
 
-copied:
-	mb->protocol = mtod(mb, (struct ipoib_header *))->proto;
-	mb_reset_mac_header(mb);
-	m_adj(mb, IPOIB_ENCAP_LEN);
-
-	dev->last_rx = jiffies;
 	++dev->if_opackets;
-	dev->if_obytes += mb->len;
+	dev->if_obytes += mb->m_pkthdr.len;
 
 	mb->m_pkthdr.rcvif = dev;
-	/* XXX get correct PACKET_ type here */
-	mb->pkt_type = PACKET_HOST;
-	netif_receive_mb(mb);
+	proto = ntohs(*mtod(mb, uint16_t *));
+	m_adj(mb, IPOIB_ENCAP_LEN);
+	ipoib_demux(dev, mb, proto);
 
 repost:
 	if (has_srq) {
@@ -709,16 +628,17 @@ void ipoib_cm_send(struct ifnet *dev, st
 	struct ipoib_cm_tx_buf *tx_req;
 	u64 addr;
 
-	if (unlikely(mb->len > tx->mtu)) {
+	m_adj(mb, INFINIBAND_ALEN);
+	if (unlikely(mb->m_pkthdr.len > tx->mtu)) {
 		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
-			   mb->len, tx->mtu);
+			   mb->m_pkthdr.len, tx->mtu);
 		++dev->if_oerrors;
 		ipoib_cm_mb_too_long(dev, mb, tx->mtu - IPOIB_ENCAP_LEN);
 		return;
 	}
 
 	ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n",
-		       tx->tx_head, mb->len, tx->qp->qp_num);
+		       tx->tx_head, mb->m_pkthdr.len, tx->qp->qp_num);
 
 	/*
 	 * We put the mb into the tx_ring _before_ we call post_send()
@@ -729,24 +649,23 @@ void ipoib_cm_send(struct ifnet *dev, st
 	 */
 	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
 	tx_req->mb = mb;
-	addr = ib_dma_map_single(priv->ca, mtod(mb, void *), mb->len,
+	addr = ib_dma_map_single(priv->ca, mtod(mb, void *), mb->m_pkthdr.len,
 				 DMA_TO_DEVICE);
 	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
 		++dev->if_oerrors;
-		m_free(mb);
+		m_freem(mb);
 		return;
 	}
 
 	tx_req->mapping = addr;
 
 	if (unlikely(post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
-			       addr, mb->len))) {
+			       addr, mb->m_pkthdr.len))) {
 		ipoib_warn(priv, "post_send failed\n");
 		++dev->if_oerrors;
-		ib_dma_unmap_single(priv->ca, addr, mb->len, DMA_TO_DEVICE);
-		m_free(mb);
+		ib_dma_unmap_single(priv->ca, addr, mb->m_pkthdr.len, DMA_TO_DEVICE);
+		m_freem(mb);
 	} else {
-		dev->trans_start = jiffies;
 		++tx->tx_head;
 
 		if (++priv->tx_outstanding == ipoib_sendq_size) {
@@ -754,7 +673,7 @@ void ipoib_cm_send(struct ifnet *dev, st
 				  tx->qp->qp_num);
 			if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
 				ipoib_warn(priv, "request notify on send CQ failed\n");
-			netif_stop_queue(dev);
+			dev->if_drv_flags |= IFF_DRV_OACTIVE;
 		}
 	}
 }
@@ -778,21 +697,19 @@ void ipoib_cm_handle_tx_wc(struct ifnet 
 
 	tx_req = &tx->tx_ring[wr_id];
 
-	ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->mb->len, DMA_TO_DEVICE);
+	ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->mb->m_pkthdr.len, DMA_TO_DEVICE);
 
 	/* FIXME: is this right? Shouldn't we only increment on success? */
 	++dev->if_opackets;
-	dev->if_obytes += tx_req->mb->len;
-
-	m_free(tx_req->mb);
+	dev->if_obytes += tx_req->mb->m_pkthdr.len;
 
-	netif_tx_lock(dev);
+	m_freem(tx_req->mb);
 
 	++tx->tx_tail;
 	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
-	    netif_queue_stopped(dev) &&
+	    (dev->if_drv_flags & IFF_DRV_OACTIVE) != 0 &&
 	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
-		netif_wake_queue(dev);
+		dev->if_drv_flags |= IFF_DRV_OACTIVE;
 
 	if (wc->status != IB_WC_SUCCESS &&
 	    wc->status != IB_WC_WR_FLUSH_ERR) {
@@ -808,6 +725,9 @@ void ipoib_cm_handle_tx_wc(struct ifnet 
 		if (path) {
 			path->cm = NULL;
 			tx->path = NULL;
+			rb_erase(&path->rb_node, &priv->path_tree);
+			list_del(&path->list);
+			ipoib_path_free(dev, path);
 		}
 
 		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
@@ -820,7 +740,6 @@ void ipoib_cm_handle_tx_wc(struct ifnet 
 		spin_unlock_irqrestore(&priv->lock, flags);
 	}
 
-	netif_tx_unlock(dev);
 }
 
 int ipoib_cm_dev_open(struct ifnet *dev)
@@ -891,6 +810,8 @@ void ipoib_cm_dev_stop(struct ifnet *dev
 	ib_destroy_cm_id(priv->cm.id);
 	priv->cm.id = NULL;
 
+	cancel_work_sync(&priv->cm.rx_reap_task);
+
 	spin_lock_irq(&priv->lock);
 	while (!list_empty(&priv->cm.passive_ids)) {
 		p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
@@ -946,6 +867,7 @@ static int ipoib_cm_rep_handler(struct i
 	int qp_attr_mask, ret;
 	struct mbuf *mb;
 
+	ipoib_dbg(priv, "cm rep handler\n");
 	p->mtu = be32_to_cpu(data->mtu);
 
 	if (p->mtu <= IPOIB_ENCAP_LEN) {
@@ -985,13 +907,20 @@ static int ipoib_cm_rep_handler(struct i
 	spin_lock_irq(&priv->lock);
 	set_bit(IPOIB_FLAG_OPER_UP, &p->flags);
 	if (p->path)
-		while ((mb = __mb_dequeue(&p->path->queue)))
-			__mb_queue_tail(&mbqueue, mb);
+		for (;;) {
+			_IF_DEQUEUE(&p->path->queue, mb);
+			if (mb == NULL)
+				break;
+			_IF_ENQUEUE(&mbqueue, mb);
+		}
 	spin_unlock_irq(&priv->lock);
 
-	while ((mb = __mb_dequeue(&mbqueue))) {
+	for (;;) {
+		_IF_DEQUEUE(&mbqueue, mb);
+		if (mb == NULL)
+			break;
 		mb->m_pkthdr.rcvif = p->dev;
-		if (dev_queue_xmit(mb))
+		if (p->dev->if_transmit(p->dev, mb))
 			ipoib_warn(priv, "dev_queue_xmit failed "
 				   "to requeue packet\n");
 	}
@@ -1030,6 +959,8 @@ static int ipoib_cm_send_req(struct ifne
 	struct ipoib_cm_data data = {};
 	struct ib_cm_req_param req = {};
 
+	ipoib_dbg(priv, "cm send req\n");
+
 	data.qpn = cpu_to_be32(priv->qp->qp_num);
 	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
 
@@ -1089,7 +1020,7 @@ static int ipoib_cm_tx_init(struct ipoib
 	struct ipoib_dev_priv *priv = p->dev->if_softc;
 	int ret;
 
-	p->tx_ring = vmalloc(ipoib_sendq_size * sizeof *p->tx_ring);
+	p->tx_ring = kzalloc(ipoib_sendq_size * sizeof *p->tx_ring, GFP_KERNEL);
 	if (!p->tx_ring) {
 		ipoib_warn(priv, "failed to allocate tx ring\n");
 		ret = -ENOMEM;
@@ -1136,14 +1067,15 @@ err_id:
 	ib_destroy_qp(p->qp);
 err_qp:
 	p->qp = NULL;
-	vfree(p->tx_ring);
+	kfree(p->tx_ring);
 err_tx:
 	return ret;
 }
 
 static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
 {
-	struct ipoib_dev_priv *priv = p->dev->if_softc;
+	struct ifnet *dev = p->dev;
+	struct ipoib_dev_priv *priv = dev->if_softc;
 	struct ipoib_cm_tx_buf *tx_req;
 	unsigned long begin;
 
@@ -1171,22 +1103,20 @@ timeout:
 
 	while ((int) p->tx_tail - (int) p->tx_head < 0) {
 		tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
-		ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->mb->len,
+		ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->mb->m_pkthdr.len,
 				    DMA_TO_DEVICE);
-		m_free(tx_req->mb);
+		m_freem(tx_req->mb);
 		++p->tx_tail;
-		netif_tx_lock_bh(p->dev);
 		if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
-		    netif_queue_stopped(p->dev) &&
+		    (dev->if_drv_flags & IFF_DRV_OACTIVE) != 0 &&
 		    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
-			netif_wake_queue(p->dev);
-		netif_tx_unlock_bh(p->dev);
+			dev->if_drv_flags |= IFF_DRV_OACTIVE;
 	}
 
 	if (p->qp)
 		ib_destroy_qp(p->qp);
 
-	vfree(p->tx_ring);
+	kfree(p->tx_ring);
 	kfree(p);
 }
 
@@ -1195,7 +1125,6 @@ static int ipoib_cm_tx_handler(struct ib
 {
 	struct ipoib_cm_tx *tx = cm_id->context;
 	struct ipoib_dev_priv *priv = tx->dev->if_softc;
-	struct ifnet *dev = priv->dev;
 	struct ipoib_path *path;
 	unsigned long flags;
 	int ret;
@@ -1216,13 +1145,15 @@ static int ipoib_cm_tx_handler(struct ib
 	case IB_CM_REJ_RECEIVED:
 	case IB_CM_TIMEWAIT_EXIT:
 		ipoib_dbg(priv, "CM error %d.\n", event->event);
-		netif_tx_lock_bh(dev);
 		spin_lock_irqsave(&priv->lock, flags);
 		path = tx->path;
 
 		if (path) {
 			path->cm = NULL;
 			tx->path = NULL;
+			rb_erase(&path->rb_node, &priv->path_tree);
+			list_del(&path->list);
+			ipoib_path_free(tx->dev, path);
 		}
 
 		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
@@ -1231,7 +1162,6 @@ static int ipoib_cm_tx_handler(struct ib
 		}
 
 		spin_unlock_irqrestore(&priv->lock, flags);
-		netif_tx_unlock_bh(dev);
 		break;
 	default:
 		break;
@@ -1249,6 +1179,7 @@ struct ipoib_cm_tx *ipoib_cm_create_tx(s
 	if (!tx)
 		return NULL;
 
+	ipoib_dbg(priv, "Creating cm tx\n");
 	path->cm = tx;
 	tx->path = path;
 	tx->dev = dev;
@@ -1265,7 +1196,7 @@ void ipoib_cm_destroy_tx(struct ipoib_cm
 		list_move(&tx->list, &priv->cm.reap_list);
 		queue_work(ipoib_workqueue, &priv->cm.reap_task);
 		ipoib_dbg(priv, "Reap connection for gid %pI6\n",
-			  tx->path->dgid.raw);
+			  tx->path->pathrec.dgid.raw);
 		tx->path = NULL;
 	}
 }
@@ -1274,7 +1205,6 @@ static void ipoib_cm_tx_start(struct wor
 {
 	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
 						   cm.start_task);
-	struct ifnet *dev = priv->dev;
 	struct ipoib_path *path;
 	struct ipoib_cm_tx *p;
 	unsigned long flags;
@@ -1283,7 +1213,7 @@ static void ipoib_cm_tx_start(struct wor
 	struct ib_sa_path_rec pathrec;
 	u32 qpn;
 
-	netif_tx_lock_bh(dev);
+	ipoib_dbg(priv, "cm start task\n");
 	spin_lock_irqsave(&priv->lock, flags);
 
 	while (!list_empty(&priv->cm.start_list)) {
@@ -1294,94 +1224,94 @@ static void ipoib_cm_tx_start(struct wor
 		memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
 
 		spin_unlock_irqrestore(&priv->lock, flags);
-		netif_tx_unlock_bh(dev);
 
 		ret = ipoib_cm_tx_init(p, qpn, &pathrec);
 
-		netif_tx_lock_bh(dev);
 		spin_lock_irqsave(&priv->lock, flags);
 
 		if (ret) {
 			path = p->path;
-			if (path)
+			if (path) {
 				path->cm = NULL;
+				rb_erase(&path->rb_node, &priv->path_tree);
+				list_del(&path->list);
+				ipoib_path_free(priv->dev, path);
+			}
 			list_del(&p->list);
 			kfree(p);
 		}
 	}
 
 	spin_unlock_irqrestore(&priv->lock, flags);
-	netif_tx_unlock_bh(dev);
 }
 
 static void ipoib_cm_tx_reap(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
 						   cm.reap_task);
-	struct ifnet *dev = priv->dev;
 	struct ipoib_cm_tx *p;
 	unsigned long flags;
 
-	netif_tx_lock_bh(dev);
 	spin_lock_irqsave(&priv->lock, flags);
 
 	while (!list_empty(&priv->cm.reap_list)) {
 		p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
 		list_del(&p->list);
 		spin_unlock_irqrestore(&priv->lock, flags);
-		netif_tx_unlock_bh(dev);
 		ipoib_cm_tx_destroy(p);
-		netif_tx_lock_bh(dev);
 		spin_lock_irqsave(&priv->lock, flags);
 	}
 
 	spin_unlock_irqrestore(&priv->lock, flags);
-	netif_tx_unlock_bh(dev);
 }
 
 static void ipoib_cm_mb_reap(struct work_struct *work)
 {
 	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
 						   cm.mb_task);
-	struct ifnet *dev = priv->dev;
 	struct mbuf *mb;
 	unsigned long flags;
 	unsigned mtu = priv->mcast_mtu;
+	uint16_t proto;
 
-	netif_tx_lock_bh(dev);
 	spin_lock_irqsave(&priv->lock, flags);
 
-	while ((mb = mb_dequeue(&priv->cm.mb_queue))) {
+	for (;;) {
+		IF_DEQUEUE(&priv->cm.mb_queue, mb);
+		if (mb == NULL)
+			break;
 		spin_unlock_irqrestore(&priv->lock, flags);
-		netif_tx_unlock_bh(dev);
 
-		if (mb->protocol == htons(ETH_P_IP))
-			icmp_send(mb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-		else if (mb->protocol == htons(ETH_P_IPV6))
-			icmpv6_send(mb, ICMPV6_PKT_TOOBIG, 0, mtu, priv->dev);
+		proto = htons(*mtod(mb, uint16_t *));
+		m_adj(mb, IPOIB_ENCAP_LEN);
+		if (proto == ETHERTYPE_IP)
+			icmp_error(mb, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu);
+#if defined(INET6)
+		else if (proto == ETHERTYPE_IPV6)
+			icmp6_error(mb, ICMP6_PACKET_TOO_BIG, 0, mtu);
 #endif
-		m_free(mb);
+		m_freem(mb);
 
-		netif_tx_lock_bh(dev);
 		spin_lock_irqsave(&priv->lock, flags);
 	}
 
 	spin_unlock_irqrestore(&priv->lock, flags);
-	netif_tx_unlock_bh(dev);
 }
 
-void ipoib_cm_mb_too_long(struct ifnet *dev, struct mbuf *mb,
-			   unsigned int mtu)
+void
+ipoib_cm_mb_too_long(struct ifnet *dev, struct mbuf *mb, unsigned int mtu)
 {
 	struct ipoib_dev_priv *priv = dev->if_softc;
-	int e = mb_queue_empty(&priv->cm.mb_queue);
+	int e = priv->cm.mb_queue.ifq_len; 
 
+/* XXX */
+#if 0
 	if (mb->dst)
 		mb->dst->ops->update_pmtu(mb->dst, mtu);
+#endif
 
-	mb_queue_tail(&priv->cm.mb_queue, mb);
-	if (e)
+	IF_ENQUEUE(&priv->cm.mb_queue, mb);
+	if (e == 0)
 		queue_work(ipoib_workqueue, &priv->cm.mb_task);
 }
 
@@ -1421,69 +1351,6 @@ static void ipoib_cm_stale_task(struct w
 }
 
 
-static ssize_t show_mode(struct device *d, struct device_attribute *attr,
-			 char *buf)
-{
-	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d));
-
-	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
-		return sprintf(buf, "connected\n");
-	else
-		return sprintf(buf, "datagram\n");
-}
-
-static ssize_t set_mode(struct device *d, struct device_attribute *attr,
-			const char *buf, size_t count)
-{
-	struct ifnet *dev = to_net_dev(d);
-	struct ipoib_dev_priv *priv = dev->if_softc;
-
-	/* flush paths if we switch modes so that connections are restarted */
-	if (IPOIB_CM_SUPPORTED(IF_LLADDR(dev)) && !strcmp(buf, "connected\n")) {
-		set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
-		ipoib_warn(priv, "enabling connected mode "
-			   "will cause multicast packet drops\n");
-
-		rtnl_lock();
-		dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO);
-		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
-
-		if (ipoib_cm_max_mtu(dev) > priv->mcast_mtu)
-			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
-				   priv->mcast_mtu);
-		dev_set_mtu(dev, ipoib_cm_max_mtu(dev));
-		rtnl_unlock();
-
-		ipoib_flush_paths(dev);
-		return count;
-	}
-
-	if (!strcmp(buf, "datagram\n")) {
-		clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
-
-		rtnl_lock();
-		if (test_bit(IPOIB_FLAG_CSUM, &priv->flags)) {
-			dev->features |= NETIF_F_IP_CSUM | NETIF_F_SG;
-			if (priv->hca_caps & IB_DEVICE_UD_TSO)
-				dev->features |= NETIF_F_TSO;
-		}
-		dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
-		rtnl_unlock();
-		ipoib_flush_paths(dev);
-
-		return count;
-	}
-
-	return -EINVAL;
-}
-
-static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode);
-
-int ipoib_cm_add_mode_attr(struct ifnet *dev)
-{
-	return device_create_file(&dev->dev, &dev_attr_mode);
-}
-
 static void ipoib_cm_create_srq(struct ifnet *dev, int max_sge)
 {
 	struct ipoib_dev_priv *priv = dev->if_softc;
@@ -1503,7 +1370,7 @@ static void ipoib_cm_create_srq(struct i
 		return;
 	}
 
-	priv->cm.srq_ring = vmalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring);
+	priv->cm.srq_ring = kzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring, GFP_KERNEL);
 	if (!priv->cm.srq_ring) {
 		printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n",
 		       priv->ca->name, ipoib_recvq_size);
@@ -1534,8 +1401,8 @@ int ipoib_cm_dev_init(struct ifnet *dev)
 	INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
 	INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
 
-	bzero(&priv->cm.mb_queue, sizeof(priv->cm.mb_queue);
-	mtx_init(&priv->cm.mb_queue->ifq_mtx,
+	bzero(&priv->cm.mb_queue, sizeof(priv->cm.mb_queue));
+	mtx_init(&priv->cm.mb_queue.ifq_mtx,
 	    dev->if_xname, "if send queue", MTX_DEF);
 
 	ret = ib_query_device(priv->ca, &attr);
@@ -1564,7 +1431,6 @@ int ipoib_cm_dev_init(struct ifnet *dev)
 	if (ipoib_cm_has_srq(dev)) {
 		for (i = 0; i < ipoib_recvq_size; ++i) {
 			if (!ipoib_cm_alloc_rx_mb(dev, priv->cm.srq_ring, i,
-						   priv->cm.num_frags - 1,
 						   priv->cm.srq_ring[i].mapping)) {
 				ipoib_warn(priv, "failed to allocate "
 					   "receive buffer %d\n", i);

Modified: projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c
==============================================================================
--- projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c	Thu Oct 14 01:49:40 2010	(r213823)
+++ projects/ofed/head/sys/ofed/drivers/infiniband/ulp/ipoib/ipoib_ib.c	Thu Oct 14 02:02:55 2010	(r213824)
@@ -112,14 +112,12 @@ static int ipoib_ib_post_receive(struct 
 
 	priv->rx_wr.wr_id   = id | IPOIB_OP_RECV;
 	priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0];
-	priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1];
-
 
 	ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr);
 	if (unlikely(ret)) {
 		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
 		ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping);
-		m_free(priv->rx_ring[id].mb);
+		m_freem(priv->rx_ring[id].mb);
 		priv->rx_ring[id].mb = NULL;
 	}
 
@@ -134,11 +132,7 @@ static struct mbuf *ipoib_alloc_rx_mb(st
 	u64 *mapping;
 
 	/*
-	 * XXX This could be done more efficiently.  ipoib adds 44 bytes of
-	 * headers on to the mtu.  We could do the ib header seperate from
-	 * the data and use more efficient allocations.
-	 *
-	 * XXX Should be calculated once and stashed.
+	 * XXX Should be calculated once and cached.
 	 */
 	buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
 	if (buf_size <= MCLBYTES)
@@ -164,7 +158,7 @@ static struct mbuf *ipoib_alloc_rx_mb(st
 	return mb;
 
 error:
-	m_free(mb);
+	m_freem(mb);
 	return NULL;
 }
 
@@ -212,7 +206,7 @@ static void ipoib_ib_handle_rx_wc(struct
 				   "(status=%d, wrid=%d vend_err %x)\n",
 				   wc->status, wr_id, wc->vendor_err);
 		ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
-		m_free(mb);
+		m_freem(mb);
 		priv->rx_ring[wr_id].mb = NULL;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-projects mailing list