svn commit: r246482 - in head/sys: dev/bxe dev/e1000 dev/ixgbe dev/oce net ofed/drivers/net/mlx4 sys

Randall Stewart rrs at FreeBSD.org
Thu Feb 7 15:20:57 UTC 2013


Author: rrs
Date: Thu Feb  7 15:20:54 2013
New Revision: 246482
URL: http://svnweb.freebsd.org/changeset/base/246482

Log:
  This fixes a out-of-order problem with several
  of the newer drivers. The basic problem was
  that the driver was pulling the mbuf off the
  drbr ring and then when sending with xmit(), encounting
  a full transmit ring. Thus the lower layer
  xmit() function would return an error, and the
  drivers would then append the data back on to the ring.
  For TCP this is a horrible scenario sure to bring
  on a fast-retransmit.
  
  The fix is to use drbr_peek() to pull the data pointer
  but not remove it from the ring. If it fails then
  we either call the new drbr_putback or drbr_advance
  method. Advance moves it forward (we do this sometimes
  when the xmit() function frees the mbuf). When
  we succeed we always call advance. The
  putback will always copy the mbuf back to the top
  of the ring. Note that the putback *cannot* be used
  with a drbr_dequeue() only with drbr_peek(). We most
  of the time, in putback, would not need to copy it
  back since most likey the mbuf is still the same, but
  sometimes xmit() functions will change the mbuf via
  a pullup or other call. So the optimial case for
  the single consumer is to always copy it back. If
  we ever do a multiple_consumer (for lagg?) we
  will  need a test and atomic in the put back possibly
  a seperate putback_mc() in the ring buf.
  
  Reviewed by:	jhb at freebsd.org, jlv at freebsd.org

Modified:
  head/sys/dev/bxe/if_bxe.c
  head/sys/dev/e1000/if_em.c
  head/sys/dev/e1000/if_igb.c
  head/sys/dev/ixgbe/ixgbe.c
  head/sys/dev/ixgbe/ixv.c
  head/sys/dev/oce/oce_if.c
  head/sys/net/if_var.h
  head/sys/ofed/drivers/net/mlx4/en_tx.c
  head/sys/sys/buf_ring.h

Modified: head/sys/dev/bxe/if_bxe.c
==============================================================================
--- head/sys/dev/bxe/if_bxe.c	Thu Feb  7 15:19:12 2013	(r246481)
+++ head/sys/dev/bxe/if_bxe.c	Thu Feb  7 15:20:54 2013	(r246482)
@@ -9506,24 +9506,15 @@ bxe_tx_mq_start_locked(struct ifnet *ifp
 
 	BXE_FP_LOCK_ASSERT(fp);
 
-	if (m == NULL) {
-		/* No new work, check for pending frames. */
-		next = drbr_dequeue(ifp, fp->br);
-	} else if (drbr_needs_enqueue(ifp, fp->br)) {
-		/* Both new and pending work, maintain packet order. */
+	if (m != NULL) {
 		rc = drbr_enqueue(ifp, fp->br, m);
 		if (rc != 0) {
 			fp->tx_soft_errors++;
 			goto bxe_tx_mq_start_locked_exit;
 		}
-		next = drbr_dequeue(ifp, fp->br);
-	} else
-		/* New work only, nothing pending. */
-		next = m;
-
+	}
  	/* Keep adding entries while there are frames to send. */
-	while (next != NULL) {
-
+	while ((next = drbr_peek(ifp, fp->br)) != NULL) {
 		/* The transmit mbuf now belongs to us, keep track of it. */
 		fp->tx_mbuf_alloc++;
 
@@ -9537,23 +9528,22 @@ bxe_tx_mq_start_locked(struct ifnet *ifp
 		if (__predict_false(rc != 0)) {
 			fp->tx_encap_failures++;
 			/* Very Bad Frames(tm) may have been dropped. */
-			if (next != NULL) {
+			if (next == NULL) {
+				drbr_advance(ifp, fp->br);
+			} else {
+				drbr_putback(ifp, fp->br, next);
 				/*
 				 * Mark the TX queue as full and save
 				 * the frame.
 				 */
 				ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 				fp->tx_frame_deferred++;
-
-				/* This may reorder frame. */
-				rc = drbr_enqueue(ifp, fp->br, next);
 				fp->tx_mbuf_alloc--;
 			}
-
 			/* Stop looking for more work. */
 			break;
 		}
-
+		drbr_advance(ifp, fp->br);
 		/* The transmit frame was enqueued successfully. */
 		tx_count++;
 
@@ -9574,8 +9564,6 @@ bxe_tx_mq_start_locked(struct ifnet *ifp
 			ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 			break;
 		}
-
-		next = drbr_dequeue(ifp, fp->br);
 	}
 
 	/* No TX packets were dequeued. */

Modified: head/sys/dev/e1000/if_em.c
==============================================================================
--- head/sys/dev/e1000/if_em.c	Thu Feb  7 15:19:12 2013	(r246481)
+++ head/sys/dev/e1000/if_em.c	Thu Feb  7 15:20:54 2013	(r246482)
@@ -905,22 +905,24 @@ em_mq_start_locked(struct ifnet *ifp, st
 	}
 
 	enq = 0;
-	if (m == NULL) {
-		next = drbr_dequeue(ifp, txr->br);
-	} else if (drbr_needs_enqueue(ifp, txr->br)) {
-		if ((err = drbr_enqueue(ifp, txr->br, m)) != 0)
+	if (m != NULL) {
+		err = drbr_enqueue(ifp, txr->br, m);
+		if (err) {
 			return (err);
-		next = drbr_dequeue(ifp, txr->br);
-	} else
-		next = m;
+		}
+	} 
 
 	/* Process the queue */
-	while (next != NULL) {
+	while ((next = drbr_peek(ifp, txr->br)) != NULL) {
 		if ((err = em_xmit(txr, &next)) != 0) {
-                        if (next != NULL)
-                                err = drbr_enqueue(ifp, txr->br, next);
-                        break;
+			if (next == NULL) {
+				drbr_advance(ifp, txr->br);
+			} else {
+				drbr_putback(ifp, txr->br, next);
+			}
+			break;
 		}
+		drbr_advance(ifp, txr->br);
 		enq++;
 		ifp->if_obytes += next->m_pkthdr.len;
 		if (next->m_flags & M_MCAST)
@@ -928,7 +930,6 @@ em_mq_start_locked(struct ifnet *ifp, st
 		ETHER_BPF_MTAP(ifp, next);
 		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
                         break;
-		next = drbr_dequeue(ifp, txr->br);
 	}
 
 	if (enq > 0) {

Modified: head/sys/dev/e1000/if_igb.c
==============================================================================
--- head/sys/dev/e1000/if_igb.c	Thu Feb  7 15:19:12 2013	(r246481)
+++ head/sys/dev/e1000/if_igb.c	Thu Feb  7 15:20:54 2013	(r246482)
@@ -350,6 +350,16 @@ TUNABLE_INT("hw.igb.max_interrupt_rate",
 SYSCTL_INT(_hw_igb, OID_AUTO, max_interrupt_rate, CTLFLAG_RDTUN,
     &igb_max_interrupt_rate, 0, "Maximum interrupts per second");
 
+#if __FreeBSD_version >= 800000
+/*
+** Tuneable number of buffers in the buf-ring (drbr_xxx)
+*/
+static int igb_buf_ring_size = IGB_BR_SIZE;
+TUNABLE_INT("hw.igb.buf_ring_size", &igb_buf_ring_size);
+SYSCTL_INT(_hw_igb, OID_AUTO, buf_ring_size, CTLFLAG_RDTUN,
+    &igb_buf_ring_size, 0, "Size of the bufring");
+#endif
+
 /*
 ** Header split causes the packet header to
 ** be dma'd to a seperate mbuf from the payload.
@@ -965,12 +975,13 @@ igb_mq_start(struct ifnet *ifp, struct m
 		** out-of-order delivery, but 
 		** settle for it if that fails
 		*/
-		if (m)
+		if (m != NULL)
 			drbr_enqueue(ifp, txr->br, m);
 		err = igb_mq_start_locked(ifp, txr);
 		IGB_TX_UNLOCK(txr);
 	} else {
-		err = drbr_enqueue(ifp, txr->br, m);
+		if (m != NULL)
+			err = drbr_enqueue(ifp, txr->br, m);
 		taskqueue_enqueue(que->tq, &txr->txq_task);
 	}
 
@@ -994,12 +1005,22 @@ igb_mq_start_locked(struct ifnet *ifp, s
 	enq = 0;
 
 	/* Process the queue */
-	while ((next = drbr_dequeue(ifp, txr->br)) != NULL) {
+	while ((next = drbr_peek(ifp, txr->br)) != NULL) {
 		if ((err = igb_xmit(txr, &next)) != 0) {
-			if (next != NULL)
-				err = drbr_enqueue(ifp, txr->br, next);
+			if (next == NULL) {
+				/* It was freed, move forward */
+				drbr_advance(ifp, txr->br);
+			} else {
+				/* 
+				 * Still have one left, it may not be
+				 * the same since the transmit function
+				 * may have changed it.
+				 */
+				drbr_putback(ifp, txr->br, next);
+			}
 			break;
 		}
+		drbr_advance(ifp, txr->br);
 		enq++;
 		ifp->if_obytes += next->m_pkthdr.len;
 		if (next->m_flags & M_MCAST)
@@ -3301,7 +3322,7 @@ igb_allocate_queues(struct adapter *adap
         	}
 #if __FreeBSD_version >= 800000
 		/* Allocate a buf ring */
-		txr->br = buf_ring_alloc(IGB_BR_SIZE, M_DEVBUF,
+		txr->br = buf_ring_alloc(igb_buf_ring_size, M_DEVBUF,
 		    M_WAITOK, &txr->tx_mtx);
 #endif
 	}

Modified: head/sys/dev/ixgbe/ixgbe.c
==============================================================================
--- head/sys/dev/ixgbe/ixgbe.c	Thu Feb  7 15:19:12 2013	(r246481)
+++ head/sys/dev/ixgbe/ixgbe.c	Thu Feb  7 15:20:54 2013	(r246482)
@@ -832,22 +832,24 @@ ixgbe_mq_start_locked(struct ifnet *ifp,
 	}
 
 	enqueued = 0;
-	if (m == NULL) {
-		next = drbr_dequeue(ifp, txr->br);
-	} else if (drbr_needs_enqueue(ifp, txr->br)) {
-		if ((err = drbr_enqueue(ifp, txr->br, m)) != 0)
+	if (m != NULL) {
+		err = drbr_enqueue(ifp, txr->br, m);
+		if (err) {
 			return (err);
-		next = drbr_dequeue(ifp, txr->br);
-	} else
-		next = m;
+		}
+	}
 
 	/* Process the queue */
-	while (next != NULL) {
+	while ((next = drbr_peek(ifp, txr->br)) != NULL) {
 		if ((err = ixgbe_xmit(txr, &next)) != 0) {
-			if (next != NULL)
-				err = drbr_enqueue(ifp, txr->br, next);
+			if (next == NULL) {
+				drbr_advance(ifp, txr->br);
+			} else {
+				drbr_putback(ifp, txr->br, next);
+			}
 			break;
 		}
+		drbr_advance(ifp, txr->br);
 		enqueued++;
 		/* Send a copy of the frame to the BPF listener */
 		ETHER_BPF_MTAP(ifp, next);
@@ -855,7 +857,6 @@ ixgbe_mq_start_locked(struct ifnet *ifp,
 			break;
 		if (txr->tx_avail < IXGBE_TX_OP_THRESHOLD)
 			ixgbe_txeof(txr);
-		next = drbr_dequeue(ifp, txr->br);
 	}
 
 	if (enqueued > 0) {

Modified: head/sys/dev/ixgbe/ixv.c
==============================================================================
--- head/sys/dev/ixgbe/ixv.c	Thu Feb  7 15:19:12 2013	(r246481)
+++ head/sys/dev/ixgbe/ixv.c	Thu Feb  7 15:20:54 2013	(r246482)
@@ -620,22 +620,23 @@ ixv_mq_start_locked(struct ifnet *ifp, s
 		ixv_txeof(txr);
 
 	enqueued = 0;
-	if (m == NULL) {
-		next = drbr_dequeue(ifp, txr->br);
-	} else if (drbr_needs_enqueue(ifp, txr->br)) {
-		if ((err = drbr_enqueue(ifp, txr->br, m)) != 0)
+	if (m != NULL) {
+		err = drbr_enqueue(ifp, txr->br, m);
+		if (err) {
 			return (err);
-		next = drbr_dequeue(ifp, txr->br);
-	} else
-		next = m;
-
+		}
+	}
 	/* Process the queue */
-	while (next != NULL) {
+	while ((next = drbr_peek(ifp, txr->br)) != NULL) {
 		if ((err = ixv_xmit(txr, &next)) != 0) {
-			if (next != NULL)
-				err = drbr_enqueue(ifp, txr->br, next);
+			if (next == NULL) {
+				drbr_advance(ifp, txr->br);
+			} else {
+				drbr_putback(ifp, txr->br, next);
+			}
 			break;
 		}
+		drbr_advance(ifp, txr->br);
 		enqueued++;
 		ifp->if_obytes += next->m_pkthdr.len;
 		if (next->m_flags & M_MCAST)
@@ -648,7 +649,6 @@ ixv_mq_start_locked(struct ifnet *ifp, s
 			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 			break;
 		}
-		next = drbr_dequeue(ifp, txr->br);
 	}
 
 	if (enqueued > 0) {

Modified: head/sys/dev/oce/oce_if.c
==============================================================================
--- head/sys/dev/oce/oce_if.c	Thu Feb  7 15:19:12 2013	(r246481)
+++ head/sys/dev/oce/oce_if.c	Thu Feb  7 15:20:54 2013	(r246482)
@@ -1166,29 +1166,27 @@ oce_multiq_transmit(struct ifnet *ifp, s
 		return status;
 	}
 
-	if (m == NULL)
-		next = drbr_dequeue(ifp, br);		
-	else if (drbr_needs_enqueue(ifp, br)) {
+	 if (m != NULL) {
 		if ((status = drbr_enqueue(ifp, br, m)) != 0)
 			return status;
-		next = drbr_dequeue(ifp, br);
-	} else
-		next = m;
-
-	while (next != NULL) {
+	} 
+	while ((next = drbr_peek(ifp, br)) != NULL) {
 		if (oce_tx(sc, &next, queue_index)) {
-			if (next != NULL) {
+			if (next == NULL) {
+				drbr_advance(ifp, br);
+			} else {
+				drbr_putback(ifp, br, next);
 				wq->tx_stats.tx_stops ++;
 				ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 				status = drbr_enqueue(ifp, br, next);
 			}  
 			break;
 		}
+		drbr_advance(ifp, br);
 		ifp->if_obytes += next->m_pkthdr.len;
 		if (next->m_flags & M_MCAST)
 			ifp->if_omcasts++;
 		ETHER_BPF_MTAP(ifp, next);
-		next = drbr_dequeue(ifp, br);
 	}
 
 	return status;

Modified: head/sys/net/if_var.h
==============================================================================
--- head/sys/net/if_var.h	Thu Feb  7 15:19:12 2013	(r246481)
+++ head/sys/net/if_var.h	Thu Feb  7 15:20:54 2013	(r246482)
@@ -622,6 +622,45 @@ drbr_enqueue(struct ifnet *ifp, struct b
 }
 
 static __inline void
+drbr_putback(struct ifnet *ifp, struct buf_ring *br, struct mbuf *new)
+{
+	/*
+	 * The top of the list needs to be swapped 
+	 * for this one.
+	 */
+#ifdef ALTQ
+	if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
+		/* 
+		 * Peek in altq case dequeued it
+		 * so put it back.
+		 */
+		IFQ_DRV_PREPEND(&ifp->if_snd, new);
+		return;
+	}
+#endif
+	buf_ring_putback_sc(br, new);
+}
+
+static __inline struct mbuf *
+drbr_peek(struct ifnet *ifp, struct buf_ring *br)
+{
+#ifdef ALTQ
+	struct mbuf *m;
+	if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {
+		/* 
+		 * Pull it off like a dequeue
+		 * since drbr_advance() does nothing
+		 * for altq and drbr_putback() will
+		 * use the old prepend function.
+		 */
+		IFQ_DEQUEUE(&ifp->if_snd, m);
+		return (m);
+	}
+#endif
+	return(buf_ring_peek(br));
+}
+
+static __inline void
 drbr_flush(struct ifnet *ifp, struct buf_ring *br)
 {
 	struct mbuf *m;
@@ -648,7 +687,7 @@ drbr_dequeue(struct ifnet *ifp, struct b
 #ifdef ALTQ
 	struct mbuf *m;
 
-	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {	
+	if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd)) {	
 		IFQ_DEQUEUE(&ifp->if_snd, m);
 		return (m);
 	}
@@ -656,6 +695,18 @@ drbr_dequeue(struct ifnet *ifp, struct b
 	return (buf_ring_dequeue_sc(br));
 }
 
+static __inline void
+drbr_advance(struct ifnet *ifp, struct buf_ring *br)
+{
+#ifdef ALTQ
+	/* Nothing to do here since peek dequeues in altq case */
+	if (ifp != NULL && ALTQ_IS_ENABLED(&ifp->if_snd))
+		return;
+#endif
+	return (buf_ring_advance_sc(br));
+}
+
+
 static __inline struct mbuf *
 drbr_dequeue_cond(struct ifnet *ifp, struct buf_ring *br,
     int (*func) (struct mbuf *, void *), void *arg) 

Modified: head/sys/ofed/drivers/net/mlx4/en_tx.c
==============================================================================
--- head/sys/ofed/drivers/net/mlx4/en_tx.c	Thu Feb  7 15:19:12 2013	(r246481)
+++ head/sys/ofed/drivers/net/mlx4/en_tx.c	Thu Feb  7 15:20:54 2013	(r246482)
@@ -931,22 +931,21 @@ mlx4_en_transmit_locked(struct ifnet *de
 	}
 
 	enqueued = 0;
-	if (m == NULL) {
-		next = drbr_dequeue(dev, ring->br);
-	} else if (drbr_needs_enqueue(dev, ring->br)) {
+	if (m != NULL) {
 		if ((err = drbr_enqueue(dev, ring->br, m)) != 0)
 			return (err);
-		next = drbr_dequeue(dev, ring->br);
-	} else
-		next = m;
-
+	}
 	/* Process the queue */
-	while (next != NULL) {
+	while ((next = drbr_peek(ifp, ring->br)) != NULL) {
 		if ((err = mlx4_en_xmit(dev, tx_ind, &next)) != 0) {
-			if (next != NULL)
-				err = drbr_enqueue(dev, ring->br, next);
+			if (next == NULL) {
+				drbr_advance(ifp, ring->br);
+			} else {
+				drbr_putback(ifp, ring->br, next);
+			}
 			break;
 		}
+		drbr_advance(ifp, ring->br);
 		enqueued++;
 		dev->if_obytes += next->m_pkthdr.len;
 		if (next->m_flags & M_MCAST)
@@ -955,7 +954,6 @@ mlx4_en_transmit_locked(struct ifnet *de
 		ETHER_BPF_MTAP(dev, next);
 		if ((dev->if_drv_flags & IFF_DRV_RUNNING) == 0)
 			break;
-		next = drbr_dequeue(dev, ring->br);
 	}
 
 	if (enqueued > 0)

Modified: head/sys/sys/buf_ring.h
==============================================================================
--- head/sys/sys/buf_ring.h	Thu Feb  7 15:19:12 2013	(r246481)
+++ head/sys/sys/buf_ring.h	Thu Feb  7 15:20:54 2013	(r246482)
@@ -208,6 +208,54 @@ buf_ring_dequeue_sc(struct buf_ring *br)
 }
 
 /*
+ * single-consumer advance after a peek
+ * use where it is protected by a lock
+ * e.g. a network driver's tx queue lock
+ */
+static __inline void
+buf_ring_advance_sc(struct buf_ring *br)
+{
+	uint32_t cons_head, cons_next;
+	uint32_t prod_tail;
+	
+	cons_head = br->br_cons_head;
+	prod_tail = br->br_prod_tail;
+	
+	cons_next = (cons_head + 1) & br->br_cons_mask;
+	if (cons_head == prod_tail) 
+		return;
+	br->br_cons_head = cons_next;
+#ifdef DEBUG_BUFRING
+	br->br_ring[cons_head] = NULL;
+#endif
+	br->br_cons_tail = cons_next;
+}
+
+/*
+ * Used to return a buffer (most likely already there)
+ * to the top od the ring. The caller should *not*
+ * have used any dequeue to pull it out of the ring
+ * but instead should have used the peek() function.
+ * This is normally used where the transmit queue
+ * of a driver is full, and an mubf must be returned.
+ * Most likely whats in the ring-buffer is what
+ * is being put back (since it was not removed), but
+ * sometimes the lower transmit function may have
+ * done a pullup or other function that will have
+ * changed it. As an optimzation we always put it
+ * back (since jhb says the store is probably cheaper),
+ * if we have to do a multi-queue version we will need
+ * the compare and an atomic.
+ */
+static __inline void
+buf_ring_putback_sc(struct buf_ring *br, void *new)
+{
+	KASSERT(br->br_cons_head != br->br_prod_tail, 
+		("Buf-Ring has none in putback")) ;
+	br->br_ring[br->br_cons_head] = new;
+}
+
+/*
  * return a pointer to the first entry in the ring
  * without modifying it, or NULL if the ring is empty
  * race-prone if not protected by a lock


More information about the svn-src-all mailing list