svn commit: r208901 - head/sys/dev/xen/netfront

Tue Jun 8 03:13:27 UTC 2010

Author: ken
Date: Tue Jun  8 03:13:26 2010
New Revision: 208901
URL: http://svn.freebsd.org/changeset/base/208901

Log:
  A number of netfront fixes and stability improvements:
  
   - Re-enable TSO.  This was broken previously due to CSUM_TSO clearing the
     CSUM_TCP flag, so our checksum flags were incorrectly set going to the
     netback driver.  That was fixed in r206844 in tcp_output.c, so we can
     turn TSO back on here.
  
   - Fix the way transmit slots are calculated, so that we can't overfill
     the ring.
  
   - Avoid sending packets with more fragments/segments than netback can
     handle.  The Linux netback code can only handle packets of
     MAX_SKB_FRAGS, which turns out to be 18 on machines with 4K pages.  We
     can easily generate packets with 32 or so fragments with TSO turned on.
     Right now the solution is just to drop the packets (since netback
     doesn't seem to handle it gracefully), but we should come up with a way
     to allow a driver to tell the TCP stack the maximum number of fragments
     it can handle in a single packet.
  
   - Fix the way the consumer is tracked in the receive path.  It could get
     out of sync fairly easily.
  
   - Use standard Xen ring macros to make it clearer how netfront is using
     the rings.
  
   - Get rid of Linux-ish negative errno return values.
  
   - Added more documentation to the driver.
  
   - Refactored code to make it easier to read.
  
   - Some other minor fixes.
  
  Reviewed by:	gibbs
  
  Reviewed by:	gibbs
  Sponsored by:	Spectra Logic
  MFC after:	7 days

Modified:
  head/sys/dev/xen/netfront/netfront.c

Modified: head/sys/dev/xen/netfront/netfront.c
==============================================================================

--- head/sys/dev/xen/netfront/netfront.c	Mon Jun  7 22:43:37 2010	(r208900)
+++ head/sys/dev/xen/netfront/netfront.c	Tue Jun  8 03:13:26 2010	(r208901)
@@ -89,7 +89,7 @@ __FBSDID("$FreeBSD$");
 
 #include "xenbus_if.h"
 
-#define XN_CSUM_FEATURES	(CSUM_TCP | CSUM_UDP)
+#define XN_CSUM_FEATURES	(CSUM_TCP | CSUM_UDP | CSUM_TSO)
 
 #define GRANT_INVALID_REF	0
 
@@ -124,7 +124,16 @@ static const int MODPARM_rx_copy = 1;
 static const int MODPARM_rx_flip = 0;
 #endif
 
-#define MAX_SKB_FRAGS	(65536/PAGE_SIZE + 2)
+/**
+ * \brief The maximum allowed data fragments in a single transmit
+ *        request.
+ *
+ * This limit is imposed by the backend driver.  We assume here that
+ * we are dealing with a Linux driver domain and have set our limit
+ * to mirror the Linux MAX_SKB_FRAGS constant.
+ */
+#define	MAX_TX_REQ_FRAGS (65536 / PAGE_SIZE + 2)
+
 #define RX_COPY_THRESHOLD 256
 
 #define net_ratelimit() 0
@@ -140,6 +149,9 @@ static void xn_tick_locked(struct netfro
 static void xn_tick(void *);
 
 static void xn_intr(void *);
+static inline int xn_count_frags(struct mbuf *m);
+static int  xn_assemble_tx_request(struct netfront_info *sc,
+				   struct mbuf *m_head);
 static void xn_start_locked(struct ifnet *);
 static void xn_start(struct ifnet *);
 static int  xn_ioctl(struct ifnet *, u_long, caddr_t);
@@ -174,8 +186,8 @@ static void xn_free_rx_ring(struct netfr
 static void xn_free_tx_ring(struct netfront_info *);
 
 static int xennet_get_responses(struct netfront_info *np,
-	struct netfront_rx_info *rinfo, RING_IDX rp, struct mbuf **list,
-	int *pages_flipped_p);
+	struct netfront_rx_info *rinfo, RING_IDX rp, RING_IDX *cons,
+	struct mbuf **list, int *pages_flipped_p);
 
 #define virt_to_mfn(x) (vtomach(x) >> PAGE_SHIFT)
 
@@ -187,11 +199,12 @@ static int xennet_get_responses(struct n
  * not the other way around.  The size must track the free index arrays.
  */
 struct xn_chain_data {
-		struct mbuf		*xn_tx_chain[NET_TX_RING_SIZE+1];
-		int			xn_tx_chain_cnt;
-		struct mbuf		*xn_rx_chain[NET_RX_RING_SIZE+1];
+	struct mbuf    *xn_tx_chain[NET_TX_RING_SIZE+1];
+	int		xn_tx_chain_cnt;
+	struct mbuf    *xn_rx_chain[NET_RX_RING_SIZE+1];
 };
 
+#define NUM_ELEMENTS(x) (sizeof(x)/sizeof(*x))
 
 struct net_device_stats
 {
@@ -255,17 +268,11 @@ struct netfront_info {
 	int rx_max_target;
 	int rx_target;
 
-	/*
-	 * {tx,rx}_skbs store outstanding skbuffs. The first entry in each
-	 * array is an index into a chain of free entries.
-	 */
-
 	grant_ref_t gref_tx_head;
 	grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1]; 
 	grant_ref_t gref_rx_head;
 	grant_ref_t grant_rx_ref[NET_TX_RING_SIZE + 1]; 
 
-#define TX_MAX_TARGET min(NET_RX_RING_SIZE, 256)
 	device_t		xbdev;
 	int			tx_ring_ref;
 	int			rx_ring_ref;
@@ -288,7 +295,7 @@ struct netfront_info {
 #define XN_LOCK_INIT(_sc, _name) \
         mtx_init(&(_sc)->tx_lock, #_name"_tx", "network transmit lock", MTX_DEF); \
         mtx_init(&(_sc)->rx_lock, #_name"_rx", "network receive lock", MTX_DEF);  \
-    mtx_init(&(_sc)->sc_lock, #_name"_sc", "netfront softc lock", MTX_DEF)
+        mtx_init(&(_sc)->sc_lock, #_name"_sc", "netfront softc lock", MTX_DEF)
 
 #define XN_RX_LOCK(_sc)           mtx_lock(&(_sc)->rx_lock)
 #define XN_RX_UNLOCK(_sc)         mtx_unlock(&(_sc)->rx_lock)
@@ -324,18 +331,22 @@ struct netfront_rx_info {
  */
 
 static inline void
-add_id_to_freelist(struct mbuf **list, unsigned short id)
+add_id_to_freelist(struct mbuf **list, uintptr_t id)
 {
-	KASSERT(id != 0, ("add_id_to_freelist: the head item (0) must always be free."));
+	KASSERT(id != 0,
+		("%s: the head item (0) must always be free.", __func__));
 	list[id] = list[0];
-	list[0]  = (void *)(u_long)id;
+	list[0]  = (struct mbuf *)id;
 }
 
 static inline unsigned short
 get_id_from_freelist(struct mbuf **list)
 {
-	u_int id = (u_int)(u_long)list[0];
-	KASSERT(id != 0, ("get_id_from_freelist: the head item (0) must always remain free."));
+	uintptr_t id;
+
+	id = (uintptr_t)list[0];
+	KASSERT(id != 0,
+		("%s: the head item (0) must always remain free.", __func__));
 	list[0] = list[id];
 	return (id);
 }
@@ -347,8 +358,7 @@ xennet_rxidx(RING_IDX idx)
 }
 
 static inline struct mbuf *
-xennet_get_rx_mbuf(struct netfront_info *np,
-						RING_IDX ri)
+xennet_get_rx_mbuf(struct netfront_info *np, RING_IDX ri)
 {
 	int i = xennet_rxidx(ri);
 	struct mbuf *m;
@@ -495,25 +505,25 @@ talk_to_backend(device_t dev, struct net
 		goto destroy_ring;
 	}
 	err = xenbus_printf(xbt, node, "tx-ring-ref","%u",
-			    info->tx_ring_ref);
+			info->tx_ring_ref);
 	if (err) {
 		message = "writing tx ring-ref";
 		goto abort_transaction;
 	}
 	err = xenbus_printf(xbt, node, "rx-ring-ref","%u",
-			    info->rx_ring_ref);
+			info->rx_ring_ref);
 	if (err) {
 		message = "writing rx ring-ref";
 		goto abort_transaction;
 	}
 	err = xenbus_printf(xbt, node,
-		"event-channel", "%u", irq_to_evtchn_port(info->irq));
+			"event-channel", "%u", irq_to_evtchn_port(info->irq));
 	if (err) {
 		message = "writing event-channel";
 		goto abort_transaction;
 	}
 	err = xenbus_printf(xbt, node, "request-rx-copy", "%u",
-			    info->copying_receiver);
+			info->copying_receiver);
 	if (err) {
 		message = "writing request-rx-copy";
 		goto abort_transaction;
@@ -674,9 +684,9 @@ xn_free_rx_ring(struct netfront_info *sc
 	int i;
 	
 	for (i = 0; i < NET_RX_RING_SIZE; i++) {
-		if (sc->xn_cdata.xn_rx_chain[i] != NULL) {
-			m_freem(sc->xn_cdata.xn_rx_chain[i]);
-			sc->xn_cdata.xn_rx_chain[i] = NULL;
+		if (sc->xn_cdata.rx_mbufs[i] != NULL) {
+			m_freem(sc->rx_mbufs[i]);
+			sc->rx_mbufs[i] = NULL;
 		}
 	}
 	
@@ -693,8 +703,8 @@ xn_free_tx_ring(struct netfront_info *sc
 	int i;
 	
 	for (i = 0; i < NET_TX_RING_SIZE; i++) {
-		if (sc->xn_cdata.xn_tx_chain[i] != NULL) {
-			m_freem(sc->xn_cdata.xn_tx_chain[i]);
+		if (sc->tx_mbufs[i] != NULL) {
+			m_freem(sc->tx_mbufs[i]);
 			sc->xn_cdata.xn_tx_chain[i] = NULL;
 		}
 	}
@@ -703,39 +713,36 @@ xn_free_tx_ring(struct netfront_info *sc
 #endif
 }
 
-/*
- * Do some brief math on the number of descriptors available to
- * determine how many slots are available.
- *
- * Firstly - wouldn't something with RING_FREE_REQUESTS() be more applicable?
- * Secondly - MAX_SKB_FRAGS is a Linux construct which may not apply here.
- * Thirdly - it isn't used here anyway; the magic constant '24' is possibly
- *   wrong?
- * The "2" is presumably to ensure there are also enough slots available for
- * the ring entries used for "options" (eg, the TSO entry before a packet
- * is queued); I'm not sure why its 2 and not 1. Perhaps to make sure there's
- * a "free" node in the tx mbuf list (node 0) to represent the freelist?
+/**
+ * \brief Verify that there is sufficient space in the Tx ring
+ *        buffer for a maximally sized request to be enqueued.
  *
- * This only figures out whether any xenbus ring descriptors are available;
- * it doesn't at all reflect how many tx mbuf ring descriptors are also
- * available.
+ * A transmit request requires a transmit descriptor for each packet
+ * fragment, plus up to 2 entries for "options" (e.g. TSO).
  */
 static inline int
-netfront_tx_slot_available(struct netfront_info *np)
+xn_tx_slot_available(struct netfront_info *np)
 {
-	return ((np->tx.req_prod_pvt - np->tx.rsp_cons) <
-		(TX_MAX_TARGET - /* MAX_SKB_FRAGS */ 24 - 2));
+	return (RING_FREE_REQUESTS(&np->tx) > (MAX_TX_REQ_FRAGS + 2));
 }
+
 static void
 netif_release_tx_bufs(struct netfront_info *np)
 {
-	struct mbuf *m;
 	int i;
 
 	for (i = 1; i <= NET_TX_RING_SIZE; i++) {
-		m = np->xn_cdata.xn_tx_chain[i];
+		struct mbuf *m;
+
+		m = np->tx_mbufs[i];
 
-		if (((u_long)m) < KERNBASE)
+		/*
+		 * We assume that no kernel addresses are
+		 * less than NET_TX_RING_SIZE.  Any entry
+		 * in the table that is below this number
+		 * must be an index from free-list tracking.
+		 */
+		if (((uintptr_t)m) <= NET_TX_RING_SIZE)
 			continue;
 		gnttab_grant_foreign_access_ref(np->grant_tx_ref[i],
 		    xenbus_get_otherend_id(np->xbdev),
@@ -774,19 +781,25 @@ network_alloc_rx_buffers(struct netfront
 		return;
 	
 	/*
-	 * Allocate skbuffs greedily, even though we batch updates to the
+	 * Allocate mbufs greedily, even though we batch updates to the
 	 * receive ring. This creates a less bursty demand on the memory
-	 * allocator, so should reduce the chance of failed allocation
+	 * allocator, and so should reduce the chance of failed allocation
 	 * requests both for ourself and for other kernel subsystems.
+	 *
+	 * Here we attempt to maintain rx_target buffers in flight, counting
+	 * buffers that we have yet to process in the receive ring.
 	 */
 	batch_target = sc->rx_target - (req_prod - sc->rx.rsp_cons);
 	for (i = mbufq_len(&sc->xn_rx_batch); i < batch_target; i++) {
 		MGETHDR(m_new, M_DONTWAIT, MT_DATA);
-		if (m_new == NULL) 
+		if (m_new == NULL) {
+			printf("%s: MGETHDR failed\n", __func__);
 			goto no_mbuf;
+		}
 
 		m_cljget(m_new, M_DONTWAIT, MJUMPAGESIZE);
 		if ((m_new->m_flags & M_EXT) == 0) {
+			printf("%s: m_cljget failed\n", __func__);
 			m_freem(m_new);
 
 no_mbuf:
@@ -803,16 +816,29 @@ no_mbuf:
 		mbufq_tail(&sc->xn_rx_batch, m_new);
 	}
 	
-	/* Is the batch large enough to be worthwhile? */
+	/*
+	 * If we've allocated at least half of our target number of entries,
+	 * submit them to the backend - we have enough to make the overhead
+	 * of submission worthwhile.  Otherwise wait for more mbufs and
+	 * request entries to become available.
+	 */
 	if (i < (sc->rx_target/2)) {
 		if (req_prod >sc->rx.sring->req_prod)
 			goto push;
 		return;
 	}
-	/* Adjust floating fill target if we risked running out of buffers. */
-	if ( ((req_prod - sc->rx.sring->rsp_prod) < (sc->rx_target / 4)) &&
-	     ((sc->rx_target *= 2) > sc->rx_max_target) )
-		sc->rx_target = sc->rx_max_target;
+
+	/*
+	 * Double floating fill target if we risked having the backend
+	 * run out of empty buffers for receive traffic.  We define "running
+	 * low" as having less than a fourth of our target buffers free
+	 * at the time we refilled the queue. 
+	 */
+	if ((req_prod - sc->rx.sring->rsp_prod) < (sc->rx_target / 4)) {
+		sc->rx_target *= 2;
+		if (sc->rx_target > sc->rx_max_target)
+			sc->rx_target = sc->rx_max_target;
+	}
 
 refill:
 	for (nr_flips = i = 0; ; i++) {
@@ -824,9 +850,8 @@ refill:
 
 		id = xennet_rxidx(req_prod + i);
 
-		KASSERT(sc->xn_cdata.xn_rx_chain[id] == NULL,
-		    ("non-NULL xm_rx_chain"));
-		sc->xn_cdata.xn_rx_chain[id] = m_new;
+		KASSERT(sc->rx_mbufs[id] == NULL, ("non-NULL xm_rx_chain"));
+		sc->rx_mbufs[id] = m_new;
 
 		ref = gnttab_claim_grant_reference(&sc->gref_rx_head);
 		KASSERT((short)ref >= 0, ("negative ref"));
@@ -950,14 +975,13 @@ xn_rxeof(struct netfront_info *np)
 			memset(extras, 0, sizeof(rinfo.extras));
 
 			m = NULL;
-			err = xennet_get_responses(np, &rinfo, rp, &m,
+			err = xennet_get_responses(np, &rinfo, rp, &i, &m,
 			    &pages_flipped);
 
 			if (unlikely(err)) {
 				if (m)
 					mbufq_tail(&errq, m);
 				np->stats.rx_errors++;
-				i = np->rx.rsp_cons;
 				continue;
 			}
 
@@ -979,7 +1003,7 @@ xn_rxeof(struct netfront_info *np)
 			np->stats.rx_bytes += m->m_pkthdr.len;
 
 			mbufq_tail(&rxq, m);
-			np->rx.rsp_cons = ++i;
+			np->rx.rsp_cons = i;
 		}
 
 		if (pages_flipped) {
@@ -1084,9 +1108,16 @@ xn_txeof(struct netfront_info *np)
 			if (txr->status == NETIF_RSP_NULL)
 				continue;
 
+			if (txr->status != NETIF_RSP_OKAY) {
+				printf("%s: WARNING: response is %d!\n",
+				       __func__, txr->status);
+			}
 			id = txr->id;
-			m = np->xn_cdata.xn_tx_chain[id]; 
+			m = np->tx_mbufs[id]; 
 			KASSERT(m != NULL, ("mbuf not found in xn_tx_chain"));
+			KASSERT((uintptr_t)m > NET_TX_RING_SIZE,
+				("mbuf already on the free list, but we're "
+				"trying to free it again!"));
 			M_ASSERTVALID(m);
 			
 			/*
@@ -1097,10 +1128,8 @@ xn_txeof(struct netfront_info *np)
 				ifp->if_opackets++;
 			if (unlikely(gnttab_query_foreign_access(
 			    np->grant_tx_ref[id]) != 0)) {
-				WPRINTK("network_tx_buf_gc: warning "
-				    "-- grant still in use by backend "
-				    "domain.\n");
-				goto out; 
+				panic("grant id %u still in use by the backend",
+				      id);
 			}
 			gnttab_end_foreign_access_ref(
 				np->grant_tx_ref[id]);
@@ -1108,12 +1137,9 @@ xn_txeof(struct netfront_info *np)
 				&np->gref_tx_head, np->grant_tx_ref[id]);
 			np->grant_tx_ref[id] = GRANT_INVALID_REF;
 			
-			np->xn_cdata.xn_tx_chain[id] = NULL;
-			add_id_to_freelist(np->xn_cdata.xn_tx_chain, id);
+			np->tx_mbufs[id] = NULL;
+			add_id_to_freelist(np->tx_mbufs, id);
 			np->xn_cdata.xn_tx_chain_cnt--;
-			if (np->xn_cdata.xn_tx_chain_cnt < 0) {
-				panic("netif_release_tx_bufs: tx_chain_cnt must be >= 0");
-			}
 			m_free(m);
 			/* Only mark the queue active if we've freed up at least one slot to try */
 			ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
@@ -1135,7 +1161,6 @@ xn_txeof(struct netfront_info *np)
 		mb();
 	} while (prod != np->tx.sring->rsp_prod);
 	
- out: 
 	if (np->tx_full &&
 	    ((np->tx.sring->req_prod - prod) < NET_TX_RING_SIZE)) {
 		np->tx_full = 0;
@@ -1159,7 +1184,7 @@ xn_intr(void *xsc)
 	    ifp->if_drv_flags & IFF_DRV_RUNNING))
 		return;
 #endif
-	if (np->tx.rsp_cons != np->tx.sring->rsp_prod) {
+	if (RING_HAS_UNCONSUMED_RESPONSES(&np->tx)) {
 		XN_TX_LOCK(np);
 		xn_txeof(np);
 		XN_TX_UNLOCK(np);			
@@ -1191,10 +1216,9 @@ xennet_move_rx_slot(struct netfront_info
 
 static int
 xennet_get_extras(struct netfront_info *np,
-    struct netif_extra_info *extras, RING_IDX rp)
+    struct netif_extra_info *extras, RING_IDX rp, RING_IDX *cons)
 {
 	struct netif_extra_info *extra;
-	RING_IDX cons = np->rx.rsp_cons;
 
 	int err = 0;
 
@@ -1202,17 +1226,17 @@ xennet_get_extras(struct netfront_info *
 		struct mbuf *m;
 		grant_ref_t ref;
 
-		if (unlikely(cons + 1 == rp)) {
+		if (unlikely(*cons + 1 == rp)) {
 #if 0			
 			if (net_ratelimit())
 				WPRINTK("Missing extra info\n");
 #endif			
-			err = -EINVAL;
+			err = EINVAL;
 			break;
 		}
 
 		extra = (struct netif_extra_info *)
-		RING_GET_RESPONSE(&np->rx, ++cons);
+		RING_GET_RESPONSE(&np->rx, ++(*cons));
 
 		if (unlikely(!extra->type ||
 			extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) {
@@ -1221,23 +1245,22 @@ xennet_get_extras(struct netfront_info *
 				WPRINTK("Invalid extra type: %d\n",
 					extra->type);
 #endif			
-			err = -EINVAL;
+			err = EINVAL;
 		} else {
 			memcpy(&extras[extra->type - 1], extra, sizeof(*extra));
 		}
 
-		m = xennet_get_rx_mbuf(np, cons);
-		ref = xennet_get_rx_ref(np, cons);
+		m = xennet_get_rx_mbuf(np, *cons);
+		ref = xennet_get_rx_ref(np, *cons);
 		xennet_move_rx_slot(np, m, ref);
 	} while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE);
 
-	np->rx.rsp_cons = cons;
 	return err;
 }
 
 static int
 xennet_get_responses(struct netfront_info *np,
-	struct netfront_rx_info *rinfo, RING_IDX rp,
+	struct netfront_rx_info *rinfo, RING_IDX rp, RING_IDX *cons,
 	struct mbuf  **list,
 	int *pages_flipped_p)
 {
@@ -1246,26 +1269,25 @@ xennet_get_responses(struct netfront_inf
 	struct multicall_entry *mcl;
 	struct netif_rx_response *rx = &rinfo->rx;
 	struct netif_extra_info *extras = rinfo->extras;
-	RING_IDX cons = np->rx.rsp_cons;
 	struct mbuf *m, *m0, *m_prev;
-	grant_ref_t ref = xennet_get_rx_ref(np, cons);
-	int max = 5 /* MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD) */;
+	grant_ref_t ref = xennet_get_rx_ref(np, *cons);
+	RING_IDX ref_cons = *cons;
+	int max = 5 /* MAX_TX_REQ_FRAGS + (rx->status <= RX_COPY_THRESHOLD) */;
 	int frags = 1;
 	int err = 0;
 	u_long ret;
 
-	m0 = m = m_prev = xennet_get_rx_mbuf(np, cons);
+	m0 = m = m_prev = xennet_get_rx_mbuf(np, *cons);
 
 	
 	if (rx->flags & NETRXF_extra_info) {
-		err = xennet_get_extras(np, extras, rp);
-		cons = np->rx.rsp_cons;
+		err = xennet_get_extras(np, extras, rp, cons);
 	}
 
 
 	if (m0 != NULL) {
-			m0->m_pkthdr.len = 0;
-			m0->m_next = NULL;
+		m0->m_pkthdr.len = 0;
+		m0->m_next = NULL;
 	}
 	
 	for (;;) {
@@ -1277,14 +1299,18 @@ xennet_get_responses(struct netfront_inf
 #endif
 		if (unlikely(rx->status < 0 ||
 			rx->offset + rx->status > PAGE_SIZE)) {
+
 #if 0						
 			if (net_ratelimit())
 				WPRINTK("rx->offset: %x, size: %u\n",
 					rx->offset, rx->status);
 #endif						
 			xennet_move_rx_slot(np, m, ref);
-			err = -EINVAL;
-			goto next;
+			if (m0 == m)
+				m0 = NULL;
+			m = NULL;
+			err = EINVAL;
+			goto next_skip_queue;
 		}
 		
 		/*
@@ -1293,11 +1319,12 @@ xennet_get_responses(struct netfront_inf
 		 * situation to the system controller to reboot the backed.
 		 */
 		if (ref == GRANT_INVALID_REF) {
+
 #if 0 				
 			if (net_ratelimit())
 				WPRINTK("Bad rx response id %d.\n", rx->id);
 #endif			
-			err = -EINVAL;
+			err = EINVAL;
 			goto next;
 		}
 
@@ -1306,12 +1333,10 @@ xennet_get_responses(struct netfront_inf
 			 * headroom, ...
 			 */
 			if (!(mfn = gnttab_end_foreign_transfer_ref(ref))) {
-				if (net_ratelimit())
-					WPRINTK("Unfulfilled rx req "
-						"(id=%d, st=%d).\n",
-						rx->id, rx->status);
+				WPRINTK("Unfulfilled rx req (id=%d, st=%d).\n",
+					rx->id, rx->status);
 				xennet_move_rx_slot(np, m, ref);
-				err = -ENOMEM;
+				err = ENOMEM;
 				goto next;
 			}
 
@@ -1349,23 +1374,43 @@ next:
 		m->m_data += rx->offset;
 		m0->m_pkthdr.len += rx->status;
 		
+next_skip_queue:
 		if (!(rx->flags & NETRXF_more_data))
 			break;
 
-		if (cons + frags == rp) {
+		if (*cons + frags == rp) {
 			if (net_ratelimit())
 				WPRINTK("Need more frags\n");
-			err = -ENOENT;
+			err = ENOENT;
+			printf("%s: cons %u frags %u rp %u, not enough frags\n",
+			       __func__, *cons, frags, rp);
 				break;
 		}
+		/*
+		 * Note that m can be NULL, if rx->status < 0 or if
+		 * rx->offset + rx->status > PAGE_SIZE above.  
+		 */
 		m_prev = m;
 		
-		rx = RING_GET_RESPONSE(&np->rx, cons + frags);
-		m = xennet_get_rx_mbuf(np, cons + frags);
+		rx = RING_GET_RESPONSE(&np->rx, *cons + frags);
+		m = xennet_get_rx_mbuf(np, *cons + frags);
 
-		m_prev->m_next = m;
+		/*
+		 * m_prev == NULL can happen if rx->status < 0 or if
+		 * rx->offset + * rx->status > PAGE_SIZE above.  
+		 */
+		if (m_prev != NULL)
+			m_prev->m_next = m;
+
+		/*
+		 * m0 can be NULL if rx->status < 0 or if * rx->offset +
+		 * rx->status > PAGE_SIZE above.  
+		 */
+		if (m0 == NULL)
+			m0 = m;
 		m->m_next = NULL;
-		ref = xennet_get_rx_ref(np, cons + frags);
+		ref = xennet_get_rx_ref(np, *cons + frags);
+		ref_cons = *cons + frags;
 		frags++;
 	}
 	*list = m0;
@@ -1373,11 +1418,12 @@ next:
 	if (unlikely(frags > max)) {
 		if (net_ratelimit())
 			WPRINTK("Too many frags\n");
-		err = -E2BIG;
+		printf("%s: too many frags %d > max %d\n", __func__, frags,
+		       max);
+		err = E2BIG;
 	}
 
-	if (unlikely(err))
-		np->rx.rsp_cons = cons + frags;
+	*cons += frags;
 
 	*pages_flipped_p = pages_flipped;
 
@@ -1406,209 +1452,228 @@ xn_tick(void *xsc) 
 	XN_RX_UNLOCK(sc);
      
 }
-static void
-xn_start_locked(struct ifnet *ifp) 
+
+/**
+ * \brief Count the number of fragments in an mbuf chain.
+ *
+ * Surprisingly, there isn't an M* macro for this.
+ */
+static inline int
+xn_count_frags(struct mbuf *m)
 {
-	int otherend_id;
-	unsigned short id;
-	struct mbuf *m_head, *m;
-	struct netfront_info *sc;
-	netif_tx_request_t *tx;
+	int nfrags;
+
+	for (nfrags = 0; m != NULL; m = m->m_next)
+		nfrags++;
+
+	return (nfrags);
+}
+
+/**
+ * Given an mbuf chain, make sure we have enough room and then push
+ * it onto the transmit ring.
+ */
+static int
+xn_assemble_tx_request(struct netfront_info *sc, struct mbuf *m_head)
+{
+	struct ifnet *ifp;
+	struct mbuf *m;
+	u_int nfrags;
 	netif_extra_info_t *extra;
-	RING_IDX i;
-	grant_ref_t ref;
-	u_long mfn, tx_bytes;
-	int notify, nfrags;
+	int otherend_id;
 
-	sc = ifp->if_softc;
-	otherend_id = xenbus_get_otherend_id(sc->xbdev);
-	tx_bytes = 0;
+	ifp = sc->xn_ifp;
 
-	if (!netfront_carrier_ok(sc))
-		return;
-	
-	for (i = sc->tx.req_prod_pvt; TRUE; i++) {
-		IF_DEQUEUE(&ifp->if_snd, m_head);
-		if (m_head == NULL) 
-			break;
-		
-		/*
-		 * netfront_tx_slot_available() tries to do some math to
-		 * ensure that there'll be enough xenbus ring slots available
-		 * for the maximum number of packet fragments (and a couple more
-		 * for what I guess are TSO and other ring entry items.)
-		 */
-		if (!netfront_tx_slot_available(sc)) {
-			IF_PREPEND(&ifp->if_snd, m_head);
-			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
-			break;
-		}
+	/**
+	 * Defragment the mbuf if necessary.
+	 */
+	nfrags = xn_count_frags(m_head);
 
-		/*
-		 * Defragment the mbuf if necessary.
-		 */
-		for (m = m_head, nfrags = 0; m; m = m->m_next)
-			nfrags++;
-		if (nfrags > MAX_SKB_FRAGS) {
-			m = m_defrag(m_head, M_DONTWAIT);
-			if (!m) {
-				m_freem(m_head);
-				break;
-			}
-			m_head = m;
+	/*
+	 * Check to see whether this request is longer than netback
+	 * can handle, and try to defrag it.
+	 */
+	/**
+	 * It is a bit lame, but the netback driver in Linux can't
+	 * deal with nfrags > MAX_TX_REQ_FRAGS, which is a quirk of
+	 * the Linux network stack.
+	 */
+	if (nfrags > MAX_TX_REQ_FRAGS) {
+		m = m_defrag(m_head, M_DONTWAIT);
+		if (!m) {
+			/*
+			 * Defrag failed, so free the mbuf and
+			 * therefore drop the packet.
+			 */
+			m_freem(m_head);
+			return (EMSGSIZE);
 		}
+		m_head = m;
+	}
 
-		/* Determine how many fragments now exist */
-		for (m = m_head, nfrags = 0; m; m = m->m_next)
-			nfrags++;
+	/* Determine how many fragments now exist */
+	nfrags = xn_count_frags(m_head);
 
-		/*
-		 * Don't attempt to queue this packet if there aren't
-		 * enough free entries in the chain.
-		 *
-		 * There isn't a 1:1 correspondance between the mbuf TX ring
-		 * and the xenbus TX ring.
-		 * xn_txeof() may need to be called to free up some slots.
-		 *
-		 * It is quite possible that this can be later eliminated if
-		 * it turns out that partial * packets can be pushed into
-		 * the ringbuffer, with fragments pushed in when further slots
-		 * free up.
-		 *
-		 * It is also quite possible that the driver will lock up
-		 * if the TX queue fills up with no RX traffic, and
-		 * the mbuf ring is exhausted. The queue may need
-		 * a swift kick to continue.
-		 */
+	/*
+	 * Check to see whether the defragmented packet has too many
+	 * segments for the Linux netback driver.
+	 */
+	/**
+	 * The FreeBSD TCP stack, with TSO enabled, can produce a chain
+	 * of mbufs longer than Linux can handle.  Make sure we don't
+	 * pass a too-long chain over to the other side by dropping the
+	 * packet.  It doesn't look like there is currently a way to
+	 * tell the TCP stack to generate a shorter chain of packets.
+	 */
+	if (nfrags > MAX_TX_REQ_FRAGS) {
+		m_freem(m_head);
+		return (EMSGSIZE);
+	}
 
-		/*
-		 * It is not +1 like the allocation because we need to keep
-		 * slot [0] free for the freelist head
-		 */
-		if (sc->xn_cdata.xn_tx_chain_cnt + nfrags >= NET_TX_RING_SIZE) {
-			WPRINTK("xn_start_locked: xn_tx_chain_cnt (%d) + nfrags %d >= NET_TX_RING_SIZE (%d); must be full!\n",
-			    (int) sc->xn_cdata.xn_tx_chain_cnt,
-			    (int) nfrags, (int) NET_TX_RING_SIZE);
-			IF_PREPEND(&ifp->if_snd, m_head);
-			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
-			break;
-		}
+	/*
+	 * This check should be redundant.  We've already verified that we
+	 * have enough slots in the ring to handle a packet of maximum
+	 * size, and that our packet is less than the maximum size.  Keep
+	 * it in here as an assert for now just to make certain that
+	 * xn_tx_chain_cnt is accurate.
+	 */
+	KASSERT((sc->xn_cdata.xn_tx_chain_cnt + nfrags) <= NET_TX_RING_SIZE,
+		("%s: xn_tx_chain_cnt (%d) + nfrags (%d) > NET_TX_RING_SIZE "
+		 "(%d)!", __func__, (int) sc->xn_cdata.xn_tx_chain_cnt,
+                    (int) nfrags, (int) NET_TX_RING_SIZE));
 
-		/*
-		 * Make sure there's actually space available in the
-		 * Xen TX ring for this. Overcompensate for the possibility
-		 * of having a TCP offload fragment just in case for now
-		 * (the +1) rather than adding logic to accurately calculate
-		 * the required size.
-		 */
-		if (RING_FREE_REQUESTS(&sc->tx) < (nfrags + 1)) {
-			WPRINTK("xn_start_locked: free ring slots (%d) < (nfrags + 1) (%d); must be full!\n",
-			    (int) RING_FREE_REQUESTS(&sc->tx),
-			    (int) (nfrags + 1));
-			IF_PREPEND(&ifp->if_snd, m_head);
-			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
-			break;
-		}
+	/*
+	 * Start packing the mbufs in this chain into
+	 * the fragment pointers. Stop when we run out
+	 * of fragments or hit the end of the mbuf chain.
+	 */
+	m = m_head;
+	extra = NULL;
+	otherend_id = xenbus_get_otherend_id(sc->xbdev);
+	for (m = m_head; m; m = m->m_next) {
+		netif_tx_request_t *tx;
+		uintptr_t id;
+		grant_ref_t ref;
+		u_long mfn; /* XXX Wrong type? */
 
-		/*
-		 * Start packing the mbufs in this chain into
-		 * the fragment pointers. Stop when we run out
-		 * of fragments or hit the end of the mbuf chain.
-		 */
-		m = m_head;
-		extra = NULL;
-		for (m = m_head; m; m = m->m_next) {
-			tx = RING_GET_REQUEST(&sc->tx, i);
-			id = get_id_from_freelist(sc->xn_cdata.xn_tx_chain);
-			if (id == 0)
-				panic("xn_start_locked: was allocated the freelist head!\n");
-			sc->xn_cdata.xn_tx_chain_cnt++;
-			if (sc->xn_cdata.xn_tx_chain_cnt >= NET_TX_RING_SIZE+1)
-				panic("xn_start_locked: tx_chain_cnt must be < NET_TX_RING_SIZE+1\n");
-			sc->xn_cdata.xn_tx_chain[id] = m;
-			tx->id = id;
-			ref = gnttab_claim_grant_reference(&sc->gref_tx_head);
-			KASSERT((short)ref >= 0, ("Negative ref"));
-			mfn = virt_to_mfn(mtod(m, vm_offset_t));
-			gnttab_grant_foreign_access_ref(ref, otherend_id,
-			    mfn, GNTMAP_readonly);
-			tx->gref = sc->grant_tx_ref[id] = ref;
-			tx->offset = mtod(m, vm_offset_t) & (PAGE_SIZE - 1);
-			tx->flags = 0;
-			if (m == m_head) {
-				/*
-				 * The first fragment has the entire packet
-				 * size, subsequent fragments have just the
-				 * fragment size. The backend works out the
-				 * true size of the first fragment by
-				 * subtracting the sizes of the other
-				 * fragments.
-				 */
-				tx->size = m->m_pkthdr.len;
+		tx = RING_GET_REQUEST(&sc->tx, sc->tx.req_prod_pvt);
+		id = get_id_from_freelist(sc->tx_mbufs);
+		if (id == 0)
+			panic("xn_start_locked: was allocated the freelist head!\n");
+		sc->xn_cdata.xn_tx_chain_cnt++;
+		if (sc->xn_cdata.xn_tx_chain_cnt > NET_TX_RING_SIZE)
+			panic("xn_start_locked: tx_chain_cnt must be <= NET_TX_RING_SIZE\n");
+		sc->tx_mbufs[id] = m;
+		tx->id = id;
+		ref = gnttab_claim_grant_reference(&sc->gref_tx_head);
+		KASSERT((short)ref >= 0, ("Negative ref"));
+		mfn = virt_to_mfn(mtod(m, vm_offset_t));
+		gnttab_grant_foreign_access_ref(ref, otherend_id,
+		    mfn, GNTMAP_readonly);
+		tx->gref = sc->grant_tx_ref[id] = ref;
+		tx->offset = mtod(m, vm_offset_t) & (PAGE_SIZE - 1);
+		tx->flags = 0;
+		if (m == m_head) {
+			/*
+			 * The first fragment has the entire packet
+			 * size, subsequent fragments have just the
+			 * fragment size. The backend works out the
+			 * true size of the first fragment by
+			 * subtracting the sizes of the other
+			 * fragments.
+			 */
+			tx->size = m->m_pkthdr.len;
 
-				/*
-				 * The first fragment contains the
-				 * checksum flags and is optionally
-				 * followed by extra data for TSO etc.
-				 */
-				if (m->m_pkthdr.csum_flags
-				    & CSUM_DELAY_DATA) {
-					tx->flags |= (NETTXF_csum_blank
-					    | NETTXF_data_validated);
-				}
-#if __FreeBSD_version >= 700000
-				if (m->m_pkthdr.csum_flags & CSUM_TSO) {
-					struct netif_extra_info *gso =
-						(struct netif_extra_info *)
-						RING_GET_REQUEST(&sc->tx, ++i);
-
-					if (extra)
-						extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE;
-					else
-						tx->flags |= NETTXF_extra_info;
-
-					gso->u.gso.size = m->m_pkthdr.tso_segsz;
-					gso->u.gso.type =
-						XEN_NETIF_GSO_TYPE_TCPV4;
-					gso->u.gso.pad = 0;
-					gso->u.gso.features = 0;
-
-					gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
-					gso->flags = 0;
-					extra = gso;
-				}
-#endif
-			} else {
-				tx->size = m->m_len;
+			/*
+			 * The first fragment contains the checksum flags
+			 * and is optionally followed by extra data for
+			 * TSO etc.
+			 */
+			/**
+			 * CSUM_TSO requires checksum offloading.
+			 * Some versions of FreeBSD fail to
+			 * set CSUM_TCP in the CSUM_TSO case,
+			 * so we have to test for CSUM_TSO
+			 * explicitly.
+			 */
+			if (m->m_pkthdr.csum_flags
+			    & (CSUM_DELAY_DATA | CSUM_TSO)) {
+				tx->flags |= (NETTXF_csum_blank
+				    | NETTXF_data_validated);
 			}
-			if (m->m_next) {
-				tx->flags |= NETTXF_more_data;
-				i++;
+#if __FreeBSD_version >= 700000
+			if (m->m_pkthdr.csum_flags & CSUM_TSO) {
+				struct netif_extra_info *gso =
+					(struct netif_extra_info *)
+					RING_GET_REQUEST(&sc->tx,
+							 ++sc->tx.req_prod_pvt);
+
+				tx->flags |= NETTXF_extra_info;
+
+				gso->u.gso.size = m->m_pkthdr.tso_segsz;
+				gso->u.gso.type =
+					XEN_NETIF_GSO_TYPE_TCPV4;
+				gso->u.gso.pad = 0;
+				gso->u.gso.features = 0;
+
+				gso->type = XEN_NETIF_EXTRA_TYPE_GSO;
+				gso->flags = 0;
 			}
+#endif
+		} else {
+			tx->size = m->m_len;
 		}
+		if (m->m_next)
+			tx->flags |= NETTXF_more_data;
 
-		BPF_MTAP(ifp, m_head);
+		sc->tx.req_prod_pvt++;
+	}
+	BPF_MTAP(ifp, m_head);
+
+	sc->stats.tx_bytes += m_head->m_pkthdr.len;
+	sc->stats.tx_packets++;
+
+	return (0);
+}
 
-		sc->stats.tx_bytes += m_head->m_pkthdr.len;
-		sc->stats.tx_packets++;
+static void
+xn_start_locked(struct ifnet *ifp) 
+{
+	struct netfront_info *sc;
+	struct mbuf *m_head;
+	int notify;
+
+	sc = ifp->if_softc;
+
+	if (!netfront_carrier_ok(sc))
+		return;
+
+	/*
+	 * While we have enough transmit slots available for at least one
+	 * maximum-sized packet, pull mbufs off the queue and put them on
+	 * the transmit ring.
+	 */
+	while (xn_tx_slot_available(sc)) {
+		IF_DEQUEUE(&ifp->if_snd, m_head);
+		if (m_head == NULL)
+			break;
+
+		if (xn_assemble_tx_request(sc, m_head) != 0)
+			break;
 	}

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***