PERFORCE change 134065 for review

Thu Jan 24 23:08:25 PST 2008

http://perforce.freebsd.org/chv.cgi?CH=134065

Change 134065 by kmacy at kmacy:storage:toehead on 2008/01/25 07:07:45

	split ddp support and vm functions in to separate files

Affected files ...

.. //depot/projects/toehead/sys/dev/cxgb/sys/mbufq.h#3 edit
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#7 edit
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#6 edit
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#6 edit
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h#3 edit
.. //depot/projects/toehead/sys/modules/cxgb/tom/Makefile#2 edit

Differences ...

==== //depot/projects/toehead/sys/dev/cxgb/sys/mbufq.h#3 (text+ko) ====

@@ -103,7 +103,7 @@
 }
 
 static __inline struct mbuf *
-mbufq_peek(struct mbuf_head *l)
+mbufq_peek(const struct mbuf_head *l)
 {
 	return (l->head);
 }

==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#7 (text+ko) ====

@@ -490,12 +490,10 @@
 
 	m = m_gethdr_nofail(sizeof(*req));
 
-#ifdef notyet	
-	req = (struct cpl_rx_data_ack *)__skb_put(skb, sizeof(*req));
+	req = mtod(m, struct cpl_rx_data_ack *);
 	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
-#else
-	req = mtod(m, struct cpl_rx_data_ack *);
-#endif	
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	
 	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, toep->tp_tid));
 	req->credit_dack = htonl(F_RX_MODULATE | F_RX_DACK_CHANGE |
 				 V_RX_DACK_MODE(1) |
@@ -1163,6 +1161,20 @@
 	return V_FLAVORS_VALID(flv_valid) |
 	    V_CONG_CONTROL_FLAVOR(flv_valid ? TOM_TUNABLE(dev, cong_alg) : 0);
 }
+
+#if DEBUG_WR > 1
+static int
+count_pending_wrs(const struct toepcb *toep)
+{
+	const struct mbuf *m;
+	int n = 0;
+
+	wr_queue_walk(toep, m)
+		n += m->m_pkthdr.csum_data;
+	return (n);
+}
+#endif
+
 #if 0
 (((*(struct tom_data **)&(dev)->l4opt)->conf.cong_alg) != -1)
 #endif
@@ -2132,7 +2144,7 @@
 	unsigned int rcv_nxt = ntohl(req->rcv_nxt) - 1; /* exclude FIN */
 
 	if (tp->rcv_nxt == rcv_nxt)			/* no data */
-		return 0;
+		return (0);
 
 	if (__predict_false(so_no_receive(so))) {
 		handle_excess_rx(toep, m);
@@ -2189,7 +2201,6 @@
 		
 		goto out;
 	}
-	
 	if (toep->tp_ulp_mode == ULP_MODE_TCPDDP) {
 		keep = handle_peer_close_data(so, m);
 		if (keep < 0)
@@ -3533,7 +3544,6 @@
 		if (__predict_false(credits < p->m_pkthdr.csum_data)) {
 #if DEBUG_WR > 1
 			struct tx_data_wr *w = cplhdr(p);
-#ifdef notyet
 			log(LOG_ERR,
 			       "TID %u got %u WR credits, need %u, len %u, "
 			       "main body %u, frags %u, seq # %u, ACK una %u,"
@@ -3541,8 +3551,7 @@
 			       toep->tp_tid, credits, p->csum, p->len,
 			       p->len - p->data_len, skb_shinfo(p)->nr_frags,
 			       ntohl(w->sndseq), snd_una, ntohl(hdr->snd_nxt),
-			       WR_AVAIL(tp), count_pending_wrs(tp) - credits);
-#endif			
+			    toep->tp_wr_avail, count_pending_wrs(tp) - credits);
 #endif
 			p->m_pkthdr.csum_data -= credits;
 			break;
@@ -3880,11 +3889,9 @@
 	wrlen = sizeof(*wr) + 3 * sizeof(*req) + sizeof(*getreq);
 	m = m_gethdr_nofail(wrlen);
 	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
-#ifdef notyet	
-	wr = (struct work_request_hdr *)__skb_put(skb, wrlen);
-#else
 	wr = mtod(m, struct work_request_hdr *);
-#endif	
+	m->m_pkthdr.len = m->m_len = wrlen;
+
 	/* Set the ATOMIC flag to make sure that TP processes the following
 	 * CPLs in an atomic manner and no wire segments can be interleaved.
 	 */
@@ -3955,12 +3962,10 @@
 		(modulate ? sizeof(struct cpl_rx_data_ack) : 0);
 	m = m_gethdr_nofail(wrlen);
 	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, toep));
-#ifdef notyet
-	wr = (struct work_request_hdr *)__skb_put(skb, wrlen);
+	wr = mtod(m, struct work_request_hdr *);
 	wr->wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
-#else
-	wr = mtod(m, struct work_request_hdr *);
-#endif
+	m->m_pkthdr.len = m->m_len = wrlen;
+
 	req = (struct cpl_set_tcb_field *)(wr + 1);
 	if (len0) {                  /* program buffer 0 offset and length */
 		mk_set_tcb_field_ulp(req, toep->tp_tid, W_TCB_RX_DDP_BUF0_OFFSET,

==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#6 (text+ko) ====

@@ -73,6 +73,7 @@
 #include <dev/cxgb/common/cxgb_ctl_defs.h>
 #include <dev/cxgb/cxgb_l2t.h>
 #include <dev/cxgb/cxgb_offload.h>
+
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
@@ -86,6 +87,7 @@
 #include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
 #include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
 #include <dev/cxgb/ulp/tom/cxgb_tcp.h>
+#include <dev/cxgb/ulp/tom/cxgb_vm.h>
 
 static int	(*pru_sosend)(struct socket *so, struct sockaddr *addr,
     struct uio *uio, struct mbuf *top, struct mbuf *control,
@@ -95,9 +97,6 @@
     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
     int *flagsp);
 
-#define VM_HOLD_WRITEABLE	0x1
-static int  vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags);
-static void vm_fault_unhold_pages(vm_page_t *m, int count);
 #define TMP_IOV_MAX 16
 #ifndef PG_FRAME
 #define PG_FRAME	~PAGE_MASK
@@ -240,6 +239,29 @@
 	return (0);
 }
 
+/*
+ * Returns whether a connection should enable DDP.  This happens when all of
+ * the following conditions are met:
+ * - the connection's ULP mode is DDP
+ * - DDP is not already enabled
+ * - the last receive was above the DDP threshold
+ * - receive buffers are in user space
+ * - receive side isn't shutdown (handled by caller)
+ * - the connection's receive window is big enough so that sizable buffers
+ *   can be posted without closing the window in the middle of DDP (checked
+ *   when the connection is offloaded)
+ */
+#ifdef notyet
+static int
+so_should_ddp(const struct toepcb *toep, int last_recv_len)
+{
+	return toep->tp_ulp_mode == ULP_MODE_TCPDDP && (toep->tp_ddp_state.ubuf == NULL) &&
+	       last_recv_len > TOM_TUNABLE(toep->tp_toedev, ddp_thres) &&
+	       toep->tp_tp->rcv_wnd > 
+	           (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN);
+}
+#endif
+
 static void
 cxgb_wait_dma_completion(struct toepcb *toep)
 {
@@ -501,749 +523,3 @@
 	so->so_proto->pr_usrreqs->pru_sosend = cxgb_sosend;
 	so->so_proto->pr_usrreqs->pru_soreceive = cxgb_soreceive;
 }
-
-/*
- * This routine takes a user address range and does the following:
- *  - validate that the user has access to those pages (flags indicates read or write) - if not fail
- *  - validate that count is enough to hold range number of pages - if not fail
- *  - fault in any non-resident pages
- *  - if the user is doing a read force a write fault for any COWed pages
- *  - if the user is doing a read mark all pages as dirty
- *  - hold all pages
- *  - return number of pages in count
- */
-static int
-vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags)
-{
-
-	vm_offset_t start, va;
-	vm_paddr_t pa;
-	int pageslen, faults, rv;
-
-	struct thread *td;
-	vm_map_t map;
-	pmap_t pmap;
-	vm_page_t m, *pages;
-	vm_prot_t prot;
-
-	/*
-	 * Check that virtual address range is legal
-	 * This check is somewhat bogus as on some architectures kernel
-	 * and user do not share VA - however, it appears that all FreeBSD
-	 * architectures define it
-	 */
-	pageslen =  count * PAGE_SIZE;
-	if (addr + pageslen > VM_MAXUSER_ADDRESS)
-		return (EFAULT);
-
-	td = curthread;
-	map = &td->td_proc->p_vmspace->vm_map;
-	pmap = &td->td_proc->p_vmspace->vm_pmap;
-	pages = mp;
-
-	prot = (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : VM_PROT_READ;
-	bzero(pages, sizeof(vm_page_t *) * count);
-retry:
-
-	/*
-	 * First optimistically assume that all pages are resident (and R/W if for write)
-	 * if so just mark pages as held (and dirty if for write) and return
-	 */
-	vm_page_lock_queues();
-	for (start = addr, pages = mp, faults = 0, va = addr; va < start + pageslen; va += PAGE_SIZE, pages++) {
-		/*
-		 * Assure that we only hold the page once
-		 */
-		if (*pages == NULL) {
-			/*
-			 * page queue mutex is recursable so this is OK
-			 * it would be really nice if we had an unlocked version of this so
-			 * we were only acquiring the pmap lock 1 time as opposed to potentially
-			 * many dozens of times
-			 */
-			m = pmap_extract_and_hold(pmap, va, prot);
-			if (m == NULL) {
-				faults++;
-				continue;
-			}
-			
-			*pages = m;
-			if (flags & VM_HOLD_WRITEABLE)
-				vm_page_dirty(m);
-		}
-	}
-	vm_page_unlock_queues();
-	
-	if (faults == 0) 
-		return (0);
-	/*
-	 * Pages either have insufficient permissions or are not present
-	 * trigger a fault where neccessary
-	 * 
-	 */
-	for (va = start; va < pageslen; va += PAGE_SIZE) {
-		m = NULL;
-		pa = pmap_extract(pmap, va);
-		rv = 0;
-		if (pa)
-			m = PHYS_TO_VM_PAGE(pa);
-		if (flags & VM_HOLD_WRITEABLE) {
-			if (m == NULL  || (m->flags & PG_WRITEABLE) == 0)
-				rv = vm_fault(map, va, VM_PROT_WRITE, VM_FAULT_DIRTY);
-		} else if (m == NULL)
-			rv = vm_fault(map, va, VM_PROT_READ, VM_FAULT_NORMAL);
-		if (rv)
-			goto error;
-	} 
-	goto retry;
-
-error:	
-	vm_page_lock_queues();
-	for (pages = mp,
-		 va = start; va < start + pageslen;
-	     va += PAGE_SIZE,
-		 pages++) 
-		if (*pages)
-			vm_page_unhold(*pages);
-	vm_page_unlock_queues();
-	return (EFAULT);
-}
-
-static void
-vm_fault_unhold_pages(vm_page_t *mp, int count)
-{
-
-	KASSERT(count >= 0, ("negative count %d", count));
-	vm_page_lock_queues();
-	while (count--) {
-		vm_page_unhold(*mp);
-		mp++;
-	}
-	vm_page_unlock_queues();
-}
-
-/**
- *	t3_pin_pages - pin a user memory range and prepare it for DDP
- *	@addr - the starting address
- *	@len - the length of the range
- *	@newgl - contains the pages and physical addresses of the pinned range
- *	@gl - an existing gather list, may be %NULL
- *
- *	Pins the pages in the user-space memory range [addr, addr + len) and
- *	maps them for DMA.  Returns a gather list with the pinned pages and
- *	their physical addresses.  If @gl is non NULL the pages it describes
- *	are compared against the pages for [addr, addr + len), and if the
- *	existing gather list already covers the range a new list is not
- *	allocated.  Returns 0 on success, or a negative errno.  On success if
- *	a new gather list was allocated it is returned in @newgl.
- */ 
-static int
-t3_pin_pages(bus_dma_tag_t tag, bus_dmamap_t map, unsigned long addr,
-    size_t len, struct ddp_gather_list **newgl,
-    const struct ddp_gather_list *gl)
-{
-	int i, err;
-	size_t pg_off;
-	unsigned int npages;
-	struct ddp_gather_list *p;
-
-	if (addr >= VM_MAXUSER_ADDRESS)
-		return (EINVAL);
-#if 0	
-	if (!access_ok(VERIFY_WRITE, addr, len))
-		return (EFAULT);
-#endif
-	pg_off = addr & ~PAGE_MASK;
-	npages = (pg_off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	p = malloc(sizeof(struct ddp_gather_list) + npages * sizeof(vm_page_t *),
-	    M_DEVBUF, M_NOWAIT);
-	if (!p)
-		return (ENOMEM);
-
-
-	err = vm_fault_hold_user_pages(addr, p->dgl_pages, npages, VM_HOLD_WRITEABLE);
-
-	if (err)
-		goto free_gl;
-
-	if (gl && gl->dgl_offset == pg_off && gl->dgl_nelem >= npages &&
-	    gl->dgl_length >= len) {
-		for (i = 0; i < npages; ++i)
-			if (p->dgl_pages[i] != gl->dgl_pages[i])
-				goto different_gl;
-		err = 0;
-		goto unpin;
-	}
-
-different_gl:
-	p->dgl_length = len;
-	p->dgl_offset = pg_off;
-	p->dgl_nelem = npages;
-#ifdef notyet
-	p->phys_addr[0] = pci_map_page(pdev, p->pages[0], pg_off,
-				       PAGE_SIZE - pg_off,
-				       PCI_DMA_FROMDEVICE) - pg_off;
-	for (i = 1; i < npages; ++i)
-		p->phys_addr[i] = pci_map_page(pdev, p->pages[i], 0, PAGE_SIZE,
-					       PCI_DMA_FROMDEVICE);
-#endif	
-
-	*newgl = p;
-	return 0;
-unpin:
-	vm_fault_unhold_pages(p->dgl_pages, npages);
-
-free_gl:
-	free(p, M_DEVBUF);
-	*newgl = NULL;
-	return err;
-}
-
-/*
- * Return the # of page pods needed to accommodate a # of pages.
- */
-static inline unsigned int
-pages2ppods(unsigned int pages)
-{
-	return (pages + PPOD_PAGES - 1) / PPOD_PAGES + NUM_SENTINEL_PPODS;
-}
-
-
-static void
-unmap_ddp_gl(const struct ddp_gather_list *gl)
-{
-#ifdef notyet	
-	int i;
-
-	if (!gl->nelem)
-		return;
-
-	pci_unmap_page(pdev, gl->phys_addr[0] + gl->offset,
-		       PAGE_SIZE - gl->offset, PCI_DMA_FROMDEVICE);
-	for (i = 1; i < gl->nelem; ++i)
-		pci_unmap_page(pdev, gl->phys_addr[i], PAGE_SIZE,
-			       PCI_DMA_FROMDEVICE);
-
-#endif
-}
-
-static void
-ddp_gl_free_pages(struct ddp_gather_list *gl, int dirty)
-{
-#ifdef notyet
-	int i;
-
-	for (i = 0; i < gl->nelem; ++i) {
-		if (dirty)
-			set_page_dirty_lock(gl->pages[i]);
-		put_page(gl->pages[i]);
-	}
-#endif	
-}
-
-void
-t3_free_ddp_gl(struct ddp_gather_list *gl)
-{
-	unmap_ddp_gl(gl);
-	ddp_gl_free_pages(gl, 0);
-	free(gl, M_DEVBUF);
-}
-
-/* Max # of page pods for a buffer, enough for 1MB buffer at 4KB page size */
-#define MAX_PPODS 64U
-
-/*
- * Allocate page pods for DDP buffer 1 (the user buffer) and set up the tag in
- * the TCB.  We allocate page pods in multiples of PPOD_CLUSTER_SIZE.  First we
- * try to allocate enough page pods to accommodate the whole buffer, subject to
- * the MAX_PPODS limit.  If that fails we try to allocate PPOD_CLUSTER_SIZE page
- * pods before failing entirely.
- */
-static int
-alloc_buf1_ppods(struct socket *so, struct ddp_state *p,
-			    unsigned long addr, unsigned int len)
-{
-	int tag, npages, nppods;
-	struct tom_data *d = TOM_DATA(TOE_DEV(so));
-
-	npages = ((addr & ~PAGE_MASK) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	nppods = min(pages2ppods(npages), MAX_PPODS);
-#ifdef notyet	
-	nppods = ALIGN(nppods, PPOD_CLUSTER_SIZE);
-#endif	
-	tag = t3_alloc_ppods(d, nppods);
-	if (tag < 0 && nppods > PPOD_CLUSTER_SIZE) {
-		nppods = PPOD_CLUSTER_SIZE;
-		tag = t3_alloc_ppods(d, nppods);
-	}
-	if (tag < 0)
-		return (ENOMEM);
-
-	p->ubuf_nppods = nppods;
-	p->ubuf_tag = tag;
-#if NUM_DDP_KBUF == 1
-	t3_set_ddp_tag(so, 1, tag << 6);
-#endif
-	return 0;
-}
-
-
-
-/*
- * Reposts the kernel DDP buffer after it has been previously become full and
- * invalidated.  We just need to reset the offset and adjust the DDP flags.
- * Conveniently, we can set the flags and the offset with a single message.
- * Note that this function does not set the buffer length.  Again conveniently
- * our kernel buffer is of fixed size.  If the length needs to be changed it
- * needs to be done separately.
- */
-static void
-t3_repost_kbuf(struct socket *so, unsigned int bufidx, int modulate, 
-		    int activate)
-{
-	struct toepcb *toep = sototcpcb(so)->t_toe;
-	struct ddp_state *p = &toep->tp_ddp_state;
-
-	p->buf_state[bufidx].cur_offset = p->kbuf[bufidx]->dgl_offset;
-	p->buf_state[bufidx].flags = p->kbuf_noinval ? DDP_BF_NOINVAL : 0;
-	p->buf_state[bufidx].gl = p->kbuf[bufidx];
-	p->cur_buf = bufidx;
-	p->kbuf_idx = bufidx;
-	if (!bufidx)
-	t3_setup_ddpbufs(toep, 0, 0, 0, 0,
-			 V_TF_DDP_PSH_NO_INVALIDATE(p->kbuf_noinval) |
-			 V_TF_DDP_BUF0_VALID(1),
-			 V_TF_DDP_PSH_NO_INVALIDATE(1) | V_TF_DDP_OFF(1) |
-			 V_TF_DDP_BUF0_VALID(1) |
-			 V_TF_DDP_ACTIVE_BUF(activate), modulate);
-	else
-	t3_setup_ddpbufs(toep, 0, 0, 0, 0,
-			 V_TF_DDP_PSH_NO_INVALIDATE(p->kbuf_noinval) |
-			 V_TF_DDP_BUF1_VALID(1) | 
-			 V_TF_DDP_ACTIVE_BUF(activate),
-			 V_TF_DDP_PSH_NO_INVALIDATE(1) | V_TF_DDP_OFF(1) |
-			 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_ACTIVE_BUF(1), 
-			 modulate);
-	
-}
-
-/*
- * Starting offset for the user DDP buffer.  A non-0 value ensures a DDP flush
- * won't block indefinitely if there's nothing to place (which should be rare).
- */
-#define UBUF_OFFSET 1
-
-static __inline unsigned long
-select_ddp_flags(const struct socket *so, int buf_idx,
-					     int nonblock, int rcv_flags)
-{
-	if (buf_idx == 1) {
-		if (__predict_false(rcv_flags & MSG_WAITALL))
-			return V_TF_DDP_PSH_NO_INVALIDATE(1) |
-			       V_TF_DDP_PUSH_DISABLE_1(1);
-		if (nonblock)
-			return V_TF_DDP_BUF1_FLUSH(1);
-
-		return V_TF_DDP_BUF1_FLUSH(!TOM_TUNABLE(TOE_DEV(so),
-							ddp_push_wait));
-	}
-
-	if (__predict_false(rcv_flags & MSG_WAITALL))
-		return V_TF_DDP_PSH_NO_INVALIDATE(1) |
-		       V_TF_DDP_PUSH_DISABLE_0(1);
-	if (nonblock)
-		return V_TF_DDP_BUF0_FLUSH(1);
-
-	return V_TF_DDP_BUF0_FLUSH(!TOM_TUNABLE(TOE_DEV(so), ddp_push_wait));
-}
-
-/**
- * setup_iovec_ppods - setup HW page pods for a user iovec
- * @sk: the associated socket
- * @iov: the iovec
- * @oft: additional bytes to map before the start of the buffer
- *
- * Pins a user iovec and sets up HW page pods for DDP into it.  We allocate
- * page pods for user buffers on the first call per socket.  Afterwards we
- * limit the buffer length to whatever the existing page pods can accommodate.
- * Returns a negative error code or the length of the mapped buffer.
- *
- * The current implementation handles iovecs with only one entry.
- */
-static int
-setup_iovec_ppods(struct socket *so, const struct iovec *iov, int oft)
-{
-	int err;
-	unsigned int len;
-	struct ddp_gather_list *gl = NULL;
-	struct toepcb *toep = sototcpcb(so)->t_toe;
-	struct ddp_state *p = &toep->tp_ddp_state;
-	unsigned long addr = (unsigned long)iov->iov_base - oft;
-
-	if (__predict_false(!p->ubuf_nppods)) {
-		err = alloc_buf1_ppods(so, p, addr, iov->iov_len + oft);
-		if (err)
-			return err;
-	}
-
-	len = (p->ubuf_nppods - NUM_SENTINEL_PPODS) * PPOD_PAGES * PAGE_SIZE;
-	len -= addr & ~PAGE_MASK;
-	if (len > M_TCB_RX_DDP_BUF0_LEN)
-		len = M_TCB_RX_DDP_BUF0_LEN;
-	len = min(len, sototcpcb(so)->rcv_wnd - 32768);
-	len = min(len, iov->iov_len + oft);
-
-	if (len <= p->kbuf[0]->dgl_length)
-		return -EINVAL;
-
-	err = t3_pin_pages(toep->tp_rx_dmat, toep->tp_dmamap, addr, len, &gl, p->ubuf);
-	if (err < 0)
-		return err;
-	if (gl) {
-		if (p->ubuf)
-			t3_free_ddp_gl(p->ubuf);
-		p->ubuf = gl;
-		t3_setup_ppods(so, gl, pages2ppods(gl->dgl_nelem), p->ubuf_tag, len,
-			       gl->dgl_offset, 0);
-	}
-	return len;
-}
-
-#define OVERLAY_MASK (V_TF_DDP_PSH_NO_INVALIDATE(1) | \
-		      V_TF_DDP_BUF1_FLUSH(1) | \
-		      V_TF_DDP_BUF0_FLUSH(1) | \
-		      V_TF_DDP_PUSH_DISABLE_1(1) | \
-		      V_TF_DDP_PUSH_DISABLE_0(1) | \
-		      V_TF_DDP_INDICATE_OUT(1))
-
-/*
- * Post a user buffer as an overlay on top of the current kernel buffer.
- */
-int
-t3_overlay_ubuf(struct socket *so, const struct iovec *iov,
-		    int nonblock, int rcv_flags, int modulate, int post_kbuf)
-{
-	int len, ubuf_idx;
-	unsigned long flags;
-	struct toepcb *toep = sototcpcb(so)->t_toe;
-	struct ddp_state *p = &toep->tp_ddp_state;
-
-	if (!p)
-		return -1;
-
-	len = setup_iovec_ppods(so, iov, 0);
-	if (len < 0)
-		return len;
-
-	ubuf_idx = p->kbuf_idx;
-	p->buf_state[ubuf_idx].flags = DDP_BF_NOFLIP;
-	/* Use existing offset */
-	/* Don't need to update .gl, user buffer isn't copied. */
-	p->cur_buf = ubuf_idx;
-
-	flags = select_ddp_flags(so, ubuf_idx, nonblock, rcv_flags);
-
-	if (post_kbuf) {
-		struct ddp_buf_state *dbs = &p->buf_state[ubuf_idx ^ 1];
-
-		dbs->cur_offset = 0;
-		dbs->flags = 0;
-		dbs->gl = p->kbuf[ubuf_idx ^ 1];
-		p->kbuf_idx ^= 1;
-		flags |= p->kbuf_idx ?
-			 V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_PUSH_DISABLE_1(0) :
-			 V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_PUSH_DISABLE_0(0);
-	}
-
-	if (ubuf_idx == 0) {
-		t3_overlay_ddpbuf(toep, 0, p->ubuf_tag << 6, p->kbuf_tag[1] << 6,
-				  len);
-		t3_setup_ddpbufs(toep, 0, 0, p->kbuf[1]->dgl_length, 0,
-				 flags,
-				 OVERLAY_MASK | flags, 1);
-	} else {
-		t3_overlay_ddpbuf(toep, 1, p->kbuf_tag[0] << 6, p->ubuf_tag << 6,
-				  len);
-		t3_setup_ddpbufs(toep, p->kbuf[0]->dgl_length, 0, 0, 0,
-				 flags,
-				 OVERLAY_MASK | flags, 1);
-	}
-#ifdef T3_TRACE
-	T3_TRACE5(TIDTB(so),
-		  "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x ubuf_idx %d "
-		  " kbuf_idx %d",
-		   p->ubuf_tag, flags, OVERLAY_MASK, ubuf_idx, p->kbuf_idx);
-#endif
-	return 0;
-}
-
-
-
-/*
- * Returns whether a connection should enable DDP.  This happens when all of
- * the following conditions are met:
- * - the connection's ULP mode is DDP
- * - DDP is not already enabled
- * - the last receive was above the DDP threshold
- * - receive buffers are in user space
- * - receive side isn't shutdown (handled by caller)
- * - the connection's receive window is big enough so that sizable buffers
- *   can be posted without closing the window in the middle of DDP (checked
- *   when the connection is offloaded)
- */
-#ifdef notyet
-static int
-so_should_ddp(const struct toepcb *toep, int last_recv_len)
-{
-	return toep->tp_ulp_mode == ULP_MODE_TCPDDP && !toep->tp_dpp_state.cur_buf &&
-	       last_recv_len > TOM_TUNABLE(toep->tp_toedev, ddp_thres) &&
-	       toep->tp_tp->rcv_wnd > 
-	           (TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + 
-		    DDP_RSVD_WIN);
-}
-
-static inline int
-is_ddp(const struct mbuf *m)
-{
-	return (m->m_flags & M_DDP);
-}
-
-static inline int
-is_ddp_psh(const struct mbuf *m)
-{
-        return is_ddp(skb) && (m->m_pkthdr.csum_flags & DDP_BF_PSH);
-}
-
-/*
- * Copy data from an sk_buff to an iovec.  Deals with RX_DATA, which carry the
- * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a
- * DDP buffer.
- */
-static inline int
-copy_data(const struct mbuf *m, int offset, struct iovec *to, int len)
-{
-	if (__predict_true(!is_ddp(m)))                             /* RX_DATA */
-		return mbuf_copy_datagram_iovec(m, offset, to, len);
-	if (__predict_true(m->pkthdr.csum_flags & DDP_BF_NOCOPY)) { /* user DDP */
-		to->iov_len -= len;
-		to->iov_base += len;
-		return 0;
-	}
-	return t3_ddp_copy(m, offset, to, len);             /* kernel DDP */
-}
-
-
-#endif
-/*
- * Clean up DDP state that needs to survive until socket close time, such as the
- * DDP buffers.  The buffers are already unmapped at this point as unmapping
- * needs the PCI device and a socket may close long after the device is removed.
- */
-void
-t3_cleanup_ddp(struct socket *so)
-{
-	struct toepcb *toep = sototcpcb(so)->t_toe;
-	struct ddp_state *p = &toep->tp_ddp_state;
-	int idx;
-
-	for (idx = 0; idx < NUM_DDP_KBUF; idx++)
-		if (p->kbuf[idx]) {
-			ddp_gl_free_pages(p->kbuf[idx], 0);
-			free(p->kbuf[idx], M_DEVBUF);
-		}
-
-	if (p->ubuf) {
-		ddp_gl_free_pages(p->ubuf, 0);
-		free(p->ubuf, M_DEVBUF);
-		p->ubuf = NULL;
-	}
-	toep->tp_ulp_mode = 0;
-}
-
-/*
- * This is a companion to t3_cleanup_ddp() and releases the HW resources
- * associated with a connection's DDP state, such as the page pods.
- * It's called when HW is done with a connection.   The rest of the state
- * remains available until both HW and the app are done with the connection.
- */
-void
-t3_release_ddp_resources(struct toepcb *toep)
-{
-	struct ddp_state *p = &toep->tp_ddp_state;
-	struct tom_data *d = TOM_DATA(toep->tp_toedev);
-	int idx;
-	
-	for (idx = 0; idx < NUM_DDP_KBUF; idx++) {
-		t3_free_ppods(d, p->kbuf_tag[idx], 
-		    p->kbuf_nppods[idx]);
-		unmap_ddp_gl(p->kbuf[idx]);
-	}
-
-	if (p->ubuf_nppods) {
-		t3_free_ppods(d, p->ubuf_tag, p->ubuf_nppods);
-		p->ubuf_nppods = 0;
-	}
-	if (p->ubuf)
-		unmap_ddp_gl(p->ubuf);
-	
-}
-
-void
-t3_post_kbuf(struct socket *so, int modulate)
-{
-	struct toepcb *toep = sototcpcb(so)->t_toe;
-	struct ddp_state *p = &toep->tp_ddp_state;
-
-	t3_set_ddp_tag(so, p->cur_buf, p->kbuf_tag[p->cur_buf] << 6);
-	t3_set_ddp_buf(so, p->cur_buf, 0, p->kbuf[p->cur_buf]->dgl_length);
-	t3_repost_kbuf(so, p->cur_buf, modulate, 1);
-
-#ifdef T3_TRACE
-	T3_TRACE1(TIDTB(so),
-		  "t3_post_kbuf: cur_buf = kbuf_idx = %u ", p->cur_buf);
-#endif
-}
-
-/*
- * Prepare a socket for DDP.  Must be called when the socket is known to be
- * open.
- */
-int
-t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall)
-{
-	int err = ENOMEM;
-	unsigned int nppods, kbuf_pages, idx = 0;
-	struct toepcb *toep = sototcpcb(so)->t_toe;
-	struct ddp_state *p = &toep->tp_ddp_state;
-	struct tom_data *d = TOM_DATA(toep->tp_toedev);
-
-	if (kbuf_size > M_TCB_RX_DDP_BUF0_LEN)
-		return (EINVAL);
-
-	kbuf_pages = (kbuf_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	nppods = pages2ppods(kbuf_pages);
-
-	p->kbuf_noinval = !!waitall;
-	
-	p->kbuf_tag[NUM_DDP_KBUF - 1] = -1;
-	for (idx = 0; idx < NUM_DDP_KBUF; idx++) {
-		p->kbuf[idx] = 
-		    malloc(sizeof (struct ddp_gather_list) + kbuf_pages *
-			sizeof(vm_page_t *), M_DEVBUF, M_NOWAIT|M_ZERO);
-		if (!p->kbuf[idx])
-			goto err;
-
-		p->kbuf_tag[idx] = t3_alloc_ppods(d, nppods);
-		if (p->kbuf_tag[idx] < 0)
-			goto err;
-
-		p->kbuf_nppods[idx] = nppods;
-		p->kbuf[idx]->dgl_length = kbuf_size;
-		p->kbuf[idx]->dgl_offset = 0;
-		p->kbuf[idx]->dgl_nelem = kbuf_pages;
-#ifdef notyet		
-		p->kbuf[idx]->pages = 
-		    (struct page **)&p->kbuf[idx]->phys_addr[kbuf_pages];
-
-		
-		for (i = 0; i < kbuf_pages; ++i) {
-
-			p->kbuf[idx]->pages[i] = alloc_page(sk->sk_allocation);
-			if (!p->kbuf[idx]->pages[i]) {
-				p->kbuf[idx]->nelem = i;
-				goto err;
-			}
-
-		}
-
-		for (i = 0; i < kbuf_pages; ++i)
-			p->kbuf[idx]->phys_addr[i] = 
-			    pci_map_page(p->pdev, p->kbuf[idx]->pages[i],
-					 0, PAGE_SIZE, PCI_DMA_FROMDEVICE);
-#endif		
-		t3_setup_ppods(so, p->kbuf[idx], nppods, p->kbuf_tag[idx], 
-			       p->kbuf[idx]->dgl_length, 0, 0);
-	}
-	t3_set_ddp_tag(so, 0, p->kbuf_tag[0] << 6);
-	t3_set_ddp_buf(so, 0, 0, p->kbuf[0]->dgl_length);
-	t3_repost_kbuf(so, 0, 0, 1);
-	t3_set_rcv_coalesce_enable(so, 
-	    TOM_TUNABLE(TOE_DEV(so), ddp_rcvcoalesce));
-
-#ifdef T3_TRACE
-	T3_TRACE4(TIDTB(so),
-		  "t3_enter_ddp: kbuf_size %u waitall %u tag0 %d tag1 %d",
-		   kbuf_size, waitall, p->kbuf_tag[0], p->kbuf_tag[1]);
-#endif
-
-	return 0;
-
-err:
-	t3_release_ddp_resources(toep);
-	t3_cleanup_ddp(so);
-	return err;
-}
-
-int
-t3_ddp_copy(const struct mbuf *m, int offset, struct iovec *to, int len)
-{
-#ifdef notyet	
-	int err, page_no, page_off;
-	struct ddp_gather_list *gl = (struct ddp_gather_list *)skb->mac.raw;
-
-	if (!gl->pages) {
-		dump_stack();
-		BUG_ON(1);
-	}
-
-	offset += gl->offset + TCP_SKB_CB(skb)->when;
-	page_no = offset >> PAGE_SHIFT;
-	page_off = offset & ~PAGE_MASK;
-
-	while (len) {
-		int copy = min_t(int, len, PAGE_SIZE - page_off);
-
-		err = memcpy_toiovec(to, page_address(gl->pages[page_no]) +
-				     page_off, copy);
-		if (err)
-			return -EFAULT;
-		page_no++;
-		page_off = 0;
-		len -= copy;
-	}
-#endif	
-	return 0;
-}
-
-/*
- * Allocate n page pods.  Returns -1 on failure or the page pod tag.
- */
-int t3_alloc_ppods(struct tom_data *td, unsigned int n)
-{
-	unsigned int i, j;
-
-	if (__predict_false(!td->ppod_map))
-		return -1;
-
-	mtx_lock(&td->ppod_map_lock);
-	for (i = 0; i < td->nppods; ) {
-		for (j = 0; j < n; ++j)           /* scan ppod_map[i..i+n-1] */
-			if (td->ppod_map[i + j]) {
-				i = i + j + 1;
-				goto next;
-			}
-
-		memset(&td->ppod_map[i], 1, n);   /* allocate range */
-		mtx_unlock(&td->ppod_map_lock);
-		return i;
-next:		;
-	}	
-	mtx_unlock(&td->ppod_map_lock);
-	return (0);
-}
-
-void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n)
-{
-	/* No need to take ppod_lock here */
-	memset(&td->ppod_map[tag], 0, n);
-}

==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#6 (text+ko) ====

@@ -150,20 +150,20 @@
 int t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl,
 		   unsigned int nppods, unsigned int tag, unsigned int maxoff,
 		   unsigned int pg_off, unsigned int color);
-int t3_alloc_ppods(struct tom_data *td, unsigned int n);
+int t3_alloc_ppods(struct tom_data *td, unsigned int n, int *tag);
 void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n);
 void t3_free_ddp_gl(struct ddp_gather_list *gl);
-int t3_ddp_copy(const struct mbuf *skb, int offset, struct iovec *to,
+int t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio,
 		int len);
 //void t3_repost_kbuf(struct socket *so, int modulate, int activate);
 void t3_post_kbuf(struct socket *so, int modulate);
 int t3_post_ubuf(struct socket *so, const struct iovec *iov, int nonblock,
 		 int rcv_flags, int modulate, int post_kbuf);
-void t3_cancel_ubuf(struct socket *so);
+void t3_cancel_ubuf(struct toepcb *toep);
 int t3_overlay_ubuf(struct socket *so, const struct iovec *iov, int nonblock,
 		    int rcv_flags, int modulate, int post_kbuf);
 int t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall);
-void t3_cleanup_ddp(struct socket *so);
+void t3_cleanup_ddp(struct toepcb *toep);
 void t3_release_ddp_resources(struct toepcb *toep);
 void t3_cancel_ddpbuf(struct toepcb *, unsigned int bufidx);
 void t3_overlay_ddpbuf(struct toepcb *, unsigned int bufidx, unsigned int tag0,

==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_toepcb.h#3 (text+ko) ====

@@ -99,7 +99,7 @@
 }
 
 static inline struct mbuf *
-peek_wr(struct toepcb *toep)
+peek_wr(const struct toepcb *toep)
 {
 
 	return (mbufq_peek(&toep->wr_list));
@@ -112,5 +112,10 @@
 	return (mbufq_dequeue(&toep->wr_list));
 }
 
+#define wr_queue_walk(toep, m) \
+	for (m = peek_wr(toep); m; m = m->m_nextpkt)
+
+
+
 #endif
 

==== //depot/projects/toehead/sys/modules/cxgb/tom/Makefile#2 (text+ko) ====

@@ -4,7 +4,7 @@
 
 KMOD=	tom
 SRCS=   cxgb_tom.c cxgb_cpl_io.c cxgb_listen.c cxgb_tom_sysctl.c cxgb_cpl_socket.c
-#SRCS+=  cxgb_tcp_subr.c cxgb_tcp_usrreq.c
+SRCS+=  cxgb_ddp.c cxgb_vm.c

>>> TRUNCATED FOR MAIL (1000 lines) <<<