PERFORCE change 133915 for review
Kip Macy
kmacy at FreeBSD.org
Tue Jan 22 21:09:01 PST 2008
http://perforce.freebsd.org/chv.cgi?CH=133915
Change 133915 by kmacy at kmacy:storage:toehead on 2008/01/23 05:08:06
basic zero-copy send and some infrastructure for DDP
Affected files ...
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#3 edit
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#2 edit
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#2 edit
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_tom.c#2 edit
Differences ...
==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#3 (text+ko) ====
@@ -3324,6 +3324,53 @@
SOCK_UNLOCK(lctx->lso);
}
+
+int
+t3_setup_ppods(struct socket *so, const struct ddp_gather_list *gl,
+ unsigned int nppods, unsigned int tag, unsigned int maxoff,
+ unsigned int pg_off, unsigned int color)
+{
+ unsigned int i, j, pidx;
+ struct pagepod *p;
+ struct mbuf *m;
+ struct ulp_mem_io *req;
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ unsigned int tid = toep->tp_tid;
+ const struct tom_data *td = TOM_DATA(TOE_DEV(so));
+ unsigned int ppod_addr = tag * PPOD_SIZE + td->ddp_llimit;
+
+ for (i = 0; i < nppods; ++i) {
+ m = m_gethdr_nofail(sizeof(*req) + PPOD_SIZE);
+ m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
+ req = mtod(m, struct ulp_mem_io *);
+ m->m_pkthdr.len = m->m_len = sizeof(*req) + PPOD_SIZE;
+ req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_BYPASS));
+ req->cmd_lock_addr = htonl(V_ULP_MEMIO_ADDR(ppod_addr >> 5) |
+ V_ULPTX_CMD(ULP_MEM_WRITE));
+ req->len = htonl(V_ULP_MEMIO_DATA_LEN(PPOD_SIZE / 32) |
+ V_ULPTX_NFLITS(PPOD_SIZE / 8 + 1));
+
+ p = (struct pagepod *)(req + 1);
+ if (__predict_false(i < nppods - NUM_SENTINEL_PPODS)) {
+ p->pp_vld_tid = htonl(F_PPOD_VALID | V_PPOD_TID(tid));
+ p->pp_pgsz_tag_color = htonl(V_PPOD_TAG(tag) |
+ V_PPOD_COLOR(color));
+ p->pp_max_offset = htonl(maxoff);
+ p->pp_page_offset = htonl(pg_off);
+ p->pp_rsvd = 0;
+ for (pidx = 4 * i, j = 0; j < 5; ++j, ++pidx)
+ p->pp_addr[j] = pidx < gl->dgl_nelem ?
+ htobe64(VM_PAGE_TO_PHYS(gl->dgl_pages[pidx])) : 0;
+ } else
+ p->pp_vld_tid = 0; /* mark sentinel page pods invalid */
+ send_or_defer(so, tp, m, 0);
+ ppod_addr += PPOD_SIZE;
+ }
+ return (0);
+}
+
+
void
t3_init_wr_tab(unsigned int wr_len)
{
==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#2 (text+ko) ====
@@ -38,6 +38,7 @@
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/mbuf.h>
+#include <sys/condvar.h>
#include <sys/mutex.h>
#include <sys/proc.h>
#include <sys/socket.h>
@@ -94,13 +95,13 @@
struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
int *flagsp);
-#ifdef notyet
#define VM_HOLD_WRITEABLE 0x1
-static int vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp,
- int *count, int flags);
-#endif
+static int vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags);
static void vm_fault_unhold_pages(vm_page_t *m, int count);
#define TMP_IOV_MAX 16
+#ifndef PG_FRAME
+#define PG_FRAME ~PAGE_MASK
+#endif
void
t3_init_socket_ops(void)
@@ -123,7 +124,6 @@
#endif
}
-
struct cxgb_dma_info {
size_t cdi_mapped;
int cdi_nsegs;
@@ -182,21 +182,72 @@
}
}
+static void
+cxgb_zero_copy_free(void *cl, void *arg)
+{
+ struct mbuf_vec *mv;
+ struct mbuf *m = (struct mbuf *)cl;
+
+ mv = mtomv(m);
+ /*
+ * Physical addresses, don't try to free should be unheld separately from sbdrop
+ *
+ */
+ mv->mv_count = 0;
+ m_free_iovec(m, m->m_type);
+}
-static void
-cxgb_zero_copy_free(void *cl, void *arg) {}
static int
cxgb_hold_iovec_pages(struct uio *uio, vm_page_t *m, int *held, int flags)
{
+ struct iovec *iov = uio->uio_iov;
+ int iovcnt = uio->uio_iovcnt;
+ int err, i, count, totcount, maxcount, totbytes, npages, curbytes;
+ uint64_t start, end;
+ vm_page_t *mp;
+
+ totbytes = totcount = 0;
+ maxcount = *held;
+
+ mp = m;
+ for (totcount = i = 0; (i < iovcnt) && (totcount < maxcount); i++, iov++) {
+ count = maxcount - totcount;
+
+ start = (uint64_t)iov->iov_base;
+ end = (uint64_t)((caddr_t)iov->iov_base + iov->iov_len);
+ start &= PG_FRAME;
+ end += PAGE_MASK;
+ end &= PG_FRAME;
+ npages = (end - start) >> PAGE_SHIFT;
+
+ count = min(count, npages);
- return (EINVAL);
+ err = vm_fault_hold_user_pages((vm_offset_t)iov->iov_base, mp, count, flags);
+ if (err) {
+ vm_fault_unhold_pages(m, totcount);
+ return (err);
+ }
+ mp += count;
+ totcount += count;
+ curbytes = iov->iov_len;
+ if (count != npages)
+ curbytes = count*PAGE_SIZE - (((uint64_t)iov->iov_base)&PAGE_MASK);
+ totbytes += curbytes;
+ }
+ uio->uio_resid -= totbytes;
+
+ return (0);
}
static void
-cxgb_wait_dma_completion(struct toepcb *tp)
+cxgb_wait_dma_completion(struct toepcb *toep)
{
+ struct mtx *lock;
+ lock = &toep->tp_tp->t_inpcb->inp_mtx;
+ INP_LOCK(toep->tp_tp->t_inpcb);
+ cv_wait_unlock(&toep->tp_cv, lock);
}
static int
@@ -233,7 +284,13 @@
mi_collapse_sge(mi, segs);
*m = m0;
-
+
+ /*
+ * This appears to be a no-op at the moment
+ * as busdma is all or nothing need to make
+ * sure the tag values are large enough
+ *
+ */
if (cdi.cdi_mapped < uio->uio_resid) {
uio->uio_resid -= cdi.cdi_mapped;
} else
@@ -304,10 +361,11 @@
}
uio->uio_resid -= m->m_pkthdr.len;
sent += m->m_pkthdr.len;
- sbappend_locked(&so->so_snd, m);
+ sbappend(&so->so_snd, m);
t3_push_frames(so, TRUE);
iov_adj(&uiotmp.uio_iov, &iovcnt, uiotmp.uio_resid);
}
+
/*
* Wait for pending I/O to be DMA'd to the card
*
@@ -454,51 +512,45 @@
* - hold all pages
* - return number of pages in count
*/
-#ifdef notyet
static int
-vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp, int *count, int flags)
+vm_fault_hold_user_pages(vm_offset_t addr, vm_page_t *mp, int count, int flags)
{
vm_offset_t start, va;
vm_paddr_t pa;
int pageslen, faults, rv;
-
+
struct thread *td;
vm_map_t map;
pmap_t pmap;
vm_page_t m, *pages;
vm_prot_t prot;
-
- start = addr & ~PAGE_MASK;
- pageslen = roundup2(addr + len, PAGE_SIZE);
- if (*count < (pageslen >> PAGE_SHIFT))
- return (EFBIG);
- *count = pageslen >> PAGE_SHIFT;
/*
* Check that virtual address range is legal
* This check is somewhat bogus as on some architectures kernel
* and user do not share VA - however, it appears that all FreeBSD
* architectures define it
*/
- if (addr + len > VM_MAXUSER_ADDRESS)
+ pageslen = count * PAGE_SIZE;
+ if (addr + pageslen > VM_MAXUSER_ADDRESS)
return (EFAULT);
-
+
td = curthread;
map = &td->td_proc->p_vmspace->vm_map;
pmap = &td->td_proc->p_vmspace->vm_pmap;
pages = mp;
prot = (flags & VM_HOLD_WRITEABLE) ? VM_PROT_WRITE : VM_PROT_READ;
- bzero(pages, sizeof(vm_page_t *) * (*count));
+ bzero(pages, sizeof(vm_page_t *) * count);
retry:
-
+
/*
* First optimistically assume that all pages are resident (and R/W if for write)
* if so just mark pages as held (and dirty if for write) and return
*/
vm_page_lock_queues();
- for (pages = mp, faults = 0, va = start; va < pageslen; va += PAGE_SIZE, pages++) {
+ for (start = addr, pages = mp, faults = 0, va = addr; va < start + pageslen; va += PAGE_SIZE, pages++) {
/*
* Assure that we only hold the page once
*/
@@ -514,9 +566,10 @@
faults++;
continue;
}
+
*pages = m;
- if (flags & VM_HOLD_WRITEABLE)
- vm_page_dirty(m);
+ if (flags & VM_HOLD_WRITEABLE)
+ vm_page_dirty(m);
}
}
vm_page_unlock_queues();
@@ -546,13 +599,15 @@
error:
vm_page_lock_queues();
- for (pages = mp, va = start; va < pageslen; va += PAGE_SIZE, pages++)
+ for (pages = mp,
+ va = start; va < start + pageslen;
+ va += PAGE_SIZE,
+ pages++)
if (*pages)
vm_page_unhold(*pages);
vm_page_unlock_queues();
return (EFAULT);
}
-#endif
static void
vm_fault_unhold_pages(vm_page_t *mp, int count)
@@ -567,3 +622,276 @@
vm_page_unlock_queues();
}
+/**
+ * t3_pin_pages - pin a user memory range and prepare it for DDP
+ * @addr - the starting address
+ * @len - the length of the range
+ * @newgl - contains the pages and physical addresses of the pinned range
+ * @gl - an existing gather list, may be %NULL
+ *
+ * Pins the pages in the user-space memory range [addr, addr + len) and
+ * maps them for DMA. Returns a gather list with the pinned pages and
+ * their physical addresses. If @gl is non NULL the pages it describes
+ * are compared against the pages for [addr, addr + len), and if the
+ * existing gather list already covers the range a new list is not
+ * allocated. Returns 0 on success, or a negative errno. On success if
+ * a new gather list was allocated it is returned in @newgl.
+ */
+static int
+t3_pin_pages(bus_dma_tag_t tag, bus_dmamap_t map, unsigned long addr,
+ size_t len, struct ddp_gather_list **newgl,
+ const struct ddp_gather_list *gl)
+{
+ int i, err;
+ size_t pg_off;
+ unsigned int npages;
+ struct ddp_gather_list *p;
+
+ if (addr >= VM_MAXUSER_ADDRESS)
+ return (EINVAL);
+#if 0
+ if (!access_ok(VERIFY_WRITE, addr, len))
+ return (EFAULT);
+#endif
+ pg_off = addr & ~PAGE_MASK;
+ npages = (pg_off + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ p = malloc(sizeof(struct ddp_gather_list) + npages * sizeof(vm_page_t *),
+ M_DEVBUF, M_NOWAIT);
+ if (!p)
+ return (ENOMEM);
+
+
+ err = vm_fault_hold_user_pages(addr, p->dgl_pages, npages, VM_HOLD_WRITEABLE);
+
+ if (err)
+ goto free_gl;
+
+ if (gl && gl->dgl_offset == pg_off && gl->dgl_nelem >= npages &&
+ gl->dgl_length >= len) {
+ for (i = 0; i < npages; ++i)
+ if (p->dgl_pages[i] != gl->dgl_pages[i])
+ goto different_gl;
+ err = 0;
+ goto unpin;
+ }
+
+different_gl:
+ p->dgl_length = len;
+ p->dgl_offset = pg_off;
+ p->dgl_nelem = npages;
+#if 0
+ p->phys_addr[0] = pci_map_page(pdev, p->pages[0], pg_off,
+ PAGE_SIZE - pg_off,
+ PCI_DMA_FROMDEVICE) - pg_off;
+ for (i = 1; i < npages; ++i)
+ p->phys_addr[i] = pci_map_page(pdev, p->pages[i], 0, PAGE_SIZE,
+ PCI_DMA_FROMDEVICE);
+#endif
+
+ *newgl = p;
+ return 0;
+unpin:
+ vm_fault_unhold_pages(p->dgl_pages, npages);
+
+free_gl:
+ free(p, M_DEVBUF);
+ *newgl = NULL;
+ return err;
+}
+
+/*
+ * Return the # of page pods needed to accommodate a # of pages.
+ */
+static inline unsigned int
+pages2ppods(unsigned int pages)
+{
+ return (pages + PPOD_PAGES - 1) / PPOD_PAGES + NUM_SENTINEL_PPODS;
+}
+
+/* Max # of page pods for a buffer, enough for 1MB buffer at 4KB page size */
+#define MAX_PPODS 64U
+
+/*
+ * Allocate page pods for DDP buffer 1 (the user buffer) and set up the tag in
+ * the TCB. We allocate page pods in multiples of PPOD_CLUSTER_SIZE. First we
+ * try to allocate enough page pods to accommodate the whole buffer, subject to
+ * the MAX_PPODS limit. If that fails we try to allocate PPOD_CLUSTER_SIZE page
+ * pods before failing entirely.
+ */
+static int
+alloc_buf1_ppods(struct socket *so, struct ddp_state *p,
+ unsigned long addr, unsigned int len)
+{
+ int tag, npages, nppods;
+ struct tom_data *d = TOM_DATA(TOE_DEV(so));
+
+ npages = ((addr & ~PAGE_MASK) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ nppods = min(pages2ppods(npages), MAX_PPODS);
+#ifdef notyet
+ nppods = ALIGN(nppods, PPOD_CLUSTER_SIZE);
+#endif
+ tag = t3_alloc_ppods(d, nppods);
+ if (tag < 0 && nppods > PPOD_CLUSTER_SIZE) {
+ nppods = PPOD_CLUSTER_SIZE;
+ tag = t3_alloc_ppods(d, nppods);
+ }
+ if (tag < 0)
+ return (ENOMEM);
+
+ p->ubuf_nppods = nppods;
+ p->ubuf_tag = tag;
+#if NUM_DDP_KBUF == 1
+ t3_set_ddp_tag(so, 1, tag << 6);
+#endif
+ return 0;
+}
+
+/*
+ * Starting offset for the user DDP buffer. A non-0 value ensures a DDP flush
+ * won't block indefinitely if there's nothing to place (which should be rare).
+ */
+#define UBUF_OFFSET 1
+
+static __inline unsigned long
+select_ddp_flags(const struct socket *so, int buf_idx,
+ int nonblock, int rcv_flags)
+{
+ if (buf_idx == 1) {
+ if (__predict_false(rcv_flags & MSG_WAITALL))
+ return V_TF_DDP_PSH_NO_INVALIDATE(1) |
+ V_TF_DDP_PUSH_DISABLE_1(1);
+ if (nonblock)
+ return V_TF_DDP_BUF1_FLUSH(1);
+
+ return V_TF_DDP_BUF1_FLUSH(!TOM_TUNABLE(TOE_DEV(so),
+ ddp_push_wait));
+ }
+
+ if (__predict_false(rcv_flags & MSG_WAITALL))
+ return V_TF_DDP_PSH_NO_INVALIDATE(1) |
+ V_TF_DDP_PUSH_DISABLE_0(1);
+ if (nonblock)
+ return V_TF_DDP_BUF0_FLUSH(1);
+
+ return V_TF_DDP_BUF0_FLUSH(!TOM_TUNABLE(TOE_DEV(so), ddp_push_wait));
+}
+
+/**
+ * setup_iovec_ppods - setup HW page pods for a user iovec
+ * @sk: the associated socket
+ * @iov: the iovec
+ * @oft: additional bytes to map before the start of the buffer
+ *
+ * Pins a user iovec and sets up HW page pods for DDP into it. We allocate
+ * page pods for user buffers on the first call per socket. Afterwards we
+ * limit the buffer length to whatever the existing page pods can accommodate.
+ * Returns a negative error code or the length of the mapped buffer.
+ *
+ * The current implementation handles iovecs with only one entry.
+ */
+static int
+setup_iovec_ppods(struct socket *so, const struct iovec *iov, int oft)
+{
+ int err;
+ unsigned int len;
+ struct ddp_gather_list *gl = NULL;
+ struct toepcb *toep = sototcpcb(so)->t_toe;
+ struct ddp_state *p = &toep->tp_ddp_state;
+ unsigned long addr = (unsigned long)iov->iov_base - oft;
+
+ if (__predict_false(!p->ubuf_nppods)) {
+ err = alloc_buf1_ppods(so, p, addr, iov->iov_len + oft);
+ if (err)
+ return err;
+ }
+
+ len = (p->ubuf_nppods - NUM_SENTINEL_PPODS) * PPOD_PAGES * PAGE_SIZE;
+ len -= addr & ~PAGE_MASK;
+ if (len > M_TCB_RX_DDP_BUF0_LEN)
+ len = M_TCB_RX_DDP_BUF0_LEN;
+ len = min(len, sototcpcb(so)->rcv_wnd - 32768);
+ len = min(len, iov->iov_len + oft);
+
+ if (len <= p->kbuf[0]->dgl_length)
+ return -EINVAL;
+
+ err = t3_pin_pages(toep->tp_rx_dmat, toep->tp_dmamap, addr, len, &gl, p->ubuf);
+ if (err < 0)
+ return err;
+ if (gl) {
+ if (p->ubuf)
+ t3_free_ddp_gl(p->pdev, p->ubuf);
+ p->ubuf = gl;
+ t3_setup_ppods(so, gl, pages2ppods(gl->dgl_nelem), p->ubuf_tag, len,
+ gl->dgl_offset, 0);
+ }
+ return len;
+}
+
+#define OVERLAY_MASK (V_TF_DDP_PSH_NO_INVALIDATE(1) | \
+ V_TF_DDP_BUF1_FLUSH(1) | \
+ V_TF_DDP_BUF0_FLUSH(1) | \
+ V_TF_DDP_PUSH_DISABLE_1(1) | \
+ V_TF_DDP_PUSH_DISABLE_0(1) | \
+ V_TF_DDP_INDICATE_OUT(1))
+
+/*
+ * Post a user buffer as an overlay on top of the current kernel buffer.
+ */
+int
+t3_overlay_ubuf(struct socket *so, const struct iovec *iov,
+ int nonblock, int rcv_flags, int modulate, int post_kbuf)
+{
+ int len, ubuf_idx;
+ unsigned long flags;
+ struct toepcb *toep = sototcpcb(so)->t_toe;
+ struct ddp_state *p = &toep->tp_ddp_state;
+
+ if (!p || !p->pdev)
+ return -1;
+
+ len = setup_iovec_ppods(so, iov, 0);
+ if (len < 0)
+ return len;
+
+ ubuf_idx = p->kbuf_idx;
+ p->buf_state[ubuf_idx].flags = DDP_BF_NOFLIP;
+ /* Use existing offset */
+ /* Don't need to update .gl, user buffer isn't copied. */
+ p->cur_buf = ubuf_idx;
+
+ flags = select_ddp_flags(so, ubuf_idx, nonblock, rcv_flags);
+
+ if (post_kbuf) {
+ struct ddp_buf_state *dbs = &p->buf_state[ubuf_idx ^ 1];
+
+ dbs->cur_offset = 0;
+ dbs->flags = 0;
+ dbs->gl = p->kbuf[ubuf_idx ^ 1];
+ p->kbuf_idx ^= 1;
+ flags |= p->kbuf_idx ?
+ V_TF_DDP_BUF1_VALID(1) | V_TF_DDP_PUSH_DISABLE_1(0) :
+ V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_PUSH_DISABLE_0(0);
+ }
+
+ if (ubuf_idx == 0) {
+ t3_overlay_ddpbuf(so, 0, p->ubuf_tag << 6, p->kbuf_tag[1] << 6,
+ len);
+ t3_setup_ddpbufs(so, 0, 0, p->kbuf[1]->dgl_length, 0,
+ flags,
+ OVERLAY_MASK | flags, 1);
+ } else {
+ t3_overlay_ddpbuf(so, 1, p->kbuf_tag[0] << 6, p->ubuf_tag << 6,
+ len);
+ t3_setup_ddpbufs(so, p->kbuf[0]->dgl_length, 0, 0, 0,
+ flags,
+ OVERLAY_MASK | flags, 1);
+ }
+#ifdef T3_TRACE
+ T3_TRACE5(TIDTB(sk),
+ "t3_overlay_ubuf: tag %u flags 0x%x mask 0x%x ubuf_idx %d "
+ " kbuf_idx %d",
+ p->ubuf_tag, flags, OVERLAY_MASK, ubuf_idx, p->kbuf_idx);
+#endif
+ return 0;
+}
==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#2 (text+ko) ====
@@ -1,4 +1,3 @@
-
/**************************************************************************
Copyright (c) 2007, Chelsio Inc.
@@ -96,8 +95,7 @@
unsigned int dgl_length;
unsigned int dgl_offset;
unsigned int dgl_nelem;
- vm_page_t *dgl_pages;
- bus_addr_t dgl_phys_addr[0];
+ vm_page_t dgl_pages[0];
};
struct ddp_buf_state {
@@ -161,9 +159,6 @@
int t3_alloc_ppods(struct tom_data *td, unsigned int n);
void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n);
void t3_free_ddp_gl(struct pci_dev *pdev, struct ddp_gather_list *gl);
-int t3_pin_pages(struct pci_dev *pdev, unsigned long uaddr, size_t len,
- struct ddp_gather_list **newgl,
- const struct ddp_gather_list *gl);
int t3_ddp_copy(const struct mbuf *skb, int offset, struct iovec *to,
int len);
//void t3_repost_kbuf(struct socket *so, int modulate, int activate);
==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_tom.c#2 (text+ko) ====
@@ -39,6 +39,7 @@
#include <sys/eventhandler.h>
#include <sys/mbuf.h>
#include <sys/module.h>
+#include <sys/condvar.h>
#include <sys/mutex.h>
#include <sys/socket.h>
#include <sys/sysctl.h>
@@ -152,6 +153,7 @@
{
bzero(toep, sizeof(*toep));
toep->tp_refcount = 1;
+ cv_init(&toep->tp_cv, "toep cv");
}
void
More information about the p4-projects
mailing list