PERFORCE change 134128 for review
Kip Macy
kmacy at FreeBSD.org
Sat Jan 26 00:00:17 PST 2008
http://perforce.freebsd.org/chv.cgi?CH=134128
Change 134128 by kmacy at kmacy:storage:toehead on 2008/01/26 07:59:26
first cut at implementing zero copy soreceive
Affected files ...
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#7 edit
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_ddp.c#2 edit
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#7 edit
Differences ...
==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#7 (text+ko) ====
@@ -101,6 +101,7 @@
#ifndef PG_FRAME
#define PG_FRAME ~PAGE_MASK
#endif
+#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
void
t3_init_socket_ops(void)
@@ -251,7 +252,6 @@
* can be posted without closing the window in the middle of DDP (checked
* when the connection is offloaded)
*/
-#ifdef notyet
static int
so_should_ddp(const struct toepcb *toep, int last_recv_len)
{
@@ -260,7 +260,67 @@
toep->tp_tp->rcv_wnd >
(TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) + DDP_RSVD_WIN);
}
-#endif
+
+static inline int
+is_ddp(const struct mbuf *m)
+{
+ return (m->m_flags & M_DDP);
+}
+
+static inline int
+is_ddp_psh(const struct mbuf *m)
+{
+ return is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH);
+}
+
+static int
+m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio)
+{
+ int curlen, err = 0;
+ caddr_t buf;
+
+ while (m && len) {
+ buf = mtod(m, caddr_t);
+ curlen = m->m_len;
+ if (offset < curlen) {
+ curlen -= offset;
+ buf += offset;
+ offset = 0;
+ } else {
+ offset -= curlen;
+ m = m->m_next;
+ continue;
+ }
+
+ err = uiomove_frombuf(buf, min(len, curlen), uio);
+ if (err)
+ return (err);
+ len -= min(len, m->m_len);
+ m = m->m_next;
+ }
+ return (err);
+}
+
+/*
+ * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the
+ * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a
+ * DDP buffer.
+ */
+static inline int
+copy_data(const struct mbuf *m, int offset, int len, struct uio *uio)
+{
+ struct iovec *to = uio->uio_iov;
+
+ if (__predict_true(!is_ddp(m))) /* RX_DATA */
+ return m_uiomove(m, offset, len, uio);
+ if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */
+ to->iov_len -= len;
+ to->iov_base = ((caddr_t)to->iov_base) + len;
+ uio->uio_iov = to;
+ return (0);
+ }
+ return t3_ddp_copy(m, offset, uio, len); /* kernel DDP */
+}
static void
cxgb_wait_dma_completion(struct toepcb *toep)
@@ -449,34 +509,258 @@
static int
-t3_soreceive(struct socket *so, struct uio *uio)
+t3_soreceive(struct socket *so, int *flagsp, struct uio *uio)
{
-#ifdef notyet
- int i, rv, count, hold_resid, sent, iovcnt;
- struct iovec iovtmp[TMP_IOV_MAX], *iovtmpp, *iov;
struct tcpcb *tp = sototcpcb(so);
struct toepcb *toep = tp->t_toe;
struct mbuf *m;
- struct uio uiotmp;
+ uint32_t offset;
+ int err, flags, avail, len, buffers_freed = 0, copied = 0;
+ int target; /* Read at least this many bytes */
+ long timeo;
+ int user_ddp_ok, user_ddp_pending = 0;
+ struct ddp_state *p;
+ struct inpcb *inp = sotoinpcb(so);
+
+ flags = flagsp ? (*flagsp &~ MSG_EOR) : 0;
+
+ err = sblock(&so->so_rcv, SBLOCKWAIT(flags));
+ if (err)
+ return (err);
+restart:
+ SOCKBUF_LOCK(&so->so_rcv);
+ len = uio->uio_resid;
+ m = so->so_rcv.sb_mb;
+ target = (flags & MSG_WAITALL) ? min(len, so->so_rcv.sb_hiwat) : so->so_rcv.sb_lowat;
+ timeo = so->so_rcv.sb_timeo;
+ p = &toep->tp_ddp_state;
+ user_ddp_ok = p->ubuf_ddp_ready;
+ p->cancel_ubuf = 0;
+
+ /*
+ * XXX check timeo/signal/urgent
+ */
+ if (m)
+ goto got_mbuf;
+
+ /* empty receive queue */
+ if (copied >= target && /* !sk->sk_backlog.tail && */
+ !user_ddp_pending)
+ goto done;
+ if (copied) {
+ if (so->so_error || tp->t_state == TCPS_CLOSED ||
+ (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)))
+ goto done;
+ } else {
+ if (so->so_state & SS_NOFDREF)
+ goto done;
+ if (so->so_error) {
+ err = so->so_error;
+ so->so_error = 0;
+ goto done;
+ }
+ if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
+ goto done;
+ if (tp->t_state == TCPS_CLOSED) {
+ err = ENOTCONN;
+ goto done;
+ }
+ }
+ if (so->so_rcv.sb_mb && !user_ddp_pending) {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ INP_LOCK(inp);
+ t3_cleanup_rbuf(tp);
+ INP_UNLOCK(inp);
+ goto restart;
+ }
+ if (p->ubuf && user_ddp_ok && !user_ddp_pending &&
+ uio->uio_iov->iov_len > p->kbuf[0]->dgl_length &&
+ p->ubuf_ddp_ready) {
+ user_ddp_pending =
+ !t3_overlay_ubuf(so, uio, (so->so_state & SS_NBIO), flags, 1, 1);
+ if (user_ddp_pending) {
+ p->kbuf_posted++;
+ user_ddp_ok = 0;
+ }
+ }
+ if (user_ddp_pending) {
+ /* One shot at DDP if we already have enough data */
+ if (copied >= target)
+ user_ddp_ok = 0;
+ if ((err = sbwait(&so->so_rcv)) != 0)
+ goto done;
+//for timers to work await_ddp_completion(sk, flags, &timeo);
+ } else if (copied >= target)
+ goto done;
+ else {
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ INP_LOCK(inp);
+ t3_cleanup_rbuf(tp);
+ INP_UNLOCK(inp);
+ if ((err = sbwait(&so->so_rcv)) != 0)
+ goto done;
+ }
+ goto restart;
+got_mbuf:
+ if (m->m_pkthdr.len == 0) {
+ if ((m->m_ddp_flags & DDP_BF_NOCOPY) == 0)
+ panic("empty mbuf and NOCOPY not set\n");
+ user_ddp_pending = 0;
+ sbfree(&so->so_rcv, m);
+ m = so->so_rcv.sb_mb = m_free(m);
+ goto done;
+ }
+ offset = toep->tp_copied_seq - m->m_seq;
+ if (offset > m->m_pkthdr.len)
+ panic("t3_soreceive: BUG: OFFSET > LEN seq 0x%x "
+ "skb->len %d flags 0x%x", m->m_seq,
+ m->m_pkthdr.len, m->m_ddp_flags);
+ avail = m->m_pkthdr.len - offset;
+ if (len < avail) {
+ if (is_ddp(m) && (m->m_ddp_flags & DDP_BF_NOCOPY))
+ panic("bad state in t3_soreceive\n");
+ avail = len;
+ }
+#ifdef notyet
/*
- * Events requiring iteration:
- * - number of pages exceeds max hold pages for process or system
- * - number of pages exceeds maximum sg entries for a single WR
- *
- * We're limited to holding 128 pages at once - and we're limited to
- * 34 SG entries per work request, but each SG entry can be any number
- * of contiguous pages
- *
+ * Check if the data we are preparing to copy contains urgent
+ * data. Either stop short of urgent data or skip it if it's
+ * first and we are not delivering urgent data inline.
+ */
+ if (unlikely(tp->urg_data)) {
+ u32 urg_offset = tp->urg_seq - tp->copied_seq;
+
+ if (urg_offset < avail) {
+ if (urg_offset) {
+ /* stop short of the urgent data */
+ avail = urg_offset;
+ } else if (!sock_flag(sk, SOCK_URGINLINE)) {
+ /* First byte is urgent, skip */
+ tp->copied_seq++;
+ offset++;
+ avail--;
+ if (!avail)
+ goto skip_copy;
+ }
+ }
+ }
+#endif
+ if (is_ddp_psh(m) || offset) {
+ user_ddp_ok = 0;
+#ifdef T3_TRACE
+ T3_TRACE0(TIDTB(so), "t3_sosend: PSH");
+#endif
+ }
+
+ if (user_ddp_ok && !user_ddp_pending &&
+ /*
+ * XXX
+ */
+#ifdef notyet
+ uio->uio_iovlen > p->kbuf[0]->length &&
+#endif
+ p->ubuf_ddp_ready) {
+ user_ddp_pending =
+ !t3_overlay_ubuf(so, uio, (so->so_state & SS_NBIO), flags, 1, 1);
+ if (user_ddp_pending) {
+ p->kbuf_posted++;
+ user_ddp_ok = 0;
+ }
+ }
+
+ /*
+ * If MSG_TRUNC is specified the data is discarded.
+ * XXX need to check pr_atomic
+ */
+ if (__predict_true(!(flags & MSG_TRUNC)))
+ if ((err = copy_data(m, offset, avail, uio))) {
+ if (err)
+ err = EFAULT;
+ goto done;
+ }
+
+ toep->tp_copied_seq += avail;
+ copied += avail;
+ len -= avail;
+#ifdef notyet
+skip_copy:
+ if (tp->urg_data && after(tp->copied_seq, tp->urg_seq))
+ tp->urg_data = 0;
+#endif
+ /*
+ * If the buffer is fully consumed free it. If it's a DDP
+ * buffer also handle any events it indicates.
+ */
+ if (avail + offset >= m->m_pkthdr.len) {
+ unsigned int fl = m->m_ddp_flags;
+ int got_psh = 0;
+
+ if (p->ubuf != NULL && is_ddp(m) && (fl & 1)) {
+ if (is_ddp_psh(m) && user_ddp_pending)
+ got_psh = 1;
+
+ if (fl & DDP_BF_NOCOPY)
+ user_ddp_pending = 0;
+ else {
+ p->kbuf_posted--;
+ p->ubuf_ddp_ready = 1;
+ }
+ }
+ sbfree(&so->so_rcv, m);
+ m = so->so_rcv.sb_mb = m_free(m);
+ buffers_freed++;
+
+ if ((so->so_rcv.sb_mb == NULL) && got_psh)
+ goto done;
+ }
+ if (len > 0)
+ goto restart;
+
+done:
+ /*
+ * If we can still receive decide what to do in preparation for the
+ * next receive. Note that RCV_SHUTDOWN is set if the connection
+ * transitioned to CLOSE but not if it was in that state to begin with.
*/
+ if (__predict_true((so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED)) == 0)) {
+ if (user_ddp_pending) {
+ user_ddp_ok = 0;
+ t3_cancel_ubuf(toep);
+ if (so->so_rcv.sb_mb) {
+ if (copied < 0)
+ copied = 0;
+ goto restart;
+ }
+ user_ddp_pending = 0;
+ }
+ if (p->kbuf_posted == 0) {
+#ifdef T3_TRACE
+ T3_TRACE0(TIDTB(so),
+ "chelsio_recvmsg: about to exit, repost kbuf");
+#endif
+
+ t3_post_kbuf(so, 1);
+ p->kbuf_posted++;
+ } else if (so_should_ddp(toep, copied)) {
+ t3_enter_ddp(so, TOM_TUNABLE(TOE_DEV(so),
+ ddp_copy_limit), 0);
+ p->kbuf_posted = 1;
+ }
+ }
+ if (buffers_freed)
+ t3_cleanup_rbuf(tp);
+#ifdef T3_TRACE
+ T3_TRACE5(TIDTB(so),
+ "chelsio_recvmsg <-: copied %d len %d buffers_freed %d "
+ "kbuf_posted %d user_ddp_pending %u",
+ copied, len, buffers_freed, p ? p->kbuf_posted : -1,
+ user_ddp_pending);
+#endif
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ sbunlock(&so->so_rcv);
- uiotmp = *uio;
- iovcnt = uio->uio_iovcnt;
- iov = uio->uio_iov;
- sent = 0;
- re;
-#endif
- return (0);
+ return (err);
}
static int
@@ -484,9 +768,11 @@
struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
{
struct toedev *tdev;
- int rv, zcopy_thres, zcopy_enabled;
+ int rv, zcopy_thres, zcopy_enabled, flags;
struct tcpcb *tp = sototcpcb(so);
+ flags = flagsp ? *flagsp &~ MSG_EOR : 0;
+
/*
* In order to use DMA direct from userspace the following
* conditions must be met:
@@ -500,14 +786,16 @@
* - iovcnt is 1
*
*/
- if (tp->t_flags & TF_TOE) {
+ if ((tp->t_flags & TF_TOE) && ((flags & (MSG_WAITALL|MSG_OOB|MSG_PEEK|MSG_DONTWAIT)) == 0)
+ && ((so->so_state & SS_NBIO) == 0) && (uio->uio_iovcnt == 1) &&
+ ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) && (mp0 == NULL)) {
tdev = TOE_DEV(so);
zcopy_thres = TOM_TUNABLE(tdev, ddp_thres);
zcopy_enabled = TOM_TUNABLE(tdev, ddp);
if ((uio->uio_resid > zcopy_thres) &&
(uio->uio_iovcnt == 1) && ((so->so_state & SS_NBIO) == 0)
&& zcopy_enabled) {
- rv = t3_soreceive(so, uio);
+ rv = t3_soreceive(so, flagsp, uio);
if (rv != EAGAIN)
return (rv);
}
==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_ddp.c#2 (text+ko) ====
@@ -326,9 +326,9 @@
}
/**
- * setup_iovec_ppods - setup HW page pods for a user iovec
+ * setup_uio_ppods - setup HW page pods for a user iovec
* @sk: the associated socket
- * @iov: the iovec
+ * @uio: the uio
* @oft: additional bytes to map before the start of the buffer
*
* Pins a user iovec and sets up HW page pods for DDP into it. We allocate
@@ -339,13 +339,14 @@
* The current implementation handles iovecs with only one entry.
*/
static int
-setup_iovec_ppods(struct socket *so, const struct iovec *iov, int oft, int *length)
+setup_uio_ppods(struct socket *so, const struct uio *uio, int oft, int *length)
{
int err;
unsigned int len;
struct ddp_gather_list *gl = NULL;
struct toepcb *toep = sototcpcb(so)->t_toe;
struct ddp_state *p = &toep->tp_ddp_state;
+ struct iovec *iov = uio->uio_iov;
unsigned long addr = (unsigned long)iov->iov_base - oft;
if (__predict_false(!p->ubuf_nppods)) {
@@ -424,7 +425,7 @@
* Post a user buffer as an overlay on top of the current kernel buffer.
*/
int
-t3_overlay_ubuf(struct socket *so, const struct iovec *iov,
+t3_overlay_ubuf(struct socket *so, const struct uio *uio,
int nonblock, int rcv_flags, int modulate, int post_kbuf)
{
int err, len, ubuf_idx;
@@ -435,7 +436,7 @@
if (p->ubuf == NULL)
return (EINVAL);
- err = setup_iovec_ppods(so, iov, 0, &len);
+ err = setup_uio_ppods(so, uio, 0, &len);
if (err)
return (err);
@@ -481,67 +482,6 @@
return (0);
}
-static inline int
-is_ddp(const struct mbuf *m)
-{
- return (m->m_flags & M_DDP);
-}
-
-static inline int
-is_ddp_psh(const struct mbuf *m)
-{
- return is_ddp(m) && (m->m_pkthdr.csum_flags & DDP_BF_PSH);
-}
-
-static int
-m_uiomove(const struct mbuf *m, int offset, int len, struct uio *uio)
-{
- int curlen, err = 0;
- caddr_t buf;
-
- while (m && len) {
- buf = mtod(m, caddr_t);
- curlen = m->m_len;
- if (offset < curlen) {
- curlen -= offset;
- buf += offset;
- offset = 0;
- } else {
- offset -= curlen;
- m = m->m_next;
- continue;
- }
-
- err = uiomove_frombuf(buf, min(len, curlen), uio);
- if (err)
- return (err);
- len -= min(len, m->m_len);
- m = m->m_next;
- }
- return (err);
-}
-
-/*
- * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the
- * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a
- * DDP buffer.
- */
-static inline int
-copy_data(const struct mbuf *m, int offset, struct uio *uio, int len)
-{
- struct iovec *to = uio->uio_iov;
-
- if (__predict_true(!is_ddp(m))) /* RX_DATA */
- return m_uiomove(m, offset, len, uio);
- if (__predict_true(m->m_ddp_flags & DDP_BF_NOCOPY)) { /* user DDP */
- to->iov_len -= len;
- to->iov_base = ((caddr_t)to->iov_base) + len;
- uio->uio_iov = to;
- return (0);
- }
- return t3_ddp_copy(m, offset, uio, len); /* kernel DDP */
-}
-
/*
* Clean up DDP state that needs to survive until socket close time, such as the
* DDP buffers. The buffers are already unmapped at this point as unmapping
==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#7 (text+ko) ====
@@ -153,14 +153,13 @@
int t3_alloc_ppods(struct tom_data *td, unsigned int n, int *tag);
void t3_free_ppods(struct tom_data *td, unsigned int tag, unsigned int n);
void t3_free_ddp_gl(struct ddp_gather_list *gl);
-int t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio,
- int len);
+int t3_ddp_copy(const struct mbuf *m, int offset, struct uio *uio, int len);
//void t3_repost_kbuf(struct socket *so, int modulate, int activate);
void t3_post_kbuf(struct socket *so, int modulate);
-int t3_post_ubuf(struct socket *so, const struct iovec *iov, int nonblock,
+int t3_post_ubuf(struct socket *so, const struct uio *uio, int nonblock,
int rcv_flags, int modulate, int post_kbuf);
void t3_cancel_ubuf(struct toepcb *toep);
-int t3_overlay_ubuf(struct socket *so, const struct iovec *iov, int nonblock,
+int t3_overlay_ubuf(struct socket *so, const struct uio *uio, int nonblock,
int rcv_flags, int modulate, int post_kbuf);
int t3_enter_ddp(struct socket *so, unsigned int kbuf_size, unsigned int waitall);
void t3_cleanup_ddp(struct toepcb *toep);
More information about the p4-projects
mailing list