PERFORCE change 133977 for review
Kip Macy
kmacy at FreeBSD.org
Wed Jan 23 23:36:02 PST 2008
http://perforce.freebsd.org/chv.cgi?CH=133977
Change 133977 by kmacy at kmacy:storage:toehead on 2008/01/24 07:35:33
import cpl_io ddp support
Affected files ...
.. //depot/projects/toehead/sys/dev/cxgb/sys/mvec.h#6 edit
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#5 edit
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#4 edit
.. //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#4 edit
Differences ...
==== //depot/projects/toehead/sys/dev/cxgb/sys/mvec.h#6 (text+ko) ====
@@ -48,9 +48,10 @@
extern int cxgb_mbufs_outstanding;
extern int cxgb_pack_outstanding;
-#define mtomv(m) ((struct mbuf_vec *)((m)->m_pktdat))
-#define M_IOVEC 0x100000 /* mbuf immediate data area is used for cluster ptrs */
-#define EXT_PHYS 10 /* physical/bus address */
+#define mtomv(m) ((struct mbuf_vec *)((m)->m_pktdat))
+#define M_IOVEC 0x100000 /* mbuf immediate data area is used for cluster ptrs */
+#define M_DDP 0x200000 /* direct data placement mbuf */
+#define EXT_PHYS 10 /* physical/bus address */
/*
@@ -74,6 +75,11 @@
#define EXT_CLIOVEC 9
#define EXT_JMPIOVEC 10
+#define m_cur_offset m_ext.ext_size /* override to provide ddp offset */
+#define m_seq m_pkthdr.csum_data /* stored sequence */
+#define m_ddp_gl m_ext.ext_buf /* ddp list */
+#define m_ddp_flags m_pkthdr.csum_flags /* ddp flags */
+#define m_ulp_mode m_ext.ext_type /* upper level protocol */
extern uma_zone_t zone_miovec;
==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#5 (text+ko) ====
@@ -478,6 +478,14 @@
return (credits);
}
+/*
+ * Returns true if a socket cannot accept new Rx data.
+ */
+static inline int
+so_no_receive(const struct socket *so)
+{
+ return (so->so_state & (SS_ISDISCONNECTED|SS_ISDISCONNECTING));
+}
/*
* Set of states for which we should return RX credits.
@@ -1465,6 +1473,253 @@
}
/*
+ * Returns true if we need to explicitly request RST when we receive new data
+ * on an RX-closed connection.
+ */
+static inline int
+need_rst_on_excess_rx(const struct toepcb *toep)
+{
+ return (1);
+}
+
+/*
+ * Handles Rx data that arrives in a state where the socket isn't accepting
+ * new data.
+ */
+static void
+handle_excess_rx(struct toepcb *toep, struct mbuf *m)
+{
+
+ if (need_rst_on_excess_rx(toep) && !(toep->tp_flags & TP_ABORT_SHUTDOWN))
+ t3_send_reset(toep);
+ m_freem(m);
+}
+
+/*
+ * Process a get_tcb_rpl as a DDP completion (similar to RX_DDP_COMPLETE)
+ * by getting the DDP offset from the TCB.
+ */
+static void
+tcb_rpl_as_ddp_complete(struct toepcb *toep, struct mbuf *m)
+{
+ struct ddp_state *q = &toep->tp_ddp_state;
+ struct ddp_buf_state *bsp;
+ struct cpl_get_tcb_rpl *hdr;
+ unsigned int ddp_offset;
+ struct socket *so;
+ struct tcpcb *tp;
+
+ uint64_t t;
+ __be64 *tcb;
+
+
+ /* Note that we only accout for CPL_GET_TCB issued by the DDP code. We
+ * really need a cookie in order to dispatch the RPLs.
+ */
+ q->get_tcb_count--;
+
+ /* It is a possible that a previous CPL already invalidated UBUF DDP
+ * and moved the cur_buf idx and hence no further processing of this
+ * skb is required. However, the app might be sleeping on
+ * !q->get_tcb_count and we need to wake it up.
+ */
+ if (q->cancel_ubuf && !t3_ddp_ubuf_pending(toep)) {
+ struct socket *so = toeptoso(toep);
+
+ m_freem(m);
+ if (__predict_true((so->so_state & SS_NOFDREF) == 0))
+ sorwakeup(so);
+
+ return;
+ }
+
+ bsp = &q->buf_state[q->cur_buf];
+ hdr = cplhdr(m);
+ tcb = (__be64 *)(hdr + 1);
+ if (q->cur_buf == 0) {
+ t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF0_OFFSET) / 2]);
+ ddp_offset = t >> (32 + S_TCB_RX_DDP_BUF0_OFFSET);
+ } else {
+ t = be64toh(tcb[(31 - W_TCB_RX_DDP_BUF1_OFFSET) / 2]);
+ ddp_offset = t >> S_TCB_RX_DDP_BUF1_OFFSET;
+ }
+ ddp_offset &= M_TCB_RX_DDP_BUF0_OFFSET;
+
+#ifdef T3_TRACE
+ T3_TRACE3(TIDTB(so),
+ "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u ddp_offset %u",
+ tp->rcv_nxt, q->cur_buf, ddp_offset);
+#endif
+
+#if 0
+{
+ unsigned int ddp_flags, rcv_nxt, rx_hdr_offset, buf_idx;
+
+ t = be64toh(tcb[(31 - W_TCB_RX_DDP_FLAGS) / 2]);
+ ddp_flags = (t >> S_TCB_RX_DDP_FLAGS) & M_TCB_RX_DDP_FLAGS;
+
+ t = be64toh(tcb[(31 - W_TCB_RCV_NXT) / 2]);
+ rcv_nxt = t >> S_TCB_RCV_NXT;
+ rcv_nxt &= M_TCB_RCV_NXT;
+
+ t = be64toh(tcb[(31 - W_TCB_RX_HDR_OFFSET) / 2]);
+ rx_hdr_offset = t >> (32 + S_TCB_RX_HDR_OFFSET);
+ rx_hdr_offset &= M_TCB_RX_HDR_OFFSET;
+
+ T3_TRACE2(TIDTB(sk),
+ "tcb_rpl_as_ddp_complete: DDP FLAGS 0x%x dma up to 0x%x",
+ ddp_flags, rcv_nxt - rx_hdr_offset);
+ T3_TRACE4(TB(q),
+ "tcb_rpl_as_ddp_complete: rcvnxt 0x%x hwbuf %u cur_offset %u cancel %u",
+ tp->rcv_nxt, q->cur_buf, bsp->cur_offset, q->cancel_ubuf);
+ T3_TRACE3(TB(q),
+ "tcb_rpl_as_ddp_complete: TCB rcvnxt 0x%x hwbuf 0x%x ddp_offset %u",
+ rcv_nxt - rx_hdr_offset, ddp_flags, ddp_offset);
+ T3_TRACE2(TB(q),
+ "tcb_rpl_as_ddp_complete: flags0 0x%x flags1 0x%x",
+ q->buf_state[0].flags, q->buf_state[1].flags);
+
+}
+#endif
+ m->m_cur_offset = bsp->cur_offset;
+ bsp->cur_offset = ddp_offset;
+ m->m_pkthdr.len = ddp_offset - m->m_cur_offset;
+ so = toeptoso(toep);
+
+ if (__predict_false(so_no_receive(so) && m->m_pkthdr.len)) {
+ handle_excess_rx(toep, m);
+ return;
+ }
+
+#ifdef T3_TRACE
+ if ((int)m->m_pkthdr.len < 0) {
+ t3_ddp_error(so, "tcb_rpl_as_ddp_complete: neg len");
+ }
+#endif
+ if (bsp->flags & DDP_BF_NOCOPY) {
+#ifdef T3_TRACE
+ T3_TRACE0(TB(q),
+ "tcb_rpl_as_ddp_complete: CANCEL UBUF");
+
+ if (!q->cancel_ubuf && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+ printk("!cancel_ubuf");
+ t3_ddp_error(sk, "tcb_rpl_as_ddp_complete: !cancel_ubuf");
+ }
+#endif
+ m->m_ddp_flags = DDP_BF_PSH | DDP_BF_NOCOPY | 1;
+ bsp->flags &= ~DDP_BF_NOCOPY;
+ q->cur_buf ^= 1;
+ } else if (bsp->flags & DDP_BF_NOFLIP) {
+
+ m->m_ddp_flags = 1; /* always a kernel buffer */
+
+ /* now HW buffer carries a user buffer */
+ bsp->flags &= ~DDP_BF_NOFLIP;
+ bsp->flags |= DDP_BF_NOCOPY;
+
+ /* It is possible that the CPL_GET_TCB_RPL doesn't indicate
+ * any new data in which case we're done. If in addition the
+ * offset is 0, then there wasn't a completion for the kbuf
+ * and we need to decrement the posted count.
+ */
+ if (m->m_pkthdr.len == 0) {
+ if (ddp_offset == 0)
+ q->kbuf_posted--;
+ panic("length not set");
+ m_free(m);
+ return;
+ }
+ } else {
+ /* This reply is for a CPL_GET_TCB_RPL to cancel the UBUF DDP,
+ * but it got here way late and nobody cares anymore.
+ */
+ m_free(m);
+ return;
+ }
+
+ tp = toep->tp_tp;
+ m->m_ddp_gl = (unsigned char *)bsp->gl;
+ m->m_seq = tp->rcv_nxt;
+ tp->rcv_nxt += m->m_pkthdr.len;
+ tp->t_rcvtime = ticks;
+
+#if 0
+ skb->h.th = tcphdr_skb->h.th;
+#endif
+#ifdef T3_TRACE
+ T3_TRACE3(TB(q),
+ "tcb_rpl_as_ddp_complete: seq 0x%x hwbuf %u lskb->len %u",
+ m->m_seq, q->cur_buf, m->m_pkthdr.len);
+#endif
+#ifdef notyet
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+#endif
+ if (__predict_true((so->so_state & SS_NOFDREF) == 0))
+ sorwakeup(so);
+}
+
+/*
+ * Process a CPL_GET_TCB_RPL. These can also be generated by the DDP code,
+ * in that case they are similar to DDP completions.
+ */
+static int
+do_get_tcb_rpl(struct t3cdev *cdev, struct mbuf *m, void *ctx)
+{
+ struct toepcb *toep = (struct toepcb *)ctx;
+
+ /* OK if socket doesn't exist */
+ if (toep == NULL)
+ return (CPL_RET_BUF_DONE);
+
+ tcb_rpl_as_ddp_complete(toep, m);
+
+ return (0);
+}
+
+static void
+handle_ddp_data(struct toepcb *toep, struct mbuf *m)
+{
+ struct tcpcb *tp = toep->tp_tp;
+ struct ddp_state *q;
+ struct ddp_buf_state *bsp;
+ struct cpl_rx_data *hdr = cplhdr(m);
+ unsigned int rcv_nxt = ntohl(hdr->seq);
+
+ if (tp->rcv_nxt == rcv_nxt)
+ return;
+
+ q = &toep->tp_ddp_state;
+ bsp = &q->buf_state[q->cur_buf];
+ m->m_pkthdr.len = rcv_nxt - tp->rcv_nxt;
+
+#ifdef T3_TRACE
+ if ((int)m->m_pkthdr.len < 0) {
+ t3_ddp_error(so, "handle_ddp_data: neg len");
+ }
+#endif
+
+ m->m_ddp_gl = (unsigned char *)bsp->gl;
+ m->m_cur_offset = bsp->cur_offset;
+ m->m_ddp_flags = DDP_BF_PSH | (bsp->flags & DDP_BF_NOCOPY) | 1;
+ if (bsp->flags & DDP_BF_NOCOPY)
+ bsp->flags &= ~DDP_BF_NOCOPY;
+
+ m->m_seq = tp->rcv_nxt;
+ tp->rcv_nxt = rcv_nxt;
+ bsp->cur_offset += m->m_pkthdr.len;
+ if (!(bsp->flags & DDP_BF_NOFLIP))
+ q->cur_buf ^= 1;
+ tp->t_rcvtime = ticks;
+#ifdef notyet
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+#endif
+ /* For now, don't re-enable DDP after a connection fell out of DDP
+ * mode.
+ */
+ q->ubuf_ddp_ready = 0;
+}
+
+/*
* Process new data received for a connection.
*/
static void
@@ -1477,26 +1732,25 @@
INP_LOCK(tp->t_inpcb);
-#ifdef notyet
- if (__predict_false(sk_no_receive(sk))) {
- handle_excess_rx(so, skb);
+ if (__predict_false(so_no_receive(so))) {
+ handle_excess_rx(toep, m);
return;
}
- if (ULP_MODE(tp) == ULP_MODE_TCPDDP)
- handle_ddp_data(so, skb);
+ if (toep->tp_ulp_mode == ULP_MODE_TCPDDP)
+ handle_ddp_data(toep, m);
+
+ m->m_seq = ntohl(hdr->seq);
+ m->m_ddp_flags = 0;
+ m->m_ulp_mode = 0; /* for iSCSI */
- TCP_SKB_CB(skb)->seq = ntohl(hdr->seq);
- TCP_SKB_CB(skb)->flags = 0;
- skb_ulp_mode(skb) = 0; /* for iSCSI */
-#endif
#if VALIDATE_SEQ
- if (__predict_false(TCP_SKB_CB(skb)->seq != tp->rcv_nxt)) {
- printk(KERN_ERR
+ if (__predict_false(m->m_seq != tp->rcv_nxt)) {
+ log(LOG_ERR,
"%s: TID %u: Bad sequence number %u, expected %u\n",
- TOE_DEV(sk)->name, TID(tp), TCP_SKB_CB(skb)->seq,
+ TOE_DEV(toeptoso(toep))->name, toep->tp_tid, m->m_seq,
tp->rcv_nxt);
- __kfree_skb(skb);
+ m_freem(m);
return;
}
#endif
@@ -1528,8 +1782,8 @@
toep->tp_enqueued_bytes += m->m_pkthdr.len;
#ifdef T3_TRACE
T3_TRACE2(TIDTB(sk),
- "new_rx_data: seq 0x%x len %u",
- TCP_SKB_CB(skb)->seq, skb->len);
+ "new_rx_data: seq 0x%x len %u",
+ m->m_seq, m->m_pkthdr.len);
#endif
SOCKBUF_LOCK(&so->so_rcv);
if (sb_notify(&so->so_rcv))
@@ -1567,21 +1821,20 @@
}
static void
-new_rx_data_ddp(struct socket *so, struct mbuf *m)
+new_rx_data_ddp(struct toepcb *toep, struct mbuf *m)
{
- struct tcpcb *tp = sototcpcb(so);
- struct toepcb *toep = tp->t_toe;
+ struct tcpcb *tp;
struct ddp_state *q;
struct ddp_buf_state *bsp;
struct cpl_rx_data_ddp *hdr;
unsigned int ddp_len, rcv_nxt, ddp_report, end_offset, buf_idx;
+ struct socket *so = toeptoso(toep);
-#ifdef notyet
- if (unlikely(sk_no_receive(sk))) {
- handle_excess_rx(so, m);
+ if (__predict_false(so_no_receive(so))) {
+ handle_excess_rx(toep, m);
return;
}
-#endif
+
tp = sototcpcb(so);
q = &toep->tp_ddp_state;
hdr = cplhdr(m);
@@ -1604,7 +1857,7 @@
rcv_nxt = ntohl(hdr->seq) + ddp_len;
/*
- * Overload to store old rcv_next
+ * Overload to store old RCV_NXT
*/
m->m_pkthdr.csum_data = tp->rcv_nxt;
tp->rcv_nxt = rcv_nxt;
@@ -1622,15 +1875,8 @@
* account for page pod's pg_offset.
*/
end_offset = G_DDP_OFFSET(ddp_report) + ddp_len;
-#ifdef notyet
- TCP_SKB_CB(skb)->when = end_offset - skb->len;
-
- /*
- * We store in mac.raw the address of the gather list where the
- * placement happened.
- */
- skb->mac.raw = (unsigned char *)bsp->gl;
-#endif
+ m->m_cur_offset = end_offset - m->m_pkthdr.len;
+ m->m_ddp_gl = (unsigned char *)bsp->gl;
bsp->cur_offset = end_offset;
/*
@@ -1638,9 +1884,6 @@
* Note that other parts of the code depend on this being in bit 0.
*/
if ((bsp->flags & DDP_BF_NOINVAL) && end_offset != bsp->gl->dgl_length) {
-#if 0
- TCP_SKB_CB(skb)->flags = 0; /* potential spurious completion */
-#endif
panic("spurious ddp completion");
} else {
m->m_pkthdr.csum_flags = !!(ddp_report & F_DDP_BUF_COMPLETE);
@@ -1676,7 +1919,6 @@
do_rx_data_ddp(struct t3cdev *cdev, struct mbuf *m, void *ctx)
{
struct toepcb *toep = ctx;
- struct socket *so = toeptoso(toep);
const struct cpl_rx_data_ddp *hdr = cplhdr(m);
VALIDATE_SOCK(so);
@@ -1689,26 +1931,25 @@
#if 0
skb->h.th = tcphdr_skb->h.th;
#endif
- new_rx_data_ddp(so, m);
+ new_rx_data_ddp(toep, m);
return (0);
}
static void
-process_ddp_complete(struct socket *so, struct mbuf *m)
+process_ddp_complete(struct toepcb *toep, struct mbuf *m)
{
- struct tcpcb *tp = sototcpcb(so);
- struct toepcb *toep = tp->t_toe;
+ struct tcpcb *tp = toep->tp_tp;
+ struct socket *so = toeptoso(toep);
struct ddp_state *q;
struct ddp_buf_state *bsp;
struct cpl_rx_ddp_complete *hdr;
unsigned int ddp_report, buf_idx, when;
-#ifdef notyet
- if (unlikely(sk_no_receive(sk))) {
- handle_excess_rx(sk, skb);
+ if (__predict_false(so_no_receive(so))) {
+ handle_excess_rx(toep, m);
return;
}
-#endif
+
q = &toep->tp_ddp_state;
hdr = cplhdr(m);
ddp_report = ntohl(hdr->ddp_report);
@@ -1748,11 +1989,11 @@
tp->rcv_nxt += m->m_len;
tp->t_rcvtime = ticks;
- sbappendstream_locked(&so->so_rcv, m);
-#ifdef notyet
- if (!sock_flag(sk, SOCK_DEAD))
- sk->sk_data_ready(sk, 0);
-#endif
+ sbappendstream_locked(&so->so_rcv, m)
+ ;
+ if ((so->so_state & SS_NOFDREF) == 0)
+ sorwakeup_locked(so);
+
}
/*
@@ -1762,13 +2003,12 @@
do_rx_ddp_complete(struct t3cdev *cdev, struct mbuf *m, void *ctx)
{
struct toepcb *toep = ctx;
- struct socket *so = toeptoso(toep);
VALIDATE_SOCK(so);
#if 0
skb->h.th = tcphdr_skb->h.th;
#endif
- process_ddp_complete(so, m);
+ process_ddp_complete(toep, m);
return (0);
}
@@ -3413,8 +3653,8 @@
#ifdef notyet
t3tom_register_cpl_handler(CPL_RX_URG_NOTIFY, do_rx_urg_notify);
t3tom_register_cpl_handler(CPL_TRACE_PKT, do_trace_pkt);
+#endif
t3tom_register_cpl_handler(CPL_GET_TCB_RPL, do_get_tcb_rpl);
-#endif
return (0);
}
==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c#4 (text+ko) ====
@@ -1001,6 +1001,38 @@
(TOM_TUNABLE(toep->tp_toedev, ddp_copy_limit) +
DDP_RSVD_WIN);
}
+
+static inline int
+is_ddp(const struct mbuf *m)
+{
+ return (m->m_flags & M_DDP);
+}
+
+static inline int
+is_ddp_psh(const struct mbuf *m)
+{
+ return is_ddp(skb) && (m->m_pkthdr.csum_flags & DDP_BF_PSH);
+}
+
+/*
+ * Copy data from an sk_buff to an iovec. Deals with RX_DATA, which carry the
+ * data in the sk_buff body, and with RX_DATA_DDP, which place the data in a
+ * DDP buffer.
+ */
+static inline int
+copy_data(const struct mbuf *m, int offset, struct iovec *to, int len)
+{
+ if (__predict_true(!is_ddp(m))) /* RX_DATA */
+ return mbuf_copy_datagram_iovec(m, offset, to, len);
+ if (__predict_true(m->pkthdr.csum_flags & DDP_BF_NOCOPY)) { /* user DDP */
+ to->iov_len -= len;
+ to->iov_base += len;
+ return 0;
+ }
+ return t3_ddp_copy(m, offset, to, len); /* kernel DDP */
+}
+
+
#endif
/*
* Clean up DDP state that needs to survive until socket close time, such as the
@@ -1014,9 +1046,6 @@
struct ddp_state *p = &toep->tp_ddp_state;
int idx;
- if (!p)
- return;
-
for (idx = 0; idx < NUM_DDP_KBUF; idx++)
if (p->kbuf[idx]) {
ddp_gl_free_pages(p->kbuf[idx], 0);
@@ -1026,6 +1055,7 @@
if (p->ubuf) {
ddp_gl_free_pages(p->ubuf, 0);
free(p->ubuf, M_DEVBUF);
+ p->ubuf = NULL;
}
toep->tp_ulp_mode = 0;
}
==== //depot/projects/toehead/sys/dev/cxgb/ulp/tom/cxgb_t3_ddp.h#4 (text+ko) ====
@@ -135,9 +135,8 @@
/*
* Returns 1 if a UBUF DMA buffer might be active.
*/
-static inline int t3_ddp_ubuf_pending(struct socket *so)
+static inline int t3_ddp_ubuf_pending(struct toepcb *toep)
{
- struct toepcb *toep = sototcpcb(so)->t_toe;
struct ddp_state *p = &toep->tp_ddp_state;
/* When the TOM_TUNABLE(ddp) is enabled, we're always in ULP_MODE DDP,
More information about the p4-projects
mailing list