PERFORCE change 128722 for review
Kip Macy
kmacy at FreeBSD.org
Mon Nov 5 20:44:30 PST 2007
http://perforce.freebsd.org/chv.cgi?CH=128722
Change 128722 by kmacy at kmacy:storage:toestack on 2007/11/06 04:43:31
add interface for setting socket options
add functions to set values in the tcb for options
Affected files ...
.. //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#15 edit
.. //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_defs.h#6 edit
.. //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_tom.h#6 edit
Differences ...
==== //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#15 (text+ko) ====
@@ -42,6 +42,7 @@
#include <sys/syslog.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
+#include <sys/priv.h>
#include <net/if.h>
#include <net/route.h>
@@ -55,6 +56,7 @@
#include <dev/cxgb/cxgb_osdep.h>
#include <dev/cxgb/sys/mbufq.h>
+#include <netinet/ip.h>
#include <netinet/tcp.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_fsm.h>
@@ -62,6 +64,7 @@
#include <netinet/tcp_seq.h>
#include <net/route.h>
+
#include <dev/cxgb/t3cdev.h>
#include <dev/cxgb/common/cxgb_firmware_exports.h>
#include <dev/cxgb/common/cxgb_t3_cpl.h>
@@ -122,6 +125,7 @@
* coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
*/
#define MIN_RCV_WND (24 * 1024U)
+#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS)
#define VALIDATE_SEQ 0
#define VALIDATE_SOCK(so)
@@ -134,6 +138,26 @@
static void t3_send_reset(struct socket *so);
+/*
+ * Determine whether to send a CPL message now or defer it. A message is
+ * deferred if the connection is in SYN_SENT since we don't know the TID yet.
+ * For connections in other states the message is sent immediately.
+ * If through_l2t is set the message is subject to ARP processing, otherwise
+ * it is sent directly.
+ */
+static inline void
+send_or_defer(struct socket *so, struct tcpcb *tp, struct mbuf *m, int through_l2t)
+{
+ struct toepcb *toep = tp->t_toe;
+
+ if (__predict_false(tp->t_state == TCPS_SYN_SENT))
+ mbufq_tail(&toep->out_of_order_queue, m); // defer
+ else if (through_l2t)
+ l2t_send(T3C_DEV(so), m, toep->tp_l2t); // send through L2T
+ else
+ cxgb_ofld_send(T3C_DEV(so), m); // send directly
+}
+
static inline unsigned int
mkprio(unsigned int cntrl, const struct socket *so)
{
@@ -481,11 +505,191 @@
.tu_rcvd = cxgb_toe_rcvd,
};
+
+static void
+__set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word,
+ uint64_t mask, uint64_t val, int no_reply)
+{
+ struct cpl_set_tcb_field *req;
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+
+ req = mtod(m, struct cpl_set_tcb_field *);
+ m->m_pkthdr.len = m->m_len = sizeof(*req);
+ req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+ OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
+ req->reply = V_NO_REPLY(no_reply);
+ req->cpu_idx = 0;
+ req->word = htons(word);
+ req->mask = htobe64(mask);
+ req->val = htobe64(val);
+
+ m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
+ send_or_defer(so, tp, m, 0);
+}
+
+static void
+t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val)
+{
+ struct mbuf *m;
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+
+ if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN))
+ return;
+
+ m = m_gethdr(M_NOWAIT, MT_DATA);
+ if (m == NULL) {
+ /*
+ * XXX need lowmem cache
+ */
+ }
+
+ __set_tcb_field(so, m, word, mask, val, 1);
+}
+
+/*
+ * Set one of the t_flags bits in the TCB.
+ */
+static void
+set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val)
+{
+ t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
+}
+
+/*
+ * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
+ */
+static void
+t3_set_nagle(struct socket *so)
+{
+ struct tcpcb *tp = sototcpcb(so);
+
+ set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
+}
+
+/*
+ * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
+ */
+void
+t3_set_keepalive(struct socket *so, int on_off)
+{
+ set_tcb_tflag(so, S_TF_KEEPALIVE, on_off);
+}
+
void
+t3_set_rcv_coalesce_enable(struct socket *so, int on_off)
+{
+ set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off);
+}
+
+/*
+ * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
+ */
+static void
+t3_set_tos(struct socket *so)
+{
+ t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
+ V_TCB_TOS(SO_TOS(so)));
+}
+
+
+/*
+ * In DDP mode, TP fails to schedule a timer to push RX data to the host when
+ * DDP is disabled (data is delivered to freelist). [Note that, the peer should
+ * set the PSH bit in the last segment, which would trigger delivery.]
+ * We work around the issue by setting a DDP buffer in a partial placed state,
+ * which guarantees that TP will schedule a timer.
+ */
+#define TP_DDP_TIMER_WORKAROUND_MASK\
+ (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
+ ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
+ V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
+#define TP_DDP_TIMER_WORKAROUND_VAL\
+ (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
+ ((V_TCB_RX_DDP_BUF0_OFFSET((u64)1) | V_TCB_RX_DDP_BUF0_LEN((u64)2)) <<\
+ 32))
+
+static void
t3_enable_ddp(struct socket *so, int on)
{
- printf("t3_enable_ddp unimplemented !!!! \n");
-
+ if (on)
+ t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
+ V_TF_DDP_OFF(0));
+ else
+ t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS,
+ V_TF_DDP_OFF(1) |
+ TP_DDP_TIMER_WORKAROUND_MASK,
+ V_TF_DDP_OFF(1) |
+ TP_DDP_TIMER_WORKAROUND_VAL);
+
+}
+
+
+void
+t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color)
+{
+ t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
+ V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
+ tag_color);
+}
+
+void
+t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
+ unsigned int len)
+{
+ if (buf_idx == 0)
+ t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET,
+ V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
+ V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
+ V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
+ V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
+ else
+ t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET,
+ V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
+ V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
+ V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
+ V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
+}
+
+static int
+t3_set_cong_control(struct socket *so, const char *name)
+{
+#ifdef notyet
+ int cong_algo;
+
+ for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
+ if (!strcmp(name, t3_cong_ops[cong_algo].name))
+ break;
+
+ if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
+ return -EINVAL;
+#endif
+ return 0;
+}
+
+int
+t3_get_tcb(struct socket *so)
+{
+ struct cpl_get_tcb *req;
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
+
+ if (!m)
+ return (ENOMEM);
+
+ m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
+ req = mtod(m, struct cpl_get_tcb *);
+ m->m_pkthdr.len = m->m_len = sizeof(*req);
+ req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+ OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
+ req->cpuno = htons(toep->tp_qset);
+ if (sototcpcb(so)->t_state == TCPS_SYN_SENT)
+ mbufq_tail(&toep->out_of_order_queue, m); // defer
+ else
+ cxgb_ofld_send(T3C_DEV(so), m);
+ return 0;
}
static inline void
@@ -607,7 +811,7 @@
toepcb_release(toep);
}
#ifdef notyet
- t3_set_ca_ops(sk, &tcp_init_congestion_ops);
+ t3_set_ca_ops(so, &tcp_init_congestion_ops);
#endif
TOE_DEV(so) = NULL;
#if 0
@@ -716,7 +920,6 @@
return (0);
}
-#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS)
/*
* The next two functions calculate the option 0 value for a socket.
*/
@@ -837,7 +1040,7 @@
if (rpl->status == CPL_ERR_CONN_EXIST &&
icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
icsk->icsk_retransmit_timer.function = act_open_retry_timer;
- sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
+ sk_reset_timer(so, &icsk->icsk_retransmit_timer,
jiffies + HZ / 2);
} else
#endif
@@ -979,7 +1182,7 @@
/* Purge the send queue so we don't send anything after an abort. */
sbflush(&so->so_snd);
#ifdef notyet
- if (sock_flag(sk, CLOSE_CON_REQUESTED) && is_t3a(TOE_DEV(sk)))
+ if (sock_flag(so, CLOSE_CON_REQUESTED) && is_t3a(TOE_DEV(sk)))
mode |= CPL_ABORT_POST_CLOSE_REQ;
#endif
m = m_gethdr(M_NOWAIT, MT_DATA);
@@ -1005,6 +1208,113 @@
l2t_send(T3C_DEV(so), m, toep->tp_l2t);
}
+static int
+t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ struct inpcb *inp;
+ int error, optval;
+
+ if (sopt->sopt_name == IP_OPTIONS)
+ return (ENOPROTOOPT);
+
+ if (sopt->sopt_name != IP_TOS)
+ return (EOPNOTSUPP);
+
+ error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
+
+ if (error)
+ return (error);
+
+ if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
+ return (EPERM);
+
+ inp = sotoinpcb(so);
+ inp->inp_ip_tos = optval;
+
+ t3_set_tos(so);
+
+ return (0);
+}
+
+static int
+t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ int err = 0;
+ size_t copied;
+
+ if (sopt->sopt_name != TCP_CONGESTION &&
+ sopt->sopt_name != TCP_NODELAY)
+ return (EOPNOTSUPP);
+
+ if (sopt->sopt_name == TCP_CONGESTION) {
+ char name[TCP_CA_NAME_MAX];
+ int optlen = sopt->sopt_valsize;
+ struct tcpcb *tp;
+
+ if (optlen < 1)
+ return (EINVAL);
+
+ err = copyinstr(sopt->sopt_val, name,
+ min(TCP_CA_NAME_MAX - 1, optlen), &copied);
+ if (err)
+ return (err);
+ if (copied < 1)
+ return (EINVAL);
+
+ tp = sototcpcb(so);
+ if ((err = t3_set_cong_control(so, name)) == 0)
+ tp->t_cong_control = strdup(name, M_DEVBUF);
+ else
+ return (err);
+ } else {
+ int optval, oldval;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+
+ err = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+
+ if (err)
+ return (err);
+
+ inp = sotoinpcb(so);
+ tp = intotcpcb(inp);
+
+ INP_LOCK(inp);
+
+ oldval = tp->t_flags;
+ if (optval)
+ tp->t_flags |= TF_NODELAY;
+ else
+ tp->t_flags &= ~TF_NODELAY;
+ INP_UNLOCK(inp);
+
+ if (oldval != tp->t_flags)
+ t3_set_nagle(so);
+
+ }
+
+ return (0);
+}
+
+static int
+t3_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+ struct tcpcb *tp = sototcpcb(so);
+ struct toepcb *toep = tp->t_toe;
+ int err;
+
+ if (sopt->sopt_level != IPPROTO_TCP)
+ err = t3_ip_ctloutput(so, sopt);
+ else
+ err = t3_tcp_ctloutput(so, sopt);
+
+ if (err != EOPNOTSUPP)
+ return (err);
+
+ return toep->tp_ctloutput(so, sopt);
+}
+
/*
* Process new data received for a connection.
*/
@@ -1018,12 +1328,12 @@
#ifdef notyet
if (__predict_false(sk_no_receive(sk))) {
- handle_excess_rx(sk, skb);
+ handle_excess_rx(so, skb);
return;
}
if (ULP_MODE(tp) == ULP_MODE_TCPDDP)
- handle_ddp_data(sk, skb);
+ handle_ddp_data(so, skb);
TCP_SKB_CB(skb)->seq = ntohl(hdr->seq);
TCP_SKB_CB(skb)->flags = 0;
@@ -1046,7 +1356,7 @@
* We don't handle urgent data yet
*/
if (__predict_false(hdr->urg))
- handle_urg_ptr(sk, tp->rcv_nxt + ntohs(hdr->urg));
+ handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
tp->urg_seq - tp->rcv_nxt < skb->len))
tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
@@ -1129,16 +1439,16 @@
T3_TRACE0(TIDTB(sk),"do_peer_fin:");
#endif
#ifdef notyet
- if (!is_t3a(TOE_DEV(sk)) && sock_flag(sk, ABORT_RPL_PENDING))
+ if (!is_t3a(TOE_DEV(sk)) && sock_flag(so, ABORT_RPL_PENDING))
goto out;
if (ULP_MODE(tp) == ULP_MODE_TCPDDP) {
- keep = handle_peer_close_data(sk, skb);
+ keep = handle_peer_close_data(so, skb);
if (keep < 0)
return;
}
sk->sk_shutdown |= RCV_SHUTDOWN;
- sock_set_flag(sk, SOCK_DONE);
+ sock_set_flag(so, SOCK_DONE);
#endif
switch (tp->t_state) {
case TCPS_SYN_RECEIVED:
@@ -1177,9 +1487,9 @@
/* Do not send POLL_HUP for half duplex close. */
if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
sk->sk_state == TCP_CLOSE)
- sk_wake_async(sk, 1, POLL_HUP);
+ sk_wake_async(so, 1, POLL_HUP);
else
- sk_wake_async(sk, 1, POLL_IN);
+ sk_wake_async(so, 1, POLL_IN);
#endif
}
#ifdef notyet
@@ -1250,8 +1560,8 @@
#if 0
else if (tcp_sk(sk)->linger2 < 0 &&
- !sock_flag(sk, ABORT_SHUTDOWN))
- abort_conn(sk, skb, LINUX_MIB_TCPABORTONLINGER);
+ !sock_flag(so, ABORT_SHUTDOWN))
+ abort_conn(so, skb, LINUX_MIB_TCPABORTONLINGER);
#endif
break;
default:
@@ -1351,6 +1661,9 @@
toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
assign_rxopt(so, opt);
+ toep->tp_ctloutput = so->so_proto->pr_ctloutput;
+ so->so_proto->pr_ctloutput = t3_ctloutput;
+
#if 0
inet_sk(sk)->id = tp->write_seq ^ jiffies;
#endif
@@ -1406,7 +1719,7 @@
*/
if (unlikely(sk->sk_socket)) { // simultaneous opens only
sk->sk_state_change(sk);
- sk_wake_async(sk, 0, POLL_OUT);
+ sk_wake_async(so, 0, POLL_OUT);
}
/*
* The state for the new connection is now up to date.
@@ -1490,7 +1803,7 @@
toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
-
+
/*
* Now that we finally have a TID send any CPL messages that we had to
* defer for lack of a TID.
@@ -1505,11 +1818,9 @@
* appears to correspond to sorwakeup_locked
*/
sk->sk_state_change(sk);
- sk_wake_async(sk, 0, POLL_OUT);
+ sk_wake_async(so, 0, POLL_OUT);
#endif
}
- printf("freeing %p\n", m);
-
m_free(m);
#ifdef notyet
/*
@@ -1526,7 +1837,7 @@
* them on their way.
*/
fixup_pending_writeq_buffers(sk);
- if (t3_push_frames(sk, 1))
+ if (t3_push_frames(so, 1))
sk->sk_write_space(sk);
#endif
==== //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_defs.h#6 (text+ko) ====
@@ -12,7 +12,6 @@
void t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev);
void t3_listen_stop(struct toedev *dev, struct socket *so, struct t3cdev *cdev);
int t3_push_frames(struct socket *so, int req_completion);
-void t3_enable_ddp(struct socket *so, int on);
int t3_connect(struct toedev *tdev, struct socket *so, struct ifnet *egress_ifp);
void t3_init_listen_cpl_handlers(void);
int t3_init_cpl_io(void);
@@ -28,4 +27,11 @@
void toepcb_release(struct toepcb *);
void toepcb_init(struct toepcb *);
+void t3_set_rcv_coalesce_enable(struct socket *so, int on_off);
+void t3_set_keepalive(struct socket *so, int on_off);
+void t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag);
+void t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
+ unsigned int len);
+int t3_get_tcb(struct socket *so);
+
#endif
==== //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_tom.h#6 (text+ko) ====
@@ -1,6 +1,6 @@
#ifndef CXGB_TOM_H_
#define CXGB_TOM_H_
-
+#include <sys/protosw.h>
#define LISTEN_INFO_HASH_SIZE 32
@@ -99,8 +99,9 @@
struct toepcb {
struct toedev *tp_toedev;
+ struct l2t_entry *tp_l2t;
+ pr_ctloutput_t *tp_ctloutput;
int tp_tid;
- struct l2t_entry *tp_l2t;
int tp_wr_max;
int tp_wr_avail;
int tp_wr_unacked;
More information about the p4-projects
mailing list