PERFORCE change 128722 for review

Kip Macy kmacy at FreeBSD.org
Mon Nov 5 20:44:30 PST 2007


http://perforce.freebsd.org/chv.cgi?CH=128722

Change 128722 by kmacy at kmacy:storage:toestack on 2007/11/06 04:43:31

	add interface for setting socket options
	add functions to set values in the tcb for options

Affected files ...

.. //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#15 edit
.. //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_defs.h#6 edit
.. //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_tom.h#6 edit

Differences ...

==== //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c#15 (text+ko) ====

@@ -42,6 +42,7 @@
 #include <sys/syslog.h>
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
+#include <sys/priv.h>
 
 #include <net/if.h>
 #include <net/route.h>
@@ -55,6 +56,7 @@
 #include <dev/cxgb/cxgb_osdep.h>
 #include <dev/cxgb/sys/mbufq.h>
 
+#include <netinet/ip.h>
 #include <netinet/tcp.h>
 #include <netinet/tcp_var.h>
 #include <netinet/tcp_fsm.h>
@@ -62,6 +64,7 @@
 #include <netinet/tcp_seq.h>
 #include <net/route.h>
 
+
 #include <dev/cxgb/t3cdev.h>
 #include <dev/cxgb/common/cxgb_firmware_exports.h>
 #include <dev/cxgb/common/cxgb_t3_cpl.h>
@@ -122,6 +125,7 @@
  * coalescing, handle jumbo frames, and not trigger sender SWS avoidance.
  */
 #define MIN_RCV_WND (24 * 1024U)
+#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS)
 
 #define VALIDATE_SEQ 0
 #define VALIDATE_SOCK(so)
@@ -134,6 +138,26 @@
 
 static void t3_send_reset(struct socket *so);
 
+/*
+ * Determine whether to send a CPL message now or defer it.  A message is
+ * deferred if the connection is in SYN_SENT since we don't know the TID yet.
+ * For connections in other states the message is sent immediately.
+ * If through_l2t is set the message is subject to ARP processing, otherwise
+ * it is sent directly.
+ */
+static inline void
+send_or_defer(struct socket *so, struct tcpcb *tp, struct mbuf *m, int through_l2t)
+{
+	struct toepcb *toep = tp->t_toe;
+	
+	if (__predict_false(tp->t_state == TCPS_SYN_SENT))
+		mbufq_tail(&toep->out_of_order_queue, m);  // defer
+	else if (through_l2t)
+		l2t_send(T3C_DEV(so), m, toep->tp_l2t);  // send through L2T
+	else
+		cxgb_ofld_send(T3C_DEV(so), m);          // send directly
+}
+
 static inline unsigned int
 mkprio(unsigned int cntrl, const struct socket *so)
 {
@@ -481,11 +505,191 @@
 	.tu_rcvd = cxgb_toe_rcvd,
 };
 
+
+static void
+__set_tcb_field(struct socket *so, struct mbuf *m, uint16_t word,
+			    uint64_t mask, uint64_t val, int no_reply)
+{
+	struct cpl_set_tcb_field *req;
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	
+	req = mtod(m, struct cpl_set_tcb_field *);
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, toep->tp_tid));
+	req->reply = V_NO_REPLY(no_reply);
+	req->cpu_idx = 0;
+	req->word = htons(word);
+	req->mask = htobe64(mask);
+	req->val = htobe64(val);
+
+	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));
+	send_or_defer(so, tp, m, 0);
+}
+
+static void
+t3_set_tcb_field(struct socket *so, uint16_t word, uint64_t mask, uint64_t val)
+{
+	struct mbuf *m;
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	
+	if (tp->t_state == TCPS_CLOSED || (toep->tp_flags & TP_ABORT_SHUTDOWN))
+		return;
+
+	m = m_gethdr(M_NOWAIT, MT_DATA);
+	if (m == NULL) {
+		/*
+		 * XXX need lowmem cache
+		 */
+	}
+
+	__set_tcb_field(so, m, word, mask, val, 1);
+}
+
+/*
+ * Set one of the t_flags bits in the TCB.
+ */
+static void
+set_tcb_tflag(struct socket *so, unsigned int bit_pos, int val)
+{
+	t3_set_tcb_field(so, W_TCB_T_FLAGS1, 1ULL << bit_pos, val << bit_pos);
+}
+
+/*
+ * Send a SET_TCB_FIELD CPL message to change a connection's Nagle setting.
+ */
+static void
+t3_set_nagle(struct socket *so)
+{
+	struct tcpcb *tp = sototcpcb(so);
+	
+	set_tcb_tflag(so, S_TF_NAGLE, !(tp->t_flags & TF_NODELAY));
+}
+
+/*
+ * Send a SET_TCB_FIELD CPL message to change a connection's keepalive setting.
+ */
+void
+t3_set_keepalive(struct socket *so, int on_off)
+{
+	set_tcb_tflag(so, S_TF_KEEPALIVE, on_off);
+}
+
 void
+t3_set_rcv_coalesce_enable(struct socket *so, int on_off)
+{
+	set_tcb_tflag(so, S_TF_RCV_COALESCE_ENABLE, on_off);
+}
+
+/*
+ * Send a SET_TCB_FIELD CPL message to change a connection's TOS setting.
+ */
+static void
+t3_set_tos(struct socket *so)
+{
+	t3_set_tcb_field(so, W_TCB_TOS, V_TCB_TOS(M_TCB_TOS),
+			 V_TCB_TOS(SO_TOS(so)));
+}
+
+
+/*
+ * In DDP mode, TP fails to schedule a timer to push RX data to the host when
+ * DDP is disabled (data is delivered to freelist). [Note that, the peer should
+ * set the PSH bit in the last segment, which would trigger delivery.]
+ * We work around the issue by setting a DDP buffer in a partial placed state,
+ * which guarantees that TP will schedule a timer.
+ */
+#define TP_DDP_TIMER_WORKAROUND_MASK\
+    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(1) |\
+     ((V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |\
+       V_TCB_RX_DDP_BUF0_LEN(3)) << 32))
+#define TP_DDP_TIMER_WORKAROUND_VAL\
+    (V_TF_DDP_BUF0_VALID(1) | V_TF_DDP_ACTIVE_BUF(0) |\
+     ((V_TCB_RX_DDP_BUF0_OFFSET((u64)1) | V_TCB_RX_DDP_BUF0_LEN((u64)2)) <<\
+      32))
+
+static void
 t3_enable_ddp(struct socket *so, int on)
 {
-	printf("t3_enable_ddp unimplemented !!!! \n");
-		
+	if (on)
+		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS, V_TF_DDP_OFF(1),
+				 V_TF_DDP_OFF(0));
+	else
+		t3_set_tcb_field(so, W_TCB_RX_DDP_FLAGS,
+				 V_TF_DDP_OFF(1) |
+				 TP_DDP_TIMER_WORKAROUND_MASK,
+				 V_TF_DDP_OFF(1) |
+				 TP_DDP_TIMER_WORKAROUND_VAL);
+
+}
+
+
+void
+t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag_color)
+{
+	t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_TAG + buf_idx,
+			 V_TCB_RX_DDP_BUF0_TAG(M_TCB_RX_DDP_BUF0_TAG),
+			 tag_color);
+}
+
+void
+t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
+		    unsigned int len)
+{
+	if (buf_idx == 0)
+		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF0_OFFSET,
+			 V_TCB_RX_DDP_BUF0_OFFSET(M_TCB_RX_DDP_BUF0_OFFSET) |
+			 V_TCB_RX_DDP_BUF0_LEN(M_TCB_RX_DDP_BUF0_LEN),
+			 V_TCB_RX_DDP_BUF0_OFFSET((uint64_t)offset) |
+			 V_TCB_RX_DDP_BUF0_LEN((uint64_t)len));
+	else
+		t3_set_tcb_field(so, W_TCB_RX_DDP_BUF1_OFFSET,
+			 V_TCB_RX_DDP_BUF1_OFFSET(M_TCB_RX_DDP_BUF1_OFFSET) |
+			 V_TCB_RX_DDP_BUF1_LEN(M_TCB_RX_DDP_BUF1_LEN << 32),
+			 V_TCB_RX_DDP_BUF1_OFFSET((uint64_t)offset) |
+			 V_TCB_RX_DDP_BUF1_LEN(((uint64_t)len) << 32));
+}
+
+static int
+t3_set_cong_control(struct socket *so, const char *name)
+{
+#ifdef notyet	
+	int cong_algo;
+
+	for (cong_algo = 0; cong_algo < ARRAY_SIZE(t3_cong_ops); cong_algo++)
+		if (!strcmp(name, t3_cong_ops[cong_algo].name))
+			break;
+
+	if (cong_algo >= ARRAY_SIZE(t3_cong_ops))
+		return -EINVAL;
+#endif
+	return 0;
+}
+
+int
+t3_get_tcb(struct socket *so)
+{
+	struct cpl_get_tcb *req;
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	struct mbuf *m = m_gethdr(M_NOWAIT, MT_DATA);
+
+	if (!m)
+		return (ENOMEM);
+	
+	m_set_priority(m, mkprio(CPL_PRIORITY_CONTROL, so));	
+	req = mtod(m, struct cpl_get_tcb *);
+	m->m_pkthdr.len = m->m_len = sizeof(*req);
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_GET_TCB, toep->tp_tid));
+	req->cpuno = htons(toep->tp_qset);
+	if (sototcpcb(so)->t_state == TCPS_SYN_SENT)
+		mbufq_tail(&toep->out_of_order_queue, m);	// defer
+	else
+		cxgb_ofld_send(T3C_DEV(so), m);
+	return 0;
 }
 
 static inline void
@@ -607,7 +811,7 @@
 		toepcb_release(toep);
 	}
 #ifdef notyet
-	t3_set_ca_ops(sk, &tcp_init_congestion_ops);
+	t3_set_ca_ops(so, &tcp_init_congestion_ops);
 #endif	
 	TOE_DEV(so) = NULL;
 #if 0
@@ -716,7 +920,6 @@
 	return (0);
 }
 
-#define SO_TOS(so) ((sotoinpcb(so)->inp_ip_tos >> 2) & M_TOS)
 /*
  * The next two functions calculate the option 0 value for a socket.
  */
@@ -837,7 +1040,7 @@
 	if (rpl->status == CPL_ERR_CONN_EXIST &&
 	    icsk->icsk_retransmit_timer.function != act_open_retry_timer) {
 		icsk->icsk_retransmit_timer.function = act_open_retry_timer;
-		sk_reset_timer(sk, &icsk->icsk_retransmit_timer,
+		sk_reset_timer(so, &icsk->icsk_retransmit_timer,
 			       jiffies + HZ / 2);
 	} else
 #endif		
@@ -979,7 +1182,7 @@
 	/* Purge the send queue so we don't send anything after an abort. */
 	sbflush(&so->so_snd);
 #ifdef notyet
-	if (sock_flag(sk, CLOSE_CON_REQUESTED) && is_t3a(TOE_DEV(sk)))
+	if (sock_flag(so, CLOSE_CON_REQUESTED) && is_t3a(TOE_DEV(sk)))
 		mode |= CPL_ABORT_POST_CLOSE_REQ;
 #endif
 	m = m_gethdr(M_NOWAIT, MT_DATA);
@@ -1005,6 +1208,113 @@
 		l2t_send(T3C_DEV(so), m, toep->tp_l2t);
 }
 
+static int
+t3_ip_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+	struct inpcb *inp;
+	int error, optval;
+	
+	if (sopt->sopt_name == IP_OPTIONS)
+		return (ENOPROTOOPT);
+
+	if (sopt->sopt_name != IP_TOS)
+		return (EOPNOTSUPP);
+	
+	error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
+
+	if (error)
+		return (error);
+
+	if (optval > IPTOS_PREC_CRITIC_ECP && !suser(curthread))
+		return (EPERM);
+
+	inp = sotoinpcb(so);
+	inp->inp_ip_tos = optval;
+
+	t3_set_tos(so);
+	
+	return (0);
+}
+
+static int
+t3_tcp_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+	int err = 0;
+	size_t copied;
+
+	if (sopt->sopt_name != TCP_CONGESTION &&
+	    sopt->sopt_name != TCP_NODELAY)
+		return (EOPNOTSUPP);
+	
+	if (sopt->sopt_name == TCP_CONGESTION) {
+		char name[TCP_CA_NAME_MAX];
+		int optlen = sopt->sopt_valsize;
+		struct tcpcb *tp;
+		
+		if (optlen < 1)
+			return (EINVAL);
+		
+		err = copyinstr(sopt->sopt_val, name, 
+		    min(TCP_CA_NAME_MAX - 1, optlen), &copied);
+		if (err)
+			return (err);
+		if (copied < 1)
+			return (EINVAL);
+
+		tp = sototcpcb(so);
+		if ((err = t3_set_cong_control(so, name)) == 0)
+			tp->t_cong_control = strdup(name, M_DEVBUF);
+		else
+			return (err);
+	} else {
+		int optval, oldval;
+		struct inpcb *inp;
+		struct tcpcb *tp;
+		
+		err = sooptcopyin(sopt, &optval, sizeof optval,
+		    sizeof optval);
+
+		if (err)
+			return (err);
+
+		inp = sotoinpcb(so);
+		tp = intotcpcb(inp);
+		    
+		INP_LOCK(inp);
+		
+		oldval = tp->t_flags;
+		if (optval)
+			tp->t_flags |= TF_NODELAY;
+		else
+			tp->t_flags &= ~TF_NODELAY;
+		INP_UNLOCK(inp);
+		
+		if (oldval != tp->t_flags)
+			t3_set_nagle(so);
+
+	}
+
+	return (0);
+}
+
+static int
+t3_ctloutput(struct socket *so, struct sockopt *sopt)
+{
+	struct tcpcb *tp = sototcpcb(so);
+	struct toepcb *toep = tp->t_toe;
+	int err;
+	
+	if (sopt->sopt_level != IPPROTO_TCP) 
+		err =  t3_ip_ctloutput(so, sopt);
+	else
+		err = t3_tcp_ctloutput(so, sopt);
+
+	if (err != EOPNOTSUPP)
+		return (err);
+
+	return toep->tp_ctloutput(so, sopt);
+}
+
 /*
  * Process new data received for a connection.
  */
@@ -1018,12 +1328,12 @@
 	
 #ifdef notyet	
 	if (__predict_false(sk_no_receive(sk))) {
-		handle_excess_rx(sk, skb);
+		handle_excess_rx(so, skb);
 		return;
 	}
 
 	if (ULP_MODE(tp) == ULP_MODE_TCPDDP)
-		handle_ddp_data(sk, skb);
+		handle_ddp_data(so, skb);
 
 	TCP_SKB_CB(skb)->seq = ntohl(hdr->seq);
 	TCP_SKB_CB(skb)->flags = 0;
@@ -1046,7 +1356,7 @@
 	 * We don't handle urgent data yet
 	 */
 	if (__predict_false(hdr->urg))
-		handle_urg_ptr(sk, tp->rcv_nxt + ntohs(hdr->urg));
+		handle_urg_ptr(so, tp->rcv_nxt + ntohs(hdr->urg));
 	if (__predict_false(tp->urg_data == TCP_URG_NOTYET &&
 		     tp->urg_seq - tp->rcv_nxt < skb->len))
 		tp->urg_data = TCP_URG_VALID | skb->data[tp->urg_seq -
@@ -1129,16 +1439,16 @@
 	T3_TRACE0(TIDTB(sk),"do_peer_fin:");
 #endif
 #ifdef notyet
-	if (!is_t3a(TOE_DEV(sk)) && sock_flag(sk, ABORT_RPL_PENDING))
+	if (!is_t3a(TOE_DEV(sk)) && sock_flag(so, ABORT_RPL_PENDING))
 		goto out;
 
 	if (ULP_MODE(tp) == ULP_MODE_TCPDDP) {
-		keep = handle_peer_close_data(sk, skb);
+		keep = handle_peer_close_data(so, skb);
 		if (keep < 0)
 			return;
 	}
 	sk->sk_shutdown |= RCV_SHUTDOWN;
-	sock_set_flag(sk, SOCK_DONE);
+	sock_set_flag(so, SOCK_DONE);
 #endif
 	switch (tp->t_state) {
 	case TCPS_SYN_RECEIVED:
@@ -1177,9 +1487,9 @@
 		/* Do not send POLL_HUP for half duplex close. */
 		if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
 		    sk->sk_state == TCP_CLOSE)
-			sk_wake_async(sk, 1, POLL_HUP);
+			sk_wake_async(so, 1, POLL_HUP);
 		else
-			sk_wake_async(sk, 1, POLL_IN);
+			sk_wake_async(so, 1, POLL_IN);
 #endif
 	}
 #ifdef notyet	
@@ -1250,8 +1560,8 @@
 		
 #if 0		
 		else if (tcp_sk(sk)->linger2 < 0 &&
-			 !sock_flag(sk, ABORT_SHUTDOWN))
-			abort_conn(sk, skb, LINUX_MIB_TCPABORTONLINGER);
+			 !sock_flag(so, ABORT_SHUTDOWN))
+			abort_conn(so, skb, LINUX_MIB_TCPABORTONLINGER);
 #endif		
 		break;
 	default:
@@ -1351,6 +1661,9 @@
 	
 	toep->tp_write_seq = tp->iss = tp->snd_max = tp->snd_nxt = tp->snd_una = snd_isn;
 	assign_rxopt(so, opt);
+	toep->tp_ctloutput = so->so_proto->pr_ctloutput;
+	so->so_proto->pr_ctloutput = t3_ctloutput;
+	
 #if 0	
 	inet_sk(sk)->id = tp->write_seq ^ jiffies;
 #endif	
@@ -1406,7 +1719,7 @@
 	 */
 	if (unlikely(sk->sk_socket)) {   // simultaneous opens only
 		sk->sk_state_change(sk);
-		sk_wake_async(sk, 0, POLL_OUT);
+		sk_wake_async(so, 0, POLL_OUT);
 	}
 	/*
 	 * The state for the new connection is now up to date.
@@ -1490,7 +1803,7 @@
 	toep->tp_delack_seq = toep->tp_rcv_wup = toep->tp_copied_seq = tp->irs;
 
 	make_established(so, ntohl(req->snd_isn), ntohs(req->tcp_opt));
-
+	
 	/*
 	 * Now that we finally have a TID send any CPL messages that we had to
 	 * defer for lack of a TID.
@@ -1505,11 +1818,9 @@
 		 * appears to correspond to sorwakeup_locked
 		 */
 		sk->sk_state_change(sk);
-		sk_wake_async(sk, 0, POLL_OUT);
+		sk_wake_async(so, 0, POLL_OUT);
 #endif
 	}
-	printf("freeing %p\n", m);
-	
 	m_free(m);
 #ifdef notyet
 /*
@@ -1526,7 +1837,7 @@
 	 * them on their way.
 	 */
 	fixup_pending_writeq_buffers(sk);
-	if (t3_push_frames(sk, 1))
+	if (t3_push_frames(so, 1))
 		sk->sk_write_space(sk);
 #endif
 

==== //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_defs.h#6 (text+ko) ====

@@ -12,7 +12,6 @@
 void t3_listen_start(struct toedev *dev, struct socket *so, struct t3cdev *cdev);
 void t3_listen_stop(struct toedev *dev, struct socket *so, struct t3cdev *cdev);
 int t3_push_frames(struct socket *so, int req_completion);
-void t3_enable_ddp(struct socket *so, int on);
 int t3_connect(struct toedev *tdev, struct socket *so, struct ifnet *egress_ifp);
 void t3_init_listen_cpl_handlers(void);
 int t3_init_cpl_io(void);
@@ -28,4 +27,11 @@
 void toepcb_release(struct toepcb *);
 void toepcb_init(struct toepcb *);
 
+void t3_set_rcv_coalesce_enable(struct socket *so, int on_off);
+void t3_set_keepalive(struct socket *so, int on_off);
+void t3_set_ddp_tag(struct socket *so, int buf_idx, unsigned int tag);
+void t3_set_ddp_buf(struct socket *so, int buf_idx, unsigned int offset,
+		    unsigned int len);
+int t3_get_tcb(struct socket *so);
+
 #endif

==== //depot/projects/toestack/sys/dev/cxgb/ulp/tom/cxgb_tom.h#6 (text+ko) ====

@@ -1,6 +1,6 @@
 #ifndef CXGB_TOM_H_
 #define CXGB_TOM_H_
-
+#include <sys/protosw.h>
 
 #define LISTEN_INFO_HASH_SIZE 32 
 
@@ -99,8 +99,9 @@
 
 struct toepcb {
 	struct toedev *tp_toedev;
+	struct l2t_entry *tp_l2t;
+	pr_ctloutput_t *tp_ctloutput;
 	int tp_tid;
-	struct l2t_entry *tp_l2t;
 	int tp_wr_max;
 	int tp_wr_avail;
 	int tp_wr_unacked;


More information about the p4-projects mailing list