svn commit: r304223 - in head: share/man/man4 share/man/man9 sys/netinet

Randall Stewart rrs at FreeBSD.org
Tue Aug 16 15:11:48 UTC 2016


Author: rrs
Date: Tue Aug 16 15:11:46 2016
New Revision: 304223
URL: https://svnweb.freebsd.org/changeset/base/304223

Log:
  Here we update the  modular tcp to be able to switch to an
  alternate TCP stack in other then the closed state (pre-listen/connect).
  The idea is that *if* that is supported by the alternate stack, it
  is asked if its ok to switch. If it approves the "handoff" then we
  allow the switch to happen. Also the fini() function now gets a flag
  to tell if you are switching away *or* the tcb is destroyed. The
  init() call into the alternate stack is moved to the end so the
  tcb is more fully formed before the init transpires.
  
  Sponsored by:	Netflix Inc.
  Differential Revision:	D6790

Modified:
  head/share/man/man4/tcp.4
  head/share/man/man9/tcp_functions.9
  head/sys/netinet/tcp_subr.c
  head/sys/netinet/tcp_syncache.c
  head/sys/netinet/tcp_usrreq.c
  head/sys/netinet/tcp_var.h

Modified: head/share/man/man4/tcp.4
==============================================================================
--- head/share/man/man4/tcp.4	Tue Aug 16 14:33:25 2016	(r304222)
+++ head/share/man/man4/tcp.4	Tue Aug 16 15:11:46 2016	(r304223)
@@ -633,7 +633,8 @@ when trying to use a TCP function block 
 .Xr mod_cc 4 ,
 .Xr siftr 4 ,
 .Xr syncache 4 ,
-.Xr setkey 8
+.Xr setkey 8 ,
+.Xr tcp_functions 9
 .Rs
 .%A "V. Jacobson"
 .%A "R. Braden"

Modified: head/share/man/man9/tcp_functions.9
==============================================================================
--- head/share/man/man9/tcp_functions.9	Tue Aug 16 14:33:25 2016	(r304222)
+++ head/share/man/man9/tcp_functions.9	Tue Aug 16 15:11:46 2016	(r304223)
@@ -114,14 +114,17 @@ struct tcp_function_block {
 			    struct inpcb *inp, struct tcpcb *tp);
 	/* Optional memory allocation/free routine */
 	void	(*tfb_tcp_fb_init)(struct tcpcb *);
-	void	(*tfb_tcp_fb_fini)(struct tcpcb *);
+	void	(*tfb_tcp_fb_fini)(struct tcpcb *, int);
 	/* Optional timers, must define all if you define one */
 	int	(*tfb_tcp_timer_stop_all)(struct tcpcb *);
 	void	(*tfb_tcp_timer_activate)(struct tcpcb *,
 			    uint32_t, u_int);
 	int	(*tfb_tcp_timer_active)(struct tcpcb *, uint32_t);
 	void	(*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t);
+	/* Optional functions */
 	void	(*tfb_tcp_rexmit_tmr)(struct tcpcb *);
+	void	(*tfb_tcp_handoff_ok)(struct tcpcb *);
+	/* System use */
 	volatile uint32_t tfb_refcnt;
 	uint32_t  tfb_flags;
 };
@@ -157,6 +160,16 @@ in the
 .Va tfb_tcp_fb_fini
 field.
 .Pp
+If the
+.Va tfb_tcp_fb_fini
+argument is non-NULL, the function to which it points is called when the
+kernel is destroying the TCP control block or when the socket is transitioning
+to use a different TCP stack.
+The function is called with arguments of the TCP control block and an integer
+flag.
+The flag will be zero if the socket is transitioning to use another TCP stack
+or one if the TCP control block is being destroyed.
+.Pp
 If the TCP stack implements additional timers, the TCP stack should set a
 non-NULL pointer in the
 .Va tfb_tcp_timer_stop_all ,
@@ -193,6 +206,37 @@ However, care must be taken to ensure th
 TCP control block in a valid state for the remainder of the retransmit
 timer logic.
 .Pp
+A user may select a new TCP stack before calling
+.Xr connect 2
+or
+.Xr listen 2 .
+Optionally, a TCP stack may also allow a user to begin using the TCP stack for
+a connection that is in a later state by setting a non-NULL function pointer in
+the
+.Va tfb_tcp_handoff_ok
+field.
+If this field is non-NULL and a user attempts to select that TCP stack after
+calling
+.Xr connect 2
+or
+.Xr listen 2
+for that socket, the kernel will call the function pointed to by the
+.Va tfb_tcp_handoff_ok
+field.
+The function should return 0 if the user is allowed to switch the socket to use
+the TCP stack. Otherwise, the function should return an error code, which will
+be returned to the user.
+If the
+.Va tfb_tcp_handoff_ok
+field is
+.Dv NULL
+and a user attempts to select the TCP stack after calling
+.Xr connect 2
+or
+.Xr listen 2
+for that socket, the operation will fail and the kernel will return
+.Er EINVAL .
+.Pp
 The
 .Va tfb_refcnt
 and
@@ -269,8 +313,10 @@ The
 .Fa blk
 argument references a function block that is not currently registered.
 .Sh SEE ALSO
-.Xr malloc 9 ,
-.Xr tcp 4
+.Xr connect 2 ,
+.Xr listen 2 ,
+.Xr tcp 4 ,
+.Xr malloc 9
 .Sh HISTORY
 This framework first appeared in
 .Fx 11.0 .

Modified: head/sys/netinet/tcp_subr.c
==============================================================================
--- head/sys/netinet/tcp_subr.c	Tue Aug 16 14:33:25 2016	(r304222)
+++ head/sys/netinet/tcp_subr.c	Tue Aug 16 15:11:46 2016	(r304223)
@@ -1187,9 +1187,6 @@ tcp_newtcpcb(struct inpcb *inp)
 	tp->t_fb = tcp_func_set_ptr;
 	refcount_acquire(&tp->t_fb->tfb_refcnt);
 	rw_runlock(&tcp_function_lock);
-	if (tp->t_fb->tfb_tcp_fb_init) {
-		(*tp->t_fb->tfb_tcp_fb_init)(tp);
-	}
 	/*
 	 * Use the current system default CC algorithm.
 	 */
@@ -1201,7 +1198,7 @@ tcp_newtcpcb(struct inpcb *inp)
 	if (CC_ALGO(tp)->cb_init != NULL)
 		if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) {
 			if (tp->t_fb->tfb_tcp_fb_fini)
-				(*tp->t_fb->tfb_tcp_fb_fini)(tp);
+				(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
 			refcount_release(&tp->t_fb->tfb_refcnt);
 			uma_zfree(V_tcpcb_zone, tm);
 			return (NULL);
@@ -1210,7 +1207,7 @@ tcp_newtcpcb(struct inpcb *inp)
 	tp->osd = &tm->osd;
 	if (khelp_init_osd(HELPER_CLASS_TCP, tp->osd)) {
 		if (tp->t_fb->tfb_tcp_fb_fini)
-			(*tp->t_fb->tfb_tcp_fb_fini)(tp);
+			(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		uma_zfree(V_tcpcb_zone, tm);
 		return (NULL);
@@ -1271,6 +1268,9 @@ tcp_newtcpcb(struct inpcb *inp)
 	 */
 	tcp_pcap_tcpcb_init(tp);
 #endif
+	if (tp->t_fb->tfb_tcp_fb_init) {
+		(*tp->t_fb->tfb_tcp_fb_init)(tp);
+	}
 	return (tp);		/* XXX */
 }
 
@@ -1484,7 +1484,7 @@ tcp_discardcb(struct tcpcb *tp)
 	if (tp->t_timers->tt_draincnt == 0) {
 		/* We own the last reference on tcpcb, let's free it. */
 		if (tp->t_fb->tfb_tcp_fb_fini)
-			(*tp->t_fb->tfb_tcp_fb_fini)(tp);
+			(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		tp->t_inpcb = NULL;
 		uma_zfree(V_tcpcb_zone, tp);
@@ -1513,7 +1513,7 @@ tcp_timer_discard(void *ptp)
 	if (tp->t_timers->tt_draincnt == 0) {
 		/* We own the last reference on this tcpcb, let's free it. */
 		if (tp->t_fb->tfb_tcp_fb_fini)
-			(*tp->t_fb->tfb_tcp_fb_fini)(tp);
+			(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		tp->t_inpcb = NULL;
 		uma_zfree(V_tcpcb_zone, tp);

Modified: head/sys/netinet/tcp_syncache.c
==============================================================================
--- head/sys/netinet/tcp_syncache.c	Tue Aug 16 14:33:25 2016	(r304222)
+++ head/sys/netinet/tcp_syncache.c	Tue Aug 16 15:11:46 2016	(r304223)
@@ -842,7 +842,7 @@ syncache_socket(struct syncache *sc, str
 		KASSERT(rblk != NULL,
 		    ("cannot find blk %p out of syncache?", blk));
 		if (tp->t_fb->tfb_tcp_fb_fini)
-			(*tp->t_fb->tfb_tcp_fb_fini)(tp);
+			(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
 		refcount_release(&tp->t_fb->tfb_refcnt);
 		tp->t_fb = rblk;
 		if (tp->t_fb->tfb_tcp_fb_init) {

Modified: head/sys/netinet/tcp_usrreq.c
==============================================================================
--- head/sys/netinet/tcp_usrreq.c	Tue Aug 16 14:33:25 2016	(r304222)
+++ head/sys/netinet/tcp_usrreq.c	Tue Aug 16 15:11:46 2016	(r304223)
@@ -1420,40 +1420,59 @@ tcp_ctloutput(struct socket *so, struct 
 		if (error)
 			return (error);
 		INP_WLOCK_RECHECK(inp);
-		if (tp->t_state != TCPS_CLOSED) {
-			/* 
-			 * The user has advanced the state
-			 * past the initial point, we can't
-			 * switch since we are down the road
-			 * and a new set of functions may
-			 * not be compatibile.
-			 */
-			INP_WUNLOCK(inp);
-			return(EINVAL);
-		}
 		blk = find_and_ref_tcp_functions(&fsn);
 		if (blk == NULL) {
 			INP_WUNLOCK(inp);
 			return (ENOENT);
 		}
-		if (tp->t_fb != blk) {
-			if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
+		if (tp->t_fb == blk) {
+			/* You already have this */
+			refcount_release(&blk->tfb_refcnt);
+			INP_WUNLOCK(inp);
+			return (0);
+		}
+		if (tp->t_state != TCPS_CLOSED) {
+			int error=EINVAL;
+			/* 
+			 * The user has advanced the state
+			 * past the initial point, we may not
+			 * be able to switch. 
+			 */
+			if (blk->tfb_tcp_handoff_ok != NULL) {
+				/* 
+				 * Does the stack provide a
+				 * query mechanism, if so it may
+				 * still be possible?
+				 */
+				error = (*blk->tfb_tcp_handoff_ok)(tp);
+			}
+			if (error) {
 				refcount_release(&blk->tfb_refcnt);
 				INP_WUNLOCK(inp);
-				return (ENOENT);
+				return(error);
 			}
+		}
+		if (blk->tfb_flags & TCP_FUNC_BEING_REMOVED) {
+			refcount_release(&blk->tfb_refcnt);
+			INP_WUNLOCK(inp);
+			return (ENOENT);
+		}
+		/* 
+		 * Release the old refcnt, the
+		 * lookup acquired a ref on the
+		 * new one already.
+		 */
+		if (tp->t_fb->tfb_tcp_fb_fini) {
 			/* 
-			 * Release the old refcnt, the
-			 * lookup acquires a ref on the
-			 * new one.
+			 * Tell the stack to cleanup with 0 i.e.
+			 * the tcb is not going away.
 			 */
-			if (tp->t_fb->tfb_tcp_fb_fini)
-				(*tp->t_fb->tfb_tcp_fb_fini)(tp);
-			refcount_release(&tp->t_fb->tfb_refcnt);
-			tp->t_fb = blk;
-			if (tp->t_fb->tfb_tcp_fb_init) {
-				(*tp->t_fb->tfb_tcp_fb_init)(tp);
-			}
+			(*tp->t_fb->tfb_tcp_fb_fini)(tp, 0);
+		}
+		refcount_release(&tp->t_fb->tfb_refcnt);
+		tp->t_fb = blk;
+		if (tp->t_fb->tfb_tcp_fb_init) {
+			(*tp->t_fb->tfb_tcp_fb_init)(tp);
 		}
 #ifdef TCP_OFFLOAD
 		if (tp->t_flags & TF_TOE) {

Modified: head/sys/netinet/tcp_var.h
==============================================================================
--- head/sys/netinet/tcp_var.h	Tue Aug 16 14:33:25 2016	(r304222)
+++ head/sys/netinet/tcp_var.h	Tue Aug 16 15:11:46 2016	(r304223)
@@ -116,6 +116,18 @@ struct socket;
  * does not know your callbacks you must provide a
  * stop_all function that loops through and calls
  * tcp_timer_stop() with each of your defined timers.
+ * Adding a tfb_tcp_handoff_ok function allows the socket
+ * option to change stacks to query you even if the
+ * connection is in a later stage. You return 0 to
+ * say you can take over and run your stack, you return
+ * non-zero (an error number) to say no you can't.
+ * If the function is undefined you can only change
+ * in the early states (before connect or listen).
+ * tfb_tcp_fb_fini is changed to add a flag to tell
+ * the old stack if the tcb is being destroyed or
+ * not. A one in the flag means the TCB is being
+ * destroyed, a zero indicates its transitioning to
+ * another stack (via socket option).
  */
 struct tcp_function_block {
 	char tfb_tcp_block_name[TCP_FUNCTION_NAME_LEN_MAX];
@@ -128,7 +140,7 @@ struct tcp_function_block {
 			    struct inpcb *inp, struct tcpcb *tp);
 	/* Optional memory allocation/free routine */
 	void	(*tfb_tcp_fb_init)(struct tcpcb *);
-	void	(*tfb_tcp_fb_fini)(struct tcpcb *);
+	void	(*tfb_tcp_fb_fini)(struct tcpcb *, int);
 	/* Optional timers, must define all if you define one */
 	int	(*tfb_tcp_timer_stop_all)(struct tcpcb *);
 	void	(*tfb_tcp_timer_activate)(struct tcpcb *,
@@ -136,6 +148,7 @@ struct tcp_function_block {
 	int	(*tfb_tcp_timer_active)(struct tcpcb *, uint32_t);
 	void	(*tfb_tcp_timer_stop)(struct tcpcb *, uint32_t);
 	void	(*tfb_tcp_rexmit_tmr)(struct tcpcb *);
+	int	(*tfb_tcp_handoff_ok)(struct tcpcb *);
 	volatile uint32_t tfb_refcnt;
 	uint32_t  tfb_flags;
 };


More information about the svn-src-all mailing list