git: ea9017fb25ff - main - tcp: Congestion control move to using reference counting.

From: Randall Stewart <rrs_at_FreeBSD.org>
Date: Mon, 21 Feb 2022 11:31:11 UTC
The branch main has been updated by rrs:

URL: https://cgit.FreeBSD.org/src/commit/?id=ea9017fb25ff51fcb022a5835b1e472e57490d34

commit ea9017fb25ff51fcb022a5835b1e472e57490d34
Author:     Randall Stewart <rrs@FreeBSD.org>
AuthorDate: 2022-02-21 11:30:17 +0000
Commit:     Randall Stewart <rrs@FreeBSD.org>
CommitDate: 2022-02-21 11:30:17 +0000

    tcp: Congestion control move to using reference counting.
    
    In the transport call on 12/3 Gleb asked to move the CC modules towards
    using reference counting to prevent folks from unloading a module in use.
    It was also agreed that Michael would do a user space utility like tcp_drop
    that could be used to move all connections that are using a specific CC
    to some other CC.
    
    This is the half I committed to doing, making it so that we maintain a refcount
    on a cc module every time a pcb refers to it and decrementing that every
    time a pcb no longer uses a cc module. This also helps us simplify the
    whole unloading process by getting rid of tcp_ccunload() which munged
    through all the tcb's. Instead we mark a module as being removed and
    prevent further references to it. We also make sure that if a module is
    marked as being removed it cannot be made as the default and also
    the opposite of that, if its a default it fails and does not mark it as being
    removed.
    
    Reviewed by: Michael Tuexen, Gleb Smirnoff
    Sponsored by: Netflix Inc.
    Differential Revision: https://reviews.freebsd.org/D33249
---
 sys/netinet/cc/cc.c      | 196 +++++++++++++++++++++++++++++++----------------
 sys/netinet/cc/cc.h      |  11 +++
 sys/netinet/tcp_subr.c   |  95 +----------------------
 sys/netinet/tcp_usrreq.c |  48 ++++++------
 sys/netinet/tcp_var.h    |   1 -
 5 files changed, 171 insertions(+), 180 deletions(-)

diff --git a/sys/netinet/cc/cc.c b/sys/netinet/cc/cc.c
index 647ecb47f18d..55a5f6ef652e 100644
--- a/sys/netinet/cc/cc.c
+++ b/sys/netinet/cc/cc.c
@@ -107,6 +107,45 @@ VNET_DEFINE(struct cc_algo *, default_cc_ptr) = NULL;
 VNET_DEFINE(uint32_t, newreno_beta) = 50;
 #define V_newreno_beta VNET(newreno_beta)
 
+void
+cc_refer(struct cc_algo *algo)
+{
+	CC_LIST_LOCK_ASSERT();
+	refcount_acquire(&algo->cc_refcount);
+}
+
+void
+cc_release(struct cc_algo *algo)
+{
+	CC_LIST_LOCK_ASSERT();
+	refcount_release(&algo->cc_refcount);
+}
+
+
+void
+cc_attach(struct tcpcb *tp, struct cc_algo *algo)
+{
+	/*
+	 * Attach the tcpcb to the algorithm.
+	 */
+	CC_LIST_RLOCK();
+	CC_ALGO(tp) = algo;
+	cc_refer(algo);
+	CC_LIST_RUNLOCK();
+}
+
+void
+cc_detach(struct tcpcb *tp)
+{
+	struct cc_algo *algo;
+
+	CC_LIST_RLOCK();
+	algo = CC_ALGO(tp);
+	CC_ALGO(tp) = NULL;
+	cc_release(algo);
+	CC_LIST_RUNLOCK();
+}
+
 /*
  * Sysctl handler to show and change the default CC algorithm.
  */
@@ -137,6 +176,10 @@ cc_default_algo(SYSCTL_HANDLER_ARGS)
 	STAILQ_FOREACH(funcs, &cc_list, entries) {
 		if (strncmp(default_cc, funcs->name, sizeof(default_cc)))
 			continue;
+		if (funcs->flags & CC_MODULE_BEING_REMOVED) {
+			/* Its being removed, its not eligible */
+			continue;
+		}
 		V_default_cc_ptr = funcs;
 		error = 0;
 		break;
@@ -153,12 +196,12 @@ static int
 cc_list_available(SYSCTL_HANDLER_ARGS)
 {
 	struct cc_algo *algo;
-	struct sbuf *s;
-	int err, first, nalgos;
-
-	err = nalgos = 0;
-	first = 1;
+	int error, nalgos;
+	int linesz;
+	char *buffer, *cp;
+	size_t bufsz, outsz;
 
+	error = nalgos = 0;
 	CC_LIST_RLOCK();
 	STAILQ_FOREACH(algo, &cc_list, entries) {
 		nalgos++;
@@ -167,37 +210,34 @@ cc_list_available(SYSCTL_HANDLER_ARGS)
 	if (nalgos == 0) {
 		return (ENOENT);
 	}
-	s = sbuf_new(NULL, NULL, nalgos * TCP_CA_NAME_MAX, SBUF_FIXEDLEN);
-
-	if (s == NULL)
-		return (ENOMEM);
-
-	/*
-	 * It is theoretically possible for the CC list to have grown in size
-	 * since the call to sbuf_new() and therefore for the sbuf to be too
-	 * small. If this were to happen (incredibly unlikely), the sbuf will
-	 * reach an overflow condition, sbuf_printf() will return an error and
-	 * the sysctl will fail gracefully.
-	 */
+	bufsz = (nalgos+2) * ((TCP_CA_NAME_MAX + 13) + 1);
+	buffer = malloc(bufsz, M_TEMP, M_WAITOK);
+	cp = buffer;
+
+	linesz = snprintf(cp, bufsz, "\n%-16s%c %s\n", "CCmod", 'D',
+	    "PCB count");
+	cp += linesz;
+	bufsz -= linesz;
+	outsz = linesz;
 	CC_LIST_RLOCK();
 	STAILQ_FOREACH(algo, &cc_list, entries) {
-		err = sbuf_printf(s, first ? "%s" : ", %s", algo->name);
-		if (err) {
-			/* Sbuf overflow condition. */
-			err = EOVERFLOW;
+		linesz = snprintf(cp, bufsz, "%-16s%c %u\n",
+		    algo->name,
+		    (algo == CC_DEFAULT_ALGO()) ? '*' : ' ',
+		    algo->cc_refcount);
+		if (linesz >= bufsz) {
+			error = EOVERFLOW;
 			break;
 		}
-		first = 0;
+		cp += linesz;
+		bufsz -= linesz;
+		outsz += linesz;
 	}
 	CC_LIST_RUNLOCK();
-
-	if (!err) {
-		sbuf_finish(s);
-		err = sysctl_handle_string(oidp, sbuf_data(s), 0, req);
-	}
-
-	sbuf_delete(s);
-	return (err);
+	if (error == 0)
+		error = sysctl_handle_string(oidp, buffer, outsz + 1, req);
+	free(buffer, M_TEMP);
+	return (error);
 }
 
 /*
@@ -243,41 +283,36 @@ cc_init(void)
 int
 cc_deregister_algo(struct cc_algo *remove_cc)
 {
-	struct cc_algo *funcs, *tmpfuncs;
-	int err;
-
-	err = ENOENT;
+	struct cc_algo *funcs;
+	int found = 0;
 
 	/* Remove algo from cc_list so that new connections can't use it. */
 	CC_LIST_WLOCK();
-	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
-		if (funcs == remove_cc) {
-			if (cc_check_default(remove_cc)) {
-				CC_LIST_WUNLOCK();
-				return(EBUSY);
-			}
-			break;
-		}
+	
+	/* This is unlikely to fail */
+	STAILQ_FOREACH(funcs, &cc_list, entries) {
+		if (funcs == remove_cc)
+			found = 1;
 	}
-	remove_cc->flags |= CC_MODULE_BEING_REMOVED;
-	CC_LIST_WUNLOCK();
-	err = tcp_ccalgounload(remove_cc);
-	/*
-	 * Now back through and we either remove the temp flag
-	 * or pull the registration.
-	 */
-	CC_LIST_WLOCK();
-	STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
-		if (funcs == remove_cc) {
-			if (err == 0)
-				STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
-			else
-				funcs->flags &= ~CC_MODULE_BEING_REMOVED;
-			break;
-		}
+	if (found == 0) {
+		/* Nothing to remove? */
+		CC_LIST_WUNLOCK();
+		return (ENOENT);
+	}
+	/* We assert it should have been MOD_QUIESCE'd */
+	KASSERT((remove_cc->flags & CC_MODULE_BEING_REMOVED),
+		("remove_cc:%p does not have CC_MODULE_BEING_REMOVED flag", remove_cc));
+	if (cc_check_default(remove_cc)) {
+		CC_LIST_WUNLOCK();
+		return(EBUSY);
+	}
+	if (remove_cc->cc_refcount != 0) {
+		CC_LIST_WUNLOCK();
+		return (EBUSY);
 	}
+	STAILQ_REMOVE(&cc_list, remove_cc, cc_algo, entries);
 	CC_LIST_WUNLOCK();
-	return (err);
+	return (0);
 }
 
 /*
@@ -304,6 +339,9 @@ cc_register_algo(struct cc_algo *add_cc)
 			break;
 		}
 	}
+	/* Init its reference count */
+	if (err == 0)
+		refcount_init(&add_cc->cc_refcount, 0);
 	/*
 	 * The first loaded congestion control module will become
 	 * the default until we find the "CC_DEFAULT" defined in
@@ -526,6 +564,20 @@ newreno_cc_ack_received(struct cc_var *ccv, uint16_t type)
 	}
 }
 
+static int
+cc_stop_new_assignments(struct cc_algo *algo)
+{
+	CC_LIST_WLOCK();
+	if (cc_check_default(algo)) {
+		/* A default cannot be removed */
+		CC_LIST_WUNLOCK();
+		return (EBUSY);
+	}
+	algo->flags |= CC_MODULE_BEING_REMOVED;
+	CC_LIST_WUNLOCK();
+	return (0);
+}
+
 /*
  * Handles kld related events. Returns 0 on success, non-zero on failure.
  */
@@ -555,16 +607,32 @@ cc_modevent(module_t mod, int event_type, void *data)
 			err = cc_register_algo(algo);
 		break;
 
-	case MOD_QUIESCE:
 	case MOD_SHUTDOWN:
+		break;
+	case MOD_QUIESCE:
+		/* Stop any new assigments */
+		err = cc_stop_new_assignments(algo);
+		break;
 	case MOD_UNLOAD:
+		/* 
+		 * Deregister and remove the module from the list 
+		 */
+		CC_LIST_WLOCK();
+		/* Even with -f we can't unload if its the default */
+		if (cc_check_default(algo)) {
+			/* A default cannot be removed */
+			CC_LIST_WUNLOCK();
+			return (EBUSY);
+		}
+		/*
+		 * If -f was used and users are still attached to
+		 * the algorithm things are going to go boom.
+		 */
 		err = cc_deregister_algo(algo);
-		if (!err && algo->mod_destroy != NULL)
+		if ((err == 0) && (algo->mod_destroy != NULL)) {
 			algo->mod_destroy();
-		if (err == ENOENT)
-			err = 0;
+		}
 		break;
-
 	default:
 		err = EINVAL;
 		break;
diff --git a/sys/netinet/cc/cc.h b/sys/netinet/cc/cc.h
index 6f942da7aa83..1906207bddb4 100644
--- a/sys/netinet/cc/cc.h
+++ b/sys/netinet/cc/cc.h
@@ -200,6 +200,7 @@ struct cc_algo {
 	int     (*ctl_output)(struct cc_var *, struct sockopt *, void *);
 
 	STAILQ_ENTRY (cc_algo) entries;
+	u_int	cc_refcount;
 	uint8_t flags;
 };
 
@@ -236,5 +237,15 @@ void newreno_cc_after_idle(struct cc_var *);
 void newreno_cc_cong_signal(struct cc_var *, uint32_t );
 void newreno_cc_ack_received(struct cc_var *, uint16_t);
 
+/* Called to temporarily keep an algo from going away during change */
+void cc_refer(struct cc_algo *algo);
+/* Called to release the temporary hold */
+void cc_release(struct cc_algo *algo);
+
+/* Called to attach a CC algorithm to a tcpcb */
+void cc_attach(struct tcpcb *, struct cc_algo *);
+/* Called to detach a CC algorithm from a tcpcb */
+void cc_detach(struct tcpcb *);
+
 #endif /* _KERNEL */
 #endif /* _NETINET_CC_CC_H_ */
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 298ae38d5b27..95c34c581e59 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -2171,10 +2171,7 @@ tcp_newtcpcb(struct inpcb *inp)
 	/*
 	 * Use the current system default CC algorithm.
 	 */
-	CC_LIST_RLOCK();
-	KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!"));
-	CC_ALGO(tp) = CC_DEFAULT_ALGO();
-	CC_LIST_RUNLOCK();
+	cc_attach(tp, CC_DEFAULT_ALGO());
 
 	/*
 	 * The tcpcb will hold a reference on its inpcb until tcp_discardcb()
@@ -2185,6 +2182,7 @@ tcp_newtcpcb(struct inpcb *inp)
 
 	if (CC_ALGO(tp)->cb_init != NULL)
 		if (CC_ALGO(tp)->cb_init(tp->ccv, NULL) > 0) {
+			cc_detach(tp);
 			if (tp->t_fb->tfb_tcp_fb_fini)
 				(*tp->t_fb->tfb_tcp_fb_fini)(tp, 1);
 			in_pcbrele_wlocked(inp);
@@ -2276,93 +2274,6 @@ tcp_newtcpcb(struct inpcb *inp)
 	return (tp);		/* XXX */
 }
 
-/*
- * Switch the congestion control algorithm back to Vnet default for any active
- * control blocks using an algorithm which is about to go away. If the algorithm
- * has a cb_init function and it fails (no memory) then the operation fails and
- * the unload will not succeed.
- *
- */
-int
-tcp_ccalgounload(struct cc_algo *unload_algo)
-{
-	struct cc_algo *oldalgo, *newalgo;
-	struct inpcb *inp;
-	struct tcpcb *tp;
-	VNET_ITERATOR_DECL(vnet_iter);
-
-	/*
-	 * Check all active control blocks across all network stacks and change
-	 * any that are using "unload_algo" back to its default. If "unload_algo"
-	 * requires cleanup code to be run, call it.
-	 */
-	VNET_LIST_RLOCK();
-	VNET_FOREACH(vnet_iter) {
-		CURVNET_SET(vnet_iter);
-		struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo,
-		    INPLOOKUP_WLOCKPCB);
-		/*
-		 * XXXGL: would new accept(2)d connections use algo being
-		 * unloaded?
-		 */
-		newalgo = CC_DEFAULT_ALGO();
-		while ((inp = inp_next(&inpi)) != NULL) {
-			/* Important to skip tcptw structs. */
-			if (!(inp->inp_flags & INP_TIMEWAIT) &&
-			    (tp = intotcpcb(inp)) != NULL) {
-				/*
-				 * By holding INP_WLOCK here, we are assured
-				 * that the connection is not currently
-				 * executing inside the CC module's functions.
-				 * We attempt to switch to the Vnets default,
-				 * if the init fails then we fail the whole
-				 * operation and the module unload will fail.
-				 */
-				if (CC_ALGO(tp) == unload_algo) {
-					struct cc_var cc_mem;
-					int err;
-
-					oldalgo = CC_ALGO(tp);
-					memset(&cc_mem, 0, sizeof(cc_mem));
-					cc_mem.ccvc.tcp = tp;
-					if (newalgo->cb_init == NULL) {
-						/*
-						 * No init we can skip the
-						 * dance around a possible failure.
-						 */
-						CC_DATA(tp) = NULL;
-						goto proceed;
-					}
-					err = (newalgo->cb_init)(&cc_mem, NULL);
-					if (err) {
-						/*
-						 * Presumably no memory the caller will
-						 * need to try again.
-						 */
-						INP_WUNLOCK(inp);
-						CURVNET_RESTORE();
-						VNET_LIST_RUNLOCK();
-						return (err);
-					}
-proceed:
-					if (oldalgo->cb_destroy != NULL)
-						oldalgo->cb_destroy(tp->ccv);
-					CC_ALGO(tp) = newalgo;
-					memcpy(tp->ccv, &cc_mem, sizeof(struct cc_var));
-					if (TCPS_HAVEESTABLISHED(tp->t_state) &&
-					    (CC_ALGO(tp)->conn_init != NULL)) {
-						/* Yep run the connection init for the new CC */
-						CC_ALGO(tp)->conn_init(tp->ccv);
-					}
-				}
-			}
-		}
-		CURVNET_RESTORE();
-	}
-	VNET_LIST_RUNLOCK();
-	return (0);
-}
-
 /*
  * Drop a TCP connection, reporting
  * the specified error.  If connection is synchronized,
@@ -2443,6 +2354,8 @@ tcp_discardcb(struct tcpcb *tp)
 	if (CC_ALGO(tp)->cb_destroy != NULL)
 		CC_ALGO(tp)->cb_destroy(tp->ccv);
 	CC_DATA(tp) = NULL;
+	/* Detach from the CC algorithm */
+	cc_detach(tp);
 
 #ifdef TCP_HHOOK
 	khelp_destroy_osd(tp->osd);
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index ac9e79aed917..8c8cba8ea5ad 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -2004,7 +2004,7 @@ copyin_tls_enable(struct sockopt *sopt, struct tls_enable *tls)
 extern struct cc_algo newreno_cc_algo;
 
 static int
-tcp_congestion(struct inpcb *inp, struct sockopt *sopt)
+tcp_set_cc_mod(struct inpcb *inp, struct sockopt *sopt)
 {
 	struct cc_algo *algo;
 	void *ptr = NULL;
@@ -2020,7 +2020,7 @@ tcp_congestion(struct inpcb *inp, struct sockopt *sopt)
 		return(error);
 	buf[sopt->sopt_valsize] = '\0';
 	CC_LIST_RLOCK();
-	STAILQ_FOREACH(algo, &cc_list, entries)
+	STAILQ_FOREACH(algo, &cc_list, entries) {
 		if (strncmp(buf, algo->name,
 			    TCP_CA_NAME_MAX) == 0) {
 			if (algo->flags & CC_MODULE_BEING_REMOVED) {
@@ -2029,30 +2029,24 @@ tcp_congestion(struct inpcb *inp, struct sockopt *sopt)
 			}
 			break;
 		}
+	}
 	if (algo == NULL) {
 		CC_LIST_RUNLOCK();
 		return(ESRCH);
 	}
-do_over:
+	/* 
+	 * With a reference the algorithm cannot be removed
+	 * so we hold a reference through the change process.
+	 */
+	cc_refer(algo);
+	CC_LIST_RUNLOCK();
 	if (algo->cb_init != NULL) {
 		/* We can now pre-get the memory for the CC */
 		mem_sz = (*algo->cc_data_sz)();
 		if (mem_sz == 0) {
 			goto no_mem_needed;
 		}
-		CC_LIST_RUNLOCK();
 		ptr = malloc(mem_sz, M_CC_MEM, M_WAITOK);
-		CC_LIST_RLOCK();
-		STAILQ_FOREACH(algo, &cc_list, entries)
-			if (strncmp(buf, algo->name,
-				    TCP_CA_NAME_MAX) == 0)
-				break;
-		if (algo == NULL) {
-			if (ptr)
-				free(ptr, M_CC_MEM);
-			CC_LIST_RUNLOCK();
-			return(ESRCH);
-		}
 	} else {
 no_mem_needed:
 		mem_sz = 0;
@@ -2063,22 +2057,20 @@ no_mem_needed:
 	 * back the inplock.
 	 */
 	memset(&cc_mem, 0, sizeof(cc_mem));
-	if (mem_sz != (*algo->cc_data_sz)()) {
-		if (ptr)
-			free(ptr, M_CC_MEM);
-		goto do_over;
-	}
 	INP_WLOCK(inp);
 	if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) {
 		INP_WUNLOCK(inp);
+		if (ptr)
+			free(ptr, M_CC_MEM);
+		/* Release our temp reference */
+		CC_LIST_RLOCK();
+		cc_release(algo);
 		CC_LIST_RUNLOCK();
-		free(ptr, M_CC_MEM);
 		return (ECONNRESET);
 	}
 	tp = intotcpcb(inp);
 	if (ptr != NULL)
 		memset(ptr, 0, mem_sz);
-	CC_LIST_RUNLOCK();
 	cc_mem.ccvc.tcp = tp;
 	/*
 	 * We once again hold a write lock over the tcb so it's
@@ -2103,8 +2095,12 @@ no_mem_needed:
 		 */
 		if (CC_ALGO(tp)->cb_destroy != NULL)
 			CC_ALGO(tp)->cb_destroy(tp->ccv);
+		/* Detach the old CC from the tcpcb  */
+		cc_detach(tp);
+		/* Copy in our temp memory that was inited */
 		memcpy(tp->ccv, &cc_mem, sizeof(struct cc_var));
-		tp->cc_algo = algo;
+		/* Now attach the new, which takes a reference */
+		cc_attach(tp, algo);
 		/* Ok now are we where we have gotten past any conn_init? */
 		if (TCPS_HAVEESTABLISHED(tp->t_state) && (CC_ALGO(tp)->conn_init != NULL)) {
 			/* Yep run the connection init for the new CC */
@@ -2113,6 +2109,10 @@ no_mem_needed:
 	} else if (ptr)
 		free(ptr, M_CC_MEM);
 	INP_WUNLOCK(inp);
+	/* Now lets release our temp reference */
+	CC_LIST_RLOCK();
+	cc_release(algo);
+	CC_LIST_RUNLOCK();
 	return (error);
 }
 
@@ -2336,7 +2336,7 @@ unlock_and_done:
 			break;
 
 		case TCP_CONGESTION:
-			error = tcp_congestion(inp, sopt);
+			error = tcp_set_cc_mod(inp, sopt);
 			break;
 
 		case TCP_REUSPORT_LB_NUMA:
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 6b1f79a0b9ed..b9d37471771c 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -1076,7 +1076,6 @@ VNET_DECLARE(struct hhook_head *, tcp_hhh[HHOOK_TCP_LAST + 1]);
 #endif
 
 int	 tcp_addoptions(struct tcpopt *, u_char *);
-int	 tcp_ccalgounload(struct cc_algo *unload_algo);
 struct tcpcb *
 	 tcp_close(struct tcpcb *);
 void	 tcp_discardcb(struct tcpcb *);