svn commit: r357819 - head/sys/netinet

Randall Stewart rrs at FreeBSD.org
Wed Feb 12 13:37:54 UTC 2020


Author: rrs
Date: Wed Feb 12 13:37:53 2020
New Revision: 357819
URL: https://svnweb.freebsd.org/changeset/base/357819

Log:
  Opps committed the wrong ratelimit version in  the
  whitespace cleanup.. Restore it to the proper version.
  
  Sponsored by:	Netfilx Inc.

Modified:
  head/sys/netinet/tcp_ratelimit.c
  head/sys/netinet/tcp_ratelimit.h

Modified: head/sys/netinet/tcp_ratelimit.c
==============================================================================
--- head/sys/netinet/tcp_ratelimit.c	Wed Feb 12 13:31:36 2020	(r357818)
+++ head/sys/netinet/tcp_ratelimit.c	Wed Feb 12 13:37:53 2020	(r357819)
@@ -49,11 +49,9 @@ __FBSDID("$FreeBSD$");
 #include <sys/eventhandler.h>
 #include <sys/mutex.h>
 #include <sys/ck.h>
-#include <net/if.h>
-#include <net/if_var.h>
+#define TCPSTATES		/* for logging */
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
-#define TCPSTATES		/* for logging */
 #include <netinet/tcp_var.h>
 #ifdef INET6
 #include <netinet6/tcp6_var.h>
@@ -66,186 +64,45 @@ __FBSDID("$FreeBSD$");
  * For the purposes of each send, what is the size
  * of an ethernet frame.
  */
+#ifndef ETHERNET_SEGMENT_SIZE
+#define ETHERNET_SEGMENT_SIZE 1500
+#endif
 MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
 #ifdef RATELIMIT
 
-/*
- * The following preferred table will seem weird to
- * the casual viewer. Why do we not have any rates below
- * 1Mbps? Why do we have a rate at 1.44Mbps called common?
- * Why do the rates cluster in the 1-100Mbps range more
- * than others? Why does the table jump around at the beginnign
- * and then be more consistently raising?
- *
- * Let me try to answer those questions. A lot of
- * this is dependant on the hardware. We have three basic
- * supporters of rate limiting
- *
- * Chelsio - Supporting 16 configurable rates.
- * Mlx  - c4 supporting 13 fixed rates.
- * Mlx  - c5 & c6 supporting 127 configurable rates.
- *
- * The c4 is why we have a common rate that is available
- * in all rate tables. This is a selected rate from the
- * c4 table and we assure its available in all ratelimit
- * tables. This way the tcp_ratelimit code has an assured
- * rate it should always be able to get. This answers a
- * couple of the questions above.
- *
- * So what about the rest, well the table is built to
- * try to get the most out of a joint hardware/software
- * pacing system.  The software pacer will always pick
- * a rate higher than the b/w that it is estimating
- *
- * on the path. This is done for two reasons.
- * a) So we can discover more b/w
- * and
- * b) So we can send a block of MSS's down and then
- *    have the software timer go off after the previous
- *    send is completely out of the hardware.
- *
- * But when we do <b> we don't want to have the delay
- * between the last packet sent by the hardware be
- * excessively long (to reach our desired rate).
- *
- * So let me give an example for clarity.
- *
- * Lets assume that the tcp stack sees that 29,110,000 bps is
- * what the bw of the path is. The stack would select the
- * rate 31Mbps. 31Mbps means that each send that is done
- * by the hardware will cause a 387 micro-second gap between
- * the pacets sent at that rate. For 29,110,000 bps we
- * would need 412 micro-seconds gap between each send.
- *
- * Now we pick a MSS size based on the delta between the
- * two rates (412 - 387) divided into the rate we really
- * wish to send at rounded up.  That results in a MSS
- * send of 17 mss's at once. The hardware then will
- * run out of data in a single 17MSS send in 6,579 micro-seconds.
- * On the other hand the software pacer will send more data
- * in 7,004 micro-seconds. This means that we will refill
- * the hardware 25 microseconds after it would have sent
- * next. This is a win since we no are only sending every
- * 7ms or so and yet all the packets are spaced on
- * the wire with 94% of what they should be and only
- * the last packet is delayed extra to make up for the
- * difference.  Note that the above formula has two
- * important caveat. If we are above (b/w wise) over
- * 100Mbps we double the result of the MSS calculation.
- * The second caveat is if we are 500Mbps or more
- * we just send the maximum MSS at once i.e. 45MSS
- *
- */
 #define COMMON_RATE 180500
 uint64_t desired_rates[] = {
-	122500,			/* 1Mbps  - rate 1 */
-	180500,			/* 1.44Mpbs - rate 2  common rate */
-	375000,			/* 3Mbps    - rate 3 */
-	625000,			/* 5Mbps    - rate 4 */
-	875000,			/* 7Mbps    - rate 5 */
-	1125000,		/* 9Mbps    - rate 6 */
-	1375000,		/* 11Mbps   - rate 7 */
-	1625000,	       	/* 13Mbps   - rate 8 */
-	2625000,		/* 21Mbps   - rate 9 */
-	3875000,		/* 31Mbps   - rate 10 */
-	5125000,		/* 41Meg    - rate 11 */
-	12500000,		/* 100Mbps  - rate 12 */
-	25000000,		/* 200Mbps  - rate 13 */
-	50000000,		/* 400Mbps  - rate 14 */
-	63750000,		/* 51Mbps   - rate 15 */
-	100000000,		/* 800Mbps  - rate 16 */
-	1875000,		/* 15Mbps   - rate 17 */
-	2125000,		/* 17Mbps   - rate 18 */
-	2375000,		/* 19Mbps   - rate 19 */
-	2875000,		/* 23Mbps   - rate 20 */
-	3125000,		/* 25Mbps   - rate 21 */
-	3375000,		/* 27Mbps   - rate 22 */
-	3625000,		/* 29Mbps   - rate 23 */
-	4125000,		/* 33Mbps   - rate 24 */
-	4375000,		/* 35Mbps   - rate 25 */
-	4625000,		/* 37Mbps   - rate 26 */
-	4875000,		/* 39Mbps   - rate 27 */
-	5375000,		/* 43Mbps   - rate 28 */
-	5625000,		/* 45Mbps   - rate 29 */
-	5875000,		/* 47Mbps   - rate 30 */
-	6125000,		/* 49Mbps   - rate 31 */
-	6625000,		/* 53Mbps   - rate 32 */
-	6875000,		/* 55Mbps   - rate 33 */
-	7125000,		/* 57Mbps   - rate 34 */
-	7375000,		/* 59Mbps   - rate 35 */
-	7625000,		/* 61Mbps   - rate 36 */
-	7875000,		/* 63Mbps   - rate 37 */
-	8125000,		/* 65Mbps   - rate 38 */
-	8375000,		/* 67Mbps   - rate 39 */
-	8625000,		/* 69Mbps   - rate 40 */
-	8875000,		/* 71Mbps   - rate 41 */
-	9125000,		/* 73Mbps   - rate 42 */
-	9375000,		/* 75Mbps   - rate 43 */
-	9625000,		/* 77Mbps   - rate 44 */
-	9875000,		/* 79Mbps   - rate 45 */
-	10125000,		/* 81Mbps   - rate 46 */
-	10375000,		/* 83Mbps   - rate 47 */
-	10625000,		/* 85Mbps   - rate 48 */
-	10875000,		/* 87Mbps   - rate 49 */
-	11125000,		/* 89Mbps   - rate 50 */
-	11375000,		/* 91Mbps   - rate 51 */
-	11625000,		/* 93Mbps   - rate 52 */
-	11875000,		/* 95Mbps   - rate 53 */
-	13125000,		/* 105Mbps  - rate 54 */
-	13750000,		/* 110Mbps  - rate 55 */
-	14375000,		/* 115Mbps  - rate 56 */
-	15000000,		/* 120Mbps  - rate 57 */
-	15625000,		/* 125Mbps  - rate 58 */
-	16250000,		/* 130Mbps  - rate 59 */
-	16875000,		/* 135Mbps  - rate 60 */
-	17500000,		/* 140Mbps  - rate 61 */
-	18125000,		/* 145Mbps  - rate 62 */
-	18750000,		/* 150Mbps  - rate 64 */
-	20000000,		/* 160Mbps  - rate 65 */
-	21250000,		/* 170Mbps  - rate 66 */
-	22500000,		/* 180Mbps  - rate 67 */
-	23750000,		/* 190Mbps  - rate 68 */
-	26250000,		/* 210Mbps  - rate 69 */
-	27500000,		/* 220Mbps  - rate 70 */
-	28750000,		/* 230Mbps  - rate 71 */
-	30000000,	       	/* 240Mbps  - rate 72 */
-	31250000,		/* 250Mbps  - rate 73 */
-	34375000,		/* 275Mbps  - rate 74 */
-	37500000,		/* 300Mbps  - rate 75 */
-	40625000,		/* 325Mbps  - rate 76 */
-	43750000,		/* 350Mbps  - rate 77 */
-	46875000,		/* 375Mbps  - rate 78 */
-	53125000,		/* 425Mbps  - rate 79 */
-	56250000,		/* 450Mbps  - rate 80 */
-	59375000,		/* 475Mbps  - rate 81 */
-	62500000,		/* 500Mbps  - rate 82 */
-	68750000,		/* 550Mbps  - rate 83 */
-	75000000,		/* 600Mbps  - rate 84 */
-	81250000,		/* 650Mbps  - rate 85 */
-	87500000,		/* 700Mbps  - rate 86 */
-	93750000,		/* 750Mbps  - rate 87 */
-	106250000,		/* 850Mbps  - rate 88 */
-	112500000,		/* 900Mbps  - rate 89 */
-	125000000,		/* 1Gbps    - rate 90 */
-	156250000,		/* 1.25Gps  - rate 91 */
-	187500000,		/* 1.5Gps   - rate 92 */
-	218750000,		/* 1.75Gps  - rate 93 */
-	250000000,		/* 2Gbps    - rate 94 */
-	281250000,		/* 2.25Gps  - rate 95 */
-	312500000,		/* 2.5Gbps  - rate 96 */
-	343750000,		/* 2.75Gbps - rate 97 */
-	375000000,		/* 3Gbps    - rate 98 */
-	500000000,		/* 4Gbps    - rate 99 */
-	625000000,		/* 5Gbps    - rate 100 */
-	750000000,		/* 6Gbps    - rate 101 */
-	875000000,		/* 7Gbps    - rate 102 */
-	1000000000,		/* 8Gbps    - rate 103 */
-	1125000000,		/* 9Gbps    - rate 104 */
-	1250000000,		/* 10Gbps   - rate 105 */
-	1875000000,		/* 15Gbps   - rate 106 */
-	2500000000		/* 20Gbps   - rate 107 */
+	62500,			/* 500Kbps */
+	180500,			/* 1.44Mpbs */
+	375000,			/* 3Mbps */
+	500000,			/* 4Mbps */
+	625000,			/* 5Mbps */
+	750000,			/* 6Mbps */
+	1000000,		/* 8Mbps */
+	1250000,		/* 10Mbps */
+	2500000,		/* 20Mbps */
+	3750000,		/* 30Mbps */
+	5000000,		/* 40Meg */
+	6250000,		/* 50Mbps */
+	12500000,		/* 100Mbps */
+	25000000,		/* 200Mbps */
+	50000000,		/* 400Mbps */
+	100000000,		/* 800Mbps */
+	12500,			/* 100kbps */
+	25000,			/* 200kbps */
+	875000,			/* 7Mbps */
+	1125000,		/* 9Mbps */
+	1875000,		/* 15Mbps */
+	3125000,		/* 25Mbps */
+	8125000,		/* 65Mbps */
+	10000000,		/* 80Mbps */
+	18750000,		/* 150Mbps */
+	20000000,		/* 250Mbps */
+	37500000,		/* 350Mbps */
+	62500000,		/* 500Mbps */
+	78125000,		/* 625Mbps */
+	125000000,		/* 1Gbps */
 };
-
 #define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
 #define RS_ORDERED_COUNT 16	/*
 				 * Number that are in order
@@ -427,7 +284,7 @@ rs_defer_destroy(struct tcp_rate_set *rs)
 
 	/* Set flag to only defer once. */
 	rs->rs_flags |= RS_FUNERAL_SCHD;
-	NET_EPOCH_CALL(rs_destroy, &rs->rs_epoch_ctx);
+	epoch_call(net_epoch, &rs->rs_epoch_ctx, rs_destroy);
 }
 
 #ifdef INET
@@ -522,24 +379,16 @@ rt_setup_new_rs(struct ifnet *ifp, int *error)
 		 * We can do nothing if we cannot
 		 * get a query back from the driver.
 		 */
-		printf("No query functions for %s:%d-- failed\n",
-		       ifp->if_dname, ifp->if_dunit);
 		return (NULL);
 	}
 	rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
 	if (rs == NULL) {
 		if (error)
 			*error = ENOMEM;
-		printf("No memory for malloc\n");
 		return (NULL);
 	}
-	memset(&rl, 0, sizeof(rl));
 	rl.flags = RT_NOSUPPORT;
 	ifp->if_ratelimit_query(ifp, &rl);
-	printf("if:%s:%d responds with flags:0x%x rate count:%d\n",
-	       ifp->if_dname,
-	       ifp->if_dunit,
-	       rl.flags, rl.number_of_rates);
 	if (rl.flags & RT_IS_UNUSABLE) {
 		/*
 		 * The interface does not really support
@@ -582,7 +431,7 @@ rt_setup_new_rs(struct ifnet *ifp, int *error)
 		mtx_unlock(&rs_mtx);
 		return (rs);
 	} else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
-		/* Mellanox C4 likely */
+		/* Mellanox most likely */
 		rs->rs_ifp = ifp;
 		rs->rs_if_dunit = ifp->if_dunit;
 		rs->rs_rate_cnt = rl.number_of_rates;
@@ -593,7 +442,7 @@ rt_setup_new_rs(struct ifnet *ifp, int *error)
 		rs->rs_disable = 0;
 		rate_table_act = rl.rate_table;
 	} else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
-		/* Chelsio, C5 and C6 of Mellanox? */
+		/* Chelsio */
 		rs->rs_ifp = ifp;
 		rs->rs_if_dunit = ifp->if_dunit;
 		rs->rs_rate_cnt = rl.number_of_rates;
@@ -685,14 +534,6 @@ bail:
 			rs->rs_lowest_valid = i;
 		} else {
 			int err;
-
-			if ((rl.flags & RT_IS_SETUP_REQ)  &&
-			    (ifp->if_ratelimit_query)) {
-				err = ifp->if_ratelimit_setup(ifp,
-  				         rs->rs_rlt[i].rate, i);
-				if (err)
-					goto handle_err;
-			}
 #ifdef RSS
 			hash_type = M_HASHTYPE_RSS_TCP_IPV4;
 #else
@@ -704,7 +545,6 @@ bail:
 			    rs->rs_rlt[i].rate,
 			    &rs->rs_rlt[i].tag);
 			if (err) {
-handle_err:
 				if (i == (rs->rs_rate_cnt - 1)) {
 					/*
 					 * Huh - first rate and we can't get
@@ -1038,7 +878,7 @@ rt_setup_rate(struct inpcb *inp, struct ifnet *ifp, ui
 	struct epoch_tracker et;
 	int err;
 
-	NET_EPOCH_ENTER(et);
+	epoch_enter_preempt(net_epoch_preempt, &et);
 use_real_interface:
 	CK_LIST_FOREACH(rs, &int_rs, next) {
 		/*
@@ -1071,14 +911,14 @@ use_real_interface:
 		 */
 		if (rs->rs_disable && error)
 			*error = ENODEV;
-		NET_EPOCH_EXIT(et);
+		epoch_exit_preempt(net_epoch_preempt, &et);
 		return (NULL);
 	}
 
 	if ((rs == NULL) || (rs->rs_disable != 0)) {
 		if (rs->rs_disable && error)
 			*error = ENOSPC;
-		NET_EPOCH_EXIT(et);
+		epoch_exit_preempt(net_epoch_preempt, &et);
 		return (NULL);
 	}
 	if (rs->rs_flags & RS_IS_DEFF) {
@@ -1089,7 +929,7 @@ use_real_interface:
 		if (tifp == NULL) {
 			if (rs->rs_disable && error)
 				*error = ENOTSUP;
-			NET_EPOCH_EXIT(et);
+			epoch_exit_preempt(net_epoch_preempt, &et);
 			return (NULL);
 		}
 		goto use_real_interface;
@@ -1098,7 +938,7 @@ use_real_interface:
 	    ((rs->rs_flows_using + 1) > rs->rs_flow_limit)) {
 		if (error)
 			*error = ENOSPC;
-		NET_EPOCH_EXIT(et);
+		epoch_exit_preempt(net_epoch_preempt, &et);
 		return (NULL);
 	}
 	rte = tcp_find_suitable_rate(rs, bytes_per_sec, flags);
@@ -1122,7 +962,7 @@ use_real_interface:
 		 */
 		atomic_add_64(&rs->rs_flows_using, 1);
 	}
-	NET_EPOCH_EXIT(et);
+	epoch_exit_preempt(net_epoch_preempt, &et);
 	return (rte);
 }
 
@@ -1138,22 +978,13 @@ tcp_rl_ifnet_link(void *arg __unused, struct ifnet *if
 		 * We only care on an interface going up that is rate-limit
 		 * capable.
 		 */
-		printf("ifp:%s.%d does not support rate-limit(0x%x) or link_state is not UP(state:%d)\n",
-		       ifp->if_dname,
-		       ifp->if_dunit,
-		       ifp->if_capabilities,
-		       link_state);
 		return;
 	}
 	mtx_lock(&rs_mtx);
-	printf("Link UP on interface %s.%d\n",
-	       ifp->if_dname,
-	       ifp->if_dunit);
 	CK_LIST_FOREACH(rs, &int_rs, next) {
 		if ((rs->rs_ifp == ifp) &&
 		    (rs->rs_if_dunit == ifp->if_dunit)) {
 			/* We already have initialized this guy */
-			printf("Interface already initialized\n");
 			mtx_unlock(&rs_mtx);
 			return;
 		}
@@ -1254,7 +1085,6 @@ tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *if
 			*error = EINVAL;
 		rte = NULL;
 	}
-	*error = 0;
 	return (rte);
 }
 
@@ -1362,112 +1192,6 @@ tcp_rel_pacing_rate(const struct tcp_hwrate_limit_tabl
 		mtx_unlock(&rs_mtx);
 	}
 	in_pcbdetach_txrtlmt(tp->t_inpcb);
-}
-
-#define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */
-#define ONE_HUNDRED_MBPS 12500000	/* 100Mbps in bytes per second */
-#define FIVE_HUNDRED_MBPS 62500000	/* 500Mbps in bytes per second */
-#define MAX_MSS_SENT 43	/* 43 mss = 43 x 1500 = 64,500 bytes */
-
-uint32_t
-tcp_get_pacing_mss(uint64_t bw, uint32_t segsiz, int can_use_1mss,
-		   const struct tcp_hwrate_limit_table *te)
-{
-	/*
-	 * We use the google formula to calculate the
-	 * TSO size. I.E.
-	 * bw < 24Meg
-	 *   tso = 2mss
-	 * else
-	 *   tso = min(bw/1000, 64k)
-	 *
-	 * Note for these calculations we ignore the
-	 * packet overhead (enet hdr, ip hdr and tcp hdr).
-	 */
-	uint64_t lentim, res, bytes;
-	uint32_t new_tso, min_tso_segs;
-
-	bytes = bw / 1000;
-	if (bytes > (64 * 1000))
-		bytes = 64 * 1000;
-	/* Round up */
-	new_tso = (bytes + segsiz - 1) / segsiz;
-	if (can_use_1mss && (bw < ONE_POINT_TWO_MEG))
-		min_tso_segs = 1;
-	else
-		min_tso_segs = 2;
-	if (new_tso < min_tso_segs)
-		new_tso = min_tso_segs;
-	if (new_tso > MAX_MSS_SENT)
-		new_tso = MAX_MSS_SENT;
-	new_tso *= segsiz;
-	/*
-	 * If we are not doing hardware pacing
-	 * then we are done.
-	 */
-	if (te == NULL)
-		return(new_tso);
-	/*
-	 * For hardware pacing we look at the
-	 * rate you are sending at and compare
-	 * that to the rate you have in hardware.
-	 *
-	 * If the hardware rate is slower than your
-	 * software rate then you are in error and
-	 * we will build a queue in our hardware whic
-	 * is probably not desired, in such a case
-	 * just return the non-hardware TSO size.
-	 *
-	 * If the rate in hardware is faster (which
-	 * it should be) then look at how long it
-	 * takes to send one ethernet segment size at
-	 * your b/w and compare that to the time it
-	 * takes to send at the rate you had selected.
-	 *
-	 * If your time is greater (which we hope it is)
-	 * we get the delta between the two, and then
-	 * divide that into your pacing time. This tells
-	 * us how many MSS you can send down at once (rounded up).
-	 *
-	 * Note we also double this value if the b/w is over
-	 * 100Mbps. If its over 500meg we just set you to the
-	 * max (43 segments).
-	 */
-	if (te->rate > FIVE_HUNDRED_MBPS)
-		return (segsiz * MAX_MSS_SENT);
-	if (te->rate == bw) {
-		/* We are pacing at exactly the hdwr rate */
-		return (segsiz * MAX_MSS_SENT);
-	}
-	lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
-	res = lentim / bw;
-	if (res > te->time_between) {
-		uint32_t delta, segs;
-
-		delta = res - te->time_between;
-		segs = (res + delta - 1)/delta;
-		if (te->rate > ONE_HUNDRED_MBPS)
-			segs *= 2;
-		if (segs < min_tso_segs)
-			segs = min_tso_segs;
-		if (segs > MAX_MSS_SENT)
-			segs = MAX_MSS_SENT;
-		segs *= segsiz;
-		if (segs < new_tso) {
-			/* unexpected ? */
-			return(new_tso);
-		} else {
-			return (segs);
-		}
-	} else {
-		/*
-		 * Your time is smaller which means
-		 * we will grow a queue on our
-		 * hardware. Send back the non-hardware
-		 * rate.
-		 */
-		return (new_tso);
-	}
 }
 
 static eventhandler_tag rl_ifnet_departs;

Modified: head/sys/netinet/tcp_ratelimit.h
==============================================================================
--- head/sys/netinet/tcp_ratelimit.h	Wed Feb 12 13:31:36 2020	(r357818)
+++ head/sys/netinet/tcp_ratelimit.h	Wed Feb 12 13:37:53 2020	(r357819)
@@ -88,9 +88,6 @@ CK_LIST_HEAD(head_tcp_rate_set, tcp_rate_set);
 #define RS_PACING_SUB_OK	0x0010	/* If a rate can't be found get the
 					 * next best rate (highest or lowest). */
 #ifdef _KERNEL
-#ifndef ETHERNET_SEGMENT_SIZE
-#define ETHERNET_SEGMENT_SIZE 1514
-#endif
 #ifdef RATELIMIT
 #define DETAILED_RATELIMIT_SYSCTL 1	/*
 					 * Undefine this if you don't want
@@ -138,17 +135,7 @@ tcp_rel_pacing_rate(const struct tcp_hwrate_limit_tabl
 {
 	return;
 }
+
 #endif
-/*
- * Given a b/w and a segsiz, and optional hardware
- * rate limit, return the ideal size to burst
- * out at once. Note the parameter can_use_1mss
- * dictates if the transport will tolerate a 1mss
- * limit, if not it will bottom out at 2mss (think
- * delayed ack).
- */
-uint32_t
-tcp_get_pacing_mss(uint64_t bw, uint32_t segsiz, int can_use_1mss,
-		   const struct tcp_hwrate_limit_table *te);
 #endif
 #endif


More information about the svn-src-head mailing list