svn commit: r358332 - in head/sys: net netinet
Randall Stewart
rrs at FreeBSD.org
Wed Feb 26 13:48:35 UTC 2020
Author: rrs
Date: Wed Feb 26 13:48:33 2020
New Revision: 358332
URL: https://svnweb.freebsd.org/changeset/base/358332
Log:
This commit expands tcp_ratelimit to be able to handle cards
like the mlx-c5 and c6 that require a "setup" routine before
the tcp_ratelimit code can declare and use a rate. I add the
setup routine to if_var as well as fix tcp_ratelimit to call it.
I also revisit the rates so that in the case of a mlx card
of type c5/6 we will use about 100 rates concentrated in the range
where the most gain can be had (1-200Mbps). Note that I have
tested these on a c5 and they work and perform well. In fact
in an unloaded system they pace right to the correct rate (great
job mlx!). There will be a further commit here from Hans that
will add the respective changes to the mlx driver to support this
work (which I was testing with).
Sponsored by: Netflix Inc.
Differential Revision: ttps://reviews.freebsd.org/D23647
Modified:
head/sys/net/if_var.h
head/sys/netinet/tcp_ratelimit.c
head/sys/netinet/tcp_ratelimit.h
Modified: head/sys/net/if_var.h
==============================================================================
--- head/sys/net/if_var.h Wed Feb 26 13:23:52 2020 (r358331)
+++ head/sys/net/if_var.h Wed Feb 26 13:48:33 2020 (r358332)
@@ -252,6 +252,7 @@ union if_snd_tag_query_params {
*/
#define RT_IS_FIXED_TABLE 0x00000004 /* A fixed table is attached */
#define RT_IS_UNUSABLE 0x00000008 /* It is not usable for this */
+#define RT_IS_SETUP_REQ 0x00000010 /* The interface setup must be called before use */
struct if_ratelimit_query_results {
const uint64_t *rate_table; /* Pointer to table if present */
@@ -268,8 +269,8 @@ typedef int (if_snd_tag_query_t)(struct m_snd_tag *, u
typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
typedef void (if_ratelimit_query_t)(struct ifnet *,
struct if_ratelimit_query_results *);
+typedef int (if_ratelimit_setup_t)(struct ifnet *, uint64_t, uint32_t);
-
/*
* Structure defining a network interface.
*/
@@ -368,7 +369,7 @@ struct ifnet {
if_init_fn_t if_init; /* Init routine */
int (*if_resolvemulti) /* validate/resolve multicast */
(struct ifnet *, struct sockaddr **, struct sockaddr *);
- if_qflush_fn_t if_qflush; /* flush any queue */
+ if_qflush_fn_t if_qflush; /* flush any queue */
if_transmit_fn_t if_transmit; /* initiate output routine */
void (*if_reassign) /* reassign to vnet routine */
@@ -411,6 +412,7 @@ struct ifnet {
if_snd_tag_query_t *if_snd_tag_query;
if_snd_tag_free_t *if_snd_tag_free;
if_ratelimit_query_t *if_ratelimit_query;
+ if_ratelimit_setup_t *if_ratelimit_setup;
/* Ethernet PCP */
uint8_t if_pcp;
@@ -555,7 +557,7 @@ struct ifaddr {
u_int ifa_refcnt; /* references to this structure */
counter_u64_t ifa_ipackets;
- counter_u64_t ifa_opackets;
+ counter_u64_t ifa_opackets;
counter_u64_t ifa_ibytes;
counter_u64_t ifa_obytes;
struct epoch_context ifa_epoch_ctx;
@@ -769,7 +771,7 @@ void if_setstartfn(if_t ifp, void (*)(if_t));
void if_settransmitfn(if_t ifp, if_transmit_fn_t);
void if_setqflushfn(if_t ifp, if_qflush_fn_t);
void if_setgetcounterfn(if_t ifp, if_get_counter_t);
-
+
/* Revisit the below. These are inline functions originally */
int drbr_inuse_drv(if_t ifp, struct buf_ring *br);
struct mbuf* drbr_dequeue_drv(if_t ifp, struct buf_ring *br);
Modified: head/sys/netinet/tcp_ratelimit.c
==============================================================================
--- head/sys/netinet/tcp_ratelimit.c Wed Feb 26 13:23:52 2020 (r358331)
+++ head/sys/netinet/tcp_ratelimit.c Wed Feb 26 13:48:33 2020 (r358332)
@@ -66,45 +66,199 @@ __FBSDID("$FreeBSD$");
* For the purposes of each send, what is the size
* of an ethernet frame.
*/
-#ifndef ETHERNET_SEGMENT_SIZE
-#define ETHERNET_SEGMENT_SIZE 1500
-#endif
MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
#ifdef RATELIMIT
+/*
+ * The following preferred table will seem weird to
+ * the casual viewer. Why do we not have any rates below
+ * 1Mbps? Why do we have a rate at 1.44Mbps called common?
+ * Why do the rates cluster in the 1-100Mbps range more
+ * than others? Why does the table jump around at the beginnign
+ * and then be more consistently raising?
+ *
+ * Let me try to answer those questions. A lot of
+ * this is dependant on the hardware. We have three basic
+ * supporters of rate limiting
+ *
+ * Chelsio - Supporting 16 configurable rates.
+ * Mlx - c4 supporting 13 fixed rates.
+ * Mlx - c5 & c6 supporting 127 configurable rates.
+ *
+ * The c4 is why we have a common rate that is available
+ * in all rate tables. This is a selected rate from the
+ * c4 table and we assure its available in all ratelimit
+ * tables. This way the tcp_ratelimit code has an assured
+ * rate it should always be able to get. This answers a
+ * couple of the questions above.
+ *
+ * So what about the rest, well the table is built to
+ * try to get the most out of a joint hardware/software
+ * pacing system. The software pacer will always pick
+ * a rate higher than the b/w that it is estimating
+ *
+ * on the path. This is done for two reasons.
+ * a) So we can discover more b/w
+ * and
+ * b) So we can send a block of MSS's down and then
+ * have the software timer go off after the previous
+ * send is completely out of the hardware.
+ *
+ * But when we do <b> we don't want to have the delay
+ * between the last packet sent by the hardware be
+ * excessively long (to reach our desired rate).
+ *
+ * So let me give an example for clarity.
+ *
+ * Lets assume that the tcp stack sees that 29,110,000 bps is
+ * what the bw of the path is. The stack would select the
+ * rate 31Mbps. 31Mbps means that each send that is done
+ * by the hardware will cause a 390 micro-second gap between
+ * the packets sent at that rate. For 29,110,000 bps we
+ * would need 416 micro-seconds gap between each send.
+ *
+ * Note that are calculating a complete time for pacing
+ * which includes the ethernet, IP and TCP overhead. So
+ * a full 1514 bytes is used for the above calculations.
+ * My testing has shown that both cards are also using this
+ * as their basis i.e. full payload size of the ethernet frame.
+ * The TCP stack caller needs to be aware of this and make the
+ * appropriate overhead calculations be included in its choices.
+ *
+ * Now, continuing our example, we pick a MSS size based on the
+ * delta between the two rates (416 - 390) divided into the rate
+ * we really wish to send at rounded up. That results in a MSS
+ * send of 17 mss's at once. The hardware then will
+ * run out of data in a single 17MSS send in 6,630 micro-seconds.
+ *
+ * On the other hand the software pacer will send more data
+ * in 7,072 micro-seconds. This means that we will refill
+ * the hardware 52 microseconds after it would have sent
+ * next if it had not ran out of data. This is a win since we are
+ * only sending every 7ms or so and yet all the packets are spaced on
+ * the wire with 94% of what they should be and only
+ * the last packet is delayed extra to make up for the
+ * difference.
+ *
+ * Note that the above formula has two important caveat.
+ * If we are above (b/w wise) over 100Mbps we double the result
+ * of the MSS calculation. The second caveat is if we are 500Mbps
+ * or more we just send the maximum MSS at once i.e. 45MSS. At
+ * the higher b/w's even the cards have limits to what times (timer granularity)
+ * they can insert between packets and start to send more than one
+ * packet at a time on the wire.
+ *
+ */
#define COMMON_RATE 180500
-uint64_t desired_rates[] = {
- 62500, /* 500Kbps */
- 180500, /* 1.44Mpbs */
- 375000, /* 3Mbps */
- 500000, /* 4Mbps */
- 625000, /* 5Mbps */
- 750000, /* 6Mbps */
- 1000000, /* 8Mbps */
- 1250000, /* 10Mbps */
- 2500000, /* 20Mbps */
- 3750000, /* 30Mbps */
- 5000000, /* 40Meg */
- 6250000, /* 50Mbps */
- 12500000, /* 100Mbps */
- 25000000, /* 200Mbps */
- 50000000, /* 400Mbps */
- 100000000, /* 800Mbps */
- 12500, /* 100kbps */
- 25000, /* 200kbps */
- 875000, /* 7Mbps */
- 1125000, /* 9Mbps */
- 1875000, /* 15Mbps */
- 3125000, /* 25Mbps */
- 8125000, /* 65Mbps */
- 10000000, /* 80Mbps */
- 18750000, /* 150Mbps */
- 20000000, /* 250Mbps */
- 37500000, /* 350Mbps */
- 62500000, /* 500Mbps */
- 78125000, /* 625Mbps */
- 125000000, /* 1Gbps */
+const uint64_t desired_rates[] = {
+ 122500, /* 1Mbps - rate 1 */
+ 180500, /* 1.44Mpbs - rate 2 common rate */
+ 375000, /* 3Mbps - rate 3 */
+ 625000, /* 5Mbps - rate 4 */
+ 875000, /* 7Mbps - rate 5 */
+ 1125000, /* 9Mbps - rate 6 */
+ 1375000, /* 11Mbps - rate 7 */
+ 1625000, /* 13Mbps - rate 8 */
+ 2625000, /* 21Mbps - rate 9 */
+ 3875000, /* 31Mbps - rate 10 */
+ 5125000, /* 41Meg - rate 11 */
+ 12500000, /* 100Mbps - rate 12 */
+ 25000000, /* 200Mbps - rate 13 */
+ 50000000, /* 400Mbps - rate 14 */
+ 63750000, /* 51Mbps - rate 15 */
+ 100000000, /* 800Mbps - rate 16 */
+ 1875000, /* 15Mbps - rate 17 */
+ 2125000, /* 17Mbps - rate 18 */
+ 2375000, /* 19Mbps - rate 19 */
+ 2875000, /* 23Mbps - rate 20 */
+ 3125000, /* 25Mbps - rate 21 */
+ 3375000, /* 27Mbps - rate 22 */
+ 3625000, /* 29Mbps - rate 23 */
+ 4125000, /* 33Mbps - rate 24 */
+ 4375000, /* 35Mbps - rate 25 */
+ 4625000, /* 37Mbps - rate 26 */
+ 4875000, /* 39Mbps - rate 27 */
+ 5375000, /* 43Mbps - rate 28 */
+ 5625000, /* 45Mbps - rate 29 */
+ 5875000, /* 47Mbps - rate 30 */
+ 6125000, /* 49Mbps - rate 31 */
+ 6625000, /* 53Mbps - rate 32 */
+ 6875000, /* 55Mbps - rate 33 */
+ 7125000, /* 57Mbps - rate 34 */
+ 7375000, /* 59Mbps - rate 35 */
+ 7625000, /* 61Mbps - rate 36 */
+ 7875000, /* 63Mbps - rate 37 */
+ 8125000, /* 65Mbps - rate 38 */
+ 8375000, /* 67Mbps - rate 39 */
+ 8625000, /* 69Mbps - rate 40 */
+ 8875000, /* 71Mbps - rate 41 */
+ 9125000, /* 73Mbps - rate 42 */
+ 9375000, /* 75Mbps - rate 43 */
+ 9625000, /* 77Mbps - rate 44 */
+ 9875000, /* 79Mbps - rate 45 */
+ 10125000, /* 81Mbps - rate 46 */
+ 10375000, /* 83Mbps - rate 47 */
+ 10625000, /* 85Mbps - rate 48 */
+ 10875000, /* 87Mbps - rate 49 */
+ 11125000, /* 89Mbps - rate 50 */
+ 11375000, /* 91Mbps - rate 51 */
+ 11625000, /* 93Mbps - rate 52 */
+ 11875000, /* 95Mbps - rate 53 */
+ 13125000, /* 105Mbps - rate 54 */
+ 13750000, /* 110Mbps - rate 55 */
+ 14375000, /* 115Mbps - rate 56 */
+ 15000000, /* 120Mbps - rate 57 */
+ 15625000, /* 125Mbps - rate 58 */
+ 16250000, /* 130Mbps - rate 59 */
+ 16875000, /* 135Mbps - rate 60 */
+ 17500000, /* 140Mbps - rate 61 */
+ 18125000, /* 145Mbps - rate 62 */
+ 18750000, /* 150Mbps - rate 64 */
+ 20000000, /* 160Mbps - rate 65 */
+ 21250000, /* 170Mbps - rate 66 */
+ 22500000, /* 180Mbps - rate 67 */
+ 23750000, /* 190Mbps - rate 68 */
+ 26250000, /* 210Mbps - rate 69 */
+ 27500000, /* 220Mbps - rate 70 */
+ 28750000, /* 230Mbps - rate 71 */
+ 30000000, /* 240Mbps - rate 72 */
+ 31250000, /* 250Mbps - rate 73 */
+ 34375000, /* 275Mbps - rate 74 */
+ 37500000, /* 300Mbps - rate 75 */
+ 40625000, /* 325Mbps - rate 76 */
+ 43750000, /* 350Mbps - rate 77 */
+ 46875000, /* 375Mbps - rate 78 */
+ 53125000, /* 425Mbps - rate 79 */
+ 56250000, /* 450Mbps - rate 80 */
+ 59375000, /* 475Mbps - rate 81 */
+ 62500000, /* 500Mbps - rate 82 */
+ 68750000, /* 550Mbps - rate 83 */
+ 75000000, /* 600Mbps - rate 84 */
+ 81250000, /* 650Mbps - rate 85 */
+ 87500000, /* 700Mbps - rate 86 */
+ 93750000, /* 750Mbps - rate 87 */
+ 106250000, /* 850Mbps - rate 88 */
+ 112500000, /* 900Mbps - rate 89 */
+ 125000000, /* 1Gbps - rate 90 */
+ 156250000, /* 1.25Gps - rate 91 */
+ 187500000, /* 1.5Gps - rate 92 */
+ 218750000, /* 1.75Gps - rate 93 */
+ 250000000, /* 2Gbps - rate 94 */
+ 281250000, /* 2.25Gps - rate 95 */
+ 312500000, /* 2.5Gbps - rate 96 */
+ 343750000, /* 2.75Gbps - rate 97 */
+ 375000000, /* 3Gbps - rate 98 */
+ 500000000, /* 4Gbps - rate 99 */
+ 625000000, /* 5Gbps - rate 100 */
+ 750000000, /* 6Gbps - rate 101 */
+ 875000000, /* 7Gbps - rate 102 */
+ 1000000000, /* 8Gbps - rate 103 */
+ 1125000000, /* 9Gbps - rate 104 */
+ 1250000000, /* 10Gbps - rate 105 */
+ 1875000000, /* 15Gbps - rate 106 */
+ 2500000000 /* 20Gbps - rate 107 */
};
+
#define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
#define RS_ORDERED_COUNT 16 /*
* Number that are in order
@@ -381,14 +535,18 @@ rt_setup_new_rs(struct ifnet *ifp, int *error)
* We can do nothing if we cannot
* get a query back from the driver.
*/
+ printf("Warning:No query functions for %s:%d-- failed\n",
+ ifp->if_dname, ifp->if_dunit);
return (NULL);
}
rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
if (rs == NULL) {
if (error)
*error = ENOMEM;
+ printf("Warning:No memory for malloc of tcp_rate_set\n");
return (NULL);
}
+ memset(&rl, 0, sizeof(rl));
rl.flags = RT_NOSUPPORT;
ifp->if_ratelimit_query(ifp, &rl);
if (rl.flags & RT_IS_UNUSABLE) {
@@ -433,7 +591,7 @@ rt_setup_new_rs(struct ifnet *ifp, int *error)
mtx_unlock(&rs_mtx);
return (rs);
} else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
- /* Mellanox most likely */
+ /* Mellanox C4 likely */
rs->rs_ifp = ifp;
rs->rs_if_dunit = ifp->if_dunit;
rs->rs_rate_cnt = rl.number_of_rates;
@@ -444,7 +602,7 @@ rt_setup_new_rs(struct ifnet *ifp, int *error)
rs->rs_disable = 0;
rate_table_act = rl.rate_table;
} else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
- /* Chelsio */
+ /* Chelsio, C5 and C6 of Mellanox? */
rs->rs_ifp = ifp;
rs->rs_if_dunit = ifp->if_dunit;
rs->rs_rate_cnt = rl.number_of_rates;
@@ -467,9 +625,6 @@ rt_setup_new_rs(struct ifnet *ifp, int *error)
if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
rs->rs_rate_cnt = ALL_HARDWARE_RATES;
} else {
- printf("Interface:%s unit:%d not one known to have rate-limits\n",
- ifp->if_dname,
- ifp->if_dunit);
free(rs, M_TCPPACE);
return (NULL);
}
@@ -536,6 +691,14 @@ bail:
rs->rs_lowest_valid = i;
} else {
int err;
+
+ if ((rl.flags & RT_IS_SETUP_REQ) &&
+ (ifp->if_ratelimit_query)) {
+ err = ifp->if_ratelimit_setup(ifp,
+ rs->rs_rlt[i].rate, i);
+ if (err)
+ goto handle_err;
+ }
#ifdef RSS
hash_type = M_HASHTYPE_RSS_TCP_IPV4;
#else
@@ -547,6 +710,7 @@ bail:
rs->rs_rlt[i].rate,
&rs->rs_rlt[i].tag);
if (err) {
+handle_err:
if (i == (rs->rs_rate_cnt - 1)) {
/*
* Huh - first rate and we can't get
@@ -1087,6 +1251,7 @@ tcp_set_pacing_rate(struct tcpcb *tp, struct ifnet *if
*error = EINVAL;
rte = NULL;
}
+ *error = 0;
return (rte);
}
@@ -1194,6 +1359,120 @@ tcp_rel_pacing_rate(const struct tcp_hwrate_limit_tabl
mtx_unlock(&rs_mtx);
}
in_pcbdetach_txrtlmt(tp->t_inpcb);
+}
+
+#define ONE_POINT_TWO_MEG 150000 /* 1.2 megabits in bytes */
+#define ONE_HUNDRED_MBPS 12500000 /* 100Mbps in bytes per second */
+#define FIVE_HUNDRED_MBPS 62500000 /* 500Mbps in bytes per second */
+#define MAX_MSS_SENT 43 /* 43 mss = 43 x 1500 = 64,500 bytes */
+
+
+uint32_t
+tcp_get_pacing_burst_size (uint64_t bw, uint32_t segsiz, int can_use_1mss,
+ const struct tcp_hwrate_limit_table *te, int *err)
+{
+ /*
+ * We use the google formula to calculate the
+ * TSO size. I.E.
+ * bw < 24Meg
+ * tso = 2mss
+ * else
+ * tso = min(bw/1000, 64k)
+ *
+ * Note for these calculations we ignore the
+ * packet overhead (enet hdr, ip hdr and tcp hdr).
+ */
+ uint64_t lentim, res, bytes;
+ uint32_t new_tso, min_tso_segs;
+
+ bytes = bw / 1000;
+ if (bytes > (64 * 1000))
+ bytes = 64 * 1000;
+ /* Round up */
+ new_tso = (bytes + segsiz - 1) / segsiz;
+ if (can_use_1mss && (bw < ONE_POINT_TWO_MEG))
+ min_tso_segs = 1;
+ else
+ min_tso_segs = 2;
+ if (new_tso < min_tso_segs)
+ new_tso = min_tso_segs;
+ if (new_tso > MAX_MSS_SENT)
+ new_tso = MAX_MSS_SENT;
+ new_tso *= segsiz;
+ /*
+ * If we are not doing hardware pacing
+ * then we are done.
+ */
+ if (te == NULL) {
+ if (err)
+ *err = 0;
+ return(new_tso);
+ }
+ /*
+ * For hardware pacing we look at the
+ * rate you are sending at and compare
+ * that to the rate you have in hardware.
+ *
+ * If the hardware rate is slower than your
+ * software rate then you are in error and
+ * we will build a queue in our hardware whic
+ * is probably not desired, in such a case
+ * just return the non-hardware TSO size.
+ *
+ * If the rate in hardware is faster (which
+ * it should be) then look at how long it
+ * takes to send one ethernet segment size at
+ * your b/w and compare that to the time it
+ * takes to send at the rate you had selected.
+ *
+ * If your time is greater (which we hope it is)
+ * we get the delta between the two, and then
+ * divide that into your pacing time. This tells
+ * us how many MSS you can send down at once (rounded up).
+ *
+ * Note we also double this value if the b/w is over
+ * 100Mbps. If its over 500meg we just set you to the
+ * max (43 segments).
+ */
+ if (te->rate > FIVE_HUNDRED_MBPS)
+ return (segsiz * MAX_MSS_SENT);
+ if (te->rate == bw) {
+ /* We are pacing at exactly the hdwr rate */
+ return (segsiz * MAX_MSS_SENT);
+ }
+ lentim = ETHERNET_SEGMENT_SIZE * USECS_IN_SECOND;
+ res = lentim / bw;
+ if (res > te->time_between) {
+ uint32_t delta, segs;
+
+ delta = res - te->time_between;
+ segs = (res + delta - 1)/delta;
+ if (te->rate > ONE_HUNDRED_MBPS)
+ segs *= 2;
+ if (segs < min_tso_segs)
+ segs = min_tso_segs;
+ if (segs > MAX_MSS_SENT)
+ segs = MAX_MSS_SENT;
+ segs *= segsiz;
+ if (err)
+ *err = 0;
+ if (segs < new_tso) {
+ /* unexpected ? */
+ return(new_tso);
+ } else {
+ return (segs);
+ }
+ } else {
+ /*
+ * Your time is smaller which means
+ * we will grow a queue on our
+ * hardware. Send back the non-hardware
+ * rate.
+ */
+ if (err)
+ *err = -1;
+ return (new_tso);
+ }
}
static eventhandler_tag rl_ifnet_departs;
Modified: head/sys/netinet/tcp_ratelimit.h
==============================================================================
--- head/sys/netinet/tcp_ratelimit.h Wed Feb 26 13:23:52 2020 (r358331)
+++ head/sys/netinet/tcp_ratelimit.h Wed Feb 26 13:48:33 2020 (r358332)
@@ -88,6 +88,9 @@ CK_LIST_HEAD(head_tcp_rate_set, tcp_rate_set);
#define RS_PACING_SUB_OK 0x0010 /* If a rate can't be found get the
* next best rate (highest or lowest). */
#ifdef _KERNEL
+#ifndef ETHERNET_SEGMENT_SIZE
+#define ETHERNET_SEGMENT_SIZE 1514
+#endif
#ifdef RATELIMIT
#define DETAILED_RATELIMIT_SYSCTL 1 /*
* Undefine this if you don't want
@@ -135,7 +138,18 @@ tcp_rel_pacing_rate(const struct tcp_hwrate_limit_tabl
{
return;
}
-
#endif
+/*
+ * Given a b/w and a segsiz, and optional hardware
+ * rate limit, return the ideal size to burst
+ * out at once. Note the parameter can_use_1mss
+ * dictates if the transport will tolerate a 1mss
+ * limit, if not it will bottom out at 2mss (think
+ * delayed ack).
+ */
+uint32_t
+tcp_get_pacing_burst_size(uint64_t bw, uint32_t segsiz, int can_use_1mss,
+ const struct tcp_hwrate_limit_table *te, int *err);
+
#endif
#endif
More information about the svn-src-all
mailing list