svn commit: r350501 - in head/sys: conf dev/cxgbe dev/mlx5/mlx5_en net netinet
Randall Stewart
rrs at FreeBSD.org
Thu Aug 1 14:17:34 UTC 2019
Author: rrs
Date: Thu Aug 1 14:17:31 2019
New Revision: 350501
URL: https://svnweb.freebsd.org/changeset/base/350501
Log:
This adds the third step in getting BBR into the tree. BBR and
an updated rack depend on having access to the new
ratelimit api in this commit.
Sponsored by: Netflix Inc.
Differential Revision: https://reviews.freebsd.org/D20953
Added:
head/sys/netinet/tcp_ratelimit.c (contents, props changed)
head/sys/netinet/tcp_ratelimit.h (contents, props changed)
Modified:
head/sys/conf/files
head/sys/dev/cxgbe/adapter.h
head/sys/dev/cxgbe/t4_main.c
head/sys/dev/cxgbe/t4_sched.c
head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
head/sys/net/if_dead.c
head/sys/net/if_lagg.c
head/sys/net/if_var.h
head/sys/netinet/in_pcb.c
head/sys/netinet/in_pcb.h
Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files Thu Aug 1 14:13:04 2019 (r350500)
+++ head/sys/conf/files Thu Aug 1 14:17:31 2019 (r350501)
@@ -4276,6 +4276,7 @@ netinet/tcp_lro.c optional inet | inet6
netinet/tcp_output.c optional inet | inet6
netinet/tcp_offload.c optional tcp_offload inet | tcp_offload inet6
netinet/tcp_hpts.c optional tcphpts inet | tcphpts inet6
+netinet/tcp_ratelimit.c optional ratelimit inet | ratelimit inet6
netinet/tcp_pcap.c optional inet tcppcap | inet6 tcppcap \
compile-with "${NORMAL_C} ${NO_WNONNULL}"
netinet/tcp_reass.c optional inet | inet6
Modified: head/sys/dev/cxgbe/adapter.h
==============================================================================
--- head/sys/dev/cxgbe/adapter.h Thu Aug 1 14:13:04 2019 (r350500)
+++ head/sys/dev/cxgbe/adapter.h Thu Aug 1 14:17:31 2019 (r350501)
@@ -1247,6 +1247,7 @@ int cxgbe_snd_tag_modify(struct m_snd_tag *, union if_
int cxgbe_snd_tag_query(struct m_snd_tag *, union if_snd_tag_query_params *);
void cxgbe_snd_tag_free(struct m_snd_tag *);
void cxgbe_snd_tag_free_locked(struct cxgbe_snd_tag *);
+void cxgbe_ratelimit_query(struct ifnet *, struct if_ratelimit_query_results *);
#endif
/* t4_filter.c */
Modified: head/sys/dev/cxgbe/t4_main.c
==============================================================================
--- head/sys/dev/cxgbe/t4_main.c Thu Aug 1 14:13:04 2019 (r350500)
+++ head/sys/dev/cxgbe/t4_main.c Thu Aug 1 14:17:31 2019 (r350501)
@@ -1658,6 +1658,7 @@ cxgbe_vi_attach(device_t dev, struct vi_info *vi)
ifp->if_snd_tag_modify = cxgbe_snd_tag_modify;
ifp->if_snd_tag_query = cxgbe_snd_tag_query;
ifp->if_snd_tag_free = cxgbe_snd_tag_free;
+ ifp->if_ratelimit_query = cxgbe_ratelimit_query;
#endif
ifp->if_capabilities = T4_CAP;
Modified: head/sys/dev/cxgbe/t4_sched.c
==============================================================================
--- head/sys/dev/cxgbe/t4_sched.c Thu Aug 1 14:13:04 2019 (r350500)
+++ head/sys/dev/cxgbe/t4_sched.c Thu Aug 1 14:17:31 2019 (r350501)
@@ -903,4 +903,35 @@ cxgbe_snd_tag_free(struct m_snd_tag *mst)
}
mtx_unlock(&cst->lock);
}
+
+#define CXGBE_MAX_FLOWS 4000 /* Testing show so far thats all this adapter can do */
+#define CXGBE_UNIQUE_RATE_COUNT 16 /* Number of unique rates that can be setup */
+
+void
+cxgbe_ratelimit_query(struct ifnet *ifp __unused,
+ struct if_ratelimit_query_results *q)
+{
+ /*
+ * This is a skeleton and needs future work
+ * by the driver supporters. It should be
+ * enhanced to look at the specific type of
+ * interface and select approprate values
+ * for these settings. This example goes
+ * with an earlier card (t5), it has a maximum
+ * number of 16 rates that the first guys in
+ * select (thus the flags value RT_IS_SELECTABLE).
+ * If it was a fixed table then we would setup a
+ * const array (example mlx5). Note the card tested
+ * can only support reasonably 4000 flows before
+ * the adapter has issues with sending so here
+ * we limit the number of flows using hardware
+ * pacing to that number, other cards may
+ * be able to raise or eliminate this limit.
+ */
+ q->rate_table = NULL;
+ q->flags = RT_IS_SELECTABLE;
+ q->max_flows = CXGBE_MAX_FLOWS;
+ q->number_of_rates = CXGBE_UNIQUE_RATE_COUNT;
+ q->min_segment_burst = 4; /* Driver emits 4 in a burst */
+}
#endif
Modified: head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
==============================================================================
--- head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c Thu Aug 1 14:13:04 2019 (r350500)
+++ head/sys/dev/mlx5/mlx5_en/mlx5_en_main.c Thu Aug 1 14:17:31 2019 (r350501)
@@ -4070,7 +4070,49 @@ mlx5e_snd_tag_query(struct m_snd_tag *pmt, union if_sn
}
}
+#define NUM_HDWR_RATES_MLX 13
+static const uint64_t adapter_rates_mlx[NUM_HDWR_RATES_MLX] = {
+ 135375, /* 1,083,000 */
+ 180500, /* 1,444,000 */
+ 270750, /* 2,166,000 */
+ 361000, /* 2,888,000 */
+ 541500, /* 4,332,000 */
+ 721875, /* 5,775,000 */
+ 1082875, /* 8,663,000 */
+ 1443875, /* 11,551,000 */
+ 2165750, /* 17,326,000 */
+ 2887750, /* 23,102,000 */
+ 4331625, /* 34,653,000 */
+ 5775500, /* 46,204,000 */
+ 8663125 /* 69,305,000 */
+};
+
static void
+mlx5e_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
+{
+ /*
+ * This function needs updating by the driver maintainer!
+ * For the MLX card there are currently (ConectX-4?) 13
+ * pre-set rates and others i.e. ConnectX-5, 6, 7??
+ *
+ * This will change based on later adapters
+ * and this code should be updated to look at ifp
+ * and figure out the specific adapter type
+ * settings i.e. how many rates as well
+ * as if they are fixed (as is shown here) or
+ * if they are dynamic (example chelsio t4). Also if there
+ * is a maximum number of flows that the adapter
+ * can handle that too needs to be updated in
+ * the max_flows field.
+ */
+ q->rate_table = adapter_rates_mlx;
+ q->flags = RT_IS_FIXED_TABLE;
+ q->max_flows = 0; /* mlx has no limit */
+ q->number_of_rates = NUM_HDWR_RATES_MLX;
+ q->min_segment_burst = 1;
+}
+
+static void
mlx5e_snd_tag_free(struct m_snd_tag *pmt)
{
struct mlx5e_snd_tag *tag =
@@ -4155,7 +4197,9 @@ mlx5e_create_ifp(struct mlx5_core_dev *mdev)
ifp->if_snd_tag_free = mlx5e_snd_tag_free;
ifp->if_snd_tag_modify = mlx5e_snd_tag_modify;
ifp->if_snd_tag_query = mlx5e_snd_tag_query;
-
+#ifdef RATELIMIT
+ ifp->if_ratelimit_query = mlx5e_ratelimit_query;
+#endif
/* set TSO limits so that we don't have to drop TX packets */
ifp->if_hw_tsomax = MLX5E_MAX_TX_PAYLOAD_SIZE - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
ifp->if_hw_tsomaxsegcount = MLX5E_MAX_TX_MBUF_FRAGS - 1 /* hdr */;
Modified: head/sys/net/if_dead.c
==============================================================================
--- head/sys/net/if_dead.c Thu Aug 1 14:13:04 2019 (r350500)
+++ head/sys/net/if_dead.c Thu Aug 1 14:17:31 2019 (r350501)
@@ -126,6 +126,23 @@ ifdead_snd_tag_free(struct m_snd_tag *pmt)
{
}
+static void
+ifdead_ratelimit_query(struct ifnet *ifp __unused,
+ struct if_ratelimit_query_results *q)
+{
+ /*
+ * This guy does not support
+ * this interface. Not sure
+ * why we would specify a
+ * flag on the interface
+ * that says we do.
+ */
+ q->rate_table = NULL;
+ q->flags = RT_NOSUPPORT;
+ q->max_flows = 0;
+ q->number_of_rates = 0;
+}
+
void
if_dead(struct ifnet *ifp)
{
@@ -142,4 +159,5 @@ if_dead(struct ifnet *ifp)
ifp->if_snd_tag_modify = ifdead_snd_tag_modify;
ifp->if_snd_tag_query = ifdead_snd_tag_query;
ifp->if_snd_tag_free = ifdead_snd_tag_free;
+ ifp->if_ratelimit_query = ifdead_ratelimit_query;
}
Modified: head/sys/net/if_lagg.c
==============================================================================
--- head/sys/net/if_lagg.c Thu Aug 1 14:13:04 2019 (r350500)
+++ head/sys/net/if_lagg.c Thu Aug 1 14:17:31 2019 (r350501)
@@ -144,6 +144,8 @@ static int lagg_snd_tag_modify(struct m_snd_tag *,
static int lagg_snd_tag_query(struct m_snd_tag *,
union if_snd_tag_query_params *);
static void lagg_snd_tag_free(struct m_snd_tag *);
+static void lagg_ratelimit_query(struct ifnet *,
+ struct if_ratelimit_query_results *);
#endif
static int lagg_setmulti(struct lagg_port *);
static int lagg_clrmulti(struct lagg_port *);
@@ -537,6 +539,7 @@ lagg_clone_create(struct if_clone *ifc, int unit, cadd
ifp->if_snd_tag_modify = lagg_snd_tag_modify;
ifp->if_snd_tag_query = lagg_snd_tag_query;
ifp->if_snd_tag_free = lagg_snd_tag_free;
+ ifp->if_ratelimit_query = lagg_ratelimit_query;
#endif
ifp->if_capenable = ifp->if_capabilities = IFCAP_HWSTATS;
@@ -1670,6 +1673,20 @@ lagg_snd_tag_free(struct m_snd_tag *mst)
free(lst, M_LAGG);
}
+static void
+lagg_ratelimit_query(struct ifnet *ifp __unused, struct if_ratelimit_query_results *q)
+{
+ /*
+ * For lagg, we have an indirect
+ * interface. The caller needs to
+ * get a ratelimit tag on the actual
+ * interface the flow will go on.
+ */
+ q->rate_table = NULL;
+ q->flags = RT_IS_INDIRECT;
+ q->max_flows = 0;
+ q->number_of_rates = 0;
+}
#endif
static int
Modified: head/sys/net/if_var.h
==============================================================================
--- head/sys/net/if_var.h Thu Aug 1 14:13:04 2019 (r350500)
+++ head/sys/net/if_var.h Thu Aug 1 14:17:31 2019 (r350501)
@@ -203,6 +203,8 @@ struct if_snd_tag_alloc_header {
struct if_snd_tag_alloc_rate_limit {
struct if_snd_tag_alloc_header hdr;
uint64_t max_rate; /* in bytes/s */
+ uint32_t flags; /* M_NOWAIT or M_WAITOK */
+ uint32_t reserved; /* alignment */
};
struct if_snd_tag_rate_limit_params {
@@ -210,7 +212,7 @@ struct if_snd_tag_rate_limit_params {
uint32_t queue_level; /* 0 (empty) .. 65535 (full) */
#define IF_SND_QUEUE_LEVEL_MIN 0
#define IF_SND_QUEUE_LEVEL_MAX 65535
- uint32_t reserved; /* padding */
+ uint32_t flags; /* M_NOWAIT or M_WAITOK */
};
union if_snd_tag_alloc_params {
@@ -229,12 +231,38 @@ union if_snd_tag_query_params {
struct if_snd_tag_rate_limit_params unlimited;
};
+/* Query return flags */
+#define RT_NOSUPPORT 0x00000000 /* Not supported */
+#define RT_IS_INDIRECT 0x00000001 /*
+ * Interface like a lagg, select
+ * the actual interface for
+ * capabilities.
+ */
+#define RT_IS_SELECTABLE 0x00000002 /*
+ * No rate table, you select
+ * rates and the first
+ * number_of_rates are created.
+ */
+#define RT_IS_FIXED_TABLE 0x00000004 /* A fixed table is attached */
+#define RT_IS_UNUSABLE 0x00000008 /* It is not usable for this */
+
+struct if_ratelimit_query_results {
+ const uint64_t *rate_table; /* Pointer to table if present */
+ uint32_t flags; /* Flags indicating results */
+ uint32_t max_flows; /* Max flows using, 0=unlimited */
+ uint32_t number_of_rates; /* How many unique rates can be created */
+ uint32_t min_segment_burst; /* The amount the adapter bursts at each send */
+};
+
typedef int (if_snd_tag_alloc_t)(struct ifnet *, union if_snd_tag_alloc_params *,
struct m_snd_tag **);
typedef int (if_snd_tag_modify_t)(struct m_snd_tag *, union if_snd_tag_modify_params *);
typedef int (if_snd_tag_query_t)(struct m_snd_tag *, union if_snd_tag_query_params *);
typedef void (if_snd_tag_free_t)(struct m_snd_tag *);
+typedef void (if_ratelimit_query_t)(struct ifnet *,
+ struct if_ratelimit_query_results *);
+
/*
* Structure defining a network interface.
*/
@@ -374,6 +402,7 @@ struct ifnet {
if_snd_tag_modify_t *if_snd_tag_modify;
if_snd_tag_query_t *if_snd_tag_query;
if_snd_tag_free_t *if_snd_tag_free;
+ if_ratelimit_query_t *if_ratelimit_query;
/* Ethernet PCP */
uint8_t if_pcp;
Modified: head/sys/netinet/in_pcb.c
==============================================================================
--- head/sys/netinet/in_pcb.c Thu Aug 1 14:13:04 2019 (r350500)
+++ head/sys/netinet/in_pcb.c Thu Aug 1 14:17:31 2019 (r350501)
@@ -210,6 +210,22 @@ SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtim
&VNET_NAME(ipport_randomtime), 0,
"Minimum time to keep sequental port "
"allocation before switching to a random one");
+
+#ifdef RATELIMIT
+counter_u64_t rate_limit_active;
+counter_u64_t rate_limit_alloc_fail;
+counter_u64_t rate_limit_set_ok;
+
+static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD, 0,
+ "IP Rate Limiting");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD,
+ &rate_limit_active, "Active rate limited connections");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD,
+ &rate_limit_alloc_fail, "Rate limited connection failures");
+SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD,
+ &rate_limit_set_ok, "Rate limited setting succeeded");
+#endif /* RATELIMIT */
+
#endif /* INET */
/*
@@ -3170,6 +3186,7 @@ in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_p
{
union if_snd_tag_modify_params params = {
.rate_limit.max_rate = max_pacing_rate,
+ .rate_limit.flags = M_NOWAIT,
};
struct m_snd_tag *mst;
struct ifnet *ifp;
@@ -3256,7 +3273,8 @@ in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_tx
*/
int
in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp,
- uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate)
+ uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st)
+
{
union if_snd_tag_alloc_params params = {
.rate_limit.hdr.type = (max_pacing_rate == -1U) ?
@@ -3264,22 +3282,47 @@ in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *
.rate_limit.hdr.flowid = flowid,
.rate_limit.hdr.flowtype = flowtype,
.rate_limit.max_rate = max_pacing_rate,
+ .rate_limit.flags = M_NOWAIT,
};
int error;
INP_WLOCK_ASSERT(inp);
- if (inp->inp_snd_tag != NULL)
+ if (*st != NULL)
return (EINVAL);
if (ifp->if_snd_tag_alloc == NULL) {
error = EOPNOTSUPP;
} else {
error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag);
+
+ if (error == 0) {
+ counter_u64_add(rate_limit_set_ok, 1);
+ counter_u64_add(rate_limit_active, 1);
+ } else
+ counter_u64_add(rate_limit_alloc_fail, 1);
}
return (error);
}
+void
+in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst)
+{
+ if (ifp == NULL)
+ return;
+
+ /*
+ * If the device was detached while we still had reference(s)
+ * on the ifp, we assume if_snd_tag_free() was replaced with
+ * stubs.
+ */
+ ifp->if_snd_tag_free(mst);
+
+ /* release reference count on network interface */
+ if_rele(ifp);
+ counter_u64_add(rate_limit_active, -1);
+}
+
/*
* Free an existing TX rate limit tag based on the "inp->inp_snd_tag",
* if any:
@@ -3300,49 +3343,12 @@ in_pcbdetach_txrtlmt(struct inpcb *inp)
m_snd_tag_rele(mst);
}
-/*
- * This function should be called when the INP_RATE_LIMIT_CHANGED flag
- * is set in the fast path and will attach/detach/modify the TX rate
- * limit send tag based on the socket's so_max_pacing_rate value.
- */
-void
-in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
+int
+in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate)
{
- struct socket *socket;
- uint32_t max_pacing_rate;
- bool did_upgrade;
int error;
- if (inp == NULL)
- return;
-
- socket = inp->inp_socket;
- if (socket == NULL)
- return;
-
- if (!INP_WLOCKED(inp)) {
- /*
- * NOTE: If the write locking fails, we need to bail
- * out and use the non-ratelimited ring for the
- * transmit until there is a new chance to get the
- * write lock.
- */
- if (!INP_TRY_UPGRADE(inp))
- return;
- did_upgrade = 1;
- } else {
- did_upgrade = 0;
- }
-
/*
- * NOTE: The so_max_pacing_rate value is read unlocked,
- * because atomic updates are not required since the variable
- * is checked at every mbuf we send. It is assumed that the
- * variable read itself will be atomic.
- */
- max_pacing_rate = socket->so_max_pacing_rate;
-
- /*
* If the existing send tag is for the wrong interface due to
* a route change, first drop the existing tag. Set the
* CHANGED flag so that we will keep trying to allocate a new
@@ -3376,13 +3382,61 @@ in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *
error = EAGAIN;
} else {
error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb),
- mb->m_pkthdr.flowid, max_pacing_rate);
+ mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag);
}
} else {
error = in_pcbmodify_txrtlmt(inp, max_pacing_rate);
}
if (error == 0 || error == EOPNOTSUPP)
inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED;
+
+ return (error);
+}
+
+/*
+ * This function should be called when the INP_RATE_LIMIT_CHANGED flag
+ * is set in the fast path and will attach/detach/modify the TX rate
+ * limit send tag based on the socket's so_max_pacing_rate value.
+ */
+void
+in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb)
+{
+ struct socket *socket;
+ uint32_t max_pacing_rate;
+ bool did_upgrade;
+ int error;
+
+ if (inp == NULL)
+ return;
+
+ socket = inp->inp_socket;
+ if (socket == NULL)
+ return;
+
+ if (!INP_WLOCKED(inp)) {
+ /*
+ * NOTE: If the write locking fails, we need to bail
+ * out and use the non-ratelimited ring for the
+ * transmit until there is a new chance to get the
+ * write lock.
+ */
+ if (!INP_TRY_UPGRADE(inp))
+ return;
+ did_upgrade = 1;
+ } else {
+ did_upgrade = 0;
+ }
+
+ /*
+ * NOTE: The so_max_pacing_rate value is read unlocked,
+ * because atomic updates are not required since the variable
+ * is checked at every mbuf we send. It is assumed that the
+ * variable read itself will be atomic.
+ */
+ max_pacing_rate = socket->so_max_pacing_rate;
+
+ error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate);
+
if (did_upgrade)
INP_DOWNGRADE(inp);
}
@@ -3424,4 +3478,14 @@ in_pcboutput_eagain(struct inpcb *inp)
if (did_upgrade)
INP_DOWNGRADE(inp);
}
+
+static void
+rl_init(void *st)
+{
+ rate_limit_active = counter_u64_alloc(M_WAITOK);
+ rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK);
+ rate_limit_set_ok = counter_u64_alloc(M_WAITOK);
+}
+
+SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL);
#endif /* RATELIMIT */
Modified: head/sys/netinet/in_pcb.h
==============================================================================
--- head/sys/netinet/in_pcb.h Thu Aug 1 14:13:04 2019 (r350500)
+++ head/sys/netinet/in_pcb.h Thu Aug 1 14:17:31 2019 (r350501)
@@ -883,8 +883,13 @@ struct sockaddr *
in_sockaddr(in_port_t port, struct in_addr *addr);
void in_pcbsosetlabel(struct socket *so);
#ifdef RATELIMIT
-int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t, uint32_t);
+int
+in_pcboutput_txrtlmt_locked(struct inpcb *, struct ifnet *,
+ struct mbuf *, uint32_t);
+int in_pcbattach_txrtlmt(struct inpcb *, struct ifnet *, uint32_t, uint32_t,
+ uint32_t, struct m_snd_tag **);
void in_pcbdetach_txrtlmt(struct inpcb *);
+void in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst);
int in_pcbmodify_txrtlmt(struct inpcb *, uint32_t);
int in_pcbquery_txrtlmt(struct inpcb *, uint32_t *);
int in_pcbquery_txrlevel(struct inpcb *, uint32_t *);
Added: head/sys/netinet/tcp_ratelimit.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ head/sys/netinet/tcp_ratelimit.c Thu Aug 1 14:17:31 2019 (r350501)
@@ -0,0 +1,1234 @@
+/*-
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Copyright (c) 2018-2019
+ * Netflix Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/**
+ * Author: Randall Stewart <rrs at netflix.com>
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+#include "opt_ratelimit.h"
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#ifdef KERN_TLS
+#include <sys/sockbuf_tls.h>
+#endif
+#include <sys/sysctl.h>
+#include <sys/eventhandler.h>
+#include <sys/mutex.h>
+#include <sys/ck.h>
+#define TCPSTATES /* for logging */
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_var.h>
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcp_ratelimit.h>
+#ifndef USECS_IN_SECOND
+#define USECS_IN_SECOND 1000000
+#endif
+/*
+ * For the purposes of each send, what is the size
+ * of an ethernet frame.
+ */
+#ifndef ETHERNET_SEGMENT_SIZE
+#define ETHERNET_SEGMENT_SIZE 1500
+#endif
+MALLOC_DEFINE(M_TCPPACE, "tcp_hwpace", "TCP Hardware pacing memory");
+#ifdef RATELIMIT
+
+#define COMMON_RATE 180500
+uint64_t desired_rates[] = {
+ 62500, /* 500Kbps */
+ 180500, /* 1.44Mpbs */
+ 375000, /* 3Mbps */
+ 500000, /* 4Mbps */
+ 625000, /* 5Mbps */
+ 750000, /* 6Mbps */
+ 1000000, /* 8Mbps */
+ 1250000, /* 10Mbps */
+ 2500000, /* 20Mbps */
+ 3750000, /* 30Mbps */
+ 5000000, /* 40Meg */
+ 6250000, /* 50Mbps */
+ 12500000, /* 100Mbps */
+ 25000000, /* 200Mbps */
+ 50000000, /* 400Mbps */
+ 100000000, /* 800Mbps */
+ 12500, /* 100kbps */
+ 25000, /* 200kbps */
+ 875000, /* 7Mbps */
+ 1125000, /* 9Mbps */
+ 1875000, /* 15Mbps */
+ 3125000, /* 25Mbps */
+ 8125000, /* 65Mbps */
+ 10000000, /* 80Mbps */
+ 18750000, /* 150Mbps */
+ 20000000, /* 250Mbps */
+ 37500000, /* 350Mbps */
+ 62500000, /* 500Mbps */
+ 78125000, /* 625Mbps */
+ 125000000, /* 1Gbps */
+};
+#define MAX_HDWR_RATES (sizeof(desired_rates)/sizeof(uint64_t))
+#define RS_ORDERED_COUNT 16 /*
+ * Number that are in order
+ * at the beginning of the table,
+ * over this a sort is required.
+ */
+#define RS_NEXT_ORDER_GROUP 16 /*
+ * The point in our table where
+ * we come fill in a second ordered
+ * group (index wise means -1).
+ */
+#define ALL_HARDWARE_RATES 1004 /*
+ * 1Meg - 1Gig in 1 Meg steps
+ * plus 100, 200k and 500k and
+ * 10Gig
+ */
+
+#define RS_ONE_MEGABIT_PERSEC 1000000
+#define RS_ONE_GIGABIT_PERSEC 1000000000
+#define RS_TEN_GIGABIT_PERSEC 10000000000
+
+static struct head_tcp_rate_set int_rs;
+static struct mtx rs_mtx;
+uint32_t rs_number_alive;
+uint32_t rs_number_dead;
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, rl, CTLFLAG_RW, 0,
+ "TCP Ratelimit stats");
+SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, alive, CTLFLAG_RW,
+ &rs_number_alive, 0,
+ "Number of interfaces initialized for ratelimiting");
+SYSCTL_UINT(_net_inet_tcp_rl, OID_AUTO, dead, CTLFLAG_RW,
+ &rs_number_dead, 0,
+ "Number of interfaces departing from ratelimiting");
+
+static void
+rl_add_syctl_entries(struct sysctl_oid *rl_sysctl_root, struct tcp_rate_set *rs)
+{
+ /*
+ * Add sysctl entries for thus interface.
+ */
+ if (rs->rs_flags & RS_INTF_NO_SUP) {
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "disable", CTLFLAG_RD,
+ &rs->rs_disable, 0,
+ "Disable this interface from new hdwr limiting?");
+ } else {
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "disable", CTLFLAG_RW,
+ &rs->rs_disable, 0,
+ "Disable this interface from new hdwr limiting?");
+ }
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "minseg", CTLFLAG_RW,
+ &rs->rs_min_seg, 0,
+ "What is the minimum we need to send on this interface?");
+ SYSCTL_ADD_U64(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "flow_limit", CTLFLAG_RW,
+ &rs->rs_flow_limit, 0,
+ "What is the limit for number of flows (0=unlimited)?");
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "highest", CTLFLAG_RD,
+ &rs->rs_highest_valid, 0,
+ "Highest valid rate");
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "lowest", CTLFLAG_RD,
+ &rs->rs_lowest_valid, 0,
+ "Lowest valid rate");
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "flags", CTLFLAG_RD,
+ &rs->rs_flags, 0,
+ "What lags are on the entry?");
+ SYSCTL_ADD_S32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "numrates", CTLFLAG_RD,
+ &rs->rs_rate_cnt, 0,
+ "How many rates re there?");
+ SYSCTL_ADD_U64(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO, "flows_using", CTLFLAG_RD,
+ &rs->rs_flows_using, 0,
+ "How many flows are using this interface now?");
+#ifdef DETAILED_RATELIMIT_SYSCTL
+ if (rs->rs_rlt && rs->rs_rate_cnt > 0) {
+ /* Lets display the rates */
+ int i;
+ struct sysctl_oid *rl_rates;
+ struct sysctl_oid *rl_rate_num;
+ char rate_num[16];
+ rl_rates = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_sysctl_root),
+ OID_AUTO,
+ "rate",
+ CTLFLAG_RW, 0,
+ "Ratelist");
+ for( i = 0; i < rs->rs_rate_cnt; i++) {
+ sprintf(rate_num, "%d", i);
+ rl_rate_num = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_rates),
+ OID_AUTO,
+ rate_num,
+ CTLFLAG_RW, 0,
+ "Individual Rate");
+ SYSCTL_ADD_U32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_rate_num),
+ OID_AUTO, "flags", CTLFLAG_RD,
+ &rs->rs_rlt[i].flags, 0,
+ "Flags on this rate");
+ SYSCTL_ADD_U32(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_rate_num),
+ OID_AUTO, "pacetime", CTLFLAG_RD,
+ &rs->rs_rlt[i].time_between, 0,
+ "Time hardware inserts between 1500 byte sends");
+ SYSCTL_ADD_U64(&rs->sysctl_ctx,
+ SYSCTL_CHILDREN(rl_rate_num),
+ OID_AUTO, "rate", CTLFLAG_RD,
+ &rs->rs_rlt[i].rate, 0,
+ "Rate in bytes per second");
+ }
+ }
+#endif
+}
+
+static void
+rs_destroy(epoch_context_t ctx)
+{
+ struct tcp_rate_set *rs;
+
+ rs = __containerof(ctx, struct tcp_rate_set, rs_epoch_ctx);
+ mtx_lock(&rs_mtx);
+ rs->rs_flags &= ~RS_FUNERAL_SCHD;
+ if (rs->rs_flows_using == 0) {
+ /*
+ * In theory its possible (but unlikely)
+ * that while the delete was occuring
+ * and we were applying the DEAD flag
+ * someone slipped in and found the
+ * interface in a lookup. While we
+ * decided rs_flows_using were 0 and
+ * scheduling the epoch_call, the other
+ * thread incremented rs_flow_using. This
+ * is because users have a pointer and
+ * we only use the rs_flows_using in an
+ * atomic fashion, i.e. the other entities
+ * are not protected. To assure this did
+ * not occur, we check rs_flows_using here
+ * before deleteing.
+ */
+ sysctl_ctx_free(&rs->sysctl_ctx);
+ free(rs->rs_rlt, M_TCPPACE);
+ free(rs, M_TCPPACE);
+ rs_number_dead--;
+ }
+ mtx_unlock(&rs_mtx);
+
+}
+
+extern counter_u64_t rate_limit_set_ok;
+extern counter_u64_t rate_limit_active;
+extern counter_u64_t rate_limit_alloc_fail;
+
+static int
+rl_attach_txrtlmt(struct ifnet *ifp,
+ uint32_t flowtype,
+ int flowid,
+ uint64_t cfg_rate,
+ struct m_snd_tag **tag)
+{
+ int error;
+ union if_snd_tag_alloc_params params = {
+ .rate_limit.hdr.type = IF_SND_TAG_TYPE_RATE_LIMIT,
+ .rate_limit.hdr.flowid = flowid,
+ .rate_limit.hdr.flowtype = flowtype,
+ .rate_limit.max_rate = cfg_rate,
+ .rate_limit.flags = M_NOWAIT,
+ };
+
+ if (ifp->if_snd_tag_alloc == NULL) {
+ error = EOPNOTSUPP;
+ } else {
+ error = ifp->if_snd_tag_alloc(ifp, ¶ms, tag);
+ if (error == 0) {
+ if_ref((*tag)->ifp);
+ counter_u64_add(rate_limit_set_ok, 1);
+ counter_u64_add(rate_limit_active, 1);
+ } else
+ counter_u64_add(rate_limit_alloc_fail, 1);
+ }
+ return (error);
+}
+
+static void
+populate_canned_table(struct tcp_rate_set *rs, const uint64_t *rate_table_act)
+{
+ /*
+ * The internal table is "special", it
+ * is two seperate ordered tables that
+ * must be merged. We get here when the
+ * adapter specifies a number of rates that
+ * covers both ranges in the table in some
+ * form.
+ */
+ int i, at_low, at_high;
+ uint8_t low_disabled = 0, high_disabled = 0;
+
+ for(i = 0, at_low = 0, at_high = RS_NEXT_ORDER_GROUP; i < rs->rs_rate_cnt; i++) {
+ rs->rs_rlt[i].flags = 0;
+ rs->rs_rlt[i].time_between = 0;
+ if ((low_disabled == 0) &&
+ (high_disabled ||
+ (rate_table_act[at_low] < rate_table_act[at_high]))) {
+ rs->rs_rlt[i].rate = rate_table_act[at_low];
+ at_low++;
+ if (at_low == RS_NEXT_ORDER_GROUP)
+ low_disabled = 1;
+ } else if (high_disabled == 0) {
+ rs->rs_rlt[i].rate = rate_table_act[at_high];
+ at_high++;
+ if (at_high == MAX_HDWR_RATES)
+ high_disabled = 1;
+ }
+ }
+}
+
+static struct tcp_rate_set *
+rt_setup_new_rs(struct ifnet *ifp, int *error)
+{
+ struct tcp_rate_set *rs;
+ const uint64_t *rate_table_act;
+ uint64_t lentim, res;
+ size_t sz;
+ uint32_t hash_type;
+ int i;
+ struct if_ratelimit_query_results rl;
+ struct sysctl_oid *rl_sysctl_root;
+ /*
+ * We expect to enter with the
+ * mutex locked.
+ */
+
+ if (ifp->if_ratelimit_query == NULL) {
+ /*
+ * We can do nothing if we cannot
+ * get a query back from the driver.
+ */
+ return (NULL);
+ }
+ rs = malloc(sizeof(struct tcp_rate_set), M_TCPPACE, M_NOWAIT | M_ZERO);
+ if (rs == NULL) {
+ if (error)
+ *error = ENOMEM;
+ return (NULL);
+ }
+ rl.flags = RT_NOSUPPORT;
+ ifp->if_ratelimit_query(ifp, &rl);
+ if (rl.flags & RT_IS_UNUSABLE) {
+ /*
+ * The interface does not really support
+ * the rate-limiting.
+ */
+ memset(rs, 0, sizeof(struct tcp_rate_set));
+ rs->rs_ifp = ifp;
+ rs->rs_if_dunit = ifp->if_dunit;
+ rs->rs_flags = RS_INTF_NO_SUP;
+ rs->rs_disable = 1;
+ rs_number_alive++;
+ sysctl_ctx_init(&rs->sysctl_ctx);
+ rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+ SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
+ OID_AUTO,
+ rs->rs_ifp->if_xname,
+ CTLFLAG_RW, 0,
+ "");
+ CK_LIST_INSERT_HEAD(&int_rs, rs, next);
+ /* Unlock to allow the sysctl stuff to allocate */
+ mtx_unlock(&rs_mtx);
+ rl_add_syctl_entries(rl_sysctl_root, rs);
+ /* re-lock for our caller */
+ mtx_lock(&rs_mtx);
+ return (rs);
+ } else if ((rl.flags & RT_IS_INDIRECT) == RT_IS_INDIRECT) {
+ memset(rs, 0, sizeof(struct tcp_rate_set));
+ rs->rs_ifp = ifp;
+ rs->rs_if_dunit = ifp->if_dunit;
+ rs->rs_flags = RS_IS_DEFF;
+ rs_number_alive++;
+ sysctl_ctx_init(&rs->sysctl_ctx);
+ rl_sysctl_root = SYSCTL_ADD_NODE(&rs->sysctl_ctx,
+ SYSCTL_STATIC_CHILDREN(_net_inet_tcp_rl),
+ OID_AUTO,
+ rs->rs_ifp->if_xname,
+ CTLFLAG_RW, 0,
+ "");
+ CK_LIST_INSERT_HEAD(&int_rs, rs, next);
+ /* Unlock to allow the sysctl stuff to allocate */
+ mtx_unlock(&rs_mtx);
+ rl_add_syctl_entries(rl_sysctl_root, rs);
+ /* re-lock for our caller */
+ mtx_lock(&rs_mtx);
+ return (rs);
+ } else if ((rl.flags & RT_IS_FIXED_TABLE) == RT_IS_FIXED_TABLE) {
+ /* Mellanox most likely */
+ rs->rs_ifp = ifp;
+ rs->rs_if_dunit = ifp->if_dunit;
+ rs->rs_rate_cnt = rl.number_of_rates;
+ rs->rs_min_seg = rl.min_segment_burst;
+ rs->rs_highest_valid = 0;
+ rs->rs_flow_limit = rl.max_flows;
+ rs->rs_flags = RS_IS_INTF | RS_NO_PRE;
+ rs->rs_disable = 0;
+ rate_table_act = rl.rate_table;
+ } else if ((rl.flags & RT_IS_SELECTABLE) == RT_IS_SELECTABLE) {
+ /* Chelsio */
+ rs->rs_ifp = ifp;
+ rs->rs_if_dunit = ifp->if_dunit;
+ rs->rs_rate_cnt = rl.number_of_rates;
+ rs->rs_min_seg = rl.min_segment_burst;
+ rs->rs_disable = 0;
+ rs->rs_flow_limit = rl.max_flows;
+ rate_table_act = desired_rates;
+ if ((rs->rs_rate_cnt > MAX_HDWR_RATES) &&
+ (rs->rs_rate_cnt < ALL_HARDWARE_RATES)) {
+ /*
+ * Our desired table is not big
+ * enough, do what we can.
+ */
+ rs->rs_rate_cnt = MAX_HDWR_RATES;
+ }
+ if (rs->rs_rate_cnt <= RS_ORDERED_COUNT)
+ rs->rs_flags = RS_IS_INTF;
+ else
+ rs->rs_flags = RS_IS_INTF | RS_INT_TBL;
+ if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES)
+ rs->rs_rate_cnt = ALL_HARDWARE_RATES;
+ } else {
+ printf("Interface:%s unit:%d not one known to have rate-limits\n",
+ ifp->if_dname,
+ ifp->if_dunit);
+ free(rs, M_TCPPACE);
+ return (NULL);
+ }
+ sz = sizeof(struct tcp_hwrate_limit_table) * rs->rs_rate_cnt;
+ rs->rs_rlt = malloc(sz, M_TCPPACE, M_NOWAIT);
+ if (rs->rs_rlt == NULL) {
+ if (error)
+ *error = ENOMEM;
+bail:
+ free(rs, M_TCPPACE);
+ return (NULL);
+ }
+ if (rs->rs_rate_cnt >= ALL_HARDWARE_RATES) {
+ /*
+ * The interface supports all
+ * the rates we could possibly want.
+ */
+ uint64_t rat;
+
+ rs->rs_rlt[0].rate = 12500; /* 100k */
+ rs->rs_rlt[1].rate = 25000; /* 200k */
+ rs->rs_rlt[2].rate = 62500; /* 500k */
+ /* Note 125000 == 1Megabit
+ * populate 1Meg - 1000meg.
+ */
+ for(i = 3, rat = 125000; i< (ALL_HARDWARE_RATES-1); i++) {
+ rs->rs_rlt[i].rate = rat;
+ rat += 125000;
+ }
+ rs->rs_rlt[(ALL_HARDWARE_RATES-1)].rate = 1250000000;
+ } else if (rs->rs_flags & RS_INT_TBL) {
+ /* We populate this in a special way */
+ populate_canned_table(rs, rate_table_act);
+ } else {
+ /*
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-all
mailing list