svn commit: r355304 - in head: lib/libstats share/man/man4 sys/conf sys/netinet sys/netinet/cc sys/sys

Edward Tomasz Napierala trasz at FreeBSD.org
Mon Dec 2 20:58:07 UTC 2019


Author: trasz
Date: Mon Dec  2 20:58:04 2019
New Revision: 355304
URL: https://svnweb.freebsd.org/changeset/base/355304

Log:
  Make use of the stats(3) framework in the TCP stack.
  
  This makes it possible to retrieve per-connection statistical
  information such as the receive window size, RTT, or goodput,
  using a newly added TCP_STATS getsockopt(3) option, and extract
  them using the stats_voistat_fetch(3) API.
  
  See the net/tcprtt port for an example consumer of this API.
  
  Compared to the existing TCP_INFO system, the main differences
  are that this mechanism is easy to extend without breaking ABI,
  and provides statistical information instead of raw "snapshots"
  of values at a given point in time.  stats(3) is more generic
  and can be used in both userland and the kernel.
  
  Reviewed by:	thj
  Tested by:	thj
  Obtained from:	Netflix
  Relnotes:	yes
  Sponsored by:	Klara Inc, Netflix
  Differential Revision:	https://reviews.freebsd.org/D20655

Added:
  head/sys/netinet/tcp_stats.c   (contents, props changed)
Modified:
  head/lib/libstats/Makefile
  head/share/man/man4/tcp.4
  head/sys/conf/files
  head/sys/netinet/cc/cc.h
  head/sys/netinet/tcp.h
  head/sys/netinet/tcp_input.c
  head/sys/netinet/tcp_log_buf.c
  head/sys/netinet/tcp_output.c
  head/sys/netinet/tcp_subr.c
  head/sys/netinet/tcp_usrreq.c
  head/sys/netinet/tcp_var.h
  head/sys/sys/stats.h

Modified: head/lib/libstats/Makefile
==============================================================================
--- head/lib/libstats/Makefile	Mon Dec  2 20:57:13 2019	(r355303)
+++ head/lib/libstats/Makefile	Mon Dec  2 20:58:04 2019	(r355304)
@@ -3,12 +3,12 @@
 LIB=		stats
 SHLIBDIR?= 	/lib
 SHLIB_MAJOR=	0
-SRCS=		subr_stats.c
+SRCS=		subr_stats.c tcp_stats.c
 
 # To debug, comment WITHOUT_ASSERT_DEBUG= and uncomment CFLAGS:=
 WITHOUT_ASSERT_DEBUG=
 #CFLAGS:=${CFLAGS:C/-O[0-9]/-O0 -g3/} -DDIAGNOSTIC
 
-.PATH:	${.CURDIR}/../../sys/kern
+.PATH:	${.CURDIR}/../../sys/kern ${.CURDIR}/../../sys/netinet
 
 .include <bsd.lib.mk>

Modified: head/share/man/man4/tcp.4
==============================================================================
--- head/share/man/man4/tcp.4	Mon Dec  2 20:57:13 2019	(r355303)
+++ head/share/man/man4/tcp.4	Mon Dec  2 20:58:04 2019	(r355304)
@@ -34,7 +34,7 @@
 .\"     From: @(#)tcp.4	8.1 (Berkeley) 6/5/93
 .\" $FreeBSD$
 .\"
-.Dd December 1, 2019
+.Dd December 2, 2019
 .Dt TCP 4
 .Os
 .Sh NAME
@@ -291,6 +291,10 @@ This entry can only be specified on a per-host basis a
 .Pp
 If an SADB entry cannot be found for the destination,
 the system does not send any outgoing segments and drops any inbound segments.
+.It Dv TCP_STATS
+Manage collection of connection level statistics using the
+.Xr stats 3
+framework.
 .Pp
 Each dropped segment is taken into account in the TCP protocol statistics.
 .It Dv TCP_TXTLS_ENABLE
@@ -664,6 +668,17 @@ Default is false.
 When initializing the TCP timestamps, use a per connection offset instead of a
 per host pair offset.
 Default is to use per connection offsets as recommended in RFC 7323.
+.It Va perconn_stats_enable
+Controls the default collection of statistics for all connections using the
+.Xr stats 3
+framework.
+0 disables, 1 enables, 2 enables random sampling across log id connection
+groups with all connections in a group receiving the same setting.
+.It Va perconn_stats_sample_rates
+A CSV list of template_spec=percent key-value pairs which controls the per
+template sampling rates when
+.Xr stats 3
+sampling is enabled.
 .El
 .Sh ERRORS
 A socket operation may fail with one of the following errors returned:
@@ -703,6 +718,7 @@ when trying to use a TCP function block that is not av
 .Sh SEE ALSO
 .Xr getsockopt 2 ,
 .Xr socket 2 ,
+.Xr stats 3 ,
 .Xr sysctl 3 ,
 .Xr blackhole 4 ,
 .Xr inet 4 ,

Modified: head/sys/conf/files
==============================================================================
--- head/sys/conf/files	Mon Dec  2 20:57:13 2019	(r355303)
+++ head/sys/conf/files	Mon Dec  2 20:58:04 2019	(r355304)
@@ -4295,6 +4295,7 @@ netinet/tcp_pcap.c		optional inet tcppcap | inet6 tcpp
 	compile-with "${NORMAL_C} ${NO_WNONNULL}"
 netinet/tcp_reass.c		optional inet | inet6
 netinet/tcp_sack.c		optional inet | inet6
+netinet/tcp_stats.c		optional stats inet | stats inet6
 netinet/tcp_subr.c		optional inet | inet6
 netinet/tcp_syncache.c		optional inet | inet6
 netinet/tcp_timer.c		optional inet | inet6

Modified: head/sys/netinet/cc/cc.h
==============================================================================
--- head/sys/netinet/cc/cc.h	Mon Dec  2 20:57:13 2019	(r355303)
+++ head/sys/netinet/cc/cc.h	Mon Dec  2 20:58:04 2019	(r355304)
@@ -51,9 +51,7 @@
 #ifndef _NETINET_CC_CC_H_
 #define _NETINET_CC_CC_H_
 
-#if !defined(_KERNEL)
-#error "no user-serviceable parts inside"
-#endif
+#ifdef _KERNEL
 
 /* Global CC vars. */
 extern STAILQ_HEAD(cc_head, cc_algo) cc_list;
@@ -108,6 +106,7 @@ struct cc_var {
 #define	CC_DUPACK	0x0002	/* Duplicate ACK. */
 #define	CC_PARTIALACK	0x0004	/* Not yet. */
 #define	CC_SACK		0x0008	/* Not yet. */
+#endif /* _KERNEL */
 
 /*
  * Congestion signal types passed to the cong_signal() hook. The highest order 8
@@ -121,6 +120,7 @@ struct cc_var {
 
 #define	CC_SIGPRIVMASK	0xFF000000	/* Mask to check if sig is private. */
 
+#ifdef _KERNEL
 /*
  * Structure to hold data and function pointers that together represent a
  * congestion control algorithm.
@@ -184,4 +184,5 @@ extern struct rwlock cc_list_lock;
 
 #define CC_ALGOOPT_LIMIT	2048
 
+#endif /* _KERNEL */
 #endif /* _NETINET_CC_CC_H_ */

Modified: head/sys/netinet/tcp.h
==============================================================================
--- head/sys/netinet/tcp.h	Mon Dec  2 20:57:13 2019	(r355303)
+++ head/sys/netinet/tcp.h	Mon Dec  2 20:58:04 2019	(r355304)
@@ -168,6 +168,7 @@ struct tcphdr {
 #define TCP_NOOPT	8	/* don't use TCP options */
 #define TCP_MD5SIG	16	/* use MD5 digests (RFC2385) */
 #define	TCP_INFO	32	/* retrieve tcp_info structure */
+#define	TCP_STATS	33	/* retrieve stats blob structure */
 #define	TCP_LOG		34	/* configure event logging for connection */
 #define	TCP_LOGBUF	35	/* retrieve event log for connection */
 #define	TCP_LOGID	36	/* configure log ID to correlate connections */
@@ -363,5 +364,19 @@ struct tcp_function_set {
  * TCP Control message types
  */
 #define	TLS_SET_RECORD_TYPE	1
+
+/*
+ * TCP specific variables of interest for tp->t_stats stats(9) accounting.
+ */
+#define	VOI_TCP_TXPB		0 /* Transmit payload bytes */
+#define	VOI_TCP_RETXPB		1 /* Retransmit payload bytes */
+#define	VOI_TCP_FRWIN		2 /* Foreign receive window */
+#define	VOI_TCP_LCWIN		3 /* Local congesiton window */
+#define	VOI_TCP_RTT		4 /* Round trip time */
+#define	VOI_TCP_CSIG		5 /* Congestion signal */
+#define	VOI_TCP_GPUT		6 /* Goodput */
+#define	VOI_TCP_CALCFRWINDIFF	7 /* Congestion avoidance LCWIN - FRWIN */
+#define	VOI_TCP_GPUT_ND		8 /* Goodput normalised delta */
+#define	VOI_TCP_ACKLEN		9 /* Average ACKed bytes per ACK */
 
 #endif /* !_NETINET_TCP_H_ */

Modified: head/sys/netinet/tcp_input.c
==============================================================================
--- head/sys/netinet/tcp_input.c	Mon Dec  2 20:57:13 2019	(r355303)
+++ head/sys/netinet/tcp_input.c	Mon Dec  2 20:58:04 2019	(r355304)
@@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$");
 #include "opt_tcpdebug.h"
 
 #include <sys/param.h>
+#include <sys/arb.h>
 #include <sys/kernel.h>
 #ifdef TCP_HHOOK
 #include <sys/hhook.h>
@@ -66,6 +67,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/mbuf.h>
 #include <sys/proc.h>		/* for proc0 declaration */
 #include <sys/protosw.h>
+#include <sys/qmath.h>
 #include <sys/sdt.h>
 #include <sys/signalvar.h>
 #include <sys/socket.h>
@@ -73,6 +75,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/systm.h>
+#include <sys/stats.h>
 
 #include <machine/cpu.h>	/* before tcp_seq.h, for tcp_random18() */
 
@@ -298,6 +301,10 @@ void
 cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t nsegs,
     uint16_t type)
 {
+#ifdef STATS
+	int32_t gput;
+#endif
+
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
 	tp->ccv->nsegs = nsegs;
@@ -310,6 +317,35 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, u
 		tp->ccv->flags &= ~CCF_CWND_LIMITED;
 
 	if (type == CC_ACK) {
+#ifdef STATS
+		stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF,
+		    ((int32_t)tp->snd_cwnd) - tp->snd_wnd);
+		if (!IN_RECOVERY(tp->t_flags))
+			stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_ACKLEN,
+			   tp->ccv->bytes_this_ack / (tcp_maxseg(tp) * nsegs));
+		if ((tp->t_flags & TF_GPUTINPROG) &&
+		    SEQ_GEQ(th->th_ack, tp->gput_ack)) {
+			/*
+			 * Compute goodput in bits per millisecond.
+			 */
+			gput = (((int64_t)(th->th_ack - tp->gput_seq)) << 3) /
+			    max(1, tcp_ts_getticks() - tp->gput_ts);
+			stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT,
+			    gput);
+			/*
+			 * XXXLAS: This is a temporary hack, and should be
+			 * chained off VOI_TCP_GPUT when stats(9) grows an API
+			 * to deal with chained VOIs.
+			 */
+			if (tp->t_stats_gput_prev > 0)
+				stats_voi_update_abs_s32(tp->t_stats,
+				    VOI_TCP_GPUT_ND,
+				    ((gput - tp->t_stats_gput_prev) * 100) /
+				    tp->t_stats_gput_prev);
+			tp->t_flags &= ~TF_GPUTINPROG;
+			tp->t_stats_gput_prev = gput;
+		}
+#endif /* STATS */
 		if (tp->snd_cwnd > tp->snd_ssthresh) {
 			tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
 			     nsegs * V_tcp_abc_l_var * tcp_maxseg(tp));
@@ -328,6 +364,9 @@ cc_ack_received(struct tcpcb *tp, struct tcphdr *th, u
 		tp->ccv->curack = th->th_ack;
 		CC_ALGO(tp)->ack_received(tp->ccv, type);
 	}
+#ifdef STATS
+	stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, tp->snd_cwnd);
+#endif
 }
 
 void 
@@ -393,6 +432,10 @@ cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, ui
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
+#ifdef STATS
+	stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type);
+#endif
+
 	switch(type) {
 	case CC_NDUPACK:
 		if (!IN_FASTRECOVERY(tp->t_flags)) {
@@ -1496,6 +1539,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, stru
 	 * For the SYN_SENT state the scale is zero.
 	 */
 	tiwin = th->th_win << tp->snd_scale;
+#ifdef STATS
+	stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin);
+#endif
 
 	/*
 	 * TCP ECN processing.
@@ -3359,6 +3405,10 @@ tcp_xmit_timer(struct tcpcb *tp, int rtt)
 
 	TCPSTAT_INC(tcps_rttupdated);
 	tp->t_rttupdated++;
+#ifdef STATS
+	stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT,
+	    imax(0, rtt * 1000 / hz));
+#endif
 	if ((tp->t_srtt != 0) && (tp->t_rxtshift <= TCP_RTT_INVALIDATE)) {
 		/*
 		 * srtt is stored as fixed point with 5 bits after the

Modified: head/sys/netinet/tcp_log_buf.c
==============================================================================
--- head/sys/netinet/tcp_log_buf.c	Mon Dec  2 20:57:13 2019	(r355303)
+++ head/sys/netinet/tcp_log_buf.c	Mon Dec  2 20:58:04 2019	(r355304)
@@ -30,10 +30,12 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#include <sys/arb.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/qmath.h>
 #include <sys/queue.h>
 #include <sys/refcount.h>
 #include <sys/rwlock.h>
@@ -41,6 +43,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/tree.h>
+#include <sys/stats.h>
 #include <sys/counter.h>
 
 #include <dev/tcp_log/tcp_log_dev.h>
@@ -475,7 +478,7 @@ tcp_log_grow_tlb(char *tlb_id, struct tcpcb *tp)
 
 	INP_WLOCK_ASSERT(tp->t_inpcb);
 
-#ifdef NETFLIX
+#ifdef STATS
 	if (V_tcp_perconn_stats_enable == 2 && tp->t_stats == NULL)
 		(void)tcp_stats_sample_rollthedice(tp, tlb_id, strlen(tlb_id));
 #endif

Modified: head/sys/netinet/tcp_output.c
==============================================================================
--- head/sys/netinet/tcp_output.c	Mon Dec  2 20:57:13 2019	(r355303)
+++ head/sys/netinet/tcp_output.c	Mon Dec  2 20:58:04 2019	(r355304)
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/arb.h>
 #include <sys/domain.h>
 #ifdef TCP_HHOOK
 #include <sys/hhook.h>
@@ -54,10 +55,12 @@ __FBSDID("$FreeBSD$");
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 #include <sys/protosw.h>
+#include <sys/qmath.h>
 #include <sys/sdt.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
+#include <sys/stats.h>
 
 #include <net/if.h>
 #include <net/route.h>
@@ -991,15 +994,31 @@ send:
 		struct sockbuf *msb;
 		u_int moff;
 
-		if ((tp->t_flags & TF_FORCEDATA) && len == 1)
+		if ((tp->t_flags & TF_FORCEDATA) && len == 1) {
 			TCPSTAT_INC(tcps_sndprobe);
-		else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
+#ifdef STATS
+			if (SEQ_LT(tp->snd_nxt, tp->snd_max))
+				stats_voi_update_abs_u32(tp->t_stats,
+				VOI_TCP_RETXPB, len);
+			else
+				stats_voi_update_abs_u64(tp->t_stats,
+				    VOI_TCP_TXPB, len);
+#endif /* STATS */
+		} else if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) {
 			tp->t_sndrexmitpack++;
 			TCPSTAT_INC(tcps_sndrexmitpack);
 			TCPSTAT_ADD(tcps_sndrexmitbyte, len);
+#ifdef STATS
+			stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB,
+			    len);
+#endif /* STATS */
 		} else {
 			TCPSTAT_INC(tcps_sndpack);
 			TCPSTAT_ADD(tcps_sndbyte, len);
+#ifdef STATS
+			stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB,
+			    len);
+#endif /* STATS */
 		}
 #ifdef INET6
 		if (MHLEN < hdrlen + max_linkhdr)
@@ -1472,6 +1491,15 @@ out:
 				tp->t_rtseq = startseq;
 				TCPSTAT_INC(tcps_segstimed);
 			}
+#ifdef STATS
+			if (!(tp->t_flags & TF_GPUTINPROG) && len) {
+				tp->t_flags |= TF_GPUTINPROG;
+				tp->gput_seq = startseq;
+				tp->gput_ack = startseq +
+				    ulmin(sbavail(&so->so_snd) - off, sendwin);
+				tp->gput_ts = tcp_ts_getticks();
+			}
+#endif /* STATS */
 		}
 
 		/*

Added: head/sys/netinet/tcp_stats.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/netinet/tcp_stats.c	Mon Dec  2 20:58:04 2019	(r355304)
@@ -0,0 +1,274 @@
+/*-
+ * Copyright (c) 2016-2018 Netflix, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Author: Lawrence Stewart <lstewart at netflix.com>
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/arb.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/qmath.h>
+#include <sys/queue.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#ifdef _KERNEL
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
+#include <sys/systm.h>
+#endif
+#include <sys/stats.h>
+
+#include <net/vnet.h>
+
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+
+#include <netinet/cc/cc.h>
+
+VNET_DEFINE(int, tcp_perconn_stats_dflt_tpl) = -1;
+
+#ifndef _KERNEL
+#define	V_tcp_perconn_stats_enable	VNET(tcp_perconn_stats_enable)
+#define	V_tcp_perconn_stats_dflt_tpl	VNET(tcp_perconn_stats_dflt_tpl)
+#else /* _KERNEL */
+
+VNET_DEFINE(int, tcp_perconn_stats_enable) = 2;
+VNET_DEFINE_STATIC(struct stats_tpl_sample_rate *, tcp_perconn_stats_sample_rates);
+VNET_DEFINE_STATIC(int, tcp_stats_nrates) = 0;
+#define	V_tcp_perconn_stats_sample_rates VNET(tcp_perconn_stats_sample_rates)
+#define	V_tcp_stats_nrates		VNET(tcp_stats_nrates)
+
+static struct rmlock tcp_stats_tpl_sampling_lock;
+static int tcp_stats_tpl_sr_cb(enum stats_tpl_sr_cb_action action,
+    struct stats_tpl_sample_rate **rates, int *nrates, void *ctx);
+
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, perconn_stats_enable,
+    CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_perconn_stats_enable), 0,
+    "Enable per-connection TCP stats gathering; 1 enables for all connections, "
+    "2 enables random sampling across log id connection groups");
+SYSCTL_PROC(_net_inet_tcp, OID_AUTO, perconn_stats_sample_rates,
+    CTLTYPE_STRING | CTLFLAG_RW, tcp_stats_tpl_sr_cb,
+    sizeof(struct rm_priotracker), stats_tpl_sample_rates, "A",
+    "TCP stats per template random sampling rates, in CSV tpl_spec=percent "
+    "key-value pairs (see stats(9) for template spec details)");
+#endif /* _KERNEL */
+
+#ifdef _KERNEL
+int
+#else
+static int
+/* Ensure all templates are also added to the userland template list. */
+__attribute__ ((constructor))
+#endif
+tcp_stats_init()
+{
+	int err, lasterr;
+
+	err = lasterr = 0;
+
+	V_tcp_perconn_stats_dflt_tpl = stats_tpl_alloc("TCP_DEFAULT", 0);
+	if (V_tcp_perconn_stats_dflt_tpl < 0)
+		return (-V_tcp_perconn_stats_dflt_tpl);
+
+	struct voistatspec vss_sum[] = {
+		STATS_VSS_SUM(),
+	};
+	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+	    VOI_TCP_TXPB, "TCP_TXPB", VSD_DTYPE_INT_U64,
+	    NVSS(vss_sum), vss_sum, 0);
+	lasterr = err ? err : lasterr;
+	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+	    VOI_TCP_RETXPB, "TCP_RETXPB", VSD_DTYPE_INT_U32,
+	    NVSS(vss_sum), vss_sum, 0);
+	lasterr = err ? err : lasterr;
+
+	struct voistatspec vss_max[] = {
+		STATS_VSS_MAX(),
+	};
+	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+	    VOI_TCP_FRWIN, "TCP_FRWIN", VSD_DTYPE_INT_ULONG,
+	    NVSS(vss_max), vss_max, 0);
+	lasterr = err ? err : lasterr;
+	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+	    VOI_TCP_LCWIN, "TCP_LCWIN", VSD_DTYPE_INT_ULONG,
+	    NVSS(vss_max), vss_max, 0);
+	lasterr = err ? err : lasterr;
+
+	struct voistatspec vss_rtt[] = {
+		STATS_VSS_MAX(),
+		STATS_VSS_MIN(),
+		STATS_VSS_TDGSTCLUST32(20, 4),
+	};
+	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+	    VOI_TCP_RTT, "TCP_RTT", VSD_DTYPE_INT_U32,
+	    NVSS(vss_rtt), vss_rtt, 0);
+	lasterr = err ? err : lasterr;
+
+	struct voistatspec vss_congsig[] = {
+		STATS_VSS_DVHIST32_USR(HBKTS(DVBKT(CC_ECN), DVBKT(CC_RTO),
+		    DVBKT(CC_RTO_ERR), DVBKT(CC_NDUPACK)), 0)
+	};
+	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+	    VOI_TCP_CSIG, "TCP_CSIG", VSD_DTYPE_INT_U32,
+	    NVSS(vss_congsig), vss_congsig, 0);
+	lasterr = err ? err : lasterr;
+
+	struct voistatspec vss_gput[] = {
+		STATS_VSS_MAX(),
+		STATS_VSS_TDGSTCLUST32(20, 4),
+	};
+	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+	    VOI_TCP_GPUT, "TCP_GPUT", VSD_DTYPE_INT_U32,
+	    NVSS(vss_gput), vss_gput, 0);
+	lasterr = err ? err : lasterr;
+
+	struct voistatspec vss_gput_nd[] = {
+		STATS_VSS_TDGSTCLUST32(10, 4),
+	};
+	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+	    VOI_TCP_GPUT_ND, "TCP_GPUT_ND", VSD_DTYPE_INT_S32,
+	    NVSS(vss_gput_nd), vss_gput_nd, 0);
+	lasterr = err ? err : lasterr;
+
+	struct voistatspec vss_windiff[] = {
+		STATS_VSS_CRHIST32_USR(HBKTS(CRBKT(0)), VSD_HIST_LBOUND_INF)
+	};
+	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+	    VOI_TCP_CALCFRWINDIFF, "TCP_CALCFRWINDIFF", VSD_DTYPE_INT_S32,
+	    NVSS(vss_windiff), vss_windiff, 0);
+	lasterr = err ? err : lasterr;
+
+	struct voistatspec vss_acklen[] = {
+		STATS_VSS_MAX(),
+		STATS_VSS_CRHIST32_LIN(0, 9, 1, VSD_HIST_UBOUND_INF)
+	};
+	err |= stats_tpl_add_voistats(V_tcp_perconn_stats_dflt_tpl,
+	    VOI_TCP_ACKLEN, "TCP_ACKLEN", VSD_DTYPE_INT_U32,
+	    NVSS(vss_acklen), vss_acklen, 0);
+	lasterr = err ? err : lasterr;
+
+	return (lasterr);
+}
+
+#ifdef _KERNEL
+int
+tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes,
+    size_t seed_len)
+{
+	struct rm_priotracker tracker;
+	int tpl;
+
+	tpl = -1;
+
+	if (V_tcp_stats_nrates > 0) {
+		rm_rlock(&tcp_stats_tpl_sampling_lock, &tracker);
+		tpl = stats_tpl_sample_rollthedice(V_tcp_perconn_stats_sample_rates,
+		    V_tcp_stats_nrates, seed_bytes, seed_len);
+		rm_runlock(&tcp_stats_tpl_sampling_lock, &tracker);
+
+		if (tpl >= 0) {
+			INP_WLOCK_ASSERT(tp->t_inpcb);
+			if (tp->t_stats != NULL)
+				stats_blob_destroy(tp->t_stats);
+			tp->t_stats = stats_blob_alloc(tpl, 0);
+			if (tp->t_stats == NULL)
+				tpl = -ENOMEM;
+		}
+	}
+
+	return (tpl);
+}
+
+/*
+ * Callback function for stats_tpl_sample_rates() to interact with the TCP
+ * subsystem's stats template sample rates list.
+ */
+int
+tcp_stats_tpl_sr_cb(enum stats_tpl_sr_cb_action action,
+    struct stats_tpl_sample_rate **rates, int *nrates, void *ctx)
+{
+	struct stats_tpl_sample_rate *old_rates;
+	int old_nrates;
+
+	if (ctx == NULL)
+		return (ENOMEM);
+
+	switch (action) {
+	case TPL_SR_RLOCKED_GET:
+		/*
+		 * Return with rlock held i.e. this call must be paired with a
+		 * "action == TPL_SR_RUNLOCK" call.
+		 */
+		rm_assert(&tcp_stats_tpl_sampling_lock, RA_UNLOCKED);
+		rm_rlock(&tcp_stats_tpl_sampling_lock,
+		    (struct rm_priotracker *)ctx);
+		/* FALLTHROUGH */
+	case TPL_SR_UNLOCKED_GET:
+		if (rates != NULL)
+			*rates = V_tcp_perconn_stats_sample_rates;
+		if (nrates != NULL)
+			*nrates = V_tcp_stats_nrates;
+		break;
+	case TPL_SR_RUNLOCK:
+		rm_assert(&tcp_stats_tpl_sampling_lock, RA_RLOCKED);
+		rm_runlock(&tcp_stats_tpl_sampling_lock,
+		    (struct rm_priotracker *)ctx);
+		break;
+	case TPL_SR_PUT:
+		KASSERT(rates != NULL && nrates != NULL,
+		    ("%s: PUT without new rates", __func__));
+		rm_assert(&tcp_stats_tpl_sampling_lock, RA_UNLOCKED);
+		if (rates == NULL || nrates == NULL)
+			return (EINVAL);
+		rm_wlock(&tcp_stats_tpl_sampling_lock);
+		old_rates = V_tcp_perconn_stats_sample_rates;
+		old_nrates = V_tcp_stats_nrates;
+		V_tcp_perconn_stats_sample_rates = *rates;
+		V_tcp_stats_nrates = *nrates;
+		rm_wunlock(&tcp_stats_tpl_sampling_lock);
+		*rates = old_rates;
+		*nrates = old_nrates;
+		break;
+	default:
+		return (EINVAL);
+		break;
+	}
+
+	return (0);
+}
+
+RM_SYSINIT(tcp_stats_tpl_sampling_lock, &tcp_stats_tpl_sampling_lock,
+    "tcp_stats_tpl_sampling_lock");
+#endif /* _KERNEL */

Modified: head/sys/netinet/tcp_subr.c
==============================================================================
--- head/sys/netinet/tcp_subr.c	Mon Dec  2 20:57:13 2019	(r355303)
+++ head/sys/netinet/tcp_subr.c	Mon Dec  2 20:58:04 2019	(r355304)
@@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/arb.h>
 #include <sys/callout.h>
 #include <sys/eventhandler.h>
 #ifdef TCP_HHOOK
@@ -54,6 +55,8 @@ __FBSDID("$FreeBSD$");
 #ifdef KERN_TLS
 #include <sys/ktls.h>
 #endif
+#include <sys/qmath.h>
+#include <sys/stats.h>
 #include <sys/sysctl.h>
 #include <sys/jail.h>
 #include <sys/malloc.h>
@@ -1005,6 +1008,11 @@ tcp_init(void)
 	    &V_tcp_hhh[HHOOK_TCP_EST_OUT], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0)
 		printf("%s: WARNING: unable to register helper hook\n", __func__);
 #endif
+#ifdef STATS
+	if (tcp_stats_init())
+		printf("%s: WARNING: unable to initialise TCP stats\n",
+		    __func__);
+#endif
 	hashsize = TCBHASHSIZE;
 	TUNABLE_INT_FETCH(tcbhash_tuneable, &hashsize);
 	if (hashsize == 0) {
@@ -1694,6 +1702,10 @@ tcp_newtcpcb(struct inpcb *inp)
 	if (tp->t_fb->tfb_tcp_fb_init) {
 		(*tp->t_fb->tfb_tcp_fb_init)(tp);
 	}
+#ifdef STATS
+	if (V_tcp_perconn_stats_enable == 1)
+		tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0);
+#endif
 	return (tp);		/* XXX */
 }
 
@@ -1911,6 +1923,9 @@ tcp_discardcb(struct tcpcb *tp)
 
 #ifdef TCP_HHOOK
 	khelp_destroy_osd(tp->osd);
+#endif
+#ifdef STATS
+	stats_blob_destroy(tp->t_stats);
 #endif
 
 	CC_ALGO(tp) = NULL;

Modified: head/sys/netinet/tcp_usrreq.c
==============================================================================
--- head/sys/netinet/tcp_usrreq.c	Mon Dec  2 20:57:13 2019	(r355303)
+++ head/sys/netinet/tcp_usrreq.c	Mon Dec  2 20:58:04 2019	(r355304)
@@ -49,11 +49,13 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/arb.h>
 #include <sys/limits.h>
 #include <sys/malloc.h>
 #include <sys/refcount.h>
 #include <sys/kernel.h>
 #include <sys/ktls.h>
+#include <sys/qmath.h>
 #include <sys/sysctl.h>
 #include <sys/mbuf.h>
 #ifdef INET6
@@ -65,6 +67,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/jail.h>
 #include <sys/syslog.h>
+#include <sys/stats.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
@@ -108,6 +111,13 @@ __FBSDID("$FreeBSD$");
 #endif
 #include <netipsec/ipsec_support.h>
 
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+
 /*
  * TCP protocol interface to socket abstraction.
  */
@@ -1816,6 +1826,9 @@ tcp_default_ctloutput(struct socket *so, struct sockop
 #endif
 	struct cc_algo *algo;
 	char	*pbuf, buf[TCP_LOG_ID_LEN];
+#ifdef STATS
+	struct statsblob *sbp;
+#endif
 	size_t	len;
 
 	/*
@@ -1933,6 +1946,35 @@ unlock_and_done:
 			error = EINVAL;
 			break;
 
+		case TCP_STATS:
+			INP_WUNLOCK(inp);
+#ifdef STATS
+			error = sooptcopyin(sopt, &optval, sizeof optval,
+			    sizeof optval);
+			if (error)
+				return (error);
+
+			if (optval > 0)
+				sbp = stats_blob_alloc(
+				    V_tcp_perconn_stats_dflt_tpl, 0);
+			else
+				sbp = NULL;
+
+			INP_WLOCK_RECHECK(inp);
+			if ((tp->t_stats != NULL && sbp == NULL) ||
+			    (tp->t_stats == NULL && sbp != NULL)) {
+				struct statsblob *t = tp->t_stats;
+				tp->t_stats = sbp;
+				sbp = t;
+			}
+			INP_WUNLOCK(inp);
+
+			stats_blob_destroy(sbp);
+#else
+			return (EOPNOTSUPP);
+#endif /* !STATS */
+			break;
+
 		case TCP_CONGESTION:
 			INP_WUNLOCK(inp);
 			error = sooptcopyin(sopt, buf, TCP_CA_NAME_MAX - 1, 1);
@@ -2217,6 +2259,55 @@ unlock_and_done:
 			INP_WUNLOCK(inp);
 			error = sooptcopyout(sopt, &ti, sizeof ti);
 			break;
+		case TCP_STATS:
+			{
+#ifdef STATS
+			int nheld;
+			TYPEOF_MEMBER(struct statsblob, flags) sbflags = 0;
+
+			error = 0;
+			socklen_t outsbsz = sopt->sopt_valsize;
+			if (tp->t_stats == NULL)
+				error = ENOENT;
+			else if (outsbsz >= tp->t_stats->cursz)
+				outsbsz = tp->t_stats->cursz;
+			else if (outsbsz >= sizeof(struct statsblob))
+				outsbsz = sizeof(struct statsblob);
+			else
+				error = EINVAL;
+			INP_WUNLOCK(inp);
+			if (error)
+				break;
+
+			sbp = sopt->sopt_val;
+			nheld = atop(round_page(((vm_offset_t)sbp) +
+			    (vm_size_t)outsbsz) - trunc_page((vm_offset_t)sbp));
+			vm_page_t ma[nheld];
+			if (vm_fault_quick_hold_pages(
+			    &curproc->p_vmspace->vm_map, (vm_offset_t)sbp,
+			    outsbsz, VM_PROT_READ | VM_PROT_WRITE, ma,
+			    nheld) < 0) {
+				error = EFAULT;
+				break;
+			}
+
+			if ((error = copyin_nofault(&(sbp->flags), &sbflags,
+			    SIZEOF_MEMBER(struct statsblob, flags))))
+				goto unhold;
+
+			INP_WLOCK_RECHECK(inp);
+			error = stats_blob_snapshot(&sbp, outsbsz, tp->t_stats,
+			    sbflags | SB_CLONE_USRDSTNOFAULT);
+			INP_WUNLOCK(inp);
+			sopt->sopt_valsize = outsbsz;
+unhold:
+			vm_page_unhold_pages(ma, nheld);
+#else
+			INP_WUNLOCK(inp);
+			error = EOPNOTSUPP;
+#endif /* !STATS */
+			break;
+			}
 		case TCP_CONGESTION:
 			len = strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
 			INP_WUNLOCK(inp);

Modified: head/sys/netinet/tcp_var.h
==============================================================================
--- head/sys/netinet/tcp_var.h	Mon Dec  2 20:57:13 2019	(r355303)
+++ head/sys/netinet/tcp_var.h	Mon Dec  2 20:58:04 2019	(r355304)
@@ -210,7 +210,12 @@ struct tcpcb {
 	struct tcp_log_id_node *t_lin;
 	struct tcp_log_id_bucket *t_lib;
 	const char *t_output_caller;	/* Function that called tcp_output */
+	struct statsblob *t_stats;	/* Per-connection stats */
 	uint32_t t_logsn;		/* Log "serial number" */
+	uint32_t gput_ts;		/* Time goodput measurement started */
+	tcp_seq gput_seq;		/* Outbound measurement seq */
+	tcp_seq gput_ack;		/* Inbound measurement ack */
+	int32_t t_stats_gput_prev;	/* XXXLAS: Prev gput measurement */
 	uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */
 	unsigned int *t_tfo_pending;	/* TCP Fast Open server pending counter */
 	union {
@@ -327,7 +332,7 @@ TAILQ_HEAD(tcp_funchead, tcp_function);
 #define	TF_NOPUSH	0x00001000	/* don't push */
 #define	TF_PREVVALID	0x00002000	/* saved values for bad rxmit valid */
 #define	TF_UNUSED1	0x00004000	/* unused */
-#define	TF_UNUSED2	0x00008000	/* unused */
+#define	TF_GPUTINPROG	0x00008000	/* Goodput measurement in progress */
 #define	TF_MORETOCOME	0x00010000	/* More data to be appended to sock */
 #define	TF_LQ_OVERFLOW	0x00020000	/* listen queue overflow */
 #define	TF_LASTIDLE	0x00040000	/* connection was previously idle */
@@ -787,6 +792,10 @@ VNET_DECLARE(int, tcp_insecure_rst);
 VNET_DECLARE(int, tcp_insecure_syn);
 VNET_DECLARE(int, tcp_minmss);
 VNET_DECLARE(int, tcp_mssdflt);
+#ifdef STATS
+VNET_DECLARE(int, tcp_perconn_stats_dflt_tpl);
+VNET_DECLARE(int, tcp_perconn_stats_enable);
+#endif /* STATS */
 VNET_DECLARE(int, tcp_recvspace);
 VNET_DECLARE(int, tcp_sack_globalholes);
 VNET_DECLARE(int, tcp_sack_globalmaxholes);
@@ -823,6 +832,10 @@ VNET_DECLARE(struct inpcbinfo, tcbinfo);
 #define	V_tcp_insecure_syn		VNET(tcp_insecure_syn)
 #define	V_tcp_minmss			VNET(tcp_minmss)
 #define	V_tcp_mssdflt			VNET(tcp_mssdflt)
+#ifdef STATS
+#define	V_tcp_perconn_stats_dflt_tpl	VNET(tcp_perconn_stats_dflt_tpl)
+#define	V_tcp_perconn_stats_enable	VNET(tcp_perconn_stats_enable)
+#endif /* STATS */
 #define	V_tcp_recvspace			VNET(tcp_recvspace)
 #define	V_tcp_sack_globalholes		VNET(tcp_sack_globalholes)
 #define	V_tcp_sack_globalmaxholes	VNET(tcp_sack_globalmaxholes)
@@ -966,10 +979,13 @@ int	 tcp_newreno(struct tcpcb *, struct tcphdr *);
 int	 tcp_compute_pipe(struct tcpcb *);
 uint32_t tcp_compute_initwnd(uint32_t);
 void	 tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t);
+int	 tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes,
+    size_t seed_len);
 struct mbuf *
 	 tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen,
 	   int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls);
 
+int	tcp_stats_init(void);
 
 static inline void
 tcp_fields_to_host(struct tcphdr *th)

Modified: head/sys/sys/stats.h
==============================================================================
--- head/sys/sys/stats.h	Mon Dec  2 20:57:13 2019	(r355303)
+++ head/sys/sys/stats.h	Mon Dec  2 20:58:04 2019	(r355304)
@@ -58,6 +58,9 @@
 #define _SYS_STATS_H_
 
 #include <sys/limits.h>
+#ifdef DIAGNOSTIC
+#include <sys/tree.h>
+#endif
 
 #ifndef _KERNEL
 /*


More information about the svn-src-head mailing list