svn commit: r316676 - in head/sys/netinet: . tcp_stacks

Mon Apr 10 14:51:42 UTC 2017

If possible MFC to 10 too would be nice..
thanks

On 10/4/17 4:19 pm, Steven Hartland wrote:
> Author: smh
> Date: Mon Apr 10 08:19:35 2017
> New Revision: 316676
> URL: https://svnweb.freebsd.org/changeset/base/316676
>
> Log:
>    Use estimated RTT for receive buffer auto resizing instead of timestamps
>    
>    Switched from using timestamps to RTT estimates when performing TCP receive
>    buffer auto resizing, as not all hosts support / enable TCP timestamps.
>    
>    Disabled reset of receive buffer auto scaling when not in bulk receive mode,
>    which gives an extra 20% performance increase.
>    
>    Also extracted auto resizing to a common method shared between standard and
>    fastpath modules.
>    
>    With this AWS S3 downloads at ~17ms latency on a 1Gbps connection jump from
>    ~3MB/s to ~100MB/s using the default settings.
>    
>    Reviewed by:    lstewart, gnn
>    MFC after:      2 weeks
>    Relnotes:       Yes
>    Sponsored by:   Multiplay
>    Differential Revision:  https://reviews.freebsd.org/D9668
>
> Modified:
>    head/sys/netinet/in_kdtrace.c
>    head/sys/netinet/in_kdtrace.h
>    head/sys/netinet/tcp_input.c
>    head/sys/netinet/tcp_output.c
>    head/sys/netinet/tcp_stacks/fastpath.c
>    head/sys/netinet/tcp_var.h
>
> Modified: head/sys/netinet/in_kdtrace.c
> ==============================================================================
> --- head/sys/netinet/in_kdtrace.c	Mon Apr 10 06:19:09 2017	(r316675)
> +++ head/sys/netinet/in_kdtrace.c	Mon Apr 10 08:19:35 2017	(r316676)
> @@ -132,6 +132,14 @@ SDT_PROBE_DEFINE6_XLATE(tcp, , , state__
>       "void *", "void *",
>       "int", "tcplsinfo_t *");
>   
> +SDT_PROBE_DEFINE6_XLATE(tcp, , , receive__autoresize,
> +    "void *", "void *",
> +    "struct tcpcb *", "csinfo_t *",
> +    "struct mbuf *", "ipinfo_t *",
> +    "struct tcpcb *", "tcpsinfo_t *" ,
> +    "struct tcphdr *", "tcpinfoh_t *",
> +    "int", "int");
> +
>   SDT_PROBE_DEFINE5_XLATE(udp, , , receive,
>       "void *", "pktinfo_t *",
>       "struct inpcb *", "csinfo_t *",
>
> Modified: head/sys/netinet/in_kdtrace.h
> ==============================================================================
> --- head/sys/netinet/in_kdtrace.h	Mon Apr 10 06:19:09 2017	(r316675)
> +++ head/sys/netinet/in_kdtrace.h	Mon Apr 10 08:19:35 2017	(r316676)
> @@ -65,6 +65,7 @@ SDT_PROBE_DECLARE(tcp, , , debug__input)
>   SDT_PROBE_DECLARE(tcp, , , debug__output);
>   SDT_PROBE_DECLARE(tcp, , , debug__user);
>   SDT_PROBE_DECLARE(tcp, , , debug__drop);
> +SDT_PROBE_DECLARE(tcp, , , receive__autoresize);
>   
>   SDT_PROBE_DECLARE(udp, , , receive);
>   SDT_PROBE_DECLARE(udp, , , send);
>
> Modified: head/sys/netinet/tcp_input.c
> ==============================================================================
> --- head/sys/netinet/tcp_input.c	Mon Apr 10 06:19:09 2017	(r316675)
> +++ head/sys/netinet/tcp_input.c	Mon Apr 10 08:19:35 2017	(r316676)
> @@ -1486,6 +1486,68 @@ drop:
>   	return (IPPROTO_DONE);
>   }
>   
> +/*
> + * Automatic sizing of receive socket buffer.  Often the send
> + * buffer size is not optimally adjusted to the actual network
> + * conditions at hand (delay bandwidth product).  Setting the
> + * buffer size too small limits throughput on links with high
> + * bandwidth and high delay (eg. trans-continental/oceanic links).
> + *
> + * On the receive side the socket buffer memory is only rarely
> + * used to any significant extent.  This allows us to be much
> + * more aggressive in scaling the receive socket buffer.  For
> + * the case that the buffer space is actually used to a large
> + * extent and we run out of kernel memory we can simply drop
> + * the new segments; TCP on the sender will just retransmit it
> + * later.  Setting the buffer size too big may only consume too
> + * much kernel memory if the application doesn't read() from
> + * the socket or packet loss or reordering makes use of the
> + * reassembly queue.
> + *
> + * The criteria to step up the receive buffer one notch are:
> + *  1. Application has not set receive buffer size with
> + *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
> + *  2. the number of bytes received during the time it takes
> + *     one timestamp to be reflected back to us (the RTT);
> + *  3. received bytes per RTT is within seven eighth of the
> + *     current socket buffer size;
> + *  4. receive buffer size has not hit maximal automatic size;
> + *
> + * This algorithm does one step per RTT at most and only if
> + * we receive a bulk stream w/o packet losses or reorderings.
> + * Shrinking the buffer during idle times is not necessary as
> + * it doesn't consume any memory when idle.
> + *
> + * TODO: Only step up if the application is actually serving
> + * the buffer to better manage the socket buffer resources.
> + */
> +int
> +tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
> +    struct tcpcb *tp, int tlen)
> +{
> +	int newsize = 0;
> +
> +	if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) &&
> +	    tp->t_srtt != 0 && tp->rfbuf_ts != 0 &&
> +	    TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) >
> +	    (tp->t_srtt >> TCP_RTT_SHIFT)) {
> +		if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) &&
> +		    so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) {
> +			newsize = min(so->so_rcv.sb_hiwat +
> +			    V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max);
> +		}
> +		TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize);
> +
> +		/* Start over with next RTT. */
> +		tp->rfbuf_ts = 0;
> +		tp->rfbuf_cnt = 0;
> +	} else {
> +		tp->rfbuf_cnt += tlen;	/* add up */
> +	}
> +
> +	return (newsize);
> +}
> +
>   void
>   tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
>       struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
> @@ -1849,62 +1911,7 @@ tcp_do_segment(struct mbuf *m, struct tc
>   #endif
>   			TCP_PROBE3(debug__input, tp, th, m);
>   
> -		/*
> -		 * Automatic sizing of receive socket buffer.  Often the send
> -		 * buffer size is not optimally adjusted to the actual network
> -		 * conditions at hand (delay bandwidth product).  Setting the
> -		 * buffer size too small limits throughput on links with high
> -		 * bandwidth and high delay (eg. trans-continental/oceanic links).
> -		 *
> -		 * On the receive side the socket buffer memory is only rarely
> -		 * used to any significant extent.  This allows us to be much
> -		 * more aggressive in scaling the receive socket buffer.  For
> -		 * the case that the buffer space is actually used to a large
> -		 * extent and we run out of kernel memory we can simply drop
> -		 * the new segments; TCP on the sender will just retransmit it
> -		 * later.  Setting the buffer size too big may only consume too
> -		 * much kernel memory if the application doesn't read() from
> -		 * the socket or packet loss or reordering makes use of the
> -		 * reassembly queue.
> -		 *
> -		 * The criteria to step up the receive buffer one notch are:
> -		 *  1. Application has not set receive buffer size with
> -		 *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
> -		 *  2. the number of bytes received during the time it takes
> -		 *     one timestamp to be reflected back to us (the RTT);
> -		 *  3. received bytes per RTT is within seven eighth of the
> -		 *     current socket buffer size;
> -		 *  4. receive buffer size has not hit maximal automatic size;
> -		 *
> -		 * This algorithm does one step per RTT at most and only if
> -		 * we receive a bulk stream w/o packet losses or reorderings.
> -		 * Shrinking the buffer during idle times is not necessary as
> -		 * it doesn't consume any memory when idle.
> -		 *
> -		 * TODO: Only step up if the application is actually serving
> -		 * the buffer to better manage the socket buffer resources.
> -		 */
> -			if (V_tcp_do_autorcvbuf &&
> -			    (to.to_flags & TOF_TS) &&
> -			    to.to_tsecr &&
> -			    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
> -				if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) &&
> -				    to.to_tsecr - tp->rfbuf_ts < hz) {
> -					if (tp->rfbuf_cnt >
> -					    (so->so_rcv.sb_hiwat / 8 * 7) &&
> -					    so->so_rcv.sb_hiwat <
> -					    V_tcp_autorcvbuf_max) {
> -						newsize =
> -						    min(so->so_rcv.sb_hiwat +
> -						    V_tcp_autorcvbuf_inc,
> -						    V_tcp_autorcvbuf_max);
> -					}
> -					/* Start over with next RTT. */
> -					tp->rfbuf_ts = 0;
> -					tp->rfbuf_cnt = 0;
> -				} else
> -					tp->rfbuf_cnt += tlen;	/* add up */
> -			}
> +			newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
>   
>   			/* Add data to socket buffer. */
>   			SOCKBUF_LOCK(&so->so_rcv);
> @@ -1945,10 +1952,6 @@ tcp_do_segment(struct mbuf *m, struct tc
>   		win = 0;
>   	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
>   
> -	/* Reset receive buffer auto scaling when not in bulk receive mode. */
> -	tp->rfbuf_ts = 0;
> -	tp->rfbuf_cnt = 0;
> -
>   	switch (tp->t_state) {
>   
>   	/*
>
> Modified: head/sys/netinet/tcp_output.c
> ==============================================================================
> --- head/sys/netinet/tcp_output.c	Mon Apr 10 06:19:09 2017	(r316675)
> +++ head/sys/netinet/tcp_output.c	Mon Apr 10 08:19:35 2017	(r316676)
> @@ -831,11 +831,13 @@ send:
>   			to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
>   			to.to_tsecr = tp->ts_recent;
>   			to.to_flags |= TOF_TS;
> -			/* Set receive buffer autosizing timestamp. */
> -			if (tp->rfbuf_ts == 0 &&
> -			    (so->so_rcv.sb_flags & SB_AUTOSIZE))
> -				tp->rfbuf_ts = tcp_ts_getticks();
>   		}
> +
> +		/* Set receive buffer autosizing timestamp. */
> +		if (tp->rfbuf_ts == 0 &&
> +		    (so->so_rcv.sb_flags & SB_AUTOSIZE))
> +			tp->rfbuf_ts = tcp_ts_getticks();
> +
>   		/* Selective ACK's. */
>   		if (tp->t_flags & TF_SACK_PERMIT) {
>   			if (flags & TH_SYN)
>
> Modified: head/sys/netinet/tcp_stacks/fastpath.c
> ==============================================================================
> --- head/sys/netinet/tcp_stacks/fastpath.c	Mon Apr 10 06:19:09 2017	(r316675)
> +++ head/sys/netinet/tcp_stacks/fastpath.c	Mon Apr 10 08:19:35 2017	(r316676)
> @@ -399,62 +399,8 @@ tcp_do_fastnewdata(struct mbuf *m, struc
>   			  (void *)tcp_saveipgen, &tcp_savetcp, 0);
>   #endif
>   	TCP_PROBE3(debug__input, tp, th, m);
> -	/*
> -	 * Automatic sizing of receive socket buffer.  Often the send
> -	 * buffer size is not optimally adjusted to the actual network
> -	 * conditions at hand (delay bandwidth product).  Setting the
> -	 * buffer size too small limits throughput on links with high
> -	 * bandwidth and high delay (eg. trans-continental/oceanic links).
> -	 *
> -	 * On the receive side the socket buffer memory is only rarely
> -	 * used to any significant extent.  This allows us to be much
> -	 * more aggressive in scaling the receive socket buffer.  For
> -	 * the case that the buffer space is actually used to a large
> -	 * extent and we run out of kernel memory we can simply drop
> -	 * the new segments; TCP on the sender will just retransmit it
> -	 * later.  Setting the buffer size too big may only consume too
> -	 * much kernel memory if the application doesn't read() from
> -	 * the socket or packet loss or reordering makes use of the
> -	 * reassembly queue.
> -	 *
> -	 * The criteria to step up the receive buffer one notch are:
> -	 *  1. Application has not set receive buffer size with
> -	 *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
> -	 *  2. the number of bytes received during the time it takes
> -	 *     one timestamp to be reflected back to us (the RTT);
> -	 *  3. received bytes per RTT is within seven eighth of the
> -	 *     current socket buffer size;
> -	 *  4. receive buffer size has not hit maximal automatic size;
> -	 *
> -	 * This algorithm does one step per RTT at most and only if
> -	 * we receive a bulk stream w/o packet losses or reorderings.
> -	 * Shrinking the buffer during idle times is not necessary as
> -	 * it doesn't consume any memory when idle.
> -	 *
> -	 * TODO: Only step up if the application is actually serving
> -	 * the buffer to better manage the socket buffer resources.
> -	 */
> -	if (V_tcp_do_autorcvbuf &&
> -	    (to->to_flags & TOF_TS) &&
> -	    to->to_tsecr &&
> -	    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
> -		if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) &&
> -		    to->to_tsecr - tp->rfbuf_ts < hz) {
> -			if (tp->rfbuf_cnt >
> -			    (so->so_rcv.sb_hiwat / 8 * 7) &&
> -			    so->so_rcv.sb_hiwat <
> -			    V_tcp_autorcvbuf_max) {
> -				newsize =
> -					min(so->so_rcv.sb_hiwat +
> -					    V_tcp_autorcvbuf_inc,
> -					    V_tcp_autorcvbuf_max);
> -			}
> -			/* Start over with next RTT. */
> -			tp->rfbuf_ts = 0;
> -			tp->rfbuf_cnt = 0;
> -		} else
> -			tp->rfbuf_cnt += tlen;	/* add up */
> -	}
> +
> +	newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
>   
>   	/* Add data to socket buffer. */
>   	SOCKBUF_LOCK(&so->so_rcv);
> @@ -532,10 +478,6 @@ tcp_do_slowpath(struct mbuf *m, struct t
>   		win = 0;
>   	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
>   
> -	/* Reset receive buffer auto scaling when not in bulk receive mode. */
> -	tp->rfbuf_ts = 0;
> -	tp->rfbuf_cnt = 0;
> -
>   	switch (tp->t_state) {
>   
>   	/*
>
> Modified: head/sys/netinet/tcp_var.h
> ==============================================================================
> --- head/sys/netinet/tcp_var.h	Mon Apr 10 06:19:09 2017	(r316675)
> +++ head/sys/netinet/tcp_var.h	Mon Apr 10 08:19:35 2017	(r316676)
> @@ -778,6 +778,8 @@ void	hhook_run_tcp_est_in(struct tcpcb *
>   #endif
>   
>   int	 tcp_input(struct mbuf **, int *, int);
> +int	 tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *,
> +	    struct tcpcb *, int);
>   void	 tcp_do_segment(struct mbuf *, struct tcphdr *,
>   			struct socket *, struct tcpcb *, int, int, uint8_t,
>   			int);
>
>