svn commit: r316676 - in head/sys/netinet: . tcp_stacks

Mon Apr 10 15:18:02 UTC 2017

I don't tend to MFC 10.x now, but do agree given the impact that for 
this one it should be done.

The fix is a little different, due to code restructuring in 11 / head, 
but I do have a 10.x version already.

     Regards
     Steve

On 10/04/2017 15:51, Julian Elischer wrote:
> If possible MFC to 10 too would be nice..
> thanks
>
>
> On 10/4/17 4:19 pm, Steven Hartland wrote:
>> Author: smh
>> Date: Mon Apr 10 08:19:35 2017
>> New Revision: 316676
>> URL: https://svnweb.freebsd.org/changeset/base/316676
>>
>> Log:
>>    Use estimated RTT for receive buffer auto resizing instead of 
>> timestamps
>>       Switched from using timestamps to RTT estimates when performing 
>> TCP receive
>>    buffer auto resizing, as not all hosts support / enable TCP 
>> timestamps.
>>       Disabled reset of receive buffer auto scaling when not in bulk 
>> receive mode,
>>    which gives an extra 20% performance increase.
>>       Also extracted auto resizing to a common method shared between 
>> standard and
>>    fastpath modules.
>>       With this AWS S3 downloads at ~17ms latency on a 1Gbps 
>> connection jump from
>>    ~3MB/s to ~100MB/s using the default settings.
>>       Reviewed by:    lstewart, gnn
>>    MFC after:      2 weeks
>>    Relnotes:       Yes
>>    Sponsored by:   Multiplay
>>    Differential Revision:  https://reviews.freebsd.org/D9668
>>
>> Modified:
>>    head/sys/netinet/in_kdtrace.c
>>    head/sys/netinet/in_kdtrace.h
>>    head/sys/netinet/tcp_input.c
>>    head/sys/netinet/tcp_output.c
>>    head/sys/netinet/tcp_stacks/fastpath.c
>>    head/sys/netinet/tcp_var.h
>>
>> Modified: head/sys/netinet/in_kdtrace.c
>> ============================================================================== 
>>
>> --- head/sys/netinet/in_kdtrace.c    Mon Apr 10 06:19:09 2017 (r316675)
>> +++ head/sys/netinet/in_kdtrace.c    Mon Apr 10 08:19:35 2017 (r316676)
>> @@ -132,6 +132,14 @@ SDT_PROBE_DEFINE6_XLATE(tcp, , , state__
>>       "void *", "void *",
>>       "int", "tcplsinfo_t *");
>>   +SDT_PROBE_DEFINE6_XLATE(tcp, , , receive__autoresize,
>> +    "void *", "void *",
>> +    "struct tcpcb *", "csinfo_t *",
>> +    "struct mbuf *", "ipinfo_t *",
>> +    "struct tcpcb *", "tcpsinfo_t *" ,
>> +    "struct tcphdr *", "tcpinfoh_t *",
>> +    "int", "int");
>> +
>>   SDT_PROBE_DEFINE5_XLATE(udp, , , receive,
>>       "void *", "pktinfo_t *",
>>       "struct inpcb *", "csinfo_t *",
>>
>> Modified: head/sys/netinet/in_kdtrace.h
>> ============================================================================== 
>>
>> --- head/sys/netinet/in_kdtrace.h    Mon Apr 10 06:19:09 2017 (r316675)
>> +++ head/sys/netinet/in_kdtrace.h    Mon Apr 10 08:19:35 2017 (r316676)
>> @@ -65,6 +65,7 @@ SDT_PROBE_DECLARE(tcp, , , debug__input)
>>   SDT_PROBE_DECLARE(tcp, , , debug__output);
>>   SDT_PROBE_DECLARE(tcp, , , debug__user);
>>   SDT_PROBE_DECLARE(tcp, , , debug__drop);
>> +SDT_PROBE_DECLARE(tcp, , , receive__autoresize);
>>     SDT_PROBE_DECLARE(udp, , , receive);
>>   SDT_PROBE_DECLARE(udp, , , send);
>>
>> Modified: head/sys/netinet/tcp_input.c
>> ============================================================================== 
>>
>> --- head/sys/netinet/tcp_input.c    Mon Apr 10 06:19:09 2017 (r316675)
>> +++ head/sys/netinet/tcp_input.c    Mon Apr 10 08:19:35 2017 (r316676)
>> @@ -1486,6 +1486,68 @@ drop:
>>       return (IPPROTO_DONE);
>>   }
>>   +/*
>> + * Automatic sizing of receive socket buffer.  Often the send
>> + * buffer size is not optimally adjusted to the actual network
>> + * conditions at hand (delay bandwidth product).  Setting the
>> + * buffer size too small limits throughput on links with high
>> + * bandwidth and high delay (eg. trans-continental/oceanic links).
>> + *
>> + * On the receive side the socket buffer memory is only rarely
>> + * used to any significant extent.  This allows us to be much
>> + * more aggressive in scaling the receive socket buffer.  For
>> + * the case that the buffer space is actually used to a large
>> + * extent and we run out of kernel memory we can simply drop
>> + * the new segments; TCP on the sender will just retransmit it
>> + * later.  Setting the buffer size too big may only consume too
>> + * much kernel memory if the application doesn't read() from
>> + * the socket or packet loss or reordering makes use of the
>> + * reassembly queue.
>> + *
>> + * The criteria to step up the receive buffer one notch are:
>> + *  1. Application has not set receive buffer size with
>> + *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
>> + *  2. the number of bytes received during the time it takes
>> + *     one timestamp to be reflected back to us (the RTT);
>> + *  3. received bytes per RTT is within seven eighth of the
>> + *     current socket buffer size;
>> + *  4. receive buffer size has not hit maximal automatic size;
>> + *
>> + * This algorithm does one step per RTT at most and only if
>> + * we receive a bulk stream w/o packet losses or reorderings.
>> + * Shrinking the buffer during idle times is not necessary as
>> + * it doesn't consume any memory when idle.
>> + *
>> + * TODO: Only step up if the application is actually serving
>> + * the buffer to better manage the socket buffer resources.
>> + */
>> +int
>> +tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
>> +    struct tcpcb *tp, int tlen)
>> +{
>> +    int newsize = 0;
>> +
>> +    if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) &&
>> +        tp->t_srtt != 0 && tp->rfbuf_ts != 0 &&
>> +        TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) >
>> +        (tp->t_srtt >> TCP_RTT_SHIFT)) {
>> +        if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) &&
>> +            so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) {
>> +            newsize = min(so->so_rcv.sb_hiwat +
>> +                V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max);
>> +        }
>> +        TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize);
>> +
>> +        /* Start over with next RTT. */
>> +        tp->rfbuf_ts = 0;
>> +        tp->rfbuf_cnt = 0;
>> +    } else {
>> +        tp->rfbuf_cnt += tlen;    /* add up */
>> +    }
>> +
>> +    return (newsize);
>> +}
>> +
>>   void
>>   tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
>>       struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
>> @@ -1849,62 +1911,7 @@ tcp_do_segment(struct mbuf *m, struct tc
>>   #endif
>>               TCP_PROBE3(debug__input, tp, th, m);
>>   -        /*
>> -         * Automatic sizing of receive socket buffer.  Often the send
>> -         * buffer size is not optimally adjusted to the actual network
>> -         * conditions at hand (delay bandwidth product). Setting the
>> -         * buffer size too small limits throughput on links with high
>> -         * bandwidth and high delay (eg. trans-continental/oceanic 
>> links).
>> -         *
>> -         * On the receive side the socket buffer memory is only rarely
>> -         * used to any significant extent.  This allows us to be much
>> -         * more aggressive in scaling the receive socket buffer.  For
>> -         * the case that the buffer space is actually used to a large
>> -         * extent and we run out of kernel memory we can simply drop
>> -         * the new segments; TCP on the sender will just retransmit it
>> -         * later.  Setting the buffer size too big may only consume too
>> -         * much kernel memory if the application doesn't read() from
>> -         * the socket or packet loss or reordering makes use of the
>> -         * reassembly queue.
>> -         *
>> -         * The criteria to step up the receive buffer one notch are:
>> -         *  1. Application has not set receive buffer size with
>> -         *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
>> -         *  2. the number of bytes received during the time it takes
>> -         *     one timestamp to be reflected back to us (the RTT);
>> -         *  3. received bytes per RTT is within seven eighth of the
>> -         *     current socket buffer size;
>> -         *  4. receive buffer size has not hit maximal automatic size;
>> -         *
>> -         * This algorithm does one step per RTT at most and only if
>> -         * we receive a bulk stream w/o packet losses or reorderings.
>> -         * Shrinking the buffer during idle times is not necessary as
>> -         * it doesn't consume any memory when idle.
>> -         *
>> -         * TODO: Only step up if the application is actually serving
>> -         * the buffer to better manage the socket buffer resources.
>> -         */
>> -            if (V_tcp_do_autorcvbuf &&
>> -                (to.to_flags & TOF_TS) &&
>> -                to.to_tsecr &&
>> -                (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
>> -                if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) &&
>> -                    to.to_tsecr - tp->rfbuf_ts < hz) {
>> -                    if (tp->rfbuf_cnt >
>> -                        (so->so_rcv.sb_hiwat / 8 * 7) &&
>> -                        so->so_rcv.sb_hiwat <
>> -                        V_tcp_autorcvbuf_max) {
>> -                        newsize =
>> -                            min(so->so_rcv.sb_hiwat +
>> -                            V_tcp_autorcvbuf_inc,
>> -                            V_tcp_autorcvbuf_max);
>> -                    }
>> -                    /* Start over with next RTT. */
>> -                    tp->rfbuf_ts = 0;
>> -                    tp->rfbuf_cnt = 0;
>> -                } else
>> -                    tp->rfbuf_cnt += tlen;    /* add up */
>> -            }
>> +            newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
>>                 /* Add data to socket buffer. */
>>               SOCKBUF_LOCK(&so->so_rcv);
>> @@ -1945,10 +1952,6 @@ tcp_do_segment(struct mbuf *m, struct tc
>>           win = 0;
>>       tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
>>   -    /* Reset receive buffer auto scaling when not in bulk receive 
>> mode. */
>> -    tp->rfbuf_ts = 0;
>> -    tp->rfbuf_cnt = 0;
>> -
>>       switch (tp->t_state) {
>>         /*
>>
>> Modified: head/sys/netinet/tcp_output.c
>> ============================================================================== 
>>
>> --- head/sys/netinet/tcp_output.c    Mon Apr 10 06:19:09 2017 (r316675)
>> +++ head/sys/netinet/tcp_output.c    Mon Apr 10 08:19:35 2017 (r316676)
>> @@ -831,11 +831,13 @@ send:
>>               to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
>>               to.to_tsecr = tp->ts_recent;
>>               to.to_flags |= TOF_TS;
>> -            /* Set receive buffer autosizing timestamp. */
>> -            if (tp->rfbuf_ts == 0 &&
>> -                (so->so_rcv.sb_flags & SB_AUTOSIZE))
>> -                tp->rfbuf_ts = tcp_ts_getticks();
>>           }
>> +
>> +        /* Set receive buffer autosizing timestamp. */
>> +        if (tp->rfbuf_ts == 0 &&
>> +            (so->so_rcv.sb_flags & SB_AUTOSIZE))
>> +            tp->rfbuf_ts = tcp_ts_getticks();
>> +
>>           /* Selective ACK's. */
>>           if (tp->t_flags & TF_SACK_PERMIT) {
>>               if (flags & TH_SYN)
>>
>> Modified: head/sys/netinet/tcp_stacks/fastpath.c
>> ============================================================================== 
>>
>> --- head/sys/netinet/tcp_stacks/fastpath.c    Mon Apr 10 06:19:09 
>> 2017    (r316675)
>> +++ head/sys/netinet/tcp_stacks/fastpath.c    Mon Apr 10 08:19:35 
>> 2017    (r316676)
>> @@ -399,62 +399,8 @@ tcp_do_fastnewdata(struct mbuf *m, struc
>>                 (void *)tcp_saveipgen, &tcp_savetcp, 0);
>>   #endif
>>       TCP_PROBE3(debug__input, tp, th, m);
>> -    /*
>> -     * Automatic sizing of receive socket buffer.  Often the send
>> -     * buffer size is not optimally adjusted to the actual network
>> -     * conditions at hand (delay bandwidth product).  Setting the
>> -     * buffer size too small limits throughput on links with high
>> -     * bandwidth and high delay (eg. trans-continental/oceanic links).
>> -     *
>> -     * On the receive side the socket buffer memory is only rarely
>> -     * used to any significant extent.  This allows us to be much
>> -     * more aggressive in scaling the receive socket buffer. For
>> -     * the case that the buffer space is actually used to a large
>> -     * extent and we run out of kernel memory we can simply drop
>> -     * the new segments; TCP on the sender will just retransmit it
>> -     * later.  Setting the buffer size too big may only consume too
>> -     * much kernel memory if the application doesn't read() from
>> -     * the socket or packet loss or reordering makes use of the
>> -     * reassembly queue.
>> -     *
>> -     * The criteria to step up the receive buffer one notch are:
>> -     *  1. Application has not set receive buffer size with
>> -     *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
>> -     *  2. the number of bytes received during the time it takes
>> -     *     one timestamp to be reflected back to us (the RTT);
>> -     *  3. received bytes per RTT is within seven eighth of the
>> -     *     current socket buffer size;
>> -     *  4. receive buffer size has not hit maximal automatic size;
>> -     *
>> -     * This algorithm does one step per RTT at most and only if
>> -     * we receive a bulk stream w/o packet losses or reorderings.
>> -     * Shrinking the buffer during idle times is not necessary as
>> -     * it doesn't consume any memory when idle.
>> -     *
>> -     * TODO: Only step up if the application is actually serving
>> -     * the buffer to better manage the socket buffer resources.
>> -     */
>> -    if (V_tcp_do_autorcvbuf &&
>> -        (to->to_flags & TOF_TS) &&
>> -        to->to_tsecr &&
>> -        (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
>> -        if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) &&
>> -            to->to_tsecr - tp->rfbuf_ts < hz) {
>> -            if (tp->rfbuf_cnt >
>> -                (so->so_rcv.sb_hiwat / 8 * 7) &&
>> -                so->so_rcv.sb_hiwat <
>> -                V_tcp_autorcvbuf_max) {
>> -                newsize =
>> -                    min(so->so_rcv.sb_hiwat +
>> -                        V_tcp_autorcvbuf_inc,
>> -                        V_tcp_autorcvbuf_max);
>> -            }
>> -            /* Start over with next RTT. */
>> -            tp->rfbuf_ts = 0;
>> -            tp->rfbuf_cnt = 0;
>> -        } else
>> -            tp->rfbuf_cnt += tlen;    /* add up */
>> -    }
>> +
>> +    newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
>>         /* Add data to socket buffer. */
>>       SOCKBUF_LOCK(&so->so_rcv);
>> @@ -532,10 +478,6 @@ tcp_do_slowpath(struct mbuf *m, struct t
>>           win = 0;
>>       tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
>>   -    /* Reset receive buffer auto scaling when not in bulk receive 
>> mode. */
>> -    tp->rfbuf_ts = 0;
>> -    tp->rfbuf_cnt = 0;
>> -
>>       switch (tp->t_state) {
>>         /*
>>
>> Modified: head/sys/netinet/tcp_var.h
>> ============================================================================== 
>>
>> --- head/sys/netinet/tcp_var.h    Mon Apr 10 06:19:09 2017 (r316675)
>> +++ head/sys/netinet/tcp_var.h    Mon Apr 10 08:19:35 2017 (r316676)
>> @@ -778,6 +778,8 @@ void    hhook_run_tcp_est_in(struct tcpcb *
>>   #endif
>>     int     tcp_input(struct mbuf **, int *, int);
>> +int     tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *,
>> +        struct tcpcb *, int);
>>   void     tcp_do_segment(struct mbuf *, struct tcphdr *,
>>               struct socket *, struct tcpcb *, int, int, uint8_t,
>>               int);
>>
>>
>