svn commit: r316676 - in head/sys/netinet: . tcp_stacks
Steven Hartland
steven at multiplay.co.uk
Mon Apr 10 15:18:02 UTC 2017
I don't tend to MFC 10.x now, but do agree given the impact that for
this one it should be done.
The fix is a little different, due to code restructuring in 11 / head,
but I do have a 10.x version already.
Regards
Steve
On 10/04/2017 15:51, Julian Elischer wrote:
> If possible MFC to 10 too would be nice..
> thanks
>
>
> On 10/4/17 4:19 pm, Steven Hartland wrote:
>> Author: smh
>> Date: Mon Apr 10 08:19:35 2017
>> New Revision: 316676
>> URL: https://svnweb.freebsd.org/changeset/base/316676
>>
>> Log:
>> Use estimated RTT for receive buffer auto resizing instead of
>> timestamps
>> Switched from using timestamps to RTT estimates when performing
>> TCP receive
>> buffer auto resizing, as not all hosts support / enable TCP
>> timestamps.
>> Disabled reset of receive buffer auto scaling when not in bulk
>> receive mode,
>> which gives an extra 20% performance increase.
>> Also extracted auto resizing to a common method shared between
>> standard and
>> fastpath modules.
>> With this AWS S3 downloads at ~17ms latency on a 1Gbps
>> connection jump from
>> ~3MB/s to ~100MB/s using the default settings.
>> Reviewed by: lstewart, gnn
>> MFC after: 2 weeks
>> Relnotes: Yes
>> Sponsored by: Multiplay
>> Differential Revision: https://reviews.freebsd.org/D9668
>>
>> Modified:
>> head/sys/netinet/in_kdtrace.c
>> head/sys/netinet/in_kdtrace.h
>> head/sys/netinet/tcp_input.c
>> head/sys/netinet/tcp_output.c
>> head/sys/netinet/tcp_stacks/fastpath.c
>> head/sys/netinet/tcp_var.h
>>
>> Modified: head/sys/netinet/in_kdtrace.c
>> ==============================================================================
>>
>> --- head/sys/netinet/in_kdtrace.c Mon Apr 10 06:19:09 2017 (r316675)
>> +++ head/sys/netinet/in_kdtrace.c Mon Apr 10 08:19:35 2017 (r316676)
>> @@ -132,6 +132,14 @@ SDT_PROBE_DEFINE6_XLATE(tcp, , , state__
>> "void *", "void *",
>> "int", "tcplsinfo_t *");
>> +SDT_PROBE_DEFINE6_XLATE(tcp, , , receive__autoresize,
>> + "void *", "void *",
>> + "struct tcpcb *", "csinfo_t *",
>> + "struct mbuf *", "ipinfo_t *",
>> + "struct tcpcb *", "tcpsinfo_t *" ,
>> + "struct tcphdr *", "tcpinfoh_t *",
>> + "int", "int");
>> +
>> SDT_PROBE_DEFINE5_XLATE(udp, , , receive,
>> "void *", "pktinfo_t *",
>> "struct inpcb *", "csinfo_t *",
>>
>> Modified: head/sys/netinet/in_kdtrace.h
>> ==============================================================================
>>
>> --- head/sys/netinet/in_kdtrace.h Mon Apr 10 06:19:09 2017 (r316675)
>> +++ head/sys/netinet/in_kdtrace.h Mon Apr 10 08:19:35 2017 (r316676)
>> @@ -65,6 +65,7 @@ SDT_PROBE_DECLARE(tcp, , , debug__input)
>> SDT_PROBE_DECLARE(tcp, , , debug__output);
>> SDT_PROBE_DECLARE(tcp, , , debug__user);
>> SDT_PROBE_DECLARE(tcp, , , debug__drop);
>> +SDT_PROBE_DECLARE(tcp, , , receive__autoresize);
>> SDT_PROBE_DECLARE(udp, , , receive);
>> SDT_PROBE_DECLARE(udp, , , send);
>>
>> Modified: head/sys/netinet/tcp_input.c
>> ==============================================================================
>>
>> --- head/sys/netinet/tcp_input.c Mon Apr 10 06:19:09 2017 (r316675)
>> +++ head/sys/netinet/tcp_input.c Mon Apr 10 08:19:35 2017 (r316676)
>> @@ -1486,6 +1486,68 @@ drop:
>> return (IPPROTO_DONE);
>> }
>> +/*
>> + * Automatic sizing of receive socket buffer. Often the send
>> + * buffer size is not optimally adjusted to the actual network
>> + * conditions at hand (delay bandwidth product). Setting the
>> + * buffer size too small limits throughput on links with high
>> + * bandwidth and high delay (eg. trans-continental/oceanic links).
>> + *
>> + * On the receive side the socket buffer memory is only rarely
>> + * used to any significant extent. This allows us to be much
>> + * more aggressive in scaling the receive socket buffer. For
>> + * the case that the buffer space is actually used to a large
>> + * extent and we run out of kernel memory we can simply drop
>> + * the new segments; TCP on the sender will just retransmit it
>> + * later. Setting the buffer size too big may only consume too
>> + * much kernel memory if the application doesn't read() from
>> + * the socket or packet loss or reordering makes use of the
>> + * reassembly queue.
>> + *
>> + * The criteria to step up the receive buffer one notch are:
>> + * 1. Application has not set receive buffer size with
>> + * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
>> + * 2. the number of bytes received during the time it takes
>> + * one timestamp to be reflected back to us (the RTT);
>> + * 3. received bytes per RTT is within seven eighth of the
>> + * current socket buffer size;
>> + * 4. receive buffer size has not hit maximal automatic size;
>> + *
>> + * This algorithm does one step per RTT at most and only if
>> + * we receive a bulk stream w/o packet losses or reorderings.
>> + * Shrinking the buffer during idle times is not necessary as
>> + * it doesn't consume any memory when idle.
>> + *
>> + * TODO: Only step up if the application is actually serving
>> + * the buffer to better manage the socket buffer resources.
>> + */
>> +int
>> +tcp_autorcvbuf(struct mbuf *m, struct tcphdr *th, struct socket *so,
>> + struct tcpcb *tp, int tlen)
>> +{
>> + int newsize = 0;
>> +
>> + if (V_tcp_do_autorcvbuf && (so->so_rcv.sb_flags & SB_AUTOSIZE) &&
>> + tp->t_srtt != 0 && tp->rfbuf_ts != 0 &&
>> + TCP_TS_TO_TICKS(tcp_ts_getticks() - tp->rfbuf_ts) >
>> + (tp->t_srtt >> TCP_RTT_SHIFT)) {
>> + if (tp->rfbuf_cnt > (so->so_rcv.sb_hiwat / 8 * 7) &&
>> + so->so_rcv.sb_hiwat < V_tcp_autorcvbuf_max) {
>> + newsize = min(so->so_rcv.sb_hiwat +
>> + V_tcp_autorcvbuf_inc, V_tcp_autorcvbuf_max);
>> + }
>> + TCP_PROBE6(receive__autoresize, NULL, tp, m, tp, th, newsize);
>> +
>> + /* Start over with next RTT. */
>> + tp->rfbuf_ts = 0;
>> + tp->rfbuf_cnt = 0;
>> + } else {
>> + tp->rfbuf_cnt += tlen; /* add up */
>> + }
>> +
>> + return (newsize);
>> +}
>> +
>> void
>> tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
>> struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
>> @@ -1849,62 +1911,7 @@ tcp_do_segment(struct mbuf *m, struct tc
>> #endif
>> TCP_PROBE3(debug__input, tp, th, m);
>> - /*
>> - * Automatic sizing of receive socket buffer. Often the send
>> - * buffer size is not optimally adjusted to the actual network
>> - * conditions at hand (delay bandwidth product). Setting the
>> - * buffer size too small limits throughput on links with high
>> - * bandwidth and high delay (eg. trans-continental/oceanic
>> links).
>> - *
>> - * On the receive side the socket buffer memory is only rarely
>> - * used to any significant extent. This allows us to be much
>> - * more aggressive in scaling the receive socket buffer. For
>> - * the case that the buffer space is actually used to a large
>> - * extent and we run out of kernel memory we can simply drop
>> - * the new segments; TCP on the sender will just retransmit it
>> - * later. Setting the buffer size too big may only consume too
>> - * much kernel memory if the application doesn't read() from
>> - * the socket or packet loss or reordering makes use of the
>> - * reassembly queue.
>> - *
>> - * The criteria to step up the receive buffer one notch are:
>> - * 1. Application has not set receive buffer size with
>> - * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
>> - * 2. the number of bytes received during the time it takes
>> - * one timestamp to be reflected back to us (the RTT);
>> - * 3. received bytes per RTT is within seven eighth of the
>> - * current socket buffer size;
>> - * 4. receive buffer size has not hit maximal automatic size;
>> - *
>> - * This algorithm does one step per RTT at most and only if
>> - * we receive a bulk stream w/o packet losses or reorderings.
>> - * Shrinking the buffer during idle times is not necessary as
>> - * it doesn't consume any memory when idle.
>> - *
>> - * TODO: Only step up if the application is actually serving
>> - * the buffer to better manage the socket buffer resources.
>> - */
>> - if (V_tcp_do_autorcvbuf &&
>> - (to.to_flags & TOF_TS) &&
>> - to.to_tsecr &&
>> - (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
>> - if (TSTMP_GT(to.to_tsecr, tp->rfbuf_ts) &&
>> - to.to_tsecr - tp->rfbuf_ts < hz) {
>> - if (tp->rfbuf_cnt >
>> - (so->so_rcv.sb_hiwat / 8 * 7) &&
>> - so->so_rcv.sb_hiwat <
>> - V_tcp_autorcvbuf_max) {
>> - newsize =
>> - min(so->so_rcv.sb_hiwat +
>> - V_tcp_autorcvbuf_inc,
>> - V_tcp_autorcvbuf_max);
>> - }
>> - /* Start over with next RTT. */
>> - tp->rfbuf_ts = 0;
>> - tp->rfbuf_cnt = 0;
>> - } else
>> - tp->rfbuf_cnt += tlen; /* add up */
>> - }
>> + newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
>> /* Add data to socket buffer. */
>> SOCKBUF_LOCK(&so->so_rcv);
>> @@ -1945,10 +1952,6 @@ tcp_do_segment(struct mbuf *m, struct tc
>> win = 0;
>> tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
>> - /* Reset receive buffer auto scaling when not in bulk receive
>> mode. */
>> - tp->rfbuf_ts = 0;
>> - tp->rfbuf_cnt = 0;
>> -
>> switch (tp->t_state) {
>> /*
>>
>> Modified: head/sys/netinet/tcp_output.c
>> ==============================================================================
>>
>> --- head/sys/netinet/tcp_output.c Mon Apr 10 06:19:09 2017 (r316675)
>> +++ head/sys/netinet/tcp_output.c Mon Apr 10 08:19:35 2017 (r316676)
>> @@ -831,11 +831,13 @@ send:
>> to.to_tsval = tcp_ts_getticks() + tp->ts_offset;
>> to.to_tsecr = tp->ts_recent;
>> to.to_flags |= TOF_TS;
>> - /* Set receive buffer autosizing timestamp. */
>> - if (tp->rfbuf_ts == 0 &&
>> - (so->so_rcv.sb_flags & SB_AUTOSIZE))
>> - tp->rfbuf_ts = tcp_ts_getticks();
>> }
>> +
>> + /* Set receive buffer autosizing timestamp. */
>> + if (tp->rfbuf_ts == 0 &&
>> + (so->so_rcv.sb_flags & SB_AUTOSIZE))
>> + tp->rfbuf_ts = tcp_ts_getticks();
>> +
>> /* Selective ACK's. */
>> if (tp->t_flags & TF_SACK_PERMIT) {
>> if (flags & TH_SYN)
>>
>> Modified: head/sys/netinet/tcp_stacks/fastpath.c
>> ==============================================================================
>>
>> --- head/sys/netinet/tcp_stacks/fastpath.c Mon Apr 10 06:19:09
>> 2017 (r316675)
>> +++ head/sys/netinet/tcp_stacks/fastpath.c Mon Apr 10 08:19:35
>> 2017 (r316676)
>> @@ -399,62 +399,8 @@ tcp_do_fastnewdata(struct mbuf *m, struc
>> (void *)tcp_saveipgen, &tcp_savetcp, 0);
>> #endif
>> TCP_PROBE3(debug__input, tp, th, m);
>> - /*
>> - * Automatic sizing of receive socket buffer. Often the send
>> - * buffer size is not optimally adjusted to the actual network
>> - * conditions at hand (delay bandwidth product). Setting the
>> - * buffer size too small limits throughput on links with high
>> - * bandwidth and high delay (eg. trans-continental/oceanic links).
>> - *
>> - * On the receive side the socket buffer memory is only rarely
>> - * used to any significant extent. This allows us to be much
>> - * more aggressive in scaling the receive socket buffer. For
>> - * the case that the buffer space is actually used to a large
>> - * extent and we run out of kernel memory we can simply drop
>> - * the new segments; TCP on the sender will just retransmit it
>> - * later. Setting the buffer size too big may only consume too
>> - * much kernel memory if the application doesn't read() from
>> - * the socket or packet loss or reordering makes use of the
>> - * reassembly queue.
>> - *
>> - * The criteria to step up the receive buffer one notch are:
>> - * 1. Application has not set receive buffer size with
>> - * SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
>> - * 2. the number of bytes received during the time it takes
>> - * one timestamp to be reflected back to us (the RTT);
>> - * 3. received bytes per RTT is within seven eighth of the
>> - * current socket buffer size;
>> - * 4. receive buffer size has not hit maximal automatic size;
>> - *
>> - * This algorithm does one step per RTT at most and only if
>> - * we receive a bulk stream w/o packet losses or reorderings.
>> - * Shrinking the buffer during idle times is not necessary as
>> - * it doesn't consume any memory when idle.
>> - *
>> - * TODO: Only step up if the application is actually serving
>> - * the buffer to better manage the socket buffer resources.
>> - */
>> - if (V_tcp_do_autorcvbuf &&
>> - (to->to_flags & TOF_TS) &&
>> - to->to_tsecr &&
>> - (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
>> - if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) &&
>> - to->to_tsecr - tp->rfbuf_ts < hz) {
>> - if (tp->rfbuf_cnt >
>> - (so->so_rcv.sb_hiwat / 8 * 7) &&
>> - so->so_rcv.sb_hiwat <
>> - V_tcp_autorcvbuf_max) {
>> - newsize =
>> - min(so->so_rcv.sb_hiwat +
>> - V_tcp_autorcvbuf_inc,
>> - V_tcp_autorcvbuf_max);
>> - }
>> - /* Start over with next RTT. */
>> - tp->rfbuf_ts = 0;
>> - tp->rfbuf_cnt = 0;
>> - } else
>> - tp->rfbuf_cnt += tlen; /* add up */
>> - }
>> +
>> + newsize = tcp_autorcvbuf(m, th, so, tp, tlen);
>> /* Add data to socket buffer. */
>> SOCKBUF_LOCK(&so->so_rcv);
>> @@ -532,10 +478,6 @@ tcp_do_slowpath(struct mbuf *m, struct t
>> win = 0;
>> tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
>> - /* Reset receive buffer auto scaling when not in bulk receive
>> mode. */
>> - tp->rfbuf_ts = 0;
>> - tp->rfbuf_cnt = 0;
>> -
>> switch (tp->t_state) {
>> /*
>>
>> Modified: head/sys/netinet/tcp_var.h
>> ==============================================================================
>>
>> --- head/sys/netinet/tcp_var.h Mon Apr 10 06:19:09 2017 (r316675)
>> +++ head/sys/netinet/tcp_var.h Mon Apr 10 08:19:35 2017 (r316676)
>> @@ -778,6 +778,8 @@ void hhook_run_tcp_est_in(struct tcpcb *
>> #endif
>> int tcp_input(struct mbuf **, int *, int);
>> +int tcp_autorcvbuf(struct mbuf *, struct tcphdr *, struct socket *,
>> + struct tcpcb *, int);
>> void tcp_do_segment(struct mbuf *, struct tcphdr *,
>> struct socket *, struct tcpcb *, int, int, uint8_t,
>> int);
>>
>>
>
More information about the svn-src-all
mailing list