git: 62ce18fc9a8e - main - tcp: Rack rwnd collapse.
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Tue, 23 Aug 2022 13:18:19 UTC
The branch main has been updated by rrs:
URL: https://cgit.FreeBSD.org/src/commit/?id=62ce18fc9a8e46ea72ce3a106e7b0cd1ad6a712b
commit 62ce18fc9a8e46ea72ce3a106e7b0cd1ad6a712b
Author: Randall Stewart <rrs@FreeBSD.org>
AuthorDate: 2022-08-23 13:17:05 +0000
Commit: Randall Stewart <rrs@FreeBSD.org>
CommitDate: 2022-08-23 13:17:05 +0000
tcp: Rack rwnd collapse.
Currently when the peer collapses its rwnd, we mark packets to be retransmitted
and use the must_retran flags like we do when a PMTU collapses to retransmit the
collapsed packets. However this causes a problem with some middle boxes that
play with the rwnd to control flow. As soon as the rwnd increases we start resending
which may be not even a rtt.. and in fact the peer may have gotten the packets. Which
means we gratuitously retransmit packets we should not.
The fix here is to make sure that a rack time has passed before retransmitting the packets.
This makes sure that the rwnd collapse was real and the packets do need retransmission.
Reviewed by: tuexen
Sponsored by: Netflix Inc
Differential Revision: https://reviews.freebsd.org/D35166
---
sys/netinet/tcp_log_buf.h | 4 +-
sys/netinet/tcp_stacks/rack.c | 403 ++++++++++++++++++++++++++------------
sys/netinet/tcp_stacks/tcp_rack.h | 19 +-
3 files changed, 300 insertions(+), 126 deletions(-)
diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h
index 1290a8ce6b29..c11757099c5d 100644
--- a/sys/netinet/tcp_log_buf.h
+++ b/sys/netinet/tcp_log_buf.h
@@ -236,7 +236,9 @@ enum tcp_log_events {
TCP_LOG_FSB, /* FSB information 63 */
RACK_DSACK_HANDLING, /* Handling of DSACK in rack for reordering window 64 */
TCP_HYSTART, /* TCP Hystart logging 65 */
- TCP_LOG_END /* End (keep at end) 66 */
+ TCP_CHG_QUERY, /* Change query during fnc_init() 66 */
+ TCP_RACK_LOG_COLLAPSE, /* Window collapse by peer 67 */
+ TCP_LOG_END /* End (keep at end) 68 */
};
enum tcp_log_states {
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
index 0c91c9c6703f..ea370fe9247c 100644
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -385,6 +385,9 @@ counter_u64_t rack_move_some;
counter_u64_t rack_input_idle_reduces;
counter_u64_t rack_collapsed_win;
+counter_u64_t rack_collapsed_win_seen;
+counter_u64_t rack_collapsed_win_rxt;
+counter_u64_t rack_collapsed_win_rxt_bytes;
counter_u64_t rack_try_scwnd;
counter_u64_t rack_hw_pace_init_fail;
counter_u64_t rack_hw_pace_lost;
@@ -790,6 +793,9 @@ sysctl_rack_clear(SYSCTL_HANDLER_ARGS)
counter_u64_zero(rack_move_some);
counter_u64_zero(rack_try_scwnd);
counter_u64_zero(rack_collapsed_win);
+ counter_u64_zero(rack_collapsed_win_rxt);
+ counter_u64_zero(rack_collapsed_win_seen);
+ counter_u64_zero(rack_collapsed_win_rxt_bytes);
}
rack_clear_counter = 0;
return (0);
@@ -1757,12 +1763,31 @@ rack_init_sysctls(void)
OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD,
&rack_input_idle_reduces,
"Total number of idle reductions on input");
+ rack_collapsed_win_seen = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "collapsed_win_seen", CTLFLAG_RD,
+ &rack_collapsed_win_seen,
+ "Total number of collapsed window events seen (where our window shrinks)");
+
rack_collapsed_win = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
OID_AUTO, "collapsed_win", CTLFLAG_RD,
&rack_collapsed_win,
- "Total number of collapsed windows");
+ "Total number of collapsed window events where we mark packets");
+ rack_collapsed_win_rxt = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "collapsed_win_rxt", CTLFLAG_RD,
+ &rack_collapsed_win_rxt,
+ "Total number of packets that were retransmitted");
+ rack_collapsed_win_rxt_bytes = counter_u64_alloc(M_WAITOK);
+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
+ SYSCTL_CHILDREN(rack_counters),
+ OID_AUTO, "collapsed_win_bytes", CTLFLAG_RD,
+ &rack_collapsed_win_rxt_bytes,
+ "Total number of bytes that were retransmitted");
rack_try_scwnd = counter_u64_alloc(M_WAITOK);
SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
SYSCTL_CHILDREN(rack_counters),
@@ -2772,6 +2797,9 @@ rack_counter_destroy(void)
counter_u64_free(rack_sack_splits);
counter_u64_free(rack_input_idle_reduces);
counter_u64_free(rack_collapsed_win);
+ counter_u64_free(rack_collapsed_win_rxt);
+ counter_u64_free(rack_collapsed_win_rxt_bytes);
+ counter_u64_free(rack_collapsed_win_seen);
counter_u64_free(rack_try_scwnd);
counter_u64_free(rack_persists_sends);
counter_u64_free(rack_persists_acks);
@@ -5295,7 +5323,9 @@ activate_rxt:
goto activate_rxt;
}
/* Convert from ms to usecs */
- if ((rsm->r_flags & RACK_SACK_PASSED) || (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
+ if ((rsm->r_flags & RACK_SACK_PASSED) ||
+ (rsm->r_flags & RACK_RWND_COLLAPSED) ||
+ (rsm->r_dupack >= DUP_ACK_THRESHOLD)) {
if ((tp->t_flags & TF_SENTFIN) &&
((tp->snd_max - tp->snd_una) == 1) &&
(rsm->r_flags & RACK_HAS_FIN)) {
@@ -5757,7 +5787,7 @@ rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts,
* real pacing. And the tlp or rxt is smaller
* than the pacing calculation. Lets not
* pace that long since we know the calculation
- * so far is not accurate.
+ * so far is not accurate.
*/
slot = hpts_timeout;
}
@@ -6501,7 +6531,7 @@ rack_remxt_tmr(struct tcpcb *tp)
trsm = rsm;
if (rsm->r_flags & RACK_ACKED)
rsm->r_flags |= RACK_WAS_ACKED;
- rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS);
+ rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED);
rsm->r_flags |= RACK_MUST_RXT;
}
/* Clear the count (we just un-acked them) */
@@ -8040,6 +8070,13 @@ rack_log_sack_passed(struct tcpcb *tp,
*/
continue;
}
+ if (nrsm->r_flags & RACK_RWND_COLLAPSED) {
+ /*
+ * If the peer dropped the rwnd on
+ * these then we don't worry about them.
+ */
+ continue;
+ }
if (nrsm->r_flags & RACK_SACK_PASSED) {
/*
* We found one that is already marked
@@ -9797,7 +9834,7 @@ rack_strike_dupack(struct tcp_rack *rack)
/* Sendmap entries that are marked to
* be retransmitted do not need dupack's
* struck. We get these marks for a number
- * of reasons (rxt timeout with no sack,
+ * of reasons (rxt timeout with no sack,
* mtu change, or rwnd collapses). When
* these events occur, we know we must retransmit
* them and mark the sendmap entries. Dupack counting
@@ -10308,47 +10345,83 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so,
return (0);
}
+
static void
-rack_collapsed_window(struct tcp_rack *rack)
+rack_log_collapse(struct tcp_rack *rack, uint32_t cnt, uint32_t split, uint32_t out, int line,
+ int dir, uint32_t flags, struct rack_sendmap *rsm)
+{
+ if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
+ union tcp_log_stackspecific log;
+ struct timeval tv;
+
+ memset(&log, 0, sizeof(log));
+ log.u_bbr.flex1 = cnt;
+ log.u_bbr.flex2 = split;
+ log.u_bbr.flex3 = out;
+ log.u_bbr.flex4 = line;
+ log.u_bbr.flex5 = rack->r_must_retran;
+ log.u_bbr.flex6 = flags;
+ log.u_bbr.flex7 = rack->rc_has_collapsed;
+ log.u_bbr.flex8 = dir; /*
+ * 1 is collapsed, 0 is uncollapsed,
+ * 2 is log of a rsm being marked, 3 is a split.
+ */
+ if (rsm == NULL)
+ log.u_bbr.rttProp = 0;
+ else
+ log.u_bbr.rttProp = (uint64_t)rsm;
+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
+ log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked);
+ TCP_LOG_EVENTP(rack->rc_tp, NULL,
+ &rack->rc_inp->inp_socket->so_rcv,
+ &rack->rc_inp->inp_socket->so_snd,
+ TCP_RACK_LOG_COLLAPSE, 0,
+ 0, &log, false, &tv);
+ }
+}
+
+static void
+rack_collapsed_window(struct tcp_rack *rack, uint32_t out, int line)
{
/*
- * Now we must walk the
- * send map and divide the
- * ones left stranded. These
- * guys can't cause us to abort
- * the connection and are really
- * "unsent". However if a buggy
- * client actually did keep some
- * of the data i.e. collapsed the win
- * and refused to ack and then opened
- * the win and acked that data. We would
- * get into an ack war, the simplier
- * method then of just pretending we
- * did not send those segments something
- * won't work.
+ * Here all we do is mark the collapsed point and set the flag.
+ * This may happen again and again, but there is no
+ * sense splitting our map until we know where the
+ * peer finally lands in the collapse.
*/
- struct rack_sendmap *rsm, *nrsm, fe;
+ rack_trace_point(rack, RACK_TP_COLLAPSED_WND);
+ if ((rack->rc_has_collapsed == 0) ||
+ (rack->r_ctl.last_collapse_point != (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)))
+ counter_u64_add(rack_collapsed_win_seen, 1);
+ rack->r_ctl.last_collapse_point = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
+ rack->r_ctl.high_collapse_point = rack->rc_tp->snd_max;
+ rack->rc_has_collapsed = 1;
+ rack->r_collapse_point_valid = 1;
+ rack_log_collapse(rack, 0, 0, rack->r_ctl.last_collapse_point, line, 1, 0, NULL);
+}
+
+static void
+rack_un_collapse_window(struct tcp_rack *rack, int line)
+{
+ struct rack_sendmap *nrsm, *rsm, fe;
+ int cnt = 0, split = 0;
#ifdef INVARIANTS
struct rack_sendmap *insret;
#endif
- tcp_seq max_seq;
- rack_trace_point(rack, RACK_TP_COLLAPSED_WND);
- max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd;
memset(&fe, 0, sizeof(fe));
- fe.r_start = max_seq;
- /* Find the first seq past or at maxseq */
+ rack->rc_has_collapsed = 0;
+ fe.r_start = rack->r_ctl.last_collapse_point;
rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
if (rsm == NULL) {
- /* Nothing to do strange */
- rack->rc_has_collapsed = 0;
+ /* Nothing to do maybe the peer ack'ed it all */
+ rack_log_collapse(rack, 0, 0, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL);
return;
}
- /*
- * Now do we need to split at
- * the collapse point?
- */
- if (SEQ_GT(max_seq, rsm->r_start)) {
+ /* Now do we need to split this one? */
+ if (SEQ_GT(rack->r_ctl.last_collapse_point, rsm->r_start)) {
+ rack_log_collapse(rack, rsm->r_start, rsm->r_end,
+ rack->r_ctl.last_collapse_point, line, 3, rsm->r_flags, rsm);
nrsm = rack_alloc_limit(rack, RACK_LIMIT_TYPE_SPLIT);
if (nrsm == NULL) {
/* We can't get a rsm, mark all? */
@@ -10356,7 +10429,8 @@ rack_collapsed_window(struct tcp_rack *rack)
goto no_split;
}
/* Clone it */
- rack_clone_rsm(rack, nrsm, rsm, max_seq);
+ split = 1;
+ rack_clone_rsm(rack, nrsm, rsm, rack->r_ctl.last_collapse_point);
#ifndef INVARIANTS
(void)RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm);
#else
@@ -10366,7 +10440,8 @@ rack_collapsed_window(struct tcp_rack *rack)
nrsm, insret, rack, rsm);
}
#endif
- rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, max_seq, __LINE__);
+ rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT,
+ rack->r_ctl.last_collapse_point, __LINE__);
if (rsm->r_in_tmap) {
TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext);
nrsm->r_in_tmap = 1;
@@ -10378,38 +10453,15 @@ rack_collapsed_window(struct tcp_rack *rack)
rsm = nrsm;
}
no_split:
- counter_u64_add(rack_collapsed_win, 1);
RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) {
nrsm->r_flags |= RACK_RWND_COLLAPSED;
+ rack_log_collapse(rack, nrsm->r_start, nrsm->r_end, 0, line, 4, nrsm->r_flags, nrsm);
+ cnt++;
}
- rack->rc_has_collapsed = 1;
-}
-
-static void
-rack_un_collapse_window(struct tcp_rack *rack)
-{
- struct rack_sendmap *rsm;
- int cnt = 0;;
-
- rack->r_ctl.rc_out_at_rto = 0;
- rack->r_ctl.rc_snd_max_at_rto = rack->rc_tp->snd_una;
- RB_FOREACH_REVERSE(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) {
- if (rsm->r_flags & RACK_RWND_COLLAPSED) {
- rsm->r_flags &= ~RACK_RWND_COLLAPSED;
- rsm->r_flags |= RACK_MUST_RXT;
- if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) {
- rack->r_ctl.rc_snd_max_at_rto = rsm->r_end;
- rack->r_ctl.rc_out_at_rto += (rsm->r_end - rsm->r_start);
- }
- cnt++;
- }
- else
- break;
- }
- rack->rc_has_collapsed = 0;
if (cnt) {
- rack->r_must_retran = 1;
+ counter_u64_add(rack_collapsed_win, 1);
}
+ rack_log_collapse(rack, cnt, split, ctf_outstanding(rack->rc_tp), line, 0, 0, NULL);
}
static void
@@ -10518,9 +10570,12 @@ rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so,
}
if (tp->snd_wnd < ctf_outstanding(tp))
/* The peer collapsed the window */
- rack_collapsed_window(rack);
+ rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__);
else if (rack->rc_has_collapsed)
- rack_un_collapse_window(rack);
+ rack_un_collapse_window(rack, __LINE__);
+ if ((rack->r_collapse_point_valid) &&
+ (SEQ_GT(th->th_ack, rack->r_ctl.high_collapse_point)))
+ rack->r_collapse_point_valid = 0;
/* Was persist timer active and now we have window space? */
if ((rack->rc_in_persist != 0) &&
(tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
@@ -11076,10 +11131,12 @@ rack_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (tp->snd_wnd < ctf_outstanding(tp)) {
/* The peer collapsed the window */
- rack_collapsed_window(rack);
+ rack_collapsed_window(rack, ctf_outstanding(tp), __LINE__);
} else if (rack->rc_has_collapsed)
- rack_un_collapse_window(rack);
-
+ rack_un_collapse_window(rack, __LINE__);
+ if ((rack->r_collapse_point_valid) &&
+ (SEQ_GT(tp->snd_una, rack->r_ctl.high_collapse_point)))
+ rack->r_collapse_point_valid = 0;
/*
* Pull snd_wl2 up to prevent seq wrap relative to th_ack.
*/
@@ -13066,13 +13123,6 @@ rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uin
/* Not a valid win update */
return;
}
- if (tp->snd_wnd > tp->max_sndwnd)
- tp->max_sndwnd = tp->snd_wnd;
- if (tp->snd_wnd < (tp->snd_max - high_seq)) {
- /* The peer collapsed the window */
- rack_collapsed_window(rack);
- } else if (rack->rc_has_collapsed)
- rack_un_collapse_window(rack);
/* Do we exit persists? */
if ((rack->rc_in_persist != 0) &&
(tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2),
@@ -13609,6 +13659,15 @@ rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mb
#ifdef TCP_ACCOUNTING
ts_val = get_cyclecount();
#endif
+ /* Tend to any collapsed window */
+ if (SEQ_GT(tp->snd_max, high_seq) && (tp->snd_wnd < (tp->snd_max - high_seq))) {
+ /* The peer collapsed the window */
+ rack_collapsed_window(rack, (tp->snd_max - high_seq), __LINE__);
+ } else if (rack->rc_has_collapsed)
+ rack_un_collapse_window(rack, __LINE__);
+ if ((rack->r_collapse_point_valid) &&
+ (SEQ_GT(high_seq, rack->r_ctl.high_collapse_point)))
+ rack->r_collapse_point_valid = 0;
acked_amount = acked = (high_seq - tp->snd_una);
if (acked) {
/*
@@ -15930,6 +15989,11 @@ rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendma
if (tp->t_logstate != TCP_LOG_STATE_OFF) {
union tcp_log_stackspecific log;
+ if (rsm->r_flags & RACK_RWND_COLLAPSED) {
+ rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm);
+ counter_u64_add(rack_collapsed_win_rxt, 1);
+ counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start));
+ }
memset(&log.u_bbr, 0, sizeof(log.u_bbr));
log.u_bbr.inhpts = tcp_in_hpts(rack->rc_inp);
if (rack->rack_no_prr)
@@ -16538,6 +16602,58 @@ failed:
return (-1);
}
+static struct rack_sendmap *
+rack_check_collapsed(struct tcp_rack *rack, uint32_t cts)
+{
+ struct rack_sendmap *rsm = NULL;
+ struct rack_sendmap fe;
+ int thresh;
+
+restart:
+ fe.r_start = rack->r_ctl.last_collapse_point;
+ rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe);
+ if ((rsm == NULL) || ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0)) {
+ /* Nothing, strange turn off validity */
+ rack->r_collapse_point_valid = 0;
+ return (NULL);
+ }
+ /* Can we send it yet? */
+ if (rsm->r_end > (rack->rc_tp->snd_una + rack->rc_tp->snd_wnd)) {
+ /*
+ * Receiver window has not grown enough for
+ * the segment to be put on the wire.
+ */
+ return (NULL);
+ }
+ if (rsm->r_flags & RACK_ACKED) {
+ /*
+ * It has been sacked, lets move to the
+ * next one if possible.
+ */
+ rack->r_ctl.last_collapse_point = rsm->r_end;
+ /* Are we done? */
+ if (SEQ_GEQ(rack->r_ctl.last_collapse_point,
+ rack->r_ctl.high_collapse_point)) {
+ rack->r_collapse_point_valid = 0;
+ return (NULL);
+ }
+ goto restart;
+ }
+ /* Now has it been long enough ? */
+ thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts);
+ if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > thresh) {
+ rack_log_collapse(rack, rsm->r_start,
+ (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
+ thresh, __LINE__, 6, rsm->r_flags, rsm);
+ return (rsm);
+ }
+ /* Not enough time */
+ rack_log_collapse(rack, rsm->r_start,
+ (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])),
+ thresh, __LINE__, 7, rsm->r_flags, rsm);
+ return (NULL);
+}
+
static int
rack_output(struct tcpcb *tp)
{
@@ -16598,7 +16714,6 @@ rack_output(struct tcpcb *tp)
struct ip6_hdr *ip6 = NULL;
int32_t isipv6;
#endif
- uint8_t filled_all = 0;
bool hw_tls = false;
/* setup and take the cache hits here */
@@ -16863,6 +16978,29 @@ again:
sb_offset = rsm->r_start - tp->snd_una;
if (len >= segsiz)
len = segsiz;
+ } else if (rack->r_collapse_point_valid &&
+ ((rsm = rack_check_collapsed(rack, cts)) != NULL)) {
+ /*
+ * If an RSM is returned then enough time has passed
+ * for us to retransmit it. Move up the collapse point,
+ * since this rsm has its chance to retransmit now.
+ */
+ rack_trace_point(rack, RACK_TP_COLLAPSED_RXT);
+ rack->r_ctl.last_collapse_point = rsm->r_end;
+ /* Are we done? */
+ if (SEQ_GEQ(rack->r_ctl.last_collapse_point,
+ rack->r_ctl.high_collapse_point))
+ rack->r_collapse_point_valid = 0;
+ sack_rxmit = 1;
+ /* We are not doing a TLP */
+ doing_tlp = 0;
+ len = rsm->r_end - rsm->r_start;
+ sb_offset = rsm->r_start - tp->snd_una;
+ sendalot = 0;
+ if ((rack->full_size_rxt == 0) &&
+ (rack->shape_rxt_to_pacing_min == 0) &&
+ (len >= segsiz))
+ len = segsiz;
} else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) {
/* We have a retransmit that takes precedence */
if ((!IN_FASTRECOVERY(tp->t_flags)) &&
@@ -16921,53 +17059,72 @@ again:
}
if (rack->r_must_retran &&
(doing_tlp == 0) &&
+ (SEQ_GT(tp->snd_max, tp->snd_una)) &&
(rsm == NULL)) {
/*
- * Non-Sack and we had a RTO or Sack/non-Sack and a
- * MTU change, we need to retransmit until we reach
- * the former snd_max (rack->r_ctl.rc_snd_max_at_rto).
+ * There are two different ways that we
+ * can get into this block:
+ * a) This is a non-sack connection, we had a time-out
+ * and thus r_must_retran was set and everything
+ * left outstanding as been marked for retransmit.
+ * b) The MTU of the path shrank, so that everything
+ * was marked to be retransmitted with the smaller
+ * mtu and r_must_retran was set.
+ *
+ * This means that we expect the sendmap (outstanding)
+ * to all be marked must. We can use the tmap to
+ * look at them.
+ *
*/
- if (SEQ_GT(tp->snd_max, tp->snd_una)) {
- int sendwin, flight;
-
- sendwin = min(tp->snd_wnd, tp->snd_cwnd);
- flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto);
- if (flight >= sendwin) {
- so = inp->inp_socket;
- sb = &so->so_snd;
- goto just_return_nolock;
- }
- rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
- if (rsm == NULL) {
- /* TSNH */
- rack->r_must_retran = 0;
- rack->r_ctl.rc_out_at_rto = 0;
- so = inp->inp_socket;
- sb = &so->so_snd;
- goto just_return_nolock;
- }
- if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
- /* It does not have the flag, we are done */
- rack->r_must_retran = 0;
- rack->r_ctl.rc_out_at_rto = 0;
- } else {
- sack_rxmit = 1;
- len = rsm->r_end - rsm->r_start;
- sendalot = 0;
- sb_offset = rsm->r_start - tp->snd_una;
- if (len >= segsiz)
- len = segsiz;
- /*
- * Delay removing the flag RACK_MUST_RXT so
- * that the fastpath for retransmit will
- * work with this rsm.
- */
+ int sendwin, flight;
- }
- } else {
- /* We must be done if there is nothing outstanding */
+ sendwin = min(tp->snd_wnd, tp->snd_cwnd);
+ flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto);
+ if (flight >= sendwin) {
+ /*
+ * We can't send yet.
+ */
+ so = inp->inp_socket;
+ sb = &so->so_snd;
+ goto just_return_nolock;
+ }
+ /*
+ * This is the case a/b mentioned above. All
+ * outstanding/not-acked should be marked.
+ * We can use the tmap to find them.
+ */
+ rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
+ if (rsm == NULL) {
+ /* TSNH */
+ rack->r_must_retran = 0;
+ rack->r_ctl.rc_out_at_rto = 0;
+ so = inp->inp_socket;
+ sb = &so->so_snd;
+ goto just_return_nolock;
+ }
+ if ((rsm->r_flags & RACK_MUST_RXT) == 0) {
+ /*
+ * The first one does not have the flag, did we collapse
+ * further up in our list?
+ */
rack->r_must_retran = 0;
rack->r_ctl.rc_out_at_rto = 0;
+ rsm = NULL;
+ sack_rxmit = 0;
+ } else {
+ sack_rxmit = 1;
+ len = rsm->r_end - rsm->r_start;
+ sb_offset = rsm->r_start - tp->snd_una;
+ sendalot = 0;
+ if ((rack->full_size_rxt == 0) &&
+ (rack->shape_rxt_to_pacing_min == 0) &&
+ (len >= segsiz))
+ len = segsiz;
+ /*
+ * Delay removing the flag RACK_MUST_RXT so
+ * that the fastpath for retransmit will
+ * work with this rsm.
+ */
}
}
/*
@@ -18177,7 +18334,7 @@ send:
if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb,
((rsm == NULL) ? hw_tls : 0)
#ifdef NETFLIX_COPY_ARGS
- , &filled_all
+ , &s_mb, &s_moff
#endif
);
if (len <= (tp->t_maxseg - optlen)) {
@@ -18548,15 +18705,17 @@ send:
log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs;
log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs;
log.u_bbr.flex4 = orig_len;
- if (filled_all)
- log.u_bbr.flex5 = 0x80000000;
- else
- log.u_bbr.flex5 = 0;
/* Save off the early/late values */
log.u_bbr.flex6 = rack->r_ctl.rc_agg_early;
log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed;
log.u_bbr.bw_inuse = rack_get_bw(rack);
- if (rsm || sack_rxmit) {
+ log.u_bbr.flex8 = 0;
+ if (rsm) {
+ if (rsm->r_flags & RACK_RWND_COLLAPSED) {
+ rack_log_collapse(rack, rsm->r_start, rsm->r_end, 0, __LINE__, 5, rsm->r_flags, rsm);
+ counter_u64_add(rack_collapsed_win_rxt, 1);
+ counter_u64_add(rack_collapsed_win_rxt_bytes, (rsm->r_end - rsm->r_start));
+ }
if (doing_tlp)
log.u_bbr.flex8 = 2;
else
diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h
index e8560446b798..c747ceac7628 100644
--- a/sys/netinet/tcp_stacks/tcp_rack.h
+++ b/sys/netinet/tcp_stacks/tcp_rack.h
@@ -275,7 +275,7 @@ struct rack_opts_stats {
* non-zero, the default is 4 for continuous tracing.
* You also set in the number of connections you want
* have get BB logs in net.inet.tcp.<stack>.tp.count.
- *
+ *
* Count will decrement every time BB logging is assigned
* to a connection that hit your tracepoint.
*
@@ -291,6 +291,7 @@ struct rack_opts_stats {
#define RACK_TP_HWENOBUF 0x00000001 /* When we are doing hardware pacing and hit enobufs */
#define RACK_TP_ENOBUF 0x00000002 /* When we hit enobufs with software pacing */
#define RACK_TP_COLLAPSED_WND 0x00000003 /* When a peer to collapses its rwnd on us */
+#define RACK_TP_COLLAPSED_RXT 0x00000004 /* When we actually retransmit a collapsed window rsm */
#define MIN_GP_WIN 6 /* We need at least 6 MSS in a GP measurement */
#ifdef _KERNEL
@@ -472,6 +473,8 @@ struct rack_control {
uint32_t roundends; /* acked value above which round ends */
uint32_t num_dsack; /* Count of dsack's seen (1 per window)*/
uint32_t forced_ack_ts;
+ uint32_t last_collapse_point; /* Last point peer collapsed too */
+ uint32_t high_collapse_point;
uint32_t rc_lower_rtt_us_cts; /* Time our GP rtt was last lowered */
uint32_t rc_time_probertt_entered;
uint32_t rc_time_probertt_starts;
@@ -546,7 +549,15 @@ struct tcp_rack {
struct inpcb *rc_inp; /* The inpcb Lock(a) */
uint8_t rc_free_cnt; /* Number of free entries on the rc_free list
* Lock(a) */
- uint8_t client_bufferlvl; /* 0 - 5 normaly, less than or at 2 means its real low */
+ uint8_t client_bufferlvl : 4, /* Expected range [0,5]: 0=unset, 1=low/empty */
+ rack_deferred_inited : 1,
+ /* ******************************************************************** */
+ /* Note for details of next two fields see rack_init_retransmit_rate() */
+ /* ******************************************************************** */
+ full_size_rxt: 1,
+ shape_rxt_to_pacing_min : 1,
+ /* ******************************************************************** */
+ spare : 1;
uint8_t no_prr_addback : 1,
gp_ready : 1,
defer_options: 1,
@@ -647,7 +658,9 @@ struct tcp_rack {
r_late : 1,
r_wanted_output: 1,
r_rr_config : 2,
- rc_avail_bit : 3;
+ r_persist_lt_bw_off : 1,
+ r_collapse_point_valid : 1,
+ rc_avail_bit : 2;
uint16_t rc_init_win : 8,
rc_gp_rtt_set : 1,
rc_gp_dyn_mul : 1,