PERFORCE change 54368 for review
Paul Saab
ps at FreeBSD.org
Mon Jun 7 23:58:24 GMT 2004
http://perforce.freebsd.org/chv.cgi?CH=54368
Change 54368 by ps at butter.corp on 2004/06/07 23:58:02
Commit the SACK work done at Yahoo! on RELENG_4 and ported
to -current.
The scoreboarding code was obtained from OpenBSD, and many
of the remaining changes were inspired by OpenBSD, but not
taken directly from there.
Affected files ...
.. //depot/projects/sack/conf/files#2 edit
.. //depot/projects/sack/conf/options#2 edit
.. //depot/projects/sack/netinet/tcp.h#2 edit
.. //depot/projects/sack/netinet/tcp_input.c#2 edit
.. //depot/projects/sack/netinet/tcp_output.c#2 edit
.. //depot/projects/sack/netinet/tcp_sack.c#1 add
.. //depot/projects/sack/netinet/tcp_seq.h#2 edit
.. //depot/projects/sack/netinet/tcp_subr.c#2 edit
.. //depot/projects/sack/netinet/tcp_syncache.c#2 edit
.. //depot/projects/sack/netinet/tcp_timer.c#2 edit
.. //depot/projects/sack/netinet/tcp_var.h#2 edit
Differences ...
==== //depot/projects/sack/conf/files#2 (text+ko) ====
@@ -1450,6 +1450,7 @@
netinet/tcp_hostcache.c optional inet
netinet/tcp_input.c optional inet
netinet/tcp_output.c optional inet
+netinet/tcp_sack.c optional inet
netinet/tcp_subr.c optional inet
netinet/tcp_syncache.c optional inet
netinet/tcp_timer.c optional inet
==== //depot/projects/sack/conf/options#2 (text+ko) ====
@@ -346,6 +346,7 @@
SLIP_IFF_OPTS opt_slip.h
TCPDEBUG
TCP_SIGNATURE opt_inet.h
+TCP_SACK_DEBUG opt_tcp_sack.h
TCP_DROP_SYNFIN opt_tcp_input.h
XBONEHACK
==== //depot/projects/sack/netinet/tcp.h#2 (text+ko) ====
@@ -85,14 +85,17 @@
#define TCPOPT_SACK_PERMITTED 4 /* Experimental */
#define TCPOLEN_SACK_PERMITTED 2
#define TCPOPT_SACK 5 /* Experimental */
+#define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */
#define TCPOPT_TIMESTAMP 8
#define TCPOLEN_TIMESTAMP 10
#define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */
#define TCPOPT_TSTAMP_HDR \
(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP)
+#define MAX_TCPOPTLEN 40 /* Absolute maximum TCP options len */
+
#define TCPOPT_CC 11 /* CC options: RFC-1644 */
-#define TCPOPT_CCNEW 12
+#define TCPOPT_CCNEW 12
#define TCPOPT_CCECHO 13
#define TCPOLEN_CC 6
#define TCPOLEN_CC_APPA (TCPOLEN_CC+2)
@@ -101,6 +104,15 @@
#define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */
#define TCPOLEN_SIGNATURE 18
+/* Option definitions */
+#define TCPOPT_SACK_PERMIT_HDR \
+(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK_PERMITTED<<8|TCPOLEN_SACK_PERMITTED)
+#define TCPOPT_SACK_HDR (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK<<8)
+/* Miscellaneous constants */
+#define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at sender side */
+#define TCP_MAX_SACK 3 /* MAX # SACKs sent in any segment */
+
+
/*
* Default maximum segment size for TCP.
* With an IP MTU of 576, this is 536,
==== //depot/projects/sack/netinet/tcp_input.c#2 (text+ko) ====
@@ -37,6 +37,7 @@
#include "opt_mac.h"
#include "opt_tcpdebug.h"
#include "opt_tcp_input.h"
+#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/kernel.h>
@@ -159,7 +160,9 @@
struct inpcbinfo tcbinfo;
struct mtx *tcbinfo_mtx;
-static void tcp_dooptions(struct tcpopt *, u_char *, int, int);
+static void tcp_dooptions(struct tcpcb *, struct tcpopt *, u_char *,
+ int, int, struct tcphdr *);
+
static void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
static int tcp_reass(struct tcpcb *, struct tcphdr *, int *,
@@ -721,7 +724,7 @@
* present in a SYN segment. See tcp_timewait().
*/
if (thflags & TH_SYN)
- tcp_dooptions(&to, optp, optlen, 1);
+ tcp_dooptions((struct tcpcb *)NULL, &to, optp, optlen, 1, th);
if (tcp_timewait((struct tcptw *)inp->inp_ppcb,
&to, th, m, tlen))
goto findpcb;
@@ -934,7 +937,7 @@
tcp_trace(TA_INPUT, ostate, tp,
(void *)tcp_saveipgen, &tcp_savetcp, 0);
#endif
- tcp_dooptions(&to, optp, optlen, 1);
+ tcp_dooptions(tp, &to, optp, optlen, 1, th);
if (!syncache_add(&inc, &to, th, &so, m))
goto drop;
if (so == NULL) {
@@ -1050,7 +1053,7 @@
* for incoming connections is handled in tcp_syncache.
* XXX this is traditional behavior, may need to be cleaned up.
*/
- tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
+ tcp_dooptions(tp,&to, optp, optlen, thflags & TH_SYN,th);
if (thflags & TH_SYN) {
if (to.to_flags & TOF_SCALE) {
tp->t_flags |= TF_RCVD_SCALE;
@@ -1065,8 +1068,22 @@
tp->t_flags |= TF_RCVD_CC;
if (to.to_flags & TOF_MSS)
tcp_mss(tp, to.to_mss);
+ if (tp->sack_enable) {
+ if (!(to.to_flags & TOF_SACK))
+ tp->sack_enable = 0;
+ else
+ tp->t_flags |= TF_SACK_PERMIT;
+ }
+
}
+ if (tp->sack_enable) {
+ /* Delete stale (cumulatively acked) SACK holes */
+ tcp_del_sackholes(tp, th);
+ tp->rcv_laststart = th->th_seq; /* last rec'vd segment*/
+ tp->rcv_lastend = th->th_seq + tlen;
+ }
+
/*
* Header prediction: check for the two common cases
* of a uni-directional data xfer. If the packet has
@@ -1116,9 +1133,10 @@
if (SEQ_GT(th->th_ack, tp->snd_una) &&
SEQ_LEQ(th->th_ack, tp->snd_max) &&
tp->snd_cwnd >= tp->snd_wnd &&
- ((!tcp_do_newreno &&
+ ((!tcp_do_newreno && !tp->sack_enable &&
tp->t_dupacks < tcprexmtthresh) ||
- (tcp_do_newreno && !IN_FASTRECOVERY(tp)))) {
+ ((tcp_do_newreno || tp->sack_enable) &&
+ !IN_FASTRECOVERY(tp)))) {
KASSERT(headlocked, ("headlocked"));
INP_INFO_WUNLOCK(&tcbinfo);
/*
@@ -1214,6 +1232,9 @@
* with nothing on the reassembly queue and
* we have enough buffer space to take it.
*/
+ /* Clean receiver SACK report if present */
+ if (tp->sack_enable && tp->rcv_numsacks)
+ tcp_clean_sackreport(tp);
++tcpstat.tcps_preddat;
tp->rcv_nxt += tlen;
/*
@@ -1892,7 +1913,7 @@
th->th_ack != tp->snd_una)
tp->t_dupacks = 0;
else if (++tp->t_dupacks > tcprexmtthresh ||
- (tcp_do_newreno &&
+ ((tcp_do_newreno || tp->sack_enable) &&
IN_FASTRECOVERY(tp))) {
tp->snd_cwnd += tp->t_maxseg;
(void) tcp_output(tp);
@@ -1900,7 +1921,8 @@
} else if (tp->t_dupacks == tcprexmtthresh) {
tcp_seq onxt = tp->snd_nxt;
u_int win;
- if (tcp_do_newreno &&
+ if ((tcp_do_newreno ||
+ tp->sack_enable) &&
SEQ_LEQ(th->th_ack,
tp->snd_recover)) {
tp->t_dupacks = 0;
@@ -1915,6 +1937,17 @@
tp->snd_recover = tp->snd_max;
callout_stop(tp->tt_rexmt);
tp->t_rtttime = 0;
+ if (tp->sack_enable) {
+ tcpstat.tcps_sack_recovery_episode++;
+ tp->snd_cwnd =
+ tp->t_maxseg *
+ tp->t_dupacks;
+ (void) tcp_output(tp);
+ tp->snd_cwnd =
+ tp->snd_ssthresh;
+ goto drop;
+ }
+
tp->snd_nxt = th->th_ack;
tp->snd_cwnd = tp->t_maxseg;
(void) tcp_output(tp);
@@ -1965,12 +1998,16 @@
* If the congestion window was inflated to account
* for the other side's cached packets, retract it.
*/
- if (tcp_do_newreno) {
+ if (tcp_do_newreno || tp->sack_enable) {
if (IN_FASTRECOVERY(tp)) {
if (SEQ_LT(th->th_ack, tp->snd_recover)) {
- tcp_newreno_partial_ack(tp, th);
+ if (tp->sack_enable)
+ tcp_sack_partialack(tp, th);
+ else
+ tcp_newreno_partial_ack(tp, th);
} else {
/*
+ * Out of fast recovery.
* Window inflation should have left us
* with approximately snd_ssthresh
* outstanding data.
@@ -2092,7 +2129,8 @@
* Otherwise open linearly: maxseg per window
* (maxseg^2 / cwnd per packet).
*/
- if (!tcp_do_newreno || !IN_FASTRECOVERY(tp)) {
+ if ((!tcp_do_newreno && !tp->sack_enable) ||
+ !IN_FASTRECOVERY(tp)) {
register u_int cw = tp->snd_cwnd;
register u_int incr = tp->t_maxseg;
if (cw > tp->snd_ssthresh)
@@ -2110,14 +2148,20 @@
}
sowwakeup(so);
/* detect una wraparound */
- if (tcp_do_newreno && !IN_FASTRECOVERY(tp) &&
+ if ((tcp_do_newreno || tp->sack_enable) &&
+ !IN_FASTRECOVERY(tp) &&
SEQ_GT(tp->snd_una, tp->snd_recover) &&
SEQ_LEQ(th->th_ack, tp->snd_recover))
tp->snd_recover = th->th_ack - 1;
- if (tcp_do_newreno && IN_FASTRECOVERY(tp) &&
+ if ((tcp_do_newreno || tp->sack_enable) &&
+ IN_FASTRECOVERY(tp) &&
SEQ_GEQ(th->th_ack, tp->snd_recover))
EXIT_FASTRECOVERY(tp);
tp->snd_una = th->th_ack;
+ if (tp->sack_enable) {
+ if (SEQ_GT(tp->snd_una, tp->snd_recover))
+ tp->snd_recover = tp->snd_una;
+ }
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
tp->snd_nxt = tp->snd_una;
@@ -2318,7 +2362,8 @@
thflags = tcp_reass(tp, th, &tlen, m);
tp->t_flags |= TF_ACKNOW;
}
-
+ if (tp->sack_enable)
+ tcp_update_sack_list(tp);
/*
* Note the amount of data that peer has sent into
* our window, in order to estimate the sender's
@@ -2521,11 +2566,13 @@
* Parse TCP options and place in tcpopt.
*/
static void
-tcp_dooptions(to, cp, cnt, is_syn)
+tcp_dooptions(tp, to, cp, cnt, is_syn, th)
+ struct tcpcb *tp;
struct tcpopt *to;
- u_char *cp;
+ u_char *cp;
int cnt;
int is_syn;
+ struct tcphdr *th;
{
int opt, optlen;
@@ -2614,6 +2661,20 @@
to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN);
break;
#endif
+ case TCPOPT_SACK_PERMITTED:
+ if (!tcp_do_sack ||
+ optlen != TCPOLEN_SACK_PERMITTED)
+ continue;
+ if (is_syn) {
+ /* MUST only be set on SYN */
+ to->to_flags |= TOF_SACK;
+ }
+ break;
+
+ case TCPOPT_SACK:
+ if (!tp || tcp_sack_option(tp, th, cp, optlen))
+ continue;
+ break;
default:
continue;
}
==== //depot/projects/sack/netinet/tcp_output.c#2 (text+ko) ====
@@ -35,6 +35,7 @@
#include "opt_ipsec.h"
#include "opt_mac.h"
#include "opt_tcpdebug.h"
+#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -122,6 +123,8 @@
u_char opt[TCP_MAXOLEN];
unsigned ipoptlen, optlen, hdrlen;
int idle, sendalot;
+ int i, sack_rxmit;
+ struct sackhole *p;
#if 0
int maxburst = TCP_MAXBURST;
#endif
@@ -171,6 +174,13 @@
}
}
again:
+ /*
+ * If we've recently taken a timeout, snd_max will be greater than
+ * snd_nxt. There may be SACK information that allows us to avoid
+ * resending already delivered data. Adjust snd_nxt accordingly.
+ */
+ if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
+ tcp_sack_adjust(tp);
sendalot = 0;
off = tp->snd_nxt - tp->snd_una;
sendwin = min(tp->snd_wnd, tp->snd_cwnd);
@@ -178,6 +188,36 @@
flags = tcp_outflags[tp->t_state];
/*
+ * Send any SACK-generated retransmissions. If we're explicitly trying
+ * to send out new data (when sendalot is 1), bypass this function.
+ * If we retransmit in fast recovery mode, decrement snd_cwnd, since
+ * we're replacing a (future) new transmission with a retransmission
+ * now, and we previously incremented snd_cwnd in tcp_input().
+ */
+ /*
+ * Still in sack recovery , reset rxmit flag to zero.
+ */
+ sack_rxmit = 0;
+ len = 0;
+ p = NULL;
+ if (tp->sack_enable && IN_FASTRECOVERY(tp) &&
+ (p = tcp_sack_output(tp))) {
+ sack_rxmit = 1;
+ sendalot = 1;
+ off = p->rxmit - tp->snd_una;
+ KASSERT(tp->snd_cwnd >= 0,("%s: CWIN is negative: %ld", __func__, tp->snd_cwnd));
+ /* Do not retransmit SACK segments beyond snd_recover */
+ if (SEQ_GT(p->end, tp->snd_recover))
+ len = min(tp->snd_cwnd, tp->snd_recover - p->rxmit);
+ else
+ len = min(tp->snd_cwnd, p->end - p->rxmit);
+ if (len > 0) {
+ tcpstat.tcps_sack_rexmits++;
+ tcpstat.tcps_sack_rexmit_bytes +=
+ min(len, tp->t_maxseg);
+ }
+ }
+ /*
* Get standard flags, and add SYN or FIN if requested by 'hidden'
* state flags.
*/
@@ -230,9 +270,12 @@
* In the normal retransmit-FIN-only case, however, snd_nxt will
* be set to snd_una, the offset will be 0, and the length may
* wind up 0.
+ *
+ * If sack_rxmit is true we are retransmitting from the scoreboard
+ * in which case len is already set.
*/
- len = (long)ulmin(so->so_snd.sb_cc, sendwin) - off;
-
+ if (!sack_rxmit)
+ len = ((long)ulmin(so->so_snd.sb_cc, sendwin) - off);
/*
* Lop off SYN bit if it has already been sent. However, if this
@@ -331,6 +374,8 @@
goto send;
if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */
goto send;
+ if (sack_rxmit)
+ goto send;
}
/*
@@ -374,7 +419,18 @@
if (flags & TH_FIN &&
((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
goto send;
-
+ /*
+ * In SACK, it is possible for tcp_output to fail to send a segment
+ * after the retransmission timer has been turned off. Make sure
+ * that the retransmission timer is set.
+ */
+ if (tp->sack_enable && SEQ_GT(tp->snd_max, tp->snd_una) &&
+ !callout_active(tp->tt_rexmt) &&
+ !callout_active(tp->tt_persist)) {
+ callout_reset(tp->tt_rexmt, tp->t_rxtcur,
+ tcp_timer_rexmt, tp);
+ return (0);
+ }
/*
* TCP window updates are not reliable, rather a polling protocol
* using ``persist'' packets is used to insure receipt of window
@@ -435,6 +491,19 @@
(void)memcpy(opt + 2, &mss, sizeof(mss));
optlen = TCPOLEN_MAXSEG;
+ /*
+ * If this is the first SYN of connection (not a SYN
+ * ACK), include SACK_PERMIT_HDR option. If this is a
+ * SYN ACK, include SACK_PERMIT_HDR option if peer has
+ * already done so. This is only for active connect,
+ * since the syncache takes care of the passive connect.
+ */
+ if (tp->sack_enable && ((flags & TH_ACK) == 0 ||
+ (tp->t_flags & TF_SACK_PERMIT))) {
+ *((u_int32_t *) (opt + optlen)) =
+ htonl(TCPOPT_SACK_PERMIT_HDR);
+ optlen += 4;
+ }
if ((tp->t_flags & TF_REQ_SCALE) &&
((flags & TH_ACK) == 0 ||
(tp->t_flags & TF_RCVD_SCALE))) {
@@ -466,6 +535,32 @@
optlen += TCPOLEN_TSTAMP_APPA;
}
+ /*
+ * Send SACKs if necessary. This should be the last option processed.
+ * Only as many SACKs are sent as are permitted by the maximum options
+ * size. No more than three SACKs are sent.
+ */
+ if (tp->sack_enable && tp->t_state == TCPS_ESTABLISHED &&
+ (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT &&
+ tp->rcv_numsacks) {
+ u_int32_t *lp = (u_int32_t *)(opt + optlen);
+ u_int32_t *olp = lp++;
+ int count = 0; /* actual number of SACKs inserted */
+ int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK;
+
+ tcpstat.tcps_sack_send_blocks++;
+ maxsack = min(maxsack, TCP_MAX_SACK);
+ for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) {
+ struct sackblk sack = tp->sackblks[i];
+ if (sack.start == 0 && sack.end == 0)
+ continue;
+ *lp++ = htonl(sack.start);
+ *lp++ = htonl(sack.end);
+ count++;
+ }
+ *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2));
+ optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */
+ }
/*
* Send `CC-family' options if our side wants to use them (TF_REQ_CC),
* options are allowed (!TF_NOOPT) and it's not a RST.
@@ -734,6 +829,10 @@
th->th_seq = htonl(tp->snd_nxt);
else
th->th_seq = htonl(tp->snd_max);
+ if (sack_rxmit) {
+ th->th_seq = htonl(p->rxmit);
+ p->rxmit += len;
+ }
th->th_ack = htonl(tp->rcv_nxt);
if (optlen) {
bcopy(opt, th + 1, optlen);
@@ -831,6 +930,8 @@
tp->t_flags |= TF_SENTFIN;
}
}
+ if (tp->sack_enable && sack_rxmit && (p->rxmit != tp->snd_nxt))
+ goto timer;
tp->snd_nxt += len;
if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
tp->snd_max = tp->snd_nxt;
@@ -853,6 +954,17 @@
* Initialize shift counter which is used for backoff
* of retransmit time.
*/
+timer:
+ if (tp->sack_enable && sack_rxmit &&
+ !callout_active(tp->tt_rexmt) &&
+ tp->snd_nxt != tp->snd_max) {
+ callout_reset(tp->tt_rexmt, tp->t_rxtcur,
+ tcp_timer_rexmt, tp);
+ if (callout_active(tp->tt_persist)) {
+ callout_stop(tp->tt_persist);
+ tp->t_rxtshift = 0;
+ }
+ }
if (!callout_active(tp->tt_rexmt) &&
tp->snd_nxt != tp->snd_una) {
if (callout_active(tp->tt_persist)) {
==== //depot/projects/sack/netinet/tcp_seq.h#2 (text+ko) ====
@@ -42,6 +42,9 @@
#define SEQ_GT(a,b) ((int)((a)-(b)) > 0)
#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0)
+#define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b))
+#define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b))
+
/* for modulo comparisons of timestamps */
#define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
#define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
==== //depot/projects/sack/netinet/tcp_subr.c#2 (text+ko) ====
@@ -36,6 +36,7 @@
#include "opt_ipsec.h"
#include "opt_mac.h"
#include "opt_tcpdebug.h"
+#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -201,6 +202,17 @@
SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
&tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
+
+int tcp_do_sack = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_sack, CTLFLAG_RW,
+ &tcp_do_sack, 0, "Enable/Disable TCP SACK support");
+
+int tcp_sackhole_limit = 10 * 1024; /* Arbitrarily set */
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sackhole_limit, CTLFLAG_RW,
+ &tcp_sackhole_limit, 0, "Limit on the total SACK scoreboard elements");
+
+uma_zone_t sack_hole_zone;
+
static struct inpcb *tcp_notify(struct inpcb *, int);
static void tcp_discardcb(struct tcpcb *);
static void tcp_isn_tick(void *);
@@ -292,6 +304,8 @@
tcp_isn_tick(NULL);
EVENTHANDLER_REGISTER(shutdown_pre_sync, tcp_fini, NULL,
SHUTDOWN_PRI_DEFAULT);
+ sack_hole_zone = uma_zcreate("sackhole", sizeof(struct sackhole),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
}
void
@@ -599,6 +613,7 @@
tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
if (tcp_do_rfc1644)
tp->t_flags |= TF_REQ_CC;
+ tp->sack_enable = tcp_do_sack;
tp->t_inpcb = inp; /* XXX */
/*
* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
@@ -732,6 +747,7 @@
tp->t_segqlen--;
tcp_reass_qsize--;
}
+ tcp_free_sackholes(tp);
inp->inp_ppcb = NULL;
tp->t_inpcb = NULL;
uma_zfree(tcpcb_zone, tp);
@@ -752,7 +768,6 @@
#ifdef INET6
struct socket *so = inp->inp_socket;
#endif
-
tcp_discardcb(tp);
#ifdef INET6
if (INP_CHECK_SOCKAF(so, AF_INET6))
==== //depot/projects/sack/netinet/tcp_syncache.c#2 (text+ko) ====
@@ -39,6 +39,7 @@
#include "opt_ipsec.h"
#include "opt_mac.h"
#include "opt_tcpdebug.h"
+#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -702,7 +703,10 @@
if (sc->sc_flags & SCF_SIGNATURE)
tp->t_flags |= TF_SIGNATURE;
#endif
-
+ if (sc->sc_flags & SCF_SACK) {
+ tp->sack_enable = 1;
+ tp->t_flags |= TF_SACK_PERMIT;
+ }
/*
* Set up MSS and get cached values from tcp_hostcache.
* This might overwrite some of the defaults we just set.
@@ -989,6 +993,9 @@
sc->sc_flags = SCF_SIGNATURE;
#endif
+ if (to->to_flags & TOF_SACK)
+ sc->sc_flags |= SCF_SACK;
+
/*
* XXX
* We have the option here of not doing TAO (even if the segment
@@ -1105,6 +1112,7 @@
optlen += (sc->sc_flags & SCF_SIGNATURE) ?
TCPOLEN_SIGNATURE + 2 : 0;
#endif
+ optlen += ((sc->sc_flags & SCF_SACK) ? 4 : 0);
}
tlen = hlen + sizeof(struct tcphdr) + optlen;
@@ -1242,6 +1250,11 @@
optp += TCPOLEN_SIGNATURE + 2;
}
#endif /* TCP_SIGNATURE */
+
+ if (sc->sc_flags & SCF_SACK) {
+ *(u_int32_t *)optp = htonl(TCPOPT_SACK_PERMIT_HDR);
+ optp += 4;
+ }
}
#ifdef INET6
==== //depot/projects/sack/netinet/tcp_timer.c#2 (text+ko) ====
@@ -32,6 +32,7 @@
#include "opt_inet6.h"
#include "opt_tcpdebug.h"
+#include "opt_tcp_sack.h"
#include <sys/param.h>
#include <sys/kernel.h>
@@ -217,6 +218,7 @@
return;
}
INP_LOCK(inp);
+ tcp_free_sackholes(tp);
if (callout_pending(tp->tt_2msl) || !callout_active(tp->tt_2msl)) {
INP_UNLOCK(tp->t_inpcb);
INP_INFO_WUNLOCK(&tcbinfo);
@@ -497,6 +499,7 @@
return;
}
callout_deactivate(tp->tt_rexmt);
+ tcp_free_sackholes(tp);
/*
* Retransmission timer went off. Message has not
* been acked within retransmit interval. Back off
==== //depot/projects/sack/netinet/tcp_var.h#2 (text+ko) ====
@@ -52,6 +52,17 @@
extern int tcp_reass_qsize;
extern struct uma_zone *tcp_reass_zone;
+struct sackblk {
+ tcp_seq start; /* start seq no. of sack block */
+ tcp_seq end; /* end seq no. */
+};
+
+struct sackhole {
+ tcp_seq start; /* start seq no. of hole */
+ tcp_seq end; /* end seq no. */
+ tcp_seq rxmit; /* next seq. no in hole to be retransmitted */
+ struct sackhole *next; /* next in list */
+};
struct tcptemp {
u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */
struct tcphdr tt_t;
@@ -179,6 +190,16 @@
u_long rcv_second; /* start of interval second */
u_long rcv_pps; /* received packets per second */
u_long rcv_byps; /* received bytes per second */
+ /* SACK related state */
+ int sack_enable; /* enable SACK for this connection */
+ int snd_numholes; /* number of holes seen by sender */
+ struct sackhole *snd_holes; /* linked list of holes (sorted) */
+
+ tcp_seq rcv_laststart; /* start of last segment recd. */
+ tcp_seq rcv_lastend; /* end of ... */
+ tcp_seq rcv_lastsack; /* last seq number(+1) sack'd by rcv'r*/
+ int rcv_numsacks; /* # distinct sack blks present */
+ struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */
};
#define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY)
@@ -216,6 +237,7 @@
#define TOF_SCALE 0x0020
#define TOF_SIGNATURE 0x0040 /* signature option present */
#define TOF_SIGLEN 0x0080 /* signature length valid (RFC2385) */
+#define TOF_SACK 0x0100 /* Peer sent SACK option */
u_int32_t to_tsval;
u_int32_t to_tsecr;
tcp_cc to_cc; /* holds CC or CCnew */
@@ -249,6 +271,7 @@
#define SCF_CC 0x08 /* negotiated CC */
#define SCF_UNREACH 0x10 /* icmp unreachable received */
#define SCF_SIGNATURE 0x20 /* send MD5 digests */
+#define SCF_SACK 0x80 /* send SACK option */
TAILQ_ENTRY(syncache) sc_hash;
TAILQ_ENTRY(syncache) sc_timerq;
};
@@ -434,6 +457,13 @@
u_long tcps_hc_added; /* entry added to hostcache */
u_long tcps_hc_bucketoverflow; /* hostcache per bucket limit hit */
+
+ /* SACK related stats */
+ u_long tcps_sack_recovery_episode; /* SACK recovery episodes */
+ u_long tcps_sack_rexmits; /* SACK rexmit segments */
+ u_long tcps_sack_rexmit_bytes; /* SACK rexmit bytes */
+ u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */
+ u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */
};
/*
@@ -467,7 +497,8 @@
#define TCPCTL_PCBLIST 11 /* list of all outstanding PCBs */
#define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */
#define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */
-#define TCPCTL_MAXID 14
+#define TCPCTL_SACK 14 /* Selective Acknowledgement,rfc 2018 */
+#define TCPCTL_MAXID 15
#define TCPCTL_NAMES { \
{ 0, 0 }, \
@@ -505,6 +536,8 @@
extern int ss_fltsz;
extern int ss_fltsz_local;
+extern int tcp_do_sack; /* SACK enabled/disabled */
+
void tcp_canceltimers(struct tcpcb *);
struct tcpcb *
tcp_close(struct tcpcb *);
@@ -578,6 +611,23 @@
extern u_long tcp_recvspace;
tcp_seq tcp_new_isn(struct tcpcb *);
+int tcp_sack_option(struct tcpcb *,struct tcphdr *,u_char *,int);
+void tcp_update_sack_list(struct tcpcb *tp);
+void tcp_del_sackholes(struct tcpcb *, struct tcphdr *);
+void tcp_clean_sackreport(struct tcpcb *tp);
+void tcp_sack_adjust(struct tcpcb *tp);
+struct sackhole *tcp_sack_output(struct tcpcb *tp);
+void tcp_sack_partialack(struct tcpcb *, struct tcphdr *);
+void tcp_free_sackholes(struct tcpcb *tp);
+#ifdef DEBUG
+void tcp_print_holes(struct tcpcb *tp);
+#endif
+int tcp_newreno(struct tcpcb *, struct tcphdr *);
+u_long tcp_seq_subtract(u_long, u_long );
+#ifdef TCP_SACK_DEBUG
+void tcp_print_holes(struct tcpcb *tp);
+#endif /* TCP_SACK_DEBUG */
+
#endif /* _KERNEL */
#endif /* _NETINET_TCP_VAR_H_ */
More information about the p4-projects
mailing list