PERFORCE change 197225 for review
Catalin Nicutar
cnicutar at FreeBSD.org
Fri Aug 5 16:09:10 UTC 2011
http://p4web.freebsd.org/@@197225?ac=10
Change 197225 by cnicutar at cnicutar_cronos on 2011/08/05 16:08:46
Forward-port UTO kernel changes from 8 to HEAD.
Affected files ...
.. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp.h#2 edit
.. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_input.c#2 edit
.. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_output.c#2 edit
.. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_subr.c#2 edit
.. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_syncache.c#2 edit
.. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_syncache.h#2 edit
.. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_timer.c#2 edit
.. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_usrreq.c#2 edit
.. //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_var.h#2 edit
Differences ...
==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp.h#2 (text+ko) ====
@@ -96,6 +96,8 @@
#define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */
#define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */
#define TCPOLEN_SIGNATURE 18
+#define TCPOPT_UTO 28
+#define TCPOLEN_UTO 4
/* Miscellaneous constants */
#define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at receiver side */
@@ -103,6 +105,14 @@
/*
+ * The timeout ranges for TCP UTO have security implications; in particular,
+ * long timeouts might allow for denial-of-service attacks.
+ */
+#define TCP_UTOMIN 100 /* Minimum acceptable timeout. */
+#define TCP_UTOMAX 600 /* Maximum advertised timeout. */
+
+
+/*
* The default maximum segment size (MSS) to be used for new TCP connections
* when path MTU discovery is not enabled.
*
@@ -158,6 +168,8 @@
#define TCP_MD5SIG 0x10 /* use MD5 digests (RFC2385) */
#define TCP_INFO 0x20 /* retrieve tcp_info structure */
#define TCP_CONGESTION 0x40 /* get/set congestion control algorithm */
+#define TCP_SNDUTO_TIMEOUT 0x80 /* get/set sent UTO value */
+#define TCP_RCVUTO_TIMEOUT 0x100 /* accept UTO suggestion */
#define TCP_CA_NAME_MAX 16 /* max congestion control name length */
==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_input.c#2 (text+ko) ====
@@ -1324,6 +1324,21 @@
(void *)tcp_saveipgen, &tcp_savetcp, 0);
#endif
tcp_dooptions(&to, optp, optlen, TO_SYN);
+
+ if (to.to_flags & TOF_UTO) {
+ /*
+ * Storing the value even if the user might not
+ * accept it. Also, not clamping it just yet.
+ */
+ tp->rcv_uto = (to.to_uto & UTO_MINS) ?
+ (to.to_uto & ~(UTO_MINS)) * 60 : to.to_uto;
+ /*
+ * XXX-CN Using option both for send and receive.
+ * Clear it for syncache.
+ */
+ to.to_flags &= ~TOF_UTO;
+ }
+
syncache_add(&inc, &to, th, inp, &so, m);
/*
* Entry added to syncache and mbuf consumed.
@@ -1511,6 +1526,18 @@
(thflags & TH_SYN) ? TO_SYN : 0);
/*
+ * Processing received UTO even if the user doesn't accept it
+ * yet. The user might want to accept it later (perhaps after
+ * authentication) but the peer need not send it again.
+ * The value is converter to seconds and not clamped (the user
+ * needs to know the real value received).
+ */
+ if (to.to_flags & TOF_UTO) {
+ tp->rcv_uto = (to.to_uto & UTO_MINS) ?
+ (to.to_uto & ~(UTO_MINS)) * 60 : to.to_uto;
+ }
+
+ /*
* If echoed timestamp is later than the current time,
* fall back to non RFC1323 RTT calculation. Normalize
* timestamp if syncookies were used when this connection
@@ -3169,6 +3196,17 @@
to->to_sacks = cp + 2;
TCPSTAT_INC(tcps_sack_rcv_blocks);
break;
+ case TCPOPT_UTO:
+ if (optlen != TCPOLEN_UTO)
+ continue;
+ if (!V_uto_enable)
+ continue;
+ to->to_flags |= TOF_UTO;
+ bcopy((char *)cp + 2,
+ (char *)&to->to_uto, sizeof(to->to_uto));
+ to->to_uto = htons(to->to_uto);
+ /* Avoid converting to seconds: it might overflow. */
+ break;
default:
continue;
}
==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_output.c#2 (text+ko) ====
@@ -705,6 +705,18 @@
to.to_sacks = (u_char *)tp->sackblks;
}
}
+ /* UTO */
+ if (tp->t_flags & TF_SND_UTO) {
+ to.to_uto = tp->snd_uto;
+ to.to_flags |= TOF_UTO;
+ /*
+ * The option is sent with the SYN and with the first
+ * non-SYN segment.
+ */
+ if (!(flags & TH_SYN))
+ tp->t_flags &= ~TF_SND_UTO;
+
+ }
#ifdef TCP_SIGNATURE
/* TCP-MD5 (RFC2385). */
if (tp->t_flags & TF_SIGNATURE)
@@ -1491,6 +1503,39 @@
TCPSTAT_INC(tcps_sack_send_blocks);
break;
}
+ case TOF_UTO:
+ while (optlen % 4) {
+ optlen += TCPOLEN_NOP;
+ *optp++ = TCPOPT_NOP;
+ }
+ if (TCP_MAXOLEN - optlen < TCPOLEN_UTO)
+ continue;
+ optlen += TCPOLEN_UTO;
+ *optp++ = TCPOPT_UTO;
+ *optp++ = TCPOLEN_UTO;
+
+ if (to->to_uto > UTO_MINS_TH) {
+ /*
+ * If the timeout is larger than UTO_MINS
+ * we'll specify minutes.
+ * XXX-CN UTO_MINS is arbitrary.
+ */
+ to->to_uto /= 60;
+ to->to_uto |= UTO_MINS;
+ }
+
+ /*
+ * XXX-CN to_uto is 32b because the user is allowed
+ * to specify more than 16b of seconds (dividing the
+ * value by 60 will make it fit).
+ */
+ {
+ uint16_t uto = to->to_uto;
+ uto = htons(uto);
+ bcopy((u_char *)&uto, optp, sizeof(uto));
+ optp += sizeof(uto);
+ }
+ break;
default:
panic("%s: unknown TCP option type", __func__);
break;
==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_subr.c#2 (text+ko) ====
@@ -161,6 +161,24 @@
"Default TCP Maximum Segment Size for IPv6");
#endif /* INET6 */
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, uto, CTLFLAG_RW, 0, "TCP UTO");
+
+VNET_DEFINE(int, uto_enable) = 1;
+SYSCTL_VNET_INT(_net_inet_tcp_uto, OID_AUTO, enable, CTLFLAG_RW,
+ &VNET_NAME(uto_enable), 0,
+ "Enable TCP UTO for all connections");
+
+VNET_DEFINE(int, uto_min_timeout) = TCP_UTOMIN;
+SYSCTL_VNET_INT(_net_inet_tcp_uto, OID_AUTO, min_timeout, CTLFLAG_RW,
+ &VNET_NAME(uto_min_timeout), 0,
+ "Minimum accepted timeout for a connection");
+
+VNET_DEFINE(int, uto_max_timeout) = 600;
+SYSCTL_VNET_INT(_net_inet_tcp_uto, OID_AUTO, max_timeout, CTLFLAG_RW,
+ &VNET_NAME(uto_max_timeout), 0,
+ "Maximum accepted timeout for a connection");
+
+
/*
* Minimum MSS we accept and use. This prevents DoS attacks where
* we are forced to a ridiculous low MSS like 20 and send hundreds
==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_syncache.c#2 (text+ko) ====
@@ -827,6 +827,14 @@
#endif
if (sc->sc_flags & SCF_SACK)
tp->t_flags |= TF_SACK_PERMIT;
+ if (sc->sc_flags & SCF_SND_UTO) {
+ tp->t_flags |= TF_SND_UTO;
+ tp->snd_uto = sc->sc_snd_uto;
+ }
+ if (sc->sc_flags & SCF_RCV_UTO) {
+ tp->t_flags |= TF_RCV_UTO;
+ tp->rcv_uto = sc->sc_rcv_uto;
+ }
}
if (sc->sc_flags & SCF_ECN)
@@ -1039,6 +1047,14 @@
struct syncache scs;
struct ucred *cred;
+ /*
+ * The client may have sent us an UTO suggestion; even if it hasn't,
+ * we need to inherit the current disposition (i.e. will the resulting
+ * socket accept suggestions?).
+ */
+ uint8_t rcv_uto_tf = 0;
+ uint32_t rcv_uto = 0;
+
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp); /* listen socket */
KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_SYN,
@@ -1063,6 +1079,19 @@
sb_hiwat = so->so_rcv.sb_hiwat;
ltflags = (tp->t_flags & (TF_NOOPT | TF_SIGNATURE));
+ /* Set User Timeout to send in SYN-ACK. */
+ if (tp->t_flags & TF_SND_UTO) {
+ /* Also inherited after connection is established. */
+ to->to_uto = tp->snd_uto;
+ to->to_flags |= TOF_UTO;
+ }
+
+ if (tp->t_flags & TF_RCV_UTO) {
+ /* Remember received timeout to pass on. */
+ rcv_uto_tf = 1;
+ rcv_uto = tp->rcv_uto;
+ }
+
/* By the time we drop the lock these should no longer be used. */
so = NULL;
tp = NULL;
@@ -1271,7 +1300,17 @@
sc->sc_flags |= SCF_NOOPT;
if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn)
sc->sc_flags |= SCF_ECN;
+ if (to->to_flags & TOF_UTO) {
+ sc->sc_snd_uto = to->to_uto;
+ sc->sc_flags |= SCF_SND_UTO;
+ }
+ /* Inherit received UTO. */
+ if (rcv_uto_tf) {
+ sc->sc_rcv_uto = rcv_uto;
+ sc->sc_flags |= SCF_RCV_UTO;
+ }
+
if (V_tcp_syncookies) {
syncookie_generate(sch, sc, &flowtmp);
#ifdef INET6
@@ -1438,6 +1477,10 @@
}
if (sc->sc_flags & SCF_SACK)
to.to_flags |= TOF_SACKPERM;
+ if (sc->sc_flags & SCF_SND_UTO) {
+ to.to_uto = sc->sc_snd_uto;
+ to.to_flags |= TOF_UTO;
+ }
#ifdef TCP_SIGNATURE
if (sc->sc_flags & SCF_SIGNATURE)
to.to_flags |= TOF_SIGNATURE;
==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_syncache.h#2 (text+ko) ====
@@ -82,7 +82,8 @@
struct label *sc_label; /* MAC label reference */
struct ucred *sc_cred; /* cred cache for jail checks */
- u_int32_t sc_spare[2]; /* UTO */
+ u_int32_t sc_snd_uto; /* user timeout to send */
+ u_int32_t sc_rcv_uto; /* user timeout received */
};
/*
@@ -96,6 +97,8 @@
#define SCF_SIGNATURE 0x20 /* send MD5 digests */
#define SCF_SACK 0x80 /* send SACK option */
#define SCF_ECN 0x100 /* send ECN setup packet */
+#define SCF_SND_UTO 0x200 /* send UTO */
+#define SCF_RCV_UTO 0x400 /* receive UTO suggestions */
#define SYNCOOKIE_SECRET_SIZE 8 /* dwords */
#define SYNCOOKIE_LIFETIME 16 /* seconds */
==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_timer.c#2 (text+ko) ====
@@ -67,6 +67,9 @@
#include <netinet/tcp_debug.h>
#endif
+/* XXX-CN this will have to move */
+#define ticks_to_secs(t) ((t) / hz)
+
int tcp_keepinit;
SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit, CTLTYPE_INT|CTLFLAG_RW,
&tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "time to establish connection");
@@ -309,6 +312,18 @@
return;
}
callout_deactivate(&tp->t_timers->tt_keep);
+ if ((tp->snd_uto) || ((tp->t_flags & TF_RCV_UTO) && tp->rcv_uto)) {
+ /*
+ * This connection is using UTO (either sending or has
+ * received a value). We need to stop sending keepalives
+ * (RFC 5482 4.2).
+ * Returning without resetting the timer.
+ */
+ INP_WUNLOCK(inp);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ CURVNET_RESTORE();
+ return;
+ }
/*
* Keep-alive timer went off; send something
* or drop connection if idle for too long.
@@ -447,6 +462,7 @@
int rexmt;
int headlocked;
struct inpcb *inp;
+ int uto_left = 0;
#ifdef TCPDEBUG
int ostate;
@@ -477,12 +493,45 @@
}
callout_deactivate(&tp->t_timers->tt_rexmt);
tcp_free_sackholes(tp);
+
+ if (tp->t_rxtshift == 0)
+ /* UTO starting again since it's the first retransmit. */
+ tp->t_suto = 0;
+
+ if (tp->snd_uto || ((tp->t_flags & TF_RCV_UTO) && tp->rcv_uto)) {
+ /*
+ * Since we're using UTO for this connection we need to
+ * compute how much time we've got left.
+ */
+ uto_left = 0;
+ if (tp->t_flags & TF_RCV_UTO)
+ /* Clamping the received value. */
+ uto_left = min(V_uto_max_timeout,
+ max(V_uto_min_timeout, tp->rcv_uto));
+
+ /* Taking the longer timeout. */
+ uto_left = max(tp->snd_uto, uto_left);
+
+ /* Subtract time that has passed since the first retransmit. */
+ if (tp->t_suto)
+ uto_left -= ticks_to_secs(ticks - tp->t_suto);
+
+ /*
+ * The user may choose a value that's less than TCP_MAXRXTSHIFT
+ * retransmits.
+ */
+ if (uto_left <= 0)
+ /* Before or after the retransmits, UTO was exceeded. */
+ goto timeoutdrop;
+ }
+
/*
* Retransmission timer went off. Message has not
* been acked within retransmit interval. Back off
* to a longer retransmit interval and retransmit one segment.
*/
- if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) {
+ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT && uto_left <= 0) {
+timeoutdrop:
tp->t_rxtshift = TCP_MAXRXTSHIFT;
TCPSTAT_INC(tcps_timeoutdrop);
in_pcbref(inp);
@@ -525,13 +574,22 @@
tp->t_flags &= ~TF_WASCRECOVERY;
tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
tp->t_flags |= TF_PREVVALID;
+ tp->t_suto = ticks; /* Keep track of UTO start. */
} else
tp->t_flags &= ~TF_PREVVALID;
TCPSTAT_INC(tcps_rexmttimeo);
if (tp->t_state == TCPS_SYN_SENT)
rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
- else
- rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
+ else {
+ if (tp->t_rxtshift <= TCP_MAXRXTSHIFT)
+ rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
+ else
+ rexmt = TCPTV_REXMTMAX;
+ }
+ /* We might want to wait less than an entire backoff. */
+ if (uto_left)
+ rexmt = min(rexmt, uto_left * hz);
+
TCPT_RANGESET(tp->t_rxtcur, rexmt,
tp->t_rttmin, TCPTV_REXMTMAX);
/*
==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_usrreq.c#2 (text+ko) ====
@@ -1322,6 +1322,44 @@
INP_WUNLOCK(inp);
break;
#endif /* TCP_SIGNATURE */
+ case TCP_SNDUTO_TIMEOUT:
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ return (error);
+
+ INP_WLOCK_RECHECK(inp);
+ if (optval == 0) {
+ /* Disable sending the option. */
+ tp->t_flags &= ~TF_SND_UTO;
+ tp->snd_uto = 0;
+ } else if (optval >= V_uto_min_timeout &&
+ optval <= V_uto_max_timeout) {
+ /* The timeout is acceptable. */
+ tp->snd_uto = optval;
+ tp->t_flags |= TF_SND_UTO;
+ } else
+ error = EINVAL;
+
+ INP_WUNLOCK(inp);
+ break;
+
+ case TCP_RCVUTO_TIMEOUT:
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &optval, sizeof optval,
+ sizeof optval);
+ if (error)
+ return (error);
+
+ INP_WLOCK_RECHECK(inp);
+ if (optval <= 0)
+ /* This connection will ignore suggestions. */
+ tp->t_flags &= ~TF_RCV_UTO;
+ else
+ tp->t_flags |= TF_RCV_UTO;
+ INP_WUNLOCK(inp);
+ break;
case TCP_NODELAY:
case TCP_NOOPT:
INP_WUNLOCK(inp);
@@ -1454,7 +1492,16 @@
error = sooptcopyout(sopt, &optval, sizeof optval);
break;
#endif
-
+ case TCP_SNDUTO_TIMEOUT:
+ optval = tp->snd_uto;
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
+ case TCP_RCVUTO_TIMEOUT:
+ optval = tp->rcv_uto;
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, &optval, sizeof optval);
+ break;
case TCP_NODELAY:
optval = tp->t_flags & TF_NODELAY;
INP_WUNLOCK(inp);
==== //depot/projects/soc2011/cnicutar_tcputo_9/src/sys/netinet/tcp_var.h#2 (text+ko) ====
@@ -203,9 +203,13 @@
struct cc_var *ccv; /* congestion control specific vars */
struct osd *osd; /* storage for Khelp module data */
- uint32_t t_ispare[12]; /* 4 keep timers, 5 UTO, 3 TBD */
+ uint32_t t_ispare[9]; /* 4 keep timers, 2 UTO, 3 TBD */
void *t_pspare2[4]; /* 4 TBD */
uint64_t _pad[6]; /* 6 TBD (1-2 CC/RTT?) */
+
+ uint32_t snd_uto; /* sent timeout */
+ uint32_t rcv_uto; /* received suggestion from peer */
+ int t_suto; /* uto starting time */
};
/*
@@ -225,6 +229,8 @@
#define TF_NEEDFIN 0x000800 /* send FIN (implicit state) */
#define TF_NOPUSH 0x001000 /* don't push */
#define TF_PREVVALID 0x002000 /* saved values for bad rxmit valid */
+#define TF_SND_UTO 0x004000 /* send UTO option */
+#define TF_RCV_UTO 0x008000 /* accept UTO suggestions */
#define TF_MORETOCOME 0x010000 /* More data to be appended to sock */
#define TF_LQ_OVERFLOW 0x020000 /* listen queue overflow */
#define TF_LASTIDLE 0x040000 /* connection was previously idle */
@@ -292,7 +298,8 @@
#define TOF_TS 0x0010 /* timestamp */
#define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */
#define TOF_SACK 0x0080 /* Peer sent SACK option */
-#define TOF_MAXOPT 0x0100
+#define TOF_UTO 0x0100 /* user timeout option */
+#define TOF_MAXOPT 0x0200
u_int32_t to_tsval; /* new timestamp */
u_int32_t to_tsecr; /* reflected timestamp */
u_char *to_sacks; /* pointer to the first SACK blocks */
@@ -300,7 +307,7 @@
u_int16_t to_mss; /* maximum segment size */
u_int8_t to_wscale; /* window scaling */
u_int8_t to_nsacks; /* number of SACK blocks */
- u_int32_t to_spare; /* UTO */
+ u_int32_t to_uto; /* UTO */
};
/*
@@ -308,6 +315,12 @@
*/
#define TO_SYN 0x01 /* parse SYN-only options */
+/*
+ * Values for TCP UTO.
+ */
+#define UTO_MINS 0x8000 /* Highest bit set means "minutes". */
+#define UTO_MINS_TH 3600 /* Send minutes if >= one hour. */
+
struct hc_metrics_lite { /* must stay in sync with hc_metrics */
u_long rmx_mtu; /* MTU for this path */
u_long rmx_ssthresh; /* outbound gateway buffer limit */
@@ -611,6 +624,10 @@
VNET_DECLARE(int, ss_fltsz_local);
VNET_DECLARE(int, tcp_do_rfc3465);
VNET_DECLARE(int, tcp_abc_l_var);
+VNET_DECLARE(int, uto_enable);
+VNET_DECLARE(int, uto_min_timeout);
+VNET_DECLARE(int, uto_max_timeout);
+
#define V_tcb VNET(tcb)
#define V_tcbinfo VNET(tcbinfo)
#define V_tcpstat VNET(tcpstat)
@@ -623,6 +640,9 @@
#define V_ss_fltsz_local VNET(ss_fltsz_local)
#define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465)
#define V_tcp_abc_l_var VNET(tcp_abc_l_var)
+#define V_uto_enable VNET(uto_enable)
+#define V_uto_min_timeout VNET(uto_min_timeout)
+#define V_uto_max_timeout VNET(uto_max_timeout)
VNET_DECLARE(int, tcp_do_sack); /* SACK enabled/disabled */
VNET_DECLARE(int, tcp_sc_rst_sock_fail); /* RST on sock alloc failure */
More information about the p4-projects
mailing list