PERFORCE change 113893 for review

Roman Divacky rdivacky at FreeBSD.org
Fri Feb 2 09:19:43 UTC 2007


http://perforce.freebsd.org/chv.cgi?CH=113893

Change 113893 by rdivacky at rdivacky_witten on 2007/02/02 09:18:56

	IFC

Affected files ...

.. //depot/projects/linuxolator/src/sys/arm/xscale/ixp425/avila_machdep.c#3 integrate
.. //depot/projects/linuxolator/src/sys/compat/linux/linux_emul.c#32 integrate
.. //depot/projects/linuxolator/src/sys/dev/iwi/if_iwi.c#6 integrate
.. //depot/projects/linuxolator/src/sys/kern/sched_4bsd.c#10 integrate
.. //depot/projects/linuxolator/src/sys/kern/subr_witness.c#6 integrate
.. //depot/projects/linuxolator/src/sys/kern/uipc_socket.c#10 integrate
.. //depot/projects/linuxolator/src/sys/net80211/_ieee80211.h#4 integrate
.. //depot/projects/linuxolator/src/sys/netinet/tcp_input.c#7 integrate
.. //depot/projects/linuxolator/src/sys/netinet/tcp_output.c#5 integrate
.. //depot/projects/linuxolator/src/sys/netinet/tcp_syncache.c#5 integrate
.. //depot/projects/linuxolator/src/sys/netinet/tcp_usrreq.c#4 integrate
.. //depot/projects/linuxolator/src/sys/netinet/tcp_var.h#3 integrate
.. //depot/projects/linuxolator/src/sys/sun4v/include/intr_machdep.h#3 integrate
.. //depot/projects/linuxolator/src/sys/sun4v/include/smp.h#4 integrate
.. //depot/projects/linuxolator/src/sys/sun4v/sun4v/intr_machdep.c#4 integrate
.. //depot/projects/linuxolator/src/sys/sun4v/sun4v/mp_machdep.c#5 integrate
.. //depot/projects/linuxolator/src/sys/sun4v/sun4v/tte.c#4 integrate
.. //depot/projects/linuxolator/src/sys/sys/socketvar.h#2 integrate

Differences ...

==== //depot/projects/linuxolator/src/sys/arm/xscale/ixp425/avila_machdep.c#3 (text+ko) ====

@@ -49,7 +49,7 @@
 #include "opt_ddb.h"
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/arm/xscale/ixp425/avila_machdep.c,v 1.2 2006/12/06 06:34:54 julian Exp $");
+__FBSDID("$FreeBSD: src/sys/arm/xscale/ixp425/avila_machdep.c,v 1.3 2007/02/02 05:14:21 kevlo Exp $");
 
 #define _ARM32_BUS_DMA_PRIVATE
 #include <sys/param.h>
@@ -274,7 +274,7 @@
 #ifdef DDB
 	vm_offset_t zstart = 0, zend = 0;
 #endif
-	int i = 0;
+	int i;
 	uint32_t fake_preload[35];
 	uint32_t memsize;
 

==== //depot/projects/linuxolator/src/sys/compat/linux/linux_emul.c#32 (text+ko) ====

@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/compat/linux/linux_emul.c,v 1.14 2007/02/01 13:29:27 kib Exp $");
+__FBSDID("$FreeBSD: src/sys/compat/linux/linux_emul.c,v 1.15 2007/02/02 08:58:16 kib Exp $");
 
 #include "opt_compat.h"
 

==== //depot/projects/linuxolator/src/sys/dev/iwi/if_iwi.c#6 (text+ko) ====

@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/dev/iwi/if_iwi.c,v 1.44 2006/12/07 15:24:38 kevlo Exp $");
+__FBSDID("$FreeBSD: src/sys/dev/iwi/if_iwi.c,v 1.45 2007/02/02 05:17:18 kevlo Exp $");
 
 /*-
  * Intel(R) PRO/Wireless 2200BG/2225BG/2915ABG driver
@@ -545,9 +545,10 @@
 	ring->queued = 0;
 	ring->cur = ring->next = 0;
 
-	error = bus_dma_tag_create(NULL, 4, 0, BUS_SPACE_MAXADDR_32BIT,
-	    BUS_SPACE_MAXADDR, NULL, NULL, count * IWI_CMD_DESC_SIZE, 1,
-	    count * IWI_CMD_DESC_SIZE, 0, NULL, NULL, &ring->desc_dmat);
+	error = bus_dma_tag_create(bus_get_dma_tag(sc->sc_dev), 4, 0,
+	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL,
+	    count * IWI_CMD_DESC_SIZE, 1, count * IWI_CMD_DESC_SIZE, 0, 
+	    NULL, NULL, &ring->desc_dmat);
 	if (error != 0) {
 		device_printf(sc->sc_dev, "could not create desc DMA tag\n");
 		goto fail;
@@ -606,9 +607,10 @@
 	ring->csr_ridx = csr_ridx;
 	ring->csr_widx = csr_widx;
 
-	error = bus_dma_tag_create(NULL, 4, 0, BUS_SPACE_MAXADDR_32BIT,
-	    BUS_SPACE_MAXADDR, NULL, NULL, count * IWI_TX_DESC_SIZE, 1,
-	    count * IWI_TX_DESC_SIZE, 0, NULL, NULL, &ring->desc_dmat);
+	error = bus_dma_tag_create(bus_get_dma_tag(sc->sc_dev), 4, 0,
+	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL,
+	    count * IWI_TX_DESC_SIZE, 1, count * IWI_TX_DESC_SIZE, 0, NULL, 
+	    NULL, &ring->desc_dmat);
 	if (error != 0) {
 		device_printf(sc->sc_dev, "could not create desc DMA tag\n");
 		goto fail;
@@ -636,9 +638,9 @@
 		goto fail;
 	}
 
-	error = bus_dma_tag_create(NULL, 1, 0, BUS_SPACE_MAXADDR_32BIT,
-	    BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, IWI_MAX_NSEG,
-	    MCLBYTES, 0, NULL, NULL, &ring->data_dmat);
+	error = bus_dma_tag_create(bus_get_dma_tag(sc->sc_dev), 1, 0,
+	BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES,
+	IWI_MAX_NSEG, MCLBYTES, 0, NULL, NULL, &ring->data_dmat);
 	if (error != 0) {
 		device_printf(sc->sc_dev, "could not create data DMA tag\n");
 		goto fail;
@@ -744,9 +746,9 @@
 		goto fail;
 	}
 
-	error = bus_dma_tag_create(NULL, 1, 0, BUS_SPACE_MAXADDR_32BIT,
-	    BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES, 1, MCLBYTES, 0, NULL,
-	    NULL, &ring->data_dmat);
+	error = bus_dma_tag_create(bus_get_dma_tag(sc->sc_dev), 1, 0,
+	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, MCLBYTES,
+	    1, MCLBYTES, 0, NULL, NULL, &ring->data_dmat);
 	if (error != 0) {
 		device_printf(sc->sc_dev, "could not create data DMA tag\n");
 		goto fail;
@@ -3111,9 +3113,10 @@
 	if (sc->fw_uc.size > sc->fw_dma_size)
 		sc->fw_dma_size = sc->fw_uc.size;
 
-	if (bus_dma_tag_create(NULL, 4, 0, BUS_SPACE_MAXADDR_32BIT,
-	    BUS_SPACE_MAXADDR, NULL, NULL, sc->fw_dma_size, 1, sc->fw_dma_size,
-	    0, NULL, NULL, &sc->fw_dmat) != 0) {
+	if (bus_dma_tag_create(bus_get_dma_tag(sc->sc_dev), 4, 0, 
+	    BUS_SPACE_MAXADDR_32BIT, BUS_SPACE_MAXADDR, NULL, NULL, 
+	    sc->fw_dma_size, 1, sc->fw_dma_size, 0, NULL, NULL, 
+	    &sc->fw_dmat) != 0) {
 		device_printf(sc->sc_dev,
 		    "could not create firmware DMA tag\n");
 		IWI_LOCK(sc);

==== //depot/projects/linuxolator/src/sys/kern/sched_4bsd.c#10 (text+ko) ====

@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sched_4bsd.c,v 1.95 2007/01/23 08:46:50 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sched_4bsd.c,v 1.96 2007/02/02 05:14:21 julian Exp $");
 
 #include "opt_hwpmc_hooks.h"
 
@@ -866,9 +866,12 @@
 	 * or stopped or any thing else similar.  We never put the idle
 	 * threads on the run queue, however.
 	 */
-	if (td == PCPU_GET(idlethread))
+	if (td->td_flags & TDF_IDLETD) {
 		TD_SET_CAN_RUN(td);
-	else {
+#ifdef SMP
+		idle_cpus_mask &= ~PCPU_GET(cpumask);
+#endif
+	} else {
 		if (TD_IS_RUNNING(td)) {
 			/* Put us back on the run queue. */
 			sched_add(td, (flags & SW_PREEMPT) ?
@@ -901,13 +904,33 @@
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
 
+                /* I feel sleepy */
 		cpu_switch(td, newtd);
+		/*
+		 * Where am I?  What year is it?
+		 * We are in the same thread that went to sleep above,
+		 * but any amount of time may have passed. All out context
+		 * will still be available as will local variables.
+		 * PCPU values however may have changed as we may have
+		 * changed CPU so don't trust cached values of them.
+		 * New threads will go to fork_exit() instead of here
+		 * so if you change things here you may need to change
+		 * things there too.
+		 * If the thread above was exiting it will never wake
+		 * up again here, so either it has saved everything it
+		 * needed to, or the thread_wait() or wait() will
+		 * need to reap it.
+		 */
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
 	}
 
+#ifdef SMP
+	if (td->td_flags & TDF_IDLETD)
+		idle_cpus_mask |= PCPU_GET(cpumask);
+#endif
 	sched_lock.mtx_lock = (uintptr_t)td;
 	td->td_oncpu = PCPU_GET(cpuid);
 }
@@ -1326,18 +1349,9 @@
 {
 	struct proc *p;
 	struct thread *td;
-#ifdef SMP
-	cpumask_t mycpu;
-#endif
 
 	td = curthread;
 	p = td->td_proc;
-#ifdef SMP
-	mycpu = PCPU_GET(cpumask);
-	mtx_lock_spin(&sched_lock);
-	idle_cpus_mask |= mycpu;
-	mtx_unlock_spin(&sched_lock);
-#endif
 	for (;;) {
 		mtx_assert(&Giant, MA_NOTOWNED);
 
@@ -1345,13 +1359,7 @@
 			cpu_idle();
 
 		mtx_lock_spin(&sched_lock);
-#ifdef SMP
-		idle_cpus_mask &= ~mycpu;
-#endif
 		mi_switch(SW_VOL, NULL);
-#ifdef SMP
-		idle_cpus_mask |= mycpu;
-#endif
 		mtx_unlock_spin(&sched_lock);
 	}
 }

==== //depot/projects/linuxolator/src/sys/kern/subr_witness.c#6 (text+ko) ====

@@ -82,7 +82,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_witness.c,v 1.221 2007/01/16 22:56:28 ssouhlal Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_witness.c,v 1.222 2007/02/02 09:02:18 kib Exp $");
 
 #include "opt_ddb.h"
 #include "opt_witness.h"
@@ -370,6 +370,13 @@
 	{ "cdev", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
+	 * kqueue/VFS interaction
+	 */
+	{ "kqueue", &lock_class_mtx_sleep },
+	{ "struct mount mtx", &lock_class_mtx_sleep },
+	{ "vnode interlock", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
 	 * spin locks
 	 */
 #ifdef SMP

==== //depot/projects/linuxolator/src/sys/kern/uipc_socket.c#10 (text+ko) ====

@@ -95,7 +95,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_socket.c,v 1.289 2007/01/22 14:50:28 andre Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_socket.c,v 1.290 2007/02/01 17:53:40 andre Exp $");
 
 #include "opt_inet.h"
 #include "opt_mac.h"
@@ -368,6 +368,10 @@
 	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
 	    NULL, NULL, NULL);
 	so->so_count = 1;
+	/*
+	 * Auto-sizing of socket buffers is managed by the protocols and
+	 * the appropriate flags must be set in the pru_attach function.
+	 */
 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
 	if (error) {
 		KASSERT(so->so_count == 1, ("socreate: so_count %d",
@@ -442,6 +446,8 @@
 	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
 	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
 	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
+	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
+	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
 	so->so_state |= connstatus;
 	ACCEPT_LOCK();
 	if (connstatus) {
@@ -2116,6 +2122,8 @@
 					error = ENOBUFS;
 					goto bad;
 				}
+				(sopt->sopt_name == SO_SNDBUF ? &so->so_snd :
+				    &so->so_rcv)->sb_flags &= ~SB_AUTOSIZE;
 				break;
 
 			/*

==== //depot/projects/linuxolator/src/sys/net80211/_ieee80211.h#4 (text+ko) ====

@@ -29,7 +29,7 @@
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/net80211/_ieee80211.h,v 1.6 2007/01/15 01:12:28 sam Exp $
+ * $FreeBSD: src/sys/net80211/_ieee80211.h,v 1.7 2007/02/02 02:45:33 sam Exp $
  */
 #ifndef _NET80211__IEEE80211_H_
 #define _NET80211__IEEE80211_H_
@@ -186,6 +186,8 @@
 	(((_c)->ic_flags & (IEEE80211_CHAN_QUARTER | IEEE80211_CHAN_HALF)) == 0)
 #define	IEEE80211_IS_CHAN_GSM(_c) \
 	(((_c)->ic_flags & IEEE80211_CHAN_GSM) != 0)
+#define	IEEE80211_IS_CHAN_PASSIVE(_c) \
+	(((_c)->ic_flags & IEEE80211_CHAN_PASSIVE) != 0)
 
 /* ni_chan encoding for FH phy */
 #define	IEEE80211_FH_CHANMOD	80

==== //depot/projects/linuxolator/src/sys/netinet/tcp_input.c#7 (text+ko) ====

@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)tcp_input.c	8.12 (Berkeley) 5/24/95
- * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.311 2006/12/12 12:17:56 bz Exp $
+ * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.312 2007/02/01 18:32:13 andre Exp $
  */
 
 #include "opt_ipfw.h"		/* for ipfw_fwd		*/
@@ -161,6 +161,18 @@
 	   &tcp_reass_overflows, 0,
 	   "Global number of TCP Segment Reassembly Queue Overflows");
 
+int	tcp_do_autorcvbuf = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_auto, CTLFLAG_RW,
+	   &tcp_do_autorcvbuf, 0, "Enable automatic receive buffer sizing");
+
+int	tcp_autorcvbuf_inc = 16*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_inc, CTLFLAG_RW,
+	   &tcp_autorcvbuf_inc, 0, "Incrementor step size of automatic receive buffer");
+
+int	tcp_autorcvbuf_max = 256*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, recvbuf_max, CTLFLAG_RW,
+	   &tcp_autorcvbuf_max, 0, "Max size of automatic receive buffer");
+
 struct inpcbhead tcb;
 #define	tcb6	tcb  /* for KAME src sync over BSD*'s */
 struct inpcbinfo tcbinfo;
@@ -1295,6 +1307,8 @@
 		} else if (th->th_ack == tp->snd_una &&
 		    LIST_EMPTY(&tp->t_segq) &&
 		    tlen <= sbspace(&so->so_rcv)) {
+			int newsize = 0;	/* automatic sockbuf scaling */
+
 			KASSERT(headlocked, ("headlocked"));
 			INP_INFO_WUNLOCK(&tcbinfo);
 			headlocked = 0;
@@ -1321,18 +1335,78 @@
 			tcpstat.tcps_rcvpack++;
 			tcpstat.tcps_rcvbyte += tlen;
 			ND6_HINT(tp);	/* some progress has been done */
-			/*
 #ifdef TCPDEBUG
 			if (so->so_options & SO_DEBUG)
 				tcp_trace(TA_INPUT, ostate, tp,
 				    (void *)tcp_saveipgen, &tcp_savetcp, 0);
 #endif
-			 * Add data to socket buffer.
-			 */
+		/*
+		 * Automatic sizing of receive socket buffer.  Often the send
+		 * buffer size is not optimally adjusted to the actual network
+		 * conditions at hand (delay bandwidth product).  Setting the
+		 * buffer size too small limits throughput on links with high
+		 * bandwidth and high delay (eg. trans-continental/oceanic links).
+		 *
+		 * On the receive side the socket buffer memory is only rarely
+		 * used to any significant extent.  This allows us to be much
+		 * more aggressive in scaling the receive socket buffer.  For
+		 * the case that the buffer space is actually used to a large
+		 * extent and we run out of kernel memory we can simply drop
+		 * the new segments; TCP on the sender will just retransmit it
+		 * later.  Setting the buffer size too big may only consume too
+		 * much kernel memory if the application doesn't read() from
+		 * the socket or packet loss or reordering makes use of the
+		 * reassembly queue.
+		 *
+		 * The criteria to step up the receive buffer one notch are:
+		 *  1. the number of bytes received during the time it takes
+		 *     one timestamp to be reflected back to us (the RTT);
+		 *  2. received bytes per RTT is within seven eighth of the
+		 *     current socket buffer size;
+		 *  3. receive buffer size has not hit maximal automatic size;
+		 *
+		 * This algorithm does one step per RTT at most and only if
+		 * we receive a bulk stream w/o packet losses or reorderings.
+		 * Shrinking the buffer during idle times is not necessary as
+		 * it doesn't consume any memory when idle.
+		 *
+		 * TODO: Only step up if the application is actually serving
+		 * the buffer to better manage the socket buffer resources.
+		 */
+			if (tcp_do_autorcvbuf &&
+			    to.to_tsecr &&
+			    (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
+				if (to.to_tsecr > tp->rfbuf_ts &&
+				    to.to_tsecr - tp->rfbuf_ts < hz) {
+					if (tp->rfbuf_cnt >
+					    (so->so_rcv.sb_hiwat / 8 * 7) &&
+					    so->so_rcv.sb_hiwat <
+					    tcp_autorcvbuf_max) {
+						newsize =
+						    min(so->so_rcv.sb_hiwat +
+						    tcp_autorcvbuf_inc,
+						    tcp_autorcvbuf_max);
+					}
+					/* Start over with next RTT. */
+					tp->rfbuf_ts = 0;
+					tp->rfbuf_cnt = 0;
+				} else
+					tp->rfbuf_cnt += tlen;	/* add up */
+			}
+
+			/* Add data to socket buffer. */
 			SOCKBUF_LOCK(&so->so_rcv);
 			if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
 				m_freem(m);
 			} else {
+				/*
+				 * Set new socket buffer size.
+				 * Give up when limit is reached.
+				 */
+				if (newsize)
+					if (!sbreserve_locked(&so->so_rcv,
+					    newsize, so, curthread))
+						so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
 				m_adj(m, drop_hdrlen);	/* delayed header drop */
 				sbappendstream_locked(&so->so_rcv, m);
 			}
@@ -1361,6 +1435,10 @@
 	tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
 	}
 
+	/* Reset receive buffer auto scaling when not in bulk receive mode. */
+	tp->rfbuf_ts = 0;
+	tp->rfbuf_cnt = 0;
+
 	switch (tp->t_state) {
 
 	/*

==== //depot/projects/linuxolator/src/sys/netinet/tcp_output.c#5 (text+ko) ====

@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)tcp_output.c	8.4 (Berkeley) 5/24/95
- * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.121 2006/10/22 11:52:16 rwatson Exp $
+ * $FreeBSD: src/sys/netinet/tcp_output.c,v 1.122 2007/02/01 18:32:13 andre Exp $
  */
 
 #include "opt_inet.h"
@@ -110,6 +110,19 @@
 SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
 	&tcp_do_tso, 0, "Enable TCP Segmentation Offload");
 
+int	tcp_do_autosndbuf = 1;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_auto, CTLFLAG_RW,
+	&tcp_do_autosndbuf, 0, "Enable automatic send buffer sizing");
+
+int	tcp_autosndbuf_inc = 8*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_inc, CTLFLAG_RW,
+	&tcp_autosndbuf_inc, 0, "Incrementor step size of automatic send buffer");
+
+int	tcp_autosndbuf_max = 256*1024;
+SYSCTL_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
+	&tcp_autosndbuf_max, 0, "Max size of automatic send buffer");
+
+
 /*
  * Tcp output routine: figure out what should be sent and send it.
  */
@@ -380,11 +393,60 @@
 		}
 	}
 
+	/* len will be >= 0 after this point. */
+	KASSERT(len >= 0, ("%s: len < 0", __func__));
+
+	/*
+	 * Automatic sizing of send socket buffer.  Often the send buffer
+	 * size is not optimally adjusted to the actual network conditions
+	 * at hand (delay bandwidth product).  Setting the buffer size too
+	 * small limits throughput on links with high bandwidth and high
+	 * delay (eg. trans-continental/oceanic links).  Setting the
+	 * buffer size too big consumes too much real kernel memory,
+	 * especially with many connections on busy servers.
+	 *
+	 * The criteria to step up the send buffer one notch are:
+	 *  1. receive window of remote host is larger than send buffer
+	 *     (with a fudge factor of 5/4th);
+	 *  2. send buffer is filled to 7/8th with data (so we actually
+	 *     have data to make use of it);
+	 *  3. send buffer fill has not hit maximal automatic size;
+	 *  4. our send window (slow start and cogestion controlled) is
+	 *     larger than sent but unacknowledged data in send buffer.
+	 *
+	 * The remote host receive window scaling factor may limit the
+	 * growing of the send buffer before it reaches its allowed
+	 * maximum.
+	 *
+	 * It scales directly with slow start or congestion window
+	 * and does at most one step per received ACK.  This fast
+	 * scaling has the drawback of growing the send buffer beyond
+	 * what is strictly necessary to make full use of a given
+	 * delay*bandwith product.  However testing has shown this not
+	 * to be much of an problem.  At worst we are trading wasting
+	 * of available bandwith (the non-use of it) for wasting some
+	 * socket buffer memory.
+	 *
+	 * TODO: Shrink send buffer during idle periods together
+	 * with congestion window.  Requires another timer.  Has to
+	 * wait for upcoming tcp timer rewrite.
+	 */
+	if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
+		if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
+		    so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
+		    so->so_snd.sb_cc < tcp_autosndbuf_max &&
+		    sendwin >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
+			if (!sbreserve_locked(&so->so_snd,
+			    min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
+			     tcp_autosndbuf_max), so, curthread))
+				so->so_snd.sb_flags &= ~SB_AUTOSIZE;
+		}
+	}
+
 	/*
-	 * len will be >= 0 after this point.  Truncate to the maximum
-	 * segment length or enable TCP Segmentation Offloading (if supported
-	 * by hardware) and ensure that FIN is removed if the length no longer
-	 * contains the last data byte.
+	 * Truncate to the maximum segment length or enable TCP Segmentation
+	 * Offloading (if supported by hardware) and ensure that FIN is removed
+	 * if the length no longer contains the last data byte.
 	 *
 	 * TSO may only be used if we are in a pure bulk sending state.  The
 	 * presence of TCP-MD5, SACK retransmits, SACK advertizements and
@@ -606,6 +668,10 @@
 		optlen += TCPOLEN_TSTAMP_APPA;
 	}
 
+	/* Set receive buffer autosizing timestamp. */
+	if (tp->rfbuf_ts == 0 && (so->so_rcv.sb_flags & SB_AUTOSIZE))
+		tp->rfbuf_ts = ticks;
+
 #ifdef TCP_SIGNATURE
 #ifdef INET6
 	if (!isipv6)

==== //depot/projects/linuxolator/src/sys/netinet/tcp_syncache.c#5 (text+ko) ====

@@ -29,7 +29,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/netinet/tcp_syncache.c,v 1.103 2006/12/13 06:00:56 csjp Exp $
+ * $FreeBSD: src/sys/netinet/tcp_syncache.c,v 1.104 2007/02/01 17:39:18 andre Exp $
  */
 
 #include "opt_inet.h"
@@ -1014,9 +1014,15 @@
 		if (to->to_flags & TOF_SCALE) {
 			int wscale = 0;
 
-			/* Compute proper scaling value from buffer space */
+			/*
+			 * Compute proper scaling value from buffer space.
+			 * Leave enough room for the socket buffer to grow
+			 * with auto sizing.  This allows us to scale the
+			 * receive buffer over a wide range while not losing
+			 * any efficiency or fine granularity.
+			 */
 			while (wscale < TCP_MAX_WINSHIFT &&
-			    (TCP_MAXWIN << wscale) < sb_hiwat)
+			    (0x1 << wscale) < tcp_minmss)
 				wscale++;
 			sc->sc_requested_r_scale = wscale;
 			sc->sc_requested_s_scale = to->to_requested_s_scale;

==== //depot/projects/linuxolator/src/sys/netinet/tcp_usrreq.c#4 (text+ko) ====

@@ -29,7 +29,7 @@
  * SUCH DAMAGE.
  *
  *	From: @(#)tcp_usrreq.c	8.2 (Berkeley) 1/3/94
- * $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.142 2006/11/22 17:16:54 sam Exp $
+ * $FreeBSD: src/sys/netinet/tcp_usrreq.c,v 1.144 2007/02/01 18:32:13 andre Exp $
  */
 
 #include "opt_inet.h"
@@ -1131,9 +1131,14 @@
 	inp->inp_laddr = laddr;
 	in_pcbrehash(inp);
 
-	/* Compute window scaling to request.  */
+	/*
+	 * Compute window scaling to request:
+	 * Scale to fit into sweet spot.  See tcp_syncache.c.
+	 * XXX: This should move to tcp_output().
+	 * XXX: This should be based on the actual MSS.
+	 */
 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
-	    (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
+	    (0x1 << tp->request_r_scale) < tcp_minmss)
 		tp->request_r_scale++;
 
 	soisconnecting(so);
@@ -1441,6 +1446,8 @@
 		if (error)
 			return (error);
 	}
+	so->so_rcv.sb_flags |= SB_AUTOSIZE;
+	so->so_snd.sb_flags |= SB_AUTOSIZE;
 	INP_INFO_WLOCK(&tcbinfo);
 	error = in_pcballoc(so, &tcbinfo);
 	if (error) {

==== //depot/projects/linuxolator/src/sys/netinet/tcp_var.h#3 (text+ko) ====

@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)tcp_var.h	8.4 (Berkeley) 5/24/95
- * $FreeBSD: src/sys/netinet/tcp_var.h,v 1.137 2006/09/13 13:08:27 andre Exp $
+ * $FreeBSD: src/sys/netinet/tcp_var.h,v 1.138 2007/02/01 18:32:13 andre Exp $
  */
 
 #ifndef _NETINET_TCP_VAR_H_
@@ -202,6 +202,8 @@
 					   episode starts at this seq number */
 	struct sackhint	sackhint;	/* SACK scoreboard hint */
 	int	t_rttlow;		/* smallest observerved RTT */
+	u_int32_t	rfbuf_ts;	/* recv buffer autoscaling timestamp */
+	int	rfbuf_cnt;		/* recv buffer autoscaling byte count */
 };
 
 #define IN_FASTRECOVERY(tp)	(tp->t_flags & TF_FASTRECOVERY)

==== //depot/projects/linuxolator/src/sys/sun4v/include/intr_machdep.h#3 (text+ko) ====

@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/sun4v/include/intr_machdep.h,v 1.2 2007/01/19 11:15:33 marius Exp $
+ * $FreeBSD: src/sys/sun4v/include/intr_machdep.h,v 1.3 2007/02/02 05:00:21 kmacy Exp $
  */
 
 #ifndef	_MACHINE_INTR_MACHDEP_H_
@@ -46,6 +46,7 @@
 #define	PIL_RENDEZVOUS	3	/* smp rendezvous ipi */
 #define	PIL_AST		4	/* ast ipi */
 #define	PIL_STOP	5	/* stop cpu ipi */
+#define	PIL_PREEMPT	6	/* preempt idle thread cpu ipi */
 #define	PIL_FAST	13	/* fast interrupts */
 #define	PIL_TICK	14
 

==== //depot/projects/linuxolator/src/sys/sun4v/include/smp.h#4 (text+ko) ====

@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/sun4v/include/smp.h,v 1.3 2006/12/25 02:05:52 kmacy Exp $
+ * $FreeBSD: src/sys/sun4v/include/smp.h,v 1.4 2007/02/02 05:00:21 kmacy Exp $
  */
 
 #ifndef	_MACHINE_SMP_H_
@@ -44,7 +44,9 @@
 #define	IPI_AST		PIL_AST
 #define	IPI_RENDEZVOUS	PIL_RENDEZVOUS
 #define	IPI_STOP	PIL_STOP
+#define IPI_PREEMPT     PIL_PREEMPT
 
+
 #define	IPI_RETRIES	5000
 
 struct cpu_start_args {
@@ -79,6 +81,7 @@
 
 void cpu_ipi_ast(struct trapframe *tf);
 void cpu_ipi_stop(struct trapframe *tf);
+void cpu_ipi_preempt(struct trapframe *tf);
 
 void	ipi_selected(u_int cpus, u_int ipi);
 void	ipi_all(u_int ipi);

==== //depot/projects/linuxolator/src/sys/sun4v/sun4v/intr_machdep.c#4 (text+ko) ====

@@ -59,7 +59,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/sun4v/sun4v/intr_machdep.c,v 1.3 2006/11/24 05:27:49 kmacy Exp $");
+__FBSDID("$FreeBSD: src/sys/sun4v/sun4v/intr_machdep.c,v 1.4 2007/02/02 05:00:21 kmacy Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -114,7 +114,8 @@
 	"rndzvs",	/* PIL_RENDEZVOUS */
 	"ast",		/* PIL_AST */
 	"stop",		/* PIL_STOP */
-	"stray", "stray", "stray", "stray", "stray", "stray", "stray",
+	"preempt",      /* PIL_PREEMPT */
+	"stray", "stray", "stray", "stray", "stray", "stray",
 	"fast",		/* PIL_FAST */
 	"tick",		/* PIL_TICK */
 };
@@ -266,6 +267,7 @@
 	intr_handlers[PIL_AST] = cpu_ipi_ast;
 	intr_handlers[PIL_RENDEZVOUS] = (ih_func_t *)smp_rendezvous_action;
 	intr_handlers[PIL_STOP]= cpu_ipi_stop;
+	intr_handlers[PIL_PREEMPT]= cpu_ipi_preempt;
 #endif
 	mtx_init(&intr_table_lock, "intr table", NULL, MTX_SPIN);
 	cpu_intrq_alloc();

==== //depot/projects/linuxolator/src/sys/sun4v/sun4v/mp_machdep.c#5 (text+ko) ====

@@ -55,7 +55,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/sun4v/sun4v/mp_machdep.c,v 1.5 2006/12/17 01:31:56 kmacy Exp $");
+__FBSDID("$FreeBSD: src/sys/sun4v/sun4v/mp_machdep.c,v 1.6 2007/02/02 05:00:21 kmacy Exp $");
 
 #include "opt_trap_trace.h"
 
@@ -456,6 +456,20 @@
 }
 
 void
+cpu_ipi_preempt(struct trapframe *tf)
+{
+	struct thread *running_thread = curthread;
+
+	mtx_lock_spin(&sched_lock);
+	if (running_thread->td_critnest > 1)
+		running_thread->td_owepreempt = 1;
+	else
+		mi_switch(SW_INVOL | SW_PREEMPT, NULL);
+	mtx_unlock_spin(&sched_lock);
+
+}
+
+void
 cpu_ipi_selected(int cpu_count, uint16_t *cpulist, u_long d0, u_long d1, u_long d2, uint64_t *ackmask)
 {
 

==== //depot/projects/linuxolator/src/sys/sun4v/sun4v/tte.c#4 (text+ko) ====

@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/sun4v/sun4v/tte.c,v 1.3 2006/12/24 08:03:27 kmacy Exp $");
+__FBSDID("$FreeBSD: src/sys/sun4v/sun4v/tte.c,v 1.4 2007/02/02 04:57:11 kmacy Exp $");
 
 #include "opt_ddb.h"
 #include "opt_pmap.h"
@@ -74,7 +74,7 @@
 		PMAP_LOCK(pmap);
 		otte_data = tte_hash_clear_bits(pmap->pm_hash, pv->pv_va, flags);
 		if ((matchbits = (otte_data & active_flags)) != 0) {
-			if (matchbits == VTD_W) 
+			if ((otte_data & (VTD_SW_W|VTD_W)) == (VTD_SW_W|VTD_W)) 
 				vm_page_dirty(m);
 			pmap_invalidate_page(pmap, pv->pv_va, TRUE);
 		}

==== //depot/projects/linuxolator/src/sys/sys/socketvar.h#2 (text+ko) ====

@@ -27,7 +27,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)socketvar.h	8.3 (Berkeley) 2/19/95
- * $FreeBSD: src/sys/sys/socketvar.h,v 1.154 2006/08/01 10:30:26 rwatson Exp $
+ * $FreeBSD: src/sys/sys/socketvar.h,v 1.155 2007/02/01 17:53:41 andre Exp $
  */
 
 #ifndef _SYS_SOCKETVAR_H_
@@ -128,6 +128,7 @@
 #define	SB_NOINTR	0x40		/* operations not interruptible */
 #define SB_AIO		0x80		/* AIO operations queued */
 #define SB_KNOTE	0x100		/* kernel note attached */
+#define	SB_AUTOSIZE	0x800		/* automatically size socket buffer */
 
 	void	(*so_upcall)(struct socket *, void *, int);
 	void	*so_upcallarg;


More information about the p4-projects mailing list