svn commit: r332770 - in head/sys: conf netinet netinet/tcp_stacks sys

Thu Apr 19 13:38:01 UTC 2018

Author: rrs
Date: Thu Apr 19 13:37:59 2018
New Revision: 332770
URL: https://svnweb.freebsd.org/changeset/base/332770

Log:
  This commit brings in the TCP high precision timer system (tcp_hpts).
  It is the forerunner/foundational work of bringing in both Rack and BBR
  which use hpts for pacing out packets. The feature is optional and requires
  the TCPHPTS option to be enabled before the feature will be active. TCP
  modules that use it must assure that the base component is compile in
  the kernel in which they are loaded.
  
  MFC after:	Never
  Sponsored by:	Netflix Inc.
  Differential Revision:	https://reviews.freebsd.org/D15020

Added:
  head/sys/netinet/tcp_hpts.c   (contents, props changed)
  head/sys/netinet/tcp_hpts.h   (contents, props changed)
  head/sys/sys/kern_prefetch.h   (contents, props changed)
Modified:
  head/sys/conf/files
  head/sys/conf/options
  head/sys/netinet/in_pcb.c
  head/sys/netinet/in_pcb.h
  head/sys/netinet/tcp_stacks/fastpath.c
  head/sys/netinet/tcp_subr.c
  head/sys/netinet/tcp_syncache.c
  head/sys/netinet/tcp_usrreq.c
  head/sys/netinet/tcp_var.h
  head/sys/sys/mbuf.h

Modified: head/sys/conf/files
==============================================================================

--- head/sys/conf/files	Thu Apr 19 12:50:49 2018	(r332769)
+++ head/sys/conf/files	Thu Apr 19 13:37:59 2018	(r332770)
@@ -4355,6 +4355,7 @@ netinet/tcp_log_buf.c		optional tcp_blackbox inet | tc
 netinet/tcp_lro.c		optional inet | inet6
 netinet/tcp_output.c		optional inet | inet6
 netinet/tcp_offload.c		optional tcp_offload inet | tcp_offload inet6
+netinet/tcp_hpts.c              optional tcphpts inet | tcphpts inet6
 netinet/tcp_pcap.c		optional inet tcppcap | inet6 tcppcap
 netinet/tcp_reass.c		optional inet | inet6
 netinet/tcp_sack.c		optional inet | inet6

Modified: head/sys/conf/options
==============================================================================
--- head/sys/conf/options	Thu Apr 19 12:50:49 2018	(r332769)
+++ head/sys/conf/options	Thu Apr 19 13:37:59 2018	(r332770)
@@ -218,6 +218,7 @@ SYSVMSG		opt_sysvipc.h
 SYSVSEM		opt_sysvipc.h
 SYSVSHM		opt_sysvipc.h
 SW_WATCHDOG	opt_watchdog.h
+TCPHPTS         opt_inet.h
 TURNSTILE_PROFILING
 UMTX_PROFILING
 UMTX_CHAINS	opt_global.h

Modified: head/sys/netinet/in_pcb.c
==============================================================================
--- head/sys/netinet/in_pcb.c	Thu Apr 19 12:50:49 2018	(r332769)
+++ head/sys/netinet/in_pcb.c	Thu Apr 19 13:37:59 2018	(r332770)
@@ -58,6 +58,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/domain.h>
 #include <sys/protosw.h>
 #include <sys/rmlock.h>
+#include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/sockio.h>
@@ -87,6 +88,9 @@ __FBSDID("$FreeBSD$");
 #include <netinet/in_pcb.h>
 #include <netinet/ip_var.h>
 #include <netinet/tcp_var.h>
+#ifdef TCPHPTS
+#include <netinet/tcp_hpts.h>
+#endif
 #include <netinet/udp.h>
 #include <netinet/udp_var.h>
 #endif
@@ -1224,9 +1228,28 @@ in_pcbrele_rlocked(struct inpcb *inp)
 		}
 		return (0);
 	}
-
+	
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-
+#ifdef TCPHPTS
+	if (inp->inp_in_hpts || inp->inp_in_input) {
+		struct tcp_hpts_entry *hpts;
+		/*
+		 * We should not be on the hpts at 
+		 * this point in any form. we must
+		 * get the lock to be sure.
+		 */
+		hpts = tcp_hpts_lock(inp);
+		if (inp->inp_in_hpts)
+			panic("Hpts:%p inp:%p at free still on hpts",
+			      hpts, inp);
+		mtx_unlock(&hpts->p_mtx);
+		hpts = tcp_input_lock(inp);
+		if (inp->inp_in_input) 
+			panic("Hpts:%p inp:%p at free still on input hpts",
+			      hpts, inp);
+		mtx_unlock(&hpts->p_mtx);
+	}
+#endif
 	INP_RUNLOCK(inp);
 	pcbinfo = inp->inp_pcbinfo;
 	uma_zfree(pcbinfo->ipi_zone, inp);
@@ -1255,7 +1278,26 @@ in_pcbrele_wlocked(struct inpcb *inp)
 	}
 
 	KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__));
-
+#ifdef TCPHPTS
+	if (inp->inp_in_hpts || inp->inp_in_input) {
+		struct tcp_hpts_entry *hpts;
+		/*
+		 * We should not be on the hpts at 
+		 * this point in any form. we must
+		 * get the lock to be sure.
+		 */
+		hpts = tcp_hpts_lock(inp);
+		if (inp->inp_in_hpts)
+			panic("Hpts:%p inp:%p at free still on hpts",
+			      hpts, inp);
+		mtx_unlock(&hpts->p_mtx);
+		hpts = tcp_input_lock(inp);
+		if (inp->inp_in_input) 
+			panic("Hpts:%p inp:%p at free still on input hpts",
+			      hpts, inp);
+		mtx_unlock(&hpts->p_mtx);
+	}
+#endif
 	INP_WUNLOCK(inp);
 	pcbinfo = inp->inp_pcbinfo;
 	uma_zfree(pcbinfo->ipi_zone, inp);

Modified: head/sys/netinet/in_pcb.h
==============================================================================
--- head/sys/netinet/in_pcb.h	Thu Apr 19 12:50:49 2018	(r332769)
+++ head/sys/netinet/in_pcb.h	Thu Apr 19 13:37:59 2018	(r332770)
@@ -156,6 +156,7 @@ struct in_conninfo {
  * from the global list.
  *
  * Key:
+ * (b) - Protected by the hpts lock.
  * (c) - Constant after initialization
  * (g) - Protected by the pcbgroup lock
  * (i) - Protected by the inpcb lock
@@ -164,7 +165,52 @@ struct in_conninfo {
  * (h) - Protected by the pcbhash lock for the inpcb
  * (s) - Protected by another subsystem's locks
  * (x) - Undefined locking
+ * 
+ * Notes on the tcp_hpts:
+ * 
+ * First Hpts lock order is
+ * 1) INP_WLOCK()
+ * 2) HPTS_LOCK() i.e. hpts->pmtx 
  *
+ * To insert a TCB on the hpts you *must* be holding the INP_WLOCK(). 
+ * You may check the inp->inp_in_hpts flag without the hpts lock. 
+ * The hpts is the only one that will clear this flag holding 
+ * only the hpts lock. This means that in your tcp_output()
+ * routine when you test for the inp_in_hpts flag to be 1 
+ * it may be transitioning to 0 (by the hpts). 
+ * That's ok since that will just mean an extra call to tcp_output 
+ * that most likely will find the call you executed
+ * (when the mis-match occured) will have put the TCB back 
+ * on the hpts and it will return. If your
+ * call did not add the inp back to the hpts then you will either
+ * over-send or the cwnd will block you from sending more.
+ *
+ * Note you should also be holding the INP_WLOCK() when you
+ * call the remove from the hpts as well. Though usually
+ * you are either doing this from a timer, where you need and have
+ * the INP_WLOCK() or from destroying your TCB where again
+ * you should already have the INP_WLOCK().
+ *
+ * The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and 
+ * inp_input_cpu_set fields are controlled completely by
+ * the hpts. Do not ever set these. The inp_hpts_cpu_set
+ * and inp_input_cpu_set fields indicate if the hpts has
+ * setup the respective cpu field. It is advised if this
+ * field is 0, to enqueue the packet with the appropriate
+ * hpts_immediate() call. If the _set field is 1, then
+ * you may compare the inp_*_cpu field to the curcpu and
+ * may want to again insert onto the hpts if these fields
+ * are not equal (i.e. you are not on the expected CPU).
+ *
+ * A note on inp_hpts_calls and inp_input_calls, these
+ * flags are set when the hpts calls either the output
+ * or do_segment routines respectively. If the routine
+ * being called wants to use this, then it needs to
+ * clear the flag before returning. The hpts will not
+ * clear the flag. The flags can be used to tell if
+ * the hpts is the function calling the respective
+ * routine.
+ *
  * A few other notes:
  *
  * When a read lock is held, stability of the field is guaranteed; to write
@@ -190,14 +236,45 @@ struct inpcb {
 	LIST_ENTRY(inpcb) inp_pcbgrouphash;	/* (g/i) hash list */
 	struct rwlock	inp_lock;
 	/* Cache line #2 (amd64) */
-#define	inp_start_zero	inp_refcount
+#define	inp_start_zero	inp_hpts
 #define	inp_zero_size	(sizeof(struct inpcb) - \
 			    offsetof(struct inpcb, inp_start_zero))
+	TAILQ_ENTRY(inpcb) inp_hpts;	/* pacing out queue next lock(b) */
+
+	uint32_t inp_hpts_request;	/* Current hpts request, zero if
+					 * fits in the pacing window (i&b). */
+	/*
+	 * Note the next fields are protected by a
+	 * different lock (hpts-lock). This means that 
+	 * they must correspond in size to the smallest
+	 * protectable bit field (uint8_t on x86, and
+	 * other platfomrs potentially uint32_t?). Also
+	 * since CPU switches can occur at different times the two
+	 * fields can *not* be collapsed into a signal bit field.
+	 */
+#if defined(__amd64__) || defined(__i386__)	
+	volatile uint8_t inp_in_hpts; /* on output hpts (lock b) */
+	volatile uint8_t inp_in_input; /* on input hpts (lock b) */
+#else
+	volatile uint32_t inp_in_hpts; /* on output hpts (lock b) */
+	volatile uint32_t inp_in_input; /* on input hpts (lock b) */
+#endif
+	volatile uint16_t  inp_hpts_cpu; /* Lock (i) */
 	u_int	inp_refcount;		/* (i) refcount */
 	int	inp_flags;		/* (i) generic IP/datagram flags */
 	int	inp_flags2;		/* (i) generic IP/datagram flags #2*/
+	volatile uint16_t  inp_input_cpu; /* Lock (i) */
+	volatile uint8_t inp_hpts_cpu_set :1,  /* on output hpts (i) */
+			 inp_input_cpu_set : 1,	/* on input hpts (i) */
+			 inp_hpts_calls :1,	/* (i) from output hpts */
+			 inp_input_calls :1,	/* (i) from input hpts */
+			 inp_spare_bits2 : 4;
+	uint8_t inp_spare_byte;		/* Compiler hole */
 	void	*inp_ppcb;		/* (i) pointer to per-protocol pcb */
 	struct	socket *inp_socket;	/* (i) back pointer to socket */
+	uint32_t 	 inp_hptsslot;	/* Hpts wheel slot this tcb is Lock(i&b) */
+	uint32_t         inp_hpts_drop_reas;	/* reason we are dropping the PCB (lock i&b) */
+	TAILQ_ENTRY(inpcb) inp_input;	/* pacing in  queue next lock(b) */
 	struct	inpcbinfo *inp_pcbinfo;	/* (c) PCB list info */
 	struct	inpcbgroup *inp_pcbgroup; /* (g/i) PCB group list */
 	LIST_ENTRY(inpcb) inp_pcbgroup_wild; /* (g/i/h) group wildcard entry */
@@ -638,6 +715,7 @@ short	inp_so_options(const struct inpcb *inp);
 #define	INP_RECVRSSBUCKETID	0x00000200 /* populate recv datagram with bucket id */
 #define	INP_RATE_LIMIT_CHANGED	0x00000400 /* rate limit needs attention */
 #define	INP_ORIGDSTADDR		0x00000800 /* receive IP dst address/port */
+#define INP_CANNOT_DO_ECN	0x00001000 /* The stack does not do ECN */
 
 /*
  * Flags passed to in_pcblookup*() functions.

Added: head/sys/netinet/tcp_hpts.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/sys/netinet/tcp_hpts.c	Thu Apr 19 13:37:59 2018	(r332770)
@@ -0,0 +1,1964 @@
+/*-
+ * Copyright (c) 2016-8
+ *	Netflix Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_tcpdebug.h"
+/**
+ * Some notes about usage.
+ *
+ * The tcp_hpts system is designed to provide a high precision timer
+ * system for tcp. Its main purpose is to provide a mechanism for 
+ * pacing packets out onto the wire. It can be used in two ways
+ * by a given TCP stack (and those two methods can be used simultaneously).
+ *
+ * First, and probably the main thing its used by Rack and BBR for, it can
+ * be used to call tcp_output() of a transport stack at some time in the future.
+ * The normal way this is done is that tcp_output() of the stack schedules
+ * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
+ * slot is the time from now that the stack wants to be called but it
+ * must be converted to tcp_hpts's notion of slot. This is done with
+ * one of the macros HPTS_MS_TO_SLOTS or HPTS_USEC_TO_SLOTS. So a typical
+ * call from the tcp_output() routine might look like:
+ *
+ * tcp_hpts_insert(tp, HPTS_USEC_TO_SLOTS(550));
+ *
+ * The above would schedule tcp_ouput() to be called in 550 useconds.
+ * Note that if using this mechanism the stack will want to add near
+ * its top a check to prevent unwanted calls (from user land or the
+ * arrival of incoming ack's). So it would add something like:
+ *
+ * if (inp->inp_in_hpts)
+ *    return;
+ *
+ * to prevent output processing until the time alotted has gone by.
+ * Of course this is a bare bones example and the stack will probably
+ * have more consideration then just the above.
+ *
+ * Now the tcp_hpts system will call tcp_output in one of two forms, 
+ * it will first check to see if the stack as defined a 
+ * tfb_tcp_output_wtime() function, if so that is the routine it
+ * will call, if that function is not defined then it will call the
+ * tfb_tcp_output() function. The only difference between these
+ * two calls is that the former passes the time in to the function
+ * so the function does not have to access the time (which tcp_hpts
+ * already has). What these functions do is of course totally up
+ * to the individual tcp stack.
+ *
+ * Now the second function (actually two functions I guess :D)
+ * the tcp_hpts system provides is the  ability to either abort 
+ * a connection (later) or process  input on a connection. 
+ * Why would you want to do this? To keep processor locality.
+ *
+ * So in order to use the input redirection function the
+ * stack changes its tcp_do_segment() routine to instead
+ * of process the data call the function:
+ *
+ * tcp_queue_pkt_to_input()
+ *
+ * You will note that the arguments to this function look
+ * a lot like tcp_do_segments's arguments. This function
+ * will assure that the tcp_hpts system will
+ * call the functions tfb_tcp_hpts_do_segment() from the
+ * correct CPU. Note that multiple calls can get pushed
+ * into the tcp_hpts system this will be indicated by
+ * the next to last argument to tfb_tcp_hpts_do_segment()
+ * (nxt_pkt). If nxt_pkt is a 1 then another packet is
+ * coming. If nxt_pkt is a 0 then this is the last call
+ * that the tcp_hpts system has available for the tcp stack.
+ * 
+ * The other point of the input system is to be able to safely
+ * drop a tcp connection without worrying about the recursive 
+ * locking that may be occuring on the INP_WLOCK. So if
+ * a stack wants to drop a connection it calls:
+ *
+ *     tcp_set_inp_to_drop(tp, ETIMEDOUT)
+ * 
+ * To schedule the tcp_hpts system to call 
+ * 
+ *    tcp_drop(tp, drop_reason)
+ *
+ * at a future point. This is quite handy to prevent locking
+ * issues when dropping connections.
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/hhook.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>		/* for proc0 declaration */
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+#include <sys/refcount.h>
+#include <sys/sched.h>
+#include <sys/queue.h>
+#include <sys/smp.h>
+#include <sys/counter.h>
+#include <sys/time.h>
+#include <sys/kthread.h>
+#include <sys/kern_prefetch.h>
+
+#include <vm/uma.h>
+
+#include <net/route.h>
+#include <net/vnet.h>
+
+#define TCPSTATES		/* for logging */
+
+#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>	/* required for icmp_var.h */
+#include <netinet/icmp_var.h>	/* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#include <netinet/ip6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#define	TCPOUTFLAGS
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/cc/cc.h>
+#include <netinet/tcp_hpts.h>
+
+#ifdef tcpdebug
+#include <netinet/tcp_debug.h>
+#endif				/* tcpdebug */
+#ifdef tcp_offload
+#include <netinet/tcp_offload.h>
+#endif
+
+#ifdef ipsec
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif				/* ipsec */
+#include "opt_rss.h"
+
+MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
+#ifdef RSS
+static int tcp_bind_threads = 1;
+#else
+static int tcp_bind_threads = 0;
+#endif
+TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
+
+static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG;
+
+TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size);
+
+static struct tcp_hptsi tcp_pace;
+
+static int
+tcp_hptsi_lock_inpinfo(struct inpcb *inp,
+    struct tcpcb **tp);
+static void tcp_wakehpts(struct tcp_hpts_entry *p);
+static void tcp_wakeinput(struct tcp_hpts_entry *p);
+static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
+static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick);
+static void tcp_hpts_thread(void *ctx);
+static void tcp_init_hptsi(void *st);
+
+int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
+static int32_t tcp_hpts_callout_skip_swi = 0;
+
+SYSCTL_DECL(_net_inet_tcp);
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW, 0, "TCP Hpts controls");
+
+#define	timersub(tvp, uvp, vvp)						\
+	do {								\
+		(vvp)->tv_sec = (tvp)->tv_sec - (uvp)->tv_sec;		\
+		(vvp)->tv_usec = (tvp)->tv_usec - (uvp)->tv_usec;	\
+		if ((vvp)->tv_usec < 0) {				\
+			(vvp)->tv_sec--;				\
+			(vvp)->tv_usec += 1000000;			\
+		}							\
+	} while (0)
+
+static int32_t logging_on = 0;
+static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2);
+static int32_t tcp_hpts_precision = 120;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
+    &tcp_hpts_precision, 120,
+    "Value for PRE() precision of callout");
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
+    &logging_on, 0,
+    "Turn on logging if compiled in");
+
+counter_u64_t hpts_loops;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD,
+    &hpts_loops, "Number of times hpts had to loop to catch up");
+
+counter_u64_t back_tosleep;
+
+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
+    &back_tosleep, "Number of times hpts found no tcbs");
+
+static int32_t in_newts_every_tcb = 0;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW,
+    &in_newts_every_tcb, 0,
+    "Do we have a new cts every tcb we process for input");
+static int32_t in_ts_percision = 0;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW,
+    &in_ts_percision, 0,
+    "Do we use percise timestamp for clients on input");
+static int32_t out_newts_every_tcb = 0;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW,
+    &out_newts_every_tcb, 0,
+    "Do we have a new cts every tcb we process for output");
+static int32_t out_ts_percision = 0;
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW,
+    &out_ts_percision, 0,
+    "Do we use a percise timestamp for every output cts");
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW,
+    &hpts_sleep_max, 0,
+    "The maximum time the hpts will sleep <1 - 254>");
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW,
+    &tcp_min_hptsi_time, 0,
+    "The minimum time the hpts must sleep before processing more slots");
+
+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW,
+    &tcp_hpts_callout_skip_swi, 0,
+    "Do we have the callout call directly to the hpts?");
+
+static void
+__tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot,
+    uint32_t ticknow, int32_t line)
+{
+	struct hpts_log *pl;
+
+	HPTS_MTX_ASSERT(hpts);
+	if (hpts->p_log == NULL)
+		return;
+	pl = &hpts->p_log[hpts->p_log_at];
+	hpts->p_log_at++;
+	if (hpts->p_log_at >= hpts->p_logsize) {
+		hpts->p_log_at = 0;
+		hpts->p_log_wrapped = 1;
+	}
+	pl->inp = inp;
+	if (inp) {
+		pl->t_paceslot = inp->inp_hptsslot;
+		pl->t_hptsreq = inp->inp_hpts_request;
+		pl->p_onhpts = inp->inp_in_hpts;
+		pl->p_oninput = inp->inp_in_input;
+	} else {
+		pl->t_paceslot = 0;
+		pl->t_hptsreq = 0;
+		pl->p_onhpts = 0;
+		pl->p_oninput = 0;
+	}
+	pl->is_notempty = 1;
+	pl->event = event;
+	pl->line = line;
+	pl->cts = tcp_get_usecs(NULL);
+	pl->p_curtick = hpts->p_curtick;
+	pl->p_prevtick = hpts->p_prevtick;
+	pl->p_on_queue_cnt = hpts->p_on_queue_cnt;
+	pl->ticknow = ticknow;
+	pl->slot_req = slot;
+	pl->p_nxt_slot = hpts->p_nxt_slot;
+	pl->p_cur_slot = hpts->p_cur_slot;
+	pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time;
+	pl->p_flags = (hpts->p_cpu & 0x7f);
+	pl->p_flags <<= 7;
+	pl->p_flags |= (hpts->p_num & 0x7f);
+	pl->p_flags <<= 2;
+	if (hpts->p_hpts_active) {
+		pl->p_flags |= HPTS_HPTS_ACTIVE;
+	}
+}
+
+#define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__)
+
+static void
+hpts_timeout_swi(void *arg)
+{
+	struct tcp_hpts_entry *hpts;
+
+	hpts = (struct tcp_hpts_entry *)arg;
+	swi_sched(hpts->ie_cookie, 0);
+}
+
+static void
+hpts_timeout_dir(void *arg)
+{
+	tcp_hpts_thread(arg);
+}
+
+static inline void
+hpts_sane_pace_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int clear)
+{
+#ifdef INVARIANTS
+	if (mtx_owned(&hpts->p_mtx) == 0) {
+		/* We don't own the mutex? */
+		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
+	}
+	if (hpts->p_cpu != inp->inp_hpts_cpu) {
+		/* It is not the right cpu/mutex? */
+		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
+	}
+	if (inp->inp_in_hpts == 0) {
+		/* We are not on the hpts? */
+		panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp);
+	}
+	if (TAILQ_EMPTY(head) &&
+	    (hpts->p_on_queue_cnt != 0)) {
+		/* We should not be empty with a queue count */
+		panic("%s hpts:%p hpts bucket empty but cnt:%d",
+		    __FUNCTION__, hpts, hpts->p_on_queue_cnt);
+	}
+#endif
+	TAILQ_REMOVE(head, inp, inp_hpts);
+	hpts->p_on_queue_cnt--;
+	if (hpts->p_on_queue_cnt < 0) {
+		/* Count should not go negative .. */
+#ifdef INVARIANTS
+		panic("Hpts goes negative inp:%p hpts:%p",
+		    inp, hpts);
+#endif
+		hpts->p_on_queue_cnt = 0;
+	}
+	if (clear) {
+		inp->inp_hpts_request = 0;
+		inp->inp_in_hpts = 0;
+	}
+}
+
+static inline void
+hpts_sane_pace_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, struct hptsh *head, int line, int noref)
+{
+#ifdef INVARIANTS
+	if (mtx_owned(&hpts->p_mtx) == 0) {
+		/* We don't own the mutex? */
+		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
+	}
+	if (hpts->p_cpu != inp->inp_hpts_cpu) {
+		/* It is not the right cpu/mutex? */
+		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
+	}
+	if ((noref == 0) && (inp->inp_in_hpts == 1)) {
+		/* We are already on the hpts? */
+		panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp);
+	}
+#endif
+	TAILQ_INSERT_TAIL(head, inp, inp_hpts);
+	inp->inp_in_hpts = 1;
+	hpts->p_on_queue_cnt++;
+	if (noref == 0) {
+		in_pcbref(inp);
+	}
+}
+
+static inline void
+hpts_sane_input_remove(struct tcp_hpts_entry *hpts, struct inpcb *inp, int clear)
+{
+#ifdef INVARIANTS
+	if (mtx_owned(&hpts->p_mtx) == 0) {
+		/* We don't own the mutex? */
+		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
+	}
+	if (hpts->p_cpu != inp->inp_input_cpu) {
+		/* It is not the right cpu/mutex? */
+		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
+	}
+	if (inp->inp_in_input == 0) {
+		/* We are not on the input hpts? */
+		panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp);
+	}
+#endif
+	TAILQ_REMOVE(&hpts->p_input, inp, inp_input);
+	hpts->p_on_inqueue_cnt--;
+	if (hpts->p_on_inqueue_cnt < 0) {
+#ifdef INVARIANTS
+		panic("Hpts in goes negative inp:%p hpts:%p",
+		    inp, hpts);
+#endif
+		hpts->p_on_inqueue_cnt = 0;
+	}
+#ifdef INVARIANTS
+	if (TAILQ_EMPTY(&hpts->p_input) &&
+	    (hpts->p_on_inqueue_cnt != 0)) {
+		/* We should not be empty with a queue count */
+		panic("%s hpts:%p in_hpts input empty but cnt:%d",
+		    __FUNCTION__, hpts, hpts->p_on_inqueue_cnt);
+	}
+#endif
+	if (clear)
+		inp->inp_in_input = 0;
+}
+
+static inline void
+hpts_sane_input_insert(struct tcp_hpts_entry *hpts, struct inpcb *inp, int line)
+{
+#ifdef INVARIANTS
+	if (mtx_owned(&hpts->p_mtx) == 0) {
+		/* We don't own the mutex? */
+		panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
+	}
+	if (hpts->p_cpu != inp->inp_input_cpu) {
+		/* It is not the right cpu/mutex? */
+		panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
+	}
+	if (inp->inp_in_input == 1) {
+		/* We are already on the input hpts? */
+		panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp);
+	}
+#endif
+	TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input);
+	inp->inp_in_input = 1;
+	hpts->p_on_inqueue_cnt++;
+	in_pcbref(inp);
+}
+
+static int
+sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS)
+{
+	struct tcp_hpts_entry *hpts;
+	size_t sz;
+	int32_t logging_was, i;
+	int32_t error = 0;
+
+	/*
+	 * HACK: Turn off logging so no locks are required this really needs
+	 * a memory barrier :)
+	 */
+	logging_was = logging_on;
+	logging_on = 0;
+	if (!req->oldptr) {
+		/* How much? */
+		sz = 0;
+		for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
+			hpts = tcp_pace.rp_ent[i];
+			if (hpts->p_log == NULL)
+				continue;
+			sz += (sizeof(struct hpts_log) * hpts->p_logsize);
+		}
+		error = SYSCTL_OUT(req, 0, sz);
+	} else {
+		for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
+			hpts = tcp_pace.rp_ent[i];
+			if (hpts->p_log == NULL)
+				continue;
+			if (hpts->p_log_wrapped)
+				sz = (sizeof(struct hpts_log) * hpts->p_logsize);
+			else
+				sz = (sizeof(struct hpts_log) * hpts->p_log_at);
+			error = SYSCTL_OUT(req, hpts->p_log, sz);
+		}
+	}
+	logging_on = logging_was;
+	return error;
+}
+
+SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
+    0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log");
+
+
+/*
+ * Try to get the INP_INFO lock.
+ *
+ * This function always succeeds in getting the lock. It will clear
+ * *tpp and return (1) if something critical changed while the inpcb
+ * was unlocked. Otherwise, it will leave *tpp unchanged and return (0).
+ *
+ * This function relies on the fact that the hpts always holds a
+ * reference on the inpcb while the segment is on the hptsi wheel and
+ * in the input queue.
+ *
+ */
+static int
+tcp_hptsi_lock_inpinfo(struct inpcb *inp, struct tcpcb **tpp)
+{
+	struct tcp_function_block *tfb;
+	struct tcpcb *tp;
+	void *ptr;
+
+	/* Try the easy way. */
+	if (INP_INFO_TRY_RLOCK(&V_tcbinfo))
+		return (0);
+
+	/*
+	 * OK, let's try the hard way. We'll save the function pointer block
+	 * to make sure that doesn't change while we aren't holding the
+	 * lock.
+	 */
+	tp = *tpp;
+	tfb = tp->t_fb;
+	ptr = tp->t_fb_ptr;
+	INP_WUNLOCK(inp);
+	INP_INFO_RLOCK(&V_tcbinfo);
+	INP_WLOCK(inp);
+	/* If the session went away, return an error. */
+	if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
+	    (inp->inp_flags2 & INP_FREED)) {
+		*tpp = NULL;
+		return (1);
+	}
+	/*
+	 * If the function block or stack-specific data block changed,
+	 * report an error.
+	 */
+	tp = intotcpcb(inp);
+	if ((tp->t_fb != tfb) && (tp->t_fb_ptr != ptr)) {
+		*tpp = NULL;
+		return (1);
+	}
+	return (0);
+}
+
+
+static void
+tcp_wakehpts(struct tcp_hpts_entry *hpts)
+{
+	HPTS_MTX_ASSERT(hpts);
+	swi_sched(hpts->ie_cookie, 0);
+	if (hpts->p_hpts_active == 2) {
+		/* Rare sleeping on a ENOBUF */
+		wakeup_one(hpts);
+	}
+}
+
+static void
+tcp_wakeinput(struct tcp_hpts_entry *hpts)
+{
+	HPTS_MTX_ASSERT(hpts);
+	swi_sched(hpts->ie_cookie, 0);
+	if (hpts->p_hpts_active == 2) {
+		/* Rare sleeping on a ENOBUF */
+		wakeup_one(hpts);
+	}
+}
+
+struct tcp_hpts_entry *
+tcp_cur_hpts(struct inpcb *inp)
+{
+	int32_t hpts_num;
+	struct tcp_hpts_entry *hpts;
+
+	hpts_num = inp->inp_hpts_cpu;
+	hpts = tcp_pace.rp_ent[hpts_num];
+	return (hpts);
+}
+
+struct tcp_hpts_entry *
+tcp_hpts_lock(struct inpcb *inp)
+{
+	struct tcp_hpts_entry *hpts;
+	int32_t hpts_num;
+
+again:
+	hpts_num = inp->inp_hpts_cpu;
+	hpts = tcp_pace.rp_ent[hpts_num];
+#ifdef INVARIANTS
+	if (mtx_owned(&hpts->p_mtx)) {
+		panic("Hpts:%p owns mtx prior-to lock line:%d",
+		    hpts, __LINE__);
+	}
+#endif
+	mtx_lock(&hpts->p_mtx);
+	if (hpts_num != inp->inp_hpts_cpu) {
+		mtx_unlock(&hpts->p_mtx);
+		goto again;
+	}
+	return (hpts);
+}
+
+struct tcp_hpts_entry *
+tcp_input_lock(struct inpcb *inp)
+{
+	struct tcp_hpts_entry *hpts;
+	int32_t hpts_num;
+
+again:
+	hpts_num = inp->inp_input_cpu;
+	hpts = tcp_pace.rp_ent[hpts_num];
+#ifdef INVARIANTS
+	if (mtx_owned(&hpts->p_mtx)) {
+		panic("Hpts:%p owns mtx prior-to lock line:%d",
+		    hpts, __LINE__);
+	}
+#endif
+	mtx_lock(&hpts->p_mtx);
+	if (hpts_num != inp->inp_input_cpu) {
+		mtx_unlock(&hpts->p_mtx);
+		goto again;
+	}
+	return (hpts);
+}
+
+static void
+tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line)
+{
+	int32_t add_freed;
+
+	if (inp->inp_flags2 & INP_FREED) {
+		/*
+		 * Need to play a special trick so that in_pcbrele_wlocked
+		 * does not return 1 when it really should have returned 0.
+		 */
+		add_freed = 1;
+		inp->inp_flags2 &= ~INP_FREED;
+	} else {
+		add_freed = 0;
+	}
+#ifndef INP_REF_DEBUG
+	if (in_pcbrele_wlocked(inp)) {
+		/*
+		 * This should not happen. We have the inpcb referred to by
+		 * the main socket (why we are called) and the hpts. It
+		 * should always return 0.
+		 */
+		panic("inpcb:%p release ret 1",
+		    inp);
+	}
+#else
+	if (__in_pcbrele_wlocked(inp, line)) {
+		/*
+		 * This should not happen. We have the inpcb referred to by
+		 * the main socket (why we are called) and the hpts. It
+		 * should always return 0.
+		 */
+		panic("inpcb:%p release ret 1",
+		    inp);
+	}
+#endif
+	if (add_freed) {
+		inp->inp_flags2 |= INP_FREED;
+	}
+}
+
+static void
+tcp_hpts_remove_locked_output(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
+{
+	if (inp->inp_in_hpts) {
+		hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], 1);
+		tcp_remove_hpts_ref(inp, hpts, line);
+	}
+}
+
+static void
+tcp_hpts_remove_locked_input(struct tcp_hpts_entry *hpts, struct inpcb *inp, int32_t flags, int32_t line)
+{
+	HPTS_MTX_ASSERT(hpts);
+	if (inp->inp_in_input) {
+		hpts_sane_input_remove(hpts, inp, 1);
+		tcp_remove_hpts_ref(inp, hpts, line);
+	}
+}
+
+/*
+ * Called normally with the INP_LOCKED but it
+ * does not matter, the hpts lock is the key
+ * but the lock order allows us to hold the
+ * INP lock and then get the hpts lock.
+ *
+ * Valid values in the flags are
+ * HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
+ * HPTS_REMOVE_INPUT - remove from the input of the hpts.
+ * Note that you can or both values together and get two
+ * actions.
+ */
+void
+__tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
+{
+	struct tcp_hpts_entry *hpts;
+
+	INP_WLOCK_ASSERT(inp);
+	if (flags & HPTS_REMOVE_OUTPUT) {
+		hpts = tcp_hpts_lock(inp);
+		tcp_hpts_remove_locked_output(hpts, inp, flags, line);
+		mtx_unlock(&hpts->p_mtx);
+	}
+	if (flags & HPTS_REMOVE_INPUT) {
+		hpts = tcp_input_lock(inp);
+		tcp_hpts_remove_locked_input(hpts, inp, flags, line);
+		mtx_unlock(&hpts->p_mtx);
+	}
+}
+
+static inline int
+hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus)
+{
+	return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS);
+}
+
+static int
+tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref)
+{
+	int32_t need_wake = 0;
+	uint32_t ticknow = 0;
+
+	HPTS_MTX_ASSERT(hpts);
+	if (inp->inp_in_hpts == 0) {
+		/* Ok we need to set it on the hpts in the current slot */
+		if (hpts->p_hpts_active == 0) {
+			/* A sleeping hpts we want in next slot to run */
+			if (logging_on) {
+				tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0,
+				    hpts_tick(hpts, 1));
+			}
+			inp->inp_hptsslot = hpts_tick(hpts, 1);
+			inp->inp_hpts_request = 0;
+			if (logging_on) {
+				tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow);
+			}
+			need_wake = 1;
+		} else if ((void *)inp == hpts->p_inp) {
+			/*
+			 * We can't allow you to go into the same slot we
+			 * are in. We must put you out.
+			 */
+			inp->inp_hptsslot = hpts->p_nxt_slot;
+		} else
+			inp->inp_hptsslot = hpts->p_cur_slot;
+		hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
+		inp->inp_hpts_request = 0;
+		if (logging_on) {
+			tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0);
+		}
+		if (need_wake) {
+			/*
+			 * Activate the hpts if it is sleeping and its
+			 * timeout is not 1.
+			 */
+			if (logging_on) {
+				tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow);
+			}
+			hpts->p_direct_wake = 1;
+			tcp_wakehpts(hpts);
+		}
+	}
+	return (need_wake);
+}
+

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***