git: c2a69e846fff - main - tcp_hpts: move HPTS related fields from inpcb to tcpcb

Go to: [ bottom of page ] [ top of archives ] [ this month ]
From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Tue, 25 Apr 2023 19:21:53 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=c2a69e846fffb95271c0299e0a81e2033382e9c2

commit c2a69e846fffb95271c0299e0a81e2033382e9c2
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2023-04-25 19:18:33 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2023-04-25 19:18:33 +0000

    tcp_hpts: move HPTS related fields from inpcb to tcpcb
    
    This makes inpcb lighter and allows future cache line optimizations
    of tcpcb.  The reason why HPTS originally used inpcb is the compressed
    TIME-WAIT state (see 0d7445193ab), that used to free a tcpcb, while the
    associated connection is still on the HPTS ring.
    
    Reviewed by:            rrs
    Differential Revision:  https://reviews.freebsd.org/D39697
---
 sys/netinet/in_pcb.c          |   2 -
 sys/netinet/in_pcb.h          |  74 +-------
 sys/netinet/tcp_hpts.c        | 384 ++++++++++++++++++++----------------------
 sys/netinet/tcp_hpts.h        |  16 +-
 sys/netinet/tcp_lro.c         |   6 +-
 sys/netinet/tcp_stacks/bbr.c  |  66 ++++----
 sys/netinet/tcp_stacks/rack.c | 116 ++++++-------
 sys/netinet/tcp_subr.c        |  11 +-
 sys/netinet/tcp_timewait.c    |   1 -
 sys/netinet/tcp_usrreq.c      |   2 +-
 sys/netinet/tcp_var.h         |  20 ++-
 11 files changed, 311 insertions(+), 387 deletions(-)

diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 9193dfb2372b..350d08360105 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -1692,7 +1692,6 @@ in_pcbrele_rlocked(struct inpcb *inp)
 
 	MPASS(inp->inp_flags & INP_FREED);
 	MPASS(inp->inp_socket == NULL);
-	MPASS(inp->inp_in_hpts == 0);
 	crfree(inp->inp_cred);
 #ifdef INVARIANTS
 	inp->inp_cred = NULL;
@@ -1713,7 +1712,6 @@ in_pcbrele_wlocked(struct inpcb *inp)
 
 	MPASS(inp->inp_flags & INP_FREED);
 	MPASS(inp->inp_socket == NULL);
-	MPASS(inp->inp_in_hpts == 0);
 	crfree(inp->inp_cred);
 #ifdef INVARIANTS
 	inp->inp_cred = NULL;
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 984cb9e26561..62c5758268a7 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -145,7 +145,6 @@ struct in_conninfo {
  * lock is to be obtained and SMR section exited.
  *
  * Key:
- * (b) - Protected by the hpts lock.
  * (c) - Constant after initialization
  * (e) - Protected by the SMR section
  * (i) - Protected by the inpcb lock
@@ -154,51 +153,6 @@ struct in_conninfo {
  * (s) - Protected by another subsystem's locks
  * (x) - Undefined locking
  *
- * Notes on the tcp_hpts:
- *
- * First Hpts lock order is
- * 1) INP_WLOCK()
- * 2) HPTS_LOCK() i.e. hpts->pmtx
- *
- * To insert a TCB on the hpts you *must* be holding the INP_WLOCK().
- * You may check the inp->inp_in_hpts flag without the hpts lock.
- * The hpts is the only one that will clear this flag holding
- * only the hpts lock. This means that in your tcp_output()
- * routine when you test for the inp_in_hpts flag to be 1
- * it may be transitioning to 0 (by the hpts).
- * That's ok since that will just mean an extra call to tcp_output
- * that most likely will find the call you executed
- * (when the mis-match occurred) will have put the TCB back
- * on the hpts and it will return. If your
- * call did not add the inp back to the hpts then you will either
- * over-send or the cwnd will block you from sending more.
- *
- * Note you should also be holding the INP_WLOCK() when you
- * call the remove from the hpts as well. Though usually
- * you are either doing this from a timer, where you need and have
- * the INP_WLOCK() or from destroying your TCB where again
- * you should already have the INP_WLOCK().
- *
- * The inp_hpts_cpu, inp_hpts_cpu_set, inp_input_cpu and
- * inp_input_cpu_set fields are controlled completely by
- * the hpts. Do not ever set these. The inp_hpts_cpu_set
- * and inp_input_cpu_set fields indicate if the hpts has
- * setup the respective cpu field. It is advised if this
- * field is 0, to enqueue the packet with the appropriate
- * hpts_immediate() call. If the _set field is 1, then
- * you may compare the inp_*_cpu field to the curcpu and
- * may want to again insert onto the hpts if these fields
- * are not equal (i.e. you are not on the expected CPU).
- *
- * A note on inp_hpts_calls and inp_input_calls, these
- * flags are set when the hpts calls either the output
- * or do_segment routines respectively. If the routine
- * being called wants to use this, then it needs to
- * clear the flag before returning. The hpts will not
- * clear the flag. The flags can be used to tell if
- * the hpts is the function calling the respective
- * routine.
- *
  * A few other notes:
  *
  * When a read lock is held, stability of the field is guaranteed; to write
@@ -219,41 +173,15 @@ struct inpcb {
 	CK_LIST_ENTRY(inpcb) inp_hash_wild;	/* hash table linkage */
 	struct rwlock	inp_lock;
 	/* Cache line #2 (amd64) */
-#define	inp_start_zero	inp_hpts
+#define	inp_start_zero	inp_refcount
 #define	inp_zero_size	(sizeof(struct inpcb) - \
 			    offsetof(struct inpcb, inp_start_zero))
-	TAILQ_ENTRY(inpcb) inp_hpts;	/* pacing out queue next lock(b) */
-	uint32_t inp_hpts_gencnt;	/* XXXGL */
-	uint32_t inp_hpts_request;	/* Current hpts request, zero if
-					 * fits in the pacing window (i&b). */
-	/*
-	 * Note the next fields are protected by a
-	 * different lock (hpts-lock). This means that
-	 * they must correspond in size to the smallest
-	 * protectable bit field (uint8_t on x86, and
-	 * other platfomrs potentially uint32_t?). Also
-	 * since CPU switches can occur at different times the two
-	 * fields can *not* be collapsed into a signal bit field.
-	 */
-#if defined(__amd64__) || defined(__i386__)
-	uint8_t inp_in_hpts; /* on output hpts (lock b) */
-#else
-	uint32_t inp_in_hpts; /* on output hpts (lock b) */
-#endif
-	volatile uint16_t  inp_hpts_cpu; /* Lock (i) */
-	volatile uint16_t  inp_irq_cpu;	/* Set by LRO in behalf of or the driver */
 	u_int	inp_refcount;		/* (i) refcount */
 	int	inp_flags;		/* (i) generic IP/datagram flags */
 	int	inp_flags2;		/* (i) generic IP/datagram flags #2*/
-	uint8_t inp_hpts_cpu_set :1,  /* on output hpts (i) */
-			 inp_hpts_calls :1,	/* (i) from output hpts */
-			 inp_irq_cpu_set :1,	/* (i) from LRO/Driver */
-			 inp_spare_bits2 : 3;
 	uint8_t inp_numa_domain;	/* numa domain */
 	void	*inp_ppcb;		/* (i) pointer to per-protocol pcb */
 	struct	socket *inp_socket;	/* (i) back pointer to socket */
-	int32_t 	 inp_hptsslot;	/* Hpts wheel slot this tcb is Lock(i&b) */
-	uint32_t         inp_hpts_drop_reas;	/* reason we are dropping the PCB (lock i&b) */
 	struct	inpcbinfo *inp_pcbinfo;	/* (c) PCB list info */
 	struct	ucred	*inp_cred;	/* (c) cache of socket cred */
 	u_int32_t inp_flow;		/* (i) IPv6 flow information */
diff --git a/sys/netinet/tcp_hpts.c b/sys/netinet/tcp_hpts.c
index cc1bd71d0d43..59122bb242b9 100644
--- a/sys/netinet/tcp_hpts.c
+++ b/sys/netinet/tcp_hpts.c
@@ -199,7 +199,7 @@ struct tcp_hpts_entry {
 	uint8_t p_fill[3];	  /* Fill to 32 bits */
 	/* Cache line 0x40 */
 	struct hptsh {
-		TAILQ_HEAD(, inpcb)	head;
+		TAILQ_HEAD(, tcpcb)	head;
 		uint32_t		count;
 		uint32_t		gencnt;
 	} *p_hptss;			/* Hptsi wheel */
@@ -273,12 +273,6 @@ static struct hpts_domain_info {
 	int cpu[MAXCPU];
 } hpts_domains[MAXMEMDOM];
 
-enum {
-	IHPTS_NONE = 0,
-	IHPTS_ONQUEUE,
-	IHPTS_MOVING,
-};
-
 counter_u64_t hpts_hopelessly_behind;
 
 SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD,
@@ -426,6 +420,17 @@ SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
     &tcp_hpts_no_wake_over_thresh, 0,
     "When we are over the threshold on the pacer do we prohibit wakeups?");
 
+static uint16_t
+hpts_random_cpu(void)
+{
+	uint16_t cpuid;
+	uint32_t ran;
+
+	ran = arc4random();
+	cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss);
+	return (cpuid);
+}
+
 static void
 tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv,
 	     int slots_to_run, int idx, int from_callout)
@@ -489,54 +494,67 @@ hpts_timeout_swi(void *arg)
 }
 
 static void
-inp_hpts_insert(struct inpcb *inp, struct tcp_hpts_entry *hpts)
+tcp_hpts_insert_internal(struct tcpcb *tp, struct tcp_hpts_entry *hpts)
 {
+	struct inpcb *inp = tptoinpcb(tp);
 	struct hptsh *hptsh;
 
 	INP_WLOCK_ASSERT(inp);
 	HPTS_MTX_ASSERT(hpts);
-	MPASS(hpts->p_cpu == inp->inp_hpts_cpu);
+	MPASS(hpts->p_cpu == tp->t_hpts_cpu);
 	MPASS(!(inp->inp_flags & INP_DROPPED));
 
-	hptsh = &hpts->p_hptss[inp->inp_hptsslot];
+	hptsh = &hpts->p_hptss[tp->t_hpts_slot];
 
-	if (inp->inp_in_hpts == IHPTS_NONE) {
-		inp->inp_in_hpts = IHPTS_ONQUEUE;
+	if (tp->t_in_hpts == IHPTS_NONE) {
+		tp->t_in_hpts = IHPTS_ONQUEUE;
 		in_pcbref(inp);
-	} else if (inp->inp_in_hpts == IHPTS_MOVING) {
-		inp->inp_in_hpts = IHPTS_ONQUEUE;
+	} else if (tp->t_in_hpts == IHPTS_MOVING) {
+		tp->t_in_hpts = IHPTS_ONQUEUE;
 	} else
-		MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE);
-	inp->inp_hpts_gencnt = hptsh->gencnt;
+		MPASS(tp->t_in_hpts == IHPTS_ONQUEUE);
+	tp->t_hpts_gencnt = hptsh->gencnt;
 
-	TAILQ_INSERT_TAIL(&hptsh->head, inp, inp_hpts);
+	TAILQ_INSERT_TAIL(&hptsh->head, tp, t_hpts);
 	hptsh->count++;
 	hpts->p_on_queue_cnt++;
 }
 
 static struct tcp_hpts_entry *
-tcp_hpts_lock(struct inpcb *inp)
+tcp_hpts_lock(struct tcpcb *tp)
 {
 	struct tcp_hpts_entry *hpts;
 
-	INP_LOCK_ASSERT(inp);
+	INP_LOCK_ASSERT(tptoinpcb(tp));
 
-	hpts = tcp_pace.rp_ent[inp->inp_hpts_cpu];
+	hpts = tcp_pace.rp_ent[tp->t_hpts_cpu];
 	HPTS_LOCK(hpts);
 
 	return (hpts);
 }
 
 static void
-inp_hpts_release(struct inpcb *inp)
+tcp_hpts_release(struct tcpcb *tp)
 {
 	bool released __diagused;
 
-	inp->inp_in_hpts = IHPTS_NONE;
-	released = in_pcbrele_wlocked(inp);
+	tp->t_in_hpts = IHPTS_NONE;
+	released = in_pcbrele_wlocked(tptoinpcb(tp));
 	MPASS(released == false);
 }
 
+/*
+ * Initialize newborn tcpcb to get ready for use with HPTS.
+ */
+void
+tcp_hpts_init(struct tcpcb *tp)
+{
+
+	tp->t_hpts_cpu = hpts_random_cpu();
+	tp->t_lro_cpu = HPTS_CPU_NONE;
+	MPASS(!(tp->t_flags2 & TF2_HPTS_CPU_SET));
+}
+
 /*
  * Called normally with the INP_LOCKED but it
  * does not matter, the hpts lock is the key
@@ -544,39 +562,39 @@ inp_hpts_release(struct inpcb *inp)
  * INP lock and then get the hpts lock.
  */
 void
-tcp_hpts_remove(struct inpcb *inp)
+tcp_hpts_remove(struct tcpcb *tp)
 {
 	struct tcp_hpts_entry *hpts;
 	struct hptsh *hptsh;
 
-	INP_WLOCK_ASSERT(inp);
+	INP_WLOCK_ASSERT(tptoinpcb(tp));
 
-	hpts = tcp_hpts_lock(inp);
-	if (inp->inp_in_hpts == IHPTS_ONQUEUE) {
-		hptsh = &hpts->p_hptss[inp->inp_hptsslot];
-		inp->inp_hpts_request = 0;
-		if (__predict_true(inp->inp_hpts_gencnt == hptsh->gencnt)) {
-			TAILQ_REMOVE(&hptsh->head, inp, inp_hpts);
+	hpts = tcp_hpts_lock(tp);
+	if (tp->t_in_hpts == IHPTS_ONQUEUE) {
+		hptsh = &hpts->p_hptss[tp->t_hpts_slot];
+		tp->t_hpts_request = 0;
+		if (__predict_true(tp->t_hpts_gencnt == hptsh->gencnt)) {
+			TAILQ_REMOVE(&hptsh->head, tp, t_hpts);
 			MPASS(hptsh->count > 0);
 			hptsh->count--;
 			MPASS(hpts->p_on_queue_cnt > 0);
 			hpts->p_on_queue_cnt--;
-			inp_hpts_release(inp);
+			tcp_hpts_release(tp);
 		} else {
 			/*
 			 * tcp_hptsi() now owns the TAILQ head of this inp.
 			 * Can't TAILQ_REMOVE, just mark it.
 			 */
 #ifdef INVARIANTS
-			struct inpcb *tmp;
+			struct tcpcb *tmp;
 
-			TAILQ_FOREACH(tmp, &hptsh->head, inp_hpts)
-				MPASS(tmp != inp);
+			TAILQ_FOREACH(tmp, &hptsh->head, t_hpts)
+				MPASS(tmp != tp);
 #endif
-			inp->inp_in_hpts = IHPTS_MOVING;
-			inp->inp_hptsslot = -1;
+			tp->t_in_hpts = IHPTS_MOVING;
+			tp->t_hpts_slot = -1;
 		}
-	} else if (inp->inp_in_hpts == IHPTS_MOVING) {
+	} else if (tp->t_in_hpts == IHPTS_MOVING) {
 		/*
 		 * Handle a special race condition:
 		 * tcp_hptsi() moves inpcb to detached tailq
@@ -585,18 +603,11 @@ tcp_hpts_remove(struct inpcb *inp)
 		 * tcp_hpts_remove() again (we are here!), then in_pcbdrop()
 		 * tcp_hptsi() finds pcb with meaningful slot and INP_DROPPED
 		 */
-		inp->inp_hptsslot = -1;
+		tp->t_hpts_slot = -1;
 	}
 	HPTS_UNLOCK(hpts);
 }
 
-bool
-tcp_in_hpts(struct inpcb *inp)
-{
-
-	return (inp->inp_in_hpts == IHPTS_ONQUEUE);
-}
-
 static inline int
 hpts_slot(uint32_t wheel_slot, uint32_t plus)
 {
@@ -762,15 +773,15 @@ max_slots_available(struct tcp_hpts_entry *hpts, uint32_t wheel_slot, uint32_t *
 
 #ifdef INVARIANTS
 static void
-check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line)
+check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct tcpcb *tp,
+    uint32_t hptsslot, int line)
 {
 	/*
 	 * Sanity checks for the pacer with invariants
 	 * on insert.
 	 */
-	KASSERT(inp_hptsslot < NUM_OF_HPTSI_SLOTS,
-		("hpts:%p inp:%p slot:%d > max",
-		 hpts, inp, inp_hptsslot));
+	KASSERT(hptsslot < NUM_OF_HPTSI_SLOTS,
+		("hpts:%p tp:%p slot:%d > max", hpts, tp, hptsslot));
 	if ((hpts->p_hpts_active) &&
 	    (hpts->p_wheel_complete == 0)) {
 		/*
@@ -781,22 +792,21 @@ check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uin
 		 */
 		int distance, yet_to_run;
 
-		distance = hpts_slots_diff(hpts->p_runningslot, inp_hptsslot);
+		distance = hpts_slots_diff(hpts->p_runningslot, hptsslot);
 		if (hpts->p_runningslot != hpts->p_cur_slot)
 			yet_to_run = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
 		else
 			yet_to_run = 0;	/* processing last slot */
-		KASSERT(yet_to_run <= distance,
-			("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
-			 hpts, inp, inp_hptsslot,
-			 distance, yet_to_run,
-			 hpts->p_runningslot, hpts->p_cur_slot));
+		KASSERT(yet_to_run <= distance, ("hpts:%p tp:%p slot:%d "
+		    "distance:%d yet_to_run:%d rs:%d cs:%d", hpts, tp,
+		    hptsslot, distance, yet_to_run, hpts->p_runningslot,
+		    hpts->p_cur_slot));
 	}
 }
 #endif
 
 uint32_t
-tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag)
+tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line, struct hpts_diag *diag)
 {
 	struct tcp_hpts_entry *hpts;
 	struct timeval tv;
@@ -804,16 +814,16 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts
 	int32_t wheel_slot, maxslots;
 	bool need_wakeup = false;
 
-	INP_WLOCK_ASSERT(inp);
-	MPASS(!tcp_in_hpts(inp));
-	MPASS(!(inp->inp_flags & INP_DROPPED));
+	INP_WLOCK_ASSERT(tptoinpcb(tp));
+	MPASS(!(tptoinpcb(tp)->inp_flags & INP_DROPPED));
+	MPASS(!tcp_in_hpts(tp));
 
 	/*
 	 * We now return the next-slot the hpts will be on, beyond its
 	 * current run (if up) or where it was when it stopped if it is
 	 * sleeping.
 	 */
-	hpts = tcp_hpts_lock(inp);
+	hpts = tcp_hpts_lock(tp);
 	microuptime(&tv);
 	if (diag) {
 		memset(diag, 0, sizeof(struct hpts_diag));
@@ -830,20 +840,20 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts
 	}
 	if (slot == 0) {
 		/* Ok we need to set it on the hpts in the current slot */
-		inp->inp_hpts_request = 0;
+		tp->t_hpts_request = 0;
 		if ((hpts->p_hpts_active == 0) || (hpts->p_wheel_complete)) {
 			/*
 			 * A sleeping hpts we want in next slot to run
 			 * note that in this state p_prev_slot == p_cur_slot
 			 */
-			inp->inp_hptsslot = hpts_slot(hpts->p_prev_slot, 1);
+			tp->t_hpts_slot = hpts_slot(hpts->p_prev_slot, 1);
 			if ((hpts->p_on_min_sleep == 0) &&
 			    (hpts->p_hpts_active == 0))
 				need_wakeup = true;
 		} else
-			inp->inp_hptsslot = hpts->p_runningslot;
-		if (__predict_true(inp->inp_in_hpts != IHPTS_MOVING))
-			inp_hpts_insert(inp, hpts);
+			tp->t_hpts_slot = hpts->p_runningslot;
+		if (__predict_true(tp->t_in_hpts != IHPTS_MOVING))
+			tcp_hpts_insert_internal(tp, hpts);
 		if (need_wakeup) {
 			/*
 			 * Activate the hpts if it is sleeping and its
@@ -880,28 +890,28 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts
 			 */
 			slot--;
 		}
-		inp->inp_hptsslot = last_slot;
-		inp->inp_hpts_request = slot;
+		tp->t_hpts_slot = last_slot;
+		tp->t_hpts_request = slot;
 	} else 	if (maxslots >= slot) {
 		/* It all fits on the wheel */
-		inp->inp_hpts_request = 0;
-		inp->inp_hptsslot = hpts_slot(wheel_slot, slot);
+		tp->t_hpts_request = 0;
+		tp->t_hpts_slot = hpts_slot(wheel_slot, slot);
 	} else {
 		/* It does not fit */
-		inp->inp_hpts_request = slot - maxslots;
-		inp->inp_hptsslot = last_slot;
+		tp->t_hpts_request = slot - maxslots;
+		tp->t_hpts_slot = last_slot;
 	}
 	if (diag) {
-		diag->slot_remaining = inp->inp_hpts_request;
-		diag->inp_hptsslot = inp->inp_hptsslot;
+		diag->slot_remaining = tp->t_hpts_request;
+		diag->inp_hptsslot = tp->t_hpts_slot;
 	}
 #ifdef INVARIANTS
-	check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
+	check_if_slot_would_be_wrong(hpts, tp, tp->t_hpts_slot, line);
 #endif
-	if (__predict_true(inp->inp_in_hpts != IHPTS_MOVING))
-		inp_hpts_insert(inp, hpts);
+	if (__predict_true(tp->t_in_hpts != IHPTS_MOVING))
+		tcp_hpts_insert_internal(tp, hpts);
 	if ((hpts->p_hpts_active == 0) &&
-	    (inp->inp_hpts_request == 0) &&
+	    (tp->t_hpts_request == 0) &&
 	    (hpts->p_on_min_sleep == 0)) {
 		/*
 		 * The hpts is sleeping and NOT on a minimum
@@ -972,54 +982,35 @@ tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts
 	return (slot_on);
 }
 
-uint16_t
-hpts_random_cpu(struct inpcb *inp){
-	/*
-	 * No flow type set distribute the load randomly.
-	 */
-	uint16_t cpuid;
-	uint32_t ran;
-
-	/*
-	 * Shortcut if it is already set. XXXGL: does it happen?
-	 */
-	if (inp->inp_hpts_cpu_set) {
-		return (inp->inp_hpts_cpu);
-	}
-	/* Nothing set use a random number */
-	ran = arc4random();
-	cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss);
-	return (cpuid);
-}
-
 static uint16_t
-hpts_cpuid(struct inpcb *inp, int *failed)
+hpts_cpuid(struct tcpcb *tp, int *failed)
 {
+	struct inpcb *inp = tptoinpcb(tp);
 	u_int cpuid;
 #ifdef NUMA
 	struct hpts_domain_info *di;
 #endif
 
 	*failed = 0;
-	if (inp->inp_hpts_cpu_set) {
-		return (inp->inp_hpts_cpu);
+	if (tp->t_flags2 & TF2_HPTS_CPU_SET) {
+		return (tp->t_hpts_cpu);
 	}
 	/*
 	 * If we are using the irq cpu set by LRO or
 	 * the driver then it overrides all other domains.
 	 */
 	if (tcp_use_irq_cpu) {
-		if (inp->inp_irq_cpu_set == 0) {
+		if (tp->t_lro_cpu == HPTS_CPU_NONE) {
 			*failed = 1;
-			return(0);
+			return (0);
 		}
-		return(inp->inp_irq_cpu);
+		return (tp->t_lro_cpu);
 	}
 	/* If one is set the other must be the same */
 #ifdef RSS
 	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
 	if (cpuid == NETISR_CPUID_NONE)
-		return (hpts_random_cpu(inp));
+		return (hpts_random_cpu());
 	else
 		return (cpuid);
 #endif
@@ -1030,7 +1021,7 @@ hpts_cpuid(struct inpcb *inp, int *failed)
 	 */
 	if (inp->inp_flowtype == M_HASHTYPE_NONE) {
 		counter_u64_add(cpu_uses_random, 1);
-		return (hpts_random_cpu(inp));
+		return (hpts_random_cpu());
 	}
 	/*
 	 * Hash to a thread based on the flowid.  If we are using numa,
@@ -1081,12 +1072,10 @@ static int32_t
 tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout)
 {
 	struct tcpcb *tp;
-	struct inpcb *inp;
 	struct timeval tv;
 	int32_t slots_to_run, i, error;
 	int32_t loop_cnt = 0;
 	int32_t did_prefetch = 0;
-	int32_t prefetch_ninp = 0;
 	int32_t prefetch_tp = 0;
 	int32_t wrap_loop_cnt = 0;
 	int32_t slot_pos_of_endpoint = 0;
@@ -1154,25 +1143,25 @@ again:
 		 * run them, the extra 10usecs of late (by being
 		 * put behind) does not really matter in this situation.
 		 */
-		TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot].head,
-		    inp_hpts) {
-			MPASS(inp->inp_hptsslot == hpts->p_nxt_slot);
-			MPASS(inp->inp_hpts_gencnt ==
+		TAILQ_FOREACH(tp, &hpts->p_hptss[hpts->p_nxt_slot].head,
+		    t_hpts) {
+			MPASS(tp->t_hpts_slot == hpts->p_nxt_slot);
+			MPASS(tp->t_hpts_gencnt ==
 			    hpts->p_hptss[hpts->p_nxt_slot].gencnt);
-			MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE);
+			MPASS(tp->t_in_hpts == IHPTS_ONQUEUE);
 
 			/*
 			 * Update gencnt and nextslot accordingly to match
 			 * the new location. This is safe since it takes both
 			 * the INP lock and the pacer mutex to change the
-			 * inp_hptsslot and inp_hpts_gencnt.
+			 * t_hptsslot and t_hpts_gencnt.
 			 */
-			inp->inp_hpts_gencnt =
+			tp->t_hpts_gencnt =
 			    hpts->p_hptss[hpts->p_runningslot].gencnt;
-			inp->inp_hptsslot = hpts->p_runningslot;
+			tp->t_hpts_slot = hpts->p_runningslot;
 		}
 		TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningslot].head,
-		    &hpts->p_hptss[hpts->p_nxt_slot].head, inp_hpts);
+		    &hpts->p_hptss[hpts->p_nxt_slot].head, t_hpts);
 		hpts->p_hptss[hpts->p_runningslot].count +=
 		    hpts->p_hptss[hpts->p_nxt_slot].count;
 		hpts->p_hptss[hpts->p_nxt_slot].count = 0;
@@ -1191,8 +1180,8 @@ again:
 		goto no_one;
 	}
 	for (i = 0; i < slots_to_run; i++) {
-		struct inpcb *inp, *ninp;
-		TAILQ_HEAD(, inpcb) head = TAILQ_HEAD_INITIALIZER(head);
+		struct tcpcb *tp, *ntp;
+		TAILQ_HEAD(, tcpcb) head = TAILQ_HEAD_INITIALIZER(head);
 		struct hptsh *hptsh;
 		uint32_t runningslot;
 
@@ -1205,20 +1194,54 @@ again:
 
 		runningslot = hpts->p_runningslot;
 		hptsh = &hpts->p_hptss[runningslot];
-		TAILQ_SWAP(&head, &hptsh->head, inpcb, inp_hpts);
+		TAILQ_SWAP(&head, &hptsh->head, tcpcb, t_hpts);
 		hpts->p_on_queue_cnt -= hptsh->count;
 		hptsh->count = 0;
 		hptsh->gencnt++;
 
 		HPTS_UNLOCK(hpts);
 
-		TAILQ_FOREACH_SAFE(inp, &head, inp_hpts, ninp) {
+		TAILQ_FOREACH_SAFE(tp, &head, t_hpts, ntp) {
+			struct inpcb *inp = tptoinpcb(tp);
 			bool set_cpu;
 
-			if (ninp != NULL) {
-				/* We prefetch the next inp if possible */
-				kern_prefetch(ninp, &prefetch_ninp);
-				prefetch_ninp = 1;
+			if (ntp != NULL) {
+				/*
+				 * If we have a next tcpcb, see if we can
+				 * prefetch it. Note this may seem
+				 * "risky" since we have no locks (other
+				 * than the previous inp) and there no
+				 * assurance that ntp was not pulled while
+				 * we were processing tp and freed. If this
+				 * occurred it could mean that either:
+				 *
+				 * a) Its NULL (which is fine we won't go
+				 * here) <or> b) Its valid (which is cool we
+				 * will prefetch it) <or> c) The inp got
+				 * freed back to the slab which was
+				 * reallocated. Then the piece of memory was
+				 * re-used and something else (not an
+				 * address) is in inp_ppcb. If that occurs
+				 * we don't crash, but take a TLB shootdown
+				 * performance hit (same as if it was NULL
+				 * and we tried to pre-fetch it).
+				 *
+				 * Considering that the likelyhood of <c> is
+				 * quite rare we will take a risk on doing
+				 * this. If performance drops after testing
+				 * we can always take this out. NB: the
+				 * kern_prefetch on amd64 actually has
+				 * protection against a bad address now via
+				 * the DMAP_() tests. This will prevent the
+				 * TLB hit, and instead if <c> occurs just
+				 * cause us to load cache with a useless
+				 * address (to us).
+				 *
+				 * XXXGL: this comment and the prefetch action
+				 * could be outdated after tp == inp change.
+				 */
+				kern_prefetch(ntp, &prefetch_tp);
+				prefetch_tp = 1;
 			}
 
 			/* For debugging */
@@ -1232,33 +1255,33 @@ again:
 			}
 
 			INP_WLOCK(inp);
-			if (inp->inp_hpts_cpu_set == 0) {
+			if ((tp->t_flags2 & TF2_HPTS_CPU_SET) == 0) {
 				set_cpu = true;
 			} else {
 				set_cpu = false;
 			}
 
-			if (__predict_false(inp->inp_in_hpts == IHPTS_MOVING)) {
-				if (inp->inp_hptsslot == -1) {
-					inp->inp_in_hpts = IHPTS_NONE;
+			if (__predict_false(tp->t_in_hpts == IHPTS_MOVING)) {
+				if (tp->t_hpts_slot == -1) {
+					tp->t_in_hpts = IHPTS_NONE;
 					if (in_pcbrele_wlocked(inp) == false)
 						INP_WUNLOCK(inp);
 				} else {
 					HPTS_LOCK(hpts);
-					inp_hpts_insert(inp, hpts);
+					tcp_hpts_insert_internal(tp, hpts);
 					HPTS_UNLOCK(hpts);
 					INP_WUNLOCK(inp);
 				}
 				continue;
 			}
 
-			MPASS(inp->inp_in_hpts == IHPTS_ONQUEUE);
+			MPASS(tp->t_in_hpts == IHPTS_ONQUEUE);
 			MPASS(!(inp->inp_flags & INP_DROPPED));
-			KASSERT(runningslot == inp->inp_hptsslot,
+			KASSERT(runningslot == tp->t_hpts_slot,
 				("Hpts:%p inp:%p slot mis-aligned %u vs %u",
-				 hpts, inp, runningslot, inp->inp_hptsslot));
+				 hpts, inp, runningslot, tp->t_hpts_slot));
 
-			if (inp->inp_hpts_request) {
+			if (tp->t_hpts_request) {
 				/*
 				 * This guy is deferred out further in time
 				 * then our wheel had available on it.
@@ -1268,38 +1291,36 @@ again:
 				uint32_t maxslots, last_slot, remaining_slots;
 
 				remaining_slots = slots_to_run - (i + 1);
-				if (inp->inp_hpts_request > remaining_slots) {
+				if (tp->t_hpts_request > remaining_slots) {
 					HPTS_LOCK(hpts);
 					/*
 					 * How far out can we go?
 					 */
 					maxslots = max_slots_available(hpts,
 					    hpts->p_cur_slot, &last_slot);
-					if (maxslots >= inp->inp_hpts_request) {
+					if (maxslots >= tp->t_hpts_request) {
 						/* We can place it finally to
 						 * be processed.  */
-						inp->inp_hptsslot = hpts_slot(
+						tp->t_hpts_slot = hpts_slot(
 						    hpts->p_runningslot,
-						    inp->inp_hpts_request);
-						inp->inp_hpts_request = 0;
+						    tp->t_hpts_request);
+						tp->t_hpts_request = 0;
 					} else {
 						/* Work off some more time */
-						inp->inp_hptsslot = last_slot;
-						inp->inp_hpts_request -=
+						tp->t_hpts_slot = last_slot;
+						tp->t_hpts_request -=
 						    maxslots;
 					}
-					inp_hpts_insert(inp, hpts);
+					tcp_hpts_insert_internal(tp, hpts);
 					HPTS_UNLOCK(hpts);
 					INP_WUNLOCK(inp);
 					continue;
 				}
-				inp->inp_hpts_request = 0;
+				tp->t_hpts_request = 0;
 				/* Fall through we will so do it now */
 			}
 
-			inp_hpts_release(inp);
-			tp = intotcpcb(inp);
-			MPASS(tp);
+			tcp_hpts_release(tp);
 			if (set_cpu) {
 				/*
 				 * Setup so the next time we will move to
@@ -1318,7 +1339,7 @@ again:
 				 * gets added to the hpts (not this one)
 				 * :-)
 				 */
-				tcp_set_hpts(inp);
+				tcp_set_hpts(tp);
 			}
 			CURVNET_SET(inp->inp_vnet);
 			/* Lets do any logging that we might want to */
@@ -1331,16 +1352,17 @@ again:
 				did_prefetch = 1;
 			}
 			/*
-			 * We set inp_hpts_calls to 1 before any possible output.
-			 * The contract with the transport is that if it cares about
-			 * hpts calling it should clear the flag. That way next time
-			 * it is called it will know it is hpts.
+			 * We set TF2_HPTS_CALLS before any possible output.
+			 * The contract with the transport is that if it cares
+			 * about hpts calling it should clear the flag. That
+			 * way next time it is called it will know it is hpts.
 			 *
-			 * We also only call tfb_do_queued_segments() <or> tcp_output()
-			 * it is expected that if segments are queued and come in that
-			 * the final input mbuf will cause a call to  output if it is needed.
+			 * We also only call tfb_do_queued_segments() <or>
+			 * tcp_output().  It is expected that if segments are
+			 * queued and come in that the final input mbuf will
+			 * cause a call to output if it is needed.
 			 */
-			inp->inp_hpts_calls = 1;
+			tp->t_flags2 |= TF2_HPTS_CALLS;
 			if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) &&
 			    !STAILQ_EMPTY(&tp->t_inqueue)) {
 				error = (*tp->t_fb->tfb_do_queued_segments)(tp, 0);
@@ -1352,44 +1374,6 @@ again:
 			error = tcp_output(tp);
 			if (error < 0)
 				goto skip_pacing;
-			if (ninp) {
-				/*
-				 * If we have a nxt inp, see if we can
-				 * prefetch it. Note this may seem
-				 * "risky" since we have no locks (other
-				 * than the previous inp) and there no
-				 * assurance that ninp was not pulled while
-				 * we were processing inp and freed. If this
-				 * occurred it could mean that either:
-				 *
-				 * a) Its NULL (which is fine we won't go
-				 * here) <or> b) Its valid (which is cool we
-				 * will prefetch it) <or> c) The inp got
-				 * freed back to the slab which was
-				 * reallocated. Then the piece of memory was
-				 * re-used and something else (not an
-				 * address) is in inp_ppcb. If that occurs
-				 * we don't crash, but take a TLB shootdown
-				 * performance hit (same as if it was NULL
-				 * and we tried to pre-fetch it).
-				 *
-				 * Considering that the likelyhood of <c> is
-				 * quite rare we will take a risk on doing
-				 * this. If performance drops after testing
-				 * we can always take this out. NB: the
-				 * kern_prefetch on amd64 actually has
-				 * protection against a bad address now via
-				 * the DMAP_() tests. This will prevent the
-				 * TLB hit, and instead if <c> occurs just
-				 * cause us to load cache with a useless
-				 * address (to us).
-				 *
-				 * XXXGL: with tcpcb == inpcb, I'm unsure this
-				 * prefetch is still correct and useful.
-				 */
-				kern_prefetch(ninp, &prefetch_tp);
-				prefetch_tp = 1;
-			}
 			INP_WUNLOCK(inp);
 		skip_pacing:
 			CURVNET_RESTORE();
@@ -1491,18 +1475,18 @@ no_run:
 }
 
 void
-__tcp_set_hpts(struct inpcb *inp, int32_t line)
+__tcp_set_hpts(struct tcpcb *tp, int32_t line)
 {
 	struct tcp_hpts_entry *hpts;
 	int failed;
 
-	INP_WLOCK_ASSERT(inp);
-	hpts = tcp_hpts_lock(inp);
-	if ((inp->inp_in_hpts == 0) &&
-	    (inp->inp_hpts_cpu_set == 0)) {
-		inp->inp_hpts_cpu = hpts_cpuid(inp, &failed);
+	INP_WLOCK_ASSERT(tptoinpcb(tp));
+
+	hpts = tcp_hpts_lock(tp);
+	if (tp->t_in_hpts == IHPTS_NONE && !(tp->t_flags2 & TF2_HPTS_CPU_SET)) {
+		tp->t_hpts_cpu = hpts_cpuid(tp, &failed);
 		if (failed == 0)
-			inp->inp_hpts_cpu_set = 1;
+			tp->t_flags2 |= TF2_HPTS_CPU_SET;
 	}
 	mtx_unlock(&hpts->p_mtx);
 }
diff --git a/sys/netinet/tcp_hpts.h b/sys/netinet/tcp_hpts.h
index 9bceca0fd340..dfa6eaf79bdc 100644
--- a/sys/netinet/tcp_hpts.h
+++ b/sys/netinet/tcp_hpts.h
@@ -111,10 +111,14 @@ struct hpts_diag {
  *
  */
 
-
 #ifdef _KERNEL
-void tcp_hpts_remove(struct inpcb *);
-bool tcp_in_hpts(struct inpcb *);
+void tcp_hpts_init(struct tcpcb *);
+void tcp_hpts_remove(struct tcpcb *);
+static bool
+tcp_in_hpts(struct tcpcb *tp)
+{
+	return (tp->t_in_hpts == IHPTS_ONQUEUE);
+}
 
 /*
  * To insert a TCB on the hpts you *must* be holding the
@@ -140,20 +144,18 @@ bool tcp_in_hpts(struct inpcb *);
  * that INP_WLOCK() or from destroying your TCB where again
  * you should already have the INP_WLOCK().
  */
-uint32_t tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line,
+uint32_t tcp_hpts_insert_diag(struct tcpcb *tp, uint32_t slot, int32_t line,
     struct hpts_diag *diag);
 #define	tcp_hpts_insert(inp, slot)	\
 	tcp_hpts_insert_diag((inp), (slot), __LINE__, NULL)
 
-void __tcp_set_hpts(struct inpcb *inp, int32_t line);
+void __tcp_set_hpts(struct tcpcb *tp, int32_t line);
 #define tcp_set_hpts(a) __tcp_set_hpts(a, __LINE__)
 
 void tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason);
 
 void tcp_run_hpts(void);
 
-uint16_t hpts_random_cpu(struct inpcb *inp);
-
 extern int32_t tcp_min_hptsi_time;
 
 #endif /* _KERNEL */
diff --git a/sys/netinet/tcp_lro.c b/sys/netinet/tcp_lro.c
index 7cbf535a9263..76c345add1f8 100644
--- a/sys/netinet/tcp_lro.c
+++ b/sys/netinet/tcp_lro.c
@@ -1380,10 +1380,8 @@ tcp_lro_flush_tcphpts(struct lro_ctrl *lc, struct lro_entry *le)
 		INP_WUNLOCK(inp);
 		return (TCP_LRO_CANNOT);
 	}
-	if ((inp->inp_irq_cpu_set == 0)  && (lc->lro_cpu_is_set == 1)) {
-		inp->inp_irq_cpu = lc->lro_last_cpu;
-		inp->inp_irq_cpu_set = 1;
-	}
+	if (tp->t_lro_cpu == HPTS_CPU_NONE && lc->lro_cpu_is_set == 1)
+		tp->t_lro_cpu = lc->lro_last_cpu;
 	/* Check if the transport doesn't support the needed optimizations. */
 	if ((inp->inp_flags2 & (INP_SUPPORTS_MBUFQ | INP_MBUF_ACKCMP)) == 0) {
 		INP_WUNLOCK(inp);
diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c
index f5cf362a57dc..f8c7557150dd 100644
--- a/sys/netinet/tcp_stacks/bbr.c
+++ b/sys/netinet/tcp_stacks/bbr.c
@@ -739,7 +739,7 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
 	int32_t delay_calc = 0;
 	uint32_t prev_delay = 0;
 
-	if (tcp_in_hpts(inp)) {
+	if (tcp_in_hpts(tp)) {
 		/* A previous call is already set up */
 		return;
 	}
@@ -904,14 +904,14 @@ bbr_start_hpts_timer(struct tcp_bbr *bbr, struct tcpcb *tp, uint32_t cts, int32_
 			inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE;
 		bbr->rc_pacer_started = cts;
 
-		(void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(slot),
+		(void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(slot),
 					   __LINE__, &diag);
 		bbr->rc_timer_first = 0;
 		bbr->bbr_timer_src = frm;
 		bbr_log_to_start(bbr, cts, hpts_timeout, slot, 1);
 		bbr_log_hpts_diag(bbr, cts, &diag);
 	} else if (hpts_timeout) {
-		(void)tcp_hpts_insert_diag(inp, HPTS_USEC_TO_SLOTS(hpts_timeout),
+		(void)tcp_hpts_insert_diag(tp, HPTS_USEC_TO_SLOTS(hpts_timeout),
 					   __LINE__, &diag);
 		/*
 		 * We add the flag here as well if the slot is set,
@@ -1050,8 +1050,8 @@ bbr_timer_audit(struct tcpcb *tp, struct tcp_bbr *bbr, uint32_t cts, struct sock
 	 */
 wrong_timer:
 	if ((bbr->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) {
-		if (tcp_in_hpts(inp))
*** 791 LINES SKIPPED ***