svn commit: r335356 - in head/sys: kern netinet netinet/tcp_stacks
Matt Macy
mmacy at FreeBSD.org
Tue Jun 19 01:54:02 UTC 2018
Author: mmacy
Date: Tue Jun 19 01:54:00 2018
New Revision: 335356
URL: https://svnweb.freebsd.org/changeset/base/335356
Log:
convert inpcbinfo hash and info rwlocks to epoch + mutex
- Convert inpcbinfo info & hash locks to epoch for read and mutex for write
- Garbage collect code that handled INP_INFO_TRY_RLOCK failures as
INP_INFO_RLOCK which can no longer fail
When running 64 netperfs sending minimal sized packets on a 2x8x2 reduces
unhalted core cycles samples in rwlock rlock/runlock in udp_send from 51% to
3%.
Overall packet throughput rate limited by CPU affinity and NIC driver design
choices.
On the receiver unhalted core cycles samples in in_pcblookup_hash went from
13% to to 1.6%
Tested by LLNW and pho@
Reviewed by: jtl
Sponsored by: Limelight Networks
Differential Revision: https://reviews.freebsd.org/D15686
Modified:
head/sys/kern/subr_witness.c
head/sys/netinet/in_pcb.h
head/sys/netinet/tcp_hpts.c
head/sys/netinet/tcp_input.c
head/sys/netinet/tcp_stacks/rack.c
head/sys/netinet/tcp_timewait.c
Modified: head/sys/kern/subr_witness.c
==============================================================================
--- head/sys/kern/subr_witness.c Tue Jun 19 01:33:03 2018 (r335355)
+++ head/sys/kern/subr_witness.c Tue Jun 19 01:54:00 2018 (r335356)
@@ -561,14 +561,14 @@ static struct witness_order_list_entry order_lists[] =
/*
* UDP/IP
*/
- { "udp", &lock_class_rw },
+ { "udp", &lock_class_mtx_sleep },
{ "udpinp", &lock_class_rw },
{ "so_snd", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
* TCP/IP
*/
- { "tcp", &lock_class_rw },
+ { "tcp", &lock_class_mtx_sleep },
{ "tcpinp", &lock_class_rw },
{ "so_snd", &lock_class_mtx_sleep },
{ NULL, NULL },
Modified: head/sys/netinet/in_pcb.h
==============================================================================
--- head/sys/netinet/in_pcb.h Tue Jun 19 01:33:03 2018 (r335355)
+++ head/sys/netinet/in_pcb.h Tue Jun 19 01:54:00 2018 (r335356)
@@ -51,6 +51,8 @@
#include <sys/lock.h>
#include <sys/rwlock.h>
#include <net/vnet.h>
+#include <net/if.h>
+#include <net/if_var.h>
#include <vm/uma.h>
#endif
#include <sys/ck.h>
@@ -157,6 +159,7 @@ struct in_conninfo {
* Key:
* (b) - Protected by the hpts lock.
* (c) - Constant after initialization
+ * (e) - Protected by the net_epoch_prempt epoch
* (g) - Protected by the pcbgroup lock
* (i) - Protected by the inpcb lock
* (p) - Protected by the pcbinfo lock for the inpcb
@@ -231,7 +234,7 @@ struct inpcbpolicy;
struct m_snd_tag;
struct inpcb {
/* Cache line #1 (amd64) */
- CK_LIST_ENTRY(inpcb) inp_hash; /* (h/i) hash list */
+ CK_LIST_ENTRY(inpcb) inp_hash; /* [w](h/i) [r](e/i) hash list */
CK_LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */
struct rwlock inp_lock;
/* Cache line #2 (amd64) */
@@ -324,8 +327,8 @@ struct inpcb {
struct route_in6 inp_route6;
};
CK_LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */
- /* (p[w]) for list iteration */
- /* (p[r]/l) for addition/removal */
+ /* (e[r]) for list iteration */
+ /* (p[w]/l) for addition/removal */
struct epoch_context inp_epoch_ctx;
};
#endif /* _KERNEL */
@@ -436,22 +439,23 @@ struct in_pcblist {
* Locking key:
*
* (c) Constant or nearly constant after initialisation
+ * (e) - Protected by the net_epoch_prempt epoch
* (g) Locked by ipi_lock
* (l) Locked by ipi_list_lock
- * (h) Read using either ipi_hash_lock or inpcb lock; write requires both
+ * (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock
* (p) Protected by one or more pcbgroup locks
* (x) Synchronisation properties poorly defined
*/
struct inpcbinfo {
/*
- * Global lock protecting full inpcb list traversal
+ * Global lock protecting inpcb list modification
*/
- struct rwlock ipi_lock;
+ struct mtx ipi_lock;
/*
* Global list of inpcbs on the protocol.
*/
- struct inpcbhead *ipi_listhead; /* (g/l) */
+ struct inpcbhead *ipi_listhead; /* [r](e) [w](g/l) */
u_int ipi_count; /* (l) */
/*
@@ -482,9 +486,9 @@ struct inpcbinfo {
u_int ipi_hashfields; /* (c) */
/*
- * Global lock protecting non-pcbgroup hash lookup tables.
+ * Global lock protecting modification non-pcbgroup hash lookup tables.
*/
- struct rwlock ipi_hash_lock;
+ struct mtx ipi_hash_lock;
/*
* Global hash of inpcbs, hashed by local and foreign addresses and
@@ -626,20 +630,18 @@ int inp_so_options(const struct inpcb *inp);
#endif /* _KERNEL */
#define INP_INFO_LOCK_INIT(ipi, d) \
- rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE)
-#define INP_INFO_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_lock)
-#define INP_INFO_RLOCK(ipi) rw_rlock(&(ipi)->ipi_lock)
-#define INP_INFO_WLOCK(ipi) rw_wlock(&(ipi)->ipi_lock)
-#define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock)
-#define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock)
-#define INP_INFO_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_lock)
-#define INP_INFO_WLOCKED(ipi) rw_wowned(&(ipi)->ipi_lock)
-#define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock)
-#define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock)
-#define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED)
-#define INP_INFO_RLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_RLOCKED)
-#define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED)
-#define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED)
+ mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE)
+#define INP_INFO_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_lock)
+#define INP_INFO_RLOCK(ipi) NET_EPOCH_ENTER()
+#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock)
+#define INP_INFO_TRY_WLOCK(ipi) mtx_trylock(&(ipi)->ipi_lock)
+#define INP_INFO_WLOCKED(ipi) mtx_owned(&(ipi)->ipi_lock)
+#define INP_INFO_RUNLOCK(ipi) NET_EPOCH_EXIT()
+#define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_lock)
+#define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch() || mtx_owned(&(ipi)->ipi_lock))
+#define INP_INFO_RLOCK_ASSERT(ipi) MPASS(in_epoch())
+#define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED)
+#define INP_INFO_UNLOCK_ASSERT(ipi) MPASS(!in_epoch() && !mtx_owned(&(ipi)->ipi_lock))
#define INP_LIST_LOCK_INIT(ipi, d) \
rw_init_flags(&(ipi)->ipi_list_lock, (d), 0)
@@ -660,17 +662,14 @@ int inp_so_options(const struct inpcb *inp);
#define INP_LIST_UNLOCK_ASSERT(ipi) \
rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED)
-#define INP_HASH_LOCK_INIT(ipi, d) \
- rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0)
-#define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock)
-#define INP_HASH_RLOCK(ipi) rw_rlock(&(ipi)->ipi_hash_lock)
-#define INP_HASH_WLOCK(ipi) rw_wlock(&(ipi)->ipi_hash_lock)
-#define INP_HASH_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_hash_lock)
-#define INP_HASH_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_hash_lock)
-#define INP_HASH_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \
- RA_LOCKED)
-#define INP_HASH_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \
- RA_WLOCKED)
+#define INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF)
+#define INP_HASH_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_hash_lock)
+#define INP_HASH_RLOCK(ipi) NET_EPOCH_ENTER()
+#define INP_HASH_WLOCK(ipi) mtx_lock(&(ipi)->ipi_hash_lock)
+#define INP_HASH_RUNLOCK(ipi) NET_EPOCH_EXIT()
+#define INP_HASH_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_hash_lock)
+#define INP_HASH_LOCK_ASSERT(ipi) MPASS(in_epoch() || mtx_owned(&(ipi)->ipi_hash_lock))
+#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED);
#define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \
MTX_DEF | MTX_DUPOK)
Modified: head/sys/netinet/tcp_hpts.c
==============================================================================
--- head/sys/netinet/tcp_hpts.c Tue Jun 19 01:33:03 2018 (r335355)
+++ head/sys/netinet/tcp_hpts.c Tue Jun 19 01:54:00 2018 (r335356)
@@ -184,9 +184,6 @@ TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_
static struct tcp_hptsi tcp_pace;
-static int
-tcp_hptsi_lock_inpinfo(struct inpcb *inp,
- struct tcpcb **tp);
static void tcp_wakehpts(struct tcp_hpts_entry *p);
static void tcp_wakeinput(struct tcp_hpts_entry *p);
static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv);
@@ -498,59 +495,6 @@ SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE
0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log");
-/*
- * Try to get the INP_INFO lock.
- *
- * This function always succeeds in getting the lock. It will clear
- * *tpp and return (1) if something critical changed while the inpcb
- * was unlocked. Otherwise, it will leave *tpp unchanged and return (0).
- *
- * This function relies on the fact that the hpts always holds a
- * reference on the inpcb while the segment is on the hptsi wheel and
- * in the input queue.
- *
- */
-static int
-tcp_hptsi_lock_inpinfo(struct inpcb *inp, struct tcpcb **tpp)
-{
- struct tcp_function_block *tfb;
- struct tcpcb *tp;
- void *ptr;
-
- /* Try the easy way. */
- if (INP_INFO_TRY_RLOCK(&V_tcbinfo))
- return (0);
-
- /*
- * OK, let's try the hard way. We'll save the function pointer block
- * to make sure that doesn't change while we aren't holding the
- * lock.
- */
- tp = *tpp;
- tfb = tp->t_fb;
- ptr = tp->t_fb_ptr;
- INP_WUNLOCK(inp);
- INP_INFO_RLOCK(&V_tcbinfo);
- INP_WLOCK(inp);
- /* If the session went away, return an error. */
- if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
- (inp->inp_flags2 & INP_FREED)) {
- *tpp = NULL;
- return (1);
- }
- /*
- * If the function block or stack-specific data block changed,
- * report an error.
- */
- tp = intotcpcb(inp);
- if ((tp->t_fb != tfb) && (tp->t_fb_ptr != ptr)) {
- *tpp = NULL;
- return (1);
- }
- return (0);
-}
-
-
static void
tcp_wakehpts(struct tcp_hpts_entry *hpts)
{
@@ -1290,10 +1234,7 @@ out:
(m->m_pkthdr.pace_lock == TI_RLOCKED ||
tp->t_state != TCPS_ESTABLISHED)) {
ti_locked = TI_RLOCKED;
- if (tcp_hptsi_lock_inpinfo(inp, &tp)) {
- CURVNET_RESTORE();
- goto out;
- }
+ INP_INFO_RLOCK(&V_tcbinfo);
m = tp->t_in_pkt;
}
if (in_newts_every_tcb) {
@@ -1360,7 +1301,6 @@ out:
*/
if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) ||
(inp->inp_flags2 & INP_FREED)) {
- out_free:
while (m) {
m_freem(m);
m = n;
@@ -1376,8 +1316,7 @@ out:
if (ti_locked == TI_UNLOCKED &&
(tp->t_state != TCPS_ESTABLISHED)) {
ti_locked = TI_RLOCKED;
- if (tcp_hptsi_lock_inpinfo(inp, &tp))
- goto out_free;
+ INP_INFO_RLOCK(&V_tcbinfo);
}
} /** end while(m) */
} /** end if ((m != NULL) && (m == tp->t_in_pkt)) */
Modified: head/sys/netinet/tcp_input.c
==============================================================================
--- head/sys/netinet/tcp_input.c Tue Jun 19 01:33:03 2018 (r335355)
+++ head/sys/netinet/tcp_input.c Tue Jun 19 01:54:00 2018 (r335356)
@@ -960,25 +960,10 @@ findpcb:
*
* XXXRW: It may be time to rethink timewait locking.
*/
-relocked:
if (inp->inp_flags & INP_TIMEWAIT) {
if (ti_locked == TI_UNLOCKED) {
- if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
- in_pcbref(inp);
- INP_WUNLOCK(inp);
- INP_INFO_RLOCK(&V_tcbinfo);
- ti_locked = TI_RLOCKED;
- INP_WLOCK(inp);
- if (in_pcbrele_wlocked(inp)) {
- inp = NULL;
- goto findpcb;
- } else if (inp->inp_flags & INP_DROPPED) {
- INP_WUNLOCK(inp);
- inp = NULL;
- goto findpcb;
- }
- } else
- ti_locked = TI_RLOCKED;
+ INP_INFO_RLOCK();
+ ti_locked = TI_RLOCKED;
}
INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
@@ -1026,23 +1011,8 @@ relocked:
(tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) &&
!IS_FASTOPEN(tp->t_flags)))) {
if (ti_locked == TI_UNLOCKED) {
- if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) {
- in_pcbref(inp);
- INP_WUNLOCK(inp);
- INP_INFO_RLOCK(&V_tcbinfo);
- ti_locked = TI_RLOCKED;
- INP_WLOCK(inp);
- if (in_pcbrele_wlocked(inp)) {
- inp = NULL;
- goto findpcb;
- } else if (inp->inp_flags & INP_DROPPED) {
- INP_WUNLOCK(inp);
- inp = NULL;
- goto findpcb;
- }
- goto relocked;
- } else
- ti_locked = TI_RLOCKED;
+ INP_INFO_RLOCK();
+ ti_locked = TI_RLOCKED;
}
INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
}
Modified: head/sys/netinet/tcp_stacks/rack.c
==============================================================================
--- head/sys/netinet/tcp_stacks/rack.c Tue Jun 19 01:33:03 2018 (r335355)
+++ head/sys/netinet/tcp_stacks/rack.c Tue Jun 19 01:54:00 2018 (r335356)
@@ -6837,34 +6837,8 @@ rack_do_segment(struct mbuf *m, struct tcphdr *th, str
* Initial input (ACK to SYN-ACK etc)lets go ahead and get
* it processed
*/
- if (ti_locked != TI_RLOCKED && INP_INFO_TRY_RLOCK(&V_tcbinfo))
- ti_locked = TI_RLOCKED;
- if (ti_locked != TI_RLOCKED) {
- inp = tp->t_inpcb;
- tfb = tp->t_fb;
- in_pcbref(inp);
- INP_WUNLOCK(inp);
- INP_INFO_RLOCK(&V_tcbinfo);
- ti_locked = TI_RLOCKED;
- INP_WLOCK(inp);
- if (in_pcbrele_wlocked(inp))
- inp = NULL;
- if (inp == NULL || (inp->inp_flags2 & INP_FREED) ||
- (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) {
- /* The TCPCB went away. Free the packet. */
- INP_INFO_RUNLOCK(&V_tcbinfo);
- if (inp)
- INP_WUNLOCK(inp);
- m_freem(m);
- return;
- }
- /* If the stack changed, call the correct stack. */
- if (tp->t_fb != tfb) {
- tp->t_fb->tfb_tcp_do_segment(m, th, so, tp,
- drop_hdrlen, tlen, iptos, ti_locked);
- return;
- }
- }
+ INP_INFO_RLOCK();
+ ti_locked = TI_RLOCKED;
tcp_get_usecs(&tv);
rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
tlen, iptos, ti_locked, 0, &tv);
Modified: head/sys/netinet/tcp_timewait.c
==============================================================================
--- head/sys/netinet/tcp_timewait.c Tue Jun 19 01:33:03 2018 (r335355)
+++ head/sys/netinet/tcp_timewait.c Tue Jun 19 01:54:00 2018 (r335356)
@@ -707,54 +707,46 @@ tcp_tw_2msl_scan(int reuse)
in_pcbref(inp);
TW_RUNLOCK(V_tw_lock);
- if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) {
-
- INP_WLOCK(inp);
- tw = intotw(inp);
- if (in_pcbrele_wlocked(inp)) {
- if (__predict_true(tw == NULL)) {
- INP_INFO_RUNLOCK(&V_tcbinfo);
- continue;
- } else {
- /* This should not happen as in TIMEWAIT
- * state the inp should not be destroyed
- * before its tcptw. If INVARIANTS is
- * defined panic.
- */
+ INP_INFO_RLOCK(&V_tcbinfo);
+ INP_WLOCK(inp);
+ tw = intotw(inp);
+ if (in_pcbrele_wlocked(inp)) {
+ if (__predict_true(tw == NULL)) {
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ continue;
+ } else {
+ /* This should not happen as in TIMEWAIT
+ * state the inp should not be destroyed
+ * before its tcptw. If INVARIANTS is
+ * defined panic.
+ */
#ifdef INVARIANTS
- panic("%s: Panic before an infinite "
- "loop: INP_TIMEWAIT && (INP_FREED "
- "|| inp last reference) && tw != "
- "NULL", __func__);
+ panic("%s: Panic before an infinite "
+ "loop: INP_TIMEWAIT && (INP_FREED "
+ "|| inp last reference) && tw != "
+ "NULL", __func__);
#else
- log(LOG_ERR, "%s: Avoid an infinite "
- "loop: INP_TIMEWAIT && (INP_FREED "
- "|| inp last reference) && tw != "
- "NULL", __func__);
+ log(LOG_ERR, "%s: Avoid an infinite "
+ "loop: INP_TIMEWAIT && (INP_FREED "
+ "|| inp last reference) && tw != "
+ "NULL", __func__);
#endif
- INP_INFO_RUNLOCK(&V_tcbinfo);
- break;
- }
- }
-
- if (tw == NULL) {
- /* tcp_twclose() has already been called */
- INP_WUNLOCK(inp);
INP_INFO_RUNLOCK(&V_tcbinfo);
- continue;
+ break;
}
+ }
- tcp_twclose(tw, reuse);
+ if (tw == NULL) {
+ /* tcp_twclose() has already been called */
+ INP_WUNLOCK(inp);
INP_INFO_RUNLOCK(&V_tcbinfo);
- if (reuse)
- return tw;
- } else {
- /* INP_INFO lock is busy, continue later. */
- INP_WLOCK(inp);
- if (!in_pcbrele_wlocked(inp))
- INP_WUNLOCK(inp);
- break;
+ continue;
}
+
+ tcp_twclose(tw, reuse);
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ if (reuse)
+ return tw;
}
return NULL;
More information about the svn-src-head
mailing list