svn commit: r341646 - in head: share/man/man4 sys/netpfil/pf

Kristof Provost kp at FreeBSD.org
Thu Dec 6 19:27:16 UTC 2018


Author: kp
Date: Thu Dec  6 19:27:15 2018
New Revision: 341646
URL: https://svnweb.freebsd.org/changeset/base/341646

Log:
  pfsync: Performance improvement
  
  pfsync code is called for every new state, state update and state
  deletion in pf. While pf itself can operate on multiple states at the
  same time (on different cores, assuming the states hash to a different
  hashrow), pfsync only had a single lock.
  This greatly reduced throughput on multicore systems.
  
  Address this by splitting the pfsync queues into buckets, based on the
  state id. This ensures that updates for a given connection always end up
  in the same bucket, which allows pfsync to still collapse multiple
  updates into one, while allowing multiple cores to proceed at the same
  time.
  
  The number of buckets is tunable, but defaults to 2 x number of cpus.
  Benchmarking has shown improvement, depending on hardware and setup, from ~30%
  to ~100%.
  
  MFC after:	1 week
  Sponsored by:	Orange Business Services
  Differential Revision:	https://reviews.freebsd.org/D18373

Modified:
  head/share/man/man4/pfsync.4
  head/sys/netpfil/pf/if_pfsync.c

Modified: head/share/man/man4/pfsync.4
==============================================================================
--- head/share/man/man4/pfsync.4	Thu Dec  6 19:20:00 2018	(r341645)
+++ head/share/man/man4/pfsync.4	Thu Dec  6 19:27:15 2018	(r341646)
@@ -26,7 +26,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd August 18, 2017
+.Dd December 6, 2018
 .Dt PFSYNC 4
 .Os
 .Sh NAME
@@ -130,6 +130,13 @@ See
 .Xr carp 4
 for more information.
 Default value is 240.
+.It Va net.pfsync.pfsync_buckets
+The number of
+.Nm
+buckets.
+This affects the performance and memory tradeoff.
+Defaults to twice the number of CPUs.
+Change only if benchmarks show this helps on your workload.
 .El
 .Sh EXAMPLES
 .Nm

Modified: head/sys/netpfil/pf/if_pfsync.c
==============================================================================
--- head/sys/netpfil/pf/if_pfsync.c	Thu Dec  6 19:20:00 2018	(r341645)
+++ head/sys/netpfil/pf/if_pfsync.c	Thu Dec  6 19:27:15 2018	(r341646)
@@ -77,6 +77,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/protosw.h>
+#include <sys/smp.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
 #include <sys/sysctl.h>
@@ -106,6 +107,8 @@ __FBSDID("$FreeBSD$");
 	sizeof(struct pfsync_header) + \
 	sizeof(struct pfsync_subheader) )
 
+struct pfsync_bucket;
+
 struct pfsync_pkt {
 	struct ip *ip;
 	struct in_addr src;
@@ -164,7 +167,7 @@ static struct pfsync_q pfsync_qs[] = {
 };
 
 static void	pfsync_q_ins(struct pf_state *, int, bool);
-static void	pfsync_q_del(struct pf_state *, bool);
+static void	pfsync_q_del(struct pf_state *, bool, struct pfsync_bucket *);
 
 static void	pfsync_update_state(struct pf_state *);
 
@@ -183,6 +186,28 @@ struct pfsync_deferral {
 	struct mbuf			*pd_m;
 };
 
+struct pfsync_sofct;
+
+struct pfsync_bucket
+{
+	int			b_id;
+	struct pfsync_softc	*b_sc;
+	struct mtx		b_mtx;
+	struct callout		b_tmo;
+	int			b_flags;
+#define	PFSYNCF_BUCKET_PUSH	0x00000001
+
+	size_t			b_len;
+	TAILQ_HEAD(, pf_state)			b_qs[PFSYNC_S_COUNT];
+	TAILQ_HEAD(, pfsync_upd_req_item)	b_upd_req_list;
+	TAILQ_HEAD(, pfsync_deferral)		b_deferrals;
+	u_int			b_deferred;
+	void			*b_plus;
+	size_t			b_pluslen;
+
+	struct  ifaltq b_snd;
+};
+
 struct pfsync_softc {
 	/* Configuration */
 	struct ifnet		*sc_ifp;
@@ -192,20 +217,12 @@ struct pfsync_softc {
 	uint32_t		sc_flags;
 #define	PFSYNCF_OK		0x00000001
 #define	PFSYNCF_DEFER		0x00000002
-#define	PFSYNCF_PUSH		0x00000004
 	uint8_t			sc_maxupdates;
 	struct ip		sc_template;
-	struct callout		sc_tmo;
 	struct mtx		sc_mtx;
 
 	/* Queued data */
-	size_t			sc_len;
-	TAILQ_HEAD(, pf_state)			sc_qs[PFSYNC_S_COUNT];
-	TAILQ_HEAD(, pfsync_upd_req_item)	sc_upd_req_list;
-	TAILQ_HEAD(, pfsync_deferral)		sc_deferrals;
-	u_int			sc_deferred;
-	void			*sc_plus;
-	size_t			sc_pluslen;
+	struct pfsync_bucket	*sc_buckets;
 
 	/* Bulk update info */
 	struct mtx		sc_bulk_mtx;
@@ -223,6 +240,10 @@ struct pfsync_softc {
 #define	PFSYNC_UNLOCK(sc)	mtx_unlock(&(sc)->sc_mtx)
 #define	PFSYNC_LOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_mtx, MA_OWNED)
 
+#define PFSYNC_BUCKET_LOCK(b)		mtx_lock(&(b)->b_mtx)
+#define PFSYNC_BUCKET_UNLOCK(b)		mtx_unlock(&(b)->b_mtx)
+#define PFSYNC_BUCKET_LOCK_ASSERT(b)	mtx_assert(&(b)->b_mtx, MA_OWNED)
+
 #define	PFSYNC_BLOCK(sc)	mtx_lock(&(sc)->sc_bulk_mtx)
 #define	PFSYNC_BUNLOCK(sc)	mtx_unlock(&(sc)->sc_bulk_mtx)
 #define	PFSYNC_BLOCK_ASSERT(sc)	mtx_assert(&(sc)->sc_bulk_mtx, MA_OWNED)
@@ -239,7 +260,8 @@ VNET_DEFINE_STATIC(int, pfsync_carp_adj) = CARP_MAXSKE
 #define	V_pfsync_carp_adj	VNET(pfsync_carp_adj)
 
 static void	pfsync_timeout(void *);
-static void	pfsync_push(struct pfsync_softc *);
+static void	pfsync_push(struct pfsync_bucket *);
+static void	pfsync_push_all(struct pfsync_softc *);
 static void	pfsyncintr(void *);
 static int	pfsync_multicast_setup(struct pfsync_softc *, struct ifnet *,
 		    void *);
@@ -249,12 +271,16 @@ static void	pfsync_pointers_uninit(void);
 static int	pfsync_init(void);
 static void	pfsync_uninit(void);
 
+static unsigned long pfsync_buckets;
+
 SYSCTL_NODE(_net, OID_AUTO, pfsync, CTLFLAG_RW, 0, "PFSYNC");
 SYSCTL_STRUCT(_net_pfsync, OID_AUTO, stats, CTLFLAG_VNET | CTLFLAG_RW,
     &VNET_NAME(pfsyncstats), pfsyncstats,
     "PFSYNC statistics (struct pfsyncstats, net/if_pfsync.h)");
 SYSCTL_INT(_net_pfsync, OID_AUTO, carp_demotion_factor, CTLFLAG_RW,
     &VNET_NAME(pfsync_carp_adj), 0, "pfsync's CARP demotion factor adjustment");
+SYSCTL_ULONG(_net_pfsync, OID_AUTO, pfsync_buckets, CTLFLAG_RDTUN,
+    &pfsync_buckets, 0, "Number of pfsync hash buckets");
 
 static int	pfsync_clone_create(struct if_clone *, int, caddr_t);
 static void	pfsync_clone_destroy(struct ifnet *);
@@ -270,10 +296,10 @@ static void	pfsync_undefer_state(struct pf_state *, in
 static void	pfsync_defer_tmo(void *);
 
 static void	pfsync_request_update(u_int32_t, u_int64_t);
-static void	pfsync_update_state_req(struct pf_state *);
+static bool	pfsync_update_state_req(struct pf_state *);
 
 static void	pfsync_drop(struct pfsync_softc *);
-static void	pfsync_sendout(int);
+static void	pfsync_sendout(int, int);
 static void	pfsync_send_plus(void *, size_t);
 
 static void	pfsync_bulk_start(void);
@@ -285,7 +311,10 @@ static void	pfsync_detach_ifnet(struct ifnet *);
 #ifdef IPSEC
 static void	pfsync_update_net_tdb(struct pfsync_tdb *);
 #endif
+static struct pfsync_bucket	*pfsync_get_bucket(struct pfsync_softc *,
+		    struct pf_state *);
 
+
 #define PFSYNC_MAX_BULKTRIES	12
 
 VNET_DEFINE(struct if_clone *, pfsync_cloner);
@@ -296,21 +325,16 @@ pfsync_clone_create(struct if_clone *ifc, int unit, ca
 {
 	struct pfsync_softc *sc;
 	struct ifnet *ifp;
-	int q;
+	struct pfsync_bucket *b;
+	int c, q;
 
 	if (unit != 0)
 		return (EINVAL);
 
-	sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO);
-	sc->sc_flags |= PFSYNCF_OK;
+	if (! pfsync_buckets)
+		pfsync_buckets = mp_ncpus * 2;
 
-	for (q = 0; q < PFSYNC_S_COUNT; q++)
-		TAILQ_INIT(&sc->sc_qs[q]);
-
-	TAILQ_INIT(&sc->sc_upd_req_list);
-	TAILQ_INIT(&sc->sc_deferrals);
-
-	sc->sc_len = PFSYNC_MINPKT;
+	sc = malloc(sizeof(struct pfsync_softc), M_PFSYNC, M_WAITOK | M_ZERO);
 	sc->sc_maxupdates = 128;
 
 	ifp = sc->sc_ifp = if_alloc(IFT_PFSYNC);
@@ -323,12 +347,10 @@ pfsync_clone_create(struct if_clone *ifc, int unit, ca
 	ifp->if_ioctl = pfsyncioctl;
 	ifp->if_output = pfsyncoutput;
 	ifp->if_type = IFT_PFSYNC;
-	ifp->if_snd.ifq_maxlen = ifqmaxlen;
 	ifp->if_hdrlen = sizeof(struct pfsync_header);
 	ifp->if_mtu = ETHERMTU;
 	mtx_init(&sc->sc_mtx, pfsyncname, NULL, MTX_DEF);
 	mtx_init(&sc->sc_bulk_mtx, "pfsync bulk", NULL, MTX_DEF);
-	callout_init(&sc->sc_tmo, 1);
 	callout_init_mtx(&sc->sc_bulk_tmo, &sc->sc_bulk_mtx, 0);
 	callout_init_mtx(&sc->sc_bulkfail_tmo, &sc->sc_bulk_mtx, 0);
 
@@ -336,6 +358,27 @@ pfsync_clone_create(struct if_clone *ifc, int unit, ca
 
 	bpfattach(ifp, DLT_PFSYNC, PFSYNC_HDRLEN);
 
+	sc->sc_buckets = mallocarray(pfsync_buckets, sizeof(*sc->sc_buckets),
+	    M_PFSYNC, M_ZERO | M_WAITOK);
+	for (c = 0; c < pfsync_buckets; c++) {
+		b = &sc->sc_buckets[c];
+		mtx_init(&b->b_mtx, pfsyncname, NULL, MTX_DEF);
+
+		b->b_id = c;
+		b->b_sc = sc;
+		b->b_len = PFSYNC_MINPKT;
+
+		for (q = 0; q < PFSYNC_S_COUNT; q++)
+			TAILQ_INIT(&b->b_qs[q]);
+
+		TAILQ_INIT(&b->b_upd_req_list);
+		TAILQ_INIT(&b->b_deferrals);
+
+		callout_init(&b->b_tmo, 1);
+
+		b->b_snd.ifq_maxlen = ifqmaxlen;
+	}
+
 	V_pfsyncif = sc;
 
 	return (0);
@@ -345,29 +388,36 @@ static void
 pfsync_clone_destroy(struct ifnet *ifp)
 {
 	struct pfsync_softc *sc = ifp->if_softc;
+	struct pfsync_bucket *b;
+	int c;
 
-	/*
-	 * At this stage, everything should have already been
-	 * cleared by pfsync_uninit(), and we have only to
-	 * drain callouts.
-	 */
-	while (sc->sc_deferred > 0) {
-		struct pfsync_deferral *pd = TAILQ_FIRST(&sc->sc_deferrals);
+	for (c = 0; c < pfsync_buckets; c++) {
+		b = &sc->sc_buckets[c];
+		/*
+		 * At this stage, everything should have already been
+		 * cleared by pfsync_uninit(), and we have only to
+		 * drain callouts.
+		 */
+		while (b->b_deferred > 0) {
+			struct pfsync_deferral *pd =
+			    TAILQ_FIRST(&b->b_deferrals);
 
-		TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
-		sc->sc_deferred--;
-		if (callout_stop(&pd->pd_tmo) > 0) {
-			pf_release_state(pd->pd_st);
-			m_freem(pd->pd_m);
-			free(pd, M_PFSYNC);
-		} else {
-			pd->pd_refs++;
-			callout_drain(&pd->pd_tmo);
-			free(pd, M_PFSYNC);
+			TAILQ_REMOVE(&b->b_deferrals, pd, pd_entry);
+			b->b_deferred--;
+			if (callout_stop(&pd->pd_tmo) > 0) {
+				pf_release_state(pd->pd_st);
+				m_freem(pd->pd_m);
+				free(pd, M_PFSYNC);
+			} else {
+				pd->pd_refs++;
+				callout_drain(&pd->pd_tmo);
+				free(pd, M_PFSYNC);
+			}
 		}
+
+		callout_drain(&b->b_tmo);
 	}
 
-	callout_drain(&sc->sc_tmo);
 	callout_drain(&sc->sc_bulkfail_tmo);
 	callout_drain(&sc->sc_bulk_tmo);
 
@@ -383,6 +433,8 @@ pfsync_clone_destroy(struct ifnet *ifp)
 		pfsync_multicast_cleanup(sc);
 	mtx_destroy(&sc->sc_mtx);
 	mtx_destroy(&sc->sc_bulk_mtx);
+
+	free(sc->sc_buckets, M_PFSYNC);
 	free(sc, M_PFSYNC);
 
 	V_pfsyncif = NULL;
@@ -546,7 +598,7 @@ pfsync_state_import(struct pfsync_state *sp, u_int8_t 
 		st->state_flags &= ~PFSTATE_NOSYNC;
 		if (st->state_flags & PFSTATE_ACK) {
 			pfsync_q_ins(st, PFSYNC_S_IACK, true);
-			pfsync_push(sc);
+			pfsync_push_all(sc);
 		}
 	}
 	st->state_flags &= ~PFSTATE_ACK;
@@ -785,9 +837,7 @@ pfsync_in_iack(struct pfsync_pkt *pkt, struct mbuf *m,
 			continue;
 
 		if (st->state_flags & PFSTATE_ACK) {
-			PFSYNC_LOCK(V_pfsyncif);
 			pfsync_undefer_state(st, 0);
-			PFSYNC_UNLOCK(V_pfsyncif);
 		}
 		PF_STATE_UNLOCK(st);
 	}
@@ -876,9 +926,7 @@ pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, 
 		}
 
 		if (st->state_flags & PFSTATE_ACK) {
-			PFSYNC_LOCK(sc);
 			pfsync_undefer_state(st, 1);
-			PFSYNC_UNLOCK(sc);
 		}
 
 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
@@ -912,9 +960,7 @@ pfsync_in_upd(struct pfsync_pkt *pkt, struct mbuf *m, 
 
 			pfsync_update_state(st);
 			PF_STATE_UNLOCK(st);
-			PFSYNC_LOCK(sc);
-			pfsync_push(sc);
-			PFSYNC_UNLOCK(sc);
+			pfsync_push_all(sc);
 			continue;
 		}
 		PF_STATE_UNLOCK(st);
@@ -960,16 +1006,14 @@ pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m
 		st = pf_find_state_byid(up->id, up->creatorid);
 		if (st == NULL) {
 			/* We don't have this state. Ask for it. */
-			PFSYNC_LOCK(sc);
+			PFSYNC_BUCKET_LOCK(&sc->sc_buckets[0]);
 			pfsync_request_update(up->creatorid, up->id);
-			PFSYNC_UNLOCK(sc);
+			PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[0]);
 			continue;
 		}
 
 		if (st->state_flags & PFSTATE_ACK) {
-			PFSYNC_LOCK(sc);
 			pfsync_undefer_state(st, 1);
-			PFSYNC_UNLOCK(sc);
 		}
 
 		if (st->key[PF_SK_WIRE]->proto == IPPROTO_TCP)
@@ -1003,9 +1047,7 @@ pfsync_in_upd_c(struct pfsync_pkt *pkt, struct mbuf *m
 
 			pfsync_update_state(st);
 			PF_STATE_UNLOCK(st);
-			PFSYNC_LOCK(sc);
-			pfsync_push(sc);
-			PFSYNC_UNLOCK(sc);
+			pfsync_push_all(sc);
 			continue;
 		}
 		PF_STATE_UNLOCK(st);
@@ -1283,6 +1325,7 @@ pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t dat
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct pfsyncreq pfsyncr;
 	int error;
+	int c;
 
 	switch (cmd) {
 	case SIOCSIFFLAGS:
@@ -1303,10 +1346,12 @@ pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t dat
 		    ifr->ifr_mtu > sc->sc_sync_if->if_mtu)
 			return (EINVAL);
 		if (ifr->ifr_mtu < ifp->if_mtu) {
-			PFSYNC_LOCK(sc);
-			if (sc->sc_len > PFSYNC_MINPKT)
-				pfsync_sendout(1);
-			PFSYNC_UNLOCK(sc);
+			for (c = 0; c < pfsync_buckets; c++) {
+				PFSYNC_BUCKET_LOCK(&sc->sc_buckets[c]);
+				if (sc->sc_buckets[c].b_len > PFSYNC_MINPKT)
+					pfsync_sendout(1, c);
+				PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[c]);
+			}
 		}
 		ifp->if_mtu = ifr->ifr_mtu;
 		break;
@@ -1379,12 +1424,16 @@ pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t dat
 			break;
 		}
 
-		if (sc->sc_len > PFSYNC_MINPKT &&
-		    (sifp->if_mtu < sc->sc_ifp->if_mtu ||
-		    (sc->sc_sync_if != NULL &&
-		    sifp->if_mtu < sc->sc_sync_if->if_mtu) ||
-		    sifp->if_mtu < MCLBYTES - sizeof(struct ip)))
-			pfsync_sendout(1);
+		for (c = 0; c < pfsync_buckets; c++) {
+			PFSYNC_BUCKET_LOCK(&sc->sc_buckets[c]);
+			if (sc->sc_buckets[c].b_len > PFSYNC_MINPKT &&
+			    (sifp->if_mtu < sc->sc_ifp->if_mtu ||
+			    (sc->sc_sync_if != NULL &&
+			    sifp->if_mtu < sc->sc_sync_if->if_mtu) ||
+			    sifp->if_mtu < MCLBYTES - sizeof(struct ip)))
+				pfsync_sendout(1, c);
+			PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[c]);
+		}
 
 		if (imo->imo_membership)
 			pfsync_multicast_cleanup(sc);
@@ -1421,8 +1470,10 @@ pfsyncioctl(struct ifnet *ifp, u_long cmd, caddr_t dat
 		sc->sc_flags &= ~PFSYNCF_OK;
 		if (V_pf_status.debug >= PF_DEBUG_MISC)
 			printf("pfsync: requesting bulk update\n");
-		pfsync_request_update(0, 0);
 		PFSYNC_UNLOCK(sc);
+		PFSYNC_BUCKET_LOCK(&sc->sc_buckets[0]);
+		pfsync_request_update(0, 0);
+		PFSYNC_BUCKET_UNLOCK(&sc->sc_buckets[0]);
 		PFSYNC_BLOCK(sc);
 		sc->sc_ureq_sent = time_uptime;
 		callout_reset(&sc->sc_bulkfail_tmo, 5 * hz, pfsync_bulk_fail,
@@ -1483,33 +1534,37 @@ pfsync_drop(struct pfsync_softc *sc)
 {
 	struct pf_state *st, *next;
 	struct pfsync_upd_req_item *ur;
-	int q;
+	struct pfsync_bucket *b;
+	int c, q;
 
-	for (q = 0; q < PFSYNC_S_COUNT; q++) {
-		if (TAILQ_EMPTY(&sc->sc_qs[q]))
-			continue;
+	for (c = 0; c < pfsync_buckets; c++) {
+		b = &sc->sc_buckets[c];
+		for (q = 0; q < PFSYNC_S_COUNT; q++) {
+			if (TAILQ_EMPTY(&b->b_qs[q]))
+				continue;
 
-		TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, next) {
-			KASSERT(st->sync_state == q,
-				("%s: st->sync_state == q",
-					__func__));
-			st->sync_state = PFSYNC_S_NONE;
-			pf_release_state(st);
+			TAILQ_FOREACH_SAFE(st, &b->b_qs[q], sync_list, next) {
+				KASSERT(st->sync_state == q,
+					("%s: st->sync_state == q",
+						__func__));
+				st->sync_state = PFSYNC_S_NONE;
+				pf_release_state(st);
+			}
+			TAILQ_INIT(&b->b_qs[q]);
 		}
-		TAILQ_INIT(&sc->sc_qs[q]);
-	}
 
-	while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
-		TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
-		free(ur, M_PFSYNC);
-	}
+		while ((ur = TAILQ_FIRST(&b->b_upd_req_list)) != NULL) {
+			TAILQ_REMOVE(&b->b_upd_req_list, ur, ur_entry);
+			free(ur, M_PFSYNC);
+		}
 
-	sc->sc_plus = NULL;
-	sc->sc_len = PFSYNC_MINPKT;
+		b->b_len = PFSYNC_MINPKT;
+		b->b_plus = NULL;
+	}
 }
 
 static void
-pfsync_sendout(int schedswi)
+pfsync_sendout(int schedswi, int c)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct ifnet *ifp = sc->sc_ifp;
@@ -1519,27 +1574,28 @@ pfsync_sendout(int schedswi)
 	struct pfsync_subheader *subh;
 	struct pf_state *st, *st_next;
 	struct pfsync_upd_req_item *ur;
+	struct pfsync_bucket *b = &sc->sc_buckets[c];
 	int offset;
 	int q, count = 0;
 
 	KASSERT(sc != NULL, ("%s: null sc", __func__));
-	KASSERT(sc->sc_len > PFSYNC_MINPKT,
-	    ("%s: sc_len %zu", __func__, sc->sc_len));
-	PFSYNC_LOCK_ASSERT(sc);
+	KASSERT(b->b_len > PFSYNC_MINPKT,
+	    ("%s: sc_len %zu", __func__, b->b_len));
+	PFSYNC_BUCKET_LOCK_ASSERT(b);
 
 	if (ifp->if_bpf == NULL && sc->sc_sync_if == NULL) {
 		pfsync_drop(sc);
 		return;
 	}
 
-	m = m_get2(max_linkhdr + sc->sc_len, M_NOWAIT, MT_DATA, M_PKTHDR);
+	m = m_get2(max_linkhdr + b->b_len, M_NOWAIT, MT_DATA, M_PKTHDR);
 	if (m == NULL) {
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OERRORS, 1);
 		V_pfsyncstats.pfsyncs_onomem++;
 		return;
 	}
 	m->m_data += max_linkhdr;
-	m->m_len = m->m_pkthdr.len = sc->sc_len;
+	m->m_len = m->m_pkthdr.len = b->b_len;
 
 	/* build the ip header */
 	ip = (struct ip *)m->m_data;
@@ -1555,19 +1611,19 @@ pfsync_sendout(int schedswi)
 	offset += sizeof(*ph);
 
 	ph->version = PFSYNC_VERSION;
-	ph->len = htons(sc->sc_len - sizeof(*ip));
+	ph->len = htons(b->b_len - sizeof(*ip));
 	bcopy(V_pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH);
 
 	/* walk the queues */
 	for (q = 0; q < PFSYNC_S_COUNT; q++) {
-		if (TAILQ_EMPTY(&sc->sc_qs[q]))
+		if (TAILQ_EMPTY(&b->b_qs[q]))
 			continue;
 
 		subh = (struct pfsync_subheader *)(m->m_data + offset);
 		offset += sizeof(*subh);
 
 		count = 0;
-		TAILQ_FOREACH_SAFE(st, &sc->sc_qs[q], sync_list, st_next) {
+		TAILQ_FOREACH_SAFE(st, &b->b_qs[q], sync_list, st_next) {
 			KASSERT(st->sync_state == q,
 				("%s: st->sync_state == q",
 					__func__));
@@ -1581,7 +1637,7 @@ pfsync_sendout(int schedswi)
 			pf_release_state(st);
 			count++;
 		}
-		TAILQ_INIT(&sc->sc_qs[q]);
+		TAILQ_INIT(&b->b_qs[q]);
 
 		bzero(subh, sizeof(*subh));
 		subh->action = pfsync_qs[q].action;
@@ -1589,13 +1645,13 @@ pfsync_sendout(int schedswi)
 		V_pfsyncstats.pfsyncs_oacts[pfsync_qs[q].action] += count;
 	}
 
-	if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) {
+	if (!TAILQ_EMPTY(&b->b_upd_req_list)) {
 		subh = (struct pfsync_subheader *)(m->m_data + offset);
 		offset += sizeof(*subh);
 
 		count = 0;
-		while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) {
-			TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
+		while ((ur = TAILQ_FIRST(&b->b_upd_req_list)) != NULL) {
+			TAILQ_REMOVE(&b->b_upd_req_list, ur, ur_entry);
 
 			bcopy(&ur->ur_msg, m->m_data + offset,
 			    sizeof(ur->ur_msg));
@@ -1611,11 +1667,11 @@ pfsync_sendout(int schedswi)
 	}
 
 	/* has someone built a custom region for us to add? */
-	if (sc->sc_plus != NULL) {
-		bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen);
-		offset += sc->sc_pluslen;
+	if (b->b_plus != NULL) {
+		bcopy(b->b_plus, m->m_data + offset, b->b_pluslen);
+		offset += b->b_pluslen;
 
-		sc->sc_plus = NULL;
+		b->b_plus = NULL;
 	}
 
 	subh = (struct pfsync_subheader *)(m->m_data + offset);
@@ -1629,24 +1685,24 @@ pfsync_sendout(int schedswi)
 	/* we're done, let's put it on the wire */
 	if (ifp->if_bpf) {
 		m->m_data += sizeof(*ip);
-		m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip);
+		m->m_len = m->m_pkthdr.len = b->b_len - sizeof(*ip);
 		BPF_MTAP(ifp, m);
 		m->m_data -= sizeof(*ip);
-		m->m_len = m->m_pkthdr.len = sc->sc_len;
+		m->m_len = m->m_pkthdr.len = b->b_len;
 	}
 
 	if (sc->sc_sync_if == NULL) {
-		sc->sc_len = PFSYNC_MINPKT;
+		b->b_len = PFSYNC_MINPKT;
 		m_freem(m);
 		return;
 	}
 
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_OPACKETS, 1);
 	if_inc_counter(sc->sc_ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len);
-	sc->sc_len = PFSYNC_MINPKT;
+	b->b_len = PFSYNC_MINPKT;
 
-	if (!_IF_QFULL(&sc->sc_ifp->if_snd))
-		_IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
+	if (!_IF_QFULL(&b->b_snd))
+		_IF_ENQUEUE(&b->b_snd, m);
 	else {
 		m_freem(m);
 		if_inc_counter(sc->sc_ifp, IFCOUNTER_OQDROPS, 1);
@@ -1659,6 +1715,7 @@ static void
 pfsync_insert_state(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
+	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	if (st->state_flags & PFSTATE_NOSYNC)
 		return;
@@ -1672,12 +1729,12 @@ pfsync_insert_state(struct pf_state *st)
 	KASSERT(st->sync_state == PFSYNC_S_NONE,
 		("%s: st->sync_state %u", __func__, st->sync_state));
 
-	PFSYNC_LOCK(sc);
-	if (sc->sc_len == PFSYNC_MINPKT)
-		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
+	PFSYNC_BUCKET_LOCK(b);
+	if (b->b_len == PFSYNC_MINPKT)
+		callout_reset(&b->b_tmo, 1 * hz, pfsync_timeout, b);
 
 	pfsync_q_ins(st, PFSYNC_S_INS, true);
-	PFSYNC_UNLOCK(sc);
+	PFSYNC_BUCKET_UNLOCK(b);
 
 	st->sync_updates = 0;
 }
@@ -1687,6 +1744,7 @@ pfsync_defer(struct pf_state *st, struct mbuf *m)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_deferral *pd;
+	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	if (m->m_flags & (M_BCAST|M_MCAST))
 		return (0);
@@ -1699,13 +1757,13 @@ pfsync_defer(struct pf_state *st, struct mbuf *m)
 		return (0);
 	}
 
-	 if (sc->sc_deferred >= 128)
-		pfsync_undefer(TAILQ_FIRST(&sc->sc_deferrals), 0);
+	if (b->b_deferred >= 128)
+		pfsync_undefer(TAILQ_FIRST(&b->b_deferrals), 0);
 
 	pd = malloc(sizeof(*pd), M_PFSYNC, M_NOWAIT);
 	if (pd == NULL)
 		return (0);
-	sc->sc_deferred++;
+	b->b_deferred++;
 
 	m->m_flags |= M_SKIP_FIREWALL;
 	st->state_flags |= PFSTATE_ACK;
@@ -1716,11 +1774,11 @@ pfsync_defer(struct pf_state *st, struct mbuf *m)
 	pf_ref_state(st);
 	pd->pd_m = m;
 
-	TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry);
-	callout_init_mtx(&pd->pd_tmo, &sc->sc_mtx, CALLOUT_RETURNUNLOCKED);
+	TAILQ_INSERT_TAIL(&b->b_deferrals, pd, pd_entry);
+	callout_init_mtx(&pd->pd_tmo, &b->b_mtx, CALLOUT_RETURNUNLOCKED);
 	callout_reset(&pd->pd_tmo, 10, pfsync_defer_tmo, pd);
 
-	pfsync_push(sc);
+	pfsync_push(b);
 
 	return (1);
 }
@@ -1731,11 +1789,12 @@ pfsync_undefer(struct pfsync_deferral *pd, int drop)
 	struct pfsync_softc *sc = pd->pd_sc;
 	struct mbuf *m = pd->pd_m;
 	struct pf_state *st = pd->pd_st;
+	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
-	PFSYNC_LOCK_ASSERT(sc);
+	PFSYNC_BUCKET_LOCK_ASSERT(b);
 
-	TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
-	sc->sc_deferred--;
+	TAILQ_REMOVE(&b->b_deferrals, pd, pd_entry);
+	b->b_deferred--;
 	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
 	free(pd, M_PFSYNC);
 	pf_release_state(st);
@@ -1743,8 +1802,8 @@ pfsync_undefer(struct pfsync_deferral *pd, int drop)
 	if (drop)
 		m_freem(m);
 	else {
-		_IF_ENQUEUE(&sc->sc_ifp->if_snd, m);
-		pfsync_push(sc);
+		_IF_ENQUEUE(&b->b_snd, m);
+		pfsync_push(b);
 	}
 }
 
@@ -1755,13 +1814,14 @@ pfsync_defer_tmo(void *arg)
 	struct pfsync_softc *sc = pd->pd_sc;
 	struct mbuf *m = pd->pd_m;
 	struct pf_state *st = pd->pd_st;
+	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
-	PFSYNC_LOCK_ASSERT(sc);
+	PFSYNC_BUCKET_LOCK_ASSERT(b);
 
 	CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
 
-	TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry);
-	sc->sc_deferred--;
+	TAILQ_REMOVE(&b->b_deferrals, pd, pd_entry);
+	b->b_deferred--;
 	pd->pd_st->state_flags &= ~PFSTATE_ACK;	/* XXX: locking! */
 	if (pd->pd_refs == 0)
 		free(pd, M_PFSYNC);
@@ -1779,40 +1839,52 @@ pfsync_undefer_state(struct pf_state *st, int drop)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	struct pfsync_deferral *pd;
+	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
-	PFSYNC_LOCK_ASSERT(sc);
+	PFSYNC_BUCKET_LOCK(b);
 
-	TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) {
+	TAILQ_FOREACH(pd, &b->b_deferrals, pd_entry) {
 		 if (pd->pd_st == st) {
 			if (callout_stop(&pd->pd_tmo) > 0)
 				pfsync_undefer(pd, drop);
+
+			PFSYNC_BUCKET_UNLOCK(b);
 			return;
 		}
 	}
+	PFSYNC_BUCKET_UNLOCK(b);
 
 	panic("%s: unable to find deferred state", __func__);
 }
 
+static struct pfsync_bucket*
+pfsync_get_bucket(struct pfsync_softc *sc, struct pf_state *st)
+{
+	int c = PF_IDHASH(st) % pfsync_buckets;
+	return &sc->sc_buckets[c];
+}
+
 static void
 pfsync_update_state(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	bool sync = false, ref = true;
+	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	PF_STATE_LOCK_ASSERT(st);
-	PFSYNC_LOCK(sc);
+	PFSYNC_BUCKET_LOCK(b);
 
 	if (st->state_flags & PFSTATE_ACK)
 		pfsync_undefer_state(st, 0);
 	if (st->state_flags & PFSTATE_NOSYNC) {
 		if (st->sync_state != PFSYNC_S_NONE)
-			pfsync_q_del(st, true);
-		PFSYNC_UNLOCK(sc);
+			pfsync_q_del(st, true, b);
+		PFSYNC_BUCKET_UNLOCK(b);
 		return;
 	}
 
-	if (sc->sc_len == PFSYNC_MINPKT)
-		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
+	if (b->b_len == PFSYNC_MINPKT)
+		callout_reset(&b->b_tmo, 1 * hz, pfsync_timeout, b);
 
 	switch (st->sync_state) {
 	case PFSYNC_S_UPD_C:
@@ -1828,7 +1900,7 @@ pfsync_update_state(struct pf_state *st)
 		break;
 
 	case PFSYNC_S_IACK:
-		pfsync_q_del(st, false);
+		pfsync_q_del(st, false, b);
 		ref = false;
 		/* FALLTHROUGH */
 
@@ -1842,26 +1914,27 @@ pfsync_update_state(struct pf_state *st)
 	}
 
 	if (sync || (time_uptime - st->pfsync_time) < 2)
-		pfsync_push(sc);
+		pfsync_push(b);
 
-	PFSYNC_UNLOCK(sc);
+	PFSYNC_BUCKET_UNLOCK(b);
 }
 
 static void
 pfsync_request_update(u_int32_t creatorid, u_int64_t id)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
+	struct pfsync_bucket *b = &sc->sc_buckets[0];
 	struct pfsync_upd_req_item *item;
 	size_t nlen = sizeof(struct pfsync_upd_req);
 
-	PFSYNC_LOCK_ASSERT(sc);
+	PFSYNC_BUCKET_LOCK_ASSERT(b);
 
 	/*
 	 * This code does a bit to prevent multiple update requests for the
 	 * same state being generated. It searches current subheader queue,
 	 * but it doesn't lookup into queue of already packed datagrams.
 	 */
-	TAILQ_FOREACH(item, &sc->sc_upd_req_list, ur_entry)
+	TAILQ_FOREACH(item, &b->b_upd_req_list, ur_entry)
 		if (item->ur_msg.id == id &&
 		    item->ur_msg.creatorid == creatorid)
 			return;
@@ -1873,46 +1946,47 @@ pfsync_request_update(u_int32_t creatorid, u_int64_t i
 	item->ur_msg.id = id;
 	item->ur_msg.creatorid = creatorid;
 
-	if (TAILQ_EMPTY(&sc->sc_upd_req_list))
+	if (TAILQ_EMPTY(&b->b_upd_req_list))
 		nlen += sizeof(struct pfsync_subheader);
 
-	if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
-		pfsync_sendout(1);
+	if (b->b_len + nlen > sc->sc_ifp->if_mtu) {
+		pfsync_sendout(1, 0);
 
 		nlen = sizeof(struct pfsync_subheader) +
 		    sizeof(struct pfsync_upd_req);
 	}
 
-	TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry);
-	sc->sc_len += nlen;
+	TAILQ_INSERT_TAIL(&b->b_upd_req_list, item, ur_entry);
+	b->b_len += nlen;
 }
 
-static void
+static bool
 pfsync_update_state_req(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
-	bool ref = true;
+	bool ref = true, full = false;
+	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
 	PF_STATE_LOCK_ASSERT(st);
-	PFSYNC_LOCK(sc);
+	PFSYNC_BUCKET_LOCK(b);
 
 	if (st->state_flags & PFSTATE_NOSYNC) {
 		if (st->sync_state != PFSYNC_S_NONE)
-			pfsync_q_del(st, true);
-		PFSYNC_UNLOCK(sc);
-		return;
+			pfsync_q_del(st, true, b);
+		PFSYNC_BUCKET_UNLOCK(b);
+		return (full);
 	}
 
 	switch (st->sync_state) {
 	case PFSYNC_S_UPD_C:
 	case PFSYNC_S_IACK:
-		pfsync_q_del(st, false);
+		pfsync_q_del(st, false, b);
 		ref = false;
 		/* FALLTHROUGH */
 
 	case PFSYNC_S_NONE:
 		pfsync_q_ins(st, PFSYNC_S_UPD, ref);
-		pfsync_push(sc);
+		pfsync_push(b);
 		break;
 
 	case PFSYNC_S_INS:
@@ -1925,38 +1999,44 @@ pfsync_update_state_req(struct pf_state *st)
 		panic("%s: unexpected sync state %d", __func__, st->sync_state);
 	}
 
-	PFSYNC_UNLOCK(sc);
+	if ((sc->sc_ifp->if_mtu - b->b_len) < sizeof(struct pfsync_state))
+		full = true;
+
+	PFSYNC_BUCKET_UNLOCK(b);
+
+	return (full);
 }
 
 static void
 pfsync_delete_state(struct pf_state *st)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
+	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 	bool ref = true;
 
-	PFSYNC_LOCK(sc);
+	PFSYNC_BUCKET_LOCK(b);
 	if (st->state_flags & PFSTATE_ACK)
 		pfsync_undefer_state(st, 1);
 	if (st->state_flags & PFSTATE_NOSYNC) {
 		if (st->sync_state != PFSYNC_S_NONE)
-			pfsync_q_del(st, true);
-		PFSYNC_UNLOCK(sc);
+			pfsync_q_del(st, true, b);
+		PFSYNC_BUCKET_UNLOCK(b);
 		return;
 	}
 
-	if (sc->sc_len == PFSYNC_MINPKT)
-		callout_reset(&sc->sc_tmo, 1 * hz, pfsync_timeout, V_pfsyncif);
+	if (b->b_len == PFSYNC_MINPKT)
+		callout_reset(&b->b_tmo, 1 * hz, pfsync_timeout, b);
 
 	switch (st->sync_state) {
 	case PFSYNC_S_INS:
 		/* We never got to tell the world so just forget about it. */
-		pfsync_q_del(st, true);
+		pfsync_q_del(st, true, b);
 		break;
 
 	case PFSYNC_S_UPD_C:
 	case PFSYNC_S_UPD:
 	case PFSYNC_S_IACK:
-		pfsync_q_del(st, false);
+		pfsync_q_del(st, false, b);
 		ref = false;
 		/* FALLTHROUGH */
 
@@ -1968,13 +2048,12 @@ pfsync_delete_state(struct pf_state *st)
 		panic("%s: unexpected sync state %d", __func__, st->sync_state);
 	}
 
-	PFSYNC_UNLOCK(sc);
+	PFSYNC_BUCKET_UNLOCK(b);
 }
 
 static void
 pfsync_clear_states(u_int32_t creatorid, const char *ifname)
 {
-	struct pfsync_softc *sc = V_pfsyncif;
 	struct {
 		struct pfsync_subheader subh;
 		struct pfsync_clr clr;
@@ -1989,9 +2068,7 @@ pfsync_clear_states(u_int32_t creatorid, const char *i
 	strlcpy(r.clr.ifname, ifname, sizeof(r.clr.ifname));
 	r.clr.creatorid = creatorid;
 
-	PFSYNC_LOCK(sc);
 	pfsync_send_plus(&r, sizeof(r));
-	PFSYNC_UNLOCK(sc);
 }
 
 static void
@@ -1999,48 +2076,48 @@ pfsync_q_ins(struct pf_state *st, int q, bool ref)
 {
 	struct pfsync_softc *sc = V_pfsyncif;
 	size_t nlen = pfsync_qs[q].len;
+	struct pfsync_bucket *b = pfsync_get_bucket(sc, st);
 
-	PFSYNC_LOCK_ASSERT(sc);
+	PFSYNC_BUCKET_LOCK_ASSERT(b);
 
 	KASSERT(st->sync_state == PFSYNC_S_NONE,
 		("%s: st->sync_state %u", __func__, st->sync_state));
-	KASSERT(sc->sc_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu",
-	    sc->sc_len));
+	KASSERT(b->b_len >= PFSYNC_MINPKT, ("pfsync pkt len is too low %zu",
+	    b->b_len));
 
-	if (TAILQ_EMPTY(&sc->sc_qs[q]))
+	if (TAILQ_EMPTY(&b->b_qs[q]))
 		nlen += sizeof(struct pfsync_subheader);
 
-	if (sc->sc_len + nlen > sc->sc_ifp->if_mtu) {
-		pfsync_sendout(1);
+	if (b->b_len + nlen > sc->sc_ifp->if_mtu) {
+		pfsync_sendout(1, b->b_id);
 
 		nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len;
 	}
 
-	sc->sc_len += nlen;
-	TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list);
+	b->b_len += nlen;
+	TAILQ_INSERT_TAIL(&b->b_qs[q], st, sync_list);
 	st->sync_state = q;
 	if (ref)
 		pf_ref_state(st);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-all mailing list