svn commit: r193102 - in projects/pnet/sys: conf i386/conf kern net netatalk netinet netinet6 netipsec netipx netnatm

Sat May 30 17:39:51 UTC 2009

Author: rwatson
Date: Sat May 30 17:39:50 2009
New Revision: 193102
URL: http://svn.freebsd.org/changeset/base/193102

Log:
  The netisr2 code is now settling out, so replace the existing netisr code
  with it entirely, and disable various compatibility mode parts for the
  old code.  More to do but this is the basic file move and make it build
  commit.
  
  Discussed with:	bz

Deleted:
  projects/pnet/sys/i386/conf/NETISR2
  projects/pnet/sys/net/netisr2.c
  projects/pnet/sys/net/netisr2.h
Modified:
  projects/pnet/sys/conf/NOTES
  projects/pnet/sys/conf/files
  projects/pnet/sys/conf/options
  projects/pnet/sys/kern/kern_poll.c
  projects/pnet/sys/net/netisr.c
  projects/pnet/sys/net/netisr.h
  projects/pnet/sys/net/rtsock.c
  projects/pnet/sys/netatalk/ddp_usrreq.c
  projects/pnet/sys/netinet/if_ether.c
  projects/pnet/sys/netinet/igmp.c
  projects/pnet/sys/netinet/ip_divert.c
  projects/pnet/sys/netinet/ip_input.c
  projects/pnet/sys/netinet6/ip6_input.c
  projects/pnet/sys/netipsec/ipsec_input.c
  projects/pnet/sys/netipx/ipx_input.c
  projects/pnet/sys/netnatm/natm_proto.c

Modified: projects/pnet/sys/conf/NOTES
==============================================================================

--- projects/pnet/sys/conf/NOTES	Sat May 30 17:26:55 2009	(r193101)
+++ projects/pnet/sys/conf/NOTES	Sat May 30 17:39:50 2009	(r193102)
@@ -512,11 +512,6 @@ options 	HWPMC_HOOKS		# Other necessary 
 # NETWORKING OPTIONS
 
 #
-# Support for parallel netisr threads.
-#
-options 	NETISR2
-
-#
 # Protocol families
 #
 options 	INET			#Internet communications protocols

Modified: projects/pnet/sys/conf/files
==============================================================================
--- projects/pnet/sys/conf/files	Sat May 30 17:26:55 2009	(r193101)
+++ projects/pnet/sys/conf/files	Sat May 30 17:39:50 2009	(r193102)
@@ -2157,7 +2157,6 @@ net/if_vlan.c			optional vlan
 net/mppcc.c			optional netgraph_mppc_compression
 net/mppcd.c			optional netgraph_mppc_compression
 net/netisr.c			standard
-net/netisr2.c			optional netisr2
 net/pfil.c			optional ether | inet
 net/radix.c			standard
 net/radix_mpath.c		standard

Modified: projects/pnet/sys/conf/options
==============================================================================
--- projects/pnet/sys/conf/options	Sat May 30 17:26:55 2009	(r193101)
+++ projects/pnet/sys/conf/options	Sat May 30 17:39:50 2009	(r193102)
@@ -413,7 +413,6 @@ MBUF_STRESS_TEST
 MROUTING		opt_mrouting.h
 NCP
 NETATALK		opt_atalk.h
-NETISR2			opt_netisr.h
 NFSLOCKD
 RADIX_MPATH		opt_mpath.h
 ROUTETABLES		opt_route.h

Modified: projects/pnet/sys/kern/kern_poll.c
==============================================================================
--- projects/pnet/sys/kern/kern_poll.c	Sat May 30 17:26:55 2009	(r193101)
+++ projects/pnet/sys/kern/kern_poll.c	Sat May 30 17:39:50 2009	(r193102)
@@ -28,7 +28,6 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_netisr.h"
 #include "opt_route.h"
 #include "opt_device_polling.h"
 
@@ -46,7 +45,6 @@ __FBSDID("$FreeBSD$");
 
 #include <net/if.h>			/* for IFF_* flags		*/
 #include <net/netisr.h>			/* for NETISR_POLL		*/
-#include <net/netisr2.h>
 #include <net/route.h>
 #include <net/vnet.h>
 
@@ -110,10 +108,8 @@ SYSCTL_NODE(_kern, OID_AUTO, polling, CT
 SYSCTL_UINT(_kern_polling, OID_AUTO, burst, CTLFLAG_RD,
 	&poll_burst, 0, "Current polling burst size");
 
-#ifdef NETISR2
 static int	netisr_poll_scheduled;
 static int	netisr_pollmore_scheduled;
-#endif
 
 static int poll_burst_max_sysctl(SYSCTL_HANDLER_ARGS)
 {
@@ -269,10 +265,6 @@ init_device_poll(void)
 {
 
 	mtx_init(&poll_mtx, "polling", NULL, MTX_DEF);
-#ifndef NETISR2
-	netisr_register(NETISR_POLL, (netisr_t *)netisr_poll, NULL, 0);
-	netisr_register(NETISR_POLLMORE, (netisr_t *)netisr_pollmore, NULL, 0);
-#endif
 }
 SYSINIT(device_poll, SI_SUB_CLOCKS, SI_ORDER_MIDDLE, init_device_poll, NULL);
 
@@ -321,13 +313,9 @@ hardclock_device_poll(void)
 		if (phase != 0)
 			suspect++;
 		phase = 1;
-#ifdef NETISR2
 		netisr_poll_scheduled = 1;
 		netisr_pollmore_scheduled = 1;
 		netisr2_sched_poll();
-#else
-		schednetisrbits(1 << NETISR_POLL | 1 << NETISR_POLLMORE);
-#endif
 		phase = 2;
 	}
 	if (pending_polls++ > 0)
@@ -378,22 +366,16 @@ netisr_pollmore()
 	int kern_load;
 
 	mtx_lock(&poll_mtx);
-#ifdef NETISR2
 	if (!netisr_pollmore_scheduled) {
 		mtx_unlock(&poll_mtx);
 		return;
 	}
 	netisr_pollmore_scheduled = 0;
-#endif
 	phase = 5;
 	if (residual_burst > 0) {
-#ifdef NETISR2
 		netisr_poll_scheduled = 1;
 		netisr_pollmore_scheduled = 1;
 		netisr2_sched_poll();
-#else
-		schednetisrbits(1 << NETISR_POLL | 1 << NETISR_POLLMORE);
-#endif
 		mtx_unlock(&poll_mtx);
 		/* will run immediately on return, followed by netisrs */
 		return;
@@ -423,21 +405,16 @@ netisr_pollmore()
 		poll_burst -= (poll_burst / 8);
 		if (poll_burst < 1)
 			poll_burst = 1;
-#ifdef NETISR2
 		netisr_poll_scheduled = 1;
 		netisr_pollmore_scheduled = 1;
 		netisr2_sched_poll();
-#else
-		schednetisrbits(1 << NETISR_POLL | 1 << NETISR_POLLMORE);
-#endif
 		phase = 6;
 	}
 	mtx_unlock(&poll_mtx);
 }
 
 /*
- * netisr_poll is scheduled by schednetisr when appropriate, typically once
- * per tick.
+ * netisr_poll is typically scheduled once per tick.
  */
 void
 netisr_poll(void)
@@ -446,13 +423,11 @@ netisr_poll(void)
 	enum poll_cmd arg = POLL_ONLY;
 
 	mtx_lock(&poll_mtx);
-#ifdef NETISR2
 	if (!netisr_poll_scheduled) {
 		mtx_unlock(&poll_mtx);
 		return;
 	}
 	netisr_poll_scheduled = 0;
-#endif
 	phase = 3;
 	if (residual_burst == 0) { /* first call in this tick */
 		microuptime(&poll_start_t);

Modified: projects/pnet/sys/net/netisr.c
==============================================================================
--- projects/pnet/sys/net/netisr.c	Sat May 30 17:26:55 2009	(r193101)
+++ projects/pnet/sys/net/netisr.c	Sat May 30 17:39:50 2009	(r193102)
@@ -1,6 +1,5 @@
 /*-
- * Copyright (c) 2001,2002,2003 Jonathan Lemon <jlemon at FreeBSD.org>
- * Copyright (c) 1997, Stefan Esser <se at freebsd.org>
+ * Copyright (c) 2007-2009 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -23,233 +22,1069 @@
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+/*
+ * netisr2 is a packet dispatch service, allowing synchronous (directly
+ * dispatched) and asynchronous (deferred dispatch) processing of packets by
+ * registered protocol handlers.  Callers pass a protocol identifier and
+ * packet to netisr2, along with a direct dispatch hint, and work will either
+ * be immediately processed with the registered handler, or passed to a
+ * kernel software interrupt (SWI) thread for deferred dispatch.  Callers
+ * will generally select one or the other based on:
+ *
+ * - Might directly dispatching a netisr handler lead to code reentrance or
+ *   lock recursion, such as entering the socket code from the socket code.
+ * - Might directly dispatching a netisr handler lead to recursive
+ *   processing, such as when decapsulating several wrapped layers of tunnel
+ *   information (IPSEC within IPSEC within ...).
  *
- * $FreeBSD$
+ * Maintaining ordering for protocol streams is a critical design concern.
+ * Enforcing ordering limits the opportunity for concurrency, but maintains
+ * the strong ordering requirements found in some protocols, such as TCP.  Of
+ * related concern is CPU affinity--it is desirable to process all data
+ * associated with a particular stream on the same CPU over time in order to
+ * avoid acquiring locks associated with the connection on different CPUs,
+ * keep connection data in one cache, and to generally encourage associated
+ * user threads to live on the same CPU as the stream.  It's also desirable
+ * to avoid lock migration and contention where locks are associated with
+ * more than one flow.
+ *
+ * netisr2 supports several policy variations, represented by the
+ * NETISR_POLICY_* constants, allowing protocols to play a varying role in
+ * identifying flows, assigning work to CPUs, etc.  These are described in
+ * detail in netisr.h.
  */
 
+#include "opt_ddb.h"
 #include "opt_device_polling.h"
-#include "opt_netisr.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
-#include <sys/rtprio.h>
-#include <sys/systm.h>
-#include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
+#include <sys/interrupt.h>
 #include <sys/lock.h>
-#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
 #include <sys/proc.h>
-#include <sys/random.h>
-#include <sys/resourcevar.h>
+#include <sys/rmlock.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/socket.h>
 #include <sys/sysctl.h>
-#include <sys/unistd.h>
-#include <sys/vimage.h>
-#include <machine/atomic.h>
-#include <machine/cpu.h>
-#include <machine/stdarg.h>
+#include <sys/systm.h>
 
-#include <sys/mbuf.h>
-#include <sys/socket.h>
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
 
 #include <net/if.h>
-#include <net/if_types.h>
 #include <net/if_var.h>
 #include <net/netisr.h>
 
-volatile unsigned int	netisr;	/* scheduling bits for network */
+/*-
+ * Synchronize use and modification of the registered netisr data structures;
+ * acquire a read lock while modifying the set of registered protocols to
+ * prevent partially registered or unregistered protocols from being run.
+ *
+ * The following data structures and fields are protected by this lock:
+ *
+ * - The np array, including all fields of struct netisr_proto.
+ * - The nws array, including all fields of struct netisr_worker.
+ * - The nws_array array.
+ *
+ * Note: the NETISR2_LOCKING define controls whether read locks are acquired
+ * in packet processing paths requiring netisr registration stability.  This
+ * is disabled by default as it can lead to a measurable performance
+ * degradation even with rmlocks (3%-6% for loopback ping-ping traffic), and
+ * because netisr registration and unregistration is extremely rare at
+ * runtime.  If it becomes more common, this decision should be revisited.
+ *
+ * XXXRW: rmlocks don't support assertions.
+ */
+static struct rmlock	netisr_rmlock;
+#define	NETISR_LOCK_INIT()	rm_init_flags(&netisr_rmlock, "netisr", \
+				    RM_NOWITNESS)
+#define	NETISR_LOCK_ASSERT()
+#define	NETISR_RLOCK(tracker)	rm_rlock(&netisr_rmlock, (tracker))
+#define	NETISR_RUNLOCK(tracker)	rm_runlock(&netisr_rmlock, (tracker))
+#define	NETISR_WLOCK()		rm_wlock(&netisr_rmlock)
+#define	NETISR_WUNLOCK()	rm_wunlock(&netisr_rmlock)
+/* #define	NETISR2_LOCKING */
+
+SYSCTL_NODE(_net, OID_AUTO, isr2, CTLFLAG_RW, 0, "netisr2");
+
+/*-
+ * Three direct dispatch policies are supported:
+ *
+ * - Always defer: all work is scheduled for a netisr, regardless of context.
+ *   (!direct_enable)
+ *
+ * - Hybrid: if the executing context allows direct dispatch, and we're
+ *   running on the CPU the work would be done on, then direct dispatch if it
+ *   wouldn't violate ordering constraints on the workstream.
+ *   (direct_enable && !direct_force)
+ *
+ * - Always direct: if the executing context allows direct dispatch, always
+ *   direct dispatch.  (direct_enable && direct_force)
+ *
+ * Notice that changing the global policy could lead to short periods of
+ * misordered processing, but this is considered acceptable as compared to
+ * the complexity of enforcing ordering during policy changes.
+ */
+static int	netisr_direct_force = 1;	/* Always direct dispatch. */
+SYSCTL_INT(_net_isr2, OID_AUTO, direct_force, CTLFLAG_RW,
+    &netisr_direct_force, 0, "Force direct dispatch");
+
+static int	netisr_direct_enable = 1;	/* Enable direct dispatch. */
+SYSCTL_INT(_net_isr2, OID_AUTO, direct_enable, CTLFLAG_RW,
+    &netisr_direct_enable, 0, "Enable direct dispatch");
+
+/*
+ * Allow the administrator to limit the number of threads (CPUs) to use for
+ * netisr2.  We don't check netisr_maxthreads before creating the thread for
+ * CPU 0, so in practice we ignore values <= 1.  This must be set at boot.
+ * We will create at most one thread per CPU.
+ */
+static int	netisr_maxthreads = 1;		/* Max number of threads. */
+TUNABLE_INT("net.isr2.maxthreads", &netisr_maxthreads);
+SYSCTL_INT(_net_isr2, OID_AUTO, maxthreads, CTLFLAG_RD,
+    &netisr_maxthreads, 0,
+    "Use at most this many CPUs for netisr2 processing");
+
+static int	netisr_bindthreads = 0;		/* Bind threads to CPUs. */
+TUNABLE_INT("net.isr2.bindthreads", &netisr_bindthreads);
+SYSCTL_INT(_net_isr2, OID_AUTO, bindthreads, CTLFLAG_RD,
+    &netisr_bindthreads, 0, "Bind netisr2 threads to CPUs.");
+
+/*
+ * Limit per-workstream queues to at most net.isr2.maxqlimit, both for
+ * initial configuration and later modification using netisr2_setqlimit().
+ */
+#define	NETISR_DEFAULT_MAXQLIMIT	10240
+static int	netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT;
+SYSCTL_INT(_net_isr2, OID_AUTO, maxqlimit, CTLFLAG_RD,
+    &netisr_maxqlimit, 0,
+    "Maximum netisr2 per-protocol, per-CPU queue depth.");
+
+/*
+ * Each protocol is described by a struct netisr_proto, which holds all
+ * global per-protocol information.  This data structure is set up by
+ * netisr_register(), and derived from the public struct netisr_handler.
+ */
+struct netisr_proto {
+	const char	*np_name;	/* Character string protocol name. */
+	netisr_t	*np_handler;	/* Protocol handler. */
+	netisr_m2flow_t	*np_m2flow;	/* Query flow for untagged packet. */
+	netisr_m2cpuid_t *np_m2cpuid;	/* Query CPU to process packet on. */
+	u_int		 np_qlimit;	/* Maximum per-CPU queue depth. */
+	u_int		 np_policy;	/* Work placement policy. */
+};
+
+#define	NETISR_MAXPROT		32		/* Compile-time limit. */
+
+/*
+ * The np array describes all registered protocols, indexed by protocol
+ * number.
+ */
+static struct netisr_proto	np[NETISR_MAXPROT];
+
+/*
+ * Protocol-specific work for each workstream is described by struct
+ * netisr_work.  Each work descriptor consists of an mbuf queue and
+ * statistics.
+ */
+struct netisr_work {
+	/*
+	 * Packet queue, linked by m_nextpkt.
+	 */
+	struct mbuf	*nw_head;
+	struct mbuf	*nw_tail;
+	u_int		 nw_len;
+	u_int		 nw_qlimit;
+	u_int		 nw_watermark;
 
-struct netisr {
-	netisr_t	*ni_handler;
-	struct ifqueue	*ni_queue;
-	int		ni_flags;
-} netisrs[32];
+	/*
+	 * Statistics -- written unlocked, but mostly from curcpu.
+	 */
+	u_int64_t	 nw_dispatched; /* Number of direct dispatches. */
+	u_int64_t	 nw_hybrid_dispatched; /* "" hybrid dispatches. */
+	u_int64_t	 nw_qdrops;	/* "" drops. */
+	u_int64_t	 nw_queued;	/* "" enqueues. */
+	u_int64_t	 nw_handled;	/* "" handled in worker. */
+};
 
-static void *net_ih;
+/*
+ * Workstreams hold a set of ordered work across each protocol, and are
+ * described by netisr_workstream.  Each workstream is associated with a
+ * worker thread, which in turn is pinned to a CPU.  Work associated with a
+ * workstream can be processd in other threads during direct dispatch;
+ * concurrent processing is prevented by the NWS_RUNNING flag, which
+ * indicates that a thread is already processing the work queue.
+ */
+struct netisr_workstream {
+	struct intr_event *nws_intr_event;	/* Handler for stream. */
+	void		*nws_swi_cookie;	/* swi(9) cookie for stream. */
+	struct mtx	 nws_mtx;		/* Synchronize work. */
+	u_int		 nws_cpu;		/* CPU pinning. */
+	u_int		 nws_flags;		/* Wakeup flags. */
+	u_int		 nws_pendingbits;	/* Scheduled protocols. */
 
+	/*
+	 * Each protocol has per-workstream data.
+	 */
+	struct netisr_work	nws_work[NETISR_MAXPROT];
+} __aligned(CACHE_LINE_SIZE);
+
+/*
+ * Per-CPU workstream data, indexed by CPU ID.
+ */
+static struct netisr_workstream		 nws[MAXCPU];
+
+/*
+ * Map contiguous values between 0 and nws_count into CPU IDs appropriate for
+ * indexing the nws[] array.  This allows constructions of the form
+ * nws[nws_array(arbitraryvalue % nws_count)].
+ */
+static u_int				 nws_array[MAXCPU];
+
+/*
+ * Number of registered workstreams.  Will be at most the number of running
+ * CPUs once fully started.
+ */
+static u_int				 nws_count;
+
+/*
+ * Per-workstream flags.
+ */
+#define	NWS_RUNNING	0x00000001	/* Currently running in a thread. */
+#define	NWS_DISPATCHING	0x00000002	/* Currently being direct-dispatched. */
+#define	NWS_SCHEDULED	0x00000004	/* Signal issued. */
+
+/*
+ * Synchronization for each workstream: a mutex protects all mutable fields
+ * in each stream, including per-protocol state (mbuf queues).  The SWI is
+ * woken up if asynchronous dispatch is required.
+ */
+#define	NWS_LOCK(s)		mtx_lock(&(s)->nws_mtx)
+#define	NWS_LOCK_ASSERT(s)	mtx_assert(&(s)->nws_mtx, MA_OWNED)
+#define	NWS_UNLOCK(s)		mtx_unlock(&(s)->nws_mtx)
+#define	NWS_SIGNAL(s)		swi_sched((s)->nws_swi_cookie, 0)
+
+/*
+ * Utility routines for protocols that implement their own mapping of flows
+ * to CPUs.
+ */
+u_int
+netisr2_get_cpucount(void)
+{
+
+	return (nws_count);
+}
+
+u_int
+netisr2_get_cpuid(u_int cpunumber)
+{
+
+	KASSERT(cpunumber < nws_count, ("netisr2_get_cpuid: %u > %u",
+	    cpunumber, nws_count));
+
+	return (nws_array[cpunumber]);
+}
+
+/*
+ * The default implementation of -> CPU ID mapping.
+ *
+ * Non-static so that protocols can use it to map their own work to specific
+ * CPUs in a manner consistent to netisr2 for affinity purposes.
+ */
+u_int
+netisr2_default_flow2cpu(u_int flowid)
+{
+
+	return (nws_array[flowid % nws_count]);
+}
+
+/*
+ * Register a new netisr handler, which requires initializing per-protocol
+ * fields for each workstream.  All netisr2 work is briefly suspended while
+ * the protocol is installed.
+ */
 void
-legacy_setsoftnet(void)
+netisr2_register(const struct netisr_handler *nhp)
 {
-	swi_sched(net_ih, 0);
+	struct netisr_work *npwp;
+	const char *name;
+	u_int i, proto;
+
+	proto = nhp->nh_proto;
+	name = nhp->nh_name;
+
+	/*
+	 * Test that the requested registration is valid.
+	 */
+	KASSERT(nhp->nh_name != NULL,
+	    ("netisr2_register: nh_name NULL for %d", proto));
+	KASSERT(nhp->nh_handler != NULL,
+	    ("netisr2_register: nh_handler NULL for %s", name));
+	KASSERT(nhp->nh_policy == NETISR_POLICY_SOURCE ||
+	    nhp->nh_policy == NETISR_POLICY_FLOW ||
+	    nhp->nh_policy == NETISR_POLICY_CPU,
+	    ("netisr2_register: unsupported nh_policy %u for %s",
+	    nhp->nh_policy, name));
+	KASSERT(nhp->nh_policy == NETISR_POLICY_FLOW ||
+	    nhp->nh_m2flow == NULL,
+	    ("netisr2_register: nh_policy != FLOW but m2flow defined for %s",
+	    name));
+	KASSERT(nhp->nh_policy == NETISR_POLICY_CPU || nhp->nh_m2cpuid == NULL,
+	    ("netisr2_register: nh_policy != CPU but m2cpuid defined for %s",
+	    name));
+	KASSERT(nhp->nh_policy != NETISR_POLICY_CPU || nhp->nh_m2cpuid != NULL,
+	    ("netisr2_register: nh_policy == CPU but m2cpuid not defined for "
+	    "%s", name));
+	KASSERT(nhp->nh_qlimit != 0,
+	    ("netisr2_register: nh_qlimit 0 for %s", name));
+	KASSERT(proto < NETISR_MAXPROT,
+	    ("netisr2_register(%d, %s): protocol too big", proto, name));
+
+	/*
+	 * Test that no existing registration exists for this protocol.
+	 */
+	NETISR_WLOCK();
+	KASSERT(np[proto].np_name == NULL,
+	    ("netisr2_register(%d, %s): name present", proto, name));
+	KASSERT(np[proto].np_handler == NULL,
+	    ("netisr2_register(%d, %s): handler present", proto, name));
+
+	np[proto].np_name = name;
+	np[proto].np_handler = nhp->nh_handler;
+	np[proto].np_m2flow = nhp->nh_m2flow;
+	np[proto].np_m2cpuid = nhp->nh_m2cpuid;
+	if (nhp->nh_qlimit > netisr_maxqlimit) {
+		printf("netisr2_register: %s requested queue limit %u "
+		    "capped to net.isr2.maxqlimit %u\n", name,
+		    nhp->nh_qlimit, netisr_maxqlimit);
+		np[proto].np_qlimit = netisr_maxqlimit;
+	} else
+		np[proto].np_qlimit = nhp->nh_qlimit;
+	np[proto].np_policy = nhp->nh_policy;
+	for (i = 0; i < MAXCPU; i++) {
+		npwp = &nws[i].nws_work[proto];
+		bzero(npwp, sizeof(*npwp));
+		npwp->nw_qlimit = nhp->nh_qlimit;
+	}
+	NETISR_WUNLOCK();
 }
 
+/*
+ * Clear drop counters across all workstreams for a protocol.
+ */
 void
-netisr_register(int num, netisr_t *handler, struct ifqueue *inq, int flags)
+netisr2_clearqdrops(const struct netisr_handler *nhp)
 {
-	
-	KASSERT(!(num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs))),
-	    ("bad isr %d", num));
-	KASSERT(flags == 0, ("netisr_register: bad flags 0x%x\n", flags));
-	netisrs[num].ni_handler = handler;
-	netisrs[num].ni_queue = inq;
-	netisrs[num].ni_flags = flags;
+	struct netisr_work *npwp;
+#ifdef INVARIANTS
+	const char *name;
+#endif
+	u_int i, proto;
+
+	proto = nhp->nh_proto;
+#ifdef INVARIANTS
+	name = nhp->nh_name;
+#endif
+	KASSERT(proto < NETISR_MAXPROT,
+	    ("netisr_clearqdrops(%d): protocol too big for %s", proto, name));
+
+	NETISR_WLOCK();
+	KASSERT(np[proto].np_handler != NULL,
+	    ("netisr_clearqdrops(%d): protocol not registered for %s", proto,
+	    name));
+
+	for (i = 0; i < MAXCPU; i++) {
+		npwp = &nws[i].nws_work[proto];
+		npwp->nw_qdrops = 0;
+	}
+	NETISR_WUNLOCK();
 }
 
+/*
+ * Query the current drop counters across all workstreams for a protocol.
+ */
 void
-netisr_unregister(int num)
+netisr2_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp)
 {
-	struct netisr *ni;
-	
-	KASSERT(!(num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs))),
-	    ("bad isr %d", num));
-	ni = &netisrs[num];
-	ni->ni_handler = NULL;
-	if (ni->ni_queue != NULL)
-		IF_DRAIN(ni->ni_queue);
-	ni->ni_queue = NULL;
+	struct netisr_work *npwp;
+	struct rm_priotracker tracker;
+#ifdef INVARIANTS
+	const char *name;
+#endif
+	u_int i, proto;
+
+	*qdropp = 0;
+	proto = nhp->nh_proto;
+#ifdef INVARIANTS
+	name = nhp->nh_name;
+#endif
+	KASSERT(proto < NETISR_MAXPROT,
+	    ("netisr_getqdrops(%d): protocol too big for %s", proto, name));
+
+	NETISR_RLOCK(&tracker);
+	KASSERT(np[proto].np_handler != NULL,
+	    ("netisr_getqdrops(%d): protocol not registered for %s", proto,
+	    name));
+
+	for (i = 0; i < MAXCPU; i++) {
+		npwp = &nws[i].nws_work[proto];
+		*qdropp += npwp->nw_qdrops;
+	}
+	NETISR_RUNLOCK(&tracker);
 }
 
-struct isrstat {
-	int	isrs_count;			/* dispatch count */
-	int	isrs_directed;			/* ...directly dispatched */
-	int	isrs_deferred;			/* ...queued instead */
-	int	isrs_queued;			/* intentionally queueued */
-	int	isrs_drop;			/* dropped 'cuz no handler */
-	int	isrs_swi_count;			/* swi_net handlers called */
-};
-static struct isrstat isrstat;
+/*
+ * Query the current queue limit for per-workstream queues for a protocol.
+ */
+void
+netisr2_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp)
+{
+	struct rm_priotracker tracker;
+#ifdef INVARIANTS
+	const char *name;
+#endif
+	u_int proto;
+
+	proto = nhp->nh_proto;
+#ifdef INVARIANTS
+	name = nhp->nh_name;
+#endif
+	KASSERT(proto < NETISR_MAXPROT,
+	    ("netisr_getqlimit(%d): protocol too big for %s", proto, name));
+
+	NETISR_RLOCK(&tracker);
+	KASSERT(np[proto].np_handler != NULL,
+	    ("netisr_getqlimit(%d): protocol not registered for %s", proto,
+	    name));
+	*qlimitp = np[proto].np_qlimit;
+	NETISR_RUNLOCK(&tracker);
+}
 
-SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr counters");
+/*
+ * Update the queue limit across per-workstream queues for a protocol.  We
+ * simply change the limits, and don't drain overflowed packets as they will
+ * (hopefully) take care of themselves shortly.
+ */
+int
+netisr2_setqlimit(const struct netisr_handler *nhp, u_int qlimit)
+{
+	struct netisr_work *npwp;
+#ifdef INVARIANTS
+	const char *name;
+#endif
+	u_int i, proto;
+
+	if (qlimit > netisr_maxqlimit)
+		return (EINVAL);
+
+	proto = nhp->nh_proto;
+#ifdef INVARIANTS
+	name = nhp->nh_name;
+#endif
+	KASSERT(proto < NETISR_MAXPROT,
+	    ("netisr_setqlimit(%d): protocol too big for %s", proto, name));
 
-static int	netisr_direct = 1;
-SYSCTL_INT(_net_isr, OID_AUTO, direct, CTLFLAG_RW, 
-    &netisr_direct, 0, "enable direct dispatch");
-TUNABLE_INT("net.isr.direct", &netisr_direct);
-
-SYSCTL_INT(_net_isr, OID_AUTO, count, CTLFLAG_RD,
-    &isrstat.isrs_count, 0, "");
-SYSCTL_INT(_net_isr, OID_AUTO, directed, CTLFLAG_RD, 
-    &isrstat.isrs_directed, 0, "");
-SYSCTL_INT(_net_isr, OID_AUTO, deferred, CTLFLAG_RD, 
-    &isrstat.isrs_deferred, 0, "");
-SYSCTL_INT(_net_isr, OID_AUTO, queued, CTLFLAG_RD, 
-    &isrstat.isrs_queued, 0, "");
-SYSCTL_INT(_net_isr, OID_AUTO, drop, CTLFLAG_RD, 
-    &isrstat.isrs_drop, 0, "");
-SYSCTL_INT(_net_isr, OID_AUTO, swi_count, CTLFLAG_RD, 
-    &isrstat.isrs_swi_count, 0, "");
-
-/*
- * Process all packets currently present in a netisr queue.  Used to
- * drain an existing set of packets waiting for processing when we
- * begin direct dispatch, to avoid processing packets out of order.
+	NETISR_WLOCK();
+	KASSERT(np[proto].np_handler != NULL,
+	    ("netisr_setqlimit(%d): protocol not registered for %s", proto,
+	    name));
+
+	np[proto].np_qlimit = qlimit;
+	for (i = 0; i < MAXCPU; i++) {
+		npwp = &nws[i].nws_work[proto];
+		npwp->nw_qlimit = qlimit;
+	}
+	NETISR_WUNLOCK();
+	return (0);
+}
+
+/*
+ * Drain all packets currently held in a particular protocol work queue.
  */
 static void
-netisr_processqueue(struct netisr *ni)
+netisr2_drain_proto(struct netisr_work *npwp)
 {
 	struct mbuf *m;
 
-	for (;;) {
-		IF_DEQUEUE(ni->ni_queue, m);
-		if (m == NULL)
-			break;
-		VNET_ASSERT(m->m_pkthdr.rcvif != NULL);
-		CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
-		ni->ni_handler(m);
-		CURVNET_RESTORE();
+	while ((m = npwp->nw_head) != NULL) {
+		npwp->nw_head = m->m_nextpkt;
+		m->m_nextpkt = NULL;
+		if (npwp->nw_head == NULL)
+			npwp->nw_tail = NULL;
+		npwp->nw_len--;
+		m_freem(m);
 	}
+	KASSERT(npwp->nw_tail == NULL, ("netisr_drain_proto: tail"));
+	KASSERT(npwp->nw_len == 0, ("netisr_drain_proto: len"));
 }
 
 /*
- * Call the netisr directly instead of queueing the packet, if possible.
+ * Remove the registration of a network protocol, which requires clearing
+ * per-protocol fields across all workstreams, including freeing all mbufs in
+ * the queues at time of unregister.  All work in netisr2 is briefly
+ * suspended while this takes place.
  */
-#ifndef NETISR2
 void
-netisr_dispatch(int num, struct mbuf *m)
+netisr2_unregister(const struct netisr_handler *nhp)
 {
-	struct netisr *ni;
-	
-	isrstat.isrs_count++;		/* XXX redundant */
-	KASSERT(!(num < 0 || num >= (sizeof(netisrs)/sizeof(*netisrs))),
-	    ("bad isr %d", num));
-	ni = &netisrs[num];
-	if (ni->ni_queue == NULL) {
-		isrstat.isrs_drop++;
-		m_freem(m);
-		return;
+	struct netisr_work *npwp;
+#ifdef INVARIANTS
+	const char *name;
+#endif
+	u_int i, proto;
+
+	proto = nhp->nh_proto;
+#ifdef INVARIANTS
+	name = nhp->nh_name;
+#endif
+	KASSERT(proto < NETISR_MAXPROT,
+	    ("netisr_unregister(%d): protocol too big for %s", proto, name));
+
+	NETISR_WLOCK();
+	KASSERT(np[proto].np_handler != NULL,
+	    ("netisr_unregister(%d): protocol not registered for %s", proto,
+	    name));
+
+	np[proto].np_name = NULL;
+	np[proto].np_handler = NULL;
+	np[proto].np_m2flow = NULL;
+	np[proto].np_m2cpuid = NULL;
+	np[proto].np_qlimit = 0;
+	np[proto].np_policy = 0;
+	for (i = 0; i < MAXCPU; i++) {
+		npwp = &nws[i].nws_work[proto];
+		netisr2_drain_proto(npwp);
+		bzero(npwp, sizeof(*npwp));
+	}
+	NETISR_WUNLOCK();
+}
+
+/*
+ * Look up the workstream given a packet and source identifier.  Do this by
+ * checking the protocol's policy, and optionally call out to the protocol
+ * for assistance if required.
+ */
+static struct mbuf *
+netisr2_select_cpuid(struct netisr_proto *npp, uintptr_t source,
+    struct mbuf *m, u_int *cpuidp)
+{
+	struct ifnet *ifp;
+
+	NETISR_LOCK_ASSERT();
+
+	/*
+	 * In the event we have only one worker, shortcut and deliver to it
+	 * without further ado.
+	 */
+	if (nws_count == 1) {
+		*cpuidp = nws_array[0];
+		return (m);
 	}
 
 	/*
-	 * Directly dispatch handling of this packet, if permitted by global
-	 * policy.  Source ordering is maintained by virtue of callers
-	 * consistently calling one of queued or direct dispatch.
+	 * What happens next depends on the policy selected by the protocol.
+	 * If we want to support per-interface policies, we should do that
+	 * here first.
 	 */
-	if (netisr_direct) {
-		isrstat.isrs_directed++;
-		ni->ni_handler(m);
+	switch (npp->np_policy) {
+	case NETISR_POLICY_CPU:
+		return (npp->np_m2cpuid(m, source, cpuidp));
+
+	case NETISR_POLICY_FLOW:
+		if (!(m->m_flags & M_FLOWID) && npp->np_m2flow != NULL) {
+			m = npp->np_m2flow(m, source);
+			if (m == NULL)
+				return (NULL);
+		}
+		if (m->m_flags & M_FLOWID) {
+			*cpuidp =
+			    netisr2_default_flow2cpu(m->m_pkthdr.flowid);
+			return (m);
+		}
+		/* FALLTHROUGH */
+
+	case NETISR_POLICY_SOURCE:
+		ifp = m->m_pkthdr.rcvif;
+		if (ifp != NULL)
+			*cpuidp = nws_array[(ifp->if_index + source) %
+			    nws_count];
+		else
+			*cpuidp = nws_array[source % nws_count];
+		return (m);
+
+	default:
+		panic("netisr2_select_cpuid: invalid policy %u for %s",
+		    npp->np_policy, npp->np_name);
+	}
+}
+
+/*
+ * Process packets associated with a workstream and protocol.  For reasons of
+ * fairness, we process up to one complete netisr queue at a time, moving the
+ * queue to a stack-local queue for processing, but do not loop refreshing
+ * from the global queue.  The caller is responsible for deciding whether to
+ * loop, and for setting the NWS_RUNNING flag.  The passed workstream will be
+ * locked on entry and relocked before return, but will be released while
+ * processing.  The number of packets processed is returned.
+ */
+static u_int
+netisr2_process_workstream_proto(struct netisr_workstream *nwsp, u_int proto)
+{
+	struct netisr_work local_npw, *npwp;
+	u_int handled;
+	struct mbuf *m;
+
+	NWS_LOCK_ASSERT(nwsp);
+
+	KASSERT(nwsp->nws_flags & NWS_RUNNING,
+	    ("netisr_process_workstream_proto(%d): not running", proto));
+	KASSERT(proto >= 0 && proto < NETISR_MAXPROT,
+	    ("netisr_process_workstream_proto(%d): invalid proto\n", proto));
+
+	npwp = &nwsp->nws_work[proto];
+	if (npwp->nw_len == 0)
+		return (0);
+
+	/*
+	 * Move the global work queue to a thread-local work queue.
+	 *
+	 * Notice that this means the effective maximum length of the queue
+	 * is actually twice that of the maximum queue length specified in
+	 * the protocol registration call.
+	 */
+	handled = npwp->nw_len;
+	local_npw = *npwp;
+	npwp->nw_head = NULL;
+	npwp->nw_tail = NULL;
+	npwp->nw_len = 0;
+	nwsp->nws_pendingbits &= ~(1 << proto);
+	NWS_UNLOCK(nwsp);
+	while ((m = local_npw.nw_head) != NULL) {
+		local_npw.nw_head = m->m_nextpkt;
+		m->m_nextpkt = NULL;
+		if (local_npw.nw_head == NULL)
+			local_npw.nw_tail = NULL;
+		local_npw.nw_len--;
+		np[proto].np_handler(m);
+	}
+	KASSERT(local_npw.nw_len == 0,
+	    ("netisr_process_proto(%d): len %d", proto, local_npw.nw_len));
+	NWS_LOCK(nwsp);
+	npwp->nw_handled += handled;
+	return (handled);
+}
+
+/*
+ * SWI handler for netisr2 -- processes prackets in a set of workstreams that
+ * it owns, woken up by calls to NWS_SIGNAL().  If this workstream is already
+ * being direct dispatched, go back to sleep and wait for the dispatching
+ * thread to wake us up again.
+ */
+static void
+swi_net(void *arg)
+{
+#ifdef NETISR2_LOCKING
+	struct rm_priotracker tracker;
+#endif
+	struct netisr_workstream *nwsp;
+	u_int bits, prot;
+
+	nwsp = arg;
+
+#ifdef DEVICE_POLLING
+	KASSERT(nws_count == 1,
+	    ("swi_net: device_polling but nws_count != 1"));
+	netisr_poll();
+#endif
+#ifdef NETISR2_LOCKING
+	NETISR_RLOCK(&tracker);
+#endif
+	NWS_LOCK(nwsp);
+	KASSERT(!(nwsp->nws_flags & NWS_RUNNING), ("swi_net: running"));
+	if (nwsp->nws_flags & NWS_DISPATCHING)
+		goto out;
+	nwsp->nws_flags |= NWS_RUNNING;
+	nwsp->nws_flags &= ~NWS_SCHEDULED;
+	while ((bits = nws->nws_pendingbits) != 0) {
+		while ((prot = ffs(bits)) != 0) {
+			prot--;
+			bits &= ~(1 << prot);
+			(void)netisr2_process_workstream_proto(nwsp, prot);
+		}
+	}
+	nwsp->nws_flags &= ~NWS_RUNNING;
+out:
+	NWS_UNLOCK(nwsp);
+#ifdef NETISR2_LOCKING
+	NETISR_RUNLOCK(&tracker);
+#endif
+#ifdef DEVICE_POLLING
+	netisr_pollmore();
+#endif
+}
+
+static int
+netisr2_queue_workstream(struct netisr_workstream *nwsp, u_int proto,
+    struct netisr_work *npwp, struct mbuf *m, int *dosignalp)
+{
+
+	NWS_LOCK_ASSERT(nwsp);
+
+	*dosignalp = 0;
+	if (npwp->nw_len < npwp->nw_qlimit) {
+		m->m_nextpkt = NULL;
+		if (npwp->nw_head == NULL) {
+			npwp->nw_head = m;
+			npwp->nw_tail = m;
+		} else {
+			npwp->nw_tail->m_nextpkt = m;
+			npwp->nw_tail = m;
+		}
+		npwp->nw_len++;
+		if (npwp->nw_len > npwp->nw_watermark)
+			npwp->nw_watermark = npwp->nw_len;
+		nwsp->nws_pendingbits |= (1 << proto);
+		if (!(nwsp->nws_flags & (NWS_SCHEDULED | NWS_RUNNING))) {
+			nwsp->nws_flags |= NWS_SCHEDULED;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***