svn commit: r222249 - head/sys/net

Tue May 24 12:34:19 UTC 2011

Author: rwatson
Date: Tue May 24 12:34:19 2011
New Revision: 222249
URL: http://svn.freebsd.org/changeset/base/222249

Log:
  Rework netisr policy mechanism so that per-protocol dispatch policies can
  be represented:
  
  - A single policy namespace is defined, consisting of four possible
    policies: "default" to use the global default, "deferred" to force
    deferred dispatch, "direct" to employ direct dispatch where possible, and
    "hybrid" which makes a dynamic decision based on CPU affinity, ordering,
    etc.  Routines are implemented to convert between strings and an integer
    namespace.
  
  - A new global variable, netisr_dispatch_policy, subsumes existing global
    variables for direct dispatch, forced direct dispatch, etc, and is used
    for explicit policy interpretation and composition.  Old variables remain
    so that they can be exported by legacy sysctls for use by old netstat(1)
    binaries.  A new sysctl and tunable, netisr.dispatch.policy, accepts the
    above strings for specifying a global policy default.
  
  - The protocol registration structure, netisr_handler, grows an nh_dispatch
    field, which accepts a per-policy policy override.  The default value is
    '0', which corresponds to "default", meaning that protocols will accept
    the global default policy unless otherwise specified.
  
  - Policies are now interpreted and composed explicitly at various points in
    packet dispatch; protocol policies override global policies.
  
  - Protocols grow the ability to express a non-opinion about affinity even
    when implenting m2cpuid by returning NETISR_CPUID_NONE.  In that case, the
    framework falls back on source ordering, rather than simply using the
    current CPU.
  
  These changes are in support of allowing link layer re-dispatch based on
  RSS or similar hashes provided by NICs, especially in the case where the
  number of hardware receive queues matches hardware core count, rather than
  hardware thread count, requiring further software redistributeon.  (i.e.,
  on RMI XLR).
  
  MFC after:      3 weeks
  Reviewed by:    bz
  Sponsored by:   Juniper Networks, Inc.

Modified:
  head/sys/net/netisr.c
  head/sys/net/netisr.h
  head/sys/net/netisr_internal.h

Modified: head/sys/net/netisr.c
==============================================================================

--- head/sys/net/netisr.c	Tue May 24 09:01:56 2011	(r222248)
+++ head/sys/net/netisr.c	Tue May 24 12:34:19 2011	(r222249)
@@ -1,6 +1,6 @@
 /*-
  * Copyright (c) 2007-2009 Robert N. M. Watson
- * Copyright (c) 2010 Juniper Networks, Inc.
+ * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * This software was developed by Robert N. M. Watson under contract
@@ -127,32 +127,44 @@ static struct rmlock	netisr_rmlock;
 SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr");
 
 /*-
- * Three direct dispatch policies are supported:
+ * Three global direct dispatch policies are supported:
  *
- * - Always defer: all work is scheduled for a netisr, regardless of context.
- *   (!direct)
+ * NETISR_DISPATCH_QUEUED: All work is deferred for a netisr, regardless of
+ * context (may be overriden by protocols).
  *
- * - Hybrid: if the executing context allows direct dispatch, and we're
- *   running on the CPU the work would be done on, then direct dispatch if it
- *   wouldn't violate ordering constraints on the workstream.
- *   (direct && !direct_force)
+ * NETISR_DISPATCH_HYBRID: If the executing context allows direct dispatch,
+ * and we're running on the CPU the work would be performed on, then direct
+ * dispatch it if it wouldn't violate ordering constraints on the workstream.
  *
- * - Always direct: if the executing context allows direct dispatch, always
- *   direct dispatch.  (direct && direct_force)
+ * NETISR_DISPATCH_DIRECT: If the executing context allows direct dispatch,
+ * always direct dispatch.  (The default.)
  *
  * Notice that changing the global policy could lead to short periods of
  * misordered processing, but this is considered acceptable as compared to
- * the complexity of enforcing ordering during policy changes.
- */
-static int	netisr_direct_force = 1;	/* Always direct dispatch. */
-TUNABLE_INT("net.isr.direct_force", &netisr_direct_force);
-SYSCTL_INT(_net_isr, OID_AUTO, direct_force, CTLFLAG_RW,
-    &netisr_direct_force, 0, "Force direct dispatch");
-
-static int	netisr_direct = 1;	/* Enable direct dispatch. */
-TUNABLE_INT("net.isr.direct", &netisr_direct);
-SYSCTL_INT(_net_isr, OID_AUTO, direct, CTLFLAG_RW,
-    &netisr_direct, 0, "Enable direct dispatch");
+ * the complexity of enforcing ordering during policy changes.  Protocols can
+ * override the global policy (when they're not doing that, they select
+ * NETISR_DISPATCH_DEFAULT).
+ */
+#define	NETISR_DISPATCH_POLICY_DEFAULT	NETISR_DISPATCH_DIRECT
+#define	NETISR_DISPATCH_POLICY_MAXSTR	20 /* Used for temporary buffers. */
+static u_int	netisr_dispatch_policy = NETISR_DISPATCH_POLICY_DEFAULT;
+static int	sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_net_isr, OID_AUTO, dispatch, CTLTYPE_STRING | CTLFLAG_RW |
+    CTLFLAG_TUN, 0, 0, sysctl_netisr_dispatch_policy, "A",
+    "netisr dispatch policy");
+
+/*
+ * These sysctls were used in previous versions to control and export
+ * dispatch policy state.  Now, we provide read-only export via them so that
+ * older netstat binaries work.  At some point they can be garbage collected.
+ */
+static int	netisr_direct_force;
+SYSCTL_INT(_net_isr, OID_AUTO, direct_force, CTLFLAG_RD,
+    &netisr_direct_force, 0, "compat: force direct dispatch");
+
+static int	netisr_direct;
+SYSCTL_INT(_net_isr, OID_AUTO, direct, CTLFLAG_RD, &netisr_direct, 0,
+    "compat: enable direct dispatch");
 
 /*
  * Allow the administrator to limit the number of threads (CPUs) to use for
@@ -276,6 +288,106 @@ netisr_default_flow2cpu(u_int flowid)
 }
 
 /*
+ * Dispatch tunable and sysctl configuration.
+ */
+struct netisr_dispatch_table_entry {
+	u_int		 ndte_policy;
+	const char	*ndte_policy_str;
+};
+static const struct netisr_dispatch_table_entry netisr_dispatch_table[] = {
+	{ NETISR_DISPATCH_DEFAULT, "default" },
+	{ NETISR_DISPATCH_DEFERRED, "deferred" },
+	{ NETISR_DISPATCH_HYBRID, "hybrid" },
+	{ NETISR_DISPATCH_DIRECT, "direct" },
+};
+static const u_int netisr_dispatch_table_len =
+    (sizeof(netisr_dispatch_table) / sizeof(netisr_dispatch_table[0]));
+
+static void
+netisr_dispatch_policy_to_str(u_int dispatch_policy, char *buffer,
+    u_int buflen)
+{
+	const struct netisr_dispatch_table_entry *ndtep;
+	const char *str;
+	u_int i;
+
+	str = "unknown";
+	for (i = 0; i < netisr_dispatch_table_len; i++) {
+		ndtep = &netisr_dispatch_table[i];
+		if (ndtep->ndte_policy == dispatch_policy) {
+			str = ndtep->ndte_policy_str;
+			break;
+		}
+	}
+	snprintf(buffer, buflen, "%s", str);
+}
+
+static int
+netisr_dispatch_policy_from_str(const char *str, u_int *dispatch_policyp)
+{
+	const struct netisr_dispatch_table_entry *ndtep;
+	u_int i;
+
+	for (i = 0; i < netisr_dispatch_table_len; i++) {
+		ndtep = &netisr_dispatch_table[i];
+		if (strcmp(ndtep->ndte_policy_str, str) == 0) {
+			*dispatch_policyp = ndtep->ndte_policy;
+			return (0);
+		}
+	}
+	return (EINVAL);
+}
+
+static void
+netisr_dispatch_policy_compat(void)
+{
+
+	switch (netisr_dispatch_policy) {
+	case NETISR_DISPATCH_DEFERRED:
+		netisr_direct_force = 0;
+		netisr_direct = 0;
+		break;
+
+	case NETISR_DISPATCH_HYBRID:
+		netisr_direct_force = 0;
+		netisr_direct = 1;
+		break;
+
+	case NETISR_DISPATCH_DIRECT:
+		netisr_direct_force = 1;
+		netisr_direct = 1;
+		break;
+
+	default:
+		panic("%s: unknown policy %u", __func__,
+		    netisr_dispatch_policy);
+	}
+}
+
+static int
+sysctl_netisr_dispatch_policy(SYSCTL_HANDLER_ARGS)
+{
+	char tmp[NETISR_DISPATCH_POLICY_MAXSTR];
+	u_int dispatch_policy;
+	int error;
+
+	netisr_dispatch_policy_to_str(netisr_dispatch_policy, tmp,
+	    sizeof(tmp));
+	error = sysctl_handle_string(oidp, tmp, sizeof(tmp), req);
+	if (error == 0 && req->newptr != NULL) {
+		error = netisr_dispatch_policy_from_str(tmp,
+		    &dispatch_policy);
+		if (error == 0 && dispatch_policy == NETISR_DISPATCH_DEFAULT)
+			error = EINVAL;
+		if (error == 0) {
+			netisr_dispatch_policy = dispatch_policy;
+			netisr_dispatch_policy_compat();
+		}
+	}
+	return (error);
+}
+
+/*
  * Register a new netisr handler, which requires initializing per-protocol
  * fields for each workstream.  All netisr work is briefly suspended while
  * the protocol is installed.
@@ -312,6 +424,12 @@ netisr_register(const struct netisr_hand
 	KASSERT(nhp->nh_policy != NETISR_POLICY_CPU || nhp->nh_m2cpuid != NULL,
 	    ("%s: nh_policy == CPU but m2cpuid not defined for %s", __func__,
 	    name));
+	KASSERT(nhp->nh_dispatch == NETISR_DISPATCH_DEFAULT ||
+	    nhp->nh_dispatch == NETISR_DISPATCH_DEFERRED ||
+	    nhp->nh_dispatch == NETISR_DISPATCH_HYBRID ||
+	    nhp->nh_dispatch == NETISR_DISPATCH_DIRECT,
+	    ("%s: invalid nh_dispatch (%u)", __func__, nhp->nh_dispatch));
+
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s(%u, %s): protocol too big", __func__, proto, name));
 
@@ -339,6 +457,7 @@ netisr_register(const struct netisr_hand
 	} else
 		netisr_proto[proto].np_qlimit = nhp->nh_qlimit;
 	netisr_proto[proto].np_policy = nhp->nh_policy;
+	netisr_proto[proto].np_dispatch = nhp->nh_dispatch;
 	CPU_FOREACH(i) {
 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
 		bzero(npwp, sizeof(*npwp));
@@ -541,15 +660,32 @@ netisr_unregister(const struct netisr_ha
 }
 
 /*
+ * Compose the global and per-protocol policies on dispatch, and return the
+ * dispatch policy to use.
+ */
+static u_int
+netisr_get_dispatch(struct netisr_proto *npp)
+{
+
+	/*
+	 * Protocol-specific configuration overrides the global default.
+	 */
+	if (npp->np_dispatch != NETISR_DISPATCH_DEFAULT)
+		return (npp->np_dispatch);
+	return (netisr_dispatch_policy);
+}
+
+/*
  * Look up the workstream given a packet and source identifier.  Do this by
  * checking the protocol's policy, and optionally call out to the protocol
  * for assistance if required.
  */
 static struct mbuf *
-netisr_select_cpuid(struct netisr_proto *npp, uintptr_t source,
-    struct mbuf *m, u_int *cpuidp)
+netisr_select_cpuid(struct netisr_proto *npp, u_int dispatch_policy,
+    uintptr_t source, struct mbuf *m, u_int *cpuidp)
 {
 	struct ifnet *ifp;
+	u_int policy;
 
 	NETISR_LOCK_ASSERT();
 
@@ -567,11 +703,30 @@ netisr_select_cpuid(struct netisr_proto 
 	 * If we want to support per-interface policies, we should do that
 	 * here first.
 	 */
-	switch (npp->np_policy) {
-	case NETISR_POLICY_CPU:
-		return (npp->np_m2cpuid(m, source, cpuidp));
+	policy = npp->np_policy;
+	if (policy == NETISR_POLICY_CPU) {
+		m = npp->np_m2cpuid(m, source, cpuidp);
+		if (m == NULL)
+			return (NULL);
 
-	case NETISR_POLICY_FLOW:
+		/*
+		 * It's possible for a protocol not to have a good idea about
+		 * where to process a packet, in which case we fall back on
+		 * the netisr code to decide.  In the hybrid case, return the
+		 * current CPU ID, which will force an immediate direct
+		 * dispatch.  In the queued case, fall back on the SOURCE
+		 * policy.
+		 */
+		if (*cpuidp != NETISR_CPUID_NONE)
+			return (m);
+		if (dispatch_policy == NETISR_DISPATCH_HYBRID) {
+			*cpuidp = curcpu;
+			return (m);
+		}
+		policy = NETISR_POLICY_SOURCE;
+	}
+
+	if (policy == NETISR_POLICY_FLOW) {
 		if (!(m->m_flags & M_FLOWID) && npp->np_m2flow != NULL) {
 			m = npp->np_m2flow(m, source);
 			if (m == NULL)
@@ -582,21 +737,19 @@ netisr_select_cpuid(struct netisr_proto 
 			    netisr_default_flow2cpu(m->m_pkthdr.flowid);
 			return (m);
 		}
-		/* FALLTHROUGH */
-
-	case NETISR_POLICY_SOURCE:
-		ifp = m->m_pkthdr.rcvif;
-		if (ifp != NULL)
-			*cpuidp = nws_array[(ifp->if_index + source) %
-			    nws_count];
-		else
-			*cpuidp = nws_array[source % nws_count];
-		return (m);
-
-	default:
-		panic("%s: invalid policy %u for %s", __func__,
-		    npp->np_policy, npp->np_name);
+		policy = NETISR_POLICY_SOURCE;
 	}
+
+	KASSERT(policy == NETISR_POLICY_SOURCE,
+	    ("%s: invalid policy %u for %s", __func__, npp->np_policy,
+	    npp->np_name));
+
+	ifp = m->m_pkthdr.rcvif;
+	if (ifp != NULL)
+		*cpuidp = nws_array[(ifp->if_index + source) % nws_count];
+	else
+		*cpuidp = nws_array[source % nws_count];
+	return (m);
 }
 
 /*
@@ -795,7 +948,8 @@ netisr_queue_src(u_int proto, uintptr_t 
 	KASSERT(netisr_proto[proto].np_handler != NULL,
 	    ("%s: invalid proto %u", __func__, proto));
 
-	m = netisr_select_cpuid(&netisr_proto[proto], source, m, &cpuid);
+	m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_DEFERRED,
+	    source, m, &cpuid);
 	if (m != NULL) {
 		KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__,
 		    cpuid));
@@ -826,23 +980,23 @@ netisr_dispatch_src(u_int proto, uintptr
 	struct rm_priotracker tracker;
 #endif
 	struct netisr_workstream *nwsp;
+	struct netisr_proto *npp;
 	struct netisr_work *npwp;
 	int dosignal, error;
-	u_int cpuid;
-
-	/*
-	 * If direct dispatch is entirely disabled, fall back on queueing.
-	 */
-	if (!netisr_direct)
-		return (netisr_queue_src(proto, source, m));
+	u_int cpuid, dispatch_policy;
 
 	KASSERT(proto < NETISR_MAXPROT,
 	    ("%s: invalid proto %u", __func__, proto));
 #ifdef NETISR_LOCKING
 	NETISR_RLOCK(&tracker);
 #endif
-	KASSERT(netisr_proto[proto].np_handler != NULL,
-	    ("%s: invalid proto %u", __func__, proto));
+	npp = &netisr_proto[proto];
+	KASSERT(npp->np_handler != NULL, ("%s: invalid proto %u", __func__,
+	    proto));
+
+	dispatch_policy = netisr_get_dispatch(npp);
+	if (dispatch_policy == NETISR_DISPATCH_DEFERRED)
+		return (netisr_queue_src(proto, source, m));
 
 	/*
 	 * If direct dispatch is forced, then unconditionally dispatch
@@ -851,7 +1005,7 @@ netisr_dispatch_src(u_int proto, uintptr
 	 * nws_flags because all netisr processing will be source ordered due
 	 * to always being forced to directly dispatch.
 	 */
-	if (netisr_direct_force) {
+	if (dispatch_policy == NETISR_DISPATCH_DIRECT) {
 		nwsp = DPCPU_PTR(nws);
 		npwp = &nwsp->nws_work[proto];
 		npwp->nw_dispatched++;
@@ -861,18 +1015,22 @@ netisr_dispatch_src(u_int proto, uintptr
 		goto out_unlock;
 	}
 
+	KASSERT(dispatch_policy == NETISR_DISPATCH_HYBRID,
+	    ("%s: unknown dispatch policy (%u)", __func__, dispatch_policy));
+
 	/*
 	 * Otherwise, we execute in a hybrid mode where we will try to direct
 	 * dispatch if we're on the right CPU and the netisr worker isn't
 	 * already running.
 	 */
-	m = netisr_select_cpuid(&netisr_proto[proto], source, m, &cpuid);
+	sched_pin();
+	m = netisr_select_cpuid(&netisr_proto[proto], NETISR_DISPATCH_HYBRID,
+	    source, m, &cpuid);
 	if (m == NULL) {
 		error = ENOBUFS;
-		goto out_unlock;
+		goto out_unpin;
 	}
 	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
-	sched_pin();
 	if (cpuid != curcpu)
 		goto queue_fallback;
 	nwsp = DPCPU_PTR(nws);
@@ -1003,6 +1161,9 @@ netisr_start_swi(u_int cpuid, struct pcp
 static void
 netisr_init(void *arg)
 {
+	char tmp[NETISR_DISPATCH_POLICY_MAXSTR];
+	u_int dispatch_policy;
+	int error;
 
 	KASSERT(curcpu == 0, ("%s: not on CPU 0", __func__));
 
@@ -1033,6 +1194,20 @@ netisr_init(void *arg)
 	}
 #endif
 
+	if (TUNABLE_STR_FETCH("net.isr.dispatch", tmp, sizeof(tmp))) {
+		error = netisr_dispatch_policy_from_str(tmp,
+		    &dispatch_policy);
+		if (error == 0 && dispatch_policy == NETISR_DISPATCH_DEFAULT)
+			error = EINVAL;
+		if (error == 0) {
+			netisr_dispatch_policy = dispatch_policy;
+			netisr_dispatch_policy_compat();
+		} else
+			printf(
+			    "%s: invalid dispatch policy %s, using default\n",
+			    __func__, tmp);
+	}
+
 	netisr_start_swi(curcpu, pcpu_find(curcpu));
 }
 SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL);
@@ -1088,6 +1263,7 @@ sysctl_netisr_proto(SYSCTL_HANDLER_ARGS)
 		snpp->snp_proto = proto;
 		snpp->snp_qlimit = npp->np_qlimit;
 		snpp->snp_policy = npp->np_policy;
+		snpp->snp_dispatch = npp->np_dispatch;
 		if (npp->np_m2flow != NULL)
 			snpp->snp_flags |= NETISR_SNP_FLAGS_M2FLOW;
 		if (npp->np_m2cpuid != NULL)

Modified: head/sys/net/netisr.h
==============================================================================
--- head/sys/net/netisr.h	Tue May 24 09:01:56 2011	(r222248)
+++ head/sys/net/netisr.h	Tue May 24 12:34:19 2011	(r222249)
@@ -1,6 +1,6 @@
 /*-
  * Copyright (c) 2007-2009 Robert N. M. Watson
- * Copyright (c) 2010 Juniper Networks, Inc.
+ * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * This software was developed by Robert N. M. Watson under contract
@@ -71,6 +71,15 @@
 #define	NETISR_POLICY_CPU	3	/* Protocol determines CPU placement. */
 
 /*
+ * Protocol dispatch policy constants; selects whether and when direct
+ * dispatch is permitted.
+ */
+#define	NETISR_DISPATCH_DEFAULT		0	/* Use global default. */
+#define	NETISR_DISPATCH_DEFERRED	1	/* Always defer dispatch. */
+#define	NETISR_DISPATCH_HYBRID		2	/* Allow hybrid dispatch. */
+#define	NETISR_DISPATCH_DIRECT		3	/* Always direct dispatch. */
+
+/*
  * Monitoring data structures, exported by sysctl(2).
  *
  * Three sysctls are defined.  First, a per-protocol structure exported by
@@ -84,7 +93,8 @@ struct sysctl_netisr_proto {
 	u_int	snp_qlimit;			/* nh_qlimit */
 	u_int	snp_policy;			/* nh_policy */
 	u_int	snp_flags;			/* Various flags. */
-	u_int	_snp_ispare[7];
+	u_int	snp_dispatch;			/* Dispatch policy. */
+	u_int	_snp_ispare[6];
 };
 
 /*
@@ -173,6 +183,8 @@ typedef struct mbuf	*netisr_m2cpuid_t(st
 typedef	struct mbuf	*netisr_m2flow_t(struct mbuf *m, uintptr_t source);
 typedef void		 netisr_drainedcpu_t(u_int cpuid);
 
+#define	NETISR_CPUID_NONE	((u_int)-1)	/* No affinity returned. */
+
 /*
  * Data structure describing a protocol handler.
  */
@@ -185,7 +197,8 @@ struct netisr_handler {
 	u_int		 nh_proto;	/* Integer protocol ID. */
 	u_int		 nh_qlimit;	/* Maximum per-CPU queue depth. */
 	u_int		 nh_policy;	/* Work placement policy. */
-	u_int		 nh_ispare[5];	/* For future use. */
+	u_int		 nh_dispatch;	/* Dispatch policy. */
+	u_int		 nh_ispare[4];	/* For future use. */
 	void		*nh_pspare[4];	/* For future use. */
 };
 

Modified: head/sys/net/netisr_internal.h
==============================================================================
--- head/sys/net/netisr_internal.h	Tue May 24 09:01:56 2011	(r222248)
+++ head/sys/net/netisr_internal.h	Tue May 24 12:34:19 2011	(r222249)
@@ -1,6 +1,6 @@
 /*-
  * Copyright (c) 2007-2009 Robert N. M. Watson
- * Copyright (c) 2010 Juniper Networks, Inc.
+ * Copyright (c) 2010-2011 Juniper Networks, Inc.
  * All rights reserved.
  *
  * This software was developed by Robert N. M. Watson under contract
@@ -64,6 +64,7 @@ struct netisr_proto {
 	netisr_drainedcpu_t *np_drainedcpu; /* Callback when drained a queue. */
 	u_int		 np_qlimit;	/* Maximum per-CPU queue depth. */
 	u_int		 np_policy;	/* Work placement policy. */
+	u_int		 np_dispatch;	/* Work dispatch policy. */
 };
 
 #define	NETISR_MAXPROT	16		/* Compile-time limit. */