svn commit: r251139 - in head: sys/dev/netmap sys/net tools/tools/netmap

Luigi Rizzo luigi at FreeBSD.org
Thu May 30 14:07:14 UTC 2013


Author: luigi
Date: Thu May 30 14:07:14 2013
New Revision: 251139
URL: http://svnweb.freebsd.org/changeset/base/251139

Log:
  Bring in a number of new features, mostly implemented by Michio Honda:
  
  - the VALE switch now support up to 254 destinations per switch,
    unicast or broadcast (multicast goes to all ports).
  
  - we can attach hw interfaces and the host stack to a VALE switch,
    which means we will be able to use it more or less as a native bridge
    (minor tweaks still necessary).
    A 'vale-ctl' program is supplied in tools/tools/netmap
    to attach/detach ports the switch, and list current configuration.
  
  - the lookup function in the VALE switch can be reassigned to
    something else, similar to the pf hooks. This will enable
    attaching the firewall, or other processing functions (e.g. in-kernel
    openvswitch) directly on the netmap port.
  
  The internal API used by device drivers does not change.
  
  Userspace applications should be recompiled because we
  bump NETMAP_API as we now use some fields in the struct nmreq
  that were previously ignored -- otherwise, data structures
  are the same.
  
  Manpages will be committed separately.

Added:
  head/tools/tools/netmap/vale-ctl.c   (contents, props changed)
Modified:
  head/sys/dev/netmap/netmap.c
  head/sys/dev/netmap/netmap_kern.h
  head/sys/net/netmap.h
  head/tools/tools/netmap/Makefile

Modified: head/sys/dev/netmap/netmap.c
==============================================================================
--- head/sys/dev/netmap/netmap.c	Thu May 30 13:41:19 2013	(r251138)
+++ head/sys/dev/netmap/netmap.c	Thu May 30 14:07:14 2013	(r251139)
@@ -119,6 +119,9 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, mitiga
 int netmap_no_pendintr = 1;
 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
+int netmap_txsync_retry = 2;
+SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
+    &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
 
 int netmap_drop = 0;	/* debugging */
 int netmap_flags = 0;	/* debug flags */
@@ -128,25 +131,30 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, drop, 
 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
 
-#ifdef NM_BRIDGE /* support for netmap bridge */
+#ifdef NM_BRIDGE /* support for netmap virtual switch, called VALE */
 
 /*
- * system parameters.
+ * system parameters (most of them in netmap_kern.h)
+ * NM_NAME	prefix for switch port names, default "vale"
+ * NM_MAXPORTS	number of ports
+ * NM_BRIDGES	max number of switches in the system.
+ *	XXX should become a sysctl or tunable
  *
- * All switched ports have prefix NM_NAME.
- * The switch has a max of NM_BDG_MAXPORTS ports (often stored in a bitmap,
- * so a practical upper bound is 64).
- * Each tx ring is read-write, whereas rx rings are readonly (XXX not done yet).
+ * Switch ports are named valeX:Y where X is the switch name and Y
+ * is the port. If Y matches a physical interface name, the port is
+ * connected to a physical device.
+ *
+ * Unlike physical interfaces, switch ports use their own memory region
+ * for rings and buffers.
  * The virtual interfaces use per-queue lock instead of core lock.
  * In the tx loop, we aggregate traffic in batches to make all operations
  * faster. The batch size is NM_BDG_BATCH
  */
-#define	NM_NAME			"vale"	/* prefix for the interface */
-#define NM_BDG_MAXPORTS		16	/* up to 64 ? */
+#define NM_BDG_MAXRINGS		16	/* XXX unclear how many. */
 #define NM_BRIDGE_RINGSIZE	1024	/* in the device */
 #define NM_BDG_HASH		1024	/* forwarding table entries */
 #define NM_BDG_BATCH		1024	/* entries in the forwarding buffer */
-#define	NM_BRIDGES		4	/* number of bridges */
+#define	NM_BRIDGES		8	/* number of bridges */
 
 
 int netmap_bridge = NM_BDG_BATCH; /* bridge batch size */
@@ -174,14 +182,27 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, bridge
 #define	ADD_BDG_REF(ifp)	refcount_acquire(&NA(ifp)->na_bdg_refcount)
 #define	DROP_BDG_REF(ifp)	refcount_release(&NA(ifp)->na_bdg_refcount)
 
-static void bdg_netmap_attach(struct ifnet *ifp);
+static void bdg_netmap_attach(struct netmap_adapter *);
 static int bdg_netmap_reg(struct ifnet *ifp, int onoff);
+static int kern_netmap_regif(struct nmreq *nmr);
+
 /* per-tx-queue entry */
 struct nm_bdg_fwd {	/* forwarding entry for a bridge */
 	void *buf;
-	uint64_t dst;	/* dst mask */
-	uint32_t src;	/* src index ? */
-	uint16_t len;	/* src len */
+	uint32_t ft_dst;	/* dst port */
+	uint16_t ft_len;	/* src len */
+	uint16_t ft_next;	/* next packet to same destination */
+};
+
+/* We need to build a list of buffers going to each destination.
+ * Each buffer is in one entry of struct nm_bdg_fwd, we use ft_next
+ * to build the list, and struct nm_bdg_q below for the queue.
+ * The structure should compact because potentially we have a lot
+ * of destinations.
+ */
+struct nm_bdg_q {
+	uint16_t bq_head;
+	uint16_t bq_tail;
 };
 
 struct nm_hash_ent {
@@ -198,26 +219,78 @@ struct nm_hash_ent {
  * The bridge is non blocking on the transmit ports.
  *
  * bdg_lock protects accesses to the bdg_ports array.
+ * This is a rw lock (or equivalent).
  */
 struct nm_bridge {
-	struct ifnet *bdg_ports[NM_BDG_MAXPORTS];
-	int n_ports;
-	uint64_t act_ports;
-	int freelist;	/* first buffer index */
-	NM_SELINFO_T si;	/* poll/select wait queue */
-	NM_LOCK_T bdg_lock;	/* protect the selinfo ? */
+	int namelen;	/* 0 means free */
 
-	/* the forwarding table, MAC+ports */
-	struct nm_hash_ent ht[NM_BDG_HASH];
+	/* XXX what is the proper alignment/layout ? */
+	NM_RWLOCK_T bdg_lock;	/* protects bdg_ports */
+	struct netmap_adapter *bdg_ports[NM_BDG_MAXPORTS];
 
-	int namelen;	/* 0 means free */
 	char basename[IFNAMSIZ];
+	/*
+	 * The function to decide the destination port.
+	 * It returns either of an index of the destination port,
+	 * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
+	 * forward this packet.  ring_nr is the source ring index, and the
+	 * function may overwrite this value to forward this packet to a
+	 * different ring index.
+	 * This function must be set by netmap_bdgctl().
+	 */
+	bdg_lookup_fn_t nm_bdg_lookup;
+
+	/* the forwarding table, MAC+ports */
+	struct nm_hash_ent ht[NM_BDG_HASH];
 };
 
 struct nm_bridge nm_bridges[NM_BRIDGES];
+NM_LOCK_T	netmap_bridge_mutex;
 
-#define BDG_LOCK(b)	mtx_lock(&(b)->bdg_lock)
-#define BDG_UNLOCK(b)	mtx_unlock(&(b)->bdg_lock)
+/* other OS will have these macros defined in their own glue code. */
+
+#ifdef __FreeBSD__
+#define BDG_LOCK()		mtx_lock(&netmap_bridge_mutex)
+#define BDG_UNLOCK()		mtx_unlock(&netmap_bridge_mutex)
+#define BDG_WLOCK(b)		rw_wlock(&(b)->bdg_lock)
+#define BDG_WUNLOCK(b)		rw_wunlock(&(b)->bdg_lock)
+#define BDG_RLOCK(b)		rw_rlock(&(b)->bdg_lock)
+#define BDG_RUNLOCK(b)		rw_runlock(&(b)->bdg_lock)
+
+/* set/get variables. OS-specific macros may wrap these
+ * assignments into read/write lock or similar
+ */
+#define BDG_SET_VAR(lval, p)	(lval = p)
+#define BDG_GET_VAR(lval)	(lval)
+#define BDG_FREE(p)		free(p, M_DEVBUF)
+#endif /* __FreeBSD__ */
+
+static __inline int
+nma_is_vp(struct netmap_adapter *na)
+{
+	return na->nm_register == bdg_netmap_reg;
+}
+static __inline int
+nma_is_host(struct netmap_adapter *na)
+{
+	return na->nm_register == NULL;
+}
+static __inline int
+nma_is_hw(struct netmap_adapter *na)
+{
+	/* In case of sw adapter, nm_register is NULL */
+	return !nma_is_vp(na) && !nma_is_host(na);
+}
+
+/*
+ * Regarding holding a NIC, if the NIC is owned by the kernel
+ * (i.e., bridge), neither another bridge nor user can use it;
+ * if the NIC is owned by a user, only users can share it.
+ * Evaluation must be done under NMA_LOCK().
+ */
+#define NETMAP_OWNED_BY_KERN(ifp)	(!nma_is_vp(NA(ifp)) && NA(ifp)->na_bdg)
+#define NETMAP_OWNED_BY_ANY(ifp) \
+	(NETMAP_OWNED_BY_KERN(ifp) || (NA(ifp)->refcount > 0))
 
 /*
  * NA(ifp)->bdg_port	port index
@@ -245,15 +318,16 @@ pkt_copy(void *_src, void *_dst, int l)
         }
 }
 
+
 /*
  * locate a bridge among the existing ones.
  * a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
  * We assume that this is called with a name of at least NM_NAME chars.
  */
 static struct nm_bridge *
-nm_find_bridge(const char *name)
+nm_find_bridge(const char *name, int create)
 {
-	int i, l, namelen, e;
+	int i, l, namelen;
 	struct nm_bridge *b = NULL;
 
 	namelen = strlen(NM_NAME);	/* base length */
@@ -268,29 +342,94 @@ nm_find_bridge(const char *name)
 		namelen = IFNAMSIZ;
 	ND("--- prefix is '%.*s' ---", namelen, name);
 
-	/* use the first entry for locking */
-	BDG_LOCK(nm_bridges); // XXX do better
-	for (e = -1, i = 1; i < NM_BRIDGES; i++) {
-		b = nm_bridges + i;
-		if (b->namelen == 0)
-			e = i;	/* record empty slot */
-		else if (strncmp(name, b->basename, namelen) == 0) {
+	BDG_LOCK();
+	/* lookup the name, remember empty slot if there is one */
+	for (i = 0; i < NM_BRIDGES; i++) {
+		struct nm_bridge *x = nm_bridges + i;
+
+		if (x->namelen == 0) {
+			if (create && b == NULL)
+				b = x;	/* record empty slot */
+		} else if (x->namelen != namelen) {
+			continue;
+		} else if (strncmp(name, x->basename, namelen) == 0) {
 			ND("found '%.*s' at %d", namelen, name, i);
+			b = x;
 			break;
 		}
 	}
-	if (i == NM_BRIDGES) { /* all full */
-		if (e == -1) { /* no empty slot */
-			b = NULL;
-		} else {
-			b = nm_bridges + e;
-			strncpy(b->basename, name, namelen);
-			b->namelen = namelen;
-		}
+	if (i == NM_BRIDGES && b) { /* name not found, can create entry */
+		strncpy(b->basename, name, namelen);
+		b->namelen = namelen;
+		/* set the default function */
+		b->nm_bdg_lookup = netmap_bdg_learning;
+		/* reset the MAC address table */
+		bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
 	}
-	BDG_UNLOCK(nm_bridges);
+	BDG_UNLOCK();
 	return b;
 }
+
+
+/*
+ * Free the forwarding tables for rings attached to switch ports.
+ */
+static void
+nm_free_bdgfwd(struct netmap_adapter *na)
+{
+	int nrings, i;
+	struct netmap_kring *kring;
+
+	nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings;
+	kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings;
+	for (i = 0; i < nrings; i++) {
+		if (kring[i].nkr_ft) {
+			free(kring[i].nkr_ft, M_DEVBUF);
+			kring[i].nkr_ft = NULL; /* protect from freeing twice */
+		}
+	}
+	if (nma_is_hw(na))
+		nm_free_bdgfwd(SWNA(na->ifp));
+}
+
+
+/*
+ * Allocate the forwarding tables for the rings attached to the bridge ports.
+ */
+static int
+nm_alloc_bdgfwd(struct netmap_adapter *na)
+{
+	int nrings, l, i, num_dstq;
+	struct netmap_kring *kring;
+
+	/* all port:rings + broadcast */
+	num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
+	l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH;
+	l += sizeof(struct nm_bdg_q) * num_dstq;
+	l += sizeof(uint16_t) * NM_BDG_BATCH;
+
+	nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings;
+	kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings;
+	for (i = 0; i < nrings; i++) {
+		struct nm_bdg_fwd *ft;
+		struct nm_bdg_q *dstq;
+		int j;
+
+		ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
+		if (!ft) {
+			nm_free_bdgfwd(na);
+			return ENOMEM;
+		}
+		dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH);
+		for (j = 0; j < num_dstq; j++)
+			dstq[j].bq_head = dstq[j].bq_tail = NM_BDG_BATCH;
+		kring[i].nkr_ft = ft;
+	}
+	if (nma_is_hw(na))
+		nm_alloc_bdgfwd(SWNA(na->ifp));
+	return 0;
+}
+
 #endif /* NM_BRIDGE */
 
 
@@ -413,20 +552,11 @@ netmap_dtor_locked(void *data)
 		if (netmap_verbose)
 			D("deleting last instance for %s", ifp->if_xname);
 		/*
-		 * there is a race here with *_netmap_task() and
-		 * netmap_poll(), which don't run under NETMAP_REG_LOCK.
-		 * na->refcount == 0 && na->ifp->if_capenable & IFCAP_NETMAP
-		 * (aka NETMAP_DELETING(na)) are a unique marker that the
-		 * device is dying.
-		 * Before destroying stuff we sleep a bit, and then complete
-		 * the job. NIOCREG should realize the condition and
-		 * loop until they can continue; the other routines
-		 * should check the condition at entry and quit if
-		 * they cannot run.
+		 * (TO CHECK) This function is only called
+		 * when the last reference to this file descriptor goes
+		 * away. This means we cannot have any pending poll()
+		 * or interrupt routine operating on the structure.
 		 */
-		na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
-		tsleep(na, 0, "NIOCUNREG", 4);
-		na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
 		na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */
 		/* Wake up any sleeping threads. netmap_poll will
 		 * then return POLLERR
@@ -437,6 +567,9 @@ netmap_dtor_locked(void *data)
 			selwakeuppri(&na->rx_rings[i].si, PI_NET);
 		selwakeuppri(&na->tx_si, PI_NET);
 		selwakeuppri(&na->rx_si, PI_NET);
+#ifdef NM_BRIDGE
+		nm_free_bdgfwd(na);
+#endif /* NM_BRIDGE */
 		/* release all buffers */
 		for (i = 0; i < na->num_tx_rings + 1; i++) {
 			struct netmap_ring *ring = na->tx_rings[i].ring;
@@ -458,49 +591,81 @@ netmap_dtor_locked(void *data)
 		/* knlist_destroy(&na->tx_si.si_note); */
 		/* knlist_destroy(&na->rx_si.si_note); */
 		netmap_free_rings(na);
-		wakeup(na);
+		if (nma_is_hw(na))
+			SWNA(ifp)->tx_rings = SWNA(ifp)->rx_rings = NULL;
 	}
 	netmap_if_free(nifp);
 }
 
+
+/* we assume netmap adapter exists */
 static void
 nm_if_rele(struct ifnet *ifp)
 {
 #ifndef NM_BRIDGE
 	if_rele(ifp);
 #else /* NM_BRIDGE */
-	int i, full;
+	int i, full = 0, is_hw;
 	struct nm_bridge *b;
+	struct netmap_adapter *na;
 
-	if (strncmp(ifp->if_xname, NM_NAME, sizeof(NM_NAME) - 1)) {
+	/* I can be called not only for get_ifp()-ed references where netmap's
+	 * capability is guaranteed, but also for non-netmap-capable NICs.
+	 */
+	if (!NETMAP_CAPABLE(ifp) || !NA(ifp)->na_bdg) {
 		if_rele(ifp);
 		return;
 	}
 	if (!DROP_BDG_REF(ifp))
 		return;
-	b = ifp->if_bridge;
-	BDG_LOCK(nm_bridges);
-	BDG_LOCK(b);
+
+	na = NA(ifp);
+	b = na->na_bdg;
+	is_hw = nma_is_hw(na);
+
+	BDG_WLOCK(b);
 	ND("want to disconnect %s from the bridge", ifp->if_xname);
 	full = 0;
+	/* remove the entry from the bridge, also check
+	 * if there are any leftover interfaces
+	 * XXX we should optimize this code, e.g. going directly
+	 * to na->bdg_port, and having a counter of ports that
+	 * are connected. But it is not in a critical path.
+	 * In NIC's case, index of sw na is always higher than hw na
+	 */
 	for (i = 0; i < NM_BDG_MAXPORTS; i++) {
-		if (b->bdg_ports[i] == ifp) {
-			b->bdg_ports[i] = NULL;
-			bzero(ifp, sizeof(*ifp));
-			free(ifp, M_DEVBUF);
-			break;
-		}
-		else if (b->bdg_ports[i] != NULL)
+		struct netmap_adapter *tmp = BDG_GET_VAR(b->bdg_ports[i]);
+
+		if (tmp == na) {
+			/* disconnect from bridge */
+			BDG_SET_VAR(b->bdg_ports[i], NULL);
+			na->na_bdg = NULL;
+			if (is_hw && SWNA(ifp)->na_bdg) {
+				/* disconnect sw adapter too */
+				int j = SWNA(ifp)->bdg_port;
+				BDG_SET_VAR(b->bdg_ports[j], NULL);
+				SWNA(ifp)->na_bdg = NULL;
+			}
+		} else if (tmp != NULL) {
 			full = 1;
+		}
 	}
-	BDG_UNLOCK(b);
+	BDG_WUNLOCK(b);
 	if (full == 0) {
-		ND("freeing bridge %d", b - nm_bridges);
+		ND("marking bridge %d as free", b - nm_bridges);
 		b->namelen = 0;
+		b->nm_bdg_lookup = NULL;
 	}
-	BDG_UNLOCK(nm_bridges);
-	if (i == NM_BDG_MAXPORTS)
+	if (na->na_bdg) { /* still attached to the bridge */
 		D("ouch, cannot find ifp to remove");
+	} else if (is_hw) {
+		if_rele(ifp);
+	} else {
+		bzero(na, sizeof(*na));
+		free(na, M_DEVBUF);
+		bzero(ifp, sizeof(*ifp));
+		free(ifp, M_DEVBUF);
+	}
 #endif /* NM_BRIDGE */
 }
 
@@ -514,9 +679,13 @@ netmap_dtor(void *data)
 	if (ifp) {
 		struct netmap_adapter *na = NA(ifp);
 
+		if (na->na_bdg)
+			BDG_WLOCK(na->na_bdg);
 		na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
 		netmap_dtor_locked(data);
 		na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
+		if (na->na_bdg)
+			BDG_WUNLOCK(na->na_bdg);
 
 		nm_if_rele(ifp); /* might also destroy *na */
 	}
@@ -528,6 +697,7 @@ netmap_dtor(void *data)
 	free(priv, M_DEVBUF);
 }
 
+
 #ifdef __FreeBSD__
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -536,8 +706,16 @@ netmap_dtor(void *data)
 #include <vm/vm_pager.h>
 #include <vm/uma.h>
 
+/*
+ * In order to track whether pages are still mapped, we hook into
+ * the standard cdev_pager and intercept the constructor and
+ * destructor.
+ * XXX but then ? Do we really use the information ?
+ * Need to investigate.
+ */
 static struct cdev_pager_ops saved_cdev_pager_ops;
 
+
 static int
 netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
     vm_ooffset_t foff, struct ucred *cred, u_short *color)
@@ -548,6 +726,7 @@ netmap_dev_pager_ctor(void *handle, vm_o
 			size, prot, foff, cred, color);
 }
 
+
 static void
 netmap_dev_pager_dtor(void *handle)
 {
@@ -562,6 +741,8 @@ static struct cdev_pager_ops netmap_cdev
         .cdev_pg_fault = NULL,
 };
 
+
+// XXX check whether we need netmap_mmap_single _and_ netmap_mmap
 static int
 netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff,
 	vm_size_t objsize,  vm_object_t *objp, int prot)
@@ -630,6 +811,7 @@ netmap_mmap(__unused struct cdev *dev,
 	return (*paddr ? 0 : ENOMEM);
 }
 
+
 static int
 netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
 {
@@ -639,6 +821,7 @@ netmap_close(struct cdev *dev, int fflag
 	return 0;
 }
 
+
 static int
 netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
@@ -677,6 +860,7 @@ netmap_open(struct cdev *dev, int oflags
  * might take a while before releasing the buffer.
  */
 
+
 /*
  * pass a chain of buffers to the host stack as coming from 'dst'
  */
@@ -701,6 +885,7 @@ struct mbq {
 	int count;
 };
 
+
 /*
  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
  * Run from hwcur to cur - reserved
@@ -745,6 +930,7 @@ netmap_grab_packets(struct netmap_kring 
 	q->tail = tail;
 }
 
+
 /*
  * called under main lock to send packets from the host to the NIC
  * The host ring has packets from nr_hwcur to (cur - reserved)
@@ -794,6 +980,7 @@ netmap_sw_to_nic(struct netmap_adapter *
 	}
 }
 
+
 /*
  * netmap_sync_to_host() passes packets up. We are called from a
  * system call in user process context, and the only contention
@@ -827,6 +1014,18 @@ netmap_sync_to_host(struct netmap_adapte
 	netmap_send_up(na->ifp, q.head);
 }
 
+
+/* SWNA(ifp)->txrings[0] is always NA(ifp)->txrings[NA(ifp)->num_txrings] */
+static int
+netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int do_lock)
+{
+	(void)ring_nr;
+	(void)do_lock;
+	netmap_sync_to_host(NA(ifp));
+	return 0;
+}
+
+
 /*
  * rxsync backend for packets coming from the host stack.
  * They have been put in the queue by netmap_start() so we
@@ -881,38 +1080,60 @@ netmap_sync_from_host(struct netmap_adap
  * Return ENXIO if the interface does not exist, EINVAL if netmap
  * is not supported by the interface.
  * If successful, hold a reference.
+ *
+ * During the NIC is attached to a bridge, reference is managed
+ * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as
+ * virtual ports.  Hence, on the final DROP_BDG_REF(), the NIC
+ * is detached from the bridge, then ifp's refcount is dropped (this
+ * is equivalent to that ifp is destroyed in case of virtual ports.
+ *
+ * This function uses if_rele() when we want to prevent the NIC from
+ * being detached from the bridge in error handling.  But once refcount
+ * is acquired by this function, it must be released using nm_if_rele().
  */
 static int
-get_ifp(const char *name, struct ifnet **ifp)
+get_ifp(struct nmreq *nmr, struct ifnet **ifp)
 {
+	const char *name = nmr->nr_name;
+	int namelen = strlen(name);
 #ifdef NM_BRIDGE
 	struct ifnet *iter = NULL;
+	int no_prefix = 0;
 
 	do {
 		struct nm_bridge *b;
-		int i, l, cand = -1;
+		struct netmap_adapter *na;
+		int i, cand = -1, cand2 = -1;
 
-		if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1))
+		if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) {
+			no_prefix = 1;
 			break;
-		b = nm_find_bridge(name);
+		}
+		b = nm_find_bridge(name, 1 /* create a new one if no exist */ );
 		if (b == NULL) {
 			D("no bridges available for '%s'", name);
 			return (ENXIO);
 		}
-		/* XXX locking */
-		BDG_LOCK(b);
+		/* Now we are sure that name starts with the bridge's name */
+		BDG_WLOCK(b);
 		/* lookup in the local list of ports */
 		for (i = 0; i < NM_BDG_MAXPORTS; i++) {
-			iter = b->bdg_ports[i];
-			if (iter == NULL) {
+			na = BDG_GET_VAR(b->bdg_ports[i]);
+			if (na == NULL) {
 				if (cand == -1)
 					cand = i; /* potential insert point */
+				else if (cand2 == -1)
+					cand2 = i; /* for host stack */
 				continue;
 			}
-			if (!strcmp(iter->if_xname, name)) {
+			iter = na->ifp;
+			/* XXX make sure the name only contains one : */
+			if (!strcmp(iter->if_xname, name) /* virtual port */ ||
+			    (namelen > b->namelen && !strcmp(iter->if_xname,
+			    name + b->namelen + 1)) /* NIC */) {
 				ADD_BDG_REF(iter);
 				ND("found existing interface");
-				BDG_UNLOCK(b);
+				BDG_WUNLOCK(b);
 				break;
 			}
 		}
@@ -921,23 +1142,73 @@ get_ifp(const char *name, struct ifnet *
 		if (cand == -1) {
 			D("bridge full, cannot create new port");
 no_port:
-			BDG_UNLOCK(b);
+			BDG_WUNLOCK(b);
 			*ifp = NULL;
 			return EINVAL;
 		}
 		ND("create new bridge port %s", name);
-		/* space for forwarding list after the ifnet */
-		l = sizeof(*iter) +
-			 sizeof(struct nm_bdg_fwd)*NM_BDG_BATCH ;
-		iter = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
-		if (!iter)
-			goto no_port;
-		strcpy(iter->if_xname, name);
-		bdg_netmap_attach(iter);
-		b->bdg_ports[cand] = iter;
-		iter->if_bridge = b;
+		/*
+		 * create a struct ifnet for the new port.
+		 * The forwarding table is attached to the kring(s).
+		 */
+		/*
+		 * try see if there is a matching NIC with this name
+		 * (after the bridge's name)
+		 */
+		iter = ifunit_ref(name + b->namelen + 1);
+		if (!iter) { /* this is a virtual port */
+			/* Create a temporary NA with arguments, then
+			 * bdg_netmap_attach() will allocate the real one
+			 * and attach it to the ifp
+			 */
+			struct netmap_adapter tmp_na;
+
+			if (nmr->nr_cmd) /* nr_cmd must be for a NIC */
+				goto no_port;
+			bzero(&tmp_na, sizeof(tmp_na));
+			/* bound checking */
+			if (nmr->nr_tx_rings < 1)
+				nmr->nr_tx_rings = 1;
+			if (nmr->nr_tx_rings > NM_BDG_MAXRINGS)
+				nmr->nr_tx_rings = NM_BDG_MAXRINGS;
+			tmp_na.num_tx_rings = nmr->nr_tx_rings;
+			if (nmr->nr_rx_rings < 1)
+				nmr->nr_rx_rings = 1;
+			if (nmr->nr_rx_rings > NM_BDG_MAXRINGS)
+				nmr->nr_rx_rings = NM_BDG_MAXRINGS;
+			tmp_na.num_rx_rings = nmr->nr_rx_rings;
+
+			iter = malloc(sizeof(*iter), M_DEVBUF, M_NOWAIT | M_ZERO);
+			if (!iter)
+				goto no_port;
+			strcpy(iter->if_xname, name);
+			tmp_na.ifp = iter;
+			/* bdg_netmap_attach creates a struct netmap_adapter */
+			bdg_netmap_attach(&tmp_na);
+		} else if (NETMAP_CAPABLE(iter)) { /* this is a NIC */
+			/* cannot attach the NIC that any user or another
+			 * bridge already holds.
+			 */
+			if (NETMAP_OWNED_BY_ANY(iter) || cand2 == -1) {
+ifunit_rele:
+				if_rele(iter); /* don't detach from bridge */
+				goto no_port;
+			}
+			/* bind the host stack to the bridge */
+			if (nmr->nr_arg1 == NETMAP_BDG_HOST) {
+				BDG_SET_VAR(b->bdg_ports[cand2], SWNA(iter));
+				SWNA(iter)->bdg_port = cand2;
+				SWNA(iter)->na_bdg = b;
+			}
+		} else /* not a netmap-capable NIC */
+			goto ifunit_rele;
+		na = NA(iter);
+		na->bdg_port = cand;
+		/* bind the port to the bridge (virtual ports are not active) */
+		BDG_SET_VAR(b->bdg_ports[cand], na);
+		na->na_bdg = b;
 		ADD_BDG_REF(iter);
-		BDG_UNLOCK(b);
+		BDG_WUNLOCK(b);
 		ND("attaching virtual bridge %p", b);
 	} while (0);
 	*ifp = iter;
@@ -949,8 +1220,16 @@ no_port:
 	/* can do this if the capability exists and if_pspare[0]
 	 * points to the netmap descriptor.
 	 */
-	if (NETMAP_CAPABLE(*ifp))
+	if (NETMAP_CAPABLE(*ifp)) {
+#ifdef NM_BRIDGE
+		/* Users cannot use the NIC attached to a bridge directly */
+		if (no_prefix && NETMAP_OWNED_BY_KERN(*ifp)) {
+			if_rele(*ifp); /* don't detach from bridge */
+			return EINVAL;
+		} else
+#endif /* NM_BRIDGE */
 		return 0;	/* valid pointer, we hold the refcount */
+	}
 	nm_if_rele(*ifp);
 	return EINVAL;	// not NETMAP capable
 }
@@ -1059,6 +1338,296 @@ netmap_set_ringid(struct netmap_priv_d *
 	return 0;
 }
 
+
+/*
+ * possibly move the interface to netmap-mode.
+ * If success it returns a pointer to netmap_if, otherwise NULL.
+ * This must be called with NMA_LOCK held.
+ */
+static struct netmap_if *
+netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp,
+	uint16_t ringid, int *err)
+{
+	struct netmap_adapter *na = NA(ifp);
+	struct netmap_if *nifp = NULL;
+	int i, error;
+
+	if (na->na_bdg)
+		BDG_WLOCK(na->na_bdg);
+	na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
+
+	/* ring configuration may have changed, fetch from the card */
+	netmap_update_config(na);
+	priv->np_ifp = ifp;     /* store the reference */
+	error = netmap_set_ringid(priv, ringid);
+	if (error)
+		goto out;
+	nifp = netmap_if_new(ifp->if_xname, na);
+	if (nifp == NULL) { /* allocation failed */
+		error = ENOMEM;
+	} else if (ifp->if_capenable & IFCAP_NETMAP) {
+		/* was already set */
+	} else {
+		/* Otherwise set the card in netmap mode
+		 * and make it use the shared buffers.
+		 */
+		for (i = 0 ; i < na->num_tx_rings + 1; i++)
+			mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock",
+			    MTX_NETWORK_LOCK, MTX_DEF);
+		for (i = 0 ; i < na->num_rx_rings + 1; i++) {
+			mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock",
+			    MTX_NETWORK_LOCK, MTX_DEF);
+		}
+		if (nma_is_hw(na)) {
+			SWNA(ifp)->tx_rings = &na->tx_rings[na->num_tx_rings];
+			SWNA(ifp)->rx_rings = &na->rx_rings[na->num_rx_rings];
+		}
+		error = na->nm_register(ifp, 1); /* mode on */
+#ifdef NM_BRIDGE
+		if (!error)
+			error = nm_alloc_bdgfwd(na);
+#endif /* NM_BRIDGE */
+		if (error) {
+			netmap_dtor_locked(priv);
+			/* nifp is not yet in priv, so free it separately */
+			netmap_if_free(nifp);
+			nifp = NULL;
+		}
+
+	}
+out:
+	*err = error;
+	na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
+	if (na->na_bdg)
+		BDG_WUNLOCK(na->na_bdg);
+	return nifp;
+}
+
+
+/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */
+static int
+kern_netmap_regif(struct nmreq *nmr)
+{
+	struct ifnet *ifp;
+	struct netmap_if *nifp;
+	struct netmap_priv_d *npriv;
+	int error;
+
+	npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
+	if (npriv == NULL)
+		return ENOMEM;
+	error = netmap_get_memory(npriv);
+	if (error) {
+free_exit:
+		bzero(npriv, sizeof(*npriv));
+		free(npriv, M_DEVBUF);
+		return error;
+	}
+
+	NMA_LOCK();
+	error = get_ifp(nmr, &ifp);
+	if (error) { /* no device, or another bridge or user owns the device */
+		NMA_UNLOCK();
+		goto free_exit;
+	} else if (!NETMAP_OWNED_BY_KERN(ifp)) {
+		/* got reference to a virtual port or direct access to a NIC.
+		 * perhaps specified no bridge's prefix or wrong NIC's name
+		 */
+		error = EINVAL;
+unref_exit:
+		nm_if_rele(ifp);
+		NMA_UNLOCK();
+		goto free_exit;
+	}
+
+	if (nmr->nr_cmd == NETMAP_BDG_DETACH) {
+		if (NA(ifp)->refcount == 0) { /* not registered */
+			error = EINVAL;
+			goto unref_exit;
+		}
+		NMA_UNLOCK();
+
+		netmap_dtor(NA(ifp)->na_kpriv); /* unregister */
+		NA(ifp)->na_kpriv = NULL;
+		nm_if_rele(ifp); /* detach from the bridge */
+		goto free_exit;
+	} else if (NA(ifp)->refcount > 0) { /* already registered */
+		error = EINVAL;
+		goto unref_exit;
+	}
+
+	nifp = netmap_do_regif(npriv, ifp, nmr->nr_ringid, &error);
+	if (!nifp)
+		goto unref_exit;
+	wmb(); // XXX do we need it ?
+	npriv->np_nifp = nifp;
+	NA(ifp)->na_kpriv = npriv;
+	NMA_UNLOCK();
+	D("registered %s to netmap-mode", ifp->if_xname);
+	return 0;
+}
+
+
+/* CORE_LOCK is not necessary */
+static void
+netmap_swlock_wrapper(struct ifnet *dev, int what, u_int queueid)
+{
+	struct netmap_adapter *na = SWNA(dev);
+
+	switch (what) {
+	case NETMAP_TX_LOCK:
+		mtx_lock(&na->tx_rings[queueid].q_lock);
+		break;
+
+	case NETMAP_TX_UNLOCK:
+		mtx_unlock(&na->tx_rings[queueid].q_lock);
+		break;
+
+	case NETMAP_RX_LOCK:
+		mtx_lock(&na->rx_rings[queueid].q_lock);
+		break;
+
+	case NETMAP_RX_UNLOCK:
+		mtx_unlock(&na->rx_rings[queueid].q_lock);
+		break;
+	}
+}
+
+
+/* Initialize necessary fields of sw adapter located in right after hw's
+ * one.  sw adapter attaches a pair of sw rings of the netmap-mode NIC.
+ * It is always activated and deactivated at the same tie with the hw's one.
+ * Thus we don't need refcounting on the sw adapter.
+ * Regardless of NIC's feature we use separate lock so that anybody can lock
+ * me independently from the hw adapter.
+ * Make sure nm_register is NULL to be handled as FALSE in nma_is_hw
+ */
+static void
+netmap_attach_sw(struct ifnet *ifp)
+{
+	struct netmap_adapter *hw_na = NA(ifp);
+	struct netmap_adapter *na = SWNA(ifp);
+
+	na->ifp = ifp;
+	na->separate_locks = 1;
+	na->nm_lock = netmap_swlock_wrapper;
+	na->num_rx_rings = na->num_tx_rings = 1;
+	na->num_tx_desc = hw_na->num_tx_desc;
+	na->num_rx_desc = hw_na->num_rx_desc;
+	na->nm_txsync = netmap_bdg_to_host;
+}
+
+
+/* exported to kernel callers */
+int
+netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
+{
+	struct nm_bridge *b;
+	struct netmap_adapter *na;
+	struct ifnet *iter;
+	char *name = nmr->nr_name;
+	int cmd = nmr->nr_cmd, namelen = strlen(name);
+	int error = 0, i, j;
+
+	switch (cmd) {
+	case NETMAP_BDG_ATTACH:
+	case NETMAP_BDG_DETACH:
+		error = kern_netmap_regif(nmr);
+		break;
+
+	case NETMAP_BDG_LIST:
+		/* this is used to enumerate bridges and ports */
+		if (namelen) { /* look up indexes of bridge and port */
+			if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
+				error = EINVAL;
+				break;
+			}
+			b = nm_find_bridge(name, 0 /* don't create */);
+			if (!b) {
+				error = ENOENT;
+				break;
+			}
+
+			BDG_RLOCK(b);
+			error = ENOENT;
+			for (i = 0; i < NM_BDG_MAXPORTS; i++) {
+				na = BDG_GET_VAR(b->bdg_ports[i]);
+				if (na == NULL)
+					continue;
+				iter = na->ifp;
+				/* the former and the latter identify a
+				 * virtual port and a NIC, respectively
+				 */
+				if (!strcmp(iter->if_xname, name) ||
+				    (namelen > b->namelen &&
+				    !strcmp(iter->if_xname,
+				    name + b->namelen + 1))) {
+					/* bridge index */
+					nmr->nr_arg1 = b - nm_bridges;
+					nmr->nr_arg2 = i; /* port index */
+					error = 0;
+					break;
+				}
+			}
+			BDG_RUNLOCK(b);
+		} else {
+			/* return the first non-empty entry starting from
+			 * bridge nr_arg1 and port nr_arg2.
+			 *
+			 * Users can detect the end of the same bridge by
+			 * seeing the new and old value of nr_arg1, and can
+			 * detect the end of all the bridge by error != 0
+			 */
+			i = nmr->nr_arg1;
+			j = nmr->nr_arg2;
+
+			for (error = ENOENT; error && i < NM_BRIDGES; i++) {
+				b = nm_bridges + i;
+				BDG_RLOCK(b);
+				for (; j < NM_BDG_MAXPORTS; j++) {
+					na = BDG_GET_VAR(b->bdg_ports[j]);
+					if (na == NULL)
+						continue;
+					iter = na->ifp;
+					nmr->nr_arg1 = i;
+					nmr->nr_arg2 = j;
+					strncpy(name, iter->if_xname, IFNAMSIZ);
+					error = 0;
+					break;
+				}
+				BDG_RUNLOCK(b);
+				j = 0; /* following bridges scan from 0 */
+			}

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-all mailing list