svn commit: r251139 - in head: sys/dev/netmap sys/net tools/tools/netmap
Luigi Rizzo
luigi at FreeBSD.org
Thu May 30 14:07:14 UTC 2013
Author: luigi
Date: Thu May 30 14:07:14 2013
New Revision: 251139
URL: http://svnweb.freebsd.org/changeset/base/251139
Log:
Bring in a number of new features, mostly implemented by Michio Honda:
- the VALE switch now support up to 254 destinations per switch,
unicast or broadcast (multicast goes to all ports).
- we can attach hw interfaces and the host stack to a VALE switch,
which means we will be able to use it more or less as a native bridge
(minor tweaks still necessary).
A 'vale-ctl' program is supplied in tools/tools/netmap
to attach/detach ports the switch, and list current configuration.
- the lookup function in the VALE switch can be reassigned to
something else, similar to the pf hooks. This will enable
attaching the firewall, or other processing functions (e.g. in-kernel
openvswitch) directly on the netmap port.
The internal API used by device drivers does not change.
Userspace applications should be recompiled because we
bump NETMAP_API as we now use some fields in the struct nmreq
that were previously ignored -- otherwise, data structures
are the same.
Manpages will be committed separately.
Added:
head/tools/tools/netmap/vale-ctl.c (contents, props changed)
Modified:
head/sys/dev/netmap/netmap.c
head/sys/dev/netmap/netmap_kern.h
head/sys/net/netmap.h
head/tools/tools/netmap/Makefile
Modified: head/sys/dev/netmap/netmap.c
==============================================================================
--- head/sys/dev/netmap/netmap.c Thu May 30 13:41:19 2013 (r251138)
+++ head/sys/dev/netmap/netmap.c Thu May 30 14:07:14 2013 (r251139)
@@ -119,6 +119,9 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, mitiga
int netmap_no_pendintr = 1;
SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
+int netmap_txsync_retry = 2;
+SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
+ &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
int netmap_drop = 0; /* debugging */
int netmap_flags = 0; /* debug flags */
@@ -128,25 +131,30 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, drop,
SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
-#ifdef NM_BRIDGE /* support for netmap bridge */
+#ifdef NM_BRIDGE /* support for netmap virtual switch, called VALE */
/*
- * system parameters.
+ * system parameters (most of them in netmap_kern.h)
+ * NM_NAME prefix for switch port names, default "vale"
+ * NM_MAXPORTS number of ports
+ * NM_BRIDGES max number of switches in the system.
+ * XXX should become a sysctl or tunable
*
- * All switched ports have prefix NM_NAME.
- * The switch has a max of NM_BDG_MAXPORTS ports (often stored in a bitmap,
- * so a practical upper bound is 64).
- * Each tx ring is read-write, whereas rx rings are readonly (XXX not done yet).
+ * Switch ports are named valeX:Y where X is the switch name and Y
+ * is the port. If Y matches a physical interface name, the port is
+ * connected to a physical device.
+ *
+ * Unlike physical interfaces, switch ports use their own memory region
+ * for rings and buffers.
* The virtual interfaces use per-queue lock instead of core lock.
* In the tx loop, we aggregate traffic in batches to make all operations
* faster. The batch size is NM_BDG_BATCH
*/
-#define NM_NAME "vale" /* prefix for the interface */
-#define NM_BDG_MAXPORTS 16 /* up to 64 ? */
+#define NM_BDG_MAXRINGS 16 /* XXX unclear how many. */
#define NM_BRIDGE_RINGSIZE 1024 /* in the device */
#define NM_BDG_HASH 1024 /* forwarding table entries */
#define NM_BDG_BATCH 1024 /* entries in the forwarding buffer */
-#define NM_BRIDGES 4 /* number of bridges */
+#define NM_BRIDGES 8 /* number of bridges */
int netmap_bridge = NM_BDG_BATCH; /* bridge batch size */
@@ -174,14 +182,27 @@ SYSCTL_INT(_dev_netmap, OID_AUTO, bridge
#define ADD_BDG_REF(ifp) refcount_acquire(&NA(ifp)->na_bdg_refcount)
#define DROP_BDG_REF(ifp) refcount_release(&NA(ifp)->na_bdg_refcount)
-static void bdg_netmap_attach(struct ifnet *ifp);
+static void bdg_netmap_attach(struct netmap_adapter *);
static int bdg_netmap_reg(struct ifnet *ifp, int onoff);
+static int kern_netmap_regif(struct nmreq *nmr);
+
/* per-tx-queue entry */
struct nm_bdg_fwd { /* forwarding entry for a bridge */
void *buf;
- uint64_t dst; /* dst mask */
- uint32_t src; /* src index ? */
- uint16_t len; /* src len */
+ uint32_t ft_dst; /* dst port */
+ uint16_t ft_len; /* src len */
+ uint16_t ft_next; /* next packet to same destination */
+};
+
+/* We need to build a list of buffers going to each destination.
+ * Each buffer is in one entry of struct nm_bdg_fwd, we use ft_next
+ * to build the list, and struct nm_bdg_q below for the queue.
+ * The structure should compact because potentially we have a lot
+ * of destinations.
+ */
+struct nm_bdg_q {
+ uint16_t bq_head;
+ uint16_t bq_tail;
};
struct nm_hash_ent {
@@ -198,26 +219,78 @@ struct nm_hash_ent {
* The bridge is non blocking on the transmit ports.
*
* bdg_lock protects accesses to the bdg_ports array.
+ * This is a rw lock (or equivalent).
*/
struct nm_bridge {
- struct ifnet *bdg_ports[NM_BDG_MAXPORTS];
- int n_ports;
- uint64_t act_ports;
- int freelist; /* first buffer index */
- NM_SELINFO_T si; /* poll/select wait queue */
- NM_LOCK_T bdg_lock; /* protect the selinfo ? */
+ int namelen; /* 0 means free */
- /* the forwarding table, MAC+ports */
- struct nm_hash_ent ht[NM_BDG_HASH];
+ /* XXX what is the proper alignment/layout ? */
+ NM_RWLOCK_T bdg_lock; /* protects bdg_ports */
+ struct netmap_adapter *bdg_ports[NM_BDG_MAXPORTS];
- int namelen; /* 0 means free */
char basename[IFNAMSIZ];
+ /*
+ * The function to decide the destination port.
+ * It returns either of an index of the destination port,
+ * NM_BDG_BROADCAST to broadcast this packet, or NM_BDG_NOPORT not to
+ * forward this packet. ring_nr is the source ring index, and the
+ * function may overwrite this value to forward this packet to a
+ * different ring index.
+ * This function must be set by netmap_bdgctl().
+ */
+ bdg_lookup_fn_t nm_bdg_lookup;
+
+ /* the forwarding table, MAC+ports */
+ struct nm_hash_ent ht[NM_BDG_HASH];
};
struct nm_bridge nm_bridges[NM_BRIDGES];
+NM_LOCK_T netmap_bridge_mutex;
-#define BDG_LOCK(b) mtx_lock(&(b)->bdg_lock)
-#define BDG_UNLOCK(b) mtx_unlock(&(b)->bdg_lock)
+/* other OS will have these macros defined in their own glue code. */
+
+#ifdef __FreeBSD__
+#define BDG_LOCK() mtx_lock(&netmap_bridge_mutex)
+#define BDG_UNLOCK() mtx_unlock(&netmap_bridge_mutex)
+#define BDG_WLOCK(b) rw_wlock(&(b)->bdg_lock)
+#define BDG_WUNLOCK(b) rw_wunlock(&(b)->bdg_lock)
+#define BDG_RLOCK(b) rw_rlock(&(b)->bdg_lock)
+#define BDG_RUNLOCK(b) rw_runlock(&(b)->bdg_lock)
+
+/* set/get variables. OS-specific macros may wrap these
+ * assignments into read/write lock or similar
+ */
+#define BDG_SET_VAR(lval, p) (lval = p)
+#define BDG_GET_VAR(lval) (lval)
+#define BDG_FREE(p) free(p, M_DEVBUF)
+#endif /* __FreeBSD__ */
+
+static __inline int
+nma_is_vp(struct netmap_adapter *na)
+{
+ return na->nm_register == bdg_netmap_reg;
+}
+static __inline int
+nma_is_host(struct netmap_adapter *na)
+{
+ return na->nm_register == NULL;
+}
+static __inline int
+nma_is_hw(struct netmap_adapter *na)
+{
+ /* In case of sw adapter, nm_register is NULL */
+ return !nma_is_vp(na) && !nma_is_host(na);
+}
+
+/*
+ * Regarding holding a NIC, if the NIC is owned by the kernel
+ * (i.e., bridge), neither another bridge nor user can use it;
+ * if the NIC is owned by a user, only users can share it.
+ * Evaluation must be done under NMA_LOCK().
+ */
+#define NETMAP_OWNED_BY_KERN(ifp) (!nma_is_vp(NA(ifp)) && NA(ifp)->na_bdg)
+#define NETMAP_OWNED_BY_ANY(ifp) \
+ (NETMAP_OWNED_BY_KERN(ifp) || (NA(ifp)->refcount > 0))
/*
* NA(ifp)->bdg_port port index
@@ -245,15 +318,16 @@ pkt_copy(void *_src, void *_dst, int l)
}
}
+
/*
* locate a bridge among the existing ones.
* a ':' in the name terminates the bridge name. Otherwise, just NM_NAME.
* We assume that this is called with a name of at least NM_NAME chars.
*/
static struct nm_bridge *
-nm_find_bridge(const char *name)
+nm_find_bridge(const char *name, int create)
{
- int i, l, namelen, e;
+ int i, l, namelen;
struct nm_bridge *b = NULL;
namelen = strlen(NM_NAME); /* base length */
@@ -268,29 +342,94 @@ nm_find_bridge(const char *name)
namelen = IFNAMSIZ;
ND("--- prefix is '%.*s' ---", namelen, name);
- /* use the first entry for locking */
- BDG_LOCK(nm_bridges); // XXX do better
- for (e = -1, i = 1; i < NM_BRIDGES; i++) {
- b = nm_bridges + i;
- if (b->namelen == 0)
- e = i; /* record empty slot */
- else if (strncmp(name, b->basename, namelen) == 0) {
+ BDG_LOCK();
+ /* lookup the name, remember empty slot if there is one */
+ for (i = 0; i < NM_BRIDGES; i++) {
+ struct nm_bridge *x = nm_bridges + i;
+
+ if (x->namelen == 0) {
+ if (create && b == NULL)
+ b = x; /* record empty slot */
+ } else if (x->namelen != namelen) {
+ continue;
+ } else if (strncmp(name, x->basename, namelen) == 0) {
ND("found '%.*s' at %d", namelen, name, i);
+ b = x;
break;
}
}
- if (i == NM_BRIDGES) { /* all full */
- if (e == -1) { /* no empty slot */
- b = NULL;
- } else {
- b = nm_bridges + e;
- strncpy(b->basename, name, namelen);
- b->namelen = namelen;
- }
+ if (i == NM_BRIDGES && b) { /* name not found, can create entry */
+ strncpy(b->basename, name, namelen);
+ b->namelen = namelen;
+ /* set the default function */
+ b->nm_bdg_lookup = netmap_bdg_learning;
+ /* reset the MAC address table */
+ bzero(b->ht, sizeof(struct nm_hash_ent) * NM_BDG_HASH);
}
- BDG_UNLOCK(nm_bridges);
+ BDG_UNLOCK();
return b;
}
+
+
+/*
+ * Free the forwarding tables for rings attached to switch ports.
+ */
+static void
+nm_free_bdgfwd(struct netmap_adapter *na)
+{
+ int nrings, i;
+ struct netmap_kring *kring;
+
+ nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings;
+ kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings;
+ for (i = 0; i < nrings; i++) {
+ if (kring[i].nkr_ft) {
+ free(kring[i].nkr_ft, M_DEVBUF);
+ kring[i].nkr_ft = NULL; /* protect from freeing twice */
+ }
+ }
+ if (nma_is_hw(na))
+ nm_free_bdgfwd(SWNA(na->ifp));
+}
+
+
+/*
+ * Allocate the forwarding tables for the rings attached to the bridge ports.
+ */
+static int
+nm_alloc_bdgfwd(struct netmap_adapter *na)
+{
+ int nrings, l, i, num_dstq;
+ struct netmap_kring *kring;
+
+ /* all port:rings + broadcast */
+ num_dstq = NM_BDG_MAXPORTS * NM_BDG_MAXRINGS + 1;
+ l = sizeof(struct nm_bdg_fwd) * NM_BDG_BATCH;
+ l += sizeof(struct nm_bdg_q) * num_dstq;
+ l += sizeof(uint16_t) * NM_BDG_BATCH;
+
+ nrings = nma_is_vp(na) ? na->num_tx_rings : na->num_rx_rings;
+ kring = nma_is_vp(na) ? na->tx_rings : na->rx_rings;
+ for (i = 0; i < nrings; i++) {
+ struct nm_bdg_fwd *ft;
+ struct nm_bdg_q *dstq;
+ int j;
+
+ ft = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!ft) {
+ nm_free_bdgfwd(na);
+ return ENOMEM;
+ }
+ dstq = (struct nm_bdg_q *)(ft + NM_BDG_BATCH);
+ for (j = 0; j < num_dstq; j++)
+ dstq[j].bq_head = dstq[j].bq_tail = NM_BDG_BATCH;
+ kring[i].nkr_ft = ft;
+ }
+ if (nma_is_hw(na))
+ nm_alloc_bdgfwd(SWNA(na->ifp));
+ return 0;
+}
+
#endif /* NM_BRIDGE */
@@ -413,20 +552,11 @@ netmap_dtor_locked(void *data)
if (netmap_verbose)
D("deleting last instance for %s", ifp->if_xname);
/*
- * there is a race here with *_netmap_task() and
- * netmap_poll(), which don't run under NETMAP_REG_LOCK.
- * na->refcount == 0 && na->ifp->if_capenable & IFCAP_NETMAP
- * (aka NETMAP_DELETING(na)) are a unique marker that the
- * device is dying.
- * Before destroying stuff we sleep a bit, and then complete
- * the job. NIOCREG should realize the condition and
- * loop until they can continue; the other routines
- * should check the condition at entry and quit if
- * they cannot run.
+ * (TO CHECK) This function is only called
+ * when the last reference to this file descriptor goes
+ * away. This means we cannot have any pending poll()
+ * or interrupt routine operating on the structure.
*/
- na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
- tsleep(na, 0, "NIOCUNREG", 4);
- na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
na->nm_register(ifp, 0); /* off, clear IFCAP_NETMAP */
/* Wake up any sleeping threads. netmap_poll will
* then return POLLERR
@@ -437,6 +567,9 @@ netmap_dtor_locked(void *data)
selwakeuppri(&na->rx_rings[i].si, PI_NET);
selwakeuppri(&na->tx_si, PI_NET);
selwakeuppri(&na->rx_si, PI_NET);
+#ifdef NM_BRIDGE
+ nm_free_bdgfwd(na);
+#endif /* NM_BRIDGE */
/* release all buffers */
for (i = 0; i < na->num_tx_rings + 1; i++) {
struct netmap_ring *ring = na->tx_rings[i].ring;
@@ -458,49 +591,81 @@ netmap_dtor_locked(void *data)
/* knlist_destroy(&na->tx_si.si_note); */
/* knlist_destroy(&na->rx_si.si_note); */
netmap_free_rings(na);
- wakeup(na);
+ if (nma_is_hw(na))
+ SWNA(ifp)->tx_rings = SWNA(ifp)->rx_rings = NULL;
}
netmap_if_free(nifp);
}
+
+/* we assume netmap adapter exists */
static void
nm_if_rele(struct ifnet *ifp)
{
#ifndef NM_BRIDGE
if_rele(ifp);
#else /* NM_BRIDGE */
- int i, full;
+ int i, full = 0, is_hw;
struct nm_bridge *b;
+ struct netmap_adapter *na;
- if (strncmp(ifp->if_xname, NM_NAME, sizeof(NM_NAME) - 1)) {
+ /* I can be called not only for get_ifp()-ed references where netmap's
+ * capability is guaranteed, but also for non-netmap-capable NICs.
+ */
+ if (!NETMAP_CAPABLE(ifp) || !NA(ifp)->na_bdg) {
if_rele(ifp);
return;
}
if (!DROP_BDG_REF(ifp))
return;
- b = ifp->if_bridge;
- BDG_LOCK(nm_bridges);
- BDG_LOCK(b);
+
+ na = NA(ifp);
+ b = na->na_bdg;
+ is_hw = nma_is_hw(na);
+
+ BDG_WLOCK(b);
ND("want to disconnect %s from the bridge", ifp->if_xname);
full = 0;
+ /* remove the entry from the bridge, also check
+ * if there are any leftover interfaces
+ * XXX we should optimize this code, e.g. going directly
+ * to na->bdg_port, and having a counter of ports that
+ * are connected. But it is not in a critical path.
+ * In NIC's case, index of sw na is always higher than hw na
+ */
for (i = 0; i < NM_BDG_MAXPORTS; i++) {
- if (b->bdg_ports[i] == ifp) {
- b->bdg_ports[i] = NULL;
- bzero(ifp, sizeof(*ifp));
- free(ifp, M_DEVBUF);
- break;
- }
- else if (b->bdg_ports[i] != NULL)
+ struct netmap_adapter *tmp = BDG_GET_VAR(b->bdg_ports[i]);
+
+ if (tmp == na) {
+ /* disconnect from bridge */
+ BDG_SET_VAR(b->bdg_ports[i], NULL);
+ na->na_bdg = NULL;
+ if (is_hw && SWNA(ifp)->na_bdg) {
+ /* disconnect sw adapter too */
+ int j = SWNA(ifp)->bdg_port;
+ BDG_SET_VAR(b->bdg_ports[j], NULL);
+ SWNA(ifp)->na_bdg = NULL;
+ }
+ } else if (tmp != NULL) {
full = 1;
+ }
}
- BDG_UNLOCK(b);
+ BDG_WUNLOCK(b);
if (full == 0) {
- ND("freeing bridge %d", b - nm_bridges);
+ ND("marking bridge %d as free", b - nm_bridges);
b->namelen = 0;
+ b->nm_bdg_lookup = NULL;
}
- BDG_UNLOCK(nm_bridges);
- if (i == NM_BDG_MAXPORTS)
+ if (na->na_bdg) { /* still attached to the bridge */
D("ouch, cannot find ifp to remove");
+ } else if (is_hw) {
+ if_rele(ifp);
+ } else {
+ bzero(na, sizeof(*na));
+ free(na, M_DEVBUF);
+ bzero(ifp, sizeof(*ifp));
+ free(ifp, M_DEVBUF);
+ }
#endif /* NM_BRIDGE */
}
@@ -514,9 +679,13 @@ netmap_dtor(void *data)
if (ifp) {
struct netmap_adapter *na = NA(ifp);
+ if (na->na_bdg)
+ BDG_WLOCK(na->na_bdg);
na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
netmap_dtor_locked(data);
na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
+ if (na->na_bdg)
+ BDG_WUNLOCK(na->na_bdg);
nm_if_rele(ifp); /* might also destroy *na */
}
@@ -528,6 +697,7 @@ netmap_dtor(void *data)
free(priv, M_DEVBUF);
}
+
#ifdef __FreeBSD__
#include <vm/vm.h>
#include <vm/vm_param.h>
@@ -536,8 +706,16 @@ netmap_dtor(void *data)
#include <vm/vm_pager.h>
#include <vm/uma.h>
+/*
+ * In order to track whether pages are still mapped, we hook into
+ * the standard cdev_pager and intercept the constructor and
+ * destructor.
+ * XXX but then ? Do we really use the information ?
+ * Need to investigate.
+ */
static struct cdev_pager_ops saved_cdev_pager_ops;
+
static int
netmap_dev_pager_ctor(void *handle, vm_ooffset_t size, vm_prot_t prot,
vm_ooffset_t foff, struct ucred *cred, u_short *color)
@@ -548,6 +726,7 @@ netmap_dev_pager_ctor(void *handle, vm_o
size, prot, foff, cred, color);
}
+
static void
netmap_dev_pager_dtor(void *handle)
{
@@ -562,6 +741,8 @@ static struct cdev_pager_ops netmap_cdev
.cdev_pg_fault = NULL,
};
+
+// XXX check whether we need netmap_mmap_single _and_ netmap_mmap
static int
netmap_mmap_single(struct cdev *cdev, vm_ooffset_t *foff,
vm_size_t objsize, vm_object_t *objp, int prot)
@@ -630,6 +811,7 @@ netmap_mmap(__unused struct cdev *dev,
return (*paddr ? 0 : ENOMEM);
}
+
static int
netmap_close(struct cdev *dev, int fflag, int devtype, struct thread *td)
{
@@ -639,6 +821,7 @@ netmap_close(struct cdev *dev, int fflag
return 0;
}
+
static int
netmap_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
{
@@ -677,6 +860,7 @@ netmap_open(struct cdev *dev, int oflags
* might take a while before releasing the buffer.
*/
+
/*
* pass a chain of buffers to the host stack as coming from 'dst'
*/
@@ -701,6 +885,7 @@ struct mbq {
int count;
};
+
/*
* put a copy of the buffers marked NS_FORWARD into an mbuf chain.
* Run from hwcur to cur - reserved
@@ -745,6 +930,7 @@ netmap_grab_packets(struct netmap_kring
q->tail = tail;
}
+
/*
* called under main lock to send packets from the host to the NIC
* The host ring has packets from nr_hwcur to (cur - reserved)
@@ -794,6 +980,7 @@ netmap_sw_to_nic(struct netmap_adapter *
}
}
+
/*
* netmap_sync_to_host() passes packets up. We are called from a
* system call in user process context, and the only contention
@@ -827,6 +1014,18 @@ netmap_sync_to_host(struct netmap_adapte
netmap_send_up(na->ifp, q.head);
}
+
+/* SWNA(ifp)->txrings[0] is always NA(ifp)->txrings[NA(ifp)->num_txrings] */
+static int
+netmap_bdg_to_host(struct ifnet *ifp, u_int ring_nr, int do_lock)
+{
+ (void)ring_nr;
+ (void)do_lock;
+ netmap_sync_to_host(NA(ifp));
+ return 0;
+}
+
+
/*
* rxsync backend for packets coming from the host stack.
* They have been put in the queue by netmap_start() so we
@@ -881,38 +1080,60 @@ netmap_sync_from_host(struct netmap_adap
* Return ENXIO if the interface does not exist, EINVAL if netmap
* is not supported by the interface.
* If successful, hold a reference.
+ *
+ * During the NIC is attached to a bridge, reference is managed
+ * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as
+ * virtual ports. Hence, on the final DROP_BDG_REF(), the NIC
+ * is detached from the bridge, then ifp's refcount is dropped (this
+ * is equivalent to that ifp is destroyed in case of virtual ports.
+ *
+ * This function uses if_rele() when we want to prevent the NIC from
+ * being detached from the bridge in error handling. But once refcount
+ * is acquired by this function, it must be released using nm_if_rele().
*/
static int
-get_ifp(const char *name, struct ifnet **ifp)
+get_ifp(struct nmreq *nmr, struct ifnet **ifp)
{
+ const char *name = nmr->nr_name;
+ int namelen = strlen(name);
#ifdef NM_BRIDGE
struct ifnet *iter = NULL;
+ int no_prefix = 0;
do {
struct nm_bridge *b;
- int i, l, cand = -1;
+ struct netmap_adapter *na;
+ int i, cand = -1, cand2 = -1;
- if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1))
+ if (strncmp(name, NM_NAME, sizeof(NM_NAME) - 1)) {
+ no_prefix = 1;
break;
- b = nm_find_bridge(name);
+ }
+ b = nm_find_bridge(name, 1 /* create a new one if no exist */ );
if (b == NULL) {
D("no bridges available for '%s'", name);
return (ENXIO);
}
- /* XXX locking */
- BDG_LOCK(b);
+ /* Now we are sure that name starts with the bridge's name */
+ BDG_WLOCK(b);
/* lookup in the local list of ports */
for (i = 0; i < NM_BDG_MAXPORTS; i++) {
- iter = b->bdg_ports[i];
- if (iter == NULL) {
+ na = BDG_GET_VAR(b->bdg_ports[i]);
+ if (na == NULL) {
if (cand == -1)
cand = i; /* potential insert point */
+ else if (cand2 == -1)
+ cand2 = i; /* for host stack */
continue;
}
- if (!strcmp(iter->if_xname, name)) {
+ iter = na->ifp;
+ /* XXX make sure the name only contains one : */
+ if (!strcmp(iter->if_xname, name) /* virtual port */ ||
+ (namelen > b->namelen && !strcmp(iter->if_xname,
+ name + b->namelen + 1)) /* NIC */) {
ADD_BDG_REF(iter);
ND("found existing interface");
- BDG_UNLOCK(b);
+ BDG_WUNLOCK(b);
break;
}
}
@@ -921,23 +1142,73 @@ get_ifp(const char *name, struct ifnet *
if (cand == -1) {
D("bridge full, cannot create new port");
no_port:
- BDG_UNLOCK(b);
+ BDG_WUNLOCK(b);
*ifp = NULL;
return EINVAL;
}
ND("create new bridge port %s", name);
- /* space for forwarding list after the ifnet */
- l = sizeof(*iter) +
- sizeof(struct nm_bdg_fwd)*NM_BDG_BATCH ;
- iter = malloc(l, M_DEVBUF, M_NOWAIT | M_ZERO);
- if (!iter)
- goto no_port;
- strcpy(iter->if_xname, name);
- bdg_netmap_attach(iter);
- b->bdg_ports[cand] = iter;
- iter->if_bridge = b;
+ /*
+ * create a struct ifnet for the new port.
+ * The forwarding table is attached to the kring(s).
+ */
+ /*
+ * try see if there is a matching NIC with this name
+ * (after the bridge's name)
+ */
+ iter = ifunit_ref(name + b->namelen + 1);
+ if (!iter) { /* this is a virtual port */
+ /* Create a temporary NA with arguments, then
+ * bdg_netmap_attach() will allocate the real one
+ * and attach it to the ifp
+ */
+ struct netmap_adapter tmp_na;
+
+ if (nmr->nr_cmd) /* nr_cmd must be for a NIC */
+ goto no_port;
+ bzero(&tmp_na, sizeof(tmp_na));
+ /* bound checking */
+ if (nmr->nr_tx_rings < 1)
+ nmr->nr_tx_rings = 1;
+ if (nmr->nr_tx_rings > NM_BDG_MAXRINGS)
+ nmr->nr_tx_rings = NM_BDG_MAXRINGS;
+ tmp_na.num_tx_rings = nmr->nr_tx_rings;
+ if (nmr->nr_rx_rings < 1)
+ nmr->nr_rx_rings = 1;
+ if (nmr->nr_rx_rings > NM_BDG_MAXRINGS)
+ nmr->nr_rx_rings = NM_BDG_MAXRINGS;
+ tmp_na.num_rx_rings = nmr->nr_rx_rings;
+
+ iter = malloc(sizeof(*iter), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (!iter)
+ goto no_port;
+ strcpy(iter->if_xname, name);
+ tmp_na.ifp = iter;
+ /* bdg_netmap_attach creates a struct netmap_adapter */
+ bdg_netmap_attach(&tmp_na);
+ } else if (NETMAP_CAPABLE(iter)) { /* this is a NIC */
+ /* cannot attach the NIC that any user or another
+ * bridge already holds.
+ */
+ if (NETMAP_OWNED_BY_ANY(iter) || cand2 == -1) {
+ifunit_rele:
+ if_rele(iter); /* don't detach from bridge */
+ goto no_port;
+ }
+ /* bind the host stack to the bridge */
+ if (nmr->nr_arg1 == NETMAP_BDG_HOST) {
+ BDG_SET_VAR(b->bdg_ports[cand2], SWNA(iter));
+ SWNA(iter)->bdg_port = cand2;
+ SWNA(iter)->na_bdg = b;
+ }
+ } else /* not a netmap-capable NIC */
+ goto ifunit_rele;
+ na = NA(iter);
+ na->bdg_port = cand;
+ /* bind the port to the bridge (virtual ports are not active) */
+ BDG_SET_VAR(b->bdg_ports[cand], na);
+ na->na_bdg = b;
ADD_BDG_REF(iter);
- BDG_UNLOCK(b);
+ BDG_WUNLOCK(b);
ND("attaching virtual bridge %p", b);
} while (0);
*ifp = iter;
@@ -949,8 +1220,16 @@ no_port:
/* can do this if the capability exists and if_pspare[0]
* points to the netmap descriptor.
*/
- if (NETMAP_CAPABLE(*ifp))
+ if (NETMAP_CAPABLE(*ifp)) {
+#ifdef NM_BRIDGE
+ /* Users cannot use the NIC attached to a bridge directly */
+ if (no_prefix && NETMAP_OWNED_BY_KERN(*ifp)) {
+ if_rele(*ifp); /* don't detach from bridge */
+ return EINVAL;
+ } else
+#endif /* NM_BRIDGE */
return 0; /* valid pointer, we hold the refcount */
+ }
nm_if_rele(*ifp);
return EINVAL; // not NETMAP capable
}
@@ -1059,6 +1338,296 @@ netmap_set_ringid(struct netmap_priv_d *
return 0;
}
+
+/*
+ * possibly move the interface to netmap-mode.
+ * If success it returns a pointer to netmap_if, otherwise NULL.
+ * This must be called with NMA_LOCK held.
+ */
+static struct netmap_if *
+netmap_do_regif(struct netmap_priv_d *priv, struct ifnet *ifp,
+ uint16_t ringid, int *err)
+{
+ struct netmap_adapter *na = NA(ifp);
+ struct netmap_if *nifp = NULL;
+ int i, error;
+
+ if (na->na_bdg)
+ BDG_WLOCK(na->na_bdg);
+ na->nm_lock(ifp, NETMAP_REG_LOCK, 0);
+
+ /* ring configuration may have changed, fetch from the card */
+ netmap_update_config(na);
+ priv->np_ifp = ifp; /* store the reference */
+ error = netmap_set_ringid(priv, ringid);
+ if (error)
+ goto out;
+ nifp = netmap_if_new(ifp->if_xname, na);
+ if (nifp == NULL) { /* allocation failed */
+ error = ENOMEM;
+ } else if (ifp->if_capenable & IFCAP_NETMAP) {
+ /* was already set */
+ } else {
+ /* Otherwise set the card in netmap mode
+ * and make it use the shared buffers.
+ */
+ for (i = 0 ; i < na->num_tx_rings + 1; i++)
+ mtx_init(&na->tx_rings[i].q_lock, "nm_txq_lock",
+ MTX_NETWORK_LOCK, MTX_DEF);
+ for (i = 0 ; i < na->num_rx_rings + 1; i++) {
+ mtx_init(&na->rx_rings[i].q_lock, "nm_rxq_lock",
+ MTX_NETWORK_LOCK, MTX_DEF);
+ }
+ if (nma_is_hw(na)) {
+ SWNA(ifp)->tx_rings = &na->tx_rings[na->num_tx_rings];
+ SWNA(ifp)->rx_rings = &na->rx_rings[na->num_rx_rings];
+ }
+ error = na->nm_register(ifp, 1); /* mode on */
+#ifdef NM_BRIDGE
+ if (!error)
+ error = nm_alloc_bdgfwd(na);
+#endif /* NM_BRIDGE */
+ if (error) {
+ netmap_dtor_locked(priv);
+ /* nifp is not yet in priv, so free it separately */
+ netmap_if_free(nifp);
+ nifp = NULL;
+ }
+
+ }
+out:
+ *err = error;
+ na->nm_lock(ifp, NETMAP_REG_UNLOCK, 0);
+ if (na->na_bdg)
+ BDG_WUNLOCK(na->na_bdg);
+ return nifp;
+}
+
+
+/* Process NETMAP_BDG_ATTACH and NETMAP_BDG_DETACH */
+static int
+kern_netmap_regif(struct nmreq *nmr)
+{
+ struct ifnet *ifp;
+ struct netmap_if *nifp;
+ struct netmap_priv_d *npriv;
+ int error;
+
+ npriv = malloc(sizeof(*npriv), M_DEVBUF, M_NOWAIT|M_ZERO);
+ if (npriv == NULL)
+ return ENOMEM;
+ error = netmap_get_memory(npriv);
+ if (error) {
+free_exit:
+ bzero(npriv, sizeof(*npriv));
+ free(npriv, M_DEVBUF);
+ return error;
+ }
+
+ NMA_LOCK();
+ error = get_ifp(nmr, &ifp);
+ if (error) { /* no device, or another bridge or user owns the device */
+ NMA_UNLOCK();
+ goto free_exit;
+ } else if (!NETMAP_OWNED_BY_KERN(ifp)) {
+ /* got reference to a virtual port or direct access to a NIC.
+ * perhaps specified no bridge's prefix or wrong NIC's name
+ */
+ error = EINVAL;
+unref_exit:
+ nm_if_rele(ifp);
+ NMA_UNLOCK();
+ goto free_exit;
+ }
+
+ if (nmr->nr_cmd == NETMAP_BDG_DETACH) {
+ if (NA(ifp)->refcount == 0) { /* not registered */
+ error = EINVAL;
+ goto unref_exit;
+ }
+ NMA_UNLOCK();
+
+ netmap_dtor(NA(ifp)->na_kpriv); /* unregister */
+ NA(ifp)->na_kpriv = NULL;
+ nm_if_rele(ifp); /* detach from the bridge */
+ goto free_exit;
+ } else if (NA(ifp)->refcount > 0) { /* already registered */
+ error = EINVAL;
+ goto unref_exit;
+ }
+
+ nifp = netmap_do_regif(npriv, ifp, nmr->nr_ringid, &error);
+ if (!nifp)
+ goto unref_exit;
+ wmb(); // XXX do we need it ?
+ npriv->np_nifp = nifp;
+ NA(ifp)->na_kpriv = npriv;
+ NMA_UNLOCK();
+ D("registered %s to netmap-mode", ifp->if_xname);
+ return 0;
+}
+
+
+/* CORE_LOCK is not necessary */
+static void
+netmap_swlock_wrapper(struct ifnet *dev, int what, u_int queueid)
+{
+ struct netmap_adapter *na = SWNA(dev);
+
+ switch (what) {
+ case NETMAP_TX_LOCK:
+ mtx_lock(&na->tx_rings[queueid].q_lock);
+ break;
+
+ case NETMAP_TX_UNLOCK:
+ mtx_unlock(&na->tx_rings[queueid].q_lock);
+ break;
+
+ case NETMAP_RX_LOCK:
+ mtx_lock(&na->rx_rings[queueid].q_lock);
+ break;
+
+ case NETMAP_RX_UNLOCK:
+ mtx_unlock(&na->rx_rings[queueid].q_lock);
+ break;
+ }
+}
+
+
+/* Initialize necessary fields of sw adapter located in right after hw's
+ * one. sw adapter attaches a pair of sw rings of the netmap-mode NIC.
+ * It is always activated and deactivated at the same tie with the hw's one.
+ * Thus we don't need refcounting on the sw adapter.
+ * Regardless of NIC's feature we use separate lock so that anybody can lock
+ * me independently from the hw adapter.
+ * Make sure nm_register is NULL to be handled as FALSE in nma_is_hw
+ */
+static void
+netmap_attach_sw(struct ifnet *ifp)
+{
+ struct netmap_adapter *hw_na = NA(ifp);
+ struct netmap_adapter *na = SWNA(ifp);
+
+ na->ifp = ifp;
+ na->separate_locks = 1;
+ na->nm_lock = netmap_swlock_wrapper;
+ na->num_rx_rings = na->num_tx_rings = 1;
+ na->num_tx_desc = hw_na->num_tx_desc;
+ na->num_rx_desc = hw_na->num_rx_desc;
+ na->nm_txsync = netmap_bdg_to_host;
+}
+
+
+/* exported to kernel callers */
+int
+netmap_bdg_ctl(struct nmreq *nmr, bdg_lookup_fn_t func)
+{
+ struct nm_bridge *b;
+ struct netmap_adapter *na;
+ struct ifnet *iter;
+ char *name = nmr->nr_name;
+ int cmd = nmr->nr_cmd, namelen = strlen(name);
+ int error = 0, i, j;
+
+ switch (cmd) {
+ case NETMAP_BDG_ATTACH:
+ case NETMAP_BDG_DETACH:
+ error = kern_netmap_regif(nmr);
+ break;
+
+ case NETMAP_BDG_LIST:
+ /* this is used to enumerate bridges and ports */
+ if (namelen) { /* look up indexes of bridge and port */
+ if (strncmp(name, NM_NAME, strlen(NM_NAME))) {
+ error = EINVAL;
+ break;
+ }
+ b = nm_find_bridge(name, 0 /* don't create */);
+ if (!b) {
+ error = ENOENT;
+ break;
+ }
+
+ BDG_RLOCK(b);
+ error = ENOENT;
+ for (i = 0; i < NM_BDG_MAXPORTS; i++) {
+ na = BDG_GET_VAR(b->bdg_ports[i]);
+ if (na == NULL)
+ continue;
+ iter = na->ifp;
+ /* the former and the latter identify a
+ * virtual port and a NIC, respectively
+ */
+ if (!strcmp(iter->if_xname, name) ||
+ (namelen > b->namelen &&
+ !strcmp(iter->if_xname,
+ name + b->namelen + 1))) {
+ /* bridge index */
+ nmr->nr_arg1 = b - nm_bridges;
+ nmr->nr_arg2 = i; /* port index */
+ error = 0;
+ break;
+ }
+ }
+ BDG_RUNLOCK(b);
+ } else {
+ /* return the first non-empty entry starting from
+ * bridge nr_arg1 and port nr_arg2.
+ *
+ * Users can detect the end of the same bridge by
+ * seeing the new and old value of nr_arg1, and can
+ * detect the end of all the bridge by error != 0
+ */
+ i = nmr->nr_arg1;
+ j = nmr->nr_arg2;
+
+ for (error = ENOENT; error && i < NM_BRIDGES; i++) {
+ b = nm_bridges + i;
+ BDG_RLOCK(b);
+ for (; j < NM_BDG_MAXPORTS; j++) {
+ na = BDG_GET_VAR(b->bdg_ports[j]);
+ if (na == NULL)
+ continue;
+ iter = na->ifp;
+ nmr->nr_arg1 = i;
+ nmr->nr_arg2 = j;
+ strncpy(name, iter->if_xname, IFNAMSIZ);
+ error = 0;
+ break;
+ }
+ BDG_RUNLOCK(b);
+ j = 0; /* following bridges scan from 0 */
+ }
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-all
mailing list