PERFORCE change 113714 for review

Robert Watson rwatson at FreeBSD.org
Tue Jan 30 18:48:36 UTC 2007


http://perforce.freebsd.org/chv.cgi?CH=113714

Change 113714 by rwatson at rwatson_cinnamon on 2007/01/30 18:48:29

	First experimentation with zero-copy (i.e., one-copy) BPF:
	
	- Break out the current BPF buffering model into bpf_buffer.c.
	
	- Add a new bpf_zerocopy.c that implements an alternative buffering
	  model based on pages of memory "donated" by a user process.
	
	The basic model is that the BPF consumer will select a buffering
	mode before attaching to an interface; if none is selected, then
	the default is the current behavior providing complete
	compatibility.  If zero-copy buffering is seleced, an ioctl() to
	identify two page-aligned, integer multiple of page-sized buffers
	that will be wired by the kernel and used in place of malloc(9)-
	allocated buffers.  These buffers will be written to directly by
	the BPF tap routines, and are laid out identically to buffers
	read from user space.  For now, an explicit acknowledgement via an
	ioctl is required to rotate the buffers when user space has
	completed reading the current finished buffer (hbuf) in similar
	style to what occurs when bpfread() finishes currently.  It would
	be desirable to trigger rotation via the shared memory also,
	perhaps via a head with flags indicating if user space is ready for
	the buffer to be re-used.
	
	sf_bufs are used to map the user pages into kernel, which requires
	a fair amount of bookkeeping as they must be individually managed
	and are not contiguously mapped.  However, this is significantly
	more functional than trying to do the same with an mmaping of the
	BPF device due to serious limitations in the device pager.
	
	I have not yet updated any applications (and especially libpcap)
	or performed any performance benchmarking of serious debugging.
	This is simply a code snapshot as a starting point for discussion.

Affected files ...

.. //depot/projects/zcopybpf/src/sys/conf/files#3 edit
.. //depot/projects/zcopybpf/src/sys/conf/options#2 edit
.. //depot/projects/zcopybpf/src/sys/net/bpf.c#2 edit
.. //depot/projects/zcopybpf/src/sys/net/bpf.h#2 edit
.. //depot/projects/zcopybpf/src/sys/net/bpf_buffer.c#3 edit
.. //depot/projects/zcopybpf/src/sys/net/bpf_buffer.h#1 add
.. //depot/projects/zcopybpf/src/sys/net/bpf_zerocopy.c#1 add
.. //depot/projects/zcopybpf/src/sys/net/bpf_zerocopy.h#1 add
.. //depot/projects/zcopybpf/src/sys/net/bpfdesc.h#3 edit

Differences ...

==== //depot/projects/zcopybpf/src/sys/conf/files#3 (text+ko) ====

@@ -1510,8 +1510,10 @@
 libkern/strtouq.c		standard
 libkern/strvalid.c		standard
 net/bpf.c			standard
+net/bpf_buffer.c		optional bpf
 net/bpf_jitter.c		optional bpf_jitter
 net/bpf_filter.c		optional bpf | netgraph_bpf
+net/bpf_zerocopy.c		optional bpf_zerocopy
 net/bridgestp.c			optional if_bridge
 net/bsd_comp.c			optional ppp_bsdcomp
 net/if.c			standard

==== //depot/projects/zcopybpf/src/sys/conf/options#2 (text+ko) ====

@@ -492,6 +492,7 @@
 # DRM options
 DRM_DEBUG		opt_drm.h
 
+BPF_ZEROCOPY		opt_bpf.h
 ZERO_COPY_SOCKETS	opt_zero.h
 TI_PRIVATE_JUMBOS	opt_ti.h
 TI_JUMBO_HDRSPLIT	opt_ti.h

==== //depot/projects/zcopybpf/src/sys/net/bpf.c#2 (text+ko) ====

@@ -65,9 +65,13 @@
 
 #include <net/if.h>
 #include <net/bpf.h>
+#include <net/bpf_buffer.h>
 #ifdef BPF_JITTER
 #include <net/bpf_jitter.h>
 #endif
+#ifdef BPF_ZEROCOPY
+#include <net/bpf_zerocopy.h>
+#endif
 #include <net/bpfdesc.h>
 
 #include <netinet/in.h>
@@ -79,7 +83,7 @@
 
 #include <security/mac/mac_framework.h>
 
-static MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
+MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
 
 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
 
@@ -95,19 +99,17 @@
 static struct mtx	bpf_mtx;		/* bpf global lock */
 static int		bpf_bpfd_cnt;
 
-static void	bpf_allocbufs(struct bpf_d *);
 static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
 static void	bpf_detachd(struct bpf_d *);
 static void	bpf_freed(struct bpf_d *);
-static void	bpf_mcopy(const void *, void *, size_t);
 static int	bpf_movein(struct uio *, int, int,
 		    struct mbuf **, struct sockaddr *, struct bpf_insn *);
 static int	bpf_setif(struct bpf_d *, struct ifreq *);
 static void	bpf_timed_out(void *);
 static __inline void
 		bpf_wakeup(struct bpf_d *);
-static void	catchpacket(struct bpf_d *, u_char *, u_int,
-		    u_int, void (*)(const void *, void *, size_t),
+static void	catchpacket(struct bpf_d *, u_char *, u_int, u_int,
+		    void (*)(struct bpf_d *, caddr_t, u_int, void *, u_int),
 		    struct timeval *);
 static void	reset_d(struct bpf_d *);
 static int	 bpf_setf(struct bpf_d *, struct bpf_program *, u_long cmd);
@@ -123,12 +125,6 @@
  * The default read buffer size is patchable.
  */
 SYSCTL_NODE(_net, OID_AUTO, bpf, CTLFLAG_RW, 0, "bpf sysctl");
-static int bpf_bufsize = 4096;
-SYSCTL_INT(_net_bpf, OID_AUTO, bufsize, CTLFLAG_RW,
-    &bpf_bufsize, 0, "");
-static int bpf_maxbufsize = BPF_MAXBUFSIZE;
-SYSCTL_INT(_net_bpf, OID_AUTO, maxbufsize, CTLFLAG_RW,
-    &bpf_maxbufsize, 0, "");
 static int bpf_maxinsns = BPF_MAXINSNS;
 SYSCTL_INT(_net_bpf, OID_AUTO, maxinsns, CTLFLAG_RW,
     &bpf_maxinsns, 0, "Maximum bpf program instructions");
@@ -159,7 +155,163 @@
 static struct filterops bpfread_filtops =
 	{ 1, NULL, filt_bpfdetach, filt_bpfread };
 
+/*
+ * Wrapper functions for various buffering methods.  If the set of buffer
+ * modes expands, we will probably want to introduce a switch data structure
+ * similar to protosw, et.
+ */
+static void
+bpf_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
+    u_int len)
+{
+
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_BUFFER:
+		return (bpf_buffer_append_bytes(d, buf, offset, src, len));
+
+#ifdef BPF_ZEROCOPY
+	case BPF_BUFMODE_ZBUF:
+		return (bpf_zerocopy_append_bytes(d, buf, offset, src, len));
+#endif
+
+	default:
+		panic("bpf_buf_append_bytes");
+	}
+}
+
+static void
+bpf_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
+    u_int len)
+{
+
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_BUFFER:
+		return (bpf_buffer_append_mbuf(d, buf, offset, src, len));
+
+#ifdef BPF_ZEROCOPY
+	case BPF_BUFMODE_ZBUF:
+		return (bpf_zerocopy_append_mbuf(d, buf, offset, src, len));
+#endif
+
+	default:
+		panic("bpf_buf_append_mbuf");
+	}
+}
+
+static void
+bpf_free(struct bpf_d *d)
+{
+
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_BUFFER:
+		return (bpf_buffer_free(d));
+
+#ifdef BPF_ZEROCOPY
+	case BPF_BUFMODE_ZBUF:
+		return (bpf_zerocopy_free(d));
+#endif
+
+	default:
+		panic("bpf_buf_free");
+	}
+}
+
+static int
+bpf_uiomove(struct bpf_d *d, caddr_t buf, u_int len, struct uio *uio)
+{
+
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_BUFFER:
+		return (bpf_buffer_uiomove(d, buf, len, uio));
+
+#ifdef BPF_ZEROCOPY
+	case BPF_BUFMODE_ZBUF:
+		return (bpf_zerocopy_uiomove(d, buf, len, uio));
+#endif
+
+	default:
+		panic("bpf_buf_uiomove");
+	}
+}
+
 static int
+bpf_ioctl_sblen(struct bpf_d *d, u_int *i)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_BUFFER)
+		return (EOPNOTSUPP);
+	return (bpf_buffer_ioctl_sblen(d, i));
+}
+
+static int
+bpf_ioctl_ackzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
+		return (EOPNOTSUPP);
+#ifdef BPF_ZEROCOPY
+	return (bpf_zerocopy_ioctl_ackzbuf(td, d, bz));
+#else
+	panic("bpf_ioctl_getznext");
+#endif
+}
+
+static int
+bpf_ioctl_getzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
+		return (EOPNOTSUPP);
+#ifdef BPF_ZEROCOPY
+	return (bpf_zerocopy_ioctl_getzbuf(td, d, bz));
+#else
+	panic("bpf_ioctl_getznext");
+#endif
+}
+
+static int
+bpf_ioctl_getznext(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
+		return (EOPNOTSUPP);
+#ifdef BPF_ZEROCOPY
+	return (bpf_zerocopy_ioctl_getznext(td, d, bz));
+#else
+	panic("bpf_ioctl_getznext");
+#endif
+}
+
+static int
+bpf_ioctl_getzmax(struct thread *td, struct bpf_d *d, u_int *i)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
+		return (EOPNOTSUPP);
+#ifdef BPF_ZEROCOPY
+	return (bpf_zerocopy_ioctl_getzmax(td, d, i));
+#else
+	return (ENOTTY);
+#endif
+}
+
+static int
+bpf_ioctl_setzbuf(struct thread *td, struct bpf_d *d, struct bpf_zbuf *bz)
+{
+
+	if (d->bd_bufmode != BPF_BUFMODE_ZBUF)
+		return (EOPNOTSUPP);
+#ifdef BPF_ZEROCOPY
+	return (bpf_zerocopy_ioctl_setzbuf(td, d, bz));
+#else
+	return (ENOTTY);
+#endif
+}
+
+/*
+ * General BPF functions.
+ */
+static int
 bpf_movein(struct uio *uio, int linktype, int mtu, struct mbuf **mp,
     struct sockaddr *sockp, struct bpf_insn *wfilter)
 {
@@ -404,7 +556,14 @@
 		    "bpf%d", dev2unit(dev));
 	MALLOC(d, struct bpf_d *, sizeof(*d), M_BPF, M_WAITOK | M_ZERO);
 	dev->si_drv1 = d;
-	d->bd_bufsize = bpf_bufsize;
+
+	/*
+	 * XXXRW: For historical reasons, perform a one-time initialization
+	 * call to the buffer routines, even though we're not yet committed
+	 * to a particular buffer method.
+	 */
+	bpf_buffer_init(d);
+	d->bd_bufmode = BPF_BUFMODE_DEFAULT;
 	d->bd_sig = SIGIO;
 	d->bd_seesent = 1;
 	d->bd_pid = td->td_proc->p_pid;
@@ -451,18 +610,19 @@
 	return (0);
 }
 
-
 /*
- * Rotate the packet buffers in descriptor d.  Move the store buffer
- * into the hold slot, and the free buffer into the store slot.
- * Zero the length of the new store buffer.
+ * Rotate the packet buffers in descriptor d.  Move the store buffer into
+ * the hold slot, and the free buffer ino the store slot.  Zero the length of
+ * the new store buffer.  Descriptor lock should be held.
  */
-#define ROTATE_BUFFERS(d) \
-	(d)->bd_hbuf = (d)->bd_sbuf; \
-	(d)->bd_hlen = (d)->bd_slen; \
-	(d)->bd_sbuf = (d)->bd_fbuf; \
-	(d)->bd_slen = 0; \
-	(d)->bd_fbuf = NULL;
+#define	ROTATE_BUFFERS(d)	do {					\
+	(d)->bd_hbuf = (d)->bd_sbuf;					\
+	(d)->bd_hlen = (d)->bd_slen;					\
+	(d)->bd_sbuf = (d)->bd_fbuf;					\
+	(d)->bd_slen = 0;						\
+	(d)->bd_fbuf = NULL;						\
+} while (0)
+
 /*
  *  bpfread - read next chunk of packets from buffers
  */
@@ -553,8 +713,12 @@
 	 * Move data from hold buffer into user space.
 	 * We know the entire buffer is transferred since
 	 * we checked above that the read buffer is bpf_bufsize bytes.
+	 *
+	 * XXXRW: More synchronization needed here: what if a second thread
+	 * issues a read on the same fd at the same time?  Don't want this
+	 * getting invalidated.
 	 */
-	error = uiomove(d->bd_hbuf, d->bd_hlen, uio);
+	error = bpf_uiomove(d, d->bd_hbuf, d->bd_hlen, uio);
 
 	BPFD_LOCK(d);
 	d->bd_fbuf = d->bd_hbuf;
@@ -565,7 +729,6 @@
 	return (error);
 }
 
-
 /*
  * If there are processes sleeping on this descriptor, wake them up.
  */
@@ -685,6 +848,10 @@
  *  BIOCGSEESENT	Get "see packets sent" flag
  *  BIOCSSEESENT	Set "see packets sent" flag
  *  BIOCLOCK		Set "locked" flag
+ *  BIOCGETZBUF		Query current zero-copy buffer locations.
+ *  BIOCSETZBUF		Set current zero-copy buffer locations.
+ *  BIOCSETZBUF		Acknowledge reading zero-copy buffers.
+ *  BIOCGETZMAX		Get maximume zero-copy buffer size.
  */
 /* ARGSUSED */
 static	int
@@ -721,6 +888,8 @@
 		case BIOCSRTIMEOUT:
 		case BIOCIMMEDIATE:
 		case TIOCGPGRP:
+		case BIOCACKZBUF:
+		case BIOCGETZBUF:
 			break;
 		default:
 			return (EPERM);
@@ -773,17 +942,7 @@
 	 * Set buffer length.
 	 */
 	case BIOCSBLEN:
-		if (d->bd_bif != NULL)
-			error = EINVAL;
-		else {
-			u_int size = *(u_int *)addr;
-
-			if (size > bpf_maxbufsize)
-				*(u_int *)addr = size = bpf_maxbufsize;
-			else if (size < BPF_MINBUFSIZE)
-				*(u_int *)addr = size = BPF_MINBUFSIZE;
-			d->bd_bufsize = size;
-		}
+		error = bpf_ioctl_sblen(d, (u_int *)addr);
 		break;
 
 	/*
@@ -1002,6 +1161,59 @@
 	case BIOCGRSIG:
 		*(u_int *)addr = d->bd_sig;
 		break;
+
+	case BIOCGETBUFMODE:
+		*(u_int *)addr = d->bd_bufmode;
+		break;
+
+	case BIOCSETBUFMODE:
+		/*
+		 * Allow the buffering mode to be changed as long as we
+		 * haven't yet committed to a particular mode.  Our
+		 * definition of commitment, for now, is whether or not a
+		 * buffer has been allocated or an interface attached, since
+		 * that's the point where things get tricky.
+		 *
+		 * XXXRW: This will need some refinement.  Is checking both
+		 * for buffers and interface binding redundant?
+		 */
+		switch (*(u_int *)addr) {
+		case BPF_BUFMODE_BUFFER:
+			break;
+
+#ifdef BPF_ZEROCOPY
+		case BPF_BUFMODE_ZBUF:
+			break;
+#endif
+
+		default:
+			return (EINVAL);
+		}
+
+		BPFD_LOCK(d);
+		if (d->bd_sbuf != NULL || d->bd_hbuf != NULL ||
+		    d->bd_fbuf != NULL || d->bd_bif != NULL) {
+			BPFD_UNLOCK(d);
+			return (EBUSY);
+		}
+		d->bd_bufmode = *(u_int *)addr;
+		BPFD_UNLOCK(d);
+		break;
+
+	case BIOCACKZBUF:
+		return (bpf_ioctl_ackzbuf(td, d, (struct bpf_zbuf *)addr));
+
+	case BIOCGETZBUF:
+		return (bpf_ioctl_getzbuf(td, d, (struct bpf_zbuf *)addr));
+
+	case BIOCGETZMAX:
+		return (bpf_ioctl_getzmax(td, d, (u_int *)addr));
+
+	case BIOCGETZNEXT:
+		return (bpf_ioctl_getznext(td, d, (struct bpf_zbuf *)addr));
+
+	case BIOCSETZBUF:
+		return (bpf_ioctl_setzbuf(td, d, (struct bpf_zbuf *)addr));
 	}
 	return (error);
 }
@@ -1102,13 +1314,30 @@
 		return (ENXIO);
 
 	bp = theywant->if_bpf;
+
 	/*
-	 * Allocate the packet buffers if we need to.
-	 * If we're already attached to requested interface,
-	 * just flush the buffer.
+	 * Behavior here depends on the buffering model.  If we're using
+	 * kernel memory buffers, then we can allocate them here.  If we're
+	 * using zero-copy, then the user process must have registered
+	 * buffers by the time we get here.  If not, return an error.
+	 *
+	 * XXXRW: Could this be better abstracted?
 	 */
-	if (d->bd_sbuf == NULL)
-		bpf_allocbufs(d);
+	switch (d->bd_bufmode) {
+	case BPF_BUFMODE_BUFFER:
+		if (d->bd_sbuf == NULL)
+			bpf_buffer_alloc(d);
+		KASSERT(d->bd_sbuf != NULL, ("bpf_setif: bd_sbuf NULL"));
+		break;
+
+	case BPF_BUFMODE_ZBUF:
+		if (d->bd_sbuf == NULL)
+			return (EINVAL);
+		break;
+
+	default:
+		panic("bpf_setif: bufmode %d", d->bd_bufmode);
+	}
 	if (bp != d->bd_bif) {
 		if (d->bd_bif)
 			/*
@@ -1252,7 +1481,8 @@
 #ifdef MAC
 			if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0)
 #endif
-				catchpacket(d, pkt, pktlen, slen, bcopy, &tv);
+				catchpacket(d, pkt, pktlen, slen,
+				    bpf_append_bytes, &tv);
 		}
 		BPFD_UNLOCK(d);
 	}
@@ -1260,30 +1490,6 @@
 }
 
 /*
- * Copy data from an mbuf chain into a buffer.  This code is derived
- * from m_copydata in sys/uipc_mbuf.c.
- */
-static void
-bpf_mcopy(const void *src_arg, void *dst_arg, size_t len)
-{
-	const struct mbuf *m;
-	u_int count;
-	u_char *dst;
-
-	m = src_arg;
-	dst = dst_arg;
-	while (len > 0) {
-		if (m == NULL)
-			panic("bpf_mcopy");
-		count = min(m->m_len, len);
-		bcopy(mtod(m, void *), dst, count);
-		m = m->m_next;
-		dst += count;
-		len -= count;
-	}
-}
-
-/*
  * Incoming linkage from device drivers, when packet is in an mbuf chain.
  */
 void
@@ -1323,7 +1529,7 @@
 			if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)m, pktlen, slen,
-				    bpf_mcopy, &tv);
+				    bpf_append_mbuf, &tv);
 		}
 		BPFD_UNLOCK(d);
 	}
@@ -1373,7 +1579,7 @@
 			if (mac_check_bpfdesc_receive(d, bp->bif_ifp) == 0)
 #endif
 				catchpacket(d, (u_char *)&mb, pktlen, slen,
-				    bpf_mcopy, &tv);
+				    bpf_append_mbuf, &tv);
 		}
 		BPFD_UNLOCK(d);
 	}
@@ -1384,14 +1590,15 @@
  * Move the packet data from interface memory (pkt) into the
  * store buffer.  "cpfn" is the routine called to do the actual data
  * transfer.  bcopy is passed in to copy contiguous chunks, while
- * bpf_mcopy is passed in to copy mbuf chains.  In the latter case,
+ * bpf_append_mbuf is passed in to copy mbuf chains.  In the latter case,
  * pkt is really an mbuf.
  */
 static void
 catchpacket(struct bpf_d *d, u_char *pkt, u_int pktlen, u_int snaplen,
-    void (*cpfn)(const void *, void *, size_t), struct timeval *tv)
+    void (*cpfn)(struct bpf_d *, caddr_t, u_int, void *, u_int),
+    struct timeval *tv)
 {
-	struct bpf_hdr *hp;
+	struct bpf_hdr hdr;
 	int totlen, curlen;
 	int hdrlen = d->bd_bif->bif_hdrlen;
 	int do_wakeup = 0;
@@ -1438,16 +1645,20 @@
 		do_wakeup = 1;
 
 	/*
-	 * Append the bpf header.
+	 * Append the bpf header.  Note we append the actual header size, but
+	 * move forward the length of the header plus padding.
 	 */
-	hp = (struct bpf_hdr *)(d->bd_sbuf + curlen);
-	hp->bh_tstamp = *tv;
-	hp->bh_datalen = pktlen;
-	hp->bh_hdrlen = hdrlen;
+	bzero(&hdr, sizeof(hdr));
+	hdr.bh_tstamp = *tv;
+	hdr.bh_datalen = pktlen;
+	hdr.bh_hdrlen = hdrlen;
+	hdr.bh_caplen = totlen - hdrlen;
+	bpf_append_bytes(d, d->bd_sbuf, curlen, &hdr, sizeof(hdr));
+
 	/*
 	 * Copy the packet data into the store buffer and update its length.
 	 */
-	(*cpfn)(pkt, (u_char *)hp + hdrlen, (hp->bh_caplen = totlen - hdrlen));
+	(*cpfn)(d, d->bd_sbuf, curlen + hdrlen, pkt, hdr.bh_caplen);
 	d->bd_slen = curlen + totlen;
 
 	if (do_wakeup)
@@ -1455,41 +1666,19 @@
 }
 
 /*
- * Initialize all nonzero fields of a descriptor.
- */
-static void
-bpf_allocbufs(struct bpf_d *d)
-{
-
-	KASSERT(d->bd_fbuf == NULL, ("bpf_allocbufs: bd_fbuf != NULL"));
-	KASSERT(d->bd_sbuf == NULL, ("bpf_allocbufs: bd_sbuf != NULL"));
-	KASSERT(d->bd_hbuf == NULL, ("bpf_allocbufs: bd_hbuf != NULL"));
-
-	d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
-	d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
-	d->bd_slen = 0;
-	d->bd_hlen = 0;
-}
-
-/*
  * Free buffers currently in use by a descriptor.
  * Called on close.
  */
 static void
 bpf_freed(struct bpf_d *d)
 {
+
 	/*
 	 * We don't need to lock out interrupts since this descriptor has
 	 * been detached from its interface and it yet hasn't been marked
 	 * free.
 	 */
-	if (d->bd_sbuf != NULL) {
-		free(d->bd_sbuf, M_BPF);
-		if (d->bd_hbuf != NULL)
-			free(d->bd_hbuf, M_BPF);
-		if (d->bd_fbuf != NULL)
-			free(d->bd_fbuf, M_BPF);
-	}
+	bpf_free(d);
 	if (d->bd_rfilter) {
 		free((caddr_t)d->bd_rfilter, M_BPF);
 #ifdef BPF_JITTER

==== //depot/projects/zcopybpf/src/sys/net/bpf.h#2 (text+ko) ====

@@ -92,6 +92,38 @@
 #define BPF_MAJOR_VERSION 1
 #define BPF_MINOR_VERSION 1
 
+/*
+ * Historically, BPF has supported a single buffering model, first using mbuf
+ * clusters in kernel, and later using malloc(9) buffers in kernel.  We now
+ * support multiple buffering modes, which may be queried and set using
+ * BIOCGETBUFMODE and BIOCSETBUFMODE.  So as to avoid handling the complexity
+ * of changing modes while sniffing packets, the mode becomes fixed once an
+ * interface has been attached to the BPF descriptor.
+ */
+#define	BPF_BUFMODE_BUFFER	1	/* Kernel buffers with read(). */
+#define	BPF_BUFMODE_ZBUF	2	/* Zero-copy buffers. */
+
+#define	BPF_BUFMODE_DEFAULT	BPF_BUFMODE_BUFFER	/* Default. */
+
+/*
+ * Struct used by BIOCACKZBUF, BIOCGETZNEXT, BIOCGETZBUF, BIOCSETZBUF:
+ * describes up to two zero-copy buffer as used by BPF.
+ *
+ * BIOCACKZBUF      Acknowledge read of stored zero-copy buffer (rotate).
+ * BIOCGETZBUF      Query current zero-copy buffer locations.
+ * BIOCGETZNEXT     Query next stored buffer, if available.
+ * BIOCSETZBUF      Set current zero-copy buffer locations (once only).
+ *
+ * Pointers may be set to NULL to indicate a buffer is not configure, should
+ * be freed, or is not being acknowledged.
+ */
+struct bpf_zbuf {
+	void	*bz_bufa;	/* Location of 'a' zero-copy buffer. */
+	size_t	 bz_bufalen;	/* Size of 'a' zero-copy buffer. */
+	void	*bz_bufb;	/* Location of 'b' zero-copy buffer. */
+	size_t	 bz_bufblen;	/* Size of 'b' zero-copy buffer. */
+};
+
 #define	BIOCGBLEN	_IOR('B',102, u_int)
 #define	BIOCSBLEN	_IOWR('B',102, u_int)
 #define	BIOCSETF	_IOW('B',103, struct bpf_program)
@@ -115,6 +147,13 @@
 #define	BIOCGDLTLIST	_IOWR('B',121, struct bpf_dltlist)
 #define	BIOCLOCK	_IO('B', 122)
 #define	BIOCSETWF	_IOW('B',123, struct bpf_program)
+#define	BIOCGETBUFMODE	_IOR('B', 124, u_int)
+#define	BIOCSETBUFMODE	_IOW('B', 125, u_int)
+#define	BIOCACKZBUF	_IOW('B', 126, struct bpf_zbuf)
+#define	BIOCGETZBUF	_IOR('B', 127, struct bpf_zbuf)
+#define	BIOCGETZMAX	_IOR('B', 128, u_int)
+#define	BIOCGETZNEXT	_IOR('B', 129, struct bpf_zbuf)
+#define	BIOCSETZBUF	_IOW('B', 130, struct bpf_zbuf)
 
 /*
  * Structure prepended to each packet.
@@ -615,6 +654,15 @@
 };
 
 #ifdef _KERNEL
+
+#ifdef MALLOC_DECLARE
+MALLOC_DECLARE(M_BPF);
+#endif
+
+#ifdef SYSCTL_DECL
+SYSCTL_DECL(_net_bpf);
+#endif
+
 /*
  * Descriptor associated with each attached hardware interface.
  */

==== //depot/projects/zcopybpf/src/sys/net/bpf_buffer.c#3 (text+ko) ====

@@ -1,4 +1,31 @@
 /*-
+ * Copyright (c) 2007 Seccuris Inc.
+ * All rights reserved.
+ *
+ * This sofware was developed by Robert N. M. Watson under contract to
+ * Seccuris Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
  * Copyright (c) 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
@@ -37,180 +64,81 @@
  */
 
 #include "opt_bpf.h"
-#include "opt_mac.h"
-#include "opt_netgraph.h"
 
-#include <sys/types.h>
 #include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/conf.h>
-#include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
-#include <sys/time.h>
-#include <sys/priv.h>
-#include <sys/proc.h>
-#include <sys/signalvar.h>
-#include <sys/filio.h>
-#include <sys/sockio.h>
-#include <sys/ttycom.h>
+#include <sys/socket.h>
 #include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
 
-#include <sys/event.h>
-#include <sys/file.h>
-#include <sys/poll.h>
-#include <sys/proc.h>
-
-#include <sys/socket.h>
-
 #include <net/if.h>
 #include <net/bpf.h>
-#ifdef BPF_JITTER
-#include <net/bpf_jitter.h>
-#endif
+#include <net/bpf_buffer.h>
 #include <net/bpfdesc.h>
 
-#include <netinet/in.h>
-#include <netinet/if_ether.h>
-#include <sys/kernel.h>
-#include <sys/sysctl.h>
+/*
+ * Implement historical kernel memory buffering model for BPF: two malloc(9)
+ * kernel buffers are hung off of the descriptor.  The size is fixed prior to
+ * attaching to an ifnet, ad cannot be changed after that.  read(2) simply
+ * copies the data to user space using uiomove(9).
+ */
 
-#include <net80211/ieee80211_freebsd.h>
+static int bpf_bufsize = 4096;
+SYSCTL_INT(_net_bpf, OID_AUTO, bufsize, CTLFLAG_RW,
+    &bpf_bufsize, 0, "");
+static int bpf_maxbufsize = BPF_MAXBUFSIZE;
+SYSCTL_INT(_net_bpf, OID_AUTO, maxbufsize, CTLFLAG_RW,
+    &bpf_maxbufsize, 0, "");
 
-#include <security/mac/mac_framework.h>
+void
+bpf_buffer_alloc(struct bpf_d *d)
+{
 
-static MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
+	KASSERT(d->bd_fbuf == NULL, ("bpf_buffer_alloc: bd_fbuf != NULL"));
+	KASSERT(d->bd_sbuf == NULL, ("bpf_buffer_alloc: bd_sbuf != NULL"));
+	KASSERT(d->bd_hbuf == NULL, ("bpf_buffer_alloc: bd_hbuf != NULL"));
 
-#if defined(DEV_BPF) || defined(NETGRAPH_BPF)
+	// printf("bpf_buffer_alloc size %d\n", d->bd_bufsize);
 
-#define PRINET  26			/* interruptible */
+	d->bd_fbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
+	d->bd_sbuf = (caddr_t)malloc(d->bd_bufsize, M_BPF, M_WAITOK);
+	d->bd_hbuf = NULL;
+	d->bd_slen = 0;
+	d->bd_hlen = 0;
+}
 
 /*
- * Rotate the packet buffers in descriptor d.  Move the store buffer
- * into the hold slot, and the free buffer into the store slot.
- * Zero the length of the new store buffer.
+ * Simple data copy to the current kernel buffer.
  */
-#define ROTATE_BUFFERS(d) \
-	(d)->bd_hbuf = (d)->bd_sbuf; \
-	(d)->bd_hlen = (d)->bd_slen; \
-	(d)->bd_sbuf = (d)->bd_fbuf; \
-	(d)->bd_slen = 0; \
-	(d)->bd_fbuf = NULL;
-/*
- *  bpfread - read next chunk of packets from buffers
- */
-static	int
-bpfread(struct cdev *dev, struct uio *uio, int ioflag)
+void
+bpf_buffer_append_bytes(struct bpf_d *d, caddr_t buf, u_int offset,
+    void *src, u_int len)
 {
-	struct bpf_d *d = dev->si_drv1;
-	int timed_out;
-	int error;
+	u_char *src_bytes;
 
-	/*
-	 * Restrict application to use a buffer the same size as
-	 * as kernel buffers.
-	 */
-	if (uio->uio_resid != d->bd_bufsize)
-		return (EINVAL);
+	// printf("bpf_buffer_append_bytes size %d\n", len);
 
-	BPFD_LOCK(d);
-	if (d->bd_state == BPF_WAITING)
-		callout_stop(&d->bd_callout);
-	timed_out = (d->bd_state == BPF_TIMED_OUT);
-	d->bd_state = BPF_IDLE;
-	/*
-	 * If the hold buffer is empty, then do a timed sleep, which
-	 * ends when the timeout expires or when enough packets
-	 * have arrived to fill the store buffer.
-	 */
-	while (d->bd_hbuf == NULL) {
-		if ((d->bd_immediate || timed_out) && d->bd_slen != 0) {
-			/*
-			 * A packet(s) either arrived since the previous
-			 * read or arrived while we were asleep.
-			 * Rotate the buffers and return what's here.
-			 */
-			ROTATE_BUFFERS(d);
-			break;
-		}
-
-		/*
-		 * No data is available, check to see if the bpf device
-		 * is still pointed at a real interface.  If not, return
-		 * ENXIO so that the userland process knows to rebind
-		 * it before using it again.
-		 */
-		if (d->bd_bif == NULL) {
-			BPFD_UNLOCK(d);
-			return (ENXIO);
-		}
-
-		if (ioflag & O_NONBLOCK) {
-			BPFD_UNLOCK(d);
-			return (EWOULDBLOCK);
-		}
-		error = msleep(d, &d->bd_mtx, PRINET|PCATCH,
-		     "bpf", d->bd_rtout);
-		if (error == EINTR || error == ERESTART) {
-			BPFD_UNLOCK(d);
-			return (error);
-		}
-		if (error == EWOULDBLOCK) {
-			/*
-			 * On a timeout, return what's in the buffer,
-			 * which may be nothing.  If there is something
-			 * in the store buffer, we can rotate the buffers.
-			 */
-			if (d->bd_hbuf)
-				/*
-				 * We filled up the buffer in between
-				 * getting the timeout and arriving
-				 * here, so we don't need to rotate.
-				 */
-				break;
-
-			if (d->bd_slen == 0) {
-				BPFD_UNLOCK(d);
-				return (0);
-			}
-			ROTATE_BUFFERS(d);
-			break;
-		}
-	}
-	/*
-	 * At this point, we know we have something in the hold slot.
-	 */
-	BPFD_UNLOCK(d);
-
-	/*
-	 * Move data from hold buffer into user space.
-	 * We know the entire buffer is transferred since
-	 * we checked above that the read buffer is bpf_bufsize bytes.
-	 */
-	error = uiomove(d->bd_hbuf, d->bd_hlen, uio);
-
-	BPFD_LOCK(d);
-	d->bd_fbuf = d->bd_hbuf;
-	d->bd_hbuf = NULL;
-	d->bd_hlen = 0;
-	BPFD_UNLOCK(d);
-
-	return (error);
+	src_bytes = (u_char *)src;
+	bcopy(src_bytes, buf + offset, len);
 }
 
 /*
- * Copy data from an mbuf chain into a buffer.  This code is derived
- * from m_copydata in sys/uipc_mbuf.c.
+ * Scatter-gather data copy from an mbuf chain to the current kernel buffer.
  */
-static void
-bpf_mcopy(const void *src_arg, void *dst_arg, size_t len)
+void
+bpf_buffer_append_mbuf(struct bpf_d *d, caddr_t buf, u_int offset, void *src,
+    u_int len)
 {
 	const struct mbuf *m;
+	u_char *dst;
 	u_int count;
-	u_char *dst;
+
+	// printf("bpf_buffer_append_mbuf size %d\n", len);
 
-	m = src_arg;
-	dst = dst_arg;
+	m = (struct mbuf *)src;
+	dst = (u_char *)buf + offset;
 	while (len > 0) {
 		if (m == NULL)
 			panic("bpf_mcopy");
@@ -223,18 +151,76 @@
 }
 
 /*
- * Initialize all nonzero fields of a descriptor.
+ * Free BPF kernel buffers on device close.
+ */
+void
+bpf_buffer_free(struct bpf_d *d)
+{
+
+	// printf("bpf_buffer_free(sbuf: %p, hbuf: %p, fbuf: %p)\n",
+	//    d->bd_sbuf, d->bd_hbuf, d->bd_fbuf);
+
+	if (d->bd_sbuf != NULL)
+		free(d->bd_sbuf, M_BPF);

>>> TRUNCATED FOR MAIL (1000 lines) <<<


More information about the p4-projects mailing list