svn commit: r347526 - head/sys/net

Mon May 13 13:45:30 UTC 2019

Author: ae
Date: Mon May 13 13:45:28 2019
New Revision: 347526
URL: https://svnweb.freebsd.org/changeset/base/347526

Log:
  Rework locking in BPF code to remove rwlock from fast path.
  
  On high packets rate the contention on rwlock in bpf_*tap*() functions
  can lead to packets dropping. To avoid this, migrate this code to use
  epoch(9) KPI and ConcurrencyKit's lists.
  
  * all lists changed to use CK_LIST;
  * reference counting added to bpf_if and bpf_d;
  * now bpf_if references ifnet and releases this reference on destroy;
  * each bpf_d descriptor references bpf_if when it is attached;
  * new struct bpf_program_buffer introduced to keep BPF filter programs;
  * bpf_program_buffer, bpf_d and bpf_if structures are freed by
    epoch_call();
  * bpf_freelist and ifnet_departure event are no longer needed, thus
    both are removed;
  
  Reviewed by:	melifaro
  Sponsored by:	Yandex LLC
  Differential Revision:	https://reviews.freebsd.org/D20224

Modified:
  head/sys/net/bpf.c
  head/sys/net/bpf.h
  head/sys/net/bpfdesc.h

Modified: head/sys/net/bpf.c
==============================================================================

--- head/sys/net/bpf.c	Mon May 13 13:30:34 2019	(r347525)
+++ head/sys/net/bpf.c	Mon May 13 13:45:28 2019	(r347526)
@@ -3,6 +3,7 @@
  *
  * Copyright (c) 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 2019 Andrey V. Elsukov <ae at FreeBSD.org>
  *
  * This code is derived from the Stanford/CMU enet packet filter,
  * (net/enet.c) distributed as part of 4.3BSD, and code contributed
@@ -46,7 +47,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/lock.h>
-#include <sys/rwlock.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/fcntl.h>
@@ -99,7 +99,7 @@ __FBSDID("$FreeBSD$");
 MALLOC_DEFINE(M_BPF, "BPF", "BPF data");
 
 static struct bpf_if_ext dead_bpf_if = {
-	.bif_dlist = LIST_HEAD_INITIALIZER()
+	.bif_dlist = CK_LIST_HEAD_INITIALIZER()
 };
 
 struct bpf_if {
@@ -108,19 +108,22 @@ struct bpf_if {
 	struct bpf_if_ext bif_ext;	/* public members */
 	u_int		bif_dlt;	/* link layer type */
 	u_int		bif_hdrlen;	/* length of link header */
+	struct bpfd_list bif_wlist;	/* writer-only list */
 	struct ifnet	*bif_ifp;	/* corresponding interface */
-	struct rwlock	bif_lock;	/* interface lock */
-	LIST_HEAD(, bpf_d) bif_wlist;	/* writer-only list */
-	int		bif_flags;	/* Interface flags */
 	struct bpf_if	**bif_bpf;	/* Pointer to pointer to us */
+	volatile u_int	bif_refcnt;
+	struct epoch_context epoch_ctx;
 };
 
 CTASSERT(offsetof(struct bpf_if, bif_ext) == 0);
 
-#define BPFIF_RLOCK(bif)	rw_rlock(&(bif)->bif_lock)
-#define BPFIF_RUNLOCK(bif)	rw_runlock(&(bif)->bif_lock)
-#define BPFIF_WLOCK(bif)	rw_wlock(&(bif)->bif_lock)
-#define BPFIF_WUNLOCK(bif)	rw_wunlock(&(bif)->bif_lock)
+struct bpf_program_buffer {
+	struct epoch_context	epoch_ctx;
+#ifdef BPF_JITTER
+	bpf_jit_filter		*func;
+#endif
+	void			*buffer[0];
+};
 
 #if defined(DEV_BPF) || defined(NETGRAPH_BPF)
 
@@ -173,18 +176,24 @@ struct bpf_dltlist32 {
 #define BPF_LOCK_ASSERT()	sx_assert(&bpf_sx, SA_XLOCKED)
 /*
  * bpf_iflist is a list of BPF interface structures, each corresponding to a
- * specific DLT.  The same network interface might have several BPF interface
+ * specific DLT. The same network interface might have several BPF interface
  * structures registered by different layers in the stack (i.e., 802.11
  * frames, ethernet frames, etc).
  */
-static LIST_HEAD(, bpf_if)	bpf_iflist, bpf_freelist;
+CK_LIST_HEAD(bpf_iflist, bpf_if);
+static struct bpf_iflist bpf_iflist;
 static struct sx	bpf_sx;		/* bpf global lock */
 static int		bpf_bpfd_cnt;
 
+static void	bpfif_ref(struct bpf_if *);
+static void	bpfif_rele(struct bpf_if *);
+
+static void	bpfd_ref(struct bpf_d *);
+static void	bpfd_rele(struct bpf_d *);
 static void	bpf_attachd(struct bpf_d *, struct bpf_if *);
 static void	bpf_detachd(struct bpf_d *);
-static void	bpf_detachd_locked(struct bpf_d *);
-static void	bpf_freed(struct bpf_d *);
+static void	bpf_detachd_locked(struct bpf_d *, bool);
+static void	bpfd_free(epoch_context_t);
 static int	bpf_movein(struct uio *, int, struct ifnet *, struct mbuf **,
 		    struct sockaddr *, int *, struct bpf_d *);
 static int	bpf_setif(struct bpf_d *, struct ifreq *);
@@ -243,37 +252,106 @@ static struct filterops bpfread_filtops = {
 	.f_event = filt_bpfread,
 };
 
-eventhandler_tag	bpf_ifdetach_cookie = NULL;
-
 /*
- * LOCKING MODEL USED BY BPF:
+ * LOCKING MODEL USED BY BPF
+ *
  * Locks:
- * 1) global lock (BPF_LOCK). Mutex, used to protect interface addition/removal,
- * some global counters and every bpf_if reference.
- * 2) Interface lock. Rwlock, used to protect list of BPF descriptors and their filters.
- * 3) Descriptor lock. Mutex, used to protect BPF buffers and various structure fields
- *   used by bpf_mtap code.
+ * 1) global lock (BPF_LOCK). Sx, used to protect some global counters,
+ * every bpf_iflist changes, serializes ioctl access to bpf descriptors.
+ * 2) Descriptor lock. Mutex, used to protect BPF buffers and various
+ * structure fields used by bpf_*tap* code.
  *
- * Lock order:
+ * Lock order: global lock, then descriptor lock.
  *
- * Global lock, interface lock, descriptor lock
+ * There are several possible consumers:
  *
- * We have to acquire interface lock before descriptor main lock due to BPF_MTAP[2]
- * working model. In many places (like bpf_detachd) we start with BPF descriptor
- * (and we need to at least rlock it to get reliable interface pointer). This
- * gives us potential LOR. As a result, we use global lock to protect from bpf_if
- * change in every such place.
+ * 1. The kernel registers interface pointer with bpfattach().
+ * Each call allocates new bpf_if structure, references ifnet pointer
+ * and links bpf_if into bpf_iflist chain. This is protected with global
+ * lock.
  *
- * Changing d->bd_bif is protected by 1) global lock, 2) interface lock and
- * 3) descriptor main wlock.
- * Reading bd_bif can be protected by any of these locks, typically global lock.
+ * 2. An userland application uses ioctl() call to bpf_d descriptor.
+ * All such call are serialized with global lock. BPF filters can be
+ * changed, but pointer to old filter will be freed using epoch_call().
+ * Thus it should be safe for bpf_tap/bpf_mtap* code to do access to
+ * filter pointers, even if change will happen during bpf_tap execution.
+ * Destroying of bpf_d descriptor also is doing using epoch_call().
  *
- * Changing read/write BPF filter is protected by the same three locks,
- * the same applies for reading.
+ * 3. An userland application can write packets into bpf_d descriptor.
+ * There we need to be sure, that ifnet won't disappear during bpfwrite().
  *
- * Sleeping in global lock is not allowed due to bpfdetach() using it.
+ * 4. The kernel invokes bpf_tap/bpf_mtap* functions. The access to
+ * bif_dlist is protected with net_epoch_preempt section. So, it should
+ * be safe to make access to bpf_d descriptor inside the section.
+ *
+ * 5. The kernel invokes bpfdetach() on interface destroying. All lists
+ * are modified with global lock held and actual free() is done using
+ * epoch_call().
  */
 
+static void
+bpfif_free(epoch_context_t ctx)
+{
+	struct bpf_if *bp;
+
+	bp = __containerof(ctx, struct bpf_if, epoch_ctx);
+	if_rele(bp->bif_ifp);
+	free(bp, M_BPF);
+}
+
+static void
+bpfif_ref(struct bpf_if *bp)
+{
+
+	refcount_acquire(&bp->bif_refcnt);
+}
+
+static void
+bpfif_rele(struct bpf_if *bp)
+{
+
+	if (!refcount_release(&bp->bif_refcnt))
+		return;
+	epoch_call(net_epoch_preempt, &bp->epoch_ctx, bpfif_free);
+}
+
+static void
+bpfd_ref(struct bpf_d *d)
+{
+
+	refcount_acquire(&d->bd_refcnt);
+}
+
+static void
+bpfd_rele(struct bpf_d *d)
+{
+
+	if (!refcount_release(&d->bd_refcnt))
+		return;
+	epoch_call(net_epoch_preempt, &d->epoch_ctx, bpfd_free);
+}
+
+static struct bpf_program_buffer*
+bpf_program_buffer_alloc(size_t size, int flags)
+{
+
+	return (malloc(sizeof(struct bpf_program_buffer) + size,
+	    M_BPF, flags));
+}
+
+static void
+bpf_program_buffer_free(epoch_context_t ctx)
+{
+	struct bpf_program_buffer *ptr;
+
+	ptr = __containerof(ctx, struct bpf_program_buffer, epoch_ctx);
+#ifdef BPF_JITTER
+	if (ptr->func != NULL)
+		bpf_destroy_jit_filter(ptr->func);
+#endif
+	free(ptr, M_BPF);
+}
+
 /*
  * Wrapper functions for various buffering methods.  If the set of buffer
  * modes expands, we will probably want to introduce a switch data structure
@@ -627,7 +705,8 @@ bad:
 }
 
 /*
- * Attach file to the bpf interface, i.e. make d listen on bp.
+ * Attach descriptor to the bpf interface, i.e. make d listen on bp,
+ * then reset its buffers and counters with reset_d().
  */
 static void
 bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
@@ -643,7 +722,7 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 	op_w = V_bpf_optimize_writers || d->bd_writer;
 
 	if (d->bd_bif != NULL)
-		bpf_detachd_locked(d);
+		bpf_detachd_locked(d, false);
 	/*
 	 * Point d at bp, and add d to the interface's list.
 	 * Since there are many applications using BPF for
@@ -652,26 +731,27 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
 	 * some filter is configured.
 	 */
 
-	BPFIF_WLOCK(bp);
 	BPFD_LOCK(d);
-
+	/*
+	 * Hold reference to bpif while descriptor uses this interface.
+	 */
+	bpfif_ref(bp);
 	d->bd_bif = bp;
-
 	if (op_w != 0) {
 		/* Add to writers-only list */
-		LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
+		CK_LIST_INSERT_HEAD(&bp->bif_wlist, d, bd_next);
 		/*
 		 * We decrement bd_writer on every filter set operation.
 		 * First BIOCSETF is done by pcap_open_live() to set up
-		 * snap length. After that appliation usually sets its own filter
+		 * snap length. After that appliation usually sets its own
+		 * filter.
 		 */
 		d->bd_writer = 2;
 	} else
-		LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
+		CK_LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
 
+	reset_d(d);
 	BPFD_UNLOCK(d);
-	BPFIF_WUNLOCK(bp);
-
 	bpf_bpfd_cnt++;
 
 	CTR3(KTR_NET, "%s: bpf_attach called by pid %d, adding to %s list",
@@ -685,7 +765,8 @@ bpf_attachd(struct bpf_d *d, struct bpf_if *bp)
  * Check if we need to upgrade our descriptor @d from write-only mode.
  */
 static int
-bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode, int flen)
+bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct bpf_insn *fcode,
+    int flen)
 {
 	int is_snap, need_upgrade;
 
@@ -705,7 +786,8 @@ bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct 
 	 * we'd prefer to treat k=0 (deny ALL) case the same way: e.g.
 	 * do not consider upgrading immediately
 	 */
-	if (cmd == BIOCSETF && flen == 1 && fcode[0].code == (BPF_RET | BPF_K))
+	if (cmd == BIOCSETF && flen == 1 &&
+	    fcode[0].code == (BPF_RET | BPF_K))
 		is_snap = 1;
 	else
 		is_snap = 0;
@@ -743,88 +825,45 @@ bpf_check_upgrade(u_long cmd, struct bpf_d *d, struct 
 }
 
 /*
- * Add d to the list of active bp filters.
- * Requires bpf_attachd() to be called before.
- */
-static void
-bpf_upgraded(struct bpf_d *d)
-{
-	struct bpf_if *bp;
-
-	BPF_LOCK_ASSERT();
-
-	bp = d->bd_bif;
-
-	/*
-	 * Filter can be set several times without specifying interface.
-	 * Mark d as reader and exit.
-	 */
-	if (bp == NULL) {
-		BPFD_LOCK(d);
-		d->bd_writer = 0;
-		BPFD_UNLOCK(d);
-		return;
-	}
-
-	BPFIF_WLOCK(bp);
-	BPFD_LOCK(d);
-
-	/* Remove from writers-only list */
-	LIST_REMOVE(d, bd_next);
-	LIST_INSERT_HEAD(&bp->bif_dlist, d, bd_next);
-	/* Mark d as reader */
-	d->bd_writer = 0;
-
-	BPFD_UNLOCK(d);
-	BPFIF_WUNLOCK(bp);
-
-	CTR2(KTR_NET, "%s: upgrade required by pid %d", __func__, d->bd_pid);
-
-	EVENTHANDLER_INVOKE(bpf_track, bp->bif_ifp, bp->bif_dlt, 1);
-}
-
-/*
  * Detach a file from its interface.
  */
 static void
 bpf_detachd(struct bpf_d *d)
 {
 	BPF_LOCK();
-	bpf_detachd_locked(d);
+	bpf_detachd_locked(d, false);
 	BPF_UNLOCK();
 }
 
 static void
-bpf_detachd_locked(struct bpf_d *d)
+bpf_detachd_locked(struct bpf_d *d, bool detached_ifp)
 {
-	int error;
 	struct bpf_if *bp;
 	struct ifnet *ifp;
+	int error;
 
+	BPF_LOCK_ASSERT();
 	CTR2(KTR_NET, "%s: detach required by pid %d", __func__, d->bd_pid);
 
-	BPF_LOCK_ASSERT();
-
 	/* Check if descriptor is attached */
 	if ((bp = d->bd_bif) == NULL)
 		return;
 
-	BPFIF_WLOCK(bp);
 	BPFD_LOCK(d);
-
+	/* Remove d from the interface's descriptor list. */
+	CK_LIST_REMOVE(d, bd_next);
 	/* Save bd_writer value */
 	error = d->bd_writer;
-
-	/*
-	 * Remove d from the interface's descriptor list.
-	 */
-	LIST_REMOVE(d, bd_next);
-
 	ifp = bp->bif_ifp;
 	d->bd_bif = NULL;
+	if (detached_ifp) {
+		/*
+		 * Notify descriptor as it's detached, so that any
+		 * sleepers wake up and get ENXIO.
+		 */
+		bpf_wakeup(d);
+	}
 	BPFD_UNLOCK(d);
-	BPFIF_WUNLOCK(bp);
-
 	bpf_bpfd_cnt--;
 
 	/* Call event handler iff d is attached */
@@ -833,9 +872,9 @@ bpf_detachd_locked(struct bpf_d *d)
 
 	/*
 	 * Check if this descriptor had requested promiscuous mode.
-	 * If so, turn it off.
+	 * If so and ifnet is not detached, turn it off.
 	 */
-	if (d->bd_promisc) {
+	if (d->bd_promisc && !detached_ifp) {
 		d->bd_promisc = 0;
 		CURVNET_SET(ifp->if_vnet);
 		error = ifpromisc(ifp, 0);
@@ -851,6 +890,7 @@ bpf_detachd_locked(struct bpf_d *d)
 				"bpf_detach: ifpromisc failed (%d)\n", error);
 		}
 	}
+	bpfif_rele(bp);
 }
 
 /*
@@ -875,8 +915,7 @@ bpf_dtor(void *data)
 	seldrain(&d->bd_sel);
 	knlist_destroy(&d->bd_sel.si_note);
 	callout_drain(&d->bd_callout);
-	bpf_freed(d);
-	free(d, M_BPF);
+	bpfd_rele(d);
 }
 
 /*
@@ -918,6 +957,7 @@ bpfopen(struct cdev *dev, int flags, int fmt, struct t
 	d->bd_bufmode = BPF_BUFMODE_BUFFER;
 	d->bd_sig = SIGIO;
 	d->bd_direction = BPF_D_INOUT;
+	d->bd_refcnt = 1;
 	BPF_PID_REFRESH(d, td);
 #ifdef MAC
 	mac_bpfdesc_init(d);
@@ -1093,7 +1133,8 @@ bpf_timed_out(void *arg)
 
 	BPFD_LOCK_ASSERT(d);
 
-	if (callout_pending(&d->bd_callout) || !callout_active(&d->bd_callout))
+	if (callout_pending(&d->bd_callout) ||
+	    !callout_active(&d->bd_callout))
 		return;
 	if (d->bd_state == BPF_WAITING) {
 		d->bd_state = BPF_TIMED_OUT;
@@ -1119,47 +1160,71 @@ bpf_ready(struct bpf_d *d)
 static int
 bpfwrite(struct cdev *dev, struct uio *uio, int ioflag)
 {
+	struct route ro;
+	struct sockaddr dst;
+	struct epoch_tracker et;
+	struct bpf_if *bp;
 	struct bpf_d *d;
 	struct ifnet *ifp;
 	struct mbuf *m, *mc;
-	struct sockaddr dst;
-	struct route ro;
 	int error, hlen;
 
 	error = devfs_get_cdevpriv((void **)&d);
 	if (error != 0)
 		return (error);
 
+	NET_EPOCH_ENTER(et);
+	BPFD_LOCK(d);
 	BPF_PID_REFRESH_CUR(d);
 	counter_u64_add(d->bd_wcount, 1);
-	/* XXX: locking required */
-	if (d->bd_bif == NULL) {
-		counter_u64_add(d->bd_wdcount, 1);
-		return (ENXIO);
+	if ((bp = d->bd_bif) == NULL) {
+		error = ENXIO;
+		goto out_locked;
 	}
 
-	ifp = d->bd_bif->bif_ifp;
-
+	ifp = bp->bif_ifp;
 	if ((ifp->if_flags & IFF_UP) == 0) {
-		counter_u64_add(d->bd_wdcount, 1);
-		return (ENETDOWN);
+		error = ENETDOWN;
+		goto out_locked;
 	}
 
-	if (uio->uio_resid == 0) {
-		counter_u64_add(d->bd_wdcount, 1);
-		return (0);
-	}
+	if (uio->uio_resid == 0)
+		goto out_locked;
 
 	bzero(&dst, sizeof(dst));
 	m = NULL;
 	hlen = 0;
-	/* XXX: bpf_movein() can sleep */
-	error = bpf_movein(uio, (int)d->bd_bif->bif_dlt, ifp,
+
+	/*
+	 * Take extra reference, unlock d and exit from epoch section,
+	 * since bpf_movein() can sleep.
+	 */
+	bpfd_ref(d);
+	NET_EPOCH_EXIT(et);
+	BPFD_UNLOCK(d);
+
+	error = bpf_movein(uio, (int)bp->bif_dlt, ifp,
 	    &m, &dst, &hlen, d);
-	if (error) {
+
+	if (error != 0) {
 		counter_u64_add(d->bd_wdcount, 1);
+		bpfd_rele(d);
 		return (error);
 	}
+
+	BPFD_LOCK(d);
+	/*
+	 * Check that descriptor is still attached to the interface.
+	 * This can happen on bpfdetach(). To avoid access to detached
+	 * ifnet, free mbuf and return ENXIO.
+	 */
+	if (d->bd_bif == NULL) {
+		counter_u64_add(d->bd_wdcount, 1);
+		BPFD_UNLOCK(d);
+		bpfd_rele(d);
+		m_freem(m);
+		return (ENXIO);
+	}
 	counter_u64_add(d->bd_wfcount, 1);
 	if (d->bd_hdrcmplt)
 		dst.sa_family = pseudo_AF_HDRCMPLT;
@@ -1180,11 +1245,9 @@ bpfwrite(struct cdev *dev, struct uio *uio, int ioflag
 
 	CURVNET_SET(ifp->if_vnet);
 #ifdef MAC
-	BPFD_LOCK(d);
 	mac_bpfdesc_create_mbuf(d, m);
 	if (mc != NULL)
 		mac_bpfdesc_create_mbuf(d, mc);
-	BPFD_UNLOCK(d);
 #endif
 
 	bzero(&ro, sizeof(ro));
@@ -1205,7 +1268,14 @@ bpfwrite(struct cdev *dev, struct uio *uio, int ioflag
 			m_freem(mc);
 	}
 	CURVNET_RESTORE();
+	BPFD_UNLOCK(d);
+	bpfd_rele(d);
+	return (error);
 
+out_locked:
+	counter_u64_add(d->bd_wdcount, 1);
+	NET_EPOCH_EXIT(et);
+	BPFD_UNLOCK(d);
 	return (error);
 }
 
@@ -1830,16 +1900,11 @@ bpfioctl(struct cdev *dev, u_long cmd, caddr_t addr, i
 }
 
 /*
- * Set d's packet filter program to fp.  If this file already has a filter,
- * free it and replace it.  Returns EINVAL for bogus requests.
+ * Set d's packet filter program to fp. If this file already has a filter,
+ * free it and replace it. Returns EINVAL for bogus requests.
  *
- * Note we need global lock here to serialize bpf_setf() and bpf_setif() calls
- * since reading d->bd_bif can't be protected by d or interface lock due to
- * lock order.
- *
- * Additionally, we have to acquire interface write lock due to bpf_mtap() uses
- * interface read lock to read all filers.
- *
+ * Note we use global lock here to serialize bpf_setf() and bpf_setif()
+ * calls.
  */
 static int
 bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_long cmd)
@@ -1848,13 +1913,14 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_lo
 	struct bpf_program fp_swab;
 	struct bpf_program32 *fp32;
 #endif
-	struct bpf_insn *fcode, *old;
+	struct bpf_program_buffer *fcode;
+	struct bpf_insn *filter;
 #ifdef BPF_JITTER
-	bpf_jit_filter *jfunc, *ofunc;
+	bpf_jit_filter *jfunc;
 #endif
 	size_t size;
 	u_int flen;
-	int need_upgrade;
+	bool track_event;
 
 #ifdef COMPAT_FREEBSD32
 	switch (cmd) {
@@ -1863,7 +1929,8 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_lo
 	case BIOCSETFNR32:
 		fp32 = (struct bpf_program32 *)fp;
 		fp_swab.bf_len = fp32->bf_len;
-		fp_swab.bf_insns = (struct bpf_insn *)(uintptr_t)fp32->bf_insns;
+		fp_swab.bf_insns =
+		    (struct bpf_insn *)(uintptr_t)fp32->bf_insns;
 		fp = &fp_swab;
 		switch (cmd) {
 		case BIOCSETF32:
@@ -1877,12 +1944,10 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_lo
 	}
 #endif
 
-	fcode = NULL;
+	filter = NULL;
 #ifdef BPF_JITTER
-	jfunc = ofunc = NULL;
+	jfunc = NULL;
 #endif
-	need_upgrade = 0;
-
 	/*
 	 * Check new filter validness before acquiring any locks.
 	 * Allocate memory for new filter, if needed.
@@ -1892,10 +1957,11 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_lo
 		return (EINVAL);
 	size = flen * sizeof(*fp->bf_insns);
 	if (size > 0) {
-		/* We're setting up new filter.  Copy and check actual data. */
-		fcode = malloc(size, M_BPF, M_WAITOK);
-		if (copyin(fp->bf_insns, fcode, size) != 0 ||
-		    !bpf_validate(fcode, flen)) {
+		/* We're setting up new filter. Copy and check actual data. */
+		fcode = bpf_program_buffer_alloc(size, M_WAITOK);
+		filter = (struct bpf_insn *)fcode->buffer;
+		if (copyin(fp->bf_insns, filter, size) != 0 ||
+		    !bpf_validate(filter, flen)) {
 			free(fcode, M_BPF);
 			return (EINVAL);
 		}
@@ -1905,50 +1971,73 @@ bpf_setf(struct bpf_d *d, struct bpf_program *fp, u_lo
 			 * Filter is copied inside fcode and is
 			 * perfectly valid.
 			 */
-			jfunc = bpf_jitter(fcode, flen);
+			jfunc = bpf_jitter(filter, flen);
 		}
 #endif
 	}
 
-	BPF_LOCK();
+	track_event = false;
+	fcode = NULL;
 
-	/*
-	 * Set up new filter.
-	 * Protect filter change by interface lock.
-	 * Additionally, we are protected by global lock here.
-	 */
-	if (d->bd_bif != NULL)
-		BPFIF_WLOCK(d->bd_bif);
+	BPF_LOCK();
 	BPFD_LOCK(d);
+	/* Set up new filter. */
 	if (cmd == BIOCSETWF) {
-		old = d->bd_wfilter;
-		d->bd_wfilter = fcode;
+		if (d->bd_wfilter != NULL) {
+			fcode = __containerof((void *)d->bd_wfilter,
+			    struct bpf_program_buffer, buffer);
+#ifdef BPF_JITTER
+			fcode->func = NULL;
+#endif
+		}
+		d->bd_wfilter = filter;
 	} else {
-		old = d->bd_rfilter;
-		d->bd_rfilter = fcode;
+		if (d->bd_rfilter != NULL) {
+			fcode = __containerof((void *)d->bd_rfilter,
+			    struct bpf_program_buffer, buffer);
 #ifdef BPF_JITTER
-		ofunc = d->bd_bfilter;
+			fcode->func = d->bd_bfilter;
+#endif
+		}
+		d->bd_rfilter = filter;
+#ifdef BPF_JITTER
 		d->bd_bfilter = jfunc;
 #endif
 		if (cmd == BIOCSETF)
 			reset_d(d);
 
-		need_upgrade = bpf_check_upgrade(cmd, d, fcode, flen);
+		if (bpf_check_upgrade(cmd, d, filter, flen) != 0) {
+			/*
+			 * Filter can be set several times without
+			 * specifying interface. In this case just mark d
+			 * as reader.
+			 */
+			d->bd_writer = 0;
+			if (d->bd_bif != NULL) {
+				/*
+				 * Remove descriptor from writers-only list
+				 * and add it to active readers list.
+				 */
+				CK_LIST_REMOVE(d, bd_next);
+				CK_LIST_INSERT_HEAD(&d->bd_bif->bif_dlist,
+				    d, bd_next);
+				CTR2(KTR_NET,
+				    "%s: upgrade required by pid %d",
+				    __func__, d->bd_pid);
+				track_event = true;
+			}
+		}
 	}
 	BPFD_UNLOCK(d);
-	if (d->bd_bif != NULL)
-		BPFIF_WUNLOCK(d->bd_bif);
-	if (old != NULL)
-		free(old, M_BPF);
-#ifdef BPF_JITTER
-	if (ofunc != NULL)
-		bpf_destroy_jit_filter(ofunc);
-#endif
 
-	/* Move d to active readers list. */
-	if (need_upgrade != 0)
-		bpf_upgraded(d);
+	if (fcode != NULL)
+		epoch_call(net_epoch_preempt, &fcode->epoch_ctx,
+		    bpf_program_buffer_free);
 
+	if (track_event)
+		EVENTHANDLER_INVOKE(bpf_track,
+		    d->bd_bif->bif_ifp, d->bd_bif->bif_dlt, 1);
+
 	BPF_UNLOCK();
 	return (0);
 }
@@ -1971,15 +2060,6 @@ bpf_setif(struct bpf_d *d, struct ifreq *ifr)
 		return (ENXIO);
 
 	bp = theywant->if_bpf;
-
-	/* Check if interface is not being detached from BPF */
-	BPFIF_RLOCK(bp);
-	if (bp->bif_flags & BPFIF_FLAG_DYING) {
-		BPFIF_RUNLOCK(bp);
-		return (ENXIO);
-	}
-	BPFIF_RUNLOCK(bp);
-
 	/*
 	 * At this point, we expect the buffer is already allocated.  If not,
 	 * return an error.
@@ -1996,9 +2076,11 @@ bpf_setif(struct bpf_d *d, struct ifreq *ifr)
 	}
 	if (bp != d->bd_bif)
 		bpf_attachd(d, bp);
-	BPFD_LOCK(d);
-	reset_d(d);
-	BPFD_UNLOCK(d);
+	else {
+		BPFD_LOCK(d);
+		reset_d(d);
+		BPFD_UNLOCK(d);
+	}
 	return (0);
 }
 
@@ -2150,6 +2232,7 @@ bpf_gettime(struct bintime *bt, int tstype, struct mbu
 void
 bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 {
+	struct epoch_tracker et;
 	struct bintime bt;
 	struct bpf_d *d;
 #ifdef BPF_JITTER
@@ -2159,24 +2242,14 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 	int gottime;
 
 	gottime = BPF_TSTAMP_NONE;
-
-	BPFIF_RLOCK(bp);
-
-	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
-		/*
-		 * We are not using any locks for d here because:
-		 * 1) any filter change is protected by interface
-		 * write lock
-		 * 2) destroying/detaching d is protected by interface
-		 * write lock, too
-		 */
-
+	NET_EPOCH_ENTER(et);
+	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		counter_u64_add(d->bd_rcount, 1);
 		/*
-		 * NB: We dont call BPF_CHECK_DIRECTION() here since there is no
-		 * way for the caller to indiciate to us whether this packet
-		 * is inbound or outbound.  In the bpf_mtap() routines, we use
-		 * the interface pointers on the mbuf to figure it out.
+		 * NB: We dont call BPF_CHECK_DIRECTION() here since there
+		 * is no way for the caller to indiciate to us whether this
+		 * packet is inbound or outbound. In the bpf_mtap() routines,
+		 * we use the interface pointers on the mbuf to figure it out.
 		 */
 #ifdef BPF_JITTER
 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
@@ -2190,10 +2263,10 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 			 * Filter matches. Let's to acquire write lock.
 			 */
 			BPFD_LOCK(d);
-
 			counter_u64_add(d->bd_fcount, 1);
 			if (gottime < bpf_ts_quality(d->bd_tstamp))
-				gottime = bpf_gettime(&bt, d->bd_tstamp, NULL);
+				gottime = bpf_gettime(&bt, d->bd_tstamp,
+				    NULL);
 #ifdef MAC
 			if (mac_bpfdesc_check_receive(d, bp->bif_ifp) == 0)
 #endif
@@ -2202,7 +2275,7 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 			BPFD_UNLOCK(d);
 		}
 	}
-	BPFIF_RUNLOCK(bp);
+	NET_EPOCH_EXIT(et);
 }
 
 #define	BPF_CHECK_DIRECTION(d, r, i)				\
@@ -2216,6 +2289,7 @@ bpf_tap(struct bpf_if *bp, u_char *pkt, u_int pktlen)
 void
 bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 {
+	struct epoch_tracker et;
 	struct bintime bt;
 	struct bpf_d *d;
 #ifdef BPF_JITTER
@@ -2233,9 +2307,8 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 	pktlen = m_length(m, NULL);
 	gottime = BPF_TSTAMP_NONE;
 
-	BPFIF_RLOCK(bp);
-
-	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
+	NET_EPOCH_ENTER(et);
+	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
 			continue;
 		counter_u64_add(d->bd_rcount, 1);
@@ -2243,7 +2316,8 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 		bf = bpf_jitter_enable != 0 ? d->bd_bfilter : NULL;
 		/* XXX We cannot handle multiple mbufs. */
 		if (bf != NULL && m->m_next == NULL)
-			slen = (*(bf->func))(mtod(m, u_char *), pktlen, pktlen);
+			slen = (*(bf->func))(mtod(m, u_char *), pktlen,
+			    pktlen);
 		else
 #endif
 		slen = bpf_filter(d->bd_rfilter, (u_char *)m, pktlen, 0);
@@ -2261,7 +2335,7 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 			BPFD_UNLOCK(d);
 		}
 	}
-	BPFIF_RUNLOCK(bp);
+	NET_EPOCH_EXIT(et);
 }
 
 /*
@@ -2271,6 +2345,7 @@ bpf_mtap(struct bpf_if *bp, struct mbuf *m)
 void
 bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, struct mbuf *m)
 {
+	struct epoch_tracker et;
 	struct bintime bt;
 	struct mbuf mb;
 	struct bpf_d *d;
@@ -2296,9 +2371,8 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, s
 
 	gottime = BPF_TSTAMP_NONE;
 
-	BPFIF_RLOCK(bp);
-
-	LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
+	NET_EPOCH_ENTER(et);
+	CK_LIST_FOREACH(d, &bp->bif_dlist, bd_next) {
 		if (BPF_CHECK_DIRECTION(d, m->m_pkthdr.rcvif, bp->bif_ifp))
 			continue;
 		counter_u64_add(d->bd_rcount, 1);
@@ -2317,11 +2391,10 @@ bpf_mtap2(struct bpf_if *bp, void *data, u_int dlen, s
 			BPFD_UNLOCK(d);
 		}
 	}
-	BPFIF_RUNLOCK(bp);
+	NET_EPOCH_EXIT(et);
 }
 
 #undef	BPF_CHECK_DIRECTION
-
 #undef	BPF_TSTAMP_NONE
 #undef	BPF_TSTAMP_FAST
 #undef	BPF_TSTAMP_NORMAL
@@ -2540,26 +2613,30 @@ copy:
  * Called on close.
  */
 static void
-bpf_freed(struct bpf_d *d)
+bpfd_free(epoch_context_t ctx)
 {
+	struct bpf_d *d;
+	struct bpf_program_buffer *p;
 
 	/*
 	 * We don't need to lock out interrupts since this descriptor has
 	 * been detached from its interface and it yet hasn't been marked
 	 * free.
 	 */
+	d = __containerof(ctx, struct bpf_d, epoch_ctx);
 	bpf_free(d);
 	if (d->bd_rfilter != NULL) {
-		free((caddr_t)d->bd_rfilter, M_BPF);
-#ifdef BPF_JITTER
-		if (d->bd_bfilter != NULL)
-			bpf_destroy_jit_filter(d->bd_bfilter);
-#endif
+		p = __containerof((void *)d->bd_rfilter,
+		    struct bpf_program_buffer, buffer);
+		bpf_program_buffer_free(&p->epoch_ctx);
 	}
-	if (d->bd_wfilter != NULL)
-		free((caddr_t)d->bd_wfilter, M_BPF);
-	mtx_destroy(&d->bd_lock);
+	if (d->bd_wfilter != NULL) {
+		p = __containerof((void *)d->bd_wfilter,
+		    struct bpf_program_buffer, buffer);
+		bpf_program_buffer_free(&p->epoch_ctx);
+	}
 
+	mtx_destroy(&d->bd_lock);
 	counter_u64_free(d->bd_rcount);
 	counter_u64_free(d->bd_dcount);
 	counter_u64_free(d->bd_fcount);
@@ -2567,7 +2644,7 @@ bpf_freed(struct bpf_d *d)
 	counter_u64_free(d->bd_wfcount);
 	counter_u64_free(d->bd_wdcount);
 	counter_u64_free(d->bd_zcopy);
-
+	free(d, M_BPF);
 }
 
 /*
@@ -2588,25 +2665,31 @@ bpfattach(struct ifnet *ifp, u_int dlt, u_int hdrlen)
  * headers are not yet supporrted).
  */
 void
-bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen, struct bpf_if **driverp)
+bpfattach2(struct ifnet *ifp, u_int dlt, u_int hdrlen,
+    struct bpf_if **driverp)
 {
 	struct bpf_if *bp;
 
-	KASSERT(*driverp == NULL, ("bpfattach2: driverp already initialized"));
+	KASSERT(*driverp == NULL,
+	    ("bpfattach2: driverp already initialized"));
 
 	bp = malloc(sizeof(*bp), M_BPF, M_WAITOK | M_ZERO);
 
-	rw_init(&bp->bif_lock, "bpf interface lock");
-	LIST_INIT(&bp->bif_dlist);
-	LIST_INIT(&bp->bif_wlist);
+	CK_LIST_INIT(&bp->bif_dlist);
+	CK_LIST_INIT(&bp->bif_wlist);
 	bp->bif_ifp = ifp;
 	bp->bif_dlt = dlt;
 	bp->bif_hdrlen = hdrlen;
 	bp->bif_bpf = driverp;
+	bp->bif_refcnt = 1;
 	*driverp = bp;
-
+	/*
+	 * Reference ifnet pointer, so it won't freed until
+	 * we release it.
+	 */
+	if_ref(ifp);
 	BPF_LOCK();
-	LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
+	CK_LIST_INSERT_HEAD(&bpf_iflist, bp, bif_next);
 	BPF_UNLOCK();
 
 	if (bootverbose && IS_DEFAULT_VNET(curvnet))
@@ -2647,103 +2730,37 @@ bpf_get_bp_params(struct bpf_if *bp, u_int *bif_dlt, u
 void
 bpfdetach(struct ifnet *ifp)
 {
-	struct bpf_if	*bp, *bp_temp;
-	struct bpf_d	*d;
-	int ndetached;
+	struct bpf_if *bp, *bp_temp;
+	struct bpf_d *d;
 
-	ndetached = 0;
-
 	BPF_LOCK();

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***