git: 787845c0e8e8 - main - Revert "ip_mroute: refactor bw_meter API"

Wojciech Macek wma at FreeBSD.org
Thu May 20 10:15:11 UTC 2021


The branch main has been updated by wma:

URL: https://cgit.FreeBSD.org/src/commit/?id=787845c0e8e831bf5b2d000950241cb23c16ca45

commit 787845c0e8e831bf5b2d000950241cb23c16ca45
Author:     Wojciech Macek <wma at FreeBSD.org>
AuthorDate: 2021-05-20 10:14:58 +0000
Commit:     Wojciech Macek <wma at FreeBSD.org>
CommitDate: 2021-05-20 10:14:58 +0000

    Revert "ip_mroute: refactor bw_meter API"
    
    This reverts commit d1cd99b147411b331a9bff659533780ef297ef58.
---
 sys/netinet/ip_mroute.c | 610 +++++++++++++++++++++++++++++++-----------------
 sys/netinet/ip_mroute.h |  10 +-
 2 files changed, 400 insertions(+), 220 deletions(-)

diff --git a/sys/netinet/ip_mroute.c b/sys/netinet/ip_mroute.c
index 0cb980b7247e..b8e677ba9af1 100644
--- a/sys/netinet/ip_mroute.c
+++ b/sys/netinet/ip_mroute.c
@@ -49,7 +49,6 @@
  * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
  * Modified by Hitoshi Asaeda, WIDE, August 2000
  * Modified by Pavlin Radoslavov, ICSI, October 2002
- * Modified by Wojciech Macek, Semihalf, May 2021
  *
  * MROUTING Revision: 3.5
  * and PIM-SMv2 and PIM-DM support, advanced API support,
@@ -203,6 +202,16 @@ VNET_DEFINE_STATIC(struct callout, expire_upcalls_ch);
  * Bandwidth meter variables and constants
  */
 static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters");
+/*
+ * Pending timeouts are stored in a hash table, the key being the
+ * expiration time. Periodically, the entries are analysed and processed.
+ */
+#define	BW_METER_BUCKETS	1024
+VNET_DEFINE_STATIC(struct bw_meter **, bw_meter_timers);
+#define	V_bw_meter_timers	VNET(bw_meter_timers)
+VNET_DEFINE_STATIC(struct callout, bw_meter_ch);
+#define	V_bw_meter_ch		VNET(bw_meter_ch)
+#define	BW_METER_PERIOD (hz)		/* periodical handling of bw meters */
 
 /*
  * Pending upcalls are stored in a vector which is flushed when
@@ -311,13 +320,14 @@ static int	add_mfc(struct mfcctl2 *);
 static int	add_vif(struct vifctl *);
 static void	bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
 static void	bw_meter_process(void);
-static void	bw_meter_geq_receive_packet(struct bw_meter *, int,
+static void	bw_meter_receive_packet(struct bw_meter *, int,
 		    struct timeval *);
 static void	bw_upcalls_send(void);
 static int	del_bw_upcall(struct bw_upcall *);
 static int	del_mfc(struct mfcctl2 *);
 static int	del_vif(vifi_t);
 static int	del_vif_locked(vifi_t);
+static void	expire_bw_meter_process(void *);
 static void	expire_bw_upcalls_send(void *);
 static void	expire_mfc(struct mfc *);
 static void	expire_upcalls(void *);
@@ -675,6 +685,8 @@ ip_mrouter_init(struct socket *so, int version)
 	curvnet);
     callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send,
 	curvnet);
+    callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process,
+	curvnet);
 
     V_ip_mrouter = so;
     ip_mrouter_cnt++;
@@ -733,6 +745,7 @@ X_ip_mrouter_done(void)
 
     callout_stop(&V_expire_upcalls_ch);
     callout_stop(&V_bw_upcalls_ch);
+    callout_stop(&V_bw_meter_ch);
 
     MFC_LOCK();
 
@@ -753,6 +766,7 @@ X_ip_mrouter_done(void)
     bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize);
 
     V_bw_upcalls_n = 0;
+    bzero(V_bw_meter_timers, BW_METER_BUCKETS * sizeof(*V_bw_meter_timers));
 
     MFC_UNLOCK();
 
@@ -1022,8 +1036,7 @@ expire_mfc(struct mfc *rt)
 
 	MFC_LOCK_ASSERT();
 
-	free_bw_list(rt->mfc_bw_meter_leq);
-	free_bw_list(rt->mfc_bw_meter_geq);
+	free_bw_list(rt->mfc_bw_meter);
 
 	TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) {
 		m_freem(rte->m);
@@ -1126,8 +1139,7 @@ add_mfc(struct mfcctl2 *mfccp)
 	    rt->mfc_nstall = 0;
 
 	    rt->mfc_expire     = 0;
-	    rt->mfc_bw_meter_leq = NULL;
-	    rt->mfc_bw_meter_geq = NULL;
+	    rt->mfc_bw_meter = NULL;
 
 	    /* insert new entry at head of hash chain */
 	    LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash);
@@ -1167,10 +1179,8 @@ del_mfc(struct mfcctl2 *mfccp)
     /*
      * free the bw_meter entries
      */
-    free_bw_list(rt->mfc_bw_meter_leq);
-    rt->mfc_bw_meter_leq = NULL;
-    free_bw_list(rt->mfc_bw_meter_geq);
-    rt->mfc_bw_meter_geq = NULL;
+    free_bw_list(rt->mfc_bw_meter);
+    rt->mfc_bw_meter = NULL;
 
     LIST_REMOVE(rt, mfc_hash);
     free(rt, M_MRTABLE);
@@ -1383,8 +1393,7 @@ fail:
 
 	    /* clear the RP address */
 	    rt->mfc_rp.s_addr = INADDR_ANY;
-	    rt->mfc_bw_meter_leq = NULL;
-	    rt->mfc_bw_meter_geq = NULL;
+	    rt->mfc_bw_meter = NULL;
 
 	    /* initialize pkt counters per src-grp */
 	    rt->mfc_pkt_cnt = 0;
@@ -1450,6 +1459,16 @@ expire_upcalls(void *arg)
 		if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
 			continue;
 
+		/*
+		 * free the bw_meter entries
+		 */
+		while (rt->mfc_bw_meter != NULL) {
+		    struct bw_meter *x = rt->mfc_bw_meter;
+
+		    rt->mfc_bw_meter = x->bm_mfc_next;
+		    free(x, M_BWMETER);
+		}
+
 		MRTSTAT_INC(mrts_cache_cleanups);
 		CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__,
 		    (u_long)ntohl(rt->mfc_origin.s_addr),
@@ -1583,22 +1602,14 @@ ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
     /*
      * Perform upcall-related bw measuring.
      */
-    if ((rt->mfc_bw_meter_geq != NULL) || (rt->mfc_bw_meter_leq != NULL)) {
+    if (rt->mfc_bw_meter != NULL) {
 	struct bw_meter *x;
 	struct timeval now;
 
 	microtime(&now);
 	MFC_LOCK_ASSERT();
-	/* Process meters for Greater-or-EQual case */
-	for (x = rt->mfc_bw_meter_geq; x != NULL; x = x->bm_mfc_next)
-		bw_meter_geq_receive_packet(x, plen, &now);
-
-	/* Process meters for Lower-or-EQual case */
-	for (x = rt->mfc_bw_meter_leq; x != NULL; x = x->bm_mfc_next) {
-		/* Record that a packet is received */
-		x->bm_measured.b_packets++;
-		x->bm_measured.b_bytes += plen;
-	}
+	for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next)
+	    bw_meter_receive_packet(x, plen, &now);
     }
 
     return 0;
@@ -1748,139 +1759,84 @@ compute_bw_meter_flags(struct bw_upcall *req)
     return flags;
 }
 
-static void
-expire_bw_meter_leq(void *arg)
-{
-	struct bw_meter *x = arg;
-	struct timeval now;
-	/*
-	 * INFO:
-	 * callout is always executed with MFC_LOCK taken
-	 */
-
-	CURVNET_SET((struct vnet *)x->arg);
-
-	microtime(&now);
-
-	/*
-	 * Test if we should deliver an upcall
-	 */
-	if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
-	    (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
-	    ((x->bm_flags & BW_METER_UNIT_BYTES) &&
-	    (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
-		/* Prepare an upcall for delivery */
-		bw_meter_prepare_upcall(x, &now);
-	}
-
-	/* Send all upcalls that are pending delivery */
-	bw_upcalls_send();
-
-	/* Reset counters */
-	x->bm_start_time = now;
-	x->bm_measured.b_bytes = 0;
-	x->bm_measured.b_packets = 0;
-
-	callout_schedule(&x->bm_meter_callout, tvtohz(&x->bm_threshold.b_time));
-
-	CURVNET_RESTORE();
-}
-
 /*
  * Add a bw_meter entry
  */
 static int
 add_bw_upcall(struct bw_upcall *req)
 {
-	struct mfc *mfc;
-	struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
-	BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
-	struct timeval now;
-	struct bw_meter *x, **bwm_ptr;
-	uint32_t flags;
-
-	if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
-		return EOPNOTSUPP;
-
-	/* Test if the flags are valid */
-	if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
-		return EINVAL;
-	if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
-		return EINVAL;
-	if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
-			== (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
-		return EINVAL;
-
-	/* Test if the threshold time interval is valid */
-	if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
-		return EINVAL;
+    struct mfc *mfc;
+    struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
+		BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
+    struct timeval now;
+    struct bw_meter *x;
+    uint32_t flags;
 
-	flags = compute_bw_meter_flags(req);
+    if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
+	return EOPNOTSUPP;
 
-	/*
-	 * Find if we have already same bw_meter entry
-	 */
-	MFC_LOCK();
-	mfc = mfc_find(&req->bu_src, &req->bu_dst);
-	if (mfc == NULL) {
-		MFC_UNLOCK();
-		return EADDRNOTAVAIL;
-	}
+    /* Test if the flags are valid */
+    if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
+	return EINVAL;
+    if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
+	return EINVAL;
+    if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
+	    == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
+	return EINVAL;
 
-	/* Choose an appropriate bw_meter list */
-	if (req->bu_flags & BW_UPCALL_GEQ)
-		bwm_ptr = &mfc->mfc_bw_meter_geq;
-	else
-		bwm_ptr = &mfc->mfc_bw_meter_leq;
-
-	for (x = *bwm_ptr; x != NULL; x = x->bm_mfc_next) {
-		if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
-		    &req->bu_threshold.b_time, ==))
-		    && (x->bm_threshold.b_packets
-		    == req->bu_threshold.b_packets)
-		    && (x->bm_threshold.b_bytes
-		    == req->bu_threshold.b_bytes)
-		    && (x->bm_flags & BW_METER_USER_FLAGS)
-		    == flags) {
-			MFC_UNLOCK();
-			return 0; /* XXX Already installed */
-		}
-	}
+    /* Test if the threshold time interval is valid */
+    if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
+	return EINVAL;
 
-	/* Allocate the new bw_meter entry */
-	x = (struct bw_meter*) malloc(sizeof(*x), M_BWMETER, M_NOWAIT);
-	if (x == NULL) {
-		MFC_UNLOCK();
-		return ENOBUFS;
-	}
+    flags = compute_bw_meter_flags(req);
 
-	/* Set the new bw_meter entry */
-	x->bm_threshold.b_time = req->bu_threshold.b_time;
-	microtime(&now);
-	x->bm_start_time = now;
-	x->bm_threshold.b_packets = req->bu_threshold.b_packets;
-	x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
-	x->bm_measured.b_packets = 0;
-	x->bm_measured.b_bytes = 0;
-	x->bm_flags = flags;
-	x->bm_time_next = NULL;
-	x->bm_mfc = mfc;
-	x->arg = curvnet;
-
-	/* For LEQ case create periodic callout */
-	if (req->bu_flags & BW_UPCALL_LEQ) {
-		callout_init_mtx(&x->bm_meter_callout, &mfc_mtx,0);
-		callout_reset(&x->bm_meter_callout, tvtohz(&x->bm_threshold.b_time),
-		    expire_bw_meter_leq, x);
+    /*
+     * Find if we have already same bw_meter entry
+     */
+    MFC_LOCK();
+    mfc = mfc_find(&req->bu_src, &req->bu_dst);
+    if (mfc == NULL) {
+	MFC_UNLOCK();
+	return EADDRNOTAVAIL;
+    }
+    for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) {
+	if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
+			   &req->bu_threshold.b_time, ==)) &&
+	    (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
+	    (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
+	    (x->bm_flags & BW_METER_USER_FLAGS) == flags)  {
+	    MFC_UNLOCK();
+	    return 0;		/* XXX Already installed */
 	}
+    }
 
-	/* Add the new bw_meter entry to the front of entries for this MFC */
-	x->bm_mfc_next = *bwm_ptr;
-	*bwm_ptr = x;
-
+    /* Allocate the new bw_meter entry */
+    x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT);
+    if (x == NULL) {
 	MFC_UNLOCK();
+	return ENOBUFS;
+    }
 
-	return 0;
+    /* Set the new bw_meter entry */
+    x->bm_threshold.b_time = req->bu_threshold.b_time;
+    microtime(&now);
+    x->bm_start_time = now;
+    x->bm_threshold.b_packets = req->bu_threshold.b_packets;
+    x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
+    x->bm_measured.b_packets = 0;
+    x->bm_measured.b_bytes = 0;
+    x->bm_flags = flags;
+    x->bm_time_next = NULL;
+    x->bm_time_hash = BW_METER_BUCKETS;
+
+    /* Add the new bw_meter entry to the front of entries for this MFC */
+    x->bm_mfc = mfc;
+    x->bm_mfc_next = mfc->mfc_bw_meter;
+    mfc->mfc_bw_meter = x;
+    schedule_bw_meter(x, &now);
+    MFC_UNLOCK();
+
+    return 0;
 }
 
 static void
@@ -1889,11 +1845,8 @@ free_bw_list(struct bw_meter *list)
     while (list != NULL) {
 	struct bw_meter *x = list;
 
-	/* MFC_LOCK must be held here */
-	if (x->bm_flags & BW_METER_LEQ)
-		callout_drain(&x->bm_meter_callout);
-
 	list = list->bm_mfc_next;
+	unschedule_bw_meter(x);
 	free(x, M_BWMETER);
     }
 }
@@ -1905,7 +1858,7 @@ static int
 del_bw_upcall(struct bw_upcall *req)
 {
     struct mfc *mfc;
-    struct bw_meter *x, **bwm_ptr;
+    struct bw_meter *x;
 
     if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
 	return EOPNOTSUPP;
@@ -1923,14 +1876,8 @@ del_bw_upcall(struct bw_upcall *req)
 	 */
 	struct bw_meter *list;
 
-	/* Free LEQ list */
-	list = mfc->mfc_bw_meter_leq;
-	mfc->mfc_bw_meter_leq = NULL;
-	free_bw_list(list);
-
-	/* Free GEQ list */
-	list = mfc->mfc_bw_meter_geq;
-	mfc->mfc_bw_meter_geq = NULL;
+	list = mfc->mfc_bw_meter;
+	mfc->mfc_bw_meter = NULL;
 	free_bw_list(list);
 	MFC_UNLOCK();
 	return 0;
@@ -1940,14 +1887,8 @@ del_bw_upcall(struct bw_upcall *req)
 
 	flags = compute_bw_meter_flags(req);
 
-	/* Choose an appropriate bw_meter list */
-	if (req->bu_flags & BW_UPCALL_GEQ)
-		bwm_ptr = &mfc->mfc_bw_meter_geq;
-	else
-		bwm_ptr = &mfc->mfc_bw_meter_leq;
-
 	/* Find the bw_meter entry to delete */
-	for (prev = NULL, x = *bwm_ptr; x != NULL;
+	for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL;
 	     prev = x, x = x->bm_mfc_next) {
 	    if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
 			       &req->bu_threshold.b_time, ==)) &&
@@ -1960,11 +1901,9 @@ del_bw_upcall(struct bw_upcall *req)
 	    if (prev != NULL)
 		prev->bm_mfc_next = x->bm_mfc_next;	/* remove from middle*/
 	    else
-		*bwm_ptr = x->bm_mfc_next;/* new head of list */
-
-	    if (req->bu_flags & BW_UPCALL_LEQ)
-		    callout_stop(&x->bm_meter_callout);
+		x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */
 
+	    unschedule_bw_meter(x);
 	    MFC_UNLOCK();
 	    /* Free the bw_meter entry */
 	    free(x, M_BWMETER);
@@ -1981,15 +1920,16 @@ del_bw_upcall(struct bw_upcall *req)
  * Perform bandwidth measurement processing that may result in an upcall
  */
 static void
-bw_meter_geq_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
+bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
 {
-	struct timeval delta;
+    struct timeval delta;
 
-	MFC_LOCK_ASSERT();
+    MFC_LOCK_ASSERT();
 
-	delta = *nowp;
-	BW_TIMEVALDECR(&delta, &x->bm_start_time);
+    delta = *nowp;
+    BW_TIMEVALDECR(&delta, &x->bm_start_time);
 
+    if (x->bm_flags & BW_METER_GEQ) {
 	/*
 	 * Processing for ">=" type of bw_meter entry
 	 */
@@ -2009,15 +1949,63 @@ bw_meter_geq_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
 	 * Test if we should deliver an upcall
 	 */
 	if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
-		if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
-		    (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
-		    ((x->bm_flags & BW_METER_UNIT_BYTES) &&
-		    (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
-			/* Prepare an upcall for delivery */
-			bw_meter_prepare_upcall(x, nowp);
-			x->bm_flags |= BW_METER_UPCALL_DELIVERED;
-		}
+	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
+		 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
+		((x->bm_flags & BW_METER_UNIT_BYTES) &&
+		 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
+		/* Prepare an upcall for delivery */
+		bw_meter_prepare_upcall(x, nowp);
+		x->bm_flags |= BW_METER_UPCALL_DELIVERED;
+	    }
+	}
+    } else if (x->bm_flags & BW_METER_LEQ) {
+	/*
+	 * Processing for "<=" type of bw_meter entry
+	 */
+	if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
+	    /*
+	     * We are behind time with the multicast forwarding table
+	     * scanning for "<=" type of bw_meter entries, so test now
+	     * if we should deliver an upcall.
+	     */
+	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
+		 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
+		((x->bm_flags & BW_METER_UNIT_BYTES) &&
+		 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
+		/* Prepare an upcall for delivery */
+		bw_meter_prepare_upcall(x, nowp);
+	    }
+	    /* Reschedule the bw_meter entry */
+	    unschedule_bw_meter(x);
+	    schedule_bw_meter(x, nowp);
+	}
+
+	/* Record that a packet is received */
+	x->bm_measured.b_packets++;
+	x->bm_measured.b_bytes += plen;
+
+	/*
+	 * Test if we should restart the measuring interval
+	 */
+	if ((x->bm_flags & BW_METER_UNIT_PACKETS &&
+	     x->bm_measured.b_packets <= x->bm_threshold.b_packets) ||
+	    (x->bm_flags & BW_METER_UNIT_BYTES &&
+	     x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) {
+	    /* Don't restart the measuring interval */
+	} else {
+	    /* Do restart the measuring interval */
+	    /*
+	     * XXX: note that we don't unschedule and schedule, because this
+	     * might be too much overhead per packet. Instead, when we process
+	     * all entries for a given timer hash bin, we check whether it is
+	     * really a timeout. If not, we reschedule at that time.
+	     */
+	    x->bm_start_time = *nowp;
+	    x->bm_measured.b_packets = 0;
+	    x->bm_measured.b_bytes = 0;
+	    x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
 	}
+    }
 }
 
 /*
@@ -2026,44 +2014,44 @@ bw_meter_geq_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
 static void
 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
 {
-	struct timeval delta;
-	struct bw_upcall *u;
+    struct timeval delta;
+    struct bw_upcall *u;
 
-	MFC_LOCK_ASSERT();
+    MFC_LOCK_ASSERT();
 
-	/*
-	 * Compute the measured time interval
-	 */
-	delta = *nowp;
-	BW_TIMEVALDECR(&delta, &x->bm_start_time);
+    /*
+     * Compute the measured time interval
+     */
+    delta = *nowp;
+    BW_TIMEVALDECR(&delta, &x->bm_start_time);
 
-	/*
-	 * If there are too many pending upcalls, deliver them now
-	 */
-	if (V_bw_upcalls_n >= BW_UPCALLS_MAX)
-		bw_upcalls_send();
+    /*
+     * If there are too many pending upcalls, deliver them now
+     */
+    if (V_bw_upcalls_n >= BW_UPCALLS_MAX)
+	bw_upcalls_send();
 
-	/*
-	 * Set the bw_upcall entry
-	 */
-	u = &V_bw_upcalls[V_bw_upcalls_n++];
-	u->bu_src = x->bm_mfc->mfc_origin;
-	u->bu_dst = x->bm_mfc->mfc_mcastgrp;
-	u->bu_threshold.b_time = x->bm_threshold.b_time;
-	u->bu_threshold.b_packets = x->bm_threshold.b_packets;
-	u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
-	u->bu_measured.b_time = delta;
-	u->bu_measured.b_packets = x->bm_measured.b_packets;
-	u->bu_measured.b_bytes = x->bm_measured.b_bytes;
-	u->bu_flags = 0;
-	if (x->bm_flags & BW_METER_UNIT_PACKETS)
-		u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
-	if (x->bm_flags & BW_METER_UNIT_BYTES)
-		u->bu_flags |= BW_UPCALL_UNIT_BYTES;
-	if (x->bm_flags & BW_METER_GEQ)
-		u->bu_flags |= BW_UPCALL_GEQ;
-	if (x->bm_flags & BW_METER_LEQ)
-		u->bu_flags |= BW_UPCALL_LEQ;
+    /*
+     * Set the bw_upcall entry
+     */
+    u = &V_bw_upcalls[V_bw_upcalls_n++];
+    u->bu_src = x->bm_mfc->mfc_origin;
+    u->bu_dst = x->bm_mfc->mfc_mcastgrp;
+    u->bu_threshold.b_time = x->bm_threshold.b_time;
+    u->bu_threshold.b_packets = x->bm_threshold.b_packets;
+    u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
+    u->bu_measured.b_time = delta;
+    u->bu_measured.b_packets = x->bm_measured.b_packets;
+    u->bu_measured.b_bytes = x->bm_measured.b_bytes;
+    u->bu_flags = 0;
+    if (x->bm_flags & BW_METER_UNIT_PACKETS)
+	u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
+    if (x->bm_flags & BW_METER_UNIT_BYTES)
+	u->bu_flags |= BW_UPCALL_UNIT_BYTES;
+    if (x->bm_flags & BW_METER_GEQ)
+	u->bu_flags |= BW_UPCALL_GEQ;
+    if (x->bm_flags & BW_METER_LEQ)
+	u->bu_flags |= BW_UPCALL_LEQ;
 }
 
 /*
@@ -2115,6 +2103,183 @@ bw_upcalls_send(void)
     }
 }
 
+/*
+ * Compute the timeout hash value for the bw_meter entries
+ */
+#define	BW_METER_TIMEHASH(bw_meter, hash)				\
+    do {								\
+	struct timeval next_timeval = (bw_meter)->bm_start_time;	\
+									\
+	BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \
+	(hash) = next_timeval.tv_sec;					\
+	if (next_timeval.tv_usec)					\
+	    (hash)++; /* XXX: make sure we don't timeout early */	\
+	(hash) %= BW_METER_BUCKETS;					\
+    } while (0)
+
+/*
+ * Schedule a timer to process periodically bw_meter entry of type "<="
+ * by linking the entry in the proper hash bucket.
+ */
+static void
+schedule_bw_meter(struct bw_meter *x, struct timeval *nowp)
+{
+    int time_hash;
+
+    MFC_LOCK_ASSERT();
+
+    if (!(x->bm_flags & BW_METER_LEQ))
+	return;		/* XXX: we schedule timers only for "<=" entries */
+
+    /*
+     * Reset the bw_meter entry
+     */
+    x->bm_start_time = *nowp;
+    x->bm_measured.b_packets = 0;
+    x->bm_measured.b_bytes = 0;
+    x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
+
+    /*
+     * Compute the timeout hash value and insert the entry
+     */
+    BW_METER_TIMEHASH(x, time_hash);
+    x->bm_time_next = V_bw_meter_timers[time_hash];
+    V_bw_meter_timers[time_hash] = x;
+    x->bm_time_hash = time_hash;
+}
+
+/*
+ * Unschedule the periodic timer that processes bw_meter entry of type "<="
+ * by removing the entry from the proper hash bucket.
+ */
+static void
+unschedule_bw_meter(struct bw_meter *x)
+{
+    int time_hash;
+    struct bw_meter *prev, *tmp;
+
+    MFC_LOCK_ASSERT();
+
+    if (!(x->bm_flags & BW_METER_LEQ))
+	return;		/* XXX: we schedule timers only for "<=" entries */
+
+    /*
+     * Compute the timeout hash value and delete the entry
+     */
+    time_hash = x->bm_time_hash;
+    if (time_hash >= BW_METER_BUCKETS)
+	return;		/* Entry was not scheduled */
+
+    for (prev = NULL, tmp = V_bw_meter_timers[time_hash];
+	     tmp != NULL; prev = tmp, tmp = tmp->bm_time_next)
+	if (tmp == x)
+	    break;
+
+    if (tmp == NULL)
+	panic("unschedule_bw_meter: bw_meter entry not found");
+
+    if (prev != NULL)
+	prev->bm_time_next = x->bm_time_next;
+    else
+	V_bw_meter_timers[time_hash] = x->bm_time_next;
+
+    x->bm_time_next = NULL;
+    x->bm_time_hash = BW_METER_BUCKETS;
+}
+
+/*
+ * Process all "<=" type of bw_meter that should be processed now,
+ * and for each entry prepare an upcall if necessary. Each processed
+ * entry is rescheduled again for the (periodic) processing.
+ *
+ * This is run periodically (once per second normally). On each round,
+ * all the potentially matching entries are in the hash slot that we are
+ * looking at.
+ */
+static void
+bw_meter_process()
+{
+    uint32_t loops;
+    int i;
+    struct timeval now, process_endtime;
+
+    microtime(&now);
+    if (V_last_tv_sec == now.tv_sec)
+	return;		/* nothing to do */
+
+    loops = now.tv_sec - V_last_tv_sec;
+    V_last_tv_sec = now.tv_sec;
+    if (loops > BW_METER_BUCKETS)
+	loops = BW_METER_BUCKETS;
+
+    MFC_LOCK();
+    /*
+     * Process all bins of bw_meter entries from the one after the last
+     * processed to the current one. On entry, i points to the last bucket
+     * visited, so we need to increment i at the beginning of the loop.
+     */
+    for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) {
+	struct bw_meter *x, *tmp_list;
+
+	if (++i >= BW_METER_BUCKETS)
+	    i = 0;
+
+	/* Disconnect the list of bw_meter entries from the bin */
+	tmp_list = V_bw_meter_timers[i];
+	V_bw_meter_timers[i] = NULL;
+
+	/* Process the list of bw_meter entries */
+	while (tmp_list != NULL) {
+	    x = tmp_list;
+	    tmp_list = tmp_list->bm_time_next;
+
+	    /* Test if the time interval is over */
+	    process_endtime = x->bm_start_time;
+	    BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time);
+	    if (BW_TIMEVALCMP(&process_endtime, &now, >)) {
+		/* Not yet: reschedule, but don't reset */
+		int time_hash;
+
+		BW_METER_TIMEHASH(x, time_hash);
+		if (time_hash == i && process_endtime.tv_sec == now.tv_sec) {
+		    /*
+		     * XXX: somehow the bin processing is a bit ahead of time.
+		     * Put the entry in the next bin.
+		     */
+		    if (++time_hash >= BW_METER_BUCKETS)
+			time_hash = 0;
+		}
+		x->bm_time_next = V_bw_meter_timers[time_hash];
+		V_bw_meter_timers[time_hash] = x;
+		x->bm_time_hash = time_hash;
+
+		continue;
+	    }
+
+	    /*
+	     * Test if we should deliver an upcall
+	     */
+	    if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
+		 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
+		((x->bm_flags & BW_METER_UNIT_BYTES) &&
+		 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
+		/* Prepare an upcall for delivery */
+		bw_meter_prepare_upcall(x, &now);
+	    }
+
+	    /*
+	     * Reschedule for next processing
+	     */
+	    schedule_bw_meter(x, &now);
+	}
+    }
+
+    /* Send all upcalls that are pending delivery */
+    bw_upcalls_send();
+
+    MFC_UNLOCK();
+}
+
 /*
  * A periodic function for sending all upcalls that are pending delivery
  */
@@ -2132,6 +2297,23 @@ expire_bw_upcalls_send(void *arg)
     CURVNET_RESTORE();
 }
 
+/*
+ * A periodic function for periodic scanning of the multicast forwarding
+ * table for processing all "<=" bw_meter entries.
+ */
+static void
+expire_bw_meter_process(void *arg)
+{
+    CURVNET_SET((struct vnet *) arg);
+
+    if (V_mrt_api_config & MRT_MFC_BW_UPCALL)
+	bw_meter_process();
+
+    callout_reset(&V_bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process,
+	curvnet);
+    CURVNET_RESTORE();
+}
+
 /*
  * End of bandwidth monitoring code
  */
@@ -2653,11 +2835,14 @@ vnet_mroute_init(const void *unused __unused)
 
 	V_viftable = mallocarray(MAXVIFS, sizeof(*V_viftable),
 	    M_MRTABLE, M_WAITOK|M_ZERO);
+	V_bw_meter_timers = mallocarray(BW_METER_BUCKETS,
+	    sizeof(*V_bw_meter_timers), M_MRTABLE, M_WAITOK|M_ZERO);
 	V_bw_upcalls = mallocarray(BW_UPCALLS_MAX, sizeof(*V_bw_upcalls),
 	    M_MRTABLE, M_WAITOK|M_ZERO);
 
 	callout_init(&V_expire_upcalls_ch, 1);
 	callout_init(&V_bw_upcalls_ch, 1);
+	callout_init(&V_bw_meter_ch, 1);
 }
 
 VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init,
@@ -2668,6 +2853,7 @@ vnet_mroute_uninit(const void *unused __unused)
 {
 
 	free(V_bw_upcalls, M_MRTABLE);
+	free(V_bw_meter_timers, M_MRTABLE);
 	free(V_viftable, M_MRTABLE);
 	free(V_nexpire, M_MRTABLE);
 	V_nexpire = NULL;
diff --git a/sys/netinet/ip_mroute.h b/sys/netinet/ip_mroute.h
index 07d77065de33..6ef99c0172f3 100644
--- a/sys/netinet/ip_mroute.h
+++ b/sys/netinet/ip_mroute.h
@@ -283,10 +283,7 @@ struct mfc {
 	struct timeval	mfc_last_assert;	/* last time I sent an assert*/
 	uint8_t		mfc_flags[MAXVIFS];	/* the MRT_MFC_FLAGS_* flags */
 	struct in_addr	mfc_rp;			/* the RP address	     */
-	struct bw_meter	*mfc_bw_meter_leq;	/* list of bandwidth meters
-						   for Lower-or-EQual case   */
-	struct bw_meter *mfc_bw_meter_geq;	/* list of bandwidth meters
-						   for Greater-or-EQual case */
+	struct bw_meter	*mfc_bw_meter;		/* list of bandwidth meters  */
 	u_long		mfc_nstall;		/* # of packets awaiting mfc */
 	TAILQ_HEAD(, rtdetq) mfc_stall;		/* q of packets awaiting mfc */
 };
@@ -330,6 +327,7 @@ struct rtdetq {
 struct bw_meter {
 	struct bw_meter	*bm_mfc_next;		/* next bw meter (same mfc)  */
 	struct bw_meter	*bm_time_next;		/* next bw meter (same time) */
+	uint32_t	bm_time_hash;		/* the time hash value       */
 	struct mfc	*bm_mfc;		/* the corresponding mfc     */
 	uint32_t	bm_flags;		/* misc flags (see below)    */
 #define BW_METER_UNIT_PACKETS	(1 << 0)	/* threshold (in packets)    */
@@ -346,10 +344,6 @@ struct bw_meter {
 	struct bw_data	bm_threshold;		/* the upcall threshold	     */
 	struct bw_data	bm_measured;		/* the measured bw	     */
 	struct timeval	bm_start_time;		/* abs. time		     */
-#ifdef _KERNEL
-	struct callout	bm_meter_callout;	/* Periodic callout          */
-	void*		arg;			/* custom argument           */
-#endif
 };
 
 #ifdef _KERNEL


More information about the dev-commits-src-all mailing list