Routing enhancement - reduce routing table locking

Ingo Flaschberger if at freebsd.org
Tue Apr 5 01:47:32 UTC 2011


Hi,

I have written a patch to:
*) reduce locking of routing table to achieve the same speed as with
    flowtables, which do not scale with many routes:
 	use of a copy of the route
 	use rm_lock(9)
 	(idea of Andre Oppermann)
*) implement some multipath changes to use a direct attached
    interface route and a real route, used some OpenBSD code
*) icmp rate-limiting in forwarding (old code from FreeBSD page)

The patch applies at FreeBSD 8.2 stable.
Comments are welcome, especially if I can reuse "dst" (which seems to 
work):
 	rtl.rt_gateway = (struct sockaddr *)dst;

but 'm not shure if I should take same caution with bcopy regarding 
sa_len:
 	rtlookup_fib(struct sockaddr *dst, u_int fibnum, struct rtlookup *rtl,

Kind regards,
 	Ingo Flaschberger

Geschaeftsleitung
____________________________________
crossip communications gmbh
A-1020 Wien, Sebastian Kneipp Gasse 1/3

Sitz der Gesellschaft: 1020 Wien, Oesterreich
Firmenbuchgericht: Handelsgericht Wien, FN 269698 s,
Umsatzsteueridentifikationsnummer (UID): ATU62080367

Haftungsausschluss / Disclaimer <http://www.xip.at/content/view/278/>
-------------- next part --------------
diff -u -r ../src_org_8.2_20110329/contrib/ipfilter/radix.c ./contrib/ipfilter/radix.c
--- ../src_org_8.2_20110329/contrib/ipfilter/radix.c	2009-08-03 08:13:06.000000000 +0000
+++ ./contrib/ipfilter/radix.c	2011-04-03 16:08:28.000000000 +0000
@@ -759,9 +759,10 @@
 }
 
 struct radix_node *
-rn_delete(v_arg, netmask_arg, head)
+rn_delete(v_arg, netmask_arg, head, rn)
 	void *v_arg, *netmask_arg;
 	struct radix_node_head *head;
+	struct radix_node *rn;
 {
 	struct radix_node *t, *p, *x, *tt;
 	struct radix_mask *m, *saved_m, **mp;
@@ -1069,7 +1070,7 @@
 	struct radix_node_head *rnh = p;
 	struct radix_node *d;
 
-	d = rnh->rnh_deladdr(n->rn_key, NULL, rnh);
+	d = rnh->rnh_deladdr(n->rn_key, NULL, rnh, NULL);
 	if (d != NULL) {
 		FreeS(d, max_keylen + 2 * sizeof (*d));
 	}
diff -u -r ../src_org_8.2_20110329/contrib/ipfilter/radix_ipf.h ./contrib/ipfilter/radix_ipf.h
--- ../src_org_8.2_20110329/contrib/ipfilter/radix_ipf.h	2009-08-03 08:13:06.000000000 +0000
+++ ./contrib/ipfilter/radix_ipf.h	2011-04-03 16:08:28.000000000 +0000
@@ -130,7 +130,7 @@
 		__P((void *v, void *mask,
 		     struct radix_node_head *head, struct radix_node nodes[]));
 	struct	radix_node *(*rnh_deladdr)	/* remove based on sockaddr */
-		__P((void *v, void *mask, struct radix_node_head *head));
+		__P((void *v, void *mask, struct radix_node_head *head, struct radix_node *rn));
 	struct	radix_node *(*rnh_delpkt)	/* remove based on packet hdr */
 		__P((void *v, void *mask, struct radix_node_head *head));
 	struct	radix_node *(*rnh_matchaddr)	/* locate based on sockaddr */
@@ -202,7 +202,7 @@
 	 *rn_addmask __P((void *, int, int)),
 	 *rn_addroute __P((void *, void *, struct radix_node_head *,
 			struct radix_node [2])),
-	 *rn_delete __P((void *, void *, struct radix_node_head *)),
+	 *rn_delete __P((void *, void *, struct radix_node_head *, struct radix_node *)),
 	 *rn_insert __P((void *, struct radix_node_head *, int *,
 			struct radix_node [2])),
 	 *rn_lookup __P((void *, void *, struct radix_node_head *)),
diff -u -r ../src_org_8.2_20110329/sbin/routed/radix.c ./sbin/routed/radix.c
--- ../src_org_8.2_20110329/sbin/routed/radix.c	2009-08-03 08:13:06.000000000 +0000
+++ ./sbin/routed/radix.c	2011-04-03 16:08:07.000000000 +0000
@@ -662,7 +662,8 @@
 static struct radix_node *
 rn_delete(void *v_arg,
 	  void *netmask_arg,
-	  struct radix_node_head *head)
+	  struct radix_node_head *head,
+	  struct radix_node *rn)
 {
 	struct radix_node *t, *p, *x, *tt;
 	struct radix_mask *m, *saved_m, **mp;
@@ -670,6 +671,8 @@
 	caddr_t v, netmask;
 	int b, head_off, vlen;
 
+	rn = NULL; /* XXX make compiler happy */
+
 	v = v_arg;
 	netmask = netmask_arg;
 	x = head->rnh_treetop;
diff -u -r ../src_org_8.2_20110329/sbin/routed/radix.h ./sbin/routed/radix.h
--- ../src_org_8.2_20110329/sbin/routed/radix.h	2009-08-03 08:13:06.000000000 +0000
+++ ./sbin/routed/radix.h	2011-04-03 16:08:07.000000000 +0000
@@ -115,7 +115,7 @@
 		(void *v, void *mask,
 		     struct radix_node_head *head, struct radix_node nodes[]);
 	struct	radix_node *(*rnh_deladdr)	/* remove based on sockaddr */
-		(void *v, void *mask, struct radix_node_head *head);
+		(void *v, void *mask, struct radix_node_head *head, struct radix_node *rn);
 	struct	radix_node *(*rnh_delpkt)	/* remove based on packet hdr */
 		(void *v, void *mask, struct radix_node_head *head);
 	struct	radix_node *(*rnh_matchaddr)	/* locate based on sockaddr */
diff -u -r ../src_org_8.2_20110329/sbin/routed/table.c ./sbin/routed/table.c
--- ../src_org_8.2_20110329/sbin/routed/table.c	2009-08-03 08:13:06.000000000 +0000
+++ ./sbin/routed/table.c	2011-04-03 16:08:07.000000000 +0000
@@ -1865,7 +1865,7 @@
 	mask_sock.sin_addr.s_addr = htonl(rt->rt_mask);
 	masktrim(&mask_sock);
 	if (rt != (struct rt_entry *)rhead->rnh_deladdr(&dst_sock, &mask_sock,
-							rhead)) {
+							rhead, NULL)) {
 		msglog("rnh_deladdr() failed");
 	} else {
 		free(rt);
diff -u -r ../src_org_8.2_20110329/sys/contrib/ipfilter/netinet/ip_pool.c ./sys/contrib/ipfilter/netinet/ip_pool.c
--- ../src_org_8.2_20110329/sys/contrib/ipfilter/netinet/ip_pool.c	2007-10-18 21:42:38.000000000 +0000
+++ ./sys/contrib/ipfilter/netinet/ip_pool.c	2011-04-03 16:07:46.000000000 +0000
@@ -67,6 +67,7 @@
 #include "netinet/ip_compat.h"
 #include "netinet/ip_fil.h"
 #include "netinet/ip_pool.h"
+#include <sys/rmlock.h>
 
 #if defined(IPFILTER_LOOKUP) && defined(_KERNEL) && \
       ((BSD >= 198911) && !defined(__osf__) && \
@@ -620,7 +621,7 @@
 
 	RADIX_NODE_HEAD_LOCK(ipo->ipo_head);
 	ipo->ipo_head->rnh_deladdr(&ipe->ipn_addr, &ipe->ipn_mask,
-				   ipo->ipo_head);
+				   ipo->ipo_head, NULL);
 	RADIX_NODE_HEAD_UNLOCK(ipo->ipo_head);
 
 	ip_pool_node_deref(ipe);
@@ -751,7 +752,7 @@
 	RADIX_NODE_HEAD_LOCK(ipo->ipo_head);
 	while ((n = ipo->ipo_list) != NULL) {
 		ipo->ipo_head->rnh_deladdr(&n->ipn_addr, &n->ipn_mask,
-					   ipo->ipo_head);
+					   ipo->ipo_head, NULL);
 
 		*n->ipn_pnext = n->ipn_next;
 		if (n->ipn_next)
@@ -963,7 +964,7 @@
 	struct radix_node_head *rnh = p;
 	struct radix_node *d;
 
-	d = rnh->rnh_deladdr(n->rn_key, NULL, rnh);
+	d = rnh->rnh_deladdr(n->rn_key, NULL, rnh, NULL);
 	if (d != NULL) {
 		FreeS(d, max_keylen + 2 * sizeof (*d));
 	}
diff -u -r ../src_org_8.2_20110329/sys/contrib/pf/net/pf.c ./sys/contrib/pf/net/pf.c
--- ../src_org_8.2_20110329/sys/contrib/pf/net/pf.c	2010-09-20 17:03:10.000000000 +0000
+++ ./sys/contrib/pf/net/pf.c	2011-04-03 16:07:46.000000000 +0000
@@ -99,9 +99,7 @@
 #include <net/if_types.h>
 #include <net/bpf.h>
 #include <net/route.h>
-#ifndef __FreeBSD__
 #include <net/radix_mpath.h>
-#endif
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
@@ -6166,9 +6164,9 @@
 			if (kif->pfik_ifp == ifp)
 				ret = 1;
 #ifdef __FreeBSD__ /* MULTIPATH_ROUTING */
-			rn = NULL;
-#else
 			rn = rn_mpath_next(rn);
+#else
+			rn = rn_mpath_next(rn, 0);
 #endif
 		} while (check_mpath == 1 && rn != NULL && ret == 0);
 	} else
diff -u -r ../src_org_8.2_20110329/sys/contrib/pf/net/pf_table.c ./sys/contrib/pf/net/pf_table.c
--- ../src_org_8.2_20110329/sys/contrib/pf/net/pf_table.c	2009-08-03 08:13:06.000000000 +0000
+++ ./sys/contrib/pf/net/pf_table.c	2011-04-03 16:07:46.000000000 +0000
@@ -44,7 +44,7 @@
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
-#include <sys/rwlock.h>
+#include <sys/rmlock.h>
 #ifdef __FreeBSD__
 #include <sys/malloc.h>
 #endif
@@ -1114,17 +1114,9 @@
 #endif
 	if (KENTRY_NETWORK(ke)) {
 		pfr_prepare_network(&mask, ke->pfrke_af, ke->pfrke_net);
-#ifdef __FreeBSD__
-		rn = rn_delete(&ke->pfrke_sa, &mask, head);
-#else
 		rn = rn_delete(&ke->pfrke_sa, &mask, head, NULL);
-#endif
 	} else
-#ifdef __FreeBSD__
-		rn = rn_delete(&ke->pfrke_sa, NULL, head);
-#else
 		rn = rn_delete(&ke->pfrke_sa, NULL, head, NULL);
-#endif
 	splx(s);
 
 	if (rn == NULL) {
diff -u -r ../src_org_8.2_20110329/sys/kern/subr_witness.c ./sys/kern/subr_witness.c
--- ../src_org_8.2_20110329/sys/kern/subr_witness.c	2011-03-28 15:26:48.000000000 +0000
+++ ./sys/kern/subr_witness.c	2011-04-03 16:07:54.000000000 +0000
@@ -508,7 +508,7 @@
 	 * Routing
 	 */
 	{ "so_rcv", &lock_class_mtx_sleep },
-	{ "radix node head", &lock_class_rw },
+	{ "radix node head", &lock_class_rm },
 	{ "rtentry", &lock_class_mtx_sleep },
 	{ "ifaddr", &lock_class_mtx_sleep },
 	{ NULL, NULL },
diff -u -r ../src_org_8.2_20110329/sys/kern/vfs_export.c ./sys/kern/vfs_export.c
--- ../src_org_8.2_20110329/sys/kern/vfs_export.c	2009-10-01 13:11:45.000000000 +0000
+++ ./sys/kern/vfs_export.c	2011-04-03 16:07:54.000000000 +0000
@@ -43,6 +43,7 @@
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
+#include <sys/rmlock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
@@ -228,7 +229,7 @@
 	struct radix_node_head *rnh = (struct radix_node_head *) w;
 	struct ucred *cred;
 
-	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
+	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh, NULL);
 	cred = ((struct netcred *)rn)->netc_anon;
 	if (cred != NULL)
 		crfree(cred);
@@ -427,6 +428,7 @@
 	register struct netcred *np;
 	register struct radix_node_head *rnh;
 	struct sockaddr *saddr;
+	struct rm_priotracker tracker;
 
 	nep = mp->mnt_export;
 	if (nep == NULL)
@@ -440,10 +442,10 @@
 			saddr = nam;
 			rnh = nep->ne_rtable[saddr->sa_family];
 			if (rnh != NULL) {
-				RADIX_NODE_HEAD_RLOCK(rnh);
+				RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
 				np = (struct netcred *)
 				    (*rnh->rnh_matchaddr)(saddr, rnh);
-				RADIX_NODE_HEAD_RUNLOCK(rnh);
+				RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
 				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
 					np = NULL;
 			}
diff -u -r ../src_org_8.2_20110329/sys/net/if.c ./sys/net/if.c
--- ../src_org_8.2_20110329/sys/net/if.c	2011-03-28 15:26:51.000000000 +0000
+++ ./sys/net/if.c	2011-04-03 16:07:57.000000000 +0000
@@ -49,6 +49,7 @@
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
+#include <sys/rmlock.h>
 #include <sys/refcount.h>
 #include <sys/module.h>
 #include <sys/rwlock.h>
diff -u -r ../src_org_8.2_20110329/sys/net/pfil.c ./sys/net/pfil.c
--- ../src_org_8.2_20110329/sys/net/pfil.c	2010-02-07 09:00:22.000000000 +0000
+++ ./sys/net/pfil.c	2011-04-03 16:07:57.000000000 +0000
@@ -39,7 +39,6 @@
 #include <sys/socketvar.h>
 #include <sys/systm.h>
 #include <sys/condvar.h>
-#include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
diff -u -r ../src_org_8.2_20110329/sys/net/radix.c ./sys/net/radix.c
--- ../src_org_8.2_20110329/sys/net/radix.c	2010-04-02 05:02:50.000000000 +0000
+++ ./sys/net/radix.c	2011-04-03 16:07:57.000000000 +0000
@@ -41,6 +41,7 @@
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/syslog.h>
+#include <sys/rmlock.h>
 #include <net/radix.h>
 #include "opt_mpath.h"
 #ifdef RADIX_MPATH
@@ -614,7 +615,7 @@
 	struct radix_node treenodes[2];
 {
 	caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg;
-	register struct radix_node *t, *x = 0, *tt;
+	register struct radix_node *t, *x = 0, *xx = 0, *tt;
 	struct radix_node *saved_tt, *top = head->rnh_treetop;
 	short b = 0, b_leaf = 0;
 	int keyduplicated;
@@ -723,12 +724,19 @@
 		x = t->rn_right;
 	/* Promote general routes from below */
 	if (x->rn_bit < 0) {
-	    for (mp = &t->rn_mklist; x; x = x->rn_dupedkey)
+            for (mp = &t->rn_mklist; x; xx = x, x = x->rn_dupedkey) {
+                if (xx && xx->rn_mklist && xx->rn_mask == x->rn_mask &&
+				x->rn_mklist == 0) {
+			/* multipath route, bump refcount on first mklist */
+			x->rn_mklist = xx->rn_mklist;
+			x->rn_mklist->rm_refs++;
+		}
 		if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) {
 			*mp = m = rn_new_radix_mask(x, 0);
 			if (m)
 				mp = &m->rm_mklist;
 		}
+	    }
 	} else if (x->rn_mklist) {
 		/*
 		 * Skip over masks whose index is > that of new node
@@ -760,11 +768,30 @@
 			break;
 		if (m->rm_flags & RNF_NORMAL) {
 			mmask = m->rm_leaf->rn_mask;
-			if (tt->rn_flags & RNF_NORMAL) {
-#if !defined(RADIX_MPATH)
-			    log(LOG_ERR,
-			        "Non-unique normal route, mask not entered\n");
+			if (keyduplicated) {
+				if (m->rm_leaf->rn_parent == tt)
+					/* new route is bettter */
+					m->rm_leaf = tt;
+#ifdef DIAGNOSTIC
+				else {
+					for (t = m->rm_leaf; t;
+						t = t->rn_dupedkey) {
+						break;
+					}
+					if (t == NULL) {
+						log(LOG_ERR, "Non-unique "
+							"normal route on dupedkey, "
+							"mask not entered\n");
+						return tt;
+					}
+				}
 #endif
+				m->rm_refs++;
+				tt->rn_mklist = m;
+				return tt;
+			} else if (tt->rn_flags & RNF_NORMAL) {
+				log(LOG_ERR, "Non-unique normal route,"
+					" mask not entered\n");
 				return tt;
 			}
 		} else
@@ -783,9 +810,10 @@
 }
 
 struct radix_node *
-rn_delete(v_arg, netmask_arg, head)
+rn_delete(v_arg, netmask_arg, head, rn)
 	void *v_arg, *netmask_arg;
 	struct radix_node_head *head;
+	struct radix_node *rn;
 {
 	register struct radix_node *t, *p, *x, *tt;
 	struct radix_mask *m, *saved_m, **mp;
@@ -815,13 +843,37 @@
 			if ((tt = tt->rn_dupedkey) == 0)
 				return (0);
 	}
+#ifdef RADIX_MPATH
+	if (rn) {
+		while (tt != rn)
+			if ((tt = tt->rn_dupedkey) == 0)
+				return (0);
+	}
+#endif
 	if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0)
 		goto on1;
 	if (tt->rn_flags & RNF_NORMAL) {
-		if (m->rm_leaf != tt || m->rm_refs > 0) {
+		if (m->rm_leaf != tt && m->rm_refs == 0) {
 			log(LOG_ERR, "rn_delete: inconsistent annotation\n");
 			return 0;  /* dangling ref could cause disaster */
 		}
+		if (m->rm_leaf != tt) {
+			if (--m->rm_refs >= 0)
+				goto on1;
+		}
+		/* tt is currently the head of the possible multipath chain */
+		if (m->rm_refs > 0) {
+			if (tt->rn_dupedkey == NULL ||
+				tt->rn_dupedkey->rn_mklist != m) {
+					log(LOG_ERR, "rn_delete: inconsistent "
+						"dupedkey list\n");
+					return (0);
+			}
+			m->rm_leaf = tt->rn_dupedkey;
+			--m->rm_refs;
+			goto on1;
+		}
+		/* else tt is last and only route */
 	} else {
 		if (m->rm_mask != tt->rn_mask) {
 			log(LOG_ERR, "rn_delete: inconsistent annotation\n");
@@ -875,15 +927,10 @@
 			else
 				t->rn_right = x;
 		} else {
-			/* find node in front of tt on the chain */
-			for (x = p = saved_tt; p && p->rn_dupedkey != tt;)
-				p = p->rn_dupedkey;
-			if (p) {
-				p->rn_dupedkey = tt->rn_dupedkey;
-				if (tt->rn_dupedkey)		/* parent */
-					tt->rn_dupedkey->rn_parent = p;
-								/* parent */
-			} else log(LOG_ERR, "rn_delete: couldn't find us\n");
+			x = saved_tt;
+			t->rn_dupedkey = tt->rn_dupedkey;
+			if (tt->rn_dupedkey)
+				tt->rn_dupedkey->rn_parent = t;
 		}
 		t = tt + 1;
 		if  (t->rn_flags & RNF_ACTIVE) {
@@ -931,8 +978,16 @@
 				if (m == x->rn_mklist) {
 					struct radix_mask *mm = m->rm_mklist;
 					x->rn_mklist = 0;
-					if (--(m->rm_refs) < 0)
+					if (--(m->rm_refs) < 0) {
 						MKFree(m);
+					} else if (m->rm_flags & RNF_NORMAL) {
+						/*
+						* don't progress because this
+						* a multipath route. Next
+						* route will use the same m.
+						*/
+						mm = m;
+					}
 					m = mm;
 				}
 			if (m)
diff -u -r ../src_org_8.2_20110329/sys/net/radix.h ./sys/net/radix.h
--- ../src_org_8.2_20110329/sys/net/radix.h	2010-03-23 09:58:59.000000000 +0000
+++ ./sys/net/radix.h	2011-04-03 16:07:57.000000000 +0000
@@ -36,7 +36,7 @@
 #ifdef _KERNEL
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
-#include <sys/_rwlock.h>
+#include <sys/_rmlock.h>
 #endif
 
 #ifdef MALLOC_DECLARE
@@ -114,7 +114,7 @@
 		(void *v, void *mask,
 		     struct radix_node_head *head, struct radix_node nodes[]);
 	struct	radix_node *(*rnh_deladdr)	/* remove based on sockaddr */
-		(void *v, void *mask, struct radix_node_head *head);
+		(void *v, void *mask, struct radix_node_head *head, struct radix_node *rn);
 	struct	radix_node *(*rnh_delpkt)	/* remove based on packet hdr */
 		(void *v, void *mask, struct radix_node_head *head);
 	struct	radix_node *(*rnh_matchaddr)	/* locate based on sockaddr */
@@ -133,7 +133,7 @@
 	struct	radix_node rnh_nodes[3];	/* empty tree for common case */
 	int	rnh_multipath;			/* multipath capable ? */
 #ifdef _KERNEL
-	struct	rwlock rnh_lock;		/* locks entire radix tree */
+	struct	rmlock rnh_lock;		/* locks entire radix tree */
 #endif
 };
 
@@ -147,17 +147,15 @@
 #define Free(p) free((caddr_t)p, M_RTABLE);
 
 #define	RADIX_NODE_HEAD_LOCK_INIT(rnh)	\
-    rw_init_flags(&(rnh)->rnh_lock, "radix node head", 0)
-#define	RADIX_NODE_HEAD_LOCK(rnh)	rw_wlock(&(rnh)->rnh_lock)
-#define	RADIX_NODE_HEAD_UNLOCK(rnh)	rw_wunlock(&(rnh)->rnh_lock)
-#define	RADIX_NODE_HEAD_RLOCK(rnh)	rw_rlock(&(rnh)->rnh_lock)
-#define	RADIX_NODE_HEAD_RUNLOCK(rnh)	rw_runlock(&(rnh)->rnh_lock)
-#define	RADIX_NODE_HEAD_LOCK_TRY_UPGRADE(rnh)	rw_try_upgrade(&(rnh)->rnh_lock)
-
-
-#define	RADIX_NODE_HEAD_DESTROY(rnh)	rw_destroy(&(rnh)->rnh_lock)
-#define	RADIX_NODE_HEAD_LOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_LOCKED)
-#define	RADIX_NODE_HEAD_WLOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_WLOCKED)
+	rm_init_flags(&(rnh)->rnh_lock, "radix node head", 0)
+#define        RADIX_NODE_HEAD_LOCK(rnh)       rm_wlock(&(rnh)->rnh_lock)
+#define        RADIX_NODE_HEAD_UNLOCK(rnh)     rm_wunlock(&(rnh)->rnh_lock)
+#define        RADIX_NODE_HEAD_RLOCK(rnh, tracker)     rm_rlock(&(rnh)->rnh_lock, (tracker))
+#define        RADIX_NODE_HEAD_RUNLOCK(rnh, tracker)   rm_runlock(&(rnh)->rnh_lock, (tracker))
+
+#define        RADIX_NODE_HEAD_DESTROY(rnh)    rm_destroy(&(rnh)->rnh_lock)
+#define        RADIX_NODE_HEAD_LOCK_ASSERT(rnh)        rm_wowned(&(rnh)->rnh_lock)
+#define        RADIX_NODE_HEAD_WLOCK_ASSERT(rnh)       rm_wowned(&(rnh)->rnh_lock)
 #endif /* _KERNEL */
 
 void	 rn_init(int);
@@ -168,7 +166,7 @@
 	 *rn_addmask(void *, int, int),
 	 *rn_addroute (void *, void *, struct radix_node_head *,
 			struct radix_node [2]),
-	 *rn_delete(void *, void *, struct radix_node_head *),
+	 *rn_delete(void *, void *, struct radix_node_head *, struct radix_node *),
 	 *rn_lookup (void *v_arg, void *m_arg,
 		        struct radix_node_head *head),
 	 *rn_match(void *, struct radix_node_head *);
diff -u -r ../src_org_8.2_20110329/sys/net/radix_mpath.c ./sys/net/radix_mpath.c
--- ../src_org_8.2_20110329/sys/net/radix_mpath.c	2010-04-02 05:02:50.000000000 +0000
+++ ./sys/net/radix_mpath.c	2011-04-04 19:33:16.000000000 +0000
@@ -45,6 +45,8 @@
 #include <sys/socket.h>
 #include <sys/domain.h>
 #include <sys/syslog.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
 #include <net/radix.h>
 #include <net/radix_mpath.h>
 #include <net/route.h>
@@ -54,7 +56,7 @@
 /*
  * give some jitter to hash, to avoid synchronization between routers
  */
-static uint32_t hashjitter;
+uint32_t hashjitter;
 
 int
 rn_mpath_capable(struct radix_node_head *rnh)
@@ -77,10 +79,11 @@
 		return NULL;
 }
 
-uint32_t
+//uint32_t
+int64_t
 rn_mpath_count(struct radix_node *rn)
 {
-	uint32_t i = 0;
+	int64_t i = 0;
 	struct rtentry *rt;
 	
 	while (rn != NULL) {
@@ -112,46 +115,14 @@
 		 * we need to compare the interface address because
 		 * rt_gateway is a special sockadd_dl structure
 		 */
-		if (rt->rt_gateway->sa_family == AF_LINK) {
-			if (!memcmp(rt->rt_ifa->ifa_addr, gate, gate->sa_len))
+		if (rt->rt_gateway->sa_len == gate->sa_len &&
+			!memcmp(rt->rt_gateway, gate, gate->sa_len))
 				break;
-		} else {
-			if (rt->rt_gateway->sa_len == gate->sa_len &&
-			    !memcmp(rt->rt_gateway, gate, gate->sa_len))
-				break;
-		}
 	} while ((rn = rn_mpath_next(rn)) != NULL);
 
 	return (struct rtentry *)rn;
 }
 
-/* 
- * go through the chain and unlink "rt" from the list
- * the caller will free "rt"
- */
-int
-rt_mpath_deldup(struct rtentry *headrt, struct rtentry *rt)
-{
-        struct radix_node *t, *tt;
-
-        if (!headrt || !rt)
-            return (0);
-        t = (struct radix_node *)headrt;
-        tt = rn_mpath_next(t);
-        while (tt) {
-            if (tt == (struct radix_node *)rt) {
-                t->rn_dupedkey = tt->rn_dupedkey;
-                tt->rn_dupedkey = NULL;
-    	        tt->rn_flags &= ~RNF_ACTIVE;
-	        tt[1].rn_flags &= ~RNF_ACTIVE;
-                return (1);
-            }
-            t = tt;
-            tt = rn_mpath_next((struct radix_node *)t);
-        }
-        return (0);
-}
-
 /*
  * check if we have the same key/mask/gateway on the table already.
  */
@@ -262,9 +233,10 @@
 rtalloc_mpath_fib(struct route *ro, uint32_t hash, u_int fibnum)
 {
 	struct radix_node *rn0, *rn;
-	u_int32_t n;
+	u_int32_t n = 0;
 	struct rtentry *rt;
 	int64_t weight;
+	int64_t lowest_weight;
 
 	/*
 	 * XXX we don't attempt to lookup cached route again; what should
@@ -285,20 +257,32 @@
 
 	/* beyond here, we use rn as the master copy */
 	rn0 = rn = (struct radix_node *)ro->ro_rt;
-	n = rn_mpath_count(rn0);
+
+	/* find lowest weight route */
+	for ( rt = (struct rtentry *)rn, weight = rt->rt_rmx.rmx_weight; rn != NULL; rn = rn_mpath_next( rn)) {
+		rt = (struct rtentry *)rn;
+		if(rt->rt_flags & RTF_UP) {
+			if (weight > rt->rt_rmx.rmx_weight) {
+				weight = rt->rt_rmx.rmx_weight;
+				n = 1;
+			} else if (weight == rt->rt_rmx.rmx_weight)
+				n++;
+		}
+	}
+	lowest_weight = weight;
 
 	/* gw selection by Modulo-N Hash (RFC2991) XXX need improvement? */
 	hash += hashjitter;
 	hash %= n;
-	for (weight = abs((int32_t)hash), rt = ro->ro_rt;
-	     weight >= rt->rt_rmx.rmx_weight && rn; 
-	     weight -= rt->rt_rmx.rmx_weight) {
-		
-		/* stay within the multipath routes */
-		if (rn->rn_dupedkey && rn->rn_mask != rn->rn_dupedkey->rn_mask)
-			break;
-		rn = rn->rn_dupedkey;
+	for ( rn = rn0, n = 0; rn != NULL; rn = rn_mpath_next( rn)) {
 		rt = (struct rtentry *)rn;
+		if(rt->rt_flags & RTF_UP) {
+			if ( rt->rt_rmx.rmx_weight == lowest_weight) {
+				if (n == hash)
+					break;
+				n++;
+			}
+		}
 	}
 	/* XXX try filling rt_gwroute and avoid unreachable gw  */
 
diff -u -r ../src_org_8.2_20110329/sys/net/radix_mpath.h ./sys/net/radix_mpath.h
--- ../src_org_8.2_20110329/sys/net/radix_mpath.h	2009-08-03 08:13:06.000000000 +0000
+++ ./sys/net/radix_mpath.h	2011-04-04 19:48:09.000000000 +0000
@@ -44,9 +44,10 @@
 struct route;
 struct rtentry;
 struct sockaddr;
+extern uint32_t hashjitter;
 int	rn_mpath_capable(struct radix_node_head *);
 struct radix_node *rn_mpath_next(struct radix_node *);
-u_int32_t rn_mpath_count(struct radix_node *);
+int64_t rn_mpath_count(struct radix_node *);
 struct rtentry *rt_mpath_matchgate(struct rtentry *, struct sockaddr *);
 int rt_mpath_conflict(struct radix_node_head *, struct rtentry *,
     struct sockaddr *);
diff -u -r ../src_org_8.2_20110329/sys/net/route.c ./sys/net/route.c
--- ../src_org_8.2_20110329/sys/net/route.c	2011-03-28 15:26:51.000000000 +0000
+++ ./sys/net/route.c	2011-04-04 23:01:17.000000000 +0000
@@ -51,6 +51,8 @@
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
@@ -342,6 +344,7 @@
 	struct radix_node *rn;
 	struct rtentry *newrt;
 	struct rt_addrinfo info;
+	struct rm_priotracker tracker;
 	int err = 0, msgtype = RTM_MISS;
 	int needlock;
 
@@ -358,24 +361,26 @@
 		goto miss;
 	}
 	needlock = !(ignflags & RTF_RNH_LOCKED);
-	if (needlock)
-		RADIX_NODE_HEAD_RLOCK(rnh);
-#ifdef INVARIANTS	
+	if (needlock) /* XXX we always need the lock for now! */
+		RADIX_NODE_HEAD_LOCK(rnh);
 	else
-		RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
-#endif
+		RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
 	rn = rnh->rnh_matchaddr(dst, rnh);
 	if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		newrt = rt = RNTORT(rn);
 		RT_LOCK(newrt);
 		RT_ADDREF(newrt);
-		if (needlock)
-			RADIX_NODE_HEAD_RUNLOCK(rnh);
+		if (needlock) /* XXX we always need the lock for now! */
+			RADIX_NODE_HEAD_UNLOCK(rnh);
+		else
+			RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
 		goto done;
+	}
+	if (needlock) /* XXX we always need the lock for now! */
+		RADIX_NODE_HEAD_UNLOCK(rnh);
+	else
+		RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
 
-	} else if (needlock)
-		RADIX_NODE_HEAD_RUNLOCK(rnh);
-	
 	/*
 	 * Either we hit the root or couldn't find any match,
 	 * Which basically means
@@ -400,6 +405,157 @@
 }
 
 /*
+ * Lookup a destination in the routing table and
+ * report the next hop, interface and interface address
+ * in a new structure.
+ * Only read lock access on the routing table is required,
+ * individual routes are not locked.
+ * Returns 1 for entry found, 0 for not found.
+ */
+int
+rtlookup_fib(struct sockaddr *dst, u_int fibnum, struct rtlookup *rtl,
+       int flags)
+{
+        struct radix_node_head *rnh;
+       struct radix_node *rn;
+       struct rtentry *rt;
+       int ret = 0;
+       struct rm_priotracker tracker;
+
+       KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
+       if (dst->sa_family != AF_INET)  /* Only INET supports > 1 fib now */
+               fibnum = 0;
+       rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
+
+       /* Look up the address in the table for that Address Family. */
+       if (rnh == NULL) {
+               V_rtstat.rts_unreach++;
+               return (0);
+       }
+
+       RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
+       rn = rnh->rnh_matchaddr(dst, rnh);
+       if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+               rt = RNTORT(rn);
+
+               int rt_len = SA_SIZE( rt->rt_gateway);
+               int rtl_len = SA_SIZE( rtl->rt_gateway);
+               if( rt_len > rtl_len) {
+                       bcopy( &rt->rt_gateway, &rtl->rt_gateway, rtl_len);
+               } else {
+                       bcopy( &rt->rt_gateway, &rtl->rt_gateway, rt_len);
+               }
+               rtl->rt_ifp = rt->rt_ifp;
+               rtl->rt_ifa = rt->rt_ifa;
+               rtl->rt_rmx.rmx_mtu = rt->rt_rmx.rmx_mtu;
+               rtl->rt_rmx.rmx_expire = rt->rt_rmx.rmx_expire;
+               rtl->rt_flags = rt->rt_flags;
+               if (flags & RTL_PKSENT)
+                       rt->rt_rmx.rmx_pksent++;                /* racy but ok - XXX WHY?*/
+               ret = 1;
+       }
+       RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
+       return (ret);
+}
+
+#ifdef RADIX_MPATH
+/*
+ * Lookup a mpath destination in the routing table and
+ * report the next hop, interface and interface address
+ * in a new structure.
+ * Only read lock access on the routing table is required,
+ * individual routes are not locked.
+ * Returns 1 for entry found, 0 for not found.
+ */
+int
+rtlookup_mpath_fib(struct sockaddr *dst, u_int32_t hash, u_int fibnum,
+        struct rtlookup *rtl, int flags)
+{
+       struct radix_node_head *rnh;
+       struct radix_node *rn, *rn0;
+       struct rtentry *rt;
+       int ret = 0;
+       struct rm_priotracker tracker;
+       int64_t weight;
+       int64_t lowest_weight;
+       u_int32_t n = 0;
+
+       KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
+       if (dst->sa_family != AF_INET)  /* Only INET supports > 1 fib now */
+               fibnum = 0;
+       rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
+
+       /* Look up the address in the table for that Address Family. */
+       if (rnh == NULL) {
+               V_rtstat.rts_unreach++;
+               return (0);
+       }
+
+       RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
+       rn = rnh->rnh_matchaddr(dst, rnh);
+       if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+                /* we have a route - now do the mpath selection */
+               if (rn_mpath_next( rn) != NULL) { /* multipath */
+                       rn0 = rn;
+
+                       /* find lowest weight route */
+                       for ( rt = (struct rtentry *)rn, weight = rt->rt_rmx.rmx_weight;
+                                       rn != NULL; rn = rn_mpath_next( rn)) {
+                               rt = (struct rtentry *)rn;
+                               if(rt->rt_flags & RTF_UP) {
+                                       if (weight > rt->rt_rmx.rmx_weight) {
+                                               weight = rt->rt_rmx.rmx_weight;
+                                               n = 1;
+                                       } else if (weight == rt->rt_rmx.rmx_weight)
+                                               n++;
+                               }
+                       }
+                       lowest_weight = weight;
+
+                       /* select now one of the lowest weight routes */
+                       /* gw selection by Modulo-N Hash (RFC2991) XXX need improvement? */
+                       hash += hashjitter;
+                       hash %= n;
+                       for ( rn = rn0, n = 0; rn != NULL; rn = rn_mpath_next( rn)) {
+                               rt = (struct rtentry *)rn;
+                               if(rt->rt_flags & RTF_UP) {
+                                       if ( rt->rt_rmx.rmx_weight == lowest_weight) {
+                                               if (n == hash)
+                                                       break;
+                                               n++;
+                                       }
+                               }
+                       }
+
+                       /* gw selection has failed - there must be only zero weight routes */                   
+                       if (!rn)
+                               goto end;
+               } else
+                       rt = (struct rtentry *)rn;
+
+               int rt_len = SA_SIZE( rt->rt_gateway);
+               int rtl_len = SA_SIZE( rtl->rt_gateway);
+               if( rt_len > rtl_len) {
+                       bcopy( &rt->rt_gateway, &rtl->rt_gateway, rtl_len);
+               } else {
+                       bcopy( &rt->rt_gateway, &rtl->rt_gateway, rt_len);
+               }
+               rtl->rt_ifp = rt->rt_ifp;
+               rtl->rt_ifa = rt->rt_ifa;
+               rtl->rt_rmx.rmx_mtu = rt->rt_rmx.rmx_mtu;
+               rtl->rt_rmx.rmx_expire = rt->rt_rmx.rmx_expire;
+               rtl->rt_flags = rt->rt_flags;
+               if (flags & RTL_PKSENT)
+                       rt->rt_rmx.rmx_pksent++;                /* racy but ok - XXX WHY?*/
+               ret = 1;
+       }
+end:
+       RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
+       return (ret);
+}
+#endif
+
+/*
  * Remove a reference count from an rtentry.
  * If the count gets low enough, take it out of the routing table
  */
@@ -875,7 +1031,7 @@
 	 * Remove the item from the tree; it should be there,
 	 * but when callers invoke us blindly it may not (sigh).
 	 */
-	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh);
+	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh, NULL);
 	if (rn == NULL) {
 		error = ESRCH;
 		goto bad;
@@ -913,112 +1069,6 @@
 	return (error);
 }
 
-#ifdef RADIX_MPATH
-static int
-rn_mpath_update(int req, struct rt_addrinfo *info,
-    struct radix_node_head *rnh, struct rtentry **ret_nrt)
-{
-	/*
-	 * if we got multipath routes, we require users to specify
-	 * a matching RTAX_GATEWAY.
-	 */
-	struct rtentry *rt, *rto = NULL;
-	register struct radix_node *rn;
-	int error = 0;
-
-	rn = rnh->rnh_matchaddr(dst, rnh);
-	if (rn == NULL)
-		return (ESRCH);
-	rto = rt = RNTORT(rn);
-	rt = rt_mpath_matchgate(rt, gateway);
-	if (rt == NULL)
-		return (ESRCH);
-	/*
-	 * this is the first entry in the chain
-	 */
-	if (rto == rt) {
-		rn = rn_mpath_next((struct radix_node *)rt);
-		/*
-		 * there is another entry, now it's active
-		 */
-		if (rn) {
-			rto = RNTORT(rn);
-			RT_LOCK(rto);
-			rto->rt_flags |= RTF_UP;
-			RT_UNLOCK(rto);
-		} else if (rt->rt_flags & RTF_GATEWAY) {
-			/*
-			 * For gateway routes, we need to 
-			 * make sure that we we are deleting
-			 * the correct gateway. 
-			 * rt_mpath_matchgate() does not 
-			 * check the case when there is only
-			 * one route in the chain.  
-			 */
-			if (gateway &&
-			    (rt->rt_gateway->sa_len != gateway->sa_len ||
-				memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
-				error = ESRCH;
-			else {
-				/*
-				 * remove from tree before returning it
-				 * to the caller
-				 */
-				rn = rnh->rnh_deladdr(dst, netmask, rnh);
-				KASSERT(rt == RNTORT(rn), ("radix node disappeared"));
-				goto gwdelete;
-			}
-			
-		}
-		/*
-		 * use the normal delete code to remove
-		 * the first entry
-		 */
-		if (req != RTM_DELETE) 
-			goto nondelete;
-
-		error = ENOENT;
-		goto done;
-	}
-		
-	/*
-	 * if the entry is 2nd and on up
-	 */
-	if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt))
-		panic ("rtrequest1: rt_mpath_deldup");
-gwdelete:
-	RT_LOCK(rt);
-	RT_ADDREF(rt);
-	if (req == RTM_DELETE) {
-		rt->rt_flags &= ~RTF_UP;
-		/*
-		 * One more rtentry floating around that is not
-		 * linked to the routing table. rttrash will be decremented
-		 * when RTFREE(rt) is eventually called.
-		 */
-		V_rttrash++;
-	}
-	
-nondelete:
-	if (req != RTM_DELETE)
-		panic("unrecognized request %d", req);
-	
-
-	/*
-	 * If the caller wants it, then it can have it,
-	 * but it's up to it to free the rtentry as we won't be
-	 * doing it.
-	 */
-	if (ret_nrt) {
-		*ret_nrt = rt;
-		RT_UNLOCK(rt);
-	} else
-		RTFREE_LOCKED(rt);
-done:
-	return (error);
-}
-#endif
-
 int
 rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
 				u_int fibnum)
@@ -1032,6 +1082,7 @@
 	register struct radix_node_head *rnh;
 	struct ifaddr *ifa;
 	struct sockaddr *ndst;
+	struct rm_priotracker tracker;
 #define senderr(x) { error = x ; goto bad; }
 
 	KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
@@ -1048,7 +1099,7 @@
 	if (needlock)
 		RADIX_NODE_HEAD_LOCK(rnh);
 	else
-		RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
+		RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
 	/*
 	 * If we are adding a host route then we don't want to put
 	 * a netmask in the tree, nor do we want to clone it.
@@ -1058,28 +1109,30 @@
 
 	switch (req) {
 	case RTM_DELETE:
+		if ((rn = rnh->rnh_lookup(dst, netmask, rnh)) == NULL)
+			senderr(ESRCH);
+		rt = RNTORT(rn);
 #ifdef RADIX_MPATH
+		/*
+		 * if we got multipath routes, we require users to specify
+		 * a matching RTAX_GATEWAY.
+		 */
 		if (rn_mpath_capable(rnh)) {
-			error = rn_mpath_update(req, info, rnh, ret_nrt);
-			/*
-			 * "bad" holds true for the success case
-			 * as well
-			 */
-			if (error != ENOENT)
-				goto bad;
-			error = 0;
+			rt = rt_mpath_matchgate( rt, gateway);
+			rn = (struct radix_node *)rt;
+			if (!rt)
+				senderr(ESRCH);
 		}
 #endif
 		/*
 		 * Remove the item from the tree and return it.
 		 * Complain if it is not there and do no more processing.
 		 */
-		rn = rnh->rnh_deladdr(dst, netmask, rnh);
+		rn = rnh->rnh_deladdr(dst, netmask, rnh, rn);
 		if (rn == NULL)
 			senderr(ESRCH);
 		if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
 			panic ("rtrequest delete");
-		rt = RNTORT(rn);
 		RT_LOCK(rt);
 		RT_ADDREF(rt);
 		rt->rt_flags &= ~RTF_UP;
@@ -1285,6 +1338,8 @@
 bad:
 	if (needlock)
 		RADIX_NODE_HEAD_UNLOCK(rnh);
+	else
+		RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
 	return (error);
 #undef senderr
 }
@@ -1308,7 +1363,9 @@
 #endif
 
 	RT_LOCK_ASSERT(rt);
+#ifdef INVARIANTS
 	RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
+#endif
 	
 	/*
 	 * Prepare to store the gateway in rt->rt_gateway.
diff -u -r ../src_org_8.2_20110329/sys/net/route.h ./sys/net/route.h
--- ../src_org_8.2_20110329/sys/net/route.h	2010-04-02 05:12:46.000000000 +0000
+++ ./sys/net/route.h	2011-04-03 16:07:57.000000000 +0000
@@ -79,6 +79,39 @@
 };
 
 /*
+ * Pointers to structures on the stack for pure routing
+ * table lookups / fast mtu access.
+ * Fakes struct rt_metrics_lite
+ */
+struct rtlookup_metrics {
+       u_long  rmx_mtu;        /* MTU for this path */
+       u_long  rmx_expire;     /* XXX rearange rt_metrics_lite */
+       u_long  rmx_pksent;     /* XXX faster than extra if? - remove? */
+};
+
+/*
+ * Pointers to structures on the stack for pure routing
+ * table lookups. 
+ * Fakes struct rtentry
+ */
+#ifndef RNF_NORMAL
+#include <net/radix.h>
+#ifdef RADIX_MPATH
+#include <net/radix_mpath.h>
+#endif
+#endif
+struct rtlookup {
+       struct  radix_node rt_nodes[2];         /* XXX rearange rtentry and remove */
+       struct  sockaddr *rt_gateway;
+       int     rt_flags;
+       int     rt_refcnt;                      /* XXX rearange rtentry and remove */
+       struct  ifnet *rt_ifp;
+       struct  ifaddr *rt_ifa;
+       struct  rtlookup_metrics rt_rmx;
+};
+#define        RTL_PKSENT      0x0001  /* increment packet sent counter */
+
+/*
  * rmx_rtt and rmx_rttvar are stored as microseconds;
  * RTTTOPRHZ(rtt) converts to a value suitable for use
  * by a protocol slowtimo counter.
@@ -123,12 +156,6 @@
  * gateways are marked so that the output routines know to address the
  * gateway rather than the ultimate destination.
  */
-#ifndef RNF_NORMAL
-#include <net/radix.h>
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
-#endif
 struct rtentry {
 	struct	radix_node rt_nodes[2];	/* tree glue, and other values */
 	/*
@@ -430,6 +457,10 @@
 void	 rtalloc_fib(struct route *ro, u_int fibnum);
 struct rtentry *rtalloc1_fib(struct sockaddr *, int, u_long, u_int);
 int	 rtioctl_fib(u_long, caddr_t, u_int);
+int    rtlookup_fib(struct sockaddr *, u_int, struct rtlookup *, int);
+#ifdef RADIX_MPATH
+int    rtlookup_mpath_fib(struct sockaddr *, u_int32_t, u_int, struct rtlookup *, int);
+#endif
 void	 rtredirect_fib(struct sockaddr *, struct sockaddr *,
 	    struct sockaddr *, int, struct sockaddr *, u_int);
 int	 rtrequest_fib(int, struct sockaddr *,
diff -u -r ../src_org_8.2_20110329/sys/net/rtsock.c ./sys/net/rtsock.c
--- ../src_org_8.2_20110329/sys/net/rtsock.c	2010-10-30 11:54:55.000000000 +0000
+++ ./sys/net/rtsock.c	2011-04-03 16:07:57.000000000 +0000
@@ -51,6 +51,7 @@
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
+#include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
@@ -513,6 +514,7 @@
 	int len, error = 0;
 	struct ifnet *ifp = NULL;
 	union sockaddr_union saun;
+	struct rm_priotracker tracker;
 
 #define senderr(e) { error = e; goto flush;}
 	if (m == NULL || ((m->m_len < sizeof(long)) &&
@@ -643,11 +645,11 @@
 		    info.rti_info[RTAX_DST]->sa_family);
 		if (rnh == NULL)
 			senderr(EAFNOSUPPORT);
-		RADIX_NODE_HEAD_RLOCK(rnh);
+		RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
 		rt = (struct rtentry *) rnh->rnh_lookup(info.rti_info[RTAX_DST],
 			info.rti_info[RTAX_NETMASK], rnh);
 		if (rt == NULL) {	/* XXX looks bogus */
-			RADIX_NODE_HEAD_RUNLOCK(rnh);
+			RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
 			senderr(ESRCH);
 		}
 #ifdef RADIX_MPATH
@@ -663,7 +665,7 @@
 		    (rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) {
 			rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]);
 			if (!rt) {
-				RADIX_NODE_HEAD_RUNLOCK(rnh);
+				RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
 				senderr(ESRCH);
 			}
 		}
@@ -695,13 +697,13 @@
 			 */
 			rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr, rnh);
 			if (rt == NULL) {
-				RADIX_NODE_HEAD_RUNLOCK(rnh);
+				RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
 				senderr(ESRCH);
 			}
 		} 
 		RT_LOCK(rt);
 		RT_ADDREF(rt);
-		RADIX_NODE_HEAD_RUNLOCK(rnh);
+		RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
 
 		/* 
 		 * Fix for PR: 82974
diff -u -r ../src_org_8.2_20110329/sys/netinet/icmp_var.h ./sys/netinet/icmp_var.h
--- ../src_org_8.2_20110329/sys/netinet/icmp_var.h	2009-08-03 08:13:06.000000000 +0000
+++ ./sys/netinet/icmp_var.h	2011-04-03 16:07:57.000000000 +0000
@@ -102,7 +102,11 @@
 #define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */
 #define BANDLIM_RST_OPENPORT 4   /* No connection, listener */
 #define BANDLIM_ICMP6_UNREACH 5
-#define BANDLIM_MAX 5
+#define BANDLIM_ICMP_FWD_UNREACH 6 /* forwarding: limit unreachable */
+#define BANDLIM_ICMP_FWD_TIMXCEED 7 /* forwarding: limit time-exceeded */
+#define BANDLIM_ICMP_FWD_NEEDFRAG 8 /* forwarding: limit need-frag */
+#define BANDLIM_ICMP_FWD_FILTER 9 /* forwarding: limit admin-prohib */
+#define BANDLIM_MAX 9
 #endif
 
 #endif
diff -u -r ../src_org_8.2_20110329/sys/netinet/in.c ./sys/netinet/in.c
--- ../src_org_8.2_20110329/sys/netinet/in.c	2011-01-12 20:44:11.000000000 +0000
+++ ./sys/netinet/in.c	2011-04-03 16:07:57.000000000 +0000
@@ -1392,12 +1392,42 @@
 in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr)
 {
 	struct rtentry *rt;
+#ifdef RADIX_MPATH
+	int64_t weight;
+	struct rtentry *rt0;
+	int32_t found = 0;
+#endif
 
 	KASSERT(l3addr->sa_family == AF_INET,
 	    ("sin_family %d", l3addr->sa_family));
 
 	/* XXX rtalloc1 should take a const param */
 	rt = rtalloc1(__DECONST(struct sockaddr *, l3addr), 0, 0);
+#ifdef RADIX_MPATH
+	rt0 = rt;
+	if ((rt != NULL) && ( rn_mpath_next((struct radix_node *)rt) != NULL)) {
+		/* check if there are other, matching routes */
+		/* find lowest weight route */
+		for ( weight = rt->rt_rmx.rmx_weight; rt != NULL; rt = (struct rtentry *)rn_mpath_next( (struct radix_node *)rt)) {
+			if(rt->rt_flags & RTF_UP) {
+				if (weight > rt->rt_rmx.rmx_weight)
+					weight = rt->rt_rmx.rmx_weight;
+			}
+		}
+
+		/* find now one non gateway route with lowest weight */
+		for ( rt = rt0; rt != NULL; rt = (struct rtentry *)rn_mpath_next( (struct radix_node *)rt)) {
+			if(rt->rt_flags & RTF_UP) {
+				if ((weight == rt->rt_rmx.rmx_weight) && !(rt->rt_flags & RTF_GATEWAY)) {
+					found = 1;
+					 break;
+				}
+			}
+		}
+		if (found == 0)
+			rt = NULL;
+	}
+#endif
 	if (rt == NULL || (!(flags & LLE_PUB) &&
 			   ((rt->rt_flags & RTF_GATEWAY) || 
 			    (rt->rt_ifp != ifp)))) {
@@ -1405,11 +1435,20 @@
 		log(LOG_INFO, "IPv4 address: \"%s\" is not on the network\n",
 		    inet_ntoa(((const struct sockaddr_in *)l3addr)->sin_addr));
 #endif
+#ifdef RADIX_MPATH
+		if (rt0 != NULL)
+			RTFREE_LOCKED(rt0);
+#else
 		if (rt != NULL)
 			RTFREE_LOCKED(rt);
+#endif
 		return (EINVAL);
 	}
+#ifdef RADIX_MPATH
+	RTFREE_LOCKED(rt0);
+#else
 	RTFREE_LOCKED(rt);
+#endif
 	return 0;
 }
 
diff -u -r ../src_org_8.2_20110329/sys/netinet/in_rmx.c ./sys/netinet/in_rmx.c
--- ../src_org_8.2_20110329/sys/netinet/in_rmx.c	2010-10-11 11:25:37.000000000 +0000
+++ ./sys/netinet/in_rmx.c	2011-04-03 16:07:57.000000000 +0000
@@ -51,6 +51,8 @@
 #include <sys/mbuf.h>
 #include <sys/syslog.h>
 #include <sys/callout.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/route.h>
diff -u -r ../src_org_8.2_20110329/sys/netinet/ip_fastfwd.c ./sys/netinet/ip_fastfwd.c
--- ../src_org_8.2_20110329/sys/netinet/ip_fastfwd.c	2010-12-10 14:06:50.000000000 +0000
+++ ./sys/netinet/ip_fastfwd.c	2011-04-05 01:13:41.000000000 +0000
@@ -94,6 +94,9 @@
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
+#ifdef RADIX_MPATH
+#include <net/radix_mpath.h>
+#endif
 #include <net/vnet.h>
 
 #include <netinet/in.h>
@@ -102,6 +105,7 @@
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
+#include <netinet/icmp_var.h>
 #include <netinet/ip_options.h>
 
 #include <machine/in_cksum.h>
@@ -113,7 +117,11 @@
     &VNET_NAME(ipfastforward_active), 0, "Enable fast IP forwarding");
 
 static struct sockaddr_in *
-ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m)
+#ifdef RADIX_MPATH
+ip_findroute(struct route *ro, uint32_t hash, struct in_addr dest, struct mbuf *m, struct rtlookup *rtl)
+#else
+ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m, struct rtlookup *rtl)
+#endif
 {
 	struct sockaddr_in *dst;
 	struct rtentry *rt;
@@ -126,7 +134,17 @@
 	dst->sin_family = AF_INET;
 	dst->sin_len = sizeof(*dst);
 	dst->sin_addr.s_addr = dest.s_addr;
-	in_rtalloc_ign(ro, 0, M_GETFIB(m));
+
+	rtl->rt_gateway = (struct sockaddr *)dst;
+#ifdef RADIX_MPATH
+	if (!rtlookup_mpath_fib((struct sockaddr *)dst,
+			hash, M_GETFIB(m),  rtl, RTL_PKSENT))
+#else
+	if (!rtlookup_fib( (struct sockaddr *)dst, M_GETFIB(m), rtl, RTL_PKSENT))
+#endif
+		ro->ro_rt = NULL;
+	else
+		ro->ro_rt = (struct rtentry *)rtl;
 
 	/*
 	 * Route there and interface still up?
@@ -140,9 +158,10 @@
 	} else {
 		IPSTAT_INC(ips_noroute);
 		IPSTAT_INC(ips_cantforward);
-		if (rt)
-			RTFREE(rt);
-		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
+		if (badport_bandlim(BANDLIM_ICMP_FWD_UNREACH) < 0)
+			m_freem(m);
+		else
+			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		return NULL;
 	}
 	return dst;
@@ -167,6 +186,7 @@
 	u_short sum, ip_len;
 	int error = 0;
 	int hlen, mtu;
+	struct rtlookup rtl;
 #ifdef IPFIREWALL_FORWARD
 	struct m_tag *fwd_tag;
 #endif
@@ -299,8 +319,11 @@
 		if (ip_doopts == 1)
 			return m;
 		else if (ip_doopts == 2) {
-			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_FILTER_PROHIB,
-				0, 0);
+			if (badport_bandlim(BANDLIM_ICMP_FWD_FILTER) < 0)
+				m_freem(m);
+			else
+				icmp_error(m, ICMP_UNREACH,
+					ICMP_UNREACH_FILTER_PROHIB, 0, 0);
 			return NULL;	/* mbuf already free'd */
 		}
 		/* else ignore IP options and continue */
@@ -399,7 +422,11 @@
 	if (!V_ipstealth) {
 #endif
 	if (ip->ip_ttl <= IPTTLDEC) {
-		icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0);
+		if (badport_bandlim(BANDLIM_ICMP_FWD_TIMXCEED) < 0)
+			m_freem(m);
+		else
+			icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
+				0, 0);
 		return NULL;	/* mbuf already free'd */
 	}
 
@@ -420,7 +447,13 @@
 	/*
 	 * Find route to destination.
 	 */
-	if ((dst = ip_findroute(&ro, dest, m)) == NULL)
+#ifdef RADIX_MPATH
+	if ((dst = ip_findroute(&ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
+			dest, m, &rtl)) == NULL)
+#else
+	if ((dst = ip_findroute(&ro,
+			dest, m, &rtl)) == NULL)
+#endif
 		return NULL;	/* icmp unreach already sent */
 	ifp = ro.ro_rt->rt_ifp;
 
@@ -476,8 +509,6 @@
 			 * "ours"-label.
 			 */
 			m->m_flags |= M_FASTFWD_OURS;
-			if (ro.ro_rt)
-				RTFREE(ro.ro_rt);
 			return m;
 		}
 		/*
@@ -490,8 +521,7 @@
 			m_tag_delete(m, fwd_tag);
 		}
 #endif /* IPFIREWALL_FORWARD */
-		RTFREE(ro.ro_rt);
-		if ((dst = ip_findroute(&ro, dest, m)) == NULL)
+		if ((dst = ip_findroute(&ro, dest, m, &rtl)) == NULL)
 			return NULL;	/* icmp unreach already sent */
 		ifp = ro.ro_rt->rt_ifp;
 	}
@@ -507,6 +537,8 @@
 	if ((ro.ro_rt->rt_flags & RTF_REJECT) &&
 	    (ro.ro_rt->rt_rmx.rmx_expire == 0 ||
 	    time_uptime < ro.ro_rt->rt_rmx.rmx_expire)) {
+		if (badport_bandlim(BANDLIM_ICMP_FWD_UNREACH) < 0)
+			goto drop;
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		goto consumed;
 	}
@@ -527,6 +559,8 @@
 	 * Check if media link state of interface is not down
 	 */
 	if (ifp->if_link_state == LINK_STATE_DOWN) {
+		if (badport_bandlim(BANDLIM_ICMP_FWD_UNREACH) < 0)
+			goto drop;
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		goto consumed;
 	}
@@ -557,8 +591,9 @@
 		 */
 		if (ip->ip_off & IP_DF) {
 			IPSTAT_INC(ips_cantfrag);
-			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
-				0, mtu);
+			if (badport_bandlim(BANDLIM_ICMP_FWD_NEEDFRAG) < 0)
+				goto drop;
+			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu);
 			goto consumed;
 		} else {
 			/*
@@ -606,12 +641,9 @@
 		IPSTAT_INC(ips_fastforward);
 	}
 consumed:
-	RTFREE(ro.ro_rt);
 	return NULL;
 drop:
 	if (m)
 		m_freem(m);
-	if (ro.ro_rt)
-		RTFREE(ro.ro_rt);
 	return NULL;
 }
diff -u -r ../src_org_8.2_20110329/sys/netinet/ip_icmp.c ./sys/netinet/ip_icmp.c
--- ../src_org_8.2_20110329/sys/netinet/ip_icmp.c	2010-09-21 22:33:30.000000000 +0000
+++ ./sys/netinet/ip_icmp.c	2011-04-04 23:01:57.000000000 +0000
@@ -958,7 +958,11 @@
 		{ "icmp tstamp response" },
 		{ "closed port RST response" },
 		{ "open port RST response" },
-		{ "icmp6 unreach response" }
+		{ "icmp6 unreach response" },
+		{ "forwarding: limit unreachable" },
+		{ "forwarding: limit time-exceeded" },
+		{ "forwarding: limit need-frag" },
+		{ "forwarding: limit admin-prohib" }
 	};
 
 	/*
diff -u -r ../src_org_8.2_20110329/sys/netinet/ip_input.c ./sys/netinet/ip_input.c
--- ../src_org_8.2_20110329/sys/netinet/ip_input.c	2011-03-28 15:26:52.000000000 +0000
+++ ./sys/netinet/ip_input.c	2011-04-05 01:14:00.000000000 +0000
@@ -71,6 +71,7 @@
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_icmp.h>
+#include <netinet/icmp_var.h>
 #include <netinet/ip_options.h>
 #include <machine/in_cksum.h>
 #include <netinet/ip_carp.h>
@@ -1348,20 +1349,22 @@
 	struct route sro;
 	struct sockaddr_in *sin;
 	struct in_ifaddr *ia;
+	struct sockaddr_in *lu_dst;
+	struct rtlookup rtl;
 
 	bzero(&sro, sizeof(sro));
 	sin = (struct sockaddr_in *)&sro.ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = dst;
-	in_rtalloc_ign(&sro, 0, fibnum);
-
-	if (sro.ro_rt == NULL)
+	lu_dst = (struct sockaddr_in *)&sro.ro_dst;
+	rtl.rt_gateway = (struct sockaddr *)lu_dst;
+	if (!rtlookup_fib( (struct sockaddr *)lu_dst,
+			fibnum, &rtl, 0))
 		return (NULL);
 
-	ia = ifatoia(sro.ro_rt->rt_ifa);
+	ia = ifatoia(rtl.rt_ifa);
 	ifa_ref(&ia->ia_ifa);
-	RTFREE(sro.ro_rt);
 	return (ia);
 }
 
@@ -1397,6 +1400,9 @@
 	struct in_addr dest;
 	struct route ro;
 	int error, type = 0, code = 0, mtu = 0;
+	struct rtlookup rtl;
+	struct sockaddr_in *dst;
+	int icmp_send = 0;
 
 	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
 		IPSTAT_INC(ips_cantforward);
@@ -1407,8 +1413,11 @@
 	if (!V_ipstealth) {
 #endif
 		if (ip->ip_ttl <= IPTTLDEC) {
-			icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
-			    0, 0);
+			if (badport_bandlim(BANDLIM_ICMP_FWD_TIMXCEED) < 0)
+				m_freem(m);
+			else
+				icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
+					0, 0);
 			return;
 		}
 #ifdef IPSTEALTH
@@ -1423,7 +1432,10 @@
 	 * ip_output in case of outgoing IPsec policy.
 	 */
 	if (!srcrt && ia == NULL) {
-		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
+		if (badport_bandlim(BANDLIM_ICMP_FWD_UNREACH) < 0)
+			m_freem(m);
+		else
+			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		return;
 	}
 #endif
@@ -1488,7 +1500,13 @@
 		sin->sin_family = AF_INET;
 		sin->sin_len = sizeof(*sin);
 		sin->sin_addr = ip->ip_dst;
-		in_rtalloc_ign(&ro, 0, M_GETFIB(m));
+		dst = (struct sockaddr_in *)&ro.ro_dst;
+		rtl.rt_gateway = (struct sockaddr *)dst;
+		if (!rtlookup_fib( (struct sockaddr *)dst,
+				M_GETFIB(m), &rtl, 0))
+			ro.ro_rt = NULL;
+		else
+			ro.ro_rt = (struct rtentry *)&rtl;
 
 		rt = ro.ro_rt;
 
@@ -1508,8 +1526,6 @@
 				code = ICMP_REDIRECT_HOST;
 			}
 		}
-		if (rt)
-			RTFREE(rt);
 	}
 
 	/*
@@ -1522,8 +1538,6 @@
 
 	if (error == EMSGSIZE && ro.ro_rt)
 		mtu = ro.ro_rt->rt_rmx.rmx_mtu;
-	if (ro.ro_rt)
-		RTFREE(ro.ro_rt);
 
 	if (error)
 		IPSTAT_INC(ips_cantforward);
@@ -1558,11 +1572,13 @@
 	default:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_HOST;
+		icmp_send = badport_bandlim( BANDLIM_ICMP_FWD_UNREACH);
 		break;
 
 	case EMSGSIZE:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_NEEDFRAG;
+		icmp_send = badport_bandlim( BANDLIM_ICMP_FWD_NEEDFRAG);
 
 #ifdef IPSEC
 		/* 
@@ -1618,7 +1634,10 @@
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
-	icmp_error(mcopy, type, code, dest.s_addr, mtu);
+	if (icmp_send < 0)
+		m_freem(m);
+	else
+		icmp_error(mcopy, type, code, dest.s_addr, mtu);
 }
 
 void
diff -u -r ../src_org_8.2_20110329/sys/netinet/ip_output.c ./sys/netinet/ip_output.c
--- ../src_org_8.2_20110329/sys/netinet/ip_output.c	2010-10-25 13:16:11.000000000 +0000
+++ ./sys/netinet/ip_output.c	2011-04-05 01:15:32.000000000 +0000
@@ -128,6 +128,7 @@
 	struct in_ifaddr *ia = NULL;
 	int isbroadcast, sw_csum;
 	struct route iproute;
+	struct rtlookup rtl;
 	struct rtentry *rte;	/* cache for ro->ro_rt */
 	struct in_addr odst;
 #ifdef IPFIREWALL_FORWARD
@@ -271,16 +272,24 @@
 		 * operation (as it is for ARP).
 		 */
 		if (rte == NULL) {
+			rtl.rt_gateway = (struct sockaddr *)dst;
 #ifdef RADIX_MPATH
-			rtalloc_mpath_fib(ro,
-			    ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
-			    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
+			if (!rtlookup_mpath_fib((struct sockaddr *)dst,
+					ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
+					inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m),
+					&rtl, RTL_PKSENT))
 #else
-			in_rtalloc_ign(ro, 0,
-			    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
+			if (!rtlookup_fib( (struct sockaddr *)dst,
+					inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m),
+					&rtl, RTL_PKSENT))
 #endif
-			rte = ro->ro_rt;
+				ro->ro_rt = NULL;
+			else {
+				nortfree = 1;
+				ro->ro_rt = (struct rtentry *)&rtl;
+			}
 		}
+		rte = ro->ro_rt;
 		if (rte == NULL ||
 		    rte->rt_ifp == NULL ||
 		    !RT_LINK_IS_UP(rte->rt_ifp)) {
diff -u -r ../src_org_8.2_20110329/sys/netinet/ipfw/ip_fw_table.c ./sys/netinet/ipfw/ip_fw_table.c
--- ../src_org_8.2_20110329/sys/netinet/ipfw/ip_fw_table.c	2010-03-23 09:58:59.000000000 +0000
+++ ./sys/netinet/ipfw/ip_fw_table.c	2011-04-03 16:07:57.000000000 +0000
@@ -137,7 +137,7 @@
 	mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
 	sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
 	IPFW_WLOCK(ch);
-	ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh);
+	ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh, NULL);
 	if (ent == NULL) {
 		IPFW_WUNLOCK(ch);
 		return (ESRCH);
@@ -154,7 +154,7 @@
 	struct table_entry *ent;
 
 	ent = (struct table_entry *)
-	    rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
+	    rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh, NULL);
 	if (ent != NULL)
 		free(ent, M_IPFW_TBL);
 	return (0);
diff -u -r ../src_org_8.2_20110329/sys/netinet/raw_ip.c ./sys/netinet/raw_ip.c
--- ../src_org_8.2_20110329/sys/netinet/raw_ip.c	2011-04-02 14:45:13.000000000 +0000
+++ ./sys/netinet/raw_ip.c	2011-04-03 16:07:57.000000000 +0000
@@ -755,6 +755,8 @@
 		if (err == 0)
 			ia->ia_flags |= IFA_ROUTE;
 		err = ifa_add_loopback_route((struct ifaddr *)ia, sa);
+		if (err == 0)
+		        ia->ia_flags |= IFA_RTSELF;
 		ifa_free(&ia->ia_ifa);
 		break;
 	}
diff -u -r ../src_org_8.2_20110329/sys/netinet6/in6_ifattach.c ./sys/netinet6/in6_ifattach.c
--- ../src_org_8.2_20110329/sys/netinet6/in6_ifattach.c	2010-05-06 06:44:19.000000000 +0000
+++ ./sys/netinet6/in6_ifattach.c	2011-04-03 16:07:57.000000000 +0000
@@ -42,6 +42,8 @@
 #include <sys/proc.h>
 #include <sys/syslog.h>
 #include <sys/md5.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
diff -u -r ../src_org_8.2_20110329/sys/netinet6/in6_rmx.c ./sys/netinet6/in6_rmx.c
--- ../src_org_8.2_20110329/sys/netinet6/in6_rmx.c	2010-10-11 11:25:37.000000000 +0000
+++ ./sys/netinet6/in6_rmx.c	2011-04-03 16:07:57.000000000 +0000
@@ -87,6 +87,7 @@
 #include <sys/rwlock.h>
 #include <sys/syslog.h>
 #include <sys/callout.h>
+#include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/route.h>
diff -u -r ../src_org_8.2_20110329/sys/netinet6/in6_src.c ./sys/netinet6/in6_src.c
--- ../src_org_8.2_20110329/sys/netinet6/in6_src.c	2010-05-06 06:44:19.000000000 +0000
+++ ./sys/netinet6/in6_src.c	2011-04-05 01:14:24.000000000 +0000
@@ -796,15 +796,253 @@
 
 /*
  * clone - meaningful only for bsdi and freebsd
+ * XXX remove and do lookup direct in ip6_output
  */
 int
 in6_selectroute(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
-    struct ip6_moptions *mopts, struct route_in6 *ro,
+    struct ip6_moptions *mopts, struct route_in6 *ro, struct rtlookup *rtl,
     struct ifnet **retifp, struct rtentry **retrt)
 {
+       int error = 0;
+       struct ifnet *ifp = NULL;
+       struct rtentry *rt = NULL;
+       struct sockaddr_in6 *sin6_next;
+       struct in6_pktinfo *pi = NULL;
+       struct in6_addr *dst = &dstsock->sin6_addr;
+       struct sockaddr_in6 *lu_dst;
+       int norouteok = 0;
+#if 0
+       char ip6buf[INET6_ADDRSTRLEN];
+
+       if (dstsock->sin6_addr.s6_addr32[0] == 0 &&
+           dstsock->sin6_addr.s6_addr32[1] == 0 &&
+           !IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) {
+               printf("in6_selectroute: strange destination %s\n",
+                      ip6_sprintf(ip6buf, &dstsock->sin6_addr));
+       } else {
+               printf("in6_selectroute: destination = %s%%%d\n",
+                      ip6_sprintf(ip6buf, &dstsock->sin6_addr),
+                      dstsock->sin6_scope_id); /* for debug */
+       }
+#endif
+
+       /* If the caller specify the outgoing interface explicitly, use it. */
+       if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) {
+               /* XXX boundary check is assumed to be already done. */
+               ifp = ifnet_byindex(pi->ipi6_ifindex);
+               if (ifp != NULL &&
+                   (norouteok || retrt == NULL ||
+                   IN6_IS_ADDR_MULTICAST(dst))) {
+                       /*
+                        * we do not have to check or get the route for
+                        * multicast.
+                        */
+                       goto done;
+               } else
+                       goto getroute;
+       }
+
+       /*
+        * If the destination address is a multicast address and the outgoing
+        * interface for the address is specified by the caller, use it.
+        */
+       if (IN6_IS_ADDR_MULTICAST(dst) &&
+           mopts != NULL && (ifp = mopts->im6o_multicast_ifp) != NULL) {
+               goto done; /* we do not need a route for multicast. */
+       }
+
+  getroute:
+       /*
+        * If the next hop address for the packet is specified by the caller,
+        * use it as the gateway.
+        */
+       if (opts && opts->ip6po_nexthop) {
+               struct route_in6 *ron;
+               struct llentry *la;
+           
+               sin6_next = satosin6(opts->ip6po_nexthop);
+               
+               /* at this moment, we only support AF_INET6 next hops */
+               if (sin6_next->sin6_family != AF_INET6) {
+                       error = EAFNOSUPPORT; /* or should we proceed? */
+                       goto done;
+               }
+
+               /*
+                * If the next hop is an IPv6 address, then the node identified
+                * by that address must be a neighbor of the sending host.
+                */
+               ron = &opts->ip6po_nextroute;
+               /*
+                * XXX what do we do here?
+                * PLZ to be fixing
+                */
+
+               if (ron->ro_rt == NULL) {
+                       lu_dst = (struct sockaddr_in6 *)&ron->ro_dst; 
+                       rtl->rt_gateway = (struct sockaddr *)lu_dst;
+                       if (!rtlookup_fib( (struct sockaddr *)lu_dst, 0U,
+                                       rtl, RTL_PKSENT)) {
+                               ron->ro_rt = NULL;
+                               error = EHOSTUNREACH;
+                               goto done;
+                       } else
+                               ron->ro_rt = (struct rtentry *) rtl;
+               }
+
+               rt = ron->ro_rt;
+               ifp = rt->rt_ifp;
+               IF_AFDATA_LOCK(ifp);
+               la = lla_lookup(LLTABLE6(ifp), 0, (struct sockaddr *)&sin6_next->sin6_addr);
+               IF_AFDATA_UNLOCK(ifp);
+               if (la != NULL) 
+                       LLE_RUNLOCK(la);
+               else {
+                       error = EHOSTUNREACH;
+                       goto done;
+               }
+#if 0
+               if ((ron->ro_rt &&
+                   (ron->ro_rt->rt_flags & (RTF_UP | RTF_LLINFO)) !=
+                   (RTF_UP | RTF_LLINFO)) ||
+                   !IN6_ARE_ADDR_EQUAL(&satosin6(&ron->ro_dst)->sin6_addr,
+                   &sin6_next->sin6_addr)) {
+                       if (ron->ro_rt)
+                               ron->ro_rt = NULL;
+                       *satosin6(&ron->ro_dst) = *sin6_next;
+               }
+               if (ron->ro_rt == NULL) {
+                       lu_dst = (struct sockaddr_in6 *)&ron->ro_dst;
+                       rtl->rt_gateway = (struct sockaddr *)lu_dst;
+                       if (!rtlookup_fib( (struct sockaddr *)lu_dst, 0U,
+                                       rtl, RTL_PKSENT)) {
+                               ron->ro_rt = NULL;
+                               error = EHOSTUNREACH;
+                               goto done;
+                       } else {
+                               ron->ro_rt = (struct rtentry *) rtl;
+                               if (!(ron->ro_rt->rt_flags & RTF_LLINFO)) {
+                                       ron->ro_rt = NULL;
+                                       error = EHOSTUNREACH;
+                                       goto done;
+                                }
+                       }
+               }
+#endif
+
+               /*
+                * When cloning is required, try to allocate a route to the
+                * destination so that the caller can store path MTU
+                * information.
+                */
+               goto done;
+       }
+
+       /*
+        * Use a cached route if it exists and is valid, else try to allocate
+        * a new one.  Note that we should check the address family of the
+        * cached destination, in case of sharing the cache with IPv4.
+        */
+       if (ro) {
+               if (ro->ro_rt &&
+                   (!(ro->ro_rt->rt_flags & RTF_UP) ||
+                    ((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 ||
+                    !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr,
+                    dst)))
+                       ro->ro_rt = (struct rtentry *)NULL;
+               if (ro->ro_rt == (struct rtentry *)NULL) {
+                       struct sockaddr_in6 *sa6;
+
+                       /* No route yet, so try to acquire one */
+                       bzero(&ro->ro_dst, sizeof(struct sockaddr_in6));
+                       sa6 = (struct sockaddr_in6 *)&ro->ro_dst;
+                       *sa6 = *dstsock;
+                       sa6->sin6_scope_id = 0;
+
+                       lu_dst = (struct sockaddr_in6 *)&ro->ro_dst; 
+                       rtl->rt_gateway = (struct sockaddr *)lu_dst;
+#ifdef RADIX_MPATH
+                       if (!rtlookup_mpath_fib((struct sockaddr *)lu_dst,
+                                       ntohl(sa6->sin6_addr.s6_addr32[3]),
+                                       0U, rtl, RTL_PKSENT))
+#else
+                       if (!rtlookup_fib((struct sockaddr *)lu_dst, 0U,
+                                       rtl, RTL_PKSENT))
+#endif
+                               ro->ro_rt = NULL;
+                       else
+                               ro->ro_rt = (struct rtentry *) rtl;
+               }
+                               
+               /*
+                * do not care about the result if we have the nexthop
+                * explicitly specified.
+                */
+               if (opts && opts->ip6po_nexthop)
+                       goto done;
+
+               if (ro->ro_rt) {
+                       ifp = ro->ro_rt->rt_ifp;
+
+                       if (ifp == NULL) { /* can this really happen? */
+                               ro->ro_rt = NULL;
+                       }
+               }
+               if (ro->ro_rt == NULL)
+                       error = EHOSTUNREACH;
+               rt = ro->ro_rt;
+
+               /*
+                * Check if the outgoing interface conflicts with
+                * the interface specified by ipi6_ifindex (if specified).
+                * Note that loopback interface is always okay.
+                * (this may happen when we are sending a packet to one of
+                *  our own addresses.)
+                */
+               if (ifp && opts && opts->ip6po_pktinfo &&
+                   opts->ip6po_pktinfo->ipi6_ifindex) {
+                       if (!(ifp->if_flags & IFF_LOOPBACK) &&
+                           ifp->if_index !=
+                           opts->ip6po_pktinfo->ipi6_ifindex) {
+                               error = EHOSTUNREACH;
+                               goto done;
+                       }
+               }
+       }
+
+  done:
+       if (ifp == NULL && rt == NULL) {
+               /*
+                * This can happen if the caller did not pass a cached route
+                * nor any other hints.  We treat this case an error.
+                */
+               error = EHOSTUNREACH;
+       }
+       if (error == EHOSTUNREACH)
+               V_ip6stat.ip6s_noroute++;
+
+       if (retifp != NULL) {
+               *retifp = ifp;
+
+               /*
+                * Adjust the "outgoing" interface.  If we're going to loop 
+                * the packet back to ourselves, the ifp would be the loopback 
+                * interface. However, we'd rather know the interface associated 
+                * to the destination address (which should probably be one of 
+                * our own addresses.)
+                */
+               if (rt) {
+                       if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) &&
+                           (rt->rt_gateway->sa_family == AF_LINK))
+                               *retifp = 
+                                       ifnet_byindex(((struct sockaddr_dl *)
+                                                      rt->rt_gateway)->sdl_index);
+               }
+       }
+       if (retrt != NULL)
+               *retrt = rt;    /* rt may be NULL */
 
-	return (selectroute(dstsock, opts, mopts, ro, retifp,
-	    retrt, 0));
+       return (error);
 }
 
 /*
diff -u -r ../src_org_8.2_20110329/sys/netinet6/ip6_forward.c ./sys/netinet6/ip6_forward.c
--- ../src_org_8.2_20110329/sys/netinet6/ip6_forward.c	2010-02-07 09:00:22.000000000 +0000
+++ ./sys/netinet6/ip6_forward.c	2011-04-05 01:14:58.000000000 +0000
@@ -99,6 +99,7 @@
 	struct ifnet *origifp;	/* maybe unnecessary */
 	u_int32_t inzone, outzone;
 	struct in6_addr src_in6, dst_in6;
+	struct rtlookup rtl;
 #ifdef IPSEC
 	struct secpolicy *sp = NULL;
 	int ipsecrt = 0;
@@ -352,18 +353,27 @@
 	dst->sin6_family = AF_INET6;
 	dst->sin6_addr = ip6->ip6_dst;
 
-	rin6.ro_rt = rtalloc1((struct sockaddr *)dst, 0, 0);
-	if (rin6.ro_rt != NULL)
-		RT_UNLOCK(rin6.ro_rt);
-	else {
+	rtl.rt_gateway = (struct sockaddr *)dst;
+#ifdef RADIX_MPATH
+	src_in6 = ip6->ip6_src;
+	dst_in6 = ip6->ip6_dst;
+	if (!rtlookup_mpath_fib((struct sockaddr *)dst,
+			ntohl(src_in6->sin6_addr.s6_addr32[3] ^ dst_in6->sin6_addr.s6_addr32[3]),
+			0U, &rtl, RTL_PKSENT)) {
+#else
+	if (!rtlookup_fib( (struct sockaddr *)dst, 0U, &rtl,
+			RTL_PKSENT)) {
+#endif
+		rin6.ro_rt = NULL;
 		V_ip6stat.ip6s_noroute++;
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute);
 		if (mcopy) {
 			icmp6_error(mcopy, ICMP6_DST_UNREACH,
-			ICMP6_DST_UNREACH_NOROUTE, 0);
+				ICMP6_DST_UNREACH_NOROUTE, 0);
 		}
 		goto bad;
-	}
+	} else
+		rin6.ro_rt = (struct rtentry *) &rtl;
 	rt = rin6.ro_rt;
 #ifdef IPSEC
 skip_routing:
@@ -580,12 +590,12 @@
 
 senderr:
 	if (mcopy == NULL)
-		goto out;
+		return;
 	switch (error) {
 	case 0:
 		if (type == ND_REDIRECT) {
 			icmp6_redirect_output(mcopy, rt);
-			goto out;
+			return;
 		}
 		goto freecopy;
 
diff -u -r ../src_org_8.2_20110329/sys/netinet6/ip6_output.c ./sys/netinet6/ip6_output.c
--- ../src_org_8.2_20110329/sys/netinet6/ip6_output.c	2010-10-25 13:16:11.000000000 +0000
+++ ./sys/netinet6/ip6_output.c	2011-04-03 16:07:57.000000000 +0000
@@ -200,7 +200,7 @@
 	int hlen, tlen, len, off;
 	struct route_in6 ip6route;
 	struct rtentry *rt = NULL;
-	struct sockaddr_in6 *dst, src_sa, dst_sa;
+	struct sockaddr_in6 *dst, src_sa, dst_sa, dst_lookup;
 	struct in6_addr odst;
 	int error = 0;
 	struct in6_ifaddr *ia = NULL;
@@ -213,6 +213,7 @@
 	struct route_in6 *ro_pmtu = NULL;
 	int hdrsplit = 0;
 	int needipsec = 0;
+	struct rtlookup rtl;
 #ifdef SCTP
 	int sw_csum;
 #endif
@@ -572,11 +573,11 @@
 	/* adjust pointer */
 	ip6 = mtod(m, struct ip6_hdr *);
 
-	bzero(&dst_sa, sizeof(dst_sa));
-	dst_sa.sin6_family = AF_INET6;
-	dst_sa.sin6_len = sizeof(dst_sa);
-	dst_sa.sin6_addr = ip6->ip6_dst;
-	if ((error = in6_selectroute(&dst_sa, opt, im6o, ro,
+	bzero(&dst_lookup, sizeof(dst_lookup));
+	dst_lookup.sin6_family = AF_INET6;
+	dst_lookup.sin6_len = sizeof(dst_lookup);
+	dst_lookup.sin6_addr = ip6->ip6_dst;
+	if ((error = in6_selectroute(&dst_lookup, opt, im6o, ro, &rtl,
 	    &ifp, &rt)) != 0) {
 		switch (error) {
 		case EHOSTUNREACH:
@@ -595,7 +596,7 @@
 		 * If in6_selectroute() does not return a route entry,
 		 * dst may not have been updated.
 		 */
-		*dst = dst_sa;	/* XXX */
+		*dst = dst_lookup;      /* XXX */
 	}
 
 	/*
@@ -1071,11 +1072,6 @@
 		V_ip6stat.ip6s_fragmented++;
 
 done:
-	if (ro == &ip6route && ro->ro_rt) { /* brace necessary for RTFREE */
-		RTFREE(ro->ro_rt);
-	} else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) {
-		RTFREE(ro_pmtu->ro_rt);
-	}
 #ifdef IPSEC
 	if (sp != NULL)
 		KEY_FREESP(&sp);
diff -u -r ../src_org_8.2_20110329/sys/netinet6/ip6_var.h ./sys/netinet6/ip6_var.h
--- ../src_org_8.2_20110329/sys/netinet6/ip6_var.h	2010-09-09 06:43:18.000000000 +0000
+++ ./sys/netinet6/ip6_var.h	2011-04-03 16:07:57.000000000 +0000
@@ -431,12 +431,13 @@
 int	dest6_input __P((struct mbuf **, int *, int));
 int	none_input __P((struct mbuf **, int *, int));
 
+#include <net/route.h>
 int	in6_selectsrc(struct sockaddr_in6 *, struct ip6_pktopts *,
 	struct inpcb *inp, struct route_in6 *, struct ucred *cred,
 	struct ifnet **, struct in6_addr *);
 int in6_selectroute __P((struct sockaddr_in6 *, struct ip6_pktopts *,
-	struct ip6_moptions *, struct route_in6 *, struct ifnet **,
-	struct rtentry **));
+	struct ip6_moptions *, struct route_in6 *, struct rtlookup *,
+	struct ifnet **, struct rtentry **));
 u_int32_t ip6_randomid __P((void));
 u_int32_t ip6_randomflowlabel __P((void));
 #endif /* _KERNEL */
diff -u -r ../src_org_8.2_20110329/sys/netinet6/nd6_rtr.c ./sys/netinet6/nd6_rtr.c
--- ../src_org_8.2_20110329/sys/netinet6/nd6_rtr.c	2010-05-06 06:44:19.000000000 +0000
+++ ./sys/netinet6/nd6_rtr.c	2011-04-03 16:07:57.000000000 +0000
@@ -48,6 +48,8 @@
 #include <sys/rwlock.h>
 #include <sys/syslog.h>
 #include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/if_types.h>


More information about the freebsd-net mailing list