Routing enhancement - reduce routing table locking

Ingo Flaschberger if at freebsd.org
Sun Apr 17 23:00:24 UTC 2011


attached a new version of this patch with some improvements and bug-fixes.
Test-Reports are welcome.

Kind regards,
 	Ingo Flaschberger
-------------- next part --------------
diff -u -r ../src_org_8.2_20110329/contrib/ipfilter/radix.c ./contrib/ipfilter/radix.c
--- ../src_org_8.2_20110329/contrib/ipfilter/radix.c	2009-08-03 08:13:06.000000000 +0000
+++ ./contrib/ipfilter/radix.c	2011-04-03 16:08:28.000000000 +0000
@@ -759,9 +759,10 @@
 }
 
 struct radix_node *
-rn_delete(v_arg, netmask_arg, head)
+rn_delete(v_arg, netmask_arg, head, rn)
 	void *v_arg, *netmask_arg;
 	struct radix_node_head *head;
+	struct radix_node *rn;
 {
 	struct radix_node *t, *p, *x, *tt;
 	struct radix_mask *m, *saved_m, **mp;
@@ -1069,7 +1070,7 @@
 	struct radix_node_head *rnh = p;
 	struct radix_node *d;
 
-	d = rnh->rnh_deladdr(n->rn_key, NULL, rnh);
+	d = rnh->rnh_deladdr(n->rn_key, NULL, rnh, NULL);
 	if (d != NULL) {
 		FreeS(d, max_keylen + 2 * sizeof (*d));
 	}
diff -u -r ../src_org_8.2_20110329/contrib/ipfilter/radix_ipf.h ./contrib/ipfilter/radix_ipf.h
--- ../src_org_8.2_20110329/contrib/ipfilter/radix_ipf.h	2009-08-03 08:13:06.000000000 +0000
+++ ./contrib/ipfilter/radix_ipf.h	2011-04-12 16:27:31.000000000 +0000
@@ -130,7 +130,8 @@
 		__P((void *v, void *mask,
 		     struct radix_node_head *head, struct radix_node nodes[]));
 	struct	radix_node *(*rnh_deladdr)	/* remove based on sockaddr */
-		__P((void *v, void *mask, struct radix_node_head *head));
+		__P((void *v, void *mask, struct radix_node_head *head,
+		    struct radix_node *rn));
 	struct	radix_node *(*rnh_delpkt)	/* remove based on packet hdr */
 		__P((void *v, void *mask, struct radix_node_head *head));
 	struct	radix_node *(*rnh_matchaddr)	/* locate based on sockaddr */
@@ -202,7 +203,8 @@
 	 *rn_addmask __P((void *, int, int)),
 	 *rn_addroute __P((void *, void *, struct radix_node_head *,
 			struct radix_node [2])),
-	 *rn_delete __P((void *, void *, struct radix_node_head *)),
+	 *rn_delete __P((void *, void *, struct radix_node_head *,
+	      struct radix_node *)),
 	 *rn_insert __P((void *, struct radix_node_head *, int *,
 			struct radix_node [2])),
 	 *rn_lookup __P((void *, void *, struct radix_node_head *)),
diff -u -r ../src_org_8.2_20110329/sbin/routed/radix.c ./sbin/routed/radix.c
--- ../src_org_8.2_20110329/sbin/routed/radix.c	2009-08-03 08:13:06.000000000 +0000
+++ ./sbin/routed/radix.c	2011-04-03 16:08:07.000000000 +0000
@@ -662,7 +662,8 @@
 static struct radix_node *
 rn_delete(void *v_arg,
 	  void *netmask_arg,
-	  struct radix_node_head *head)
+	  struct radix_node_head *head,
+	  struct radix_node *rn)
 {
 	struct radix_node *t, *p, *x, *tt;
 	struct radix_mask *m, *saved_m, **mp;
@@ -670,6 +671,8 @@
 	caddr_t v, netmask;
 	int b, head_off, vlen;
 
+	rn = NULL; /* XXX make compiler happy */
+
 	v = v_arg;
 	netmask = netmask_arg;
 	x = head->rnh_treetop;
diff -u -r ../src_org_8.2_20110329/sbin/routed/radix.h ./sbin/routed/radix.h
--- ../src_org_8.2_20110329/sbin/routed/radix.h	2009-08-03 08:13:06.000000000 +0000
+++ ./sbin/routed/radix.h	2011-04-12 16:28:04.000000000 +0000
@@ -115,7 +115,8 @@
 		(void *v, void *mask,
 		     struct radix_node_head *head, struct radix_node nodes[]);
 	struct	radix_node *(*rnh_deladdr)	/* remove based on sockaddr */
-		(void *v, void *mask, struct radix_node_head *head);
+		(void *v, void *mask, struct radix_node_head *head,
+		     struct radix_node *rn);
 	struct	radix_node *(*rnh_delpkt)	/* remove based on packet hdr */
 		(void *v, void *mask, struct radix_node_head *head);
 	struct	radix_node *(*rnh_matchaddr)	/* locate based on sockaddr */
diff -u -r ../src_org_8.2_20110329/sbin/routed/table.c ./sbin/routed/table.c
--- ../src_org_8.2_20110329/sbin/routed/table.c	2009-08-03 08:13:06.000000000 +0000
+++ ./sbin/routed/table.c	2011-04-03 16:08:07.000000000 +0000
@@ -1865,7 +1865,7 @@
 	mask_sock.sin_addr.s_addr = htonl(rt->rt_mask);
 	masktrim(&mask_sock);
 	if (rt != (struct rt_entry *)rhead->rnh_deladdr(&dst_sock, &mask_sock,
-							rhead)) {
+							rhead, NULL)) {
 		msglog("rnh_deladdr() failed");
 	} else {
 		free(rt);
diff -u -r ../src_org_8.2_20110329/sys/contrib/ipfilter/netinet/ip_pool.c ./sys/contrib/ipfilter/netinet/ip_pool.c
--- ../src_org_8.2_20110329/sys/contrib/ipfilter/netinet/ip_pool.c	2007-10-18 21:42:38.000000000 +0000
+++ ./sys/contrib/ipfilter/netinet/ip_pool.c	2011-04-03 16:07:46.000000000 +0000
@@ -67,6 +67,7 @@
 #include "netinet/ip_compat.h"
 #include "netinet/ip_fil.h"
 #include "netinet/ip_pool.h"
+#include <sys/rmlock.h>
 
 #if defined(IPFILTER_LOOKUP) && defined(_KERNEL) && \
       ((BSD >= 198911) && !defined(__osf__) && \
@@ -620,7 +621,7 @@
 
 	RADIX_NODE_HEAD_LOCK(ipo->ipo_head);
 	ipo->ipo_head->rnh_deladdr(&ipe->ipn_addr, &ipe->ipn_mask,
-				   ipo->ipo_head);
+				   ipo->ipo_head, NULL);
 	RADIX_NODE_HEAD_UNLOCK(ipo->ipo_head);
 
 	ip_pool_node_deref(ipe);
@@ -751,7 +752,7 @@
 	RADIX_NODE_HEAD_LOCK(ipo->ipo_head);
 	while ((n = ipo->ipo_list) != NULL) {
 		ipo->ipo_head->rnh_deladdr(&n->ipn_addr, &n->ipn_mask,
-					   ipo->ipo_head);
+					   ipo->ipo_head, NULL);
 
 		*n->ipn_pnext = n->ipn_next;
 		if (n->ipn_next)
@@ -963,7 +964,7 @@
 	struct radix_node_head *rnh = p;
 	struct radix_node *d;
 
-	d = rnh->rnh_deladdr(n->rn_key, NULL, rnh);
+	d = rnh->rnh_deladdr(n->rn_key, NULL, rnh, NULL);
 	if (d != NULL) {
 		FreeS(d, max_keylen + 2 * sizeof (*d));
 	}
diff -u -r ../src_org_8.2_20110329/sys/contrib/pf/net/pf.c ./sys/contrib/pf/net/pf.c
--- ../src_org_8.2_20110329/sys/contrib/pf/net/pf.c	2010-09-20 17:03:10.000000000 +0000
+++ ./sys/contrib/pf/net/pf.c	2011-04-03 16:07:46.000000000 +0000
@@ -99,9 +99,7 @@
 #include <net/if_types.h>
 #include <net/bpf.h>
 #include <net/route.h>
-#ifndef __FreeBSD__
 #include <net/radix_mpath.h>
-#endif
 
 #include <netinet/in.h>
 #include <netinet/in_var.h>
@@ -6166,9 +6164,9 @@
 			if (kif->pfik_ifp == ifp)
 				ret = 1;
 #ifdef __FreeBSD__ /* MULTIPATH_ROUTING */
-			rn = NULL;
-#else
 			rn = rn_mpath_next(rn);
+#else
+			rn = rn_mpath_next(rn, 0);
 #endif
 		} while (check_mpath == 1 && rn != NULL && ret == 0);
 	} else
diff -u -r ../src_org_8.2_20110329/sys/contrib/pf/net/pf_table.c ./sys/contrib/pf/net/pf_table.c
--- ../src_org_8.2_20110329/sys/contrib/pf/net/pf_table.c	2009-08-03 08:13:06.000000000 +0000
+++ ./sys/contrib/pf/net/pf_table.c	2011-04-03 16:07:46.000000000 +0000
@@ -44,7 +44,7 @@
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
-#include <sys/rwlock.h>
+#include <sys/rmlock.h>
 #ifdef __FreeBSD__
 #include <sys/malloc.h>
 #endif
@@ -1114,17 +1114,9 @@
 #endif
 	if (KENTRY_NETWORK(ke)) {
 		pfr_prepare_network(&mask, ke->pfrke_af, ke->pfrke_net);
-#ifdef __FreeBSD__
-		rn = rn_delete(&ke->pfrke_sa, &mask, head);
-#else
 		rn = rn_delete(&ke->pfrke_sa, &mask, head, NULL);
-#endif
 	} else
-#ifdef __FreeBSD__
-		rn = rn_delete(&ke->pfrke_sa, NULL, head);
-#else
 		rn = rn_delete(&ke->pfrke_sa, NULL, head, NULL);
-#endif
 	splx(s);
 
 	if (rn == NULL) {
diff -u -r ../src_org_8.2_20110329/sys/kern/subr_witness.c ./sys/kern/subr_witness.c
--- ../src_org_8.2_20110329/sys/kern/subr_witness.c	2011-03-28 15:26:48.000000000 +0000
+++ ./sys/kern/subr_witness.c	2011-04-03 16:07:54.000000000 +0000
@@ -508,7 +508,7 @@
 	 * Routing
 	 */
 	{ "so_rcv", &lock_class_mtx_sleep },
-	{ "radix node head", &lock_class_rw },
+	{ "radix node head", &lock_class_rm },
 	{ "rtentry", &lock_class_mtx_sleep },
 	{ "ifaddr", &lock_class_mtx_sleep },
 	{ NULL, NULL },
diff -u -r ../src_org_8.2_20110329/sys/kern/vfs_export.c ./sys/kern/vfs_export.c
--- ../src_org_8.2_20110329/sys/kern/vfs_export.c	2009-10-01 13:11:45.000000000 +0000
+++ ./sys/kern/vfs_export.c	2011-04-03 16:07:54.000000000 +0000
@@ -43,6 +43,7 @@
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
+#include <sys/rmlock.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
@@ -228,7 +229,7 @@
 	struct radix_node_head *rnh = (struct radix_node_head *) w;
 	struct ucred *cred;
 
-	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
+	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh, NULL);
 	cred = ((struct netcred *)rn)->netc_anon;
 	if (cred != NULL)
 		crfree(cred);
@@ -427,6 +428,7 @@
 	register struct netcred *np;
 	register struct radix_node_head *rnh;
 	struct sockaddr *saddr;
+	struct rm_priotracker tracker;
 
 	nep = mp->mnt_export;
 	if (nep == NULL)
@@ -440,10 +442,10 @@
 			saddr = nam;
 			rnh = nep->ne_rtable[saddr->sa_family];
 			if (rnh != NULL) {
-				RADIX_NODE_HEAD_RLOCK(rnh);
+				RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
 				np = (struct netcred *)
 				    (*rnh->rnh_matchaddr)(saddr, rnh);
-				RADIX_NODE_HEAD_RUNLOCK(rnh);
+				RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
 				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
 					np = NULL;
 			}
diff -u -r ../src_org_8.2_20110329/sys/net/if.c ./sys/net/if.c
--- ../src_org_8.2_20110329/sys/net/if.c	2011-04-06 18:03:49.000000000 +0000
+++ ./sys/net/if.c	2011-04-12 16:21:05.000000000 +0000
@@ -49,6 +49,7 @@
 #include <sys/protosw.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
+#include <sys/rmlock.h>
 #include <sys/refcount.h>
 #include <sys/module.h>
 #include <sys/rwlock.h>
diff -u -r ../src_org_8.2_20110329/sys/net/pfil.c ./sys/net/pfil.c
--- ../src_org_8.2_20110329/sys/net/pfil.c	2010-02-07 09:00:22.000000000 +0000
+++ ./sys/net/pfil.c	2011-04-03 16:07:57.000000000 +0000
@@ -39,7 +39,6 @@
 #include <sys/socketvar.h>
 #include <sys/systm.h>
 #include <sys/condvar.h>
-#include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
diff -u -r ../src_org_8.2_20110329/sys/net/radix.c ./sys/net/radix.c
--- ../src_org_8.2_20110329/sys/net/radix.c	2010-04-02 05:02:50.000000000 +0000
+++ ./sys/net/radix.c	2011-04-03 16:07:57.000000000 +0000
@@ -41,6 +41,7 @@
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/syslog.h>
+#include <sys/rmlock.h>
 #include <net/radix.h>
 #include "opt_mpath.h"
 #ifdef RADIX_MPATH
@@ -614,7 +615,7 @@
 	struct radix_node treenodes[2];
 {
 	caddr_t v = (caddr_t)v_arg, netmask = (caddr_t)n_arg;
-	register struct radix_node *t, *x = 0, *tt;
+	register struct radix_node *t, *x = 0, *xx = 0, *tt;
 	struct radix_node *saved_tt, *top = head->rnh_treetop;
 	short b = 0, b_leaf = 0;
 	int keyduplicated;
@@ -723,12 +724,19 @@
 		x = t->rn_right;
 	/* Promote general routes from below */
 	if (x->rn_bit < 0) {
-	    for (mp = &t->rn_mklist; x; x = x->rn_dupedkey)
+            for (mp = &t->rn_mklist; x; xx = x, x = x->rn_dupedkey) {
+                if (xx && xx->rn_mklist && xx->rn_mask == x->rn_mask &&
+				x->rn_mklist == 0) {
+			/* multipath route, bump refcount on first mklist */
+			x->rn_mklist = xx->rn_mklist;
+			x->rn_mklist->rm_refs++;
+		}
 		if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) {
 			*mp = m = rn_new_radix_mask(x, 0);
 			if (m)
 				mp = &m->rm_mklist;
 		}
+	    }
 	} else if (x->rn_mklist) {
 		/*
 		 * Skip over masks whose index is > that of new node
@@ -760,11 +768,30 @@
 			break;
 		if (m->rm_flags & RNF_NORMAL) {
 			mmask = m->rm_leaf->rn_mask;
-			if (tt->rn_flags & RNF_NORMAL) {
-#if !defined(RADIX_MPATH)
-			    log(LOG_ERR,
-			        "Non-unique normal route, mask not entered\n");
+			if (keyduplicated) {
+				if (m->rm_leaf->rn_parent == tt)
+					/* new route is bettter */
+					m->rm_leaf = tt;
+#ifdef DIAGNOSTIC
+				else {
+					for (t = m->rm_leaf; t;
+						t = t->rn_dupedkey) {
+						break;
+					}
+					if (t == NULL) {
+						log(LOG_ERR, "Non-unique "
+							"normal route on dupedkey, "
+							"mask not entered\n");
+						return tt;
+					}
+				}
 #endif
+				m->rm_refs++;
+				tt->rn_mklist = m;
+				return tt;
+			} else if (tt->rn_flags & RNF_NORMAL) {
+				log(LOG_ERR, "Non-unique normal route,"
+					" mask not entered\n");
 				return tt;
 			}
 		} else
@@ -783,9 +810,10 @@
 }
 
 struct radix_node *
-rn_delete(v_arg, netmask_arg, head)
+rn_delete(v_arg, netmask_arg, head, rn)
 	void *v_arg, *netmask_arg;
 	struct radix_node_head *head;
+	struct radix_node *rn;
 {
 	register struct radix_node *t, *p, *x, *tt;
 	struct radix_mask *m, *saved_m, **mp;
@@ -815,13 +843,37 @@
 			if ((tt = tt->rn_dupedkey) == 0)
 				return (0);
 	}
+#ifdef RADIX_MPATH
+	if (rn) {
+		while (tt != rn)
+			if ((tt = tt->rn_dupedkey) == 0)
+				return (0);
+	}
+#endif
 	if (tt->rn_mask == 0 || (saved_m = m = tt->rn_mklist) == 0)
 		goto on1;
 	if (tt->rn_flags & RNF_NORMAL) {
-		if (m->rm_leaf != tt || m->rm_refs > 0) {
+		if (m->rm_leaf != tt && m->rm_refs == 0) {
 			log(LOG_ERR, "rn_delete: inconsistent annotation\n");
 			return 0;  /* dangling ref could cause disaster */
 		}
+		if (m->rm_leaf != tt) {
+			if (--m->rm_refs >= 0)
+				goto on1;
+		}
+		/* tt is currently the head of the possible multipath chain */
+		if (m->rm_refs > 0) {
+			if (tt->rn_dupedkey == NULL ||
+				tt->rn_dupedkey->rn_mklist != m) {
+					log(LOG_ERR, "rn_delete: inconsistent "
+						"dupedkey list\n");
+					return (0);
+			}
+			m->rm_leaf = tt->rn_dupedkey;
+			--m->rm_refs;
+			goto on1;
+		}
+		/* else tt is last and only route */
 	} else {
 		if (m->rm_mask != tt->rn_mask) {
 			log(LOG_ERR, "rn_delete: inconsistent annotation\n");
@@ -875,15 +927,10 @@
 			else
 				t->rn_right = x;
 		} else {
-			/* find node in front of tt on the chain */
-			for (x = p = saved_tt; p && p->rn_dupedkey != tt;)
-				p = p->rn_dupedkey;
-			if (p) {
-				p->rn_dupedkey = tt->rn_dupedkey;
-				if (tt->rn_dupedkey)		/* parent */
-					tt->rn_dupedkey->rn_parent = p;
-								/* parent */
-			} else log(LOG_ERR, "rn_delete: couldn't find us\n");
+			x = saved_tt;
+			t->rn_dupedkey = tt->rn_dupedkey;
+			if (tt->rn_dupedkey)
+				tt->rn_dupedkey->rn_parent = t;
 		}
 		t = tt + 1;
 		if  (t->rn_flags & RNF_ACTIVE) {
@@ -931,8 +978,16 @@
 				if (m == x->rn_mklist) {
 					struct radix_mask *mm = m->rm_mklist;
 					x->rn_mklist = 0;
-					if (--(m->rm_refs) < 0)
+					if (--(m->rm_refs) < 0) {
 						MKFree(m);
+					} else if (m->rm_flags & RNF_NORMAL) {
+						/*
+						* don't progress because this
+						* a multipath route. Next
+						* route will use the same m.
+						*/
+						mm = m;
+					}
 					m = mm;
 				}
 			if (m)
diff -u -r ../src_org_8.2_20110329/sys/net/radix.h ./sys/net/radix.h
--- ../src_org_8.2_20110329/sys/net/radix.h	2010-03-23 09:58:59.000000000 +0000
+++ ./sys/net/radix.h	2011-04-12 16:29:47.000000000 +0000
@@ -36,7 +36,7 @@
 #ifdef _KERNEL
 #include <sys/_lock.h>
 #include <sys/_mutex.h>
-#include <sys/_rwlock.h>
+#include <sys/_rmlock.h>
 #endif
 
 #ifdef MALLOC_DECLARE
@@ -114,7 +114,8 @@
 		(void *v, void *mask,
 		     struct radix_node_head *head, struct radix_node nodes[]);
 	struct	radix_node *(*rnh_deladdr)	/* remove based on sockaddr */
-		(void *v, void *mask, struct radix_node_head *head);
+		(void *v, void *mask, struct radix_node_head *head,
+		     struct radix_node *rn);
 	struct	radix_node *(*rnh_delpkt)	/* remove based on packet hdr */
 		(void *v, void *mask, struct radix_node_head *head);
 	struct	radix_node *(*rnh_matchaddr)	/* locate based on sockaddr */
@@ -133,7 +134,7 @@
 	struct	radix_node rnh_nodes[3];	/* empty tree for common case */
 	int	rnh_multipath;			/* multipath capable ? */
 #ifdef _KERNEL
-	struct	rwlock rnh_lock;		/* locks entire radix tree */
+	struct	rmlock rnh_lock;		/* locks entire radix tree */
 #endif
 };
 
@@ -147,17 +148,15 @@
 #define Free(p) free((caddr_t)p, M_RTABLE);
 
 #define	RADIX_NODE_HEAD_LOCK_INIT(rnh)	\
-    rw_init_flags(&(rnh)->rnh_lock, "radix node head", 0)
-#define	RADIX_NODE_HEAD_LOCK(rnh)	rw_wlock(&(rnh)->rnh_lock)
-#define	RADIX_NODE_HEAD_UNLOCK(rnh)	rw_wunlock(&(rnh)->rnh_lock)
-#define	RADIX_NODE_HEAD_RLOCK(rnh)	rw_rlock(&(rnh)->rnh_lock)
-#define	RADIX_NODE_HEAD_RUNLOCK(rnh)	rw_runlock(&(rnh)->rnh_lock)
-#define	RADIX_NODE_HEAD_LOCK_TRY_UPGRADE(rnh)	rw_try_upgrade(&(rnh)->rnh_lock)
-
-
-#define	RADIX_NODE_HEAD_DESTROY(rnh)	rw_destroy(&(rnh)->rnh_lock)
-#define	RADIX_NODE_HEAD_LOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_LOCKED)
-#define	RADIX_NODE_HEAD_WLOCK_ASSERT(rnh) rw_assert(&(rnh)->rnh_lock, RA_WLOCKED)
+	rm_init_flags(&(rnh)->rnh_lock, "radix node head", 0)
+#define        RADIX_NODE_HEAD_LOCK(rnh)       rm_wlock(&(rnh)->rnh_lock)
+#define        RADIX_NODE_HEAD_UNLOCK(rnh)     rm_wunlock(&(rnh)->rnh_lock)
+#define        RADIX_NODE_HEAD_RLOCK(rnh, tracker)     rm_rlock(&(rnh)->rnh_lock, (tracker))
+#define        RADIX_NODE_HEAD_RUNLOCK(rnh, tracker)   rm_runlock(&(rnh)->rnh_lock, (tracker))
+
+#define        RADIX_NODE_HEAD_DESTROY(rnh)    rm_destroy(&(rnh)->rnh_lock)
+#define        RADIX_NODE_HEAD_LOCK_ASSERT(rnh)        rm_wowned(&(rnh)->rnh_lock)
+#define        RADIX_NODE_HEAD_WLOCK_ASSERT(rnh)       rm_wowned(&(rnh)->rnh_lock)
 #endif /* _KERNEL */
 
 void	 rn_init(int);
@@ -168,7 +167,7 @@
 	 *rn_addmask(void *, int, int),
 	 *rn_addroute (void *, void *, struct radix_node_head *,
 			struct radix_node [2]),
-	 *rn_delete(void *, void *, struct radix_node_head *),
+	 *rn_delete(void *, void *, struct radix_node_head *, struct radix_node *),
 	 *rn_lookup (void *v_arg, void *m_arg,
 		        struct radix_node_head *head),
 	 *rn_match(void *, struct radix_node_head *);
diff -u -r ../src_org_8.2_20110329/sys/net/radix_mpath.c ./sys/net/radix_mpath.c
--- ../src_org_8.2_20110329/sys/net/radix_mpath.c	2011-04-12 21:31:08.000000000 +0000
+++ ./sys/net/radix_mpath.c	2011-04-15 17:40:43.000000000 +0000
@@ -45,6 +45,8 @@
 #include <sys/socket.h>
 #include <sys/domain.h>
 #include <sys/syslog.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
 #include <net/radix.h>
 #include <net/radix_mpath.h>
 #include <net/route.h>
@@ -54,7 +56,7 @@
 /*
  * give some jitter to hash, to avoid synchronization between routers
  */
-static uint32_t hashjitter;
+uint32_t hashjitter;
 
 int
 rn_mpath_capable(struct radix_node_head *rnh)
@@ -77,20 +79,6 @@
 		return NULL;
 }
 
-uint32_t
-rn_mpath_count(struct radix_node *rn)
-{
-	uint32_t i = 0;
-	struct rtentry *rt;
-	
-	while (rn != NULL) {
-		rt = (struct rtentry *)rn;
-		i += rt->rt_rmx.rmx_weight;
-		rn = rn_mpath_next(rn);
-	}
-	return (i);
-}
-
 struct rtentry *
 rt_mpath_matchgate(struct rtentry *rt, struct sockaddr *gate)
 {
@@ -125,33 +113,6 @@
 	return (struct rtentry *)rn;
 }
 
-/* 
- * go through the chain and unlink "rt" from the list
- * the caller will free "rt"
- */
-int
-rt_mpath_deldup(struct rtentry *headrt, struct rtentry *rt)
-{
-        struct radix_node *t, *tt;
-
-        if (!headrt || !rt)
-            return (0);
-        t = (struct radix_node *)headrt;
-        tt = rn_mpath_next(t);
-        while (tt) {
-            if (tt == (struct radix_node *)rt) {
-                t->rn_dupedkey = tt->rn_dupedkey;
-                tt->rn_dupedkey = NULL;
-    	        tt->rn_flags &= ~RNF_ACTIVE;
-	        tt[1].rn_flags &= ~RNF_ACTIVE;
-                return (1);
-            }
-            t = tt;
-            tt = rn_mpath_next((struct radix_node *)t);
-        }
-        return (0);
-}
-
 /*
  * check if we have the same key/mask/gateway on the table already.
  */
@@ -261,10 +222,21 @@
 void
 rtalloc_mpath_fib(struct route *ro, uint32_t hash, u_int fibnum)
 {
+	rtalloc_mpath_fib_flags( ro, hash, fibnum, 0);
+}
+
+/*
+ * flag RTF_GATEWAY returns only interface routes,
+ * only one interface-route is possible
+ */
+void
+rtalloc_mpath_fib_flags(struct route *ro, uint32_t hash, u_int fibnum, int flags)
+{
 	struct radix_node *rn0, *rn;
-	u_int32_t n;
+	u_int32_t n = 1;
 	struct rtentry *rt;
 	int64_t weight;
+	int64_t lowest_weight;
 
 	/*
 	 * XXX we don't attempt to lookup cached route again; what should
@@ -285,20 +257,35 @@
 
 	/* beyond here, we use rn as the master copy */
 	rn0 = rn = (struct radix_node *)ro->ro_rt;
-	n = rn_mpath_count(rn0);
+
+	/* find lowest weight route */
+	for ( rt = (struct rtentry *)rn, weight = rt->rt_rmx.rmx_weight;
+	    rn != NULL; rn = rn_mpath_next( rn)) {
+		rt = (struct rtentry *)rn;
+		if ((rt->rt_flags & RTF_UP) && !(rt->rt_flags & flags)) {
+			if (flags & RTF_GATEWAY) /* shortcut */
+				goto end;	/* only 1 interface route possible! */
+			if (weight > rt->rt_rmx.rmx_weight) {
+				weight = rt->rt_rmx.rmx_weight;
+				n = 1;
+			} else if (weight == rt->rt_rmx.rmx_weight)
+				n++;
+		}
+	}
+	lowest_weight = weight;
 
 	/* gw selection by Modulo-N Hash (RFC2991) XXX need improvement? */
 	hash += hashjitter;
 	hash %= n;
-	for (weight = abs((int32_t)hash), rt = ro->ro_rt;
-	     weight >= rt->rt_rmx.rmx_weight && rn; 
-	     weight -= rt->rt_rmx.rmx_weight) {
-		
-		/* stay within the multipath routes */
-		if (rn->rn_dupedkey && rn->rn_mask != rn->rn_dupedkey->rn_mask)
-			break;
-		rn = rn->rn_dupedkey;
+	for ( rn = rn0, n = 0; rn != NULL; rn = rn_mpath_next( rn)) {
 		rt = (struct rtentry *)rn;
+		if (rt->rt_flags & RTF_UP) {
+			if ( rt->rt_rmx.rmx_weight == lowest_weight) {
+				if (n == hash)
+					break;
+				n++;
+			}
+		}
 	}
 	/* XXX try filling rt_gwroute and avoid unreachable gw  */
 
@@ -308,6 +295,7 @@
 		ro->ro_rt = NULL;
 		return;
 	}
+end:
 	if (ro->ro_rt != rt) {
 		RTFREE_LOCKED(ro->ro_rt);
 		ro->ro_rt = (struct rtentry *)rn;
diff -u -r ../src_org_8.2_20110329/sys/net/radix_mpath.h ./sys/net/radix_mpath.h
--- ../src_org_8.2_20110329/sys/net/radix_mpath.h	2009-08-03 08:13:06.000000000 +0000
+++ ./sys/net/radix_mpath.h	2011-04-15 17:39:56.000000000 +0000
@@ -44,14 +44,15 @@
 struct route;
 struct rtentry;
 struct sockaddr;
+extern uint32_t hashjitter;
 int	rn_mpath_capable(struct radix_node_head *);
 struct radix_node *rn_mpath_next(struct radix_node *);
-u_int32_t rn_mpath_count(struct radix_node *);
 struct rtentry *rt_mpath_matchgate(struct rtentry *, struct sockaddr *);
 int rt_mpath_conflict(struct radix_node_head *, struct rtentry *,
     struct sockaddr *);
+void rtalloc_mpath_fib_flags(struct route *, u_int32_t, u_int, int);
 void rtalloc_mpath_fib(struct route *, u_int32_t, u_int);
-#define rtalloc_mpath(_route, _hash) rtalloc_mpath_fib((_route), (_hash), 0)
+#define rtalloc_mpath(_route, _hash) rtalloc_mpath_fib_flags((_route), (_hash), 0, 0)
 struct radix_node *rn_mpath_lookup(void *, void *,
     struct radix_node_head *);
 int rt_mpath_deldup(struct rtentry *, struct rtentry *);
diff -u -r ../src_org_8.2_20110329/sys/net/route.c ./sys/net/route.c
--- ../src_org_8.2_20110329/sys/net/route.c	2011-04-14 16:09:40.000000000 +0000
+++ ./sys/net/route.c	2011-04-14 16:14:18.000000000 +0000
@@ -51,6 +51,8 @@
 #include <sys/proc.h>
 #include <sys/domain.h>
 #include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
@@ -342,6 +344,7 @@
 	struct radix_node *rn;
 	struct rtentry *newrt;
 	struct rt_addrinfo info;
+	struct rm_priotracker tracker;
 	int err = 0, msgtype = RTM_MISS;
 	int needlock;
 
@@ -358,24 +361,26 @@
 		goto miss;
 	}
 	needlock = !(ignflags & RTF_RNH_LOCKED);
-	if (needlock)
-		RADIX_NODE_HEAD_RLOCK(rnh);
-#ifdef INVARIANTS	
+	if (needlock) /* XXX we always need the lock for now! */
+		RADIX_NODE_HEAD_LOCK(rnh);
 	else
-		RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
-#endif
+		RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
 	rn = rnh->rnh_matchaddr(dst, rnh);
 	if (rn && ((rn->rn_flags & RNF_ROOT) == 0)) {
 		newrt = rt = RNTORT(rn);
 		RT_LOCK(newrt);
 		RT_ADDREF(newrt);
-		if (needlock)
-			RADIX_NODE_HEAD_RUNLOCK(rnh);
+		if (needlock) /* XXX we always need the lock for now! */
+			RADIX_NODE_HEAD_UNLOCK(rnh);
+		else
+			RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
 		goto done;
+	}
+	if (needlock) /* XXX we always need the lock for now! */
+		RADIX_NODE_HEAD_UNLOCK(rnh);
+	else
+		RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
 
-	} else if (needlock)
-		RADIX_NODE_HEAD_RUNLOCK(rnh);
-	
 	/*
 	 * Either we hit the root or couldn't find any match,
 	 * Which basically means
@@ -400,6 +405,162 @@
 }
 
 /*
+ * Lookup a destination in the routing table and
+ * report the next hop, interface and interface address
+ * in a new structure.
+ * Only read lock access on the routing table is required,
+ * individual routes are not locked.
+ * Returns 1 for entry found, 0 for not found.
+ */
+int
+rtlookup_fib(struct sockaddr *dst, u_int fibnum, struct rtlookup *rtl,
+    int flags)
+{
+	struct radix_node_head *rnh;
+	struct radix_node *rn;
+	struct rtentry *rt;
+	int ret = 0;
+	struct rm_priotracker tracker;
+
+	KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
+	if (dst->sa_family != AF_INET)  /* Only INET supports > 1 fib now */
+		fibnum = 0;
+	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
+
+	/* Look up the address in the table for that Address Family. */
+	if (rnh == NULL) {
+		V_rtstat.rts_unreach++;
+		return (0);
+	}
+
+	RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
+	rn = rnh->rnh_matchaddr(dst, rnh);
+	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+		rt = RNTORT(rn);
+
+		if( rt->rt_gateway->sa_len > rtl->rt_gateway->sa_len) {
+			unsigned char sa_len = rtl->rt_gateway->sa_len;
+			bcopy( rt->rt_gateway, rtl->rt_gateway, sa_len);
+			rtl->rt_gateway->sa_len = sa_len;
+		} else {
+			unsigned char sa_len = rt->rt_gateway->sa_len;
+			bcopy( rt->rt_gateway, rtl->rt_gateway, sa_len);
+			rtl->rt_gateway->sa_len = sa_len;
+		}
+		rtl->rt_ifp = rt->rt_ifp;
+		rtl->rt_ifa = rt->rt_ifa;
+		rtl->rt_rmx.rmx_mtu = rt->rt_rmx.rmx_mtu;
+		rtl->rt_rmx.rmx_expire = rt->rt_rmx.rmx_expire;
+		rtl->rt_flags = rt->rt_flags;
+		if (flags & RTL_PKSENT)
+                	rt->rt_rmx.rmx_pksent++; /* racy but ok */
+		ret = 1;
+	}
+	RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
+	return (ret);
+}
+
+#ifdef RADIX_MPATH
+/*
+ * Lookup a mpath destination in the routing table and
+ * report the next hop, interface and interface address
+ * in a new structure.
+ * Only read lock access on the routing table is required,
+ * individual routes are not locked.
+ * Returns 1 for entry found, 0 for not found.
+ */
+int
+rtlookup_mpath_fib(struct sockaddr *dst, u_int32_t hash, u_int fibnum,
+    struct rtlookup *rtl, int flags)
+{
+	struct radix_node_head *rnh;
+	struct radix_node *rn, *rn0;
+	struct rtentry *rt;
+	int ret = 0;
+	struct rm_priotracker tracker;
+	int64_t weight;
+	int64_t lowest_weight;
+	u_int32_t n = 0;
+
+	KASSERT((fibnum < rt_numfibs), ("rtalloc1_fib: bad fibnum"));
+	if (dst->sa_family != AF_INET)  /* Only INET supports > 1 fib now */
+		fibnum = 0;
+	rnh = rt_tables_get_rnh(fibnum, dst->sa_family);
+
+	/* Look up the address in the table for that Address Family. */
+	if (rnh == NULL) {
+		V_rtstat.rts_unreach++;
+		return (0);
+	}
+
+	RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
+	rn = rnh->rnh_matchaddr(dst, rnh);
+	if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) {
+		/* we have a route - now do the mpath selection */
+		if (rn_mpath_next( rn) != NULL) { /* multipath */
+			rn0 = rn;
+
+			/* find lowest weight route */
+			for ( rt = (struct rtentry *)rn,
+			    weight = rt->rt_rmx.rmx_weight;
+			    rn != NULL; rn = rn_mpath_next( rn)) {
+				rt = (struct rtentry *)rn;
+				if(rt->rt_flags & RTF_UP) {
+					if (weight > rt->rt_rmx.rmx_weight) {
+						weight = rt->rt_rmx.rmx_weight;
+						n = 1;
+					} else if (weight == rt->rt_rmx.rmx_weight)
+						n++;
+				}
+			}
+			lowest_weight = weight;
+
+			/* select now one of the lowest weight routes */
+			/* gw selection by Modulo-N Hash (RFC2991) XXX need improvement? */
+			hash += hashjitter;
+			hash %= n;
+			for ( rn = rn0, n = 0; rn != NULL; rn = rn_mpath_next( rn)) {
+				rt = (struct rtentry *)rn;
+				if(rt->rt_flags & RTF_UP) {
+					if ( rt->rt_rmx.rmx_weight == lowest_weight) {
+						if (n == hash)
+							break;
+						n++;
+					}
+				}
+			}
+
+			/* gw selection has failed - there must be only zero weight routes */                   
+			if (!rn)
+				goto end;
+		} else
+			rt = (struct rtentry *)rn;
+
+		if( rt->rt_gateway->sa_len > rtl->rt_gateway->sa_len) {
+			unsigned char sa_len = rtl->rt_gateway->sa_len;
+			bcopy( rt->rt_gateway, rtl->rt_gateway, sa_len);
+			rtl->rt_gateway->sa_len = sa_len;
+		} else {
+			unsigned char sa_len = rt->rt_gateway->sa_len;
+			bcopy( rt->rt_gateway, rtl->rt_gateway, sa_len);
+			rtl->rt_gateway->sa_len = sa_len;
+		}
+		rtl->rt_ifp = rt->rt_ifp;
+		rtl->rt_ifa = rt->rt_ifa;
+		rtl->rt_rmx.rmx_mtu = rt->rt_rmx.rmx_mtu;
+		rtl->rt_rmx.rmx_expire = rt->rt_rmx.rmx_expire;
+		rtl->rt_flags = rt->rt_flags;
+		if (flags & RTL_PKSENT)
+			rt->rt_rmx.rmx_pksent++;	/* racy but ok */
+		ret = 1;
+	}
+end:
+	RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
+	return (ret);
+}
+#endif
+
+/*
  * Remove a reference count from an rtentry.
  * If the count gets low enough, take it out of the routing table
  */
@@ -875,7 +1036,7 @@
 	 * Remove the item from the tree; it should be there,
 	 * but when callers invoke us blindly it may not (sigh).
 	 */
-	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh);
+	rn = rnh->rnh_deladdr(rt_key(rt), rt_mask(rt), rnh, NULL);
 	if (rn == NULL) {
 		error = ESRCH;
 		goto bad;
@@ -913,112 +1074,6 @@
 	return (error);
 }
 
-#ifdef RADIX_MPATH
-static int
-rn_mpath_update(int req, struct rt_addrinfo *info,
-    struct radix_node_head *rnh, struct rtentry **ret_nrt)
-{
-	/*
-	 * if we got multipath routes, we require users to specify
-	 * a matching RTAX_GATEWAY.
-	 */
-	struct rtentry *rt, *rto = NULL;
-	register struct radix_node *rn;
-	int error = 0;
-
-	rn = rnh->rnh_matchaddr(dst, rnh);
-	if (rn == NULL)
-		return (ESRCH);
-	rto = rt = RNTORT(rn);
-	rt = rt_mpath_matchgate(rt, gateway);
-	if (rt == NULL)
-		return (ESRCH);
-	/*
-	 * this is the first entry in the chain
-	 */
-	if (rto == rt) {
-		rn = rn_mpath_next((struct radix_node *)rt);
-		/*
-		 * there is another entry, now it's active
-		 */
-		if (rn) {
-			rto = RNTORT(rn);
-			RT_LOCK(rto);
-			rto->rt_flags |= RTF_UP;
-			RT_UNLOCK(rto);
-		} else if (rt->rt_flags & RTF_GATEWAY) {
-			/*
-			 * For gateway routes, we need to 
-			 * make sure that we we are deleting
-			 * the correct gateway. 
-			 * rt_mpath_matchgate() does not 
-			 * check the case when there is only
-			 * one route in the chain.  
-			 */
-			if (gateway &&
-			    (rt->rt_gateway->sa_len != gateway->sa_len ||
-				memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
-				error = ESRCH;
-			else {
-				/*
-				 * remove from tree before returning it
-				 * to the caller
-				 */
-				rn = rnh->rnh_deladdr(dst, netmask, rnh);
-				KASSERT(rt == RNTORT(rn), ("radix node disappeared"));
-				goto gwdelete;
-			}
-			
-		}
-		/*
-		 * use the normal delete code to remove
-		 * the first entry
-		 */
-		if (req != RTM_DELETE) 
-			goto nondelete;
-
-		error = ENOENT;
-		goto done;
-	}
-		
-	/*
-	 * if the entry is 2nd and on up
-	 */
-	if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt))
-		panic ("rtrequest1: rt_mpath_deldup");
-gwdelete:
-	RT_LOCK(rt);
-	RT_ADDREF(rt);
-	if (req == RTM_DELETE) {
-		rt->rt_flags &= ~RTF_UP;
-		/*
-		 * One more rtentry floating around that is not
-		 * linked to the routing table. rttrash will be decremented
-		 * when RTFREE(rt) is eventually called.
-		 */
-		V_rttrash++;
-	}
-	
-nondelete:
-	if (req != RTM_DELETE)
-		panic("unrecognized request %d", req);
-	
-
-	/*
-	 * If the caller wants it, then it can have it,
-	 * but it's up to it to free the rtentry as we won't be
-	 * doing it.
-	 */
-	if (ret_nrt) {
-		*ret_nrt = rt;
-		RT_UNLOCK(rt);
-	} else
-		RTFREE_LOCKED(rt);
-done:
-	return (error);
-}
-#endif
-
 int
 rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
 				u_int fibnum)
@@ -1032,6 +1087,7 @@
 	register struct radix_node_head *rnh;
 	struct ifaddr *ifa;
 	struct sockaddr *ndst;
+	struct rm_priotracker tracker;
 #define senderr(x) { error = x ; goto bad; }
 
 	KASSERT((fibnum < rt_numfibs), ("rtrequest1_fib: bad fibnum"));
@@ -1048,7 +1104,7 @@
 	if (needlock)
 		RADIX_NODE_HEAD_LOCK(rnh);
 	else
-		RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
+		RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
 	/*
 	 * If we are adding a host route then we don't want to put
 	 * a netmask in the tree, nor do we want to clone it.
@@ -1058,28 +1114,30 @@
 
 	switch (req) {
 	case RTM_DELETE:
+		if ((rn = rnh->rnh_lookup(dst, netmask, rnh)) == NULL)
+			senderr(ESRCH);
+		rt = RNTORT(rn);
 #ifdef RADIX_MPATH
+		/*
+		 * if we got multipath routes, we require users to specify
+		 * a matching RTAX_GATEWAY.
+		 */
 		if (rn_mpath_capable(rnh)) {
-			error = rn_mpath_update(req, info, rnh, ret_nrt);
-			/*
-			 * "bad" holds true for the success case
-			 * as well
-			 */
-			if (error != ENOENT)
-				goto bad;
-			error = 0;
+			rt = rt_mpath_matchgate( rt, gateway);
+			rn = (struct radix_node *)rt;
+			if (!rt)
+				senderr(ESRCH);
 		}
 #endif
 		/*
 		 * Remove the item from the tree and return it.
 		 * Complain if it is not there and do no more processing.
 		 */
-		rn = rnh->rnh_deladdr(dst, netmask, rnh);
+		rn = rnh->rnh_deladdr(dst, netmask, rnh, rn);
 		if (rn == NULL)
 			senderr(ESRCH);
 		if (rn->rn_flags & (RNF_ACTIVE | RNF_ROOT))
 			panic ("rtrequest delete");
-		rt = RNTORT(rn);
 		RT_LOCK(rt);
 		RT_ADDREF(rt);
 		rt->rt_flags &= ~RTF_UP;
@@ -1285,6 +1343,8 @@
 bad:
 	if (needlock)
 		RADIX_NODE_HEAD_UNLOCK(rnh);
+	else
+		RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
 	return (error);
 #undef senderr
 }
@@ -1308,7 +1368,9 @@
 #endif
 
 	RT_LOCK_ASSERT(rt);
+#ifdef INVARIANTS
 	RADIX_NODE_HEAD_LOCK_ASSERT(rnh);
+#endif
 	
 	/*
 	 * Prepare to store the gateway in rt->rt_gateway.
@@ -1464,6 +1526,7 @@
 					    ifa->ifa_addr);
 					if (!rt)
 						error = ESRCH;
+					rn = (struct radix_node *)rt;
 				}
 			}
 			else
diff -u -r ../src_org_8.2_20110329/sys/net/route.h ./sys/net/route.h
--- ../src_org_8.2_20110329/sys/net/route.h	2010-04-02 05:12:46.000000000 +0000
+++ ./sys/net/route.h	2011-04-06 20:37:48.000000000 +0000
@@ -79,6 +79,39 @@
 };
 
 /*
+ * Pointers to structures on the stack for pure routing
+ * table lookups / fast mtu access.
+ * Fakes struct rt_metrics_lite
+ */
+struct rtlookup_metrics {
+	u_long  rmx_mtu;        /* MTU for this path */
+	u_long  rmx_expire;     /* XXX rearange rt_metrics_lite */
+	u_long  rmx_pksent;     /* XXX faster than extra if? - remove? */
+};
+
+/*
+ * Pointers to structures on the stack for pure routing
+ * table lookups. 
+ * Fakes struct rtentry
+ */
+#ifndef RNF_NORMAL
+#include <net/radix.h>
+#ifdef RADIX_MPATH
+#include <net/radix_mpath.h>
+#endif
+#endif
+struct rtlookup {
+       struct  radix_node rt_nodes[2];         /* XXX rearange rtentry and remove */
+       struct  sockaddr *rt_gateway;
+       int     rt_flags;
+       int     rt_refcnt;                      /* XXX rearange rtentry and remove */
+       struct  ifnet *rt_ifp;
+       struct  ifaddr *rt_ifa;
+       struct  rtlookup_metrics rt_rmx;
+};
+#define        RTL_PKSENT      0x0001  /* increment packet sent counter */
+
+/*
  * rmx_rtt and rmx_rttvar are stored as microseconds;
  * RTTTOPRHZ(rtt) converts to a value suitable for use
  * by a protocol slowtimo counter.
@@ -123,12 +156,6 @@
  * gateways are marked so that the output routines know to address the
  * gateway rather than the ultimate destination.
  */
-#ifndef RNF_NORMAL
-#include <net/radix.h>
-#ifdef RADIX_MPATH
-#include <net/radix_mpath.h>
-#endif
-#endif
 struct rtentry {
 	struct	radix_node rt_nodes[2];	/* tree glue, and other values */
 	/*
@@ -430,6 +457,10 @@
 void	 rtalloc_fib(struct route *ro, u_int fibnum);
 struct rtentry *rtalloc1_fib(struct sockaddr *, int, u_long, u_int);
 int	 rtioctl_fib(u_long, caddr_t, u_int);
+int    rtlookup_fib(struct sockaddr *, u_int, struct rtlookup *, int);
+#ifdef RADIX_MPATH
+int    rtlookup_mpath_fib(struct sockaddr *, u_int32_t, u_int, struct rtlookup *, int);
+#endif
 void	 rtredirect_fib(struct sockaddr *, struct sockaddr *,
 	    struct sockaddr *, int, struct sockaddr *, u_int);
 int	 rtrequest_fib(int, struct sockaddr *,
diff -u -r ../src_org_8.2_20110329/sys/net/rtsock.c ./sys/net/rtsock.c
--- ../src_org_8.2_20110329/sys/net/rtsock.c	2010-10-30 11:54:55.000000000 +0000
+++ ./sys/net/rtsock.c	2011-04-03 16:07:57.000000000 +0000
@@ -51,6 +51,7 @@
 #include <sys/socketvar.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
+#include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
@@ -513,6 +514,7 @@
 	int len, error = 0;
 	struct ifnet *ifp = NULL;
 	union sockaddr_union saun;
+	struct rm_priotracker tracker;
 
 #define senderr(e) { error = e; goto flush;}
 	if (m == NULL || ((m->m_len < sizeof(long)) &&
@@ -643,11 +645,11 @@
 		    info.rti_info[RTAX_DST]->sa_family);
 		if (rnh == NULL)
 			senderr(EAFNOSUPPORT);
-		RADIX_NODE_HEAD_RLOCK(rnh);
+		RADIX_NODE_HEAD_RLOCK(rnh, &tracker);
 		rt = (struct rtentry *) rnh->rnh_lookup(info.rti_info[RTAX_DST],
 			info.rti_info[RTAX_NETMASK], rnh);
 		if (rt == NULL) {	/* XXX looks bogus */
-			RADIX_NODE_HEAD_RUNLOCK(rnh);
+			RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
 			senderr(ESRCH);
 		}
 #ifdef RADIX_MPATH
@@ -663,7 +665,7 @@
 		    (rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) {
 			rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]);
 			if (!rt) {
-				RADIX_NODE_HEAD_RUNLOCK(rnh);
+				RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
 				senderr(ESRCH);
 			}
 		}
@@ -695,13 +697,13 @@
 			 */
 			rt = (struct rtentry *)rnh->rnh_matchaddr(&laddr, rnh);
 			if (rt == NULL) {
-				RADIX_NODE_HEAD_RUNLOCK(rnh);
+				RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
 				senderr(ESRCH);
 			}
 		} 
 		RT_LOCK(rt);
 		RT_ADDREF(rt);
-		RADIX_NODE_HEAD_RUNLOCK(rnh);
+		RADIX_NODE_HEAD_RUNLOCK(rnh, &tracker);
 
 		/* 
 		 * Fix for PR: 82974
diff -u -r ../src_org_8.2_20110329/sys/netinet/icmp_var.h ./sys/netinet/icmp_var.h
--- ../src_org_8.2_20110329/sys/netinet/icmp_var.h	2009-08-03 08:13:06.000000000 +0000
+++ ./sys/netinet/icmp_var.h	2011-04-03 16:07:57.000000000 +0000
@@ -102,7 +102,11 @@
 #define BANDLIM_RST_CLOSEDPORT 3 /* No connection, and no listeners */
 #define BANDLIM_RST_OPENPORT 4   /* No connection, listener */
 #define BANDLIM_ICMP6_UNREACH 5
-#define BANDLIM_MAX 5
+#define BANDLIM_ICMP_FWD_UNREACH 6 /* forwarding: limit unreachable */
+#define BANDLIM_ICMP_FWD_TIMXCEED 7 /* forwarding: limit time-exceeded */
+#define BANDLIM_ICMP_FWD_NEEDFRAG 8 /* forwarding: limit need-frag */
+#define BANDLIM_ICMP_FWD_FILTER 9 /* forwarding: limit admin-prohib */
+#define BANDLIM_MAX 9
 #endif
 
 #endif
diff -u -r ../src_org_8.2_20110329/sys/netinet/in.c ./sys/netinet/in.c
--- ../src_org_8.2_20110329/sys/netinet/in.c	2011-04-12 13:02:02.000000000 +0000
+++ ./sys/netinet/in.c	2011-04-14 16:22:41.000000000 +0000
@@ -1393,12 +1393,22 @@
 in_lltable_rtcheck(struct ifnet *ifp, u_int flags, const struct sockaddr *l3addr)
 {
 	struct rtentry *rt;
+#ifdef RADIX_MPATH
+	struct route ro;
+#endif
 
 	KASSERT(l3addr->sa_family == AF_INET,
 	    ("sin_family %d", l3addr->sa_family));
 
+#ifdef RADIX_MPATH
+	bzero( &ro, sizeof(ro));
+	bcopy( __DECONST(struct sockaddr *, l3addr), &ro.ro_dst, sizeof(struct sockaddr));
+	rtalloc_mpath_fib_flags( (struct route *)&ro, RTF_ANNOUNCE, 0, RTF_GATEWAY);
+	rt = ro.ro_rt;
+#else
 	/* XXX rtalloc1 should take a const param */
 	rt = rtalloc1(__DECONST(struct sockaddr *, l3addr), 0, 0);
+#endif
 	if (rt == NULL || (!(flags & LLE_PUB) &&
 			   ((rt->rt_flags & RTF_GATEWAY) || 
 			    (rt->rt_ifp != ifp)))) {
@@ -1406,11 +1416,20 @@
 		log(LOG_INFO, "IPv4 address: \"%s\" is not on the network\n",
 		    inet_ntoa(((const struct sockaddr_in *)l3addr)->sin_addr));
 #endif
+#ifdef RADIX_MPATH
+		if (rt != NULL)
+			RTFREE(rt);
+#else
 		if (rt != NULL)
 			RTFREE_LOCKED(rt);
+#endif
 		return (EINVAL);
 	}
+#ifdef RADIX_MPATH
+	RTFREE(rt);
+#else
 	RTFREE_LOCKED(rt);
+#endif
 	return 0;
 }
 
diff -u -r ../src_org_8.2_20110329/sys/netinet/in_rmx.c ./sys/netinet/in_rmx.c
--- ../src_org_8.2_20110329/sys/netinet/in_rmx.c	2010-10-11 11:25:37.000000000 +0000
+++ ./sys/netinet/in_rmx.c	2011-04-14 16:23:13.000000000 +0000
@@ -51,6 +51,8 @@
 #include <sys/mbuf.h>
 #include <sys/syslog.h>
 #include <sys/callout.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/route.h>
diff -u -r ../src_org_8.2_20110329/sys/netinet/ip_fastfwd.c ./sys/netinet/ip_fastfwd.c
--- ../src_org_8.2_20110329/sys/netinet/ip_fastfwd.c	2010-12-10 14:06:50.000000000 +0000
+++ ./sys/netinet/ip_fastfwd.c	2011-04-07 02:24:01.000000000 +0000
@@ -94,6 +94,9 @@
 #include <net/if_var.h>
 #include <net/if_dl.h>
 #include <net/route.h>
+#ifdef RADIX_MPATH
+#include <net/radix_mpath.h>
+#endif
 #include <net/vnet.h>
 
 #include <netinet/in.h>
@@ -102,6 +105,7 @@
 #include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_icmp.h>
+#include <netinet/icmp_var.h>
 #include <netinet/ip_options.h>
 
 #include <machine/in_cksum.h>
@@ -113,7 +117,11 @@
     &VNET_NAME(ipfastforward_active), 0, "Enable fast IP forwarding");
 
 static struct sockaddr_in *
-ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m)
+#ifdef RADIX_MPATH
+ip_findroute(struct route *ro, uint32_t hash, struct in_addr dest, struct mbuf *m, struct rtlookup *rtl)
+#else
+ip_findroute(struct route *ro, struct in_addr dest, struct mbuf *m, struct rtlookup *rtl)
+#endif
 {
 	struct sockaddr_in *dst;
 	struct rtentry *rt;
@@ -126,7 +134,16 @@
 	dst->sin_family = AF_INET;
 	dst->sin_len = sizeof(*dst);
 	dst->sin_addr.s_addr = dest.s_addr;
-	in_rtalloc_ign(ro, 0, M_GETFIB(m));
+
+#ifdef RADIX_MPATH
+	if (!rtlookup_mpath_fib((struct sockaddr *)dst,
+			hash, M_GETFIB(m),  rtl, RTL_PKSENT))
+#else
+	if (!rtlookup_fib( (struct sockaddr *)dst, M_GETFIB(m), rtl, RTL_PKSENT))
+#endif
+		ro->ro_rt = NULL;
+	else
+		ro->ro_rt = (struct rtentry *)rtl;
 
 	/*
 	 * Route there and interface still up?
@@ -140,9 +157,10 @@
 	} else {
 		IPSTAT_INC(ips_noroute);
 		IPSTAT_INC(ips_cantforward);
-		if (rt)
-			RTFREE(rt);
-		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
+		if (badport_bandlim(BANDLIM_ICMP_FWD_UNREACH) < 0)
+			m_freem(m);
+		else
+			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		return NULL;
 	}
 	return dst;
@@ -167,6 +185,8 @@
 	u_short sum, ip_len;
 	int error = 0;
 	int hlen, mtu;
+	struct rtlookup rtl;
+	struct sockaddr_dl gateway;
 #ifdef IPFIREWALL_FORWARD
 	struct m_tag *fwd_tag;
 #endif
@@ -299,8 +319,11 @@
 		if (ip_doopts == 1)
 			return m;
 		else if (ip_doopts == 2) {
-			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_FILTER_PROHIB,
-				0, 0);
+			if (badport_bandlim(BANDLIM_ICMP_FWD_FILTER) < 0)
+				m_freem(m);
+			else
+				icmp_error(m, ICMP_UNREACH,
+					ICMP_UNREACH_FILTER_PROHIB, 0, 0);
 			return NULL;	/* mbuf already free'd */
 		}
 		/* else ignore IP options and continue */
@@ -399,7 +422,11 @@
 	if (!V_ipstealth) {
 #endif
 	if (ip->ip_ttl <= IPTTLDEC) {
-		icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS, 0, 0);
+		if (badport_bandlim(BANDLIM_ICMP_FWD_TIMXCEED) < 0)
+			m_freem(m);
+		else
+			icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
+				0, 0);
 		return NULL;	/* mbuf already free'd */
 	}
 
@@ -420,7 +447,16 @@
 	/*
 	 * Find route to destination.
 	 */
-	if ((dst = ip_findroute(&ro, dest, m)) == NULL)
+	bzero( &gateway, sizeof(gateway));
+	gateway.sdl_len = sizeof(gateway);
+	rtl.rt_gateway = (struct sockaddr *)&gateway;
+#ifdef RADIX_MPATH
+	if ((dst = ip_findroute(&ro, ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
+			dest, m, &rtl)) == NULL)
+#else
+	if ((dst = ip_findroute(&ro,
+			dest, m, &rtl)) == NULL)
+#endif
 		return NULL;	/* icmp unreach already sent */
 	ifp = ro.ro_rt->rt_ifp;
 
@@ -476,8 +512,6 @@
 			 * "ours"-label.
 			 */
 			m->m_flags |= M_FASTFWD_OURS;
-			if (ro.ro_rt)
-				RTFREE(ro.ro_rt);
 			return m;
 		}
 		/*
@@ -490,8 +524,10 @@
 			m_tag_delete(m, fwd_tag);
 		}
 #endif /* IPFIREWALL_FORWARD */
-		RTFREE(ro.ro_rt);
-		if ((dst = ip_findroute(&ro, dest, m)) == NULL)
+		bzero( &gateway, sizeof(gateway));
+		gateway.sdl_len = sizeof(gateway);
+		rtl.rt_gateway = (struct sockaddr *)&gateway;
+		if ((dst = ip_findroute(&ro, dest, m, &rtl)) == NULL)
 			return NULL;	/* icmp unreach already sent */
 		ifp = ro.ro_rt->rt_ifp;
 	}
@@ -507,6 +543,8 @@
 	if ((ro.ro_rt->rt_flags & RTF_REJECT) &&
 	    (ro.ro_rt->rt_rmx.rmx_expire == 0 ||
 	    time_uptime < ro.ro_rt->rt_rmx.rmx_expire)) {
+		if (badport_bandlim(BANDLIM_ICMP_FWD_UNREACH) < 0)
+			goto drop;
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		goto consumed;
 	}
@@ -527,6 +565,8 @@
 	 * Check if media link state of interface is not down
 	 */
 	if (ifp->if_link_state == LINK_STATE_DOWN) {
+		if (badport_bandlim(BANDLIM_ICMP_FWD_UNREACH) < 0)
+			goto drop;
 		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		goto consumed;
 	}
@@ -557,8 +597,9 @@
 		 */
 		if (ip->ip_off & IP_DF) {
 			IPSTAT_INC(ips_cantfrag);
-			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG,
-				0, mtu);
+			if (badport_bandlim(BANDLIM_ICMP_FWD_NEEDFRAG) < 0)
+				goto drop;
+			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_NEEDFRAG, 0, mtu);
 			goto consumed;
 		} else {
 			/*
@@ -606,12 +647,9 @@
 		IPSTAT_INC(ips_fastforward);
 	}
 consumed:
-	RTFREE(ro.ro_rt);
 	return NULL;
 drop:
 	if (m)
 		m_freem(m);
-	if (ro.ro_rt)
-		RTFREE(ro.ro_rt);
 	return NULL;
 }
diff -u -r ../src_org_8.2_20110329/sys/netinet/ip_icmp.c ./sys/netinet/ip_icmp.c
--- ../src_org_8.2_20110329/sys/netinet/ip_icmp.c	2010-09-21 22:33:30.000000000 +0000
+++ ./sys/netinet/ip_icmp.c	2011-04-04 23:01:57.000000000 +0000
@@ -958,7 +958,11 @@
 		{ "icmp tstamp response" },
 		{ "closed port RST response" },
 		{ "open port RST response" },
-		{ "icmp6 unreach response" }
+		{ "icmp6 unreach response" },
+		{ "forwarding: limit unreachable" },
+		{ "forwarding: limit time-exceeded" },
+		{ "forwarding: limit need-frag" },
+		{ "forwarding: limit admin-prohib" }
 	};
 
 	/*
diff -u -r ../src_org_8.2_20110329/sys/netinet/ip_input.c ./sys/netinet/ip_input.c
--- ../src_org_8.2_20110329/sys/netinet/ip_input.c	2011-03-28 15:26:52.000000000 +0000
+++ ./sys/netinet/ip_input.c	2011-04-14 16:28:22.000000000 +0000
@@ -71,6 +71,7 @@
 #include <netinet/ip_var.h>
 #include <netinet/ip_fw.h>
 #include <netinet/ip_icmp.h>
+#include <netinet/icmp_var.h>
 #include <netinet/ip_options.h>
 #include <machine/in_cksum.h>
 #include <netinet/ip_carp.h>
@@ -1348,20 +1349,24 @@
 	struct route sro;
 	struct sockaddr_in *sin;
 	struct in_ifaddr *ia;
+	struct rtlookup rtl;
+	struct sockaddr_dl gateway;
 
 	bzero(&sro, sizeof(sro));
 	sin = (struct sockaddr_in *)&sro.ro_dst;
 	sin->sin_family = AF_INET;
 	sin->sin_len = sizeof(*sin);
 	sin->sin_addr = dst;
-	in_rtalloc_ign(&sro, 0, fibnum);
-
-	if (sro.ro_rt == NULL)
+	
+	bzero( &gateway, sizeof(gateway));
+	gateway.sdl_len = sizeof(gateway);
+	rtl.rt_gateway = (struct sockaddr *)&gateway;
+	if (!rtlookup_fib( (struct sockaddr *)&sro.ro_dst,
+			fibnum, &rtl, 0))
 		return (NULL);
 
-	ia = ifatoia(sro.ro_rt->rt_ifa);
+	ia = ifatoia(rtl.rt_ifa);
 	ifa_ref(&ia->ia_ifa);
-	RTFREE(sro.ro_rt);
 	return (ia);
 }
 
@@ -1397,6 +1402,8 @@
 	struct in_addr dest;
 	struct route ro;
 	int error, type = 0, code = 0, mtu = 0;
+	int icmp_send = 0;
+	struct rtlookup rtl;
 
 	if (m->m_flags & (M_BCAST|M_MCAST) || in_canforward(ip->ip_dst) == 0) {
 		IPSTAT_INC(ips_cantforward);
@@ -1407,8 +1414,11 @@
 	if (!V_ipstealth) {
 #endif
 		if (ip->ip_ttl <= IPTTLDEC) {
-			icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
-			    0, 0);
+			if (badport_bandlim(BANDLIM_ICMP_FWD_TIMXCEED) < 0)
+				m_freem(m);
+			else
+				icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
+					0, 0);
 			return;
 		}
 #ifdef IPSTEALTH
@@ -1423,7 +1433,10 @@
 	 * ip_output in case of outgoing IPsec policy.
 	 */
 	if (!srcrt && ia == NULL) {
-		icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
+		if (badport_bandlim(BANDLIM_ICMP_FWD_UNREACH) < 0)
+			m_freem(m);
+		else
+			icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, 0, 0);
 		return;
 	}
 #endif
@@ -1476,6 +1489,8 @@
 	 * and if packet was not source routed (or has any options).
 	 * Also, don't send redirect if forwarding using a default route
 	 * or a route modified by a redirect.
+	 * This part of code also do not use the rtlookup_fib, because
+	 * rt_nodes is not supported.
 	 */
 	dest.s_addr = 0;
 	if (!srcrt && V_ipsendredirects &&
@@ -1517,13 +1532,13 @@
 	 * the ICMP_UNREACH_NEEDFRAG "Next-Hop MTU" field described in RFC1191.
 	 */
 	bzero(&ro, sizeof(ro));
+	bzero(&rtl, sizeof(rtl));
+	ro.ro_rt = (struct rtentry *)&rtl;
 
-	error = ip_output(m, NULL, &ro, IP_FORWARDING, NULL, NULL);
+	error = ip_output(m, NULL, &ro, IP_FORWARDING | IP_NORTFREE, NULL, NULL);
 
 	if (error == EMSGSIZE && ro.ro_rt)
 		mtu = ro.ro_rt->rt_rmx.rmx_mtu;
-	if (ro.ro_rt)
-		RTFREE(ro.ro_rt);
 
 	if (error)
 		IPSTAT_INC(ips_cantforward);
@@ -1558,11 +1573,13 @@
 	default:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_HOST;
+		icmp_send = badport_bandlim( BANDLIM_ICMP_FWD_UNREACH);
 		break;
 
 	case EMSGSIZE:
 		type = ICMP_UNREACH;
 		code = ICMP_UNREACH_NEEDFRAG;
+		icmp_send = badport_bandlim( BANDLIM_ICMP_FWD_NEEDFRAG);
 
 #ifdef IPSEC
 		/* 
@@ -1618,7 +1635,10 @@
 	}
 	if (ia != NULL)
 		ifa_free(&ia->ia_ifa);
-	icmp_error(mcopy, type, code, dest.s_addr, mtu);
+	if (icmp_send < 0)
+		m_freem(m);
+	else
+		icmp_error(mcopy, type, code, dest.s_addr, mtu);
 }
 
 void
diff -u -r ../src_org_8.2_20110329/sys/netinet/ip_output.c ./sys/netinet/ip_output.c
--- ../src_org_8.2_20110329/sys/netinet/ip_output.c	2010-10-25 13:16:11.000000000 +0000
+++ ./sys/netinet/ip_output.c	2011-04-14 16:27:30.000000000 +0000
@@ -54,6 +54,7 @@
 
 #include <net/if.h>
 #include <net/if_llatbl.h>
+#include <net/if_dl.h>
 #include <net/netisr.h>
 #include <net/pfil.h>
 #include <net/route.h>
@@ -128,8 +129,13 @@
 	struct in_ifaddr *ia = NULL;
 	int isbroadcast, sw_csum;
 	struct route iproute;
+	struct rtlookup rtlnew;
+	struct rtlookup *rtl;
 	struct rtentry *rte;	/* cache for ro->ro_rt */
 	struct in_addr odst;
+	struct sockaddr_dl gateway;
+	int ro_provided = 1;
+	
 #ifdef IPFIREWALL_FORWARD
 	struct m_tag *fwd_tag = NULL;
 #endif
@@ -148,7 +154,9 @@
 	}
 
 	if (ro == NULL) {
+		ro_provided = 0;
 		ro = &iproute;
+		rtl = &rtlnew;
 		bzero(ro, sizeof (*ro));
 
 #ifdef FLOWTABLE
@@ -167,7 +175,15 @@
 			}
 		}
 #endif
-	}
+	} else {
+		if (ro->ro_rt && (flags & IP_NORTFREE)) {
+		        nortfree = 1;
+			/* extract rtl from provided ro, clear ro->ro_rt afterwards */
+	        	rtl = (struct rtlookup *)ro->ro_rt;
+			ro->ro_rt = NULL;
+		} else
+			rtl = &rtlnew;
+        }
 
 	if (opt) {
 		len = 0;
@@ -271,16 +287,39 @@
 		 * operation (as it is for ARP).
 		 */
 		if (rte == NULL) {
+			/* check if caller does not free ro_rt, as the  */
+			if ((ro_provided) && ((flags & IP_NORTFREE) == 0)) {
+#ifdef RADIX_MPATH
+				rtalloc_mpath_fib(ro,
+				    ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
+				    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
+#else
+				in_rtalloc_ign(ro, 0,
+				    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
+#endif
+				rte = ro->ro_rt;
+			} else { /* alright - use the fast lookup code */
+			        bzero( &gateway, sizeof(gateway));
+			        gateway.sdl_len = sizeof(gateway);
+				rtl->rt_gateway = (struct sockaddr *)&gateway;
 #ifdef RADIX_MPATH
-			rtalloc_mpath_fib(ro,
-			    ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
-			    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
+				if (!rtlookup_mpath_fib((struct sockaddr *)dst,
+						ntohl(ip->ip_src.s_addr ^ ip->ip_dst.s_addr),
+						inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m),
+						rtl, RTL_PKSENT))
 #else
-			in_rtalloc_ign(ro, 0,
-			    inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m));
+				if (!rtlookup_fib( (struct sockaddr *)dst,
+						inp ? inp->inp_inc.inc_fibnum : M_GETFIB(m),
+						rtl, RTL_PKSENT))
 #endif
-			rte = ro->ro_rt;
+				        ro->ro_rt = NULL;
+				else {
+					nortfree = 1;
+					ro->ro_rt = (struct rtentry *)rtl;
+				}
+			}
 		}
+		rte = ro->ro_rt;
 		if (rte == NULL ||
 		    rte->rt_ifp == NULL ||
 		    !RT_LINK_IS_UP(rte->rt_ifp)) {
diff -u -r ../src_org_8.2_20110329/sys/netinet/ip_var.h ./sys/netinet/ip_var.h
--- ../src_org_8.2_20110329/sys/netinet/ip_var.h	2010-09-18 01:54:28.000000000 +0000
+++ ./sys/netinet/ip_var.h	2011-04-07 02:22:05.000000000 +0000
@@ -157,6 +157,7 @@
 #define	IP_SENDTOIF		0x8		/* send on specific ifnet */
 #define IP_ROUTETOIF		SO_DONTROUTE	/* 0x10 bypass routing tables */
 #define IP_ALLOWBROADCAST	SO_BROADCAST	/* 0x20 can send broadcast packets */
+#define IP_NORTFREE		0x40		/* caller does not free ro_rt via RTFREE */
 
 /*
  * mbuf flag used by ip_fastfwd
diff -u -r ../src_org_8.2_20110329/sys/netinet/ipfw/ip_fw_table.c ./sys/netinet/ipfw/ip_fw_table.c
--- ../src_org_8.2_20110329/sys/netinet/ipfw/ip_fw_table.c	2010-03-23 09:58:59.000000000 +0000
+++ ./sys/netinet/ipfw/ip_fw_table.c	2011-04-03 16:07:57.000000000 +0000
@@ -137,7 +137,7 @@
 	mask.sin_addr.s_addr = htonl(mlen ? ~((1 << (32 - mlen)) - 1) : 0);
 	sa.sin_addr.s_addr = addr & mask.sin_addr.s_addr;
 	IPFW_WLOCK(ch);
-	ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh);
+	ent = (struct table_entry *)rnh->rnh_deladdr(&sa, &mask, rnh, NULL);
 	if (ent == NULL) {
 		IPFW_WUNLOCK(ch);
 		return (ESRCH);
@@ -154,7 +154,7 @@
 	struct table_entry *ent;
 
 	ent = (struct table_entry *)
-	    rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh);
+	    rnh->rnh_deladdr(rn->rn_key, rn->rn_mask, rnh, NULL);
 	if (ent != NULL)
 		free(ent, M_IPFW_TBL);
 	return (0);
diff -u -r ../src_org_8.2_20110329/sys/netinet6/in6_ifattach.c ./sys/netinet6/in6_ifattach.c
--- ../src_org_8.2_20110329/sys/netinet6/in6_ifattach.c	2010-05-06 06:44:19.000000000 +0000
+++ ./sys/netinet6/in6_ifattach.c	2011-04-03 16:07:57.000000000 +0000
@@ -42,6 +42,8 @@
 #include <sys/proc.h>
 #include <sys/syslog.h>
 #include <sys/md5.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/if_dl.h>
diff -u -r ../src_org_8.2_20110329/sys/netinet6/in6_rmx.c ./sys/netinet6/in6_rmx.c
--- ../src_org_8.2_20110329/sys/netinet6/in6_rmx.c	2010-10-11 11:25:37.000000000 +0000
+++ ./sys/netinet6/in6_rmx.c	2011-04-03 16:07:57.000000000 +0000
@@ -87,6 +87,7 @@
 #include <sys/rwlock.h>
 #include <sys/syslog.h>
 #include <sys/callout.h>
+#include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/route.h>
diff -u -r ../src_org_8.2_20110329/sys/netinet6/in6_src.c ./sys/netinet6/in6_src.c
--- ../src_org_8.2_20110329/sys/netinet6/in6_src.c	2011-04-12 13:02:04.000000000 +0000
+++ ./sys/netinet6/in6_src.c	2011-04-14 16:30:21.000000000 +0000
@@ -518,10 +518,10 @@
 	if (dstsock->sin6_addr.s6_addr32[0] == 0 &&
 	    dstsock->sin6_addr.s6_addr32[1] == 0 &&
 	    !IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) {
-		printf("in6_selectroute: strange destination %s\n",
+		printf("selectroute: strange destination %s\n",
 		       ip6_sprintf(ip6buf, &dstsock->sin6_addr));
 	} else {
-		printf("in6_selectroute: destination = %s%%%d\n",
+		printf("selectroute: destination = %s%%%d\n",
 		       ip6_sprintf(ip6buf, &dstsock->sin6_addr),
 		       dstsock->sin6_scope_id); /* for debug */
 	}
@@ -802,9 +802,259 @@
     struct ip6_moptions *mopts, struct route_in6 *ro,
     struct ifnet **retifp, struct rtentry **retrt)
 {
+        return (selectroute(dstsock, opts, mopts, ro, retifp,
+            retrt, 0));
+}
+ 
+/* Provides fast but minimal access to routing table.
+ * based at selectroute
+ * XXX remove and do lookup direct in ip6_output
+ */
+int
+in6_lookup(struct sockaddr_in6 *dstsock, struct ip6_pktopts *opts,
+    struct ip6_moptions *mopts, struct route_in6 *ro, struct rtlookup *rtl,
+    struct ifnet **retifp, struct rtentry **retrt)
+{
+       int error = 0;
+       struct ifnet *ifp = NULL;
+       struct rtentry *rt = NULL;
+       struct sockaddr_in6 *sin6_next;
+       struct in6_pktinfo *pi = NULL;
+       struct in6_addr *dst = &dstsock->sin6_addr;
+       int norouteok = 0;
+       unsigned char sa_len = rtl->rt_gateway->sa_len;
+#if 0
+       char ip6buf[INET6_ADDRSTRLEN];
+
+       if (dstsock->sin6_addr.s6_addr32[0] == 0 &&
+           dstsock->sin6_addr.s6_addr32[1] == 0 &&
+           !IN6_IS_ADDR_LOOPBACK(&dstsock->sin6_addr)) {
+               printf("in6_lookup: strange destination %s\n",
+                      ip6_sprintf(ip6buf, &dstsock->sin6_addr));
+       } else {
+               printf("in6_lookup: destination = %s%%%d\n",
+                      ip6_sprintf(ip6buf, &dstsock->sin6_addr),
+                      dstsock->sin6_scope_id); /* for debug */
+       }
+#endif
+
+       /* If the caller specify the outgoing interface explicitly, use it. */
+       if (opts && (pi = opts->ip6po_pktinfo) != NULL && pi->ipi6_ifindex) {
+               /* XXX boundary check is assumed to be already done. */
+               ifp = ifnet_byindex(pi->ipi6_ifindex);
+               if (ifp != NULL &&
+                   (norouteok || retrt == NULL ||
+                   IN6_IS_ADDR_MULTICAST(dst))) {
+                       /*
+                        * we do not have to check or get the route for
+                        * multicast.
+                        */
+                       goto done;
+               } else
+                       goto getroute;
+       }
+
+       /*
+        * If the destination address is a multicast address and the outgoing
+        * interface for the address is specified by the caller, use it.
+        */
+       if (IN6_IS_ADDR_MULTICAST(dst) &&
+           mopts != NULL && (ifp = mopts->im6o_multicast_ifp) != NULL) {
+               goto done; /* we do not need a route for multicast. */
+       }
+
+  getroute:
+       /*
+        * If the next hop address for the packet is specified by the caller,
+        * use it as the gateway.
+        */
+       if (opts && opts->ip6po_nexthop) {
+               struct route_in6 *ron;
+               struct llentry *la;
+           
+               sin6_next = satosin6(opts->ip6po_nexthop);
+               
+               /* at this moment, we only support AF_INET6 next hops */
+               if (sin6_next->sin6_family != AF_INET6) {
+                       error = EAFNOSUPPORT; /* or should we proceed? */
+                       goto done;
+               }
+
+               /*
+                * If the next hop is an IPv6 address, then the node identified
+                * by that address must be a neighbor of the sending host.
+                */
+               ron = &opts->ip6po_nextroute;
+               /*
+                * XXX what do we do here?
+                * PLZ to be fixing
+                */
+
+               if (ron->ro_rt == NULL) {
+                       bzero( rtl->rt_gateway, sa_len);
+                       rtl->rt_gateway->sa_len = sa_len;
+                       if (!rtlookup_fib( (struct sockaddr *)&ron->ro_dst, 0U, /* multi path case? */
+                                       rtl, RTL_PKSENT)) {
+                               ron->ro_rt = NULL;
+                               error = EHOSTUNREACH;
+                               goto done;
+                       } else
+                               ron->ro_rt = (struct rtentry *) rtl;
+               }
+
+               rt = ron->ro_rt;
+               ifp = rt->rt_ifp;
+               IF_AFDATA_LOCK(ifp);
+               la = lla_lookup(LLTABLE6(ifp), 0, (struct sockaddr *)&sin6_next->sin6_addr);
+               IF_AFDATA_UNLOCK(ifp);
+               if (la != NULL) 
+                       LLE_RUNLOCK(la);
+               else {
+                       error = EHOSTUNREACH;
+                       goto done;
+               }
+#if 0
+               if ((ron->ro_rt &&
+                   (ron->ro_rt->rt_flags & (RTF_UP | RTF_LLINFO)) !=
+                   (RTF_UP | RTF_LLINFO)) ||
+                   !IN6_ARE_ADDR_EQUAL(&satosin6(&ron->ro_dst)->sin6_addr,
+                   &sin6_next->sin6_addr)) {
+                       if (ron->ro_rt)
+                               ron->ro_rt = NULL;
+                       *satosin6(&ron->ro_dst) = *sin6_next;
+               }
+               if (ron->ro_rt == NULL) {
+                       bzero( rtl->rt_gateway, sa_len);
+                       rtl->rt_gateway->sa_len = sa_len;
+                       if (!rtlookup_fib( (struct sockaddr *)&ron->ro_dst, 0U,
+                                       rtl, RTL_PKSENT)) {
+                               ron->ro_rt = NULL;
+                               error = EHOSTUNREACH;
+                               goto done;
+                       } else {
+                               ron->ro_rt = (struct rtentry *) rtl;
+                               if (!(ron->ro_rt->rt_flags & RTF_LLINFO)) {
+                                       ron->ro_rt = NULL;
+                                       error = EHOSTUNREACH;
+                                       goto done;
+                                }
+                       }
+               }
+#endif
+
+               /*
+                * When cloning is required, try to allocate a route to the
+                * destination so that the caller can store path MTU
+                * information.
+                */
+               goto done;
+       }
+
+       /*
+        * Use a cached route if it exists and is valid, else try to allocate
+        * a new one.  Note that we should check the address family of the
+        * cached destination, in case of sharing the cache with IPv4.
+        */
+       if (ro) {
+               if (ro->ro_rt &&
+                   (!(ro->ro_rt->rt_flags & RTF_UP) ||
+                    ((struct sockaddr *)(&ro->ro_dst))->sa_family != AF_INET6 ||
+                    !IN6_ARE_ADDR_EQUAL(&satosin6(&ro->ro_dst)->sin6_addr,
+                    dst)))
+                       ro->ro_rt = (struct rtentry *)NULL;
+               if (ro->ro_rt == (struct rtentry *)NULL) {
+                       struct sockaddr_in6 *sa6;
+
+                       /* No route yet, so try to acquire one */
+                       bzero(&ro->ro_dst, sizeof(struct sockaddr_in6));
+                       sa6 = (struct sockaddr_in6 *)&ro->ro_dst;
+                       *sa6 = *dstsock;
+                       sa6->sin6_scope_id = 0;
+
+                       bzero( rtl->rt_gateway, sa_len);
+                       rtl->rt_gateway->sa_len = sa_len;
+#ifdef RADIX_MPATH
+                       if (!rtlookup_mpath_fib((struct sockaddr *)&ro->ro_dst,
+                                       ntohl(sa6->sin6_addr.s6_addr32[3]),
+                                       0U, rtl, RTL_PKSENT))
+#else
+                       if (!rtlookup_fib((struct sockaddr *)&ro->ro_dst, 0U,
+                                       rtl, RTL_PKSENT))
+#endif
+                               ro->ro_rt = NULL;
+                       else
+                               ro->ro_rt = (struct rtentry *) rtl;
+               }
+                               
+               /*
+                * do not care about the result if we have the nexthop
+                * explicitly specified.
+                */
+               if (opts && opts->ip6po_nexthop)
+                       goto done;
+
+               if (ro->ro_rt) {
+                       ifp = ro->ro_rt->rt_ifp;
+
+                       if (ifp == NULL) { /* can this really happen? */
+                               ro->ro_rt = NULL;
+                       }
+               }
+               if (ro->ro_rt == NULL)
+                       error = EHOSTUNREACH;
+               rt = ro->ro_rt;
+
+               /*
+                * Check if the outgoing interface conflicts with
+                * the interface specified by ipi6_ifindex (if specified).
+                * Note that loopback interface is always okay.
+                * (this may happen when we are sending a packet to one of
+                *  our own addresses.)
+                */
+               if (ifp && opts && opts->ip6po_pktinfo &&
+                   opts->ip6po_pktinfo->ipi6_ifindex) {
+                       if (!(ifp->if_flags & IFF_LOOPBACK) &&
+                           ifp->if_index !=
+                           opts->ip6po_pktinfo->ipi6_ifindex) {
+                               error = EHOSTUNREACH;
+                               goto done;
+                       }
+               }
+       }
+
+  done:
+       if (ifp == NULL && rt == NULL) {
+               /*
+                * This can happen if the caller did not pass a cached route
+                * nor any other hints.  We treat this case an error.
+                */
+               error = EHOSTUNREACH;
+       }
+       if (error == EHOSTUNREACH)
+               V_ip6stat.ip6s_noroute++;
+
+       if (retifp != NULL) {
+               *retifp = ifp;
+
+               /*
+                * Adjust the "outgoing" interface.  If we're going to loop 
+                * the packet back to ourselves, the ifp would be the loopback 
+                * interface. However, we'd rather know the interface associated 
+                * to the destination address (which should probably be one of 
+                * our own addresses.)
+                */
+               if (rt) {
+                       if ((rt->rt_ifp->if_flags & IFF_LOOPBACK) &&
+                           (rt->rt_gateway->sa_family == AF_LINK))
+                               *retifp = 
+                                       ifnet_byindex(((struct sockaddr_dl *)
+                                                      rt->rt_gateway)->sdl_index);
+               }
+       }
+       if (retrt != NULL)
+               *retrt = rt;    /* rt may be NULL */
 
-	return (selectroute(dstsock, opts, mopts, ro, retifp,
-	    retrt, 0));
+       return (error);
 }
 
 /*
diff -u -r ../src_org_8.2_20110329/sys/netinet6/ip6_forward.c ./sys/netinet6/ip6_forward.c
--- ../src_org_8.2_20110329/sys/netinet6/ip6_forward.c	2010-02-07 09:00:22.000000000 +0000
+++ ./sys/netinet6/ip6_forward.c	2011-04-14 16:31:13.000000000 +0000
@@ -50,6 +50,7 @@
 #include <sys/syslog.h>
 
 #include <net/if.h>
+#include <net/if_dl.h>
 #include <net/route.h>
 #include <net/pfil.h>
 
@@ -99,9 +100,12 @@
 	struct ifnet *origifp;	/* maybe unnecessary */
 	u_int32_t inzone, outzone;
 	struct in6_addr src_in6, dst_in6;
+	struct rtlookup rtl;
+	struct sockaddr_dl gateway;
 #ifdef IPSEC
 	struct secpolicy *sp = NULL;
 	int ipsecrt = 0;
+	int nortfree = 0;
 #endif
 	char ip6bufs[INET6_ADDRSTRLEN], ip6bufd[INET6_ADDRSTRLEN];
 
@@ -283,7 +287,7 @@
 	state.ro = NULL;	/* update at ipsec6_output_tunnel() */
 	state.dst = NULL;	/* update at ipsec6_output_tunnel() */
 
-	error = ipsec6_output_tunnel(&state, sp, 0);
+	error = ipsec6_output_tunnel(&state, sp, 0, &rtl, (struct sockaddr *)&gateway, &nortfree);
 
 	m = state.m;
 	KEY_FREESP(&sp);
@@ -352,18 +356,29 @@
 	dst->sin6_family = AF_INET6;
 	dst->sin6_addr = ip6->ip6_dst;
 
-	rin6.ro_rt = rtalloc1((struct sockaddr *)dst, 0, 0);
-	if (rin6.ro_rt != NULL)
-		RT_UNLOCK(rin6.ro_rt);
-	else {
+	bzero( &gateway, sizeof(gateway));
+	gateway.sdl_len = sizeof(gateway);
+	rtl.rt_gateway = (struct sockaddr *)&gateway;
+#ifdef RADIX_MPATH
+	src_in6 = ip6->ip6_src;
+	dst_in6 = ip6->ip6_dst;
+	if (!rtlookup_mpath_fib((struct sockaddr *)&rin6.ro_dst,
+			ntohl(src_in6->sin6_addr.s6_addr32[3] ^ dst_in6->sin6_addr.s6_addr32[3]),
+			0U, &rtl, RTL_PKSENT)) {
+#else
+	if (!rtlookup_fib( (struct sockaddr *)&rin6.ro_dst, 0U, &rtl,
+			RTL_PKSENT)) {
+#endif
+		rin6.ro_rt = NULL;
 		V_ip6stat.ip6s_noroute++;
 		in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_noroute);
 		if (mcopy) {
 			icmp6_error(mcopy, ICMP6_DST_UNREACH,
-			ICMP6_DST_UNREACH_NOROUTE, 0);
+				ICMP6_DST_UNREACH_NOROUTE, 0);
 		}
 		goto bad;
-	}
+	} else
+		rin6.ro_rt = (struct rtentry *) &rtl;
 	rt = rin6.ro_rt;
 #ifdef IPSEC
 skip_routing:
@@ -580,12 +595,12 @@
 
 senderr:
 	if (mcopy == NULL)
-		goto out;
+		return;
 	switch (error) {
 	case 0:
 		if (type == ND_REDIRECT) {
 			icmp6_redirect_output(mcopy, rt);
-			goto out;
+			return;
 		}
 		goto freecopy;
 
@@ -607,18 +622,11 @@
 		break;
 	}
 	icmp6_error(mcopy, type, code, 0);
-	goto out;
+	return;
 
  freecopy:
 	m_freem(mcopy);
-	goto out;
+	return;
 bad:
 	m_freem(m);
-out:
-	if (rt != NULL
-#ifdef IPSEC
-	    && !ipsecrt
-#endif
-	    )
-		RTFREE(rt);
 }
diff -u -r ../src_org_8.2_20110329/sys/netinet6/ip6_input.c ./sys/netinet6/ip6_input.c
--- ../src_org_8.2_20110329/sys/netinet6/ip6_input.c	2010-09-18 01:54:28.000000000 +0000
+++ ./sys/netinet6/ip6_input.c	2011-04-07 02:14:28.000000000 +0000
@@ -313,6 +313,8 @@
 	int srcrt = 0;
 	struct llentry *lle = NULL;
 	struct sockaddr_in6 dst6, *dst;
+	struct rtlookup rtl;
+	struct sockaddr_dl gateway;
 
 	bzero(&rin6, sizeof(struct route_in6));
 #ifdef IPSEC
@@ -603,11 +605,15 @@
 	dst->sin6_len = sizeof(struct sockaddr_in6);
 	dst->sin6_family = AF_INET6;
 	dst->sin6_addr = ip6->ip6_dst;
-	rin6.ro_rt = rtalloc1((struct sockaddr *)dst, 0, 0);
-	if (rin6.ro_rt)
-		RT_UNLOCK(rin6.ro_rt);
 
-#define rt6_key(r) ((struct sockaddr_in6 *)((r)->rt_nodes->rn_key))
+	bzero( &gateway, sizeof(gateway));
+	gateway.sdl_len = sizeof(gateway);
+	rtl.rt_gateway = (struct sockaddr *)&gateway;
+	if (!rtlookup_fib( (struct sockaddr *)dst,
+	    0U, &rtl, 0))
+		rin6.ro_rt = NULL;
+	else
+		rin6.ro_rt = (struct rtentry *)&rtl;
 
 	/*
 	 * Accept the packet if the forwarding interface to the destination
@@ -638,15 +644,6 @@
 #ifdef RTF_CLONED
 	    !(rin6.ro_rt->rt_flags & RTF_CLONED) &&
 #endif
-#if 0
-	    /*
-	     * The check below is redundant since the comparison of
-	     * the destination and the key of the rtentry has
-	     * already done through looking up the routing table.
-	     */
-	    IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
-	    &rt6_key(rin6.ro_rt)->sin6_addr)
-#endif
 	    rin6.ro_rt->rt_ifp->if_type == IFT_LOOP) {
 		int free_ia6 = 0;
 		struct in6_ifaddr *ia6;
@@ -763,7 +760,7 @@
 #if 0	/*touches NULL pointer*/
 			in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_discard);
 #endif
-			goto out;	/* m have already been freed */
+			return;	/* m have already been freed */
 		}
 
 		/* adjust pointer */
@@ -786,7 +783,7 @@
 			icmp6_error(m, ICMP6_PARAM_PROB,
 				    ICMP6_PARAMPROB_HEADER,
 				    (caddr_t)&ip6->ip6_plen - (caddr_t)ip6);
-			goto out;
+			return;
 		}
 #ifndef PULLDOWN_TEST
 		/* ip6_hopopts_input() ensures that mbuf is contiguous */
@@ -796,7 +793,7 @@
 			sizeof(struct ip6_hbh));
 		if (hbh == NULL) {
 			V_ip6stat.ip6s_tooshort++;
-			goto out;
+			return;
 		}
 #endif
 		nxt = hbh->ip6h_nxt;
@@ -868,7 +865,7 @@
 		}
 	} else if (!ours) {
 		ip6_forward(m, srcrt);
-		goto out;
+		return;
 	}
 
 	ip6 = mtod(m, struct ip6_hdr *);
@@ -931,12 +928,9 @@
 
 		nxt = (*inet6sw[ip6_protox[nxt]].pr_input)(&m, &off, nxt);
 	}
-	goto out;
+	return;
 bad:
 	m_freem(m);
-out:
-	if (rin6.ro_rt)
-		RTFREE(rin6.ro_rt);
 }
 
 /*
diff -u -r ../src_org_8.2_20110329/sys/netinet6/ip6_output.c ./sys/netinet6/ip6_output.c
--- ../src_org_8.2_20110329/sys/netinet6/ip6_output.c	2010-10-25 13:16:11.000000000 +0000
+++ ./sys/netinet6/ip6_output.c	2011-04-14 16:33:04.000000000 +0000
@@ -82,6 +82,7 @@
 #include <sys/ucred.h>
 
 #include <net/if.h>
+#include <net/if_dl.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/pfil.h>
@@ -135,7 +136,8 @@
 static int ip6_insert_jumboopt(struct ip6_exthdrs *, u_int32_t);
 static int ip6_splithdr(struct mbuf *, struct ip6_exthdrs *);
 static int ip6_getpmtu __P((struct route_in6 *, struct route_in6 *,
-	struct ifnet *, struct in6_addr *, u_long *, int *));
+	struct ifnet *, struct in6_addr *, u_long *, int *, int *, 
+	struct rtlookup *, struct sockaddr_dl *));
 static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
 
 
@@ -213,6 +215,10 @@
 	struct route_in6 *ro_pmtu = NULL;
 	int hdrsplit = 0;
 	int needipsec = 0;
+	struct rtlookup rtl;
+	struct sockaddr_dl gateway;
+	int ro_provided = 1;
+	int nortfree = 0;
 #ifdef SCTP
 	int sw_csum;
 #endif
@@ -463,6 +469,7 @@
 	 * Route packet.
 	 */
 	if (ro == 0) {
+	        ro_provided = 0;
 		ro = &ip6route;
 		bzero((caddr_t)ro, sizeof(*ro));
 	}
@@ -521,7 +528,15 @@
 		state.ro = (struct route *)ro;
 		state.dst = (struct sockaddr *)dst;
 
-		error = ipsec6_output_tunnel(&state, sp, flags);
+		/* check if caller does not free ro_rt, as needed by
+		 * rtlookup_(mpath)_fib
+		 */
+		if (ro_provided)
+			error = ipsec6_output_tunnel(&state, sp, flags, NULL, 
+			    NULL, &nortfree);
+		else
+			error = ipsec6_output_tunnel(&state, sp, flags, &rtl,
+			    (struct sockaddr *)&gateway, &nortfree);
 
 		m = state.m;
 		ro = (struct route_in6 *)state.ro;
@@ -576,15 +591,26 @@
 	dst_sa.sin6_family = AF_INET6;
 	dst_sa.sin6_len = sizeof(dst_sa);
 	dst_sa.sin6_addr = ip6->ip6_dst;
-	if ((error = in6_selectroute(&dst_sa, opt, im6o, ro,
-	    &ifp, &rt)) != 0) {
+	/* check if caller does not free ro_rt, as needed by
+	 * rtlookup_(mpath)_fib
+	 */
+	if (ro_provided) {
+		error = in6_selectroute(&dst_sa, opt, im6o, ro, &ifp, &rt);
+	} else {
+		bzero( &gateway, sizeof(gateway));
+		gateway.sdl_len = sizeof(gateway);
+		rtl.rt_gateway = (struct sockaddr *)&gateway;
+		error = in6_lookup(&dst_sa, opt, im6o, ro, &rtl, &ifp, &rt);
+		nortfree = 1;
+	}
+	if (error != 0) {
 		switch (error) {
-		case EHOSTUNREACH:
-			V_ip6stat.ip6s_noroute++;
-			break;
-		case EADDRNOTAVAIL:
-		default:
-			break; /* XXX statistics? */
+			case EHOSTUNREACH:
+				V_ip6stat.ip6s_noroute++;
+				break;
+			case EADDRNOTAVAIL:
+			default:
+				break; /* XXX statistics? */
 		}
 		if (ifp != NULL)
 			in6_ifstat_inc(ifp, ifs6_out_discard);
@@ -592,7 +618,7 @@
 	}
 	if (rt == NULL) {
 		/*
-		 * If in6_selectroute() does not return a route entry,
+		 * If in6_selectroute()/in6_lookup() does not return a route entry,
 		 * dst may not have been updated.
 		 */
 		*dst = dst_sa;	/* XXX */
@@ -743,11 +769,20 @@
 	if (ifpp)
 		*ifpp = ifp;
 
-	/* Determine path MTU. */
-	if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu,
-	    &alwaysfrag)) != 0)
-		goto bad;
-
+	/* Determine path MTU.
+	 * check if caller does not free ro_rt, as needed by
+	 * rtlookup_(mpath)_fib
+	 */
+	if (ro_provided) {
+		if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu,
+                    &alwaysfrag, &nortfree, NULL, NULL)) != 0)
+			goto bad;
+	} else {
+		if ((error = ip6_getpmtu(ro_pmtu, ro, ifp, &finaldst, &mtu,
+                    &alwaysfrag, &nortfree, &rtl, &gateway)) != 0)
+			goto bad;
+	}
+			
 	/*
 	 * The caller of this function may specify to use the minimum MTU
 	 * in some cases.
@@ -1071,9 +1106,9 @@
 		V_ip6stat.ip6s_fragmented++;
 
 done:
-	if (ro == &ip6route && ro->ro_rt) { /* brace necessary for RTFREE */
+	if (ro == &ip6route && ro->ro_rt && !nortfree) { /* brace necessary for RTFREE */
 		RTFREE(ro->ro_rt);
-	} else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt) {
+	} else if (ro_pmtu == &ip6route && ro_pmtu->ro_rt && !nortfree) {
 		RTFREE(ro_pmtu->ro_rt);
 	}
 #ifdef IPSEC
@@ -1264,7 +1299,7 @@
 static int
 ip6_getpmtu(struct route_in6 *ro_pmtu, struct route_in6 *ro,
     struct ifnet *ifp, struct in6_addr *dst, u_long *mtup,
-    int *alwaysfragp)
+    int *alwaysfragp, int *nortfree, struct rtlookup *rtl, struct sockaddr_dl *gateway)
 {
 	u_int32_t mtu = 0;
 	int alwaysfrag = 0;
@@ -1277,7 +1312,8 @@
 		if (ro_pmtu->ro_rt &&
 		    ((ro_pmtu->ro_rt->rt_flags & RTF_UP) == 0 ||
 		     !IN6_ARE_ADDR_EQUAL(&sa6_dst->sin6_addr, dst))) {
-			RTFREE(ro_pmtu->ro_rt);
+			if (rtl == NULL)
+				RTFREE(ro_pmtu->ro_rt);
 			ro_pmtu->ro_rt = (struct rtentry *)NULL;
 		}
 		if (ro_pmtu->ro_rt == NULL) {
@@ -1286,7 +1322,19 @@
 			sa6_dst->sin6_len = sizeof(struct sockaddr_in6);
 			sa6_dst->sin6_addr = *dst;
 
-			rtalloc((struct route *)ro_pmtu);
+			if (rtl == NULL)
+				rtalloc((struct route *)ro_pmtu);
+			else {
+                                bzero( gateway, sizeof( struct sockaddr_dl));
+                                gateway->sdl_len = sizeof(struct sockaddr_dl);
+                                rtl->rt_gateway = (struct sockaddr *)gateway;
+                                if (!rtlookup_fib( (struct sockaddr *)sa6_dst, 0, rtl, 0))
+					ro_pmtu->ro_rt = NULL;
+                                else {
+					ro_pmtu->ro_rt = (struct rtentry *)rtl;
+                                        *nortfree = 1;
+                                }
+			}
 		}
 	}
 	if (ro_pmtu->ro_rt) {
@@ -1887,7 +1935,7 @@
 				 * the outgoing interface.
 				 */
 				error = ip6_getpmtu(&sro, NULL, NULL,
-				    &in6p->in6p_faddr, &pmtu, NULL);
+				    &in6p->in6p_faddr, &pmtu, NULL, NULL, NULL, NULL);
 				if (sro.ro_rt)
 					RTFREE(sro.ro_rt);
 				if (error)
diff -u -r ../src_org_8.2_20110329/sys/netinet6/ip6_var.h ./sys/netinet6/ip6_var.h
--- ../src_org_8.2_20110329/sys/netinet6/ip6_var.h	2010-09-09 06:43:18.000000000 +0000
+++ ./sys/netinet6/ip6_var.h	2011-04-07 11:54:16.000000000 +0000
@@ -431,12 +431,16 @@
 int	dest6_input __P((struct mbuf **, int *, int));
 int	none_input __P((struct mbuf **, int *, int));
 
+#include <net/route.h>
 int	in6_selectsrc(struct sockaddr_in6 *, struct ip6_pktopts *,
 	struct inpcb *inp, struct route_in6 *, struct ucred *cred,
 	struct ifnet **, struct in6_addr *);
 int in6_selectroute __P((struct sockaddr_in6 *, struct ip6_pktopts *,
 	struct ip6_moptions *, struct route_in6 *, struct ifnet **,
 	struct rtentry **));
+int in6_lookup __P((struct sockaddr_in6 *, struct ip6_pktopts *,
+	struct ip6_moptions *, struct route_in6 *, struct rtlookup *,
+	struct ifnet **, struct rtentry **));
 u_int32_t ip6_randomid __P((void));
 u_int32_t ip6_randomflowlabel __P((void));
 #endif /* _KERNEL */
diff -u -r ../src_org_8.2_20110329/sys/netinet6/nd6_nbr.c ./sys/netinet6/nd6_nbr.c
--- ../src_org_8.2_20110329/sys/netinet6/nd6_nbr.c	2011-04-12 13:02:04.000000000 +0000
+++ ./sys/netinet6/nd6_nbr.c	2011-04-12 16:06:26.000000000 +0000
@@ -256,7 +256,11 @@
 			 */
 			if (need_proxy)
 				proxydl = *SDL(rt->rt_gateway);
+#ifdef RADIX_MPATH
+			RTFREE(rt);     /* rtalloc_mpath does not return a locked route */
+#else
 			RTFREE_LOCKED(rt);
+#endif
 		}
 		if (need_proxy) {
 			/*
diff -u -r ../src_org_8.2_20110329/sys/netinet6/nd6_rtr.c ./sys/netinet6/nd6_rtr.c
--- ../src_org_8.2_20110329/sys/netinet6/nd6_rtr.c	2010-05-06 06:44:19.000000000 +0000
+++ ./sys/netinet6/nd6_rtr.c	2011-04-03 16:07:57.000000000 +0000
@@ -48,6 +48,8 @@
 #include <sys/rwlock.h>
 #include <sys/syslog.h>
 #include <sys/queue.h>
+#include <sys/lock.h>
+#include <sys/rmlock.h>
 
 #include <net/if.h>
 #include <net/if_types.h>
diff -u -r ../src_org_8.2_20110329/sys/netipsec/ipsec6.h ./sys/netipsec/ipsec6.h
--- ../src_org_8.2_20110329/sys/netipsec/ipsec6.h	2010-09-18 01:54:28.000000000 +0000
+++ ./sys/netipsec/ipsec6.h	2011-04-08 12:57:45.000000000 +0000
@@ -72,7 +72,8 @@
 extern int ipsec6_output_trans __P((struct ipsec_output_state *, u_char *,
 	struct mbuf *, struct secpolicy *, int, int *));
 extern int ipsec6_output_tunnel __P((struct ipsec_output_state *,
-	struct secpolicy *, int));
+	struct secpolicy *, int, struct rtlookup *, struct sockaddr *,
+	int *));
 #endif /*_KERNEL*/
 
 #endif /*_NETIPSEC_IPSEC6_H_*/
diff -u -r ../src_org_8.2_20110329/sys/netipsec/ipsec_output.c ./sys/netipsec/ipsec_output.c
--- ../src_org_8.2_20110329/sys/netipsec/ipsec_output.c	2011-02-22 19:39:08.000000000 +0000
+++ ./sys/netipsec/ipsec_output.c	2011-04-08 12:56:38.000000000 +0000
@@ -44,6 +44,7 @@
 #include <sys/syslog.h>
 
 #include <net/if.h>
+#include <net/if_dl.h>
 #include <net/pfil.h>
 #include <net/route.h>
 #include <net/vnet.h>
@@ -752,7 +753,8 @@
  * IPsec output logic for IPv6, tunnel mode.
  */
 int
-ipsec6_output_tunnel(struct ipsec_output_state *state, struct secpolicy *sp, int flags)
+ipsec6_output_tunnel(struct ipsec_output_state *state, struct secpolicy *sp,
+    int flags, struct rtlookup *rtl, struct sockaddr *gateway, int *nortfree)
 {
 	struct ip6_hdr *ip6;
 	struct ipsecrequest *isr;
@@ -760,7 +762,7 @@
 	int error;
 	struct sockaddr_in6 *dst6;
 	struct mbuf *m;
-
+	
 	IPSEC_ASSERT(state != NULL, ("null state"));
 	IPSEC_ASSERT(state->m != NULL, ("null m"));
 	IPSEC_ASSERT(sp != NULL, ("null sp"));
@@ -836,7 +838,8 @@
 		if (state->ro->ro_rt
 		 && ((state->ro->ro_rt->rt_flags & RTF_UP) == 0
 		  || !IN6_ARE_ADDR_EQUAL(&dst6->sin6_addr, &ip6->ip6_dst))) {
-			RTFREE(state->ro->ro_rt);
+			if (rtl == NULL)
+				RTFREE(state->ro->ro_rt);
 			state->ro->ro_rt = NULL;
 		}
 		if (state->ro->ro_rt == NULL) {
@@ -844,7 +847,20 @@
 			dst6->sin6_family = AF_INET6;
 			dst6->sin6_len = sizeof(*dst6);
 			dst6->sin6_addr = ip6->ip6_dst;
-			rtalloc(state->ro);
+
+			if (rtl == NULL)
+				rtalloc(state->ro);
+			else {
+				bzero( gateway, sizeof( struct sockaddr_dl));
+				gateway->sa_len = sizeof(struct sockaddr_dl);
+				rtl->rt_gateway = (struct sockaddr *)gateway;
+				if (!rtlookup_fib( (struct sockaddr *)dst6, 0, rtl, 0))
+					state->ro->ro_rt = NULL;
+				else {
+					state->ro->ro_rt = (struct rtentry *)rtl;
+					*nortfree = 1;
+				}
+			}
 		}
 		if (state->ro->ro_rt == NULL) {
 			V_ip6stat.ip6s_noroute++;


More information about the freebsd-net mailing list