misc/143703: Patch: ECMP Phase 1 fixes for FreeBSD 7.2
Balaji G
balajig81 at gmail.com
Tue Feb 9 17:50:01 UTC 2010
>Number: 143703
>Category: misc
>Synopsis: Patch: ECMP Phase 1 fixes for FreeBSD 7.2
>Confidential: no
>Severity: non-critical
>Priority: medium
>Responsible: freebsd-bugs
>State: open
>Quarter:
>Keywords:
>Date-Required:
>Class: update
>Submitter-Id: current-users
>Arrival-Date: Tue Feb 09 17:50:01 UTC 2010
>Closed-Date:
>Last-Modified:
>Originator: Balaji G
>Release: FreeBSD 7.2
>Organization:
Home
>Environment:
FreeBSD 7.2-RELEASE FreeBSD 7.2-RELEASE #7: Sun Feb 7 13:19:58 IST 2010 root@:/usr/obj/usr/home/balaji/7.2.0/sys/MYKERNEL i386
>Description:
The Patch contains the ECMP Phase 1 fixes back ported to 7.2 release. I am working on bringing in the remaining changes too. This patch installs ECMP routes in the RIB
>How-To-Repeat:
Create two static routes with the same destination and different gateway
>Fix:
Patch attached with submission follows:
diff -u -r /home/balaji/Codes/FreeBSD/7.2.0/7.2.0_unmodified/7.2.0/sys/net/radix.c net/radix.c
--- /home/balaji/Codes/FreeBSD/7.2.0/7.2.0_unmodified/7.2.0/sys/net/radix.c 2010-02-04 22:40:29.000000000 +0530
+++ net/radix.c 2010-02-09 21:10:19.731903885 +0530
@@ -48,6 +48,8 @@
#include <net/radix.h>
#endif
+#include <net/radix_mpath.h>
+
static int rn_walktree_from(struct radix_node_head *h, void *a, void *m,
walktree_f_t *f, void *w);
static int rn_walktree(struct radix_node_head *, walktree_f_t *, void *);
@@ -630,6 +632,21 @@
saved_tt = tt = rn_insert(v, head, &keyduplicated, treenodes);
if (keyduplicated) {
for (t = tt; tt; t = tt, tt = tt->rn_dupedkey) {
+
+ /* permit multipath, if enabled for the family */
+ if (rn_mpath_capable(head) && netmask == tt->rn_mask) {
+ /*
+ * go down to the end of multipaths, so that
+ * new entry goes into the end of rn_dupedkey
+ * chain.
+ */
+ do {
+ t = tt;
+ tt = tt->rn_dupedkey;
+ } while (tt && t->rn_mask == tt->rn_mask);
+ break;
+ }
+
if (tt->rn_mask == netmask)
return (0);
if (netmask == 0 ||
diff -u -r /home/balaji/Codes/FreeBSD/7.2.0/7.2.0_unmodified/7.2.0/sys/net/radix.h net/radix.h
--- /home/balaji/Codes/FreeBSD/7.2.0/7.2.0_unmodified/7.2.0/sys/net/radix.h 2010-02-05 08:38:18.000000000 +0530
+++ net/radix.h 2010-02-09 21:17:10.373883032 +0530
@@ -130,9 +130,9 @@
void (*rnh_close) /* do something when the last ref drops */
(struct radix_node *rn, struct radix_node_head *head);
struct radix_node rnh_nodes[3]; /* empty tree for common case */
- /* ECMP Changes Begin */
+
int rnh_multipath; /* multipath capable ? */
- /* ECMP Changes End */
+
#ifdef _KERNEL
struct mtx rnh_mtx; /* locks entire radix tree */
#endif
diff -u -r /home/balaji/Codes/FreeBSD/7.2.0/7.2.0_unmodified/7.2.0/sys/net/radix_mpath.c net/radix_mpath.c
--- /home/balaji/Codes/FreeBSD/7.2.0/7.2.0_unmodified/7.2.0/sys/net/radix_mpath.c 2010-02-05 08:30:52.000000000 +0530
+++ net/radix_mpath.c 2010-02-09 21:12:14.681638068 +0530
@@ -54,7 +54,7 @@
/*
* give some jitter to hash, to avoid synchronization between routers
*/
-static uint32_t hashjitter;
+static uint32_t hashjitter;
int
rn_mpath_capable(struct radix_node_head *rnh)
@@ -258,7 +258,6 @@
return 0;
}
-#if 0
void
rtalloc_mpath_fib(struct route *ro, uint32_t hash, u_int fibnum)
{
@@ -317,10 +316,9 @@
}
RT_UNLOCK(ro->ro_rt);
}
-#endif
extern int in6_inithead(void **head, int off);
-extern int in_inithead(void **head, int off);
+extern int in_inthead(void **head, int off);
#ifdef INET
int
@@ -352,5 +350,5 @@
} else
return 0;
}
-
#endif
+
Only in net: radix_mpath.h
diff -u -r /home/balaji/Codes/FreeBSD/7.2.0/7.2.0_unmodified/7.2.0/sys/net/route.c net/route.c
--- /home/balaji/Codes/FreeBSD/7.2.0/7.2.0_unmodified/7.2.0/sys/net/route.c 2010-02-04 22:40:29.000000000 +0530
+++ net/route.c 2010-02-09 21:16:21.754622762 +0530
@@ -886,6 +886,111 @@
return (rtrequest1_fib(req, info, ret_nrt, 0));
}
+static int
+rn_mpath_update(int req, struct rt_addrinfo *info,
+ struct radix_node_head *rnh, struct rtentry **ret_nrt)
+{
+ /*
+ * if we got multipath routes, we require users to specify
+ * a matching RTAX_GATEWAY.
+ */
+ struct rtentry *rt, *rto = NULL;
+ register struct radix_node *rn;
+ int error = 0;
+
+ rn = rnh->rnh_matchaddr(dst, rnh);
+ if (rn == NULL)
+ return (ESRCH);
+ rto = rt = RNTORT(rn);
+ rt = rt_mpath_matchgate(rt, gateway);
+ if (rt == NULL)
+ return (ESRCH);
+ /*
+ * this is the first entry in the chain
+ */
+ if (rto == rt) {
+ rn = rn_mpath_next((struct radix_node *)rt);
+ /*
+ * there is another entry, now it's active
+ */
+ if (rn) {
+ rto = RNTORT(rn);
+ RT_LOCK(rto);
+ rto->rt_flags |= RTF_UP;
+ RT_UNLOCK(rto);
+ } else if (rt->rt_flags & RTF_GATEWAY) {
+ /*
+ * For gateway routes, we need to
+ * make sure that we we are deleting
+ * the correct gateway.
+ * rt_mpath_matchgate() does not
+ * check the case when there is only
+ * one route in the chain.
+ */
+ if (gateway &&
+ (rt->rt_gateway->sa_len != gateway->sa_len ||
+ memcmp(rt->rt_gateway, gateway, gateway->sa_len)))
+ error = ESRCH;
+ else {
+ /*
+ * remove from tree before returning it
+ * to the caller
+ */
+ rn = rnh->rnh_deladdr(dst, netmask, rnh);
+ KASSERT(rt == RNTORT(rn), ("radix node disappeared"));
+ goto gwdelete;
+ }
+
+ }
+ /*
+ * use the normal delete code to remove
+ * the first entry
+ */
+ if (req != RTM_DELETE)
+ goto nondelete;
+
+ error = ENOENT;
+ goto done;
+ }
+
+ /*
+ * if the entry is 2nd and on up
+ */
+ if ((req == RTM_DELETE) && !rt_mpath_deldup(rto, rt))
+ panic ("rtrequest1: rt_mpath_deldup");
+gwdelete:
+ RT_LOCK(rt);
+ RT_ADDREF(rt);
+ if (req == RTM_DELETE) {
+ rt->rt_flags &= ~RTF_UP;
+ /*
+ * One more rtentry floating around that is not
+ * linked to the routing table. rttrash will be decremented
+ * when RTFREE(rt) is eventually called.
+ */
+ rttrash++;
+ }
+
+nondelete:
+ if (req != RTM_DELETE)
+ panic("unrecognized request %d", req);
+
+
+ /*
+ * If the caller wants it, then it can have it,
+ * but it's up to it to free the rtentry as we won't be
+ * doing it.
+ */
+ if (ret_nrt) {
+ *ret_nrt = rt;
+ RT_UNLOCK(rt);
+ } else
+ RTFREE_LOCKED(rt);
+done:
+ return (error);
+}
+
+
int
rtrequest1_fib(int req, struct rt_addrinfo *info, struct rtentry **ret_nrt,
u_int fibnum)
@@ -923,6 +1028,17 @@
}
switch (req) {
case RTM_DELETE:
+
+ if (rn_mpath_capable(rnh)) {
+ error = rn_mpath_update(req, info, rnh, ret_nrt);
+ /*
+ * "bad" holds true for the success case
+ * as well
+ */
+ if (error != ENOENT)
+ goto bad;
+ }
+
/*
* Remove the item from the tree and return it.
* Complain if it is not there and do no more processing.
@@ -1046,6 +1162,18 @@
rt->rt_ifa = ifa;
rt->rt_ifp = ifa->ifa_ifp;
+ /* do not permit exactly the same dst/mask/gw pair */
+ if (rn_mpath_capable(rnh) &&
+ rt_mpath_conflict(rnh, rt, netmask)) {
+ if (rt->rt_ifa) {
+ IFAFREE(rt->rt_ifa);
+ }
+ Free(rt_key(rt));
+ RT_LOCK_DESTROY(rt);
+ uma_zfree(rtzone, rt);
+ senderr(EEXIST);
+ }
+
/* XXX mtu manipulation will be done in rnh_addaddr -- itojun */
rn = rnh->rnh_addaddr(ndst, netmask, rnh, rt->rt_nodes);
if (rn == NULL) {
@@ -1456,6 +1584,27 @@
/* this table doesn't exist but others might */
continue;
RADIX_NODE_HEAD_LOCK(rnh);
+ if (rn_mpath_capable(rnh)) {
+
+ rn = rnh->rnh_matchaddr(dst, rnh);
+ if (rn == NULL)
+ error = ESRCH;
+ else {
+ rt = RNTORT(rn);
+ /*
+ * for interface route the
+ * rt->rt_gateway is sockaddr_intf
+ * for cloning ARP entries, so
+ * rt_mpath_matchgate must use the
+ * interface address
+ */
+ rt = rt_mpath_matchgate(rt,
+ ifa->ifa_addr);
+ if (!rt)
+ error = ESRCH;
+ }
+ }
+ else
rn = rnh->rnh_lookup(dst, netmask, rnh);
error = (rn == NULL ||
(rn->rn_flags & RNF_ROOT) ||
@@ -1482,6 +1631,20 @@
* notify any listening routing agents of the change
*/
RT_LOCK(rt);
+ /*
+ * in case address alias finds the first address
+ * e.g. ifconfig bge0 192.103.54.246/24
+ * e.g. ifconfig bge0 192.103.54.247/24
+ * the address set in the route is 192.103.54.246
+ * so we need to replace it with 192.103.54.247
+ */
+ if (memcmp(rt->rt_ifa->ifa_addr,
+ ifa->ifa_addr, ifa->ifa_addr->sa_len)) {
+ IFAFREE(rt->rt_ifa);
+ IFAREF(ifa);
+ rt->rt_ifp = ifa->ifa_ifp;
+ rt->rt_ifa = ifa;
+ }
rt_newaddrmsg(cmd, ifa, error, rt);
if (cmd == RTM_DELETE) {
/*
diff -u -r /home/balaji/Codes/FreeBSD/7.2.0/7.2.0_unmodified/7.2.0/sys/net/route.h net/route.h
--- /home/balaji/Codes/FreeBSD/7.2.0/7.2.0_unmodified/7.2.0/sys/net/route.h 2010-02-04 22:40:29.000000000 +0530
+++ net/route.h 2010-02-09 21:18:10.257871360 +0530
@@ -58,6 +58,7 @@
u_long rmx_mtu; /* MTU for this path */
u_long rmx_expire; /* lifetime for route, e.g. redirect */
u_long rmx_pksent; /* packets sent using this route */
+ u_long rmx_weight;
};
struct rt_metrics {
@@ -101,6 +102,9 @@
#ifndef RNF_NORMAL
#include <net/radix.h>
#endif
+
+#include <net/radix_mpath.h>
+
struct rtentry {
struct radix_node rt_nodes[2]; /* tree glue, and other values */
/*
diff -u -r /home/balaji/Codes/FreeBSD/7.2.0/7.2.0_unmodified/7.2.0/sys/net/rtsock.c net/rtsock.c
--- /home/balaji/Codes/FreeBSD/7.2.0/7.2.0_unmodified/7.2.0/sys/net/rtsock.c 2010-02-04 22:40:29.000000000 +0530
+++ net/rtsock.c 2010-02-09 21:18:37.274871425 +0530
@@ -536,6 +536,24 @@
RADIX_NODE_HEAD_UNLOCK(rnh);
senderr(ESRCH);
}
+
+ /*
+ * for RTM_CHANGE/LOCK, if we got multipath routes,
+ * we require users to specify a matching RTAX_GATEWAY.
+ *
+ * for RTM_GET, gate is optional even with multipath.
+ * if gate == NULL the first match is returned.
+ * (no need to call rt_mpath_matchgate if gate == NULL)
+ */
+ if (rn_mpath_capable(rnh) &&
+ (rtm->rtm_type != RTM_GET || info.rti_info[RTAX_GATEWAY])) {
+ rt = rt_mpath_matchgate(rt, info.rti_info[RTAX_GATEWAY]);
+ if (!rt) {
+ RADIX_NODE_HEAD_UNLOCK(rnh);
+ senderr(ESRCH);
+ }
+ }
+
RT_LOCK(rt);
RT_ADDREF(rt);
RADIX_NODE_HEAD_UNLOCK(rnh);
>Release-Note:
>Audit-Trail:
>Unformatted:
More information about the freebsd-bugs
mailing list