rewrite of rt_check() (now rt_check_fib())

Julian Elischer julian at elischer.org
Fri Sep 5 22:49:18 UTC 2008


Julian Elischer wrote:
> In tryin gto understand rt_check_fib() (wsa rt_check()) I ended up 
> rewriting it to do what I thought it was trying to do..
> this stops the panics some people have seen, but allows the system to 
> stay up long enough to see some other problem..
> anyhow this si the patch:
> 
> comments?
> 

I was thinking about this a bit.

rt_check_fib() drops the lock on the given rtentry in order to be able 
to get a lock on another rtentry that MIGHT be the same rtentry.

while it is doing this, another processor could free the original 
rtentry. The only safw way to get around this is to hold an additional 
reference on the first rtentry (we don't already have one) while it is 
unlocked so that we can be sure that it is not actually freed if this 
happens.

to do this safely I'd have to add a couple of new items into route.h:

#define RT_TEMP_UNLOCK(_rt) do {                                \
         RT_ADDREF(_rt);                                         \
         RT_UNLOCK(_rt);                                         \
} while (0)

#define RT_RELOCK(_rt) do {                                     \
         RT_LOCK(_rt)                                            \
         if ((_rt)->rt_refcnt <= 1)                              \
                 rtfree(_rt);                                    \
                 _rt = 0; /*  signal that it went away */        \
         else {                                                  \
                 RT_REMREF(_rt);                                 \
                 RT_UNLOCK(_rt);                                 \
                 /* note that _rt is still valid */              \
         }                                                       \
} while (0)


the new version of rt_check is attached.....
-------------- next part --------------
/*
 * rt_check() is invoked on each layer 2 output path, prior to
 * encapsulating outbound packets.
 *
 * The function is mostly used to find a routing entry for the gateway,
 * which in some protocol families could also point to the link-level
 * address for the gateway itself (the side effect of revalidating the
 * route to the destination is rather pointless at this stage, we did it
 * already a moment before in the pr_output() routine to locate the ifp
 * and gateway to use).
 *
 * When we remove the layer-3 to layer-2 mapping tables from the
 * routing table, this function can be removed.
 *
 * === On input ===
 *   *dst is the address of the NEXT HOP (which coincides with the
 *	final destination if directly reachable);
 *   *lrt0 points to the cached route to the final destination;
 *   *lrt is not meaningful;
 *    fibnum is the index to the correct network fib for this packet
 *	(*lrt0 has not ref held on it so REMREF is not needed )
 *
 * === Operation ===
 * If the route is marked down try to find a new route.  If the route
 * to the gateway is gone, try to setup a new route.  Otherwise,
 * if the route is marked for packets to be rejected, enforce that.
 * Note that rtalloc returns an rtentry with an extra REF that we need to lose.
 *
 * === On return ===
 *   *dst is unchanged;
 *   *lrt0 points to the (possibly new) route to the final destination
 *   *lrt points to the route to the next hop   [LOCKED]
 *
 * Their values are meaningful ONLY if no error is returned.
 *
 * To follow this you have to remember that:
 * RT_REMREF reduces the reference count by 1 but doesn't check it for 0 (!)
 * RTFREE_LOCKED includes an RT_REMREF (or an rtfree if refs == 1)
 *    and an RT_UNLOCK
 * RTFREE does an RT_LOCK and an RTFREE_LOCKED
 * The gwroute pointer counts as a reference on the rtentry to which it points.
 * so when we add it we use the ref that rtalloc gives us and when we lose it
 * we need to remove the reference.
 */
int
rt_check(struct rtentry **lrt, struct rtentry **lrt0, struct sockaddr *dst)
{
	return (rt_check_fib(lrt, lrt0, dst, 0));
}

int
rt_check_fib(struct rtentry **lrt, struct rtentry **lrt0, struct sockaddr *dst,
		u_int fibnum)
{
	struct rtentry *rt;
	struct rtentry *rt0;
	int error;

	KASSERT(*lrt0 != NULL, ("rt_check"));
	rt0 = *lrt0;
	rt = NULL;

	/* NB: the locking here is tortuous... */
	RT_LOCK(rt0);
retry:
	if (rt0 && (rt0->rt_flags & RTF_UP) == 0) {
		/* Current rt0 is useless, try get a replacement. */
		RT_UNLOCK(rt0);
		rt0 = NULL;
	}
	if (rt0 == NULL) {
		rt0 = rtalloc1_fib(dst, 1, 0UL, fibnum);
		if ((rt0 != NULL)/* && (rt0->rt_flags & RTF_UP) */) {
			RT_REMREF(rt0); /* don't need the reference. */
			/* XXX what about interface change? */
		} else {
			return (EHOSTUNREACH);
		}
	}

	/* XXX BSD/OS checks dst->sa_family != AF_NS */
	if (rt0->rt_flags & RTF_GATEWAY) {
		if ((rt = rt0->rt_gwroute) != NULL) {
			RT_LOCK(rt);		/* NB: gwroute */
			if ((rt->rt_flags & RTF_UP) == 0) {
				/* gw route is dud. ignore/lose it */
				RTFREE_LOCKED(rt); /* unref (&unlock) gwroute */
				rt = rt0->rt_gwroute = NULL;
			}
		}
		
		if (rt == NULL) {  /* NOT AN ELSE CLAUSE */
			RT_TEMP_UNLOCK(rt0); /* MUST return to undo this */
			/* XXX MRT link level looked up in table 0 */
			rt = rtalloc1_fib(rt0->rt_gateway, 1, 0UL, fibnum);
			if ((rt == rt0) || (rt == NULL)) {
				/* the best we can do is not good enough */
				if (rt) {
					RT_REMREF(rt); /* assumes ref > 0 */
					RT_UNLOCK(rt);
				}
				RT_RELOCK(rt0); /* lose the refcount */
				if (rt0)
					RT_UNLOCK(rt0);
				return (ENETUNREACH);
			}
			/*
			 * Relock it and lose the added reference.
			 * All sorts of things could have happenned while we
			 * had no lock on it, so check for them.
			 */
			RT_RELOCK(rt0);
			if (rt0 == NULL || ((rt0->rt_flags & RTF_UP) == 0))
				/* Ru-roh.. what we had is no longer any good */
				goto retry;
			/* 
			 * While we were away, someone replaced the gateway.
			 * Since a reference count is involved we can't just
			 * overwrite it.
			 */
			if (rt0->rt_gwroute) {
				if (rt0->rt_gwroute != rt) {
					RT_FREE_LOCKED(rt);
					goto retry;
				}
			} else {
				rt0->rt_gwroute = rt;
			}
		} else {
			RT_UNLOCK(rt0);
		}
	} else {
		/* think of rt as having the lock from now on.. */
		rt = rt0;
	}
	/* XXX why are we inspecting rmx_expire? */
	if ((rt->rt_flags & RTF_REJECT) &&
	    (rt->rt_rmx.rmx_expire == 0 ||
	    time_uptime < rt->rt_rmx.rmx_expire)) {
		RT_UNLOCK(rt);
		return (rt == rt0 ? EHOSTDOWN : EHOSTUNREACH);
	}

	*lrt = rt;
	*lrt0 = rt0;
	return (0);
}


More information about the freebsd-net mailing list