svn commit: r334671 - in head/sys: net netinet netinet6 netipsec

Andrey V. Elsukov ae at FreeBSD.org
Tue Jun 5 20:51:04 UTC 2018


Author: ae
Date: Tue Jun  5 20:51:01 2018
New Revision: 334671
URL: https://svnweb.freebsd.org/changeset/base/334671

Log:
  Rework IP encapsulation handling code.
  
  Currently it has several disadvantages:
  - it uses single mutex to protect internal structures. It is used by
    data- and control- path, thus there are no parallelism at all.
  - it uses single list to keep encap handlers for both INET and INET6
    families.
  - struct encaptab keeps unneeded information (src, dst, masks, protosw),
    that isn't used by code in the source tree.
  - matches are prioritized and when many tunneling interfaces are
    registered, encapcheck handler of each interface is invoked for each
    packet. The search takes O(n) for n interfaces. All this work is done
    with exclusive lock held.
  
  What this patch includes:
  - the datapath is converted to be lockless using epoch(9) KPI.
  - struct encaptab now linked using CK_LIST.
  - all unused fields removed from struct encaptab. Several new fields
    addedr: min_length is the minimum packet length, that encapsulation
    handler expects to see; exact_match is maximum number of bits, that
    can return an encapsulation handler, when it wants to consume a packet.
  - IPv6 and IPv4 handlers are stored in separate lists;
  - added new "encap_lookup_t" method, that will be used later. It is
    targeted to speedup lookup of needed interface, when gif(4)/gre(4) have
    many interfaces.
  - the need to use protosw structure is eliminated. The only pr_input
    method was used from this structure, so I don't see the need to keep
    using it.
  - encap_input_t method changed to avoid using mbuf tags to store softc
    pointer. Now it is passed directly trough encap_input_t method.
    encap_getarg() funtions is removed.
  - all sockaddr structures and code that uses them removed. We don't have
    any code in the tree that uses them. All consumers use encap_attach_func()
    method, that relies on invoking of encapcheck() to determine the needed
    handler.
  - introduced struct encap_config, it contains parameters of encap handler
    that is going to be registered by encap_attach() function.
  - encap handlers are stored in lists ordered by exact_match value, thus
    handlers that need more bits to match will be checked first, and if
    encapcheck method returns exact_match value, the search will be stopped.
  - all current consumers changed to use new KPI.
  
  Reviewed by:	mmacy
  Sponsored by:	Yandex LLC
  Differential Revision:	https://reviews.freebsd.org/D15617

Modified:
  head/sys/net/if_gif.c
  head/sys/net/if_gre.c
  head/sys/net/if_gre.h
  head/sys/net/if_me.c
  head/sys/net/if_stf.c
  head/sys/netinet/in_gif.c
  head/sys/netinet/ip_encap.c
  head/sys/netinet/ip_encap.h
  head/sys/netinet/ip_gre.c
  head/sys/netinet/ip_mroute.c
  head/sys/netinet/pim_var.h
  head/sys/netinet6/in6_gif.c
  head/sys/netinet6/ip6_gre.c
  head/sys/netinet6/ip6_mroute.c
  head/sys/netinet6/pim6_var.h
  head/sys/netipsec/xform_ipcomp.c

Modified: head/sys/net/if_gif.c
==============================================================================
--- head/sys/net/if_gif.c	Tue Jun  5 20:41:06 2018	(r334670)
+++ head/sys/net/if_gif.c	Tue Jun  5 20:51:01 2018	(r334671)
@@ -923,12 +923,24 @@ bad:
 }
 
 static void
-gif_detach(struct gif_softc *sc)
+gif_detach(struct gif_softc *sc, int family)
 {
 
 	sx_assert(&gif_ioctl_sx, SA_XLOCKED);
-	if (sc->gif_ecookie != NULL)
-		encap_detach(sc->gif_ecookie);
+	if (sc->gif_ecookie != NULL) {
+		switch (family) {
+#ifdef INET
+		case AF_INET:
+			ip_encap_detach(sc->gif_ecookie);
+			break;
+#endif
+#ifdef INET6
+		case AF_INET6:
+			ip6_encap_detach(sc->gif_ecookie);
+			break;
+#endif
+		}
+	}
 	sc->gif_ecookie = NULL;
 }
 
@@ -1020,7 +1032,7 @@ gif_set_tunnel(struct ifnet *ifp, struct sockaddr *src
 	}
 
 	if (sc->gif_family != src->sa_family)
-		gif_detach(sc);
+		gif_detach(sc, sc->gif_family);
 	if (sc->gif_family == 0 ||
 	    sc->gif_family != src->sa_family)
 		error = gif_attach(sc, src->sa_family);
@@ -1058,7 +1070,7 @@ gif_delete_tunnel(struct ifnet *ifp)
 	sc->gif_family = 0;
 	GIF_WUNLOCK(sc);
 	if (family != 0) {
-		gif_detach(sc);
+		gif_detach(sc, family);
 		free(sc->gif_hdr, M_GIF);
 	}
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;

Modified: head/sys/net/if_gre.c
==============================================================================
--- head/sys/net/if_gre.c	Tue Jun  5 20:41:06 2018	(r334670)
+++ head/sys/net/if_gre.c	Tue Jun  5 20:51:01 2018	(r334671)
@@ -551,12 +551,24 @@ gre_updatehdr(struct gre_softc *sc)
 }
 
 static void
-gre_detach(struct gre_softc *sc)
+gre_detach(struct gre_softc *sc, int family)
 {
 
 	sx_assert(&gre_ioctl_sx, SA_XLOCKED);
-	if (sc->gre_ecookie != NULL)
-		encap_detach(sc->gre_ecookie);
+	if (sc->gre_ecookie != NULL) {
+		switch (family) {
+#ifdef INET
+		case AF_INET:
+			ip_encap_detach(sc->gre_ecookie);
+			break;
+#endif
+#ifdef INET6
+		case AF_INET6:
+			ip6_encap_detach(sc->gre_ecookie);
+			break;
+#endif
+		}
+	}
 	sc->gre_ecookie = NULL;
 }
 
@@ -624,7 +636,7 @@ gre_set_tunnel(struct ifnet *ifp, struct sockaddr *src
 		return (EAFNOSUPPORT);
 	}
 	if (sc->gre_family != 0)
-		gre_detach(sc);
+		gre_detach(sc, sc->gre_family);
 	GRE_WLOCK(sc);
 	if (sc->gre_family != 0)
 		free(sc->gre_hdr, M_GRE);
@@ -666,7 +678,7 @@ gre_delete_tunnel(struct ifnet *ifp)
 	sc->gre_family = 0;
 	GRE_WUNLOCK(sc);
 	if (family != 0) {
-		gre_detach(sc);
+		gre_detach(sc, family);
 		free(sc->gre_hdr, M_GRE);
 	}
 	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
@@ -674,12 +686,11 @@ gre_delete_tunnel(struct ifnet *ifp)
 }
 
 int
-gre_input(struct mbuf **mp, int *offp, int proto)
+gre_input(struct mbuf *m, int off, int proto, void *arg)
 {
-	struct gre_softc *sc;
+	struct gre_softc *sc = arg;
 	struct grehdr *gh;
 	struct ifnet *ifp;
-	struct mbuf *m;
 	uint32_t *opts;
 #ifdef notyet
 	uint32_t key;
@@ -687,12 +698,8 @@ gre_input(struct mbuf **mp, int *offp, int proto)
 	uint16_t flags;
 	int hlen, isr, af;
 
-	m = *mp;
-	sc = encap_getarg(m);
-	KASSERT(sc != NULL, ("encap_getarg returned NULL"));
-
 	ifp = GRE2IFP(sc);
-	hlen = *offp + sizeof(struct grehdr) + 4 * sizeof(uint32_t);
+	hlen = off + sizeof(struct grehdr) + 4 * sizeof(uint32_t);
 	if (m->m_pkthdr.len < hlen)
 		goto drop;
 	if (m->m_len < hlen) {
@@ -700,7 +707,7 @@ gre_input(struct mbuf **mp, int *offp, int proto)
 		if (m == NULL)
 			goto drop;
 	}
-	gh = (struct grehdr *)mtodo(m, *offp);
+	gh = (struct grehdr *)mtodo(m, off);
 	flags = ntohs(gh->gre_flags);
 	if (flags & ~GRE_FLAGS_MASK)
 		goto drop;
@@ -710,7 +717,7 @@ gre_input(struct mbuf **mp, int *offp, int proto)
 		/* reserved1 field must be zero */
 		if (((uint16_t *)opts)[1] != 0)
 			goto drop;
-		if (in_cksum_skip(m, m->m_pkthdr.len, *offp) != 0)
+		if (in_cksum_skip(m, m->m_pkthdr.len, off) != 0)
 			goto drop;
 		hlen += 2 * sizeof(uint16_t);
 		opts++;
@@ -760,7 +767,7 @@ gre_input(struct mbuf **mp, int *offp, int proto)
 	default:
 		goto drop;
 	}
-	m_adj(m, *offp + hlen);
+	m_adj(m, off + hlen);
 	m_clrprotoflags(m);
 	m->m_pkthdr.rcvif = ifp;
 	M_SETFIB(m, ifp->if_fib);

Modified: head/sys/net/if_gre.h
==============================================================================
--- head/sys/net/if_gre.h	Tue Jun  5 20:41:06 2018	(r334670)
+++ head/sys/net/if_gre.h	Tue Jun  5 20:51:01 2018	(r334671)
@@ -101,7 +101,7 @@ struct gre_softc {
 #define	gre_oip			gre_gihdr->gi_ip
 #define	gre_oip6		gre_gi6hdr->gi6_ip6
 
-int	gre_input(struct mbuf **, int *, int);
+int	gre_input(struct mbuf *, int, int, void *);
 #ifdef INET
 int	in_gre_attach(struct gre_softc *);
 int	in_gre_output(struct mbuf *, int, int);

Modified: head/sys/net/if_me.c
==============================================================================
--- head/sys/net/if_me.c	Tue Jun  5 20:41:06 2018	(r334670)
+++ head/sys/net/if_me.c	Tue Jun  5 20:51:01 2018	(r334671)
@@ -37,7 +37,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/mbuf.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
-#include <sys/protosw.h>
 #include <sys/rmlock.h>
 #include <sys/socket.h>
 #include <sys/sockio.h>
@@ -122,12 +121,23 @@ static int	me_transmit(struct ifnet *, struct mbuf *);
 static int	me_ioctl(struct ifnet *, u_long, caddr_t);
 static int	me_output(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *);
-static int	me_input(struct mbuf **, int *, int);
+static int	me_input(struct mbuf *, int, int, void *);
 
 static int	me_set_tunnel(struct ifnet *, struct sockaddr_in *,
     struct sockaddr_in *);
 static void	me_delete_tunnel(struct ifnet *);
+static int	me_encapcheck(const struct mbuf *, int, int, void *);
 
+#define	ME_MINLEN	(sizeof(struct ip) + sizeof(struct mobhdr) -\
+    sizeof(in_addr_t))
+static const struct encap_config ipv4_encap_cfg = {
+	.proto = IPPROTO_MOBILE,
+	.min_length = ME_MINLEN,
+	.exact_match = (sizeof(in_addr_t) << 4) + 8,
+	.check = me_encapcheck,
+	.input = me_input
+};
+
 SYSCTL_DECL(_net_link);
 static SYSCTL_NODE(_net_link, IFT_TUNNEL, me, CTLFLAG_RW, 0,
     "Minimal Encapsulation for IP (RFC 2004)");
@@ -140,19 +150,6 @@ static VNET_DEFINE(int, max_me_nesting) = MAX_ME_NEST;
 SYSCTL_INT(_net_link_me, OID_AUTO, max_nesting, CTLFLAG_RW | CTLFLAG_VNET,
     &VNET_NAME(max_me_nesting), 0, "Max nested tunnels");
 
-extern struct domain inetdomain;
-static const struct protosw in_mobile_protosw = {
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_MOBILE,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_input =		me_input,
-	.pr_output =		rip_output,
-	.pr_ctlinput =		rip_ctlinput,
-	.pr_ctloutput =		rip_ctloutput,
-	.pr_usrreqs =		&rip_usrreqs
-};
-
 static void
 vnet_me_init(const void *unused __unused)
 {
@@ -334,17 +331,13 @@ me_encapcheck(const struct mbuf *m, int off, int proto
 
 	M_ASSERTPKTHDR(m);
 
-	if (m->m_pkthdr.len < sizeof(struct ip) + sizeof(struct mobhdr) -
-	    sizeof(struct in_addr))
-		return (0);
-
 	ret = 0;
 	ME_RLOCK(sc);
 	if (ME_READY(sc)) {
 		ip = mtod(m, struct ip *);
 		if (sc->me_src.s_addr == ip->ip_dst.s_addr &&
 		    sc->me_dst.s_addr == ip->ip_src.s_addr)
-			ret = 32 * 2;
+			ret = 32 * 2 + 8;
 	}
 	ME_RUNLOCK(sc);
 	return (ret);
@@ -376,8 +369,8 @@ me_set_tunnel(struct ifnet *ifp, struct sockaddr_in *s
 	ME_WUNLOCK(sc);
 
 	if (sc->me_ecookie == NULL)
-		sc->me_ecookie = encap_attach_func(AF_INET, IPPROTO_MOBILE,
-		    me_encapcheck, &in_mobile_protosw, sc);
+		sc->me_ecookie = ip_encap_attach(&ipv4_encap_cfg,
+		    sc, M_WAITOK);
 	if (sc->me_ecookie != NULL) {
 		ifp->if_drv_flags |= IFF_DRV_RUNNING;
 		if_link_state_change(ifp, LINK_STATE_UP);
@@ -392,7 +385,7 @@ me_delete_tunnel(struct ifnet *ifp)
 
 	sx_assert(&me_ioctl_sx, SA_XLOCKED);
 	if (sc->me_ecookie != NULL)
-		encap_detach(sc->me_ecookie);
+		ip_encap_detach(sc->me_ecookie);
 	sc->me_ecookie = NULL;
 	ME_WLOCK(sc);
 	sc->me_src.s_addr = 0;
@@ -414,19 +407,14 @@ me_in_cksum(uint16_t *p, int nwords)
 	return (~sum);
 }
 
-int
-me_input(struct mbuf **mp, int *offp, int proto)
+static int
+me_input(struct mbuf *m, int off, int proto, void *arg)
 {
-	struct me_softc *sc;
+	struct me_softc *sc = arg;
 	struct mobhdr *mh;
 	struct ifnet *ifp;
-	struct mbuf *m;
 	struct ip *ip;
 	int hlen;
-
-	m = *mp;
-	sc = encap_getarg(m);
-	KASSERT(sc != NULL, ("encap_getarg returned NULL"));
 
 	ifp = ME2IFP(sc);
 	/* checks for short packets */

Modified: head/sys/net/if_stf.c
==============================================================================
--- head/sys/net/if_stf.c	Tue Jun  5 20:41:06 2018	(r334670)
+++ head/sys/net/if_stf.c	Tue Jun  5 20:51:01 2018	(r334671)
@@ -85,7 +85,6 @@
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/module.h>
-#include <sys/protosw.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/rmlock.h>
@@ -151,19 +150,7 @@ static const char stfname[] = "stf";
 static MALLOC_DEFINE(M_STF, stfname, "6to4 Tunnel Interface");
 static const int ip_stf_ttl = 40;
 
-extern  struct domain inetdomain;
-static int in_stf_input(struct mbuf **, int *, int);
-static struct protosw in_stf_protosw = {
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		IPPROTO_IPV6,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_input =		in_stf_input,
-	.pr_output =		rip_output,
-	.pr_ctloutput =		rip_ctloutput,
-	.pr_usrreqs =		&rip_usrreqs
-};
-
+static int in_stf_input(struct mbuf *, int, int, void *);
 static char *stfnames[] = {"stf0", "stf", "6to4", NULL};
 
 static int stfmodevent(module_t, int, void *);
@@ -183,6 +170,14 @@ static int stf_clone_create(struct if_clone *, char *,
 static int stf_clone_destroy(struct if_clone *, struct ifnet *);
 static struct if_clone *stf_cloner;
 
+static const struct encap_config ipv4_encap_cfg = {
+	.proto = IPPROTO_IPV6,
+	.min_length = sizeof(struct ip),
+	.exact_match = (sizeof(in_addr_t) << 3) + 8,
+	.check = stf_encapcheck,
+	.input = in_stf_input
+};
+
 static int
 stf_clone_match(struct if_clone *ifc, const char *name)
 {
@@ -250,8 +245,7 @@ stf_clone_create(struct if_clone *ifc, char *name, siz
 	ifp->if_dname = stfname;
 	ifp->if_dunit = IF_DUNIT_NONE;
 
-	sc->encap_cookie = encap_attach_func(AF_INET, IPPROTO_IPV6,
-	    stf_encapcheck, &in_stf_protosw, sc);
+	sc->encap_cookie = ip_encap_attach(&ipv4_encap_cfg, sc, M_WAITOK);
 	if (sc->encap_cookie == NULL) {
 		if_printf(ifp, "attach failed\n");
 		free(sc, M_STF);
@@ -274,7 +268,7 @@ stf_clone_destroy(struct if_clone *ifc, struct ifnet *
 	struct stf_softc *sc = ifp->if_softc;
 	int err __unused;
 
-	err = encap_detach(sc->encap_cookie);
+	err = ip_encap_detach(sc->encap_cookie);
 	KASSERT(err == 0, ("Unexpected error detaching encap_cookie"));
 	bpfdetach(ifp);
 	if_detach(ifp);
@@ -608,28 +602,20 @@ stf_checkaddr6(struct stf_softc *sc, struct in6_addr *
 }
 
 static int
-in_stf_input(struct mbuf **mp, int *offp, int proto)
+in_stf_input(struct mbuf *m, int off, int proto, void *arg)
 {
-	struct stf_softc *sc;
+	struct stf_softc *sc = arg;
 	struct ip *ip;
 	struct ip6_hdr *ip6;
-	struct mbuf *m;
 	u_int8_t otos, itos;
 	struct ifnet *ifp;
-	int off;
 
-	m = *mp;
-	off = *offp;
-
 	if (proto != IPPROTO_IPV6) {
 		m_freem(m);
 		return (IPPROTO_DONE);
 	}
 
 	ip = mtod(m, struct ip *);
-
-	sc = (struct stf_softc *)encap_getarg(m);
-
 	if (sc == NULL || (STF2IFP(sc)->if_flags & IFF_UP) == 0) {
 		m_freem(m);
 		return (IPPROTO_DONE);
@@ -680,7 +666,7 @@ in_stf_input(struct mbuf **mp, int *offp, int proto)
 	ip6->ip6_flow |= htonl((u_int32_t)itos << 20);
 
 	m->m_pkthdr.rcvif = ifp;
-	
+
 	if (bpf_peers_present(ifp->if_bpf)) {
 		/*
 		 * We need to prepend the address family as

Modified: head/sys/netinet/in_gif.c
==============================================================================
--- head/sys/netinet/in_gif.c	Tue Jun  5 20:41:06 2018	(r334670)
+++ head/sys/netinet/in_gif.c	Tue Jun  5 20:51:01 2018	(r334671)
@@ -47,7 +47,6 @@ __FBSDID("$FreeBSD$");
 #include <sys/errno.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
-#include <sys/protosw.h>
 #include <sys/malloc.h>
 
 #include <net/if.h>
@@ -70,25 +69,11 @@ __FBSDID("$FreeBSD$");
 
 #include <net/if_gif.h>
 
-static int in_gif_input(struct mbuf **, int *, int);
-
-extern  struct domain inetdomain;
-static struct protosw in_gif_protosw = {
-	.pr_type =		SOCK_RAW,
-	.pr_domain =		&inetdomain,
-	.pr_protocol =		0/* IPPROTO_IPV[46] */,
-	.pr_flags =		PR_ATOMIC|PR_ADDR,
-	.pr_input =		in_gif_input,
-	.pr_output =		rip_output,
-	.pr_ctloutput =		rip_ctloutput,
-	.pr_usrreqs =		&rip_usrreqs
-};
-
 #define GIF_TTL		30
 static VNET_DEFINE(int, ip_gif_ttl) = GIF_TTL;
 #define	V_ip_gif_ttl		VNET(ip_gif_ttl)
 SYSCTL_INT(_net_inet_ip, IPCTL_GIF_TTL, gifttl, CTLFLAG_VNET | CTLFLAG_RW,
-	&VNET_NAME(ip_gif_ttl), 0, "");
+    &VNET_NAME(ip_gif_ttl), 0, "Default TTL value for encapsulated packets");
 
 int
 in_gif_output(struct ifnet *ifp, struct mbuf *m, int proto, uint8_t ecn)
@@ -136,15 +121,13 @@ in_gif_output(struct ifnet *ifp, struct mbuf *m, int p
 }
 
 static int
-in_gif_input(struct mbuf **mp, int *offp, int proto)
+in_gif_input(struct mbuf *m, int off, int proto, void *arg)
 {
-	struct mbuf *m = *mp;
-	struct gif_softc *sc;
+	struct gif_softc *sc = arg;
 	struct ifnet *gifp;
 	struct ip *ip;
 	uint8_t ecn;
 
-	sc = encap_getarg(m);
 	if (sc == NULL) {
 		m_freem(m);
 		KMOD_IPSTAT_INC(ips_nogif);
@@ -154,7 +137,7 @@ in_gif_input(struct mbuf **mp, int *offp, int proto)
 	if ((gifp->if_flags & IFF_UP) != 0) {
 		ip = mtod(m, struct ip *);
 		ecn = ip->ip_tos;
-		m_adj(m, *offp);
+		m_adj(m, off);
 		gif_input(m, gifp, proto, ecn);
 	} else {
 		m_freem(m);
@@ -182,7 +165,7 @@ in_gif_encapcheck(const struct mbuf *m, int off, int p
 	ip = mtod(m, const struct ip *);
 	if (sc->gif_iphdr->ip_src.s_addr != ip->ip_dst.s_addr)
 		return (0);
-	ret = 32;
+	ret = 32 + 8; /* src + proto */
 	if (sc->gif_iphdr->ip_dst.s_addr != ip->ip_src.s_addr) {
 		if ((sc->gif_options & GIF_IGNORE_SOURCE) == 0)
 			return (0);
@@ -205,14 +188,19 @@ in_gif_encapcheck(const struct mbuf *m, int off, int p
 	return (ret);
 }
 
+static const struct encap_config ipv4_encap_cfg = {
+	.proto = -1,
+	.min_length = sizeof(struct ip),
+	.exact_match = (sizeof(in_addr_t) << 4) + 8,
+	.check = gif_encapcheck,
+	.input = in_gif_input
+};
+
 int
 in_gif_attach(struct gif_softc *sc)
 {
 
 	KASSERT(sc->gif_ecookie == NULL, ("gif_ecookie isn't NULL"));
-	sc->gif_ecookie = encap_attach_func(AF_INET, -1, gif_encapcheck,
-	    &in_gif_protosw, sc);
-	if (sc->gif_ecookie == NULL)
-		return (EEXIST);
+	sc->gif_ecookie = ip_encap_attach(&ipv4_encap_cfg, sc, M_WAITOK);
 	return (0);
 }

Modified: head/sys/netinet/ip_encap.c
==============================================================================
--- head/sys/netinet/ip_encap.c	Tue Jun  5 20:41:06 2018	(r334670)
+++ head/sys/netinet/ip_encap.c	Tue Jun  5 20:51:01 2018	(r334671)
@@ -4,6 +4,7 @@
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
+ * Copyright (c) 2018 Andrey V. Elsukov <ae at FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -56,417 +57,214 @@
  * So, clearly good old protosw does not work for protocol #4 and #41.
  * The code will let you match protocol via src/dst address pair.
  */
-/* XXX is M_NETADDR correct? */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include "opt_mrouting.h"
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/kernel.h>
 #include <sys/lock.h>
+#include <sys/malloc.h>
 #include <sys/mutex.h>
-#include <sys/socket.h>
-#include <sys/sockio.h>
 #include <sys/mbuf.h>
 #include <sys/errno.h>
-#include <sys/protosw.h>
-#include <sys/queue.h>
+#include <sys/socket.h>
 
 #include <net/if.h>
-#include <net/route.h>
+#include <net/if_var.h>
 
 #include <netinet/in.h>
-#include <netinet/in_systm.h>
-#include <netinet/ip.h>
 #include <netinet/ip_var.h>
 #include <netinet/ip_encap.h>
 
 #ifdef INET6
-#include <netinet/ip6.h>
 #include <netinet6/ip6_var.h>
 #endif
 
-#include <machine/stdarg.h>
+static MALLOC_DEFINE(M_NETADDR, "encap_export_host",
+    "Export host address structure");
 
-#include <sys/kernel.h>
-#include <sys/malloc.h>
-static MALLOC_DEFINE(M_NETADDR, "encap_export_host", "Export host address structure");
+struct encaptab {
+	CK_LIST_ENTRY(encaptab) chain;
+	int		proto;
+	int		min_length;
+	int		exact_match;
+	void		*arg;
 
-static void encap_add(struct encaptab *);
-static int mask_match(const struct encaptab *, const struct sockaddr *,
-		const struct sockaddr *);
-static void encap_fillarg(struct mbuf *, void *);
+	encap_lookup_t	lookup;
+	encap_check_t	check;
+	encap_input_t	input;
+};
 
-/*
- * All global variables in ip_encap.c are locked using encapmtx.
- */
+CK_LIST_HEAD(encaptab_head, encaptab);
+#ifdef INET
+static struct encaptab_head ipv4_encaptab = CK_LIST_HEAD_INITIALIZER();
+#endif
+#ifdef INET6
+static struct encaptab_head ipv6_encaptab = CK_LIST_HEAD_INITIALIZER();
+#endif
+
 static struct mtx encapmtx;
 MTX_SYSINIT(encapmtx, &encapmtx, "encapmtx", MTX_DEF);
-static LIST_HEAD(, encaptab) encaptab = LIST_HEAD_INITIALIZER(encaptab);
+#define	ENCAP_WLOCK()		mtx_lock(&encapmtx)
+#define	ENCAP_WUNLOCK()		mtx_unlock(&encapmtx)
+#define	ENCAP_RLOCK()		epoch_enter_preempt(net_epoch_preempt)
+#define	ENCAP_RUNLOCK()		epoch_exit_preempt(net_epoch_preempt)
+#define	ENCAP_WAIT()		epoch_wait_preempt(net_epoch_preempt)
 
-#ifdef INET
-int
-encap4_input(struct mbuf **mp, int *offp, int proto)
+static struct encaptab *
+encap_attach(struct encaptab_head *head, const struct encap_config *cfg,
+    void *arg, int mflags)
 {
-	struct ip *ip;
-	struct mbuf *m;
-	struct sockaddr_in s, d;
-	const struct protosw *psw;
-	struct encaptab *ep, *match;
-	void *arg;
-	int matchprio, off, prio;
+	struct encaptab *ep, *tmp;
 
-	m = *mp;
-	off = *offp;
-	ip = mtod(m, struct ip *);
+	if (cfg == NULL || cfg->input == NULL ||
+	    (cfg->check == NULL && cfg->lookup == NULL) ||
+	    (cfg->lookup != NULL && cfg->exact_match != ENCAP_DRV_LOOKUP) ||
+	    (cfg->exact_match == ENCAP_DRV_LOOKUP && cfg->lookup == NULL))
+		return (NULL);
 
-	bzero(&s, sizeof(s));
-	s.sin_family = AF_INET;
-	s.sin_len = sizeof(struct sockaddr_in);
-	s.sin_addr = ip->ip_src;
-	bzero(&d, sizeof(d));
-	d.sin_family = AF_INET;
-	d.sin_len = sizeof(struct sockaddr_in);
-	d.sin_addr = ip->ip_dst;
+	ep = malloc(sizeof(*ep), M_NETADDR, mflags);
+	if (ep == NULL)
+		return (NULL);
 
-	arg = NULL;
-	psw = NULL;
-	match = NULL;
-	matchprio = 0;
-	mtx_lock(&encapmtx);
-	LIST_FOREACH(ep, &encaptab, chain) {
-		if (ep->af != AF_INET)
-			continue;
-		if (ep->proto >= 0 && ep->proto != proto)
-			continue;
-		if (ep->func)
-			prio = (*ep->func)(m, off, proto, ep->arg);
-		else {
-			/*
-			 * it's inbound traffic, we need to match in reverse
-			 * order
-			 */
-			prio = mask_match(ep, (struct sockaddr *)&d,
-			    (struct sockaddr *)&s);
-		}
+	ep->proto = cfg->proto;
+	ep->min_length = cfg->min_length;
+	ep->exact_match = cfg->exact_match;
+	ep->arg = arg;
+	ep->lookup = cfg->exact_match == ENCAP_DRV_LOOKUP ? cfg->lookup: NULL;
+	ep->check = cfg->exact_match != ENCAP_DRV_LOOKUP ? cfg->check: NULL;
+	ep->input = cfg->input;
 
-		/*
-		 * We prioritize the matches by using bit length of the
-		 * matches.  mask_match() and user-supplied matching function
-		 * should return the bit length of the matches (for example,
-		 * if both src/dst are matched for IPv4, 64 should be returned).
-		 * 0 or negative return value means "it did not match".
-		 *
-		 * The question is, since we have two "mask" portion, we
-		 * cannot really define total order between entries.
-		 * For example, which of these should be preferred?
-		 * mask_match() returns 48 (32 + 16) for both of them.
-		 *	src=3ffe::/16, dst=3ffe:501::/32
-		 *	src=3ffe:501::/32, dst=3ffe::/16
-		 *
-		 * We need to loop through all the possible candidates
-		 * to get the best match - the search takes O(n) for
-		 * n attachments (i.e. interfaces).
-		 */
-		if (prio <= 0)
-			continue;
-		if (prio > matchprio) {
-			matchprio = prio;
-			match = ep;
-		}
+	ENCAP_WLOCK();
+	CK_LIST_FOREACH(tmp, head, chain) {
+		if (tmp->exact_match <= ep->exact_match)
+			break;
 	}
-	if (match != NULL) {
-		psw = match->psw;
-		arg = match->arg;
-	}
-	mtx_unlock(&encapmtx);
+	if (tmp == NULL)
+		CK_LIST_INSERT_HEAD(head, ep, chain);
+	else
+		CK_LIST_INSERT_BEFORE(tmp, ep, chain);
+	ENCAP_WUNLOCK();
+	return (ep);
+}
 
-	if (match != NULL) {
-		/* found a match, "match" has the best one */
-		if (psw != NULL && psw->pr_input != NULL) {
-			encap_fillarg(m, arg);
-			(*psw->pr_input)(mp, offp, proto);
-		} else
-			m_freem(m);
-		return (IPPROTO_DONE);
-	}
+static int
+encap_detach(struct encaptab_head *head, const struct encaptab *cookie)
+{
+	struct encaptab *ep;
 
-	/* last resort: inject to raw socket */
-	return (rip_input(mp, offp, proto));
+	ENCAP_WLOCK();
+	CK_LIST_FOREACH(ep, head, chain) {
+		if (ep == cookie) {
+			CK_LIST_REMOVE(ep, chain);
+			ENCAP_WUNLOCK();
+			ENCAP_WAIT();
+			free(ep, M_NETADDR);
+			return (0);
+		}
+	}
+	ENCAP_WUNLOCK();
+	return (EINVAL);
 }
-#endif
 
-#ifdef INET6
-int
-encap6_input(struct mbuf **mp, int *offp, int proto)
+static int
+encap_input(struct encaptab_head *head, struct mbuf *m, int off, int proto)
 {
-	struct mbuf *m = *mp;
-	struct ip6_hdr *ip6;
-	struct sockaddr_in6 s, d;
-	const struct protosw *psw;
 	struct encaptab *ep, *match;
 	void *arg;
-	int prio, matchprio;
+	int matchprio, ret;
 
-	ip6 = mtod(m, struct ip6_hdr *);
-
-	bzero(&s, sizeof(s));
-	s.sin6_family = AF_INET6;
-	s.sin6_len = sizeof(struct sockaddr_in6);
-	s.sin6_addr = ip6->ip6_src;
-	bzero(&d, sizeof(d));
-	d.sin6_family = AF_INET6;
-	d.sin6_len = sizeof(struct sockaddr_in6);
-	d.sin6_addr = ip6->ip6_dst;
-
-	arg = NULL;
-	psw = NULL;
 	match = NULL;
 	matchprio = 0;
-	mtx_lock(&encapmtx);
-	LIST_FOREACH(ep, &encaptab, chain) {
-		if (ep->af != AF_INET6)
-			continue;
+
+	ENCAP_RLOCK();
+	CK_LIST_FOREACH(ep, head, chain) {
 		if (ep->proto >= 0 && ep->proto != proto)
 			continue;
-		if (ep->func)
-			prio = (*ep->func)(m, *offp, proto, ep->arg);
-		else {
+		if (ep->min_length > m->m_pkthdr.len)
+			continue;
+		if (ep->exact_match == ENCAP_DRV_LOOKUP)
+			ret = (*ep->lookup)(m, off, proto, &arg);
+		else
+			ret = (*ep->check)(m, off, proto, ep->arg);
+		if (ret <= 0)
+			continue;
+		if (ret > matchprio) {
+			match = ep;
+			if (ep->exact_match != ENCAP_DRV_LOOKUP)
+				arg = ep->arg;
 			/*
-			 * it's inbound traffic, we need to match in reverse
-			 * order
+			 * No need to continue the search, we got the
+			 * exact match.
 			 */
-			prio = mask_match(ep, (struct sockaddr *)&d,
-			    (struct sockaddr *)&s);
+			if (ret >= ep->exact_match)
+				break;
+			matchprio = ret;
 		}
-
-		/* see encap4_input() for issues here */
-		if (prio <= 0)
-			continue;
-		if (prio > matchprio) {
-			matchprio = prio;
-			match = ep;
-		}
 	}
-	if (match != NULL) {
-		psw = match->psw;
-		arg = match->arg;
-	}
-	mtx_unlock(&encapmtx);
 
 	if (match != NULL) {
-		/* found a match */
-		if (psw != NULL && psw->pr_input != NULL) {
-			encap_fillarg(m, arg);
-			return (*psw->pr_input)(mp, offp, proto);
-		} else {
-			m_freem(m);
-			return (IPPROTO_DONE);
-		}
+		/* found a match, "match" has the best one */
+		ret = (*match->input)(m, off, proto, arg);
+		ENCAP_RUNLOCK();
+		MPASS(ret == IPPROTO_DONE);
+		return (IPPROTO_DONE);
 	}
-
-	/* last resort: inject to raw socket */
-	return rip6_input(mp, offp, proto);
+	ENCAP_RUNLOCK();
+	return (0);
 }
-#endif
 
-/*lint -sem(encap_add, custodial(1)) */
-static void
-encap_add(struct encaptab *ep)
-{
-
-	mtx_assert(&encapmtx, MA_OWNED);
-	LIST_INSERT_HEAD(&encaptab, ep, chain);
-}
-
-/*
- * sp (src ptr) is always my side, and dp (dst ptr) is always remote side.
- * length of mask (sm and dm) is assumed to be same as sp/dp.
- * Return value will be necessary as input (cookie) for encap_detach().
- */
+#ifdef INET
 const struct encaptab *
-encap_attach(int af, int proto, const struct sockaddr *sp,
-    const struct sockaddr *sm, const struct sockaddr *dp,
-    const struct sockaddr *dm, const struct protosw *psw, void *arg)
+ip_encap_attach(const struct encap_config *cfg, void *arg, int mflags)
 {
-	struct encaptab *ep;
 
-	/* sanity check on args */
-	if (sp->sa_len > sizeof(ep->src) || dp->sa_len > sizeof(ep->dst))
-		return (NULL);
-	if (sp->sa_len != dp->sa_len)
-		return (NULL);
-	if (af != sp->sa_family || af != dp->sa_family)
-		return (NULL);
-
-	/* check if anyone have already attached with exactly same config */
-	mtx_lock(&encapmtx);
-	LIST_FOREACH(ep, &encaptab, chain) {
-		if (ep->af != af)
-			continue;
-		if (ep->proto != proto)
-			continue;
-		if (ep->src.ss_len != sp->sa_len ||
-		    bcmp(&ep->src, sp, sp->sa_len) != 0 ||
-		    bcmp(&ep->srcmask, sm, sp->sa_len) != 0)
-			continue;
-		if (ep->dst.ss_len != dp->sa_len ||
-		    bcmp(&ep->dst, dp, dp->sa_len) != 0 ||
-		    bcmp(&ep->dstmask, dm, dp->sa_len) != 0)
-			continue;
-
-		mtx_unlock(&encapmtx);
-		return (NULL);
-	}
-
-	ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT);	/*XXX*/
-	if (ep == NULL) {
-		mtx_unlock(&encapmtx);
-		return (NULL);
-	}
-	bzero(ep, sizeof(*ep));
-
-	ep->af = af;
-	ep->proto = proto;
-	bcopy(sp, &ep->src, sp->sa_len);
-	bcopy(sm, &ep->srcmask, sp->sa_len);
-	bcopy(dp, &ep->dst, dp->sa_len);
-	bcopy(dm, &ep->dstmask, dp->sa_len);
-	ep->psw = psw;
-	ep->arg = arg;
-
-	encap_add(ep);
-	mtx_unlock(&encapmtx);
-	return (ep);
+	return (encap_attach(&ipv4_encaptab, cfg, arg, mflags));
 }
 
-const struct encaptab *
-encap_attach_func(int af, int proto,
-    int (*func)(const struct mbuf *, int, int, void *),
-    const struct protosw *psw, void *arg)
+int
+ip_encap_detach(const struct encaptab *cookie)
 {
-	struct encaptab *ep;
 
-	/* sanity check on args */
-	if (!func)
-		return (NULL);
-
-	ep = malloc(sizeof(*ep), M_NETADDR, M_NOWAIT);	/*XXX*/
-	if (ep == NULL)
-		return (NULL);
-	bzero(ep, sizeof(*ep));
-
-	ep->af = af;
-	ep->proto = proto;
-	ep->func = func;
-	ep->psw = psw;
-	ep->arg = arg;
-
-	mtx_lock(&encapmtx);
-	encap_add(ep);
-	mtx_unlock(&encapmtx);
-	return (ep);
+	return (encap_detach(&ipv4_encaptab, cookie));
 }
 
 int
-encap_detach(const struct encaptab *cookie)
+encap4_input(struct mbuf **mp, int *offp, int proto)
 {
-	const struct encaptab *ep = cookie;
-	struct encaptab *p;
 
-	mtx_lock(&encapmtx);
-	LIST_FOREACH(p, &encaptab, chain) {
-		if (p == ep) {
-			LIST_REMOVE(p, chain);
-			mtx_unlock(&encapmtx);
-			free(p, M_NETADDR);	/*XXX*/
-			return 0;
-		}
-	}
-	mtx_unlock(&encapmtx);
-
-	return EINVAL;
+	if (encap_input(&ipv4_encaptab, *mp, *offp, proto) != IPPROTO_DONE)
+		return (rip_input(mp, offp, proto));
+	return (IPPROTO_DONE);
 }
+#endif /* INET */
 
-static int
-mask_match(const struct encaptab *ep, const struct sockaddr *sp,
-    const struct sockaddr *dp)
+#ifdef INET6
+const struct encaptab *
+ip6_encap_attach(const struct encap_config *cfg, void *arg, int mflags)
 {
-	struct sockaddr_storage s;
-	struct sockaddr_storage d;
-	int i;
-	const u_int8_t *p, *q;
-	u_int8_t *r;
-	int matchlen;
 
-	if (sp->sa_len > sizeof(s) || dp->sa_len > sizeof(d))
-		return 0;
-	if (sp->sa_family != ep->af || dp->sa_family != ep->af)
-		return 0;
-	if (sp->sa_len != ep->src.ss_len || dp->sa_len != ep->dst.ss_len)
-		return 0;
-
-	matchlen = 0;
-
-	p = (const u_int8_t *)sp;
-	q = (const u_int8_t *)&ep->srcmask;
-	r = (u_int8_t *)&s;
-	for (i = 0 ; i < sp->sa_len; i++) {
-		r[i] = p[i] & q[i];
-		/* XXX estimate */
-		matchlen += (q[i] ? 8 : 0);
-	}
-
-	p = (const u_int8_t *)dp;
-	q = (const u_int8_t *)&ep->dstmask;
-	r = (u_int8_t *)&d;
-	for (i = 0 ; i < dp->sa_len; i++) {
-		r[i] = p[i] & q[i];
-		/* XXX rough estimate */
-		matchlen += (q[i] ? 8 : 0);
-	}
-
-	/* need to overwrite len/family portion as we don't compare them */
-	s.ss_len = sp->sa_len;
-	s.ss_family = sp->sa_family;
-	d.ss_len = dp->sa_len;
-	d.ss_family = dp->sa_family;
-
-	if (bcmp(&s, &ep->src, ep->src.ss_len) == 0 &&
-	    bcmp(&d, &ep->dst, ep->dst.ss_len) == 0) {
-		return matchlen;
-	} else
-		return 0;

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-all mailing list