git: 530c2c30b0c7 - main - ip6_output: Reduce cache misses on pktopts

From: Andrew Gallatin <gallatin_at_FreeBSD.org>
Date: Wed, 20 Mar 2024 19:51:13 UTC
The branch main has been updated by gallatin:

URL: https://cgit.FreeBSD.org/src/commit/?id=530c2c30b0c75f1a71df637ae1e09b352f8256cb

commit 530c2c30b0c75f1a71df637ae1e09b352f8256cb
Author:     Andrew Gallatin <gallatin@FreeBSD.org>
AuthorDate: 2024-03-20 19:46:01 +0000
Commit:     Andrew Gallatin <gallatin@FreeBSD.org>
CommitDate: 2024-03-20 19:50:57 +0000

    ip6_output: Reduce cache misses on pktopts
    
    When profiling an IP6 heavy workload, I noticed that we were
    getting a lot of cache misses in ip6_output() around
    ip6_pktopts. This was happening because the TCP stack passes
    inp->in6p_outputopts even if all options are unused. So in the
    common case of no options present, pkt_opts is not null, and is
    checked repeatedly for different options. Since ip6_pktopts is
    large (4 cachelines), and every field is checked, we take 4
    cache misses (2 of which tend to be hidden by the adjacent line
    prefetcher).
    
    To fix this common case, I introduced a new flag in ip6_pktopts
    (ip6po_valid) which tracks which options have been set. In the
    common case where nothing is set, this causes just a single
    cache miss to load. It also eliminates a test for some options
    (if (opt != NULL && opt->val >= const) vs if ((optvalid & flag) !=0 )
    
    To keep the struct the same size in 64-bit kernels, and to keep
    the integer values (like ip6po_hlim, ip6po_tclass, etc) on the
    same cacheline, I moved them to the top.
    
    As suggested by zlei, the null check in MAKE_EXTHDR() becomes
    redundant, and can be removed.
    
    For our web server workload (with the ip6po_tclass option set),
    this drops the CPI from 2.9 to 2.4 for ip6_output
    
    Differential Revision: https://reviews.freebsd.org/D44204
    Reviewed by: bz, glebius, zlei
    No Objection from: melifaro
    Sponsored by: Netflix Inc.
---
 sys/netinet6/ip6_output.c | 67 ++++++++++++++++++++++++++++++++---------------
 sys/netinet6/ip6_var.h    | 56 +++++++++++++++++++++++++--------------
 2 files changed, 83 insertions(+), 40 deletions(-)

diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c
index a2c3efad749b..530f86c36689 100644
--- a/sys/netinet6/ip6_output.c
+++ b/sys/netinet6/ip6_output.c
@@ -159,14 +159,12 @@ static int copypktopts(struct ip6_pktopts *, struct ip6_pktopts *, int);
  */
 #define	MAKE_EXTHDR(hp, mp, _ol)					\
     do {								\
-	if (hp) {							\
-		struct ip6_ext *eh = (struct ip6_ext *)(hp);		\
-		error = ip6_copyexthdr((mp), (caddr_t)(hp),		\
-		    ((eh)->ip6e_len + 1) << 3);				\
-		if (error)						\
-			goto freehdrs;					\
-		(_ol) += (*(mp))->m_len;				\
-	}								\
+	struct ip6_ext *eh = (struct ip6_ext *)(hp);		\
+	error = ip6_copyexthdr((mp), (caddr_t)(hp),		\
+	    ((eh)->ip6e_len + 1) << 3);				\
+	if (error)						\
+		goto freehdrs;					\
+	(_ol) += (*(mp))->m_len;				\
     } while (/*CONSTCOND*/ 0)
 
 /*
@@ -431,6 +429,7 @@ ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
 	uint32_t fibnum;
 	struct m_tag *fwd_tag = NULL;
 	uint32_t id;
+	uint32_t optvalid;
 
 	NET_EPOCH_ASSERT();
 
@@ -491,14 +490,17 @@ ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
 	 * Keep the length of the unfragmentable part for fragmentation.
 	 */
 	bzero(&exthdrs, sizeof(exthdrs));
-	optlen = 0;
+	optlen = optvalid = 0;
 	unfragpartlen = sizeof(struct ip6_hdr);
 	if (opt) {
+		optvalid = opt->ip6po_valid;
+
 		/* Hop-by-Hop options header. */
-		MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh, optlen);
+		if ((optvalid & IP6PO_VALID_HBH) != 0)
+			MAKE_EXTHDR(opt->ip6po_hbh, &exthdrs.ip6e_hbh, optlen);
 
 		/* Destination options header (1st part). */
-		if (opt->ip6po_rthdr) {
+		if ((optvalid & IP6PO_VALID_RHINFO) != 0) {
 #ifndef RTHDR_SUPPORT_IMPLEMENTED
 			/*
 			 * If there is a routing header, discard the packet
@@ -524,11 +526,13 @@ ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
 			 * options, which might automatically be inserted in
 			 * the kernel.
 			 */
-			MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1,
-			    optlen);
+			if ((optvalid & IP6PO_VALID_DEST1) != 0)
+				MAKE_EXTHDR(opt->ip6po_dest1, &exthdrs.ip6e_dest1,
+				    optlen);
 		}
 		/* Routing header. */
-		MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr, optlen);
+		if ((optvalid & IP6PO_VALID_RHINFO) != 0)
+			MAKE_EXTHDR(opt->ip6po_rthdr, &exthdrs.ip6e_rthdr, optlen);
 
 		unfragpartlen += optlen;
 
@@ -538,7 +542,8 @@ ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
 		 */
 
 		/* Destination options header (2nd part). */
-		MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2, optlen);
+		if ((optvalid & IP6PO_VALID_DEST2) != 0)
+			MAKE_EXTHDR(opt->ip6po_dest2, &exthdrs.ip6e_dest2, optlen);
 	}
 
 	/*
@@ -627,7 +632,7 @@ ip6_output(struct mbuf *m0, struct ip6_pktopts *opt,
 
 	/* Route packet. */
 	ro_pmtu = ro;
-	if (opt && opt->ip6po_rthdr)
+	if ((optvalid & IP6PO_VALID_RHINFO) != 0)
 		ro = &opt->ip6po_route;
 	if (ro != NULL)
 		dst = (struct sockaddr_in6 *)&ro->ro_dst;
@@ -641,7 +646,7 @@ again:
 	 * Do not override if a non-zero value is already set.
 	 * We check the diffserv field and the ECN field separately.
 	 */
-	if (opt && opt->ip6po_tclass >= 0) {
+	if ((optvalid & IP6PO_VALID_TC) != 0){
 		int mask = 0;
 
 		if (IPV6_DSCP(ip6) == 0)
@@ -653,7 +658,7 @@ again:
 	}
 
 	/* Fill in or override the hop limit field, if necessary. */
-	if (opt && opt->ip6po_hlim != -1)
+	if ((optvalid & IP6PO_VALID_HLIM) != 0)
 		ip6->ip6_hlim = opt->ip6po_hlim & 0xff;
 	else if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
 		if (im6o != NULL)
@@ -855,7 +860,7 @@ nonh6lookup:
 	/* All scope ID checks are successful. */
 
 	if (nh && !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
-		if (opt && opt->ip6po_nextroute.ro_nh) {
+		if ((optvalid & IP6PO_VALID_NHINFO) != 0) {
 			/*
 			 * The nexthop is explicitly specified by the
 			 * application.  We assume the next hop is an IPv6
@@ -2648,10 +2653,14 @@ ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
 			free(pktopt->ip6po_pktinfo, M_IP6OPT);
 		pktopt->ip6po_pktinfo = NULL;
 	}
-	if (optname == -1 || optname == IPV6_HOPLIMIT)
+	if (optname == -1 || optname == IPV6_HOPLIMIT) {
 		pktopt->ip6po_hlim = -1;
-	if (optname == -1 || optname == IPV6_TCLASS)
+		pktopt->ip6po_valid &= ~IP6PO_VALID_HLIM;
+	}
+	if (optname == -1 || optname == IPV6_TCLASS) {
 		pktopt->ip6po_tclass = -1;
+		pktopt->ip6po_valid &= ~IP6PO_VALID_TC;
+	}
 	if (optname == -1 || optname == IPV6_NEXTHOP) {
 		if (pktopt->ip6po_nextroute.ro_nh) {
 			NH_FREE(pktopt->ip6po_nextroute.ro_nh);
@@ -2660,16 +2669,19 @@ ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
 		if (pktopt->ip6po_nexthop)
 			free(pktopt->ip6po_nexthop, M_IP6OPT);
 		pktopt->ip6po_nexthop = NULL;
+		pktopt->ip6po_valid &= ~IP6PO_VALID_NHINFO;
 	}
 	if (optname == -1 || optname == IPV6_HOPOPTS) {
 		if (pktopt->ip6po_hbh)
 			free(pktopt->ip6po_hbh, M_IP6OPT);
 		pktopt->ip6po_hbh = NULL;
+		pktopt->ip6po_valid &= ~IP6PO_VALID_HBH;
 	}
 	if (optname == -1 || optname == IPV6_RTHDRDSTOPTS) {
 		if (pktopt->ip6po_dest1)
 			free(pktopt->ip6po_dest1, M_IP6OPT);
 		pktopt->ip6po_dest1 = NULL;
+		pktopt->ip6po_valid &= ~IP6PO_VALID_DEST1;
 	}
 	if (optname == -1 || optname == IPV6_RTHDR) {
 		if (pktopt->ip6po_rhinfo.ip6po_rhi_rthdr)
@@ -2679,11 +2691,13 @@ ip6_clearpktopts(struct ip6_pktopts *pktopt, int optname)
 			NH_FREE(pktopt->ip6po_route.ro_nh);
 			pktopt->ip6po_route.ro_nh = NULL;
 		}
+		pktopt->ip6po_valid &= ~IP6PO_VALID_RHINFO;
 	}
 	if (optname == -1 || optname == IPV6_DSTOPTS) {
 		if (pktopt->ip6po_dest2)
 			free(pktopt->ip6po_dest2, M_IP6OPT);
 		pktopt->ip6po_dest2 = NULL;
+		pktopt->ip6po_valid &= ~IP6PO_VALID_DEST2;
 	}
 }
 
@@ -2730,6 +2744,7 @@ copypktopts(struct ip6_pktopts *dst, struct ip6_pktopts *src, int canwait)
 	PKTOPT_EXTHDRCPY(ip6po_dest1);
 	PKTOPT_EXTHDRCPY(ip6po_dest2);
 	PKTOPT_EXTHDRCPY(ip6po_rthdr); /* not copy the cached route */
+	dst->ip6po_valid = src->ip6po_valid;
 	return (0);
 
   bad:
@@ -2959,6 +2974,7 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
 				return (ENOBUFS);
 		}
 		bcopy(pktinfo, opt->ip6po_pktinfo, sizeof(*pktinfo));
+		opt->ip6po_valid |= IP6PO_VALID_PKTINFO;
 		break;
 	}
 
@@ -2981,6 +2997,7 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
 			return (EINVAL);
 
 		opt->ip6po_hlim = *hlimp;
+		opt->ip6po_valid |= IP6PO_VALID_HLIM;
 		break;
 	}
 
@@ -2995,6 +3012,7 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
 			return (EINVAL);
 
 		opt->ip6po_tclass = tclass;
+		opt->ip6po_valid |= IP6PO_VALID_TC;
 		break;
 	}
 
@@ -3045,6 +3063,7 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
 		if (opt->ip6po_nexthop == NULL)
 			return (ENOBUFS);
 		bcopy(buf, opt->ip6po_nexthop, *buf);
+		opt->ip6po_valid |= IP6PO_VALID_NHINFO;
 		break;
 
 	case IPV6_2292HOPOPTS:
@@ -3083,6 +3102,7 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
 		if (opt->ip6po_hbh == NULL)
 			return (ENOBUFS);
 		bcopy(hbh, opt->ip6po_hbh, hbhlen);
+		opt->ip6po_valid |= IP6PO_VALID_HBH;
 
 		break;
 	}
@@ -3150,6 +3170,10 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
 		if (*newdest == NULL)
 			return (ENOBUFS);
 		bcopy(dest, *newdest, destlen);
+		if (newdest == &opt->ip6po_dest1)
+			opt->ip6po_valid |= IP6PO_VALID_DEST1;
+		else
+			opt->ip6po_valid |= IP6PO_VALID_DEST2;
 
 		break;
 	}
@@ -3192,6 +3216,7 @@ ip6_setpktopt(int optname, u_char *buf, int len, struct ip6_pktopts *opt,
 		if (opt->ip6po_rthdr == NULL)
 			return (ENOBUFS);
 		bcopy(rth, opt->ip6po_rthdr, rthlen);
+		opt->ip6po_valid |= IP6PO_VALID_RHINFO;
 
 		break;
 	}
diff --git a/sys/netinet6/ip6_var.h b/sys/netinet6/ip6_var.h
index aeeda14f281f..f3ac13a9caad 100644
--- a/sys/netinet6/ip6_var.h
+++ b/sys/netinet6/ip6_var.h
@@ -130,27 +130,26 @@ struct	ip6po_nhinfo {
 #define ip6po_nexthop	ip6po_nhinfo.ip6po_nhi_nexthop
 #define ip6po_nextroute	ip6po_nhinfo.ip6po_nhi_route
 
+/*
+ * Note that fields with valid data must be flagged in ip6po_valid.
+ * This is done to reduce cache misses in ip6_output().  Before
+ * ip6po_valid, ip6_output needed to check all the individual fields
+ * of ip6_pktopts needed to be checked themselves, and they are spread
+ * across 4 cachelines. ip6_output() is currently the only consumer of
+ * these flags, as it is in the critical path of every packet sent.
+ */
 struct	ip6_pktopts {
-	struct	mbuf *ip6po_m;	/* Pointer to mbuf storing the data */
-	int	ip6po_hlim;	/* Hoplimit for outgoing packets */
-
-	/* Outgoing IF/address information */
-	struct	in6_pktinfo *ip6po_pktinfo;
-
-	/* Next-hop address information */
-	struct	ip6po_nhinfo ip6po_nhinfo;
-
-	struct	ip6_hbh *ip6po_hbh; /* Hop-by-Hop options header */
-
-	/* Destination options header (before a routing header) */
-	struct	ip6_dest *ip6po_dest1;
-
-	/* Routing header related info. */
-	struct	ip6po_rhinfo ip6po_rhinfo;
-
-	/* Destination options header (after a routing header) */
-	struct	ip6_dest *ip6po_dest2;
+	uint32_t ip6po_valid;
+#define IP6PO_VALID_HLIM	0x0001
+#define IP6PO_VALID_PKTINFO	0x0002
+#define IP6PO_VALID_NHINFO	0x0004
+#define IP6PO_VALID_HBH		0x0008
+#define IP6PO_VALID_DEST1	0x0010
+#define IP6PO_VALID_RHINFO	0x0020
+#define IP6PO_VALID_DEST2	0x0040
+#define IP6PO_VALID_TC		0x0080
 
+	int	ip6po_hlim;	/* Hoplimit for outgoing packets */
 	int	ip6po_tclass;	/* traffic class */
 
 	int	ip6po_minmtu;  /* fragment vs PMTU discovery policy */
@@ -171,6 +170,25 @@ struct	ip6_pktopts {
 #endif
 #define IP6PO_DONTFRAG	0x04	/* disable fragmentation (IPV6_DONTFRAG) */
 #define IP6PO_USECOA	0x08	/* use care of address */
+
+	struct	mbuf *ip6po_m;	/* Pointer to mbuf storing the data */
+
+	/* Outgoing IF/address information */
+	struct	in6_pktinfo *ip6po_pktinfo;
+
+	/* Next-hop address information */
+	struct	ip6po_nhinfo ip6po_nhinfo;
+
+	struct	ip6_hbh *ip6po_hbh; /* Hop-by-Hop options header */
+
+	/* Destination options header (before a routing header) */
+	struct	ip6_dest *ip6po_dest1;
+
+	/* Routing header related info. */
+	struct	ip6po_rhinfo ip6po_rhinfo;
+
+	/* Destination options header (after a routing header) */
+	struct	ip6_dest *ip6po_dest2;
 };
 
 /*