svn commit: r332894 - in head: cddl/lib/libdtrace sys/kern sys/netinet sys/netinet6 sys/sys

Sean Bruno sbruno at FreeBSD.org
Mon Apr 23 19:51:02 UTC 2018


Author: sbruno
Date: Mon Apr 23 19:51:00 2018
New Revision: 332894
URL: https://svnweb.freebsd.org/changeset/base/332894

Log:
  Load balance sockets with new SO_REUSEPORT_LB option
  
  This patch adds a new socket option, SO_REUSEPORT_LB, which allow multiple
  programs or threads to bind to the same port and incoming connections will be
  load balanced using a hash function.
  
  Most of the code was copied from a similar patch for DragonflyBSD.
  
  However, in DragonflyBSD, load balancing is a global on/off setting and can not
  be set per socket. This patch allows for simultaneous use of both the current
  SO_REUSEPORT and the new SO_REUSEPORT_LB options on the same system.
  
  Required changes to structures
  Globally change so_options from 16 to 32 bit value to allow for more options.
  Add hashtable in pcbinfo to hold all SO_REUSEPORT_LB sockets.
  
  Limitations
  As DragonflyBSD, a load balance group is limited to 256 pcbs
  (256 programs or threads sharing the same socket).
  
  Submitted by:	Johannes Lundberg <johanlun0 at gmail.com>
  Sponsored by:	Limelight Networks
  Differential Revision:	https://reviews.freebsd.org/D11003

Modified:
  head/cddl/lib/libdtrace/tcp.d
  head/sys/kern/uipc_debug.c
  head/sys/kern/uipc_socket.c
  head/sys/netinet/in_pcb.c
  head/sys/netinet/in_pcb.h
  head/sys/netinet/ip_output.c
  head/sys/netinet/tcp_subr.c
  head/sys/netinet/udp_usrreq.c
  head/sys/netinet6/in6_pcb.c
  head/sys/netinet6/in6_src.c
  head/sys/netinet6/ip6_output.c
  head/sys/netinet6/udp6_usrreq.c
  head/sys/sys/socket.h
  head/sys/sys/socketvar.h

Modified: head/cddl/lib/libdtrace/tcp.d
==============================================================================
--- head/cddl/lib/libdtrace/tcp.d	Mon Apr 23 18:33:26 2018	(r332893)
+++ head/cddl/lib/libdtrace/tcp.d	Mon Apr 23 19:51:00 2018	(r332894)
@@ -192,12 +192,12 @@ translator tcpsinfo_t < struct tcpcb *p > {
 	tcps_rport =		p == NULL ? 0 : ntohs(p->t_inpcb->inp_inc.inc_ie.ie_fport);
 	tcps_laddr =		p == NULL ? 0 :
 	    p->t_inpcb->inp_vflag == INP_IPV4 ?
-	    inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.ie46_local.ia46_addr4.s_addr) :
-	    inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.ie6_local);
+	    inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.id46_addr.ia46_addr4.s_addr) :
+	    inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependladdr.id6_addr);
 	tcps_raddr =		p == NULL ? 0 :
 	    p->t_inpcb->inp_vflag == INP_IPV4 ?
-	    inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.ie46_foreign.ia46_addr4.s_addr) :
-	    inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.ie6_foreign);
+	    inet_ntoa(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.id46_addr.ia46_addr4.s_addr) :
+	    inet_ntoa6(&p->t_inpcb->inp_inc.inc_ie.ie_dependfaddr.id6_addr);
 	tcps_state =		p == NULL ? -1 : p->t_state;
 	tcps_iss =		p == NULL ? 0  : p->iss;
 	tcps_irs =		p == NULL ? 0  : p->irs;

Modified: head/sys/kern/uipc_debug.c
==============================================================================
--- head/sys/kern/uipc_debug.c	Mon Apr 23 18:33:26 2018	(r332893)
+++ head/sys/kern/uipc_debug.c	Mon Apr 23 19:51:00 2018	(r332894)
@@ -77,7 +77,7 @@ db_print_sotype(short so_type)
 }
 
 static void
-db_print_sooptions(short so_options)
+db_print_sooptions(int so_options)
 {
 	int comma;
 
@@ -120,6 +120,10 @@ db_print_sooptions(short so_options)
 	}
 	if (so_options & SO_REUSEPORT) {
 		db_printf("%sSO_REUSEPORT", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_REUSEPORT_LB) {
+		db_printf("%sSO_REUSEPORT_LB", comma ? ", " : "");
 		comma = 1;
 	}
 	if (so_options & SO_TIMESTAMP) {

Modified: head/sys/kern/uipc_socket.c
==============================================================================
--- head/sys/kern/uipc_socket.c	Mon Apr 23 18:33:26 2018	(r332893)
+++ head/sys/kern/uipc_socket.c	Mon Apr 23 19:51:00 2018	(r332894)
@@ -1057,6 +1057,100 @@ sofree(struct socket *so)
 }
 
 /*
+ * Let socket in same load balance group (same port and address)
+ * inherit pending sockets of the closing socket.
+ *
+ * "so_inh" will inherit sockets from "so"
+ */
+void
+soinherit(struct socket *so, struct socket *so_inh)
+{
+	TAILQ_HEAD(, socket) comp, incomp;
+	struct socket *sp, *head, *head_inh;
+	int qlen, incqlen;
+
+	KASSERT(so->so_options & SO_ACCEPTCONN,
+	    ("so does not accept connection"));
+	KASSERT(so_inh->so_options & SO_ACCEPTCONN,
+	    ("so_inh does not accept connection"));
+
+
+restart:
+	SOCK_LOCK(so);
+	if ((head = so->so_listen) != NULL &&
+	    __predict_false(SOLISTEN_TRYLOCK(head) == 0)) {
+		SOCK_UNLOCK(so);
+		goto restart;
+	}
+
+restart_inh:
+	SOCK_LOCK(so_inh);
+	if ((head_inh = so_inh->so_listen) != NULL &&
+	    __predict_false(SOLISTEN_TRYLOCK(head_inh) == 0)) {
+		SOCK_UNLOCK(so_inh);
+		goto restart_inh;
+	}
+
+	TAILQ_INIT(&comp);
+	TAILQ_INIT(&incomp);
+
+	/*
+	 * Save completed queue and incompleted queue
+	 */
+	TAILQ_CONCAT(&comp, &so->sol_comp, so_list);
+	qlen = so->sol_qlen;
+	so->sol_qlen = 0;
+
+	TAILQ_CONCAT(&incomp, &so->sol_incomp, so_list);
+	incqlen = so->sol_incqlen;
+	so->sol_incqlen = 0;
+
+	/*
+	 * Append the saved completed queue and incompleted
+	 * queue to the socket inherits them.
+	 *
+	 * XXX
+	 * This may temporarily break the inheriting socket's
+	 * so_qlimit.
+	 */
+	TAILQ_FOREACH(sp, &comp, so_list) {
+		refcount_acquire(&so_inh->so_count);
+		sp->so_listen = so_inh;
+		crfree(sp->so_cred);
+		sp->so_cred = crhold(so_inh->so_cred);
+	}
+
+	TAILQ_FOREACH(sp, &incomp, so_list) {
+		refcount_acquire(&so_inh->so_count);
+		sp->so_listen = so_inh;
+		crfree(sp->so_cred);
+		sp->so_cred = crhold(so_inh->so_cred);
+	}
+
+	TAILQ_CONCAT(&so_inh->sol_comp, &comp, so_list);
+	so_inh->sol_qlen += qlen;
+
+	TAILQ_CONCAT(&so_inh->sol_incomp, &incomp, so_list);
+	so_inh->sol_incqlen += incqlen;
+
+	SOCK_UNLOCK(so);
+	if(head != NULL)
+		SOLISTEN_UNLOCK(head);
+
+	SOCK_UNLOCK(so_inh);
+	if(head_inh != NULL) {
+		if(qlen > 0) {
+			/*
+			 * "New" connections have arrived
+			 */
+			solisten_wakeup(head_inh);
+		} else {
+			SOLISTEN_UNLOCK(head_inh);
+		}
+	}
+}
+
+/*
  * Close a socket on last file table reference removal.  Initiate disconnect
  * if connected.  Free socket when disconnect complete.
  *
@@ -2776,6 +2870,7 @@ sosetopt(struct socket *so, struct sockopt *sopt)
 		case SO_BROADCAST:
 		case SO_REUSEADDR:
 		case SO_REUSEPORT:
+		case SO_REUSEPORT_LB:
 		case SO_OOBINLINE:
 		case SO_TIMESTAMP:
 		case SO_BINTIME:
@@ -2994,6 +3089,7 @@ sogetopt(struct socket *so, struct sockopt *sopt)
 		case SO_KEEPALIVE:
 		case SO_REUSEADDR:
 		case SO_REUSEPORT:
+		case SO_REUSEPORT_LB:
 		case SO_BROADCAST:
 		case SO_OOBINLINE:
 		case SO_ACCEPTCONN:

Modified: head/sys/netinet/in_pcb.c
==============================================================================
--- head/sys/netinet/in_pcb.c	Mon Apr 23 18:33:26 2018	(r332893)
+++ head/sys/netinet/in_pcb.c	Mon Apr 23 19:51:00 2018	(r332894)
@@ -108,6 +108,9 @@ __FBSDID("$FreeBSD$");
 
 #include <security/mac/mac_framework.h>
 
+#define INPCBLBGROUP_SIZMIN	8
+#define INPCBLBGROUP_SIZMAX	256
+
 static struct callout	ipport_tick_callout;
 
 /*
@@ -217,7 +220,186 @@ SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtim
  * functions often modify hash chains or addresses in pcbs.
  */
 
+static struct inpcblbgroup *
+in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
+    uint16_t port, const union in_dependaddr *addr, int size)
+{
+	struct inpcblbgroup *grp;
+
+	size_t bytes = __offsetof(struct inpcblbgroup, il_inp[size]);
+	grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT);
+	if(!grp)
+		return NULL;
+	grp->il_vflag = vflag;
+	grp->il_lport = port;
+	grp->il_dependladdr = *addr;
+	grp->il_inpsiz = size;
+	LIST_INSERT_HEAD(hdr, grp, il_list);
+
+	return grp;
+}
+
+static void
+in_pcblbgroup_free(struct inpcblbgroup *grp)
+{
+	LIST_REMOVE(grp, il_list);
+	free(grp, M_TEMP);
+}
+
+static struct inpcblbgroup *
+in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
+    struct inpcblbgroup *old_grp, int size)
+{
+	struct inpcblbgroup *grp;
+	int i;
+
+	grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
+	    old_grp->il_lport, &old_grp->il_dependladdr, size);
+	if(!grp)
+		return NULL;
+
+	KASSERT(old_grp->il_inpcnt < grp->il_inpsiz,
+	    ("invalid new local group size %d and old local group count %d",
+	     grp->il_inpsiz, old_grp->il_inpcnt));
+	for (i = 0; i < old_grp->il_inpcnt; ++i)
+		grp->il_inp[i] = old_grp->il_inp[i];
+	grp->il_inpcnt = old_grp->il_inpcnt;
+
+	in_pcblbgroup_free(old_grp);
+
+	return grp;
+}
+
 /*
+ * Add PCB to lb group (load balance used by SO_REUSEPORT_LB)
+ */
+static int
+in_pcbinslbgrouphash(struct inpcb *inp, struct inpcbinfo *pcbinfo)
+{
+	struct inpcblbgrouphead *hdr;
+	struct inpcblbgroup *grp;
+
+	uint16_t hashmask = pcbinfo->ipi_lbgrouphashmask;
+	uint16_t lport = inp->inp_lport;
+	uint32_t group_index = INP_PCBLBGROUP_PORTHASH(lport, hashmask);
+
+	hdr = &pcbinfo->ipi_lbgrouphashbase[group_index];
+
+	struct ucred *cred;
+
+	if (pcbinfo->ipi_lbgrouphashbase == NULL)
+		return 0;
+
+	/*
+	 * don't allow jailed socket to join local group
+	 */
+	if (inp->inp_socket != NULL)
+		cred = inp->inp_socket->so_cred;
+	else
+		cred = NULL;
+	if (cred != NULL && jailed(cred))
+		return 0;
+
+#ifdef INET6
+	/*
+	 * don't allow IPv4 mapped INET6 wild socket
+	 */
+	if ((inp->inp_vflag & INP_IPV4) &&
+	    inp->inp_laddr.s_addr == INADDR_ANY &&
+	    INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) {
+		return 0;
+	}
+#endif
+
+	hdr = &pcbinfo->ipi_lbgrouphashbase[
+	    INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
+
+	LIST_FOREACH(grp, hdr, il_list) {
+		if (grp->il_vflag == inp->inp_vflag &&
+		    grp->il_lport == inp->inp_lport &&
+		    memcmp(&grp->il_dependladdr,
+		        &inp->inp_inc.inc_ie.ie_dependladdr,
+		        sizeof(grp->il_dependladdr)) == 0) {
+			break;
+		}
+	}
+	if (grp == NULL) {
+		/* Create new load balance group */
+		grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
+		    inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
+		    INPCBLBGROUP_SIZMIN);
+		if(!grp)
+			return (ENOBUFS);
+	} else if (grp->il_inpcnt == grp->il_inpsiz) {
+		if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
+			static int limit_logged = 0;
+
+			if (!limit_logged) {
+				limit_logged = 1;
+				printf("lb group port %d, "
+					   "limit reached\n", ntohs(grp->il_lport));
+			}
+			return 0;
+		}
+
+		/* Expand this local group */
+		grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
+		if(!grp)
+			return (ENOBUFS);
+	}
+
+	KASSERT(grp->il_inpcnt < grp->il_inpsiz,
+			("invalid local group size %d and count %d",
+			 grp->il_inpsiz, grp->il_inpcnt));
+
+	grp->il_inp[grp->il_inpcnt] = inp;
+	grp->il_inpcnt++;
+	return 0;
+}
+
+static void
+in_pcbremlbgrouphash(struct inpcb *inp, struct inpcbinfo *pcbinfo)
+{
+	struct inpcblbgrouphead *hdr;
+	struct inpcblbgroup *grp;
+
+	if (pcbinfo->ipi_lbgrouphashbase == NULL)
+		return;
+
+	hdr = &pcbinfo->ipi_lbgrouphashbase[
+	    INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
+
+	LIST_FOREACH(grp, hdr, il_list) {
+		int i;
+
+		for (i = 0; i < grp->il_inpcnt; ++i) {
+			if (grp->il_inp[i] != inp)
+				continue;
+
+			if (grp->il_inpcnt == 1) {
+				/* Free this local group */
+				in_pcblbgroup_free(grp);
+			} else {
+				/* Pull up inpcbs */
+				for (; i + 1 < grp->il_inpcnt; ++i)
+					grp->il_inp[i] = grp->il_inp[i + 1];
+				grp->il_inpcnt--;
+
+				if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
+				    grp->il_inpcnt <= (grp->il_inpsiz / 4)) {
+					/* Shrink this local group */
+					struct inpcblbgroup *new_grp =
+						in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
+					if(new_grp)
+						grp = new_grp;
+				}
+			}
+			return;
+		}
+	}
+}
+
+/*
  * Different protocols initialize their inpcbs differently - giving
  * different name to the lock.  But they all are disposed the same.
  */
@@ -252,6 +434,8 @@ in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char 
 	    &pcbinfo->ipi_hashmask);
 	pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB,
 	    &pcbinfo->ipi_porthashmask);
+	pcbinfo->ipi_lbgrouphashbase = hashinit(hash_nelements, M_PCB,
+	    &pcbinfo->ipi_lbgrouphashmask);
 #ifdef PCBGROUP
 	in_pcbgroup_init(pcbinfo, hashfields, hash_nelements);
 #endif
@@ -275,6 +459,8 @@ in_pcbinfo_destroy(struct inpcbinfo *pcbinfo)
 	hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask);
 	hashdestroy(pcbinfo->ipi_porthashbase, M_PCB,
 	    pcbinfo->ipi_porthashmask);
+	hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB,
+	    pcbinfo->ipi_lbgrouphashmask);
 #ifdef PCBGROUP
 	in_pcbgroup_destroy(pcbinfo);
 #endif
@@ -513,18 +699,20 @@ in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp
 /*
  * Return cached socket options.
  */
-short
+int
 inp_so_options(const struct inpcb *inp)
 {
-   short so_options;
+	int so_options;
 
-   so_options = 0;
+	so_options = 0;
 
-   if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
-	   so_options |= SO_REUSEPORT;
-   if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
-	   so_options |= SO_REUSEADDR;
-   return (so_options);
+	if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0)
+		so_options |= SO_REUSEPORT_LB;
+	if ((inp->inp_flags2 & INP_REUSEPORT) != 0)
+		so_options |= SO_REUSEPORT;
+	if ((inp->inp_flags2 & INP_REUSEADDR) != 0)
+		so_options |= SO_REUSEADDR;
+	return (so_options);
 }
 #endif /* INET || INET6 */
 
@@ -581,6 +769,12 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *n
 	int error;
 
 	/*
+	 * XXX Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
+	 * so that we don't have to add to the (already messy) code below
+	 */
+	int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
+
+	/*
 	 * No state changes, so read locks are sufficient here.
 	 */
 	INP_LOCK_ASSERT(inp);
@@ -591,7 +785,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *n
 	laddr.s_addr = *laddrp;
 	if (nam != NULL && laddr.s_addr != INADDR_ANY)
 		return (EINVAL);
-	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
+	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
 		lookupflags = INPLOOKUP_WILDCARD;
 	if (nam == NULL) {
 		if ((error = prison_local_ip4(cred, &laddr)) != 0)
@@ -628,16 +822,20 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *n
 			 */
 			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
+			// XXX: How to deal with SO_REUSEPORT_LB here?
+			// Added equivalent treatment as SO_REUSEPORT here for now
+			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
+				reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
 		} else if (sin->sin_addr.s_addr != INADDR_ANY) {
 			sin->sin_port = 0;		/* yech... */
 			bzero(&sin->sin_zero, sizeof(sin->sin_zero));
 			/*
-			 * Is the address a local IP address? 
+			 * Is the address a local IP address?
 			 * If INP_BINDANY is set, then the socket may be bound
 			 * to any endpoint address, local or not.
 			 */
 			if ((inp->inp_flags & INP_BINDANY) == 0 &&
-			    ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) 
+			    ifa_ifwithaddr_check((struct sockaddr *)sin) == 0)
 				return (EADDRNOTAVAIL);
 		}
 		laddr = sin->sin_addr;
@@ -667,7 +865,8 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *n
 				     ntohl(t->inp_faddr.s_addr) == INADDR_ANY) &&
 				    (ntohl(sin->sin_addr.s_addr) != INADDR_ANY ||
 				     ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
-				     (t->inp_flags2 & INP_REUSEPORT) == 0) &&
+				     (t->inp_flags2 & INP_REUSEPORT) ||
+				     (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
 				    (inp->inp_cred->cr_uid !=
 				     t->inp_cred->cr_uid))
 					return (EADDRINUSE);
@@ -692,11 +891,14 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *n
 				 */
 				tw = intotw(t);
 				if (tw == NULL ||
-				    (reuseport & tw->tw_so_options) == 0)
+				    ((reuseport & tw->tw_so_options) == 0 &&
+					(reuseport_lb & tw->tw_so_options) == 0)) {
 					return (EADDRINUSE);
+				}
 			} else if (t &&
-			    ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
-			    (reuseport & inp_so_options(t)) == 0) {
+				   ((inp->inp_flags2 & INP_BINDMULTI) == 0) &&
+				   (reuseport & inp_so_options(t)) == 0 &&
+				   (reuseport_lb & inp_so_options(t)) == 0) {
 #ifdef INET6
 				if (ntohl(sin->sin_addr.s_addr) !=
 				    INADDR_ANY ||
@@ -705,7 +907,7 @@ in_pcbbind_setup(struct inpcb *inp, struct sockaddr *n
 				    (inp->inp_vflag & INP_IPV6PROTO) == 0 ||
 				    (t->inp_vflag & INP_IPV6PROTO) == 0)
 #endif
-				return (EADDRINUSE);
+						return (EADDRINUSE);
 				if (t && (! in_pcbbind_check_bindmulti(inp, t)))
 					return (EADDRINUSE);
 			}
@@ -1409,6 +1611,7 @@ in_pcbdrop(struct inpcb *inp)
 		struct inpcbport *phd = inp->inp_phd;
 
 		INP_HASH_WLOCK(inp->inp_pcbinfo);
+		in_pcbremlbgrouphash(inp, inp->inp_pcbinfo);
 		LIST_REMOVE(inp, inp_hash);
 		LIST_REMOVE(inp, inp_portlist);
 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
@@ -1669,6 +1872,98 @@ in_pcblookup_local(struct inpcbinfo *pcbinfo, struct i
 }
 #undef INP_LOOKUP_MAPPED_PCB_COST
 
+struct inpcb *
+in_pcblookup_lbgroup_last(const struct inpcb *inp)
+{
+	const struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
+	const struct inpcblbgrouphead *hdr;
+	const struct inpcblbgroup *grp;
+	int i;
+
+	if (pcbinfo->ipi_lbgrouphashbase == NULL)
+		return NULL;
+
+	hdr = &pcbinfo->ipi_lbgrouphashbase[
+	    INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
+
+	LIST_FOREACH(grp, hdr, il_list) {
+		if (grp->il_vflag == inp->inp_vflag &&
+		    grp->il_lport == inp->inp_lport &&
+		    memcmp(&grp->il_dependladdr,
+			&inp->inp_inc.inc_ie.ie_dependladdr,
+			sizeof(grp->il_dependladdr)) == 0) {
+			break;
+		}
+	}
+	if (grp == NULL || grp->il_inpcnt == 1)
+		return NULL;
+
+	KASSERT(grp->il_inpcnt >= 2,
+	    ("invalid lbgroup inp count %d", grp->il_inpcnt));
+	for (i = 0; i < grp->il_inpcnt; ++i) {
+		if (grp->il_inp[i] == inp) {
+			int last = grp->il_inpcnt - 1;
+
+			if (i == last)
+				last = grp->il_inpcnt - 2;
+			return grp->il_inp[last];
+		}
+	}
+	return NULL;
+}
+
+static struct inpcb *
+in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
+  const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
+  uint16_t fport, int lookupflags)
+{
+	struct inpcb *local_wild = NULL;
+	const struct inpcblbgrouphead *hdr;
+	struct inpcblbgroup *grp;
+	struct inpcblbgroup *grp_local_wild;
+
+	hdr = &pcbinfo->ipi_lbgrouphashbase[
+		  INP_PCBLBGROUP_PORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
+
+	/*
+	 * Order of socket selection:
+	 * 1. non-wild.
+	 * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
+	 *
+	 * NOTE:
+	 * - Load balanced group does not contain jailed sockets
+	 * - Load balanced group does not contain IPv4 mapped INET6 wild sockets
+	 */
+	LIST_FOREACH(grp, hdr, il_list) {
+#ifdef INET6
+		if (!(grp->il_vflag & INP_IPV4))
+			continue;
+#endif
+
+		if (grp->il_lport == lport) {
+
+			uint32_t idx = 0;
+			int pkt_hash = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport);
+
+			idx = pkt_hash % grp->il_inpcnt;
+
+			if (grp->il_laddr.s_addr == laddr->s_addr) {
+				return grp->il_inp[idx];
+			} else {
+				if (grp->il_laddr.s_addr == INADDR_ANY &&
+					(lookupflags & INPLOOKUP_WILDCARD)) {
+					local_wild = grp->il_inp[idx];
+					grp_local_wild = grp;
+				}
+			}
+		}
+	}
+	if (local_wild != NULL) {
+		return local_wild;
+	}
+	return NULL;
+}
+
 #ifdef PCBGROUP
 /*
  * Lookup PCB in hash list, using pcbgroup tables.
@@ -1948,6 +2243,18 @@ in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, st
 		return (tmpinp);
 
 	/*
+	 * Then look in lb group (for wildcard match)
+	 */
+	if (pcbinfo->ipi_lbgrouphashbase != NULL &&
+		(lookupflags & INPLOOKUP_WILDCARD)) {
+		inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, fport,
+								   lookupflags);
+		if (inp != NULL) {
+			return inp;
+		}
+	}
+
+	/*
 	 * Then look for a wildcard match, if requested.
 	 */
 	if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
@@ -2164,6 +2471,7 @@ in_pcbinshash_internal(struct inpcb *inp, int do_pcbgr
 	struct inpcbinfo *pcbinfo = inp->inp_pcbinfo;
 	struct inpcbport *phd;
 	u_int32_t hashkey_faddr;
+	int so_options;
 
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
@@ -2184,7 +2492,21 @@ in_pcbinshash_internal(struct inpcb *inp, int do_pcbgr
 	pcbporthash = &pcbinfo->ipi_porthashbase[
 	    INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)];
 
+
 	/*
+	 * Add entry in lb group
+	 * Only do this if SO_REUSEPORT_LB is set
+	 */
+	so_options = inp_so_options(inp);
+	if(so_options & SO_REUSEPORT_LB) {
+		int ret = in_pcbinslbgrouphash(inp, pcbinfo);
+		if(ret) {
+			// pcb lb group malloc fail (ret=ENOBUFS)
+			return ret;
+		}
+	}
+
+	/*
 	 * Go through port list and look for a head for this lport.
 	 */
 	LIST_FOREACH(phd, pcbporthash, phd_hash) {
@@ -2310,6 +2632,10 @@ in_pcbremlists(struct inpcb *inp)
 		struct inpcbport *phd = inp->inp_phd;
 
 		INP_HASH_WLOCK(pcbinfo);
+
+		// XXX Only do if SO_REUSEPORT_LB set?
+		in_pcbremlbgrouphash(inp, pcbinfo);
+
 		LIST_REMOVE(inp, inp_hash);
 		LIST_REMOVE(inp, inp_portlist);
 		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {

Modified: head/sys/netinet/in_pcb.h
==============================================================================
--- head/sys/netinet/in_pcb.h	Mon Apr 23 18:33:26 2018	(r332893)
+++ head/sys/netinet/in_pcb.h	Mon Apr 23 19:51:00 2018	(r332894)
@@ -78,6 +78,11 @@ struct in_addr_4in6 {
 	struct	in_addr	ia46_addr4;
 };
 
+union in_dependaddr {
+	struct in_addr_4in6 id46_addr;
+	struct in6_addr	id6_addr;
+};
+
 /*
  * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553.  in_conninfo has
  * some extra padding to accomplish this.
@@ -88,22 +93,14 @@ struct in_endpoints {
 	u_int16_t	ie_fport;		/* foreign port */
 	u_int16_t	ie_lport;		/* local port */
 	/* protocol dependent part, local and foreign addr */
-	union {
-		/* foreign host table entry */
-		struct	in_addr_4in6 ie46_foreign;
-		struct	in6_addr ie6_foreign;
-	} ie_dependfaddr;
-	union {
-		/* local host table entry */
-		struct	in_addr_4in6 ie46_local;
-		struct	in6_addr ie6_local;
-	} ie_dependladdr;
+	union in_dependaddr ie_dependfaddr;	/* foreign host table entry */
+	union in_dependaddr ie_dependladdr;	/* local host table entry */
+#define	ie_faddr	ie_dependfaddr.id46_addr.ia46_addr4
+#define	ie_laddr	ie_dependladdr.id46_addr.ia46_addr4
+#define	ie6_faddr	ie_dependfaddr.id6_addr
+#define	ie6_laddr	ie_dependladdr.id6_addr
 	u_int32_t	ie6_zoneid;		/* scope zone id */
 };
-#define	ie_faddr	ie_dependfaddr.ie46_foreign.ia46_addr4
-#define	ie_laddr	ie_dependladdr.ie46_local.ia46_addr4
-#define	ie6_faddr	ie_dependfaddr.ie6_foreign
-#define	ie6_laddr	ie_dependladdr.ie6_local
 
 /*
  * XXX The defines for inc_* are hacks and should be changed to direct
@@ -407,6 +404,21 @@ struct inpcbport {
 	u_short phd_port;
 };
 
+struct inpcblbgroup {
+	LIST_ENTRY(inpcblbgroup) il_list;
+	uint16_t	il_lport;
+	u_char		il_vflag;
+	u_char		il_pad;
+	uint32_t	il_pad2;
+	union in_dependaddr il_dependladdr;
+#define il_laddr	il_dependladdr.id46_addr.ia46_addr4
+#define il6_laddr	il_dependladdr.id6_addr
+	uint32_t	il_inpsiz; /* size of il_inp[] */
+	uint32_t	il_inpcnt; /* # of elem in il_inp[] */
+	struct inpcb	*il_inp[];
+};
+LIST_HEAD(inpcblbgrouphead, inpcblbgroup);
+
 /*-
  * Global data structure for each high-level protocol (UDP, TCP, ...) in both
  * IPv4 and IPv6.  Holds inpcb lists and information for managing them.
@@ -500,6 +512,13 @@ struct inpcbinfo {
 	u_long			 ipi_wildmask;		/* (p) */
 
 	/*
+	 * Load balanced group used by the SO_REUSEPORT_LB option,
+	 * hashed by local address and local port.
+	 */
+	struct	inpcblbgrouphead *ipi_lbgrouphashbase;
+	u_long	ipi_lbgrouphashmask;
+
+	/*
 	 * Pointer to network stack instance
 	 */
 	struct vnet		*ipi_vnet;		/* (c) */
@@ -585,7 +604,7 @@ struct tcpcb *
 	inp_inpcbtotcpcb(struct inpcb *inp);
 void 	inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp,
 		uint32_t *faddr, uint16_t *fp);
-short	inp_so_options(const struct inpcb *inp);
+int		inp_so_options(const struct inpcb *inp);
 
 #endif /* _KERNEL */
 
@@ -648,6 +667,10 @@ short	inp_so_options(const struct inpcb *inp);
 	(((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask))
 #define INP_PCBPORTHASH(lport, mask) \
 	(ntohs((lport)) & (mask))
+#define INP_PCBLBGROUP_PORTHASH(lport, mask) \
+	(ntohs((lport)) & (mask))
+#define INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \
+	((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport)))
 #define	INP6_PCBHASHKEY(faddr)	((faddr)->s6_addr32[3])
 
 /*
@@ -716,6 +739,7 @@ short	inp_so_options(const struct inpcb *inp);
 #define	INP_RATE_LIMIT_CHANGED	0x00000400 /* rate limit needs attention */
 #define	INP_ORIGDSTADDR		0x00000800 /* receive IP dst address/port */
 #define INP_CANNOT_DO_ECN	0x00001000 /* The stack does not do ECN */
+#define	INP_REUSEPORT_LB	0x00002000 /* SO_REUSEPORT_LB option is set */
 
 /*
  * Flags passed to in_pcblookup*() functions.
@@ -818,6 +842,8 @@ struct inpcb *
 struct inpcb *
 	in_pcblookup(struct inpcbinfo *, struct in_addr, u_int,
 	    struct in_addr, u_int, int, struct ifnet *);
+struct inpcb *
+	in_pcblookup_lbgroup_last(const struct inpcb *inp);
 struct inpcb *
 	in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int,
 	    struct in_addr, u_int, int, struct ifnet *, struct mbuf *);

Modified: head/sys/netinet/ip_output.c
==============================================================================
--- head/sys/netinet/ip_output.c	Mon Apr 23 18:33:26 2018	(r332893)
+++ head/sys/netinet/ip_output.c	Mon Apr 23 19:51:00 2018	(r332894)
@@ -986,6 +986,15 @@ ip_ctloutput(struct socket *so, struct sockopt *sopt)
 				INP_WUNLOCK(inp);
 				error = 0;
 				break;
+			case SO_REUSEPORT_LB:
+				INP_WLOCK(inp);
+				if ((so->so_options & SO_REUSEPORT_LB) != 0)
+					inp->inp_flags2 |= INP_REUSEPORT_LB;
+				else
+					inp->inp_flags2 &= ~INP_REUSEPORT_LB;
+				INP_WUNLOCK(inp);
+				error = 0;
+				break;
 			case SO_SETFIB:
 				INP_WLOCK(inp);
 				inp->inp_inc.inc_fibnum = so->so_fibnum;

Modified: head/sys/netinet/tcp_subr.c
==============================================================================
--- head/sys/netinet/tcp_subr.c	Mon Apr 23 18:33:26 2018	(r332893)
+++ head/sys/netinet/tcp_subr.c	Mon Apr 23 19:51:00 2018	(r332894)
@@ -1956,10 +1956,28 @@ tcp_close(struct tcpcb *tp)
 {
 	struct inpcb *inp = tp->t_inpcb;
 	struct socket *so;
+	struct inpcb *inp_inh = NULL;
+	int listen = tp->t_state & TCPS_LISTEN;
 
 	INP_INFO_LOCK_ASSERT(&V_tcbinfo);
 	INP_WLOCK_ASSERT(inp);
 
+	if (listen) {
+		/*
+		 * Pending socket/syncache inheritance
+		 *
+		 * If this is a listen(2) socket, find another listen(2)
+		 * socket in the same local group, which could inherit
+		 * the syncache and sockets pending on the completion
+		 * and incompletion queues.
+		 *
+		 * NOTE:
+		 * Currently the inheritance could only happen on the
+		 * listen(2) sockets with SO_REUSEPORT_LB set.
+		 */
+		inp_inh = in_pcblookup_lbgroup_last(inp);
+	}
+
 #ifdef TCP_OFFLOAD
 	if (tp->t_state == TCPS_LISTEN)
 		tcp_offload_listen_stop(tp);
@@ -1979,7 +1997,16 @@ tcp_close(struct tcpcb *tp)
 		tcp_state_change(tp, TCPS_CLOSED);
 	KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
 	so = inp->inp_socket;
+
 	soisdisconnected(so);
+
+	if(listen)
+	{
+		if(inp_inh != NULL && inp_inh->inp_socket != NULL) {
+			soinherit(so, inp_inh->inp_socket);
+		}
+	}
+
 	if (inp->inp_flags & INP_SOCKREF) {
 		KASSERT(so->so_state & SS_PROTOREF,
 		    ("tcp_close: !SS_PROTOREF"));

Modified: head/sys/netinet/udp_usrreq.c
==============================================================================
--- head/sys/netinet/udp_usrreq.c	Mon Apr 23 18:33:26 2018	(r332893)
+++ head/sys/netinet/udp_usrreq.c	Mon Apr 23 19:51:00 2018	(r332894)
@@ -612,7 +612,7 @@ udp_input(struct mbuf **mp, int *offp, int proto)
 			 * will never clear these options after setting them.
 			 */
 			if ((last->inp_socket->so_options &
-			    (SO_REUSEPORT|SO_REUSEADDR)) == 0)
+			    (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0)
 				break;
 		}
 

Modified: head/sys/netinet6/in6_pcb.c
==============================================================================
--- head/sys/netinet6/in6_pcb.c	Mon Apr 23 18:33:26 2018	(r332893)
+++ head/sys/netinet6/in6_pcb.c	Mon Apr 23 19:51:00 2018	(r332894)
@@ -125,6 +125,12 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam,
 	int error, lookupflags = 0;
 	int reuseport = (so->so_options & SO_REUSEPORT);
 
+	/*
+	 * XXX Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here
+	 * so that we don't have to add to the (already messy) code below
+	 */
+	int reuseport_lb = (so->so_options & SO_REUSEPORT_LB);
+
 	INP_WLOCK_ASSERT(inp);
 	INP_HASH_WLOCK_ASSERT(pcbinfo);
 
@@ -132,7 +138,7 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam,
 		return (EADDRNOTAVAIL);
 	if (inp->inp_lport || !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
 		return (EINVAL);
-	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0)
+	if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0)
 		lookupflags = INPLOOKUP_WILDCARD;
 	if (nam == NULL) {
 		if ((error = prison_local_ip6(cred, &inp->in6p_laddr,
@@ -166,6 +172,10 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam,
 			 */
 			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0)
 				reuseport = SO_REUSEADDR|SO_REUSEPORT;
+			// XXX: How to deal with SO_REUSEPORT_LB here?
+			// Added equivalent treatment as SO_REUSEPORT here for now
+			if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0)
+				reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB;
 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 			struct ifaddr *ifa;
 
@@ -214,7 +224,8 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam,
 				     IN6_IS_ADDR_UNSPECIFIED(&t->in6p_faddr)) &&
 				    (!IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) ||
 				     !IN6_IS_ADDR_UNSPECIFIED(&t->in6p_laddr) ||
-				     (t->inp_flags2 & INP_REUSEPORT) == 0) &&
+				     (t->inp_flags2 & INP_REUSEPORT) ||
+				     (t->inp_flags2 & INP_REUSEPORT_LB) == 0) &&
 				    (inp->inp_cred->cr_uid !=
 				     t->inp_cred->cr_uid))
 					return (EADDRINUSE);
@@ -264,34 +275,39 @@ in6_pcbbind(struct inpcb *inp, struct sockaddr *nam,
 				 */
 				tw = intotw(t);
 				if (tw == NULL ||
-				    (reuseport & tw->tw_so_options) == 0)
+				    ((reuseport & tw->tw_so_options) == 0 &&
+					 (reuseport_lb & tw->tw_so_options) == 0))
 					return (EADDRINUSE);
-			} else if (t && (reuseport & inp_so_options(t)) == 0) {
+			} else if (t && (reuseport & inp_so_options(t)) == 0 &&
+					   (reuseport_lb & inp_so_options(t)) == 0) {
 				return (EADDRINUSE);
 			}
 #ifdef INET
 			if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0 &&
-			    IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
+				IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
 				struct sockaddr_in sin;
 
 				in6_sin6_2_sin(&sin, sin6);
 				t = in_pcblookup_local(pcbinfo, sin.sin_addr,
-				    lport, lookupflags, cred);
+									   lport, lookupflags, cred);
 				if (t && t->inp_flags & INP_TIMEWAIT) {
 					tw = intotw(t);
 					if (tw == NULL)
 						return (EADDRINUSE);
 					if ((reuseport & tw->tw_so_options) == 0
-					    && (ntohl(t->inp_laddr.s_addr) !=
-					     INADDR_ANY || ((inp->inp_vflag &
-					     INP_IPV6PROTO) ==
-					     (t->inp_vflag & INP_IPV6PROTO))))
+						&& (reuseport_lb & tw->tw_so_options) == 0
+						&& (ntohl(t->inp_laddr.s_addr) !=
+							INADDR_ANY || ((inp->inp_vflag &
+											INP_IPV6PROTO) ==
+										   (t->inp_vflag & INP_IPV6PROTO))))
 						return (EADDRINUSE);
 				} else if (t &&
-				    (reuseport & inp_so_options(t)) == 0 &&
-				    (ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
-				    (t->inp_vflag & INP_IPV6PROTO) != 0))
+						   (reuseport & inp_so_options(t)) == 0 &&
+						   (reuseport_lb & inp_so_options(t)) == 0 &&
+						   (ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||
+							(t->inp_vflag & INP_IPV6PROTO) != 0)) {
 					return (EADDRINUSE);
+				}
 			}
 #endif
 		}
@@ -856,6 +872,54 @@ in6_rtchange(struct inpcb *inp, int errno)
 	return inp;
 }
 
+static struct inpcb *
+in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
+  const struct in6_addr *laddr, uint16_t lport, const struct in6_addr *faddr,
+  uint16_t fport, int lookupflags)
+{
+	struct inpcb *local_wild = NULL;
+	const struct inpcblbgrouphead *hdr;
+	struct inpcblbgroup *grp;
+	struct inpcblbgroup *grp_local_wild;
+
+	hdr = &pcbinfo->ipi_lbgrouphashbase[
+		  INP_PCBLBGROUP_PORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
+
+	/*
+	 * Order of socket selection:
+	 * 1. non-wild.
+	 * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD).
+	 *
+	 * NOTE:
+	 * - Load balanced group does not contain jailed sockets
+	 * - Load balanced does not contain IPv4 mapped INET6 wild sockets
+	 */
+	LIST_FOREACH(grp, hdr, il_list) {
+
+		if (grp->il_lport == lport) {
+			uint32_t idx = 0;
+			int pkt_hash = INP_PCBLBGROUP_PKTHASH(
+						       INP6_PCBHASHKEY(faddr), lport, fport);
+
+			idx = pkt_hash % grp->il_inpcnt;
+
+			if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr)) {
+				return grp->il_inp[idx];
+			} else {
+				if (IN6_IS_ADDR_UNSPECIFIED(&grp->il6_laddr) &&
+					(lookupflags & INPLOOKUP_WILDCARD)) {
+					local_wild = grp->il_inp[idx];
+					grp_local_wild = grp;
+				}
+			}
+		}
+	}
+	if (local_wild != NULL) {
+		return local_wild;
+	}
+	return NULL;
+}
+
 #ifdef PCBGROUP
 /*
  * Lookup PCB in hash list, using pcbgroup tables.
@@ -1057,6 +1121,8 @@ found:

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-head mailing list