Changes to PCBPORTHASH wrt TCP, review needed

Mike Silbersack silby at silby.com
Mon Oct 27 00:16:20 PST 2003


The attached patch is rather short, but makes a substantial change in how
ports are bound, so I'd like a quick review of the code and the concept if
possible.

In short, what this patch does is remove established TCP connections from
the PCBPORTHASH table, thereby allowing their ephemeral ports to be
concurrently used by other tcp connections to other hosts/ports.

To do this, I made the following changes:

- in_pcbinshash was modified so that the "porthash" parameter determines
whether or not the inp in question is added to the pcbporthash.

- in_pcbrehash was modified so that the "remove" parameter determines
whether the inp in question will be removed from the pcbportlist when it
is rehashed in the pcbhash

- in_pcbremlists was modified to account for pcbs not necessarily being in
both hashes

- syncache_socket uses the porthash parameter to ensure that incoming tcp
connections are never added to the pcbporthash.

- tcp_connect and tcp6_connect use ins_rehash to remove the inp in
question from the pcbporthash list after the final ip/port has been
determined.  Note that the pcbhash is still consulted, so an attempt to
create a tcp socket which conflicts with an already existing connection
will be detected correctly.

One easy way to test this patch is to install http_load, set your
ephemeral port range to something in the range of 30, and have it start
testing a host.  It will quickly create TIME_WAIT sockets filling all
ephemeral ports.  Without this patch, you will be unable to create
outgoing connections; with this patch, other outgoing connections will be
fine.

Note that since the port chosen is not released from the pcbporthash table
until tcp_connect, repeated calls to bind can still reserve all ephemeral
ports; I don't believe that is going to be a common case.

So far, the only problem I can see is that in_pcbbind, being unaware of
the state of pcbhash, may return ports which collide with in-use
connections for the destination host while non-colliding ports are still
available.  I should be able to work around this potential issue with a
simple retry loop, so it's not a big concern.

Otherwise, can anyone see any problems that this change would cause?  I
believe that it would change behavior so that SO_REUSEADDR and
SO_REUSEPORT would no longer be required for daemons which restart and
attempt to listen on the same port, but I believe this to be a positive
change.  Ports used by tcp listen sockets and udp sockets should be
protected as before, so that should be ok as well.  Am I missing something
subtle?

Thanks,

Mike "Silby" Silbersack
-------------- next part --------------
diff -u -r /usr/src/sys.old/netinet/in_pcb.c /usr/src/sys/netinet/in_pcb.c
--- /usr/src/sys.old/netinet/in_pcb.c	Fri Oct 24 12:05:00 2003
+++ /usr/src/sys/netinet/in_pcb.c	Sun Oct 26 21:14:36 2003
@@ -212,7 +212,7 @@
 	    &inp->inp_lport, td);
 	if (error)
 		return (error);
-	if (in_pcbinshash(inp) != 0) {
+	if (in_pcbinshash(inp, 1) != 0) {
 		inp->inp_laddr.s_addr = INADDR_ANY;
 		inp->inp_lport = 0;
 		return (EAGAIN);
@@ -460,7 +460,7 @@
 	if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) {
 		inp->inp_lport = lport;
 		inp->inp_laddr.s_addr = laddr;
-		if (in_pcbinshash(inp) != 0) {
+		if (in_pcbinshash(inp, 1) != 0) {
 			inp->inp_laddr.s_addr = INADDR_ANY;
 			inp->inp_lport = 0;
 			return (EAGAIN);
@@ -472,7 +472,7 @@
 	inp->inp_laddr.s_addr = laddr;
 	inp->inp_faddr.s_addr = faddr;
 	inp->inp_fport = fport;
-	in_pcbrehash(inp);
+	in_pcbrehash(inp, 0);
 	if (anonport)
 		inp->inp_flags |= INP_ANONPORT;
 	return (0);
@@ -652,7 +652,7 @@
 
 	inp->inp_faddr.s_addr = INADDR_ANY;
 	inp->inp_fport = 0;
-	in_pcbrehash(inp);
+	in_pcbrehash(inp, 0);
 	if (inp->inp_socket->so_state & SS_NOFDREF)
 		in_pcbdetach(inp);
 }
@@ -1075,8 +1075,9 @@
  * Insert PCB onto various hash lists.
  */
 int
-in_pcbinshash(inp)
+in_pcbinshash(inp, porthash)
 	struct inpcb *inp;
+	int porthash;
 {
 	struct inpcbhead *pcbhash;
 	struct inpcbporthead *pcbporthash;
@@ -1094,30 +1095,34 @@
 	pcbhash = &pcbinfo->hashbase[INP_PCBHASH(hashkey_faddr,
 		 inp->inp_lport, inp->inp_fport, pcbinfo->hashmask)];
 
-	pcbporthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(inp->inp_lport,
-	    pcbinfo->porthashmask)];
+	if (porthash) {
+		pcbporthash = &pcbinfo->porthashbase[INP_PCBPORTHASH(inp->inp_lport,
+		    pcbinfo->porthashmask)];
 
-	/*
-	 * Go through port list and look for a head for this lport.
-	 */
-	LIST_FOREACH(phd, pcbporthash, phd_hash) {
-		if (phd->phd_port == inp->inp_lport)
-			break;
-	}
-	/*
-	 * If none exists, malloc one and tack it on.
-	 */
-	if (phd == NULL) {
-		MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_NOWAIT);
+		/*
+		 * Go through port list and look for a head for this lport.
+		 */
+		LIST_FOREACH(phd, pcbporthash, phd_hash) {
+			if (phd->phd_port == inp->inp_lport)
+				break;
+		}
+		/*
+		 * If none exists, malloc one and tack it on.
+		 */
 		if (phd == NULL) {
-			return (ENOBUFS); /* XXX */
+			MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_NOWAIT);
+			if (phd == NULL) {
+				return (ENOBUFS); /* XXX */
+			}
+			phd->phd_port = inp->inp_lport;
+			LIST_INIT(&phd->phd_pcblist);
+			LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
 		}
-		phd->phd_port = inp->inp_lport;
-		LIST_INIT(&phd->phd_pcblist);
-		LIST_INSERT_HEAD(pcbporthash, phd, phd_hash);
+		inp->inp_phd = phd;
+		LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
+	} else {
+		inp->inp_phd = NULL;
 	}
-	inp->inp_phd = phd;
-	LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist);
 	LIST_INSERT_HEAD(pcbhash, inp, inp_hash);
 	return (0);
 }
@@ -1129,11 +1134,13 @@
  * not change after in_pcbinshash() has been called.
  */
 void
-in_pcbrehash(inp)
+in_pcbrehash(inp, remove)
 	struct inpcb *inp;
+	int remove;
 {
 	struct inpcbhead *head;
 	u_int32_t hashkey_faddr;
+	struct inpcbport *phd = inp->inp_phd;
 
 #ifdef INET6
 	if (inp->inp_vflag & INP_IPV6)
@@ -1147,6 +1154,15 @@
 
 	LIST_REMOVE(inp, inp_hash);
 	LIST_INSERT_HEAD(head, inp, inp_hash);
+
+	if (remove && phd) {
+		LIST_REMOVE(inp, inp_portlist);
+		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
+			LIST_REMOVE(phd, phd_hash);
+			free(phd, M_PCB);
+		}
+		inp->inp_phd = NULL;
+	}
 }
 
 /*
@@ -1161,10 +1177,12 @@
 		struct inpcbport *phd = inp->inp_phd;
 
 		LIST_REMOVE(inp, inp_hash);
-		LIST_REMOVE(inp, inp_portlist);
-		if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
-			LIST_REMOVE(phd, phd_hash);
-			free(phd, M_PCB);
+		if (phd) {
+			LIST_REMOVE(inp, inp_portlist);
+			if (LIST_FIRST(&phd->phd_pcblist) == NULL) {
+				LIST_REMOVE(phd, phd_hash);
+				free(phd, M_PCB);
+			}
 		}
 	}
 	LIST_REMOVE(inp, inp_list);
diff -u -r /usr/src/sys.old/netinet/in_pcb.h /usr/src/sys/netinet/in_pcb.h
--- /usr/src/sys.old/netinet/in_pcb.h	Fri Oct 24 12:05:00 2003
+++ /usr/src/sys/netinet/in_pcb.h	Sun Oct 26 21:03:15 2003
@@ -340,7 +340,7 @@
 	    struct thread *);
 void	in_pcbdetach(struct inpcb *);
 void	in_pcbdisconnect(struct inpcb *);
-int	in_pcbinshash(struct inpcb *);
+int	in_pcbinshash(struct inpcb *, int);
 struct inpcb *
 	in_pcblookup_local(struct inpcbinfo *,
 	    struct in_addr, u_int, int);
@@ -349,7 +349,7 @@
 	    struct in_addr, u_int, int, struct ifnet *);
 void	in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr,
 	    int, struct inpcb *(*)(struct inpcb *, int));
-void	in_pcbrehash(struct inpcb *);
+void	in_pcbrehash(struct inpcb *, int);
 int	in_setpeeraddr(struct socket *so, struct sockaddr **nam, struct inpcbinfo *pcbinfo);
 int	in_setsockaddr(struct socket *so, struct sockaddr **nam, struct inpcbinfo *pcbinfo);;
 struct sockaddr *
diff -u -r /usr/src/sys.old/netinet/tcp_syncache.c /usr/src/sys/netinet/tcp_syncache.c
--- /usr/src/sys.old/netinet/tcp_syncache.c	Fri Oct 24 12:05:00 2003
+++ /usr/src/sys/netinet/tcp_syncache.c	Sun Oct 26 02:26:41 2003
@@ -605,7 +605,7 @@
 	}
 #endif
 	inp->inp_lport = sc->sc_inc.inc_lport;
-	if (in_pcbinshash(inp) != 0) {
+	if (in_pcbinshash(inp, 0) != 0) {
 		/*
 		 * Undo the assignments above if we failed to
 		 * put the PCB on the hash lists.
diff -u -r /usr/src/sys.old/netinet/tcp_usrreq.c /usr/src/sys/netinet/tcp_usrreq.c
--- /usr/src/sys.old/netinet/tcp_usrreq.c	Fri Oct 24 12:05:00 2003
+++ /usr/src/sys/netinet/tcp_usrreq.c	Mon Oct 27 01:43:50 2003
@@ -883,7 +883,7 @@
 			return EADDRINUSE;
 	}
 	inp->inp_laddr = laddr;
-	in_pcbrehash(inp);
+	in_pcbrehash(inp, 1);
 
 	/* Compute window scaling to request.  */
 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
@@ -972,7 +972,7 @@
 	inp->inp_fport = sin6->sin6_port;
 	if ((sin6->sin6_flowinfo & IPV6_FLOWINFO_MASK) != 0)
 		inp->in6p_flowinfo = sin6->sin6_flowinfo;
-	in_pcbrehash(inp);
+	in_pcbrehash(inp, 1);
 
 	/* Compute window scaling to request.  */
 	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
diff -u -r /usr/src/sys.old/netinet/udp_usrreq.c /usr/src/sys/netinet/udp_usrreq.c
--- /usr/src/sys.old/netinet/udp_usrreq.c	Fri Oct 24 12:05:00 2003
+++ /usr/src/sys/netinet/udp_usrreq.c	Sun Oct 26 02:18:41 2003
@@ -784,7 +784,7 @@
 		if (inp->inp_laddr.s_addr == INADDR_ANY &&
 		    inp->inp_lport == 0) {
 			inp->inp_lport = lport;
-			if (in_pcbinshash(inp) != 0) {
+			if (in_pcbinshash(inp, 1) != 0) {
 				inp->inp_lport = 0;
 				error = EAGAIN;
 				goto release;
diff -u -r /usr/src/sys.old/netinet6/in6_pcb.c /usr/src/sys/netinet6/in6_pcb.c
--- /usr/src/sys.old/netinet6/in6_pcb.c	Fri Oct 24 12:04:58 2003
+++ /usr/src/sys/netinet6/in6_pcb.c	Sun Oct 26 21:04:56 2003
@@ -280,7 +280,7 @@
 	}
 	else {
 		inp->inp_lport = lport;
-		if (in_pcbinshash(inp) != 0) {
+		if (in_pcbinshash(inp, 1) != 0) {
 			inp->in6p_laddr = in6addr_any;
 			inp->inp_lport = 0;
 			return (EAGAIN);
@@ -409,7 +409,7 @@
 		    (htonl(ip6_flow_seq++) & IPV6_FLOWLABEL_MASK);
 #endif
 
-	in_pcbrehash(inp);
+	in_pcbrehash(inp, 0);
 	return (0);
 }
 
@@ -421,7 +421,7 @@
 	inp->inp_fport = 0;
 	/* clear flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
 	inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
-	in_pcbrehash(inp);
+	in_pcbrehash(inp, 0);
 	if (inp->inp_socket->so_state & SS_NOFDREF)
 		in6_pcbdetach(inp);
 }
diff -u -r /usr/src/sys.old/netinet6/in6_src.c /usr/src/sys/netinet6/in6_src.c
--- /usr/src/sys.old/netinet6/in6_src.c	Fri Oct 24 12:04:59 2003
+++ /usr/src/sys/netinet6/in6_src.c	Sun Oct 26 02:20:13 2003
@@ -394,7 +394,7 @@
 	}
 
 	inp->inp_lport = lport;
-	if (in_pcbinshash(inp) != 0) {
+	if (in_pcbinshash(inp, 1) != 0) {
 		inp->in6p_laddr = in6addr_any;
 		inp->inp_lport = 0;
 		return (EAGAIN);


More information about the freebsd-net mailing list