git: 5f610cf1ec62 - stable/14 - inpcb: Close some SO_REUSEPORT_LB races
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Tue, 14 Jan 2025 14:43:32 UTC
The branch stable/14 has been updated by markj:
URL: https://cgit.FreeBSD.org/src/commit/?id=5f610cf1ec62ee7c9843a2eee5010901f3ebdbe6
commit 5f610cf1ec62ee7c9843a2eee5010901f3ebdbe6
Author: Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2024-12-12 14:02:12 +0000
Commit: Mark Johnston <markj@FreeBSD.org>
CommitDate: 2025-01-14 14:14:24 +0000
inpcb: Close some SO_REUSEPORT_LB races
For a long time, the inpcb lookup path has been lockless in the common
case: we use net_epoch to synchronize lookups. However, the routines
which update lbgroups were not careful to synchronize with unlocked
lookups. I believe that in the worst case this can result in spurious
connection aborts (I have a regression test case to exercise this), but
it's hard to be certain.
Modify in_pcblbgroup* routines to synchronize with unlocked lookup:
- When removing inpcbs from an lbgroup, do not shrink the array.
The maximum number of lbgroup entries is INPCBLBGROUP_SIZMAX (256),
and it doesn't seem worth the complexity to shrink the array when a
socket is removed.
- When resizing an lbgroup, do not insert it into the hash table until
it is fully initialized; otherwise lookups may observe a partially
constructed lbgroup.
- When adding an inpcb to the group, increment the counter after adding
the array entry, using a release store. Otherwise it's possible for
lookups to observe a null array slot.
- When looking up an entry, use a corresponding acquire load.
Reviewed by: ae, glebius
MFC after: 1 month
Sponsored by: Klara, Inc.
Sponsored by: Stormshield
Differential Revision: https://reviews.freebsd.org/D48020
(cherry picked from commit a600aabe9b04f0906069a8fb1f8d696ad186080f)
---
sys/netinet/in_pcb.c | 94 ++++++++++++++++++++++++++++----------------------
sys/netinet6/in6_pcb.c | 13 +++++--
2 files changed, 63 insertions(+), 44 deletions(-)
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 331804545bee..15729bcd1ce3 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -255,9 +255,8 @@ static void in_pcbremhash(struct inpcb *);
*/
static struct inpcblbgroup *
-in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, struct ucred *cred,
- u_char vflag, uint16_t port, const union in_dependaddr *addr, int size,
- uint8_t numa_domain)
+in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port,
+ const union in_dependaddr *addr, int size, uint8_t numa_domain)
{
struct inpcblbgroup *grp;
size_t bytes;
@@ -272,7 +271,6 @@ in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, struct ucred *cred,
grp->il_numa_domain = numa_domain;
grp->il_dependladdr = *addr;
grp->il_inpsiz = size;
- CK_LIST_INSERT_HEAD(hdr, grp, il_list);
return (grp);
}
@@ -294,6 +292,24 @@ in_pcblbgroup_free(struct inpcblbgroup *grp)
NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx);
}
+static void
+in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp)
+{
+ KASSERT(grp->il_inpcnt < grp->il_inpsiz,
+ ("invalid local group size %d and count %d", grp->il_inpsiz,
+ grp->il_inpcnt));
+ INP_WLOCK_ASSERT(inp);
+
+ inp->inp_flags |= INP_INLBGROUP;
+ grp->il_inp[grp->il_inpcnt] = inp;
+
+ /*
+ * Synchronize with in_pcblookup_lbgroup(): make sure that we don't
+ * expose a null slot to the lookup path.
+ */
+ atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1);
+}
+
static struct inpcblbgroup *
in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
struct inpcblbgroup *old_grp, int size)
@@ -301,7 +317,7 @@ in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
struct inpcblbgroup *grp;
int i;
- grp = in_pcblbgroup_alloc(hdr, old_grp->il_cred, old_grp->il_vflag,
+ grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag,
old_grp->il_lport, &old_grp->il_dependladdr, size,
old_grp->il_numa_domain);
if (grp == NULL)
@@ -314,34 +330,11 @@ in_pcblbgroup_resize(struct inpcblbgrouphead *hdr,
for (i = 0; i < old_grp->il_inpcnt; ++i)
grp->il_inp[i] = old_grp->il_inp[i];
grp->il_inpcnt = old_grp->il_inpcnt;
+ CK_LIST_INSERT_HEAD(hdr, grp, il_list);
in_pcblbgroup_free(old_grp);
return (grp);
}
-/*
- * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i]
- * and shrink group if possible.
- */
-static void
-in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp,
- int i)
-{
- struct inpcblbgroup *grp, *new_grp;
-
- grp = *grpp;
- for (; i + 1 < grp->il_inpcnt; ++i)
- grp->il_inp[i] = grp->il_inp[i + 1];
- grp->il_inpcnt--;
-
- if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN &&
- grp->il_inpcnt <= grp->il_inpsiz / 4) {
- /* Shrink this group. */
- new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2);
- if (new_grp != NULL)
- *grpp = new_grp;
- }
-}
-
/*
* Add PCB to load balance group for SO_REUSEPORT_LB option.
*/
@@ -386,11 +379,13 @@ in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
}
if (grp == NULL) {
/* Create new load balance group. */
- grp = in_pcblbgroup_alloc(hdr, inp->inp_cred, inp->inp_vflag,
+ grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag,
inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
INPCBLBGROUP_SIZMIN, numa_domain);
if (grp == NULL)
return (ENOBUFS);
+ in_pcblbgroup_insert(grp, inp);
+ CK_LIST_INSERT_HEAD(hdr, grp, il_list);
} else if (grp->il_inpcnt == grp->il_inpsiz) {
if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) {
if (ratecheck(&lastprint, &interval))
@@ -403,15 +398,10 @@ in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2);
if (grp == NULL)
return (ENOBUFS);
+ in_pcblbgroup_insert(grp, inp);
+ } else {
+ in_pcblbgroup_insert(grp, inp);
}
-
- KASSERT(grp->il_inpcnt < grp->il_inpsiz,
- ("invalid local group size %d and count %d", grp->il_inpsiz,
- grp->il_inpcnt));
-
- grp->il_inp[grp->il_inpcnt] = inp;
- grp->il_inpcnt++;
- inp->inp_flags |= INP_INLBGROUP;
return (0);
}
@@ -443,8 +433,17 @@ in_pcbremlbgrouphash(struct inpcb *inp)
/* We are the last, free this local group. */
in_pcblbgroup_free(grp);
} else {
- /* Pull up inpcbs, shrink group if possible. */
- in_pcblbgroup_reorder(hdr, &grp, i);
+ KASSERT(grp->il_inpcnt >= 2,
+ ("invalid local group count %d",
+ grp->il_inpcnt));
+ grp->il_inp[i] =
+ grp->il_inp[grp->il_inpcnt - 1];
+
+ /*
+ * Synchronize with in_pcblookup_lbgroup().
+ */
+ atomic_store_rel_int(&grp->il_inpcnt,
+ grp->il_inpcnt - 1);
}
inp->inp_flags &= ~INP_INLBGROUP;
return;
@@ -2112,8 +2111,11 @@ in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
const struct inpcblbgrouphead *hdr;
struct inpcblbgroup *grp;
struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
+ struct inpcb *inp;
+ u_int count;
INP_HASH_LOCK_ASSERT(pcbinfo);
+ NET_EPOCH_ASSERT();
hdr = &pcbinfo->ipi_lbgrouphashbase[
INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)];
@@ -2172,9 +2174,17 @@ in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
grp = local_wild;
if (grp == NULL)
return (NULL);
+
out:
- return (grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) %
- grp->il_inpcnt]);
+ /*
+ * Synchronize with in_pcblbgroup_insert().
+ */
+ count = atomic_load_acq_int(&grp->il_inpcnt);
+ if (count == 0)
+ return (NULL);
+ inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count];
+ KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
+ return (inp);
}
static bool
diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c
index 04b87819d629..b3b2f03451aa 100644
--- a/sys/netinet6/in6_pcb.c
+++ b/sys/netinet6/in6_pcb.c
@@ -922,6 +922,8 @@ in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
const struct inpcblbgrouphead *hdr;
struct inpcblbgroup *grp;
struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild;
+ struct inpcb *inp;
+ u_int count;
INP_HASH_LOCK_ASSERT(pcbinfo);
@@ -983,8 +985,15 @@ in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
if (grp == NULL)
return (NULL);
out:
- return (grp->il_inp[INP6_PCBLBGROUP_PKTHASH(faddr, lport, fport) %
- grp->il_inpcnt]);
+ /*
+ * Synchronize with in_pcblbgroup_insert().
+ */
+ count = atomic_load_acq_int(&grp->il_inpcnt);
+ if (count == 0)
+ return (NULL);
+ inp = grp->il_inp[INP6_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count];
+ KASSERT(inp != NULL, ("%s: inp == NULL", __func__));
+ return (inp);
}
static bool