git: 69f61cee2efb - main - unix/stream: fix a race with MSG_PEEK on SOCK_SEQPACKET with MSG_EOR

From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Fri, 14 Nov 2025 02:40:26 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=69f61cee2efb1eec0640ca7de9b2d51599569a5d

commit 69f61cee2efb1eec0640ca7de9b2d51599569a5d
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2025-11-14 02:39:48 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2025-11-14 02:39:48 +0000

    unix/stream: fix a race with MSG_PEEK on SOCK_SEQPACKET with MSG_EOR
    
    The pr_soreceive method first scans the buffer holding the both I/O sx(9)
    and socket buffer mutex(9) and after figuring out how much needs to be
    copied out drops the mutex.  Since the other side may only append to the
    buffer, it is safe to continue the operation holding the sx(9) only.
    However, the code had a bug that it used pointer in the very last mbuf as
    marker of the place where to stop.  This worked both in a case when we
    drain a buffer completely (marker points at NULL) and in a case when we
    wanted to stop at MSG_EOR (marker points at next mbuf after MSG_EOR).
    However, this pointer is not consistent after we dropped the socket buffer
    mutex.
    
    Rewrite the logic to use the data length as bounds for the copyout cycle.
    
    Provide a test case that reproduces the race.  Note that the race is very
    hard to hit, thus test will pass on unmodified kernel as well.  In a
    virtual machine I needed to add tsleep(9) for 10 nanoseconds into the
    middle of function to be able to reproduce.
    
    PR:                     290658
    Reviewed by:            markj
    Differential Revision:  https://reviews.freebsd.org/D53632
    Fixes:                  d15792780760ef94647af9b377b5f0a80e1826bc
---
 sys/kern/uipc_usrreq.c               | 87 +++++++++++++++++-------------------
 tests/sys/kern/unix_seqpacket_test.c | 62 +++++++++++++++++++++++++
 2 files changed, 103 insertions(+), 46 deletions(-)

diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index 86e1694fb869..b1cb6de98b5b 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -1372,8 +1372,8 @@ uipc_soreceive_stream_or_seqpacket(struct socket *so, struct sockaddr **psa,
     struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	struct sockbuf *sb = &so->so_rcv;
-	struct mbuf *control, *m, *first, *last, *next;
-	u_int ctl, space, datalen, mbcnt, lastlen;
+	struct mbuf *control, *m, *first, *part, *next;
+	u_int ctl, space, datalen, mbcnt, partlen;
 	int error, flags;
 	bool nonblock, waitall, peek;
 
@@ -1457,22 +1457,16 @@ restart:
 		control = NULL;
 
 	/*
-	 * Find split point for the next copyout.  On exit from the loop:
-	 * last == NULL - socket to be flushed
-	 * last != NULL
-	 *   lastlen > last->m_len - uio to be filled, last to be adjusted
-	 *   lastlen == 0          - MT_CONTROL, M_EOR or M_NOTREADY encountered
+	 * Find split point for the next copyout.  On exit from the loop,
+	 * 'next' points to the new head of the buffer STAILQ and 'datalen'
+	 * contains the amount of data we will copy out at the end.  The
+	 * copyout is protected by the I/O lock only, as writers can only
+	 * append to the buffer.  We need to record the socket buffer state
+	 * and do all length adjustments before dropping the socket buffer lock.
 	 */
-	space = uio->uio_resid;
-	datalen = 0;
-	for (m = first, last = sb->uxst_fnrdy, lastlen = 0;
-	     m != sb->uxst_fnrdy;
+	for (space = uio->uio_resid, m = next = first, part = NULL, datalen = 0;
+	     space > 0 && m != sb->uxst_fnrdy && m->m_type == MT_DATA;
 	     m = STAILQ_NEXT(m, m_stailq)) {
-		if (m->m_type != MT_DATA) {
-			last = m;
-			lastlen = 0;
-			break;
-		}
 		if (space >= m->m_len) {
 			space -= m->m_len;
 			datalen += m->m_len;
@@ -1480,29 +1474,29 @@ restart:
 			if (m->m_flags & M_EXT)
 				mbcnt += m->m_ext.ext_size;
 			if (m->m_flags & M_EOR) {
-				last = STAILQ_NEXT(m, m_stailq);
-				lastlen = 0;
 				flags |= MSG_EOR;
+				next = STAILQ_NEXT(m, m_stailq);
 				break;
 			}
 		} else {
 			datalen += space;
-			last = m;
-			lastlen = space;
+			partlen = space;
+			if (!peek) {
+				m->m_len -= partlen;
+				m->m_data += partlen;
+			}
+			next = part = m;
 			break;
 		}
+		next = STAILQ_NEXT(m, m_stailq);
 	}
 
-	UIPC_STREAM_SBCHECK(sb);
 	if (!peek) {
-		if (last == NULL)
+		STAILQ_FIRST(&sb->uxst_mbq) = next;
+#ifdef INVARIANTS
+		if (next == NULL)
 			STAILQ_INIT(&sb->uxst_mbq);
-		else {
-			STAILQ_FIRST(&sb->uxst_mbq) = last;
-			MPASS(last->m_len > lastlen);
-			last->m_len -= lastlen;
-			last->m_data += lastlen;
-		}
+#endif
 		MPASS(sb->sb_acc >= datalen);
 		sb->sb_acc -= datalen;
 		sb->sb_ccc -= datalen;
@@ -1629,33 +1623,34 @@ restart:
 		}
 	}
 
-	for (m = first; m != last; m = next) {
+	for (m = first; datalen > 0; m = next) {
+		void *data;
+		u_int len;
+
 		next = STAILQ_NEXT(m, m_stailq);
-		error = uiomove(mtod(m, char *), m->m_len, uio);
+		if (m == part) {
+			data = peek ?
+			    mtod(m, char *) : mtod(m, char *) - partlen;
+			len = partlen;
+		} else {
+			data = mtod(m, char *);
+			len = m->m_len;
+		}
+		error = uiomove(data, len, uio);
 		if (__predict_false(error)) {
-			SOCK_IO_RECV_UNLOCK(so);
 			if (!peek)
-				for (; m != last; m = next) {
+				for (; m != part && datalen > 0; m = next) {
 					next = STAILQ_NEXT(m, m_stailq);
+					MPASS(datalen >= m->m_len);
+					datalen -= m->m_len;
 					m_free(m);
 				}
-			return (error);
-		}
-		if (!peek)
-			m_free(m);
-	}
-	if (last != NULL && lastlen > 0) {
-		if (!peek) {
-			MPASS(!(m->m_flags & M_PKTHDR));
-			MPASS(last->m_data - M_START(last) >= lastlen);
-			error = uiomove(mtod(last, char *) - lastlen,
-			    lastlen, uio);
-		} else
-			error = uiomove(mtod(last, char *), lastlen, uio);
-		if (__predict_false(error)) {
 			SOCK_IO_RECV_UNLOCK(so);
 			return (error);
 		}
+		datalen -= len;
+		if (!peek && m != part)
+			m_free(m);
 	}
 	if (waitall && !(flags & MSG_EOR) && uio->uio_resid > 0)
 		goto restart;
diff --git a/tests/sys/kern/unix_seqpacket_test.c b/tests/sys/kern/unix_seqpacket_test.c
index b9a6be015241..27bd430430b4 100644
--- a/tests/sys/kern/unix_seqpacket_test.c
+++ b/tests/sys/kern/unix_seqpacket_test.c
@@ -1314,6 +1314,67 @@ ATF_TC_BODY(random_eor_and_waitall, tc)
 	free(params.records);
 }
 
+/* See bug 290658. */
+#define	PEEK_RACE_SIZE	10
+#define	PEEK_RACE_TRIES	10000
+static void *
+peek_race_writer(void *args)
+{
+	struct timespec ts = {};
+	u_short seed[3];
+	char buf[PEEK_RACE_SIZE];
+	int fd = *(int *)args;
+
+	arc4random_buf(seed, sizeof(seed));
+	for (u_int i = 0; i < PEEK_RACE_TRIES; i++) {
+		ATF_REQUIRE_EQ(PEEK_RACE_SIZE,
+		    send(fd, buf, sizeof(buf), MSG_EOR));
+		ts.tv_nsec = nrand48(seed) % 20;
+		(void)clock_nanosleep(CLOCK_MONOTONIC_FAST, 0, &ts, NULL);
+	}
+
+	return (NULL);
+}
+
+static void *
+peek_race_peeker(void *args)
+{
+	char buf[PEEK_RACE_SIZE * 10];
+	int fd = *(int *)args;
+
+	for (u_int i = 0; i < PEEK_RACE_TRIES; i++) {
+		ssize_t rcvd;
+
+		while ((rcvd = recv(fd, buf, sizeof(buf),
+		    MSG_PEEK | MSG_DONTWAIT)) == -1)
+			ATF_REQUIRE(errno == EAGAIN);
+		ATF_REQUIRE(rcvd == PEEK_RACE_SIZE);
+
+		ATF_REQUIRE_EQ(PEEK_RACE_SIZE,
+		    recv(fd, buf, sizeof(buf), 0));
+	}
+
+	return (NULL);
+}
+
+ATF_TC_WITHOUT_HEAD(peek_race);
+ATF_TC_BODY(peek_race, tc)
+{
+	pthread_t peeker, writer;
+	int sv[2];
+
+	do_socketpair(sv);
+
+	ATF_REQUIRE_EQ(0, pthread_create(&writer, NULL, peek_race_writer,
+	    &sv[0]));
+	ATF_REQUIRE_EQ(0, pthread_create(&peeker, NULL, peek_race_peeker,
+	    &sv[1]));
+	ATF_REQUIRE_EQ(0, pthread_join(writer, NULL));
+	ATF_REQUIRE_EQ(0, pthread_join(peeker, NULL));
+	close(sv[0]);
+	close(sv[1]);
+}
+
 /*
  * Main.
  */
@@ -1370,6 +1431,7 @@ ATF_TP_ADD_TCS(tp)
 	ATF_TP_ADD_TC(tp, pipe_128k_8k);
 	ATF_TP_ADD_TC(tp, pipe_128k_128k);
 	ATF_TP_ADD_TC(tp, random_eor_and_waitall);
+	ATF_TP_ADD_TC(tp, peek_race);
 
 	return atf_no_error();
 }