git: a35f24c9055d - main - sendfile: factor out socket send buffer space sensing into a method

From: Gleb Smirnoff <glebius_at_FreeBSD.org>
Date: Wed, 30 Apr 2025 21:18:12 UTC
The branch main has been updated by glebius:

URL: https://cgit.FreeBSD.org/src/commit/?id=a35f24c9055d90b409eede861b7dc6defa01a118

commit a35f24c9055d90b409eede861b7dc6defa01a118
Author:     Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2025-04-30 21:17:35 +0000
Commit:     Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2025-04-30 21:17:35 +0000

    sendfile: factor out socket send buffer space sensing into a method
    
    Move a block of code that works with the socket send buffer from the main
    sendfile loop into a separate function.  Make it a protocol method, so
    that protocols may provide a different one.
    
    While here, provide a long comment explaining why we modify sb_lowat and
    why we can't just remove that hack.
    
    No functional change intended.
    
    Reviewed by:            markj
    Differential Revision:  https://reviews.freebsd.org/D48918
---
 sys/kern/kern_sendfile.c | 155 ++++++++++++++++++++++++-----------------------
 sys/kern/uipc_domain.c   |   7 +++
 sys/kern/uipc_usrreq.c   |   1 +
 sys/netinet/tcp_usrreq.c |   2 +
 sys/sys/protosw.h        |   6 +-
 sys/sys/socketvar.h      |   1 +
 6 files changed, 95 insertions(+), 77 deletions(-)

diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c
index d409f41824ad..1fd1828c37c7 100644
--- a/sys/kern/kern_sendfile.c
+++ b/sys/kern/kern_sendfile.c
@@ -667,6 +667,77 @@ sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
 	return (0);
 }
 
+/*
+ * Check socket state and wait (or EAGAIN) for needed amount of space.
+ */
+int
+sendfile_wait_generic(struct socket *so, off_t need, int *space)
+{
+	int error;
+
+	MPASS(need > 0);
+	MPASS(space != NULL);
+
+	/*
+	 * XXXGL: the hack with sb_lowat originates from d99b0dd2c5297.  To
+	 * achieve high performance sending with sendfile(2) a non-blocking
+	 * socket needs a fairly high low watermark.  Otherwise, the socket
+	 * will be reported as writable too early, and sendfile(2) will send
+	 * just a few bytes each time.  It is important to understand that
+	 * we are changing sb_lowat not for the current invocation of the
+	 * syscall, but for the *next* syscall. So there is no way to
+	 * workaround the problem, e.g. provide a special version of sbspace().
+	 * Since this hack has been in the kernel for a long time, we
+	 * anticipate that there is a lot of software that will suffer if we
+	 * remove it.  See also b21104487324.
+	 */
+	error = 0;
+	SOCK_SENDBUF_LOCK(so);
+	if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
+		so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
+	if (so->so_snd.sb_lowat < PAGE_SIZE && so->so_snd.sb_hiwat >= PAGE_SIZE)
+		so->so_snd.sb_lowat = PAGE_SIZE;
+retry_space:
+	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+		error = EPIPE;
+		goto done;
+	} else if (so->so_error) {
+		error = so->so_error;
+		so->so_error = 0;
+		goto done;
+	}
+	if ((so->so_state & SS_ISCONNECTED) == 0) {
+		error = ENOTCONN;
+		goto done;
+	}
+
+	*space = sbspace(&so->so_snd);
+	if (*space < need && (*space <= 0 || *space < so->so_snd.sb_lowat)) {
+		if (so->so_state & SS_NBIO) {
+			error = EAGAIN;
+			goto done;
+		}
+		/*
+		 * sbwait() drops the lock while sleeping.  When we loop back
+		 * to retry_space the state may have changed and we retest
+		 * for it.
+		 */
+		error = sbwait(so, SO_SND);
+		/*
+		 * An error from sbwait() usually indicates that we've been
+		 * interrupted by a signal.  If we've sent anything then return
+		 * bytes sent, otherwise return the error.
+		 */
+		if (error != 0)
+			goto done;
+		goto retry_space;
+	}
+done:
+	SOCK_SENDBUF_UNLOCK(so);
+
+	return (error);
+}
+
 int
 vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
     struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
@@ -677,6 +748,7 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
 	struct vm_object *obj;
 	vm_page_t pga;
 	struct socket *so;
+	const struct protosw *pr;
 #ifdef KERN_TLS
 	struct ktls_session *tls;
 #endif
@@ -710,6 +782,7 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
 	error = sendfile_getsock(td, sockfd, &sock_fp, &so);
 	if (error != 0)
 		goto out;
+	pr = so->so_proto;
 
 #ifdef MAC
 	error = mac_socket_check_send(td->td_ucred, so);
@@ -760,74 +833,8 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
 		int nios, space, npages, rhpages;
 
 		mtail = NULL;
-		/*
-		 * Check the socket state for ongoing connection,
-		 * no errors and space in socket buffer.
-		 * If space is low allow for the remainder of the
-		 * file to be processed if it fits the socket buffer.
-		 * Otherwise block in waiting for sufficient space
-		 * to proceed, or if the socket is nonblocking, return
-		 * to userland with EAGAIN while reporting how far
-		 * we've come.
-		 * We wait until the socket buffer has significant free
-		 * space to do bulk sends.  This makes good use of file
-		 * system read ahead and allows packet segmentation
-		 * offloading hardware to take over lots of work.  If
-		 * we were not careful here we would send off only one
-		 * sfbuf at a time.
-		 */
-		SOCKBUF_LOCK(&so->so_snd);
-		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
-			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
-		if (so->so_snd.sb_lowat < PAGE_SIZE &&
-		    so->so_snd.sb_hiwat >= PAGE_SIZE)
-			so->so_snd.sb_lowat = PAGE_SIZE;
-retry_space:
-		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
-			error = EPIPE;
-			SOCKBUF_UNLOCK(&so->so_snd);
-			goto done;
-		} else if (so->so_error) {
-			error = so->so_error;
-			so->so_error = 0;
-			SOCKBUF_UNLOCK(&so->so_snd);
-			goto done;
-		}
-		if ((so->so_state & SS_ISCONNECTED) == 0) {
-			SOCKBUF_UNLOCK(&so->so_snd);
-			error = ENOTCONN;
+		if ((error = pr->pr_sendfile_wait(so, rem, &space)) != 0)
 			goto done;
-		}
-
-		space = sbspace(&so->so_snd);
-		if (space < rem &&
-		    (space <= 0 ||
-		     space < so->so_snd.sb_lowat)) {
-			if (so->so_state & SS_NBIO) {
-				SOCKBUF_UNLOCK(&so->so_snd);
-				error = EAGAIN;
-				goto done;
-			}
-			/*
-			 * sbwait drops the lock while sleeping.
-			 * When we loop back to retry_space the
-			 * state may have changed and we retest
-			 * for it.
-			 */
-			error = sbwait(so, SO_SND);
-			/*
-			 * An error from sbwait usually indicates that we've
-			 * been interrupted by a signal. If we've sent anything
-			 * then return bytes sent, otherwise return the error.
-			 */
-			if (error != 0) {
-				SOCKBUF_UNLOCK(&so->so_snd);
-				goto done;
-			}
-			goto retry_space;
-		}
-		SOCKBUF_UNLOCK(&so->so_snd);
-
 		/*
 		 * At the beginning of the first loop check if any headers
 		 * are specified and copy them into mbufs.  Reduce space in
@@ -966,8 +973,7 @@ retry_space:
 		 *
 		 * TLS frames always require unmapped mbufs.
 		 */
-		if ((mb_use_ext_pgs &&
-		    so->so_proto->pr_protocol == IPPROTO_TCP)
+		if ((mb_use_ext_pgs && pr->pr_protocol == IPPROTO_TCP)
 #ifdef KERN_TLS
 		    || tls != NULL
 #endif
@@ -1172,8 +1178,8 @@ prepend_header:
 				sendfile_iodone(sfio, NULL, 0, 0);
 #ifdef KERN_TLS
 			if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
-				error = so->so_proto->pr_send(so,
-				    PRUS_NOTREADY, m, NULL, NULL, td);
+				error = pr->pr_send(so, PRUS_NOTREADY, m, NULL,
+				    NULL, td);
 				if (error != 0) {
 					m_freem(m);
 				} else {
@@ -1182,14 +1188,13 @@ prepend_header:
 				}
 			} else
 #endif
-				error = so->so_proto->pr_send(so, 0, m, NULL,
-				    NULL, td);
+				error = pr->pr_send(so, 0, m, NULL, NULL, td);
 		} else {
 			sfio->so = so;
 			sfio->m = m0;
 			soref(so);
-			error = so->so_proto->pr_send(so, PRUS_NOTREADY, m,
-			    NULL, NULL, td);
+			error = pr->pr_send(so, PRUS_NOTREADY, m, NULL, NULL,
+			    td);
 			sendfile_iodone(sfio, NULL, 0, error);
 		}
 #ifdef TCP_REQUEST_TRK
diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c
index 3f31f8ba421c..ebcf041790b2 100644
--- a/sys/kern/uipc_domain.c
+++ b/sys/kern/uipc_domain.c
@@ -144,6 +144,12 @@ pr_send_notsupp(struct socket *so, int flags, struct mbuf *m,
 	return (EOPNOTSUPP);
 }
 
+static int
+pr_sendfile_wait_notsupp(struct socket *so, off_t need, int *space)
+{
+	return (EOPNOTSUPP);
+}
+
 static int
 pr_ready_notsupp(struct socket *so, struct mbuf *m, int count)
 {
@@ -208,6 +214,7 @@ pr_init(struct domain *dom, struct protosw *pr)
 	NOTSUPP(pr_rcvd);
 	NOTSUPP(pr_rcvoob);
 	NOTSUPP(pr_send);
+	NOTSUPP(pr_sendfile_wait);
 	NOTSUPP(pr_shutdown);
 	NOTSUPP(pr_sockaddr);
 	NOTSUPP(pr_sosend);
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index a67e105a1447..79e4da5c8698 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -3373,6 +3373,7 @@ static struct protosw streamproto = {
 	.pr_peeraddr =		uipc_peeraddr,
 	.pr_rcvd =		uipc_rcvd,
 	.pr_send =		uipc_send,
+	.pr_sendfile_wait =	sendfile_wait_generic,
 	.pr_ready =		uipc_ready,
 	.pr_sense =		uipc_sense,
 	.pr_shutdown =		uipc_shutdown,
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index fbc204097b25..c64d25f0fa87 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1419,6 +1419,7 @@ struct protosw tcp_protosw = {
 	.pr_rcvd =		tcp_usr_rcvd,
 	.pr_rcvoob =		tcp_usr_rcvoob,
 	.pr_send =		tcp_usr_send,
+	.pr_sendfile_wait =	sendfile_wait_generic,
 	.pr_ready =		tcp_usr_ready,
 	.pr_shutdown =		tcp_usr_shutdown,
 	.pr_sockaddr =		in_getsockaddr,
@@ -1447,6 +1448,7 @@ struct protosw tcp6_protosw = {
 	.pr_rcvd =		tcp_usr_rcvd,
 	.pr_rcvoob =		tcp_usr_rcvoob,
 	.pr_send =		tcp_usr_send,
+	.pr_sendfile_wait =	sendfile_wait_generic,
 	.pr_ready =		tcp_usr_ready,
 	.pr_shutdown =		tcp_usr_shutdown,
 	.pr_sockaddr =		in6_mapped_sockaddr,
diff --git a/sys/sys/protosw.h b/sys/sys/protosw.h
index 4808f136cabf..7fb8dfdc7208 100644
--- a/sys/sys/protosw.h
+++ b/sys/sys/protosw.h
@@ -81,6 +81,7 @@ typedef enum {
 } pr_send_flags_t;
 typedef int	pr_send_t(struct socket *, int, struct mbuf *,
 		    struct sockaddr *, struct mbuf *, struct thread *);
+typedef	int	pr_sendfile_wait_t(struct socket *, off_t, int *);
 typedef int	pr_ready_t(struct socket *, struct mbuf *, int);
 typedef int	pr_sense_t(struct socket *, struct stat *);
 typedef int	pr_shutdown_t(struct socket *, enum shutdown_how);
@@ -109,9 +110,9 @@ struct protosw {
 	struct	domain	*pr_domain;	/* domain protocol a member of */
 
 	pr_soreceive_t	*pr_soreceive;	/* recv(2) */
-	pr_rcvd_t	*pr_rcvd;	/* soreceive_generic() if PR_WANTRCVD */
 	pr_sosend_t	*pr_sosend;	/* send(2) */
 	pr_send_t	*pr_send;	/* send(2) via sosend_generic() */
+	pr_sendfile_wait_t  *pr_sendfile_wait;	/* sendfile helper */
 	pr_ready_t	*pr_ready;	/* sendfile/ktls readyness */
 	pr_sopoll_t	*pr_sopoll;	/* poll(2) */
 /* Cache line #2 */
@@ -121,7 +122,7 @@ struct protosw {
 	pr_disconnect_t	*pr_disconnect;	/* sodisconnect() */
 	pr_close_t	*pr_close;	/* close(2) */
 	pr_shutdown_t	*pr_shutdown;	/* shutdown(2) */
-	pr_abort_t	*pr_abort;	/* abrupt tear down: soabort() */
+	pr_rcvd_t	*pr_rcvd;	/* soreceive_generic() if PR_WANTRCVD */
 	pr_aio_queue_t	*pr_aio_queue;	/* aio(9) */
 /* Cache line #3 */
 	pr_bind_t	*pr_bind;	/* bind(2) */
@@ -133,6 +134,7 @@ struct protosw {
 	pr_control_t	*pr_control;	/* ioctl(2) */
 	pr_rcvoob_t	*pr_rcvoob;	/* soreceive_rcvoob() */
 /* Cache line #4 */
+	pr_abort_t	*pr_abort;	/* abrupt tear down: soabort() */
 	pr_ctloutput_t	*pr_ctloutput;	/* control output (from above) */
 	pr_peeraddr_t	*pr_peeraddr;	/* getpeername(2) */
 	pr_sockaddr_t	*pr_sockaddr;	/* getsockname(2) */
diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h
index 8e70ada24259..9e3a59433f2b 100644
--- a/sys/sys/socketvar.h
+++ b/sys/sys/socketvar.h
@@ -555,6 +555,7 @@ int	sosend_dgram(struct socket *so, struct sockaddr *addr,
 int	sosend_generic(struct socket *so, struct sockaddr *addr,
 	    struct uio *uio, struct mbuf *top, struct mbuf *control,
 	    int flags, struct thread *td);
+int	sendfile_wait_generic(struct socket *so, off_t need, int *space);
 int	sosetfib(struct socket *so, int fibnum);
 int	soshutdown(struct socket *so, enum shutdown_how);
 void	soupcall_clear(struct socket *, sb_which);