git: a35f24c9055d - main - sendfile: factor out socket send buffer space sensing into a method
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Wed, 30 Apr 2025 21:18:12 UTC
The branch main has been updated by glebius:
URL: https://cgit.FreeBSD.org/src/commit/?id=a35f24c9055d90b409eede861b7dc6defa01a118
commit a35f24c9055d90b409eede861b7dc6defa01a118
Author: Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2025-04-30 21:17:35 +0000
Commit: Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2025-04-30 21:17:35 +0000
sendfile: factor out socket send buffer space sensing into a method
Move a block of code that works with the socket send buffer from the main
sendfile loop into a separate function. Make it a protocol method, so
that protocols may provide a different one.
While here, provide a long comment explaining why we modify sb_lowat and
why we can't just remove that hack.
No functional change intended.
Reviewed by: markj
Differential Revision: https://reviews.freebsd.org/D48918
---
sys/kern/kern_sendfile.c | 155 ++++++++++++++++++++++++-----------------------
sys/kern/uipc_domain.c | 7 +++
sys/kern/uipc_usrreq.c | 1 +
sys/netinet/tcp_usrreq.c | 2 +
sys/sys/protosw.h | 6 +-
sys/sys/socketvar.h | 1 +
6 files changed, 95 insertions(+), 77 deletions(-)
diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c
index d409f41824ad..1fd1828c37c7 100644
--- a/sys/kern/kern_sendfile.c
+++ b/sys/kern/kern_sendfile.c
@@ -667,6 +667,77 @@ sendfile_getsock(struct thread *td, int s, struct file **sock_fp,
return (0);
}
+/*
+ * Check socket state and wait (or EAGAIN) for needed amount of space.
+ */
+int
+sendfile_wait_generic(struct socket *so, off_t need, int *space)
+{
+ int error;
+
+ MPASS(need > 0);
+ MPASS(space != NULL);
+
+ /*
+ * XXXGL: the hack with sb_lowat originates from d99b0dd2c5297. To
+ * achieve high performance sending with sendfile(2) a non-blocking
+ * socket needs a fairly high low watermark. Otherwise, the socket
+ * will be reported as writable too early, and sendfile(2) will send
+ * just a few bytes each time. It is important to understand that
+ * we are changing sb_lowat not for the current invocation of the
+ * syscall, but for the *next* syscall. So there is no way to
+ * workaround the problem, e.g. provide a special version of sbspace().
+ * Since this hack has been in the kernel for a long time, we
+ * anticipate that there is a lot of software that will suffer if we
+ * remove it. See also b21104487324.
+ */
+ error = 0;
+ SOCK_SENDBUF_LOCK(so);
+ if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
+ so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
+ if (so->so_snd.sb_lowat < PAGE_SIZE && so->so_snd.sb_hiwat >= PAGE_SIZE)
+ so->so_snd.sb_lowat = PAGE_SIZE;
+retry_space:
+ if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+ error = EPIPE;
+ goto done;
+ } else if (so->so_error) {
+ error = so->so_error;
+ so->so_error = 0;
+ goto done;
+ }
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ error = ENOTCONN;
+ goto done;
+ }
+
+ *space = sbspace(&so->so_snd);
+ if (*space < need && (*space <= 0 || *space < so->so_snd.sb_lowat)) {
+ if (so->so_state & SS_NBIO) {
+ error = EAGAIN;
+ goto done;
+ }
+ /*
+ * sbwait() drops the lock while sleeping. When we loop back
+ * to retry_space the state may have changed and we retest
+ * for it.
+ */
+ error = sbwait(so, SO_SND);
+ /*
+ * An error from sbwait() usually indicates that we've been
+ * interrupted by a signal. If we've sent anything then return
+ * bytes sent, otherwise return the error.
+ */
+ if (error != 0)
+ goto done;
+ goto retry_space;
+ }
+done:
+ SOCK_SENDBUF_UNLOCK(so);
+
+ return (error);
+}
+
int
vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
@@ -677,6 +748,7 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
struct vm_object *obj;
vm_page_t pga;
struct socket *so;
+ const struct protosw *pr;
#ifdef KERN_TLS
struct ktls_session *tls;
#endif
@@ -710,6 +782,7 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
error = sendfile_getsock(td, sockfd, &sock_fp, &so);
if (error != 0)
goto out;
+ pr = so->so_proto;
#ifdef MAC
error = mac_socket_check_send(td->td_ucred, so);
@@ -760,74 +833,8 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
int nios, space, npages, rhpages;
mtail = NULL;
- /*
- * Check the socket state for ongoing connection,
- * no errors and space in socket buffer.
- * If space is low allow for the remainder of the
- * file to be processed if it fits the socket buffer.
- * Otherwise block in waiting for sufficient space
- * to proceed, or if the socket is nonblocking, return
- * to userland with EAGAIN while reporting how far
- * we've come.
- * We wait until the socket buffer has significant free
- * space to do bulk sends. This makes good use of file
- * system read ahead and allows packet segmentation
- * offloading hardware to take over lots of work. If
- * we were not careful here we would send off only one
- * sfbuf at a time.
- */
- SOCKBUF_LOCK(&so->so_snd);
- if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
- so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
- if (so->so_snd.sb_lowat < PAGE_SIZE &&
- so->so_snd.sb_hiwat >= PAGE_SIZE)
- so->so_snd.sb_lowat = PAGE_SIZE;
-retry_space:
- if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
- error = EPIPE;
- SOCKBUF_UNLOCK(&so->so_snd);
- goto done;
- } else if (so->so_error) {
- error = so->so_error;
- so->so_error = 0;
- SOCKBUF_UNLOCK(&so->so_snd);
- goto done;
- }
- if ((so->so_state & SS_ISCONNECTED) == 0) {
- SOCKBUF_UNLOCK(&so->so_snd);
- error = ENOTCONN;
+ if ((error = pr->pr_sendfile_wait(so, rem, &space)) != 0)
goto done;
- }
-
- space = sbspace(&so->so_snd);
- if (space < rem &&
- (space <= 0 ||
- space < so->so_snd.sb_lowat)) {
- if (so->so_state & SS_NBIO) {
- SOCKBUF_UNLOCK(&so->so_snd);
- error = EAGAIN;
- goto done;
- }
- /*
- * sbwait drops the lock while sleeping.
- * When we loop back to retry_space the
- * state may have changed and we retest
- * for it.
- */
- error = sbwait(so, SO_SND);
- /*
- * An error from sbwait usually indicates that we've
- * been interrupted by a signal. If we've sent anything
- * then return bytes sent, otherwise return the error.
- */
- if (error != 0) {
- SOCKBUF_UNLOCK(&so->so_snd);
- goto done;
- }
- goto retry_space;
- }
- SOCKBUF_UNLOCK(&so->so_snd);
-
/*
* At the beginning of the first loop check if any headers
* are specified and copy them into mbufs. Reduce space in
@@ -966,8 +973,7 @@ retry_space:
*
* TLS frames always require unmapped mbufs.
*/
- if ((mb_use_ext_pgs &&
- so->so_proto->pr_protocol == IPPROTO_TCP)
+ if ((mb_use_ext_pgs && pr->pr_protocol == IPPROTO_TCP)
#ifdef KERN_TLS
|| tls != NULL
#endif
@@ -1172,8 +1178,8 @@ prepend_header:
sendfile_iodone(sfio, NULL, 0, 0);
#ifdef KERN_TLS
if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) {
- error = so->so_proto->pr_send(so,
- PRUS_NOTREADY, m, NULL, NULL, td);
+ error = pr->pr_send(so, PRUS_NOTREADY, m, NULL,
+ NULL, td);
if (error != 0) {
m_freem(m);
} else {
@@ -1182,14 +1188,13 @@ prepend_header:
}
} else
#endif
- error = so->so_proto->pr_send(so, 0, m, NULL,
- NULL, td);
+ error = pr->pr_send(so, 0, m, NULL, NULL, td);
} else {
sfio->so = so;
sfio->m = m0;
soref(so);
- error = so->so_proto->pr_send(so, PRUS_NOTREADY, m,
- NULL, NULL, td);
+ error = pr->pr_send(so, PRUS_NOTREADY, m, NULL, NULL,
+ td);
sendfile_iodone(sfio, NULL, 0, error);
}
#ifdef TCP_REQUEST_TRK
diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c
index 3f31f8ba421c..ebcf041790b2 100644
--- a/sys/kern/uipc_domain.c
+++ b/sys/kern/uipc_domain.c
@@ -144,6 +144,12 @@ pr_send_notsupp(struct socket *so, int flags, struct mbuf *m,
return (EOPNOTSUPP);
}
+static int
+pr_sendfile_wait_notsupp(struct socket *so, off_t need, int *space)
+{
+ return (EOPNOTSUPP);
+}
+
static int
pr_ready_notsupp(struct socket *so, struct mbuf *m, int count)
{
@@ -208,6 +214,7 @@ pr_init(struct domain *dom, struct protosw *pr)
NOTSUPP(pr_rcvd);
NOTSUPP(pr_rcvoob);
NOTSUPP(pr_send);
+ NOTSUPP(pr_sendfile_wait);
NOTSUPP(pr_shutdown);
NOTSUPP(pr_sockaddr);
NOTSUPP(pr_sosend);
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index a67e105a1447..79e4da5c8698 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -3373,6 +3373,7 @@ static struct protosw streamproto = {
.pr_peeraddr = uipc_peeraddr,
.pr_rcvd = uipc_rcvd,
.pr_send = uipc_send,
+ .pr_sendfile_wait = sendfile_wait_generic,
.pr_ready = uipc_ready,
.pr_sense = uipc_sense,
.pr_shutdown = uipc_shutdown,
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index fbc204097b25..c64d25f0fa87 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1419,6 +1419,7 @@ struct protosw tcp_protosw = {
.pr_rcvd = tcp_usr_rcvd,
.pr_rcvoob = tcp_usr_rcvoob,
.pr_send = tcp_usr_send,
+ .pr_sendfile_wait = sendfile_wait_generic,
.pr_ready = tcp_usr_ready,
.pr_shutdown = tcp_usr_shutdown,
.pr_sockaddr = in_getsockaddr,
@@ -1447,6 +1448,7 @@ struct protosw tcp6_protosw = {
.pr_rcvd = tcp_usr_rcvd,
.pr_rcvoob = tcp_usr_rcvoob,
.pr_send = tcp_usr_send,
+ .pr_sendfile_wait = sendfile_wait_generic,
.pr_ready = tcp_usr_ready,
.pr_shutdown = tcp_usr_shutdown,
.pr_sockaddr = in6_mapped_sockaddr,
diff --git a/sys/sys/protosw.h b/sys/sys/protosw.h
index 4808f136cabf..7fb8dfdc7208 100644
--- a/sys/sys/protosw.h
+++ b/sys/sys/protosw.h
@@ -81,6 +81,7 @@ typedef enum {
} pr_send_flags_t;
typedef int pr_send_t(struct socket *, int, struct mbuf *,
struct sockaddr *, struct mbuf *, struct thread *);
+typedef int pr_sendfile_wait_t(struct socket *, off_t, int *);
typedef int pr_ready_t(struct socket *, struct mbuf *, int);
typedef int pr_sense_t(struct socket *, struct stat *);
typedef int pr_shutdown_t(struct socket *, enum shutdown_how);
@@ -109,9 +110,9 @@ struct protosw {
struct domain *pr_domain; /* domain protocol a member of */
pr_soreceive_t *pr_soreceive; /* recv(2) */
- pr_rcvd_t *pr_rcvd; /* soreceive_generic() if PR_WANTRCVD */
pr_sosend_t *pr_sosend; /* send(2) */
pr_send_t *pr_send; /* send(2) via sosend_generic() */
+ pr_sendfile_wait_t *pr_sendfile_wait; /* sendfile helper */
pr_ready_t *pr_ready; /* sendfile/ktls readyness */
pr_sopoll_t *pr_sopoll; /* poll(2) */
/* Cache line #2 */
@@ -121,7 +122,7 @@ struct protosw {
pr_disconnect_t *pr_disconnect; /* sodisconnect() */
pr_close_t *pr_close; /* close(2) */
pr_shutdown_t *pr_shutdown; /* shutdown(2) */
- pr_abort_t *pr_abort; /* abrupt tear down: soabort() */
+ pr_rcvd_t *pr_rcvd; /* soreceive_generic() if PR_WANTRCVD */
pr_aio_queue_t *pr_aio_queue; /* aio(9) */
/* Cache line #3 */
pr_bind_t *pr_bind; /* bind(2) */
@@ -133,6 +134,7 @@ struct protosw {
pr_control_t *pr_control; /* ioctl(2) */
pr_rcvoob_t *pr_rcvoob; /* soreceive_rcvoob() */
/* Cache line #4 */
+ pr_abort_t *pr_abort; /* abrupt tear down: soabort() */
pr_ctloutput_t *pr_ctloutput; /* control output (from above) */
pr_peeraddr_t *pr_peeraddr; /* getpeername(2) */
pr_sockaddr_t *pr_sockaddr; /* getsockname(2) */
diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h
index 8e70ada24259..9e3a59433f2b 100644
--- a/sys/sys/socketvar.h
+++ b/sys/sys/socketvar.h
@@ -555,6 +555,7 @@ int sosend_dgram(struct socket *so, struct sockaddr *addr,
int sosend_generic(struct socket *so, struct sockaddr *addr,
struct uio *uio, struct mbuf *top, struct mbuf *control,
int flags, struct thread *td);
+int sendfile_wait_generic(struct socket *so, off_t need, int *space);
int sosetfib(struct socket *so, int fibnum);
int soshutdown(struct socket *so, enum shutdown_how);
void soupcall_clear(struct socket *, sb_which);