git: a35f24c9055d - main - sendfile: factor out socket send buffer space sensing into a method
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Wed, 30 Apr 2025 21:18:12 UTC
The branch main has been updated by glebius: URL: https://cgit.FreeBSD.org/src/commit/?id=a35f24c9055d90b409eede861b7dc6defa01a118 commit a35f24c9055d90b409eede861b7dc6defa01a118 Author: Gleb Smirnoff <glebius@FreeBSD.org> AuthorDate: 2025-04-30 21:17:35 +0000 Commit: Gleb Smirnoff <glebius@FreeBSD.org> CommitDate: 2025-04-30 21:17:35 +0000 sendfile: factor out socket send buffer space sensing into a method Move a block of code that works with the socket send buffer from the main sendfile loop into a separate function. Make it a protocol method, so that protocols may provide a different one. While here, provide a long comment explaining why we modify sb_lowat and why we can't just remove that hack. No functional change intended. Reviewed by: markj Differential Revision: https://reviews.freebsd.org/D48918 --- sys/kern/kern_sendfile.c | 155 ++++++++++++++++++++++++----------------------- sys/kern/uipc_domain.c | 7 +++ sys/kern/uipc_usrreq.c | 1 + sys/netinet/tcp_usrreq.c | 2 + sys/sys/protosw.h | 6 +- sys/sys/socketvar.h | 1 + 6 files changed, 95 insertions(+), 77 deletions(-) diff --git a/sys/kern/kern_sendfile.c b/sys/kern/kern_sendfile.c index d409f41824ad..1fd1828c37c7 100644 --- a/sys/kern/kern_sendfile.c +++ b/sys/kern/kern_sendfile.c @@ -667,6 +667,77 @@ sendfile_getsock(struct thread *td, int s, struct file **sock_fp, return (0); } +/* + * Check socket state and wait (or EAGAIN) for needed amount of space. + */ +int +sendfile_wait_generic(struct socket *so, off_t need, int *space) +{ + int error; + + MPASS(need > 0); + MPASS(space != NULL); + + /* + * XXXGL: the hack with sb_lowat originates from d99b0dd2c5297. To + * achieve high performance sending with sendfile(2) a non-blocking + * socket needs a fairly high low watermark. Otherwise, the socket + * will be reported as writable too early, and sendfile(2) will send + * just a few bytes each time. It is important to understand that + * we are changing sb_lowat not for the current invocation of the + * syscall, but for the *next* syscall. So there is no way to + * workaround the problem, e.g. provide a special version of sbspace(). + * Since this hack has been in the kernel for a long time, we + * anticipate that there is a lot of software that will suffer if we + * remove it. See also b21104487324. + */ + error = 0; + SOCK_SENDBUF_LOCK(so); + if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) + so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; + if (so->so_snd.sb_lowat < PAGE_SIZE && so->so_snd.sb_hiwat >= PAGE_SIZE) + so->so_snd.sb_lowat = PAGE_SIZE; +retry_space: + if (so->so_snd.sb_state & SBS_CANTSENDMORE) { + error = EPIPE; + goto done; + } else if (so->so_error) { + error = so->so_error; + so->so_error = 0; + goto done; + } + if ((so->so_state & SS_ISCONNECTED) == 0) { + error = ENOTCONN; + goto done; + } + + *space = sbspace(&so->so_snd); + if (*space < need && (*space <= 0 || *space < so->so_snd.sb_lowat)) { + if (so->so_state & SS_NBIO) { + error = EAGAIN; + goto done; + } + /* + * sbwait() drops the lock while sleeping. When we loop back + * to retry_space the state may have changed and we retest + * for it. + */ + error = sbwait(so, SO_SND); + /* + * An error from sbwait() usually indicates that we've been + * interrupted by a signal. If we've sent anything then return + * bytes sent, otherwise return the error. + */ + if (error != 0) + goto done; + goto retry_space; + } +done: + SOCK_SENDBUF_UNLOCK(so); + + return (error); +} + int vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags, @@ -677,6 +748,7 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, struct vm_object *obj; vm_page_t pga; struct socket *so; + const struct protosw *pr; #ifdef KERN_TLS struct ktls_session *tls; #endif @@ -710,6 +782,7 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, error = sendfile_getsock(td, sockfd, &sock_fp, &so); if (error != 0) goto out; + pr = so->so_proto; #ifdef MAC error = mac_socket_check_send(td->td_ucred, so); @@ -760,74 +833,8 @@ vn_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio, int nios, space, npages, rhpages; mtail = NULL; - /* - * Check the socket state for ongoing connection, - * no errors and space in socket buffer. - * If space is low allow for the remainder of the - * file to be processed if it fits the socket buffer. - * Otherwise block in waiting for sufficient space - * to proceed, or if the socket is nonblocking, return - * to userland with EAGAIN while reporting how far - * we've come. - * We wait until the socket buffer has significant free - * space to do bulk sends. This makes good use of file - * system read ahead and allows packet segmentation - * offloading hardware to take over lots of work. If - * we were not careful here we would send off only one - * sfbuf at a time. - */ - SOCKBUF_LOCK(&so->so_snd); - if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2) - so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2; - if (so->so_snd.sb_lowat < PAGE_SIZE && - so->so_snd.sb_hiwat >= PAGE_SIZE) - so->so_snd.sb_lowat = PAGE_SIZE; -retry_space: - if (so->so_snd.sb_state & SBS_CANTSENDMORE) { - error = EPIPE; - SOCKBUF_UNLOCK(&so->so_snd); - goto done; - } else if (so->so_error) { - error = so->so_error; - so->so_error = 0; - SOCKBUF_UNLOCK(&so->so_snd); - goto done; - } - if ((so->so_state & SS_ISCONNECTED) == 0) { - SOCKBUF_UNLOCK(&so->so_snd); - error = ENOTCONN; + if ((error = pr->pr_sendfile_wait(so, rem, &space)) != 0) goto done; - } - - space = sbspace(&so->so_snd); - if (space < rem && - (space <= 0 || - space < so->so_snd.sb_lowat)) { - if (so->so_state & SS_NBIO) { - SOCKBUF_UNLOCK(&so->so_snd); - error = EAGAIN; - goto done; - } - /* - * sbwait drops the lock while sleeping. - * When we loop back to retry_space the - * state may have changed and we retest - * for it. - */ - error = sbwait(so, SO_SND); - /* - * An error from sbwait usually indicates that we've - * been interrupted by a signal. If we've sent anything - * then return bytes sent, otherwise return the error. - */ - if (error != 0) { - SOCKBUF_UNLOCK(&so->so_snd); - goto done; - } - goto retry_space; - } - SOCKBUF_UNLOCK(&so->so_snd); - /* * At the beginning of the first loop check if any headers * are specified and copy them into mbufs. Reduce space in @@ -966,8 +973,7 @@ retry_space: * * TLS frames always require unmapped mbufs. */ - if ((mb_use_ext_pgs && - so->so_proto->pr_protocol == IPPROTO_TCP) + if ((mb_use_ext_pgs && pr->pr_protocol == IPPROTO_TCP) #ifdef KERN_TLS || tls != NULL #endif @@ -1172,8 +1178,8 @@ prepend_header: sendfile_iodone(sfio, NULL, 0, 0); #ifdef KERN_TLS if (tls != NULL && tls->mode == TCP_TLS_MODE_SW) { - error = so->so_proto->pr_send(so, - PRUS_NOTREADY, m, NULL, NULL, td); + error = pr->pr_send(so, PRUS_NOTREADY, m, NULL, + NULL, td); if (error != 0) { m_freem(m); } else { @@ -1182,14 +1188,13 @@ prepend_header: } } else #endif - error = so->so_proto->pr_send(so, 0, m, NULL, - NULL, td); + error = pr->pr_send(so, 0, m, NULL, NULL, td); } else { sfio->so = so; sfio->m = m0; soref(so); - error = so->so_proto->pr_send(so, PRUS_NOTREADY, m, - NULL, NULL, td); + error = pr->pr_send(so, PRUS_NOTREADY, m, NULL, NULL, + td); sendfile_iodone(sfio, NULL, 0, error); } #ifdef TCP_REQUEST_TRK diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c index 3f31f8ba421c..ebcf041790b2 100644 --- a/sys/kern/uipc_domain.c +++ b/sys/kern/uipc_domain.c @@ -144,6 +144,12 @@ pr_send_notsupp(struct socket *so, int flags, struct mbuf *m, return (EOPNOTSUPP); } +static int +pr_sendfile_wait_notsupp(struct socket *so, off_t need, int *space) +{ + return (EOPNOTSUPP); +} + static int pr_ready_notsupp(struct socket *so, struct mbuf *m, int count) { @@ -208,6 +214,7 @@ pr_init(struct domain *dom, struct protosw *pr) NOTSUPP(pr_rcvd); NOTSUPP(pr_rcvoob); NOTSUPP(pr_send); + NOTSUPP(pr_sendfile_wait); NOTSUPP(pr_shutdown); NOTSUPP(pr_sockaddr); NOTSUPP(pr_sosend); diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index a67e105a1447..79e4da5c8698 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -3373,6 +3373,7 @@ static struct protosw streamproto = { .pr_peeraddr = uipc_peeraddr, .pr_rcvd = uipc_rcvd, .pr_send = uipc_send, + .pr_sendfile_wait = sendfile_wait_generic, .pr_ready = uipc_ready, .pr_sense = uipc_sense, .pr_shutdown = uipc_shutdown, diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index fbc204097b25..c64d25f0fa87 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1419,6 +1419,7 @@ struct protosw tcp_protosw = { .pr_rcvd = tcp_usr_rcvd, .pr_rcvoob = tcp_usr_rcvoob, .pr_send = tcp_usr_send, + .pr_sendfile_wait = sendfile_wait_generic, .pr_ready = tcp_usr_ready, .pr_shutdown = tcp_usr_shutdown, .pr_sockaddr = in_getsockaddr, @@ -1447,6 +1448,7 @@ struct protosw tcp6_protosw = { .pr_rcvd = tcp_usr_rcvd, .pr_rcvoob = tcp_usr_rcvoob, .pr_send = tcp_usr_send, + .pr_sendfile_wait = sendfile_wait_generic, .pr_ready = tcp_usr_ready, .pr_shutdown = tcp_usr_shutdown, .pr_sockaddr = in6_mapped_sockaddr, diff --git a/sys/sys/protosw.h b/sys/sys/protosw.h index 4808f136cabf..7fb8dfdc7208 100644 --- a/sys/sys/protosw.h +++ b/sys/sys/protosw.h @@ -81,6 +81,7 @@ typedef enum { } pr_send_flags_t; typedef int pr_send_t(struct socket *, int, struct mbuf *, struct sockaddr *, struct mbuf *, struct thread *); +typedef int pr_sendfile_wait_t(struct socket *, off_t, int *); typedef int pr_ready_t(struct socket *, struct mbuf *, int); typedef int pr_sense_t(struct socket *, struct stat *); typedef int pr_shutdown_t(struct socket *, enum shutdown_how); @@ -109,9 +110,9 @@ struct protosw { struct domain *pr_domain; /* domain protocol a member of */ pr_soreceive_t *pr_soreceive; /* recv(2) */ - pr_rcvd_t *pr_rcvd; /* soreceive_generic() if PR_WANTRCVD */ pr_sosend_t *pr_sosend; /* send(2) */ pr_send_t *pr_send; /* send(2) via sosend_generic() */ + pr_sendfile_wait_t *pr_sendfile_wait; /* sendfile helper */ pr_ready_t *pr_ready; /* sendfile/ktls readyness */ pr_sopoll_t *pr_sopoll; /* poll(2) */ /* Cache line #2 */ @@ -121,7 +122,7 @@ struct protosw { pr_disconnect_t *pr_disconnect; /* sodisconnect() */ pr_close_t *pr_close; /* close(2) */ pr_shutdown_t *pr_shutdown; /* shutdown(2) */ - pr_abort_t *pr_abort; /* abrupt tear down: soabort() */ + pr_rcvd_t *pr_rcvd; /* soreceive_generic() if PR_WANTRCVD */ pr_aio_queue_t *pr_aio_queue; /* aio(9) */ /* Cache line #3 */ pr_bind_t *pr_bind; /* bind(2) */ @@ -133,6 +134,7 @@ struct protosw { pr_control_t *pr_control; /* ioctl(2) */ pr_rcvoob_t *pr_rcvoob; /* soreceive_rcvoob() */ /* Cache line #4 */ + pr_abort_t *pr_abort; /* abrupt tear down: soabort() */ pr_ctloutput_t *pr_ctloutput; /* control output (from above) */ pr_peeraddr_t *pr_peeraddr; /* getpeername(2) */ pr_sockaddr_t *pr_sockaddr; /* getsockname(2) */ diff --git a/sys/sys/socketvar.h b/sys/sys/socketvar.h index 8e70ada24259..9e3a59433f2b 100644 --- a/sys/sys/socketvar.h +++ b/sys/sys/socketvar.h @@ -555,6 +555,7 @@ int sosend_dgram(struct socket *so, struct sockaddr *addr, int sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td); +int sendfile_wait_generic(struct socket *so, off_t need, int *space); int sosetfib(struct socket *so, int fibnum); int soshutdown(struct socket *so, enum shutdown_how); void soupcall_clear(struct socket *, sb_which);