git: 5716d902ae1d - main - Revert "unix: new implementation of unix/stream & unix/seqpacket"
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Tue, 09 Apr 2024 20:19:34 UTC
The branch main has been updated by glebius:
URL: https://cgit.FreeBSD.org/src/commit/?id=5716d902ae1d2dbd396f7808c12d7b0d285f2c9d
commit 5716d902ae1d2dbd396f7808c12d7b0d285f2c9d
Author: Gleb Smirnoff <glebius@FreeBSD.org>
AuthorDate: 2024-04-09 20:15:16 +0000
Commit: Gleb Smirnoff <glebius@FreeBSD.org>
CommitDate: 2024-04-09 20:15:47 +0000
Revert "unix: new implementation of unix/stream & unix/seqpacket"
The regressions in aio(4) and kernel RPC aren't a 5 minute problem.
This reverts commit d80a97def9a1db6f07f5d2e68f7ad62b27918947.
This reverts commit d1cbb17a873c787a527316bbb27551e97d5ad30c.
This reverts commit fb8a8333b481cc4256d0b3f0b5b4feaa4594e01f.
---
sys/kern/uipc_usrreq.c | 959 +++++++++++++++++--------------------------------
sys/sys/sockbuf.h | 7 -
2 files changed, 320 insertions(+), 646 deletions(-)
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index 4a1c480c43fa..6e83e2be6f05 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -5,7 +5,7 @@
* The Regents of the University of California. All Rights Reserved.
* Copyright (c) 2004-2009 Robert N. M. Watson All Rights Reserved.
* Copyright (c) 2018 Matthew Macy
- * Copyright (c) 2022-2024 Gleb Smirnoff <glebius@FreeBSD.org>
+ * Copyright (c) 2022 Gleb Smirnoff <glebius@FreeBSD.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -141,14 +141,11 @@ static struct timeout_task unp_gc_task;
static struct task unp_defer_task;
/*
- * SOCK_STREAM and SOCK_SEQPACKET unix(4) sockets fully bypass the send buffer,
- * however the notion of send buffer still makes sense with them. Its size is
- * the amount of space that a send(2) syscall may copyin(9) before checking
- * with the receive buffer of a peer. Although not linked anywhere yet,
- * pointed to by a stack variable, effectively it is a buffer that needs to be
- * sized.
+ * Both send and receive buffers are allocated PIPSIZ bytes of buffering for
+ * stream sockets, although the total for sender and receiver is actually
+ * only PIPSIZ.
*
- * SOCK_DGRAM sockets really use the sendspace as the maximum datagram size,
+ * Datagram sockets really use the sendspace as the maximum datagram size,
* and don't really want to reserve the sendspace. Their recvspace should be
* large enough for at least one max-size datagram plus address.
*/
@@ -159,7 +156,7 @@ static u_long unpst_sendspace = PIPSIZ;
static u_long unpst_recvspace = PIPSIZ;
static u_long unpdg_maxdgram = 8*1024; /* support 8KB syslog msgs */
static u_long unpdg_recvspace = 16*1024;
-static u_long unpsp_sendspace = PIPSIZ;
+static u_long unpsp_sendspace = PIPSIZ; /* really max datagram size */
static u_long unpsp_recvspace = PIPSIZ;
static SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
@@ -303,12 +300,13 @@ static void unp_gc(__unused void *, int);
static void unp_scan(struct mbuf *, void (*)(struct filedescent **, int));
static void unp_discard(struct file *);
static void unp_freerights(struct filedescent **, int);
-static int unp_internalize(struct mbuf *, struct mchain *,
- struct thread *);
+static int unp_internalize(struct mbuf **, struct thread *,
+ struct mbuf **, u_int *, u_int *);
static void unp_internalize_fp(struct file *);
static int unp_externalize(struct mbuf *, struct mbuf **, int);
static int unp_externalize_fp(struct file *);
-static void unp_addsockcred(struct thread *, struct mchain *, int);
+static struct mbuf *unp_addsockcred(struct thread *, struct mbuf *,
+ int, struct mbuf **, u_int *, u_int *);
static void unp_process_defers(void * __unused, int);
static void
@@ -451,7 +449,6 @@ uipc_attach(struct socket *so, int proto, struct thread *td)
case SOCK_STREAM:
sendspace = unpst_sendspace;
recvspace = unpst_recvspace;
- STAILQ_INIT(&so->so_rcv.sb_mbq);
break;
case SOCK_DGRAM:
@@ -469,7 +466,6 @@ uipc_attach(struct socket *so, int proto, struct thread *td)
case SOCK_SEQPACKET:
sendspace = unpsp_sendspace;
recvspace = unpsp_recvspace;
- STAILQ_INIT(&so->so_rcv.sb_mbq);
break;
default:
@@ -801,10 +797,6 @@ uipc_detach(struct socket *so)
taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
switch (so->so_type) {
- case SOCK_STREAM:
- case SOCK_SEQPACKET:
- MPASS(STAILQ_EMPTY(&so->so_rcv.sb_mbq));
- break;
case SOCK_DGRAM:
/*
* Everything should have been unlinked/freed by unp_dispose()
@@ -860,10 +852,6 @@ uipc_listen(struct socket *so, int backlog, struct thread *td)
error = solisten_proto_check(so);
if (error == 0) {
cru2xt(td, &unp->unp_peercred);
- (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
- 0, RLIM_INFINITY);
- (void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_rcv.sb_hiwat,
- 0, RLIM_INFINITY);
solisten_proto(so, backlog);
}
SOCK_UNLOCK(so);
@@ -897,562 +885,187 @@ uipc_peeraddr(struct socket *so, struct sockaddr *ret)
return (0);
}
-/*
- * pr_sosend() called with mbuf instead of uio is a kernel thread. NFS,
- * netgraph(4) and other subsystems can call into socket code. The
- * function will condition the mbuf so that it can be safely put onto socket
- * buffer and calculate its char count and mbuf count.
- *
- * Note: we don't support receiving control data from a kernel thread. Our
- * pr_sosend methods have MPASS() to check that. This may change.
- */
-static void
-uipc_reset_kernel_mbuf(struct mbuf *m, struct mchain *mc)
-{
-
- M_ASSERTPKTHDR(m);
-
- m_clrprotoflags(m);
- m_tag_delete_chain(m, NULL);
- m->m_pkthdr.rcvif = NULL;
- m->m_pkthdr.flowid = 0;
- m->m_pkthdr.csum_flags = 0;
- m->m_pkthdr.fibnum = 0;
- m->m_pkthdr.rsstype = 0;
-
- mc_init_m(mc, m);
- MPASS(m->m_pkthdr.len == mc->mc_len);
-}
-
-#ifdef SOCKBUF_DEBUG
-static inline void
-uipc_stream_sbcheck(struct sockbuf *sb)
+static int
+uipc_rcvd(struct socket *so, int flags)
{
- struct mbuf *d;
- u_int dcc, dctl, dmbcnt;
-
- dcc = dctl = dmbcnt = 0;
- STAILQ_FOREACH(d, &sb->sb_mbq, m_stailq) {
- if (d->m_type == MT_CONTROL)
- dctl += d->m_len;
- else if (d->m_type == MT_DATA)
- dcc += d->m_len;
- else
- MPASS(0);
- dmbcnt += MSIZE;
- if (d->m_flags & M_EXT)
- dmbcnt += d->m_ext.ext_size;
- if (d->m_stailq.stqe_next == NULL)
- MPASS(sb->sb_mbq.stqh_last == &d->m_stailq.stqe_next);
- }
- MPASS(dcc == sb->sb_acc);
- MPASS(dcc == sb->sb_ccc);
- MPASS(dctl == sb->sb_ctl);
- MPASS(dmbcnt == sb->sb_mbcnt);
-}
-#define UIPC_STREAM_SBCHECK(sb) uipc_stream_sbcheck(sb)
-#else
-#define UIPC_STREAM_SBCHECK(sb) do {} while (0)
-#endif
+ struct unpcb *unp, *unp2;
+ struct socket *so2;
+ u_int mbcnt, sbcc;
-/*
- * uipc_stream_sbspace() returns how much a writer can send, limited by char
- * count or mbuf memory use, whatever ends first.
- *
- * XXXGL: sb_mbcnt may overcommit sb_mbmax in case if previous write observed
- * 'space < mbspace', but mchain allocated to hold 'space' bytes of data ended
- * up with 'mc_mlen > mbspace'. A typical scenario would be a full buffer with
- * writer trying to push in a large write, and a slow reader, that reads just
- * a few bytes at a time. In that case writer will keep creating new mbufs
- * with mc_split(). These mbufs will carry little chars, but will all point at
- * the same cluster, thus each adding cluster size to sb_mbcnt. This means we
- * will count same cluster many times potentially underutilizing socket buffer.
- * We aren't optimizing towards ineffective readers. Classic socket buffer had
- * the same "feature".
- */
-static inline u_int
-uipc_stream_sbspace(struct sockbuf *sb)
-{
- u_int space, mbspace;
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
+ KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET,
+ ("%s: socktype %d", __func__, so->so_type));
- MPASS(sb->sb_hiwat >= sb->sb_ccc + sb->sb_ctl);
- space = sb->sb_hiwat - sb->sb_ccc - sb->sb_ctl;
- if (__predict_true(sb->sb_mbmax >= sb->sb_mbcnt))
- mbspace = sb->sb_mbmax - sb->sb_mbcnt;
- else
+ /*
+ * Adjust backpressure on sender and wakeup any waiting to write.
+ *
+ * The unp lock is acquired to maintain the validity of the unp_conn
+ * pointer; no lock on unp2 is required as unp2->unp_socket will be
+ * static as long as we don't permit unp2 to disconnect from unp,
+ * which is prevented by the lock on unp. We cache values from
+ * so_rcv to avoid holding the so_rcv lock over the entire
+ * transaction on the remote so_snd.
+ */
+ SOCKBUF_LOCK(&so->so_rcv);
+ mbcnt = so->so_rcv.sb_mbcnt;
+ sbcc = sbavail(&so->so_rcv);
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ /*
+ * There is a benign race condition at this point. If we're planning to
+ * clear SB_STOP, but uipc_send is called on the connected socket at
+ * this instant, it might add data to the sockbuf and set SB_STOP. Then
+ * we would erroneously clear SB_STOP below, even though the sockbuf is
+ * full. The race is benign because the only ill effect is to allow the
+ * sockbuf to exceed its size limit, and the size limits are not
+ * strictly guaranteed anyway.
+ */
+ UNP_PCB_LOCK(unp);
+ unp2 = unp->unp_conn;
+ if (unp2 == NULL) {
+ UNP_PCB_UNLOCK(unp);
return (0);
-
- return (min(space, mbspace));
+ }
+ so2 = unp2->unp_socket;
+ SOCKBUF_LOCK(&so2->so_snd);
+ if (sbcc < so2->so_snd.sb_hiwat && mbcnt < so2->so_snd.sb_mbmax)
+ so2->so_snd.sb_flags &= ~SB_STOP;
+ sowwakeup_locked(so2);
+ UNP_PCB_UNLOCK(unp);
+ return (0);
}
static int
-uipc_sosend_stream_or_seqpacket(struct socket *so, struct sockaddr *addr,
- struct uio *uio, struct mbuf *m, struct mbuf *c, int flags,
- struct thread *td)
+uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
+ struct mbuf *control, struct thread *td)
{
struct unpcb *unp, *unp2;
struct socket *so2;
- struct sockbuf *sb;
- struct mchain mc, cmc;
- ssize_t resid, sent;
- bool nonblock, eor;
+ u_int mbcnt, sbcc;
int error;
- MPASS((uio != NULL && m == NULL) || (m != NULL && uio == NULL));
- MPASS(m == NULL || c == NULL);
-
- if (__predict_false(flags & MSG_OOB))
- return (EOPNOTSUPP);
-
- nonblock = (so->so_state & SS_NBIO) ||
- (flags & (MSG_DONTWAIT | MSG_NBIO));
- eor = flags & MSG_EOR;
+ unp = sotounpcb(so);
+ KASSERT(unp != NULL, ("%s: unp == NULL", __func__));
+ KASSERT(so->so_type == SOCK_STREAM || so->so_type == SOCK_SEQPACKET,
+ ("%s: socktype %d", __func__, so->so_type));
- mc = MCHAIN_INITIALIZER(&mc);
- cmc = MCHAIN_INITIALIZER(&cmc);
- sent = 0;
+ error = 0;
+ if (flags & PRUS_OOB) {
+ error = EOPNOTSUPP;
+ goto release;
+ }
+ if (control != NULL &&
+ (error = unp_internalize(&control, td, NULL, NULL, NULL)))
+ goto release;
- if (m == NULL) {
- if (c != NULL && (error = unp_internalize(c, &cmc, td)))
+ unp2 = NULL;
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ if (nam != NULL) {
+ if ((error = unp_connect(so, nam, td)) != 0)
+ goto out;
+ } else {
+ error = ENOTCONN;
goto out;
- /*
- * Optimization for a case when our send fits into the receive
- * buffer - do the copyin before taking any locks, sized to our
- * send buffer. Later copyins will also take into account
- * space in the peer's receive buffer.
- */
- resid = uio->uio_resid;
- error = mc_uiotomc(&mc, uio, so->so_snd.sb_hiwat, 0, M_WAITOK,
- eor ? M_EOR : 0);
- if (__predict_false(error))
- goto out2;
- } else
- uipc_reset_kernel_mbuf(m, &mc);
-
- error = SOCK_IO_SEND_LOCK(so, SBLOCKWAIT(flags));
- if (error)
- goto out2;
+ }
+ }
- unp = sotounpcb(so);
UNP_PCB_LOCK(unp);
- unp2 = unp_pcb_lock_peer(unp);
- if (__predict_false(so->so_error != 0)) {
- error = so->so_error;
- so->so_error = 0;
- UNP_PCB_UNLOCK(unp);
- if (unp2 != NULL)
- UNP_PCB_UNLOCK(unp2);
- goto out3;
- }
- if (__predict_false(unp2 == NULL)) {
- /*
- * Different error code for a previously connected socket and
- * a never connected one. The SS_ISDISCONNECTED is set in the
- * unp_soisdisconnected() and is synchronized by the pcb lock.
- */
- error = so->so_state & SS_ISDISCONNECTED ? EPIPE : ENOTCONN;
+ if ((unp2 = unp_pcb_lock_peer(unp)) == NULL) {
UNP_PCB_UNLOCK(unp);
- goto out3;
+ error = ENOTCONN;
+ goto out;
+ } else if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+ unp_pcb_unlock_pair(unp, unp2);
+ error = EPIPE;
+ goto out;
}
UNP_PCB_UNLOCK(unp);
-
+ if ((so2 = unp2->unp_socket) == NULL) {
+ UNP_PCB_UNLOCK(unp2);
+ error = ENOTCONN;
+ goto out;
+ }
+ SOCKBUF_LOCK(&so2->so_rcv);
if (unp2->unp_flags & UNP_WANTCRED_MASK) {
/*
* Credentials are passed only once on SOCK_STREAM and
* SOCK_SEQPACKET (LOCAL_CREDS => WANTCRED_ONESHOT), or
* forever (LOCAL_CREDS_PERSISTENT => WANTCRED_ALWAYS).
*/
- unp_addsockcred(td, &cmc, unp2->unp_flags);
+ control = unp_addsockcred(td, control, unp2->unp_flags, NULL,
+ NULL, NULL);
unp2->unp_flags &= ~UNP_WANTCRED_ONESHOT;
}
/*
- * Cycle through the data to send and available space in the peer's
- * receive buffer. Put a reference on the peer socket, so that it
- * doesn't get freed while we sbwait(). If peer goes away, we will
- * observe the SBS_CANTRCVMORE and our sorele() will finalize peer's
- * socket destruction.
+ * Send to paired receive port and wake up readers. Don't
+ * check for space available in the receive buffer if we're
+ * attaching ancillary data; Unix domain sockets only check
+ * for space in the sending sockbuf, and that check is
+ * performed one level up the stack. At that level we cannot
+ * precisely account for the amount of buffer space used
+ * (e.g., because control messages are not yet internalized).
*/
- so2 = unp2->unp_socket;
- soref(so2);
- UNP_PCB_UNLOCK(unp2);
- sb = &so2->so_rcv;
- while (mc.mc_len + cmc.mc_len > 0) {
- struct mchain mcnext = MCHAIN_INITIALIZER(&mcnext);
- u_int space;
+ switch (so->so_type) {
+ case SOCK_STREAM:
+ if (control != NULL) {
+ sbappendcontrol_locked(&so2->so_rcv,
+ m->m_len > 0 ? m : NULL, control, flags);
+ control = NULL;
+ } else
+ sbappend_locked(&so2->so_rcv, m, flags);
+ break;
- SOCK_RECVBUF_LOCK(so2);
-restart:
- UIPC_STREAM_SBCHECK(sb);
- if (__predict_false(cmc.mc_len > sb->sb_hiwat)) {
- SOCK_RECVBUF_UNLOCK(so2);
- error = EMSGSIZE;
- goto out4;
- }
- if (__predict_false(sb->sb_state & SBS_CANTRCVMORE)) {
- SOCK_RECVBUF_UNLOCK(so2);
- error = EPIPE;
- goto out4;
- }
- /*
- * Wait on the peer socket receive buffer until we have enough
- * space to put at least control. The data is a stream and can
- * be put partially, but control is really a datagram.
- */
- space = uipc_stream_sbspace(sb);
- if (space < sb->sb_lowat || space < cmc.mc_len) {
- if (nonblock) {
- SOCK_RECVBUF_UNLOCK(so2);
- error = EWOULDBLOCK;
- goto out4;
- }
- if ((error = sbwait(so2, SO_RCV)) != 0) {
- SOCK_RECVBUF_UNLOCK(so2);
- goto out4;
- } else
- goto restart;
- }
- MPASS(space >= cmc.mc_len);
- space -= cmc.mc_len;
- if (space == 0) {
- /* There is space only to send control. */
- MPASS(!STAILQ_EMPTY(&cmc.mc_q));
- mcnext = mc;
- mc = MCHAIN_INITIALIZER(&mc);
- } else if (space < mc.mc_len) {
- /* Not enough space. */
- if (__predict_false(mc_split(&mc, &mcnext, space,
- M_NOWAIT) == ENOMEM)) {
- /*
- * If allocation failed use M_WAITOK and merge
- * the chain back. Next time mc_split() will
- * easily split at the same place. Only if we
- * race with setsockopt(SO_RCVBUF) shrinking
- * sb_hiwat can this happen more than once.
- */
- SOCK_RECVBUF_UNLOCK(so2);
- (void)mc_split(&mc, &mcnext, space, M_WAITOK);
- mc_concat(&mc, &mcnext);
- SOCK_RECVBUF_LOCK(so2);
- goto restart;
- }
- MPASS(mc.mc_len == space);
- }
- if (!STAILQ_EMPTY(&cmc.mc_q)) {
- STAILQ_CONCAT(&sb->sb_mbq, &cmc.mc_q);
- sb->sb_ctl += cmc.mc_len;
- sb->sb_mbcnt += cmc.mc_mlen;
- cmc.mc_len = 0;
- }
- sent += mc.mc_len;
- sb->sb_acc += mc.mc_len;
- sb->sb_ccc += mc.mc_len;
- sb->sb_mbcnt += mc.mc_mlen;
- STAILQ_CONCAT(&sb->sb_mbq, &mc.mc_q);
- UIPC_STREAM_SBCHECK(sb);
- space = uipc_stream_sbspace(sb);
- sorwakeup_locked(so2);
- mc = mcnext;
- if (STAILQ_EMPTY(&mc.mc_q) &&
- uio != NULL && uio->uio_resid > 0) {
- /*
- * Copyin sum of peer's receive buffer space and our
- * sb_hiwat, which is our virtual send buffer size.
- * See comment above unpst_sendspace declaration.
- * We are reading sb_hiwat locklessly, cause a) we
- * don't care about an application that does send(2)
- * and setsockopt(2) racing internally, and for an
- * application that does this in sequence we will see
- * the correct value cause sbsetopt() uses buffer lock
- * and we also have already acquired it at least once.
- */
- error = mc_uiotomc(&mc, uio, space +
- atomic_load_int(&so->so_snd.sb_hiwat), 0, M_WAITOK,
- eor ? M_EOR : 0);
- if (__predict_false(error))
- goto out4;
- }
+ case SOCK_SEQPACKET:
+ if (sbappendaddr_nospacecheck_locked(&so2->so_rcv,
+ &sun_noname, m, control))
+ control = NULL;
+ break;
}
- MPASS(STAILQ_EMPTY(&mc.mc_q));
-
- td->td_ru.ru_msgsnd++;
-out4:
- sorele(so2);
-out3:
- SOCK_IO_SEND_UNLOCK(so);
-out2:
- if (!mc_empty(&cmc))
- unp_scan(mc_first(&cmc), unp_freerights);
-out:
- mc_freem(&mc);
- mc_freem(&cmc);
-
- if (uio != NULL)
- uio->uio_resid = resid - sent;
-
- return (error);
-}
-
-static int
-uipc_soreceive_stream_or_seqpacket(struct socket *so, struct sockaddr **psa,
- struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
-{
- struct sockbuf *sb = &so->so_rcv;
- struct mbuf *control, *m, *first, *last, *next;
- u_int ctl, space, datalen, mbcnt, lastlen;
- int error, flags;
- bool nonblock, waitall, peek;
-
- MPASS(mp0 == NULL);
-
- if (psa != NULL)
- *psa = NULL;
- if (controlp != NULL)
- *controlp = NULL;
-
- flags = flagsp != NULL ? *flagsp : 0;
- nonblock = (so->so_state & SS_NBIO) ||
- (flags & (MSG_DONTWAIT | MSG_NBIO));
- peek = flags & MSG_PEEK;
- waitall = (flags & MSG_WAITALL) && !peek;
+ mbcnt = so2->so_rcv.sb_mbcnt;
+ sbcc = sbavail(&so2->so_rcv);
+ if (sbcc)
+ sorwakeup_locked(so2);
+ else
+ SOCKBUF_UNLOCK(&so2->so_rcv);
/*
- * This check may fail only on a socket that never went through
- * connect(2). We can check this locklessly, cause: a) for a new born
- * socket we don't care about applications that may race internally
- * between connect(2) and recv(2), and b) for a dying socket if we
- * miss update by unp_sosidisconnected(), we would still get the check
- * correct. For dying socket we would observe SBS_CANTRCVMORE later.
+ * The PCB lock on unp2 protects the SB_STOP flag. Without it,
+ * it would be possible for uipc_rcvd to be called at this
+ * point, drain the receiving sockbuf, clear SB_STOP, and then
+ * we would set SB_STOP below. That could lead to an empty
+ * sockbuf having SB_STOP set
*/
- if (__predict_false((atomic_load_short(&so->so_state) &
- (SS_ISCONNECTED|SS_ISDISCONNECTED)) == 0))
- return (ENOTCONN);
-
- error = SOCK_IO_RECV_LOCK(so, SBLOCKWAIT(flags));
- if (__predict_false(error))
- return (error);
-
-restart:
- SOCK_RECVBUF_LOCK(so);
- UIPC_STREAM_SBCHECK(sb);
- while (sb->sb_acc < sb->sb_lowat &&
- (sb->sb_ctl == 0 || controlp == NULL)) {
- if (so->so_error) {
- error = so->so_error;
- if (!peek)
- so->so_error = 0;
- SOCK_RECVBUF_UNLOCK(so);
- SOCK_IO_RECV_UNLOCK(so);
- return (error);
- }
- if (sb->sb_state & SBS_CANTRCVMORE) {
- SOCK_RECVBUF_UNLOCK(so);
- SOCK_IO_RECV_UNLOCK(so);
- return (0);
- }
- if (nonblock) {
- SOCK_RECVBUF_UNLOCK(so);
- SOCK_IO_RECV_UNLOCK(so);
- return (EWOULDBLOCK);
- }
- error = sbwait(so, SO_RCV);
- if (error) {
- SOCK_RECVBUF_UNLOCK(so);
- SOCK_IO_RECV_UNLOCK(so);
- return (error);
- }
- }
-
- MPASS(STAILQ_FIRST(&sb->sb_mbq));
- MPASS(sb->sb_acc > 0 || sb->sb_ctl > 0);
-
- mbcnt = 0;
- ctl = 0;
- first = STAILQ_FIRST(&sb->sb_mbq);
- if (first->m_type == MT_CONTROL) {
- control = first;
- STAILQ_FOREACH_FROM(first, &sb->sb_mbq, m_stailq) {
- if (first->m_type != MT_CONTROL)
- break;
- ctl += first->m_len;
- mbcnt += MSIZE;
- if (first->m_flags & M_EXT)
- mbcnt += first->m_ext.ext_size;
- }
- } else
- control = NULL;
-
+ SOCKBUF_LOCK(&so->so_snd);
+ if (sbcc >= so->so_snd.sb_hiwat || mbcnt >= so->so_snd.sb_mbmax)
+ so->so_snd.sb_flags |= SB_STOP;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ UNP_PCB_UNLOCK(unp2);
+ m = NULL;
+out:
/*
- * Find split point for the next copyout. On exit from the loop:
- * last == NULL - socket to be flushed
- * last != NULL
- * lastlen > last->m_len - uio to be filled, last to be adjusted
- * lastlen == 0 - MT_CONTROL or M_EOR encountered
+ * PRUS_EOF is equivalent to pr_send followed by pr_shutdown.
*/
- space = uio->uio_resid;
- datalen = 0;
- for (m = first, last = NULL; m != NULL; m = STAILQ_NEXT(m, m_stailq)) {
- if (m->m_type != MT_DATA) {
- last = m;
- lastlen = 0;
- break;
- }
- if (space >= m->m_len) {
- space -= m->m_len;
- datalen += m->m_len;
- mbcnt += MSIZE;
- if (m->m_flags & M_EXT)
- mbcnt += m->m_ext.ext_size;
- if (m->m_flags & M_EOR) {
- last = STAILQ_NEXT(m, m_stailq);
- lastlen = 0;
- flags |= MSG_EOR;
- break;
- }
- } else {
- datalen += space;
- last = m;
- lastlen = space;
- break;
- }
- }
-
- UIPC_STREAM_SBCHECK(sb);
- if (!peek) {
- if (last == NULL)
- STAILQ_INIT(&sb->sb_mbq);
- else {
- STAILQ_FIRST(&sb->sb_mbq) = last;
- MPASS(last->m_len > lastlen);
- last->m_len -= lastlen;
- last->m_data += lastlen;
- }
- MPASS(sb->sb_acc >= datalen);
- sb->sb_acc -= datalen;
- sb->sb_ccc -= datalen;
- MPASS(sb->sb_ctl >= ctl);
- sb->sb_ctl -= ctl;
- MPASS(sb->sb_mbcnt >= mbcnt);
- sb->sb_mbcnt -= mbcnt;
- UIPC_STREAM_SBCHECK(sb);
- /* Mind the name. We are waking writer here, not reader. */
- sorwakeup_locked(so);
- } else
- SOCK_RECVBUF_UNLOCK(so);
-
- while (control != NULL && control->m_type == MT_CONTROL) {
- if (!peek) {
- struct mbuf *c;
-
- /*
- * unp_externalize() failure must abort entire read(2).
- * Such failure should also free the problematic
- * control, so that socket is not left in a state
- * where it can't progress forward with reading.
- * Probability of such a failure is really low, so it
- * is fine that we need to perform pretty complex
- * operation here to reconstruct the buffer.
- * XXXGL: unp_externalize() used to be
- * dom_externalize() KBI and it frees whole chain, so
- * we need to feed it with mbufs one by one.
- */
- c = control;
- control = STAILQ_NEXT(c, m_stailq);
- STAILQ_NEXT(c, m_stailq) = NULL;
- error = unp_externalize(c, controlp, flags);
- if (__predict_false(error)) {
- SOCK_RECVBUF_LOCK(so);
- UIPC_STREAM_SBCHECK(sb);
- MPASS(!(sb->sb_state & SBS_CANTRCVMORE));
- /* XXXGL: STAILQ_PREPEND */
- if (STAILQ_EMPTY(&sb->sb_mbq) &&
- control != NULL)
- STAILQ_INSERT_HEAD(&sb->sb_mbq,
- control, m_stailq);
- else
- STAILQ_FIRST(&sb->sb_mbq) = control;
- sb->sb_ctl = sb->sb_acc = sb->sb_ccc =
- sb->sb_mbcnt = 0;
- STAILQ_FOREACH(m, &sb->sb_mbq, m_stailq) {
- if (m->m_type == MT_DATA) {
- sb->sb_acc += m->m_len;
- sb->sb_ccc += m->m_len;
- } else {
- sb->sb_ctl += m->m_len;
- }
- sb->sb_mbcnt += MSIZE;
- if (m->m_flags & M_EXT)
- sb->sb_mbcnt +=
- m->m_ext.ext_size;
- }
- UIPC_STREAM_SBCHECK(sb);
- SOCK_RECVBUF_UNLOCK(so);
- SOCK_IO_RECV_UNLOCK(so);
- return (error);
- }
- if (controlp != NULL) {
- while (*controlp != NULL)
- controlp = &(*controlp)->m_next;
- }
- } else {
- /*
- * XXXGL
- *
- * In MSG_PEEK case control is not externalized. This
- * means we are leaking some kernel pointers to the
- * userland. They are useless to a law-abiding
- * application, but may be useful to a malware. This
- * is what the historical implementation in the
- * soreceive_generic() did. To be improved?
- */
- if (controlp != NULL) {
- *controlp = m_copym(control, 0, control->m_len,
- M_WAITOK);
- controlp = &(*controlp)->m_next;
- }
- control = STAILQ_NEXT(control, m_stailq);
- }
+ if (flags & PRUS_EOF) {
+ UNP_PCB_LOCK(unp);
+ socantsendmore(so);
+ unp_shutdown(unp);
+ UNP_PCB_UNLOCK(unp);
}
+ if (control != NULL && error != 0)
+ unp_scan(control, unp_freerights);
- for (m = first; m != last; m = next) {
- next = STAILQ_NEXT(m, m_stailq);
- error = uiomove(mtod(m, char *), m->m_len, uio);
- if (__predict_false(error)) {
- SOCK_IO_RECV_UNLOCK(so);
- if (!peek)
- for (; m != last; m = next) {
- next = STAILQ_NEXT(m, m_stailq);
- m_free(m);
- }
- return (error);
- }
- if (!peek)
- m_free(m);
- }
- if (last != NULL && lastlen > 0) {
- if (!peek) {
- MPASS(!(m->m_flags & M_PKTHDR));
- MPASS(last->m_data - M_START(last) >= lastlen);
- error = uiomove(mtod(last, char *) - lastlen,
- lastlen, uio);
- } else
- error = uiomove(mtod(last, char *), lastlen, uio);
- if (__predict_false(error)) {
- SOCK_IO_RECV_UNLOCK(so);
- return (error);
- }
- }
- if (waitall && !(flags & MSG_EOR) && uio->uio_resid > 0)
- goto restart;
- SOCK_IO_RECV_UNLOCK(so);
-
- if (flagsp != NULL)
- *flagsp |= flags;
-
- uio->uio_td->td_ru.ru_msgrcv++;
-
- return (0);
+release:
+ if (control != NULL)
+ m_freem(control);
+ /*
+ * In case of PRUS_NOTREADY, uipc_ready() is responsible
+ * for freeing memory.
+ */
+ if (m != NULL && (flags & PRUS_NOTREADY) == 0)
+ m_freem(m);
+ return (error);
}
/* PF_UNIX/SOCK_DGRAM version of sbspace() */
@@ -1498,8 +1111,7 @@ uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
const struct sockaddr *from;
struct socket *so2;
struct sockbuf *sb;
- struct mchain cmc = MCHAIN_INITIALIZER(&cmc);
- struct mbuf *f;
+ struct mbuf *f, *clast;
u_int cc, ctl, mbcnt;
u_int dcc __diagused, dctl __diagused, dmbcnt __diagused;
int error;
@@ -1508,6 +1120,7 @@ uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
error = 0;
f = NULL;
+ ctl = 0;
if (__predict_false(flags & MSG_OOB)) {
error = EOPNOTSUPP;
@@ -1526,14 +1139,16 @@ uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
f = m_gethdr(M_WAITOK, MT_SONAME);
cc = m->m_pkthdr.len;
mbcnt = MSIZE + m->m_pkthdr.memlen;
- if (c != NULL && (error = unp_internalize(c, &cmc, td)))
+ if (c != NULL &&
+ (error = unp_internalize(&c, td, &clast, &ctl, &mbcnt)))
goto out;
} else {
- struct mchain mc;
+ /* pr_sosend() with mbuf usually is a kernel thread. */
+
+ M_ASSERTPKTHDR(m);
+ if (__predict_false(c != NULL))
+ panic("%s: control from a kernel thread", __func__);
- uipc_reset_kernel_mbuf(m, &mc);
- cc = mc.mc_len;
- mbcnt = mc.mc_mlen;
if (__predict_false(m->m_pkthdr.len > unpdg_maxdgram)) {
error = EMSGSIZE;
goto out;
@@ -1542,6 +1157,22 @@ uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
error = ENOBUFS;
goto out;
}
+ /* Condition the foreign mbuf to our standards. */
+ m_clrprotoflags(m);
+ m_tag_delete_chain(m, NULL);
+ m->m_pkthdr.rcvif = NULL;
+ m->m_pkthdr.flowid = 0;
+ m->m_pkthdr.csum_flags = 0;
+ m->m_pkthdr.fibnum = 0;
+ m->m_pkthdr.rsstype = 0;
+
+ cc = m->m_pkthdr.len;
+ mbcnt = MSIZE;
+ for (struct mbuf *mb = m; mb != NULL; mb = mb->m_next) {
+ mbcnt += MSIZE;
+ if (mb->m_flags & M_EXT)
+ mbcnt += mb->m_ext.ext_size;
+ }
}
unp = sotounpcb(so);
@@ -1593,7 +1224,8 @@ uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
}
if (unp2->unp_flags & UNP_WANTCRED_MASK)
- unp_addsockcred(td, &cmc, unp2->unp_flags);
+ c = unp_addsockcred(td, c, unp2->unp_flags, &clast, &ctl,
+ &mbcnt);
if (unp->unp_addr != NULL)
from = (struct sockaddr *)unp->unp_addr;
else
@@ -1601,21 +1233,25 @@ uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
f->m_len = from->sa_len;
MPASS(from->sa_len <= MLEN);
bcopy(from, mtod(f, void *), from->sa_len);
+ ctl += f->m_len;
/*
* Concatenate mbufs: from -> control -> data.
* Save overall cc and mbcnt in "from" mbuf.
*/
- if (!STAILQ_EMPTY(&cmc.mc_q)) {
- f->m_next = mc_first(&cmc);
- mc_last(&cmc)->m_next = m;
- /* XXXGL: This is dirty as well as rollback after ENOBUFS. */
- STAILQ_INIT(&cmc.mc_q);
+ if (c != NULL) {
+#ifdef INVARIANTS
+ struct mbuf *mc;
+
+ for (mc = c; mc->m_next != NULL; mc = mc->m_next);
+ MPASS(mc == clast);
+#endif
+ f->m_next = c;
+ clast->m_next = m;
+ c = NULL;
} else
f->m_next = m;
m = NULL;
- ctl = f->m_len + cmc.mc_len;
- mbcnt += cmc.mc_mlen;
#ifdef INVARIANTS
dcc = dctl = dmbcnt = 0;
for (struct mbuf *mb = f; mb != NULL; mb = mb->m_next) {
@@ -1681,7 +1317,7 @@ uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
soroverflow_locked(so2);
error = ENOBUFS;
if (f->m_next->m_type == MT_CONTROL) {
- STAILQ_FIRST(&cmc.mc_q) = f->m_next;
+ c = f->m_next;
f->m_next = NULL;
}
}
@@ -1696,12 +1332,13 @@ uipc_sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
out3:
SOCK_IO_SEND_UNLOCK(so);
out2:
- if (!mc_empty(&cmc))
- unp_scan(mc_first(&cmc), unp_freerights);
+ if (c)
+ unp_scan(c, unp_freerights);
out:
if (f)
m_freem(f);
- mc_freem(&cmc);
+ if (c)
+ m_freem(c);
if (m)
m_freem(m);
@@ -1942,7 +1579,6 @@ uipc_soreceive_dgram(struct socket *so, struct sockaddr **psa, struct uio *uio,
return (0);
}
-#if 0 /* No sendfile support. */
static bool
uipc_ready_scan(struct socket *so, struct mbuf *m, int count, int *errorp)
{
@@ -2022,7 +1658,6 @@ uipc_ready(struct socket *so, struct mbuf *m, int count)
}
return (error);
}
-#endif
static int
uipc_sense(struct socket *so, struct stat *sb)
@@ -2461,19 +2096,6 @@ unp_connect2(struct socket *so, struct socket *so2)
}
}
-static void
-unp_soisdisconnected(struct socket *so)
-{
- SOCK_LOCK(so);
- MPASS(!SOLISTENING(so));
- so->so_state |= SS_ISDISCONNECTED;
- so->so_state &= ~SS_ISCONNECTED;
- SOCK_RECVBUF_LOCK(so);
- socantrcvmore_locked(so);
- SOCK_UNLOCK(so);
- wakeup(&so->so_timeo); /* XXXGL: is this needed? */
-}
-
*** 398 LINES SKIPPED ***