svn commit: r319722 - in head: sys/cam/ctl sys/dev/iscsi sys/kern sys/netgraph sys/netgraph/bluetooth/socket sys/netinet sys/ofed/drivers/infiniband/core sys/ofed/drivers/infiniband/ulp/sdp sys/rpc...
Gleb Smirnoff
glebius at FreeBSD.org
Thu Jun 8 21:30:36 UTC 2017
Author: glebius
Date: Thu Jun 8 21:30:34 2017
New Revision: 319722
URL: https://svnweb.freebsd.org/changeset/base/319722
Log:
Listening sockets improvements.
o Separate fields of struct socket that belong to listening from
fields that belong to normal dataflow, and unionize them. This
shrinks the structure a bit.
- Take out selinfo's from the socket buffers into the socket. The
first reason is to support braindamaged scenario when a socket is
added to kevent(2) and then listen(2) is cast on it. The second
reason is that there is future plan to make socket buffers pluggable,
so that for a dataflow socket a socket buffer can be changed, and
in this case we also want to keep same selinfos through the lifetime
of a socket.
- Remove struct struct so_accf. Since now listening stuff no longer
affects struct socket size, just move its fields into listening part
of the union.
- Provide sol_upcall field and enforce that so_upcall_set() may be called
only on a dataflow socket, which has buffers, and for listening sockets
provide solisten_upcall_set().
o Remove ACCEPT_LOCK() global.
- Add a mutex to socket, to be used instead of socket buffer lock to lock
fields of struct socket that don't belong to a socket buffer.
- Allow to acquire two socket locks, but the first one must belong to a
listening socket.
- Make soref()/sorele() to use atomic(9). This allows in some situations
to do soref() without owning socket lock. There is place for improvement
here, it is possible to make sorele() also to lock optionally.
- Most protocols aren't touched by this change, except UNIX local sockets.
See below for more information.
o Reduce copy-and-paste in kernel modules that accept connections from
listening sockets: provide function solisten_dequeue(), and use it in
the following modules: ctl(4), iscsi(4), ng_btsocket(4), ng_ksocket(4),
infiniband, rpc.
o UNIX local sockets.
- Removal of ACCEPT_LOCK() global uncovered several races in the UNIX
local sockets. Most races exist around spawning a new socket, when we
are connecting to a local listening socket. To cover them, we need to
hold locks on both PCBs when spawning a third one. This means holding
them across sonewconn(). This creates a LOR between pcb locks and
unp_list_lock.
- To fix the new LOR, abandon the global unp_list_lock in favor of global
unp_link_lock. Indeed, separating these two locks didn't provide us any
extra parralelism in the UNIX sockets.
- Now call into uipc_attach() may happen with unp_link_lock hold if, we
are accepting, or without unp_link_lock in case if we are just creating
a socket.
- Another problem in UNIX sockets is that uipc_close() basicly did nothing
for a listening socket. The vnode remained opened for connections. This
is fixed by removing vnode in uipc_close(). Maybe the right way would be
to do it for all sockets (not only listening), simply move the vnode
teardown from uipc_detach() to uipc_close()?
Sponsored by: Netflix
Differential Revision: https://reviews.freebsd.org/D9770
Modified:
head/sys/cam/ctl/ctl_ha.c
head/sys/dev/iscsi/icl_soft_proxy.c
head/sys/kern/sys_socket.c
head/sys/kern/uipc_accf.c
head/sys/kern/uipc_debug.c
head/sys/kern/uipc_sockbuf.c
head/sys/kern/uipc_socket.c
head/sys/kern/uipc_syscalls.c
head/sys/kern/uipc_usrreq.c
head/sys/netgraph/bluetooth/socket/ng_btsocket_l2cap.c
head/sys/netgraph/bluetooth/socket/ng_btsocket_rfcomm.c
head/sys/netgraph/bluetooth/socket/ng_btsocket_sco.c
head/sys/netgraph/ng_ksocket.c
head/sys/netinet/sctp_input.c
head/sys/netinet/sctp_syscalls.c
head/sys/netinet/sctp_sysctl.c
head/sys/netinet/sctp_usrreq.c
head/sys/netinet/tcp_subr.c
head/sys/netinet/tcp_syncache.c
head/sys/netinet/tcp_timewait.c
head/sys/ofed/drivers/infiniband/core/iwcm.c
head/sys/ofed/drivers/infiniband/ulp/sdp/sdp_main.c
head/sys/rpc/svc_vc.c
head/sys/sys/sockbuf.h
head/sys/sys/socket.h
head/sys/sys/socketvar.h
head/usr.bin/netstat/inet.c
Modified: head/sys/cam/ctl/ctl_ha.c
==============================================================================
--- head/sys/cam/ctl/ctl_ha.c Thu Jun 8 21:06:18 2017 (r319721)
+++ head/sys/cam/ctl/ctl_ha.c Thu Jun 8 21:30:34 2017 (r319722)
@@ -458,45 +458,20 @@ out:
static int
ctl_ha_accept(struct ha_softc *softc)
{
- struct socket *so;
+ struct socket *lso, *so;
struct sockaddr *sap;
int error;
- ACCEPT_LOCK();
- if (softc->ha_lso->so_rcv.sb_state & SBS_CANTRCVMORE)
- softc->ha_lso->so_error = ECONNABORTED;
- if (softc->ha_lso->so_error) {
- error = softc->ha_lso->so_error;
- softc->ha_lso->so_error = 0;
- ACCEPT_UNLOCK();
+ lso = softc->ha_lso;
+ SOLISTEN_LOCK(lso);
+ error = solisten_dequeue(lso, &so, 0);
+ if (error == EWOULDBLOCK)
+ return (error);
+ if (error) {
printf("%s: socket error %d\n", __func__, error);
goto out;
}
- so = TAILQ_FIRST(&softc->ha_lso->so_comp);
- if (so == NULL) {
- ACCEPT_UNLOCK();
- return (EWOULDBLOCK);
- }
- KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
- KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
- /*
- * Before changing the flags on the socket, we have to bump the
- * reference count. Otherwise, if the protocol calls sofree(),
- * the socket will be released due to a zero refcount.
- */
- SOCK_LOCK(so); /* soref() and so_state update */
- soref(so); /* file descriptor reference */
-
- TAILQ_REMOVE(&softc->ha_lso->so_comp, so, so_list);
- softc->ha_lso->so_qlen--;
- so->so_state |= SS_NBIO;
- so->so_qstate &= ~SQ_COMP;
- so->so_head = NULL;
-
- SOCK_UNLOCK(so);
- ACCEPT_UNLOCK();
-
sap = NULL;
error = soaccept(so, &sap);
if (error != 0) {
@@ -556,9 +531,6 @@ ctl_ha_listen(struct ha_softc *softc)
printf("%s: REUSEPORT setting failed %d\n",
__func__, error);
}
- SOCKBUF_LOCK(&softc->ha_lso->so_rcv);
- soupcall_set(softc->ha_lso, SO_RCV, ctl_ha_lupcall, softc);
- SOCKBUF_UNLOCK(&softc->ha_lso->so_rcv);
}
memcpy(&sa, &softc->ha_peer_in, sizeof(sa));
@@ -572,6 +544,10 @@ ctl_ha_listen(struct ha_softc *softc)
printf("%s: solisten() error %d\n", __func__, error);
goto out;
}
+ SOLISTEN_LOCK(softc->ha_lso);
+ softc->ha_lso->so_state |= SS_NBIO;
+ solisten_upcall_set(softc->ha_lso, ctl_ha_lupcall, softc);
+ SOLISTEN_UNLOCK(softc->ha_lso);
return (0);
out:
Modified: head/sys/dev/iscsi/icl_soft_proxy.c
==============================================================================
--- head/sys/dev/iscsi/icl_soft_proxy.c Thu Jun 8 21:06:18 2017 (r319721)
+++ head/sys/dev/iscsi/icl_soft_proxy.c Thu Jun 8 21:30:34 2017 (r319722)
@@ -92,7 +92,6 @@ struct icl_listen_sock {
struct icl_listen *ils_listen;
struct socket *ils_socket;
bool ils_running;
- bool ils_disconnecting;
int ils_id;
};
@@ -184,7 +183,9 @@ icl_listen_free(struct icl_listen *il)
while (ils->ils_running) {
ICL_DEBUG("waiting for accept thread to terminate");
sx_xunlock(&il->il_lock);
- ils->ils_disconnecting = true;
+ SOLISTEN_LOCK(ils->ils_socket);
+ ils->ils_socket->so_error = ENOTCONN;
+ SOLISTEN_UNLOCK(ils->ils_socket);
wakeup(&ils->ils_socket->so_timeo);
pause("icl_unlisten", 1 * hz);
sx_xlock(&il->il_lock);
@@ -200,9 +201,9 @@ icl_listen_free(struct icl_listen *il)
}
/*
- * XXX: Doing accept in a separate thread in each socket might not be the best way
- * to do stuff, but it's pretty clean and debuggable - and you probably won't
- * have hundreds of listening sockets anyway.
+ * XXX: Doing accept in a separate thread in each socket might not be the
+ * best way to do stuff, but it's pretty clean and debuggable - and you
+ * probably won't have hundreds of listening sockets anyway.
*/
static void
icl_accept_thread(void *arg)
@@ -218,55 +219,22 @@ icl_accept_thread(void *arg)
ils->ils_running = true;
for (;;) {
- ACCEPT_LOCK();
- while (TAILQ_EMPTY(&head->so_comp) && head->so_error == 0 && ils->ils_disconnecting == false) {
- if (head->so_rcv.sb_state & SBS_CANTRCVMORE) {
- head->so_error = ECONNABORTED;
- break;
- }
- error = msleep(&head->so_timeo, &accept_mtx, PSOCK | PCATCH,
- "accept", 0);
- if (error) {
- ACCEPT_UNLOCK();
- ICL_WARN("msleep failed with error %d", error);
- continue;
- }
- if (ils->ils_disconnecting) {
- ACCEPT_UNLOCK();
- ICL_DEBUG("terminating");
- ils->ils_running = false;
- kthread_exit();
- return;
- }
+ SOLISTEN_LOCK(head);
+ error = solisten_dequeue(head, &so, 0);
+ if (error == ENOTCONN) {
+ /*
+ * XXXGL: ENOTCONN is our mark from icl_listen_free().
+ * Neither socket code, nor msleep(9) may return it.
+ */
+ ICL_DEBUG("terminating");
+ ils->ils_running = false;
+ kthread_exit();
+ return;
}
- if (head->so_error) {
- error = head->so_error;
- head->so_error = 0;
- ACCEPT_UNLOCK();
- ICL_WARN("socket error %d", error);
+ if (error) {
+ ICL_WARN("solisten_dequeue error %d", error);
continue;
}
- so = TAILQ_FIRST(&head->so_comp);
- KASSERT(so != NULL, ("NULL so"));
- KASSERT(!(so->so_qstate & SQ_INCOMP), ("accept1: so SQ_INCOMP"));
- KASSERT(so->so_qstate & SQ_COMP, ("accept1: so not SQ_COMP"));
-
- /*
- * Before changing the flags on the socket, we have to bump the
- * reference count. Otherwise, if the protocol calls sofree(),
- * the socket will be released due to a zero refcount.
- */
- SOCK_LOCK(so); /* soref() and so_state update */
- soref(so); /* file descriptor reference */
-
- TAILQ_REMOVE(&head->so_comp, so, so_list);
- head->so_qlen--;
- so->so_state |= (head->so_state & SS_NBIO);
- so->so_qstate &= ~SQ_COMP;
- so->so_head = NULL;
-
- SOCK_UNLOCK(so);
- ACCEPT_UNLOCK();
sa = NULL;
error = soaccept(so, &sa);
Modified: head/sys/kern/sys_socket.c
==============================================================================
--- head/sys/kern/sys_socket.c Thu Jun 8 21:06:18 2017 (r319721)
+++ head/sys/kern/sys_socket.c Thu Jun 8 21:30:34 2017 (r319722)
@@ -170,32 +170,36 @@ soo_ioctl(struct file *fp, u_long cmd, void *data, str
break;
case FIOASYNC:
- /*
- * XXXRW: This code separately acquires SOCK_LOCK(so) and
- * SOCKBUF_LOCK(&so->so_rcv) even though they are the same
- * mutex to avoid introducing the assumption that they are
- * the same.
- */
if (*(int *)data) {
SOCK_LOCK(so);
so->so_state |= SS_ASYNC;
+ if (SOLISTENING(so)) {
+ so->sol_sbrcv_flags |= SB_ASYNC;
+ so->sol_sbsnd_flags |= SB_ASYNC;
+ } else {
+ SOCKBUF_LOCK(&so->so_rcv);
+ so->so_rcv.sb_flags |= SB_ASYNC;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ SOCKBUF_LOCK(&so->so_snd);
+ so->so_snd.sb_flags |= SB_ASYNC;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ }
SOCK_UNLOCK(so);
- SOCKBUF_LOCK(&so->so_rcv);
- so->so_rcv.sb_flags |= SB_ASYNC;
- SOCKBUF_UNLOCK(&so->so_rcv);
- SOCKBUF_LOCK(&so->so_snd);
- so->so_snd.sb_flags |= SB_ASYNC;
- SOCKBUF_UNLOCK(&so->so_snd);
} else {
SOCK_LOCK(so);
so->so_state &= ~SS_ASYNC;
+ if (SOLISTENING(so)) {
+ so->sol_sbrcv_flags &= ~SB_ASYNC;
+ so->sol_sbsnd_flags &= ~SB_ASYNC;
+ } else {
+ SOCKBUF_LOCK(&so->so_rcv);
+ so->so_rcv.sb_flags &= ~SB_ASYNC;
+ SOCKBUF_UNLOCK(&so->so_rcv);
+ SOCKBUF_LOCK(&so->so_snd);
+ so->so_snd.sb_flags &= ~SB_ASYNC;
+ SOCKBUF_UNLOCK(&so->so_snd);
+ }
SOCK_UNLOCK(so);
- SOCKBUF_LOCK(&so->so_rcv);
- so->so_rcv.sb_flags &= ~SB_ASYNC;
- SOCKBUF_UNLOCK(&so->so_rcv);
- SOCKBUF_LOCK(&so->so_snd);
- so->so_snd.sb_flags &= ~SB_ASYNC;
- SOCKBUF_UNLOCK(&so->so_snd);
}
break;
@@ -706,7 +710,6 @@ soaio_process_sb(struct socket *so, struct sockbuf *sb
sb->sb_flags &= ~SB_AIO_RUNNING;
SOCKBUF_UNLOCK(sb);
- ACCEPT_LOCK();
SOCK_LOCK(so);
sorele(so);
}
Modified: head/sys/kern/uipc_accf.c
==============================================================================
--- head/sys/kern/uipc_accf.c Thu Jun 8 21:06:18 2017 (r319721)
+++ head/sys/kern/uipc_accf.c Thu Jun 8 21:30:34 2017 (r319722)
@@ -173,13 +173,13 @@ accept_filt_getopt(struct socket *so, struct sockopt *
error = EINVAL;
goto out;
}
- if ((so->so_options & SO_ACCEPTFILTER) == 0) {
+ if (so->sol_accept_filter == NULL) {
error = EINVAL;
goto out;
}
- strcpy(afap->af_name, so->so_accf->so_accept_filter->accf_name);
- if (so->so_accf->so_accept_filter_str != NULL)
- strcpy(afap->af_arg, so->so_accf->so_accept_filter_str);
+ strcpy(afap->af_name, so->sol_accept_filter->accf_name);
+ if (so->sol_accept_filter_str != NULL)
+ strcpy(afap->af_arg, so->sol_accept_filter_str);
out:
SOCK_UNLOCK(so);
if (error == 0)
@@ -193,31 +193,57 @@ accept_filt_setopt(struct socket *so, struct sockopt *
{
struct accept_filter_arg *afap;
struct accept_filter *afp;
- struct so_accf *newaf;
- int error = 0;
+ char *accept_filter_str = NULL;
+ void *accept_filter_arg = NULL;
+ int error;
/*
* Handle the simple delete case first.
*/
if (sopt == NULL || sopt->sopt_val == NULL) {
+ struct socket *sp, *sp1;
+ int wakeup;
+
SOCK_LOCK(so);
if ((so->so_options & SO_ACCEPTCONN) == 0) {
SOCK_UNLOCK(so);
return (EINVAL);
}
- if (so->so_accf != NULL) {
- struct so_accf *af = so->so_accf;
- if (af->so_accept_filter != NULL &&
- af->so_accept_filter->accf_destroy != NULL) {
- af->so_accept_filter->accf_destroy(so);
- }
- if (af->so_accept_filter_str != NULL)
- free(af->so_accept_filter_str, M_ACCF);
- free(af, M_ACCF);
- so->so_accf = NULL;
+ if (so->sol_accept_filter == NULL) {
+ SOCK_UNLOCK(so);
+ return (0);
}
+ if (so->sol_accept_filter->accf_destroy != NULL)
+ so->sol_accept_filter->accf_destroy(so);
+ if (so->sol_accept_filter_str != NULL)
+ free(so->sol_accept_filter_str, M_ACCF);
+ so->sol_accept_filter = NULL;
+ so->sol_accept_filter_arg = NULL;
+ so->sol_accept_filter_str = NULL;
so->so_options &= ~SO_ACCEPTFILTER;
- SOCK_UNLOCK(so);
+
+ /*
+ * Move from incomplete queue to complete only those
+ * connections, that are blocked by us.
+ */
+ wakeup = 0;
+ TAILQ_FOREACH_SAFE(sp, &so->sol_incomp, so_list, sp1) {
+ SOCK_LOCK(sp);
+ if (sp->so_options & SO_ACCEPTFILTER) {
+ TAILQ_REMOVE(&so->sol_incomp, sp, so_list);
+ TAILQ_INSERT_TAIL(&so->sol_comp, sp, so_list);
+ sp->so_qstate = SQ_COMP;
+ sp->so_options &= ~SO_ACCEPTFILTER;
+ so->sol_incqlen--;
+ so->sol_qlen++;
+ wakeup = 1;
+ }
+ SOCK_UNLOCK(sp);
+ }
+ if (wakeup)
+ solisten_wakeup(so); /* unlocks */
+ else
+ SOLISTEN_UNLOCK(so);
return (0);
}
@@ -238,17 +264,10 @@ accept_filt_setopt(struct socket *so, struct sockopt *
free(afap, M_TEMP);
return (ENOENT);
}
- /*
- * Allocate the new accept filter instance storage. We may
- * have to free it again later if we fail to attach it. If
- * attached properly, 'newaf' is NULLed to avoid a free()
- * while in use.
- */
- newaf = malloc(sizeof(*newaf), M_ACCF, M_WAITOK | M_ZERO);
if (afp->accf_create != NULL && afap->af_name[0] != '\0') {
size_t len = strlen(afap->af_name) + 1;
- newaf->so_accept_filter_str = malloc(len, M_ACCF, M_WAITOK);
- strcpy(newaf->so_accept_filter_str, afap->af_name);
+ accept_filter_str = malloc(len, M_ACCF, M_WAITOK);
+ strcpy(accept_filter_str, afap->af_name);
}
/*
@@ -256,8 +275,8 @@ accept_filt_setopt(struct socket *so, struct sockopt *
* without first removing it.
*/
SOCK_LOCK(so);
- if (((so->so_options & SO_ACCEPTCONN) == 0) ||
- (so->so_accf != NULL)) {
+ if ((so->so_options & SO_ACCEPTCONN) == 0 ||
+ so->sol_accept_filter != NULL) {
error = EINVAL;
goto out;
}
@@ -268,25 +287,20 @@ accept_filt_setopt(struct socket *so, struct sockopt *
* can't block.
*/
if (afp->accf_create != NULL) {
- newaf->so_accept_filter_arg =
- afp->accf_create(so, afap->af_arg);
- if (newaf->so_accept_filter_arg == NULL) {
+ accept_filter_arg = afp->accf_create(so, afap->af_arg);
+ if (accept_filter_arg == NULL) {
error = EINVAL;
goto out;
}
}
- newaf->so_accept_filter = afp;
- so->so_accf = newaf;
+ so->sol_accept_filter = afp;
+ so->sol_accept_filter_arg = accept_filter_arg;
+ so->sol_accept_filter_str = accept_filter_str;
so->so_options |= SO_ACCEPTFILTER;
- newaf = NULL;
out:
SOCK_UNLOCK(so);
- if (newaf != NULL) {
- if (newaf->so_accept_filter_str != NULL)
- free(newaf->so_accept_filter_str, M_ACCF);
- free(newaf, M_ACCF);
- }
- if (afap != NULL)
- free(afap, M_TEMP);
+ if (accept_filter_str != NULL)
+ free(accept_filter_str, M_ACCF);
+ free(afap, M_TEMP);
return (error);
}
Modified: head/sys/kern/uipc_debug.c
==============================================================================
--- head/sys/kern/uipc_debug.c Thu Jun 8 21:06:18 2017 (r319721)
+++ head/sys/kern/uipc_debug.c Thu Jun 8 21:30:34 2017 (r319722)
@@ -448,8 +448,6 @@ db_print_socket(struct socket *so, const char *socketn
db_printf(")\n");
db_print_indent(indent);
- db_printf("so_qstate: 0x%x (", so->so_qstate);
- db_print_soqstate(so->so_qstate);
db_printf(") ");
db_printf("so_pcb: %p ", so->so_pcb);
db_printf("so_proto: %p\n", so->so_proto);
@@ -458,24 +456,28 @@ db_print_socket(struct socket *so, const char *socketn
db_print_protosw(so->so_proto, "so_proto", indent);
db_print_indent(indent);
- db_printf("so_head: %p ", so->so_head);
- db_printf("so_incomp first: %p ", TAILQ_FIRST(&so->so_incomp));
- db_printf("so_comp first: %p\n", TAILQ_FIRST(&so->so_comp));
+ if (so->so_options & SO_ACCEPTCONN) {
+ db_printf("sol_incomp first: %p ",
+ TAILQ_FIRST(&so->sol_incomp));
+ db_printf("sol_comp first: %p\n", TAILQ_FIRST(&so->sol_comp));
+ db_printf("sol_qlen: %d ", so->sol_qlen);
+ db_printf("sol_incqlen: %d ", so->sol_incqlen);
+ db_printf("sol_qlimit: %d ", so->sol_qlimit);
+ } else {
+ db_printf("so_qstate: 0x%x (", so->so_qstate);
+ db_print_soqstate(so->so_qstate);
+ db_printf("so_listen: %p ", so->so_listen);
+ /* so_list skipped */
+ db_printf("so_timeo: %d ", so->so_timeo);
+ db_printf("so_error: %d\n", so->so_error);
- db_print_indent(indent);
- /* so_list skipped */
- db_printf("so_qlen: %u ", so->so_qlen);
- db_printf("so_incqlen: %u ", so->so_incqlen);
- db_printf("so_qlimit: %u ", so->so_qlimit);
- db_printf("so_timeo: %d ", so->so_timeo);
- db_printf("so_error: %d\n", so->so_error);
+ db_print_indent(indent);
+ db_printf("so_sigio: %p ", so->so_sigio);
+ db_printf("so_oobmark: %lu ", so->so_oobmark);
- db_print_indent(indent);
- db_printf("so_sigio: %p ", so->so_sigio);
- db_printf("so_oobmark: %lu ", so->so_oobmark);
-
- db_print_sockbuf(&so->so_rcv, "so_rcv", indent);
- db_print_sockbuf(&so->so_snd, "so_snd", indent);
+ db_print_sockbuf(&so->so_rcv, "so_rcv", indent);
+ db_print_sockbuf(&so->so_snd, "so_snd", indent);
+ }
}
DB_SHOW_COMMAND(socket, db_show_socket)
Modified: head/sys/kern/uipc_sockbuf.c
==============================================================================
--- head/sys/kern/uipc_sockbuf.c Thu Jun 8 21:06:18 2017 (r319721)
+++ head/sys/kern/uipc_sockbuf.c Thu Jun 8 21:30:34 2017 (r319722)
@@ -314,14 +314,14 @@ sowakeup(struct socket *so, struct sockbuf *sb)
SOCKBUF_LOCK_ASSERT(sb);
- selwakeuppri(&sb->sb_sel, PSOCK);
- if (!SEL_WAITING(&sb->sb_sel))
+ selwakeuppri(sb->sb_sel, PSOCK);
+ if (!SEL_WAITING(sb->sb_sel))
sb->sb_flags &= ~SB_SEL;
if (sb->sb_flags & SB_WAIT) {
sb->sb_flags &= ~SB_WAIT;
wakeup(&sb->sb_acc);
}
- KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
+ KNOTE_LOCKED(&sb->sb_sel->si_note, 0);
if (sb->sb_upcall != NULL && !(so->so_state & SS_ISDISCONNECTED)) {
ret = sb->sb_upcall(so, sb->sb_upcallarg, M_NOWAIT);
if (ret == SU_ISCONNECTED) {
Modified: head/sys/kern/uipc_socket.c
==============================================================================
--- head/sys/kern/uipc_socket.c Thu Jun 8 21:06:18 2017 (r319721)
+++ head/sys/kern/uipc_socket.c Thu Jun 8 21:30:34 2017 (r319722)
@@ -106,6 +106,7 @@ __FBSDID("$FreeBSD$");
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_compat.h"
+#include "opt_sctp.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -154,13 +155,21 @@ __FBSDID("$FreeBSD$");
static int soreceive_rcvoob(struct socket *so, struct uio *uio,
int flags);
+static void so_rdknl_lock(void *);
+static void so_rdknl_unlock(void *);
+static void so_rdknl_assert_locked(void *);
+static void so_rdknl_assert_unlocked(void *);
+static void so_wrknl_lock(void *);
+static void so_wrknl_unlock(void *);
+static void so_wrknl_assert_locked(void *);
+static void so_wrknl_assert_unlocked(void *);
static void filt_sordetach(struct knote *kn);
static int filt_soread(struct knote *kn, long hint);
static void filt_sowdetach(struct knote *kn);
static int filt_sowrite(struct knote *kn, long hint);
-static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
static int filt_soempty(struct knote *kn, long hint);
+static int inline hhook_run_socket(struct socket *so, void *hctx, int32_t h_id);
fo_kqfilter_t soo_kqfilter;
static struct filterops soread_filtops = {
@@ -393,8 +402,16 @@ soalloc(struct vnet *vnet)
return (NULL);
}
+ /*
+ * The socket locking protocol allows to lock 2 sockets at a time,
+ * however, the first one must be a listening socket. WITNESS lacks
+ * a feature to change class of an existing lock, so we use DUPOK.
+ */
+ mtx_init(&so->so_lock, "socket", NULL, MTX_DEF | MTX_DUPOK);
SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
+ so->so_rcv.sb_sel = &so->so_rdsel;
+ so->so_snd.sb_sel = &so->so_wrsel;
sx_init(&so->so_snd.sb_sx, "so_snd_sx");
sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
TAILQ_INIT(&so->so_snd.sb_aiojobq);
@@ -450,9 +467,6 @@ sodealloc(struct socket *so)
if (so->so_snd.sb_hiwat)
(void)chgsbsize(so->so_cred->cr_uidinfo,
&so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
- /* remove accept filter if one is present. */
- if (so->so_accf != NULL)
- accept_filt_setopt(so, NULL);
#ifdef MAC
mac_socket_destroy(so);
#endif
@@ -460,10 +474,16 @@ sodealloc(struct socket *so)
crfree(so->so_cred);
khelp_destroy_osd(&so->osd);
- sx_destroy(&so->so_snd.sb_sx);
- sx_destroy(&so->so_rcv.sb_sx);
- SOCKBUF_LOCK_DESTROY(&so->so_snd);
- SOCKBUF_LOCK_DESTROY(&so->so_rcv);
+ if (SOLISTENING(so)) {
+ if (so->sol_accept_filter != NULL)
+ accept_filt_setopt(so, NULL);
+ } else {
+ sx_destroy(&so->so_snd.sb_sx);
+ sx_destroy(&so->so_rcv.sb_sx);
+ SOCKBUF_LOCK_DESTROY(&so->so_snd);
+ SOCKBUF_LOCK_DESTROY(&so->so_rcv);
+ }
+ mtx_destroy(&so->so_lock);
uma_zfree(socket_zone, so);
}
@@ -506,8 +526,6 @@ socreate(int dom, struct socket **aso, int type, int p
if (so == NULL)
return (ENOBUFS);
- TAILQ_INIT(&so->so_incomp);
- TAILQ_INIT(&so->so_comp);
so->so_type = type;
so->so_cred = crhold(cred);
if ((prp->pr_domain->dom_family == PF_INET) ||
@@ -520,9 +538,10 @@ socreate(int dom, struct socket **aso, int type, int p
#ifdef MAC
mac_socket_create(cred, so);
#endif
- knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
- knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
- so->so_count = 1;
+ knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
+ so_rdknl_assert_locked, so_rdknl_assert_unlocked);
+ knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
+ so_wrknl_assert_locked, so_wrknl_assert_unlocked);
/*
* Auto-sizing of socket buffers is managed by the protocols and
* the appropriate flags must be set in the pru_attach function.
@@ -531,12 +550,10 @@ socreate(int dom, struct socket **aso, int type, int p
error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
CURVNET_RESTORE();
if (error) {
- KASSERT(so->so_count == 1, ("socreate: so_count %d",
- so->so_count));
- so->so_count = 0;
sodealloc(so);
return (error);
}
+ soref(so);
*aso = so;
return (0);
}
@@ -564,11 +581,11 @@ sonewconn(struct socket *head, int connstatus)
static int overcount;
struct socket *so;
- int over;
+ u_int over;
- ACCEPT_LOCK();
- over = (head->so_qlen > 3 * head->so_qlimit / 2);
- ACCEPT_UNLOCK();
+ SOLISTEN_LOCK(head);
+ over = (head->sol_qlen > 3 * head->sol_qlimit / 2);
+ SOLISTEN_UNLOCK(head);
#ifdef REGRESSION
if (regression_sonewconn_earlytest && over) {
#else
@@ -580,15 +597,15 @@ sonewconn(struct socket *head, int connstatus)
log(LOG_DEBUG, "%s: pcb %p: Listen queue overflow: "
"%i already in queue awaiting acceptance "
"(%d occurrences)\n",
- __func__, head->so_pcb, head->so_qlen, overcount);
+ __func__, head->so_pcb, head->sol_qlen, overcount);
overcount = 0;
}
return (NULL);
}
- VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
- __func__, __LINE__, head));
+ VNET_ASSERT(head->so_vnet != NULL, ("%s: so %p vnet is NULL",
+ __func__, head));
so = soalloc(head->so_vnet);
if (so == NULL) {
log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
@@ -596,11 +613,8 @@ sonewconn(struct socket *head, int connstatus)
__func__, head->so_pcb);
return (NULL);
}
- if ((head->so_options & SO_ACCEPTFILTER) != 0)
- connstatus = 0;
- so->so_head = head;
+ so->so_listen = head;
so->so_type = head->so_type;
- so->so_options = head->so_options &~ SO_ACCEPTCONN;
so->so_linger = head->so_linger;
so->so_state = head->so_state | SS_NOFDREF;
so->so_fibnum = head->so_fibnum;
@@ -609,10 +623,12 @@ sonewconn(struct socket *head, int connstatus)
#ifdef MAC
mac_socket_newconn(head, so);
#endif
- knlist_init_mtx(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv));
- knlist_init_mtx(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd));
+ knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
+ so_rdknl_assert_locked, so_rdknl_assert_unlocked);
+ knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
+ so_wrknl_assert_locked, so_wrknl_assert_unlocked);
VNET_SO_ASSERT(head);
- if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
+ if (soreserve(so, head->sol_sbsnd_hiwat, head->sol_sbrcv_hiwat)) {
sodealloc(so);
log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
__func__, head->so_pcb);
@@ -624,32 +640,24 @@ sonewconn(struct socket *head, int connstatus)
__func__, head->so_pcb);
return (NULL);
}
- so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
- so->so_snd.sb_lowat = head->so_snd.sb_lowat;
- so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
- so->so_snd.sb_timeo = head->so_snd.sb_timeo;
- so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
- so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
+ so->so_rcv.sb_lowat = head->sol_sbrcv_lowat;
+ so->so_snd.sb_lowat = head->sol_sbsnd_lowat;
+ so->so_rcv.sb_timeo = head->sol_sbrcv_timeo;
+ so->so_snd.sb_timeo = head->sol_sbsnd_timeo;
+ so->so_rcv.sb_flags |= head->sol_sbrcv_flags & SB_AUTOSIZE;
+ so->so_snd.sb_flags |= head->sol_sbsnd_flags & SB_AUTOSIZE;
+
+ SOLISTEN_LOCK(head);
+ if (head->sol_accept_filter != NULL)
+ connstatus = 0;
so->so_state |= connstatus;
- ACCEPT_LOCK();
- /*
- * The accept socket may be tearing down but we just
- * won a race on the ACCEPT_LOCK.
- * However, if sctp_peeloff() is called on a 1-to-many
- * style socket, the SO_ACCEPTCONN doesn't need to be set.
- */
- if (!(head->so_options & SO_ACCEPTCONN) &&
- ((head->so_proto->pr_protocol != IPPROTO_SCTP) ||
- (head->so_type != SOCK_SEQPACKET))) {
- SOCK_LOCK(so);
- so->so_head = NULL;
- sofree(so); /* NB: returns ACCEPT_UNLOCK'ed. */
- return (NULL);
- }
+ so->so_options = head->so_options & ~SO_ACCEPTCONN;
+ soref(head); /* A socket on (in)complete queue refs head. */
if (connstatus) {
- TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
- so->so_qstate |= SQ_COMP;
- head->so_qlen++;
+ TAILQ_INSERT_TAIL(&head->sol_comp, so, so_list);
+ so->so_qstate = SQ_COMP;
+ head->sol_qlen++;
+ solisten_wakeup(head); /* unlocks */
} else {
/*
* Keep removing sockets from the head until there's room for
@@ -658,28 +666,86 @@ sonewconn(struct socket *head, int connstatus)
* threads and soabort() requires dropping locks, we must
* loop waiting for the condition to be true.
*/
- while (head->so_incqlen > head->so_qlimit) {
+ while (head->sol_incqlen > head->sol_qlimit) {
struct socket *sp;
- sp = TAILQ_FIRST(&head->so_incomp);
- TAILQ_REMOVE(&head->so_incomp, sp, so_list);
- head->so_incqlen--;
- sp->so_qstate &= ~SQ_INCOMP;
- sp->so_head = NULL;
- ACCEPT_UNLOCK();
+
+ sp = TAILQ_FIRST(&head->sol_incomp);
+ TAILQ_REMOVE(&head->sol_incomp, sp, so_list);
+ head->sol_incqlen--;
+ SOCK_LOCK(sp);
+ sp->so_qstate = SQ_NONE;
+ sp->so_listen = NULL;
+ SOCK_UNLOCK(sp);
+ sorele(head); /* does SOLISTEN_UNLOCK, head stays */
soabort(sp);
- ACCEPT_LOCK();
+ SOLISTEN_LOCK(head);
}
- TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
- so->so_qstate |= SQ_INCOMP;
- head->so_incqlen++;
+ TAILQ_INSERT_TAIL(&head->sol_incomp, so, so_list);
+ so->so_qstate = SQ_INCOMP;
+ head->sol_incqlen++;
+ SOLISTEN_UNLOCK(head);
}
- ACCEPT_UNLOCK();
- if (connstatus) {
- sorwakeup(head);
- wakeup_one(&head->so_timeo);
+ return (so);
+}
+
+#ifdef SCTP
+/*
+ * Socket part of sctp_peeloff(). Detach a new socket from an
+ * association. The new socket is returned with a reference.
+ */
+struct socket *
+sopeeloff(struct socket *head)
+{
+ struct socket *so;
+
+ VNET_ASSERT(head->so_vnet != NULL, ("%s:%d so_vnet is NULL, head=%p",
+ __func__, __LINE__, head));
+ so = soalloc(head->so_vnet);
+ if (so == NULL) {
+ log(LOG_DEBUG, "%s: pcb %p: New socket allocation failure: "
+ "limit reached or out of memory\n",
+ __func__, head->so_pcb);
+ return (NULL);
}
+ so->so_type = head->so_type;
+ so->so_options = head->so_options;
+ so->so_linger = head->so_linger;
+ so->so_state = (head->so_state & SS_NBIO) | SS_ISCONNECTED;
+ so->so_fibnum = head->so_fibnum;
+ so->so_proto = head->so_proto;
+ so->so_cred = crhold(head->so_cred);
+#ifdef MAC
+ mac_socket_newconn(head, so);
+#endif
+ knlist_init(&so->so_rdsel.si_note, so, so_rdknl_lock, so_rdknl_unlock,
+ so_rdknl_assert_locked, so_rdknl_assert_unlocked);
+ knlist_init(&so->so_wrsel.si_note, so, so_wrknl_lock, so_wrknl_unlock,
+ so_wrknl_assert_locked, so_wrknl_assert_unlocked);
+ VNET_SO_ASSERT(head);
+ if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
+ sodealloc(so);
+ log(LOG_DEBUG, "%s: pcb %p: soreserve() failed\n",
+ __func__, head->so_pcb);
+ return (NULL);
+ }
+ if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+ sodealloc(so);
+ log(LOG_DEBUG, "%s: pcb %p: pru_attach() failed\n",
+ __func__, head->so_pcb);
+ return (NULL);
+ }
+ so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
+ so->so_snd.sb_lowat = head->so_snd.sb_lowat;
+ so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
+ so->so_snd.sb_timeo = head->so_snd.sb_timeo;
+ so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
+ so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
+
+ soref(so);
+
return (so);
}
+#endif /* SCTP */
int
sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
@@ -741,16 +807,140 @@ solisten_proto_check(struct socket *so)
void
solisten_proto(struct socket *so, int backlog)
{
+ int sbrcv_lowat, sbsnd_lowat;
+ u_int sbrcv_hiwat, sbsnd_hiwat;
+ short sbrcv_flags, sbsnd_flags;
+ sbintime_t sbrcv_timeo, sbsnd_timeo;
SOCK_LOCK_ASSERT(so);
+ if (SOLISTENING(so))
+ goto listening;
+
+ /*
+ * Change this socket to listening state.
+ */
+ sbrcv_lowat = so->so_rcv.sb_lowat;
+ sbsnd_lowat = so->so_snd.sb_lowat;
+ sbrcv_hiwat = so->so_rcv.sb_hiwat;
+ sbsnd_hiwat = so->so_snd.sb_hiwat;
+ sbrcv_flags = so->so_rcv.sb_flags;
+ sbsnd_flags = so->so_snd.sb_flags;
+ sbrcv_timeo = so->so_rcv.sb_timeo;
+ sbsnd_timeo = so->so_snd.sb_timeo;
+
+ sbdestroy(&so->so_snd, so);
+ sbdestroy(&so->so_rcv, so);
+ sx_destroy(&so->so_snd.sb_sx);
+ sx_destroy(&so->so_rcv.sb_sx);
+ SOCKBUF_LOCK_DESTROY(&so->so_snd);
+ SOCKBUF_LOCK_DESTROY(&so->so_rcv);
+
+#ifdef INVARIANTS
+ bzero(&so->so_rcv,
+ sizeof(struct socket) - offsetof(struct socket, so_rcv));
+#endif
+
+ so->sol_sbrcv_lowat = sbrcv_lowat;
+ so->sol_sbsnd_lowat = sbsnd_lowat;
+ so->sol_sbrcv_hiwat = sbrcv_hiwat;
+ so->sol_sbsnd_hiwat = sbsnd_hiwat;
+ so->sol_sbrcv_flags = sbrcv_flags;
+ so->sol_sbsnd_flags = sbsnd_flags;
+ so->sol_sbrcv_timeo = sbrcv_timeo;
+ so->sol_sbsnd_timeo = sbsnd_timeo;
+
+ so->sol_qlen = so->sol_incqlen = 0;
+ TAILQ_INIT(&so->sol_incomp);
+ TAILQ_INIT(&so->sol_comp);
+
+ so->sol_accept_filter = NULL;
+ so->sol_accept_filter_arg = NULL;
+ so->sol_accept_filter_str = NULL;
+
+ so->so_options |= SO_ACCEPTCONN;
+
+listening:
if (backlog < 0 || backlog > somaxconn)
backlog = somaxconn;
- so->so_qlimit = backlog;
- so->so_options |= SO_ACCEPTCONN;
+ so->sol_qlimit = backlog;
}
/*
+ * Wakeup listeners/subsystems once we have a complete connection.
+ * Enters with lock, returns unlocked.
+ */
+void
+solisten_wakeup(struct socket *sol)
+{
+
+ if (sol->sol_upcall != NULL)
+ (void )sol->sol_upcall(sol, sol->sol_upcallarg, M_NOWAIT);
+ else {
+ selwakeuppri(&sol->so_rdsel, PSOCK);
+ KNOTE_LOCKED(&sol->so_rdsel.si_note, 0);
+ }
+ SOLISTEN_UNLOCK(sol);
+ wakeup_one(&sol->sol_comp);
+}
+
+/*
+ * Return single connection off a listening socket queue. Main consumer of
+ * the function is kern_accept4(). Some modules, that do their own accept
+ * management also use the function.
+ *
+ * Listening socket must be locked on entry and is returned unlocked on
+ * return.
+ * The flags argument is set of accept4(2) flags and ACCEPT4_INHERIT.
+ */
+int
+solisten_dequeue(struct socket *head, struct socket **ret, int flags)
+{
+ struct socket *so;
+ int error;
+
+ SOLISTEN_LOCK_ASSERT(head);
+
+ while (!(head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp) &&
+ head->so_error == 0) {
+ error = msleep(&head->sol_comp, &head->so_lock, PSOCK | PCATCH,
+ "accept", 0);
+ if (error != 0) {
+ SOLISTEN_UNLOCK(head);
+ return (error);
+ }
+ }
+ if (head->so_error) {
+ error = head->so_error;
+ head->so_error = 0;
+ SOLISTEN_UNLOCK(head);
+ return (error);
+ }
+ if ((head->so_state & SS_NBIO) && TAILQ_EMPTY(&head->sol_comp)) {
+ SOLISTEN_UNLOCK(head);
+ return (EWOULDBLOCK);
+ }
+ so = TAILQ_FIRST(&head->sol_comp);
+ SOCK_LOCK(so);
+ KASSERT(so->so_qstate == SQ_COMP,
+ ("%s: so %p not SQ_COMP", __func__, so));
+ soref(so);
+ head->sol_qlen--;
+ so->so_qstate = SQ_NONE;
+ so->so_listen = NULL;
+ TAILQ_REMOVE(&head->sol_comp, so, so_list);
+ if (flags & ACCEPT4_INHERIT)
+ so->so_state |= (head->so_state & SS_NBIO);
+ else
+ so->so_state |= (flags & SOCK_NONBLOCK) ? SS_NBIO : 0;
+ SOCK_UNLOCK(so);
+ sorele(head);
+
+ *ret = so;
+ return (0);
+}
+
+/*
* Evaluate the reference count and named references on a socket; if no
* references remain, free it. This should be called whenever a reference is
* released, such as in sorele(), but also when named reference flags are
@@ -774,44 +964,62 @@ void
sofree(struct socket *so)
{
struct protosw *pr = so->so_proto;
- struct socket *head;
- ACCEPT_LOCK_ASSERT();
SOCK_LOCK_ASSERT(so);
if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
- (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
+ (so->so_state & SS_PROTOREF) || (so->so_qstate == SQ_COMP)) {
SOCK_UNLOCK(so);
- ACCEPT_UNLOCK();
return;
}
- head = so->so_head;
- if (head != NULL) {
- KASSERT((so->so_qstate & SQ_COMP) != 0 ||
- (so->so_qstate & SQ_INCOMP) != 0,
- ("sofree: so_head != NULL, but neither SQ_COMP nor "
- "SQ_INCOMP"));
- KASSERT((so->so_qstate & SQ_COMP) == 0 ||
- (so->so_qstate & SQ_INCOMP) == 0,
- ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
- TAILQ_REMOVE(&head->so_incomp, so, so_list);
- head->so_incqlen--;
- so->so_qstate &= ~SQ_INCOMP;
- so->so_head = NULL;
+ if (!SOLISTENING(so) && so->so_qstate == SQ_INCOMP) {
+ struct socket *sol;
+
+ sol = so->so_listen;
+ KASSERT(sol, ("%s: so %p on incomp of NULL", __func__, so));
+
+ /*
+ * To solve race between close of a listening socket and
+ * a socket on its incomplete queue, we need to lock both.
+ * The order is first listening socket, then regular.
+ * Since we don't have SS_NOFDREF neither SS_PROTOREF, this
+ * function and the listening socket are the only pointers
+ * to so. To preserve so and sol, we reference both and then
+ * relock.
+ * After relock the socket may not move to so_comp since it
+ * doesn't have PCB already, but it may be removed from
+ * so_incomp. If that happens, we share responsiblity on
+ * freeing the socket, but soclose() has already removed
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-head
mailing list