Does FreeBSD have sendmmsg or recvmmsg system calls?

Boris Astardzhiev boris.astardzhiev at gmail.com
Tue Jan 12 14:53:28 UTC 2016


Hello again,

In my spare time I did the following simple libc-only implementation of the
syscalls.
I did some tests in a VM adapting these experiments:
https://blog.cloudflare.com/how-to-receive-a-million-packets/

Any comments about the diff are greatly appreciated.

Best regards,
Boris Astardzhiev

On Fri, Jan 8, 2016 at 7:02 PM, Adrian Chadd <adrian.chadd at gmail.com> wrote:

> On 8 January 2016 at 03:02, Bruce Evans <brde at optusnet.com.au> wrote:
> > On Fri, 8 Jan 2016, Adrian Chadd wrote:
> >
> >> On 7 January 2016 at 23:58, Mark Delany <c2h at romeo.emu.st> wrote:
> >>>
> >>> On 08Jan16, Bruce Evans allegedly wrote:
> >>>>
> >>>> If the NIC can't reach line rate
> >>>
> >>>
> >>>> Network stack overheads are also enormous.
> >>>
> >>>
> >>> Bruce makes some excellent points.
> >>>
> >>> I challenge anyone to get line rate UDP out of FBSD (or Linux) for a
> >>> 1G NIC yet alone a 10G NIC listening to a single port. It was exactly
> >>> my frustration with UDP performance that led me down the path of
> >>> *mmsg() and netmap.
> >>>
> >>> Frankly this is an opportunity for FBSD as UDP performance appears to
> >>> be a neglected area.
> >>
> >>
> >> I'm there, on 16 threads.
> >>
> >> I'd rather we do it on two or three, as a lot of time is wasted in
> >> producer/consumer locking. but yeah, 500k tx/rx should be doable per
> >> CPU with only locking changes.
>
> .. and I did mean "kernel producer/consumer locking changes."
>
> >
> > Line rate for 1 Gbps is about 1500 kpps (small packets).
> >
> > With I218V2 (em), I see enormous lock contention above 3 or 4 (user)
> > threads, and 8 are slightly slower than 1.  1 doesn't saturate the NIC,
> > and 2 is optimal.
> >
>
> The RSS support in -HEAD lets you get away with parallelising UDP
> streams very nicely.
>
> The framework is pretty simple (!):
>
> * drivers ask the RSS code for the RSS config and RSS hash to use, and
> configure the hardware appropriately;
> * the netisr input paths check the existence of the RSS hash and will
> calculte it in software if reqiured;
> * v4/v6 reassembly is done (at the IP level, /not/ at the protocol
> level) and if it needs a new RSS hash / netisr reinjection, that'll
> happen;
> * the PCB lookup code for listen sockets now allows one listen socket
> per RSS bucket - as the RSS / PCBGROUPS code already extended the PCB
> to have one PCB table per RSS bucket (as well as a global one);
>
> So:
>
> * userland code queries RSS for the CPU and RSS bucket setup;
> * you then create one listen socket per RSS bucket, bind it to the
> local thread (if you want) and tell it "you're in RSS bucket X";
> * .. and then in the UDP case for local-bound sockets, the
> transmit/receive path does not require modifying the global PCB state,
> so the locking is kept per-RSS bucket, and scales linearly with the
> number of CPUs you have (until you hit the NIC queue limits.)
>
> https://github.com/erikarn/freebsd-rss/
>
> and:
>
>
> http://adrianchadd.blogspot.com/2014/06/hacking-on-receive-side-scaling-rss-on.html
>
> http://adrianchadd.blogspot.com/2014/07/application-awareness-of-receive-side.html
>
> http://adrianchadd.blogspot.com/2014/08/receive-side-scaling-figuring-out-how.html
>
> http://adrianchadd.blogspot.com/2014/09/receive-side-scaling-testing-udp.html
>
> http://adrianchadd.blogspot.com/2014/10/more-rss-udp-tests-this-time-on-dell.html
>
>
>
> -adrian
> _______________________________________________
> freebsd-net at freebsd.org mailing list
> https://lists.freebsd.org/mailman/listinfo/freebsd-net
> To unsubscribe, send any mail to "freebsd-net-unsubscribe at freebsd.org"
>
-------------- next part --------------
diff --git a/lib/libc/include/libc_private.h b/lib/libc/include/libc_private.h
index 5caf9a3..9a0d6cf 100644
--- a/lib/libc/include/libc_private.h
+++ b/lib/libc/include/libc_private.h
@@ -224,6 +224,8 @@ enum {
 	INTERPOS_kevent,
 	INTERPOS_wait6,
 	INTERPOS_ppoll,
+	INTERPOS_sendmmsg,
+	INTERPOS_recvmmsg,
 	INTERPOS_MAX
 };
 
diff --git a/lib/libc/include/namespace.h b/lib/libc/include/namespace.h
index 739d7b1..c95829e 100644
--- a/lib/libc/include/namespace.h
+++ b/lib/libc/include/namespace.h
@@ -208,6 +208,7 @@
 #define		readv				_readv
 #define		recvfrom			_recvfrom
 #define		recvmsg				_recvmsg
+#define		recvmmsg			_recvmmsg
 #define		select				_select
 #define		sem_close			_sem_close
 #define		sem_destroy			_sem_destroy
@@ -220,6 +221,7 @@
 #define		sem_unlink			_sem_unlink
 #define		sem_wait			_sem_wait
 #define		sendmsg				_sendmsg
+#define		sendmmsg			_sendmmsg
 #define		sendto				_sendto
 #define		setsockopt			_setsockopt
 /*#define		sigaction			_sigaction*/
diff --git a/lib/libc/include/un-namespace.h b/lib/libc/include/un-namespace.h
index f31fa7a..0233348 100644
--- a/lib/libc/include/un-namespace.h
+++ b/lib/libc/include/un-namespace.h
@@ -189,6 +189,7 @@
 #undef		readv
 #undef		recvfrom
 #undef		recvmsg
+#undef		recvmmsg
 #undef		select
 #undef		sem_close
 #undef		sem_destroy
@@ -201,6 +202,7 @@
 #undef		sem_unlink
 #undef		sem_wait
 #undef		sendmsg
+#undef		sendmmsg
 #undef		sendto
 #undef		setsockopt
 #undef		sigaction
diff --git a/lib/libc/sys/Makefile.inc b/lib/libc/sys/Makefile.inc
index e4fe1b2..ecb366a 100644
--- a/lib/libc/sys/Makefile.inc
+++ b/lib/libc/sys/Makefile.inc
@@ -28,6 +28,10 @@ SRCS+= futimens.c utimensat.c
 NOASM+= futimens.o utimensat.o
 PSEUDO+= _futimens.o _utimensat.o
 
+SRCS+= recvmmsg.c sendmmsg.c
+NOASM+= recvmmsg.o sendmmsg.o
+PSEUDO+= _recvmmsg.o _sendmmsg.o
+
 INTERPOSED = \
 	accept \
 	accept4 \
diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
index 7b3257c..724e1b4 100644
--- a/lib/libc/sys/Symbol.map
+++ b/lib/libc/sys/Symbol.map
@@ -399,6 +399,8 @@ FBSD_1.4 {
 	utimensat;
 	numa_setaffinity;
 	numa_getaffinity;
+	sendmmsg;
+	recvmmsg;
 };
 
 FBSDprivate_1.0 {
@@ -1051,4 +1053,6 @@ FBSDprivate_1.0 {
 	gssd_syscall;
 	__libc_interposing_slot;
 	__libc_sigwait;
+	_sendmmsg;
+	_recvmmsg;
 };
diff --git a/lib/libc/sys/recvmmsg.c b/lib/libc/sys/recvmmsg.c
new file mode 100644
index 0000000..03ab379
--- /dev/null
+++ b/lib/libc/sys/recvmmsg.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016 Boris Astardzhiev, Smartcom-Bulgaria AD
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice(s), this list of conditions and the following disclaimer as
+ *    the first lines of this file unmodified other than the possible
+ *    addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice(s), this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/socket.h>
+#include "libc_private.h"
+
+#define VLEN_MAX 1024
+
+int
+recvmmsg(int s, struct mmsghdr *msgvec, unsigned int vlen, int flags)
+{
+	int i, ret, rcvd;
+
+	if (vlen > VLEN_MAX)
+		vlen = VLEN_MAX;
+
+	rcvd = 0;
+	for (i = 0; i < vlen; i++) {
+		errno = 0;
+		ret = (((int (*)(int, const struct msghdr *, int))
+		    __libc_interposing[INTERPOS_recvmsg])(s,
+		        &msgvec[i].msg_hdr, flags));
+		if (ret < 0 || errno != 0) {
+			if (rcvd) {
+				/* We've received messages. Let caller know. */
+				errno = 0;
+				return (rcvd);
+			}
+			return (-1);
+		}
+
+		/* Save received bytes */
+		msgvec[i].msg_len = ret;
+
+		rcvd++;
+	}
+
+	return (rcvd);
+}
+
+#undef VLEN_MAX
diff --git a/lib/libc/sys/sendmmsg.c b/lib/libc/sys/sendmmsg.c
new file mode 100644
index 0000000..3387fdc
--- /dev/null
+++ b/lib/libc/sys/sendmmsg.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016 Boris Astardzhiev, Smartcom-Bulgaria AD
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice(s), this list of conditions and the following disclaimer as
+ *    the first lines of this file unmodified other than the possible
+ *    addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice(s), this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/socket.h>
+#include "libc_private.h"
+
+#define VLEN_MAX 1024
+
+int
+sendmmsg(int s, struct mmsghdr *msgvec, unsigned int vlen, int flags)
+{
+	int i, ret, sent;
+
+	if (vlen > VLEN_MAX)
+		vlen = VLEN_MAX;
+
+	sent = 0;
+	for (i = 0; i < vlen; i++) {
+		errno = 0;
+		ret = (((int (*)(int, const struct msghdr *, int))
+		    __libc_interposing[INTERPOS_sendmsg])(s,
+		        &msgvec[i].msg_hdr, flags));
+		if (ret < 0 || errno != 0) {
+			if (sent) {
+				/* We have sent messages. Let caller know. */
+				errno = 0;
+				return (sent);
+			}
+			return (-1);
+		}
+
+		/* Save sent bytes */
+		msgvec[i].msg_len = ret;
+
+		sent++;
+	}
+
+	return (sent);
+}
+
+#undef VLEN_MAX
diff --git a/lib/libthr/thread/thr_syscalls.c b/lib/libthr/thread/thr_syscalls.c
index 7c05697..7b5458d 100644
--- a/lib/libthr/thread/thr_syscalls.c
+++ b/lib/libthr/thread/thr_syscalls.c
@@ -606,6 +606,84 @@ __thr_writev(int fd, const struct iovec *iov, int iovcnt)
 	return (ret);
 }
 
+#define VLEN_MAX 1024
+
+static int
+__thr_sendmmsg(int s, struct mmsghdr *msgvec, unsigned int vlen, int flags)
+{
+	struct pthread *curthread;
+	int i, ret, sent;
+
+	curthread = _get_curthread();
+	_thr_cancel_enter(curthread);
+
+	if (vlen > VLEN_MAX)
+		vlen = VLEN_MAX;
+
+	sent = 0;
+	for (i = 0; i < (int)vlen; i++) {
+		errno = 0;
+		ret = __sys_sendmsg(s, &msgvec[i].msg_hdr, flags);
+		if (ret < 0 || errno != 0) {
+			if (sent) {
+				/* We have sent messages. Let caller know. */
+				errno = 0;
+				_thr_cancel_leave(curthread, ret <= 0);
+				return (sent);
+			}
+			return (-1);
+		}
+
+		/* Save sent bytes */
+		msgvec[i].msg_len = ret;
+
+		sent++;
+	}
+
+	_thr_cancel_leave(curthread, ret <= 0);
+
+	return (sent);
+}
+
+static int
+__thr_recvmmsg(int s, struct mmsghdr *msgvec, unsigned int vlen, int flags)
+{
+	struct pthread *curthread;
+	int i, ret, rcvd;
+
+	curthread = _get_curthread();
+	_thr_cancel_enter(curthread);
+
+	if (vlen > VLEN_MAX)
+		vlen = VLEN_MAX;
+
+	rcvd = 0;
+	for (i = 0; i < (int)vlen; i++) {
+		errno = 0;
+		ret = __sys_recvmsg(s, &msgvec[i].msg_hdr, flags);
+		if (ret < 0 || errno != 0) {
+			if (rcvd) {
+				/* We've received messages. Let caller know. */
+				errno = 0;
+				_thr_cancel_leave(curthread, ret == -1);
+				return (rcvd);
+			}
+			return (-1);
+		}
+
+		/* Save received bytes */
+		msgvec[i].msg_len = ret;
+
+		rcvd++;
+	}
+
+	_thr_cancel_leave(curthread, ret == -1);
+
+	return (rcvd);
+}
+
+#undef VLEN_MAX
+
 void
 __thr_interpose_libc(void)
 {
@@ -652,6 +730,8 @@ __thr_interpose_libc(void)
 	SLOT(kevent);
 	SLOT(wait6);
 	SLOT(ppoll);
+	SLOT(sendmmsg);
+	SLOT(recvmmsg);
 #undef SLOT
 	*(__libc_interposing_slot(
 	    INTERPOS__pthread_mutex_init_calloc_cb)) =
diff --git a/sys/sys/socket.h b/sys/sys/socket.h
index 18e2de1..504313e 100644
--- a/sys/sys/socket.h
+++ b/sys/sys/socket.h
@@ -595,6 +595,18 @@ struct sf_hdtr {
 #endif /* _KERNEL */
 #endif /* __BSD_VISIBLE */
 
+#ifndef _KERNEL
+#ifdef __BSD_VISIBLE
+/*
+ * Send/recvmmsg specific structure(s)
+ */
+struct mmsghdr {
+	struct msghdr	msg_hdr;		/* message header */
+	unsigned int	msg_len;		/* message length  */
+};
+#endif /* __BSD_VISIBLE */
+#endif /* !_KERNEL */
+
 #ifndef	_KERNEL
 
 #include <sys/cdefs.h>
@@ -615,11 +627,17 @@ int	listen(int, int);
 ssize_t	recv(int, void *, size_t, int);
 ssize_t	recvfrom(int, void *, size_t, int, struct sockaddr * __restrict, socklen_t * __restrict);
 ssize_t	recvmsg(int, struct msghdr *, int);
+#if __BSD_VISIBLE
+int	recvmmsg(int, struct mmsghdr *, unsigned int, int);
+#endif
 ssize_t	send(int, const void *, size_t, int);
 ssize_t	sendto(int, const void *,
 	    size_t, int, const struct sockaddr *, socklen_t);
 ssize_t	sendmsg(int, const struct msghdr *, int);
 #if __BSD_VISIBLE
+int	sendmmsg(int, struct mmsghdr *, unsigned int, int);
+#endif
+#if __BSD_VISIBLE
 int	sendfile(int, int, off_t, size_t, struct sf_hdtr *, off_t *, int);
 int	setfib(int);
 #endif


More information about the freebsd-net mailing list