svn commit: r358438 - stable/12/usr.sbin/bhyve

Vincenzo Maffione vmaffione at FreeBSD.org
Fri Feb 28 17:41:48 UTC 2020


Author: vmaffione
Date: Fri Feb 28 17:41:46 2020
New Revision: 358438
URL: https://svnweb.freebsd.org/changeset/base/358438

Log:
  MFC r358180
  
  bhyve: enable virtio-net mergeable rx buffers for tap(4)
  
  This patch adds a new netbe_peek_recvlen() function to the net
  backend API. The new function allows the virtio-net receive code
  to know in advance how many virtio descriptors chains will be
  needed to receive the next packet. As a result, the implementation
  of the virtio-net mergeable rx buffers feature becomes efficient,
  so that we can enable it also with the tap(4) backend. For the
  tap(4) backend, a bounce buffer is introduced to implement the
  peeck_recvlen() callback, which implies an additional packet copy
  on the receive datapath. In the future, it should be possible to
  remove the bounce buffer (and so the additional copy), by
  obtaining the length of the next packet from kevent data.
  
  Reviewed by:    grehan, aleksandr.fedorov at itglobal.com
  Differential Revision:  https://reviews.freebsd.org/D23472

Modified:
  stable/12/usr.sbin/bhyve/iov.c
  stable/12/usr.sbin/bhyve/iov.h
  stable/12/usr.sbin/bhyve/net_backends.c
  stable/12/usr.sbin/bhyve/net_backends.h
  stable/12/usr.sbin/bhyve/pci_virtio_net.c
Directory Properties:
  stable/12/   (props changed)

Modified: stable/12/usr.sbin/bhyve/iov.c
==============================================================================
--- stable/12/usr.sbin/bhyve/iov.c	Fri Feb 28 17:38:31 2020	(r358437)
+++ stable/12/usr.sbin/bhyve/iov.c	Fri Feb 28 17:41:46 2020	(r358438)
@@ -119,24 +119,25 @@ iov_to_buf(const struct iovec *iov, int niov, void **b
 }
 
 ssize_t
-buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov,
+buf_to_iov(const void *buf, size_t buflen, const struct iovec *iov, int niov,
     size_t seek)
 {
 	struct iovec *diov;
-	int ndiov, i;
 	size_t off = 0, len;
+	int  i;
 
 	if (seek > 0) {
+		int ndiov;
+
 		diov = malloc(sizeof(struct iovec) * niov);
 		seek_iov(iov, niov, diov, &ndiov, seek);
-	} else {
-		diov = iov;
-		ndiov = niov;
+		iov = diov;
+		niov = ndiov;
 	}
 
-	for (i = 0; i < ndiov && off < buflen; i++) {
-		len = MIN(diov[i].iov_len, buflen - off);
-		memcpy(diov[i].iov_base, buf + off, len);
+	for (i = 0; i < niov && off < buflen; i++) {
+		len = MIN(iov[i].iov_len, buflen - off);
+		memcpy(iov[i].iov_base, buf + off, len);
 		off += len;
 	}
 

Modified: stable/12/usr.sbin/bhyve/iov.h
==============================================================================
--- stable/12/usr.sbin/bhyve/iov.h	Fri Feb 28 17:38:31 2020	(r358437)
+++ stable/12/usr.sbin/bhyve/iov.h	Fri Feb 28 17:41:46 2020	(r358438)
@@ -38,7 +38,7 @@ void seek_iov(const struct iovec *iov1, int niov1, str
 void truncate_iov(struct iovec *iov, int *niov, size_t length);
 size_t count_iov(const struct iovec *iov, int niov);
 ssize_t iov_to_buf(const struct iovec *iov, int niov, void **buf);
-ssize_t buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov,
-    size_t seek);
+ssize_t buf_to_iov(const void *buf, size_t buflen, const struct iovec *iov,
+    int niov, size_t seek);
 
 #endif	/* _IOV_H_ */

Modified: stable/12/usr.sbin/bhyve/net_backends.c
==============================================================================
--- stable/12/usr.sbin/bhyve/net_backends.c	Fri Feb 28 17:38:31 2020	(r358437)
+++ stable/12/usr.sbin/bhyve/net_backends.c	Fri Feb 28 17:41:46 2020	(r358438)
@@ -103,6 +103,13 @@ struct net_backend {
 	    int iovcnt);
 
 	/*
+	 * Get the length of the next packet that can be received from
+	 * the backend. If no packets are currently available, this
+	 * function returns 0.
+	 */
+	ssize_t (*peek_recvlen)(struct net_backend *be);
+
+	/*
 	 * Called to receive a packet from the backend. When the function
 	 * returns a positive value 'len', the scatter-gather vector
 	 * provided by the caller contains a packet with such length.
@@ -167,6 +174,13 @@ SET_DECLARE(net_backend_set, struct net_backend);
 
 struct tap_priv {
 	struct mevent *mevp;
+	/*
+	 * A bounce buffer that allows us to implement the peek_recvlen
+	 * callback. In the future we may get the same information from
+	 * the kevent data.
+	 */
+	char bbuf[1 << 16];
+	ssize_t bbuflen;
 };
 
 static void
@@ -223,6 +237,9 @@ tap_init(struct net_backend *be, const char *devname,
 		errx(EX_OSERR, "Unable to apply rights for sandbox");
 #endif
 
+	memset(priv->bbuf, 0, sizeof(priv->bbuf));
+	priv->bbuflen = 0;
+
 	priv->mevp = mevent_add_disabled(be->fd, EVF_READ, cb, param);
 	if (priv->mevp == NULL) {
 		WPRINTF(("Could not register event"));
@@ -246,15 +263,56 @@ tap_send(struct net_backend *be, const struct iovec *i
 }
 
 static ssize_t
+tap_peek_recvlen(struct net_backend *be)
+{
+	struct tap_priv *priv = (struct tap_priv *)be->opaque;
+	ssize_t ret;
+
+	if (priv->bbuflen > 0) {
+		/*
+		 * We already have a packet in the bounce buffer.
+		 * Just return its length.
+		 */
+		return priv->bbuflen;
+	}
+
+	/*
+	 * Read the next packet (if any) into the bounce buffer, so
+	 * that we get to know its length and we can return that
+	 * to the caller.
+	 */
+	ret = read(be->fd, priv->bbuf, sizeof(priv->bbuf));
+	if (ret < 0 && errno == EWOULDBLOCK) {
+		return (0);
+	}
+
+	if (ret > 0)
+		priv->bbuflen = ret;
+
+	return (ret);
+}
+
+static ssize_t
 tap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
 {
+	struct tap_priv *priv = (struct tap_priv *)be->opaque;
 	ssize_t ret;
 
-	/* Should never be called without a valid tap fd */
-	assert(be->fd != -1);
+	if (priv->bbuflen > 0) {
+		/*
+		 * A packet is available in the bounce buffer, so
+		 * we read it from there.
+		 */
+		ret = buf_to_iov(priv->bbuf, priv->bbuflen,
+		    iov, iovcnt, 0);
 
-	ret = readv(be->fd, iov, iovcnt);
+		/* Mark the bounce buffer as empty. */
+		priv->bbuflen = 0;
 
+		return (ret);
+	}
+
+	ret = readv(be->fd, iov, iovcnt);
 	if (ret < 0 && errno == EWOULDBLOCK) {
 		return (0);
 	}
@@ -299,6 +357,7 @@ static struct net_backend tap_backend = {
 	.init = tap_init,
 	.cleanup = tap_cleanup,
 	.send = tap_send,
+	.peek_recvlen = tap_peek_recvlen,
 	.recv = tap_recv,
 	.recv_enable = tap_recv_enable,
 	.recv_disable = tap_recv_disable,
@@ -313,6 +372,7 @@ static struct net_backend vmnet_backend = {
 	.init = tap_init,
 	.cleanup = tap_cleanup,
 	.send = tap_send,
+	.peek_recvlen = tap_peek_recvlen,
 	.recv = tap_recv,
 	.recv_enable = tap_recv_enable,
 	.recv_disable = tap_recv_disable,
@@ -331,8 +391,7 @@ DATA_SET(net_backend_set, vmnet_backend);
 #define NETMAP_FEATURES (VIRTIO_NET_F_CSUM | VIRTIO_NET_F_HOST_TSO4 | \
 		VIRTIO_NET_F_HOST_TSO6 | VIRTIO_NET_F_HOST_UFO | \
 		VIRTIO_NET_F_GUEST_CSUM | VIRTIO_NET_F_GUEST_TSO4 | \
-		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO | \
-		VIRTIO_NET_F_MRG_RXBUF)
+		VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_UFO)
 
 struct netmap_priv {
 	char ifname[IFNAMSIZ];
@@ -540,6 +599,26 @@ txsync:
 }
 
 static ssize_t
+netmap_peek_recvlen(struct net_backend *be)
+{
+	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
+	struct netmap_ring *ring = priv->rx;
+	uint32_t head = ring->head;
+	ssize_t totlen = 0;
+
+	while (head != ring->tail) {
+		struct netmap_slot *slot = ring->slot + head;
+
+		totlen += slot->len;
+		if ((slot->flags & NS_MOREFRAG) == 0)
+			break;
+		head = nm_ring_next(ring, head);
+	}
+
+	return (totlen);
+}
+
+static ssize_t
 netmap_recv(struct net_backend *be, const struct iovec *iov, int iovcnt)
 {
 	struct netmap_priv *priv = (struct netmap_priv *)be->opaque;
@@ -628,6 +707,7 @@ static struct net_backend netmap_backend = {
 	.init = netmap_init,
 	.cleanup = netmap_cleanup,
 	.send = netmap_send,
+	.peek_recvlen = netmap_peek_recvlen,
 	.recv = netmap_recv,
 	.recv_enable = netmap_recv_enable,
 	.recv_disable = netmap_recv_disable,
@@ -642,6 +722,7 @@ static struct net_backend vale_backend = {
 	.init = netmap_init,
 	.cleanup = netmap_cleanup,
 	.send = netmap_send,
+	.peek_recvlen = netmap_peek_recvlen,
 	.recv = netmap_recv,
 	.recv_enable = netmap_recv_enable,
 	.recv_disable = netmap_recv_disable,
@@ -756,6 +837,13 @@ netbe_send(struct net_backend *be, const struct iovec 
 {
 
 	return (be->send(be, iov, iovcnt));
+}
+
+ssize_t
+netbe_peek_recvlen(struct net_backend *be)
+{
+
+	return (be->peek_recvlen(be));
 }
 
 /*

Modified: stable/12/usr.sbin/bhyve/net_backends.h
==============================================================================
--- stable/12/usr.sbin/bhyve/net_backends.h	Fri Feb 28 17:38:31 2020	(r358437)
+++ stable/12/usr.sbin/bhyve/net_backends.h	Fri Feb 28 17:41:46 2020	(r358438)
@@ -45,6 +45,7 @@ int	 netbe_set_cap(net_backend_t *be, uint64_t cap,
              unsigned vnet_hdr_len);
 size_t	netbe_get_vnet_hdr_len(net_backend_t *be);
 ssize_t	netbe_send(net_backend_t *be, const struct iovec *iov, int iovcnt);
+ssize_t	netbe_peek_recvlen(net_backend_t *be);
 ssize_t	netbe_recv(net_backend_t *be, const struct iovec *iov, int iovcnt);
 ssize_t	netbe_rx_discard(net_backend_t *be);
 void	netbe_rx_disable(net_backend_t *be);

Modified: stable/12/usr.sbin/bhyve/pci_virtio_net.c
==============================================================================
--- stable/12/usr.sbin/bhyve/pci_virtio_net.c	Fri Feb 28 17:38:31 2020	(r358437)
+++ stable/12/usr.sbin/bhyve/pci_virtio_net.c	Fri Feb 28 17:41:46 2020	(r358438)
@@ -228,22 +228,34 @@ pci_vtnet_rx(struct pci_vtnet_softc *sc)
 	struct virtio_mrg_rxbuf_info info[VTNET_MAXSEGS];
 	struct iovec iov[VTNET_MAXSEGS + 1];
 	struct vqueue_info *vq;
-	uint32_t riov_bytes;
-	struct iovec *riov;
-	int riov_len;
-	uint32_t ulen;
-	int n_chains;
-	int len;
 
 	vq = &sc->vsc_queues[VTNET_RXQ];
 	for (;;) {
 		struct virtio_net_rxhdr *hdr;
+		uint32_t riov_bytes;
+		struct iovec *riov;
+		uint32_t ulen;
+		int riov_len;
+		int n_chains;
+		ssize_t rlen;
+		ssize_t plen;
 
+		plen = netbe_peek_recvlen(sc->vsc_be);
+		if (plen <= 0) {
+			/*
+			 * No more packets (plen == 0), or backend errored
+			 * (plen < 0). Interrupt if needed and stop.
+			 */
+			vq_endchains(vq, /*used_all_avail=*/0);
+			return;
+		}
+		plen += prepend_hdr_len;
+
 		/*
 		 * Get a descriptor chain to store the next ingress
 		 * packet. In case of mergeable rx buffers, get as
 		 * many chains as necessary in order to make room
-		 * for a maximum sized LRO packet.
+		 * for plen bytes.
 		 */
 		riov_bytes = 0;
 		riov_len = 0;
@@ -287,8 +299,7 @@ pci_vtnet_rx(struct pci_vtnet_softc *sc)
 			riov_bytes += info[n_chains].len;
 			riov += n;
 			n_chains++;
-		} while (riov_bytes < VTNET_MAX_PKT_LEN &&
-			    riov_len < VTNET_MAXSEGS);
+		} while (riov_bytes < plen && riov_len < VTNET_MAXSEGS);
 
 		riov = iov;
 		hdr = riov[0].iov_base;
@@ -312,21 +323,20 @@ pci_vtnet_rx(struct pci_vtnet_softc *sc)
 			memset(hdr, 0, prepend_hdr_len);
 		}
 
-		len = netbe_recv(sc->vsc_be, riov, riov_len);
-
-		if (len <= 0) {
+		rlen = netbe_recv(sc->vsc_be, riov, riov_len);
+		if (rlen != plen - prepend_hdr_len) {
 			/*
-			 * No more packets (len == 0), or backend errored
-			 * (err < 0). Return unused available buffers
-			 * and stop.
+			 * If this happens it means there is something
+			 * wrong with the backend (e.g., some other
+			 * process is stealing our packets).
 			 */
+			WPRINTF(("netbe_recv: expected %zd bytes, "
+				"got %zd", plen - prepend_hdr_len, rlen));
 			vq_retchains(vq, n_chains);
-			/* Interrupt if needed/appropriate and stop. */
-			vq_endchains(vq, /*used_all_avail=*/0);
-			return;
+			continue;
 		}
 
-		ulen = (uint32_t)(len + prepend_hdr_len);
+		ulen = (uint32_t)plen;
 
 		/*
 		 * Publish the used buffers to the guest, reporting the
@@ -346,12 +356,11 @@ pci_vtnet_rx(struct pci_vtnet_softc *sc)
 				vq_relchain_prepare(vq, info[i].idx, iolen);
 				ulen -= iolen;
 				i++;
-				assert(i <= n_chains);
 			} while (ulen > 0);
 
 			hdr->vrh_bufs = i;
 			vq_relchain_publish(vq);
-			vq_retchains(vq, n_chains - i);
+			assert(i == n_chains);
 		}
 	}
 
@@ -592,7 +601,8 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *
 			free(sc);
 			return (err);
 		}
-		sc->vsc_consts.vc_hv_caps |= netbe_get_cap(sc->vsc_be);
+		sc->vsc_consts.vc_hv_caps |= VIRTIO_NET_F_MRG_RXBUF |
+		    netbe_get_cap(sc->vsc_be);
 	}
 
 	if (!mac_provided) {


More information about the svn-src-all mailing list