git: b22b31b380e8 - stable/13 - bhyve: Snapshot impovements for 'blockif' backend

From: John Baldwin <jhb_at_FreeBSD.org>
Date: Thu, 26 Jan 2023 19:47:39 UTC
The branch stable/13 has been updated by jhb:

URL: https://cgit.FreeBSD.org/src/commit/?id=b22b31b380e8810f59ada8e60fb87dcddc585054

commit b22b31b380e8810f59ada8e60fb87dcddc585054
Author:     Vitaliy Gusev <gusev.vitaliy@gmail.com>
AuthorDate: 2022-06-23 18:46:06 +0000
Commit:     John Baldwin <jhb@FreeBSD.org>
CommitDate: 2023-01-26 19:29:17 +0000

    bhyve: Snapshot impovements for 'blockif' backend
    
    When pausing a block I/O device model as part of suspending a VM, wait
    for all active block I/O requests to finish before saving snapshot
    data.  This avoids having to save information about in-flight requests
    both in the block_if layer and in storage device models.
    
    For the AHCI device model, the queues are now guaranteed to be idle
    when taking a snapshot, so remove the code to save queue state and
    rely on the initial state in a resumed VM having all queues already
    idle.
    
    This will also simplify adding NVMe snapshot support in the future.
    
    Reviewed by:    jhb
    Sponsored by:   vStack
    Differential Revision: https://reviews.freebsd.org/D26267
    
    (cherry picked from commit cd9618bdb274375139080ee4e33ccbdc980513f3)
---
 usr.sbin/bhyve/block_if.c | 105 ++++------------------------------
 usr.sbin/bhyve/block_if.h |   4 --
 usr.sbin/bhyve/pci_ahci.c | 142 +---------------------------------------------
 3 files changed, 15 insertions(+), 236 deletions(-)

diff --git a/usr.sbin/bhyve/block_if.c b/usr.sbin/bhyve/block_if.c
index 8ed95300a13c..2ca6da0469ac 100644
--- a/usr.sbin/bhyve/block_if.c
+++ b/usr.sbin/bhyve/block_if.c
@@ -109,11 +109,9 @@ struct blockif_ctxt {
 	int			bc_psectoff;
 	int			bc_closing;
 	int			bc_paused;
-	int			bc_work_count;
 	pthread_t		bc_btid[BLOCKIF_NUMTHR];
 	pthread_mutex_t		bc_mtx;
 	pthread_cond_t		bc_cond;
-	pthread_cond_t		bc_paused_cond;
 	pthread_cond_t		bc_work_done_cond;
 	blockif_resize_cb	*bc_resize_cb;
 	void			*bc_resize_cb_arg;
@@ -350,6 +348,12 @@ blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
 	(*br->br_callback)(br, err);
 }
 
+static inline bool
+blockif_empty(const struct blockif_ctxt *bc)
+{
+	return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq));
+}
+
 static void *
 blockif_thr(void *arg)
 {
@@ -367,30 +371,21 @@ blockif_thr(void *arg)
 
 	pthread_mutex_lock(&bc->bc_mtx);
 	for (;;) {
-		bc->bc_work_count++;
-
-		/* We cannot process work if the interface is paused */
-		while (!bc->bc_paused && blockif_dequeue(bc, t, &be)) {
+		while (blockif_dequeue(bc, t, &be)) {
 			pthread_mutex_unlock(&bc->bc_mtx);
 			blockif_proc(bc, be, buf);
 			pthread_mutex_lock(&bc->bc_mtx);
 			blockif_complete(bc, be);
 		}
 
-		bc->bc_work_count--;
-
-		/* If none of the workers are busy, notify the main thread */
-		if (bc->bc_work_count == 0)
+		/* If none to work, notify the main thread */
+		if (blockif_empty(bc))
 			pthread_cond_broadcast(&bc->bc_work_done_cond);
 
 		/* Check ctxt status here to see if exit requested */
 		if (bc->bc_closing)
 			break;
 
-		/* Make all worker threads wait here if the device is paused */
-		while (bc->bc_paused)
-			pthread_cond_wait(&bc->bc_paused_cond, &bc->bc_mtx);
-
 		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
 	}
 	pthread_mutex_unlock(&bc->bc_mtx);
@@ -623,8 +618,6 @@ blockif_open(nvlist_t *nvl, const char *ident)
 	pthread_mutex_init(&bc->bc_mtx, NULL);
 	pthread_cond_init(&bc->bc_cond, NULL);
 	bc->bc_paused = 0;
-	bc->bc_work_count = 0;
-	pthread_cond_init(&bc->bc_paused_cond, NULL);
 	pthread_cond_init(&bc->bc_work_done_cond, NULL);
 	TAILQ_INIT(&bc->bc_freeq);
 	TAILQ_INIT(&bc->bc_pendq);
@@ -724,6 +717,7 @@ blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
 	err = 0;
 
 	pthread_mutex_lock(&bc->bc_mtx);
+	assert(!bc->bc_paused);
 	if (!TAILQ_EMPTY(&bc->bc_freeq)) {
 		/*
 		 * Enqueue and inform the block i/o thread
@@ -748,7 +742,6 @@ blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
 int
 blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_READ));
 }
@@ -756,7 +749,6 @@ blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
 int
 blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_WRITE));
 }
@@ -764,7 +756,6 @@ blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
 int
 blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_FLUSH));
 }
@@ -772,7 +763,6 @@ blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
 int
 blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (blockif_request(bc, breq, BOP_DELETE));
 }
@@ -942,7 +932,6 @@ blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
 off_t
 blockif_size(struct blockif_ctxt *bc)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_size);
 }
@@ -950,7 +939,6 @@ blockif_size(struct blockif_ctxt *bc)
 int
 blockif_sectsz(struct blockif_ctxt *bc)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_sectsz);
 }
@@ -958,7 +946,6 @@ blockif_sectsz(struct blockif_ctxt *bc)
 void
 blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	*size = bc->bc_psectsz;
 	*off = bc->bc_psectoff;
@@ -967,7 +954,6 @@ blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
 int
 blockif_queuesz(struct blockif_ctxt *bc)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (BLOCKIF_MAXREQ - 1);
 }
@@ -975,7 +961,6 @@ blockif_queuesz(struct blockif_ctxt *bc)
 int
 blockif_is_ro(struct blockif_ctxt *bc)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_rdonly);
 }
@@ -983,7 +968,6 @@ blockif_is_ro(struct blockif_ctxt *bc)
 int
 blockif_candelete(struct blockif_ctxt *bc)
 {
-
 	assert(bc->bc_magic == BLOCKIF_SIG);
 	return (bc->bc_candelete);
 }
@@ -999,7 +983,7 @@ blockif_pause(struct blockif_ctxt *bc)
 	bc->bc_paused = 1;
 
 	/* The interface is paused. Wait for workers to finish their work */
-	while (bc->bc_work_count)
+	while (!blockif_empty(bc))
 		pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx);
 	pthread_mutex_unlock(&bc->bc_mtx);
 
@@ -1016,71 +1000,6 @@ blockif_resume(struct blockif_ctxt *bc)
 
 	pthread_mutex_lock(&bc->bc_mtx);
 	bc->bc_paused = 0;
-	/* resume the threads waiting for paused */
-	pthread_cond_broadcast(&bc->bc_paused_cond);
-	/* kick the threads after restore */
-	pthread_cond_broadcast(&bc->bc_cond);
-	pthread_mutex_unlock(&bc->bc_mtx);
-}
-
-int
-blockif_snapshot_req(struct blockif_req *br, struct vm_snapshot_meta *meta)
-{
-	int i;
-	struct iovec *iov;
-	int ret;
-
-	SNAPSHOT_VAR_OR_LEAVE(br->br_iovcnt, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(br->br_offset, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(br->br_resid, meta, ret, done);
-
-	/*
-	 * XXX: The callback and parameter must be filled by the virtualized
-	 * device that uses the interface, during its init; we're not touching
-	 * them here.
-	 */
-
-	/* Snapshot the iovecs. */
-	for (i = 0; i < br->br_iovcnt; i++) {
-		iov = &br->br_iov[i];
-
-		SNAPSHOT_VAR_OR_LEAVE(iov->iov_len, meta, ret, done);
-
-		/* We assume the iov is a guest-mapped address. */
-		SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(iov->iov_base, iov->iov_len,
-			false, meta, ret, done);
-	}
-
-done:
-	return (ret);
-}
-
-int
-blockif_snapshot(struct blockif_ctxt *bc, struct vm_snapshot_meta *meta)
-{
-	int ret;
-
-	if (bc->bc_paused == 0) {
-		fprintf(stderr, "%s: Snapshot failed: "
-			"interface not paused.\r\n", __func__);
-		return (ENXIO);
-	}
-
-	pthread_mutex_lock(&bc->bc_mtx);
-
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_magic, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_ischr, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_isgeom, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_candelete, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_rdonly, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_size, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_sectsz, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectsz, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_psectoff, meta, ret, done);
-	SNAPSHOT_VAR_OR_LEAVE(bc->bc_closing, meta, ret, done);
-
-done:
 	pthread_mutex_unlock(&bc->bc_mtx);
-	return (ret);
 }
-#endif
+#endif	/* BHYVE_SNAPSHOT */
diff --git a/usr.sbin/bhyve/block_if.h b/usr.sbin/bhyve/block_if.h
index 0407ff43cf94..cf3acc05c98f 100644
--- a/usr.sbin/bhyve/block_if.h
+++ b/usr.sbin/bhyve/block_if.h
@@ -87,10 +87,6 @@ int	blockif_close(struct blockif_ctxt *bc);
 #ifdef BHYVE_SNAPSHOT
 void	blockif_pause(struct blockif_ctxt *bc);
 void	blockif_resume(struct blockif_ctxt *bc);
-int	blockif_snapshot_req(struct blockif_req *br,
-    struct vm_snapshot_meta *meta);
-int	blockif_snapshot(struct blockif_ctxt *bc,
-    struct vm_snapshot_meta *meta);
 #endif
 
 #endif /* _BLOCK_IF_H_ */
diff --git a/usr.sbin/bhyve/pci_ahci.c b/usr.sbin/bhyve/pci_ahci.c
index f63ebd842757..94bf553d7590 100644
--- a/usr.sbin/bhyve/pci_ahci.c
+++ b/usr.sbin/bhyve/pci_ahci.c
@@ -2560,114 +2560,14 @@ open_fail:
 }
 
 #ifdef BHYVE_SNAPSHOT
-static int
-pci_ahci_snapshot_save_queues(struct ahci_port *port,
-			      struct vm_snapshot_meta *meta)
-{
-	int ret;
-	int idx;
-	struct ahci_ioreq *ioreq;
-
-	STAILQ_FOREACH(ioreq, &port->iofhd, io_flist) {
-		idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq);
-		SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
-	}
-
-	idx = -1;
-	SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
-
-	TAILQ_FOREACH(ioreq, &port->iobhd, io_blist) {
-		idx = ((void *) ioreq - (void *) port->ioreq) / sizeof(*ioreq);
-		SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
-
-		/*
-		 * Snapshot only the busy requests; other requests are
-		 * not valid.
-		 */
-		ret = blockif_snapshot_req(&ioreq->io_req, meta);
-		if (ret != 0) {
-			fprintf(stderr, "%s: failed to snapshot req\r\n",
-				__func__);
-			goto done;
-		}
-	}
-
-	idx = -1;
-	SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
-
-done:
-	return (ret);
-}
-
-static int
-pci_ahci_snapshot_restore_queues(struct ahci_port *port,
-				 struct vm_snapshot_meta *meta)
-{
-	int ret;
-	int idx;
-	struct ahci_ioreq *ioreq;
-
-	/* Empty the free queue before restoring. */
-	while (!STAILQ_EMPTY(&port->iofhd))
-		STAILQ_REMOVE_HEAD(&port->iofhd, io_flist);
-
-	/* Restore the free queue. */
-	while (1) {
-		SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
-		if (idx == -1)
-			break;
-
-		STAILQ_INSERT_TAIL(&port->iofhd, &port->ioreq[idx], io_flist);
-	}
-
-	/* Restore the busy queue. */
-	while (1) {
-		SNAPSHOT_VAR_OR_LEAVE(idx, meta, ret, done);
-		if (idx == -1)
-			break;
-
-		ioreq = &port->ioreq[idx];
-		TAILQ_INSERT_TAIL(&port->iobhd, ioreq, io_blist);
-
-		/*
-		 * Restore only the busy requests; other requests are
-		 * not valid.
-		 */
-		ret = blockif_snapshot_req(&ioreq->io_req, meta);
-		if (ret != 0) {
-			fprintf(stderr, "%s: failed to restore request\r\n",
-				__func__);
-			goto done;
-		}
-
-		/* Re-enqueue the requests in the block interface. */
-		if (ioreq->readop)
-			ret = blockif_read(port->bctx, &ioreq->io_req);
-		else
-			ret = blockif_write(port->bctx, &ioreq->io_req);
-
-		if (ret != 0) {
-			fprintf(stderr,
-				"%s: failed to re-enqueue request\r\n",
-				__func__);
-			goto done;
-		}
-	}
-
-done:
-	return (ret);
-}
-
 static int
 pci_ahci_snapshot(struct vm_snapshot_meta *meta)
 {
-	int i, j, ret;
+	int i, ret;
 	void *bctx;
 	struct pci_devinst *pi;
 	struct pci_ahci_softc *sc;
 	struct ahci_port *port;
-	struct ahci_cmd_hdr *hdr;
-	struct ahci_ioreq *ioreq;
 
 	pi = meta->dev_data;
 	sc = pi->pi_arg;
@@ -2751,43 +2651,7 @@ pci_ahci_snapshot(struct vm_snapshot_meta *meta)
 		SNAPSHOT_VAR_OR_LEAVE(port->fbs, meta, ret, done);
 		SNAPSHOT_VAR_OR_LEAVE(port->ioqsz, meta, ret, done);
 
-		for (j = 0; j < port->ioqsz; j++) {
-			ioreq = &port->ioreq[j];
-
-			/* blockif_req snapshot done only for busy requests. */
-			hdr = (struct ahci_cmd_hdr *)(port->cmd_lst +
-				ioreq->slot * AHCI_CL_SIZE);
-			SNAPSHOT_GUEST2HOST_ADDR_OR_LEAVE(ioreq->cfis,
-				0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry),
-				false, meta, ret, done);
-
-			SNAPSHOT_VAR_OR_LEAVE(ioreq->len, meta, ret, done);
-			SNAPSHOT_VAR_OR_LEAVE(ioreq->done, meta, ret, done);
-			SNAPSHOT_VAR_OR_LEAVE(ioreq->slot, meta, ret, done);
-			SNAPSHOT_VAR_OR_LEAVE(ioreq->more, meta, ret, done);
-			SNAPSHOT_VAR_OR_LEAVE(ioreq->readop, meta, ret, done);
-		}
-
-		/* Perform save / restore specific operations. */
-		if (meta->op == VM_SNAPSHOT_SAVE) {
-			ret = pci_ahci_snapshot_save_queues(port, meta);
-			if (ret != 0)
-				goto done;
-		} else if (meta->op == VM_SNAPSHOT_RESTORE) {
-			ret = pci_ahci_snapshot_restore_queues(port, meta);
-			if (ret != 0)
-				goto done;
-		} else {
-			ret = EINVAL;
-			goto done;
-		}
-
-		ret = blockif_snapshot(port->bctx, meta);
-		if (ret != 0) {
-			fprintf(stderr, "%s: failed to restore blockif\r\n",
-				__func__);
-			goto done;
-		}
+		assert(TAILQ_EMPTY(&port->iobhd));
 	}
 
 done:
@@ -2833,7 +2697,7 @@ pci_ahci_resume(struct vmctx *ctx, struct pci_devinst *pi)
 
 	return (0);
 }
-#endif
+#endif	/* BHYVE_SNAPSHOT */
 
 /*
  * Use separate emulation names to distinguish drive and atapi devices