svn commit: r365614 - in stable/12: sys/dev/virtio/block usr.sbin/bhyve

Allan Jude allanjude at FreeBSD.org
Thu Sep 10 21:01:23 UTC 2020


Author: allanjude
Date: Thu Sep 10 21:01:22 2020
New Revision: 365614
URL: https://svnweb.freebsd.org/changeset/base/365614

Log:
  MFC r360229, r363255
  
  r360229:
  Add VIRTIO_BLK_T_DISCARD (TRIM) support to the bhyve virtio-blk backend
  
  This will advertise support for TRIM to the guest virtio-blk driver and
  perform the DIOCGDELETE ioctl on the backing storage if it supports it.
  
  Thanks to Jason King and others at Joyent and illumos for expanding on
  my original patch, adding improvements including better error handling
  and making sure to following the virtio spec.
  
  r363255:
  Add VIRTIO_BLK_T_DISCARD support to the virtio-blk driver
  
  If the hypervisor advertises support for the DISCARD command then the
  guest can perform TRIM commands, freeing space on the backing store.
  
  If VIRTIO_BLK_F_DISCARD is enabled, advertise DISKFLAG_CANDELETE
  
  Tested with FreeBSD guests on bhyve and KVM
  
  Relnotes:	yes
  Sponsored by:	Klara Inc.

Modified:
  stable/12/sys/dev/virtio/block/virtio_blk.c
  stable/12/sys/dev/virtio/block/virtio_blk.h
  stable/12/usr.sbin/bhyve/block_if.c
  stable/12/usr.sbin/bhyve/pci_virtio_block.c
Directory Properties:
  stable/12/   (props changed)

Modified: stable/12/sys/dev/virtio/block/virtio_blk.c
==============================================================================
--- stable/12/sys/dev/virtio/block/virtio_blk.c	Thu Sep 10 20:54:44 2020	(r365613)
+++ stable/12/sys/dev/virtio/block/virtio_blk.c	Thu Sep 10 21:01:22 2020	(r365614)
@@ -81,6 +81,7 @@ struct vtblk_softc {
 #define VTBLK_FLAG_SUSPEND	0x0008
 #define VTBLK_FLAG_BARRIER	0x0010
 #define VTBLK_FLAG_WC_CONFIG	0x0020
+#define VTBLK_FLAG_DISCARD	0x0040
 
 	struct virtqueue	*vtblk_vq;
 	struct sglist		*vtblk_sglist;
@@ -112,6 +113,7 @@ static struct virtio_feature_desc vtblk_feature_desc[]
 	{ VIRTIO_BLK_F_WCE,		"WriteCache"	},
 	{ VIRTIO_BLK_F_TOPOLOGY,	"Topology"	},
 	{ VIRTIO_BLK_F_CONFIG_WCE,	"ConfigWCE"	},
+	{ VIRTIO_BLK_F_DISCARD,		"Discard"	},
 
 	{ 0, NULL }
 };
@@ -210,6 +212,7 @@ TUNABLE_INT("hw.vtblk.writecache_mode", &vtblk_writeca
      VIRTIO_BLK_F_WCE			| \
      VIRTIO_BLK_F_TOPOLOGY		| \
      VIRTIO_BLK_F_CONFIG_WCE		| \
+     VIRTIO_BLK_F_DISCARD		| \
      VIRTIO_RING_F_INDIRECT_DESC)
 
 #define VTBLK_MTX(_sc)		&(_sc)->vtblk_mtx
@@ -461,7 +464,7 @@ vtblk_config_change(device_t dev)
 	vtblk_read_config(sc, &blkcfg);
 
 	/* Capacity is always in 512-byte units. */
-	capacity = blkcfg.capacity * 512;
+	capacity = blkcfg.capacity * VTBLK_BSIZE;
 
 	if (sc->vtblk_disk->d_mediasize != capacity)
 		vtblk_resize_disk(sc, capacity);
@@ -546,11 +549,18 @@ vtblk_strategy(struct bio *bp)
 	 * be a better way to report our readonly'ness to GEOM above.
 	 */
 	if (sc->vtblk_flags & VTBLK_FLAG_READONLY &&
-	    (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_FLUSH)) {
+	    (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_FLUSH ||
+	    bp->bio_cmd == BIO_DELETE)) {
 		vtblk_bio_done(sc, bp, EROFS);
 		return;
 	}
 
+	if ((bp->bio_cmd != BIO_READ) && (bp->bio_cmd != BIO_WRITE) &&
+	    (bp->bio_cmd != BIO_FLUSH) && (bp->bio_cmd != BIO_DELETE)) {
+		vtblk_bio_done(sc, bp, EOPNOTSUPP);
+		return;
+	}
+
 	VTBLK_LOCK(sc);
 
 	if (sc->vtblk_flags & VTBLK_FLAG_DETACH) {
@@ -559,6 +569,13 @@ vtblk_strategy(struct bio *bp)
 		return;
 	}
 
+	if ((bp->bio_cmd == BIO_DELETE) &&
+	    !(sc->vtblk_flags & VTBLK_FLAG_DISCARD)) {
+		VTBLK_UNLOCK(sc);
+		vtblk_bio_done(sc, bp, EOPNOTSUPP);
+		return;
+	}
+
 	bioq_insert_tail(&sc->vtblk_bioq, bp);
 	vtblk_startio(sc);
 
@@ -594,6 +611,8 @@ vtblk_setup_features(struct vtblk_softc *sc)
 		sc->vtblk_flags |= VTBLK_FLAG_BARRIER;
 	if (virtio_with_feature(dev, VIRTIO_BLK_F_CONFIG_WCE))
 		sc->vtblk_flags |= VTBLK_FLAG_WC_CONFIG;
+	if (virtio_with_feature(dev, VIRTIO_BLK_F_DISCARD))
+		sc->vtblk_flags |= VTBLK_FLAG_DISCARD;
 }
 
 static int
@@ -683,12 +702,12 @@ vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio
 		dp->d_dump = vtblk_dump;
 
 	/* Capacity is always in 512-byte units. */
-	dp->d_mediasize = blkcfg->capacity * 512;
+	dp->d_mediasize = blkcfg->capacity * VTBLK_BSIZE;
 
 	if (virtio_with_feature(dev, VIRTIO_BLK_F_BLK_SIZE))
 		dp->d_sectorsize = blkcfg->blk_size;
 	else
-		dp->d_sectorsize = 512;
+		dp->d_sectorsize = VTBLK_BSIZE;
 
 	/*
 	 * The VirtIO maximum I/O size is given in terms of segments.
@@ -722,6 +741,11 @@ vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio
 		    dp->d_stripesize;
 	}
 
+	if (virtio_with_feature(dev, VIRTIO_BLK_F_DISCARD)) {
+		dp->d_flags |= DISKFLAG_CANDELETE;
+		dp->d_delmaxsize = blkcfg->max_discard_sectors * VTBLK_BSIZE;
+	}
+
 	if (vtblk_write_cache_enabled(sc, blkcfg) != 0)
 		sc->vtblk_write_cache = VTBLK_CACHE_WRITEBACK;
 	else
@@ -872,12 +896,16 @@ vtblk_request_bio(struct vtblk_softc *sc)
 		break;
 	case BIO_READ:
 		req->vbr_hdr.type = VIRTIO_BLK_T_IN;
-		req->vbr_hdr.sector = bp->bio_offset / 512;
+		req->vbr_hdr.sector = bp->bio_offset / VTBLK_BSIZE;
 		break;
 	case BIO_WRITE:
 		req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
-		req->vbr_hdr.sector = bp->bio_offset / 512;
+		req->vbr_hdr.sector = bp->bio_offset / VTBLK_BSIZE;
 		break;
+	case BIO_DELETE:
+		req->vbr_hdr.type = VIRTIO_BLK_T_DISCARD;
+		req->vbr_hdr.sector = bp->bio_offset / VTBLK_BSIZE;
+		break;
 	default:
 		panic("%s: bio with unhandled cmd: %d", __func__, bp->bio_cmd);
 	}
@@ -931,6 +959,20 @@ vtblk_request_execute(struct vtblk_softc *sc, struct v
 		/* BIO_READ means the host writes into our buffer. */
 		if (bp->bio_cmd == BIO_READ)
 			writable = sg->sg_nseg - 1;
+	} else if (bp->bio_cmd == BIO_DELETE) {
+		struct virtio_blk_discard_write_zeroes *discard;
+
+		discard = malloc(sizeof(*discard), M_DEVBUF, M_NOWAIT | M_ZERO);
+		if (discard == NULL)
+			return (ENOMEM);
+		discard->sector = bp->bio_offset / VTBLK_BSIZE;
+		discard->num_sectors = bp->bio_bcount / VTBLK_BSIZE;
+		bp->bio_driver1 = discard;
+		error = sglist_append(sg, discard, sizeof(*discard));
+		if (error || sg->sg_nseg == sg->sg_maxseg) {
+			panic("%s: bio %p data buffer too big %d",
+			    __func__, bp, error);
+		}
 	}
 
 	writable++;
@@ -1091,6 +1133,11 @@ vtblk_bio_done(struct vtblk_softc *sc, struct bio *bp,
 		bp->bio_flags |= BIO_ERROR;
 	}
 
+	if (bp->bio_driver1 != NULL) {
+		free(bp->bio_driver1, M_DEVBUF);
+		bp->bio_driver1 = NULL;
+	}
+
 	biodone(bp);
 }
 
@@ -1120,7 +1167,12 @@ vtblk_read_config(struct vtblk_softc *sc, struct virti
 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_GEOMETRY, geometry, blkcfg);
 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_BLK_SIZE, blk_size, blkcfg);
 	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_TOPOLOGY, topology, blkcfg);
-	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_CONFIG_WCE, writeback, blkcfg);
+	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_CONFIG_WCE, wce, blkcfg);
+	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, max_discard_sectors,
+	    blkcfg);
+	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, max_discard_seg, blkcfg);
+	VTBLK_GET_CONFIG(dev, VIRTIO_BLK_F_DISCARD, discard_sector_alignment,
+	    blkcfg);
 }
 
 #undef VTBLK_GET_CONFIG
@@ -1278,7 +1330,7 @@ vtblk_dump_write(struct vtblk_softc *sc, void *virtual
 	req->vbr_ack = -1;
 	req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
 	req->vbr_hdr.ioprio = 1;
-	req->vbr_hdr.sector = offset / 512;
+	req->vbr_hdr.sector = offset / VTBLK_BSIZE;
 
 	req->vbr_bp = &buf;
 	g_reset_bio(&buf);
@@ -1327,7 +1379,7 @@ vtblk_set_write_cache(struct vtblk_softc *sc, int wc)
 
 	/* Set either writeback (1) or writethrough (0) mode. */
 	virtio_write_dev_config_1(sc->vtblk_dev,
-	    offsetof(struct virtio_blk_config, writeback), wc);
+	    offsetof(struct virtio_blk_config, wce), wc);
 }
 
 static int
@@ -1342,7 +1394,7 @@ vtblk_write_cache_enabled(struct vtblk_softc *sc,
 		if (wc >= 0 && wc < VTBLK_CACHE_MAX)
 			vtblk_set_write_cache(sc, wc);
 		else
-			wc = blkcfg->writeback;
+			wc = blkcfg->wce;
 	} else
 		wc = virtio_with_feature(sc->vtblk_dev, VIRTIO_BLK_F_WCE);
 

Modified: stable/12/sys/dev/virtio/block/virtio_blk.h
==============================================================================
--- stable/12/sys/dev/virtio/block/virtio_blk.h	Thu Sep 10 20:54:44 2020	(r365613)
+++ stable/12/sys/dev/virtio/block/virtio_blk.h	Thu Sep 10 21:01:22 2020	(r365614)
@@ -33,20 +33,27 @@
 #ifndef _VIRTIO_BLK_H
 #define _VIRTIO_BLK_H
 
+#define	VTBLK_BSIZE	512
+
 /* Feature bits */
-#define VIRTIO_BLK_F_BARRIER	0x0001	/* Does host support barriers? */
-#define VIRTIO_BLK_F_SIZE_MAX	0x0002	/* Indicates maximum segment size */
-#define VIRTIO_BLK_F_SEG_MAX	0x0004	/* Indicates maximum # of segments */
-#define VIRTIO_BLK_F_GEOMETRY	0x0010	/* Legacy geometry available  */
-#define VIRTIO_BLK_F_RO		0x0020	/* Disk is read-only */
-#define VIRTIO_BLK_F_BLK_SIZE	0x0040	/* Block size of disk is available*/
-#define VIRTIO_BLK_F_SCSI	0x0080	/* Supports scsi command passthru */
-#define VIRTIO_BLK_F_WCE	0x0200	/* Writeback mode enabled after reset */
-#define VIRTIO_BLK_F_TOPOLOGY	0x0400	/* Topology information is available */
-#define VIRTIO_BLK_F_CONFIG_WCE 0x0800	/* Writeback mode available in config */
 
-#define VIRTIO_BLK_ID_BYTES	20	/* ID string length */
+#define VIRTIO_BLK_F_BARRIER		0x0001	/* Does host support barriers? */
+#define VIRTIO_BLK_F_SIZE_MAX		0x0002	/* Indicates maximum segment size */
+#define VIRTIO_BLK_F_SEG_MAX		0x0004	/* Indicates maximum # of segments */
+#define VIRTIO_BLK_F_GEOMETRY		0x0010	/* Legacy geometry available  */
+#define VIRTIO_BLK_F_RO			0x0020	/* Disk is read-only */
+#define VIRTIO_BLK_F_BLK_SIZE		0x0040	/* Block size of disk is available*/
+#define VIRTIO_BLK_F_SCSI		0x0080	/* Supports scsi command passthru */
+#define VIRTIO_BLK_F_FLUSH		0x0200	/* Flush command supported */
+#define VIRTIO_BLK_F_WCE		0x0200	/* Legacy alias for FLUSH */
+#define VIRTIO_BLK_F_TOPOLOGY		0x0400	/* Topology information is available */
+#define VIRTIO_BLK_F_CONFIG_WCE		0x0800	/* Writeback mode available in config */
+#define VIRTIO_BLK_F_MQ			0x1000	/* Support more than one vq */
+#define VIRTIO_BLK_F_DISCARD		0x2000	/* Trim blocks */
+#define VIRTIO_BLK_F_WRITE_ZEROES	0x4000	/* Write zeros */
 
+#define VIRTIO_BLK_ID_BYTES		20	/* ID string length */
+
 struct virtio_blk_config {
 	/* The capacity (in 512-byte sectors). */
 	uint64_t capacity;
@@ -66,15 +73,29 @@ struct virtio_blk_config {
 
 	/* Topology of the device (if VIRTIO_BLK_F_TOPOLOGY) */
 	struct virtio_blk_topology {
+		/* Exponent for physical block per logical block. */
 		uint8_t physical_block_exp;
+		/* Alignment offset in logical blocks. */
 		uint8_t alignment_offset;
+		/* Minimum I/O size without performance penalty in logical
+		 * blocks. */
 		uint16_t min_io_size;
+		/* Optimal sustained I/O size in logical blocks. */
 		uint32_t opt_io_size;
 	} topology;
 
 	/* Writeback mode (if VIRTIO_BLK_F_CONFIG_WCE) */
-	uint8_t writeback;
-
+	uint8_t wce;
+	uint8_t unused;
+	/* Number of vqs, only available when VIRTIO_BLK_F_MQ is set */
+	uint16_t num_queues;
+	uint32_t max_discard_sectors;
+	uint32_t max_discard_seg;
+	uint32_t discard_sector_alignment;
+	uint32_t max_write_zeroes_sectors;
+	uint32_t max_write_zeroes_seg;
+	uint8_t write_zeroes_may_unmap;
+	uint8_t unused1[3];
 } __packed;
 
 /*
@@ -89,24 +110,35 @@ struct virtio_blk_config {
  */
 
 /* These two define direction. */
-#define VIRTIO_BLK_T_IN		0
-#define VIRTIO_BLK_T_OUT	1
+#define VIRTIO_BLK_T_IN			0
+#define VIRTIO_BLK_T_OUT		1
 
 /* This bit says it's a scsi command, not an actual read or write. */
-#define VIRTIO_BLK_T_SCSI_CMD	2
+#define VIRTIO_BLK_T_SCSI_CMD		2
+#define VIRTIO_BLK_T_SCSI_CMD_OUT	3
 
 /* Cache flush command */
-#define VIRTIO_BLK_T_FLUSH	4
+#define VIRTIO_BLK_T_FLUSH		4
+#define VIRTIO_BLK_T_FLUSH_OUT		5
 
 /* Get device ID command */
-#define VIRTIO_BLK_T_GET_ID	8
+#define VIRTIO_BLK_T_GET_ID		8
 
+/* Discard command */
+#define VIRTIO_BLK_T_DISCARD		11
+
+/* Write zeros command */
+#define VIRTIO_BLK_T_WRITE_ZEROES	13
+
 /* Barrier before this op. */
-#define VIRTIO_BLK_T_BARRIER	0x80000000
+#define VIRTIO_BLK_T_BARRIER		0x80000000
 
 /* ID string length */
-#define VIRTIO_BLK_ID_BYTES	20
+#define VIRTIO_BLK_ID_BYTES		20
 
+/* Unmap this range (only valid for write zeroes command) */
+#define VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP	0x00000001
+
 /* This is the first element of the read scatter-gather list. */
 struct virtio_blk_outhdr {
 	/* VIRTIO_BLK_T* */
@@ -115,6 +147,15 @@ struct virtio_blk_outhdr {
 	uint32_t ioprio;
 	/* Sector (ie. 512 byte offset) */
 	uint64_t sector;
+};
+
+struct virtio_blk_discard_write_zeroes {
+	uint64_t sector;
+	uint32_t num_sectors;
+	struct {
+		uint32_t unmap:1;
+		uint32_t reserved:31;
+	} flags;
 };
 
 struct virtio_scsi_inhdr {

Modified: stable/12/usr.sbin/bhyve/block_if.c
==============================================================================
--- stable/12/usr.sbin/bhyve/block_if.c	Thu Sep 10 20:54:44 2020	(r365613)
+++ stable/12/usr.sbin/bhyve/block_if.c	Thu Sep 10 21:01:22 2020	(r365614)
@@ -3,6 +3,7 @@
  *
  * Copyright (c) 2013  Peter Grehan <grehan at freebsd.org>
  * All rights reserved.
+ * Copyright 2020 Joyent, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -410,6 +411,8 @@ blockif_open(const char *optstr, const char *ident)
 	off_t size, psectsz, psectoff;
 	int extra, fd, i, sectsz;
 	int nocache, sync, ro, candelete, geom, ssopt, pssopt;
+	int nodelete;
+
 #ifndef WITHOUT_CAPSICUM
 	cap_rights_t rights;
 	cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE };
@@ -422,6 +425,7 @@ blockif_open(const char *optstr, const char *ident)
 	nocache = 0;
 	sync = 0;
 	ro = 0;
+	nodelete = 0;
 
 	/*
 	 * The first element in the optstring is always a pathname.
@@ -434,6 +438,8 @@ blockif_open(const char *optstr, const char *ident)
 			continue;
 		else if (!strcmp(cp, "nocache"))
 			nocache = 1;
+		else if (!strcmp(cp, "nodelete"))
+			nodelete = 1;
 		else if (!strcmp(cp, "sync") || !strcmp(cp, "direct"))
 			sync = 1;
 		else if (!strcmp(cp, "ro"))
@@ -500,7 +506,7 @@ blockif_open(const char *optstr, const char *ident)
 			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
 		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
 		arg.len = sizeof(arg.value.i);
-		if (ioctl(fd, DIOCGATTR, &arg) == 0)
+		if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
 			candelete = arg.value.i;
 		if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
 			geom = 1;

Modified: stable/12/usr.sbin/bhyve/pci_virtio_block.c
==============================================================================
--- stable/12/usr.sbin/bhyve/pci_virtio_block.c	Thu Sep 10 20:54:44 2020	(r365613)
+++ stable/12/usr.sbin/bhyve/pci_virtio_block.c	Thu Sep 10 21:01:22 2020	(r365614)
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 2011 NetApp, Inc.
  * All rights reserved.
- * Copyright (c) 2019 Joyent, Inc.
+ * Copyright 2020 Joyent, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -57,26 +57,37 @@ __FBSDID("$FreeBSD$");
 #include "virtio.h"
 #include "block_if.h"
 
-#define VTBLK_RINGSZ	128
+#define	VTBLK_BSIZE	512
+#define	VTBLK_RINGSZ	128
 
 _Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request");
 
-#define VTBLK_S_OK	0
-#define VTBLK_S_IOERR	1
+#define	VTBLK_S_OK	0
+#define	VTBLK_S_IOERR	1
 #define	VTBLK_S_UNSUPP	2
 
 #define	VTBLK_BLK_ID_BYTES	20 + 1
 
 /* Capability bits */
-#define	VTBLK_F_SEG_MAX		(1 << 2)	/* Maximum request segments */
-#define	VTBLK_F_BLK_SIZE	(1 << 6)	/* cfg block size valid */
-#define	VTBLK_F_FLUSH		(1 << 9)	/* Cache flush support */
-#define	VTBLK_F_TOPOLOGY	(1 << 10)	/* Optimal I/O alignment */
+#define	VTBLK_F_BARRIER		(1 << 0)	/* Does host support barriers? */
+#define	VTBLK_F_SIZE_MAX	(1 << 1)	/* Indicates maximum segment size */
+#define	VTBLK_F_SEG_MAX		(1 << 2)	/* Indicates maximum # of segments */
+#define	VTBLK_F_GEOMETRY	(1 << 4)	/* Legacy geometry available  */
+#define	VTBLK_F_RO		(1 << 5)	/* Disk is read-only */
+#define	VTBLK_F_BLK_SIZE	(1 << 6)	/* Block size of disk is available*/
+#define	VTBLK_F_SCSI		(1 << 7)	/* Supports scsi command passthru */
+#define	VTBLK_F_FLUSH		(1 << 9)	/* Writeback mode enabled after reset */
+#define	VTBLK_F_WCE		(1 << 9)	/* Legacy alias for FLUSH */
+#define	VTBLK_F_TOPOLOGY	(1 << 10)	/* Topology information is available */
+#define	VTBLK_F_CONFIG_WCE	(1 << 11)	/* Writeback mode available in config */
+#define	VTBLK_F_MQ		(1 << 12)	/* Multi-Queue */
+#define	VTBLK_F_DISCARD		(1 << 13)	/* Trim blocks */
+#define	VTBLK_F_WRITE_ZEROES	(1 << 14)	/* Write zeros */
 
 /*
  * Host capabilities
  */
-#define VTBLK_S_HOSTCAPS      \
+#define	VTBLK_S_HOSTCAPS      \
   ( VTBLK_F_SEG_MAX  |						    \
     VTBLK_F_BLK_SIZE |						    \
     VTBLK_F_FLUSH    |						    \
@@ -84,6 +95,18 @@ _Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each
     VIRTIO_RING_F_INDIRECT_DESC )	/* indirect descriptors */
 
 /*
+ * The current blockif_delete() interface only allows a single delete
+ * request at a time.
+ */
+#define	VTBLK_MAX_DISCARD_SEG	1
+
+/*
+ * An arbitrary limit to prevent excessive latency due to large
+ * delete requests.
+ */
+#define	VTBLK_MAX_DISCARD_SECT	((16 << 20) / VTBLK_BSIZE)	/* 16 MiB */
+
+/*
  * Config space "registers"
  */
 struct vtblk_config {
@@ -103,6 +126,15 @@ struct vtblk_config {
 		uint32_t opt_io_size;
 	} vbc_topology;
 	uint8_t		vbc_writeback;
+	uint8_t		unused0[1];
+	uint16_t	num_queues;
+	uint32_t	max_discard_sectors;
+	uint32_t	max_discard_seg;
+	uint32_t	discard_sector_alignment;
+	uint32_t	max_write_zeroes_sectors;
+	uint32_t	max_write_zeroes_seg;
+	uint8_t		write_zeroes_may_unmap;
+	uint8_t		unused1[3];
 } __packed;
 
 /*
@@ -111,9 +143,14 @@ struct vtblk_config {
 struct virtio_blk_hdr {
 #define	VBH_OP_READ		0
 #define	VBH_OP_WRITE		1
+#define	VBH_OP_SCSI_CMD		2
+#define	VBH_OP_SCSI_CMD_OUT	3
 #define	VBH_OP_FLUSH		4
 #define	VBH_OP_FLUSH_OUT	5
 #define	VBH_OP_IDENT		8
+#define	VBH_OP_DISCARD		11
+#define	VBH_OP_WRITE_ZEROES	13
+
 #define	VBH_FLAG_BARRIER	0x80000000	/* OR'ed into vbh_type */
 	uint32_t	vbh_type;
 	uint32_t	vbh_ioprio;
@@ -124,8 +161,8 @@ struct virtio_blk_hdr {
  * Debug printf
  */
 static int pci_vtblk_debug;
-#define DPRINTF(params) if (pci_vtblk_debug) PRINTLN params
-#define WPRINTF(params) PRINTLN params
+#define	DPRINTF(params) if (pci_vtblk_debug) PRINTLN params
+#define	WPRINTF(params) PRINTLN params
 
 struct pci_vtblk_ioreq {
 	struct blockif_req		io_req;
@@ -134,6 +171,15 @@ struct pci_vtblk_ioreq {
 	uint16_t			io_idx;
 };
 
+struct virtio_blk_discard_write_zeroes {
+	uint64_t	sector;
+	uint32_t	num_sectors;
+	struct {
+		uint32_t unmap:1;
+		uint32_t reserved:31;
+	} flags;
+};
+
 /*
  * Per-device softc
  */
@@ -142,6 +188,7 @@ struct pci_vtblk_softc {
 	pthread_mutex_t vsc_mtx;
 	struct vqueue_info vbsc_vq;
 	struct vtblk_config vbsc_cfg;
+	struct virtio_consts vbsc_consts;
 	struct blockif_ctxt *bc;
 	char vbsc_ident[VTBLK_BLK_ID_BYTES];
 	struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ];
@@ -174,9 +221,8 @@ pci_vtblk_reset(void *vsc)
 }
 
 static void
-pci_vtblk_done(struct blockif_req *br, int err)
+pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err)
 {
-	struct pci_vtblk_ioreq *io = br->br_param;
 	struct pci_vtblk_softc *sc = io->io_sc;
 
 	/* convert errno into a virtio block error return */
@@ -191,9 +237,18 @@ pci_vtblk_done(struct blockif_req *br, int err)
 	 * Return the descriptor back to the host.
 	 * We wrote 1 byte (our status) to host.
 	 */
-	pthread_mutex_lock(&sc->vsc_mtx);
 	vq_relchain(&sc->vbsc_vq, io->io_idx, 1);
 	vq_endchains(&sc->vbsc_vq, 0);
+}
+
+static void
+pci_vtblk_done(struct blockif_req *br, int err)
+{
+	struct pci_vtblk_ioreq *io = br->br_param;
+	struct pci_vtblk_softc *sc = io->io_sc;
+
+	pthread_mutex_lock(&sc->vsc_mtx);
+	pci_vtblk_done_locked(io, err);
 	pthread_mutex_unlock(&sc->vsc_mtx);
 }
 
@@ -208,6 +263,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vque
 	int writeop, type;
 	struct iovec iov[BLOCKIF_IOV_MAX + 2];
 	uint16_t idx, flags[BLOCKIF_IOV_MAX + 2];
+	struct virtio_blk_discard_write_zeroes *discard;
 
 	n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags);
 
@@ -224,11 +280,11 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vque
 	io = &sc->vbsc_ios[idx];
 	assert((flags[0] & VRING_DESC_F_WRITE) == 0);
 	assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr));
-	vbh = iov[0].iov_base;
+	vbh = (struct virtio_blk_hdr *)iov[0].iov_base;
 	memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2));
 	io->io_req.br_iovcnt = n - 2;
-	io->io_req.br_offset = vbh->vbh_sector * DEV_BSIZE;
-	io->io_status = iov[--n].iov_base;
+	io->io_req.br_offset = vbh->vbh_sector * VTBLK_BSIZE;
+	io->io_status = (uint8_t *)iov[--n].iov_base;
 	assert(iov[n].iov_len == 1);
 	assert(flags[n] & VRING_DESC_F_WRITE);
 
@@ -238,7 +294,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vque
 	 * we don't advertise the capability.
 	 */
 	type = vbh->vbh_type & ~VBH_FLAG_BARRIER;
-	writeop = (type == VBH_OP_WRITE);
+	writeop = (type == VBH_OP_WRITE || type == VBH_OP_DISCARD);
 
 	iolen = 0;
 	for (i = 1; i < n; i++) {
@@ -254,7 +310,7 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vque
 	io->io_req.br_resid = iolen;
 
 	DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld",
-		 writeop ? "write" : "read/ident", iolen, i - 1,
+		 writeop ? "write/discard" : "read/ident", iolen, i - 1,
 		 io->io_req.br_offset));
 
 	switch (type) {
@@ -264,6 +320,46 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vque
 	case VBH_OP_WRITE:
 		err = blockif_write(sc->bc, &io->io_req);
 		break;
+	case VBH_OP_DISCARD:
+		/*
+		 * We currently only support a single request, if the guest
+		 * has submitted a request that doesn't conform to the
+		 * requirements, we return a error.
+		 */
+		if (iov[1].iov_len != sizeof (*discard)) {
+			pci_vtblk_done_locked(io, EINVAL);
+			return;
+		}
+
+		/* The segments to discard are provided rather than data */
+		discard = (struct virtio_blk_discard_write_zeroes *)
+		    iov[1].iov_base;
+
+		/*
+		 * virtio v1.1 5.2.6.2:
+		 * The device MUST set the status byte to VIRTIO_BLK_S_UNSUPP
+		 * for discard and write zeroes commands if any unknown flag is
+		 * set. Furthermore, the device MUST set the status byte to
+		 * VIRTIO_BLK_S_UNSUPP for discard commands if the unmap flag
+		 * is set.
+		 *
+		 * Currently there are no known flags for a DISCARD request.
+		 */
+		if (discard->flags.unmap != 0 || discard->flags.reserved != 0) {
+			pci_vtblk_done_locked(io, ENOTSUP);
+			return;
+		}
+
+		/* Make sure the request doesn't exceed our size limit */
+		if (discard->num_sectors > VTBLK_MAX_DISCARD_SECT) {
+			pci_vtblk_done_locked(io, EINVAL);
+			return;
+		}
+
+		io->io_req.br_offset = discard->sector * VTBLK_BSIZE;
+		io->io_req.br_resid = discard->num_sectors * VTBLK_BSIZE;
+		err = blockif_delete(sc->bc, &io->io_req);
+		break;
 	case VBH_OP_FLUSH:
 	case VBH_OP_FLUSH_OUT:
 		err = blockif_flush(sc->bc, &io->io_req);
@@ -274,10 +370,10 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vque
 		memset(iov[1].iov_base, 0, iov[1].iov_len);
 		strncpy(iov[1].iov_base, sc->vbsc_ident,
 		    MIN(iov[1].iov_len, sizeof(sc->vbsc_ident)));
-		pci_vtblk_done(&io->io_req, 0);
+		pci_vtblk_done_locked(io, 0);
 		return;
 	default:
-		pci_vtblk_done(&io->io_req, EOPNOTSUPP);
+		pci_vtblk_done_locked(io, EOPNOTSUPP);
 		return;
 	}
 	assert(err == 0);
@@ -332,10 +428,14 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *
 		io->io_idx = i;
 	}
 
+	bcopy(&vtblk_vi_consts, &sc->vbsc_consts, sizeof (vtblk_vi_consts));
+	if (blockif_candelete(sc->bc))
+		sc->vbsc_consts.vc_hv_caps |= VTBLK_F_DISCARD;
+
 	pthread_mutex_init(&sc->vsc_mtx, NULL);
 
 	/* init virtio softc and virtqueues */
-	vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq);
+	vi_softc_linkup(&sc->vbsc_vs, &sc->vbsc_consts, sc, pi, &sc->vbsc_vq);
 	sc->vbsc_vs.vs_mtx = &sc->vsc_mtx;
 
 	sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ;
@@ -353,7 +453,7 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *
 	    digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]);
 
 	/* setup virtio block config space */
-	sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */
+	sc->vbsc_cfg.vbc_capacity = size / VTBLK_BSIZE; /* 512-byte units */
 	sc->vbsc_cfg.vbc_size_max = 0;	/* not negotiated */
 
 	/*
@@ -375,6 +475,9 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *
 	sc->vbsc_cfg.vbc_topology.min_io_size = 0;
 	sc->vbsc_cfg.vbc_topology.opt_io_size = 0;
 	sc->vbsc_cfg.vbc_writeback = 0;
+	sc->vbsc_cfg.max_discard_sectors = VTBLK_MAX_DISCARD_SECT;
+	sc->vbsc_cfg.max_discard_seg = VTBLK_MAX_DISCARD_SEG;
+	sc->vbsc_cfg.discard_sector_alignment = sectsz / VTBLK_BSIZE;
 
 	/*
 	 * Should we move some of this into virtio.c?  Could


More information about the svn-src-all mailing list