git: 9db44a8e5da9 - main - zfs: merge OpenZFS master-9305ff2ed
Martin Matuska
mm at FreeBSD.org
Sun Mar 14 01:38:30 UTC 2021
The branch main has been updated by mm:
URL: https://cgit.FreeBSD.org/src/commit/?id=9db44a8e5da9bf1ce6dd1c0f1468ddafed6d6c91
commit 9db44a8e5da9bf1ce6dd1c0f1468ddafed6d6c91
Merge: a9275d996c22 9162a1ce3ae9
Author: Martin Matuska <mm at FreeBSD.org>
AuthorDate: 2021-03-14 01:23:51 +0000
Commit: Martin Matuska <mm at FreeBSD.org>
CommitDate: 2021-03-14 01:32:14 +0000
zfs: merge OpenZFS master-9305ff2ed
Notable upstream pull request merges:
#11153 Scalable teardown lock for FreeBSD
#11651 Don't bomb out when using keylocation=file://
#11667 zvol: call zil_replaying() during replay
#11683 abd_get_offset_struct() may allocate new abd
#11693 Intentionally allow ZFS_READONLY in zfs_write
#11716 zpool import cachefile improvements
#11720 FreeBSD: Clean up zfsdev_close to match Linux
#11730 FreeBSD: bring back possibility to rewind the
checkpoint from bootloader
Obtained from: OpenZFS
MFC after: 2 weeks
.../openzfs/.github/workflows/checkstyle.yaml | 2 +-
sys/contrib/openzfs/cmd/vdev_id/vdev_id | 9 +-
sys/contrib/openzfs/cmd/zpool/zpool_main.c | 307 +++++++++++++--------
sys/contrib/openzfs/cmd/zstream/zstream_redup.c | 1 +
sys/contrib/openzfs/config/zfs-build.m4 | 36 +++
sys/contrib/openzfs/configure.ac | 1 +
.../openzfs/include/os/freebsd/spl/sys/Makefile.am | 3 +
.../openzfs/include/os/freebsd/spl/sys/debug.h | 80 +++---
.../include/os/freebsd/zfs/sys/zfs_vfsops_os.h | 2 +-
.../openzfs/include/os/linux/spl/sys/debug.h | 78 +++---
.../include/os/linux/zfs/sys/zfs_vfsops_os.h | 33 +++
.../include/os/linux/zfs/sys/zfs_znode_impl.h | 4 +-
sys/contrib/openzfs/include/sys/dmu_redact.h | 2 +
sys/contrib/openzfs/include/sys/zfs_ioctl.h | 1 -
sys/contrib/openzfs/lib/libzfs/libzfs_crypto.c | 10 +-
sys/contrib/openzfs/lib/libzfs/libzfs_mount.c | 25 +-
.../openzfs/lib/libzfs/os/freebsd/libzfs_zmount.c | 5 +-
.../openzfs/lib/libzfs/os/linux/libzfs_mount_os.c | 6 +-
sys/contrib/openzfs/lib/libzutil/zutil_import.c | 177 +++++++++---
sys/contrib/openzfs/man/man8/zfs-receive.8 | 10 +
sys/contrib/openzfs/man/man8/zfs-send.8 | 7 +-
sys/contrib/openzfs/module/Makefile.in | 5 +
.../openzfs/module/os/freebsd/zfs/kmod_core.c | 18 +-
.../openzfs/module/os/freebsd/zfs/zfs_dir.c | 2 -
.../openzfs/module/os/freebsd/zfs/zvol_os.c | 9 +-
sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c | 26 +-
.../openzfs/module/os/linux/zfs/zio_crypt.c | 1 +
sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c | 102 +++++--
sys/contrib/openzfs/module/zcommon/zfs_prop.c | 2 +-
sys/contrib/openzfs/module/zfs/abd.c | 6 +-
sys/contrib/openzfs/module/zfs/spa_misc.c | 4 +-
sys/contrib/openzfs/module/zfs/zfs_vnops.c | 6 +-
sys/contrib/openzfs/module/zfs/zvol.c | 15 +-
sys/contrib/openzfs/tests/runfiles/common.run | 1 +
.../openzfs/tests/zfs-tests/include/commands.cfg | 2 +-
.../openzfs/tests/zfs-tests/include/libtest.shlib | 19 +-
.../openzfs/tests/zfs-tests/include/tunables.cfg | 8 +-
.../functional/cli_root/zpool/zpool_002_pos.ksh | 37 ++-
.../functional/cli_root/zpool/zpool_003_pos.ksh | 39 ++-
.../functional/cli_root/zpool_import/Makefile.am | 1 +
.../import_cachefile_paths_changed.ksh | 117 ++++++++
.../tests/functional/events/events_002_pos.ksh | 7 +-
.../tests/functional/xattr/xattr_003_neg.ksh | 44 +--
sys/modules/zfs/zfs_config.h | 4 +-
44 files changed, 896 insertions(+), 378 deletions(-)
diff --cc sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
index 2389b1a06355,000000000000..ba315f104738
mode 100644,000000..100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
@@@ -1,1525 -1,0 +1,1532 @@@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ *
+ * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Portions Copyright 2010 Robert Milkowski
+ *
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ */
+
+/* Portions Copyright 2011 Martin Matuska <mm at FreeBSD.org> */
+
+/*
+ * ZFS volume emulation driver.
+ *
+ * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
+ * Volumes are accessed through the symbolic links named:
+ *
+ * /dev/zvol/<pool_name>/<dataset_name>
+ *
+ * Volumes are persistent through reboot. No user command needs to be
+ * run before opening and using a device.
+ *
+ * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
+ * in the system. Except when they're simply character devices (volmode=dev).
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/errno.h>
+#include <sys/uio.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/cmn_err.h>
+#include <sys/stat.h>
+#include <sys/proc.h>
+#include <sys/zap.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/disk.h>
+#include <sys/dmu_traverse.h>
+#include <sys/dnode.h>
+#include <sys/dsl_dataset.h>
+#include <sys/dsl_prop.h>
+#include <sys/dsl_dir.h>
+#include <sys/byteorder.h>
+#include <sys/sunddi.h>
+#include <sys/dirent.h>
+#include <sys/policy.h>
+#include <sys/queue.h>
+#include <sys/fs/zfs.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/zil.h>
+#include <sys/zfs_znode.h>
+#include <sys/zfs_rlock.h>
+#include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
+#include <sys/zvol.h>
+#include <sys/zil_impl.h>
+#include <sys/dataset_kstats.h>
+#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfeature.h>
+#include <sys/zio_checksum.h>
+#include <sys/zil_impl.h>
+#include <sys/filio.h>
+
+#include <geom/geom.h>
+#include <sys/zvol.h>
+#include <sys/zvol_impl.h>
+
+#include "zfs_namecheck.h"
+
+#define ZVOL_DUMPSIZE "dumpsize"
+
+#ifdef ZVOL_LOCK_DEBUG
+#define ZVOL_RW_READER RW_WRITER
+#define ZVOL_RW_READ_HELD RW_WRITE_HELD
+#else
+#define ZVOL_RW_READER RW_READER
+#define ZVOL_RW_READ_HELD RW_READ_HELD
+#endif
+
+enum zvol_geom_state {
+ ZVOL_GEOM_UNINIT,
+ ZVOL_GEOM_STOPPED,
+ ZVOL_GEOM_RUNNING,
+};
+
+struct zvol_state_os {
+#define zso_dev _zso_state._zso_dev
+#define zso_geom _zso_state._zso_geom
+ union {
+ /* volmode=dev */
+ struct zvol_state_dev {
+ struct cdev *zsd_cdev;
+ uint64_t zsd_sync_cnt;
+ } _zso_dev;
+
+ /* volmode=geom */
+ struct zvol_state_geom {
+ struct g_provider *zsg_provider;
+ struct bio_queue_head zsg_queue;
+ struct mtx zsg_queue_mtx;
+ enum zvol_geom_state zsg_state;
+ } _zso_geom;
+ } _zso_state;
+ int zso_dying;
+};
+
+static uint32_t zvol_minors;
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
+ "Expose as GEOM providers (1), device files (2) or neither");
+static boolean_t zpool_on_zvol = B_FALSE;
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
+ "Allow zpools to use zvols as vdevs (DANGEROUS)");
+
+/*
+ * Toggle unmap functionality.
+ */
+boolean_t zvol_unmap_enabled = B_TRUE;
+
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
+ &zvol_unmap_enabled, 0, "Enable UNMAP functionality");
+
+/*
+ * zvol maximum transfer in one DMU tx.
+ */
+int zvol_maxphys = DMU_MAX_ACCESS / 2;
+
+static void zvol_ensure_zilog(zvol_state_t *zv);
+
+static d_open_t zvol_cdev_open;
+static d_close_t zvol_cdev_close;
+static d_ioctl_t zvol_cdev_ioctl;
+static d_read_t zvol_cdev_read;
+static d_write_t zvol_cdev_write;
+static d_strategy_t zvol_geom_bio_strategy;
+
+static struct cdevsw zvol_cdevsw = {
+ .d_name = "zvol",
+ .d_version = D_VERSION,
+ .d_flags = D_DISK | D_TRACKCLOSE,
+ .d_open = zvol_cdev_open,
+ .d_close = zvol_cdev_close,
+ .d_ioctl = zvol_cdev_ioctl,
+ .d_read = zvol_cdev_read,
+ .d_write = zvol_cdev_write,
+ .d_strategy = zvol_geom_bio_strategy,
+};
+
+extern uint_t zfs_geom_probe_vdev_key;
+
+struct g_class zfs_zvol_class = {
+ .name = "ZFS::ZVOL",
+ .version = G_VERSION,
+};
+
+DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
+
+static int zvol_geom_open(struct g_provider *pp, int flag, int count);
+static int zvol_geom_close(struct g_provider *pp, int flag, int count);
+static void zvol_geom_run(zvol_state_t *zv);
+static void zvol_geom_destroy(zvol_state_t *zv);
+static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
+static void zvol_geom_worker(void *arg);
+static void zvol_geom_bio_start(struct bio *bp);
+static int zvol_geom_bio_getattr(struct bio *bp);
+/* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */
+
+/*
+ * GEOM mode implementation
+ */
+
+/*ARGSUSED*/
+static int
+zvol_geom_open(struct g_provider *pp, int flag, int count)
+{
+ zvol_state_t *zv;
+ int err = 0;
+ boolean_t drop_suspend = B_FALSE;
+ boolean_t drop_namespace = B_FALSE;
+
+ if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
+ /*
+ * if zfs_geom_probe_vdev_key is set, that means that zfs is
+ * attempting to probe geom providers while looking for a
+ * replacement for a missing VDEV. In this case, the
+ * spa_namespace_lock will not be held, but it is still illegal
+ * to use a zvol as a vdev. Deadlocks can result if another
+ * thread has spa_namespace_lock
+ */
+ return (SET_ERROR(EOPNOTSUPP));
+ }
+
+retry:
+ rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+ zv = pp->private;
+ if (zv == NULL) {
+ rw_exit(&zvol_state_lock);
+ err = SET_ERROR(ENXIO);
+ goto out_locked;
+ }
+
+ if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
+ /*
+ * We need to guarantee that the namespace lock is held
+ * to avoid spurious failures in zvol_first_open.
+ */
+ drop_namespace = B_TRUE;
+ if (!mutex_tryenter(&spa_namespace_lock)) {
+ rw_exit(&zvol_state_lock);
+ mutex_enter(&spa_namespace_lock);
+ goto retry;
+ }
+ }
+ mutex_enter(&zv->zv_state_lock);
+ if (zv->zv_zso->zso_dying) {
+ rw_exit(&zvol_state_lock);
+ err = SET_ERROR(ENXIO);
+ goto out_zv_locked;
+ }
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
+
+ /*
+ * make sure zvol is not suspended during first open
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
+ */
+ if (zv->zv_open_count == 0) {
+ drop_suspend = B_TRUE;
+ if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ if (zv->zv_open_count == 0) {
+ ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+ err = zvol_first_open(zv, !(flag & FWRITE));
+ if (err)
+ goto out_zv_locked;
+ pp->mediasize = zv->zv_volsize;
+ pp->stripeoffset = 0;
+ pp->stripesize = zv->zv_volblocksize;
+ }
+
+ /*
+ * Check for a bad on-disk format version now since we
+ * lied about owning the dataset readonly before.
+ */
+ if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
+ dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
+ err = SET_ERROR(EROFS);
+ goto out_opened;
+ }
+ if (zv->zv_flags & ZVOL_EXCL) {
+ err = SET_ERROR(EBUSY);
+ goto out_opened;
+ }
+#ifdef FEXCL
+ if (flag & FEXCL) {
+ if (zv->zv_open_count != 0) {
+ err = SET_ERROR(EBUSY);
+ goto out_opened;
+ }
+ zv->zv_flags |= ZVOL_EXCL;
+ }
+#endif
+
+ zv->zv_open_count += count;
+out_opened:
+ if (zv->zv_open_count == 0) {
+ zvol_last_close(zv);
+ wakeup(zv);
+ }
+out_zv_locked:
+ mutex_exit(&zv->zv_state_lock);
+out_locked:
+ if (drop_namespace)
+ mutex_exit(&spa_namespace_lock);
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+ return (err);
+}
+
+/*ARGSUSED*/
+static int
+zvol_geom_close(struct g_provider *pp, int flag, int count)
+{
+ zvol_state_t *zv;
+ boolean_t drop_suspend = B_TRUE;
+ int new_open_count;
+
+ rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+ zv = pp->private;
+ if (zv == NULL) {
+ rw_exit(&zvol_state_lock);
+ return (SET_ERROR(ENXIO));
+ }
+
+ mutex_enter(&zv->zv_state_lock);
+ if (zv->zv_flags & ZVOL_EXCL) {
+ ASSERT3U(zv->zv_open_count, ==, 1);
+ zv->zv_flags &= ~ZVOL_EXCL;
+ }
+
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
+
+ /*
+ * If the open count is zero, this is a spurious close.
+ * That indicates a bug in the kernel / DDI framework.
+ */
+ ASSERT3U(zv->zv_open_count, >, 0);
+
+ /*
+ * make sure zvol is not suspended during last close
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
+ */
+ new_open_count = zv->zv_open_count - count;
+ if (new_open_count == 0) {
+ if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ new_open_count = zv->zv_open_count - count;
+ if (new_open_count != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
+ } else {
+ drop_suspend = B_FALSE;
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ /*
+ * You may get multiple opens, but only one close.
+ */
+ zv->zv_open_count = new_open_count;
+ if (zv->zv_open_count == 0) {
+ ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+ zvol_last_close(zv);
+ wakeup(zv);
+ }
+
+ mutex_exit(&zv->zv_state_lock);
+
+ if (drop_suspend)
+ rw_exit(&zv->zv_suspend_lock);
+ return (0);
+}
+
+static void
+zvol_geom_run(zvol_state_t *zv)
+{
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct g_provider *pp = zsg->zsg_provider;
+
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
+
+ g_error_provider(pp, 0);
+
+ kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
+ "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
+}
+
+static void
+zvol_geom_destroy(zvol_state_t *zv)
+{
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct g_provider *pp = zsg->zsg_provider;
+
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
+
+ g_topology_assert();
+
+ mutex_enter(&zv->zv_state_lock);
+ VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING);
+ mutex_exit(&zv->zv_state_lock);
+ zsg->zsg_provider = NULL;
+ g_wither_geom(pp->geom, ENXIO);
+}
+
+void
+zvol_wait_close(zvol_state_t *zv)
+{
+
+ if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
+ return;
+ mutex_enter(&zv->zv_state_lock);
+ zv->zv_zso->zso_dying = B_TRUE;
+
+ if (zv->zv_open_count)
+ msleep(zv, &zv->zv_state_lock,
+ PRIBIO, "zvol:dying", 10*hz);
+ mutex_exit(&zv->zv_state_lock);
+}
+
+
+static int
+zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
+{
+ int count, error, flags;
+
+ g_topology_assert();
+
+ /*
+ * To make it easier we expect either open or close, but not both
+ * at the same time.
+ */
+ KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
+ (acr <= 0 && acw <= 0 && ace <= 0),
+ ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
+ pp->name, acr, acw, ace));
+
+ if (pp->private == NULL) {
+ if (acr <= 0 && acw <= 0 && ace <= 0)
+ return (0);
+ return (pp->error);
+ }
+
+ /*
+ * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
+ * ace != 0, because GEOM already handles that and handles it a bit
+ * differently. GEOM allows for multiple read/exclusive consumers and
+ * ZFS allows only one exclusive consumer, no matter if it is reader or
+ * writer. I like better the way GEOM works so I'll leave it for GEOM
+ * to decide what to do.
+ */
+
+ count = acr + acw + ace;
+ if (count == 0)
+ return (0);
+
+ flags = 0;
+ if (acr != 0 || ace != 0)
+ flags |= FREAD;
+ if (acw != 0)
+ flags |= FWRITE;
+
+ g_topology_unlock();
+ if (count > 0)
+ error = zvol_geom_open(pp, flags, count);
+ else
+ error = zvol_geom_close(pp, flags, -count);
+ g_topology_lock();
+ return (error);
+}
+
+static void
+zvol_geom_worker(void *arg)
+{
+ zvol_state_t *zv = arg;
+ struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
+ struct bio *bp;
+
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
+
+ thread_lock(curthread);
+ sched_prio(curthread, PRIBIO);
+ thread_unlock(curthread);
+
+ for (;;) {
+ mtx_lock(&zsg->zsg_queue_mtx);
+ bp = bioq_takefirst(&zsg->zsg_queue);
+ if (bp == NULL) {
+ if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
+ zsg->zsg_state = ZVOL_GEOM_RUNNING;
+ wakeup(&zsg->zsg_state);
+ mtx_unlock(&zsg->zsg_queue_mtx);
+ kthread_exit();
+ }
+ msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
+ PRIBIO | PDROP, "zvol:io", 0);
+ continue;
+ }
+ mtx_unlock(&zsg->zsg_queue_mtx);
+ zvol_geom_bio_strategy(bp);
+ }
+}
+
+static void
+zvol_geom_bio_start(struct bio *bp)
+{
+ zvol_state_t *zv = bp->bio_to->private;
+ struct zvol_state_geom *zsg;
+ boolean_t first;
+
+ if (zv == NULL) {
+ g_io_deliver(bp, ENXIO);
+ return;
+ }
+ if (bp->bio_cmd == BIO_GETATTR) {
+ if (zvol_geom_bio_getattr(bp))
+ g_io_deliver(bp, EOPNOTSUPP);
+ return;
+ }
+
+ if (!THREAD_CAN_SLEEP()) {
+ zsg = &zv->zv_zso->zso_geom;
+ mtx_lock(&zsg->zsg_queue_mtx);
+ first = (bioq_first(&zsg->zsg_queue) == NULL);
+ bioq_insert_tail(&zsg->zsg_queue, bp);
+ mtx_unlock(&zsg->zsg_queue_mtx);
+ if (first)
+ wakeup_one(&zsg->zsg_queue);
+ return;
+ }
+
+ zvol_geom_bio_strategy(bp);
+}
+
+static int
+zvol_geom_bio_getattr(struct bio *bp)
+{
+ zvol_state_t *zv;
+
+ zv = bp->bio_to->private;
+ ASSERT3P(zv, !=, NULL);
+
+ spa_t *spa = dmu_objset_spa(zv->zv_objset);
+ uint64_t refd, avail, usedobjs, availobjs;
+
+ if (g_handleattr_int(bp, "GEOM::candelete", 1))
+ return (0);
+ if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
+ return (0);
+ } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
+ return (0);
+ } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
+ avail = metaslab_class_get_space(spa_normal_class(spa));
+ avail -= metaslab_class_get_alloc(spa_normal_class(spa));
+ if (g_handleattr_off_t(bp, "poolblocksavail",
+ avail / DEV_BSIZE))
+ return (0);
+ } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
+ refd = metaslab_class_get_alloc(spa_normal_class(spa));
+ if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
+ return (0);
+ }
+ return (1);
+}
+
+static void
+zvol_geom_bio_strategy(struct bio *bp)
+{
+ zvol_state_t *zv;
+ uint64_t off, volsize;
+ size_t resid;
+ char *addr;
+ objset_t *os;
+ zfs_locked_range_t *lr;
+ int error = 0;
+ boolean_t doread = B_FALSE;
+ boolean_t is_dumpified;
+ boolean_t sync;
+
+ if (bp->bio_to)
+ zv = bp->bio_to->private;
+ else
+ zv = bp->bio_dev->si_drv2;
+
+ if (zv == NULL) {
+ error = SET_ERROR(ENXIO);
+ goto out;
+ }
+
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+
+ switch (bp->bio_cmd) {
+ case BIO_READ:
+ doread = B_TRUE;
+ break;
+ case BIO_WRITE:
+ case BIO_FLUSH:
+ case BIO_DELETE:
+ if (zv->zv_flags & ZVOL_RDONLY) {
+ error = SET_ERROR(EROFS);
+ goto resume;
+ }
+ zvol_ensure_zilog(zv);
+ if (bp->bio_cmd == BIO_FLUSH)
+ goto sync;
+ break;
+ default:
+ error = SET_ERROR(EOPNOTSUPP);
+ goto resume;
+ }
+
+ off = bp->bio_offset;
+ volsize = zv->zv_volsize;
+
+ os = zv->zv_objset;
+ ASSERT3P(os, !=, NULL);
+
+ addr = bp->bio_data;
+ resid = bp->bio_length;
+
+ if (resid > 0 && off >= volsize) {
+ error = SET_ERROR(EIO);
+ goto resume;
+ }
+
+ is_dumpified = B_FALSE;
+ sync = !doread && !is_dumpified &&
+ zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+
+ /*
+ * There must be no buffer changes when doing a dmu_sync() because
+ * we can't change the data whilst calculating the checksum.
+ */
+ lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
+ doread ? RL_READER : RL_WRITER);
+
+ if (bp->bio_cmd == BIO_DELETE) {
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ } else {
+ zvol_log_truncate(zv, tx, off, resid, sync);
+ dmu_tx_commit(tx);
+ error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+ off, resid);
+ resid = 0;
+ }
+ goto unlock;
+ }
+ while (resid != 0 && off < volsize) {
+ size_t size = MIN(resid, zvol_maxphys);
+ if (doread) {
+ error = dmu_read(os, ZVOL_OBJ, off, size, addr,
+ DMU_READ_PREFETCH);
+ } else {
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ } else {
+ dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
+ zvol_log_write(zv, tx, off, size, sync);
+ dmu_tx_commit(tx);
+ }
+ }
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ break;
+ }
+ off += size;
+ addr += size;
+ resid -= size;
+ }
+unlock:
+ zfs_rangelock_exit(lr);
+
+ bp->bio_completed = bp->bio_length - resid;
+ if (bp->bio_completed < bp->bio_length && off > volsize)
+ error = SET_ERROR(EINVAL);
+
+ switch (bp->bio_cmd) {
+ case BIO_FLUSH:
+ break;
+ case BIO_READ:
+ dataset_kstats_update_read_kstats(&zv->zv_kstat,
+ bp->bio_completed);
+ break;
+ case BIO_WRITE:
+ dataset_kstats_update_write_kstats(&zv->zv_kstat,
+ bp->bio_completed);
+ break;
+ case BIO_DELETE:
+ break;
+ default:
+ break;
+ }
+
+ if (sync) {
+sync:
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ }
+resume:
+ rw_exit(&zv->zv_suspend_lock);
+out:
+ if (bp->bio_to)
+ g_io_deliver(bp, error);
+ else
+ biofinish(bp, NULL, error);
+}
+
+/*
+ * Character device mode implementation
+ */
+
+static int
+zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
+{
+ zvol_state_t *zv;
+ uint64_t volsize;
+ zfs_locked_range_t *lr;
+ int error = 0;
+ zfs_uio_t uio;
+
+ zfs_uio_init(&uio, uio_s);
+
+ zv = dev->si_drv2;
+
+ volsize = zv->zv_volsize;
+ /*
+ * uio_loffset == volsize isn't an error as
+ * its required for EOF processing.
+ */
+ if (zfs_uio_resid(&uio) > 0 &&
+ (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
+ return (SET_ERROR(EIO));
+
+ lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
+ zfs_uio_resid(&uio), RL_READER);
+ while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
+ uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
+
+ /* don't read past the end */
+ if (bytes > volsize - zfs_uio_offset(&uio))
+ bytes = volsize - zfs_uio_offset(&uio);
+
+ error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
+ if (error) {
+ /* convert checksum errors into IO errors */
+ if (error == ECKSUM)
+ error = SET_ERROR(EIO);
+ break;
+ }
+ }
+ zfs_rangelock_exit(lr);
+
+ return (error);
+}
+
+static int
+zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
+{
+ zvol_state_t *zv;
+ uint64_t volsize;
+ zfs_locked_range_t *lr;
+ int error = 0;
+ boolean_t sync;
+ zfs_uio_t uio;
+
+ zv = dev->si_drv2;
+
+ volsize = zv->zv_volsize;
+
+ zfs_uio_init(&uio, uio_s);
+
+ if (zfs_uio_resid(&uio) > 0 &&
+ (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
+ return (SET_ERROR(EIO));
+
+ sync = (ioflag & IO_SYNC) ||
+ (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
+
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ zvol_ensure_zilog(zv);
+
+ lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
+ zfs_uio_resid(&uio), RL_WRITER);
+ while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
+ uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
+ uint64_t off = zfs_uio_offset(&uio);
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+
+ if (bytes > volsize - off) /* don't write past the end */
+ bytes = volsize - off;
+
+ dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
+ dmu_tx_abort(tx);
+ break;
+ }
+ error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
+ if (error == 0)
+ zvol_log_write(zv, tx, off, bytes, sync);
+ dmu_tx_commit(tx);
+
+ if (error)
+ break;
+ }
+ zfs_rangelock_exit(lr);
+ if (sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ rw_exit(&zv->zv_suspend_lock);
+ return (error);
+}
+
+static int
+zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+ zvol_state_t *zv;
+ struct zvol_state_dev *zsd;
+ int err = 0;
+ boolean_t drop_suspend = B_FALSE;
+ boolean_t drop_namespace = B_FALSE;
+
+retry:
+ rw_enter(&zvol_state_lock, ZVOL_RW_READER);
+ zv = dev->si_drv2;
+ if (zv == NULL) {
+ rw_exit(&zvol_state_lock);
+ err = SET_ERROR(ENXIO);
+ goto out_locked;
+ }
+
+ if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
+ /*
+ * We need to guarantee that the namespace lock is held
+ * to avoid spurious failures in zvol_first_open.
+ */
+ drop_namespace = B_TRUE;
+ if (!mutex_tryenter(&spa_namespace_lock)) {
+ rw_exit(&zvol_state_lock);
+ mutex_enter(&spa_namespace_lock);
+ goto retry;
+ }
+ }
+ mutex_enter(&zv->zv_state_lock);
+
+ ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
+
+ /*
+ * make sure zvol is not suspended during first open
+ * (hold zv_suspend_lock) and respect proper lock acquisition
+ * ordering - zv_suspend_lock before zv_state_lock
+ */
+ if (zv->zv_open_count == 0) {
+ drop_suspend = B_TRUE;
+ if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
+ mutex_exit(&zv->zv_state_lock);
+ rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
+ mutex_enter(&zv->zv_state_lock);
+ /* check to see if zv_suspend_lock is needed */
+ if (zv->zv_open_count != 0) {
+ rw_exit(&zv->zv_suspend_lock);
+ drop_suspend = B_FALSE;
+ }
+ }
+ }
+ rw_exit(&zvol_state_lock);
+
+ ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+
+ if (zv->zv_open_count == 0) {
+ ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
+ err = zvol_first_open(zv, !(flags & FWRITE));
+ if (err)
*** 1412 LINES SKIPPED ***
More information about the dev-commits-src-all
mailing list