git: 9db44a8e5da9 - main - zfs: merge OpenZFS master-9305ff2ed

Martin Matuska mm at FreeBSD.org
Sun Mar 14 01:38:30 UTC 2021


The branch main has been updated by mm:

URL: https://cgit.FreeBSD.org/src/commit/?id=9db44a8e5da9bf1ce6dd1c0f1468ddafed6d6c91

commit 9db44a8e5da9bf1ce6dd1c0f1468ddafed6d6c91
Merge: a9275d996c22 9162a1ce3ae9
Author:     Martin Matuska <mm at FreeBSD.org>
AuthorDate: 2021-03-14 01:23:51 +0000
Commit:     Martin Matuska <mm at FreeBSD.org>
CommitDate: 2021-03-14 01:32:14 +0000

    zfs: merge OpenZFS master-9305ff2ed
    
    Notable upstream pull request merges:
      #11153 Scalable teardown lock for FreeBSD
      #11651 Don't bomb out when using keylocation=file://
      #11667 zvol: call zil_replaying() during replay
      #11683 abd_get_offset_struct() may allocate new abd
      #11693 Intentionally allow ZFS_READONLY in zfs_write
      #11716 zpool import cachefile improvements
      #11720 FreeBSD: Clean up zfsdev_close to match Linux
      #11730 FreeBSD: bring back possibility to rewind the
             checkpoint from bootloader
    
    Obtained from:  OpenZFS
    MFC after:      2 weeks

 .../openzfs/.github/workflows/checkstyle.yaml      |   2 +-
 sys/contrib/openzfs/cmd/vdev_id/vdev_id            |   9 +-
 sys/contrib/openzfs/cmd/zpool/zpool_main.c         | 307 +++++++++++++--------
 sys/contrib/openzfs/cmd/zstream/zstream_redup.c    |   1 +
 sys/contrib/openzfs/config/zfs-build.m4            |  36 +++
 sys/contrib/openzfs/configure.ac                   |   1 +
 .../openzfs/include/os/freebsd/spl/sys/Makefile.am |   3 +
 .../openzfs/include/os/freebsd/spl/sys/debug.h     |  80 +++---
 .../include/os/freebsd/zfs/sys/zfs_vfsops_os.h     |   2 +-
 .../openzfs/include/os/linux/spl/sys/debug.h       |  78 +++---
 .../include/os/linux/zfs/sys/zfs_vfsops_os.h       |  33 +++
 .../include/os/linux/zfs/sys/zfs_znode_impl.h      |   4 +-
 sys/contrib/openzfs/include/sys/dmu_redact.h       |   2 +
 sys/contrib/openzfs/include/sys/zfs_ioctl.h        |   1 -
 sys/contrib/openzfs/lib/libzfs/libzfs_crypto.c     |  10 +-
 sys/contrib/openzfs/lib/libzfs/libzfs_mount.c      |  25 +-
 .../openzfs/lib/libzfs/os/freebsd/libzfs_zmount.c  |   5 +-
 .../openzfs/lib/libzfs/os/linux/libzfs_mount_os.c  |   6 +-
 sys/contrib/openzfs/lib/libzutil/zutil_import.c    | 177 +++++++++---
 sys/contrib/openzfs/man/man8/zfs-receive.8         |  10 +
 sys/contrib/openzfs/man/man8/zfs-send.8            |   7 +-
 sys/contrib/openzfs/module/Makefile.in             |   5 +
 .../openzfs/module/os/freebsd/zfs/kmod_core.c      |  18 +-
 .../openzfs/module/os/freebsd/zfs/zfs_dir.c        |   2 -
 .../openzfs/module/os/freebsd/zfs/zvol_os.c        |   9 +-
 sys/contrib/openzfs/module/os/linux/zfs/zfs_acl.c  |  26 +-
 .../openzfs/module/os/linux/zfs/zio_crypt.c        |   1 +
 sys/contrib/openzfs/module/os/linux/zfs/zvol_os.c  | 102 +++++--
 sys/contrib/openzfs/module/zcommon/zfs_prop.c      |   2 +-
 sys/contrib/openzfs/module/zfs/abd.c               |   6 +-
 sys/contrib/openzfs/module/zfs/spa_misc.c          |   4 +-
 sys/contrib/openzfs/module/zfs/zfs_vnops.c         |   6 +-
 sys/contrib/openzfs/module/zfs/zvol.c              |  15 +-
 sys/contrib/openzfs/tests/runfiles/common.run      |   1 +
 .../openzfs/tests/zfs-tests/include/commands.cfg   |   2 +-
 .../openzfs/tests/zfs-tests/include/libtest.shlib  |  19 +-
 .../openzfs/tests/zfs-tests/include/tunables.cfg   |   8 +-
 .../functional/cli_root/zpool/zpool_002_pos.ksh    |  37 ++-
 .../functional/cli_root/zpool/zpool_003_pos.ksh    |  39 ++-
 .../functional/cli_root/zpool_import/Makefile.am   |   1 +
 .../import_cachefile_paths_changed.ksh             | 117 ++++++++
 .../tests/functional/events/events_002_pos.ksh     |   7 +-
 .../tests/functional/xattr/xattr_003_neg.ksh       |  44 +--
 sys/modules/zfs/zfs_config.h                       |   4 +-
 44 files changed, 896 insertions(+), 378 deletions(-)

diff --cc sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
index 2389b1a06355,000000000000..ba315f104738
mode 100644,000000..100644
--- a/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
+++ b/sys/contrib/openzfs/module/os/freebsd/zfs/zvol_os.c
@@@ -1,1525 -1,0 +1,1532 @@@
 +/*
 + * CDDL HEADER START
 + *
 + * The contents of this file are subject to the terms of the
 + * Common Development and Distribution License (the "License").
 + * You may not use this file except in compliance with the License.
 + *
 + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 + * or http://www.opensolaris.org/os/licensing.
 + * See the License for the specific language governing permissions
 + * and limitations under the License.
 + *
 + * When distributing Covered Code, include this CDDL HEADER in each
 + * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 + * If applicable, add the following below this CDDL HEADER, with the
 + * fields enclosed by brackets "[]" replaced with your own identifying
 + * information: Portions Copyright [yyyy] [name of copyright owner]
 + *
 + * CDDL HEADER END
 + */
 +/*
 + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
 + *
 + * Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd at FreeBSD.org>
 + * All rights reserved.
 + *
 + * Portions Copyright 2010 Robert Milkowski
 + *
 + * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
 + * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
 + * Copyright (c) 2013, Joyent, Inc. All rights reserved.
 + * Copyright (c) 2014 Integros [integros.com]
 + */
 +
 +/* Portions Copyright 2011 Martin Matuska <mm at FreeBSD.org> */
 +
 +/*
 + * ZFS volume emulation driver.
 + *
 + * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
 + * Volumes are accessed through the symbolic links named:
 + *
 + * /dev/zvol/<pool_name>/<dataset_name>
 + *
 + * Volumes are persistent through reboot.  No user command needs to be
 + * run before opening and using a device.
 + *
 + * On FreeBSD ZVOLs are simply GEOM providers like any other storage device
 + * in the system. Except when they're simply character devices (volmode=dev).
 + */
 +
 +#include <sys/types.h>
 +#include <sys/param.h>
 +#include <sys/kernel.h>
 +#include <sys/errno.h>
 +#include <sys/uio.h>
 +#include <sys/bio.h>
 +#include <sys/buf.h>
 +#include <sys/kmem.h>
 +#include <sys/conf.h>
 +#include <sys/cmn_err.h>
 +#include <sys/stat.h>
 +#include <sys/proc.h>
 +#include <sys/zap.h>
 +#include <sys/spa.h>
 +#include <sys/spa_impl.h>
 +#include <sys/zio.h>
 +#include <sys/disk.h>
 +#include <sys/dmu_traverse.h>
 +#include <sys/dnode.h>
 +#include <sys/dsl_dataset.h>
 +#include <sys/dsl_prop.h>
 +#include <sys/dsl_dir.h>
 +#include <sys/byteorder.h>
 +#include <sys/sunddi.h>
 +#include <sys/dirent.h>
 +#include <sys/policy.h>
 +#include <sys/queue.h>
 +#include <sys/fs/zfs.h>
 +#include <sys/zfs_ioctl.h>
 +#include <sys/zil.h>
 +#include <sys/zfs_znode.h>
 +#include <sys/zfs_rlock.h>
 +#include <sys/vdev_impl.h>
 +#include <sys/vdev_raidz.h>
 +#include <sys/zvol.h>
 +#include <sys/zil_impl.h>
 +#include <sys/dataset_kstats.h>
 +#include <sys/dbuf.h>
 +#include <sys/dmu_tx.h>
 +#include <sys/zfeature.h>
 +#include <sys/zio_checksum.h>
 +#include <sys/zil_impl.h>
 +#include <sys/filio.h>
 +
 +#include <geom/geom.h>
 +#include <sys/zvol.h>
 +#include <sys/zvol_impl.h>
 +
 +#include "zfs_namecheck.h"
 +
 +#define	ZVOL_DUMPSIZE		"dumpsize"
 +
 +#ifdef ZVOL_LOCK_DEBUG
 +#define	ZVOL_RW_READER		RW_WRITER
 +#define	ZVOL_RW_READ_HELD	RW_WRITE_HELD
 +#else
 +#define	ZVOL_RW_READER		RW_READER
 +#define	ZVOL_RW_READ_HELD	RW_READ_HELD
 +#endif
 +
 +enum zvol_geom_state {
 +	ZVOL_GEOM_UNINIT,
 +	ZVOL_GEOM_STOPPED,
 +	ZVOL_GEOM_RUNNING,
 +};
 +
 +struct zvol_state_os {
 +#define	zso_dev		_zso_state._zso_dev
 +#define	zso_geom	_zso_state._zso_geom
 +	union {
 +		/* volmode=dev */
 +		struct zvol_state_dev {
 +			struct cdev *zsd_cdev;
 +			uint64_t zsd_sync_cnt;
 +		} _zso_dev;
 +
 +		/* volmode=geom */
 +		struct zvol_state_geom {
 +			struct g_provider *zsg_provider;
 +			struct bio_queue_head zsg_queue;
 +			struct mtx zsg_queue_mtx;
 +			enum zvol_geom_state zsg_state;
 +		} _zso_geom;
 +	} _zso_state;
 +	int zso_dying;
 +};
 +
 +static uint32_t zvol_minors;
 +
 +SYSCTL_DECL(_vfs_zfs);
 +SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
 +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0,
 +	"Expose as GEOM providers (1), device files (2) or neither");
 +static boolean_t zpool_on_zvol = B_FALSE;
 +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0,
 +	"Allow zpools to use zvols as vdevs (DANGEROUS)");
 +
 +/*
 + * Toggle unmap functionality.
 + */
 +boolean_t zvol_unmap_enabled = B_TRUE;
 +
 +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
 +	&zvol_unmap_enabled, 0, "Enable UNMAP functionality");
 +
 +/*
 + * zvol maximum transfer in one DMU tx.
 + */
 +int zvol_maxphys = DMU_MAX_ACCESS / 2;
 +
 +static void zvol_ensure_zilog(zvol_state_t *zv);
 +
 +static d_open_t		zvol_cdev_open;
 +static d_close_t	zvol_cdev_close;
 +static d_ioctl_t	zvol_cdev_ioctl;
 +static d_read_t		zvol_cdev_read;
 +static d_write_t	zvol_cdev_write;
 +static d_strategy_t	zvol_geom_bio_strategy;
 +
 +static struct cdevsw zvol_cdevsw = {
 +	.d_name =	"zvol",
 +	.d_version =	D_VERSION,
 +	.d_flags =	D_DISK | D_TRACKCLOSE,
 +	.d_open =	zvol_cdev_open,
 +	.d_close =	zvol_cdev_close,
 +	.d_ioctl =	zvol_cdev_ioctl,
 +	.d_read =	zvol_cdev_read,
 +	.d_write =	zvol_cdev_write,
 +	.d_strategy =	zvol_geom_bio_strategy,
 +};
 +
 +extern uint_t zfs_geom_probe_vdev_key;
 +
 +struct g_class zfs_zvol_class = {
 +	.name = "ZFS::ZVOL",
 +	.version = G_VERSION,
 +};
 +
 +DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
 +
 +static int zvol_geom_open(struct g_provider *pp, int flag, int count);
 +static int zvol_geom_close(struct g_provider *pp, int flag, int count);
 +static void zvol_geom_run(zvol_state_t *zv);
 +static void zvol_geom_destroy(zvol_state_t *zv);
 +static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
 +static void zvol_geom_worker(void *arg);
 +static void zvol_geom_bio_start(struct bio *bp);
 +static int zvol_geom_bio_getattr(struct bio *bp);
 +/* static d_strategy_t	zvol_geom_bio_strategy; (declared elsewhere) */
 +
 +/*
 + * GEOM mode implementation
 + */
 +
 +/*ARGSUSED*/
 +static int
 +zvol_geom_open(struct g_provider *pp, int flag, int count)
 +{
 +	zvol_state_t *zv;
 +	int err = 0;
 +	boolean_t drop_suspend = B_FALSE;
 +	boolean_t drop_namespace = B_FALSE;
 +
 +	if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) {
 +		/*
 +		 * if zfs_geom_probe_vdev_key is set, that means that zfs is
 +		 * attempting to probe geom providers while looking for a
 +		 * replacement for a missing VDEV.  In this case, the
 +		 * spa_namespace_lock will not be held, but it is still illegal
 +		 * to use a zvol as a vdev.  Deadlocks can result if another
 +		 * thread has spa_namespace_lock
 +		 */
 +		return (SET_ERROR(EOPNOTSUPP));
 +	}
 +
 +retry:
 +	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
 +	zv = pp->private;
 +	if (zv == NULL) {
 +		rw_exit(&zvol_state_lock);
 +		err = SET_ERROR(ENXIO);
 +		goto out_locked;
 +	}
 +
 +	if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
 +		/*
 +		 * We need to guarantee that the namespace lock is held
 +		 * to avoid spurious failures in zvol_first_open.
 +		 */
 +		drop_namespace = B_TRUE;
 +		if (!mutex_tryenter(&spa_namespace_lock)) {
 +			rw_exit(&zvol_state_lock);
 +			mutex_enter(&spa_namespace_lock);
 +			goto retry;
 +		}
 +	}
 +	mutex_enter(&zv->zv_state_lock);
 +	if (zv->zv_zso->zso_dying) {
 +		rw_exit(&zvol_state_lock);
 +		err = SET_ERROR(ENXIO);
 +		goto out_zv_locked;
 +	}
 +	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
 +
 +	/*
 +	 * make sure zvol is not suspended during first open
 +	 * (hold zv_suspend_lock) and respect proper lock acquisition
 +	 * ordering - zv_suspend_lock before zv_state_lock
 +	 */
 +	if (zv->zv_open_count == 0) {
 +		drop_suspend = B_TRUE;
 +		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
 +			mutex_exit(&zv->zv_state_lock);
 +			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 +			mutex_enter(&zv->zv_state_lock);
 +			/* check to see if zv_suspend_lock is needed */
 +			if (zv->zv_open_count != 0) {
 +				rw_exit(&zv->zv_suspend_lock);
 +				drop_suspend = B_FALSE;
 +			}
 +		}
 +	}
 +	rw_exit(&zvol_state_lock);
 +
 +	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 +
 +	if (zv->zv_open_count == 0) {
 +		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
 +		err = zvol_first_open(zv, !(flag & FWRITE));
 +		if (err)
 +			goto out_zv_locked;
 +		pp->mediasize = zv->zv_volsize;
 +		pp->stripeoffset = 0;
 +		pp->stripesize = zv->zv_volblocksize;
 +	}
 +
 +	/*
 +	 * Check for a bad on-disk format version now since we
 +	 * lied about owning the dataset readonly before.
 +	 */
 +	if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) ||
 +	    dmu_objset_incompatible_encryption_version(zv->zv_objset))) {
 +		err = SET_ERROR(EROFS);
 +		goto out_opened;
 +	}
 +	if (zv->zv_flags & ZVOL_EXCL) {
 +		err = SET_ERROR(EBUSY);
 +		goto out_opened;
 +	}
 +#ifdef FEXCL
 +	if (flag & FEXCL) {
 +		if (zv->zv_open_count != 0) {
 +			err = SET_ERROR(EBUSY);
 +			goto out_opened;
 +		}
 +		zv->zv_flags |= ZVOL_EXCL;
 +	}
 +#endif
 +
 +	zv->zv_open_count += count;
 +out_opened:
 +	if (zv->zv_open_count == 0) {
 +		zvol_last_close(zv);
 +		wakeup(zv);
 +	}
 +out_zv_locked:
 +	mutex_exit(&zv->zv_state_lock);
 +out_locked:
 +	if (drop_namespace)
 +		mutex_exit(&spa_namespace_lock);
 +	if (drop_suspend)
 +		rw_exit(&zv->zv_suspend_lock);
 +	return (err);
 +}
 +
 +/*ARGSUSED*/
 +static int
 +zvol_geom_close(struct g_provider *pp, int flag, int count)
 +{
 +	zvol_state_t *zv;
 +	boolean_t drop_suspend = B_TRUE;
 +	int new_open_count;
 +
 +	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
 +	zv = pp->private;
 +	if (zv == NULL) {
 +		rw_exit(&zvol_state_lock);
 +		return (SET_ERROR(ENXIO));
 +	}
 +
 +	mutex_enter(&zv->zv_state_lock);
 +	if (zv->zv_flags & ZVOL_EXCL) {
 +		ASSERT3U(zv->zv_open_count, ==, 1);
 +		zv->zv_flags &= ~ZVOL_EXCL;
 +	}
 +
 +	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
 +
 +	/*
 +	 * If the open count is zero, this is a spurious close.
 +	 * That indicates a bug in the kernel / DDI framework.
 +	 */
 +	ASSERT3U(zv->zv_open_count, >, 0);
 +
 +	/*
 +	 * make sure zvol is not suspended during last close
 +	 * (hold zv_suspend_lock) and respect proper lock acquisition
 +	 * ordering - zv_suspend_lock before zv_state_lock
 +	 */
 +	new_open_count = zv->zv_open_count - count;
 +	if (new_open_count == 0) {
 +		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
 +			mutex_exit(&zv->zv_state_lock);
 +			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 +			mutex_enter(&zv->zv_state_lock);
 +			/* check to see if zv_suspend_lock is needed */
 +			new_open_count = zv->zv_open_count - count;
 +			if (new_open_count != 0) {
 +				rw_exit(&zv->zv_suspend_lock);
 +				drop_suspend = B_FALSE;
 +			}
 +		}
 +	} else {
 +		drop_suspend = B_FALSE;
 +	}
 +	rw_exit(&zvol_state_lock);
 +
 +	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 +
 +	/*
 +	 * You may get multiple opens, but only one close.
 +	 */
 +	zv->zv_open_count = new_open_count;
 +	if (zv->zv_open_count == 0) {
 +		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
 +		zvol_last_close(zv);
 +		wakeup(zv);
 +	}
 +
 +	mutex_exit(&zv->zv_state_lock);
 +
 +	if (drop_suspend)
 +		rw_exit(&zv->zv_suspend_lock);
 +	return (0);
 +}
 +
 +static void
 +zvol_geom_run(zvol_state_t *zv)
 +{
 +	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
 +	struct g_provider *pp = zsg->zsg_provider;
 +
 +	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
 +
 +	g_error_provider(pp, 0);
 +
 +	kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0,
 +	    "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER));
 +}
 +
 +static void
 +zvol_geom_destroy(zvol_state_t *zv)
 +{
 +	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
 +	struct g_provider *pp = zsg->zsg_provider;
 +
 +	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
 +
 +	g_topology_assert();
 +
 +	mutex_enter(&zv->zv_state_lock);
 +	VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING);
 +	mutex_exit(&zv->zv_state_lock);
 +	zsg->zsg_provider = NULL;
 +	g_wither_geom(pp->geom, ENXIO);
 +}
 +
 +void
 +zvol_wait_close(zvol_state_t *zv)
 +{
 +
 +	if (zv->zv_volmode != ZFS_VOLMODE_GEOM)
 +		return;
 +	mutex_enter(&zv->zv_state_lock);
 +	zv->zv_zso->zso_dying = B_TRUE;
 +
 +	if (zv->zv_open_count)
 +		msleep(zv, &zv->zv_state_lock,
 +		    PRIBIO, "zvol:dying", 10*hz);
 +	mutex_exit(&zv->zv_state_lock);
 +}
 +
 +
 +static int
 +zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace)
 +{
 +	int count, error, flags;
 +
 +	g_topology_assert();
 +
 +	/*
 +	 * To make it easier we expect either open or close, but not both
 +	 * at the same time.
 +	 */
 +	KASSERT((acr >= 0 && acw >= 0 && ace >= 0) ||
 +	    (acr <= 0 && acw <= 0 && ace <= 0),
 +	    ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).",
 +	    pp->name, acr, acw, ace));
 +
 +	if (pp->private == NULL) {
 +		if (acr <= 0 && acw <= 0 && ace <= 0)
 +			return (0);
 +		return (pp->error);
 +	}
 +
 +	/*
 +	 * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if
 +	 * ace != 0, because GEOM already handles that and handles it a bit
 +	 * differently. GEOM allows for multiple read/exclusive consumers and
 +	 * ZFS allows only one exclusive consumer, no matter if it is reader or
 +	 * writer. I like better the way GEOM works so I'll leave it for GEOM
 +	 * to decide what to do.
 +	 */
 +
 +	count = acr + acw + ace;
 +	if (count == 0)
 +		return (0);
 +
 +	flags = 0;
 +	if (acr != 0 || ace != 0)
 +		flags |= FREAD;
 +	if (acw != 0)
 +		flags |= FWRITE;
 +
 +	g_topology_unlock();
 +	if (count > 0)
 +		error = zvol_geom_open(pp, flags, count);
 +	else
 +		error = zvol_geom_close(pp, flags, -count);
 +	g_topology_lock();
 +	return (error);
 +}
 +
 +static void
 +zvol_geom_worker(void *arg)
 +{
 +	zvol_state_t *zv = arg;
 +	struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom;
 +	struct bio *bp;
 +
 +	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM);
 +
 +	thread_lock(curthread);
 +	sched_prio(curthread, PRIBIO);
 +	thread_unlock(curthread);
 +
 +	for (;;) {
 +		mtx_lock(&zsg->zsg_queue_mtx);
 +		bp = bioq_takefirst(&zsg->zsg_queue);
 +		if (bp == NULL) {
 +			if (zsg->zsg_state == ZVOL_GEOM_STOPPED) {
 +				zsg->zsg_state = ZVOL_GEOM_RUNNING;
 +				wakeup(&zsg->zsg_state);
 +				mtx_unlock(&zsg->zsg_queue_mtx);
 +				kthread_exit();
 +			}
 +			msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx,
 +			    PRIBIO | PDROP, "zvol:io", 0);
 +			continue;
 +		}
 +		mtx_unlock(&zsg->zsg_queue_mtx);
 +		zvol_geom_bio_strategy(bp);
 +	}
 +}
 +
 +static void
 +zvol_geom_bio_start(struct bio *bp)
 +{
 +	zvol_state_t *zv = bp->bio_to->private;
 +	struct zvol_state_geom *zsg;
 +	boolean_t first;
 +
 +	if (zv == NULL) {
 +		g_io_deliver(bp, ENXIO);
 +		return;
 +	}
 +	if (bp->bio_cmd == BIO_GETATTR) {
 +		if (zvol_geom_bio_getattr(bp))
 +			g_io_deliver(bp, EOPNOTSUPP);
 +		return;
 +	}
 +
 +	if (!THREAD_CAN_SLEEP()) {
 +		zsg = &zv->zv_zso->zso_geom;
 +		mtx_lock(&zsg->zsg_queue_mtx);
 +		first = (bioq_first(&zsg->zsg_queue) == NULL);
 +		bioq_insert_tail(&zsg->zsg_queue, bp);
 +		mtx_unlock(&zsg->zsg_queue_mtx);
 +		if (first)
 +			wakeup_one(&zsg->zsg_queue);
 +		return;
 +	}
 +
 +	zvol_geom_bio_strategy(bp);
 +}
 +
 +static int
 +zvol_geom_bio_getattr(struct bio *bp)
 +{
 +	zvol_state_t *zv;
 +
 +	zv = bp->bio_to->private;
 +	ASSERT3P(zv, !=, NULL);
 +
 +	spa_t *spa = dmu_objset_spa(zv->zv_objset);
 +	uint64_t refd, avail, usedobjs, availobjs;
 +
 +	if (g_handleattr_int(bp, "GEOM::candelete", 1))
 +		return (0);
 +	if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
 +		dmu_objset_space(zv->zv_objset, &refd, &avail,
 +		    &usedobjs, &availobjs);
 +		if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE))
 +			return (0);
 +	} else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
 +		dmu_objset_space(zv->zv_objset, &refd, &avail,
 +		    &usedobjs, &availobjs);
 +		if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE))
 +			return (0);
 +	} else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
 +		avail = metaslab_class_get_space(spa_normal_class(spa));
 +		avail -= metaslab_class_get_alloc(spa_normal_class(spa));
 +		if (g_handleattr_off_t(bp, "poolblocksavail",
 +		    avail / DEV_BSIZE))
 +			return (0);
 +	} else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
 +		refd = metaslab_class_get_alloc(spa_normal_class(spa));
 +		if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE))
 +			return (0);
 +	}
 +	return (1);
 +}
 +
 +static void
 +zvol_geom_bio_strategy(struct bio *bp)
 +{
 +	zvol_state_t *zv;
 +	uint64_t off, volsize;
 +	size_t resid;
 +	char *addr;
 +	objset_t *os;
 +	zfs_locked_range_t *lr;
 +	int error = 0;
 +	boolean_t doread = B_FALSE;
 +	boolean_t is_dumpified;
 +	boolean_t sync;
 +
 +	if (bp->bio_to)
 +		zv = bp->bio_to->private;
 +	else
 +		zv = bp->bio_dev->si_drv2;
 +
 +	if (zv == NULL) {
 +		error = SET_ERROR(ENXIO);
 +		goto out;
 +	}
 +
 +	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 +
 +	switch (bp->bio_cmd) {
 +	case BIO_READ:
 +		doread = B_TRUE;
 +		break;
 +	case BIO_WRITE:
 +	case BIO_FLUSH:
 +	case BIO_DELETE:
 +		if (zv->zv_flags & ZVOL_RDONLY) {
 +			error = SET_ERROR(EROFS);
 +			goto resume;
 +		}
 +		zvol_ensure_zilog(zv);
 +		if (bp->bio_cmd == BIO_FLUSH)
 +			goto sync;
 +		break;
 +	default:
 +		error = SET_ERROR(EOPNOTSUPP);
 +		goto resume;
 +	}
 +
 +	off = bp->bio_offset;
 +	volsize = zv->zv_volsize;
 +
 +	os = zv->zv_objset;
 +	ASSERT3P(os, !=, NULL);
 +
 +	addr = bp->bio_data;
 +	resid = bp->bio_length;
 +
 +	if (resid > 0 && off >= volsize) {
 +		error = SET_ERROR(EIO);
 +		goto resume;
 +	}
 +
 +	is_dumpified = B_FALSE;
 +	sync = !doread && !is_dumpified &&
 +	    zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
 +
 +	/*
 +	 * There must be no buffer changes when doing a dmu_sync() because
 +	 * we can't change the data whilst calculating the checksum.
 +	 */
 +	lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid,
 +	    doread ? RL_READER : RL_WRITER);
 +
 +	if (bp->bio_cmd == BIO_DELETE) {
 +		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 +		error = dmu_tx_assign(tx, TXG_WAIT);
 +		if (error != 0) {
 +			dmu_tx_abort(tx);
 +		} else {
 +			zvol_log_truncate(zv, tx, off, resid, sync);
 +			dmu_tx_commit(tx);
 +			error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
 +			    off, resid);
 +			resid = 0;
 +		}
 +		goto unlock;
 +	}
 +	while (resid != 0 && off < volsize) {
 +		size_t size = MIN(resid, zvol_maxphys);
 +		if (doread) {
 +			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
 +			    DMU_READ_PREFETCH);
 +		} else {
 +			dmu_tx_t *tx = dmu_tx_create(os);
 +			dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size);
 +			error = dmu_tx_assign(tx, TXG_WAIT);
 +			if (error) {
 +				dmu_tx_abort(tx);
 +			} else {
 +				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
 +				zvol_log_write(zv, tx, off, size, sync);
 +				dmu_tx_commit(tx);
 +			}
 +		}
 +		if (error) {
 +			/* convert checksum errors into IO errors */
 +			if (error == ECKSUM)
 +				error = SET_ERROR(EIO);
 +			break;
 +		}
 +		off += size;
 +		addr += size;
 +		resid -= size;
 +	}
 +unlock:
 +	zfs_rangelock_exit(lr);
 +
 +	bp->bio_completed = bp->bio_length - resid;
 +	if (bp->bio_completed < bp->bio_length && off > volsize)
 +		error = SET_ERROR(EINVAL);
 +
 +	switch (bp->bio_cmd) {
 +	case BIO_FLUSH:
 +		break;
 +	case BIO_READ:
 +		dataset_kstats_update_read_kstats(&zv->zv_kstat,
 +		    bp->bio_completed);
 +		break;
 +	case BIO_WRITE:
 +		dataset_kstats_update_write_kstats(&zv->zv_kstat,
 +		    bp->bio_completed);
 +		break;
 +	case BIO_DELETE:
 +		break;
 +	default:
 +		break;
 +	}
 +
 +	if (sync) {
 +sync:
 +		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 +	}
 +resume:
 +	rw_exit(&zv->zv_suspend_lock);
 +out:
 +	if (bp->bio_to)
 +		g_io_deliver(bp, error);
 +	else
 +		biofinish(bp, NULL, error);
 +}
 +
 +/*
 + * Character device mode implementation
 + */
 +
 +static int
 +zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag)
 +{
 +	zvol_state_t *zv;
 +	uint64_t volsize;
 +	zfs_locked_range_t *lr;
 +	int error = 0;
 +	zfs_uio_t uio;
 +
 +	zfs_uio_init(&uio, uio_s);
 +
 +	zv = dev->si_drv2;
 +
 +	volsize = zv->zv_volsize;
 +	/*
 +	 * uio_loffset == volsize isn't an error as
 +	 * its required for EOF processing.
 +	 */
 +	if (zfs_uio_resid(&uio) > 0 &&
 +	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
 +		return (SET_ERROR(EIO));
 +
 +	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
 +	    zfs_uio_resid(&uio), RL_READER);
 +	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
 +		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
 +
 +		/* don't read past the end */
 +		if (bytes > volsize - zfs_uio_offset(&uio))
 +			bytes = volsize - zfs_uio_offset(&uio);
 +
 +		error =  dmu_read_uio_dnode(zv->zv_dn, &uio, bytes);
 +		if (error) {
 +			/* convert checksum errors into IO errors */
 +			if (error == ECKSUM)
 +				error = SET_ERROR(EIO);
 +			break;
 +		}
 +	}
 +	zfs_rangelock_exit(lr);
 +
 +	return (error);
 +}
 +
 +static int
 +zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag)
 +{
 +	zvol_state_t *zv;
 +	uint64_t volsize;
 +	zfs_locked_range_t *lr;
 +	int error = 0;
 +	boolean_t sync;
 +	zfs_uio_t uio;
 +
 +	zv = dev->si_drv2;
 +
 +	volsize = zv->zv_volsize;
 +
 +	zfs_uio_init(&uio, uio_s);
 +
 +	if (zfs_uio_resid(&uio) > 0 &&
 +	    (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize))
 +		return (SET_ERROR(EIO));
 +
 +	sync = (ioflag & IO_SYNC) ||
 +	    (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
 +
 +	rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 +	zvol_ensure_zilog(zv);
 +
 +	lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio),
 +	    zfs_uio_resid(&uio), RL_WRITER);
 +	while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) {
 +		uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1);
 +		uint64_t off = zfs_uio_offset(&uio);
 +		dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
 +
 +		if (bytes > volsize - off)	/* don't write past the end */
 +			bytes = volsize - off;
 +
 +		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
 +		error = dmu_tx_assign(tx, TXG_WAIT);
 +		if (error) {
 +			dmu_tx_abort(tx);
 +			break;
 +		}
 +		error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx);
 +		if (error == 0)
 +			zvol_log_write(zv, tx, off, bytes, sync);
 +		dmu_tx_commit(tx);
 +
 +		if (error)
 +			break;
 +	}
 +	zfs_rangelock_exit(lr);
 +	if (sync)
 +		zil_commit(zv->zv_zilog, ZVOL_OBJ);
 +	rw_exit(&zv->zv_suspend_lock);
 +	return (error);
 +}
 +
 +static int
 +zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td)
 +{
 +	zvol_state_t *zv;
 +	struct zvol_state_dev *zsd;
 +	int err = 0;
 +	boolean_t drop_suspend = B_FALSE;
 +	boolean_t drop_namespace = B_FALSE;
 +
 +retry:
 +	rw_enter(&zvol_state_lock, ZVOL_RW_READER);
 +	zv = dev->si_drv2;
 +	if (zv == NULL) {
 +		rw_exit(&zvol_state_lock);
 +		err = SET_ERROR(ENXIO);
 +		goto out_locked;
 +	}
 +
 +	if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
 +		/*
 +		 * We need to guarantee that the namespace lock is held
 +		 * to avoid spurious failures in zvol_first_open.
 +		 */
 +		drop_namespace = B_TRUE;
 +		if (!mutex_tryenter(&spa_namespace_lock)) {
 +			rw_exit(&zvol_state_lock);
 +			mutex_enter(&spa_namespace_lock);
 +			goto retry;
 +		}
 +	}
 +	mutex_enter(&zv->zv_state_lock);
 +
 +	ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV);
 +
 +	/*
 +	 * make sure zvol is not suspended during first open
 +	 * (hold zv_suspend_lock) and respect proper lock acquisition
 +	 * ordering - zv_suspend_lock before zv_state_lock
 +	 */
 +	if (zv->zv_open_count == 0) {
 +		drop_suspend = B_TRUE;
 +		if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) {
 +			mutex_exit(&zv->zv_state_lock);
 +			rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER);
 +			mutex_enter(&zv->zv_state_lock);
 +			/* check to see if zv_suspend_lock is needed */
 +			if (zv->zv_open_count != 0) {
 +				rw_exit(&zv->zv_suspend_lock);
 +				drop_suspend = B_FALSE;
 +			}
 +		}
 +	}
 +	rw_exit(&zvol_state_lock);
 +
 +	ASSERT(MUTEX_HELD(&zv->zv_state_lock));
 +
 +	if (zv->zv_open_count == 0) {
 +		ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock));
 +		err = zvol_first_open(zv, !(flags & FWRITE));
 +		if (err)
*** 1412 LINES SKIPPED ***


More information about the dev-commits-src-all mailing list