git: a13ddd6210f3 - main - zfsd: don't try to fix an OFFLINE condition

From: Alan Somers <asomers_at_FreeBSD.org>
Date: Mon, 04 Aug 2025 18:37:02 UTC
The branch main has been updated by asomers:

URL: https://cgit.FreeBSD.org/src/commit/?id=a13ddd6210f349dec341eba0506b5f2395a7a2d6

commit a13ddd6210f349dec341eba0506b5f2395a7a2d6
Author:     Alan Somers <asomers@FreeBSD.org>
AuthorDate: 2025-05-02 12:45:32 +0000
Commit:     Alan Somers <asomers@FreeBSD.org>
CommitDate: 2025-08-04 16:00:52 +0000

    zfsd: don't try to fix an OFFLINE condition
    
    If the system administrator does "zpool offline", he's doing it for a
    reason.  zfsd shouldn't consider an offline disk to be an event that
    requires automatic healing.  Don't online it in response to a GEOM
    event, and don't try to activate a hotspare to take over from it.
    
    MFC after:      2 weeks
    Sponsored by:   ConnectWise
---
 cddl/usr.sbin/zfsd/case_file.cc                    | 18 +++++-
 cddl/usr.sbin/zfsd/zfsd_event.cc                   |  7 +++
 tests/sys/cddl/zfs/tests/zfsd/Makefile             |  2 +
 .../cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh   | 64 +++++++++++++++++++++
 .../cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh   | 66 ++++++++++++++++++++++
 tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh         | 60 ++++++++++++++++++++
 6 files changed, 215 insertions(+), 2 deletions(-)

diff --git a/cddl/usr.sbin/zfsd/case_file.cc b/cddl/usr.sbin/zfsd/case_file.cc
index 7adfb08b75c6..852767aeb227 100644
--- a/cddl/usr.sbin/zfsd/case_file.cc
+++ b/cddl/usr.sbin/zfsd/case_file.cc
@@ -299,6 +299,15 @@ CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev)
 		    PoolGUIDString().c_str(), VdevGUIDString().c_str());
 		return (/*consumed*/false);
 	}
+	if (VdevState() == VDEV_STATE_OFFLINE) {
+		/*
+		 * OFFLINE is an administrative decision.  No need for zfsd to
+		 * do anything.
+		 */
+		syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored",
+		    PoolGUIDString().c_str(), VdevGUIDString().c_str());
+		return (/*consumed*/false);
+	}
 
 	if (vdev != NULL
 	 && ( vdev->PoolGUID() == m_poolGUID
@@ -401,7 +410,8 @@ CaseFile::ReEvaluate(const ZfsEvent &event)
 		return (/*consumed*/true);
 	} else if (event.Value("type") == "sysevent.fs.zfs.config_sync") {
 		RefreshVdevState();
-		if (VdevState() < VDEV_STATE_HEALTHY)
+		if (VdevState() < VDEV_STATE_HEALTHY &&
+		    VdevState() != VDEV_STATE_OFFLINE)
 			consumed = ActivateSpare();
 	}
 
@@ -694,6 +704,11 @@ CaseFile::CloseIfSolved()
 		switch (VdevState()) {
 		case VDEV_STATE_HEALTHY:
 			/* No need to keep cases for healthy vdevs */
+		case VDEV_STATE_OFFLINE:
+			/*
+			 * Offline is a deliberate administrative action.  zfsd
+			 * doesn't need to do anything for this state.
+			 */
 			Close();
 			return (true);
 		case VDEV_STATE_REMOVED:
@@ -710,7 +725,6 @@ CaseFile::CloseIfSolved()
 			 */
 		case VDEV_STATE_UNKNOWN:
 		case VDEV_STATE_CLOSED:
-		case VDEV_STATE_OFFLINE:
 			/*
 			 * Keep open?  This may not be the correct behavior,
 			 * but it's what we've always done
diff --git a/cddl/usr.sbin/zfsd/zfsd_event.cc b/cddl/usr.sbin/zfsd/zfsd_event.cc
index 7a19b95abeed..afdabd99a8c3 100644
--- a/cddl/usr.sbin/zfsd/zfsd_event.cc
+++ b/cddl/usr.sbin/zfsd/zfsd_event.cc
@@ -355,6 +355,13 @@ ZfsEvent::Process() const
 
 	Vdev vdev(zpl.front(), vdevConfig);
 	caseFile = &CaseFile::Create(vdev);
+	if (caseFile->VdevState() == VDEV_STATE_OFFLINE) {
+		/*
+		 * An administrator did this deliberately.  It's not considered
+		 * an error that zfsd must fix.
+		 */
+		return (false);
+	}
 	if (caseFile->ReEvaluate(*this) == false) {
 		stringstream msg;
 		int priority = LOG_INFO;
diff --git a/tests/sys/cddl/zfs/tests/zfsd/Makefile b/tests/sys/cddl/zfs/tests/zfsd/Makefile
index e34e24b40906..588ca6e6c145 100644
--- a/tests/sys/cddl/zfs/tests/zfsd/Makefile
+++ b/tests/sys/cddl/zfs/tests/zfsd/Makefile
@@ -30,6 +30,8 @@ ${PACKAGE}FILES+=	zfsd_hotspare_006_pos.ksh
 ${PACKAGE}FILES+=	zfsd_hotspare_007_pos.ksh
 ${PACKAGE}FILES+=	zfsd_hotspare_008_neg.ksh
 ${PACKAGE}FILES+=	zfsd_import_001_pos.ksh
+${PACKAGE}FILES+=	zfsd_offline_001_neg.ksh
+${PACKAGE}FILES+=	zfsd_offline_002_neg.ksh
 ${PACKAGE}FILES+=	zfsd_replace_001_pos.ksh
 ${PACKAGE}FILES+=	zfsd_replace_002_pos.ksh
 ${PACKAGE}FILES+=	zfsd_replace_003_pos.ksh
diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh
new file mode 100644
index 000000000000..de7996976504
--- /dev/null
+++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh
@@ -0,0 +1,64 @@
+#!/usr/local/bin/ksh93 -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2025 ConnectWise.  All rights reserved.
+# Use is subject to license terms.
+
+. $STF_SUITE/tests/hotspare/hotspare.kshlib
+
+verify_runnable "global"
+
+function cleanup
+{
+	$ZPOOL status $TESTPOOL
+	if poolexists $TESTPOOL ; then
+		destroy_pool $TESTPOOL
+	fi
+
+	partition_cleanup
+}
+
+function verify_assertion
+{
+	log_must $ZPOOL offline $TESTPOOL $FAULT_DISK
+
+	# Wait a few seconds before verifying the state
+	$SLEEP 10
+	log_must check_state $TESTPOOL "$FAULT_DISK" "OFFLINE"
+}
+
+log_onexit cleanup
+
+log_assert "ZFSD will not automatically reactivate a disk which has been administratively offlined"
+
+ensure_zfsd_running
+
+typeset FAULT_DISK=$DISK0
+typeset POOLDEVS="$DISK0 $DISK1 $DISK2"
+set -A MY_KEYWORDS mirror raidz1
+for keyword in "${MY_KEYWORDS[@]}" ; do
+	log_must create_pool $TESTPOOL $keyword $POOLDEVS
+	verify_assertion
+
+	destroy_pool "$TESTPOOL"
+done
diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh
new file mode 100644
index 000000000000..7d8dfc62d365
--- /dev/null
+++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh
@@ -0,0 +1,66 @@
+#!/usr/local/bin/ksh93 -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2025 ConnectWise.  All rights reserved.
+# Use is subject to license terms.
+
+. $STF_SUITE/tests/hotspare/hotspare.kshlib
+
+verify_runnable "global"
+
+function cleanup
+{
+	$ZPOOL status $TESTPOOL
+	if poolexists $TESTPOOL ; then
+		destroy_pool $TESTPOOL
+	fi
+
+	partition_cleanup
+}
+
+function verify_assertion
+{
+	log_must $ZPOOL offline $TESTPOOL $FAULT_DISK
+
+	# Wait a few seconds before verifying the state
+	$SLEEP 10
+	log_must check_state $TESTPOOL "$FAULT_DISK" "OFFLINE"
+	log_must check_state $TESTPOOL "$SPARE_DISK" "AVAIL"
+}
+
+log_onexit cleanup
+
+log_assert "ZFSD will not automatically activate a spare when a disk has been administratively offlined"
+
+ensure_zfsd_running
+
+typeset FAULT_DISK=$DISK0
+typeset SPARE_DISK=$DISK3
+typeset POOLDEVS="$DISK0 $DISK1 $DISK2"
+set -A MY_KEYWORDS mirror raidz1
+for keyword in "${MY_KEYWORDS[@]}" ; do
+	log_must create_pool $TESTPOOL $keyword $POOLDEVS spare $SPARE_DISK
+	verify_assertion
+
+	destroy_pool "$TESTPOOL"
+done
diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh b/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh
index fe4ac4866ed3..b9924500a298 100755
--- a/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh
+++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh
@@ -483,6 +483,64 @@ zfsd_autoreplace_003_pos_cleanup()
 	ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed"
 }
 
+atf_test_case zfsd_offline_001_neg cleanup
+zfsd_offline_001_neg_head()
+{
+	atf_set "descr" "ZFSD will not automatically reactivate a disk which has been administratively offlined"
+	atf_set "require.progs" "ksh93 zpool zfs"
+}
+zfsd_offline_001_neg_body()
+{
+	. $(atf_get_srcdir)/../../include/default.cfg
+	. $(atf_get_srcdir)/../hotspare/hotspare.cfg
+	. $(atf_get_srcdir)/zfsd.cfg
+
+	verify_disk_count "$DISKS" 3
+	verify_zfsd_running
+	ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed"
+	ksh93 $(atf_get_srcdir)/zfsd_offline_001_neg.ksh
+	if [[ $? != 0 ]]; then
+		save_artifacts
+		atf_fail "Testcase failed"
+	fi
+}
+zfsd_offline_001_neg_cleanup()
+{
+	. $(atf_get_srcdir)/../../include/default.cfg
+	. $(atf_get_srcdir)/zfsd.cfg
+
+	ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed"
+}
+
+atf_test_case zfsd_offline_002_neg cleanup
+zfsd_offline_002_neg_head()
+{
+	atf_set "descr" "ZFSD will not automatically activate a spare when a disk has been administratively offlined"
+	atf_set "require.progs" "ksh93 zpool zfs"
+}
+zfsd_offline_002_neg_body()
+{
+	. $(atf_get_srcdir)/../../include/default.cfg
+	. $(atf_get_srcdir)/../hotspare/hotspare.cfg
+	. $(atf_get_srcdir)/zfsd.cfg
+
+	verify_disk_count "$DISKS" 4
+	verify_zfsd_running
+	ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed"
+	ksh93 $(atf_get_srcdir)/zfsd_offline_002_neg.ksh
+	if [[ $? != 0 ]]; then
+		save_artifacts
+		atf_fail "Testcase failed"
+	fi
+}
+zfsd_offline_002_neg_cleanup()
+{
+	. $(atf_get_srcdir)/../../include/default.cfg
+	. $(atf_get_srcdir)/zfsd.cfg
+
+	ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed"
+}
+
 atf_test_case zfsd_replace_001_pos cleanup
 zfsd_replace_001_pos_head()
 {
@@ -676,6 +734,8 @@ atf_init_test_cases()
 	atf_add_test_case zfsd_autoreplace_001_neg
 	atf_add_test_case zfsd_autoreplace_002_pos
 	atf_add_test_case zfsd_autoreplace_003_pos
+	atf_add_test_case zfsd_offline_001_neg
+	atf_add_test_case zfsd_offline_002_neg
 	atf_add_test_case zfsd_replace_001_pos
 	atf_add_test_case zfsd_replace_002_pos
 	atf_add_test_case zfsd_replace_003_pos