git: 4171bf5905a9 - stable/14 - zfsd: don't try to fix an OFFLINE condition
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Sat, 27 Sep 2025 14:27:48 UTC
The branch stable/14 has been updated by asomers:
URL: https://cgit.FreeBSD.org/src/commit/?id=4171bf5905a95b8fcfd3bb6bb6ede2fe543db092
commit 4171bf5905a95b8fcfd3bb6bb6ede2fe543db092
Author: Alan Somers <asomers@FreeBSD.org>
AuthorDate: 2025-05-02 12:45:32 +0000
Commit: Alan Somers <asomers@FreeBSD.org>
CommitDate: 2025-09-27 14:25:04 +0000
zfsd: don't try to fix an OFFLINE condition
If the system administrator does "zpool offline", he's doing it for a
reason. zfsd shouldn't consider an offline disk to be an event that
requires automatic healing. Don't online it in response to a GEOM
event, and don't try to activate a hotspare to take over from it.
Sponsored by: ConnectWise
(cherry picked from commit a13ddd6210f349dec341eba0506b5f2395a7a2d6)
---
cddl/usr.sbin/zfsd/case_file.cc | 18 +++++-
cddl/usr.sbin/zfsd/zfsd_event.cc | 7 +++
tests/sys/cddl/zfs/tests/zfsd/Makefile | 2 +
.../cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh | 64 +++++++++++++++++++++
.../cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh | 66 ++++++++++++++++++++++
tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh | 60 ++++++++++++++++++++
6 files changed, 215 insertions(+), 2 deletions(-)
diff --git a/cddl/usr.sbin/zfsd/case_file.cc b/cddl/usr.sbin/zfsd/case_file.cc
index 7adfb08b75c6..852767aeb227 100644
--- a/cddl/usr.sbin/zfsd/case_file.cc
+++ b/cddl/usr.sbin/zfsd/case_file.cc
@@ -299,6 +299,15 @@ CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev)
PoolGUIDString().c_str(), VdevGUIDString().c_str());
return (/*consumed*/false);
}
+ if (VdevState() == VDEV_STATE_OFFLINE) {
+ /*
+ * OFFLINE is an administrative decision. No need for zfsd to
+ * do anything.
+ */
+ syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored",
+ PoolGUIDString().c_str(), VdevGUIDString().c_str());
+ return (/*consumed*/false);
+ }
if (vdev != NULL
&& ( vdev->PoolGUID() == m_poolGUID
@@ -401,7 +410,8 @@ CaseFile::ReEvaluate(const ZfsEvent &event)
return (/*consumed*/true);
} else if (event.Value("type") == "sysevent.fs.zfs.config_sync") {
RefreshVdevState();
- if (VdevState() < VDEV_STATE_HEALTHY)
+ if (VdevState() < VDEV_STATE_HEALTHY &&
+ VdevState() != VDEV_STATE_OFFLINE)
consumed = ActivateSpare();
}
@@ -694,6 +704,11 @@ CaseFile::CloseIfSolved()
switch (VdevState()) {
case VDEV_STATE_HEALTHY:
/* No need to keep cases for healthy vdevs */
+ case VDEV_STATE_OFFLINE:
+ /*
+ * Offline is a deliberate administrative action. zfsd
+ * doesn't need to do anything for this state.
+ */
Close();
return (true);
case VDEV_STATE_REMOVED:
@@ -710,7 +725,6 @@ CaseFile::CloseIfSolved()
*/
case VDEV_STATE_UNKNOWN:
case VDEV_STATE_CLOSED:
- case VDEV_STATE_OFFLINE:
/*
* Keep open? This may not be the correct behavior,
* but it's what we've always done
diff --git a/cddl/usr.sbin/zfsd/zfsd_event.cc b/cddl/usr.sbin/zfsd/zfsd_event.cc
index 7a19b95abeed..afdabd99a8c3 100644
--- a/cddl/usr.sbin/zfsd/zfsd_event.cc
+++ b/cddl/usr.sbin/zfsd/zfsd_event.cc
@@ -355,6 +355,13 @@ ZfsEvent::Process() const
Vdev vdev(zpl.front(), vdevConfig);
caseFile = &CaseFile::Create(vdev);
+ if (caseFile->VdevState() == VDEV_STATE_OFFLINE) {
+ /*
+ * An administrator did this deliberately. It's not considered
+ * an error that zfsd must fix.
+ */
+ return (false);
+ }
if (caseFile->ReEvaluate(*this) == false) {
stringstream msg;
int priority = LOG_INFO;
diff --git a/tests/sys/cddl/zfs/tests/zfsd/Makefile b/tests/sys/cddl/zfs/tests/zfsd/Makefile
index 7d3f29a7359e..b9ba49b60214 100644
--- a/tests/sys/cddl/zfs/tests/zfsd/Makefile
+++ b/tests/sys/cddl/zfs/tests/zfsd/Makefile
@@ -31,6 +31,8 @@ ${PACKAGE}FILES+= zfsd_hotspare_006_pos.ksh
${PACKAGE}FILES+= zfsd_hotspare_007_pos.ksh
${PACKAGE}FILES+= zfsd_hotspare_008_neg.ksh
${PACKAGE}FILES+= zfsd_import_001_pos.ksh
+${PACKAGE}FILES+= zfsd_offline_001_neg.ksh
+${PACKAGE}FILES+= zfsd_offline_002_neg.ksh
${PACKAGE}FILES+= zfsd_replace_001_pos.ksh
${PACKAGE}FILES+= zfsd_replace_002_pos.ksh
${PACKAGE}FILES+= zfsd_replace_003_pos.ksh
diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh
new file mode 100644
index 000000000000..de7996976504
--- /dev/null
+++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh
@@ -0,0 +1,64 @@
+#!/usr/local/bin/ksh93 -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2025 ConnectWise. All rights reserved.
+# Use is subject to license terms.
+
+. $STF_SUITE/tests/hotspare/hotspare.kshlib
+
+verify_runnable "global"
+
+function cleanup
+{
+ $ZPOOL status $TESTPOOL
+ if poolexists $TESTPOOL ; then
+ destroy_pool $TESTPOOL
+ fi
+
+ partition_cleanup
+}
+
+function verify_assertion
+{
+ log_must $ZPOOL offline $TESTPOOL $FAULT_DISK
+
+ # Wait a few seconds before verifying the state
+ $SLEEP 10
+ log_must check_state $TESTPOOL "$FAULT_DISK" "OFFLINE"
+}
+
+log_onexit cleanup
+
+log_assert "ZFSD will not automatically reactivate a disk which has been administratively offlined"
+
+ensure_zfsd_running
+
+typeset FAULT_DISK=$DISK0
+typeset POOLDEVS="$DISK0 $DISK1 $DISK2"
+set -A MY_KEYWORDS mirror raidz1
+for keyword in "${MY_KEYWORDS[@]}" ; do
+ log_must create_pool $TESTPOOL $keyword $POOLDEVS
+ verify_assertion
+
+ destroy_pool "$TESTPOOL"
+done
diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh
new file mode 100644
index 000000000000..7d8dfc62d365
--- /dev/null
+++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh
@@ -0,0 +1,66 @@
+#!/usr/local/bin/ksh93 -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2025 ConnectWise. All rights reserved.
+# Use is subject to license terms.
+
+. $STF_SUITE/tests/hotspare/hotspare.kshlib
+
+verify_runnable "global"
+
+function cleanup
+{
+ $ZPOOL status $TESTPOOL
+ if poolexists $TESTPOOL ; then
+ destroy_pool $TESTPOOL
+ fi
+
+ partition_cleanup
+}
+
+function verify_assertion
+{
+ log_must $ZPOOL offline $TESTPOOL $FAULT_DISK
+
+ # Wait a few seconds before verifying the state
+ $SLEEP 10
+ log_must check_state $TESTPOOL "$FAULT_DISK" "OFFLINE"
+ log_must check_state $TESTPOOL "$SPARE_DISK" "AVAIL"
+}
+
+log_onexit cleanup
+
+log_assert "ZFSD will not automatically activate a spare when a disk has been administratively offlined"
+
+ensure_zfsd_running
+
+typeset FAULT_DISK=$DISK0
+typeset SPARE_DISK=$DISK3
+typeset POOLDEVS="$DISK0 $DISK1 $DISK2"
+set -A MY_KEYWORDS mirror raidz1
+for keyword in "${MY_KEYWORDS[@]}" ; do
+ log_must create_pool $TESTPOOL $keyword $POOLDEVS spare $SPARE_DISK
+ verify_assertion
+
+ destroy_pool "$TESTPOOL"
+done
diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh b/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh
index fe4ac4866ed3..b9924500a298 100755
--- a/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh
+++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh
@@ -483,6 +483,64 @@ zfsd_autoreplace_003_pos_cleanup()
ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed"
}
+atf_test_case zfsd_offline_001_neg cleanup
+zfsd_offline_001_neg_head()
+{
+ atf_set "descr" "ZFSD will not automatically reactivate a disk which has been administratively offlined"
+ atf_set "require.progs" "ksh93 zpool zfs"
+}
+zfsd_offline_001_neg_body()
+{
+ . $(atf_get_srcdir)/../../include/default.cfg
+ . $(atf_get_srcdir)/../hotspare/hotspare.cfg
+ . $(atf_get_srcdir)/zfsd.cfg
+
+ verify_disk_count "$DISKS" 3
+ verify_zfsd_running
+ ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed"
+ ksh93 $(atf_get_srcdir)/zfsd_offline_001_neg.ksh
+ if [[ $? != 0 ]]; then
+ save_artifacts
+ atf_fail "Testcase failed"
+ fi
+}
+zfsd_offline_001_neg_cleanup()
+{
+ . $(atf_get_srcdir)/../../include/default.cfg
+ . $(atf_get_srcdir)/zfsd.cfg
+
+ ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed"
+}
+
+atf_test_case zfsd_offline_002_neg cleanup
+zfsd_offline_002_neg_head()
+{
+ atf_set "descr" "ZFSD will not automatically activate a spare when a disk has been administratively offlined"
+ atf_set "require.progs" "ksh93 zpool zfs"
+}
+zfsd_offline_002_neg_body()
+{
+ . $(atf_get_srcdir)/../../include/default.cfg
+ . $(atf_get_srcdir)/../hotspare/hotspare.cfg
+ . $(atf_get_srcdir)/zfsd.cfg
+
+ verify_disk_count "$DISKS" 4
+ verify_zfsd_running
+ ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed"
+ ksh93 $(atf_get_srcdir)/zfsd_offline_002_neg.ksh
+ if [[ $? != 0 ]]; then
+ save_artifacts
+ atf_fail "Testcase failed"
+ fi
+}
+zfsd_offline_002_neg_cleanup()
+{
+ . $(atf_get_srcdir)/../../include/default.cfg
+ . $(atf_get_srcdir)/zfsd.cfg
+
+ ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed"
+}
+
atf_test_case zfsd_replace_001_pos cleanup
zfsd_replace_001_pos_head()
{
@@ -676,6 +734,8 @@ atf_init_test_cases()
atf_add_test_case zfsd_autoreplace_001_neg
atf_add_test_case zfsd_autoreplace_002_pos
atf_add_test_case zfsd_autoreplace_003_pos
+ atf_add_test_case zfsd_offline_001_neg
+ atf_add_test_case zfsd_offline_002_neg
atf_add_test_case zfsd_replace_001_pos
atf_add_test_case zfsd_replace_002_pos
atf_add_test_case zfsd_replace_003_pos