git: d565784a7eba - main - zfsd: fault disks that generate too many I/O delay events
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Wed, 29 Nov 2023 14:51:15 UTC
The branch main has been updated by asomers: URL: https://cgit.FreeBSD.org/src/commit/?id=d565784a7ebaa59e26febdcfd4a60329786ea5f5 commit d565784a7ebaa59e26febdcfd4a60329786ea5f5 Author: Alan Somers <asomers@FreeBSD.org> AuthorDate: 2023-07-12 20:46:27 +0000 Commit: Alan Somers <asomers@FreeBSD.org> CommitDate: 2023-11-29 14:51:11 +0000 zfsd: fault disks that generate too many I/O delay events If ZFS reports that a disk had at least 8 I/O operations over 60s that were each delayed by at least 30s (implying a queue depth > 4 or I/O aggregation, obviously), fault that disk. Disks that respond this slowly can degrade the entire system's performance. MFC after: 2 weeks Sponsored by: Axcient Reviewed by: delphij Differential Revision: https://reviews.freebsd.org/D42825 --- cddl/usr.sbin/zfsd/case_file.cc | 20 ++++- cddl/usr.sbin/zfsd/case_file.h | 6 +- cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc | 66 +++++++++++++++ tests/sys/cddl/zfs/tests/zfsd/Makefile | 1 + .../sys/cddl/zfs/tests/zfsd/zfsd_fault_002_pos.ksh | 99 ++++++++++++++++++++++ tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh | 32 +++++++ 6 files changed, 220 insertions(+), 4 deletions(-) diff --git a/cddl/usr.sbin/zfsd/case_file.cc b/cddl/usr.sbin/zfsd/case_file.cc index 3728913b646b..39f89fbbf7c8 100644 --- a/cddl/usr.sbin/zfsd/case_file.cc +++ b/cddl/usr.sbin/zfsd/case_file.cc @@ -470,7 +470,8 @@ CaseFile::ReEvaluate(const ZfsEvent &event) consumed = true; } else if (event.Value("class") == "ereport.fs.zfs.io" || - event.Value("class") == "ereport.fs.zfs.checksum") { + event.Value("class") == "ereport.fs.zfs.checksum" || + event.Value("class") == "ereport.fs.zfs.delay") { m_tentativeEvents.push_front(event.DeepCopy()); RegisterCallout(event); @@ -1171,6 +1172,13 @@ IsIOEvent(const Event* const event) return ("ereport.fs.zfs.io" == event->Value("type")); } +/* Does the argument event refer to an IO delay? */ +static bool +IsDelayEvent(const Event* const event) +{ + return ("ereport.fs.zfs.delay" == event->Value("type")); +} + bool CaseFile::ShouldDegrade() const { @@ -1181,8 +1189,14 @@ CaseFile::ShouldDegrade() const bool CaseFile::ShouldFault() const { - return (std::count_if(m_events.begin(), m_events.end(), - IsIOEvent) > ZFS_DEGRADE_IO_COUNT); + bool should_fault_for_io, should_fault_for_delay; + + should_fault_for_io = std::count_if(m_events.begin(), m_events.end(), + IsIOEvent) > ZFS_DEGRADE_IO_COUNT; + should_fault_for_delay = std::count_if(m_events.begin(), m_events.end(), + IsDelayEvent) > ZFS_FAULT_DELAY_COUNT; + + return (should_fault_for_io || should_fault_for_delay); } nvlist_t * diff --git a/cddl/usr.sbin/zfsd/case_file.h b/cddl/usr.sbin/zfsd/case_file.h index e6c7010af234..9566b1586ef5 100644 --- a/cddl/usr.sbin/zfsd/case_file.h +++ b/cddl/usr.sbin/zfsd/case_file.h @@ -242,7 +242,11 @@ protected: * to transition a vdev from healthy to degraded * status. */ - ZFS_DEGRADE_IO_COUNT = 50 + ZFS_DEGRADE_IO_COUNT = 50, + /** + * The number of delay errors on a vdev required to fault it + */ + ZFS_FAULT_DELAY_COUNT = 8, }; static CalloutFunc_t OnGracePeriodEnded; diff --git a/cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc b/cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc index caeb077a3de8..d76abb54c9ed 100644 --- a/cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc +++ b/cddl/usr.sbin/zfsd/tests/zfsd_unittest.cc @@ -519,6 +519,72 @@ TEST_F(CaseFileTest, PoolDestroy) EXPECT_TRUE(m_caseFile->ReEvaluate(*zfs_event)); } +/* + * A Vdev with a very large number of Delay errors should fault + * For performance reasons, RefreshVdevState should be called at most once + */ +TEST_F(CaseFileTest, VeryManyDelayErrors) +{ + EXPECT_CALL(*m_caseFile, RefreshVdevState()) + .Times(::testing::AtMost(1)) + .WillRepeatedly(::testing::Return(true)); + + for(int i=0; i<100; i++) { + stringstream evStringStream; + evStringStream << + "!system=ZFS " + "class=ereport.fs.zfs.delay " + "ena=12091638756982918145 " + "parent_guid=13237004955564865395 " + "parent_type=raidz " + "pool=testpool.4415 " + "pool_context=0 " + "pool_failmode=wait " + "pool_guid=456 " + "pool_state= 0" + "subsystem=ZFS " + "time="; + evStringStream << i << "0000000000000000 "; + evStringStream << "timestamp=" << i << " "; + evStringStream << + "type=ereport.fs.zfs.delay " + "vdev_ashift=12 " + "vdev_cksum_errors=0 " + "vdev_complete_ts=948336226469 " + "vdev_delays=77 " + "vdev_delta_ts=123998485899 " + "vdev_guid=123 " + "vdev_path=/dev/da400 " + "vdev_read_errors=0 " + "vdev_spare_guids= " + "vdev_type=disk " + "vdev_write_errors=0 " + "zio_blkid=622 " + "zio_delay=31000041101 " + "zio_delta=123998485899 " + "zio_err=0 " + "zio_flags=1572992 " + "zio_level=-2 " + "zio_object=0 " + "zio_objset=37 " + "zio_offset=25598976 " + "zio_pipeline=48234496 " + "zio_priority=3 " + "zio_size=1024" + "zio_stage=33554432 " + "zio_timestamp=824337740570 "; + Event *event(Event::CreateEvent(*m_eventFactory, + evStringStream.str())); + ZfsEvent *zfs_event = static_cast<ZfsEvent*>(event); + EXPECT_TRUE(m_caseFile->ReEvaluate(*zfs_event)); + delete event; + } + + m_caseFile->SpliceEvents(); + EXPECT_FALSE(m_caseFile->ShouldDegrade()); + EXPECT_TRUE(m_caseFile->ShouldFault()); +} + /* * A Vdev with a very large number of IO errors should fault * For performance reasons, RefreshVdevState should be called at most once diff --git a/tests/sys/cddl/zfs/tests/zfsd/Makefile b/tests/sys/cddl/zfs/tests/zfsd/Makefile index a1863a03196c..7d3f29a7359e 100644 --- a/tests/sys/cddl/zfs/tests/zfsd/Makefile +++ b/tests/sys/cddl/zfs/tests/zfsd/Makefile @@ -21,6 +21,7 @@ ${PACKAGE}FILES+= zfsd_autoreplace_003_pos.ksh ${PACKAGE}FILES+= zfsd_degrade_001_pos.ksh ${PACKAGE}FILES+= zfsd_degrade_002_pos.ksh ${PACKAGE}FILES+= zfsd_fault_001_pos.ksh +${PACKAGE}FILES+= zfsd_fault_002_pos.ksh ${PACKAGE}FILES+= zfsd_hotspare_001_pos.ksh ${PACKAGE}FILES+= zfsd_hotspare_002_pos.ksh ${PACKAGE}FILES+= zfsd_hotspare_003_pos.ksh diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_fault_002_pos.ksh b/tests/sys/cddl/zfs/tests/zfsd/zfsd_fault_002_pos.ksh new file mode 100644 index 000000000000..d8de1ceaff4b --- /dev/null +++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_fault_002_pos.ksh @@ -0,0 +1,99 @@ +#!/usr/local/bin/ksh93 -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 Axcient. All rights reserved. +# Use is subject to license terms. +# +# $FreeBSD$ + +. $STF_SUITE/include/libtest.kshlib +. $STF_SUITE/include/libgnop.kshlib + +################################################################################ +# +# __stc_assertion_start +# +# ID: zfsd_fault_002_pos +# +# DESCRIPTION: +# If a vdev experiences delayed I/O, it will become faulted. +# +# +# STRATEGY: +# 1. Create a storage pool. Use gnop vdevs so we can inject I/O delays. +# 2. Inject IO delays while doing IO to the pool. +# 3. Verify that the vdev becomes FAULTED. +# 4. ONLINE it and verify that it resilvers and joins the pool. +# +# TESTABILITY: explicit +# +# TEST_AUTOMATION_LEVEL: automated +# +# __stc_assertion_end +# +############################################################################### + +verify_runnable "global" + +log_assert "ZFS will fault a vdev that experiences delayed I/O" + +ensure_zfsd_running + +DISK0_NOP=${DISK0}.nop +DISK1_NOP=${DISK1}.nop + +log_must create_gnops $DISK0 $DISK1 + +for type in "raidz" "mirror"; do + log_note "Testing raid type $type" + + # Create a pool on the supplied disks + create_pool $TESTPOOL $type "$DISK0_NOP" "$DISK1_NOP" + log_must $ZFS create $TESTPOOL/$TESTFS + + # Cause some IO delays writing to the pool + while true; do + # ZFS currently considers an I/O to be "slow" if it's delayed + # for 30 seconds (zio_slow_io_ms). + log_must gnop configure -d 31000 -x 100 "$DISK1_NOP" + $DD if=/dev/zero bs=128k count=1 >> \ + /$TESTPOOL/$TESTFS/$TESTFILE 2> /dev/null + $FSYNC /$TESTPOOL/$TESTFS/$TESTFILE + # Check to see if the pool is faulted yet + $ZPOOL status $TESTPOOL | grep -q 'state: DEGRADED' + if [ $? == 0 ] + then + log_note "$TESTPOOL got degraded" + $ZPOOL status -s $TESTPOOL + break + fi + done + + log_must check_state $TESTPOOL $TMPDISK "FAULTED" + + log_must gnop configure -x 0 "$DISK1_NOP" + destroy_pool $TESTPOOL + log_must $RM -rf /$TESTPOOL +done + +log_pass diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh b/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh index de9bd7bc94ed..fe4ac4866ed3 100755 --- a/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh +++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh @@ -56,6 +56,37 @@ zfsd_fault_001_pos_cleanup() } +atf_test_case zfsd_fault_002_pos cleanup +zfsd_fault_002_pos_head() +{ + atf_set "descr" "ZFS will fault a vdev that experiences delayed I/O" + atf_set "require.progs" "ksh93 gnop zfs zpool zfsd" + atf_set "timeout" 300 +} +zfsd_fault_002_pos_body() +{ + . $(atf_get_srcdir)/../../include/default.cfg + . $(atf_get_srcdir)/../hotspare/hotspare.kshlib + . $(atf_get_srcdir)/../hotspare/hotspare.cfg + . $(atf_get_srcdir)/zfsd.cfg + + verify_disk_count "$DISKS" 2 + verify_zfsd_running + ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed" + ksh93 $(atf_get_srcdir)/zfsd_fault_002_pos.ksh + if [[ $? != 0 ]]; then + save_artifacts + atf_fail "Testcase failed" + fi +} +zfsd_fault_002_pos_cleanup() +{ + . $(atf_get_srcdir)/../../include/default.cfg + . $(atf_get_srcdir)/zfsd.cfg + + ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" +} + atf_test_case zfsd_degrade_001_pos cleanup zfsd_degrade_001_pos_head() { @@ -631,6 +662,7 @@ zfsd_import_001_pos_cleanup() atf_init_test_cases() { atf_add_test_case zfsd_fault_001_pos + atf_add_test_case zfsd_fault_002_pos atf_add_test_case zfsd_degrade_001_pos atf_add_test_case zfsd_degrade_002_pos atf_add_test_case zfsd_hotspare_001_pos