svn commit: r339034 - in stable/11: cddl/contrib/opensolaris/cmd/zdb cddl/contrib/opensolaris/cmd/zpool cddl/contrib/opensolaris/cmd/ztest cddl/contrib/opensolaris/lib/libzfs/common cddl/contrib/op...
Sean Eric Fagan
sef at FreeBSD.org
Mon Oct 1 04:08:51 UTC 2018
Author: sef
Date: Mon Oct 1 04:08:47 2018
New Revision: 339034
URL: https://svnweb.freebsd.org/changeset/base/339034
Log:
MFC r334844, r336180, r336458
r334844
This originated from ZFS On Linux, as
https://github.com/zfsonlinux/zfs/commit/d4a72f23863382bdf6d0ae33196f5b5decbc48fd
During scans (scrubs or resilvers), it sorts the blocks in each transaction
group by block offset; the result can be a significant improvement. (On my
test system just now, which I put some effort to introduce fragmentation into
the pool since I set it up yesterday, a scrub went from 1h2m to 33.5m with the
changes.) I've seen similar rations on production systems.
r336180
Fix up some missed and mis-merges from the sequential scan code
(r334844). Most of the changes involve moving some code around to
reduce conflicts with future merges. One of the missing changes
included a notification on scrub cancellation.
r336458
Fix a couple of typos in r334844 noticed by Richard Kojedzinszky
Approved by: mav
Sponsored by: iXsystems, Inc
Modified:
stable/11/cddl/contrib/opensolaris/cmd/zdb/zdb.c
stable/11/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
stable/11/cddl/contrib/opensolaris/cmd/ztest/ztest.c
stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c
stable/11/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
stable/11/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c
stable/11/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_indirect.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
stable/11/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h
Directory Properties:
stable/11/ (props changed)
Modified: stable/11/cddl/contrib/opensolaris/cmd/zdb/zdb.c
==============================================================================
--- stable/11/cddl/contrib/opensolaris/cmd/zdb/zdb.c Mon Oct 1 04:02:00 2018 (r339033)
+++ stable/11/cddl/contrib/opensolaris/cmd/zdb/zdb.c Mon Oct 1 04:08:47 2018 (r339034)
@@ -2281,14 +2281,14 @@ dump_dir(objset_t *os)
object_count++;
}
- ASSERT3U(object_count, ==, usedobjs);
-
(void) printf("\n");
if (error != ESRCH) {
(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
abort();
}
+
+ ASSERT3U(object_count, ==, usedobjs);
}
static void
@@ -2788,6 +2788,7 @@ zdb_blkptr_done(zio_t *zio)
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_inflight--;
+ spa->spa_load_verify_ios--;
cv_broadcast(&spa->spa_scrub_io_cv);
if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
@@ -2859,9 +2860,10 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr
flags |= ZIO_FLAG_SPECULATIVE;
mutex_enter(&spa->spa_scrub_lock);
- while (spa->spa_scrub_inflight > max_inflight)
+ while (spa->spa_load_verify_ios > max_inflight)
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
spa->spa_scrub_inflight++;
+ spa->spa_load_verify_ios++;
mutex_exit(&spa->spa_scrub_lock);
zio_nowait(zio_read(NULL, spa, bp, abd, size,
Modified: stable/11/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c
==============================================================================
--- stable/11/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c Mon Oct 1 04:02:00 2018 (r339033)
+++ stable/11/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c Mon Oct 1 04:08:47 2018 (r339034)
@@ -1643,7 +1643,7 @@ print_status_config(zpool_handle_t *zhp, const char *n
(void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS,
(uint64_t **)&ps, &c);
- if (ps && ps->pss_state == DSS_SCANNING &&
+ if (ps != NULL && ps->pss_state == DSS_SCANNING &&
vs->vs_scan_processed != 0 && children == 0) {
(void) printf(gettext(" (%s)"),
(ps->pss_func == POOL_SCAN_RESILVER) ?
@@ -4254,11 +4254,13 @@ static void
print_scan_status(pool_scan_stat_t *ps)
{
time_t start, end, pause;
- uint64_t elapsed, mins_left, hours_left;
- uint64_t pass_exam, examined, total;
- uint_t rate;
+ uint64_t total_secs_left;
+ uint64_t elapsed, secs_left, mins_left, hours_left, days_left;
+ uint64_t pass_scanned, scanned, pass_issued, issued, total;
+ uint_t scan_rate, issue_rate;
double fraction_done;
- char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7];
+ char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7];
+ char srate_buf[7], irate_buf[7];
(void) printf(gettext(" scan: "));
@@ -4272,30 +4274,37 @@ print_scan_status(pool_scan_stat_t *ps)
start = ps->pss_start_time;
end = ps->pss_end_time;
pause = ps->pss_pass_scrub_pause;
+
zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf));
assert(ps->pss_func == POOL_SCAN_SCRUB ||
ps->pss_func == POOL_SCAN_RESILVER);
- /*
- * Scan is finished or canceled.
- */
- if (ps->pss_state == DSS_FINISHED) {
- uint64_t minutes_taken = (end - start) / 60;
- char *fmt = NULL;
+ /* Scan is finished or canceled. */
+ if (ps->pss_state == DSS_FINISHED) {
+ total_secs_left = end - start;
+ days_left = total_secs_left / 60 / 60 / 24;
+ hours_left = (total_secs_left / 60 / 60) % 24;
+ mins_left = (total_secs_left / 60) % 60;
+ secs_left = (total_secs_left % 60);
+
if (ps->pss_func == POOL_SCAN_SCRUB) {
- fmt = gettext("scrub repaired %s in %lluh%um with "
- "%llu errors on %s");
+ (void) printf(gettext("scrub repaired %s "
+ "in %llu days %02llu:%02llu:%02llu "
+ "with %llu errors on %s"), processed_buf,
+ (u_longlong_t)days_left, (u_longlong_t)hours_left,
+ (u_longlong_t)mins_left, (u_longlong_t)secs_left,
+ (u_longlong_t)ps->pss_errors, ctime(&end));
} else if (ps->pss_func == POOL_SCAN_RESILVER) {
- fmt = gettext("resilvered %s in %lluh%um with "
- "%llu errors on %s");
+ (void) printf(gettext("resilvered %s "
+ "in %llu days %02llu:%02llu:%02llu "
+ "with %llu errors on %s"), processed_buf,
+ (u_longlong_t)days_left, (u_longlong_t)hours_left,
+ (u_longlong_t)mins_left, (u_longlong_t)secs_left,
+ (u_longlong_t)ps->pss_errors, ctime(&end));
+
}
- /* LINTED */
- (void) printf(fmt, processed_buf,
- (u_longlong_t)(minutes_taken / 60),
- (uint_t)(minutes_taken % 60),
- (u_longlong_t)ps->pss_errors,
- ctime((time_t *)&end));
+
return;
} else if (ps->pss_state == DSS_CANCELED) {
if (ps->pss_func == POOL_SCAN_SCRUB) {
@@ -4310,19 +4319,15 @@ print_scan_status(pool_scan_stat_t *ps)
assert(ps->pss_state == DSS_SCANNING);
- /*
- * Scan is in progress.
- */
+ /* Scan is in progress. Resilvers can't be paused. */
if (ps->pss_func == POOL_SCAN_SCRUB) {
if (pause == 0) {
(void) printf(gettext("scrub in progress since %s"),
ctime(&start));
} else {
- char buf[32];
- struct tm *p = localtime(&pause);
- (void) strftime(buf, sizeof (buf), "%a %b %e %T %Y", p);
- (void) printf(gettext("scrub paused since %s\n"), buf);
- (void) printf(gettext("\tscrub started on %s"),
+ (void) printf(gettext("scrub paused since %s"),
+ ctime(&pause));
+ (void) printf(gettext("\tscrub started on %s"),
ctime(&start));
}
} else if (ps->pss_func == POOL_SCAN_RESILVER) {
@@ -4330,49 +4335,67 @@ print_scan_status(pool_scan_stat_t *ps)
ctime(&start));
}
- examined = ps->pss_examined ? ps->pss_examined : 1;
+ scanned = ps->pss_examined;
+ pass_scanned = ps->pss_pass_exam;
+ issued = ps->pss_issued;
+ pass_issued = ps->pss_pass_issued;
total = ps->pss_to_examine;
- fraction_done = (double)examined / total;
- /* elapsed time for this pass */
+ /* we are only done with a block once we have issued the IO for it */
+ fraction_done = (double)issued / total;
+
+ /* elapsed time for this pass, rounding up to 1 if it's 0 */
elapsed = time(NULL) - ps->pss_pass_start;
elapsed -= ps->pss_pass_scrub_spent_paused;
- elapsed = elapsed ? elapsed : 1;
- pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
- rate = pass_exam / elapsed;
- rate = rate ? rate : 1;
- mins_left = ((total - examined) / rate) / 60;
- hours_left = mins_left / 60;
+ elapsed = (elapsed != 0) ? elapsed : 1;
- zfs_nicenum(examined, examined_buf, sizeof (examined_buf));
+ scan_rate = pass_scanned / elapsed;
+ issue_rate = pass_issued / elapsed;
+ total_secs_left = (issue_rate != 0) ?
+ ((total - issued) / issue_rate) : UINT64_MAX;
+
+ days_left = total_secs_left / 60 / 60 / 24;
+ hours_left = (total_secs_left / 60 / 60) % 24;
+ mins_left = (total_secs_left / 60) % 60;
+ secs_left = (total_secs_left % 60);
+
+ /* format all of the numbers we will be reporting */
+ zfs_nicenum(scanned, scanned_buf, sizeof (scanned_buf));
+ zfs_nicenum(issued, issued_buf, sizeof (issued_buf));
zfs_nicenum(total, total_buf, sizeof (total_buf));
+ zfs_nicenum(scan_rate, srate_buf, sizeof (srate_buf));
+ zfs_nicenum(issue_rate, irate_buf, sizeof (irate_buf));
- /*
- * do not print estimated time if hours_left is more than 30 days
- * or we have a paused scrub
- */
+ /* doo not print estimated time if we have a paused scrub */
if (pause == 0) {
- zfs_nicenum(rate, rate_buf, sizeof (rate_buf));
- (void) printf(gettext("\t%s scanned out of %s at %s/s"),
- examined_buf, total_buf, rate_buf);
- if (hours_left < (30 * 24)) {
- (void) printf(gettext(", %lluh%um to go\n"),
- (u_longlong_t)hours_left, (uint_t)(mins_left % 60));
- } else {
- (void) printf(gettext(
- ", (scan is slow, no estimated time)\n"));
- }
+ (void) printf(gettext("\t%s scanned at %s/s, "
+ "%s issued at %s/s, %s total\n"),
+ scanned_buf, srate_buf, issued_buf, irate_buf, total_buf);
} else {
- (void) printf(gettext("\t%s scanned out of %s\n"),
- examined_buf, total_buf);
+ (void) printf(gettext("\t%s scanned, %s issued, %s total\n"),
+ scanned_buf, issued_buf, total_buf);
}
if (ps->pss_func == POOL_SCAN_RESILVER) {
- (void) printf(gettext(" %s resilvered, %.2f%% done\n"),
+ (void) printf(gettext("\t%s resilvered, %.2f%% done"),
processed_buf, 100 * fraction_done);
} else if (ps->pss_func == POOL_SCAN_SCRUB) {
- (void) printf(gettext(" %s repaired, %.2f%% done\n"),
+ (void) printf(gettext("\t%s repaired, %.2f%% done"),
processed_buf, 100 * fraction_done);
+ }
+
+ if (pause == 0) {
+ if (issue_rate >= 10 * 1024 * 1024) {
+ (void) printf(gettext(", %llu days "
+ "%02llu:%02llu:%02llu to go\n"),
+ (u_longlong_t)days_left, (u_longlong_t)hours_left,
+ (u_longlong_t)mins_left, (u_longlong_t)secs_left);
+ } else {
+ (void) printf(gettext(", no estimated "
+ "completion time\n"));
+ }
+ } else {
+ (void) printf(gettext("\n"));
}
}
Modified: stable/11/cddl/contrib/opensolaris/cmd/ztest/ztest.c
==============================================================================
--- stable/11/cddl/contrib/opensolaris/cmd/ztest/ztest.c Mon Oct 1 04:02:00 2018 (r339033)
+++ stable/11/cddl/contrib/opensolaris/cmd/ztest/ztest.c Mon Oct 1 04:08:47 2018 (r339034)
@@ -374,15 +374,15 @@ ztest_info_t ztest_info[] = {
{ ztest_fzap, 1, &zopt_sometimes },
{ ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes },
{ ztest_spa_create_destroy, 1, &zopt_sometimes },
- { ztest_fault_inject, 1, &zopt_sometimes },
+ { ztest_fault_inject, 1, &zopt_incessant },
{ ztest_ddt_repair, 1, &zopt_sometimes },
{ ztest_dmu_snapshot_hold, 1, &zopt_sometimes },
{ ztest_reguid, 1, &zopt_rarely },
{ ztest_spa_rename, 1, &zopt_rarely },
- { ztest_scrub, 1, &zopt_rarely },
+ { ztest_scrub, 1, &zopt_often },
{ ztest_spa_upgrade, 1, &zopt_rarely },
{ ztest_dsl_dataset_promote_busy, 1, &zopt_rarely },
- { ztest_vdev_attach_detach, 1, &zopt_sometimes },
+ { ztest_vdev_attach_detach, 1, &zopt_incessant },
{ ztest_vdev_LUN_growth, 1, &zopt_rarely },
{ ztest_vdev_add_remove, 1,
&ztest_opts.zo_vdevtime },
Modified: stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c
==============================================================================
--- stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c Mon Oct 1 04:02:00 2018 (r339033)
+++ stable/11/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c Mon Oct 1 04:08:47 2018 (r339034)
@@ -219,7 +219,7 @@ check_status(nvlist_t *config, boolean_t isimport)
*/
(void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
(uint64_t **)&ps, &psc);
- if (ps && ps->pss_func == POOL_SCAN_RESILVER &&
+ if (ps != NULL && ps->pss_func == POOL_SCAN_RESILVER &&
ps->pss_state == DSS_SCANNING)
return (ZPOOL_STATUS_RESILVERING);
Modified: stable/11/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h
==============================================================================
--- stable/11/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h Mon Oct 1 04:02:00 2018 (r339033)
+++ stable/11/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h Mon Oct 1 04:08:47 2018 (r339034)
@@ -408,6 +408,7 @@ typedef struct taskq_ent {
#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */
#define TQ_FRONT 0x08 /* Queue in front */
+#define TASKQID_INVALID ((taskqid_t)0)
extern taskq_t *system_taskq;
@@ -421,6 +422,7 @@ extern void taskq_dispatch_ent(taskq_t *, task_func_t,
taskq_ent_t *);
extern void taskq_destroy(taskq_t *);
extern void taskq_wait(taskq_t *);
+extern void taskq_wait_id(taskq_t *, taskqid_t);
extern int taskq_member(taskq_t *, void *);
extern void system_taskq_init(void);
extern void system_taskq_fini(void);
Modified: stable/11/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c
==============================================================================
--- stable/11/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c Mon Oct 1 04:02:00 2018 (r339033)
+++ stable/11/cddl/contrib/opensolaris/lib/libzpool/common/taskq.c Mon Oct 1 04:08:47 2018 (r339034)
@@ -187,6 +187,12 @@ taskq_wait(taskq_t *tq)
mutex_exit(&tq->tq_lock);
}
+void
+taskq_wait_id(taskq_t *tq, taskqid_t id)
+{
+ taskq_wait(tq);
+}
+
static void *
taskq_thread(void *arg)
{
Modified: stable/11/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c
==============================================================================
--- stable/11/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c Mon Oct 1 04:02:00 2018 (r339033)
+++ stable/11/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c Mon Oct 1 04:08:47 2018 (r339034)
@@ -173,3 +173,9 @@ taskq_wait(taskq_t *tq)
{
taskqueue_drain_all(tq->tq_queue);
}
+
+void
+taskq_wait_id(taskq_t *tq, taskqid_t id)
+{
+ taskq_wait(tq);
+}
Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Mon Oct 1 04:02:00 2018 (r339033)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c Mon Oct 1 04:08:47 2018 (r339034)
@@ -339,7 +339,8 @@ int arc_no_grow_shift = 5;
* minimum lifespan of a prefetch block in clock ticks
* (initialized in arc_init())
*/
-static int arc_min_prefetch_lifespan;
+static int zfs_arc_min_prefetch_ms = 1;
+static int zfs_arc_min_prescient_prefetch_ms = 6;
/*
* If this percent of memory is free, don't throttle.
@@ -783,8 +784,9 @@ typedef struct arc_stats {
kstat_named_t arcstat_meta_limit;
kstat_named_t arcstat_meta_max;
kstat_named_t arcstat_meta_min;
- kstat_named_t arcstat_sync_wait_for_async;
+ kstat_named_t arcstat_async_upgrade_sync;
kstat_named_t arcstat_demand_hit_predictive_prefetch;
+ kstat_named_t arcstat_demand_hit_prescient_prefetch;
} arc_stats_t;
static arc_stats_t arc_stats = {
@@ -881,8 +883,9 @@ static arc_stats_t arc_stats = {
{ "arc_meta_limit", KSTAT_DATA_UINT64 },
{ "arc_meta_max", KSTAT_DATA_UINT64 },
{ "arc_meta_min", KSTAT_DATA_UINT64 },
- { "sync_wait_for_async", KSTAT_DATA_UINT64 },
+ { "async_upgrade_sync", KSTAT_DATA_UINT64 },
{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
+ { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
};
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
@@ -978,22 +981,23 @@ typedef struct arc_callback arc_callback_t;
struct arc_callback {
void *acb_private;
- arc_done_func_t *acb_done;
+ arc_read_done_func_t *acb_done;
arc_buf_t *acb_buf;
boolean_t acb_compressed;
zio_t *acb_zio_dummy;
+ zio_t *acb_zio_head;
arc_callback_t *acb_next;
};
typedef struct arc_write_callback arc_write_callback_t;
struct arc_write_callback {
- void *awcb_private;
- arc_done_func_t *awcb_ready;
- arc_done_func_t *awcb_children_ready;
- arc_done_func_t *awcb_physdone;
- arc_done_func_t *awcb_done;
- arc_buf_t *awcb_buf;
+ void *awcb_private;
+ arc_write_done_func_t *awcb_ready;
+ arc_write_done_func_t *awcb_children_ready;
+ arc_write_done_func_t *awcb_physdone;
+ arc_write_done_func_t *awcb_done;
+ arc_buf_t *awcb_buf;
};
/*
@@ -1233,6 +1237,8 @@ sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS)
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
+#define HDR_PRESCIENT_PREFETCH(hdr) \
+ ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
#define HDR_COMPRESSION_ENABLED(hdr) \
((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
@@ -1396,6 +1402,11 @@ SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize,
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
&ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prefetch_ms, CTLFLAG_RW,
+ &zfs_arc_min_prefetch_ms, 0, "Min life of prefetch block in ms");
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, arc_min_prescient_prefetch_ms, CTLFLAG_RW,
+ &zfs_arc_min_prescient_prefetch_ms, 0, "Min life of prescient prefetched block in ms");
+
/*
* L2ARC Internals
*/
@@ -3548,6 +3559,8 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
{
arc_state_t *evicted_state, *state;
int64_t bytes_evicted = 0;
+ int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
+ zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms;
ASSERT(MUTEX_HELD(hash_lock));
ASSERT(HDR_HAS_L1HDR(hdr));
@@ -3600,8 +3613,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
/* prefetch buffers have a minimum lifespan */
if (HDR_IO_IN_PROGRESS(hdr) ||
((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
- ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
- arc_min_prefetch_lifespan)) {
+ ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
ARCSTAT_BUMP(arcstat_evict_skip);
return (bytes_evicted);
}
@@ -4997,13 +5009,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* - move the buffer to the head of the list if this is
* another prefetch (to make it less likely to be evicted).
*/
- if (HDR_PREFETCH(hdr)) {
+ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
/* link protected by hash lock */
ASSERT(multilist_link_active(
&hdr->b_l1hdr.b_arc_node));
} else {
- arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PREFETCH |
+ ARC_FLAG_PRESCIENT_PREFETCH);
ARCSTAT_BUMP(arcstat_mru_hits);
}
hdr->b_l1hdr.b_arc_access = now;
@@ -5034,10 +5048,13 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* MFU state.
*/
- if (HDR_PREFETCH(hdr)) {
+ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
new_state = arc_mru;
- if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
- arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+ if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PREFETCH |
+ ARC_FLAG_PRESCIENT_PREFETCH);
+ }
DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
} else {
new_state = arc_mfu;
@@ -5058,11 +5075,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* If it was a prefetch, we will explicitly move it to
* the head of the list now.
*/
- if ((HDR_PREFETCH(hdr)) != 0) {
- ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
- /* link protected by hash_lock */
- ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
- }
+
ARCSTAT_BUMP(arcstat_mfu_hits);
hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
@@ -5073,12 +5086,11 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
* MFU state.
*/
- if (HDR_PREFETCH(hdr)) {
+ if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
/*
* This is a prefetch access...
* move this block back to the MRU state.
*/
- ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
new_state = arc_mru;
}
@@ -5145,23 +5157,28 @@ arc_buf_access(arc_buf_t *buf)
demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
}
-/* a generic arc_done_func_t which you can use */
+/* a generic arc_read_done_func_t which you can use */
/* ARGSUSED */
void
-arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
+arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+ arc_buf_t *buf, void *arg)
{
- if (zio == NULL || zio->io_error == 0)
- bcopy(buf->b_data, arg, arc_buf_size(buf));
+ if (buf == NULL)
+ return;
+
+ bcopy(buf->b_data, arg, arc_buf_size(buf));
arc_buf_destroy(buf, arg);
}
-/* a generic arc_done_func_t */
+/* a generic arc_read_done_func_t */
+/* ARGSUSED */
void
-arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
+arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+ arc_buf_t *buf, void *arg)
{
arc_buf_t **bufp = arg;
- if (zio && zio->io_error) {
- arc_buf_destroy(buf, arg);
+
+ if (buf == NULL) {
*bufp = NULL;
} else {
*bufp = buf;
@@ -5193,7 +5210,6 @@ arc_read_done(zio_t *zio)
arc_callback_t *callback_list;
arc_callback_t *acb;
boolean_t freeable = B_FALSE;
- boolean_t no_zio_error = (zio->io_error == 0);
/*
* The hdr was inserted into hash-table and removed from lists
@@ -5219,7 +5235,7 @@ arc_read_done(zio_t *zio)
ASSERT3P(hash_lock, !=, NULL);
}
- if (no_zio_error) {
+ if (zio->io_error == 0) {
/* byteswap if necessary */
if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
if (BP_GET_LEVEL(zio->io_bp) > 0) {
@@ -5240,7 +5256,8 @@ arc_read_done(zio_t *zio)
callback_list = hdr->b_l1hdr.b_acb;
ASSERT3P(callback_list, !=, NULL);
- if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
+ if (hash_lock && zio->io_error == 0 &&
+ hdr->b_l1hdr.b_state == arc_anon) {
/*
* Only call arc_access on anonymous buffers. This is because
* if we've issued an I/O for an evicted buffer, we've already
@@ -5261,14 +5278,21 @@ arc_read_done(zio_t *zio)
if (!acb->acb_done)
continue;
- /* This is a demand read since prefetches don't use callbacks */
callback_cnt++;
+ if (zio->io_error != 0)
+ continue;
+
int error = arc_buf_alloc_impl(hdr, acb->acb_private,
- acb->acb_compressed, no_zio_error, &acb->acb_buf);
- if (no_zio_error) {
- zio->io_error = error;
+ acb->acb_compressed,
+ B_TRUE, &acb->acb_buf);
+ if (error != 0) {
+ arc_buf_destroy(acb->acb_buf, acb->acb_private);
+ acb->acb_buf = NULL;
}
+
+ if (zio->io_error == 0)
+ zio->io_error = error;
}
hdr->b_l1hdr.b_acb = NULL;
arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
@@ -5281,7 +5305,7 @@ arc_read_done(zio_t *zio)
ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
callback_list != NULL);
- if (no_zio_error) {
+ if (zio->io_error == 0) {
arc_hdr_verify(hdr, zio->io_bp);
} else {
arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
@@ -5314,8 +5338,10 @@ arc_read_done(zio_t *zio)
/* execute each callback and free its structure */
while ((acb = callback_list) != NULL) {
- if (acb->acb_done)
- acb->acb_done(zio, acb->acb_buf, acb->acb_private);
+ if (acb->acb_done) {
+ acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
+ acb->acb_buf, acb->acb_private);
+ }
if (acb->acb_zio_dummy != NULL) {
acb->acb_zio_dummy->io_error = zio->io_error;
@@ -5349,7 +5375,7 @@ arc_read_done(zio_t *zio)
* for readers of this block.
*/
int
-arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done,
void *private, zio_priority_t priority, int zio_flags,
arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
{
@@ -5358,7 +5384,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, a
zio_t *rzio;
uint64_t guid = spa_load_guid(spa);
boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0;
-
+ int rc = 0;
+
ASSERT(!BP_IS_EMBEDDED(bp) ||
BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
@@ -5376,32 +5403,20 @@ top:
*arc_flags |= ARC_FLAG_CACHED;
if (HDR_IO_IN_PROGRESS(hdr)) {
+ zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
+ ASSERT3P(head_zio, !=, NULL);
if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
priority == ZIO_PRIORITY_SYNC_READ) {
/*
- * This sync read must wait for an
- * in-progress async read (e.g. a predictive
- * prefetch). Async reads are queued
- * separately at the vdev_queue layer, so
- * this is a form of priority inversion.
- * Ideally, we would "inherit" the demand
- * i/o's priority by moving the i/o from
- * the async queue to the synchronous queue,
- * but there is currently no mechanism to do
- * so. Track this so that we can evaluate
- * the magnitude of this potential performance
- * problem.
- *
- * Note that if the prefetch i/o is already
- * active (has been issued to the device),
- * the prefetch improved performance, because
- * we issued it sooner than we would have
- * without the prefetch.
+ * This is a sync read that needs to wait for
+ * an in-flight async read. Request that the
+ * zio have its priority upgraded.
*/
- DTRACE_PROBE1(arc__sync__wait__for__async,
+ zio_change_priority(head_zio, priority);
+ DTRACE_PROBE1(arc__async__upgrade__sync,
arc_buf_hdr_t *, hdr);
- ARCSTAT_BUMP(arcstat_sync_wait_for_async);
+ ARCSTAT_BUMP(arcstat_async_upgrade_sync);
}
if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
arc_hdr_clear_flags(hdr,
@@ -5428,6 +5443,7 @@ top:
spa, NULL, NULL, NULL, zio_flags);
ASSERT3P(acb->acb_done, !=, NULL);
+ acb->acb_zio_head = head_zio;
acb->acb_next = hdr->b_l1hdr.b_acb;
hdr->b_l1hdr.b_acb = acb;
mutex_exit(hash_lock);
@@ -5455,17 +5471,32 @@ top:
arc_hdr_clear_flags(hdr,
ARC_FLAG_PREDICTIVE_PREFETCH);
}
- ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
+ if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
+ ARCSTAT_BUMP(
+ arcstat_demand_hit_prescient_prefetch);
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PRESCIENT_PREFETCH);
+ }
+
+ ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
/* Get a buf with the desired data in it. */
- VERIFY0(arc_buf_alloc_impl(hdr, private,
- compressed_read, B_TRUE, &buf));
+ rc = arc_buf_alloc_impl(hdr, private,
+ compressed_read, B_TRUE, &buf);
+ if (rc != 0) {
+ arc_buf_destroy(buf, private);
+ buf = NULL;
+ }
+ ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
+ rc == 0 || rc != ENOENT);
} else if (*arc_flags & ARC_FLAG_PREFETCH &&
refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
}
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
arc_access(hdr, hash_lock);
+ if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
if (*arc_flags & ARC_FLAG_L2CACHE)
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
mutex_exit(hash_lock);
@@ -5475,7 +5506,7 @@ top:
data, metadata, hits);
if (done)
- done(NULL, buf, private);
+ done(NULL, zb, bp, buf, private);
} else {
uint64_t lsize = BP_GET_LSIZE(bp);
uint64_t psize = BP_GET_PSIZE(bp);
@@ -5549,6 +5580,9 @@ top:
if (*arc_flags & ARC_FLAG_PREFETCH)
arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+ if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
+
if (*arc_flags & ARC_FLAG_L2CACHE)
arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
if (BP_GET_LEVEL(bp) > 0)
@@ -5578,14 +5612,17 @@ top:
vd = NULL;
}
- if (priority == ZIO_PRIORITY_ASYNC_READ)
+ /*
+ * We count both async reads and scrub IOs as asynchronous so
+ * that both can be upgraded in the event of a cache hit while
+ * the read IO is still in-flight.
+ */
+ if (priority == ZIO_PRIORITY_ASYNC_READ ||
+ priority == ZIO_PRIORITY_SCRUB)
arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
else
arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
- if (hash_lock != NULL)
- mutex_exit(hash_lock);
-
/*
* At this point, we have a level 1 cache miss. Try again in
* L2ARC if possible.
@@ -5666,6 +5703,11 @@ top:
ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY, B_FALSE);
+ acb->acb_zio_head = rzio;
+
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+
DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
zio_t *, rzio);
ARCSTAT_INCR(arcstat_l2_read_bytes, size);
@@ -5680,6 +5722,8 @@ top:
return (0);
/* l2arc read error; goto zio_read() */
+ if (hash_lock != NULL)
+ mutex_enter(hash_lock);
} else {
DTRACE_PROBE1(l2arc__miss,
arc_buf_hdr_t *, hdr);
@@ -5700,7 +5744,11 @@ top:
rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size,
arc_read_done, hdr, priority, zio_flags, zb);
+ acb->acb_zio_head = rzio;
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+
if (*arc_flags & ARC_FLAG_WAIT)
return (zio_wait(rzio));
@@ -6191,9 +6239,9 @@ arc_write_done(zio_t *zio)
zio_t *
arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
- boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready,
- arc_done_func_t *children_ready, arc_done_func_t *physdone,
- arc_done_func_t *done, void *private, zio_priority_t priority,
+ boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready,
+ arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
+ arc_write_done_func_t *done, void *private, zio_priority_t priority,
int zio_flags, const zbookmark_phys_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
@@ -6620,9 +6668,6 @@ arc_init(void)
mutex_init(&arc_dnlc_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&arc_dnlc_evicts_cv, NULL, CV_DEFAULT, NULL);
-
- /* Convert seconds to clock ticks */
- arc_min_prefetch_lifespan = 1 * hz;
/* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */
arc_c_min = MAX(allmem / 32, arc_abs_min);
Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c Mon Oct 1 04:02:00 2018 (r339033)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c Mon Oct 1 04:08:47 2018 (r339034)
@@ -902,7 +902,8 @@ dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t o
}
static void
-dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+ arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
@@ -916,19 +917,22 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
ASSERT(db->db.db_data == NULL);
if (db->db_level == 0 && db->db_freed_in_flight) {
/* we were freed in flight; disregard any error */
+ if (buf == NULL) {
+ buf = arc_alloc_buf(db->db_objset->os_spa,
+ db, DBUF_GET_BUFC_TYPE(db), db->db.db_size);
+ }
arc_release(buf, db);
bzero(buf->b_data, db->db.db_size);
arc_buf_freeze(buf);
db->db_freed_in_flight = FALSE;
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
- } else if (zio == NULL || zio->io_error == 0) {
+ } else if (buf != NULL) {
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
} else {
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT3P(db->db_buf, ==, NULL);
- arc_buf_destroy(buf, db);
db->db_state = DB_UNCACHED;
}
cv_broadcast(&db->db_changed);
@@ -2326,7 +2330,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, bl
* prefetch if the next block down is our target.
*/
static void
-dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
+dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
+ const blkptr_t *iobp, arc_buf_t *abuf, void *private)
{
dbuf_prefetch_arg_t *dpa = private;
@@ -2365,13 +2370,18 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abu
dbuf_rele(db, FTAG);
}
+ if (abuf == NULL) {
+ kmem_free(dpa, sizeof(*dpa));
+ return;
+ }
+
dpa->dpa_curlevel--;
uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
- if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
+ if (BP_IS_HOLE(bp)) {
kmem_free(dpa, sizeof (*dpa));
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
@@ -3746,7 +3756,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, d
* ready callback so that we can properly handle an indirect
* block that only contains holes.
*/
- arc_done_func_t *children_ready_cb = NULL;
+ arc_write_done_func_t *children_ready_cb = NULL;
if (db->db_level != 0)
children_ready_cb = dbuf_write_children_ready;
Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c Mon Oct 1 04:02:00 2018 (r339033)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c Mon Oct 1 04:08:47 2018 (r339034)
@@ -1112,14 +1112,26 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
void
ddt_sync(spa_t *spa, uint64_t txg)
{
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
dmu_tx_t *tx;
- zio_t *rio = zio_root(spa, NULL, NULL,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
+ zio_t *rio;
ASSERT(spa_syncing_txg(spa) == txg);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+ rio = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL);
+
+ /*
+ * This function may cause an immediate scan of ddt blocks (see
+ * the comment above dsl_scan_ddt() for details). We set the
+ * scan's root zio here so that we can wait for any scan IOs in
+ * addition to the regular ddt IOs.
+ */
+ ASSERT3P(scn->scn_zio_root, ==, NULL);
+ scn->scn_zio_root = rio;
+
for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];
if (ddt == NULL)
@@ -1129,6 +1141,7 @@ ddt_sync(spa_t *spa, uint64_t txg)
}
(void) zio_wait(rio);
+ scn->scn_zio_root = NULL;
dmu_tx_commit(tx);
}
Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c Mon Oct 1 04:02:00 2018 (r339033)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c Mon Oct 1 04:08:47 2018 (r339034)
@@ -349,6 +349,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, bl
ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
+#if 0
/*
* The $ORIGIN dataset (if it exists) doesn't have an associated
* objset, so there's no reason to open it. The $ORIGIN dataset
@@ -359,6 +360,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, bl
ASSERT3P(ds->ds_dir, !=,
spa_get_dsl(spa)->dp_origin_snap->ds_dir);
}
+#endif
os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
os->os_dsl_dataset = ds;
Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c Mon Oct 1 04:02:00 2018 (r339033)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c Mon Oct 1 04:08:47 2018 (r339034)
@@ -499,8 +499,9 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const
const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
prefetch_data_t *pfd = arg;
- arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
-
+ arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
+ ARC_FLAG_PRESCIENT_PREFETCH;
+
ASSERT(pfd->pd_bytes_fetched >= 0);
if (bp == NULL)
return (0);
Modified: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
==============================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c Mon Oct 1 04:02:00 2018 (r339033)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c Mon Oct 1 04:08:47 2018 (r339034)
@@ -51,28 +51,136 @@
#include <sys/sa_impl.h>
#include <sys/zfeature.h>
#include <sys/abd.h>
+#include <sys/range_tree.h>
#ifdef _KERNEL
#include <sys/zfs_vfsops.h>
#endif
+/*
+ * Grand theory statement on scan queue sorting
+ *
+ * Scanning is implemented by recursively traversing all indirection levels
+ * in an object and reading all blocks referenced from said objects. This
+ * results in us approximately traversing the object from lowest logical
+ * offset to the highest. For best performance, we would want the logical
+ * blocks to be physically contiguous. However, this is frequently not the
+ * case with pools given the allocation patterns of copy-on-write filesystems.
+ * So instead, we put the I/Os into a reordering queue and issue them in a
+ * way that will most benefit physical disks (LBA-order).
+ *
+ * Queue management:
+ *
+ * Ideally, we would want to scan all metadata and queue up all block I/O
+ * prior to starting to issue it, because that allows us to do an optimal
+ * sorting job. This can however consume large amounts of memory. Therefore
+ * we continuously monitor the size of the queues and constrain them to 5%
+ * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this
+ * limit, we clear out a few of the largest extents at the head of the queues
+ * to make room for more scanning. Hopefully, these extents will be fairly
+ * large and contiguous, allowing us to approach sequential I/O throughput
+ * even without a fully sorted tree.
+ *
+ * Metadata scanning takes place in dsl_scan_visit(), which is called from
+ * dsl_scan_sync() every spa_sync(). If we have either fully scanned all
+ * metadata on the pool, or we need to make room in memory because our
+ * queues are too large, dsl_scan_visit() is postponed and
+ * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies
+ * that metadata scanning and queued I/O issuing are mutually exclusive. This
+ * allows us to provide maximum sequential I/O throughput for the majority of
+ * I/O's issued since sequential I/O performance is significantly negatively
+ * impacted if it is interleaved with random I/O.
+ *
+ * Implementation Notes
+ *
+ * One side effect of the queued scanning algorithm is that the scanning code
+ * needs to be notified whenever a block is freed. This is needed to allow
+ * the scanning code to remove these I/Os from the issuing queue. Additionally,
+ * we do not attempt to queue gang blocks to be issued sequentially since this
+ * is very hard to do and would have an extremely limitted performance benefit.
+ * Instead, we simply issue gang I/Os as soon as we find them using the legacy
+ * algorithm.
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-stable
mailing list