Scrub incredibly slow with 13.0-RC3 (as well as RC1 & 2)
Mathieu Chouquet-Stringer
me+freebsd at mathieu.digital
Tue Mar 30 13:43:11 UTC 2021
Hello,
TL;DR: we need kvmclock support in FreeBSD
On Sat, Mar 27, 2021 at 05:06:43PM +0100, Mathieu Chouquet-Stringer wrote:
> TL;DR: read_max makes no difference, don't use hpet with KVM, use
> scan_legacy.
I ended up resurrecting a set of patches created by Bryan [1] to add support
for kvmclock because this whole slow zfs thing is incredibly dependent on the
timecounter infrastructure. I inlined below the patch I'm using at the moment.
And results are mind blowing.
I can scrub my zroot pool in 10s instead of 1m35s...
And I started scrubbing my raidz2 pool 30 minutes ago and it's telling
me I have about 15 hours left. The best time I got before that was 23:04:17
(and that was with scan_legacy=1 which was around 3 hours quicker than the same
scrub withscan_legacy=0)...
scan: scrub in progress since Tue Mar 30 15:00:03 2021
962G scanned at 541M/s, 810G issued at 456M/s, 24.4T total
0B repaired, 3.24% done, 15:05:39 to go
And that's with vfs.zfs.scan_legacy == 0 (the default). So it seems the scan
process uses timing information a lot more than the legacy one. And
that's what I saw in the FlameGraph when I stupidly added hpet support in KVM:
I was seeing a lot of calls to hpet_get_timecount.
So the question is: what is needed to merge that kvmclock thing in the kernel?
Regards,
[1] https://people.freebsd.org/~bryanv/patches/kvm_clock-1.patch
---
sys/conf/files.x86 | 1 +
sys/x86/include/kvm.h | 49 +++++++++++++++++++++++
sys/x86/x86/kvm_clock.c | 132 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 182 insertions(+)
diff --git a/sys/conf/files.x86 b/sys/conf/files.x86
index f51392d0614..bcf53fcf1f6 100644
--- a/sys/conf/files.x86
+++ b/sys/conf/files.x86
@@ -335,6 +335,7 @@ x86/x86/dump_machdep.c standard
x86/x86/fdt_machdep.c optional fdt
x86/x86/identcpu.c standard
x86/x86/intr_machdep.c standard
+x86/x86/kvm_clock.c standard
x86/x86/legacy.c standard
x86/x86/mca.c standard
x86/x86/x86_mem.c optional mem
diff --git a/sys/x86/include/kvm.h b/sys/x86/include/kvm.h
new file mode 100644
index 00000000000..dc26822ed1c
--- /dev/null
+++ b/sys/x86/include/kvm.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2014 Bryan Venteicher <bryanv at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _X86_KVM_H_
+#define _X86_KVM_H_
+
+#define KVM_CPUID_FEATURES_LEAF 0x40000001
+
+#define KVM_FEATURE_CLOCKSOURCE 0x00000001
+#define KVM_FEATURE_CLOCKSOURCE2 0x00000008
+
+/* Deprecated: for the CLOCKSOURCE feature. */
+#define KVM_MSR_WALL_CLOCK 0x11
+#define KVM_MSR_SYSTEM_TIME 0x12
+
+#define KVM_MSR_WALL_CLOCK_NEW 0x4b564d00
+#define KVM_MSR_SYSTEM_TIME_NEW 0x4b564d01
+
+int kvm_paravirt_supported(void);
+uint32_t kvm_get_features(void);
+
+uint64_t kvm_clock_tsc_freq(void);
+
+#endif /* !_X86_KVM_H_ */
diff --git a/sys/x86/x86/kvm_clock.c b/sys/x86/x86/kvm_clock.c
new file mode 100644
index 00000000000..f16aa88c695
--- /dev/null
+++ b/sys/x86/x86/kvm_clock.c
@@ -0,0 +1,132 @@
+/*-
+ * Copyright (c) 2014 Bryan Venteicher <bryanv at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/limits.h>
+#include <sys/systm.h>
+#include <sys/pcpu.h>
+#include <sys/smp.h>
+#include <sys/timetc.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/pvclock.h>
+#include <x86/kvm.h>
+#include <x86/x86_var.h>
+
+static u_int kvm_clock_get_timecounter(struct timecounter *);
+static void kvm_clock_pcpu_system_time(void *);
+
+DPCPU_DEFINE(struct pvclock_vcpu_time_info, kvm_clock_vcpu_time_info);
+
+static struct timecounter kvm_clock_timecounter = {
+ kvm_clock_get_timecounter,
+ NULL,
+ ~0u,
+ 1000000000ULL,
+ "KVMCLOCK",
+ 1000,
+};
+
+static int kvm_clock_registered;
+static uint32_t kvm_clock_wall_clock_msr;
+static uint32_t kvm_clock_system_time_msr;
+
+uint64_t
+kvm_clock_tsc_freq(void)
+{
+ struct pvclock_vcpu_time_info *ti;
+ uint64_t freq;
+
+ critical_enter();
+ ti = DPCPU_PTR(kvm_clock_vcpu_time_info);
+ freq = pvclock_tsc_freq(ti);
+ critical_exit();
+
+ return (freq);
+}
+
+static u_int
+kvm_clock_get_timecounter(struct timecounter *tc)
+{
+ struct pvclock_vcpu_time_info *ti;
+ uint64_t time;
+
+ critical_enter();
+ ti = DPCPU_PTR(kvm_clock_vcpu_time_info);
+ time = pvclock_get_timecount(ti);
+ critical_exit();
+
+ return (time & UINT_MAX);
+}
+
+static void
+kvm_clock_pcpu_system_time(void *arg)
+{
+ uint64_t data;
+ int enable;
+
+ enable = *(int *) arg;
+
+ if (enable != 0)
+ data = vtophys(DPCPU_PTR(kvm_clock_vcpu_time_info)) | 1;
+ else
+ data = 0;
+
+ wrmsr(kvm_clock_system_time_msr, data);
+}
+
+static void
+kvm_clock_init(void)
+{
+ if (vm_guest != VM_GUEST_KVM)
+ return;
+
+ if (hv_base > 0 == 0)
+ return;
+
+ if (hv_high & KVM_FEATURE_CLOCKSOURCE2) {
+ kvm_clock_wall_clock_msr = KVM_MSR_WALL_CLOCK_NEW;
+ kvm_clock_system_time_msr = KVM_MSR_SYSTEM_TIME_NEW;
+ } else if (hv_high & KVM_FEATURE_CLOCKSOURCE) {
+ kvm_clock_wall_clock_msr = KVM_MSR_WALL_CLOCK;
+ kvm_clock_system_time_msr = KVM_MSR_SYSTEM_TIME;
+ } else
+ return;
+
+ kvm_clock_registered = 1;
+ smp_rendezvous(smp_no_rendezvous_barrier, kvm_clock_pcpu_system_time,
+ smp_no_rendezvous_barrier, &kvm_clock_registered);
+
+ tc_init(&kvm_clock_timecounter);
+}
+
+SYSINIT(kvm_clock, SI_SUB_SMP, SI_ORDER_ANY, kvm_clock_init, NULL);
--
Mathieu Chouquet-Stringer
The sun itself sees not till heaven clears.
-- William Shakespeare --
More information about the freebsd-fs
mailing list