git: 12e81504ef7a - stable/13 - vfs: prefix regular vnlru with a special case for free vnodes
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Fri, 13 Oct 2023 23:53:55 UTC
The branch stable/13 has been updated by mjg:
URL: https://cgit.FreeBSD.org/src/commit/?id=12e81504ef7aa1802c8ede384f07ae65fec48bdc
commit 12e81504ef7aa1802c8ede384f07ae65fec48bdc
Author: Mateusz Guzik <mjg@FreeBSD.org>
AuthorDate: 2023-09-14 19:08:40 +0000
Commit: Mateusz Guzik <mjg@FreeBSD.org>
CommitDate: 2023-10-13 23:48:13 +0000
vfs: prefix regular vnlru with a special case for free vnodes
Works around severe performance problems in certain corner cases, see
the commentary added.
Modifying vnlru logic has proven rather error prone in the past and a
release is near, thus take the easy way out and fix it without having to
dig into the current machinery.
(cherry picked from commit 90a008e94bb205e5b8f3c41d57e155b59a6be95d)
---
sys/kern/vfs_subr.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 113 insertions(+), 4 deletions(-)
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 5ddbff77a221..e4b9ff8e1ba6 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1417,6 +1417,14 @@ vnlru_free_locked(int count)
return (ret);
}
+static int
+vnlru_free(int count)
+{
+
+ mtx_lock(&vnode_list_mtx);
+ return (vnlru_free_locked(count));
+}
+
void
vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp)
{
@@ -1611,6 +1619,106 @@ vnlru_kick_cond(void)
mtx_unlock(&vnode_list_mtx);
}
+static void
+vnlru_proc_sleep(void)
+{
+
+ if (vnlruproc_sig) {
+ vnlruproc_sig = 0;
+ wakeup(&vnlruproc_sig);
+ }
+ msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz);
+}
+
+/*
+ * A lighter version of the machinery below.
+ *
+ * Tries to reach goals only by recycling free vnodes and does not invoke
+ * uma_reclaim(UMA_RECLAIM_DRAIN).
+ *
+ * This works around pathological behavior in vnlru in presence of tons of free
+ * vnodes, but without having to rewrite the machinery at this time. Said
+ * behavior boils down to continuously trying to reclaim all kinds of vnodes
+ * (cycling through all levels of "force") when the count is transiently above
+ * limit. This happens a lot when all vnodes are used up and vn_alloc
+ * speculatively increments the counter.
+ *
+ * Sample testcase: vnode limit 8388608, 20 separate directory trees each with
+ * 1 million files in total and 20 find(1) processes stating them in parallel
+ * (one per each tree).
+ *
+ * On a kernel with only stock machinery this needs anywhere between 60 and 120
+ * seconds to execute (time varies *wildly* between runs). With the workaround
+ * it consistently stays around 20 seconds.
+ *
+ * That is to say the entire thing needs a fundamental redesign (most notably
+ * to accommodate faster recycling), the above only tries to get it ouf the way.
+ *
+ * Return values are:
+ * -1 -- fallback to regular vnlru loop
+ * 0 -- do nothing, go to sleep
+ * >0 -- recycle this many vnodes
+ */
+static long
+vnlru_proc_light_pick(void)
+{
+ u_long rnumvnodes, rfreevnodes;
+
+ if (vstir || vnlruproc_sig == 1)
+ return (-1);
+
+ rnumvnodes = atomic_load_long(&numvnodes);
+ rfreevnodes = vnlru_read_freevnodes();
+
+ /*
+ * vnode limit might have changed and now we may be at a significant
+ * excess. Bail if we can't sort it out with free vnodes.
+ */
+ if (rnumvnodes > desiredvnodes) {
+ if (rnumvnodes - rfreevnodes >= desiredvnodes ||
+ rfreevnodes <= wantfreevnodes) {
+ return (-1);
+ }
+
+ return (rnumvnodes - desiredvnodes);
+ }
+
+ /*
+ * Don't try to reach wantfreevnodes target if there are too few vnodes
+ * to begin with.
+ */
+ if (rnumvnodes < wantfreevnodes) {
+ return (0);
+ }
+
+ if (rfreevnodes < wantfreevnodes) {
+ return (-1);
+ }
+
+ return (0);
+}
+
+static bool
+vnlru_proc_light(void)
+{
+ long freecount;
+
+ mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
+
+ freecount = vnlru_proc_light_pick();
+ if (freecount == -1)
+ return (false);
+
+ if (freecount != 0) {
+ vnlru_free(freecount);
+ }
+
+ mtx_lock(&vnode_list_mtx);
+ vnlru_proc_sleep();
+ mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
+ return (true);
+}
+
static void
vnlru_proc(void)
{
@@ -1626,6 +1734,10 @@ vnlru_proc(void)
want_reread = false;
for (;;) {
kproc_suspend_check(vnlruproc);
+
+ if (force == 0 && vnlru_proc_light())
+ continue;
+
mtx_lock(&vnode_list_mtx);
rnumvnodes = atomic_load_long(&numvnodes);
@@ -1656,10 +1768,7 @@ vnlru_proc(void)
vstir = false;
}
if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) {
- vnlruproc_sig = 0;
- wakeup(&vnlruproc_sig);
- msleep(vnlruproc, &vnode_list_mtx,
- PVFS|PDROP, "vlruwt", hz);
+ vnlru_proc_sleep();
continue;
}
rfreevnodes = vnlru_read_freevnodes();