git: cfbc3927613a - stable/14 - vfs: prefix regular vnlru with a special case for free vnodes

From: Mateusz Guzik <mjg_at_FreeBSD.org>
Date: Fri, 13 Oct 2023 23:43:12 UTC
The branch stable/14 has been updated by mjg:

URL: https://cgit.FreeBSD.org/src/commit/?id=cfbc3927613a8455498db5ff3e67046a4dab15a1

commit cfbc3927613a8455498db5ff3e67046a4dab15a1
Author:     Mateusz Guzik <mjg@FreeBSD.org>
AuthorDate: 2023-09-14 19:08:40 +0000
Commit:     Mateusz Guzik <mjg@FreeBSD.org>
CommitDate: 2023-10-13 23:41:47 +0000

    vfs: prefix regular vnlru with a special case for free vnodes
    
    Works around severe performance problems in certain corner cases, see
    the commentary added.
    
    Modifying vnlru logic has proven rather error prone in the past and a
    release is near, thus take the easy way out and fix it without having to
    dig into the current machinery.
    
    (cherry picked from commit 90a008e94bb205e5b8f3c41d57e155b59a6be95d)
---
 sys/kern/vfs_subr.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 113 insertions(+), 4 deletions(-)

diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index 5e39a149ef36..5834feff080c 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -1427,6 +1427,14 @@ vnlru_free_locked(int count)
 	return (ret);
 }
 
+static int
+vnlru_free(int count)
+{
+
+	mtx_lock(&vnode_list_mtx);
+	return (vnlru_free_locked(count));
+}
+
 void
 vnlru_free_vfsops(int count, struct vfsops *mnt_op, struct vnode *mvp)
 {
@@ -1594,6 +1602,106 @@ vnlru_kick_cond(void)
 	mtx_unlock(&vnode_list_mtx);
 }
 
+static void
+vnlru_proc_sleep(void)
+{
+
+	if (vnlruproc_sig) {
+		vnlruproc_sig = 0;
+		wakeup(&vnlruproc_sig);
+	}
+	msleep(vnlruproc, &vnode_list_mtx, PVFS|PDROP, "vlruwt", hz);
+}
+
+/*
+ * A lighter version of the machinery below.
+ *
+ * Tries to reach goals only by recycling free vnodes and does not invoke
+ * uma_reclaim(UMA_RECLAIM_DRAIN).
+ *
+ * This works around pathological behavior in vnlru in presence of tons of free
+ * vnodes, but without having to rewrite the machinery at this time. Said
+ * behavior boils down to continuously trying to reclaim all kinds of vnodes
+ * (cycling through all levels of "force") when the count is transiently above
+ * limit. This happens a lot when all vnodes are used up and vn_alloc
+ * speculatively increments the counter.
+ *
+ * Sample testcase: vnode limit 8388608, 20 separate directory trees each with
+ * 1 million files in total and 20 find(1) processes stating them in parallel
+ * (one per each tree).
+ *
+ * On a kernel with only stock machinery this needs anywhere between 60 and 120
+ * seconds to execute (time varies *wildly* between runs). With the workaround
+ * it consistently stays around 20 seconds.
+ *
+ * That is to say the entire thing needs a fundamental redesign (most notably
+ * to accommodate faster recycling), the above only tries to get it ouf the way.
+ *
+ * Return values are:
+ * -1 -- fallback to regular vnlru loop
+ *  0 -- do nothing, go to sleep
+ * >0 -- recycle this many vnodes
+ */
+static long
+vnlru_proc_light_pick(void)
+{
+	u_long rnumvnodes, rfreevnodes;
+
+	if (vstir || vnlruproc_sig == 1)
+		return (-1);
+
+	rnumvnodes = atomic_load_long(&numvnodes);
+	rfreevnodes = vnlru_read_freevnodes();
+
+	/*
+	 * vnode limit might have changed and now we may be at a significant
+	 * excess. Bail if we can't sort it out with free vnodes.
+	 */
+	if (rnumvnodes > desiredvnodes) {
+		if (rnumvnodes - rfreevnodes >= desiredvnodes ||
+		    rfreevnodes <= wantfreevnodes) {
+			return (-1);
+		}
+
+		return (rnumvnodes - desiredvnodes);
+	}
+
+	/*
+	 * Don't try to reach wantfreevnodes target if there are too few vnodes
+	 * to begin with.
+	 */
+	if (rnumvnodes < wantfreevnodes) {
+		return (0);
+	}
+
+	if (rfreevnodes < wantfreevnodes) {
+		return (-1);
+	}
+
+	return (0);
+}
+
+static bool
+vnlru_proc_light(void)
+{
+	long freecount;
+
+	mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
+
+	freecount = vnlru_proc_light_pick();
+	if (freecount == -1)
+		return (false);
+
+	if (freecount != 0) {
+		vnlru_free(freecount);
+	}
+
+	mtx_lock(&vnode_list_mtx);
+	vnlru_proc_sleep();
+	mtx_assert(&vnode_list_mtx, MA_NOTOWNED);
+	return (true);
+}
+
 static void
 vnlru_proc(void)
 {
@@ -1609,6 +1717,10 @@ vnlru_proc(void)
 	want_reread = false;
 	for (;;) {
 		kproc_suspend_check(vnlruproc);
+
+		if (force == 0 && vnlru_proc_light())
+			continue;
+
 		mtx_lock(&vnode_list_mtx);
 		rnumvnodes = atomic_load_long(&numvnodes);
 
@@ -1639,10 +1751,7 @@ vnlru_proc(void)
 			vstir = false;
 		}
 		if (force == 0 && !vnlru_under(rnumvnodes, vlowat)) {
-			vnlruproc_sig = 0;
-			wakeup(&vnlruproc_sig);
-			msleep(vnlruproc, &vnode_list_mtx,
-			    PVFS|PDROP, "vlruwt", hz);
+			vnlru_proc_sleep();
 			continue;
 		}
 		rfreevnodes = vnlru_read_freevnodes();