svn commit: r366785 - head/sys/kern
Mateusz Guzik
mjg at FreeBSD.org
Sat Oct 17 08:48:59 UTC 2020
Author: mjg
Date: Sat Oct 17 08:48:58 2020
New Revision: 366785
URL: https://svnweb.freebsd.org/changeset/base/366785
Log:
cache: rework parts of negative entry management
- declutter sysctl vfs.cache by moving relevant entries into
vfs.cache.neg
- add a little more parallelism to eviction by replacing the
global lock with an atomically modified counter
- track more statistics
The code needs further effort.
Modified:
head/sys/kern/vfs_cache.c
Modified: head/sys/kern/vfs_cache.c
==============================================================================
--- head/sys/kern/vfs_cache.c Sat Oct 17 08:48:32 2020 (r366784)
+++ head/sys/kern/vfs_cache.c Sat Oct 17 08:48:58 2020 (r366785)
@@ -113,7 +113,7 @@ SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct v
"struct vnode *");
SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
"char *");
-SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, done, "struct vnode *",
+SDT_PROBE_DEFINE2(vfs, namecache, evict_negative, done, "struct vnode *",
"char *");
SDT_PROBE_DEFINE3(vfs, fplookup, lookup, done, "struct nameidata", "int", "bool");
@@ -251,9 +251,6 @@ cache_ncp_canuse(struct namecache *ncp)
* bucketlock mtx for access to given set of hash buckets
* neglist mtx negative entry LRU management
*
- * Additionally, ncneg_shrink_lock mtx is used to have at most one thread
- * shrinking the LRU list.
- *
* It is legal to take multiple vnodelock and bucketlock locks. The locking
* order is lower address first. Both are recursive.
*
@@ -305,13 +302,14 @@ static bool __read_frequently cache_fast_revlookup = t
SYSCTL_BOOL(_vfs, OID_AUTO, cache_fast_revlookup, CTLFLAG_RW,
&cache_fast_revlookup, 0, "");
-static struct mtx __exclusive_cache_line ncneg_shrink_lock;
+static u_int __exclusive_cache_line neg_cycle;
#define ncneghash 3
#define numneglists (ncneghash + 1)
struct neglist {
- struct mtx nl_lock;
+ struct mtx nl_evict_lock;
+ struct mtx nl_lock __aligned(CACHE_LINE_SIZE);
TAILQ_HEAD(, namecache) nl_list;
TAILQ_HEAD(, namecache) nl_hotlist;
u_long nl_hotnum;
@@ -473,10 +471,6 @@ static long zap_and_exit_bucket_fail2; STATNODE_ULONG(
static long cache_lock_vnodes_cel_3_failures;
STATNODE_ULONG(cache_lock_vnodes_cel_3_failures,
"Number of times 3-way vnode locking failed");
-STATNODE_COUNTER(numneg_evicted,
- "Number of negative entries evicted when adding a new entry");
-STATNODE_COUNTER(shrinking_skipped,
- "Number of times shrinking was already in progress");
static void cache_zap_locked(struct namecache *ncp);
static int vn_fullpath_hardlink(struct nameidata *ndp, char **retbuf,
@@ -683,21 +677,6 @@ SYSCTL_PROC(_vfs_cache, OID_AUTO, nchstats, CTLTYPE_OP
CTLFLAG_MPSAFE, 0, 0, sysctl_nchstats, "LU",
"VFS cache effectiveness statistics");
-static int
-sysctl_hotnum(SYSCTL_HANDLER_ARGS)
-{
- int i, out;
-
- out = 0;
- for (i = 0; i < numneglists; i++)
- out += neglists[i].nl_hotnum;
-
- return (SYSCTL_OUT(req, &out, sizeof(out)));
-}
-SYSCTL_PROC(_vfs_cache, OID_AUTO, hotnum, CTLTYPE_INT | CTLFLAG_RD |
- CTLFLAG_MPSAFE, 0, 0, sysctl_hotnum, "I",
- "Number of hot negative entries");
-
#ifdef DIAGNOSTIC
/*
* Grab an atomic snapshot of the name cache hash chain lengths
@@ -792,27 +771,77 @@ SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE
/*
* Negative entries management
*
- * A variation of LRU scheme is used. New entries are hashed into one of
- * numneglists cold lists. Entries get promoted to the hot list on first hit.
+ * Various workloads create plenty of negative entries and barely use them
+ * afterwards. Moreover malicious users can keep performing bogus lookups
+ * adding even more entries. For example "make tinderbox" as of writing this
+ * comment ends up with 2.6M namecache entries in total, 1.2M of which are
+ * negative.
*
- * The shrinker will demote hot list head and evict from the cold list in a
- * round-robin manner.
+ * As such, a rather aggressive eviction method is needed. The currently
+ * employed method is a placeholder.
+ *
+ * Entries are split over numneglists separate lists, each of which is further
+ * split into hot and cold entries. Entries get promoted after getting a hit.
+ * Eviction happens on addition of new entry.
*/
+static SYSCTL_NODE(_vfs_cache, OID_AUTO, neg, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "Name cache negative entry statistics");
+
+SYSCTL_ULONG(_vfs_cache_neg, OID_AUTO, count, CTLFLAG_RD, &numneg, 0,
+ "Number of negative cache entries");
+
+static COUNTER_U64_DEFINE_EARLY(neg_created);
+SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, created, CTLFLAG_RD, &neg_created,
+ "Number of created negative entries");
+
+static COUNTER_U64_DEFINE_EARLY(neg_evicted);
+SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evicted, CTLFLAG_RD, &neg_evicted,
+ "Number of evicted negative entries");
+
+static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_empty);
+SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_empty, CTLFLAG_RD,
+ &neg_evict_skipped_empty,
+ "Number of times evicting failed due to lack of entries");
+
+static COUNTER_U64_DEFINE_EARLY(neg_evict_skipped_contended);
+SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, evict_skipped_contended, CTLFLAG_RD,
+ &neg_evict_skipped_contended,
+ "Number of times evicting failed due to contention");
+
+SYSCTL_COUNTER_U64(_vfs_cache_neg, OID_AUTO, hits, CTLFLAG_RD, &numneghits,
+ "Number of cache hits (negative)");
+
+static int
+sysctl_neg_hot(SYSCTL_HANDLER_ARGS)
+{
+ int i, out;
+
+ out = 0;
+ for (i = 0; i < numneglists; i++)
+ out += neglists[i].nl_hotnum;
+
+ return (SYSCTL_OUT(req, &out, sizeof(out)));
+}
+SYSCTL_PROC(_vfs_cache_neg, OID_AUTO, hot, CTLTYPE_INT | CTLFLAG_RD |
+ CTLFLAG_MPSAFE, 0, 0, sysctl_neg_hot, "I",
+ "Number of hot negative entries");
+
static void
-cache_negative_init(struct namecache *ncp)
+cache_neg_init(struct namecache *ncp)
{
struct negstate *ns;
ncp->nc_flag |= NCF_NEGATIVE;
ns = NCP2NEGSTATE(ncp);
ns->neg_flag = 0;
+ counter_u64_add(neg_created, 1);
}
/*
* Move a negative entry to the hot list.
*/
static void
-cache_negative_promote(struct namecache *ncp)
+cache_neg_promote(struct namecache *ncp)
{
struct neglist *nl;
struct negstate *ns;
@@ -838,7 +867,7 @@ cache_negative_promote(struct namecache *ncp)
* up again.
*/
static bool
-cache_negative_promote_cond(struct vnode *dvp, struct componentname *cnp,
+cache_neg_promote_cond(struct vnode *dvp, struct componentname *cnp,
struct namecache *oncp, uint32_t hash)
{
struct namecache *ncp;
@@ -896,7 +925,7 @@ cache_negative_promote_cond(struct vnode *dvp, struct
goto out_abort;
}
- cache_negative_promote(ncp);
+ cache_neg_promote(ncp);
SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
counter_u64_add(numneghits, 1);
@@ -910,7 +939,7 @@ out_abort:
}
static void
-cache_negative_hit(struct namecache *ncp)
+cache_neg_hit(struct namecache *ncp)
{
struct neglist *nl;
struct negstate *ns;
@@ -920,12 +949,12 @@ cache_negative_hit(struct namecache *ncp)
return;
nl = NCP2NEGLIST(ncp);
mtx_lock(&nl->nl_lock);
- cache_negative_promote(ncp);
+ cache_neg_promote(ncp);
mtx_unlock(&nl->nl_lock);
}
static void
-cache_negative_insert(struct namecache *ncp)
+cache_neg_insert(struct namecache *ncp)
{
struct neglist *nl;
@@ -939,7 +968,7 @@ cache_negative_insert(struct namecache *ncp)
}
static void
-cache_negative_remove(struct namecache *ncp)
+cache_neg_remove(struct namecache *ncp)
{
struct neglist *nl;
struct negstate *ns;
@@ -959,30 +988,22 @@ cache_negative_remove(struct namecache *ncp)
}
static struct neglist *
-cache_negative_shrink_select(void)
+cache_neg_evict_select(void)
{
struct neglist *nl;
- static u_int cycle;
- u_int i;
+ u_int c;
- cycle++;
- for (i = 0; i < numneglists; i++) {
- nl = &neglists[(cycle + i) % numneglists];
- if (TAILQ_FIRST(&nl->nl_list) == NULL &&
- TAILQ_FIRST(&nl->nl_hotlist) == NULL)
- continue;
- mtx_lock(&nl->nl_lock);
- if (TAILQ_FIRST(&nl->nl_list) != NULL ||
- TAILQ_FIRST(&nl->nl_hotlist) != NULL)
- return (nl);
- mtx_unlock(&nl->nl_lock);
+ c = atomic_fetchadd_int(&neg_cycle, 1) + 1;
+ nl = &neglists[c % numneglists];
+ if (!mtx_trylock(&nl->nl_evict_lock)) {
+ counter_u64_add(neg_evict_skipped_contended, 1);
+ return (NULL);
}
-
- return (NULL);
+ return (nl);
}
static void
-cache_negative_zap_one(void)
+cache_neg_evict(void)
{
struct namecache *ncp, *ncp2;
struct neglist *nl;
@@ -990,18 +1011,12 @@ cache_negative_zap_one(void)
struct mtx *dvlp;
struct mtx *blp;
- if (mtx_owner(&ncneg_shrink_lock) != NULL ||
- !mtx_trylock(&ncneg_shrink_lock)) {
- counter_u64_add(shrinking_skipped, 1);
- return;
- }
-
- nl = cache_negative_shrink_select();
- mtx_unlock(&ncneg_shrink_lock);
+ nl = cache_neg_evict_select();
if (nl == NULL) {
return;
}
+ mtx_lock(&nl->nl_lock);
ncp = TAILQ_FIRST(&nl->nl_hotlist);
if (ncp != NULL) {
ns = NCP2NEGSTATE(ncp);
@@ -1011,11 +1026,17 @@ cache_negative_zap_one(void)
ns->neg_flag &= ~NEG_HOT;
}
ncp = TAILQ_FIRST(&nl->nl_list);
- MPASS(ncp != NULL);
+ if (ncp == NULL) {
+ counter_u64_add(neg_evict_skipped_empty, 1);
+ mtx_unlock(&nl->nl_lock);
+ mtx_unlock(&nl->nl_evict_lock);
+ return;
+ }
ns = NCP2NEGSTATE(ncp);
dvlp = VP2VNODELOCK(ncp->nc_dvp);
blp = NCP2BUCKETLOCK(ncp);
mtx_unlock(&nl->nl_lock);
+ mtx_unlock(&nl->nl_evict_lock);
mtx_lock(dvlp);
mtx_lock(blp);
/*
@@ -1031,10 +1052,10 @@ cache_negative_zap_one(void)
ncp = NULL;
} else {
vfs_smr_exit();
- SDT_PROBE2(vfs, namecache, shrink_negative, done, ncp->nc_dvp,
+ SDT_PROBE2(vfs, namecache, evict_negative, done, ncp->nc_dvp,
ncp->nc_name);
cache_zap_locked(ncp);
- counter_u64_add(numneg_evicted, 1);
+ counter_u64_add(neg_evicted, 1);
}
mtx_unlock(blp);
mtx_unlock(dvlp);
@@ -1074,7 +1095,7 @@ cache_zap_locked(struct namecache *ncp)
} else {
SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
ncp->nc_name);
- cache_negative_remove(ncp);
+ cache_neg_remove(ncp);
}
if (ncp->nc_flag & NCF_ISDOTDOT) {
if (ncp == ncp->nc_dvp->v_cache_dd) {
@@ -1414,7 +1435,7 @@ negative_success:
cache_out_ts(ncp, tsp, ticksp);
counter_u64_add(numneghits, 1);
whiteout = (ncp->nc_flag & NCF_WHITE);
- cache_negative_hit(ncp);
+ cache_neg_hit(ncp);
mtx_unlock(dvlp);
if (whiteout)
cnp->cn_flags |= ISWHITEOUT;
@@ -1525,7 +1546,7 @@ negative_success:
cache_out_ts(ncp, tsp, ticksp);
counter_u64_add(numneghits, 1);
whiteout = (ncp->nc_flag & NCF_WHITE);
- cache_negative_hit(ncp);
+ cache_neg_hit(ncp);
mtx_unlock(blp);
if (whiteout)
cnp->cn_flags |= ISWHITEOUT;
@@ -1628,7 +1649,7 @@ negative_success:
}
if (!neg_hot) {
vfs_smr_exit();
- if (!cache_negative_promote_cond(dvp, cnp, ncp, hash))
+ if (!cache_neg_promote_cond(dvp, cnp, ncp, hash))
goto out_fallback;
} else {
SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp, ncp->nc_name);
@@ -1927,15 +1948,15 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp,
* Bugs:
* 1. filesystems may end up tryng to add an already existing entry
* (for example this can happen after a cache miss during concurrent
- * lookup), in which case we will call cache_negative_zap_one despite
- * not adding anything.
+ * lookup), in which case we will call cache_neg_evict despite not
+ * adding anything.
* 2. the routine may fail to free anything and no provisions are made
* to make it try harder (see the inside for failure modes)
* 3. it only ever looks at negative entries.
*/
lnumcache = atomic_fetchadd_long(&numcache, 1) + 1;
if (numneg * ncnegfactor > lnumcache) {
- cache_negative_zap_one();
+ cache_neg_evict();
lnumcache = atomic_load_long(&numcache);
}
if (__predict_false(lnumcache >= ncsize)) {
@@ -1956,7 +1977,7 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp,
ncp->nc_flag = flag | NCF_WIP;
ncp->nc_vp = vp;
if (vp == NULL)
- cache_negative_init(ncp);
+ cache_neg_init(ncp);
ncp->nc_dvp = dvp;
if (tsp != NULL) {
ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
@@ -2081,7 +2102,7 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp,
} else {
if (cnp->cn_flags & ISWHITEOUT)
ncp->nc_flag |= NCF_WHITE;
- cache_negative_insert(ncp);
+ cache_neg_insert(ncp);
SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
ncp->nc_name);
}
@@ -2183,12 +2204,11 @@ nchinit(void *dummy __unused)
mtx_init(&vnodelocks[i], "ncvn", NULL, MTX_DUPOK | MTX_RECURSE);
for (i = 0; i < numneglists; i++) {
+ mtx_init(&neglists[i].nl_evict_lock, "ncnege", NULL, MTX_DEF);
mtx_init(&neglists[i].nl_lock, "ncnegl", NULL, MTX_DEF);
TAILQ_INIT(&neglists[i].nl_list);
TAILQ_INIT(&neglists[i].nl_hotlist);
}
-
- mtx_init(&ncneg_shrink_lock, "ncnegs", NULL, MTX_DEF);
}
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
@@ -3485,7 +3505,7 @@ cache_fplookup_negative_promote(struct cache_fpl *fpl,
dvp = fpl->dvp;
cache_fpl_smr_exit(fpl);
- if (cache_negative_promote_cond(dvp, cnp, oncp, hash))
+ if (cache_neg_promote_cond(dvp, cnp, oncp, hash))
return (cache_fpl_handled(fpl, ENOENT));
else
return (cache_fpl_aborted(fpl));
More information about the svn-src-head
mailing list