svn commit: r362828 - head/sys/kern

Mateusz Guzik mjg at FreeBSD.org
Wed Jul 1 05:59:09 UTC 2020


Author: mjg
Date: Wed Jul  1 05:59:08 2020
New Revision: 362828
URL: https://svnweb.freebsd.org/changeset/base/362828

Log:
  cache: lockless forward lookup with smr
  
  This eliminates the need to take bucket locks in the common case.
  
  Concurrent lookup utilizng the same vnodes is still bottlenecked on referencing
  and locking path components, this will be taken care of separately.
  
  Reviewed by:	kib
  Tested by:	pho
  Differential Revision:	https://reviews.freebsd.org/D23913

Modified:
  head/sys/kern/vfs_cache.c

Modified: head/sys/kern/vfs_cache.c
==============================================================================
--- head/sys/kern/vfs_cache.c	Wed Jul  1 05:56:29 2020	(r362827)
+++ head/sys/kern/vfs_cache.c	Wed Jul  1 05:59:08 2020	(r362828)
@@ -56,11 +56,13 @@ __FBSDID("$FreeBSD$");
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sdt.h>
+#include <sys/smr.h>
 #include <sys/smp.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/vnode.h>
+#include <ck_queue.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
@@ -104,7 +106,7 @@ SDT_PROBE_DEFINE2(vfs, namecache, shrink_negative, don
  */
 
 struct	namecache {
-	LIST_ENTRY(namecache) nc_hash;	/* hash chain */
+	CK_LIST_ENTRY(namecache) nc_hash;/* hash chain */
 	LIST_ENTRY(namecache) nc_src;	/* source vnode list */
 	TAILQ_ENTRY(namecache) nc_dst;	/* destination vnode list */
 	struct	vnode *nc_dvp;		/* vnode of parent of name */
@@ -143,7 +145,26 @@ struct	namecache_ts {
 #define	NCF_DVDROP	0x10
 #define	NCF_NEGATIVE	0x20
 #define	NCF_HOTNEGATIVE	0x40
+#define NCF_INVALID	0x80
 
+static bool
+cache_ncp_invalid(struct namecache *ncp)
+{
+
+	atomic_thread_fence_acq();
+	return ((ncp->nc_flag & NCF_INVALID) != 0);
+}
+
+static void
+cache_ncp_invalidate(struct namecache *ncp)
+{
+
+	atomic_thread_fence_rel();
+	KASSERT((ncp->nc_flag & NCF_INVALID) == 0,
+	    ("%s: entry %p already invalid", __func__, ncp));
+	ncp->nc_flag |= NCF_INVALID;
+}
+
 /*
  * Name caching works as follows:
  *
@@ -192,12 +213,14 @@ struct	namecache_ts {
  * the first node, locking everything in order and revalidating the state.
  */
 
+VFS_SMR_DECLARE;
+
 /*
  * Structures associated with name caching.
  */
 #define NCHHASH(hash) \
 	(&nchashtbl[(hash) & nchash])
-static __read_mostly LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
+static __read_mostly CK_LIST_HEAD(nchashhead, namecache) *nchashtbl;/* Hash Table */
 static u_long __read_mostly	nchash;			/* size of hash table */
 SYSCTL_ULONG(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0,
     "Size of namecache hash table");
@@ -275,15 +298,15 @@ cache_alloc(int len, int ts)
 
 	if (__predict_false(ts)) {
 		if (len <= CACHE_PATH_CUTOFF)
-			ncp_ts = uma_zalloc(cache_zone_small_ts, M_WAITOK);
+			ncp_ts = uma_zalloc_smr(cache_zone_small_ts, M_WAITOK);
 		else
-			ncp_ts = uma_zalloc(cache_zone_large_ts, M_WAITOK);
+			ncp_ts = uma_zalloc_smr(cache_zone_large_ts, M_WAITOK);
 		ncp = &ncp_ts->nc_nc;
 	} else {
 		if (len <= CACHE_PATH_CUTOFF)
-			ncp = uma_zalloc(cache_zone_small, M_WAITOK);
+			ncp = uma_zalloc_smr(cache_zone_small, M_WAITOK);
 		else
-			ncp = uma_zalloc(cache_zone_large, M_WAITOK);
+			ncp = uma_zalloc_smr(cache_zone_large, M_WAITOK);
 	}
 	return (ncp);
 }
@@ -300,14 +323,14 @@ cache_free(struct namecache *ncp)
 	if (__predict_false(ncp->nc_flag & NCF_TS)) {
 		ncp_ts = __containerof(ncp, struct namecache_ts, nc_nc);
 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
-			uma_zfree(cache_zone_small_ts, ncp_ts);
+			uma_zfree_smr(cache_zone_small_ts, ncp_ts);
 		else
-			uma_zfree(cache_zone_large_ts, ncp_ts);
+			uma_zfree_smr(cache_zone_large_ts, ncp_ts);
 	} else {
 		if (ncp->nc_nlen <= CACHE_PATH_CUTOFF)
-			uma_zfree(cache_zone_small, ncp);
+			uma_zfree_smr(cache_zone_small, ncp);
 		else
-			uma_zfree(cache_zone_large, ncp);
+			uma_zfree_smr(cache_zone_large, ncp);
 	}
 }
 
@@ -606,7 +629,7 @@ retry:
 	}
 	/* Scan hash tables counting entries */
 	for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
-		LIST_FOREACH(ncp, ncpp, nc_hash)
+		CK_LIST_FOREACH(ncp, ncpp, nc_hash)
 			cntbuf[i]++;
 	cache_unlock_all_buckets();
 	for (error = 0, i = 0; i < n_nchash; i++)
@@ -639,7 +662,7 @@ sysctl_debug_hashstat_nchash(SYSCTL_HANDLER_ARGS)
 	/* Scan hash tables for applicable entries */
 	for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
 		count = 0;
-		LIST_FOREACH(ncp, ncpp, nc_hash) {
+		CK_LIST_FOREACH(ncp, ncpp, nc_hash) {
 			count++;
 		}
 		if (count)
@@ -860,7 +883,10 @@ cache_zap_locked(struct namecache *ncp, bool neg_locke
 
 	CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp,
 	    (ncp->nc_flag & NCF_NEGATIVE) ? NULL : ncp->nc_vp);
-	LIST_REMOVE(ncp, nc_hash);
+
+	cache_ncp_invalidate(ncp);
+
+	CK_LIST_REMOVE(ncp, nc_hash);
 	if (!(ncp->nc_flag & NCF_NEGATIVE)) {
 		SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
 		    ncp->nc_name, ncp->nc_vp);
@@ -1013,7 +1039,7 @@ cache_zap_unlocked_bucket(struct namecache *ncp, struc
 	cache_sort_vnodes(&dvlp, &vlp);
 	cache_lock_vnodes(dvlp, vlp);
 	rw_wlock(blp);
-	LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
+	CK_LIST_FOREACH(rncp, (NCHHASH(hash)), nc_hash) {
 		if (rncp == ncp && rncp->nc_dvp == dvp &&
 		    rncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(rncp->nc_name, cnp->cn_nameptr, rncp->nc_nlen))
@@ -1225,12 +1251,12 @@ retry_dotdot:
 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
 	blp = HASH2BUCKETLOCK(hash);
 retry:
-	if (LIST_EMPTY(NCHHASH(hash)))
+	if (CK_LIST_EMPTY(NCHHASH(hash)))
 		goto out_no_entry;
 
 	rw_wlock(blp);
 
-	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
+	CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		counter_u64_add(numchecks, 1);
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
@@ -1305,6 +1331,7 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, st
 	uint32_t hash;
 	enum vgetstate vs;
 	int error, ltype;
+	bool try_smr, doing_smr;
 
 #ifdef DEBUG_CACHE
 	if (__predict_false(!doingcache)) {
@@ -1321,7 +1348,11 @@ cache_lookup(struct vnode *dvp, struct vnode **vpp, st
 	if ((cnp->cn_flags & MAKEENTRY) == 0)
 		return (cache_lookup_nomakeentry(dvp, vpp, cnp, tsp, ticksp));
 
+	try_smr = true;
+	if (cnp->cn_nameiop == CREATE)
+		try_smr = false;
 retry:
+	doing_smr = false;
 	blp = NULL;
 	dvlp = NULL;
 	error = 0;
@@ -1361,10 +1392,17 @@ retry:
 	}
 
 	hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
-	blp = HASH2BUCKETLOCK(hash);
-	rw_rlock(blp);
+retry_hashed:
+	if (try_smr) {
+		vfs_smr_enter();
+		doing_smr = true;
+		try_smr = false;
+	} else {
+		blp = HASH2BUCKETLOCK(hash);
+		rw_rlock(blp);
+	}
 
-	LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
+	CK_LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
 		counter_u64_add(numchecks, 1);
 		if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen))
@@ -1373,7 +1411,10 @@ retry:
 
 	/* We failed to find an entry */
 	if (__predict_false(ncp == NULL)) {
-		rw_runlock(blp);
+		if (doing_smr)
+			vfs_smr_exit();
+		else
+			rw_runlock(blp);
 		SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
 		    NULL);
 		counter_u64_add(nummiss, 1);
@@ -1402,8 +1443,22 @@ success:
 		ltype = VOP_ISLOCKED(dvp);
 		VOP_UNLOCK(dvp);
 	}
-	vs = vget_prep(*vpp);
-	cache_lookup_unlock(blp, dvlp);
+	if (doing_smr) {
+		if (cache_ncp_invalid(ncp)) {
+			vfs_smr_exit();
+			*vpp = NULL;
+			goto retry;
+		}
+		vs = vget_prep_smr(*vpp);
+		vfs_smr_exit();
+		if (vs == VGET_NONE) {
+			*vpp = NULL;
+			goto retry;
+		}
+	} else {
+		vs = vget_prep(*vpp);
+		cache_lookup_unlock(blp, dvlp);
+	}
 	error = vget_finish(*vpp, cnp->cn_lkflags, vs);
 	if (cnp->cn_flags & ISDOTDOT) {
 		vn_lock(dvp, ltype | LK_RETRY);
@@ -1427,21 +1482,37 @@ success:
 negative_success:
 	/* We found a negative match, and want to create it, so purge */
 	if (cnp->cn_nameiop == CREATE) {
+		MPASS(!doing_smr);
 		counter_u64_add(numnegzaps, 1);
 		goto zap_and_exit;
 	}
 
+	if (doing_smr) {
+		if ((ncp->nc_flag & NCF_HOTNEGATIVE) == 0) {
+			/*
+			 * We need to take locks to promote the entry.
+			 */
+			vfs_smr_exit();
+			doing_smr = false;
+			goto retry_hashed;
+		}
+	} else {
+		cache_negative_hit(ncp);
+	}
 	counter_u64_add(numneghits, 1);
-	cache_negative_hit(ncp);
 	if (ncp->nc_flag & NCF_WHITE)
 		cnp->cn_flags |= ISWHITEOUT;
 	SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
 	    ncp->nc_name);
 	cache_out_ts(ncp, tsp, ticksp);
-	cache_lookup_unlock(blp, dvlp);
+	if (doing_smr)
+		vfs_smr_exit();
+	else
+		cache_lookup_unlock(blp, dvlp);
 	return (ENOENT);
 
 zap_and_exit:
+	MPASS(!doing_smr);
 	if (blp != NULL)
 		error = cache_zap_rlocked_bucket(ncp, cnp, hash, blp);
 	else
@@ -1775,7 +1846,7 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, 
 	 * the same path name.
 	 */
 	ncpp = NCHHASH(hash);
-	LIST_FOREACH(n2, ncpp, nc_hash) {
+	CK_LIST_FOREACH(n2, ncpp, nc_hash) {
 		if (n2->nc_dvp == dvp &&
 		    n2->nc_nlen == cnp->cn_namelen &&
 		    !bcmp(n2->nc_name, cnp->cn_nameptr, n2->nc_nlen)) {
@@ -1840,12 +1911,6 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, 
 	}
 
 	/*
-	 * Insert the new namecache entry into the appropriate chain
-	 * within the cache entries table.
-	 */
-	LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
-
-	/*
 	 * If the entry is "negative", we place it into the
 	 * "negative" cache queue, otherwise, we place it into the
 	 * destination vnode's cache entries queue.
@@ -1861,6 +1926,14 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, 
 		SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
 		    ncp->nc_name);
 	}
+
+	atomic_thread_fence_rel();
+	/*
+	 * Insert the new namecache entry into the appropriate chain
+	 * within the cache entries table.
+	 */
+	CK_LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
+
 	cache_enter_unlock(&cel);
 	if (numneg * ncnegfactor > lnumcache)
 		cache_negative_zap_one();
@@ -1908,6 +1981,11 @@ nchinit(void *dummy __unused)
 	    NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
 	    UMA_ZONE_ZINIT);
 
+	VFS_SMR_ZONE_SET(cache_zone_small);
+	VFS_SMR_ZONE_SET(cache_zone_small_ts);
+	VFS_SMR_ZONE_SET(cache_zone_large);
+	VFS_SMR_ZONE_SET(cache_zone_large_ts);
+
 	ncsize = desiredvnodes * ncsizefactor;
 	nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
 	ncbuckethash = cache_roundup_2(mp_ncpus * mp_ncpus) - 1;
@@ -1973,11 +2051,11 @@ cache_changesize(u_long newmaxvnodes)
 	nchashtbl = new_nchashtbl;
 	nchash = new_nchash;
 	for (i = 0; i <= old_nchash; i++) {
-		while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) {
+		while ((ncp = CK_LIST_FIRST(&old_nchashtbl[i])) != NULL) {
 			hash = cache_get_hash(ncp->nc_name, ncp->nc_nlen,
 			    ncp->nc_dvp);
-			LIST_REMOVE(ncp, nc_hash);
-			LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
+			CK_LIST_REMOVE(ncp, nc_hash);
+			CK_LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
 		}
 	}
 	ncsize = newncsize;
@@ -2091,7 +2169,7 @@ cache_purgevfs(struct mount *mp, bool force)
 		for (j = i; j < n_nchash; j += numbucketlocks) {
 retry:
 			bucket = &nchashtbl[j];
-			LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
+			CK_LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
 				cache_assert_bucket_locked(ncp, RA_WLOCKED);
 				if (ncp->nc_dvp->v_mount != mp)
 					continue;


More information about the svn-src-all mailing list