svn commit: r305688 - in user/alc/PQ_LAUNDRY: sys/amd64/amd64 sys/arm/arm sys/arm64/arm64 sys/cddl/compat/opensolaris/sys sys/i386/i386 sys/kern sys/powerpc/booke sys/powerpc/conf sys/riscv/riscv s...
Alan Cox
alc at FreeBSD.org
Sat Sep 10 17:15:00 UTC 2016
Author: alc
Date: Sat Sep 10 17:14:57 2016
New Revision: 305688
URL: https://svnweb.freebsd.org/changeset/base/305688
Log:
MFH r305685
Added:
user/alc/PQ_LAUNDRY/tests/sys/kern/waitpid_nohang.c
- copied unchanged from r305685, head/tests/sys/kern/waitpid_nohang.c
Modified:
user/alc/PQ_LAUNDRY/sys/amd64/amd64/pmap.c
user/alc/PQ_LAUNDRY/sys/arm/arm/pmap-v6.c
user/alc/PQ_LAUNDRY/sys/arm64/arm64/pmap.c
user/alc/PQ_LAUNDRY/sys/cddl/compat/opensolaris/sys/random.h
user/alc/PQ_LAUNDRY/sys/i386/i386/pmap.c
user/alc/PQ_LAUNDRY/sys/kern/kern_exit.c
user/alc/PQ_LAUNDRY/sys/kern/subr_witness.c
user/alc/PQ_LAUNDRY/sys/kern/uipc_syscalls.c
user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c
user/alc/PQ_LAUNDRY/sys/powerpc/booke/pmap.c
user/alc/PQ_LAUNDRY/sys/powerpc/conf/MPC85XX
user/alc/PQ_LAUNDRY/sys/riscv/riscv/pmap.c
user/alc/PQ_LAUNDRY/sys/sparc64/sparc64/pmap.c
user/alc/PQ_LAUNDRY/sys/vm/pmap.h
user/alc/PQ_LAUNDRY/tests/sys/kern/Makefile
Directory Properties:
user/alc/PQ_LAUNDRY/ (props changed)
Modified: user/alc/PQ_LAUNDRY/sys/amd64/amd64/pmap.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/amd64/amd64/pmap.c Sat Sep 10 17:00:08 2016 (r305687)
+++ user/alc/PQ_LAUNDRY/sys/amd64/amd64/pmap.c Sat Sep 10 17:14:57 2016 (r305688)
@@ -5816,8 +5816,6 @@ safe_to_clear_referenced(pmap_t pmap, pt
return (FALSE);
}
-#define PMAP_TS_REFERENCED_MAX 5
-
/*
* pmap_ts_referenced:
*
@@ -5826,10 +5824,6 @@ safe_to_clear_referenced(pmap_t pmap, pt
* is necessary that 0 only be returned when there are truly no
* reference bits set.
*
- * XXX: The exact number of bits to check and clear is a matter that
- * should be tested and standardized at some point in the future for
- * optimal aging of shared pages.
- *
* As an optimization, update the page's dirty field if a modified bit is
* found while counting reference bits. This opportunistic update can be
* performed at low cost and can eliminate the need for some future calls
@@ -5898,7 +5892,7 @@ retry:
*/
vm_page_dirty(m);
}
- if ((*pde & PG_A) != 0) {
+ if ((oldpde & PG_A) != 0) {
/*
* Since this reference bit is shared by 512 4KB
* pages, it should not be cleared every time it is
@@ -5919,7 +5913,7 @@ retry:
*/
if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
(uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
- (*pde & PG_W) == 0) {
+ (oldpde & PG_W) == 0) {
if (safe_to_clear_referenced(pmap, oldpde)) {
atomic_clear_long(pde, PG_A);
pmap_invalidate_page(pmap, pv->pv_va);
Modified: user/alc/PQ_LAUNDRY/sys/arm/arm/pmap-v6.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/arm/arm/pmap-v6.c Sat Sep 10 17:00:08 2016 (r305687)
+++ user/alc/PQ_LAUNDRY/sys/arm/arm/pmap-v6.c Sat Sep 10 17:14:57 2016 (r305688)
@@ -5161,8 +5161,6 @@ pmap_is_referenced(vm_page_t m)
return (rv);
}
-#define PMAP_TS_REFERENCED_MAX 5
-
/*
* pmap_ts_referenced:
*
@@ -5171,10 +5169,6 @@ pmap_is_referenced(vm_page_t m)
* is necessary that 0 only be returned when there are truly no
* reference bits set.
*
- * XXX: The exact number of bits to check and clear is a matter that
- * should be tested and standardized at some point in the future for
- * optimal aging of shared pages.
- *
* As an optimization, update the page's dirty field if a modified bit is
* found while counting reference bits. This opportunistic update can be
* performed at low cost and can eliminate the need for some future calls
Modified: user/alc/PQ_LAUNDRY/sys/arm64/arm64/pmap.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/arm64/arm64/pmap.c Sat Sep 10 17:00:08 2016 (r305687)
+++ user/alc/PQ_LAUNDRY/sys/arm64/arm64/pmap.c Sat Sep 10 17:14:57 2016 (r305688)
@@ -3880,8 +3880,6 @@ safe_to_clear_referenced(pmap_t pmap, pt
return (FALSE);
}
-#define PMAP_TS_REFERENCED_MAX 5
-
/*
* pmap_ts_referenced:
*
@@ -3890,9 +3888,13 @@ safe_to_clear_referenced(pmap_t pmap, pt
* is necessary that 0 only be returned when there are truly no
* reference bits set.
*
- * XXX: The exact number of bits to check and clear is a matter that
- * should be tested and standardized at some point in the future for
- * optimal aging of shared pages.
+ * As an optimization, update the page's dirty field if a modified bit is
+ * found while counting reference bits. This opportunistic update can be
+ * performed at low cost and can eliminate the need for some future calls
+ * to pmap_is_modified(). However, since this function stops after
+ * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
+ * dirty pages. Those dirty pages will only be detected by a future call
+ * to pmap_is_modified().
*/
int
pmap_ts_referenced(vm_page_t m)
@@ -3947,6 +3949,14 @@ retry:
("pmap_ts_referenced: found an invalid l1 table"));
pte = pmap_l1_to_l2(pde, pv->pv_va);
tpte = pmap_load(pte);
+ if (pmap_page_dirty(tpte)) {
+ /*
+ * Although "tpte" is mapping a 2MB page, because
+ * this function is called at a 4KB page granularity,
+ * we only update the 4KB page under test.
+ */
+ vm_page_dirty(m);
+ }
if ((tpte & ATTR_AF) != 0) {
/*
* Since this reference bit is shared by 512 4KB
@@ -4043,6 +4053,8 @@ small_mappings:
("pmap_ts_referenced: found an invalid l2 table"));
pte = pmap_l2_to_l3(pde, pv->pv_va);
tpte = pmap_load(pte);
+ if (pmap_page_dirty(tpte))
+ vm_page_dirty(m);
if ((tpte & ATTR_AF) != 0) {
if (safe_to_clear_referenced(pmap, tpte)) {
/*
Modified: user/alc/PQ_LAUNDRY/sys/cddl/compat/opensolaris/sys/random.h
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/cddl/compat/opensolaris/sys/random.h Sat Sep 10 17:00:08 2016 (r305687)
+++ user/alc/PQ_LAUNDRY/sys/cddl/compat/opensolaris/sys/random.h Sat Sep 10 17:14:57 2016 (r305688)
@@ -32,6 +32,6 @@
#include_next <sys/random.h>
#define random_get_bytes(p, s) read_random((p), (int)(s))
-#define random_get_pseudo_bytes(p, s) read_random((p), (int)(s))
+#define random_get_pseudo_bytes(p, s) arc4rand((p), (int)(s), 0)
#endif /* !_OPENSOLARIS_SYS_RANDOM_H_ */
Modified: user/alc/PQ_LAUNDRY/sys/i386/i386/pmap.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/i386/i386/pmap.c Sat Sep 10 17:00:08 2016 (r305687)
+++ user/alc/PQ_LAUNDRY/sys/i386/i386/pmap.c Sat Sep 10 17:14:57 2016 (r305688)
@@ -4765,8 +4765,6 @@ retry:
rw_wunlock(&pvh_global_lock);
}
-#define PMAP_TS_REFERENCED_MAX 5
-
/*
* pmap_ts_referenced:
*
@@ -4775,10 +4773,6 @@ retry:
* is necessary that 0 only be returned when there are truly no
* reference bits set.
*
- * XXX: The exact number of bits to check and clear is a matter that
- * should be tested and standardized at some point in the future for
- * optimal aging of shared pages.
- *
* As an optimization, update the page's dirty field if a modified bit is
* found while counting reference bits. This opportunistic update can be
* performed at low cost and can eliminate the need for some future calls
Modified: user/alc/PQ_LAUNDRY/sys/kern/kern_exit.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/kern/kern_exit.c Sat Sep 10 17:00:08 2016 (r305687)
+++ user/alc/PQ_LAUNDRY/sys/kern/kern_exit.c Sat Sep 10 17:14:57 2016 (r305688)
@@ -723,9 +723,9 @@ sys_wait4(struct thread *td, struct wait
else
rup = NULL;
error = kern_wait(td, uap->pid, &status, uap->options, rup);
- if (uap->status != NULL && error == 0)
+ if (uap->status != NULL && error == 0 && td->td_retval[0] != 0)
error = copyout(&status, uap->status, sizeof(status));
- if (uap->rusage != NULL && error == 0)
+ if (uap->rusage != NULL && error == 0 && td->td_retval[0] != 0)
error = copyout(&ru, uap->rusage, sizeof(struct rusage));
return (error);
}
@@ -759,9 +759,9 @@ sys_wait6(struct thread *td, struct wait
*/
error = kern_wait6(td, idtype, id, &status, uap->options, wrup, sip);
- if (uap->status != NULL && error == 0)
+ if (uap->status != NULL && error == 0 && td->td_retval[0] != 0)
error = copyout(&status, uap->status, sizeof(status));
- if (uap->wrusage != NULL && error == 0)
+ if (uap->wrusage != NULL && error == 0 && td->td_retval[0] != 0)
error = copyout(&wru, uap->wrusage, sizeof(wru));
if (uap->info != NULL && error == 0)
error = copyout(&si, uap->info, sizeof(si));
Modified: user/alc/PQ_LAUNDRY/sys/kern/subr_witness.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/kern/subr_witness.c Sat Sep 10 17:00:08 2016 (r305687)
+++ user/alc/PQ_LAUNDRY/sys/kern/subr_witness.c Sat Sep 10 17:14:57 2016 (r305688)
@@ -623,6 +623,14 @@ static struct witness_order_list_entry o
{ "vnode interlock", &lock_class_mtx_sleep },
{ NULL, NULL },
/*
+ * VFS namecache
+ */
+ { "ncglobal", &lock_class_rw },
+ { "ncbuc", &lock_class_rw },
+ { "vnode interlock", &lock_class_mtx_sleep },
+ { "ncneg", &lock_class_mtx_sleep },
+ { NULL, NULL },
+ /*
* ZFS locking
*/
{ "dn->dn_mtx", &lock_class_sx },
Modified: user/alc/PQ_LAUNDRY/sys/kern/uipc_syscalls.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/kern/uipc_syscalls.c Sat Sep 10 17:00:08 2016 (r305687)
+++ user/alc/PQ_LAUNDRY/sys/kern/uipc_syscalls.c Sat Sep 10 17:14:57 2016 (r305688)
@@ -1115,7 +1115,7 @@ orecvfrom(struct thread *td, struct recv
#ifdef COMPAT_OLDSOCK
int
-orecv(struct thread *td, struct orecv_args)
+orecv(struct thread *td, struct orecv_args *uap)
{
struct msghdr msg;
struct iovec aiov;
Modified: user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c Sat Sep 10 17:00:08 2016 (r305687)
+++ user/alc/PQ_LAUNDRY/sys/kern/vfs_cache.c Sat Sep 10 17:14:57 2016 (r305688)
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
#include <sys/proc.h>
#include <sys/rwlock.h>
#include <sys/sdt.h>
+#include <sys/smp.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/sysproto.h>
@@ -148,6 +149,23 @@ struct namecache_ts {
* Upon reaching the last segment of a path, if the reference
* is for DELETE, or NOCACHE is set (rewrite), and the
* name is located in the cache, it will be dropped.
+ *
+ * These locks are used (in the order in which they can be taken):
+ * NAME TYPE ROLE
+ * cache_lock rwlock global, needed for all modifications
+ * bucketlock rwlock for access to given hash bucket
+ * ncneg_mtx mtx negative entry LRU management
+ *
+ * A name -> vnode lookup can be safely performed by either locking cache_lock
+ * or the relevant hash bucket.
+ *
+ * ".." and vnode -> name lookups require cache_lock.
+ *
+ * Modifications require both cache_lock and relevant bucketlock taken for
+ * writing.
+ *
+ * Negative entry LRU management requires ncneg_mtx taken on top of either
+ * cache_lock or bucketlock.
*/
/*
@@ -179,8 +197,9 @@ SYSCTL_UINT(_vfs, OID_AUTO, ncsizefactor
struct nchstats nchstats; /* cache effectiveness statistics */
static struct rwlock cache_lock;
-RW_SYSINIT(vfscache, &cache_lock, "Name Cache");
+RW_SYSINIT(vfscache, &cache_lock, "ncglobal");
+#define CACHE_TRY_WLOCK() rw_try_wlock(&cache_lock)
#define CACHE_UPGRADE_LOCK() rw_try_upgrade(&cache_lock)
#define CACHE_RLOCK() rw_rlock(&cache_lock)
#define CACHE_RUNLOCK() rw_runlock(&cache_lock)
@@ -188,7 +207,12 @@ RW_SYSINIT(vfscache, &cache_lock, "Name
#define CACHE_WUNLOCK() rw_wunlock(&cache_lock)
static struct mtx_padalign ncneg_mtx;
-MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "Name Cache neg", MTX_DEF);
+MTX_SYSINIT(vfscache_neg, &ncneg_mtx, "ncneg", MTX_DEF);
+
+static u_int numbucketlocks;
+static struct rwlock_padalign *bucketlocks;
+#define HASH2BUCKETLOCK(hash) \
+ ((struct rwlock *)(&bucketlocks[((hash) % numbucketlocks)]))
/*
* UMA zones for the VFS cache.
@@ -307,6 +331,8 @@ STATNODE_COUNTER(numfullpathfail4, "Numb
STATNODE_COUNTER(numfullpathfound, "Number of successful fullpath calls");
static long numupgrades; STATNODE_ULONG(numupgrades,
"Number of updates of the cache after lookup (write lock + retry)");
+static long zap_and_exit_bucket_fail; STATNODE_ULONG(zap_and_exit_bucket_fail,
+ "Number of times bucketlocked zap_and_exit case failed to writelock");
static void cache_zap(struct namecache *ncp);
static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
@@ -326,6 +352,39 @@ cache_get_hash(char *name, u_char len, s
return (hash);
}
+#ifdef INVARIANTS
+static void
+cache_assert_bucket_locked(struct namecache *ncp, int mode)
+{
+ struct rwlock *bucketlock;
+ uint32_t hash;
+
+ hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
+ bucketlock = HASH2BUCKETLOCK(hash);
+ rw_assert(bucketlock, mode);
+}
+#else
+#define cache_assert_bucket_locked(x, y) do { } while (0)
+#endif
+
+static void
+cache_lock_all_buckets(void)
+{
+ u_int i;
+
+ for (i = 0; i < numbucketlocks; i++)
+ rw_wlock(&bucketlocks[i]);
+}
+
+static void
+cache_unlock_all_buckets(void)
+{
+ u_int i;
+
+ for (i = 0; i < numbucketlocks; i++)
+ rw_wunlock(&bucketlocks[i]);
+}
+
static int
sysctl_nchstats(SYSCTL_HANDLER_ARGS)
{
@@ -442,21 +501,13 @@ SYSCTL_PROC(_debug_hashstat, OID_AUTO, n
* Negative entries management
*/
static void
-cache_negative_hit(struct namecache *ncp, int wlocked)
+cache_negative_hit(struct namecache *ncp)
{
- if (!wlocked) {
- rw_assert(&cache_lock, RA_RLOCKED);
- mtx_lock(&ncneg_mtx);
- } else {
- rw_assert(&cache_lock, RA_WLOCKED);
- }
-
+ mtx_lock(&ncneg_mtx);
TAILQ_REMOVE(&ncneg, ncp, nc_dst);
TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
-
- if (!wlocked)
- mtx_unlock(&ncneg_mtx);
+ mtx_unlock(&ncneg_mtx);
}
static void
@@ -464,9 +515,12 @@ cache_negative_insert(struct namecache *
{
rw_assert(&cache_lock, RA_WLOCKED);
+ cache_assert_bucket_locked(ncp, RA_WLOCKED);
MPASS(ncp->nc_vp == NULL);
+ mtx_lock(&ncneg_mtx);
TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
numneg++;
+ mtx_unlock(&ncneg_mtx);
}
static void
@@ -474,9 +528,12 @@ cache_negative_remove(struct namecache *
{
rw_assert(&cache_lock, RA_WLOCKED);
+ cache_assert_bucket_locked(ncp, RA_WLOCKED);
MPASS(ncp->nc_vp == NULL);
+ mtx_lock(&ncneg_mtx);
TAILQ_REMOVE(&ncneg, ncp, nc_dst);
numneg--;
+ mtx_unlock(&ncneg_mtx);
}
static struct namecache *
@@ -499,10 +556,11 @@ cache_negative_zap_one(void)
* pointer to a vnode or if it is just a negative cache entry.
*/
static void
-cache_zap(struct namecache *ncp)
+cache_zap_locked(struct namecache *ncp)
{
rw_assert(&cache_lock, RA_WLOCKED);
+ cache_assert_bucket_locked(ncp, RA_WLOCKED);
CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
if (ncp->nc_vp != NULL) {
SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
@@ -532,6 +590,21 @@ cache_zap(struct namecache *ncp)
numcache--;
}
+static void
+cache_zap(struct namecache *ncp)
+{
+ struct rwlock *bucketlock;
+ uint32_t hash;
+
+ rw_assert(&cache_lock, RA_WLOCKED);
+
+ hash = cache_get_hash(nc_get_name(ncp), ncp->nc_nlen, ncp->nc_dvp);
+ bucketlock = HASH2BUCKETLOCK(hash);
+ rw_wlock(bucketlock);
+ cache_zap_locked(ncp);
+ rw_wunlock(bucketlock);
+}
+
/*
* Lookup an entry in the cache
*
@@ -549,22 +622,42 @@ cache_zap(struct namecache *ncp)
* not recursively acquired.
*/
+enum { UNLOCKED, WLOCKED, RLOCKED };
+
+static void
+cache_unlock(int cache_locked)
+{
+
+ switch (cache_locked) {
+ case UNLOCKED:
+ break;
+ case WLOCKED:
+ CACHE_WUNLOCK();
+ break;
+ case RLOCKED:
+ CACHE_RUNLOCK();
+ break;
+ }
+}
+
int
cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
struct timespec *tsp, int *ticksp)
{
+ struct rwlock *bucketlock;
struct namecache *ncp;
uint32_t hash;
- int error, ltype, wlocked;
+ int error, ltype, cache_locked;
if (!doingcache) {
cnp->cn_flags &= ~MAKEENTRY;
return (0);
}
retry:
- wlocked = 0;
- counter_u64_add(numcalls, 1);
+ bucketlock = NULL;
+ cache_locked = UNLOCKED;
error = 0;
+ counter_u64_add(numcalls, 1);
retry_wlocked:
if (cnp->cn_nameptr[0] == '.') {
@@ -598,17 +691,21 @@ retry_wlocked:
}
return (-1);
}
- if (!wlocked)
- CACHE_RLOCK();
if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
counter_u64_add(dotdothits, 1);
+ if (cache_locked == UNLOCKED) {
+ CACHE_RLOCK();
+ cache_locked = RLOCKED;
+ }
+
if (dvp->v_cache_dd == NULL) {
SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
"..", NULL);
goto unlock;
}
if ((cnp->cn_flags & MAKEENTRY) == 0) {
- if (!wlocked && !CACHE_UPGRADE_LOCK())
+ if (cache_locked != WLOCKED &&
+ !CACHE_UPGRADE_LOCK())
goto wlock;
ncp = NULL;
if (dvp->v_cache_dd->nc_flag & NCF_ISDOTDOT) {
@@ -639,10 +736,14 @@ retry_wlocked:
nc_dotdottime;
goto success;
}
- } else if (!wlocked)
- CACHE_RLOCK();
+ }
hash = cache_get_hash(cnp->cn_nameptr, cnp->cn_namelen, dvp);
+ if (cache_locked == UNLOCKED) {
+ bucketlock = HASH2BUCKETLOCK(hash);
+ rw_rlock(bucketlock);
+ }
+
LIST_FOREACH(ncp, (NCHHASH(hash)), nc_hash) {
counter_u64_add(numchecks, 1);
if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen &&
@@ -665,12 +766,7 @@ retry_wlocked:
/* We don't want to have an entry, so dump it */
if ((cnp->cn_flags & MAKEENTRY) == 0) {
counter_u64_add(numposzaps, 1);
- if (!wlocked && !CACHE_UPGRADE_LOCK())
- goto wlock;
- cache_zap(ncp);
- CACHE_WUNLOCK();
- cache_free(ncp);
- return (0);
+ goto zap_and_exit;
}
/* We found a "positive" match, return the vnode */
@@ -689,25 +785,20 @@ negative_success:
/* We found a negative match, and want to create it, so purge */
if (cnp->cn_nameiop == CREATE) {
counter_u64_add(numnegzaps, 1);
- if (!wlocked && !CACHE_UPGRADE_LOCK())
- goto wlock;
- cache_zap(ncp);
- CACHE_WUNLOCK();
- cache_free(ncp);
- return (0);
+ goto zap_and_exit;
}
counter_u64_add(numneghits, 1);
- cache_negative_hit(ncp, wlocked);
+ cache_negative_hit(ncp);
if (ncp->nc_flag & NCF_WHITE)
cnp->cn_flags |= ISWHITEOUT;
SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
nc_get_name(ncp));
cache_out_ts(ncp, tsp, ticksp);
- if (wlocked)
- CACHE_WUNLOCK();
- else
- CACHE_RUNLOCK();
+ MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
+ if (bucketlock != NULL)
+ rw_runlock(bucketlock);
+ cache_unlock(cache_locked);
return (ENOENT);
wlock:
@@ -716,9 +807,10 @@ wlock:
* a write lock and retry the operation.
*/
CACHE_RUNLOCK();
+wlock_unlocked:
CACHE_WLOCK();
numupgrades++;
- wlocked = 1;
+ cache_locked = WLOCKED;
goto retry_wlocked;
success:
@@ -733,10 +825,10 @@ success:
VOP_UNLOCK(dvp, 0);
}
vhold(*vpp);
- if (wlocked)
- CACHE_WUNLOCK();
- else
- CACHE_RUNLOCK();
+ MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
+ if (bucketlock != NULL)
+ rw_runlock(bucketlock);
+ cache_unlock(cache_locked);
error = vget(*vpp, cnp->cn_lkflags | LK_VNHELD, cnp->cn_thread);
if (cnp->cn_flags & ISDOTDOT) {
vn_lock(dvp, ltype | LK_RETRY);
@@ -758,10 +850,29 @@ success:
return (-1);
unlock:
- if (wlocked)
- CACHE_WUNLOCK();
- else
- CACHE_RUNLOCK();
+ MPASS(bucketlock != NULL || cache_locked != UNLOCKED);
+ if (bucketlock != NULL)
+ rw_runlock(bucketlock);
+ cache_unlock(cache_locked);
+ return (0);
+
+zap_and_exit:
+ if (bucketlock != NULL) {
+ rw_assert(&cache_lock, RA_UNLOCKED);
+ if (!CACHE_TRY_WLOCK()) {
+ rw_runlock(bucketlock);
+ bucketlock = NULL;
+ zap_and_exit_bucket_fail++;
+ goto wlock_unlocked;
+ }
+ cache_locked = WLOCKED;
+ rw_runlock(bucketlock);
+ bucketlock = NULL;
+ } else if (cache_locked != WLOCKED && !CACHE_UPGRADE_LOCK())
+ goto wlock;
+ cache_zap(ncp);
+ CACHE_WUNLOCK();
+ cache_free(ncp);
return (0);
}
@@ -772,6 +883,7 @@ void
cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
struct timespec *tsp, struct timespec *dtsp)
{
+ struct rwlock *bucketlock;
struct namecache *ncp, *n2, *ndd, *nneg;
struct namecache_ts *n3;
struct nchashhead *ncpp;
@@ -924,11 +1036,6 @@ cache_enter_time(struct vnode *dvp, stru
}
}
- /*
- * Insert the new namecache entry into the appropriate chain
- * within the cache entries table.
- */
- LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
if (flag != NCF_ISDOTDOT) {
if (LIST_EMPTY(&dvp->v_cache_src)) {
vhold(dvp);
@@ -937,6 +1044,15 @@ cache_enter_time(struct vnode *dvp, stru
LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src);
}
+ bucketlock = HASH2BUCKETLOCK(hash);
+ rw_wlock(bucketlock);
+
+ /*
+ * Insert the new namecache entry into the appropriate chain
+ * within the cache entries table.
+ */
+ LIST_INSERT_HEAD(ncpp, ncp, nc_hash);
+
/*
* If the entry is "negative", we place it into the
* "negative" cache queue, otherwise, we place it into the
@@ -953,6 +1069,7 @@ cache_enter_time(struct vnode *dvp, stru
SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
nc_get_name(ncp));
}
+ rw_wunlock(bucketlock);
if (numneg * ncnegfactor > numcache)
nneg = cache_negative_zap_one();
CACHE_WUNLOCK();
@@ -960,12 +1077,24 @@ cache_enter_time(struct vnode *dvp, stru
cache_free(nneg);
}
+static u_int
+cache_roundup_2(u_int val)
+{
+ u_int res;
+
+ for (res = 1; res <= val; res <<= 1)
+ continue;
+
+ return (res);
+}
+
/*
* Name cache initialization, from vfs_init() when we are booting
*/
static void
nchinit(void *dummy __unused)
{
+ u_int i;
TAILQ_INIT(&ncneg);
@@ -983,6 +1112,13 @@ nchinit(void *dummy __unused)
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
+ numbucketlocks = cache_roundup_2(mp_ncpus * 16);
+ if (numbucketlocks > nchash)
+ numbucketlocks = nchash;
+ bucketlocks = malloc(sizeof(*bucketlocks) * numbucketlocks, M_VFSCACHE,
+ M_WAITOK | M_ZERO);
+ for (i = 0; i < numbucketlocks; i++)
+ rw_init_flags(&bucketlocks[i], "ncbuc", RW_DUPOK);
numcalls = counter_u64_alloc(M_WAITOK);
dothits = counter_u64_alloc(M_WAITOK);
@@ -1023,6 +1159,7 @@ cache_changesize(int newmaxvnodes)
* because to do so, they have to be removed from the hash table.
*/
CACHE_WLOCK();
+ cache_lock_all_buckets();
old_nchashtbl = nchashtbl;
old_nchash = nchash;
nchashtbl = new_nchashtbl;
@@ -1035,6 +1172,7 @@ cache_changesize(int newmaxvnodes)
LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
}
}
+ cache_unlock_all_buckets();
CACHE_WUNLOCK();
free(old_nchashtbl, M_VFSCACHE);
}
@@ -1108,20 +1246,30 @@ void
cache_purgevfs(struct mount *mp)
{
TAILQ_HEAD(, namecache) ncps;
- struct nchashhead *ncpp;
+ struct rwlock *bucketlock;
+ struct nchashhead *bucket;
struct namecache *ncp, *nnp;
+ u_long i, j, n_nchash;
/* Scan hash tables for applicable entries */
SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
TAILQ_INIT(&ncps);
CACHE_WLOCK();
- for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
- LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
- if (ncp->nc_dvp->v_mount != mp)
- continue;
- cache_zap(ncp);
- TAILQ_INSERT_TAIL(&ncps, ncp, nc_dst);
+ n_nchash = nchash + 1;
+ for (i = 0; i < numbucketlocks; i++) {
+ bucketlock = (struct rwlock *)&bucketlocks[i];
+ rw_wlock(bucketlock);
+ for (j = i; j < n_nchash; j += numbucketlocks) {
+ bucket = &nchashtbl[j];
+ LIST_FOREACH_SAFE(ncp, bucket, nc_hash, nnp) {
+ cache_assert_bucket_locked(ncp, RA_WLOCKED);
+ if (ncp->nc_dvp->v_mount != mp)
+ continue;
+ cache_zap_locked(ncp);
+ TAILQ_INSERT_HEAD(&ncps, ncp, nc_dst);
+ }
}
+ rw_wunlock(bucketlock);
}
CACHE_WUNLOCK();
TAILQ_FOREACH_SAFE(ncp, &ncps, nc_dst, nnp) {
Modified: user/alc/PQ_LAUNDRY/sys/powerpc/booke/pmap.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/powerpc/booke/pmap.c Sat Sep 10 17:00:08 2016 (r305687)
+++ user/alc/PQ_LAUNDRY/sys/powerpc/booke/pmap.c Sat Sep 10 17:14:57 2016 (r305688)
@@ -2499,9 +2499,13 @@ mmu_booke_clear_modify(mmu_t mmu, vm_pag
* is necessary that 0 only be returned when there are truly no
* reference bits set.
*
- * XXX: The exact number of bits to check and clear is a matter that
- * should be tested and standardized at some point in the future for
- * optimal aging of shared pages.
+ * As an optimization, update the page's dirty field if a modified bit is
+ * found while counting reference bits. This opportunistic update can be
+ * performed at low cost and can eliminate the need for some future calls
+ * to pmap_is_modified(). However, since this function stops after
+ * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
+ * dirty pages. Those dirty pages will only be detected by a future call
+ * to pmap_is_modified().
*/
static int
mmu_booke_ts_referenced(mmu_t mmu, vm_page_t m)
@@ -2518,6 +2522,8 @@ mmu_booke_ts_referenced(mmu_t mmu, vm_pa
PMAP_LOCK(pv->pv_pmap);
if ((pte = pte_find(mmu, pv->pv_pmap, pv->pv_va)) != NULL &&
PTE_ISVALID(pte)) {
+ if (PTE_ISMODIFIED(pte))
+ vm_page_dirty(m);
if (PTE_ISREFERENCED(pte)) {
mtx_lock_spin(&tlbivax_mutex);
tlb_miss_lock();
@@ -2528,7 +2534,7 @@ mmu_booke_ts_referenced(mmu_t mmu, vm_pa
tlb_miss_unlock();
mtx_unlock_spin(&tlbivax_mutex);
- if (++count > 4) {
+ if (++count >= PMAP_TS_REFERENCED_MAX) {
PMAP_UNLOCK(pv->pv_pmap);
break;
}
Modified: user/alc/PQ_LAUNDRY/sys/powerpc/conf/MPC85XX
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/powerpc/conf/MPC85XX Sat Sep 10 17:00:08 2016 (r305687)
+++ user/alc/PQ_LAUNDRY/sys/powerpc/conf/MPC85XX Sat Sep 10 17:14:57 2016 (r305688)
@@ -89,6 +89,7 @@ device tun
device uart
options USB_DEBUG # enable debug msgs
#device uhci
+device ehci
device umass
device usb
device vlan
Modified: user/alc/PQ_LAUNDRY/sys/riscv/riscv/pmap.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/riscv/riscv/pmap.c Sat Sep 10 17:00:08 2016 (r305687)
+++ user/alc/PQ_LAUNDRY/sys/riscv/riscv/pmap.c Sat Sep 10 17:14:57 2016 (r305688)
@@ -2991,8 +2991,6 @@ safe_to_clear_referenced(pmap_t pmap, pt
return (FALSE);
}
-#define PMAP_TS_REFERENCED_MAX 5
-
/*
* pmap_ts_referenced:
*
@@ -3001,9 +2999,13 @@ safe_to_clear_referenced(pmap_t pmap, pt
* is necessary that 0 only be returned when there are truly no
* reference bits set.
*
- * XXX: The exact number of bits to check and clear is a matter that
- * should be tested and standardized at some point in the future for
- * optimal aging of shared pages.
+ * As an optimization, update the page's dirty field if a modified bit is
+ * found while counting reference bits. This opportunistic update can be
+ * performed at low cost and can eliminate the need for some future calls
+ * to pmap_is_modified(). However, since this function stops after
+ * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
+ * dirty pages. Those dirty pages will only be detected by a future call
+ * to pmap_is_modified().
*/
int
pmap_ts_referenced(vm_page_t m)
@@ -3012,7 +3014,7 @@ pmap_ts_referenced(vm_page_t m)
pmap_t pmap;
struct rwlock *lock;
pd_entry_t *l2;
- pt_entry_t *l3;
+ pt_entry_t *l3, old_l3;
vm_paddr_t pa;
int cleared, md_gen, not_cleared;
struct spglist free;
@@ -3050,15 +3052,18 @@ retry:
("pmap_ts_referenced: found an invalid l2 table"));
l3 = pmap_l2_to_l3(l2, pv->pv_va);
- if ((pmap_load(l3) & PTE_A) != 0) {
- if (safe_to_clear_referenced(pmap, pmap_load(l3))) {
+ old_l3 = pmap_load(l3);
+ if (pmap_page_dirty(old_l3))
+ vm_page_dirty(m);
+ if ((old_l3 & PTE_A) != 0) {
+ if (safe_to_clear_referenced(pmap, old_l3)) {
/*
* TODO: We don't handle the access flag
* at all. We need to be able to set it in
* the exception handler.
*/
panic("RISCVTODO: safe_to_clear_referenced\n");
- } else if ((pmap_load(l3) & PTE_SW_WIRED) == 0) {
+ } else if ((old_l3 & PTE_SW_WIRED) == 0) {
/*
* Wired pages cannot be paged out so
* doing accessed bit emulation for
Modified: user/alc/PQ_LAUNDRY/sys/sparc64/sparc64/pmap.c
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/sparc64/sparc64/pmap.c Sat Sep 10 17:00:08 2016 (r305687)
+++ user/alc/PQ_LAUNDRY/sys/sparc64/sparc64/pmap.c Sat Sep 10 17:14:57 2016 (r305688)
@@ -2073,18 +2073,12 @@ pmap_page_is_mapped(vm_page_t m)
return (rv);
}
-#define PMAP_TS_REFERENCED_MAX 5
-
/*
* Return a count of reference bits for a page, clearing those bits.
* It is not necessary for every reference bit to be cleared, but it
* is necessary that 0 only be returned when there are truly no
* reference bits set.
*
- * XXX: The exact number of bits to check and clear is a matter that
- * should be tested and standardized at some point in the future for
- * optimal aging of shared pages.
- *
* As an optimization, update the page's dirty field if a modified bit is
* found while counting reference bits. This opportunistic update can be
* performed at low cost and can eliminate the need for some future calls
Modified: user/alc/PQ_LAUNDRY/sys/vm/pmap.h
==============================================================================
--- user/alc/PQ_LAUNDRY/sys/vm/pmap.h Sat Sep 10 17:00:08 2016 (r305687)
+++ user/alc/PQ_LAUNDRY/sys/vm/pmap.h Sat Sep 10 17:14:57 2016 (r305688)
@@ -104,6 +104,16 @@ extern vm_offset_t kernel_vm_end;
#define PMAP_ENTER_NOSLEEP 0x0100
#define PMAP_ENTER_WIRED 0x0200
+/*
+ * Define the maximum number of machine-dependent reference bits that are
+ * cleared by a call to pmap_ts_referenced(). This limit serves two purposes.
+ * First, it bounds the cost of reference bit maintenance on widely shared
+ * pages. Second, it prevents numeric overflow during maintenance of a
+ * widely shared page's "act_count" field. An overflow could result in the
+ * premature deactivation of the page.
+ */
+#define PMAP_TS_REFERENCED_MAX 5
+
void pmap_activate(struct thread *td);
void pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
int advice);
Modified: user/alc/PQ_LAUNDRY/tests/sys/kern/Makefile
==============================================================================
--- user/alc/PQ_LAUNDRY/tests/sys/kern/Makefile Sat Sep 10 17:00:08 2016 (r305687)
+++ user/alc/PQ_LAUNDRY/tests/sys/kern/Makefile Sat Sep 10 17:14:57 2016 (r305688)
@@ -12,6 +12,7 @@ PLAIN_TESTS_C+= subr_unit_test
ATF_TESTS_C+= unix_seqpacket_test
ATF_TESTS_C+= unix_passfd_test
TEST_METADATA.unix_seqpacket_test+= timeout="15"
+ATF_TESTS_C+= waitpid_nohang
LIBADD.ptrace_test+= pthread
LIBADD.unix_seqpacket_test+= pthread
Copied: user/alc/PQ_LAUNDRY/tests/sys/kern/waitpid_nohang.c (from r305685, head/tests/sys/kern/waitpid_nohang.c)
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/alc/PQ_LAUNDRY/tests/sys/kern/waitpid_nohang.c Sat Sep 10 17:14:57 2016 (r305688, copy of r305685, head/tests/sys/kern/waitpid_nohang.c)
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 2016 Jilles Tjoelker
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/wait.h>
+
+#include <atf-c.h>
+#include <signal.h>
+#include <unistd.h>
+
+ATF_TC_WITHOUT_HEAD(waitpid_nohang);
+ATF_TC_BODY(waitpid_nohang, tc)
+{
+ pid_t child, pid;
+ int status, r;
+
+ child = fork();
+ ATF_REQUIRE(child != -1);
+ if (child == 0) {
+ sleep(10);
+ _exit(1);
+ }
+
+ status = 42;
+ pid = waitpid(child, &status, WNOHANG);
+ ATF_REQUIRE(pid == 0);
+ ATF_CHECK(status == 42);
+
+ r = kill(child, SIGTERM);
+ ATF_REQUIRE(r == 0);
+ r = waitid(P_PID, child, NULL, WEXITED | WNOWAIT);
+ ATF_REQUIRE(r == 0);
+
+ status = -1;
+ pid = waitpid(child, &status, WNOHANG);
+ ATF_REQUIRE(pid == child);
+ ATF_CHECK(WIFSIGNALED(status) && WTERMSIG(status) == SIGTERM);
+}
+
+ATF_TP_ADD_TCS(tp)
+{
+
+ ATF_TP_ADD_TC(tp, waitpid_nohang);
+ return (atf_no_error());
+}
More information about the svn-src-user
mailing list