svn commit: r323526 - in vendor-sys/illumos/dist/uts/common: fs fs/zfs sys

Andriy Gapon avg at FreeBSD.org
Wed Sep 13 10:25:45 UTC 2017


Author: avg
Date: Wed Sep 13 10:25:44 2017
New Revision: 323526
URL: https://svnweb.freebsd.org/changeset/base/323526

Log:
  8376 cached v_path should be kept fresh
  
  illumos/illumos-gate at e2fc3408efa6cdfc5e33c73c3567efc8c7592707
  https://github.com/illumos/illumos-gate/commit/e2fc3408efa6cdfc5e33c73c3567efc8c7592707
  
  https://www.illumos.org/issues/8376
    The logic for generating and maintaining the cached v_path value on vnodes
    could stand to be improved. If vnodes were purely ephemeral, then freshly
    calculating v_path at the time of lookup() would result in correct values (at a
    performance cost). When they persist, either as referenced by other structures
    (such as open files, process cwd, dnlc entries, etc), the opportunity for the
    v_path to become stale arises. This is exacerbated by the current behavior
    that, when v_path is found to be invalid (during a vnodetopath operation) will
    strive to recalculate it, but not preserve the result. The overall situation
    leads to both performance and correctness (due to lack of results) problems
    relating to v_path.
    This has been addressed in SmartOS through a series of changes. Firstly, to do
    proper invalidation of v_path when it's found to be stale:
    - OS-3891 stale v_path slows vfs lookups
    OS-3891 revealed that some logic made assumptions about v_path never
    transitioning from non-NULL to NULL. It was addressed here:
    - OS-4317 v_path accesses can race
    While the pathological stale v_path behavior had been addressed, there are
    still cases where the absence of valid v_path information was causing problems.
    The largest patch in this series addressed it by performing v_path checking and
    updates during vnode lookups/updates, when it is most convenient:
    - OS-5167 cached v_path should be kept fresh
    Two smaller updates are included too, to prevent erroneous behavior introduced
    by the prior changes:
    - OS-5846 procfs should follow VFS rules
    - OS-6134 vn_reinit balks on zeroed vnodes
  
  Reviewed by: Jerry Jelinek <jerry.jelinek at joyent.com>
  Reviewed by: Robert Mustacchi <rm at joyent.com>
  Approved by: Gordon Ross <gwr at nexenta.com>
  Author: Patrick Mooney <pmooney at pfmooney.com>

Modified:
  vendor-sys/illumos/dist/uts/common/fs/vnode.c
  vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_dir.c
  vendor-sys/illumos/dist/uts/common/sys/vnode.h

Modified: vendor-sys/illumos/dist/uts/common/fs/vnode.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/vnode.c	Wed Sep 13 10:23:55 2017	(r323525)
+++ vendor-sys/illumos/dist/uts/common/fs/vnode.c	Wed Sep 13 10:25:44 2017	(r323526)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2017, Joyent, Inc.
  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  */
@@ -68,6 +68,8 @@
 #include <fs/fs_subr.h>
 #include <sys/taskq.h>
 #include <fs/fs_reparse.h>
+#include <sys/time.h>
+#include <sys/sdt.h>
 
 /* Determine if this vnode is a file that is read-only */
 #define	ISROFILE(vp)	\
@@ -104,6 +106,9 @@ kmutex_t	vskstat_tree_lock;
 /* Global variable which enables/disables the vopstats collection */
 int vopstats_enabled = 1;
 
+/* Global used for empty/invalid v_path */
+char *vn_vpath_empty = "";
+
 /*
  * forward declarations for internal vnode specific data (vsd)
  */
@@ -2286,7 +2291,8 @@ vn_cache_constructor(void *buf, void *cdrarg, int kmfl
 	cv_init(&vp->v_cv, NULL, CV_DEFAULT, NULL);
 	rw_init(&vp->v_nbllock, NULL, RW_DEFAULT, NULL);
 	vp->v_femhead = NULL;	/* Must be done before vn_reinit() */
-	vp->v_path = NULL;
+	vp->v_path = vn_vpath_empty;
+	vp->v_path_stamp = 0;
 	vp->v_mpssdata = NULL;
 	vp->v_vsd = NULL;
 	vp->v_fopdata = NULL;
@@ -2333,6 +2339,7 @@ void
 vn_recycle(vnode_t *vp)
 {
 	ASSERT(vp->v_pages == NULL);
+	VERIFY(vp->v_path != NULL);
 
 	/*
 	 * XXX - This really belongs in vn_reinit(), but we have some issues
@@ -2355,10 +2362,11 @@ vn_recycle(vnode_t *vp)
 		kmem_free(vp->v_femhead, sizeof (*(vp->v_femhead)));
 		vp->v_femhead = NULL;
 	}
-	if (vp->v_path) {
+	if (vp->v_path != vn_vpath_empty) {
 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
-		vp->v_path = NULL;
+		vp->v_path = vn_vpath_empty;
 	}
+	vp->v_path_stamp = 0;
 
 	if (vp->v_fopdata != NULL) {
 		free_fopdata(vp);
@@ -2396,6 +2404,15 @@ vn_reinit(vnode_t *vp)
 	vp->v_locality = NULL;
 	vp->v_xattrdir = NULL;
 
+	/*
+	 * In a few specific instances, vn_reinit() is used to initialize
+	 * locally defined vnode_t instances.  Lacking the construction offered
+	 * by vn_alloc(), these vnodes require v_path initialization.
+	 */
+	if (vp->v_path == NULL) {
+		vp->v_path = vn_vpath_empty;
+	}
+
 	/* Handles v_femhead, v_path, and the r/w/map counts */
 	vn_recycle(vp);
 }
@@ -2429,9 +2446,10 @@ vn_free(vnode_t *vp)
 	 */
 	ASSERT((vp->v_count == 0) || (vp->v_count == 1));
 	ASSERT(vp->v_count_dnlc == 0);
-	if (vp->v_path != NULL) {
+	VERIFY(vp->v_path != NULL);
+	if (vp->v_path != vn_vpath_empty) {
 		kmem_free(vp->v_path, strlen(vp->v_path) + 1);
-		vp->v_path = NULL;
+		vp->v_path = vn_vpath_empty;
 	}
 
 	/* If FEM was in use, make sure everything gets cleaned up */
@@ -2954,125 +2972,256 @@ fs_new_caller_id()
 }
 
 /*
- * Given a starting vnode and a path, updates the path in the target vnode in
- * a safe manner.  If the vnode already has path information embedded, then the
- * cached path is left untouched.
+ * The value stored in v_path is relative to rootdir, located in the global
+ * zone.  Zones or chroot environments which reside deeper inside the VFS
+ * hierarchy will have a relative view of MAXPATHLEN since they are unaware of
+ * what lies below their perceived root.  In order to keep v_path usable for
+ * these child environments, its allocations are allowed to exceed MAXPATHLEN.
+ *
+ * An upper bound of max_vnode_path is placed upon v_path allocations to
+ * prevent the system from going too wild at the behest of pathological
+ * behavior from the operator.
  */
-
 size_t max_vnode_path = 4 * MAXPATHLEN;
 
+
 void
-vn_setpath(vnode_t *rootvp, struct vnode *startvp, struct vnode *vp,
-    const char *path, size_t plen)
+vn_clearpath(vnode_t *vp, hrtime_t compare_stamp)
 {
-	char	*rpath;
-	vnode_t	*base;
-	size_t	rpathlen, rpathalloc;
-	int	doslash = 1;
+	char *buf;
 
-	if (*path == '/') {
-		base = rootvp;
-		path++;
-		plen--;
-	} else {
-		base = startvp;
-	}
-
+	mutex_enter(&vp->v_lock);
 	/*
-	 * We cannot grab base->v_lock while we hold vp->v_lock because of
-	 * the potential for deadlock.
+	 * If the snapshot of v_path_stamp passed in via compare_stamp does not
+	 * match the present value on the vnode, it indicates that subsequent
+	 * changes have occurred.  The v_path value is not cleared in this case
+	 * since the new value may be valid.
 	 */
-	mutex_enter(&base->v_lock);
-	if (base->v_path == NULL) {
-		mutex_exit(&base->v_lock);
+	if (compare_stamp != 0 && vp->v_path_stamp != compare_stamp) {
+		mutex_exit(&vp->v_lock);
 		return;
 	}
+	buf = vp->v_path;
+	vp->v_path = vn_vpath_empty;
+	vp->v_path_stamp = 0;
+	mutex_exit(&vp->v_lock);
+	if (buf != vn_vpath_empty) {
+		kmem_free(buf, strlen(buf) + 1);
+	}
+}
 
-	rpathlen = strlen(base->v_path);
-	rpathalloc = rpathlen + plen + 1;
-	/* Avoid adding a slash if there's already one there */
-	if (base->v_path[rpathlen-1] == '/')
-		doslash = 0;
-	else
-		rpathalloc++;
+static void
+vn_setpath_common(vnode_t *pvp, vnode_t *vp, const char *name, size_t len,
+    boolean_t is_rename)
+{
+	char *buf, *oldbuf;
+	hrtime_t pstamp;
+	size_t baselen, buflen = 0;
 
-	/*
-	 * We don't want to call kmem_alloc(KM_SLEEP) with kernel locks held,
-	 * so we must do this dance.  If, by chance, something changes the path,
-	 * just give up since there is no real harm.
-	 */
-	mutex_exit(&base->v_lock);
+	/* Handle the vn_setpath_str case. */
+	if (pvp == NULL) {
+		if (len + 1 > max_vnode_path) {
+			DTRACE_PROBE4(vn__setpath__too__long, vnode_t *, pvp,
+			    vnode_t *, vp, char *, name, size_t, len + 1);
+			return;
+		}
+		buf = kmem_alloc(len + 1, KM_SLEEP);
+		bcopy(name, buf, len);
+		buf[len] = '\0';
 
-	/* Paths should stay within reason */
-	if (rpathalloc > max_vnode_path)
+		mutex_enter(&vp->v_lock);
+		oldbuf = vp->v_path;
+		vp->v_path = buf;
+		vp->v_path_stamp = gethrtime();
+		mutex_exit(&vp->v_lock);
+		if (oldbuf != vn_vpath_empty) {
+			kmem_free(oldbuf, strlen(oldbuf) + 1);
+		}
 		return;
+	}
 
-	rpath = kmem_alloc(rpathalloc, KM_SLEEP);
+	/* Take snapshot of parent dir */
+	mutex_enter(&pvp->v_lock);
 
-	mutex_enter(&base->v_lock);
-	if (base->v_path == NULL || strlen(base->v_path) != rpathlen) {
-		mutex_exit(&base->v_lock);
-		kmem_free(rpath, rpathalloc);
+	if ((pvp->v_flag & VTRAVERSE) != 0) {
+		/*
+		 * When the parent vnode has VTRAVERSE set in its flags, normal
+		 * assumptions about v_path calculation no longer apply.  The
+		 * primary situation where this occurs is via the VFS tricks
+		 * which procfs plays in order to allow /proc/PID/(root|cwd) to
+		 * yield meaningful results.
+		 *
+		 * When this flag is set, v_path on the child must not be
+		 * updated since the calculated value is likely to be
+		 * incorrect, given the current context.
+		 */
+		mutex_exit(&pvp->v_lock);
 		return;
 	}
-	bcopy(base->v_path, rpath, rpathlen);
-	mutex_exit(&base->v_lock);
 
-	if (doslash)
-		rpath[rpathlen++] = '/';
-	bcopy(path, rpath + rpathlen, plen);
-	rpath[rpathlen + plen] = '\0';
+retrybuf:
+	if (pvp->v_path == vn_vpath_empty) {
+		/*
+		 * Without v_path from the parent directory, generating a child
+		 * path from the name is impossible.
+		 */
+		if (len > 0) {
+			pstamp = pvp->v_path_stamp;
+			mutex_exit(&pvp->v_lock);
+			vn_clearpath(vp, pstamp);
+			return;
+		}
 
+		/*
+		 * The only feasible case here is where a NUL lookup is being
+		 * performed on rootdir prior to its v_path being populated.
+		 */
+		ASSERT(pvp->v_path_stamp == 0);
+		baselen = 0;
+		pstamp = 0;
+	} else {
+		pstamp = pvp->v_path_stamp;
+		baselen = strlen(pvp->v_path);
+		/* ignore a trailing slash if present */
+		if (pvp->v_path[baselen - 1] == '/') {
+			/* This should only the be case for rootdir */
+			ASSERT(baselen == 1 && pvp == rootdir);
+			baselen--;
+		}
+	}
+	mutex_exit(&pvp->v_lock);
+
+	if (buflen != 0) {
+		/* Free the existing (mis-sized) buffer in case of retry */
+		kmem_free(buf, buflen);
+	}
+	/* base, '/', name and trailing NUL */
+	buflen = baselen + len + 2;
+	if (buflen > max_vnode_path) {
+		DTRACE_PROBE4(vn__setpath_too__long, vnode_t *, pvp,
+		    vnode_t *, vp, char *, name, size_t, buflen);
+		return;
+	}
+	buf = kmem_alloc(buflen, KM_SLEEP);
+
+	mutex_enter(&pvp->v_lock);
+	if (pvp->v_path_stamp != pstamp) {
+		size_t vlen;
+
+		/*
+		 * Since v_path_stamp changed on the parent, it is likely that
+		 * v_path has been altered as well.  If the length does not
+		 * exactly match what was previously measured, the buffer
+		 * allocation must be repeated for proper sizing.
+		 */
+		if (pvp->v_path == vn_vpath_empty) {
+			/* Give up if parent lack v_path */
+			mutex_exit(&pvp->v_lock);
+			kmem_free(buf, buflen);
+			return;
+		}
+		vlen = strlen(pvp->v_path);
+		if (pvp->v_path[vlen - 1] == '/') {
+			vlen--;
+		}
+		if (vlen != baselen) {
+			goto retrybuf;
+		}
+	}
+	bcopy(pvp->v_path, buf, baselen);
+	mutex_exit(&pvp->v_lock);
+
+	buf[baselen] = '/';
+	baselen++;
+	bcopy(name, &buf[baselen], len + 1);
+
 	mutex_enter(&vp->v_lock);
-	if (vp->v_path != NULL) {
+	if (vp->v_path_stamp == 0) {
+		/* never-visited vnode can inherit stamp from parent */
+		ASSERT(vp->v_path == vn_vpath_empty);
+		vp->v_path_stamp = pstamp;
+		vp->v_path = buf;
 		mutex_exit(&vp->v_lock);
-		kmem_free(rpath, rpathalloc);
+	} else if (vp->v_path_stamp < pstamp || is_rename) {
+		/*
+		 * Install the updated path and stamp, ensuring that the v_path
+		 * pointer is valid at all times for dtrace.
+		 */
+		oldbuf = vp->v_path;
+		vp->v_path = buf;
+		vp->v_path_stamp = gethrtime();
+		mutex_exit(&vp->v_lock);
+		kmem_free(oldbuf, strlen(oldbuf) + 1);
 	} else {
-		vp->v_path = rpath;
+		/*
+		 * If the timestamp matches or is greater, it means another
+		 * thread performed the update first while locks were dropped
+		 * here to make the allocation.  We defer to the newer value.
+		 */
 		mutex_exit(&vp->v_lock);
+		kmem_free(buf, buflen);
 	}
+	ASSERT(MUTEX_NOT_HELD(&vp->v_lock));
 }
 
-/*
- * Sets the path to the vnode to be the given string, regardless of current
- * context.  The string must be a complete path from rootdir.  This is only used
- * by fsop_root() for setting the path based on the mountpoint.
- */
 void
-vn_setpath_str(struct vnode *vp, const char *str, size_t len)
+vn_updatepath(vnode_t *pvp, vnode_t *vp, const char *name)
 {
-	char *buf = kmem_alloc(len + 1, KM_SLEEP);
+	size_t len;
 
-	mutex_enter(&vp->v_lock);
-	if (vp->v_path != NULL) {
-		mutex_exit(&vp->v_lock);
-		kmem_free(buf, len + 1);
+	/*
+	 * If the parent is older or empty, there's nothing further to do.
+	 */
+	if (pvp->v_path == vn_vpath_empty ||
+	    pvp->v_path_stamp <= vp->v_path_stamp) {
 		return;
 	}
 
-	vp->v_path = buf;
-	bcopy(str, vp->v_path, len);
-	vp->v_path[len] = '\0';
+	/*
+	 * Given the lack of appropriate context, meaningful updates to v_path
+	 * cannot be made for during lookups for the '.' or '..' entries.
+	 */
+	len = strlen(name);
+	if (len == 0 || (len == 1 && name[0] == '.') ||
+	    (len == 2 && name[0] == '.' && name[1] == '.')) {
+		return;
+	}
 
-	mutex_exit(&vp->v_lock);
+	vn_setpath_common(pvp, vp, name, len, B_FALSE);
 }
 
 /*
+ * Given a starting vnode and a path, updates the path in the target vnode in
+ * a safe manner.  If the vnode already has path information embedded, then the
+ * cached path is left untouched.
+ */
+/* ARGSUSED */
+void
+vn_setpath(vnode_t *rootvp, vnode_t *pvp, vnode_t *vp, const char *name,
+    size_t len)
+{
+	vn_setpath_common(pvp, vp, name, len, B_FALSE);
+}
+
+/*
+ * Sets the path to the vnode to be the given string, regardless of current
+ * context.  The string must be a complete path from rootdir.  This is only used
+ * by fsop_root() for setting the path based on the mountpoint.
+ */
+void
+vn_setpath_str(vnode_t *vp, const char *str, size_t len)
+{
+	vn_setpath_common(NULL, vp, str, len, B_FALSE);
+}
+
+/*
  * Called from within filesystem's vop_rename() to handle renames once the
  * target vnode is available.
  */
 void
-vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len)
+vn_renamepath(vnode_t *pvp, vnode_t *vp, const char *name, size_t len)
 {
-	char *tmp;
-
-	mutex_enter(&vp->v_lock);
-	tmp = vp->v_path;
-	vp->v_path = NULL;
-	mutex_exit(&vp->v_lock);
-	vn_setpath(rootdir, dvp, vp, nm, len);
-	if (tmp != NULL)
-		kmem_free(tmp, strlen(tmp) + 1);
+	vn_setpath_common(pvp, vp, name, len, B_TRUE);
 }
 
 /*
@@ -3083,37 +3232,42 @@ void
 vn_copypath(struct vnode *src, struct vnode *dst)
 {
 	char *buf;
-	int alloc;
+	hrtime_t stamp;
+	size_t buflen;
 
 	mutex_enter(&src->v_lock);
-	if (src->v_path == NULL) {
+	if (src->v_path == vn_vpath_empty) {
 		mutex_exit(&src->v_lock);
 		return;
 	}
-	alloc = strlen(src->v_path) + 1;
-
-	/* avoid kmem_alloc() with lock held */
+	buflen = strlen(src->v_path) + 1;
 	mutex_exit(&src->v_lock);
-	buf = kmem_alloc(alloc, KM_SLEEP);
+
+	buf = kmem_alloc(buflen, KM_SLEEP);
+
 	mutex_enter(&src->v_lock);
-	if (src->v_path == NULL || strlen(src->v_path) + 1 != alloc) {
+	if (src->v_path == vn_vpath_empty ||
+	    strlen(src->v_path) + 1 != buflen) {
 		mutex_exit(&src->v_lock);
-		kmem_free(buf, alloc);
+		kmem_free(buf, buflen);
 		return;
 	}
-	bcopy(src->v_path, buf, alloc);
+	bcopy(src->v_path, buf, buflen);
+	stamp = src->v_path_stamp;
 	mutex_exit(&src->v_lock);
 
 	mutex_enter(&dst->v_lock);
-	if (dst->v_path != NULL) {
+	if (dst->v_path != vn_vpath_empty) {
 		mutex_exit(&dst->v_lock);
-		kmem_free(buf, alloc);
+		kmem_free(buf, buflen);
 		return;
 	}
 	dst->v_path = buf;
+	dst->v_path_stamp = stamp;
 	mutex_exit(&dst->v_lock);
 }
 
+
 /*
  * XXX Private interface for segvn routines that handle vnode
  * large page segments.
@@ -3453,9 +3607,7 @@ fop_lookup(
 	}
 	if (ret == 0 && *vpp) {
 		VOPSTATS_UPDATE(*vpp, lookup);
-		if ((*vpp)->v_path == NULL) {
-			vn_setpath(rootdir, dvp, *vpp, nm, strlen(nm));
-		}
+		vn_updatepath(dvp, *vpp, nm);
 	}
 
 	return (ret);
@@ -3495,9 +3647,7 @@ fop_create(
 	    (dvp, name, vap, excl, mode, vpp, cr, flags, ct, vsecp);
 	if (ret == 0 && *vpp) {
 		VOPSTATS_UPDATE(*vpp, create);
-		if ((*vpp)->v_path == NULL) {
-			vn_setpath(rootdir, dvp, *vpp, name, strlen(name));
-		}
+		vn_updatepath(dvp, *vpp, name);
 	}
 
 	return (ret);
@@ -3617,10 +3767,7 @@ fop_mkdir(
 	    (dvp, dirname, vap, vpp, cr, ct, flags, vsecp);
 	if (ret == 0 && *vpp) {
 		VOPSTATS_UPDATE(*vpp, mkdir);
-		if ((*vpp)->v_path == NULL) {
-			vn_setpath(rootdir, dvp, *vpp, dirname,
-			    strlen(dirname));
-		}
+		vn_updatepath(dvp, *vpp, dirname);
 	}
 
 	return (ret);

Modified: vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_dir.c
==============================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_dir.c	Wed Sep 13 10:23:55 2017	(r323525)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_dir.c	Wed Sep 13 10:25:44 2017	(r323526)
@@ -23,6 +23,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2015, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -893,9 +894,9 @@ zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_t
 		if (zp->z_links <= zp_is_dir) {
 			zfs_panic_recover("zfs: link count on %s is %u, "
 			    "should be at least %u",
-			    zp->z_vnode->v_path ? zp->z_vnode->v_path :
-			    "<unknown>", (int)zp->z_links,
-			    zp_is_dir + 1);
+			    zp->z_vnode->v_path != vn_vpath_empty ?
+			    zp->z_vnode->v_path : "<unknown>",
+			    (int)zp->z_links, zp_is_dir + 1);
 			zp->z_links = zp_is_dir + 1;
 		}
 		if (--zp->z_links == zp_is_dir) {

Modified: vendor-sys/illumos/dist/uts/common/sys/vnode.h
==============================================================================
--- vendor-sys/illumos/dist/uts/common/sys/vnode.h	Wed Sep 13 10:23:55 2017	(r323525)
+++ vendor-sys/illumos/dist/uts/common/sys/vnode.h	Wed Sep 13 10:25:44 2017	(r323526)
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2017, Joyent, Inc.
  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  */
 
@@ -223,6 +223,59 @@ struct vsd_node {
  * In particular, file systems should not access other fields; they may
  * change or even be removed.  The functionality which was once provided
  * by these fields is available through vn_* functions.
+ *
+ * VNODE PATH THEORY:
+ * In each vnode, the v_path field holds a cached version of the canonical
+ * filesystem path which that node represents.  Because vnodes lack contextual
+ * information about their own name or position in the VFS hierarchy, this path
+ * must be calculated when the vnode is instantiated by operations such as
+ * fop_create, fop_lookup, or fop_mkdir.  During said operations, both the
+ * parent vnode (and its cached v_path) and future name are known, so the
+ * v_path of the resulting object can easily be set.
+ *
+ * The caching nature of v_path is complicated in the face of directory
+ * renames.  Filesystem drivers are responsible for calling vn_renamepath when
+ * a fop_rename operation succeeds.  While the v_path on the renamed vnode will
+ * be updated, existing children of the directory (direct, or at deeper levels)
+ * will now possess v_path caches which are stale.
+ *
+ * It is expensive (and for non-directories, impossible) to recalculate stale
+ * v_path entries during operations such as vnodetopath.  The best time during
+ * which to correct such wrongs is the same as when v_path is first
+ * initialized: during fop_create/fop_lookup/fop_mkdir/etc, where adequate
+ * context is available to generate the current path.
+ *
+ * In order to quickly detect stale v_path entries (without full lookup
+ * verification) to trigger a v_path update, the v_path_stamp field has been
+ * added to vnode_t.  As part of successful fop_create/fop_lookup/fop_mkdir
+ * operations, where the name and parent vnode are available, the following
+ * rules are used to determine updates to the child:
+ *
+ * 1. If the parent lacks a v_path, clear any existing v_path and v_path_stamp
+ *    on the child.  Until the parent v_path is refreshed to a valid state, the
+ *    child v_path must be considered invalid too.
+ *
+ * 2. If the child lacks a v_path (implying v_path_stamp == 0), it inherits the
+ *    v_path_stamp value from its parent and its v_path is updated.
+ *
+ * 3. If the child v_path_stamp is less than v_path_stamp in the parent, it is
+ *    an indication that the child v_path is stale.  The v_path is updated and
+ *    v_path_stamp in the child is set to the current hrtime().
+ *
+ *    It does _not_ inherit the parent v_path_stamp in order to propagate the
+ *    the time of v_path invalidation through the directory structure.  This
+ *    prevents concurrent invalidations (operating with a now-incorrect v_path)
+ *    at deeper levels in the tree from persisting.
+ *
+ * 4. If the child v_path_stamp is greater or equal to the parent, no action
+ *    needs to be taken.
+ *
+ * Note that fop_rename operations do not follow this ruleset.  They perform an
+ * explicit update of v_path and v_path_stamp (setting it to the current time)
+ *
+ * With these constraints in place, v_path invalidations and updates should
+ * proceed in a timely manner as vnodes are accessed.  While there still are
+ * limited cases where vnodetopath operations will fail, the risk is minimized.
  */
 
 struct fem_head;	/* from fem.h */
@@ -249,6 +302,7 @@ typedef struct vnode {
 	void		*v_locality;	/* hook for locality info */
 	struct fem_head	*v_femhead;	/* fs monitoring */
 	char		*v_path;	/* cached path */
+	hrtime_t	v_path_stamp;	/* timestamp for cached path */
 	uint_t		v_rdcnt;	/* open for read count  (VREG only) */
 	uint_t		v_wrcnt;	/* open for write count (VREG only) */
 	u_longlong_t	v_mmap_read;	/* mmap read count */
@@ -350,6 +404,14 @@ typedef struct vn_vfslocks_entry {
 #define	V_SYSATTR	0x40000	/* vnode is a GFS system attribute */
 
 /*
+ * Indication that VOP_LOOKUP operations on this vnode may yield results from a
+ * different VFS instance.  The main use of this is to suppress v_path
+ * calculation logic when filesystems such as procfs emit results which defy
+ * expectations about normal VFS behavior.
+ */
+#define	VTRAVERSE	0x80000
+
+/*
  * Vnode attributes.  A bit-mask is supplied as part of the
  * structure to indicate the attributes the caller wants to
  * set (setattr) or extract (getattr).
@@ -1293,6 +1355,11 @@ void vn_setpath(vnode_t *rootvp, struct vnode *startvp
     const char *path, size_t plen);
 void vn_renamepath(vnode_t *dvp, vnode_t *vp, const char *nm, size_t len);
 
+/* Private vnode manipulation functions */
+void vn_clearpath(vnode_t *, hrtime_t);
+void vn_updatepath(vnode_t *, vnode_t *, const char *);
+
+
 /* Vnode event notification */
 void	vnevent_rename_src(vnode_t *, vnode_t *, char *, caller_context_t *);
 void	vnevent_rename_dest(vnode_t *, vnode_t *, char *, caller_context_t *);
@@ -1338,6 +1405,9 @@ void reparse_point_init(void);
 u_longlong_t	fs_new_caller_id();
 
 int	vn_vmpss_usepageio(vnode_t *);
+
+/* Empty v_path placeholder */
+extern char *vn_vpath_empty;
 
 /*
  * Needed for use of IS_VMODSORT() in kernel.


More information about the svn-src-all mailing list