svn commit: r245115 - head/sys/fs/tmpfs

Gleb Kurtsou gleb at FreeBSD.org
Sun Jan 6 22:15:45 UTC 2013


Author: gleb
Date: Sun Jan  6 22:15:44 2013
New Revision: 245115
URL: http://svnweb.freebsd.org/changeset/base/245115

Log:
  tmpfs: Replace directory entry linked list with RB-Tree.
  
  Use file name hash as a tree key, handle duplicate keys.  Both VOP_LOOKUP
  and VOP_READDIR operations utilize same tree for search.  Directory
  entry offset (cookie) is either file name hash or incremental id in case
  of hash collisions (duplicate-cookies).  Keep sorted per directory list
  of duplicate-cookie entries to facilitate cookie number allocation.
  
  Don't fail if previous VOP_READDIR() offset is no longer valid, start
  with next dirent instead.  Other file system handle it similarly.
  
  Workaround race prone tn_readdir_last[pn] fields update.
  
  Add tmpfs_dir_destroy() to free all dirents.
  
  Set NFS cookies in tmpfs_dir_getdents(). Return EJUSTRETURN from
  tmpfs_dir_getdents() instead of hard coded -1.
  
  Mark directory traversal routines static as they are no longer
  used outside of tmpfs_subr.c

Modified:
  head/sys/fs/tmpfs/tmpfs.h
  head/sys/fs/tmpfs/tmpfs_subr.c
  head/sys/fs/tmpfs/tmpfs_vfsops.c
  head/sys/fs/tmpfs/tmpfs_vnops.c

Modified: head/sys/fs/tmpfs/tmpfs.h
==============================================================================
--- head/sys/fs/tmpfs/tmpfs.h	Sun Jan  6 21:56:58 2013	(r245114)
+++ head/sys/fs/tmpfs/tmpfs.h	Sun Jan  6 22:15:44 2013	(r245115)
@@ -49,6 +49,7 @@
 /* --------------------------------------------------------------------- */
 #include <sys/malloc.h>
 #include <sys/systm.h>
+#include <sys/tree.h>
 #include <sys/vmmeter.h>
 #include <vm/swap_pager.h>
 
@@ -60,104 +61,81 @@ MALLOC_DECLARE(M_TMPFSNAME);
 /*
  * Internal representation of a tmpfs directory entry.
  */
+
+LIST_HEAD(tmpfs_dir_duphead, tmpfs_dirent);
+
 struct tmpfs_dirent {
-	TAILQ_ENTRY(tmpfs_dirent)	td_entries;
+	/*
+	 * Depending on td_cookie flag entry can be of 3 types:
+	 * - regular -- no hash collisions, stored in RB-Tree
+	 * - duphead -- synthetic linked list head for dup entries
+	 * - dup -- stored in linked list instead of RB-Tree
+	 */
+	union {
+		/* regular and duphead entry types */
+		RB_ENTRY(tmpfs_dirent)		td_entries;
 
-	/* Length of the name stored in this directory entry.  This avoids
-	 * the need to recalculate it every time the name is used. */
-	uint16_t			td_namelen;
-
-	/* The name of the entry, allocated from a string pool.  This
-	* string is not required to be zero-terminated; therefore, the
-	* td_namelen field must always be used when accessing its value. */
-	char *				td_name;
+		/* dup entry type */
+		struct {
+			LIST_ENTRY(tmpfs_dirent) entries;
+			LIST_ENTRY(tmpfs_dirent) index_entries;
+		} td_dup;
+	} uh;
+
+	uint32_t			td_cookie;
+	uint32_t			td_hash;
+	u_int				td_namelen;
 
 	/* Pointer to the node this entry refers to.  In case this field
 	 * is NULL, the node is a whiteout. */
 	struct tmpfs_node *		td_node;
+
+	union {
+		/*
+		 * The name of the entry, allocated from a string pool.  This
+		 * string is not required to be zero-terminated.
+		 */
+		char *			td_name;	/* regular, dup */
+		struct tmpfs_dir_duphead td_duphead;	/* duphead */
+	} ud;
 };
 
-/* A directory in tmpfs holds a sorted list of directory entries, which in
+/* A directory in tmpfs holds a list of directory entries, which in
  * turn point to other files (which can be directories themselves).
  *
- * In tmpfs, this list is managed by a tail queue, whose head is defined by
+ * In tmpfs, this list is managed by a RB-Tree, whose head is defined by
  * the struct tmpfs_dir type.
  *
- * It is imporant to notice that directories do not have entries for . and
+ * It is important to notice that directories do not have entries for . and
  * .. as other file systems do.  These can be generated when requested
  * based on information available by other means, such as the pointer to
  * the node itself in the former case or the pointer to the parent directory
  * in the latter case.  This is done to simplify tmpfs's code and, more
  * importantly, to remove redundancy. */
-TAILQ_HEAD(tmpfs_dir, tmpfs_dirent);
+RB_HEAD(tmpfs_dir, tmpfs_dirent);
 
 /* Each entry in a directory has a cookie that identifies it.  Cookies
  * supersede offsets within directories because, given how tmpfs stores
- * directories in memory, there is no such thing as an offset.  (Emulating
- * a real offset could be very difficult.)
- * 
+ * directories in memory, there is no such thing as an offset.
+ *
  * The '.', '..' and the end of directory markers have fixed cookies which
  * cannot collide with the cookies generated by other entries.  The cookies
- * fot the other entries are generated based on the memory address on which
- * stores their information is stored.
- *
- * Ideally, using the entry's memory pointer as the cookie would be enough
- * to represent it and it wouldn't cause collisions in any system.
- * Unfortunately, this results in "offsets" with very large values which
- * later raise problems in the Linux compatibility layer (and maybe in other
- * places) as described in PR kern/32034.  Hence we need to workaround this
- * with a rather ugly hack.
- *
- * Linux 32-bit binaries, unless built with _FILE_OFFSET_BITS=64, have off_t
- * set to 'long', which is a 32-bit *signed* long integer.  Regardless of
- * the macro value, GLIBC (2.3 at least) always uses the getdents64
- * system call (when calling readdir) which internally returns off64_t
- * offsets.  In order to make 32-bit binaries work, *GLIBC* converts the
- * 64-bit values returned by the kernel to 32-bit ones and aborts with
- * EOVERFLOW if the conversion results in values that won't fit in 32-bit
- * integers (which it assumes is because the directory is extremely large).
- * This wouldn't cause problems if we were dealing with unsigned integers,
- * but as we have signed integers, this check fails due to sign expansion.
+ * for the other entries are generated based on the file name hash value or
+ * unique number in case of name hash collision.
  *
- * For example, consider that the kernel returns the 0xc1234567 cookie to
- * userspace in a off64_t integer.  Later on, GLIBC casts this value to
- * off_t (remember, signed) with code similar to:
- *     system call returns the offset in kernel_value;
- *     off_t casted_value = kernel_value;
- *     if (sizeof(off_t) != sizeof(off64_t) &&
- *         kernel_value != casted_value)
- *             error!
- * In this case, casted_value still has 0xc1234567, but when it is compared
- * for equality against kernel_value, it is promoted to a 64-bit integer and
- * becomes 0xffffffffc1234567, which is different than 0x00000000c1234567.
- * Then, GLIBC assumes this is because the directory is very large.
- *
- * Given that all the above happens in user-space, we have no control over
- * it; therefore we must workaround the issue here.  We do this by
- * truncating the pointer value to a 32-bit integer and hope that there
- * won't be collisions.  In fact, this will not cause any problems in
- * 32-bit platforms but some might arise in 64-bit machines (I'm not sure
- * if they can happen at all in practice).
- *
- * XXX A nicer solution shall be attempted. */
-#ifdef _KERNEL
-#define	TMPFS_DIRCOOKIE_DOT	0
-#define	TMPFS_DIRCOOKIE_DOTDOT	1
-#define	TMPFS_DIRCOOKIE_EOF	2
-static __inline
-off_t
-tmpfs_dircookie(struct tmpfs_dirent *de)
-{
-	off_t cookie;
-
-	cookie = ((off_t)(uintptr_t)de >> 1) & 0x7FFFFFFF;
-	MPASS(cookie != TMPFS_DIRCOOKIE_DOT);
-	MPASS(cookie != TMPFS_DIRCOOKIE_DOTDOT);
-	MPASS(cookie != TMPFS_DIRCOOKIE_EOF);
+ * To preserve compatibility cookies are limited to 31 bits.
+ */
 
-	return cookie;
-}
-#endif
+#define	TMPFS_DIRCOOKIE_DOT		0
+#define	TMPFS_DIRCOOKIE_DOTDOT		1
+#define	TMPFS_DIRCOOKIE_EOF		2
+#define	TMPFS_DIRCOOKIE_MASK		((off_t)0x3fffffffU)
+#define	TMPFS_DIRCOOKIE_MIN		((off_t)0x00000004U)
+#define	TMPFS_DIRCOOKIE_DUP		((off_t)0x40000000U)
+#define	TMPFS_DIRCOOKIE_DUPHEAD		((off_t)0x80000000U)
+#define	TMPFS_DIRCOOKIE_DUP_MIN		TMPFS_DIRCOOKIE_DUP
+#define	TMPFS_DIRCOOKIE_DUP_MAX		\
+	(TMPFS_DIRCOOKIE_DUP | TMPFS_DIRCOOKIE_MASK)
 
 /* --------------------------------------------------------------------- */
 
@@ -243,29 +221,31 @@ struct tmpfs_node {
 		dev_t			tn_rdev;
 
 		/* Valid when tn_type == VDIR. */
-		struct tn_dir{
+		struct tn_dir {
 			/* Pointer to the parent directory.  The root
 			 * directory has a pointer to itself in this field;
 			 * this property identifies the root node. */
 			struct tmpfs_node *	tn_parent;
 
-			/* Head of a tail-queue that links the contents of
-			 * the directory together.  See above for a
-			 * description of its contents. */
+			/* Head of a tree that links the contents of
+			 * the directory together. */
 			struct tmpfs_dir	tn_dirhead;
 
+			/* Head of a list the contains fake directory entries
+			 * heads, i.e. entries with TMPFS_DIRCOOKIE_DUPHEAD
+			 * flag. */
+			struct tmpfs_dir_duphead tn_dupindex;
+
 			/* Number and pointer of the first directory entry
 			 * returned by the readdir operation if it were
 			 * called again to continue reading data from the
 			 * same directory as before.  This is used to speed
 			 * up reads of long directories, assuming that no
 			 * more than one read is in progress at a given time.
-			 * Otherwise, these values are discarded and a linear
-			 * scan is performed from the beginning up to the
-			 * point where readdir starts returning values. */
+			 * Otherwise, these values are discarded. */
 			off_t			tn_readdir_lastn;
 			struct tmpfs_dirent *	tn_readdir_lastp;
-		}tn_dir;
+		} tn_dir;
 
 		/* Valid when tn_type == VLNK. */
 		/* The link's target, allocated from a string pool. */
@@ -419,9 +399,9 @@ int	tmpfs_alloc_node(struct tmpfs_mount 
 	    char *, dev_t, struct tmpfs_node **);
 void	tmpfs_free_node(struct tmpfs_mount *, struct tmpfs_node *);
 int	tmpfs_alloc_dirent(struct tmpfs_mount *, struct tmpfs_node *,
-	    const char *, uint16_t, struct tmpfs_dirent **);
-void	tmpfs_free_dirent(struct tmpfs_mount *, struct tmpfs_dirent *,
-	    boolean_t);
+	    const char *, u_int, struct tmpfs_dirent **);
+void	tmpfs_free_dirent(struct tmpfs_mount *, struct tmpfs_dirent *);
+void	tmpfs_dirent_init(struct tmpfs_dirent *, const char *, u_int);
 int	tmpfs_alloc_vp(struct mount *, struct tmpfs_node *, int,
 	    struct vnode **);
 void	tmpfs_free_vp(struct vnode *);
@@ -429,13 +409,12 @@ int	tmpfs_alloc_file(struct vnode *, str
 	    struct componentname *, char *);
 void	tmpfs_dir_attach(struct vnode *, struct tmpfs_dirent *);
 void	tmpfs_dir_detach(struct vnode *, struct tmpfs_dirent *);
+void	tmpfs_dir_destroy(struct tmpfs_mount *, struct tmpfs_node *);
 struct tmpfs_dirent *	tmpfs_dir_lookup(struct tmpfs_node *node,
 			    struct tmpfs_node *f,
 			    struct componentname *cnp);
-int	tmpfs_dir_getdotdent(struct tmpfs_node *, struct uio *);
-int	tmpfs_dir_getdotdotdent(struct tmpfs_node *, struct uio *);
-struct tmpfs_dirent *	tmpfs_dir_lookupbycookie(struct tmpfs_node *, off_t);
-int	tmpfs_dir_getdents(struct tmpfs_node *, struct uio *, off_t *);
+int	tmpfs_dir_getdents(struct tmpfs_node *, struct uio *, int,
+	    u_long *, int *);
 int	tmpfs_dir_whiteout_add(struct vnode *, struct componentname *);
 void	tmpfs_dir_whiteout_remove(struct vnode *, struct componentname *);
 int	tmpfs_reg_resize(struct vnode *, off_t, boolean_t);
@@ -467,8 +446,8 @@ int	tmpfs_truncate(struct vnode *, off_t
  * with a length of 'len'.
  */
 #define TMPFS_DIRENT_MATCHES(de, name, len) \
-    (de->td_namelen == (uint16_t)len && \
-    bcmp((de)->td_name, (name), (de)->td_namelen) == 0)
+    (de->td_namelen == len && \
+    bcmp((de)->ud.td_name, (name), (de)->td_namelen) == 0)
 
 /* --------------------------------------------------------------------- */
 
@@ -476,11 +455,10 @@ int	tmpfs_truncate(struct vnode *, off_t
  * Ensures that the node pointed by 'node' is a directory and that its
  * contents are consistent with respect to directories.
  */
-#define TMPFS_VALIDATE_DIR(node) \
-    MPASS((node)->tn_type == VDIR); \
-    MPASS((node)->tn_size % sizeof(struct tmpfs_dirent) == 0); \
-    MPASS((node)->tn_dir.tn_readdir_lastp == NULL || \
-	tmpfs_dircookie((node)->tn_dir.tn_readdir_lastp) == (node)->tn_dir.tn_readdir_lastn);
+#define TMPFS_VALIDATE_DIR(node) do { \
+	MPASS((node)->tn_type == VDIR); \
+	MPASS((node)->tn_size % sizeof(struct tmpfs_dirent) == 0); \
+} while (0)
 
 /* --------------------------------------------------------------------- */
 

Modified: head/sys/fs/tmpfs/tmpfs_subr.c
==============================================================================
--- head/sys/fs/tmpfs/tmpfs_subr.c	Sun Jan  6 21:56:58 2013	(r245114)
+++ head/sys/fs/tmpfs/tmpfs_subr.c	Sun Jan  6 22:15:44 2013	(r245115)
@@ -37,6 +37,7 @@
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
+#include <sys/fnv_hash.h>
 #include <sys/namei.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
@@ -58,6 +59,11 @@ __FBSDID("$FreeBSD$");
 #include <fs/tmpfs/tmpfs_fifoops.h>
 #include <fs/tmpfs/tmpfs_vnops.h>
 
+struct tmpfs_dir_cursor {
+	struct tmpfs_dirent	*tdc_current;
+	struct tmpfs_dirent	*tdc_tree;
+};
+
 SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW, 0, "tmpfs file system");
 
 static long tmpfs_pages_reserved = TMPFS_PAGES_MINRESERVED;
@@ -87,6 +93,10 @@ SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, memory
     &tmpfs_pages_reserved, 0, sysctl_mem_reserved, "L",
     "Amount of available memory and swap below which tmpfs growth stops");
 
+static __inline int tmpfs_dirtree_cmp(struct tmpfs_dirent *a,
+    struct tmpfs_dirent *b);
+RB_PROTOTYPE_STATIC(tmpfs_dir, tmpfs_dirent, uh.td_entries, tmpfs_dirtree_cmp);
+
 size_t
 tmpfs_mem_avail(void)
 {
@@ -188,7 +198,8 @@ tmpfs_alloc_node(struct tmpfs_mount *tmp
 		break;
 
 	case VDIR:
-		TAILQ_INIT(&nnode->tn_dir.tn_dirhead);
+		RB_INIT(&nnode->tn_dir.tn_dirhead);
+		LIST_INIT(&nnode->tn_dir.tn_dupindex);
 		MPASS(parent != nnode);
 		MPASS(IMPLIES(parent == NULL, tmp->tm_root == NULL));
 		nnode->tn_dir.tn_parent = (parent == NULL) ? nnode : parent;
@@ -309,6 +320,49 @@ tmpfs_free_node(struct tmpfs_mount *tmp,
 
 /* --------------------------------------------------------------------- */
 
+static __inline uint32_t
+tmpfs_dirent_hash(const char *name, u_int len)
+{
+	uint32_t hash;
+
+	hash = fnv_32_buf(name, len, FNV1_32_INIT + len) & TMPFS_DIRCOOKIE_MASK;
+#ifdef TMPFS_DEBUG_DIRCOOKIE_DUP
+	hash &= 0xf;
+#endif
+	if (hash < TMPFS_DIRCOOKIE_MIN)
+		hash += TMPFS_DIRCOOKIE_MIN;
+
+	return (hash);
+}
+
+static __inline off_t
+tmpfs_dirent_cookie(struct tmpfs_dirent *de)
+{
+	MPASS(de->td_cookie >= TMPFS_DIRCOOKIE_MIN);
+
+	return (de->td_cookie);
+}
+
+static __inline boolean_t
+tmpfs_dirent_dup(struct tmpfs_dirent *de)
+{
+	return ((de->td_cookie & TMPFS_DIRCOOKIE_DUP) != 0);
+}
+
+static __inline boolean_t
+tmpfs_dirent_duphead(struct tmpfs_dirent *de)
+{
+	return ((de->td_cookie & TMPFS_DIRCOOKIE_DUPHEAD) != 0);
+}
+
+void
+tmpfs_dirent_init(struct tmpfs_dirent *de, const char *name, u_int namelen)
+{
+	de->td_hash = de->td_cookie = tmpfs_dirent_hash(name, namelen);
+	memcpy(de->ud.td_name, name, namelen);
+	de->td_namelen = namelen;
+}
+
 /*
  * Allocates a new directory entry for the node node with a name of name.
  * The new directory entry is returned in *de.
@@ -320,17 +374,17 @@ tmpfs_free_node(struct tmpfs_mount *tmp,
  */
 int
 tmpfs_alloc_dirent(struct tmpfs_mount *tmp, struct tmpfs_node *node,
-    const char *name, uint16_t len, struct tmpfs_dirent **de)
+    const char *name, u_int len, struct tmpfs_dirent **de)
 {
 	struct tmpfs_dirent *nde;
 
-	nde = (struct tmpfs_dirent *)uma_zalloc(
-					tmp->tm_dirent_pool, M_WAITOK);
-	nde->td_name = malloc(len, M_TMPFSNAME, M_WAITOK);
-	nde->td_namelen = len;
-	memcpy(nde->td_name, name, len);
-
+	nde = uma_zalloc(tmp->tm_dirent_pool, M_WAITOK);
 	nde->td_node = node;
+	if (name != NULL) {
+		nde->ud.td_name = malloc(len, M_TMPFSNAME, M_WAITOK);
+		tmpfs_dirent_init(nde, name, len);
+	} else
+		nde->td_namelen = 0;
 	if (node != NULL)
 		node->tn_links++;
 
@@ -351,20 +405,17 @@ tmpfs_alloc_dirent(struct tmpfs_mount *t
  * directory entry, as it may already have been released from the outside.
  */
 void
-tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de,
-    boolean_t node_exists)
+tmpfs_free_dirent(struct tmpfs_mount *tmp, struct tmpfs_dirent *de)
 {
-	if (node_exists) {
-		struct tmpfs_node *node;
+	struct tmpfs_node *node;
 
-		node = de->td_node;
-		if (node != NULL) {
-			MPASS(node->tn_links > 0);
-			node->tn_links--;
-		}
+	node = de->td_node;
+	if (node != NULL) {
+		MPASS(node->tn_links > 0);
+		node->tn_links--;
 	}
-
-	free(de->td_name, M_TMPFSNAME);
+	if (!tmpfs_dirent_duphead(de) && de->ud.td_name != NULL)
+		free(de->ud.td_name, M_TMPFSNAME);
 	uma_zfree(tmp->tm_dirent_pool, de);
 }
 
@@ -586,7 +637,7 @@ tmpfs_alloc_file(struct vnode *dvp, stru
 	/* Allocate a vnode for the new file. */
 	error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp);
 	if (error != 0) {
-		tmpfs_free_dirent(tmp, de, TRUE);
+		tmpfs_free_dirent(tmp, de);
 		tmpfs_free_node(tmp, node);
 		goto out;
 	}
@@ -605,6 +656,215 @@ out:
 
 /* --------------------------------------------------------------------- */
 
+static struct tmpfs_dirent *
+tmpfs_dir_first(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc)
+{
+	struct tmpfs_dirent *de;
+
+	de = RB_MIN(tmpfs_dir, &dnode->tn_dir.tn_dirhead);
+	dc->tdc_tree = de;
+	if (de != NULL && tmpfs_dirent_duphead(de))
+		de = LIST_FIRST(&de->ud.td_duphead);
+	dc->tdc_current = de;
+
+	return (dc->tdc_current);
+}
+
+static struct tmpfs_dirent *
+tmpfs_dir_next(struct tmpfs_node *dnode, struct tmpfs_dir_cursor *dc)
+{
+	struct tmpfs_dirent *de;
+
+	MPASS(dc->tdc_tree != NULL);
+	if (tmpfs_dirent_dup(dc->tdc_current)) {
+		dc->tdc_current = LIST_NEXT(dc->tdc_current, uh.td_dup.entries);
+		if (dc->tdc_current != NULL)
+			return (dc->tdc_current);
+	}
+	dc->tdc_tree = dc->tdc_current = RB_NEXT(tmpfs_dir,
+	    &dnode->tn_dir.tn_dirhead, dc->tdc_tree);
+	if ((de = dc->tdc_current) != NULL && tmpfs_dirent_duphead(de)) {
+		dc->tdc_current = LIST_FIRST(&de->ud.td_duphead);
+		MPASS(dc->tdc_current != NULL);
+	}
+
+	return (dc->tdc_current);
+}
+
+/* Lookup directory entry in RB-Tree. Function may return duphead entry. */
+static struct tmpfs_dirent *
+tmpfs_dir_xlookup_hash(struct tmpfs_node *dnode, uint32_t hash)
+{
+	struct tmpfs_dirent *de, dekey;
+
+	dekey.td_hash = hash;
+	de = RB_FIND(tmpfs_dir, &dnode->tn_dir.tn_dirhead, &dekey);
+	return (de);
+}
+
+/* Lookup directory entry by cookie, initialize directory cursor accordingly. */
+static struct tmpfs_dirent *
+tmpfs_dir_lookup_cookie(struct tmpfs_node *node, off_t cookie,
+    struct tmpfs_dir_cursor *dc)
+{
+	struct tmpfs_dir *dirhead = &node->tn_dir.tn_dirhead;
+	struct tmpfs_dirent *de, dekey;
+
+	MPASS(cookie >= TMPFS_DIRCOOKIE_MIN);
+
+	if (cookie == node->tn_dir.tn_readdir_lastn &&
+	    (de = node->tn_dir.tn_readdir_lastp) != NULL) {
+		/* Protect against possible race, tn_readdir_last[pn]
+		 * may be updated with only shared vnode lock held. */
+		if (cookie == tmpfs_dirent_cookie(de))
+			goto out;
+	}
+
+	if ((cookie & TMPFS_DIRCOOKIE_DUP) != 0) {
+		LIST_FOREACH(de, &node->tn_dir.tn_dupindex,
+		    uh.td_dup.index_entries) {
+			MPASS(tmpfs_dirent_dup(de));
+			if (de->td_cookie == cookie)
+				goto out;
+			/* dupindex list is sorted. */
+			if (de->td_cookie < cookie) {
+				de = NULL;
+				goto out;
+			}
+		}
+		MPASS(de == NULL);
+		goto out;
+	}
+
+	MPASS((cookie & TMPFS_DIRCOOKIE_MASK) == cookie);
+	dekey.td_hash = cookie;
+	/* Recover if direntry for cookie was removed */
+	de = RB_NFIND(tmpfs_dir, dirhead, &dekey);
+	dc->tdc_tree = de;
+	dc->tdc_current = de;
+	if (de != NULL && tmpfs_dirent_duphead(de)) {
+		dc->tdc_current = LIST_FIRST(&de->ud.td_duphead);
+		MPASS(dc->tdc_current != NULL);
+	}
+	return (dc->tdc_current);
+
+out:
+	dc->tdc_tree = de;
+	dc->tdc_current = de;
+	if (de != NULL && tmpfs_dirent_dup(de))
+		dc->tdc_tree = tmpfs_dir_xlookup_hash(node,
+		    de->td_hash);
+	return (dc->tdc_current);
+}
+
+/*
+ * Looks for a directory entry in the directory represented by node.
+ * 'cnp' describes the name of the entry to look for.  Note that the .
+ * and .. components are not allowed as they do not physically exist
+ * within directories.
+ *
+ * Returns a pointer to the entry when found, otherwise NULL.
+ */
+struct tmpfs_dirent *
+tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f,
+    struct componentname *cnp)
+{
+	struct tmpfs_dir_duphead *duphead;
+	struct tmpfs_dirent *de;
+	uint32_t hash;
+
+	MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.'));
+	MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' &&
+	    cnp->cn_nameptr[1] == '.')));
+	TMPFS_VALIDATE_DIR(node);
+
+	hash = tmpfs_dirent_hash(cnp->cn_nameptr, cnp->cn_namelen);
+	de = tmpfs_dir_xlookup_hash(node, hash);
+	if (de != NULL && tmpfs_dirent_duphead(de)) {
+		duphead = &de->ud.td_duphead;
+		LIST_FOREACH(de, duphead, uh.td_dup.entries) {
+			if (TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr,
+			    cnp->cn_namelen))
+				break;
+		}
+	} else if (de != NULL) {
+		if (!TMPFS_DIRENT_MATCHES(de, cnp->cn_nameptr,
+		    cnp->cn_namelen))
+			de = NULL;
+	}
+	if (de != NULL && f != NULL && de->td_node != f)
+		de = NULL;
+
+	return (de);
+}
+
+/*
+ * Attach duplicate-cookie directory entry nde to dnode and insert to dupindex
+ * list, allocate new cookie value.
+ */
+static void
+tmpfs_dir_attach_dup(struct tmpfs_node *dnode,
+    struct tmpfs_dir_duphead *duphead, struct tmpfs_dirent *nde)
+{
+	struct tmpfs_dir_duphead *dupindex;
+	struct tmpfs_dirent *de, *pde;
+
+	dupindex = &dnode->tn_dir.tn_dupindex;
+	de = LIST_FIRST(dupindex);
+	if (de == NULL || de->td_cookie < TMPFS_DIRCOOKIE_DUP_MAX) {
+		if (de == NULL)
+			nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN;
+		else
+			nde->td_cookie = de->td_cookie + 1;
+		MPASS(tmpfs_dirent_dup(nde));
+		LIST_INSERT_HEAD(dupindex, nde, uh.td_dup.index_entries);
+		LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries);
+		return;
+	}
+
+	/*
+	 * Cookie numbers are near exhaustion. Scan dupindex list for unused
+	 * numbers. dupindex list is sorted in descending order. Keep it so
+	 * after inserting nde.
+	 */
+	while (1) {
+		pde = de;
+		de = LIST_NEXT(de, uh.td_dup.index_entries);
+		if (de == NULL && pde->td_cookie != TMPFS_DIRCOOKIE_DUP_MIN) {
+			/*
+			 * Last element of the index doesn't have minimal cookie
+			 * value, use it.
+			 */
+			nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MIN;
+			LIST_INSERT_AFTER(pde, nde, uh.td_dup.index_entries);
+			LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries);
+			return;
+		} else if (de == NULL) {
+			/*
+			 * We are so lucky have 2^30 hash duplicates in single
+			 * directory :) Return largest possible cookie value.
+			 * It should be fine except possible issues with
+			 * VOP_READDIR restart.
+			 */
+			nde->td_cookie = TMPFS_DIRCOOKIE_DUP_MAX;
+			LIST_INSERT_HEAD(dupindex, nde,
+			    uh.td_dup.index_entries);
+			LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries);
+			return;
+		}
+		if (de->td_cookie + 1 == pde->td_cookie ||
+		    de->td_cookie >= TMPFS_DIRCOOKIE_DUP_MAX)
+			continue;	/* No hole or invalid cookie. */
+		nde->td_cookie = de->td_cookie + 1;
+		MPASS(tmpfs_dirent_dup(nde));
+		MPASS(pde->td_cookie > nde->td_cookie);
+		MPASS(nde->td_cookie > de->td_cookie);
+		LIST_INSERT_BEFORE(de, nde, uh.td_dup.index_entries);
+		LIST_INSERT_HEAD(duphead, nde, uh.td_dup.entries);
+		return;
+	};
+}
+
 /*
  * Attaches the directory entry de to the directory represented by vp.
  * Note that this does not change the link count of the node pointed by
@@ -614,10 +874,38 @@ void
 tmpfs_dir_attach(struct vnode *vp, struct tmpfs_dirent *de)
 {
 	struct tmpfs_node *dnode;
+	struct tmpfs_dirent *xde, *nde;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
+	MPASS(de->td_namelen > 0);
+	MPASS(de->td_hash >= TMPFS_DIRCOOKIE_MIN);
+	MPASS(de->td_cookie == de->td_hash);
+
 	dnode = VP_TO_TMPFS_DIR(vp);
-	TAILQ_INSERT_TAIL(&dnode->tn_dir.tn_dirhead, de, td_entries);
+	dnode->tn_dir.tn_readdir_lastn = 0;
+	dnode->tn_dir.tn_readdir_lastp = NULL;
+
+	MPASS(!tmpfs_dirent_dup(de));
+	xde = RB_INSERT(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de);
+	if (xde != NULL && tmpfs_dirent_duphead(xde))
+		tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de);
+	else if (xde != NULL) {
+		/*
+		 * Allocate new duphead. Swap xde with duphead to avoid
+		 * adding/removing elements with the same hash.
+		 */
+		MPASS(!tmpfs_dirent_dup(xde));
+		tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), NULL, NULL, 0,
+		    &nde);
+		/* *nde = *xde; XXX gcc 4.2.1 may generate invalid code. */
+		memcpy(nde, xde, sizeof(*xde));
+		xde->td_cookie |= TMPFS_DIRCOOKIE_DUPHEAD;
+		LIST_INIT(&xde->ud.td_duphead);
+		xde->td_namelen = 0;
+		xde->td_node = NULL;
+		tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, nde);
+		tmpfs_dir_attach_dup(dnode, &xde->ud.td_duphead, de);
+	}
 	dnode->tn_size += sizeof(struct tmpfs_dirent);
 	dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \
 	    TMPFS_NODE_MODIFIED;
@@ -633,58 +921,61 @@ tmpfs_dir_attach(struct vnode *vp, struc
 void
 tmpfs_dir_detach(struct vnode *vp, struct tmpfs_dirent *de)
 {
+	struct tmpfs_mount *tmp;
+	struct tmpfs_dir *head;
 	struct tmpfs_node *dnode;
+	struct tmpfs_dirent *xde;
 
 	ASSERT_VOP_ELOCKED(vp, __func__);
-	dnode = VP_TO_TMPFS_DIR(vp);
 
-	if (dnode->tn_dir.tn_readdir_lastp == de) {
-		dnode->tn_dir.tn_readdir_lastn = 0;
-		dnode->tn_dir.tn_readdir_lastp = NULL;
-	}
+	dnode = VP_TO_TMPFS_DIR(vp);
+	head = &dnode->tn_dir.tn_dirhead;
+	dnode->tn_dir.tn_readdir_lastn = 0;
+	dnode->tn_dir.tn_readdir_lastp = NULL;
+
+	if (tmpfs_dirent_dup(de)) {
+		/* Remove duphead if de was last entry. */
+		if (LIST_NEXT(de, uh.td_dup.entries) == NULL) {
+			xde = tmpfs_dir_xlookup_hash(dnode, de->td_hash);
+			MPASS(tmpfs_dirent_duphead(xde));
+		} else
+			xde = NULL;
+		LIST_REMOVE(de, uh.td_dup.entries);
+		LIST_REMOVE(de, uh.td_dup.index_entries);
+		if (xde != NULL) {
+			if (LIST_EMPTY(&xde->ud.td_duphead)) {
+				RB_REMOVE(tmpfs_dir, head, xde);
+				tmp = VFS_TO_TMPFS(vp->v_mount);
+				MPASS(xde->td_node == NULL);
+				tmpfs_free_dirent(tmp, xde);
+			}
+		}
+	} else
+		RB_REMOVE(tmpfs_dir, head, de);
 
-	TAILQ_REMOVE(&dnode->tn_dir.tn_dirhead, de, td_entries);
 	dnode->tn_size -= sizeof(struct tmpfs_dirent);
 	dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \
 	    TMPFS_NODE_MODIFIED;
 }
 
-/* --------------------------------------------------------------------- */
-
-/*
- * Looks for a directory entry in the directory represented by node.
- * 'cnp' describes the name of the entry to look for.  Note that the .
- * and .. components are not allowed as they do not physically exist
- * within directories.
- *
- * Returns a pointer to the entry when found, otherwise NULL.
- */
-struct tmpfs_dirent *
-tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f,
-    struct componentname *cnp)
+void
+tmpfs_dir_destroy(struct tmpfs_mount *tmp, struct tmpfs_node *dnode)
 {
-	boolean_t found;
-	struct tmpfs_dirent *de;
-
-	MPASS(IMPLIES(cnp->cn_namelen == 1, cnp->cn_nameptr[0] != '.'));
-	MPASS(IMPLIES(cnp->cn_namelen == 2, !(cnp->cn_nameptr[0] == '.' &&
-	    cnp->cn_nameptr[1] == '.')));
-	TMPFS_VALIDATE_DIR(node);
+	struct tmpfs_dirent *de, *dde, *nde;
 
-	found = 0;
-	TAILQ_FOREACH(de, &node->tn_dir.tn_dirhead, td_entries) {
-		if (f != NULL && de->td_node != f)
-		    continue;
-		MPASS(cnp->cn_namelen < 0xffff);
-		if (de->td_namelen == (uint16_t)cnp->cn_namelen &&
-		    bcmp(de->td_name, cnp->cn_nameptr, de->td_namelen) == 0) {
-			found = 1;
-			break;
+	RB_FOREACH_SAFE(de, tmpfs_dir, &dnode->tn_dir.tn_dirhead, nde) {
+		RB_REMOVE(tmpfs_dir, &dnode->tn_dir.tn_dirhead, de);
+		/* Node may already be destroyed. */
+		de->td_node = NULL;
+		if (tmpfs_dirent_duphead(de)) {
+			while ((dde = LIST_FIRST(&de->ud.td_duphead)) != NULL) {
+				LIST_REMOVE(dde, uh.td_dup.entries);
+				dde->td_node = NULL;
+				tmpfs_free_dirent(tmp, dde);
+			}
 		}
+		tmpfs_free_dirent(tmp, de);
 	}
-	node->tn_status |= TMPFS_NODE_ACCESSED;
-
-	return found ? de : NULL;
 }
 
 /* --------------------------------------------------------------------- */
@@ -696,7 +987,7 @@ tmpfs_dir_lookup(struct tmpfs_node *node
  * hold the directory entry or an appropriate error code if another
  * error happens.
  */
-int
+static int
 tmpfs_dir_getdotdent(struct tmpfs_node *node, struct uio *uio)
 {
 	int error;
@@ -713,12 +1004,9 @@ tmpfs_dir_getdotdent(struct tmpfs_node *
 	dent.d_reclen = GENERIC_DIRSIZ(&dent);
 
 	if (dent.d_reclen > uio->uio_resid)
-		error = -1;
-	else {
+		error = EJUSTRETURN;
+	else
 		error = uiomove(&dent, dent.d_reclen, uio);
-		if (error == 0)
-			uio->uio_offset = TMPFS_DIRCOOKIE_DOTDOT;
-	}
 
 	node->tn_status |= TMPFS_NODE_ACCESSED;
 
@@ -734,7 +1022,7 @@ tmpfs_dir_getdotdent(struct tmpfs_node *
  * hold the directory entry or an appropriate error code if another
  * error happens.
  */
-int
+static int
 tmpfs_dir_getdotdotdent(struct tmpfs_node *node, struct uio *uio)
 {
 	int error;
@@ -763,19 +1051,9 @@ tmpfs_dir_getdotdotdent(struct tmpfs_nod
 	dent.d_reclen = GENERIC_DIRSIZ(&dent);
 
 	if (dent.d_reclen > uio->uio_resid)
-		error = -1;
-	else {
+		error = EJUSTRETURN;
+	else
 		error = uiomove(&dent, dent.d_reclen, uio);
-		if (error == 0) {
-			struct tmpfs_dirent *de;
-
-			de = TAILQ_FIRST(&node->tn_dir.tn_dirhead);
-			if (de == NULL)
-				uio->uio_offset = TMPFS_DIRCOOKIE_EOF;
-			else
-				uio->uio_offset = tmpfs_dircookie(de);
-		}
-	}
 
 	node->tn_status |= TMPFS_NODE_ACCESSED;
 
@@ -785,30 +1063,6 @@ tmpfs_dir_getdotdotdent(struct tmpfs_nod
 /* --------------------------------------------------------------------- */
 
 /*
- * Lookup a directory entry by its associated cookie.
- */
-struct tmpfs_dirent *
-tmpfs_dir_lookupbycookie(struct tmpfs_node *node, off_t cookie)
-{
-	struct tmpfs_dirent *de;
-
-	if (cookie == node->tn_dir.tn_readdir_lastn &&
-	    node->tn_dir.tn_readdir_lastp != NULL) {
-		return node->tn_dir.tn_readdir_lastp;
-	}
-
-	TAILQ_FOREACH(de, &node->tn_dir.tn_dirhead, td_entries) {
-		if (tmpfs_dircookie(de) == cookie) {
-			break;
-		}
-	}
-
-	return de;
-}
-
-/* --------------------------------------------------------------------- */
-
-/*
  * Helper function for tmpfs_readdir.  Returns as much directory entries
  * as can fit in the uio space.  The read starts at uio->uio_offset.
  * The function returns 0 on success, -1 if there was not enough space
@@ -816,27 +1070,47 @@ tmpfs_dir_lookupbycookie(struct tmpfs_no
  * error code if another error happens.
  */
 int
-tmpfs_dir_getdents(struct tmpfs_node *node, struct uio *uio, off_t *cntp)
+tmpfs_dir_getdents(struct tmpfs_node *node, struct uio *uio, int cnt,
+    u_long *cookies, int *ncookies)
 {
-	int error;
-	off_t startcookie;
+	struct tmpfs_dir_cursor dc;
 	struct tmpfs_dirent *de;
+	off_t off;
+	int error;
 
 	TMPFS_VALIDATE_DIR(node);
 
-	/* Locate the first directory entry we have to return.  We have cached
-	 * the last readdir in the node, so use those values if appropriate.
-	 * Otherwise do a linear scan to find the requested entry. */
-	startcookie = uio->uio_offset;
-	MPASS(startcookie != TMPFS_DIRCOOKIE_DOT);
-	MPASS(startcookie != TMPFS_DIRCOOKIE_DOTDOT);
-	if (startcookie == TMPFS_DIRCOOKIE_EOF) {
-		return 0;
-	} else {
-		de = tmpfs_dir_lookupbycookie(node, startcookie);
-	}
-	if (de == NULL) {
-		return EINVAL;
+	off = 0;
+	switch (uio->uio_offset) {
+	case TMPFS_DIRCOOKIE_DOT:
+		error = tmpfs_dir_getdotdent(node, uio);
+		if (error != 0)
+			return (error);
+		uio->uio_offset = TMPFS_DIRCOOKIE_DOTDOT;
+		if (cnt != 0)
+			cookies[(*ncookies)++] = off = uio->uio_offset;
+	case TMPFS_DIRCOOKIE_DOTDOT:
+		error = tmpfs_dir_getdotdotdent(node, uio);
+		if (error != 0)
+			return (error);
+		de = tmpfs_dir_first(node, &dc);
+		if (de == NULL)
+			uio->uio_offset = TMPFS_DIRCOOKIE_EOF;
+		else
+			uio->uio_offset = tmpfs_dirent_cookie(de);
+		if (cnt != 0)
+			cookies[(*ncookies)++] = off = uio->uio_offset;
+		if (de == NULL)
+			return (0);
+		break;
+	case TMPFS_DIRCOOKIE_EOF:
+		return (0);
+	default:
+		de = tmpfs_dir_lookup_cookie(node, uio->uio_offset, &dc);
+		if (de == NULL)
+			return (EINVAL);
+		if (cnt != 0)
+			off = tmpfs_dirent_cookie(de);
 	}
 
 	/* Read as much entries as possible; i.e., until we reach the end of
@@ -887,14 +1161,14 @@ tmpfs_dir_getdents(struct tmpfs_node *no
 		}
 		d.d_namlen = de->td_namelen;
 		MPASS(de->td_namelen < sizeof(d.d_name));
-		(void)memcpy(d.d_name, de->td_name, de->td_namelen);
+		(void)memcpy(d.d_name, de->ud.td_name, de->td_namelen);
 		d.d_name[de->td_namelen] = '\0';
 		d.d_reclen = GENERIC_DIRSIZ(&d);
 
 		/* Stop reading if the directory entry we are treating is
 		 * bigger than the amount of data that can be returned. */
 		if (d.d_reclen > uio->uio_resid) {
-			error = -1;
+			error = EJUSTRETURN;
 			break;
 		}
 
@@ -902,21 +1176,30 @@ tmpfs_dir_getdents(struct tmpfs_node *no
 		 * advance pointers. */
 		error = uiomove(&d, d.d_reclen, uio);
 		if (error == 0) {
-			(*cntp)++;
-			de = TAILQ_NEXT(de, td_entries);
+			de = tmpfs_dir_next(node, &dc);
+			if (cnt != 0) {
+				if (de == NULL)
+					off = TMPFS_DIRCOOKIE_EOF;
+				else
+					off = tmpfs_dirent_cookie(de);
+				MPASS(*ncookies < cnt);
+				cookies[(*ncookies)++] = off;
+			}
 		}
 	} while (error == 0 && uio->uio_resid > 0 && de != NULL);
 
 	/* Update the offset and cache. */
-	if (de == NULL) {
-		uio->uio_offset = TMPFS_DIRCOOKIE_EOF;
-		node->tn_dir.tn_readdir_lastn = 0;
-		node->tn_dir.tn_readdir_lastp = NULL;
-	} else {
-		node->tn_dir.tn_readdir_lastn = uio->uio_offset = tmpfs_dircookie(de);
-		node->tn_dir.tn_readdir_lastp = de;
+	if (cnt == 0) {
+		if (de == NULL)
+			off = TMPFS_DIRCOOKIE_EOF;
+		else
+			off = tmpfs_dirent_cookie(de);
 	}
 
+	uio->uio_offset = off;
+	node->tn_dir.tn_readdir_lastn = off;
+	node->tn_dir.tn_readdir_lastp = de;
+
 	node->tn_status |= TMPFS_NODE_ACCESSED;
 	return error;
 }
@@ -943,7 +1226,7 @@ tmpfs_dir_whiteout_remove(struct vnode *
 	de = tmpfs_dir_lookup(VP_TO_TMPFS_DIR(dvp), NULL, cnp);
 	MPASS(de != NULL && de->td_node == NULL);
 	tmpfs_dir_detach(dvp, de);
-	tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de, TRUE);
+	tmpfs_free_dirent(VFS_TO_TMPFS(dvp->v_mount), de);
 }
 
 /* --------------------------------------------------------------------- */
@@ -1436,3 +1719,15 @@ out:
 
 	return error;
 }
+
+static __inline int
+tmpfs_dirtree_cmp(struct tmpfs_dirent *a, struct tmpfs_dirent *b)
+{
+	if (a->td_hash > b->td_hash)

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-all mailing list