kern/129231: New UFS mount (norandom) option - mostly useful for building redundant NFS servers

Attila Nagy bra at fsn.hu
Thu Nov 27 05:30:02 PST 2008


>Number:         129231
>Category:       kern
>Synopsis:       New UFS mount (norandom) option - mostly useful for building redundant NFS servers
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          change-request
>Submitter-Id:   current-users
>Arrival-Date:   Thu Nov 27 13:30:01 UTC 2008
>Closed-Date:
>Last-Modified:
>Originator:     Attila Nagy
>Release:        RELENG_7
>Organization:
FSN
>Environment:
>Description:
In our environment we use redundant (and even load balanced) RO NFS servers with CARP.
To archieve this, FreeBSD currently does not offer any help, because it lacks shared (cluster) filesystems, or any other tools to achieve inode consistency between the nodes.

This patch adds a new mount option, named norandom, which -when enabled- removes arc4random calls in the inode creation path.
When a filesystem is mounted with norandom, and exactly the same modifications happen on two machines, the resulting file system state will be the same.
This means if you have two (or more) -shared nothing- NFS servers with norandom mounted file systems and you do -for example as we do- svn updates to each, the inodes will match across the nodes, so it is possible to build redundant (read only from the client side of course, but if you can replicate the NFS operations to each nodes, even RW is possible) NFS servers with an IP failover mechanism (like CARP) without the danger of getting "Stale NFS file handles" at switchover time.
Matching inodes also make possible to use load balanced (also with CARP) NFS servers too.

We have several redundant NFS file servers spread across multiple physical locations running with this patch without any ill effects in the last two years.
>How-To-Repeat:

>Fix:
See the attached patch.

Patch attached with submission follows:

Files src/sbin/mount/getmntopts.o and src.new/sbin/mount/getmntopts.o differ
diff -ruN src/sbin/mount/mntopts.h src.new/sbin/mount/mntopts.h
--- src/sbin/mount/mntopts.h	2005-12-02 04:55:02.000000000 +0100
+++ src.new/sbin/mount/mntopts.h	2008-11-26 13:54:58.000000000 +0100
@@ -42,6 +42,7 @@
 #define MOPT_NOATIME		{ "atime",	1, MNT_NOATIME, 0 }
 #define MOPT_NOEXEC		{ "exec",	1, MNT_NOEXEC, 0 }
 #define MOPT_NOSUID		{ "suid",	1, MNT_NOSUID, 0 }
+#define MOPT_NORANDOM		{ "random",	1, MNT_NORANDOM, 0 }
 #define MOPT_NOSYMFOLLOW	{ "symfollow",  1, MNT_NOSYMFOLLOW, 0 }
 #define MOPT_RDONLY		{ "rdonly",	0, MNT_RDONLY, 0 }
 #define MOPT_SYNC		{ "sync",	0, MNT_SYNCHRONOUS, 0 }
@@ -81,6 +82,7 @@
 	MOPT_NOEXEC,							\
 	MOPT_SUIDDIR,		/* must be before MOPT_NOSUID */	\
 	MOPT_NOSUID,							\
+	MOPT_NORANDOM,							\
 	MOPT_NOSYMFOLLOW,						\
 	MOPT_RDONLY,							\
 	MOPT_UNION,							\
Files src/sbin/mount/mount and src.new/sbin/mount/mount differ
diff -ruN src/sbin/mount/mount.8 src.new/sbin/mount/mount.8
--- src/sbin/mount/mount.8	2008-06-06 14:28:33.000000000 +0200
+++ src.new/sbin/mount/mount.8	2008-11-27 13:17:59.000000000 +0100
@@ -214,6 +214,25 @@
 .Xr suidperl 1
 is installed on your system.
 It is set automatically when the user does not have super-user privileges.
+.It Cm norandom
+Do not randomize inode allocation. This option is useful if you want to
+make sure that the same sequence of file system operations results the
+same file system layout. This behaviour can be important when you would like
+to set up a redundant NFS server in a shared-nothing architecture (eg. not
+with shared disks in a HA failover cluster scenario), where if you do the same
+updates on all servers, you will get consistent inodes with this option.
+This is needed to maintain inode consistency between machines to avoid "Stale
+NFS file handle" errors on the clients when switching between NFS servers.
+.Pp
+This option should be used without soft updates enabled on the file system.
+.Xr gjournal 8
+with async mount is the recommended way to operate norandom file systems,
+because it offers both speed and file system integrity.
+.Pp
+It should be noted, that this option is
+.Em highly discouraged
+in security-conscious environments, because it makes inode allocations
+predictable.
 .It Cm nosymfollow
 Do not follow symlinks
 on the mounted file system.
Files src/sbin/mount/mount.8.gz and src.new/sbin/mount/mount.8.gz differ
diff -ruN src/sbin/mount/mount.c src.new/sbin/mount/mount.c
--- src/sbin/mount/mount.c	2008-09-01 10:37:13.000000000 +0200
+++ src.new/sbin/mount/mount.c	2008-11-26 13:56:10.000000000 +0100
@@ -96,6 +96,7 @@
 	{ MNT_NOATIME,		"noatime" },
 	{ MNT_NOEXEC,		"noexec" },
 	{ MNT_NOSUID,		"nosuid" },
+	{ MNT_NORANDOM,		"norandom" },
 	{ MNT_NOSYMFOLLOW,	"nosymfollow" },
 	{ MNT_QUOTA,		"with quotas" },
 	{ MNT_RDONLY,		"read-only" },
@@ -873,6 +874,7 @@
 	if (flags & MNT_NOATIME)	res = catopt(res, "noatime");
 	if (flags & MNT_NOCLUSTERR)	res = catopt(res, "noclusterr");
 	if (flags & MNT_NOCLUSTERW)	res = catopt(res, "noclusterw");
+	if (flags & MNT_NORANDOM)	res = catopt(res, "norandom");
 	if (flags & MNT_NOSYMFOLLOW)	res = catopt(res, "nosymfollow");
 	if (flags & MNT_SUIDDIR)	res = catopt(res, "suiddir");
 	if (flags & MNT_MULTILABEL)	res = catopt(res, "multilabel");
Files src/sbin/mount/mount.o and src.new/sbin/mount/mount.o differ
Files src/sbin/mount/mount_fs.o and src.new/sbin/mount/mount_fs.o differ
Files src/sbin/mount/vfslist.o and src.new/sbin/mount/vfslist.o differ
diff -ruN src/sys/kern/vfs_mount.c src.new/sys/kern/vfs_mount.c
--- src/sys/kern/vfs_mount.c	2008-10-06 18:11:08.000000000 +0200
+++ src.new/sys/kern/vfs_mount.c	2008-11-26 13:50:48.000000000 +0100
@@ -678,6 +678,12 @@
 			free(opt->name, M_MOUNT);
 			opt->name = strdup("nonosuid", M_MOUNT);
 		}
+		else if (strcmp(opt->name, "norandom") == 0)
+			fsflags |= MNT_NORANDOM;
+		else if (strcmp(opt->name, "random") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonorandom", M_MOUNT);
+		}
 		else if (strcmp(opt->name, "nosymfollow") == 0)
 			fsflags |= MNT_NOSYMFOLLOW;
 		else if (strcmp(opt->name, "symfollow") == 0) {
diff -ruN src/sys/sys/mount.h src.new/sys/sys/mount.h
--- src/sys/sys/mount.h	2008-10-06 18:11:08.000000000 +0200
+++ src.new/sys/sys/mount.h	2008-11-26 13:47:43.000000000 +0100
@@ -218,6 +218,7 @@
 #define	MNT_SYNCHRONOUS	0x00000002	/* filesystem written synchronously */
 #define	MNT_NOEXEC	0x00000004	/* can't exec from filesystem */
 #define	MNT_NOSUID	0x00000008	/* don't honor setuid bits on fs */
+#define	MNT_NORANDOM	0x00000010	/* don't randomize inode allocation */
 #define	MNT_UNION	0x00000020	/* union with underlying filesystem */
 #define	MNT_ASYNC	0x00000040	/* filesystem written asynchronously */
 #define	MNT_SUIDDIR	0x00100000	/* special handling of SUID on dirs */
@@ -257,7 +258,7 @@
  * but the 'mount' program may need changing to handle this.
  */
 #define	MNT_VISFLAGMASK	(MNT_RDONLY	| MNT_SYNCHRONOUS | MNT_NOEXEC	| \
-			MNT_NOSUID	| MNT_UNION	| \
+			MNT_NOSUID	| MNT_NORANDOM	| MNT_UNION	| \
 			MNT_ASYNC	| MNT_EXRDONLY	| MNT_EXPORTED	| \
 			MNT_DEFEXPORTED	| MNT_EXPORTANON| MNT_EXKERB	| \
 			MNT_LOCAL	| MNT_USER	| MNT_QUOTA	| \
@@ -267,7 +268,7 @@
 			MNT_GJOURNAL	| MNT_MULTILABEL | MNT_ACLS)
 
 /* Mask of flags that can be updated. */
-#define	MNT_UPDATEMASK (MNT_NOSUID	| MNT_NOEXEC	| \
+#define	MNT_UPDATEMASK (MNT_NOSUID	| MNT_NOEXEC	| MNT_NORANDOM	| \
 			MNT_SYNCHRONOUS	| MNT_UNION	| MNT_ASYNC	| \
 			MNT_NOATIME | \
 			MNT_NOSYMFOLLOW	| MNT_IGNORE	| \
diff -ruN src/sys/ufs/ffs/ffs_alloc.c src.new/sys/ufs/ffs/ffs_alloc.c
--- src/sys/ufs/ffs/ffs_alloc.c	2008-02-15 17:43:01.000000000 +0100
+++ src.new/sys/ufs/ffs/ffs_alloc.c	2008-11-26 12:48:58.000000000 +0100
@@ -977,8 +977,13 @@
 	/*
 	 * Set up a new generation number for this inode.
 	 */
-	if (ip->i_gen == 0 || ++ip->i_gen == 0)
-		ip->i_gen = arc4random() / 2 + 1;
+	if (ip->i_gen == 0 || ++ip->i_gen == 0) {
+		if (pvp->v_mount->mnt_flag & MNT_NORANDOM) {
+			ip->i_gen = 1;
+		} else {
+			ip->i_gen = arc4random() / 2 + 1;
+		}
+	}
 	DIP_SET(ip, i_gen, ip->i_gen);
 	if (fs->fs_magic == FS_UFS2_MAGIC) {
 		vfs_timestamp(&ts);
@@ -1039,7 +1044,11 @@
 	 */
 	ASSERT_VOP_LOCKED(ITOV(pip), "ffs_dirpref");
 	if (ITOV(pip)->v_vflag & VV_ROOT) {
-		prefcg = arc4random() % fs->fs_ncg;
+		if (ITOV(pip)->v_mount->mnt_flag & MNT_NORANDOM) {
+			prefcg = fs->fs_ncg;
+		} else {
+			prefcg = arc4random() % fs->fs_ncg;
+		}
 		mincg = prefcg;
 		minndir = fs->fs_ipg;
 		for (cg = prefcg; cg < fs->fs_ncg; cg++)
@@ -1766,7 +1775,11 @@
 		bzero(ibp->b_data, (int)fs->fs_bsize);
 		dp2 = (struct ufs2_dinode *)(ibp->b_data);
 		for (i = 0; i < INOPB(fs); i++) {
-			dp2->di_gen = arc4random() / 2 + 1;
+			if (ITOV(ip)->v_mount->mnt_flag & MNT_NORANDOM) {
+				dp2->di_gen = 1;
+			} else {
+				dp2->di_gen = arc4random() / 2 + 1;
+			}
 			dp2++;
 		}
 		cgp->cg_initediblk += INOPB(fs);
diff -ruN src/sys/ufs/ffs/ffs_vfsops.c src.new/sys/ufs/ffs/ffs_vfsops.c
--- src/sys/ufs/ffs/ffs_vfsops.c	2008-06-27 02:37:08.000000000 +0200
+++ src.new/sys/ufs/ffs/ffs_vfsops.c	2008-11-26 13:59:19.000000000 +0100
@@ -125,7 +125,7 @@
 static const char *ffs_opts[] = { "acls", "async", "noatime", "noclusterr",
     "noclusterw", "noexec", "export", "force", "from", "multilabel", 
     "snapshot", "nosuid", "suiddir", "nosymfollow", "sync",
-    "union", NULL };
+    "union", "norandom", NULL };
 
 static int
 ffs_mount(struct mount *mp, struct thread *td)
@@ -1428,7 +1428,11 @@
 	 * already have one. This should only happen on old filesystems.
 	 */
 	if (ip->i_gen == 0) {
-		ip->i_gen = arc4random() / 2 + 1;
+		if (vp->v_mount->mnt_flag & MNT_NORANDOM) {
+			ip->i_gen = 1;
+		} else {
+			ip->i_gen = arc4random() / 2 + 1;
+		}
 		if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
 			ip->i_flag |= IN_MODIFIED;
 			DIP_SET(ip, i_gen, ip->i_gen);


>Release-Note:
>Audit-Trail:
>Unformatted:


More information about the freebsd-bugs mailing list