svn commit: r248623 - in head: sbin/dumpfs sbin/newfs sbin/tunefs sys/ufs/ffs

Kirk McKusick mckusick at FreeBSD.org
Fri Mar 22 21:45:30 UTC 2013


Author: mckusick
Date: Fri Mar 22 21:45:28 2013
New Revision: 248623
URL: http://svnweb.freebsd.org/changeset/base/248623

Log:
  The purpose of this change to the FFS layout policy is to reduce the
  running time for a full fsck. It also reduces the random access time
  for large files and speeds the traversal time for directory tree walks.
  
  The key idea is to reserve a small area in each cylinder group
  immediately following the inode blocks for the use of metadata,
  specifically indirect blocks and directory contents. The new policy
  is to preferentially place metadata in the metadata area and
  everything else in the blocks that follow the metadata area.
  
  The size of this area can be set when creating a filesystem using
  newfs(8) or changed in an existing filesystem using tunefs(8).
  Both utilities use the `-k held-for-metadata-blocks' option to
  specify the amount of space to be held for metadata blocks in each
  cylinder group. By default, newfs(8) sets this area to half of
  minfree (typically 4% of the data area).
  
  This work was inspired by a paper presented at Usenix's FAST '13:
  www.usenix.org/conference/fast13/ffsck-fast-file-system-checker
  
  Details of this implementation appears in the April 2013 of ;login:
  www.usenix.org/publications/login/april-2013-volume-38-number-2.
  A copy of the April 2013 ;login: paper can also be downloaded
  from: www.mckusick.com/publications/faster_fsck.pdf.
  
  Reviewed by: kib
  Tested by:   Peter Holm
  MFC after:   4 weeks

Modified:
  head/sbin/dumpfs/dumpfs.c
  head/sbin/newfs/mkfs.c
  head/sbin/newfs/newfs.8
  head/sbin/newfs/newfs.c
  head/sbin/newfs/newfs.h
  head/sbin/tunefs/tunefs.8
  head/sbin/tunefs/tunefs.c
  head/sys/ufs/ffs/ffs_alloc.c
  head/sys/ufs/ffs/ffs_balloc.c
  head/sys/ufs/ffs/fs.h

Modified: head/sbin/dumpfs/dumpfs.c
==============================================================================
--- head/sbin/dumpfs/dumpfs.c	Fri Mar 22 21:11:17 2013	(r248622)
+++ head/sbin/dumpfs/dumpfs.c	Fri Mar 22 21:45:28 2013	(r248623)
@@ -241,8 +241,8 @@ dumpfs(const char *name)
 	    afs.fs_sblkno, afs.fs_cblkno, afs.fs_iblkno, afs.fs_dblkno);
 	printf("cgrotor\t%d\tfmod\t%d\tronly\t%d\tclean\t%d\n",
 	    afs.fs_cgrotor, afs.fs_fmod, afs.fs_ronly, afs.fs_clean);
-	printf("avgfpdir %d\tavgfilesize %d\n",
-	    afs.fs_avgfpdir, afs.fs_avgfilesize);
+	printf("metaspace %jd\tavgfpdir %d\tavgfilesize %d\n",
+	    afs.fs_metaspace, afs.fs_avgfpdir, afs.fs_avgfilesize);
 	printf("flags\t");
 	if (afs.fs_old_flags & FS_FLAGS_UPDATED)
 		fsflags = afs.fs_flags;

Modified: head/sbin/newfs/mkfs.c
==============================================================================
--- head/sbin/newfs/mkfs.c	Fri Mar 22 21:11:17 2013	(r248622)
+++ head/sbin/newfs/mkfs.c	Fri Mar 22 21:45:28 2013	(r248623)
@@ -444,6 +444,12 @@ restart:
 	if (sblock.fs_sbsize > SBLOCKSIZE)
 		sblock.fs_sbsize = SBLOCKSIZE;
 	sblock.fs_minfree = minfree;
+	if (metaspace > 0 && metaspace < sblock.fs_fpg / 2)
+		sblock.fs_metaspace = blknum(&sblock, metaspace);
+	else if (metaspace != -1)
+		/* reserve half of minfree for metadata blocks */
+		sblock.fs_metaspace = blknum(&sblock,
+		    (sblock.fs_fpg * minfree) / 200);
 	if (maxbpg == 0)
 		sblock.fs_maxbpg = MAXBLKPG(sblock.fs_bsize);
 	else

Modified: head/sbin/newfs/newfs.8
==============================================================================
--- head/sbin/newfs/newfs.8	Fri Mar 22 21:11:17 2013	(r248622)
+++ head/sbin/newfs/newfs.8	Fri Mar 22 21:45:28 2013	(r248623)
@@ -50,6 +50,7 @@
 .Op Fl g Ar avgfilesize
 .Op Fl h Ar avgfpdir
 .Op Fl i Ar bytes
+.Op Fl k Ar held-for-metadata-blocks
 .Op Fl m Ar free-space
 .Op Fl o Ar optimization
 .Op Fl p Ar partition
@@ -163,6 +164,17 @@ This flag is implemented by running the
 .Xr tunefs 8
 utility found in the user's
 .Dv $PATH .
+.It Fl k Ar held-for-metadata-blocks
+Set the amount of space to be held for metadata blocks in each cylinder group.
+When set, the file system preference routines will try to save
+the specified amount of space immediately following the inode blocks
+in each cylinder group for use by metadata blocks.
+Clustering the metadata blocks speeds up random file access
+and decreases the running time of
+.Xr fsck 8 .
+By default
+.Xr newfs 8
+sets it to half of the space reserved to minfree.
 .It Fl l
 Enable multilabel MAC on the new file system.
 .It Fl m Ar free-space

Modified: head/sbin/newfs/newfs.c
==============================================================================
--- head/sbin/newfs/newfs.c	Fri Mar 22 21:11:17 2013	(r248622)
+++ head/sbin/newfs/newfs.c	Fri Mar 22 21:45:28 2013	(r248623)
@@ -102,6 +102,7 @@ int	bsize = 0;		/* block size */
 int	maxbsize = 0;		/* maximum clustering */
 int	maxblkspercg = MAXBLKSPERCG; /* maximum blocks per cylinder group */
 int	minfree = MINFREE;	/* free space threshold */
+int	metaspace;		/* space held for metadata blocks */
 int	opt = DEFAULTOPT;	/* optimization preference (space or time) */
 int	density;		/* number of bytes per inode */
 int	maxcontig = 0;		/* max contiguous blocks to allocate */
@@ -141,7 +142,7 @@ main(int argc, char *argv[])
 	part_name = 'c';
 	reserved = 0;
 	while ((ch = getopt(argc, argv,
-	    "EJL:NO:RS:T:UXa:b:c:d:e:f:g:h:i:jlm:no:p:r:s:t")) != -1)
+	    "EJL:NO:RS:T:UXa:b:c:d:e:f:g:h:i:jk:lm:no:p:r:s:t")) != -1)
 		switch (ch) {
 		case 'E':
 			Eflag = 1;
@@ -248,6 +249,13 @@ main(int argc, char *argv[])
 		case 'l':
 			lflag = 1;
 			break;
+		case 'k':
+			if ((metaspace = atoi(optarg)) < 0)
+				errx(1, "%s: bad metadata space %%", optarg);
+			if (metaspace == 0)
+				/* force to stay zero in mkfs */
+				metaspace = -1;
+			break;
 		case 'm':
 			if ((minfree = atoi(optarg)) < 0 || minfree > 99)
 				errx(1, "%s: bad free space %%", optarg);
@@ -501,6 +509,7 @@ usage()
 	fprintf(stderr, "\t-h average files per directory\n");
 	fprintf(stderr, "\t-i number of bytes per inode\n");
 	fprintf(stderr, "\t-j enable soft updates journaling\n");
+	fprintf(stderr, "\t-k space to hold for metadata blocks\n");
 	fprintf(stderr, "\t-l enable multilabel MAC\n");
 	fprintf(stderr, "\t-n do not create .snap directory\n");
 	fprintf(stderr, "\t-m minimum free space %%\n");

Modified: head/sbin/newfs/newfs.h
==============================================================================
--- head/sbin/newfs/newfs.h	Fri Mar 22 21:11:17 2013	(r248622)
+++ head/sbin/newfs/newfs.h	Fri Mar 22 21:45:28 2013	(r248623)
@@ -96,6 +96,7 @@ extern int	bsize;		/* block size */
 extern int	maxbsize;	/* maximum clustering */
 extern int	maxblkspercg;	/* maximum blocks per cylinder group */
 extern int	minfree;	/* free space threshold */
+extern int	metaspace;	/* space held for metadata blocks */
 extern int	opt;		/* optimization preference (space or time) */
 extern int	density;	/* number of bytes per inode */
 extern int	maxcontig;	/* max contiguous blocks to allocate */

Modified: head/sbin/tunefs/tunefs.8
==============================================================================
--- head/sbin/tunefs/tunefs.8	Fri Mar 22 21:11:17 2013	(r248622)
+++ head/sbin/tunefs/tunefs.8	Fri Mar 22 21:45:28 2013	(r248623)
@@ -42,6 +42,7 @@
 .Op Fl f Ar avgfilesize
 .Op Fl j Cm enable | disable
 .Op Fl J Cm enable | disable
+.Op Fl k Ar held-for-metadata-blocks
 .Op Fl L Ar volname
 .Op Fl l Cm enable | disable
 .Op Fl m Ar minfree
@@ -96,6 +97,19 @@ Specify the expected average file size.
 Turn on/off soft updates journaling.
 .It Fl J Cm enable | disable
 Turn on/off gjournal flag.
+.It Fl k Ar held-for-metadata-blocks
+Set the amount of space to be held for metadata blocks.
+When set, the file system preference routines will try to save
+the specified amount of space immediately following the inode blocks
+in each cylinder group for use by metadata blocks.
+Clustering the metadata blocks speeds up random file access
+and decreases the running time of
+.Xr fsck 8 .
+While this option can be set at any time,
+it is most effective if set before any data is loaded into the file system.
+By default
+.Xr newfs 8
+sets it to half of the space reserved to minfree.
 .It Fl L Ar volname
 Add/modify an optional file system volume label.
 .It Fl l Cm enable | disable

Modified: head/sbin/tunefs/tunefs.c
==============================================================================
--- head/sbin/tunefs/tunefs.c	Fri Mar 22 21:11:17 2013	(r248622)
+++ head/sbin/tunefs/tunefs.c	Fri Mar 22 21:45:28 2013	(r248623)
@@ -89,10 +89,9 @@ main(int argc, char *argv[])
 	const char *special, *on;
 	const char *name;
 	int active;
-	int Aflag, aflag, eflag, evalue, fflag, fvalue, jflag, Jflag, Lflag;
-	int lflag, mflag, mvalue, Nflag, nflag, oflag, ovalue, pflag, sflag;
-	int tflag;
-	int svalue, Svalue;
+	int Aflag, aflag, eflag, evalue, fflag, fvalue, jflag, Jflag, kflag;
+	int kvalue, Lflag, lflag, mflag, mvalue, Nflag, nflag, oflag, ovalue;
+	int pflag, sflag, svalue, Svalue, tflag;
 	int ch, found_arg, i;
 	const char *chg[2];
 	struct ufs_args args;
@@ -100,13 +99,13 @@ main(int argc, char *argv[])
 
 	if (argc < 3)
 		usage();
-	Aflag = aflag = eflag = fflag = jflag = Jflag = Lflag = lflag = 0;
-	mflag = Nflag = nflag = oflag = pflag = sflag = tflag = 0;
+	Aflag = aflag = eflag = fflag = jflag = Jflag = kflag = Lflag = 0;
+	lflag = mflag = Nflag = nflag = oflag = pflag = sflag = tflag = 0;
 	avalue = jvalue = Jvalue = Lvalue = lvalue = Nvalue = nvalue = NULL;
 	evalue = fvalue = mvalue = ovalue = svalue = Svalue = 0;
 	active = 0;
 	found_arg = 0;		/* At least one arg is required. */
-	while ((ch = getopt(argc, argv, "Aa:e:f:j:J:L:l:m:N:n:o:ps:S:t:"))
+	while ((ch = getopt(argc, argv, "Aa:e:f:j:J:k:L:l:m:N:n:o:ps:S:t:"))
 	    != -1)
 		switch (ch) {
 
@@ -171,6 +170,14 @@ main(int argc, char *argv[])
 			Jflag = 1;
 			break;
 
+		case 'k':
+			found_arg = 1;
+			name = "space to hold for metadata blocks";
+			kvalue = atoi(optarg);
+			if (mvalue < 0)
+				errx(10, "bad %s (%s)", name, optarg);
+			kflag = 1;
+			break;
 
 		case 'L':
 			found_arg = 1;
@@ -404,6 +411,22 @@ main(int argc, char *argv[])
 			}
 		}
 	}
+	if (kflag) {
+		name = "space to hold for metadata blocks";
+		if (sblock.fs_metaspace == kvalue)
+			warnx("%s remains unchanged as %d", name, kvalue);
+		else {
+			kvalue = blknum(&sblock, kvalue);
+			if (kvalue > sblock.fs_fpg / 2) {
+				kvalue = blknum(&sblock, sblock.fs_fpg / 2);
+				warnx("%s cannot exceed half the file system "
+				    "space", name);
+			}
+			warnx("%s changes from %jd to %d",
+				    name, sblock.fs_metaspace, kvalue);
+			sblock.fs_metaspace = kvalue;
+		}
+	}
 	if (lflag) {
 		name = "multilabel";
 		if (strcmp(lvalue, "enable") == 0) {
@@ -1064,7 +1087,7 @@ usage(void)
 {
 	fprintf(stderr, "%s\n%s\n%s\n%s\n%s\n%s\n",
 "usage: tunefs [-A] [-a enable | disable] [-e maxbpg] [-f avgfilesize]",
-"              [-J enable | disable] [-j enable | disable]", 
+"              [-J enable | disable] [-j enable | disable] [-k metaspace]",
 "              [-L volname] [-l enable | disable] [-m minfree]",
 "              [-N enable | disable] [-n enable | disable]",
 "              [-o space | time] [-p] [-s avgfpdir] [-t enable | disable]",
@@ -1097,6 +1120,8 @@ printfs(void)
 	      sblock.fs_avgfpdir);
 	warnx("minimum percentage of free space: (-m)             %d%%",
 	      sblock.fs_minfree);
+	warnx("space to hold for metadata blocks: (-k)            %jd",
+	      sblock.fs_metaspace);
 	warnx("optimization preference: (-o)                      %s",
 	      sblock.fs_optim == FS_OPTSPACE ? "space" : "time");
 	if (sblock.fs_minfree >= MINFREE &&

Modified: head/sys/ufs/ffs/ffs_alloc.c
==============================================================================
--- head/sys/ufs/ffs/ffs_alloc.c	Fri Mar 22 21:11:17 2013	(r248622)
+++ head/sys/ufs/ffs/ffs_alloc.c	Fri Mar 22 21:45:28 2013	(r248623)
@@ -817,15 +817,6 @@ ffs_reallocblks_ufs2(ap)
 	UFS_LOCK(ump);
 	pref = ffs_blkpref_ufs2(ip, start_lbn, soff, sbap);
 	/*
-	 * Skip a block for the first indirect block.  Indirect blocks are
-	 * usually initially laid out in a good position between the data
-	 * blocks, but block reallocation would usually destroy locality by
-	 * moving them out of the way to make room for data blocks if we
-	 * didn't compensate here.
-	 */
-	if (start_lbn == NDADDR)
-		pref += fs->fs_frag;
-	/*
 	 * Search the block map looking for an allocation of the desired size.
 	 */
 	if ((newblk = ffs_hashalloc(ip, dtog(fs, pref), pref,
@@ -1090,7 +1081,7 @@ ffs_dirpref(pip)
 	struct inode *pip;
 {
 	struct fs *fs;
-	u_int cg, prefcg, dirsize, cgsize;
+	int cg, prefcg, dirsize, cgsize;
 	u_int avgifree, avgbfree, avgndir, curdirsize;
 	u_int minifree, minbfree, maxndir;
 	u_int mincg, minndir;
@@ -1158,6 +1149,22 @@ ffs_dirpref(pip)
 	 * Limit number of dirs in one cg and reserve space for 
 	 * regular files, but only if we have no deficit in
 	 * inodes or space.
+	 *
+	 * We are trying to find a suitable cylinder group nearby
+	 * our preferred cylinder group to place a new directory.
+	 * We scan from our preferred cylinder group forward looking
+	 * for a cylinder group that meets our criterion. If we get
+	 * to the final cylinder group and do not find anything,
+	 * we start scanning backwards from our preferred cylinder
+	 * group. The ideal would be to alternate looking forward
+	 * and backward, but that is just too complex to code for
+	 * the gain it would get. The most likely place where the
+	 * backward scan would take effect is when we start near
+	 * the end of the filesystem and do not find anything from
+	 * where we are to the end. In that case, scanning backward
+	 * will likely find us a suitable cylinder group much closer
+	 * to our desired location than if we were to start scanning
+	 * forward from the beginning of the filesystem.
 	 */
 	prefcg = ino_to_cg(fs, pip->i_number);
 	for (cg = prefcg; cg < fs->fs_ncg; cg++)
@@ -1167,7 +1174,7 @@ ffs_dirpref(pip)
 			if (fs->fs_contigdirs[cg] < maxcontigdirs)
 				return ((ino_t)(fs->fs_ipg * cg));
 		}
-	for (cg = 0; cg < prefcg; cg++)
+	for (cg = prefcg - 1; cg >= 0; cg--)
 		if (fs->fs_cs(fs, cg).cs_ndir < maxndir &&
 		    fs->fs_cs(fs, cg).cs_nifree >= minifree &&
 	    	    fs->fs_cs(fs, cg).cs_nbfree >= minbfree) {
@@ -1180,7 +1187,7 @@ ffs_dirpref(pip)
 	for (cg = prefcg; cg < fs->fs_ncg; cg++)
 		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
 			return ((ino_t)(fs->fs_ipg * cg));
-	for (cg = 0; cg < prefcg; cg++)
+	for (cg = prefcg - 1; cg >= 0; cg--)
 		if (fs->fs_cs(fs, cg).cs_nifree >= avgifree)
 			break;
 	return ((ino_t)(fs->fs_ipg * cg));
@@ -1193,9 +1200,15 @@ ffs_dirpref(pip)
  *
  * If no blocks have been allocated in the first section, the policy is to
  * request a block in the same cylinder group as the inode that describes
- * the file. If no blocks have been allocated in any other section, the
- * policy is to place the section in a cylinder group with a greater than
- * average number of free blocks.  An appropriate cylinder group is found
+ * the file. The first indirect is allocated immediately following the last
+ * direct block and the data blocks for the first indirect immediately
+ * follow it.
+ *
+ * If no blocks have been allocated in any other section, the indirect 
+ * block(s) are allocated in the same cylinder group as its inode in an
+ * area reserved immediately following the inode blocks. The policy for
+ * the data blocks is to place them in a cylinder group with a greater than
+ * average number of free blocks. An appropriate cylinder group is found
  * by using a rotor that sweeps the cylinder groups. When a new group of
  * blocks is needed, the sweep begins in the cylinder group following the
  * cylinder group from which the previous allocation was made. The sweep
@@ -1218,39 +1231,78 @@ ffs_blkpref_ufs1(ip, lbn, indx, bap)
 	ufs1_daddr_t *bap;
 {
 	struct fs *fs;
-	u_int cg;
+	u_int cg, inocg;
 	u_int avgbfree, startcg;
 	ufs2_daddr_t pref;
 
+	KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap"));
 	mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED);
 	fs = ip->i_fs;
 	/*
-	 * If we are allocating the first indirect block, try to place it
-	 * immediately following the last direct block.
-	 *
+	 * Allocation of indirect blocks is indicated by passing negative
+	 * values in indx: -1 for single indirect, -2 for double indirect,
+	 * -3 for triple indirect. As noted below, we attempt to allocate
+	 * the first indirect inline with the file data. For all later
+	 * indirect blocks, the data is often allocated in other cylinder
+	 * groups. However to speed random file access and to speed up
+	 * fsck, the filesystem reserves the first fs_metaspace blocks
+	 * (typically half of fs_minfree) of the data area of each cylinder
+	 * group to hold these later indirect blocks.
+	 */
+	inocg = ino_to_cg(fs, ip->i_number);
+	if (indx < 0) {
+		/*
+		 * Our preference for indirect blocks is the zone at the
+		 * beginning of the inode's cylinder group data area that
+		 * we try to reserve for indirect blocks.
+		 */
+		pref = cgmeta(fs, inocg);
+		/*
+		 * If we are allocating the first indirect block, try to
+		 * place it immediately following the last direct block.
+		 */
+		if (indx == -1 && lbn < NDADDR + NINDIR(fs) &&
+		    ip->i_din1->di_db[NDADDR - 1] != 0)
+			pref = ip->i_din1->di_db[NDADDR - 1] + fs->fs_frag;
+		return (pref);
+	}
+	/*
 	 * If we are allocating the first data block in the first indirect
-	 * block, try to place it immediately following the indirect block.
+	 * block and the indirect has been allocated in the data block area,
+	 * try to place it immediately following the indirect block.
 	 */
 	if (lbn == NDADDR) {
-		pref = ip->i_din1->di_db[NDADDR - 1];
-		if (bap == NULL && pref != 0)
-			return (pref + fs->fs_frag);
 		pref = ip->i_din1->di_ib[0];
-		if (pref != 0)
+		if (pref != 0 && pref >= cgdata(fs, inocg) &&
+		    pref < cgbase(fs, inocg + 1))
 			return (pref + fs->fs_frag);
 	}
+	/*
+	 * If we are at the beginning of a file, or we have already allocated
+	 * the maximum number of blocks per cylinder group, or we do not
+	 * have a block allocated immediately preceeding us, then we need
+	 * to decide where to start allocating new blocks.
+	 */
 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
-		if (lbn < NDADDR + NINDIR(fs)) {
-			cg = ino_to_cg(fs, ip->i_number);
-			return (cgbase(fs, cg) + fs->fs_frag);
-		}
+		/*
+		 * If we are allocating a directory data block, we want
+		 * to place it in the metadata area.
+		 */
+		if ((ip->i_mode & IFMT) == IFDIR)
+			return (cgmeta(fs, inocg));
+		/*
+		 * Until we fill all the direct and all the first indirect's
+		 * blocks, we try to allocate in the data area of the inode's
+		 * cylinder group.
+		 */
+		if (lbn < NDADDR + NINDIR(fs))
+			return (cgdata(fs, inocg));
 		/*
 		 * Find a cylinder with greater than average number of
 		 * unused data blocks.
 		 */
 		if (indx == 0 || bap[indx - 1] == 0)
-			startcg =
-			    ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
+			startcg = inocg + lbn / fs->fs_maxbpg;
 		else
 			startcg = dtog(fs, bap[indx - 1]) + 1;
 		startcg %= fs->fs_ncg;
@@ -1258,17 +1310,17 @@ ffs_blkpref_ufs1(ip, lbn, indx, bap)
 		for (cg = startcg; cg < fs->fs_ncg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
-				return (cgbase(fs, cg) + fs->fs_frag);
+				return (cgdata(fs, cg));
 			}
 		for (cg = 0; cg <= startcg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
-				return (cgbase(fs, cg) + fs->fs_frag);
+				return (cgdata(fs, cg));
 			}
 		return (0);
 	}
 	/*
-	 * We just always try to lay things out contiguously.
+	 * Otherwise, we just always try to lay things out contiguously.
 	 */
 	return (bap[indx - 1] + fs->fs_frag);
 }
@@ -1284,39 +1336,78 @@ ffs_blkpref_ufs2(ip, lbn, indx, bap)
 	ufs2_daddr_t *bap;
 {
 	struct fs *fs;
-	u_int cg;
+	u_int cg, inocg;
 	u_int avgbfree, startcg;
 	ufs2_daddr_t pref;
 
+	KASSERT(indx <= 0 || bap != NULL, ("need non-NULL bap"));
 	mtx_assert(UFS_MTX(ip->i_ump), MA_OWNED);
 	fs = ip->i_fs;
 	/*
-	 * If we are allocating the first indirect block, try to place it
-	 * immediately following the last direct block.
-	 *
+	 * Allocation of indirect blocks is indicated by passing negative
+	 * values in indx: -1 for single indirect, -2 for double indirect,
+	 * -3 for triple indirect. As noted below, we attempt to allocate
+	 * the first indirect inline with the file data. For all later
+	 * indirect blocks, the data is often allocated in other cylinder
+	 * groups. However to speed random file access and to speed up
+	 * fsck, the filesystem reserves the first fs_metaspace blocks
+	 * (typically half of fs_minfree) of the data area of each cylinder
+	 * group to hold these later indirect blocks.
+	 */
+	inocg = ino_to_cg(fs, ip->i_number);
+	if (indx < 0) {
+		/*
+		 * Our preference for indirect blocks is the zone at the
+		 * beginning of the inode's cylinder group data area that
+		 * we try to reserve for indirect blocks.
+		 */
+		pref = cgmeta(fs, inocg);
+		/*
+		 * If we are allocating the first indirect block, try to
+		 * place it immediately following the last direct block.
+		 */
+		if (indx == -1 && lbn < NDADDR + NINDIR(fs) &&
+		    ip->i_din2->di_db[NDADDR - 1] != 0)
+			pref = ip->i_din2->di_db[NDADDR - 1] + fs->fs_frag;
+		return (pref);
+	}
+	/*
 	 * If we are allocating the first data block in the first indirect
-	 * block, try to place it immediately following the indirect block.
+	 * block and the indirect has been allocated in the data block area,
+	 * try to place it immediately following the indirect block.
 	 */
 	if (lbn == NDADDR) {
-		pref = ip->i_din1->di_db[NDADDR - 1];
-		if (bap == NULL && pref != 0)
-			return (pref + fs->fs_frag);
-		pref = ip->i_din1->di_ib[0];
-		if (pref != 0)
+		pref = ip->i_din2->di_ib[0];
+		if (pref != 0 && pref >= cgdata(fs, inocg) &&
+		    pref < cgbase(fs, inocg + 1))
 			return (pref + fs->fs_frag);
 	}
+	/*
+	 * If we are at the beginning of a file, or we have already allocated
+	 * the maximum number of blocks per cylinder group, or we do not
+	 * have a block allocated immediately preceeding us, then we need
+	 * to decide where to start allocating new blocks.
+	 */
 	if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
-		if (lbn < NDADDR + NINDIR(fs)) {
-			cg = ino_to_cg(fs, ip->i_number);
-			return (cgbase(fs, cg) + fs->fs_frag);
-		}
+		/*
+		 * If we are allocating a directory data block, we want
+		 * to place it in the metadata area.
+		 */
+		if ((ip->i_mode & IFMT) == IFDIR)
+			return (cgmeta(fs, inocg));
+		/*
+		 * Until we fill all the direct and all the first indirect's
+		 * blocks, we try to allocate in the data area of the inode's
+		 * cylinder group.
+		 */
+		if (lbn < NDADDR + NINDIR(fs))
+			return (cgdata(fs, inocg));
 		/*
 		 * Find a cylinder with greater than average number of
 		 * unused data blocks.
 		 */
 		if (indx == 0 || bap[indx - 1] == 0)
-			startcg =
-			    ino_to_cg(fs, ip->i_number) + lbn / fs->fs_maxbpg;
+			startcg = inocg + lbn / fs->fs_maxbpg;
 		else
 			startcg = dtog(fs, bap[indx - 1]) + 1;
 		startcg %= fs->fs_ncg;
@@ -1324,17 +1415,17 @@ ffs_blkpref_ufs2(ip, lbn, indx, bap)
 		for (cg = startcg; cg < fs->fs_ncg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
-				return (cgbase(fs, cg) + fs->fs_frag);
+				return (cgdata(fs, cg));
 			}
 		for (cg = 0; cg <= startcg; cg++)
 			if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
 				fs->fs_cgrotor = cg;
-				return (cgbase(fs, cg) + fs->fs_frag);
+				return (cgdata(fs, cg));
 			}
 		return (0);
 	}
 	/*
-	 * We just always try to lay things out contiguously.
+	 * Otherwise, we just always try to lay things out contiguously.
 	 */
 	return (bap[indx - 1] + fs->fs_frag);
 }
@@ -1611,31 +1702,37 @@ ffs_alloccgblk(ip, bp, bpref, size)
 	ufs1_daddr_t bno;
 	ufs2_daddr_t blkno;
 	u_int8_t *blksfree;
-	int i;
+	int i, cgbpref;
 
 	fs = ip->i_fs;
 	ump = ip->i_ump;
 	mtx_assert(UFS_MTX(ump), MA_OWNED);
 	cgp = (struct cg *)bp->b_data;
 	blksfree = cg_blksfree(cgp);
-	if (bpref == 0 || dtog(fs, bpref) != cgp->cg_cgx) {
+	if (bpref == 0) {
 		bpref = cgp->cg_rotor;
-	} else {
-		bpref = blknum(fs, bpref);
-		bno = dtogd(fs, bpref);
-		/*
-		 * if the requested block is available, use it
-		 */
-		if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno)))
-			goto gotit;
+	} else if ((cgbpref = dtog(fs, bpref)) != cgp->cg_cgx) {
+		/* map bpref to correct zone in this cg */
+		if (bpref < cgdata(fs, cgbpref))
+			bpref = cgmeta(fs, cgp->cg_cgx);
+		else
+			bpref = cgdata(fs, cgp->cg_cgx);
 	}
 	/*
+	 * if the requested block is available, use it
+	 */
+	bno = dtogd(fs, blknum(fs, bpref));
+	if (ffs_isblock(fs, blksfree, fragstoblks(fs, bno)))
+		goto gotit;
+	/*
 	 * Take the next available block in this cylinder group.
 	 */
 	bno = ffs_mapsearch(fs, cgp, bpref, (int)fs->fs_frag);
 	if (bno < 0)
 		return (0);
-	cgp->cg_rotor = bno;
+	/* Update cg_rotor only if allocated from the data zone */
+	if (bno >= dtogd(fs, cgdata(fs, cgp->cg_cgx)))
+		cgp->cg_rotor = bno;
 gotit:
 	blkno = fragstoblks(fs, bno);
 	ffs_clrblock(fs, blksfree, (long)blkno);
@@ -1742,9 +1839,10 @@ ffs_clusteralloc(ip, cg, bpref, len, unu
 	 * be recalled to try an allocation in the next cylinder group.
 	 */
 	if (dtog(fs, bpref) != cg)
-		bpref = 0;
+		bpref = cgdata(fs, cg);
 	else
-		bpref = fragstoblks(fs, dtogd(fs, blknum(fs, bpref)));
+		bpref = blknum(fs, bpref);
+	bpref = fragstoblks(fs, dtogd(fs, bpref));
 	mapp = &cg_clustersfree(cgp)[bpref / NBBY];
 	map = *mapp++;
 	bit = 1 << (bpref % NBBY);

Modified: head/sys/ufs/ffs/ffs_balloc.c
==============================================================================
--- head/sys/ufs/ffs/ffs_balloc.c	Fri Mar 22 21:11:17 2013	(r248622)
+++ head/sys/ufs/ffs/ffs_balloc.c	Fri Mar 22 21:45:28 2013	(r248623)
@@ -246,7 +246,8 @@ ffs_balloc_ufs1(struct vnode *vp, off_t 
 	lbns_remfree = lbns;
 	if (nb == 0) {
 		UFS_LOCK(ump);
-		pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
+		pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1,
+		    (ufs1_daddr_t *)0);
 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
 		    flags, cred, &newb)) != 0) {
 			curthread_pflags_restore(saved_inbdflush);
@@ -299,7 +300,8 @@ retry:
 		}
 		UFS_LOCK(ump);
 		if (pref == 0)
-			pref = ffs_blkpref_ufs1(ip, lbn, 0, (ufs1_daddr_t *)0);
+			pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1,
+			    (ufs1_daddr_t *)0);
 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
 		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
 			brelse(bp);
@@ -794,7 +796,8 @@ ffs_balloc_ufs2(struct vnode *vp, off_t 
 	lbns_remfree = lbns;
 	if (nb == 0) {
 		UFS_LOCK(ump);
-		pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
+		pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1,
+		    (ufs2_daddr_t *)0);
 	        if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
 		    flags, cred, &newb)) != 0) {
 			curthread_pflags_restore(saved_inbdflush);
@@ -848,7 +851,8 @@ retry:
 		}
 		UFS_LOCK(ump);
 		if (pref == 0)
-			pref = ffs_blkpref_ufs2(ip, lbn, 0, (ufs2_daddr_t *)0);
+			pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1,
+			    (ufs2_daddr_t *)0);
 		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
 		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
 			brelse(bp);

Modified: head/sys/ufs/ffs/fs.h
==============================================================================
--- head/sys/ufs/ffs/fs.h	Fri Mar 22 21:11:17 2013	(r248622)
+++ head/sys/ufs/ffs/fs.h	Fri Mar 22 21:45:28 2013	(r248623)
@@ -333,7 +333,8 @@ struct fs {
 	int32_t	 fs_maxbsize;		/* maximum blocking factor permitted */
 	int64_t	 fs_unrefs;		/* number of unreferenced inodes */
 	int64_t  fs_providersize;	/* size of underlying GEOM provider */
-	int64_t	 fs_sparecon64[15];	/* old rotation block list head */
+	int64_t	 fs_metaspace;		/* size of area reserved for metadata */
+	int64_t	 fs_sparecon64[14];	/* old rotation block list head */
 	int64_t	 fs_sblockloc;		/* byte offset of standard superblock */
 	struct	csum_total fs_cstotal;	/* (u) cylinder summary information */
 	ufs_time_t fs_time;		/* last time written */
@@ -525,6 +526,8 @@ struct cg {
  * They calc filesystem addresses of cylinder group data structures.
  */
 #define	cgbase(fs, c)	(((ufs2_daddr_t)(fs)->fs_fpg) * (c))
+#define	cgdata(fs, c)	(cgdmin(fs, c) + (fs)->fs_metaspace)	/* data zone */
+#define	cgmeta(fs, c)	(cgdmin(fs, c))				/* meta data */
 #define	cgdmin(fs, c)	(cgstart(fs, c) + (fs)->fs_dblkno)	/* 1st data */
 #define	cgimin(fs, c)	(cgstart(fs, c) + (fs)->fs_iblkno)	/* inode blk */
 #define	cgsblock(fs, c)	(cgstart(fs, c) + (fs)->fs_sblkno)	/* super blk */


More information about the svn-src-all mailing list