svn commit: r202990 - in projects/suj/head/sbin: fsck_ffs fsdb

Jeff Roberson jeff at FreeBSD.org
Mon Jan 25 23:27:22 UTC 2010


Author: jeff
Date: Mon Jan 25 23:27:21 2010
New Revision: 202990
URL: http://svn.freebsd.org/changeset/base/202990

Log:
   - Implement partial truncation in the checker.  The file is truncated to
     the lesser of the original size, the last allocated block, and the
     intended truncation size depending on how far the original truncation
     got.
   - Improve recovery performance by doing duplication and reference move
     operations for each inode all at once.  The algorithm is still N^2 but
     doing it as records are discovered ensures that the rest are out of
     cache when we search.  This reduced fsck time by 80%.
   - Verify that .. matches the expected parent before deciding to skip
     a parent's record.
   - Add some more information to fsdb.

Modified:
  projects/suj/head/sbin/fsck_ffs/fsck.h
  projects/suj/head/sbin/fsck_ffs/main.c
  projects/suj/head/sbin/fsck_ffs/suj.c
  projects/suj/head/sbin/fsdb/fsdb.c

Modified: projects/suj/head/sbin/fsck_ffs/fsck.h
==============================================================================
--- projects/suj/head/sbin/fsck_ffs/fsck.h	Mon Jan 25 23:24:25 2010	(r202989)
+++ projects/suj/head/sbin/fsck_ffs/fsck.h	Mon Jan 25 23:27:21 2010	(r202990)
@@ -388,4 +388,4 @@ void		sblock_init(void);
 void		setinodebuf(ino_t);
 int		setup(char *dev);
 void		gjournal_check(const char *filesys);
-void		suj_check(const char *filesys);
+int		suj_check(const char *filesys);

Modified: projects/suj/head/sbin/fsck_ffs/main.c
==============================================================================
--- projects/suj/head/sbin/fsck_ffs/main.c	Mon Jan 25 23:24:25 2010	(r202989)
+++ projects/suj/head/sbin/fsck_ffs/main.c	Mon Jan 25 23:27:21 2010	(r202990)
@@ -242,8 +242,9 @@ checkfilesys(char *filesys)
 		if ((fsreadfd = open(filesys, O_RDONLY)) < 0 || readsb(0) == 0)
 			exit(3);	/* Cannot read superblock */
 		close(fsreadfd);
-		if (sblock.fs_flags & FS_NEEDSFSCK)
-			exit(4);	/* Earlier background failed */
+		/* Earlier background failed or journaled */
+		if (sblock.fs_flags & (FS_NEEDSFSCK | FS_SUJ))
+			exit(4);
 		if ((sblock.fs_flags & FS_DOSOFTDEP) == 0)
 			exit(5);	/* Not running soft updates */
 		size = MIBSIZE;
@@ -256,7 +257,7 @@ checkfilesys(char *filesys)
 	}
 	if (ckclean && skipclean) {
 		/*
-		 * If file system is gjournaled or su+j, check it here.
+		 * If file system is gjournaled, check it here.
 		 */
 		if ((fsreadfd = open(filesys, O_RDONLY)) < 0 || readsb(0) == 0)
 			exit(3);	/* Cannot read superblock */
@@ -278,18 +279,6 @@ checkfilesys(char *filesys)
 				    "CANNOT RUN FAST FSCK\n");
 			}
 		}
-#if 0
-		if ((sblock.fs_flags & FS_SUJ) != 0) {
-			if (sblock.fs_clean == 1) {
-				pwarn("FILE SYSTEM CLEAN; SKIPPING CHECKS\n");
-				exit(0);
-			}
-			suj_check(filesys);
-			if (chkdoreload(mntp) == 0)
-				exit(0);
-			exit(4);
-		}
-#endif
 	}
 	/*
 	 * If we are to do a background check:
@@ -396,6 +385,26 @@ checkfilesys(char *filesys)
 		    sblock.fs_cstotal.cs_nffree * 100.0 / sblock.fs_dsize);
 		return (0);
 	}
+	/*
+	 * Determine if we can and should do journal recovery.
+	 */
+	if ((sblock.fs_flags & (FS_SUJ | FS_NEEDSFSCK)) == FS_SUJ) {
+		if (preen || reply("USE JOURNAL?")) {
+			if (suj_check(filesys) == 0) {
+				if (chkdoreload(mntp) == 0)
+					exit(0);
+				exit(4);
+			}
+			/* suj_check failed, fall through. */
+		}
+		printf("** Skipping journal, falling through to full fsck\n");
+		/*
+		 * Write the superblock so we don't try to recover the
+		 * journal on another pass.
+		 */
+		sblock.fs_mtime = time(NULL);
+		sbdirty();
+	}
 	
 	/*
 	 * Cleared if any questions answered no. Used to decide if
@@ -493,7 +502,6 @@ checkfilesys(char *filesys)
 	inocleanup();
 	if (fsmodified) {
 		sblock.fs_time = time(NULL);
-		sblock.fs_mtime = time(NULL);
 		sbdirty();
 	}
 	if (cvtlevel && sblk.b_dirty) {

Modified: projects/suj/head/sbin/fsck_ffs/suj.c
==============================================================================
--- projects/suj/head/sbin/fsck_ffs/suj.c	Mon Jan 25 23:24:25 2010	(r202989)
+++ projects/suj/head/sbin/fsck_ffs/suj.c	Mon Jan 25 23:27:21 2010	(r202990)
@@ -49,7 +49,8 @@ __FBSDID("$FreeBSD$");
 
 static void	ino_decr(ino_t);
 
-#define	SUJ_HASHSIZE	128
+#define	DOTDOT_OFFSET	DIRECTSIZ(1)
+#define	SUJ_HASHSIZE	2048
 #define	SUJ_HASHMASK	(SUJ_HASHSIZE - 1)
 #define	SUJ_HASH(x)	((x * 2654435761) & SUJ_HASHMASK)
 
@@ -68,7 +69,9 @@ TAILQ_HEAD(srechd, suj_rec);
 struct suj_ino {
 	LIST_ENTRY(suj_ino)	si_next;
 	struct srechd		si_recs;
+	struct srechd		si_newrecs;
 	struct srechd		si_movs;
+	struct jtrncrec		*si_trunc;
 	ino_t			si_ino;
 	int			si_nlinkadj;
 	int			si_skipparent;
@@ -90,6 +93,7 @@ struct data_blk {
 	uint8_t			*db_buf;
 	ufs2_daddr_t		db_blk;
 	int			db_size;
+	int			db_dirty;
 };
 
 struct ino_blk {
@@ -106,6 +110,8 @@ struct suj_cg {
 	struct inohd		sc_inohash[SUJ_HASHSIZE];
 	struct iblkhd		sc_iblkhash[SUJ_HASHSIZE];
 	struct ino_blk		*sc_lastiblk;
+	struct suj_ino		*sc_lastino;
+	struct suj_blk		*sc_lastblk;
 	uint8_t			*sc_cgbuf;
 	struct cg		*sc_cgp;
 	int			sc_dirty;
@@ -114,6 +120,8 @@ struct suj_cg {
 
 LIST_HEAD(cghd, suj_cg) cghash[SUJ_HASHSIZE];
 LIST_HEAD(dblkhd, data_blk) dbhash[SUJ_HASHSIZE];
+struct suj_cg *lastcg;
+struct data_blk *lastblk;
 
 TAILQ_HEAD(seghd, suj_seg) allsegs;
 uint64_t oldseq;
@@ -131,6 +139,8 @@ uint64_t jbytes;
 uint64_t jrecs;
 
 typedef void (*ino_visitor)(ino_t, ufs_lbn_t, ufs2_daddr_t, int);
+static void ino_trunc(ino_t ino, off_t size);
+static void ino_build(struct suj_ino *sino);
 
 static void *
 errmalloc(size_t n)
@@ -159,12 +169,6 @@ opendisk(const char *devnam)
 		    disk->d_error);
 	}
 	fs = &disk->d_fs;
-	/*
-	 * Setup a few things so reply() can work.
-	 */
-	bcopy(fs, &sblock, sizeof(sblock));
-	fsreadfd = disk->d_fd;
-	fswritefd = disk->d_fd;
 }
 
 /*
@@ -198,8 +202,6 @@ closedisk(const char *devnam)
 	free(disk);
 	disk = NULL;
 	fs = NULL;
-	fsreadfd = -1;
-	fswritefd = -1;
 }
 
 /*
@@ -216,10 +218,14 @@ cg_lookup(int cgx)
 		abort();
 		errx(1, "Bad cg number %d", cgx);
 	}
+	if (lastcg && lastcg->sc_cgx == cgx)
+		return (lastcg);
 	hd = &cghash[SUJ_HASH(cgx)];
 	LIST_FOREACH(sc, hd, sc_next)
-		if (sc->sc_cgx == cgx)
+		if (sc->sc_cgx == cgx) {
+			lastcg = sc;
 			return (sc);
+		}
 	sc = errmalloc(sizeof(*sc));
 	bzero(sc, sizeof(*sc));
 	sc->sc_cgbuf = errmalloc(fs->fs_bsize);
@@ -245,6 +251,8 @@ ino_lookup(ino_t ino, int creat)
 	struct suj_cg *sc;
 
 	sc = cg_lookup(ino_to_cg(fs, ino));
+	if (sc->sc_lastino && sc->sc_lastino->si_ino == ino)
+		return (sc->sc_lastino);
 	hd = &sc->sc_inohash[SUJ_HASH(ino)];
 	LIST_FOREACH(sino, hd, si_next)
 		if (sino->si_ino == ino)
@@ -256,6 +264,7 @@ ino_lookup(ino_t ino, int creat)
 	sino->si_ino = ino;
 	sino->si_nlinkadj = 0;
 	TAILQ_INIT(&sino->si_recs);
+	TAILQ_INIT(&sino->si_newrecs);
 	TAILQ_INIT(&sino->si_movs);
 	LIST_INSERT_HEAD(hd, sino, si_next);
 
@@ -274,7 +283,9 @@ blk_lookup(ufs2_daddr_t blk, int creat)
 	struct blkhd *hd;
 
 	sc = cg_lookup(dtog(fs, blk));
-	hd = &sc->sc_blkhash[SUJ_HASH(blk)];
+	if (sc->sc_lastblk && sc->sc_lastblk->sb_blk == blk)
+		return (sc->sc_lastblk);
+	hd = &sc->sc_blkhash[SUJ_HASH(fragstoblks(fs, blk))];
 	LIST_FOREACH(sblk, hd, sb_next)
 		if (sblk->sb_blk == blk)
 			return (sblk);
@@ -289,16 +300,18 @@ blk_lookup(ufs2_daddr_t blk, int creat)
 	return (sblk);
 }
 
-static uint8_t *
-dblk_read(ufs2_daddr_t blk, int size)
+static struct data_blk *
+dblk_lookup(ufs2_daddr_t blk)
 {
 	struct data_blk *dblk;
 	struct dblkhd *hd;
 
-	hd = &dbhash[SUJ_HASH(blk)];
+	hd = &dbhash[SUJ_HASH(fragstoblks(fs, blk))];
+	if (lastblk && lastblk->db_blk == blk)
+		return (lastblk);
 	LIST_FOREACH(dblk, hd, db_next)
 		if (dblk->db_blk == blk)
-			goto found;
+			return (dblk);
 	/*
 	 * The inode block wasn't located, allocate a new one.
 	 */
@@ -306,7 +319,15 @@ dblk_read(ufs2_daddr_t blk, int size)
 	bzero(dblk, sizeof(*dblk));
 	LIST_INSERT_HEAD(hd, dblk, db_next);
 	dblk->db_blk = blk;
-found:
+	return (dblk);
+}
+
+static uint8_t *
+dblk_read(ufs2_daddr_t blk, int size)
+{
+	struct data_blk *dblk;
+
+	dblk = dblk_lookup(blk);
 	/*
 	 * I doubt size mismatches can happen in practice but it is trivial
 	 * to handle.
@@ -322,6 +343,33 @@ found:
 	return (dblk->db_buf);
 }
 
+static void
+dblk_dirty(ufs2_daddr_t blk)
+{
+	struct data_blk *dblk;
+
+	dblk = dblk_lookup(blk);
+	dblk->db_dirty = 1;
+}
+
+static void
+dblk_write(void)
+{
+	struct data_blk *dblk;
+	int i;
+
+	for (i = 0; i < SUJ_HASHSIZE; i++) {
+		LIST_FOREACH(dblk, &dbhash[i], db_next) {
+			if (dblk->db_dirty == 0 || dblk->db_size == 0)
+				continue;
+			if (bwrite(disk, fsbtodb(fs, dblk->db_blk),
+			    dblk->db_buf, dblk->db_size) == -1)
+				err(1, "Unable to write block %jd",
+				    dblk->db_blk);
+		}
+	}
+}
+
 static union dinode *
 ino_read(ino_t ino)
 {
@@ -333,7 +381,10 @@ ino_read(ino_t ino)
 
 	blk = ino_to_fsba(fs, ino);
 	sc = cg_lookup(ino_to_cg(fs, ino));
-	hd = &sc->sc_iblkhash[SUJ_HASH(blk)];
+	iblk = sc->sc_lastiblk;
+	if (iblk && iblk->ib_blk == blk)
+		goto found;
+	hd = &sc->sc_iblkhash[SUJ_HASH(fragstoblks(fs, blk))];
 	LIST_FOREACH(iblk, hd, ib_next)
 		if (iblk->ib_blk == blk)
 			goto found;
@@ -371,7 +422,7 @@ ino_dirty(ino_t ino)
 		iblk->ib_dirty = 1;
 		return;
 	}
-	hd = &sc->sc_iblkhash[SUJ_HASH(blk)];
+	hd = &sc->sc_iblkhash[SUJ_HASH(fragstoblks(fs, blk))];
 	LIST_FOREACH(iblk, hd, ib_next) {
 		if (iblk->ib_blk == blk) {
 			iblk->ib_dirty = 1;
@@ -612,22 +663,22 @@ blk_free(ufs2_daddr_t bno, int mask, int
  * to fetch a specific block.
  */
 static ufs2_daddr_t
-indir_blkatoff(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t cur, ufs_lbn_t lbn, int level)
+indir_blkatoff(ufs2_daddr_t blk, ino_t ino, ufs_lbn_t cur, ufs_lbn_t lbn)
 {
 	ufs2_daddr_t *bap2;
 	ufs2_daddr_t *bap1;
 	ufs_lbn_t lbnadd;
 	ufs_lbn_t base;
+	int level;
 	int i;
 
 	if (blk == 0)
 		return (0);
-	if (cur == lbn)
-		return (blk);
-	if (level == 0 && lbn < 0) {
-		abort();
+	level = lbn_level(cur);
+	if (level == -1)
+		errx(1, "Invalid indir lbn %jd", lbn);
+	if (level == 0 && lbn < 0)
 		errx(1, "Invalid lbn %jd", lbn);
-	}
 	bap2 = (void *)dblk_read(blk, fs->fs_bsize);
 	bap1 = (void *)bap2;
 	lbnadd = 1;
@@ -638,11 +689,9 @@ indir_blkatoff(ufs2_daddr_t blk, ino_t i
 		i = (lbn - base) / lbnadd;
 	else
 		i = (-lbn - base) / lbnadd;
-	if (i < 0 || i >= NINDIR(fs)) {
-		abort();
+	if (i < 0 || i >= NINDIR(fs))
 		errx(1, "Invalid indirect index %d produced by lbn %jd",
 		    i, lbn);
-	}
 	if (level == 0)
 		cur = base + (i * lbnadd);
 	else
@@ -657,7 +706,7 @@ indir_blkatoff(ufs2_daddr_t blk, ino_t i
 		abort();
 		errx(1, "Invalid lbn %jd at level 0", lbn);
 	}
-	return indir_blkatoff(blk, ino, cur, lbn, level - 1);
+	return indir_blkatoff(blk, ino, cur, lbn);
 }
 
 /*
@@ -685,14 +734,10 @@ ino_blkatoff(union dinode *ip, ino_t ino
 		return (ip->dp2.di_extb[lbn]);
 	}
 	/*
-	 * And now direct and indirect.  Verify that the lbn does not
-	 * exceed the size required to store the file by asking for
-	 * the lbn of the last byte.  These blocks should be 0 anyway
-	 * so this simply saves the traversal.
+	 * Now direct and indirect.
 	 */
-	if (lbn > 0 && lbn > lblkno(fs, DIP(ip, di_size) - 1))
-		return (0);
-	if (lbn < 0 && -lbn > lblkno(fs, DIP(ip, di_size) - 1))
+	if (DIP(ip, di_mode) == IFLNK &&
+	    DIP(ip, di_size) < fs->fs_maxsymlinklen)
 		return (0);
 	if (lbn >= 0 && lbn < NDADDR) {
 		*frags = numfrags(fs, sblksize(fs, DIP(ip, di_size), lbn));
@@ -703,7 +748,7 @@ ino_blkatoff(union dinode *ip, ino_t ino
 	for (i = 0, tmpval = NINDIR(fs), cur = NDADDR; i < NIADDR; i++,
 	    tmpval *= NINDIR(fs), cur = next) {
 		next = cur + tmpval;
-		if (lbn == -cur)
+		if (lbn == -cur - i)
 			return (DIP(ip, di_ib[i]));
 		/*
 		 * Determine whether the lbn in question is within this tree.
@@ -712,8 +757,7 @@ ino_blkatoff(union dinode *ip, ino_t ino
 			continue;
 		if (lbn > 0 && lbn >= next)
 			continue;
-
-		return indir_blkatoff(DIP(ip, di_ib[i]), ino, -cur - i, lbn, i);
+		return indir_blkatoff(DIP(ip, di_ib[i]), ino, -cur - i, lbn);
 	}
 	errx(1, "lbn %jd not in ino", lbn);
 }
@@ -760,7 +804,10 @@ ino_isat(ino_t parent, off_t diroff, ino
 	*mode = DIP(dip, di_mode);
 	if ((*mode & IFMT) != IFDIR) {
 		if (debug) {
-			/* This can happen if the parent inode was reallocated. */
+			/*
+			 * This can happen if the parent inode
+			 * was reallocated.
+			 */
 			if (*mode != 0)
 				printf("Directory %d has bad mode %o\n",
 				    parent, *mode);
@@ -791,7 +838,7 @@ ino_isat(ino_t parent, off_t diroff, ino
 	 * certain we hit a valid record and not some junk in the middle
 	 * of a file name.  Stop when we reach or pass the expected offset.
 	 */
-	dpoff = 0;
+	dpoff = (doff / DIRBLKSIZ) * DIRBLKSIZ;
 	do {
 		dp = (struct direct *)&block[dpoff];
 		if (dpoff == doff)
@@ -801,7 +848,7 @@ ino_isat(ino_t parent, off_t diroff, ino
 		dpoff += dp->d_reclen;
 	} while (dpoff <= doff);
 	if (dpoff > fs->fs_bsize)
-		errx(1, "Corrupt directory block in dir inode %d", parent);
+		errx(1, "Corrupt directory block in dir ino %d", parent);
 	/* Not found. */
 	if (dpoff != doff) {
 		if (debug)
@@ -830,6 +877,7 @@ ino_isat(ino_t parent, off_t diroff, ino
 
 #define	VISIT_INDIR	0x0001
 #define	VISIT_EXT	0x0002
+#define	VISIT_ROOT	0x0004	/* Operation came via root & valid pointers. */
 
 /*
  * Read an indirect level which may or may not be linked into an inode.
@@ -854,16 +902,14 @@ indir_visit(ino_t ino, ufs_lbn_t lbn, uf
 	 */
 	if (blk == 0)
 		return;
-	if (blk_isindir(blk, ino, lbn) == 0) {
-		if (debug)
-			printf("blk %jd ino %d lbn %jd is not indir.\n",
-			    blk, ino, lbn);
-		goto out;
-	}
 	level = lbn_level(lbn);
-	if (level == -1) {
-		abort();
+	if (level == -1)
 		errx(1, "Invalid level for lbn %jd", lbn);
+	if ((flags & VISIT_ROOT) == 0 && blk_isindir(blk, ino, lbn) == 0) {
+		if (debug)
+			printf("blk %jd ino %d lbn %jd(%d) is not indir.\n",
+			    blk, ino, lbn, level);
+		goto out;
 	}
 	lbnadd = 1;
 	for (i = level; i > 0; i--)
@@ -903,6 +949,7 @@ out:
 static uint64_t
 ino_visit(union dinode *ip, ino_t ino, ino_visitor visitor, int flags)
 {
+	ufs_lbn_t nextlbn;
 	ufs_lbn_t tmpval;
 	ufs_lbn_t lbn;
 	uint64_t size;
@@ -937,8 +984,15 @@ ino_visit(union dinode *ip, ino_t ino, i
 		fragcnt += frags;
 		visitor(ino, i, DIP(ip, di_db[i]), frags);
 	}
+	/*
+	 * We know the following indirects are real as we're following
+	 * real pointers to them.
+	 */
+	flags |= VISIT_ROOT;
 	for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; i++,
-	    tmpval *= NINDIR(fs), lbn += tmpval) {
+	    lbn = nextlbn) {
+		nextlbn = lbn + tmpval;
+		tmpval *= NINDIR(fs);
 		if (DIP(ip, di_ib[i]) == 0)
 			continue;
 		indir_visit(ino, -lbn - i, DIP(ip, di_ib[i]), &fragcnt, visitor,
@@ -948,11 +1002,15 @@ ino_visit(union dinode *ip, ino_t ino, i
 }
 
 /*
- * Null visitor function used when we just want to count blocks.
+ * Null visitor function used when we just want to count blocks and
+ * record the lbn.
  */
+ufs_lbn_t visitlbn;
 static void
 null_visit(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
 {
+	if (lbn > 0)
+		visitlbn = lbn;
 }
 
 /*
@@ -962,23 +1020,45 @@ null_visit(ino_t ino, ufs_lbn_t lbn, ufs
  * reachable at the time the inode was written.
  */
 static void
-ino_adjblks(ino_t ino)
+ino_adjblks(struct suj_ino *sino)
 {
-	struct suj_ino *sino;
 	union dinode *ip;
 	uint64_t blocks;
 	uint64_t frags;
+	off_t isize;
+	off_t size;
+	ino_t ino;
 
-	sino = ino_lookup(ino, 1);
-	if (sino->si_blkadj)
-		return;
-	sino->si_blkadj = 1;
+	ino = sino->si_ino;
 	ip = ino_read(ino);
 	/* No need to adjust zero'd inodes. */
 	if (DIP(ip, di_mode) == 0)
 		return;
+	/*
+	 * Visit all blocks and count them as well as recording the last
+	 * valid lbn in the file.  If the file size doesn't agree with the
+	 * last lbn we need to truncate to fix it.  Otherwise just adjust
+	 * the blocks count.
+	 */
+	visitlbn = 0;
 	frags = ino_visit(ip, ino, null_visit, VISIT_INDIR | VISIT_EXT);
 	blocks = fsbtodb(fs, frags);
+	/*
+	 * We assume the size and direct block list is kept coherent by
+	 * softdep.  For files that have extended into indirects we truncate
+	 * to the size in the inode or the maximum size permitted by
+	 * populated indirects.
+	 */
+	if (visitlbn >= NDADDR) {
+		isize = DIP(ip, di_size);
+		size = lblktosize(fs, visitlbn + 1);
+		printf("ino %d isize %jd size %jd\n", ino, isize, size);
+		if (isize > size)
+			isize = size;
+		/* Always truncate to free any unpopulated indirects. */
+		ino_trunc(sino->si_ino, isize);
+		return;
+	}
 	if (blocks == DIP(ip, di_blocks))
 		return;
 	if (debug)
@@ -1021,6 +1101,16 @@ blk_free_lbn(ufs2_daddr_t blk, ino_t ino
 }
 
 static void
+ino_setskip(struct suj_ino *sino, ino_t parent)
+{
+	int isdot;
+	int mode;
+
+	if (ino_isat(sino->si_ino, DOTDOT_OFFSET, parent, &mode, &isdot))
+		sino->si_skipparent = 1;
+}
+
+static void
 ino_free_children(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, int frags)
 {
 	struct suj_ino *sino;
@@ -1053,7 +1143,7 @@ ino_free_children(ino_t ino, ufs_lbn_t l
 		if (isparent && skipparent == 1)
 			continue;
 		if (debug)
-			printf("Directory %d removing inode %d name %s\n",
+			printf("Directory %d removing ino %d name %s\n",
 			    ino, dp->d_ino, dp->d_name);
 		/*
 		 * Lookup this inode to see if we have a record for it.
@@ -1070,7 +1160,7 @@ ino_free_children(ino_t ino, ufs_lbn_t l
 		 * parent.  Don't try to adjust our link down again.
 		 */
 		if (isparent == 0)
-			sino->si_skipparent = 1;
+			ino_setskip(sino, ino);
 		/*
 		 * If we haven't yet processed this inode we need to make
 		 * sure we will successfully discover the lost path.  If not
@@ -1084,16 +1174,16 @@ ino_free_children(ino_t ino, ufs_lbn_t l
 				break;
 		}
 		if (srec == NULL)
-			sino->si_nlinkadj--;
+			sino->si_nlinkadj++;
 	}
 }
 
 /*
- * Truncate an inode, freeing all blocks and decrementing all children's
+ * Reclaim an inode, freeing all blocks and decrementing all children's
  * link counts.  Free the inode back to the cg.
  */
 static void
-ino_truncate(union dinode *ip, ino_t ino, int mode)
+ino_reclaim(union dinode *ip, ino_t ino, int mode)
 {
 	uint32_t gen;
 
@@ -1147,7 +1237,7 @@ ino_decr(ino_t ino)
 		if (debug)
 			printf("ino %d not enough links to live %d < %d\n",
 			    ino, nlink, reqlink);
-		ino_truncate(ip, ino, mode);
+		ino_reclaim(ip, ino, mode);
 		return;
 	}
 	DIP_SET(ip, di_nlink, nlink);
@@ -1192,7 +1282,7 @@ ino_adjust(ino_t ino, int lastmode, nlin
 		if (debug)
 			printf("ino %d not enough links to live %d < %d\n",
 			    ino, nlink, reqlink);
-		ino_truncate(ip, ino, mode);
+		ino_reclaim(ip, ino, mode);
 		return;
 	}
 	/* If required write the updated link count. */
@@ -1205,13 +1295,194 @@ ino_adjust(ino_t ino, int lastmode, nlin
 	ino_dirty(ino);
 }
 
-#define	DOTDOT_OFFSET	DIRECTSIZ(1)
+/*
+ * Truncate some or all blocks in an indirect, freeing any that are required
+ * and zeroing the indirect.
+ */
+static void
+indir_trunc(ino_t ino, ufs_lbn_t lbn, ufs2_daddr_t blk, ufs_lbn_t lastlbn)
+{
+	ufs2_daddr_t *bap2;
+	ufs1_daddr_t *bap1;
+	ufs_lbn_t lbnadd;
+	ufs2_daddr_t nblk;
+	ufs_lbn_t next;
+	ufs_lbn_t nlbn;
+	int dirty;
+	int level;
+	int i;
+
+	if (blk == 0)
+		return;
+	dirty = 0;
+	level = lbn_level(lbn);
+	if (level == -1)
+		errx(1, "Invalid level for lbn %jd", lbn);
+	lbnadd = 1;
+	for (i = level; i > 0; i--)
+		lbnadd *= NINDIR(fs);
+	bap1 = (void *)dblk_read(blk, fs->fs_bsize);
+	bap2 = (void *)bap1;
+	for (i = 0; i < NINDIR(fs); i++) {
+		if (fs->fs_magic == FS_UFS1_MAGIC)
+			nblk = *bap1++;
+		else
+			nblk = *bap2++;
+		if (nblk == 0)
+			continue;
+		if (level != 0) {
+			nlbn = (lbn + 1) - (i * lbnadd);
+			/*
+			 * Calculate the lbn of the next indirect to
+			 * determine if any of this indirect must be
+			 * reclaimed.
+			 */
+			next = -(lbn + level) + ((i+1) * lbnadd);
+			if (next <= lastlbn)
+				continue;
+			indir_trunc(ino, nlbn, nblk, lastlbn);
+			/* If all of this indirect was reclaimed, free it. */
+			nlbn = next - lbnadd;
+			if (nlbn < lastlbn)
+				continue;
+		} else {
+			nlbn = -lbn + i * lbnadd;
+			if (nlbn < lastlbn)
+				continue;
+		}
+		dirty = 1;
+		blk_free(nblk, 0, fs->fs_frag);
+		if (fs->fs_magic == FS_UFS1_MAGIC)
+			*(bap1 - 1) = 0;
+		else
+			*(bap2 - 1) = 0;
+	}
+	if (dirty)
+		dblk_dirty(blk);
+}
+
+/*
+ * Truncate an inode to the minimum of the given size or the last populated
+ * block after any over size have been discarded.  The kernel would allocate
+ * the last block in the file but fsck does not and neither do we.  This
+ * code never extends files, only shrinks them.
+ */
+static void
+ino_trunc(ino_t ino, off_t size)
+{
+	union dinode *ip;
+	ufs2_daddr_t bn;
+	uint64_t totalfrags;
+	ufs_lbn_t nextlbn;
+	ufs_lbn_t lastlbn;
+	ufs_lbn_t tmpval;
+	ufs_lbn_t lbn;
+	ufs_lbn_t i;
+	int frags;
+	off_t cursize;
+	off_t off;
+	int mode;
+
+	ip = ino_read(ino);
+	mode = DIP(ip, di_mode) & IFMT;
+	cursize = DIP(ip, di_size);
+	if (debug)
+		printf("Truncating ino %d, mode %o to size %jd from size %jd\n",
+		    ino, mode, size, cursize);
+
+	/* Skip datablocks for short links and devices. */
+	if (mode == 0 || mode == IFBLK || mode == IFCHR ||
+	    (mode == IFLNK && cursize < fs->fs_maxsymlinklen))
+		return;
+	/* Don't extend. */
+	if (size > cursize)
+		size = cursize;
+	lastlbn = lblkno(fs, blkroundup(fs, size));
+	for (i = lastlbn; i < NDADDR; i++) {
+		if (DIP(ip, di_db[i]) == 0)
+			continue;
+		frags = sblksize(fs, cursize, i);
+		frags = numfrags(fs, frags);
+		blk_free(DIP(ip, di_db[i]), 0, frags);
+		DIP_SET(ip, di_db[i], 0);
+	}
+	/*
+	 * Follow indirect blocks, freeing anything required.
+	 */
+	for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR; i++,
+	    lbn = nextlbn) {
+		nextlbn = lbn + tmpval;
+		tmpval *= NINDIR(fs);
+		/* If we're not freeing any in this indirect range skip it. */
+		if (lastlbn >= nextlbn)
+			continue;
+		if (DIP(ip, di_ib[i]) == 0)
+			continue;
+		indir_trunc(ino, -lbn - i, DIP(ip, di_ib[i]), lastlbn);
+		/* If we freed everything in this indirect free the indir. */
+		if (lastlbn > lbn)
+			continue;
+		blk_free(DIP(ip, di_ib[i]), 0, frags);
+		DIP_SET(ip, di_ib[i], 0);
+	}
+	ino_dirty(ino);
+	/*
+	 * Now that we've freed any whole blocks that exceed the desired
+	 * truncation size, figure out how many blocks remain and what the
+	 * last populated lbn is.  We will set the size to this last lbn
+	 * rather than worrying about allocating the final lbn as the kernel
+	 * would've done.  This is consistent with normal fsck behavior.
+	 */ 
+	visitlbn = 0;
+	totalfrags = ino_visit(ip, ino, null_visit, VISIT_INDIR | VISIT_EXT);
+	if (size > lblktosize(fs, visitlbn + 1))
+		size = lblktosize(fs, visitlbn + 1);
+	/*
+	 * If we're truncating direct blocks we have to adjust frags
+	 * accordingly.
+	 */
+	if (visitlbn < NDADDR) {
+		long oldspace, newspace;
+
+		bn = DIP(ip, di_db[visitlbn]);
+		oldspace = sblksize(fs, cursize, visitlbn);
+		newspace = sblksize(fs, size, visitlbn);
+		if (oldspace != newspace) {
+			bn += numfrags(fs, newspace);
+			frags = numfrags(fs, oldspace - newspace);
+			blk_free(bn, 0, frags);
+			totalfrags -= frags;
+		}
+	}
+	DIP_SET(ip, di_blocks, fsbtodb(fs, totalfrags));
+	DIP_SET(ip, di_size, size);
+	/*
+	 * If we've truncated into the middle of a block or frag we have
+	 * to zero it here.  Otherwise the file could extend into
+	 * uninitialized space later.
+	 */
+	off = blkoff(fs, size);
+	if (off) {
+		uint8_t *buf;
+		long clrsize;
+
+		bn = ino_blkatoff(ip, ino, visitlbn, &frags);
+		if (bn == 0)
+			errx(1, "Block missing from ino %d at lbn %jd\n",
+			    ino, visitlbn);
+		clrsize = frags * fs->fs_fsize;
+		buf = dblk_read(bn, clrsize);
+		clrsize -= off;
+		buf += off;
+		bzero(buf, clrsize);
+		dblk_dirty(bn);
+	}
+	return;
+}
 
 /*
  * Process records available for one inode and determine whether the
  * link count is correct or needs adjusting.
- *
- * XXX Failed to fix zero length directory.  Shouldn't .. have been mising?
  */
 static void
 ino_check(struct suj_ino *sino)
@@ -1228,6 +1499,15 @@ ino_check(struct suj_ino *sino)
 	int isat;
 	int mode;
 
+	/*
+	 * Handle truncations that were not complete.  We don't have
+	 * to worry about truncating directory entries as they must have
+	 * been removed for truncate to succeed.
+	 */
+	if (sino->si_trunc) {
+		ino_trunc(ino, sino->si_trunc->jt_size);
+		sino->si_trunc = NULL;
+	}
 	if (sino->si_hasrecs == 0)
 		return;
 	ino = sino->si_ino;
@@ -1239,9 +1519,9 @@ ino_check(struct suj_ino *sino)
 		return;
 	rrec = (struct jrefrec *)TAILQ_FIRST(&sino->si_recs)->sr_rec;
 	nlink = rrec->jr_nlink;
-	newlinks = sino->si_nlinkadj;
+	newlinks = 0;
 	dotlinks = 0;
-	removes = 0;
+	removes = sino->si_nlinkadj;
 	TAILQ_FOREACH(srec, &sino->si_recs, sr_next) {
 		rrec = (struct jrefrec *)srec->sr_rec;
 		isat = ino_isat(rrec->jr_parent, rrec->jr_diroff, 
@@ -1286,7 +1566,7 @@ ino_check(struct suj_ino *sino)
 			if (rrec->jr_diroff == DOTDOT_OFFSET) {
 				stmp = ino_lookup(rrec->jr_parent, 0);
 				if (stmp)
-					stmp->si_skipparent = 1;
+					ino_setskip(stmp, ino);
 			}
 		}
 	}
@@ -1304,6 +1584,7 @@ blk_check(struct suj_blk *sblk)
 {
 	struct suj_rec *srec;
 	struct jblkrec *brec;
+	struct suj_ino *sino;
 	ufs2_daddr_t blk;
 	int mask;
 	int frags;
@@ -1318,6 +1599,10 @@ blk_check(struct suj_blk *sblk)
 		frags = brec->jb_frags;
 		blk = brec->jb_blkno + brec->jb_oldfrags;
 		isat = blk_isat(brec->jb_ino, brec->jb_lbn, blk, &frags);
+		if (sino == NULL || sino->si_ino != brec->jb_ino) {
+			sino = ino_lookup(brec->jb_ino, 1);
+			sino->si_blkadj = 1;
+		}
 		if (debug)
 			printf("op %d blk %jd ino %d lbn %jd frags %d isat %d (%d)\n",
 			    brec->jb_op, blk, brec->jb_ino, brec->jb_lbn,
@@ -1336,7 +1621,6 @@ blk_check(struct suj_blk *sblk)
 			blk += frags;
 			frags = brec->jb_frags - frags;
 			blk_free(blk, mask, frags);
-			ino_adjblks(brec->jb_ino);
 			continue;
 		}
 		/*
@@ -1349,19 +1633,31 @@ blk_check(struct suj_blk *sblk)
 		 */
 		blk_free_lbn(blk, brec->jb_ino, brec->jb_lbn, brec->jb_frags,
 		    brec->jb_op == JOP_FREEBLK);
-		ino_adjblks(brec->jb_ino);
 	}
 }
 
 /*
+ * Walk the list of inode records for this cg and resolve moved and duplicate
+ * inode references now that we have a complete picture.
+ */
+static void
+cg_build(struct suj_cg *sc)
+{
+	struct suj_ino *sino;
+	int i;
+
+	for (i = 0; i < SUJ_HASHSIZE; i++)
+		LIST_FOREACH(sino, &sc->sc_inohash[i], si_next)
+			ino_build(sino);
+}
+
+/*
  * Walk the list of inode and block records for this cg, recovering any
  * changes which were not complete at the time of crash.
  */
 static void
 cg_check(struct suj_cg *sc)
 {
-	struct suj_blk *nextb;
-	struct suj_ino *nexti;
 	struct suj_ino *sino;
 	struct suj_blk *sblk;
 	int i;
@@ -1370,32 +1666,43 @@ cg_check(struct suj_cg *sc)
 		printf("Recovering cg %d\n", sc->sc_cgx);
 
 	for (i = 0; i < SUJ_HASHSIZE; i++)
-		LIST_FOREACH_SAFE(sino, &sc->sc_inohash[i], si_next, nexti)
+		LIST_FOREACH(sino, &sc->sc_inohash[i], si_next)
 			ino_check(sino);
 
 	for (i = 0; i < SUJ_HASHSIZE; i++)
-		LIST_FOREACH_SAFE(sblk, &sc->sc_blkhash[i], sb_next, nextb)
+		LIST_FOREACH(sblk, &sc->sc_blkhash[i], sb_next)
 			blk_check(sblk);
 }
 
 /*
- * Write a potentially dirty cg.  All inodes must be written before the
- * cg maps are so that an allocated inode is never marked free, even if
- * we crash during fsck.
+ * Now that we've freed blocks which are not referenced we make a second
+ * pass over all inodes to adjust their block counts.
+ */
+static void
+cg_check2(struct suj_cg *sc)
+{
+	struct suj_ino *sino;
+	int i;
+
+	for (i = 0; i < SUJ_HASHSIZE; i++)
+		LIST_FOREACH(sino, &sc->sc_inohash[i], si_next)
+			if (sino->si_blkadj)
+				ino_adjblks(sino);
+}
+
+/*
+ * Write a potentially dirty cg.  Recalculate the summary information and
+ * update the superblock summary.
  */
 static void
 cg_write(struct suj_cg *sc)
 {
-	struct ino_blk *iblk;
 	ufs1_daddr_t fragno, cgbno, maxbno;
 	u_int8_t *blksfree;
 	struct cg *cgp;
 	int blk;
 	int i;
 
-	for (i = 0; i < SUJ_HASHSIZE; i++)
-		LIST_FOREACH(iblk, &sc->sc_iblkhash[i], ib_next)
-			iblk_write(iblk);
 	if (sc->sc_dirty == 0)
 		return;
 	/*
@@ -1437,6 +1744,21 @@ cg_write(struct suj_cg *sc)
 		err(1, "Unable to write cylinder group %d", sc->sc_cgx);
 }
 
+/*
+ * Write out any modified inodes.
+ */
+static void
+cg_write_inos(struct suj_cg *sc)
+{
+	struct ino_blk *iblk;
+	int i;
+
+	for (i = 0; i < SUJ_HASHSIZE; i++)
+		LIST_FOREACH(iblk, &sc->sc_iblkhash[i], ib_next)
+			if (iblk->ib_dirty)
+				iblk_write(iblk);
+}
+
 static void
 cg_apply(void (*apply)(struct suj_cg *))
 {
@@ -1473,7 +1795,7 @@ ino_unlinked(void)
 			if (debug)
 				printf("Freeing unlinked ino %d mode %o\n",
 				    ino, mode);
-			ino_truncate(ip, ino, mode);
+			ino_reclaim(ip, ino, mode);
 		} else if (debug)
 			printf("Skipping ino %d mode %o with link %d\n",
 			    ino, mode, DIP(ip, di_nlink));

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-projects mailing list