what is fsck's "slowdown"?

Matthew Dillon dillon at apollo.backplane.com
Sat Sep 4 11:38:30 PDT 2004


:Increasing MAXBUFSPACE looks like it would make the problem worse
:because getdatablk() does a linear search.

    I've researched it a bit more and the default buffer cache is only
    40KB... only 5 buffers or so (and the minimum is 5), so it seems unlikely
    that the buffer cache is responsible for the cpu use.

    I went ahead and hashed the buffer cache anyway, for DFly.  The patch
    (for DFly) is included below if anyone wants to adapt it.  It is NOT 
    well tested yet so use at your own risk.

    I also enhanced the disk I/O / cache statistics reporting for ^T.

    I think the best way to track down the cpu usage is for Mark to actually
    ^\ (core dump) his fsck while it is eating all that cpu and then use gdb
    to see which routine is burning the cpu.  Do that a couple of times.  It
    may or may not be possible to fix the problem but I can't imagine why
    fsck would burn so much cpu with such a small buffer cache short of there
    being a bug in fsck.

					-Matt
					Matthew Dillon 
					<dillon at backplane.com>


	[Patch relative to DragonFly, would have to be adapted to FreeBSD-5]

Index: fsck.h
===================================================================
RCS file: /cvs/src/sbin/fsck/fsck.h,v
retrieving revision 1.4
diff -u -r1.4 fsck.h
--- fsck.h	3 Nov 2003 19:51:04 -0000	1.4
+++ fsck.h	4 Sep 2004 18:26:31 -0000
@@ -41,8 +41,8 @@
 
 #define	MAXDUP		10	/* limit on dup blks (per inode) */
 #define	MAXBAD		10	/* limit on bad blks (per inode) */
-#define	MAXBUFSPACE	40*1024	/* maximum space to allocate to buffers */
-#define	INOBUFSIZE	56*1024	/* size of buffer to read inodes in pass1 */
+#define	MAXBUFSPACE	(4*1024*1024) /* maximum space to allocate to buffers */
+#define	INOBUFSIZE	(256*1024) /* size of buffer to read inodes in pass1 */
 
 /*
  * Each inode on the filesystem is described by the following structure.
@@ -94,10 +94,6 @@
 	char b_dirty;
 };
 
-#define	B_INUSE 1
-
-#define	MINBUFS		5	/* minimum number of buffers required */
-struct bufarea bufhead;		/* head of list of other blks in filesys */
 struct bufarea sblk;		/* file system superblock */
 struct bufarea cgblk;		/* cylinder group blocks */
 struct bufarea *pdirbp;		/* current directory contents */
Index: fsutil.h
===================================================================
RCS file: /cvs/src/sbin/fsck/fsutil.h,v
retrieving revision 1.3
diff -u -r1.3 fsutil.h
--- fsutil.h	1 Nov 2003 17:15:58 -0000	1.3
+++ fsutil.h	4 Sep 2004 18:00:29 -0000
@@ -54,6 +54,7 @@
 void *emalloc(size_t);
 void *erealloc(void *, size_t);
 char *estrdup(const char *);
+void show_cacheinfo(void);
 
 #define	CHECK_PREEN	0x0001
 #define	CHECK_VERBOSE	0x0002
Index: pass1.c
===================================================================
RCS file: /cvs/src/sbin/fsck/pass1.c,v
retrieving revision 1.5
diff -u -r1.5 pass1.c
--- pass1.c	1 Nov 2003 17:15:58 -0000	1.5
+++ pass1.c	4 Sep 2004 18:00:02 -0000
@@ -92,6 +92,7 @@
 			printf("%s: phase 1: cyl group %d of %d (%d%%)\n",
 			    cdevname, c, sblock.fs_ncg,
 			    c * 100 / sblock.fs_ncg);
+			show_cacheinfo();
 			got_siginfo = 0;
 		}
 		/*
Index: pass1b.c
===================================================================
RCS file: /cvs/src/sbin/fsck/pass1b.c,v
retrieving revision 1.5
diff -u -r1.5 pass1b.c
--- pass1b.c	1 Nov 2003 17:15:58 -0000	1.5
+++ pass1b.c	4 Sep 2004 18:00:04 -0000
@@ -65,6 +65,7 @@
 			printf("%s: phase 1b: cyl group %d of %d (%d%%)\n",
 			    cdevname, c, sblock.fs_ncg,
 			    c * 100 / sblock.fs_ncg);
+			show_cacheinfo();
 			got_siginfo = 0;
 		}
 		for (i = 0; i < sblock.fs_ipg; i++, inumber++) {
Index: pass2.c
===================================================================
RCS file: /cvs/src/sbin/fsck/pass2.c,v
retrieving revision 1.6
diff -u -r1.6 pass2.c
--- pass2.c	4 Feb 2004 17:39:59 -0000	1.6
+++ pass2.c	4 Sep 2004 18:00:08 -0000
@@ -134,6 +134,7 @@
 			printf("%s: phase 2: dir %d of %ld (%d%%)\n", cdevname,
 			    inpp - inpsort, inplast, (int)((inpp - inpsort) * 100 /
 			    inplast));
+			show_cacheinfo();
 			got_siginfo = 0;
 		}
 		inp = *inpp;
Index: pass3.c
===================================================================
RCS file: /cvs/src/sbin/fsck/pass3.c,v
retrieving revision 1.5
diff -u -r1.5 pass3.c
--- pass3.c	4 Feb 2004 17:39:59 -0000	1.5
+++ pass3.c	4 Sep 2004 18:00:11 -0000
@@ -59,6 +59,7 @@
 			printf("%s: phase 3: dir %ld of %ld (%d%%)\n", cdevname,
 			    inplast - inpindex - 1, inplast,
 			    (int)((inplast - inpindex - 1) * 100 / inplast));
+			show_cacheinfo();
 			got_siginfo = 0;
 		}
 		inp = inpsort[inpindex];
Index: pass4.c
===================================================================
RCS file: /cvs/src/sbin/fsck/pass4.c,v
retrieving revision 1.4
diff -u -r1.4 pass4.c
--- pass4.c	28 Sep 2003 14:39:17 -0000	1.4
+++ pass4.c	4 Sep 2004 18:00:13 -0000
@@ -62,6 +62,7 @@
 			printf("%s: phase 4: cyl group %d of %d (%d%%)\n",
 			    cdevname, cg, sblock.fs_ncg,
 			    cg * 100 / sblock.fs_ncg);
+			show_cacheinfo();
 			got_siginfo = 0;
 		}
 		inumber = cg * sblock.fs_ipg;
Index: pass5.c
===================================================================
RCS file: /cvs/src/sbin/fsck/pass5.c,v
retrieving revision 1.4
diff -u -r1.4 pass5.c
--- pass5.c	28 Sep 2003 14:39:17 -0000	1.4
+++ pass5.c	4 Sep 2004 18:00:17 -0000
@@ -169,6 +169,7 @@
 			printf("%s: phase 5: cyl group %d of %d (%d%%)\n",
 			    cdevname, c, sblock.fs_ncg,
 			    c * 100 / sblock.fs_ncg);
+			show_cacheinfo();
 			got_siginfo = 0;
 		}
 		getblk(&cgblk, cgtod(fs, c), fs->fs_cgsize);
Index: utilities.c
===================================================================
RCS file: /cvs/src/sbin/fsck/utilities.c,v
retrieving revision 1.6
diff -u -r1.6 utilities.c
--- utilities.c	4 Feb 2004 17:39:59 -0000	1.6
+++ utilities.c	4 Sep 2004 18:29:13 -0000
@@ -46,10 +46,42 @@
 
 #include "fsck.h"
 
-long	diskreads, totalreads;	/* Disk cache statistics */
+struct bufarea *bufhead;
+struct bufarea buffree;
+int	bufhash_size;
+int	bufhash_reap;
+int	bufhash_mask;
+int	nbuffers;
+long	totaliterations;	/* Disk cache statistics */
+long	totalreads;
+long	diskreads;
+long	bdiskreads;
+
+#define BUFHASH(blkno)	((blkno + (blkno >> 8) + (blkno >> 15)) & bufhash_mask)
+#define BUFSPERBUCKET	5	/* nominal hash table sizing factor */
+#define MINBUFS         16      /* minimum number of buffers required */
+
+#define B_INUSE		0x0001
+
 
 static void rwerror(char *mesg, ufs_daddr_t blk);
 
+void
+show_cacheinfo(void)
+{
+	printf("bufcache %ld.%02d%% (%ld/%ld) %ld uncached_rd "
+	       "%d/%d used %ld scans\n",
+		(totalreads - bdiskreads) * 100 / totalreads,
+		(totalreads - bdiskreads) * 10000 / totalreads % 100,
+		totalreads - bdiskreads,
+		totalreads,
+		diskreads - bdiskreads,
+		nbuffers - (int)buffree.b_size,
+		nbuffers,
+		totaliterations
+	);
+}
+
 int
 ftypeok(struct dinode *dp)
 {
@@ -133,7 +165,7 @@
 void
 bufinit(void)
 {
-	register struct bufarea *bp;
+	struct bufarea *bp;
 	long bufcnt, i;
 	char *bufp;
 
@@ -143,10 +175,32 @@
 		errx(EEXIT, "cannot allocate buffer pool");
 	cgblk.b_un.b_buf = bufp;
 	initbarea(&cgblk);
-	bufhead.b_next = bufhead.b_prev = &bufhead;
+
+	/*
+	 * How many buffers should we have?
+	 */
 	bufcnt = MAXBUFSPACE / sblock.fs_bsize;
 	if (bufcnt < MINBUFS)
 		bufcnt = MINBUFS;
+
+	/*
+	 * Create a hash table large enough to hold approximately 10
+	 * buffers per entry and initialize the list heads.
+	 */
+	bufhash_size = 1; 
+	while (bufhash_size < bufcnt / BUFSPERBUCKET)
+	    bufhash_size <<= 1;
+	bufhash_mask = bufhash_size - 1;
+	bufhead = calloc(sizeof(struct bufarea), bufhash_size);
+	for (i = 0; i < bufhash_size; ++i)
+	    bufhead[i].b_next = bufhead[i].b_prev = &bufhead[i];
+	buffree.b_next = &buffree;
+	buffree.b_prev = &buffree;
+
+	/*
+	 * Allocate memory for our buffers and place them on the free
+	 * list.
+	 */
 	for (i = 0; i < bufcnt; i++) {
 		bp = (struct bufarea *)malloc(sizeof(struct bufarea));
 		bufp = malloc((unsigned int)sblock.fs_bsize);
@@ -156,13 +210,14 @@
 			errx(EEXIT, "cannot allocate buffer pool");
 		}
 		bp->b_un.b_buf = bufp;
-		bp->b_prev = &bufhead;
-		bp->b_next = bufhead.b_next;
-		bufhead.b_next->b_prev = bp;
-		bufhead.b_next = bp;
+		bp->b_prev = &buffree;
+		bp->b_next = buffree.b_next;
+		buffree.b_next->b_prev = bp;
+		buffree.b_next = bp;
+		++buffree.b_size;
 		initbarea(bp);
 	}
-	bufhead.b_size = i;	/* save number of buffers */
+	nbuffers = bufcnt;
 }
 
 /*
@@ -171,26 +226,56 @@
 struct bufarea *
 getdatablk(ufs_daddr_t blkno, long size)
 {
-	register struct bufarea *bp;
+	struct bufarea *bp;
+	struct bufarea *bh_old;
+	struct bufarea *bh_new;
+	int i;
 
-	for (bp = bufhead.b_next; bp != &bufhead; bp = bp->b_next)
+	bh_old = bh_new = &bufhead[BUFHASH(blkno)];
+
+	for (bp = bh_new->b_next; bp != bh_new; bp = bp->b_next) {
 		if (bp->b_bno == fsbtodb(&sblock, blkno))
 			goto foundit;
-	for (bp = bufhead.b_prev; bp != &bufhead; bp = bp->b_prev)
-		if ((bp->b_flags & B_INUSE) == 0)
-			break;
-	if (bp == &bufhead)
-		errx(EEXIT, "deadlocked buffer pool");
+		++totaliterations;
+	}
+	if (bh_new->b_size > BUFSPERBUCKET) {
+		for (bp = bh_new->b_prev; bp != bh_new; bp = bp->b_prev) {
+			if ((bp->b_flags & B_INUSE) == 0)
+				goto getit;
+			++totaliterations;
+		}
+	}
+	if (buffree.b_next != &buffree) {
+		bh_old = &buffree;
+		bp = buffree.b_next;
+		goto getit;
+	}
+	for (i = bufhash_size; i; --i) {
+		bh_old = &bufhead[bufhash_reap];
+		bufhash_reap = (bufhash_reap - 1) & bufhash_mask;
+		if (bh_old->b_size < BUFSPERBUCKET / 4)
+			continue;
+		for (bp = bh_old->b_prev; bp != bh_old; bp = bp->b_prev) {
+			if ((bp->b_flags & B_INUSE) == 0)
+				goto getit;
+			++totaliterations;
+		}
+	}
+	errx(EEXIT, "deadlocked buffer pool");
+getit:
+	++bdiskreads;
 	getblk(bp, blkno, size);
 	/* fall through */
 foundit:
-	totalreads++;
-	bp->b_prev->b_next = bp->b_next;
+	totalreads++;				/* accounting */
+	++bh_new->b_size;
+	--bh_old->b_size;
+	bp->b_prev->b_next = bp->b_next;	/* unlink */
 	bp->b_next->b_prev = bp->b_prev;
-	bp->b_prev = &bufhead;
-	bp->b_next = bufhead.b_next;
-	bufhead.b_next->b_prev = bp;
-	bufhead.b_next = bp;
+	bp->b_prev = bh_new;			/* relink */
+	bp->b_next = bh_new->b_next;
+	bh_new->b_next->b_prev = bp;
+	bh_new->b_next = bp;
 	bp->b_flags |= B_INUSE;
 	return (bp);
 }
@@ -203,7 +288,6 @@
 	dblk = fsbtodb(&sblock, blk);
 	if (bp->b_bno != dblk) {
 		flush(fswritefd, bp);
-		diskreads++;
 		bp->b_errs = bread(fsreadfd, bp->b_un.b_buf, dblk, size);
 		bp->b_bno = dblk;
 		bp->b_size = size;
@@ -248,8 +332,9 @@
 void
 ckfini(int markclean)
 {
-	register struct bufarea *bp, *nbp;
+	struct bufarea *bh, *bp, *nbp;
 	int ofsmodified, cnt = 0;
+	int i;
 
 	if (fswritefd < 0) {
 		(void)close(fsreadfd);
@@ -264,15 +349,24 @@
 	}
 	flush(fswritefd, &cgblk);
 	free(cgblk.b_un.b_buf);
-	for (bp = bufhead.b_prev; bp && bp != &bufhead; bp = nbp) {
-		cnt++;
-		flush(fswritefd, bp);
-		nbp = bp->b_prev;
-		free(bp->b_un.b_buf);
-		free((char *)bp);
+
+	for (i = 0; i < bufhash_size; ++i) {
+		bh = &bufhead[i];
+
+		for (bp = bh->b_prev; bp && bp != bh; bp = nbp) {
+			cnt++;
+			flush(fswritefd, bp);
+			nbp = bp->b_prev;
+			free(bp->b_un.b_buf);
+			free((char *)bp);
+		}
+		bh->b_next = bh;
+		bh->b_prev = bh;
+	}
+	if (nbuffers != cnt + buffree.b_size) {
+		errx(EEXIT, "panic: lost %d buffers", 
+		    nbuffers - cnt + buffree.b_size);
 	}
-	if (bufhead.b_size != cnt)
-		errx(EEXIT, "panic: lost %d buffers", bufhead.b_size - cnt);
 	pbp = pdirbp = (struct bufarea *)0;
 	if (sblock.fs_clean != markclean) {
 		sblock.fs_clean = markclean;
@@ -291,8 +385,7 @@
 		rerun = 1;
 	}
 	if (debug)
-		printf("cache missed %ld of %ld (%d%%)\n", diskreads,
-		    totalreads, (int)(diskreads * 100 / totalreads));
+		show_cacheinfo();
 	(void)close(fsreadfd);
 	(void)close(fswritefd);
 }
@@ -304,6 +397,7 @@
 	int i, errs;
 	off_t offset;
 
+	diskreads++;
 	offset = blk;
 	offset *= dev_bsize;
 	if (lseek(fd, offset, 0) < 0)


More information about the freebsd-current mailing list