what is fsck's "slowdown"?
Matthew Dillon
dillon at apollo.backplane.com
Sat Sep 4 11:38:30 PDT 2004
:Increasing MAXBUFSPACE looks like it would make the problem worse
:because getdatablk() does a linear search.
I've researched it a bit more and the default buffer cache is only
40KB... only 5 buffers or so (and the minimum is 5), so it seems unlikely
that the buffer cache is responsible for the cpu use.
I went ahead and hashed the buffer cache anyway, for DFly. The patch
(for DFly) is included below if anyone wants to adapt it. It is NOT
well tested yet so use at your own risk.
I also enhanced the disk I/O / cache statistics reporting for ^T.
I think the best way to track down the cpu usage is for Mark to actually
^\ (core dump) his fsck while it is eating all that cpu and then use gdb
to see which routine is burning the cpu. Do that a couple of times. It
may or may not be possible to fix the problem but I can't imagine why
fsck would burn so much cpu with such a small buffer cache short of there
being a bug in fsck.
-Matt
Matthew Dillon
<dillon at backplane.com>
[Patch relative to DragonFly, would have to be adapted to FreeBSD-5]
Index: fsck.h
===================================================================
RCS file: /cvs/src/sbin/fsck/fsck.h,v
retrieving revision 1.4
diff -u -r1.4 fsck.h
--- fsck.h 3 Nov 2003 19:51:04 -0000 1.4
+++ fsck.h 4 Sep 2004 18:26:31 -0000
@@ -41,8 +41,8 @@
#define MAXDUP 10 /* limit on dup blks (per inode) */
#define MAXBAD 10 /* limit on bad blks (per inode) */
-#define MAXBUFSPACE 40*1024 /* maximum space to allocate to buffers */
-#define INOBUFSIZE 56*1024 /* size of buffer to read inodes in pass1 */
+#define MAXBUFSPACE (4*1024*1024) /* maximum space to allocate to buffers */
+#define INOBUFSIZE (256*1024) /* size of buffer to read inodes in pass1 */
/*
* Each inode on the filesystem is described by the following structure.
@@ -94,10 +94,6 @@
char b_dirty;
};
-#define B_INUSE 1
-
-#define MINBUFS 5 /* minimum number of buffers required */
-struct bufarea bufhead; /* head of list of other blks in filesys */
struct bufarea sblk; /* file system superblock */
struct bufarea cgblk; /* cylinder group blocks */
struct bufarea *pdirbp; /* current directory contents */
Index: fsutil.h
===================================================================
RCS file: /cvs/src/sbin/fsck/fsutil.h,v
retrieving revision 1.3
diff -u -r1.3 fsutil.h
--- fsutil.h 1 Nov 2003 17:15:58 -0000 1.3
+++ fsutil.h 4 Sep 2004 18:00:29 -0000
@@ -54,6 +54,7 @@
void *emalloc(size_t);
void *erealloc(void *, size_t);
char *estrdup(const char *);
+void show_cacheinfo(void);
#define CHECK_PREEN 0x0001
#define CHECK_VERBOSE 0x0002
Index: pass1.c
===================================================================
RCS file: /cvs/src/sbin/fsck/pass1.c,v
retrieving revision 1.5
diff -u -r1.5 pass1.c
--- pass1.c 1 Nov 2003 17:15:58 -0000 1.5
+++ pass1.c 4 Sep 2004 18:00:02 -0000
@@ -92,6 +92,7 @@
printf("%s: phase 1: cyl group %d of %d (%d%%)\n",
cdevname, c, sblock.fs_ncg,
c * 100 / sblock.fs_ncg);
+ show_cacheinfo();
got_siginfo = 0;
}
/*
Index: pass1b.c
===================================================================
RCS file: /cvs/src/sbin/fsck/pass1b.c,v
retrieving revision 1.5
diff -u -r1.5 pass1b.c
--- pass1b.c 1 Nov 2003 17:15:58 -0000 1.5
+++ pass1b.c 4 Sep 2004 18:00:04 -0000
@@ -65,6 +65,7 @@
printf("%s: phase 1b: cyl group %d of %d (%d%%)\n",
cdevname, c, sblock.fs_ncg,
c * 100 / sblock.fs_ncg);
+ show_cacheinfo();
got_siginfo = 0;
}
for (i = 0; i < sblock.fs_ipg; i++, inumber++) {
Index: pass2.c
===================================================================
RCS file: /cvs/src/sbin/fsck/pass2.c,v
retrieving revision 1.6
diff -u -r1.6 pass2.c
--- pass2.c 4 Feb 2004 17:39:59 -0000 1.6
+++ pass2.c 4 Sep 2004 18:00:08 -0000
@@ -134,6 +134,7 @@
printf("%s: phase 2: dir %d of %ld (%d%%)\n", cdevname,
inpp - inpsort, inplast, (int)((inpp - inpsort) * 100 /
inplast));
+ show_cacheinfo();
got_siginfo = 0;
}
inp = *inpp;
Index: pass3.c
===================================================================
RCS file: /cvs/src/sbin/fsck/pass3.c,v
retrieving revision 1.5
diff -u -r1.5 pass3.c
--- pass3.c 4 Feb 2004 17:39:59 -0000 1.5
+++ pass3.c 4 Sep 2004 18:00:11 -0000
@@ -59,6 +59,7 @@
printf("%s: phase 3: dir %ld of %ld (%d%%)\n", cdevname,
inplast - inpindex - 1, inplast,
(int)((inplast - inpindex - 1) * 100 / inplast));
+ show_cacheinfo();
got_siginfo = 0;
}
inp = inpsort[inpindex];
Index: pass4.c
===================================================================
RCS file: /cvs/src/sbin/fsck/pass4.c,v
retrieving revision 1.4
diff -u -r1.4 pass4.c
--- pass4.c 28 Sep 2003 14:39:17 -0000 1.4
+++ pass4.c 4 Sep 2004 18:00:13 -0000
@@ -62,6 +62,7 @@
printf("%s: phase 4: cyl group %d of %d (%d%%)\n",
cdevname, cg, sblock.fs_ncg,
cg * 100 / sblock.fs_ncg);
+ show_cacheinfo();
got_siginfo = 0;
}
inumber = cg * sblock.fs_ipg;
Index: pass5.c
===================================================================
RCS file: /cvs/src/sbin/fsck/pass5.c,v
retrieving revision 1.4
diff -u -r1.4 pass5.c
--- pass5.c 28 Sep 2003 14:39:17 -0000 1.4
+++ pass5.c 4 Sep 2004 18:00:17 -0000
@@ -169,6 +169,7 @@
printf("%s: phase 5: cyl group %d of %d (%d%%)\n",
cdevname, c, sblock.fs_ncg,
c * 100 / sblock.fs_ncg);
+ show_cacheinfo();
got_siginfo = 0;
}
getblk(&cgblk, cgtod(fs, c), fs->fs_cgsize);
Index: utilities.c
===================================================================
RCS file: /cvs/src/sbin/fsck/utilities.c,v
retrieving revision 1.6
diff -u -r1.6 utilities.c
--- utilities.c 4 Feb 2004 17:39:59 -0000 1.6
+++ utilities.c 4 Sep 2004 18:29:13 -0000
@@ -46,10 +46,42 @@
#include "fsck.h"
-long diskreads, totalreads; /* Disk cache statistics */
+struct bufarea *bufhead;
+struct bufarea buffree;
+int bufhash_size;
+int bufhash_reap;
+int bufhash_mask;
+int nbuffers;
+long totaliterations; /* Disk cache statistics */
+long totalreads;
+long diskreads;
+long bdiskreads;
+
+#define BUFHASH(blkno) ((blkno + (blkno >> 8) + (blkno >> 15)) & bufhash_mask)
+#define BUFSPERBUCKET 5 /* nominal hash table sizing factor */
+#define MINBUFS 16 /* minimum number of buffers required */
+
+#define B_INUSE 0x0001
+
static void rwerror(char *mesg, ufs_daddr_t blk);
+void
+show_cacheinfo(void)
+{
+ printf("bufcache %ld.%02d%% (%ld/%ld) %ld uncached_rd "
+ "%d/%d used %ld scans\n",
+ (totalreads - bdiskreads) * 100 / totalreads,
+ (totalreads - bdiskreads) * 10000 / totalreads % 100,
+ totalreads - bdiskreads,
+ totalreads,
+ diskreads - bdiskreads,
+ nbuffers - (int)buffree.b_size,
+ nbuffers,
+ totaliterations
+ );
+}
+
int
ftypeok(struct dinode *dp)
{
@@ -133,7 +165,7 @@
void
bufinit(void)
{
- register struct bufarea *bp;
+ struct bufarea *bp;
long bufcnt, i;
char *bufp;
@@ -143,10 +175,32 @@
errx(EEXIT, "cannot allocate buffer pool");
cgblk.b_un.b_buf = bufp;
initbarea(&cgblk);
- bufhead.b_next = bufhead.b_prev = &bufhead;
+
+ /*
+ * How many buffers should we have?
+ */
bufcnt = MAXBUFSPACE / sblock.fs_bsize;
if (bufcnt < MINBUFS)
bufcnt = MINBUFS;
+
+ /*
+ * Create a hash table large enough to hold approximately 10
+ * buffers per entry and initialize the list heads.
+ */
+ bufhash_size = 1;
+ while (bufhash_size < bufcnt / BUFSPERBUCKET)
+ bufhash_size <<= 1;
+ bufhash_mask = bufhash_size - 1;
+ bufhead = calloc(sizeof(struct bufarea), bufhash_size);
+ for (i = 0; i < bufhash_size; ++i)
+ bufhead[i].b_next = bufhead[i].b_prev = &bufhead[i];
+ buffree.b_next = &buffree;
+ buffree.b_prev = &buffree;
+
+ /*
+ * Allocate memory for our buffers and place them on the free
+ * list.
+ */
for (i = 0; i < bufcnt; i++) {
bp = (struct bufarea *)malloc(sizeof(struct bufarea));
bufp = malloc((unsigned int)sblock.fs_bsize);
@@ -156,13 +210,14 @@
errx(EEXIT, "cannot allocate buffer pool");
}
bp->b_un.b_buf = bufp;
- bp->b_prev = &bufhead;
- bp->b_next = bufhead.b_next;
- bufhead.b_next->b_prev = bp;
- bufhead.b_next = bp;
+ bp->b_prev = &buffree;
+ bp->b_next = buffree.b_next;
+ buffree.b_next->b_prev = bp;
+ buffree.b_next = bp;
+ ++buffree.b_size;
initbarea(bp);
}
- bufhead.b_size = i; /* save number of buffers */
+ nbuffers = bufcnt;
}
/*
@@ -171,26 +226,56 @@
struct bufarea *
getdatablk(ufs_daddr_t blkno, long size)
{
- register struct bufarea *bp;
+ struct bufarea *bp;
+ struct bufarea *bh_old;
+ struct bufarea *bh_new;
+ int i;
- for (bp = bufhead.b_next; bp != &bufhead; bp = bp->b_next)
+ bh_old = bh_new = &bufhead[BUFHASH(blkno)];
+
+ for (bp = bh_new->b_next; bp != bh_new; bp = bp->b_next) {
if (bp->b_bno == fsbtodb(&sblock, blkno))
goto foundit;
- for (bp = bufhead.b_prev; bp != &bufhead; bp = bp->b_prev)
- if ((bp->b_flags & B_INUSE) == 0)
- break;
- if (bp == &bufhead)
- errx(EEXIT, "deadlocked buffer pool");
+ ++totaliterations;
+ }
+ if (bh_new->b_size > BUFSPERBUCKET) {
+ for (bp = bh_new->b_prev; bp != bh_new; bp = bp->b_prev) {
+ if ((bp->b_flags & B_INUSE) == 0)
+ goto getit;
+ ++totaliterations;
+ }
+ }
+ if (buffree.b_next != &buffree) {
+ bh_old = &buffree;
+ bp = buffree.b_next;
+ goto getit;
+ }
+ for (i = bufhash_size; i; --i) {
+ bh_old = &bufhead[bufhash_reap];
+ bufhash_reap = (bufhash_reap - 1) & bufhash_mask;
+ if (bh_old->b_size < BUFSPERBUCKET / 4)
+ continue;
+ for (bp = bh_old->b_prev; bp != bh_old; bp = bp->b_prev) {
+ if ((bp->b_flags & B_INUSE) == 0)
+ goto getit;
+ ++totaliterations;
+ }
+ }
+ errx(EEXIT, "deadlocked buffer pool");
+getit:
+ ++bdiskreads;
getblk(bp, blkno, size);
/* fall through */
foundit:
- totalreads++;
- bp->b_prev->b_next = bp->b_next;
+ totalreads++; /* accounting */
+ ++bh_new->b_size;
+ --bh_old->b_size;
+ bp->b_prev->b_next = bp->b_next; /* unlink */
bp->b_next->b_prev = bp->b_prev;
- bp->b_prev = &bufhead;
- bp->b_next = bufhead.b_next;
- bufhead.b_next->b_prev = bp;
- bufhead.b_next = bp;
+ bp->b_prev = bh_new; /* relink */
+ bp->b_next = bh_new->b_next;
+ bh_new->b_next->b_prev = bp;
+ bh_new->b_next = bp;
bp->b_flags |= B_INUSE;
return (bp);
}
@@ -203,7 +288,6 @@
dblk = fsbtodb(&sblock, blk);
if (bp->b_bno != dblk) {
flush(fswritefd, bp);
- diskreads++;
bp->b_errs = bread(fsreadfd, bp->b_un.b_buf, dblk, size);
bp->b_bno = dblk;
bp->b_size = size;
@@ -248,8 +332,9 @@
void
ckfini(int markclean)
{
- register struct bufarea *bp, *nbp;
+ struct bufarea *bh, *bp, *nbp;
int ofsmodified, cnt = 0;
+ int i;
if (fswritefd < 0) {
(void)close(fsreadfd);
@@ -264,15 +349,24 @@
}
flush(fswritefd, &cgblk);
free(cgblk.b_un.b_buf);
- for (bp = bufhead.b_prev; bp && bp != &bufhead; bp = nbp) {
- cnt++;
- flush(fswritefd, bp);
- nbp = bp->b_prev;
- free(bp->b_un.b_buf);
- free((char *)bp);
+
+ for (i = 0; i < bufhash_size; ++i) {
+ bh = &bufhead[i];
+
+ for (bp = bh->b_prev; bp && bp != bh; bp = nbp) {
+ cnt++;
+ flush(fswritefd, bp);
+ nbp = bp->b_prev;
+ free(bp->b_un.b_buf);
+ free((char *)bp);
+ }
+ bh->b_next = bh;
+ bh->b_prev = bh;
+ }
+ if (nbuffers != cnt + buffree.b_size) {
+ errx(EEXIT, "panic: lost %d buffers",
+ nbuffers - cnt + buffree.b_size);
}
- if (bufhead.b_size != cnt)
- errx(EEXIT, "panic: lost %d buffers", bufhead.b_size - cnt);
pbp = pdirbp = (struct bufarea *)0;
if (sblock.fs_clean != markclean) {
sblock.fs_clean = markclean;
@@ -291,8 +385,7 @@
rerun = 1;
}
if (debug)
- printf("cache missed %ld of %ld (%d%%)\n", diskreads,
- totalreads, (int)(diskreads * 100 / totalreads));
+ show_cacheinfo();
(void)close(fsreadfd);
(void)close(fswritefd);
}
@@ -304,6 +397,7 @@
int i, errs;
off_t offset;
+ diskreads++;
offset = blk;
offset *= dev_bsize;
if (lseek(fd, offset, 0) < 0)
More information about the freebsd-current
mailing list