fsck bug in replaying partial frag truncate journal on UFS SU+J?
takehara.mikihito at jp.panasonic.com
takehara.mikihito at jp.panasonic.com
Thu Jun 19 02:55:13 UTC 2014
Hello,
I kept analysis of the problem which I reported previously and found how to fix this problem.
(patch in my previou report is wrong.).
My understanding is there is a rule that blkno of JOP_FREEBLK or JOP_NEWBLK must be the first
position in UFS block for each inode, for fsck_ffs's suj.c seems to support only such a case. But
there is a case that kernel does create a JOP_FREEBLK journal which is not in this rule. The case is
small file's partial frag truncation.
Bellow I attached a patch to fix this problem.
Adding new argument frags_offset for newfreework() to adjust for calling newjfreeblk(). New argument
frags_offset is used by only softdep_journal_freeblocks()
diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
index 4d0442b..2f2f063 100644
--- a/sys/ufs/ffs/ffs_softdep.c
+++ b/sys/ufs/ffs/ffs_softdep.c
@@ -1012,7 +1012,7 @@ static void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
static struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
ufs2_daddr_t, long, ufs_lbn_t);
static struct freework *newfreework(struct ufsmount *, struct freeblks *,
- struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
+ struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int, int);
static int jwait(struct worklist *, int);
static struct inodedep *inodedep_lookup_ip(struct inode *);
static int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
@@ -3996,7 +3996,7 @@ free_freedep(freedep)
* is visible outside of softdep_setup_freeblocks().
*/
static struct freework *
-newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
+newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal, frags_offset)
struct ufsmount *ump;
struct freeblks *freeblks;
struct freework *parent;
@@ -4005,6 +4005,7 @@ newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
int frags;
int off;
int journal;
+ int frags_offset;
{
struct freework *freework;
@@ -4022,7 +4023,7 @@ newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
? 0 : NINDIR(ump->um_fs) + 1;
freework->fw_start = freework->fw_off = off;
if (journal)
- newjfreeblk(freeblks, lbn, nb, frags);
+ newjfreeblk(freeblks, lbn, nb - frags_offset, frags + frags_offset);
if (parent == NULL) {
ACQUIRE_LOCK(&lk);
WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
@@ -5958,7 +5959,7 @@ setup_freedirect(freeblks, ip, i, needj)
DIP_SET(ip, i_db[i], 0);
frags = sblksize(ip->i_fs, ip->i_size, i);
frags = numfrags(ip->i_fs, frags);
- newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj);
+ newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj, 0);
}
static inline void
@@ -5977,7 +5978,7 @@ setup_freeext(freeblks, ip, i, needj)
ip->i_din2->di_extb[i] = 0;
frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i);
frags = numfrags(ip->i_fs, frags);
- newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
+ newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj, 0);
}
static inline void
@@ -5995,7 +5996,7 @@ setup_freeindir(freeblks, ip, i, lbn, needj)
return;
DIP_SET(ip, i_ib[i], 0);
newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag,
- 0, needj);
+ 0, needj, 0);
}
static inline struct freeblks *
@@ -6111,7 +6112,7 @@ setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
if (off + 1 == NINDIR(ip->i_fs))
goto nowork;
freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1,
- 0);
+ 0, 0);
/*
* Link the freework into the indirdep. This will prevent any new
* allocations from proceeding until we are finished with the
@@ -6437,7 +6438,8 @@ softdep_journal_freeblocks(ip, cred, length, flags)
oldfrags = numfrags(ip->i_fs, oldfrags);
blkno += numfrags(ip->i_fs, frags);
newfreework(ip->i_ump, freeblks, NULL, lastlbn,
- blkno, oldfrags, 0, needj);
+ blkno, oldfrags, 0, needj,
+ numfrags(ip->i_fs, frags));
} else if (blkno == 0)
allocblock = 1;
}
@@ -7737,7 +7739,7 @@ handle_workitem_freeblocks(freeblks, flags)
FREE_LOCK(&lk);
freework = newfreework(ump, freeblks, NULL,
aip->ai_lbn, aip->ai_newblkno,
- ump->um_fs->fs_frag, 0, 0);
+ ump->um_fs->fs_frag, 0, 0, 0);
ACQUIRE_LOCK(&lk);
}
newblk = WK_NEWBLK(wk);
@@ -8014,7 +8016,7 @@ indir_trunc(freework, dbn, lbn)
nlbn = (lbn + 1) - (i * lbnadd);
if (needj != 0) {
nfreework = newfreework(ump, freeblks, freework,
- nlbn, nb, fs->fs_frag, 0, 0);
+ nlbn, nb, fs->fs_frag, 0, 0, 0);
freedeps++;
}
indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
> Hello,
>
> I think fsck_ffs has a bug in replaying partial frag truncate journal on UFS SU+J.
> Bellow I tested about the issue.
>
> I tested blocksize==4KB, fragsize=512byte UFS SU+J. But I think these parameters are not related to
> this issue.
> 1) Preparing 4096byte(1block) test file.
> 2) Use "truncate" to shorten filesize to 3584byte(7frags).
> 3) Shutdown without unmount after journal is written and before inode size ufs2_dinode is written.
> 4) Run fsck_ffs with journal.
> 5) Mount again and remove test file. Then I face panic.
> "panic: ffs_blkfree_cg: freeing free block"
>
> This seems to be caused by fsck_ffs to replay JOP_FREEBLK whose "blkno" is not block-aligned.
> The above case 2), JOP_FREEBLK journal is like this:
> FREEBLK ino=5, blkno=1727, lbn=0, frags=1, oldfrags=0 <---(a)
> fsck_ffs handles JOP_NEWBLK almost same as JOP_FREEBLK. But my understanding is that in case of
> JOP_NEWBLK kernel always create block-aligned blkno. So this issue is only in case of JOP_FREEBLK.
>
> To analyze this issue, I also tested that using the above test file 3584byte(7frags) and write
> 512byte with append to largen to 4096byte(1block). In this case JOP_NEWBLK journal is like this:
> JOP_NEWBLK ino=5, blkno=1720, lbn=0, frags=8, oldfrags=7 <---(b)
>
> I think the bug is in fsck_ffs suj.c's arround blk_check(). My understanding is that blk_check()
> cannot handle non-block-aligned blkno so there is a little trick in blk_build() bellow.
> ----------------------------------------------------------------------------------------
> /*
> * Rewrite the record using oldfrags to indicate the offset into
> * the block. Leave jb_frags as the actual allocated count.
> */
> blkrec->jb_blkno -= frag;
> blkrec->jb_oldfrags = frag;
> ----------------------------------------------------------------------------------------
> By this trick, (a) is modified like:
> JOP_FREEBLK ino=5, blkno=1720, lbn=0, frags=1, oldfrags=7 <---(a')
> and (b) is modified like:
> JOP_NEWBLK ino=5, blkno=1720, lbn=0, frags=8, oldfrags=0 <---(b')
> But blk_check() cannot handle the case "oldfrags!=0". If "oldfrags!=0", blk_check()'s "isat" comes
> to be 0 even though the blk is present. So (a) should be modified same as (b')
> The following is my patch to fix like this. According to my test, this patch works fine.
> =======================================================================
> diff --git a/sbin/fsck_ffs/suj.c b/sbin/fsck_ffs/suj.c
> index e21ffc6..2512522 100644
> --- a/sbin/fsck_ffs/suj.c
> +++ b/sbin/fsck_ffs/suj.c
> @@ -2503,11 +2503,11 @@ blk_build(struct jblkrec *blkrec)
> frag = fragnum(fs, blkrec->jb_blkno);
> sblk = blk_lookup(blk, 1);
> /*
> - * Rewrite the record using oldfrags to indicate the offset into
> - * the block. Leave jb_frags as the actual allocated count.
> + * Rewrite the record to indicate the offset into the block.
> */
> blkrec->jb_blkno -= frag;
> - blkrec->jb_oldfrags = frag;
> + blkrec->jb_frags += frag;
> + blkrec->jb_oldfrags = 0;
> if (blkrec->jb_oldfrags + blkrec->jb_frags > fs->fs_frag)
> err_suj("Invalid fragment count %d oldfrags %d\n",
> blkrec->jb_frags, frag);
> =======================================================================
>
> I think we can fix this by changing kernel code not to create non-block-aligned JOP_FREEBLK.
> But I also think it is better to change fsck by considering compatibility.
>
>
> _______________________________________________
> freebsd-current at freebsd.org mailing list
> http://lists.freebsd.org/mailman/listinfo/freebsd-current
> To unsubscribe, send any mail to "freebsd-current-unsubscribe at freebsd.org"
More information about the freebsd-current
mailing list