ZFS: data sometimes not getting correctly flushed to disk (possible mmap/write issue)?

Richard Todd rmtodd at ichotolot.servalan.com
Mon Feb 25 02:15:16 UTC 2008


Further notes on the "data not getting correctly flushed to disk" issue with 
mmap and ZFS:

I've managed to construct a simpler test case, one that doesn't require
py-mutagen or any other ports installed.  See the attached shar file for
details; it contains a C++ program "gen4.cpp" and a shell script "test4.sh".
gen4.cpp creates a file, writes a little bit of data to it, and does some
mmap()ing to shuffle data about (see the "insert_bytes" and "delete_bytes"
functions, which are basically C reimplementations of the Python functions
of the same name from the py-mutagen package.)  Test4.sh is a shell script
which basically does as my shell script in my previous posting did:
      1) Run the "gen4" test program, creating a file on a ZFS partition
      2) MD5 that file
      3) Copy that file somewhere on a UFS filesystem, call the copy "before".
      4) Unmount the ZFS filesystem and remount it.
      5) Re-MD5 the test file and copy that file to the UFS filesystem as a file "after"
      6) Compare the "before" and "after" files and list which bytes differ.

The script has 4 configurable parameters which you can edit if you want to 
try and run this test yourself:
 ZFSFS name of your ZFS filesystem to test (in my case, "u1")
 ZFSMP mount point of your ZFS filesystem to test (in my case, "/u1")
 ZFSDIR a writable dir somewhere under ZFSMP (in my case, "/u1/tmp")
 UFSDIR a writable dir somewhere on a UFS filesystem (in my case, "/var/tmp").

The script takes one parameter, which I call S1, which is the number
of bytes the gen4 program tries to insert in the test file with the
"insert_bytes" function.  Only certain values of this parameter
trigger the bug, basically values of the form

    n*4096+3074+i (n integer, i in [0,510])

Also note that when the mmap bug hits, the data that doesn't get correctly
to disk is *always* the last part of the file, and always is at most 1 sector's
worth (512 bytes) and always starts at a page boundary (multiple of 4K).

Script started on Sun Feb 24 19:26:23 2008
1 blo-rakane ~/bug[ 7:26PM] Z% ./test4.sh 3073
MD5 (/u1/tmp/foo1) = fe750b17b666653e22e2a9bb240e6bd7
-rw-r--r--  1 rmtodd  wheel  4096 Feb 24 19:26 /u1/tmp/foo1
MD5 (/u1/tmp/foo1) = fe750b17b666653e22e2a9bb240e6bd7
-rw-r--r--  1 rmtodd  wheel  4096 Feb 24 19:26 /u1/tmp/foo1
2 blo-rakane ~/bug[ 7:26PM] Z% ./test4.sh 3074
MD5 (/u1/tmp/foo1) = 7ab56c42278ddcffd0269523fe7667a0
-rw-r--r--  1 rmtodd  wheel  4097 Feb 24 19:26 /u1/tmp/foo1
MD5 (/u1/tmp/foo1) = ad5e2ab0f485db6c3737b5b1bbc7c9a0
-rw-r--r--  1 rmtodd  wheel  4097 Feb 24 19:26 /u1/tmp/foo1
  4097 377   0
3 blo-rakane ~/bug[ 7:26PM] Z% ./test4.sh 3075
MD5 (/u1/tmp/foo1) = 6dc1c3b940601a4af75780fda4ab1395
-rw-r--r--  1 rmtodd  wheel  4098 Feb 24 19:26 /u1/tmp/foo1
MD5 (/u1/tmp/foo1) = ebf119d60033e99ff911d29faceef6f4
-rw-r--r--  1 rmtodd  wheel  4098 Feb 24 19:26 /u1/tmp/foo1
  4097 376   0
  4098 377   0
4 blo-rakane ~/bug[ 7:26PM] Z% ./test4.sh 3584
MD5 (/u1/tmp/foo1) = 6348eaa9cdb8e7ed99a6e81b378a5f2a
-rw-r--r--  1 rmtodd  wheel  4607 Feb 24 19:26 /u1/tmp/foo1
MD5 (/u1/tmp/foo1) = c1a464f57d68c903c6bacb0f8652c35c
-rw-r--r--  1 rmtodd  wheel  4607 Feb 24 19:26 /u1/tmp/foo1
  4097   1   0
  4098   2   0
  4099   3   0
[... intermediate lines deleted, you see the pattern ...]
  4607 377   0
5 blo-rakane ~/bug[ 7:26PM] Z% ./test4.sh 4 3585
MD5 (/u1/tmp/foo1) = 53ece827167e437de736eceb10106cdb
-rw-r--r--  1 rmtodd  wheel  4608 Feb 24 19:26 /u1/tmp/foo1
MD5 (/u1/tmp/foo1) = 53ece827167e437de736eceb10106cdb
-rw-r--r--  1 rmtodd  wheel  4608 Feb 24 19:27 /u1/tmp/foo1
6 blo-rakane ~/bug[ 7:27PM] Z% ./test4.sh 7170
MD5 (/u1/tmp/foo1) = 9c18df094aab90b5c0bb2fa89de0dc54
-rw-r--r--  1 rmtodd  wheel  8193 Feb 24 19:27 /u1/tmp/foo1
MD5 (/u1/tmp/foo1) = d353963ef7edf672fd6a75159fa513b9
-rw-r--r--  1 rmtodd  wheel  8193 Feb 24 19:27 /u1/tmp/foo1
  8193 377   0
7 blo-rakane ~/bug[ 7:27PM] Z% ./test4.sh 7171
MD5 (/u1/tmp/foo1) = 928b747af654170711e24c6eeb042cd5
-rw-r--r--  1 rmtodd  wheel  8194 Feb 24 19:27 /u1/tmp/foo1
MD5 (/u1/tmp/foo1) = 845114ca58a19f850a3e7b9797e4c637
-rw-r--r--  1 rmtodd  wheel  8194 Feb 24 19:27 /u1/tmp/foo1
  8193 376   0
  8194 377   0
8 blo-rakane ~/bug[ 7:27PM] Z% exit
Script done on Sun Feb 24 19:27:45 2008

Also, the ZFS panic (with possible FS corruption) I mentioned last
time doesn't actually seem to be connected with this mmap() bug, but
is apparently a different (unrelated) bug.  I say this because, while
I can trigger the panic by repeatedly calling the "test4.sh" script a
few hundred times, repeatedly creating files and unmounting and
remounting the fs, the panic happens even if I pick a value of the
parameter that *doesn't* trigger the mmap consistency issue.  So it 
looks like the panic issue is unrelated, and only came up accidentally
when developing my test case for the mmap issue.  (Must be my lucky day.)

So anyway, that's what I've got so far.  I may try to put some debugging
printfs to see if I can figure out what's going on with the mmap
issue.  (I'm not sure I follow this code at all, but it seems to me that 
mappedwrite() in zfs_vnops.c is a likely place to start looking.)

# This is a shell archive.  Save it in a file, remove anything before
# this line, and then unpack it by entering "sh file".  Note, it may
# create directories; files and directories will be owned by you and
# have default permissions.
#
# This archive contains:
#
#	gen4.cpp
#	test4.sh
#
echo x - gen4.cpp
sed 's/^X//' >gen4.cpp << 'END-of-gen4.cpp'
X/*
X** Program to create a file of data and do some mmap()ing writes to it. 
X*/
X
X#include <stdio.h>
X#include <stdlib.h>
X#include <stdint.h>
X#include <unistd.h>
X#include <string.h>
X#include <fcntl.h>
X#include <sys/types.h>
X#include <sys/mman.h>
X
X/* Insert size bytes (zeros) into file at offset */
Xvoid
Xinsert_bytes(int fd, unsigned int size, unsigned int offset)
X{
X    unsigned int filesize = lseek(fd, (off_t)0, SEEK_END);
X    unsigned int movesize = filesize - offset;
X    
X    char *buf = new char[size];
X    for (unsigned int i = 0 ; i < size ; ++i) {
X	buf[i] = '\0';
X    }
X    write(fd, buf, size);
X
X    char *addr = (char *) mmap((void *)NULL, filesize+size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, offset);
X    memmove(addr+size, addr, movesize);
X    if (munmap(addr, filesize+size)) {
X	perror("munmap");
X    }
X}
X
X/* Delete size bytes (zeros) from file at offset */
Xvoid
Xdelete_bytes(int fd, unsigned int size, unsigned int offset)
X{
X    unsigned int filesize = lseek(fd, (off_t)0, SEEK_END);
X    unsigned int movesize = filesize - offset - size;
X
X    char *addr = (char *) mmap((void *)NULL, filesize, PROT_READ|PROT_WRITE, MAP_SHARED, fd, offset);
X    memmove(addr, addr+size, movesize);
X    if (munmap(addr, filesize)) {
X	perror("munmap");
X    }
X    ftruncate(fd, filesize - size);
X}
X
X    
X
X/* Usage: gen1 filename number-of-seconds */
Xint
Xmain(int argc, char **argv)
X{
X    char *fname = argv[1];
X    FILE *f = fopen(fname, "w");
X
X    unsigned int size = 1024;
X
X    for (unsigned int i = 0 ; i < size ; ++i) {
X	char c = i & 0xff;
X	fwrite((void *)&c, 1, 1, f);
X    }
X
X    fclose(f);
X
X    int fd = open(fname, O_RDWR);
X
X    unsigned int S1 = atoi(argv[2]);
X#define O1 0
X    insert_bytes(fd, S1, O1);
X    
X    (void) lseek(fd, (off_t) O1, SEEK_SET);
X
X#define S2 1
X#define O2 (O1+S1)
X
X    delete_bytes(fd, S2, O2);
X    
X    exit(0);
X}
END-of-gen4.cpp
echo x - test4.sh
sed 's/^X//' >test4.sh << 'END-of-test4.sh'
X#!/bin/sh
X# Set the following variables appropriately
X# ZFSFS name of your ZFS filesystem to test
X# ZFSMP mount point of your ZFS filesystem to test
X# ZFSDIR a writable dir somewhere under ZFSMP
X# UFSDIR a writable dir somewhere on a UFS filesystem
X# Script takes one argument, S1, a number of bytes.  
X
XZFSFS=u1
XZFSMP=/u1
XZFSDIR=/u1/tmp
XUFSDIR=/var/tmp
X
XNAME=$ZFSDIR/foo1
XS1=$1
Xrm $NAME
X./gen4 $NAME $S1
Xmd5 $NAME
Xcp $NAME $UFSDIR/before
Xls -l $NAME
Xsudo umount $ZFSMP
Xsudo mount -t zfs $ZFSFS $ZFSMP
Xmd5 $NAME
Xls -l $NAME
Xcp $NAME $UFSDIR/after
Xcmp -l $UFSDIR/before $UFSDIR/after
END-of-test4.sh
exit




More information about the freebsd-current mailing list