kern/86944: [PATCH] When I use FreeBSD with NFS client,
close(2) or a fsync(2) system call gives back ENOLCK.
Kouji Ito
kouji at cty-net.ne.jp
Wed Oct 5 06:20:18 PDT 2005
>Number: 86944
>Category: kern
>Synopsis: [PATCH] When I use FreeBSD with NFS client, close(2) or a fsync(2) system call gives back ENOLCK.
>Confidential: no
>Severity: serious
>Priority: medium
>Responsible: freebsd-bugs
>State: open
>Quarter:
>Keywords:
>Date-Required:
>Class: sw-bug
>Submitter-Id: current-users
>Arrival-Date: Wed Oct 05 13:20:17 GMT 2005
>Closed-Date:
>Last-Modified:
>Originator: Kouji Ito <kouji at cty-net.ne.jp>
>Release: 5.1-RELEASE, 5.21-RELEASE, 5.4-RELEASE
>Organization:
<organization of PR author (multiple lines)>
>Environment:
NFS client : FreeBSD(i386) 5.1-RELEASE ,5.2.1-RELEASE, 5.4-RELEASE
NFS server : NetBSD/i386 1.6.1
>Description:
When I use FreeBSD with NFS client, close(2) or a fsync(2) system call gives back ENOLCK.
However, I can seem to read and write data collect.
There may be fatal influence when I use PostgreSQL with a NFS client machine.
I think that It is caused by the fact that I cannot clear an "error" varriable justry.
>How-To-Repeat:
Please refer to FreeBSD-5.2.1-RELEASE.log and FreeBSD-5.4-RELEASE.log.
I do not follow log in FreeBSD5.1-RELEASE, but a result in this is
the same as FreeBSD-5.2.1-RELEASE.log.
>Fix:
--- nfs_vnops.1.220.2.1.patch begins here ---
*** nfs_vnops.c Wed Oct 5 13:49:49 2005
--- nfs_vnops.c.new Wed Oct 5 13:48:46 2005
***************
*** 2823,2830 ****
splx(s);
if (error == 0)
panic("nfs_fsync: inconsistent lock");
! if (error == ENOLCK)
goto loop;
error = nfs_sigintr(nmp, NULL, td);
if (error)
goto done;
--- 2823,2832 ----
splx(s);
if (error == 0)
panic("nfs_fsync: inconsistent lock");
! if (error == ENOLCK) {
! error = 0; /* clear error */
goto loop;
+ }
error = nfs_sigintr(nmp, NULL, td);
if (error)
goto done;
--- nfs_vnops.1.220.2.1.patch ends here ---
--- FreeBSD-5.2.1-RELEASE.log begins here ---
TEST1 FreeBSD XXX 5.2.1-RELEASE FreeBSD 5.2.1-RELEASE
------------------------------------------------------------------------------------------------
(1) Prepare the kernel which changed nfs_vnops.c as follows.(if you want.)
(2) Prepare a NetBSD/i386(1.6.1) machine as a NFS server.
(3) Do a NFS mount of a NetBSD machine from a FreeBSD machine.
(mount_nfs -T -3)
(4) Run a "write_data" program on the file system that did a NFS mount.
(please make "data" directory under current directory.)
# ./write_data
file name = ./data/30034
(5) Run a "open_read_close" program on the file system that did a NFS mount.
close(2) system call returns -1 (an error), and the error number gives back ENOLCK.
The data which a program read are right.
The following messages are output if you change nfs_vnops.c.
# ./open_read_close ./data/30034
ncount = 0
close() return -1 errno=77
00000-0000h :00 00 00 00
(6) Run a "write_data" program on the file system that did a NFS mount.
(please make "data" directory under current directory.)
# ./write_data
file name = ./data/30980
(7) Run a "open_write_close" program on the file system that did a NFS mount.
Wait for several minutes.
close(2) system call returns -1 (an error), and the error number gives back ENOLCK.
The data which a program write are right.
The following messages are output if you change nfs_vnops.c.
# ./open_write_close ./data/30980
ncount = 0
:
ncount = 3997
ncount = 3998
close() return -1 errno=77
00000-0000h :9e 9e 9e 9e
dellgx50# hexdump ./data/30980
0000000 9e9e 9e9e 0000 0000 0000 0000 0000 0000
0000010 0000 0000 0000 0000 0000 0000 0000 0000
*
0014000
------------------------------------------------------------------------------------------------
/var/log/messages
Oct 5 13:15:57 YYY kernel: ENOLCK pid=29491, name=open_read_close
Oct 5 13:15:57 YYY kernel: PASS1: error = 77 , passone=0 pid=29491, name=open_read_close
Oct 5 13:15:57 YYY kernel: waitfor=1 MNT_WAIT=1 vp->v_numoutput=0 pid=29491, name=open_read_close
Oct 5 13:15:57 YYY kernel: PASS2: error = 77 pid=29491, name=open_read_close
Oct 5 14:01:11 YYY kernel: ENOLCK pid=30649, name=open_write_close
Oct 5 14:01:11 YYY kernel: PASS1: error = 77 , passone=0 pid=30649, name=open_write_close
Oct 5 14:01:11 YYY kernel: waitfor=1 MNT_WAIT=1 vp->v_numoutput=0 pid=30649, name=open_write_close
Oct 5 14:01:11 YYY kernel: PASS2: error = 77 pid=30649, name=open_write_close
------------------------------------------------------------------------------------------------
YYY# cat write_data.c
#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#include <syslog.h>
#define IO_SIZE (0x2000)
#define PARA 1
int main(int argc, char *argv[])
{
int nrc;
int nfd;
int nname = 0;
char aname[64];
int n, m, ncnt = 0;
static char abuff[IO_SIZE];
unsigned int nnn= 0;
nname = getpid();
syslog(LOG_LOCAL3|LOG_INFO, "nnn = %d PID = %d\n", nnn, nname);
/*
* create file.
* filename = ./data/PID
* file size = IO_SIZE * 10
* data = all 0x00
*/
sprintf(aname, "./data/%05d", nname);
nfd = open(aname, O_CREAT|O_RDWR|O_TRUNC,0666);
if (-1 == nfd) {
syslog(LOG_LOCAL3|LOG_INFO, "open(%s) error = %d\n", aname, errno);
exit(-1);
}
printf("file name = %s\n", aname);
for (n = 0; n < 10; ++n) {
nrc = write(nfd, abuff, IO_SIZE);
}
/*
* write data 0x00 from IO_SIZE * 5 to IO_SIZE * 5 + IOSIZE
*/
while(1) {
nrc = lseek(nfd, IO_SIZE * 5, SEEK_SET);
if (-1 == nrc) {
syslog(LOG_LOCAL3|LOG_INFO, "lseek(%s) n=%d m=%d error = %d\n",
aname, n, m, errno);
goto GOGO;
}
nrc = write(nfd, abuff, IO_SIZE);
if (IO_SIZE != nrc) {
syslog(LOG_LOCAL3|LOG_INFO, "write(%s) n=%d m=%d error = %d\n",
aname, n, m, errno);
goto GOGO;
}
nrc = fsync(nfd);
if (-1 == nrc) {
syslog(LOG_LOCAL3|LOG_INFO, "fsync(%s) n=%d m=%d error = %d\n",
aname, n, m, errno);
goto GOGO;
}
}
GOGO:
nrc = close(nfd);
if (-1 == nrc) {
syslog(LOG_LOCAL3|LOG_INFO, "close(%s) n=%d m=%d error = %d\n",
aname, n, m, errno);
}
exit(0);
}
------------------------------------------------------------------------------------------------
YYY# cat open_read_close.c
#include <stdio.h>
#include <fcntl.h>
#include <syslog.h>
#include <errno.h>
#include <sys/types.h>
extern int errno;
#ifdef __STDC__
int printmem(void *p,int print_size)
#else
int printmem(p,print_size)
void *p;
int print_size;
#endif
{
int num=0;
unsigned char *up=p;
static unsigned char buff[17];
for(num=0;num<print_size;++num){
if(0==(num%16))
if(0==(num/16))
fprintf(stdout,"%05d-%04xh :",num,num);
else
fprintf(stdout,"%s\n%05d-%04xh :",buff,num,num);
if(num%16==7)
fprintf(stdout,"%02x-",buff[num%16]=up[num]);
else
fprintf(stdout,"%02x ",buff[num%16]=up[num]);
buff[num%16]=(' '>buff[num%16]||'~'<buff[num%16]) ? '.':buff[num%16];
}
fprintf(stdout,"\n");
}
int main(int argc, char *argv[])
{
int nfd;
char abuff[4];
int nrc;
int ncount = 0;
while(1) {
printf("ncount = %d\n", ncount);
nfd = open(argv[1], O_RDWR);
if (-1 == nfd) {
printf("open error\n");
exit(-1);
}
memset(abuff, 0xff, sizeof(abuff));
nrc = read(nfd, abuff, sizeof(abuff));
if (nrc != sizeof(abuff)) {
printf("read() return %d errno=%d\n", nrc, errno);
exit(-2);
}
nrc = close(nfd);
if (-1 == nrc) {
printf("close() return %d errno=%d\n", nrc, errno);
printmem(abuff, sizeof(abuff));
exit(-3);
}
++ncount;
}
}
------------------------------------------------------------------------------------------------
YYY# cat open_write_close.c
#include <stdio.h>
#include <fcntl.h>
#include <syslog.h>
#include <errno.h>
#include <sys/types.h>
extern int errno;
#ifdef __STDC__
int printmem(void *p,int print_size)
#else
int printmem(p,print_size)
void *p;
int print_size;
#endif
{
int num=0;
unsigned char *up=p;
static unsigned char buff[17];
for(num=0;num<print_size;++num){
if(0==(num%16))
if(0==(num/16))
fprintf(stdout,"%05d-%04xh :",num,num);
else
fprintf(stdout,"%s\n%05d-%04xh :",buff,num,num);
if(num%16==7)
fprintf(stdout,"%02x-",buff[num%16]=up[num]);
else
fprintf(stdout,"%02x ",buff[num%16]=up[num]);
buff[num%16]=(' '>buff[num%16]||'~'<buff[num%16]) ? '.':buff[num%16];
}
fprintf(stdout,"\n");
}
int main(int argc, char *argv[])
{
int nfd;
char abuff[4];
int nrc;
int ncount = 0;
while(1) {
printf("ncount = %d\n", ncount);
nfd = open(argv[1], O_RDWR);
if (-1 == nfd) {
printf("open error\n");
exit(-1);
}
memset(abuff, ncount, sizeof(abuff));
nrc = write(nfd, abuff, sizeof(abuff));
if (nrc != sizeof(abuff)) {
printf("read() return %d errno=%d\n", nrc, errno);
exit(-2);
}
nrc = close(nfd);
if (-1 == nrc) {
printf("close() return %d errno=%d\n", nrc, errno);
printmem(abuff, sizeof(abuff));
exit(-3);
}
++ncount;
}
}
------------------------------------------------------------------------------------------------
It is the source which a debugging sentence is with.(nfs_vnops.c -> nfs_flush())
#define NFS_ENOLCK
__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_vnops.c,v 1.215 2003/11/14 20:54:09 alfred Exp $");
/*
* Flush all the blocks associated with a vnode.
* Walk through the buffer pool and push any dirty pages
* associated with the vnode.
*/
static int
nfs_flush(struct vnode *vp, struct ucred *cred, int waitfor, struct thread *td,
int commit)
{
struct nfsnode *np = VTONFS(vp);
struct buf *bp;
int i;
struct buf *nbp;
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
int s, error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos;
int passone = 1;
u_quad_t off, endoff, toff;
struct ucred* wcred = NULL;
struct buf **bvec = NULL;
#ifndef NFS_COMMITBVECSIZ
#define NFS_COMMITBVECSIZ 20
#endif
struct buf *bvec_on_stack[NFS_COMMITBVECSIZ];
int bvecsize = 0, bveccount;
if (nmp->nm_flag & NFSMNT_INT)
slpflag = PCATCH;
if (!commit)
passone = 0;
/*
* A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the
* server, but nas not been committed to stable storage on the server
* yet. On the first pass, the byte range is worked out and the commit
* rpc is done. On the second pass, nfs_writebp() is called to do the
* job.
*/
again:
off = (u_quad_t)-1;
endoff = 0;
bvecpos = 0;
if (NFS_ISV3(vp) && commit) {
s = splbio();
if (bvec != NULL && bvec != bvec_on_stack)
free(bvec, M_TEMP);
/*
* Count up how many buffers waiting for a commit.
*/
bveccount = 0;
VI_LOCK(vp);
for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
nbp = TAILQ_NEXT(bp, b_vnbufs);
if (BUF_REFCNT(bp) == 0 &&
(bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
== (B_DELWRI | B_NEEDCOMMIT))
bveccount++;
}
/*
* Allocate space to remember the list of bufs to commit. It is
* important to use M_NOWAIT here to avoid a race with nfs_write.
* If we can't get memory (for whatever reason), we will end up
* committing the buffers one-by-one in the loop below.
*/
if (bveccount > NFS_COMMITBVECSIZ) {
/*
* Release the vnode interlock to avoid a lock
* order reversal.
*/
VI_UNLOCK(vp);
bvec = (struct buf **)
malloc(bveccount * sizeof(struct buf *),
M_TEMP, M_NOWAIT);
VI_LOCK(vp);
if (bvec == NULL) {
bvec = bvec_on_stack;
bvecsize = NFS_COMMITBVECSIZ;
} else
bvecsize = bveccount;
} else {
bvec = bvec_on_stack;
bvecsize = NFS_COMMITBVECSIZ;
}
for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
if (bvecpos >= bvecsize)
break;
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
nbp = TAILQ_NEXT(bp, b_vnbufs);
continue;
}
if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) !=
(B_DELWRI | B_NEEDCOMMIT)) {
BUF_UNLOCK(bp);
nbp = TAILQ_NEXT(bp, b_vnbufs);
continue;
}
VI_UNLOCK(vp);
bremfree(bp);
/*
* Work out if all buffers are using the same cred
* so we can deal with them all with one commit.
*
* NOTE: we are not clearing B_DONE here, so we have
* to do it later on in this routine if we intend to
* initiate I/O on the bp.
*
* Note: to avoid loopback deadlocks, we do not
* assign b_runningbufspace.
*/
if (wcred == NULL)
wcred = bp->b_wcred;
else if (wcred != bp->b_wcred)
wcred = NOCRED;
bp->b_flags |= B_WRITEINPROG;
vfs_busy_pages(bp, 1);
VI_LOCK(vp);
/*
* bp is protected by being locked, but nbp is not
* and vfs_busy_pages() may sleep. We have to
* recalculate nbp.
*/
nbp = TAILQ_NEXT(bp, b_vnbufs);
/*
* A list of these buffers is kept so that the
* second loop knows which buffers have actually
* been committed. This is necessary, since there
* may be a race between the commit rpc and new
* uncommitted writes on the file.
*/
bvec[bvecpos++] = bp;
toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
bp->b_dirtyoff;
if (toff < off)
off = toff;
toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
if (toff > endoff)
endoff = toff;
}
splx(s);
VI_UNLOCK(vp);
}
if (bvecpos > 0) {
/*
* Commit data on the server, as required.
* If all bufs are using the same wcred, then use that with
* one call for all of them, otherwise commit each one
* separately.
*/
if (wcred != NOCRED)
retv = nfs_commit(vp, off, (int)(endoff - off),
wcred, td);
else {
retv = 0;
for (i = 0; i < bvecpos; i++) {
off_t off, size;
bp = bvec[i];
off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
bp->b_dirtyoff;
size = (u_quad_t)(bp->b_dirtyend
- bp->b_dirtyoff);
retv = nfs_commit(vp, off, (int)size,
bp->b_wcred, td);
if (retv) break;
}
}
if (retv == NFSERR_STALEWRITEVERF)
nfs_clearcommit(vp->v_mount);
/*
* Now, either mark the blocks I/O done or mark the
* blocks dirty, depending on whether the commit
* succeeded.
*/
for (i = 0; i < bvecpos; i++) {
bp = bvec[i];
bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG | B_CLUSTEROK);
if (retv) {
/*
* Error, leave B_DELWRI intact
*/
vfs_unbusy_pages(bp);
brelse(bp);
} else {
/*
* Success, remove B_DELWRI ( bundirty() ).
*
* b_dirtyoff/b_dirtyend seem to be NFS
* specific. We should probably move that
* into bundirty(). XXX
*/
s = splbio();
VI_LOCK(vp);
vp->v_numoutput++;
VI_UNLOCK(vp);
bp->b_flags |= B_ASYNC;
bundirty(bp);
bp->b_flags &= ~B_DONE;
bp->b_ioflags &= ~BIO_ERROR;
bp->b_dirtyoff = bp->b_dirtyend = 0;
splx(s);
bufdone(bp);
}
}
}
/*
* Start/do any write(s) that are required.
*/
loop:
s = splbio();
VI_LOCK(vp);
for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
nbp = TAILQ_NEXT(bp, b_vnbufs);
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
if (waitfor != MNT_WAIT || passone)
continue;
error = BUF_TIMELOCK(bp,
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
VI_MTX(vp), "nfsfsync", slpflag, slptimeo);
splx(s);
if (error == 0)
panic("nfs_fsync: inconsistent lock");
#ifdef NFS_ENOLCK
if (error == ENOLCK) {
printf("ENOLCK pid=%d, name=%s\n",
td->td_proc->p_pid, td->td_proc->p_comm);
goto loop;
}
#else
if (error == ENOLCK)
goto loop;
#endif
if (nfs_sigintr(nmp, NULL, td)) {
error = EINTR;
goto done;
}
if (slpflag == PCATCH) {
slpflag = 0;
slptimeo = 2 * hz;
}
goto loop;
}
if ((bp->b_flags & B_DELWRI) == 0)
panic("nfs_fsync: not dirty");
if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) {
BUF_UNLOCK(bp);
continue;
}
VI_UNLOCK(vp);
bremfree(bp);
if (passone || !commit)
bp->b_flags |= B_ASYNC;
else
bp->b_flags |= B_ASYNC | B_WRITEINPROG;
splx(s);
BUF_WRITE(bp);
goto loop;
}
splx(s);
#ifdef NFS_ENOLCK
if (0 != error) {
printf("PASS1: error = %d , passone=%d pid=%d, name=%s\n",
error, passone, td->td_proc->p_pid, td->td_proc->p_comm);
}
#endif
if (passone) {
passone = 0;
VI_UNLOCK(vp);
goto again;
}
#ifdef NFS_ENOLCK
if (0 != error) {
printf("waitfor=%d MNT_WAIT=%d vp->v_numoutput=%d pid=%d, name=%s\n",
waitfor, MNT_WAIT, (int)vp->v_numoutput, td->td_proc->p_pid, td->td_proc->p_comm);
}
#endif
if (waitfor == MNT_WAIT) {
while (vp->v_numoutput) {
vp->v_iflag |= VI_BWAIT;
error = msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp),
slpflag | (PRIBIO + 1), "nfsfsync", slptimeo);
#ifdef NFS_ENOLCK
printf("msleep() return %d pid=%d, name=%s\n",
error, td->td_proc->p_pid, td->td_proc->p_comm);
#endif
if (error) {
VI_UNLOCK(vp);
if (nfs_sigintr(nmp, NULL, td)) {
error = EINTR;
goto done;
}
if (slpflag == PCATCH) {
slpflag = 0;
slptimeo = 2 * hz;
}
VI_LOCK(vp);
}
}
if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) && commit) {
VI_UNLOCK(vp);
goto loop;
}
}
VI_UNLOCK(vp);
if (np->n_flag & NWRITEERR) {
error = np->n_error;
np->n_flag &= ~NWRITEERR;
}
done:
if (bvec != NULL && bvec != bvec_on_stack)
free(bvec, M_TEMP);
#ifdef NFS_ENOLCK
if (0 != error) {
printf("PASS2: error = %d pid=%d, name=%s\n",
error, td->td_proc->p_pid, td->td_proc->p_comm);
}
#endif
return (error);
}
--- FreeBSD-5.2.1-RELEASE.log ends here ---
--- FreeBSD-5.4-RELEASE.log begins here ---
TEST2(It is slightly complicated.) FreeBSD XXX 5.4-RELEASE FreeBSD 5.4-RELEASE
---------------------------------------------------------------------------------------
(1) Prepare the kernel which changed nfs_vnops.c as follows.
(2) Install PostgreSQL in FreeBSD.
(3) Install a "pgbench" program in FreeBSD.
A pgbench program is included in PostgreSQL.
(4) Prepare a NetBSD/i386(1.6.1) machine as a NFS server.
(5) Do a NFS mount of a NetBSD machine from a FreeBSD machine.
(mount_nfs -T -3)
(6) Run PostgreSQL in FreeBSD side.
Data directory of PostgreSQL appoints DISK of the NFS server side.
(7) Run a lower script.
The following messages are output when You continue practice for
several hours.(/var/log/messages)
#!/bin/csh
while 1
pgbench -i
end
---------------------------------------------------------------------------------------
/var/log/messages
Sep 28 19:17:38 XXX kernel: ENOLCK
Sep 28 19:17:38 XXX kernel: CHECK1 error = 77
Sep 28 19:17:38 XXX kernel: waitfor=1 MNT_WAIT=1 vp->v_numoutput=0
Sep 28 19:17:38 XXX kernel: CHECK2 error = 77
---------------------------------------------------------------------------------------
It is the source which a debugging sentence is with. (nfs_vnops.c -> nfs_flush())
#define NFS_ENOLCK
__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_vnops.c,v 1.220.2.1 2005/01/31 23:26:46 imp Exp $");
/*
* Flush all the blocks associated with a vnode.
* Walk through the buffer pool and push any dirty pages
* associated with the vnode.
*/
static int
nfs_flush(struct vnode *vp, struct ucred *cred, int waitfor, struct thread *td,
int commit)
{
struct nfsnode *np = VTONFS(vp);
struct buf *bp;
int i;
struct buf *nbp;
struct nfsmount *nmp = VFSTONFS(vp->v_mount);
int s, error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos;
int passone = 1;
u_quad_t off, endoff, toff;
struct ucred* wcred = NULL;
struct buf **bvec = NULL;
#ifndef NFS_COMMITBVECSIZ
#define NFS_COMMITBVECSIZ 20
#endif
struct buf *bvec_on_stack[NFS_COMMITBVECSIZ];
int bvecsize = 0, bveccount;
if (nmp->nm_flag & NFSMNT_INT)
slpflag = PCATCH;
if (!commit)
passone = 0;
/*
* A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the
* server, but nas not been committed to stable storage on the server
* yet. On the first pass, the byte range is worked out and the commit
* rpc is done. On the second pass, nfs_writebp() is called to do the
* job.
*/
again:
off = (u_quad_t)-1;
endoff = 0;
bvecpos = 0;
if (NFS_ISV3(vp) && commit) {
s = splbio();
if (bvec != NULL && bvec != bvec_on_stack)
free(bvec, M_TEMP);
/*
* Count up how many buffers waiting for a commit.
*/
bveccount = 0;
VI_LOCK(vp);
for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
nbp = TAILQ_NEXT(bp, b_vnbufs);
if (BUF_REFCNT(bp) == 0 &&
(bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
== (B_DELWRI | B_NEEDCOMMIT))
bveccount++;
}
/*
* Allocate space to remember the list of bufs to commit. It is
* important to use M_NOWAIT here to avoid a race with nfs_write.
* If we can't get memory (for whatever reason), we will end up
* committing the buffers one-by-one in the loop below.
*/
if (bveccount > NFS_COMMITBVECSIZ) {
/*
* Release the vnode interlock to avoid a lock
* order reversal.
*/
VI_UNLOCK(vp);
bvec = (struct buf **)
malloc(bveccount * sizeof(struct buf *),
M_TEMP, M_NOWAIT);
VI_LOCK(vp);
if (bvec == NULL) {
bvec = bvec_on_stack;
bvecsize = NFS_COMMITBVECSIZ;
} else
bvecsize = bveccount;
} else {
bvec = bvec_on_stack;
bvecsize = NFS_COMMITBVECSIZ;
}
for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
if (bvecpos >= bvecsize)
break;
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
nbp = TAILQ_NEXT(bp, b_vnbufs);
continue;
}
if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) !=
(B_DELWRI | B_NEEDCOMMIT)) {
BUF_UNLOCK(bp);
nbp = TAILQ_NEXT(bp, b_vnbufs);
continue;
}
VI_UNLOCK(vp);
bremfree(bp);
/*
* Work out if all buffers are using the same cred
* so we can deal with them all with one commit.
*
* NOTE: we are not clearing B_DONE here, so we have
* to do it later on in this routine if we intend to
* initiate I/O on the bp.
*
* Note: to avoid loopback deadlocks, we do not
* assign b_runningbufspace.
*/
if (wcred == NULL)
wcred = bp->b_wcred;
else if (wcred != bp->b_wcred)
wcred = NOCRED;
bp->b_flags |= B_WRITEINPROG;
vfs_busy_pages(bp, 1);
VI_LOCK(vp);
/*
* bp is protected by being locked, but nbp is not
* and vfs_busy_pages() may sleep. We have to
* recalculate nbp.
*/
nbp = TAILQ_NEXT(bp, b_vnbufs);
/*
* A list of these buffers is kept so that the
* second loop knows which buffers have actually
* been committed. This is necessary, since there
* may be a race between the commit rpc and new
* uncommitted writes on the file.
*/
bvec[bvecpos++] = bp;
toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
bp->b_dirtyoff;
if (toff < off)
off = toff;
toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
if (toff > endoff)
endoff = toff;
}
splx(s);
VI_UNLOCK(vp);
}
if (bvecpos > 0) {
/*
* Commit data on the server, as required.
* If all bufs are using the same wcred, then use that with
* one call for all of them, otherwise commit each one
* separately.
*/
if (wcred != NOCRED)
retv = nfs_commit(vp, off, (int)(endoff - off),
wcred, td);
else {
retv = 0;
for (i = 0; i < bvecpos; i++) {
off_t off, size;
bp = bvec[i];
off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
bp->b_dirtyoff;
size = (u_quad_t)(bp->b_dirtyend
- bp->b_dirtyoff);
retv = nfs_commit(vp, off, (int)size,
bp->b_wcred, td);
if (retv) break;
}
}
if (retv == NFSERR_STALEWRITEVERF)
nfs_clearcommit(vp->v_mount);
/*
* Now, either mark the blocks I/O done or mark the
* blocks dirty, depending on whether the commit
* succeeded.
*/
for (i = 0; i < bvecpos; i++) {
bp = bvec[i];
bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG | B_CLUSTEROK);
if (retv) {
/*
* Error, leave B_DELWRI intact
*/
vfs_unbusy_pages(bp);
brelse(bp);
} else {
/*
* Success, remove B_DELWRI ( bundirty() ).
*
* b_dirtyoff/b_dirtyend seem to be NFS
* specific. We should probably move that
* into bundirty(). XXX
*/
s = splbio();
VI_LOCK(vp);
vp->v_numoutput++;
VI_UNLOCK(vp);
bp->b_flags |= B_ASYNC;
bundirty(bp);
bp->b_flags &= ~B_DONE;
bp->b_ioflags &= ~BIO_ERROR;
bp->b_dirtyoff = bp->b_dirtyend = 0;
splx(s);
bufdone(bp);
}
}
}
/*
* Start/do any write(s) that are required.
*/
loop:
s = splbio();
VI_LOCK(vp);
for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
nbp = TAILQ_NEXT(bp, b_vnbufs);
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
if (waitfor != MNT_WAIT || passone)
continue;
error = BUF_TIMELOCK(bp,
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
VI_MTX(vp), "nfsfsync", slpflag, slptimeo);
splx(s);
if (error == 0)
panic("nfs_fsync: inconsistent lock");
#ifdef NFS_ENOLCK
if (error == ENOLCK) {
printf("ENOLCK\n");
goto loop;
}
#else
if (error == ENOLCK)
goto loop;
#endif
error = nfs_sigintr(nmp, NULL, td);
if (error)
goto done;
if (slpflag == PCATCH) {
slpflag = 0;
slptimeo = 2 * hz;
}
goto loop;
}
if ((bp->b_flags & B_DELWRI) == 0)
panic("nfs_fsync: not dirty");
if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) {
BUF_UNLOCK(bp);
continue;
}
VI_UNLOCK(vp);
bremfree(bp);
if (passone || !commit)
bp->b_flags |= B_ASYNC;
else
bp->b_flags |= B_ASYNC | B_WRITEINPROG;
splx(s);
bwrite(bp);
goto loop;
}
splx(s);
#ifdef NFS_ENOLCK
if (0 != error) {
printf("CHECK1 error = %d\n", error);
}
#endif
if (passone) {
passone = 0;
VI_UNLOCK(vp);
goto again;
}
#ifdef NFS_ENOLCK
if (0 != error) {
printf("waitfor=%d MNT_WAIT=%d vp->v_numoutput=%d\n",
waitfor, MNT_WAIT, (int)vp->v_numoutput);
}
#endif
if (waitfor == MNT_WAIT) {
while (vp->v_numoutput) {
vp->v_iflag |= VI_BWAIT;
error = msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp),
slpflag | (PRIBIO + 1), "nfsfsync", slptimeo);
#ifdef NFS_ENOLCK
printf("msleep() retun error = %d\n", error);
#endif
if (error) {
VI_UNLOCK(vp);
error = nfs_sigintr(nmp, NULL, td);
if (error)
goto done;
if (slpflag == PCATCH) {
slpflag = 0;
slptimeo = 2 * hz;
}
VI_LOCK(vp);
}
}
if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) && commit) {
VI_UNLOCK(vp);
goto loop;
}
}
VI_UNLOCK(vp);
if (np->n_flag & NWRITEERR) {
printf("error=%d np->n_error= %d\n", error, np->n_error);
error = np->n_error;
np->n_flag &= ~NWRITEERR;
}
done:
if (bvec != NULL && bvec != bvec_on_stack)
free(bvec, M_TEMP);
#ifdef NFS_ENOLCK
if (0 != error) {
printf("CHECK2 error = %d\n", error);
}
#endif
return (error);
}
--- FreeBSD-5.4-RELEASE.log ends here ---
>Release-Note:
>Audit-Trail:
>Unformatted:
More information about the freebsd-bugs
mailing list