kern/86944: [PATCH] When I use FreeBSD with NFS client, close(2) or a fsync(2) system call gives back ENOLCK.

Kouji Ito kouji at cty-net.ne.jp
Wed Oct 5 06:20:18 PDT 2005


>Number:         86944
>Category:       kern
>Synopsis:       [PATCH] When I use FreeBSD with  NFS client, close(2) or a fsync(2) system call gives back ENOLCK.
>Confidential:   no
>Severity:       serious
>Priority:       medium
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Wed Oct 05 13:20:17 GMT 2005
>Closed-Date:
>Last-Modified:
>Originator:     Kouji Ito <kouji at cty-net.ne.jp>
>Release:        5.1-RELEASE, 5.21-RELEASE, 5.4-RELEASE
>Organization:
<organization of PR author (multiple lines)>
>Environment:
NFS client : FreeBSD(i386) 5.1-RELEASE ,5.2.1-RELEASE, 5.4-RELEASE
NFS server : NetBSD/i386 1.6.1
>Description:
When I use FreeBSD with  NFS client, close(2) or a fsync(2) system call gives back ENOLCK.
However, I can seem to read and write data collect.
There may be fatal influence when I use PostgreSQL with a NFS client machine.

I think that It is caused by the fact that I cannot clear an "error" varriable justry.

>How-To-Repeat:
Please refer to FreeBSD-5.2.1-RELEASE.log and FreeBSD-5.4-RELEASE.log.
I do not follow log in FreeBSD5.1-RELEASE, but a result in this is
the same as FreeBSD-5.2.1-RELEASE.log.
>Fix:

--- nfs_vnops.1.220.2.1.patch begins here ---
*** nfs_vnops.c Wed Oct  5 13:49:49 2005
--- nfs_vnops.c.new     Wed Oct  5 13:48:46 2005
***************
*** 2823,2830 ****
                        splx(s);
                        if (error == 0)
                                panic("nfs_fsync: inconsistent lock");
!                       if (error == ENOLCK)
                                goto loop;
                        error = nfs_sigintr(nmp, NULL, td);
                        if (error)
                                goto done;
--- 2823,2832 ----
                        splx(s);
                        if (error == 0)
                                panic("nfs_fsync: inconsistent lock");
!                       if (error == ENOLCK) {
!                               error = 0;      /* clear error */
                                goto loop;
+                         }
                        error = nfs_sigintr(nmp, NULL, td);
                        if (error)
                                goto done;
--- nfs_vnops.1.220.2.1.patch ends here ---

--- FreeBSD-5.2.1-RELEASE.log begins here ---
TEST1 FreeBSD XXX 5.2.1-RELEASE FreeBSD 5.2.1-RELEASE
------------------------------------------------------------------------------------------------
(1) Prepare the kernel which changed nfs_vnops.c as follows.(if you want.)

(2) Prepare a NetBSD/i386(1.6.1) machine as a NFS server.

(3) Do a NFS mount of a NetBSD machine from a FreeBSD machine.
    (mount_nfs -T -3)

(4) Run a "write_data" program on the file system that did a NFS mount.
    (please make "data" directory under  current directory.)
    # ./write_data
    file name = ./data/30034

(5) Run a "open_read_close" program on the file system that did a NFS mount.
    close(2) system call returns  -1 (an error), and the error number gives back ENOLCK.
    The data which a program read are right.
    The following messages are output if you change nfs_vnops.c.

    # ./open_read_close ./data/30034
    ncount = 0
    close() return -1 errno=77
    00000-0000h :00 00 00 00

(6) Run a "write_data" program on the file system that did a NFS mount.
    (please make "data" directory under  current directory.)
    # ./write_data
    file name = ./data/30980

(7) Run a "open_write_close" program on the file system that did a NFS mount.
    Wait for several minutes.
    close(2) system call returns  -1 (an error), and the error number gives back ENOLCK.
    The data which a program write are right.
    The following messages are output if you change nfs_vnops.c.
    # ./open_write_close ./data/30980
    ncount = 0
        :
    ncount = 3997
    ncount = 3998
    close() return -1 errno=77
    00000-0000h :9e 9e 9e 9e
    dellgx50# hexdump  ./data/30980
    0000000 9e9e 9e9e 0000 0000 0000 0000 0000 0000
    0000010 0000 0000 0000 0000 0000 0000 0000 0000
    *
    0014000

------------------------------------------------------------------------------------------------
/var/log/messages
Oct  5 13:15:57 YYY kernel: ENOLCK pid=29491, name=open_read_close
Oct  5 13:15:57 YYY kernel: PASS1: error = 77 , passone=0 pid=29491, name=open_read_close
Oct  5 13:15:57 YYY kernel: waitfor=1 MNT_WAIT=1 vp->v_numoutput=0 pid=29491, name=open_read_close
Oct  5 13:15:57 YYY kernel: PASS2: error = 77 pid=29491, name=open_read_close

Oct  5 14:01:11 YYY kernel: ENOLCK pid=30649, name=open_write_close
Oct  5 14:01:11 YYY kernel: PASS1: error = 77 , passone=0 pid=30649, name=open_write_close
Oct  5 14:01:11 YYY kernel: waitfor=1 MNT_WAIT=1 vp->v_numoutput=0 pid=30649, name=open_write_close
Oct  5 14:01:11 YYY kernel: PASS2: error = 77 pid=30649, name=open_write_close
------------------------------------------------------------------------------------------------
YYY# cat write_data.c
#include <stdio.h>
#include <fcntl.h>
#include <errno.h>
#include <syslog.h>
#define IO_SIZE (0x2000)
#define PARA 1
int main(int argc, char *argv[])
{
   int nrc;
   int nfd;
   int nname = 0;
   char aname[64];
   int n, m, ncnt = 0;
   static char abuff[IO_SIZE];
   unsigned int nnn= 0;

   nname = getpid();
   syslog(LOG_LOCAL3|LOG_INFO, "nnn = %d PID = %d\n", nnn, nname);

   /*
    * create file.
    * filename = ./data/PID
    * file size = IO_SIZE * 10
    * data = all 0x00
    */
   sprintf(aname, "./data/%05d", nname);
   nfd = open(aname, O_CREAT|O_RDWR|O_TRUNC,0666);
   if (-1 == nfd) {
      syslog(LOG_LOCAL3|LOG_INFO, "open(%s) error = %d\n", aname, errno);
      exit(-1);
   }
   printf("file name = %s\n", aname);
   for (n = 0; n < 10; ++n) {
      nrc =  write(nfd, abuff, IO_SIZE);
   }

   /*
    * write data 0x00 from IO_SIZE * 5 to IO_SIZE * 5 + IOSIZE
    */
   while(1) {
         nrc = lseek(nfd, IO_SIZE * 5, SEEK_SET);
         if (-1 == nrc) {
            syslog(LOG_LOCAL3|LOG_INFO, "lseek(%s) n=%d m=%d error = %d\n",
                   aname, n, m, errno);
            goto GOGO;
         }
         nrc =  write(nfd, abuff, IO_SIZE);
         if (IO_SIZE != nrc) {
            syslog(LOG_LOCAL3|LOG_INFO, "write(%s) n=%d m=%d error = %d\n",
                   aname, n, m, errno);
            goto GOGO;
         }
         nrc = fsync(nfd);
         if (-1 == nrc) {
            syslog(LOG_LOCAL3|LOG_INFO, "fsync(%s) n=%d m=%d error = %d\n",
                   aname, n, m, errno);
               goto GOGO;
         }
  }
  GOGO:
  nrc = close(nfd);
  if (-1 == nrc) {
     syslog(LOG_LOCAL3|LOG_INFO, "close(%s) n=%d m=%d error = %d\n",
            aname, n, m, errno);
  }
  exit(0);
}
------------------------------------------------------------------------------------------------
YYY# cat open_read_close.c
#include <stdio.h>
#include <fcntl.h>
#include <syslog.h>
#include <errno.h>
#include <sys/types.h>
extern int errno;
#ifdef __STDC__
int printmem(void *p,int print_size)
#else
int printmem(p,print_size)
void *p;
int print_size;
#endif
{
   int num=0;
   unsigned char *up=p;
   static unsigned char buff[17];

   for(num=0;num<print_size;++num){
      if(0==(num%16))
         if(0==(num/16))
            fprintf(stdout,"%05d-%04xh :",num,num);
         else
            fprintf(stdout,"%s\n%05d-%04xh :",buff,num,num);
      if(num%16==7)
         fprintf(stdout,"%02x-",buff[num%16]=up[num]);
      else
         fprintf(stdout,"%02x ",buff[num%16]=up[num]);
      buff[num%16]=(' '>buff[num%16]||'~'<buff[num%16]) ? '.':buff[num%16];
   }
   fprintf(stdout,"\n");
}
int main(int argc, char *argv[])
{
   int nfd;
   char abuff[4];
   int nrc;
   int ncount = 0;

   while(1) {
      printf("ncount = %d\n", ncount);
      nfd = open(argv[1], O_RDWR);
      if (-1 == nfd) {
         printf("open error\n");
         exit(-1);
      }
      memset(abuff, 0xff, sizeof(abuff));
      nrc = read(nfd, abuff, sizeof(abuff));
      if (nrc != sizeof(abuff)) {
         printf("read() return %d errno=%d\n", nrc, errno);
         exit(-2);
      }

      nrc = close(nfd);
      if (-1 == nrc) {
         printf("close() return %d errno=%d\n", nrc, errno);
         printmem(abuff, sizeof(abuff));
         exit(-3);
      }
      ++ncount;
   }
}
------------------------------------------------------------------------------------------------
YYY# cat open_write_close.c
#include <stdio.h>
#include <fcntl.h>
#include <syslog.h>
#include <errno.h>
#include <sys/types.h>
extern int errno;
#ifdef __STDC__
int printmem(void *p,int print_size)
#else
int printmem(p,print_size)
void *p;
int print_size;
#endif
{
   int num=0;
   unsigned char *up=p;
   static unsigned char buff[17];

   for(num=0;num<print_size;++num){
      if(0==(num%16))
         if(0==(num/16))
            fprintf(stdout,"%05d-%04xh :",num,num);
         else
            fprintf(stdout,"%s\n%05d-%04xh :",buff,num,num);
      if(num%16==7)
         fprintf(stdout,"%02x-",buff[num%16]=up[num]);
      else
         fprintf(stdout,"%02x ",buff[num%16]=up[num]);
      buff[num%16]=(' '>buff[num%16]||'~'<buff[num%16]) ? '.':buff[num%16];
   }
   fprintf(stdout,"\n");
}
int main(int argc, char *argv[])
{
   int nfd;
   char abuff[4];
   int nrc;
   int ncount = 0;

   while(1) {
      printf("ncount = %d\n", ncount);
      nfd = open(argv[1], O_RDWR);
      if (-1 == nfd) {
         printf("open error\n");
         exit(-1);
      }
      memset(abuff, ncount, sizeof(abuff));
      nrc = write(nfd, abuff, sizeof(abuff));
      if (nrc != sizeof(abuff)) {
         printf("read() return %d errno=%d\n", nrc, errno);
         exit(-2);
      }

      nrc = close(nfd);
      if (-1 == nrc) {
         printf("close() return %d errno=%d\n", nrc, errno);
         printmem(abuff, sizeof(abuff));
         exit(-3);
      }
      ++ncount;
   }
}
------------------------------------------------------------------------------------------------
It is the source which a debugging sentence is with.(nfs_vnops.c -> nfs_flush())
#define NFS_ENOLCK
__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_vnops.c,v 1.215 2003/11/14 20:54:09 alfred Exp $");

/*
 * Flush all the blocks associated with a vnode.
 * 	Walk through the buffer pool and push any dirty pages
 *	associated with the vnode.
 */
static int
nfs_flush(struct vnode *vp, struct ucred *cred, int waitfor, struct thread *td,
    int commit)
{
	struct nfsnode *np = VTONFS(vp);
	struct buf *bp;
	int i;
	struct buf *nbp;
	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	int s, error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos;
	int passone = 1;
	u_quad_t off, endoff, toff;
	struct ucred* wcred = NULL;
	struct buf **bvec = NULL;
#ifndef NFS_COMMITBVECSIZ
#define NFS_COMMITBVECSIZ	20
#endif
	struct buf *bvec_on_stack[NFS_COMMITBVECSIZ];
	int bvecsize = 0, bveccount;

	if (nmp->nm_flag & NFSMNT_INT)
		slpflag = PCATCH;
	if (!commit)
		passone = 0;
	/*
	 * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the
	 * server, but nas not been committed to stable storage on the server
	 * yet. On the first pass, the byte range is worked out and the commit
	 * rpc is done. On the second pass, nfs_writebp() is called to do the
	 * job.
	 */
again:
	off = (u_quad_t)-1;
	endoff = 0;
	bvecpos = 0;
	if (NFS_ISV3(vp) && commit) {
		s = splbio();
		if (bvec != NULL && bvec != bvec_on_stack)
			free(bvec, M_TEMP);
		/*
		 * Count up how many buffers waiting for a commit.
		 */
		bveccount = 0;
		VI_LOCK(vp);
		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
			nbp = TAILQ_NEXT(bp, b_vnbufs);
			if (BUF_REFCNT(bp) == 0 &&
			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
				== (B_DELWRI | B_NEEDCOMMIT))
				bveccount++;
		}
		/*
		 * Allocate space to remember the list of bufs to commit.  It is
		 * important to use M_NOWAIT here to avoid a race with nfs_write.
		 * If we can't get memory (for whatever reason), we will end up
		 * committing the buffers one-by-one in the loop below.
		 */
		if (bveccount > NFS_COMMITBVECSIZ) {
			/*
			 * Release the vnode interlock to avoid a lock
			 * order reversal.
			 */
			VI_UNLOCK(vp);
			bvec = (struct buf **)
				malloc(bveccount * sizeof(struct buf *),
				       M_TEMP, M_NOWAIT);
			VI_LOCK(vp);
			if (bvec == NULL) {
				bvec = bvec_on_stack;
				bvecsize = NFS_COMMITBVECSIZ;
			} else
				bvecsize = bveccount;
		} else {
			bvec = bvec_on_stack;
			bvecsize = NFS_COMMITBVECSIZ;
		}
		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
			if (bvecpos >= bvecsize)
				break;
			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
				nbp = TAILQ_NEXT(bp, b_vnbufs);
				continue;
			}
			if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) !=
			    (B_DELWRI | B_NEEDCOMMIT)) {
				BUF_UNLOCK(bp);
				nbp = TAILQ_NEXT(bp, b_vnbufs);
				continue;
			}
			VI_UNLOCK(vp);
			bremfree(bp);
			/*
			 * Work out if all buffers are using the same cred
			 * so we can deal with them all with one commit.
			 *
			 * NOTE: we are not clearing B_DONE here, so we have
			 * to do it later on in this routine if we intend to
			 * initiate I/O on the bp.
			 *
			 * Note: to avoid loopback deadlocks, we do not
			 * assign b_runningbufspace.
			 */
			if (wcred == NULL)
				wcred = bp->b_wcred;
			else if (wcred != bp->b_wcred)
				wcred = NOCRED;
			bp->b_flags |= B_WRITEINPROG;
			vfs_busy_pages(bp, 1);

			VI_LOCK(vp);
			/*
			 * bp is protected by being locked, but nbp is not
			 * and vfs_busy_pages() may sleep.  We have to
			 * recalculate nbp.
			 */
			nbp = TAILQ_NEXT(bp, b_vnbufs);

			/*
			 * A list of these buffers is kept so that the
			 * second loop knows which buffers have actually
			 * been committed. This is necessary, since there
			 * may be a race between the commit rpc and new
			 * uncommitted writes on the file.
			 */
			bvec[bvecpos++] = bp;
			toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
				bp->b_dirtyoff;
			if (toff < off)
				off = toff;
			toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
			if (toff > endoff)
				endoff = toff;
		}
		splx(s);
		VI_UNLOCK(vp);
	}
	if (bvecpos > 0) {
		/*
		 * Commit data on the server, as required.
		 * If all bufs are using the same wcred, then use that with
		 * one call for all of them, otherwise commit each one
		 * separately.
		 */
		if (wcred != NOCRED)
			retv = nfs_commit(vp, off, (int)(endoff - off),
					  wcred, td);
		else {
			retv = 0;
			for (i = 0; i < bvecpos; i++) {
				off_t off, size;
				bp = bvec[i];
				off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
					bp->b_dirtyoff;
				size = (u_quad_t)(bp->b_dirtyend
						  - bp->b_dirtyoff);
				retv = nfs_commit(vp, off, (int)size,
						  bp->b_wcred, td);
				if (retv) break;
			}
		}

		if (retv == NFSERR_STALEWRITEVERF)
			nfs_clearcommit(vp->v_mount);

		/*
		 * Now, either mark the blocks I/O done or mark the
		 * blocks dirty, depending on whether the commit
		 * succeeded.
		 */
		for (i = 0; i < bvecpos; i++) {
			bp = bvec[i];
			bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG | B_CLUSTEROK);
			if (retv) {
				/*
				 * Error, leave B_DELWRI intact
				 */
				vfs_unbusy_pages(bp);
				brelse(bp);
			} else {
				/*
				 * Success, remove B_DELWRI ( bundirty() ).
				 *
				 * b_dirtyoff/b_dirtyend seem to be NFS
				 * specific.  We should probably move that
				 * into bundirty(). XXX
				 */
				s = splbio();
				VI_LOCK(vp);
				vp->v_numoutput++;
				VI_UNLOCK(vp);
				bp->b_flags |= B_ASYNC;
				bundirty(bp);
				bp->b_flags &= ~B_DONE;
				bp->b_ioflags &= ~BIO_ERROR;
				bp->b_dirtyoff = bp->b_dirtyend = 0;
				splx(s);
				bufdone(bp);
			}
		}
	}

	/*
	 * Start/do any write(s) that are required.
	 */
loop:
	s = splbio();
	VI_LOCK(vp);
	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
		nbp = TAILQ_NEXT(bp, b_vnbufs);
		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
			if (waitfor != MNT_WAIT || passone)
				continue;

			error = BUF_TIMELOCK(bp,
			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
			    VI_MTX(vp), "nfsfsync", slpflag, slptimeo);
			splx(s);
			if (error == 0)
				panic("nfs_fsync: inconsistent lock");
#ifdef NFS_ENOLCK
			if (error == ENOLCK) {
                                printf("ENOLCK pid=%d, name=%s\n",
                                      td->td_proc->p_pid, td->td_proc->p_comm);
				goto loop;
                        }
#else
			if (error == ENOLCK)
				goto loop;
#endif
			if (nfs_sigintr(nmp, NULL, td)) {
				error = EINTR;
				goto done;
			}
			if (slpflag == PCATCH) {
				slpflag = 0;
				slptimeo = 2 * hz;
			}
			goto loop;
		}
		if ((bp->b_flags & B_DELWRI) == 0)
			panic("nfs_fsync: not dirty");
		if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) {
			BUF_UNLOCK(bp);
			continue;
		}
		VI_UNLOCK(vp);
		bremfree(bp);
		if (passone || !commit)
		    bp->b_flags |= B_ASYNC;
		else
		    bp->b_flags |= B_ASYNC | B_WRITEINPROG;
		splx(s);
		BUF_WRITE(bp);
		goto loop;
	}
	splx(s);
#ifdef NFS_ENOLCK
        if (0 != error) {
           printf("PASS1: error = %d , passone=%d pid=%d, name=%s\n",
                  error, passone, td->td_proc->p_pid, td->td_proc->p_comm);
        }
#endif
	if (passone) {
		passone = 0;
		VI_UNLOCK(vp);
		goto again;
	}
#ifdef NFS_ENOLCK
        if (0 != error) {
           printf("waitfor=%d MNT_WAIT=%d vp->v_numoutput=%d pid=%d, name=%s\n",
                   waitfor, MNT_WAIT, (int)vp->v_numoutput, td->td_proc->p_pid, td->td_proc->p_comm);
        }
#endif
	if (waitfor == MNT_WAIT) {
		while (vp->v_numoutput) {
			vp->v_iflag |= VI_BWAIT;
			error = msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp),
				slpflag | (PRIBIO + 1), "nfsfsync", slptimeo);
#ifdef NFS_ENOLCK
                        printf("msleep() return %d pid=%d, name=%s\n",
                               error, td->td_proc->p_pid, td->td_proc->p_comm);
#endif
			if (error) {
			    VI_UNLOCK(vp);
			    if (nfs_sigintr(nmp, NULL, td)) {
				error = EINTR;
				goto done;
			    }
			    if (slpflag == PCATCH) {
				slpflag = 0;
				slptimeo = 2 * hz;
			    }
			    VI_LOCK(vp);
			}
		}
		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) && commit) {
			VI_UNLOCK(vp);
			goto loop;
		}
	}
	VI_UNLOCK(vp);
	if (np->n_flag & NWRITEERR) {
		error = np->n_error;
		np->n_flag &= ~NWRITEERR;
	}
done:
	if (bvec != NULL && bvec != bvec_on_stack)
		free(bvec, M_TEMP);
#ifdef NFS_ENOLCK
        if (0 != error) {
           printf("PASS2: error = %d pid=%d, name=%s\n",
                  error, td->td_proc->p_pid, td->td_proc->p_comm);
        }
#endif
	return (error);
}
--- FreeBSD-5.2.1-RELEASE.log ends here ---

--- FreeBSD-5.4-RELEASE.log begins here ---
TEST2(It is slightly complicated.) FreeBSD XXX 5.4-RELEASE FreeBSD 5.4-RELEASE
---------------------------------------------------------------------------------------
(1) Prepare the kernel which changed nfs_vnops.c as follows.

(2) Install PostgreSQL in FreeBSD.

(3) Install a "pgbench" program in FreeBSD.
    A pgbench program is included in PostgreSQL.

(4) Prepare a NetBSD/i386(1.6.1) machine as a NFS server.

(5) Do a NFS mount of a NetBSD machine from a FreeBSD machine.
    (mount_nfs -T -3)

(6) Run PostgreSQL in FreeBSD side.
    Data directory of PostgreSQL appoints DISK of the NFS server side.

(7) Run a lower script.
    The following messages are output when You continue practice for
    several hours.(/var/log/messages)
    #!/bin/csh
    while 1
       pgbench -i
    end
---------------------------------------------------------------------------------------
/var/log/messages
Sep 28 19:17:38 XXX kernel: ENOLCK
Sep 28 19:17:38 XXX kernel: CHECK1 error = 77
Sep 28 19:17:38 XXX kernel: waitfor=1 MNT_WAIT=1 vp->v_numoutput=0
Sep 28 19:17:38 XXX kernel: CHECK2 error = 77
---------------------------------------------------------------------------------------
It is the source which a debugging sentence is with. (nfs_vnops.c -> nfs_flush())
#define NFS_ENOLCK
__FBSDID("$FreeBSD: src/sys/nfsclient/nfs_vnops.c,v 1.220.2.1 2005/01/31 23:26:46 imp Exp $");
/*
 * Flush all the blocks associated with a vnode.
 * 	Walk through the buffer pool and push any dirty pages
 *	associated with the vnode.
 */
static int
nfs_flush(struct vnode *vp, struct ucred *cred, int waitfor, struct thread *td,
    int commit)
{
	struct nfsnode *np = VTONFS(vp);
	struct buf *bp;
	int i;
	struct buf *nbp;
	struct nfsmount *nmp = VFSTONFS(vp->v_mount);
	int s, error = 0, slptimeo = 0, slpflag = 0, retv, bvecpos;
	int passone = 1;
	u_quad_t off, endoff, toff;
	struct ucred* wcred = NULL;
	struct buf **bvec = NULL;
#ifndef NFS_COMMITBVECSIZ
#define NFS_COMMITBVECSIZ	20
#endif
	struct buf *bvec_on_stack[NFS_COMMITBVECSIZ];
	int bvecsize = 0, bveccount;

	if (nmp->nm_flag & NFSMNT_INT)
		slpflag = PCATCH;
	if (!commit)
		passone = 0;
	/*
	 * A b_flags == (B_DELWRI | B_NEEDCOMMIT) block has been written to the
	 * server, but nas not been committed to stable storage on the server
	 * yet. On the first pass, the byte range is worked out and the commit
	 * rpc is done. On the second pass, nfs_writebp() is called to do the
	 * job.
	 */
again:
	off = (u_quad_t)-1;
	endoff = 0;
	bvecpos = 0;
	if (NFS_ISV3(vp) && commit) {
		s = splbio();
		if (bvec != NULL && bvec != bvec_on_stack)
			free(bvec, M_TEMP);
		/*
		 * Count up how many buffers waiting for a commit.
		 */
		bveccount = 0;
		VI_LOCK(vp);
		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
			nbp = TAILQ_NEXT(bp, b_vnbufs);
			if (BUF_REFCNT(bp) == 0 &&
			    (bp->b_flags & (B_DELWRI | B_NEEDCOMMIT))
				== (B_DELWRI | B_NEEDCOMMIT))
				bveccount++;
		}
		/*
		 * Allocate space to remember the list of bufs to commit.  It is
		 * important to use M_NOWAIT here to avoid a race with nfs_write.
		 * If we can't get memory (for whatever reason), we will end up
		 * committing the buffers one-by-one in the loop below.
		 */
		if (bveccount > NFS_COMMITBVECSIZ) {
			/*
			 * Release the vnode interlock to avoid a lock
			 * order reversal.
			 */
			VI_UNLOCK(vp);
			bvec = (struct buf **)
				malloc(bveccount * sizeof(struct buf *),
				       M_TEMP, M_NOWAIT);
			VI_LOCK(vp);
			if (bvec == NULL) {
				bvec = bvec_on_stack;
				bvecsize = NFS_COMMITBVECSIZ;
			} else
				bvecsize = bveccount;
		} else {
			bvec = bvec_on_stack;
			bvecsize = NFS_COMMITBVECSIZ;
		}
		for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
			if (bvecpos >= bvecsize)
				break;
			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
				nbp = TAILQ_NEXT(bp, b_vnbufs);
				continue;
			}
			if ((bp->b_flags & (B_DELWRI | B_NEEDCOMMIT)) !=
			    (B_DELWRI | B_NEEDCOMMIT)) {
				BUF_UNLOCK(bp);
				nbp = TAILQ_NEXT(bp, b_vnbufs);
				continue;
			}
			VI_UNLOCK(vp);
			bremfree(bp);
			/*
			 * Work out if all buffers are using the same cred
			 * so we can deal with them all with one commit.
			 *
			 * NOTE: we are not clearing B_DONE here, so we have
			 * to do it later on in this routine if we intend to
			 * initiate I/O on the bp.
			 *
			 * Note: to avoid loopback deadlocks, we do not
			 * assign b_runningbufspace.
			 */
			if (wcred == NULL)
				wcred = bp->b_wcred;
			else if (wcred != bp->b_wcred)
				wcred = NOCRED;
			bp->b_flags |= B_WRITEINPROG;
			vfs_busy_pages(bp, 1);

			VI_LOCK(vp);
			/*
			 * bp is protected by being locked, but nbp is not
			 * and vfs_busy_pages() may sleep.  We have to
			 * recalculate nbp.
			 */
			nbp = TAILQ_NEXT(bp, b_vnbufs);

			/*
			 * A list of these buffers is kept so that the
			 * second loop knows which buffers have actually
			 * been committed. This is necessary, since there
			 * may be a race between the commit rpc and new
			 * uncommitted writes on the file.
			 */
			bvec[bvecpos++] = bp;
			toff = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
				bp->b_dirtyoff;
			if (toff < off)
				off = toff;
			toff += (u_quad_t)(bp->b_dirtyend - bp->b_dirtyoff);
			if (toff > endoff)
				endoff = toff;
		}
		splx(s);
		VI_UNLOCK(vp);
	}
	if (bvecpos > 0) {
		/*
		 * Commit data on the server, as required.
		 * If all bufs are using the same wcred, then use that with
		 * one call for all of them, otherwise commit each one
		 * separately.
		 */
		if (wcred != NOCRED)
			retv = nfs_commit(vp, off, (int)(endoff - off),
					  wcred, td);
		else {
			retv = 0;
			for (i = 0; i < bvecpos; i++) {
				off_t off, size;
				bp = bvec[i];
				off = ((u_quad_t)bp->b_blkno) * DEV_BSIZE +
					bp->b_dirtyoff;
				size = (u_quad_t)(bp->b_dirtyend
						  - bp->b_dirtyoff);
				retv = nfs_commit(vp, off, (int)size,
						  bp->b_wcred, td);
				if (retv) break;
			}
		}

		if (retv == NFSERR_STALEWRITEVERF)
			nfs_clearcommit(vp->v_mount);

		/*
		 * Now, either mark the blocks I/O done or mark the
		 * blocks dirty, depending on whether the commit
		 * succeeded.
		 */
		for (i = 0; i < bvecpos; i++) {
			bp = bvec[i];
			bp->b_flags &= ~(B_NEEDCOMMIT | B_WRITEINPROG | B_CLUSTEROK);
			if (retv) {
				/*
				 * Error, leave B_DELWRI intact
				 */
				vfs_unbusy_pages(bp);
				brelse(bp);
			} else {
				/*
				 * Success, remove B_DELWRI ( bundirty() ).
				 *
				 * b_dirtyoff/b_dirtyend seem to be NFS
				 * specific.  We should probably move that
				 * into bundirty(). XXX
				 */
				s = splbio();
				VI_LOCK(vp);
				vp->v_numoutput++;
				VI_UNLOCK(vp);
				bp->b_flags |= B_ASYNC;
				bundirty(bp);
				bp->b_flags &= ~B_DONE;
				bp->b_ioflags &= ~BIO_ERROR;
				bp->b_dirtyoff = bp->b_dirtyend = 0;
				splx(s);
				bufdone(bp);
			}
		}
	}

	/*
	 * Start/do any write(s) that are required.
	 */
loop:
	s = splbio();
	VI_LOCK(vp);
	for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) {
		nbp = TAILQ_NEXT(bp, b_vnbufs);
		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL)) {
			if (waitfor != MNT_WAIT || passone)
				continue;

			error = BUF_TIMELOCK(bp,
			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
			    VI_MTX(vp), "nfsfsync", slpflag, slptimeo);
			splx(s);
			if (error == 0)
				panic("nfs_fsync: inconsistent lock");
#ifdef NFS_ENOLCK
			if (error == ENOLCK) {
                                printf("ENOLCK\n");
				goto loop;
                        }
#else
                        if (error == ENOLCK)
                                goto loop;
#endif
			error = nfs_sigintr(nmp, NULL, td);
			if (error)
				goto done;
			if (slpflag == PCATCH) {
				slpflag = 0;
				slptimeo = 2 * hz;
			}
			goto loop;
		}
		if ((bp->b_flags & B_DELWRI) == 0)
			panic("nfs_fsync: not dirty");
		if ((passone || !commit) && (bp->b_flags & B_NEEDCOMMIT)) {
			BUF_UNLOCK(bp);
			continue;
		}
		VI_UNLOCK(vp);
		bremfree(bp);
		if (passone || !commit)
		    bp->b_flags |= B_ASYNC;
		else
		    bp->b_flags |= B_ASYNC | B_WRITEINPROG;
		splx(s);
		bwrite(bp);
		goto loop;
	}
	splx(s);
#ifdef NFS_ENOLCK
        if (0 != error) {
           printf("CHECK1 error = %d\n", error);
        }
#endif
	if (passone) {
		passone = 0;
		VI_UNLOCK(vp);
		goto again;
	}
#ifdef NFS_ENOLCK
        if (0 != error) {
           printf("waitfor=%d MNT_WAIT=%d vp->v_numoutput=%d\n",
                   waitfor, MNT_WAIT, (int)vp->v_numoutput);
        }
#endif
	if (waitfor == MNT_WAIT) {
		while (vp->v_numoutput) {
			vp->v_iflag |= VI_BWAIT;
			error = msleep((caddr_t)&vp->v_numoutput, VI_MTX(vp),
				slpflag | (PRIBIO + 1), "nfsfsync", slptimeo);
#ifdef NFS_ENOLCK
                        printf("msleep() retun error = %d\n", error);
#endif
			if (error) {
			    VI_UNLOCK(vp);
			    error = nfs_sigintr(nmp, NULL, td);
			    if (error)
				goto done;
			    if (slpflag == PCATCH) {
				slpflag = 0;
				slptimeo = 2 * hz;
			    }
			    VI_LOCK(vp);
			}
		}
		if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) && commit) {
			VI_UNLOCK(vp);
			goto loop;
		}
	}
	VI_UNLOCK(vp);
	if (np->n_flag & NWRITEERR) {
                printf("error=%d np->n_error= %d\n", error, np->n_error);
		error = np->n_error;
		np->n_flag &= ~NWRITEERR;
	}
done:
	if (bvec != NULL && bvec != bvec_on_stack)
		free(bvec, M_TEMP);
#ifdef NFS_ENOLCK
        if (0 != error) {
           printf("CHECK2 error = %d\n", error);
        }
#endif
	return (error);
}
--- FreeBSD-5.4-RELEASE.log ends here ---


>Release-Note:
>Audit-Trail:
>Unformatted:


More information about the freebsd-bugs mailing list