svn commit: r204132 - in user/kib/vm6/sys: conf dev/md kern sys ufs/ffs ufs/ufs vm

Sat Feb 20 16:34:42 UTC 2010

Author: kib
Date: Sat Feb 20 16:34:42 2010
New Revision: 204132
URL: http://svn.freebsd.org/changeset/base/204132

Log:
  Implementation of range locking for i/o and vm i/o.
  
  First vm_readwrite.c implementation by:	jeff
  In collaboration with:	pho

Added:
  user/kib/vm6/sys/kern/kern_rangelock.c   (contents, props changed)
  user/kib/vm6/sys/sys/rangelock.h   (contents, props changed)
  user/kib/vm6/sys/vm/vm_readwrite.c   (contents, props changed)
Modified:
  user/kib/vm6/sys/conf/files
  user/kib/vm6/sys/dev/md/md.c
  user/kib/vm6/sys/kern/vfs_cluster.c
  user/kib/vm6/sys/kern/vfs_default.c
  user/kib/vm6/sys/kern/vfs_subr.c
  user/kib/vm6/sys/kern/vfs_vnops.c
  user/kib/vm6/sys/kern/vnode_if.src
  user/kib/vm6/sys/sys/buf.h
  user/kib/vm6/sys/sys/file.h
  user/kib/vm6/sys/sys/proc.h
  user/kib/vm6/sys/sys/vnode.h
  user/kib/vm6/sys/ufs/ffs/ffs_balloc.c
  user/kib/vm6/sys/ufs/ffs/ffs_softdep.c
  user/kib/vm6/sys/ufs/ffs/ffs_vnops.c
  user/kib/vm6/sys/ufs/ufs/ufs_vnops.c
  user/kib/vm6/sys/vm/vm_extern.h
  user/kib/vm6/sys/vm/vm_fault.c
  user/kib/vm6/sys/vm/vm_page.c
  user/kib/vm6/sys/vm/vm_page.h
  user/kib/vm6/sys/vm/vm_pageout.c
  user/kib/vm6/sys/vm/vm_pageout.h
  user/kib/vm6/sys/vm/vm_phys.c
  user/kib/vm6/sys/vm/vnode_pager.c

Modified: user/kib/vm6/sys/conf/files
==============================================================================

--- user/kib/vm6/sys/conf/files	Sat Feb 20 16:32:33 2010	(r204131)
+++ user/kib/vm6/sys/conf/files	Sat Feb 20 16:34:42 2010	(r204132)
@@ -2077,6 +2077,7 @@ kern/kern_poll.c		optional device_pollin
 kern/kern_priv.c		standard
 kern/kern_proc.c		standard
 kern/kern_prot.c		standard
+kern/kern_rangelock.c		standard
 kern/kern_resource.c		standard
 kern/kern_rmlock.c		standard
 kern/kern_rwlock.c		standard
@@ -2768,6 +2769,7 @@ vm/vm_page.c			standard
 vm/vm_pageout.c			standard
 vm/vm_pager.c			standard
 vm/vm_phys.c			standard
+vm/vm_readwrite.c		standard
 vm/vm_reserv.c			standard
 vm/vm_unix.c			standard
 vm/vm_zeroidle.c		standard

Modified: user/kib/vm6/sys/dev/md/md.c
==============================================================================
--- user/kib/vm6/sys/dev/md/md.c	Sat Feb 20 16:32:33 2010	(r204131)
+++ user/kib/vm6/sys/dev/md/md.c	Sat Feb 20 16:34:42 2010	(r204132)
@@ -85,6 +85,7 @@
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 #include <vm/uma.h>
@@ -587,7 +588,7 @@ mdstart_swap(struct md_s *sc, struct bio
 {
 	struct sf_buf *sf;
 	int rv, offs, len, lastend;
-	vm_pindex_t i, lastp;
+	vm_pindex_t i, firstp, lastp;
 	vm_page_t m;
 	u_char *p;
 
@@ -610,18 +611,26 @@ mdstart_swap(struct md_s *sc, struct bio
 	 * we're operating on complete aligned pages).
 	 */
 	offs = bp->bio_offset % PAGE_SIZE;
+	firstp = bp->bio_offset / PAGE_SIZE;
 	lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
 	lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
 
+	vm_page_t ma[lastp - firstp + 1];
+
 	rv = VM_PAGER_OK;
 	VM_OBJECT_LOCK(sc->object);
 	vm_object_pip_add(sc->object, 1);
-	for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
+	for (i = firstp; i <= lastp; i++) {
 		len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
 
-		m = vm_page_grab(sc->object, i,
-		    VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
+		/*
+		 * Write cleans pages of the buffer, give it a
+		 * priority.
+		 */
+		m = vm_page_grab(sc->object, i, (bp->bio_cmd == BIO_WRITE ?
+		    VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_RETRY);
 		VM_OBJECT_UNLOCK(sc->object);
+		ma[i - firstp] = m;
 		sched_pin();
 		sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
 		VM_OBJECT_LOCK(sc->object);
@@ -683,6 +692,12 @@ printf("wire_count %d busy %d flags %x h
 	}
 	vm_object_pip_subtract(sc->object, 1);
 	vm_object_set_writeable_dirty(sc->object);
+	if (rv != VM_PAGER_ERROR && bp->bio_cmd == BIO_WRITE &&
+	    vm_page_count_severe()) {
+		vm_page_lock_queues();
+		vm_pageout_flush(ma, lastp - firstp + 1, IO_SYNC);
+		vm_page_unlock_queues();
+	}
 	VM_OBJECT_UNLOCK(sc->object);
 	return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
 }

Added: user/kib/vm6/sys/kern/kern_rangelock.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/kib/vm6/sys/kern/kern_rangelock.c	Sat Feb 20 16:34:42 2010	(r204132)
@@ -0,0 +1,186 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov <kib at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/rangelock.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+
+uma_zone_t rl_entry_zone;
+
+static void
+rangelock_sys_init(void)
+{
+
+	rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, rangelock_sys_init, NULL);
+
+void
+rangelock_init(struct rangelock *lock)
+{
+
+	TAILQ_INIT(&lock->rl_waiters);
+	lock->rl_currdep = NULL;
+}
+
+void
+rangelock_destroy(struct rangelock *lock)
+{
+
+	KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters"));
+}
+
+static int
+rangelock_incompatible(const struct rl_q_entry *e1,
+    const struct rl_q_entry *e2)
+{
+
+	if ((e1->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ &&
+	    (e2->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ)
+		return (0);
+#define	IN_RANGE(a, e) (a >= e->rl_q_start && a < e->rl_q_end)
+	if (IN_RANGE(e1->rl_q_start, e2) || IN_RANGE(e2->rl_q_start, e1) ||
+	    IN_RANGE(e1->rl_q_end, e2) || IN_RANGE(e2->rl_q_end, e1))
+		return (1);
+#undef	IN_RANGE
+	return (0);
+}
+
+static void
+rangelock_calc_block(struct rangelock *lock)
+{
+	struct rl_q_entry *entry, *entry1, *whead;
+
+	if (lock->rl_currdep == TAILQ_FIRST(&lock->rl_waiters) &&
+	    lock->rl_currdep != NULL)
+		lock->rl_currdep = TAILQ_NEXT(lock->rl_currdep, rl_q_link);
+	for (entry = lock->rl_currdep; entry;
+	     entry = TAILQ_NEXT(entry, rl_q_link)) {
+		TAILQ_FOREACH(entry1, &lock->rl_waiters, rl_q_link) {
+			if (rangelock_incompatible(entry, entry1))
+				goto out;
+			if (entry1 == entry)
+				break;
+		}
+	}
+out:
+	lock->rl_currdep = entry;
+	TAILQ_FOREACH(whead, &lock->rl_waiters, rl_q_link) {
+		if (whead == lock->rl_currdep)
+			break;
+		if (!(whead->rl_q_flags & RL_LOCK_GRANTED)) {
+			whead->rl_q_flags |= RL_LOCK_GRANTED;
+			wakeup(whead);
+		}
+	}
+}
+
+static void
+rangelock_unlock_vp_locked(struct vnode *vp, struct rl_q_entry *entry)
+{
+
+	ASSERT_VI_LOCKED(vp, "rangelock");
+	KASSERT(entry != vp->v_rl.rl_currdep, ("stuck currdep"));
+	TAILQ_REMOVE(&vp->v_rl.rl_waiters, entry, rl_q_link);
+	rangelock_calc_block(&vp->v_rl);
+	VI_UNLOCK(vp);
+	uma_zfree(rl_entry_zone, entry);
+}
+
+void
+rangelock_unlock(struct vnode *vp, void *cookie)
+{
+	struct rl_q_entry *entry;
+
+	entry = cookie;
+	VI_LOCK(vp);
+	rangelock_unlock_vp_locked(vp, entry);
+}
+
+void *
+rangelock_unlock_range(struct vnode *vp, void *cookie, off_t base, size_t len)
+{
+	struct rl_q_entry *entry;
+
+	entry = cookie;
+	VI_LOCK(vp);
+	KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED, ("XXX"));
+	KASSERT(entry->rl_q_start == base, ("XXX"));
+	KASSERT(entry->rl_q_end >= base + len, ("XXX"));
+	if (entry->rl_q_end == base + len) {
+		rangelock_unlock_vp_locked(vp, cookie);
+		return (NULL);
+	}
+	entry->rl_q_end = base + len;
+	rangelock_calc_block(&vp->v_rl);
+	VI_UNLOCK(vp);
+	return (cookie);
+}
+
+static void *
+rangelock_enqueue(struct vnode *vp, struct rl_q_entry *entry)
+{
+
+	VI_LOCK(vp);
+	TAILQ_INSERT_TAIL(&vp->v_rl.rl_waiters, entry, rl_q_link);
+	if (vp->v_rl.rl_currdep == NULL)
+		vp->v_rl.rl_currdep = entry;
+	rangelock_calc_block(&vp->v_rl);
+	while (!(entry->rl_q_flags & RL_LOCK_GRANTED))
+		msleep(entry, &vp->v_interlock, 0, "range", 0);
+	VI_UNLOCK(vp);
+	return (entry);
+}
+
+void *
+rangelock_rlock(struct vnode *vp, off_t base, size_t len)
+{
+	struct rl_q_entry *entry;
+
+	entry = uma_zalloc(rl_entry_zone, M_WAITOK);
+	entry->rl_q_flags = RL_LOCK_READ;
+	entry->rl_q_start = base;
+	entry->rl_q_end = base + len;
+	return (rangelock_enqueue(vp, entry));
+}
+
+void *
+rangelock_wlock(struct vnode *vp, off_t base, size_t len)
+{
+	struct rl_q_entry *entry;
+
+	entry = uma_zalloc(rl_entry_zone, M_WAITOK);
+	entry->rl_q_flags = RL_LOCK_WRITE;
+	entry->rl_q_start = base;
+	entry->rl_q_end = base + len;
+	return (rangelock_enqueue(vp, entry));
+}

Modified: user/kib/vm6/sys/kern/vfs_cluster.c
==============================================================================
--- user/kib/vm6/sys/kern/vfs_cluster.c	Sat Feb 20 16:32:33 2010	(r204131)
+++ user/kib/vm6/sys/kern/vfs_cluster.c	Sat Feb 20 16:34:42 2010	(r204132)
@@ -71,8 +71,8 @@ static int write_behind = 1;
 SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
     "Cluster write-behind; 0: disable, 1: enable, 2: backed off");
 
-static int read_max = 8;
-SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0,
+int vfs_read_max = 8;
+SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &vfs_read_max, 0,
     "Cluster read-ahead max block count");
 
 /* Page expended to mark partially backed buffers */
@@ -109,7 +109,7 @@ cluster_read(vp, filesize, lblkno, size,
 	 */
 	racluster = vp->v_mount->mnt_iosize_max / size;
 	maxra = seqcount;
-	maxra = min(read_max, maxra);
+	maxra = min(vfs_read_max, maxra);
 	maxra = min(nbuf/8, maxra);
 	if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize)
 		maxra = (filesize / size) - lblkno;
@@ -803,7 +803,9 @@ cluster_wbuild(vp, size, start_lbn, len)
 		  (tbp->b_bcount != tbp->b_bufsize) ||
 		  (tbp->b_bcount != size) ||
 		  (len == 1) ||
-		  ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
+		  ((bp = (vp->v_vflag & VV_MD) ?
+		    trypbuf(&cluster_pbuf_freecnt) :
+		    getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
 			totalwritten += tbp->b_bufsize;
 			bawrite(tbp);
 			++start_lbn;

Modified: user/kib/vm6/sys/kern/vfs_default.c
==============================================================================
--- user/kib/vm6/sys/kern/vfs_default.c	Sat Feb 20 16:32:33 2010	(r204131)
+++ user/kib/vm6/sys/kern/vfs_default.c	Sat Feb 20 16:34:42 2010	(r204132)
@@ -77,6 +77,8 @@ static int	dirent_exists(struct vnode *v
 
 #define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4)
 
+static int vop_stdextend(struct vop_extend_args *ap);
+
 /*
  * This vnode table stores what we want to do if the filesystem doesn't
  * implement a particular VOP.
@@ -118,6 +120,7 @@ struct vop_vector default_vnodeops = {
 	.vop_unlock =		vop_stdunlock,
 	.vop_vptocnp =		vop_stdvptocnp,
 	.vop_vptofh =		vop_stdvptofh,
+	.vop_extend =		vop_stdextend,
 };
 
 /*
@@ -825,6 +828,23 @@ out:
 	return (error);
 }
 
+static int
+vop_stdextend(struct vop_extend_args *ap)
+{
+	struct vattr vattr, oattr;
+	int error;
+
+
+	error = VOP_GETATTR(ap->a_vp, &oattr, ap->a_cred);
+	if (error != 0)
+		return (error);
+	if (oattr.va_size >= ap->a_size)
+		return (0);
+	VATTR_NULL(&vattr);
+	vattr.va_size = ap->a_size;
+	return (VOP_SETATTR(ap->a_vp, &vattr, ap->a_cred));
+}
+
 /*
  * vfs default ops
  * used to fill the vfs function table to get reasonable default return values.

Modified: user/kib/vm6/sys/kern/vfs_subr.c
==============================================================================
--- user/kib/vm6/sys/kern/vfs_subr.c	Sat Feb 20 16:32:33 2010	(r204131)
+++ user/kib/vm6/sys/kern/vfs_subr.c	Sat Feb 20 16:34:42 2010	(r204132)
@@ -861,6 +861,7 @@ vdestroy(struct vnode *vp)
 	/* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */
 	vp->v_op = NULL;
 #endif
+	rangelock_destroy(&vp->v_rl);
 	lockdestroy(vp->v_vnlock);
 	mtx_destroy(&vp->v_interlock);
 	mtx_destroy(BO_MTX(bo));
@@ -1015,6 +1016,7 @@ alloc:
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
 	}
+	rangelock_init(&vp->v_rl);
 
 	*vpp = vp;
 	return (0);

Modified: user/kib/vm6/sys/kern/vfs_vnops.c
==============================================================================
--- user/kib/vm6/sys/kern/vfs_vnops.c	Sat Feb 20 16:32:33 2010	(r204131)
+++ user/kib/vm6/sys/kern/vfs_vnops.c	Sat Feb 20 16:34:42 2010	(r204132)
@@ -37,12 +37,14 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/kdb.h>
 #include <sys/stat.h>
+#include <sys/sysctl.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/limits.h>
@@ -62,6 +64,13 @@ __FBSDID("$FreeBSD$");
 
 #include <security/mac/mac_framework.h>
 
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+static int vmio_enabled = 1;
+SYSCTL_INT(_vfs, OID_AUTO, vmio_enabled, CTLFLAG_RW, &vmio_enabled, 0,
+    "Use vm pages copyin/out instead of vops for read/write");
+
 static fo_rdwr_t	vn_read;
 static fo_rdwr_t	vn_write;
 static fo_truncate_t	vn_truncate;
@@ -83,6 +92,9 @@ struct 	fileops vnops = {
 	.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
 };
 
+static int vn_write_chunk(struct vnode *, struct uio *, struct ucred *,
+    struct ucred *, int);
+
 int
 vn_open(ndp, flagp, cmode, fp)
 	struct nameidata *ndp;
@@ -275,17 +287,14 @@ vn_writechk(vp)
  * Vnode close call
  */
 int
-vn_close(vp, flags, file_cred, td)
-	register struct vnode *vp;
-	int flags;
-	struct ucred *file_cred;
-	struct thread *td;
+vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
+    struct thread *td)
 {
-	struct mount *mp;
+	struct mount *mp, *mp1;
 	int error, lock_flags;
 
-	if (!(flags & FWRITE) && vp->v_mount != NULL &&
-	    vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED)
+	if (!(flags & FWRITE) && (mp1 = vp->v_mount) != NULL &&
+	    MNT_SHARED_WRITES(mp1))
 		lock_flags = LK_SHARED;
 	else
 		lock_flags = LK_EXCLUSIVE;
@@ -333,7 +342,7 @@ sequential_heuristic(struct uio *uio, st
 		 * closely related to the best I/O size for real disks than
 		 * to any block size used by software.
 		 */
-		fp->f_seqcount += howmany(uio->uio_resid, 16384);
+		fp->f_seqcount += howmany(uio->uio_resid, FRA_BLOCK_SZ);
 		if (fp->f_seqcount > IO_SEQMAX)
 			fp->f_seqcount = IO_SEQMAX;
 		return (fp->f_seqcount << IO_SEQSHIFT);
@@ -351,76 +360,71 @@ sequential_heuristic(struct uio *uio, st
  * Package up an I/O request on a vnode into a uio and do it.
  */
 int
-vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
-    aresid, td)
-	enum uio_rw rw;
-	struct vnode *vp;
-	void *base;
-	int len;
-	off_t offset;
-	enum uio_seg segflg;
-	int ioflg;
-	struct ucred *active_cred;
-	struct ucred *file_cred;
-	int *aresid;
-	struct thread *td;
+vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
+    enum uio_seg segflg, int ioflg, struct ucred *active_cred,
+    struct ucred *file_cred, int *aresid, struct thread *td)
 {
 	struct uio auio;
 	struct iovec aiov;
 	struct mount *mp;
 	struct ucred *cred;
+	void *rl_cookie;
 	int error, lock_flags;
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	aiov.iov_base = base;
+	aiov.iov_len = len;
+	auio.uio_resid = len;
+	auio.uio_offset = offset;
+	auio.uio_segflg = segflg;
+	auio.uio_rw = rw;
+	auio.uio_td = td;
+	error = 0;
+
 	if ((ioflg & IO_NODELOCKED) == 0) {
+		if (rw == UIO_READ)
+			rl_cookie = rangelock_rlock(vp, offset, len);
+		else
+			rl_cookie = rangelock_wlock(vp, offset, len);
 		mp = NULL;
 		if (rw == UIO_WRITE) { 
 			if (vp->v_type != VCHR &&
 			    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
 			    != 0)
-				return (error);
+				goto out;
 			if (MNT_SHARED_WRITES(mp) ||
-			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
+			    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
 				lock_flags = LK_SHARED;
-			} else {
+			else
 				lock_flags = LK_EXCLUSIVE;
-			}
-			vn_lock(vp, lock_flags | LK_RETRY);
 		} else
-			vn_lock(vp, LK_SHARED | LK_RETRY);
+			lock_flags = LK_SHARED;
+		vn_lock(vp, lock_flags | LK_RETRY);
+	} else
+		rl_cookie = NULL;
 
-	}
 	ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
-	auio.uio_iov = &aiov;
-	auio.uio_iovcnt = 1;
-	aiov.iov_base = base;
-	aiov.iov_len = len;
-	auio.uio_resid = len;
-	auio.uio_offset = offset;
-	auio.uio_segflg = segflg;
-	auio.uio_rw = rw;
-	auio.uio_td = td;
-	error = 0;
 #ifdef MAC
 	if ((ioflg & IO_NOMACCHECK) == 0) {
-		if (rw == UIO_READ)
-			error = mac_vnode_check_read(active_cred, file_cred,
-			    vp);
-		else
+		if (rw == UIO_WRITE)
 			error = mac_vnode_check_write(active_cred, file_cred,
 			    vp);
 	}
 #endif
 	if (error == 0) {
-		if (file_cred)
+		if (file_cred != NULL)
 			cred = file_cred;
 		else
 			cred = active_cred;
 		if (rw == UIO_READ)
-			error = VOP_READ(vp, &auio, ioflg, cred);
+			error = vn_read_chunk(vp, &auio, active_cred, cred,
+			    ioflg | IO_NODELOCKED);
 		else
-			error = VOP_WRITE(vp, &auio, ioflg, cred);
+			error = vn_write_chunk(vp, &auio, active_cred, cred,
+			    ioflg | IO_NODELOCKED);
 	}
 	if (aresid)
 		*aresid = auio.uio_resid;
@@ -428,10 +432,13 @@ vn_rdwr(rw, vp, base, len, offset, segfl
 		if (auio.uio_resid && error == 0)
 			error = EIO;
 	if ((ioflg & IO_NODELOCKED) == 0) {
-		if (rw == UIO_WRITE && vp->v_type != VCHR)
-			vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0);
+		if (mp != NULL)
+			vn_finished_write(mp);
 	}
+ out:
+	if (rl_cookie != NULL)
+		rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 
@@ -493,68 +500,148 @@ vn_rdwr_inchunks(rw, vp, base, len, offs
 	return (error);
 }
 
+static struct mtx *
+vn_lock_foffset(struct file *fp)
+{
+	struct mtx *mtxp;
+
+	mtxp = mtx_pool_find(mtxpool_sleep, fp);
+	mtx_lock(mtxp);
+	while (fp->f_vnread_flags & FOFFSET_LOCKED) {
+		fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+		msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+		    "vnread offlock", 0);
+	}
+	fp->f_vnread_flags |= FOFFSET_LOCKED;
+	mtx_unlock(mtxp);
+	return (mtxp);
+}
+
+static void
+vn_unlock_foffset(struct file *fp, struct mtx *mtxp)
+{
+
+	mtx_lock(mtxp);
+	if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
+		wakeup(&fp->f_vnread_flags);
+	fp->f_vnread_flags = 0;
+	mtx_unlock(mtxp);
+}
+
+int
+vn_read_chunk(struct vnode *vp, struct uio *uio, struct ucred *active_cred,
+    struct ucred *fcred, int ioflag)
+{
+	int error, vfslocked;
+
+	error = 0;
+	vfslocked = 0; /* gcc */
+
+	if ((ioflag & IO_NODELOCKED) == 0) {
+		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+		vn_lock(vp, LK_SHARED | LK_RETRY);
+	}
+
+#ifdef MAC
+	if ((ioflag & IO_NOMACCHECK) == 0)
+		error = mac_vnode_check_read(active_cred, fcred, vp);
+#endif
+	if (error == 0) {
+		if (!vmio_enabled ||
+		    (error = vnode_pager_read(vp, uio, ioflag)) == EOPNOTSUPP)
+			error = VOP_READ(vp, uio, ioflag, fcred);
+	}
+	if ((ioflag & IO_NODELOCKED) == 0) {
+		VOP_UNLOCK(vp, 0);
+		VFS_UNLOCK_GIANT(vfslocked);
+	}
+	return (error);
+}
+
 /*
  * File table vnode read routine.
  */
 static int
-vn_read(fp, uio, active_cred, flags, td)
-	struct file *fp;
-	struct uio *uio;
-	struct ucred *active_cred;
-	struct thread *td;
-	int flags;
+vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
+    struct thread *td)
 {
 	struct vnode *vp;
-	int error, ioflag;
 	struct mtx *mtxp;
-	int vfslocked;
+	void *rl_cookie;
+	int ioflag;
+	int error;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
-	mtxp = NULL;
-	vp = fp->f_vnode;
 	ioflag = 0;
 	if (fp->f_flag & FNONBLOCK)
 		ioflag |= IO_NDELAY;
 	if (fp->f_flag & O_DIRECT)
 		ioflag |= IO_DIRECT;
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+	vp = fp->f_vnode;
+
 	/*
 	 * According to McKusick the vn lock was protecting f_offset here.
 	 * It is now protected by the FOFFSET_LOCKED flag.
 	 */
 	if ((flags & FOF_OFFSET) == 0) {
-		mtxp = mtx_pool_find(mtxpool_sleep, fp);
-		mtx_lock(mtxp);
-		while(fp->f_vnread_flags & FOFFSET_LOCKED) {
-			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
-			msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
-			    "vnread offlock", 0);
-		}
-		fp->f_vnread_flags |= FOFFSET_LOCKED;
-		mtx_unlock(mtxp);
-		vn_lock(vp, LK_SHARED | LK_RETRY);
+		mtxp = vn_lock_foffset(fp);
 		uio->uio_offset = fp->f_offset;
 	} else
-		vn_lock(vp, LK_SHARED | LK_RETRY);
-
+		mtxp = NULL; /* gcc */
+	if (vp->v_type == VREG)
+		rl_cookie = rangelock_rlock(vp, uio->uio_offset,
+		    uio->uio_resid);
+	else
+		rl_cookie = NULL;
 	ioflag |= sequential_heuristic(uio, fp);
+	error = vn_read_chunk(vp, uio, active_cred, fp->f_cred, ioflag);
+	fp->f_nextoff = uio->uio_offset;
+	if (rl_cookie != NULL)
+		rangelock_unlock(vp, rl_cookie);
+	if ((flags & FOF_OFFSET) == 0) {
+		fp->f_offset = uio->uio_offset;
+		vn_unlock_foffset(fp, mtxp);
+	}
+	return (error);
+}
+
+static int
+vn_write_chunk(struct vnode *vp, struct uio *uio, struct ucred *active_cred,
+    struct ucred *fcred, int ioflag)
+{
+	struct mount *mp, *mp1;
+	int error, lock_flags, vfslocked;
+
+	mp = NULL;
+	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+	if (vp->v_type == VREG)
+		bwillwrite();
+	if (vp->v_type != VCHR &&
+	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		goto unlock;
 
+	if (MNT_SHARED_WRITES(mp) ||
+	    (mp == NULL && (mp1 = vp->v_mount) != NULL &&
+	     MNT_SHARED_WRITES(mp1)))
+		lock_flags = LK_SHARED;
+	else
+		lock_flags = LK_EXCLUSIVE;
+	vn_lock(vp, lock_flags | LK_RETRY);
 #ifdef MAC
-	error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
-	if (error == 0)
+	error = mac_vnode_check_write(active_cred, fcred, vp);
+#else
+	error = 0;
 #endif
-		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
-	if ((flags & FOF_OFFSET) == 0) {
-		fp->f_offset = uio->uio_offset;
-		mtx_lock(mtxp);
-		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
-			wakeup(&fp->f_vnread_flags);
-		fp->f_vnread_flags = 0;
-		mtx_unlock(mtxp);
+	if (error == 0) {
+		if (!vmio_enabled ||
+		    (error = vnode_pager_write(vp, uio, ioflag)) == EOPNOTSUPP)
+			error = VOP_WRITE(vp, uio, ioflag, fcred);
 	}
-	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0);
+	if (vp->v_type != VCHR)
+		vn_finished_write(mp);
+unlock:
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
@@ -563,24 +650,17 @@ vn_read(fp, uio, active_cred, flags, td)
  * File table vnode write routine.
  */
 static int
-vn_write(fp, uio, active_cred, flags, td)
-	struct file *fp;
-	struct uio *uio;
-	struct ucred *active_cred;
-	struct thread *td;
-	int flags;
+vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
+    struct thread *td)
 {
 	struct vnode *vp;
-	struct mount *mp;
-	int error, ioflag, lock_flags;
-	int vfslocked;
+	struct mtx *mtxp;
+	void *rl_cookie;
+	int error, ioflag;
 
 	KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
 	    uio->uio_td, td));
 	vp = fp->f_vnode;
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-	if (vp->v_type == VREG)
-		bwillwrite();
 	ioflag = IO_UNIT;
 	if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
 		ioflag |= IO_APPEND;
@@ -591,36 +671,32 @@ vn_write(fp, uio, active_cred, flags, td
 	if ((fp->f_flag & O_FSYNC) ||
 	    (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
 		ioflag |= IO_SYNC;
-	mp = NULL;
-	if (vp->v_type != VCHR &&
-	    (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
-		goto unlock;
- 
-	if ((MNT_SHARED_WRITES(mp) ||
-	    ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) &&
-	    (flags & FOF_OFFSET) != 0) {
-		lock_flags = LK_SHARED;
-	} else {
-		lock_flags = LK_EXCLUSIVE;
-	}
-
-	vn_lock(vp, lock_flags | LK_RETRY);
-	if ((flags & FOF_OFFSET) == 0)
+	if ((flags & FOF_OFFSET) == 0) {
+		mtxp = vn_lock_foffset(fp);
 		uio->uio_offset = fp->f_offset;
+	} else
+		mtxp = NULL; /* gcc */
 	ioflag |= sequential_heuristic(uio, fp);
-#ifdef MAC
-	error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
-	if (error == 0)
-#endif
-		error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
-	if ((flags & FOF_OFFSET) == 0)
+	if (vp->v_type == VREG) {
+		if ((ioflag & IO_APPEND) || !(flags & FOF_OFFSET))
+			/*
+			 * For appenders, punt and lock the whole
+			 * range. It also protects f_offset.
+			 */
+			rl_cookie = rangelock_wlock(vp, 0, (size_t)-1);
+		else
+			rl_cookie = rangelock_wlock(vp, uio->uio_offset,
+			    uio->uio_resid);
+	} else
+		rl_cookie = NULL;
+	error = vn_write_chunk(vp, uio, active_cred, fp->f_cred, ioflag);
+	if (rl_cookie != NULL)
+		rangelock_unlock(vp, rl_cookie);
+	if ((flags & FOF_OFFSET) == 0) {
 		fp->f_offset = uio->uio_offset;
+		vn_unlock_foffset(fp, mtxp);
+	}
 	fp->f_nextoff = uio->uio_offset;
-	VOP_UNLOCK(vp, 0);
-	if (vp->v_type != VCHR)
-		vn_finished_write(mp);
-unlock:
-	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -628,25 +704,29 @@ unlock:
  * File table truncate routine.
  */
 static int
-vn_truncate(fp, length, active_cred, td)
-	struct file *fp;
-	off_t length;
-	struct ucred *active_cred;
-	struct thread *td;
+vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+    struct thread *td)
 {
 	struct vattr vattr;
 	struct mount *mp;
 	struct vnode *vp;
+	void *rl_cookie;
 	int vfslocked;
 	int error;
 
 	vp = fp->f_vnode;
+
+	/*
+	 * Lock the range where the shortening take place. Increase of
+	 * file size does not need rangelock, but it is faster to lock
+	 * the range then call VOP_GETATTR to get the current size and
+	 * deal with races.
+	 */
+	rl_cookie = rangelock_wlock(vp, length, -1);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
-	if (error) {
-		VFS_UNLOCK_GIANT(vfslocked);
-		return (error);
-	}
+	if (error)
+		goto out1;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	if (vp->v_type == VDIR) {
 		error = EISDIR;
@@ -666,7 +746,9 @@ vn_truncate(fp, length, active_cred, td)
 out:
 	VOP_UNLOCK(vp, 0);
 	vn_finished_write(mp);
+out1:
 	VFS_UNLOCK_GIANT(vfslocked);
+	rangelock_unlock(vp, rl_cookie);
 	return (error);
 }
 

Modified: user/kib/vm6/sys/kern/vnode_if.src
==============================================================================
--- user/kib/vm6/sys/kern/vnode_if.src	Sat Feb 20 16:32:33 2010	(r204131)
+++ user/kib/vm6/sys/kern/vnode_if.src	Sat Feb 20 16:34:42 2010	(r204132)
@@ -611,3 +611,12 @@ vop_vptocnp {
 	INOUT char *buf;
 	INOUT int *buflen;
 };
+
+%% extend		vp	L L L
+
+vop_extend {
+	IN struct vnode *vp;
+	IN struct ucred *cred;
+	IN u_quad_t size;
+	IN int flags;
+};

Modified: user/kib/vm6/sys/sys/buf.h
==============================================================================
--- user/kib/vm6/sys/sys/buf.h	Sat Feb 20 16:32:33 2010	(r204131)
+++ user/kib/vm6/sys/sys/buf.h	Sat Feb 20 16:34:42 2010	(r204132)
@@ -257,6 +257,8 @@ extern const char *buf_wmesg;		/* Defaul
 #include <sys/proc.h>			/* XXX for curthread */
 #include <sys/mutex.h>
 
+extern int vfs_read_max;
+
 /*
  * Initialize a lock.
  */

Modified: user/kib/vm6/sys/sys/file.h
==============================================================================
--- user/kib/vm6/sys/sys/file.h	Sat Feb 20 16:32:33 2010	(r204131)
+++ user/kib/vm6/sys/sys/file.h	Sat Feb 20 16:34:42 2010	(r204132)
@@ -141,6 +141,8 @@ struct file {
 #define	FOFFSET_LOCKED       0x1
 #define	FOFFSET_LOCK_WAITING 0x2		 
 
+#define	FRA_BLOCK_SZ	     16384
+
 #endif /* _KERNEL || _WANT_FILE */
 
 /*

Modified: user/kib/vm6/sys/sys/proc.h
==============================================================================
--- user/kib/vm6/sys/sys/proc.h	Sat Feb 20 16:32:33 2010	(r204131)
+++ user/kib/vm6/sys/sys/proc.h	Sat Feb 20 16:34:42 2010	(r204132)
@@ -354,7 +354,7 @@ do {									\
 #define	TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */
 #define	TDP_ALTSTACK	0x00000020 /* Have alternate signal stack. */
 #define	TDP_DEADLKTREAT	0x00000040 /* Lock aquisition - deadlock treatment. */
-#define	TDP_UNUSED80	0x00000080 /* available. */
+#define	TDP_VMIO	0x00000080 /* Busied pages for vnode_pager io. */
 #define	TDP_NOSLEEPING	0x00000100 /* Thread is not allowed to sleep on a sq. */
 #define	TDP_OWEUPC	0x00000200 /* Call addupc() at next AST. */
 #define	TDP_ITHREAD	0x00000400 /* Thread is an interrupt thread. */

Added: user/kib/vm6/sys/sys/rangelock.h
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/kib/vm6/sys/sys/rangelock.h	Sat Feb 20 16:34:42 2010	(r204132)
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov <kib at FreeBSD.org>
+ * All rights reserved.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef	_SYS_RANGELOCK_H
+#define	_SYS_RANGELOCK_H
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/lock.h>
+#include <sys/queue.h>
+#include <sys/sx.h>
+
+#ifdef _KERNEL
+
+struct vnode;
+
+struct rl_q_entry
+{
+	TAILQ_ENTRY(rl_q_entry) rl_q_link;
+	size_t rl_q_start, rl_q_end;
+	int rl_q_flags;
+};
+
+#define	RL_LOCK_READ		0x0001
+#define	RL_LOCK_WRITE		0x0002
+#define	RL_LOCK_TYPE_MASK	0x0003
+#define	RL_LOCK_GRANTED		0x0004
+
+struct rangelock
+{
+	TAILQ_HEAD(, rl_q_entry) rl_waiters;
+	struct rl_q_entry *rl_currdep;
+};
+
+void	rangelock_init(struct rangelock *lock);
+void	rangelock_destroy(struct rangelock *lock);
+void	rangelock_unlock(struct vnode *vp, void *cookie);
+void   *rangelock_unlock_range(struct vnode *vp, void *cookie, off_t base,
+    size_t len);
+void   *rangelock_rlock(struct vnode *vp, off_t base, size_t len);
+void   *rangelock_wlock(struct vnode *vp, off_t base, size_t len);
+#endif
+
+#endif

Modified: user/kib/vm6/sys/sys/vnode.h
==============================================================================
--- user/kib/vm6/sys/sys/vnode.h	Sat Feb 20 16:32:33 2010	(r204131)
+++ user/kib/vm6/sys/sys/vnode.h	Sat Feb 20 16:34:42 2010	(r204132)
@@ -38,6 +38,7 @@
 #include <sys/lock.h>
 #include <sys/lockmgr.h>
 #include <sys/mutex.h>
+#include <sys/rangelock.h>
 #include <sys/selinfo.h>
 #include <sys/uio.h>
 #include <sys/acl.h>
@@ -168,7 +169,8 @@ struct vnode {
 	 */
 	struct vpollinfo *v_pollinfo;		/* G Poll events, p for *v_pi */
 	struct label *v_label;			/* MAC label for vnode */
-	struct lockf *v_lockf;			/* Byte-level lock list */
+	struct lockf *v_lockf;			/* Byte-level adv lock list */
+	struct rangelock v_rl;			/* Byte-range lock */
 };
 

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***