svn commit: r204132 - in user/kib/vm6/sys: conf dev/md kern sys
ufs/ffs ufs/ufs vm
Konstantin Belousov
kib at FreeBSD.org
Sat Feb 20 16:34:42 UTC 2010
Author: kib
Date: Sat Feb 20 16:34:42 2010
New Revision: 204132
URL: http://svn.freebsd.org/changeset/base/204132
Log:
Implementation of range locking for i/o and vm i/o.
First vm_readwrite.c implementation by: jeff
In collaboration with: pho
Added:
user/kib/vm6/sys/kern/kern_rangelock.c (contents, props changed)
user/kib/vm6/sys/sys/rangelock.h (contents, props changed)
user/kib/vm6/sys/vm/vm_readwrite.c (contents, props changed)
Modified:
user/kib/vm6/sys/conf/files
user/kib/vm6/sys/dev/md/md.c
user/kib/vm6/sys/kern/vfs_cluster.c
user/kib/vm6/sys/kern/vfs_default.c
user/kib/vm6/sys/kern/vfs_subr.c
user/kib/vm6/sys/kern/vfs_vnops.c
user/kib/vm6/sys/kern/vnode_if.src
user/kib/vm6/sys/sys/buf.h
user/kib/vm6/sys/sys/file.h
user/kib/vm6/sys/sys/proc.h
user/kib/vm6/sys/sys/vnode.h
user/kib/vm6/sys/ufs/ffs/ffs_balloc.c
user/kib/vm6/sys/ufs/ffs/ffs_softdep.c
user/kib/vm6/sys/ufs/ffs/ffs_vnops.c
user/kib/vm6/sys/ufs/ufs/ufs_vnops.c
user/kib/vm6/sys/vm/vm_extern.h
user/kib/vm6/sys/vm/vm_fault.c
user/kib/vm6/sys/vm/vm_page.c
user/kib/vm6/sys/vm/vm_page.h
user/kib/vm6/sys/vm/vm_pageout.c
user/kib/vm6/sys/vm/vm_pageout.h
user/kib/vm6/sys/vm/vm_phys.c
user/kib/vm6/sys/vm/vnode_pager.c
Modified: user/kib/vm6/sys/conf/files
==============================================================================
--- user/kib/vm6/sys/conf/files Sat Feb 20 16:32:33 2010 (r204131)
+++ user/kib/vm6/sys/conf/files Sat Feb 20 16:34:42 2010 (r204132)
@@ -2077,6 +2077,7 @@ kern/kern_poll.c optional device_pollin
kern/kern_priv.c standard
kern/kern_proc.c standard
kern/kern_prot.c standard
+kern/kern_rangelock.c standard
kern/kern_resource.c standard
kern/kern_rmlock.c standard
kern/kern_rwlock.c standard
@@ -2768,6 +2769,7 @@ vm/vm_page.c standard
vm/vm_pageout.c standard
vm/vm_pager.c standard
vm/vm_phys.c standard
+vm/vm_readwrite.c standard
vm/vm_reserv.c standard
vm/vm_unix.c standard
vm/vm_zeroidle.c standard
Modified: user/kib/vm6/sys/dev/md/md.c
==============================================================================
--- user/kib/vm6/sys/dev/md/md.c Sat Feb 20 16:32:33 2010 (r204131)
+++ user/kib/vm6/sys/dev/md/md.c Sat Feb 20 16:34:42 2010 (r204132)
@@ -85,6 +85,7 @@
#include <vm/vm.h>
#include <vm/vm_object.h>
#include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
#include <vm/vm_pager.h>
#include <vm/swap_pager.h>
#include <vm/uma.h>
@@ -587,7 +588,7 @@ mdstart_swap(struct md_s *sc, struct bio
{
struct sf_buf *sf;
int rv, offs, len, lastend;
- vm_pindex_t i, lastp;
+ vm_pindex_t i, firstp, lastp;
vm_page_t m;
u_char *p;
@@ -610,18 +611,26 @@ mdstart_swap(struct md_s *sc, struct bio
* we're operating on complete aligned pages).
*/
offs = bp->bio_offset % PAGE_SIZE;
+ firstp = bp->bio_offset / PAGE_SIZE;
lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
+ vm_page_t ma[lastp - firstp + 1];
+
rv = VM_PAGER_OK;
VM_OBJECT_LOCK(sc->object);
vm_object_pip_add(sc->object, 1);
- for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
+ for (i = firstp; i <= lastp; i++) {
len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
- m = vm_page_grab(sc->object, i,
- VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
+ /*
+ * Write cleans pages of the buffer, give it a
+ * priority.
+ */
+ m = vm_page_grab(sc->object, i, (bp->bio_cmd == BIO_WRITE ?
+ VM_ALLOC_SYSTEM : VM_ALLOC_NORMAL) | VM_ALLOC_RETRY);
VM_OBJECT_UNLOCK(sc->object);
+ ma[i - firstp] = m;
sched_pin();
sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
VM_OBJECT_LOCK(sc->object);
@@ -683,6 +692,12 @@ printf("wire_count %d busy %d flags %x h
}
vm_object_pip_subtract(sc->object, 1);
vm_object_set_writeable_dirty(sc->object);
+ if (rv != VM_PAGER_ERROR && bp->bio_cmd == BIO_WRITE &&
+ vm_page_count_severe()) {
+ vm_page_lock_queues();
+ vm_pageout_flush(ma, lastp - firstp + 1, IO_SYNC);
+ vm_page_unlock_queues();
+ }
VM_OBJECT_UNLOCK(sc->object);
return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
}
Added: user/kib/vm6/sys/kern/kern_rangelock.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/kib/vm6/sys/kern/kern_rangelock.c Sat Feb 20 16:34:42 2010 (r204132)
@@ -0,0 +1,186 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov <kib at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/rangelock.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+
+uma_zone_t rl_entry_zone;
+
+static void
+rangelock_sys_init(void)
+{
+
+ rl_entry_zone = uma_zcreate("rl_entry", sizeof(struct rl_q_entry),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_ANY, rangelock_sys_init, NULL);
+
+void
+rangelock_init(struct rangelock *lock)
+{
+
+ TAILQ_INIT(&lock->rl_waiters);
+ lock->rl_currdep = NULL;
+}
+
+void
+rangelock_destroy(struct rangelock *lock)
+{
+
+ KASSERT(TAILQ_EMPTY(&lock->rl_waiters), ("Dangling waiters"));
+}
+
+static int
+rangelock_incompatible(const struct rl_q_entry *e1,
+ const struct rl_q_entry *e2)
+{
+
+ if ((e1->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ &&
+ (e2->rl_q_flags & RL_LOCK_TYPE_MASK) == RL_LOCK_READ)
+ return (0);
+#define IN_RANGE(a, e) (a >= e->rl_q_start && a < e->rl_q_end)
+ if (IN_RANGE(e1->rl_q_start, e2) || IN_RANGE(e2->rl_q_start, e1) ||
+ IN_RANGE(e1->rl_q_end, e2) || IN_RANGE(e2->rl_q_end, e1))
+ return (1);
+#undef IN_RANGE
+ return (0);
+}
+
+static void
+rangelock_calc_block(struct rangelock *lock)
+{
+ struct rl_q_entry *entry, *entry1, *whead;
+
+ if (lock->rl_currdep == TAILQ_FIRST(&lock->rl_waiters) &&
+ lock->rl_currdep != NULL)
+ lock->rl_currdep = TAILQ_NEXT(lock->rl_currdep, rl_q_link);
+ for (entry = lock->rl_currdep; entry;
+ entry = TAILQ_NEXT(entry, rl_q_link)) {
+ TAILQ_FOREACH(entry1, &lock->rl_waiters, rl_q_link) {
+ if (rangelock_incompatible(entry, entry1))
+ goto out;
+ if (entry1 == entry)
+ break;
+ }
+ }
+out:
+ lock->rl_currdep = entry;
+ TAILQ_FOREACH(whead, &lock->rl_waiters, rl_q_link) {
+ if (whead == lock->rl_currdep)
+ break;
+ if (!(whead->rl_q_flags & RL_LOCK_GRANTED)) {
+ whead->rl_q_flags |= RL_LOCK_GRANTED;
+ wakeup(whead);
+ }
+ }
+}
+
+static void
+rangelock_unlock_vp_locked(struct vnode *vp, struct rl_q_entry *entry)
+{
+
+ ASSERT_VI_LOCKED(vp, "rangelock");
+ KASSERT(entry != vp->v_rl.rl_currdep, ("stuck currdep"));
+ TAILQ_REMOVE(&vp->v_rl.rl_waiters, entry, rl_q_link);
+ rangelock_calc_block(&vp->v_rl);
+ VI_UNLOCK(vp);
+ uma_zfree(rl_entry_zone, entry);
+}
+
+void
+rangelock_unlock(struct vnode *vp, void *cookie)
+{
+ struct rl_q_entry *entry;
+
+ entry = cookie;
+ VI_LOCK(vp);
+ rangelock_unlock_vp_locked(vp, entry);
+}
+
+void *
+rangelock_unlock_range(struct vnode *vp, void *cookie, off_t base, size_t len)
+{
+ struct rl_q_entry *entry;
+
+ entry = cookie;
+ VI_LOCK(vp);
+ KASSERT(entry->rl_q_flags & RL_LOCK_GRANTED, ("XXX"));
+ KASSERT(entry->rl_q_start == base, ("XXX"));
+ KASSERT(entry->rl_q_end >= base + len, ("XXX"));
+ if (entry->rl_q_end == base + len) {
+ rangelock_unlock_vp_locked(vp, cookie);
+ return (NULL);
+ }
+ entry->rl_q_end = base + len;
+ rangelock_calc_block(&vp->v_rl);
+ VI_UNLOCK(vp);
+ return (cookie);
+}
+
+static void *
+rangelock_enqueue(struct vnode *vp, struct rl_q_entry *entry)
+{
+
+ VI_LOCK(vp);
+ TAILQ_INSERT_TAIL(&vp->v_rl.rl_waiters, entry, rl_q_link);
+ if (vp->v_rl.rl_currdep == NULL)
+ vp->v_rl.rl_currdep = entry;
+ rangelock_calc_block(&vp->v_rl);
+ while (!(entry->rl_q_flags & RL_LOCK_GRANTED))
+ msleep(entry, &vp->v_interlock, 0, "range", 0);
+ VI_UNLOCK(vp);
+ return (entry);
+}
+
+void *
+rangelock_rlock(struct vnode *vp, off_t base, size_t len)
+{
+ struct rl_q_entry *entry;
+
+ entry = uma_zalloc(rl_entry_zone, M_WAITOK);
+ entry->rl_q_flags = RL_LOCK_READ;
+ entry->rl_q_start = base;
+ entry->rl_q_end = base + len;
+ return (rangelock_enqueue(vp, entry));
+}
+
+void *
+rangelock_wlock(struct vnode *vp, off_t base, size_t len)
+{
+ struct rl_q_entry *entry;
+
+ entry = uma_zalloc(rl_entry_zone, M_WAITOK);
+ entry->rl_q_flags = RL_LOCK_WRITE;
+ entry->rl_q_start = base;
+ entry->rl_q_end = base + len;
+ return (rangelock_enqueue(vp, entry));
+}
Modified: user/kib/vm6/sys/kern/vfs_cluster.c
==============================================================================
--- user/kib/vm6/sys/kern/vfs_cluster.c Sat Feb 20 16:32:33 2010 (r204131)
+++ user/kib/vm6/sys/kern/vfs_cluster.c Sat Feb 20 16:34:42 2010 (r204132)
@@ -71,8 +71,8 @@ static int write_behind = 1;
SYSCTL_INT(_vfs, OID_AUTO, write_behind, CTLFLAG_RW, &write_behind, 0,
"Cluster write-behind; 0: disable, 1: enable, 2: backed off");
-static int read_max = 8;
-SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &read_max, 0,
+int vfs_read_max = 8;
+SYSCTL_INT(_vfs, OID_AUTO, read_max, CTLFLAG_RW, &vfs_read_max, 0,
"Cluster read-ahead max block count");
/* Page expended to mark partially backed buffers */
@@ -109,7 +109,7 @@ cluster_read(vp, filesize, lblkno, size,
*/
racluster = vp->v_mount->mnt_iosize_max / size;
maxra = seqcount;
- maxra = min(read_max, maxra);
+ maxra = min(vfs_read_max, maxra);
maxra = min(nbuf/8, maxra);
if (((u_quad_t)(lblkno + maxra + 1) * size) > filesize)
maxra = (filesize / size) - lblkno;
@@ -803,7 +803,9 @@ cluster_wbuild(vp, size, start_lbn, len)
(tbp->b_bcount != tbp->b_bufsize) ||
(tbp->b_bcount != size) ||
(len == 1) ||
- ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
+ ((bp = (vp->v_vflag & VV_MD) ?
+ trypbuf(&cluster_pbuf_freecnt) :
+ getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
totalwritten += tbp->b_bufsize;
bawrite(tbp);
++start_lbn;
Modified: user/kib/vm6/sys/kern/vfs_default.c
==============================================================================
--- user/kib/vm6/sys/kern/vfs_default.c Sat Feb 20 16:32:33 2010 (r204131)
+++ user/kib/vm6/sys/kern/vfs_default.c Sat Feb 20 16:34:42 2010 (r204132)
@@ -77,6 +77,8 @@ static int dirent_exists(struct vnode *v
#define DIRENT_MINSIZE (sizeof(struct dirent) - (MAXNAMLEN+1) + 4)
+static int vop_stdextend(struct vop_extend_args *ap);
+
/*
* This vnode table stores what we want to do if the filesystem doesn't
* implement a particular VOP.
@@ -118,6 +120,7 @@ struct vop_vector default_vnodeops = {
.vop_unlock = vop_stdunlock,
.vop_vptocnp = vop_stdvptocnp,
.vop_vptofh = vop_stdvptofh,
+ .vop_extend = vop_stdextend,
};
/*
@@ -825,6 +828,23 @@ out:
return (error);
}
+static int
+vop_stdextend(struct vop_extend_args *ap)
+{
+ struct vattr vattr, oattr;
+ int error;
+
+
+ error = VOP_GETATTR(ap->a_vp, &oattr, ap->a_cred);
+ if (error != 0)
+ return (error);
+ if (oattr.va_size >= ap->a_size)
+ return (0);
+ VATTR_NULL(&vattr);
+ vattr.va_size = ap->a_size;
+ return (VOP_SETATTR(ap->a_vp, &vattr, ap->a_cred));
+}
+
/*
* vfs default ops
* used to fill the vfs function table to get reasonable default return values.
Modified: user/kib/vm6/sys/kern/vfs_subr.c
==============================================================================
--- user/kib/vm6/sys/kern/vfs_subr.c Sat Feb 20 16:32:33 2010 (r204131)
+++ user/kib/vm6/sys/kern/vfs_subr.c Sat Feb 20 16:34:42 2010 (r204132)
@@ -861,6 +861,7 @@ vdestroy(struct vnode *vp)
/* XXX Elsewhere we can detect an already freed vnode via NULL v_op. */
vp->v_op = NULL;
#endif
+ rangelock_destroy(&vp->v_rl);
lockdestroy(vp->v_vnlock);
mtx_destroy(&vp->v_interlock);
mtx_destroy(BO_MTX(bo));
@@ -1015,6 +1016,7 @@ alloc:
if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
vp->v_vflag |= VV_NOKNOTE;
}
+ rangelock_init(&vp->v_rl);
*vpp = vp;
return (0);
Modified: user/kib/vm6/sys/kern/vfs_vnops.c
==============================================================================
--- user/kib/vm6/sys/kern/vfs_vnops.c Sat Feb 20 16:32:33 2010 (r204131)
+++ user/kib/vm6/sys/kern/vfs_vnops.c Sat Feb 20 16:34:42 2010 (r204132)
@@ -37,12 +37,14 @@
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/kdb.h>
#include <sys/stat.h>
+#include <sys/sysctl.h>
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/limits.h>
@@ -62,6 +64,13 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+
+static int vmio_enabled = 1;
+SYSCTL_INT(_vfs, OID_AUTO, vmio_enabled, CTLFLAG_RW, &vmio_enabled, 0,
+ "Use vm pages copyin/out instead of vops for read/write");
+
static fo_rdwr_t vn_read;
static fo_rdwr_t vn_write;
static fo_truncate_t vn_truncate;
@@ -83,6 +92,9 @@ struct fileops vnops = {
.fo_flags = DFLAG_PASSABLE | DFLAG_SEEKABLE
};
+static int vn_write_chunk(struct vnode *, struct uio *, struct ucred *,
+ struct ucred *, int);
+
int
vn_open(ndp, flagp, cmode, fp)
struct nameidata *ndp;
@@ -275,17 +287,14 @@ vn_writechk(vp)
* Vnode close call
*/
int
-vn_close(vp, flags, file_cred, td)
- register struct vnode *vp;
- int flags;
- struct ucred *file_cred;
- struct thread *td;
+vn_close(struct vnode *vp, int flags, struct ucred *file_cred,
+ struct thread *td)
{
- struct mount *mp;
+ struct mount *mp, *mp1;
int error, lock_flags;
- if (!(flags & FWRITE) && vp->v_mount != NULL &&
- vp->v_mount->mnt_kern_flag & MNTK_EXTENDED_SHARED)
+ if (!(flags & FWRITE) && (mp1 = vp->v_mount) != NULL &&
+ MNT_SHARED_WRITES(mp1))
lock_flags = LK_SHARED;
else
lock_flags = LK_EXCLUSIVE;
@@ -333,7 +342,7 @@ sequential_heuristic(struct uio *uio, st
* closely related to the best I/O size for real disks than
* to any block size used by software.
*/
- fp->f_seqcount += howmany(uio->uio_resid, 16384);
+ fp->f_seqcount += howmany(uio->uio_resid, FRA_BLOCK_SZ);
if (fp->f_seqcount > IO_SEQMAX)
fp->f_seqcount = IO_SEQMAX;
return (fp->f_seqcount << IO_SEQSHIFT);
@@ -351,76 +360,71 @@ sequential_heuristic(struct uio *uio, st
* Package up an I/O request on a vnode into a uio and do it.
*/
int
-vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, active_cred, file_cred,
- aresid, td)
- enum uio_rw rw;
- struct vnode *vp;
- void *base;
- int len;
- off_t offset;
- enum uio_seg segflg;
- int ioflg;
- struct ucred *active_cred;
- struct ucred *file_cred;
- int *aresid;
- struct thread *td;
+vn_rdwr(enum uio_rw rw, struct vnode *vp, void *base, int len, off_t offset,
+ enum uio_seg segflg, int ioflg, struct ucred *active_cred,
+ struct ucred *file_cred, int *aresid, struct thread *td)
{
struct uio auio;
struct iovec aiov;
struct mount *mp;
struct ucred *cred;
+ void *rl_cookie;
int error, lock_flags;
VFS_ASSERT_GIANT(vp->v_mount);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ aiov.iov_base = base;
+ aiov.iov_len = len;
+ auio.uio_resid = len;
+ auio.uio_offset = offset;
+ auio.uio_segflg = segflg;
+ auio.uio_rw = rw;
+ auio.uio_td = td;
+ error = 0;
+
if ((ioflg & IO_NODELOCKED) == 0) {
+ if (rw == UIO_READ)
+ rl_cookie = rangelock_rlock(vp, offset, len);
+ else
+ rl_cookie = rangelock_wlock(vp, offset, len);
mp = NULL;
if (rw == UIO_WRITE) {
if (vp->v_type != VCHR &&
(error = vn_start_write(vp, &mp, V_WAIT | PCATCH))
!= 0)
- return (error);
+ goto out;
if (MNT_SHARED_WRITES(mp) ||
- ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
+ ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount)))
lock_flags = LK_SHARED;
- } else {
+ else
lock_flags = LK_EXCLUSIVE;
- }
- vn_lock(vp, lock_flags | LK_RETRY);
} else
- vn_lock(vp, LK_SHARED | LK_RETRY);
+ lock_flags = LK_SHARED;
+ vn_lock(vp, lock_flags | LK_RETRY);
+ } else
+ rl_cookie = NULL;
- }
ASSERT_VOP_LOCKED(vp, "IO_NODELOCKED with no vp lock held");
- auio.uio_iov = &aiov;
- auio.uio_iovcnt = 1;
- aiov.iov_base = base;
- aiov.iov_len = len;
- auio.uio_resid = len;
- auio.uio_offset = offset;
- auio.uio_segflg = segflg;
- auio.uio_rw = rw;
- auio.uio_td = td;
- error = 0;
#ifdef MAC
if ((ioflg & IO_NOMACCHECK) == 0) {
- if (rw == UIO_READ)
- error = mac_vnode_check_read(active_cred, file_cred,
- vp);
- else
+ if (rw == UIO_WRITE)
error = mac_vnode_check_write(active_cred, file_cred,
vp);
}
#endif
if (error == 0) {
- if (file_cred)
+ if (file_cred != NULL)
cred = file_cred;
else
cred = active_cred;
if (rw == UIO_READ)
- error = VOP_READ(vp, &auio, ioflg, cred);
+ error = vn_read_chunk(vp, &auio, active_cred, cred,
+ ioflg | IO_NODELOCKED);
else
- error = VOP_WRITE(vp, &auio, ioflg, cred);
+ error = vn_write_chunk(vp, &auio, active_cred, cred,
+ ioflg | IO_NODELOCKED);
}
if (aresid)
*aresid = auio.uio_resid;
@@ -428,10 +432,13 @@ vn_rdwr(rw, vp, base, len, offset, segfl
if (auio.uio_resid && error == 0)
error = EIO;
if ((ioflg & IO_NODELOCKED) == 0) {
- if (rw == UIO_WRITE && vp->v_type != VCHR)
- vn_finished_write(mp);
VOP_UNLOCK(vp, 0);
+ if (mp != NULL)
+ vn_finished_write(mp);
}
+ out:
+ if (rl_cookie != NULL)
+ rangelock_unlock(vp, rl_cookie);
return (error);
}
@@ -493,68 +500,148 @@ vn_rdwr_inchunks(rw, vp, base, len, offs
return (error);
}
+static struct mtx *
+vn_lock_foffset(struct file *fp)
+{
+ struct mtx *mtxp;
+
+ mtxp = mtx_pool_find(mtxpool_sleep, fp);
+ mtx_lock(mtxp);
+ while (fp->f_vnread_flags & FOFFSET_LOCKED) {
+ fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+ msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
+ "vnread offlock", 0);
+ }
+ fp->f_vnread_flags |= FOFFSET_LOCKED;
+ mtx_unlock(mtxp);
+ return (mtxp);
+}
+
+static void
+vn_unlock_foffset(struct file *fp, struct mtx *mtxp)
+{
+
+ mtx_lock(mtxp);
+ if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
+ wakeup(&fp->f_vnread_flags);
+ fp->f_vnread_flags = 0;
+ mtx_unlock(mtxp);
+}
+
+int
+vn_read_chunk(struct vnode *vp, struct uio *uio, struct ucred *active_cred,
+ struct ucred *fcred, int ioflag)
+{
+ int error, vfslocked;
+
+ error = 0;
+ vfslocked = 0; /* gcc */
+
+ if ((ioflag & IO_NODELOCKED) == 0) {
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ }
+
+#ifdef MAC
+ if ((ioflag & IO_NOMACCHECK) == 0)
+ error = mac_vnode_check_read(active_cred, fcred, vp);
+#endif
+ if (error == 0) {
+ if (!vmio_enabled ||
+ (error = vnode_pager_read(vp, uio, ioflag)) == EOPNOTSUPP)
+ error = VOP_READ(vp, uio, ioflag, fcred);
+ }
+ if ((ioflag & IO_NODELOCKED) == 0) {
+ VOP_UNLOCK(vp, 0);
+ VFS_UNLOCK_GIANT(vfslocked);
+ }
+ return (error);
+}
+
/*
* File table vnode read routine.
*/
static int
-vn_read(fp, uio, active_cred, flags, td)
- struct file *fp;
- struct uio *uio;
- struct ucred *active_cred;
- struct thread *td;
- int flags;
+vn_read(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
+ struct thread *td)
{
struct vnode *vp;
- int error, ioflag;
struct mtx *mtxp;
- int vfslocked;
+ void *rl_cookie;
+ int ioflag;
+ int error;
KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
uio->uio_td, td));
- mtxp = NULL;
- vp = fp->f_vnode;
ioflag = 0;
if (fp->f_flag & FNONBLOCK)
ioflag |= IO_NDELAY;
if (fp->f_flag & O_DIRECT)
ioflag |= IO_DIRECT;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+ vp = fp->f_vnode;
+
/*
* According to McKusick the vn lock was protecting f_offset here.
* It is now protected by the FOFFSET_LOCKED flag.
*/
if ((flags & FOF_OFFSET) == 0) {
- mtxp = mtx_pool_find(mtxpool_sleep, fp);
- mtx_lock(mtxp);
- while(fp->f_vnread_flags & FOFFSET_LOCKED) {
- fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
- msleep(&fp->f_vnread_flags, mtxp, PUSER -1,
- "vnread offlock", 0);
- }
- fp->f_vnread_flags |= FOFFSET_LOCKED;
- mtx_unlock(mtxp);
- vn_lock(vp, LK_SHARED | LK_RETRY);
+ mtxp = vn_lock_foffset(fp);
uio->uio_offset = fp->f_offset;
} else
- vn_lock(vp, LK_SHARED | LK_RETRY);
-
+ mtxp = NULL; /* gcc */
+ if (vp->v_type == VREG)
+ rl_cookie = rangelock_rlock(vp, uio->uio_offset,
+ uio->uio_resid);
+ else
+ rl_cookie = NULL;
ioflag |= sequential_heuristic(uio, fp);
+ error = vn_read_chunk(vp, uio, active_cred, fp->f_cred, ioflag);
+ fp->f_nextoff = uio->uio_offset;
+ if (rl_cookie != NULL)
+ rangelock_unlock(vp, rl_cookie);
+ if ((flags & FOF_OFFSET) == 0) {
+ fp->f_offset = uio->uio_offset;
+ vn_unlock_foffset(fp, mtxp);
+ }
+ return (error);
+}
+
+static int
+vn_write_chunk(struct vnode *vp, struct uio *uio, struct ucred *active_cred,
+ struct ucred *fcred, int ioflag)
+{
+ struct mount *mp, *mp1;
+ int error, lock_flags, vfslocked;
+
+ mp = NULL;
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+ if (vp->v_type == VREG)
+ bwillwrite();
+ if (vp->v_type != VCHR &&
+ (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ goto unlock;
+ if (MNT_SHARED_WRITES(mp) ||
+ (mp == NULL && (mp1 = vp->v_mount) != NULL &&
+ MNT_SHARED_WRITES(mp1)))
+ lock_flags = LK_SHARED;
+ else
+ lock_flags = LK_EXCLUSIVE;
+ vn_lock(vp, lock_flags | LK_RETRY);
#ifdef MAC
- error = mac_vnode_check_read(active_cred, fp->f_cred, vp);
- if (error == 0)
+ error = mac_vnode_check_write(active_cred, fcred, vp);
+#else
+ error = 0;
#endif
- error = VOP_READ(vp, uio, ioflag, fp->f_cred);
- if ((flags & FOF_OFFSET) == 0) {
- fp->f_offset = uio->uio_offset;
- mtx_lock(mtxp);
- if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
- wakeup(&fp->f_vnread_flags);
- fp->f_vnread_flags = 0;
- mtx_unlock(mtxp);
+ if (error == 0) {
+ if (!vmio_enabled ||
+ (error = vnode_pager_write(vp, uio, ioflag)) == EOPNOTSUPP)
+ error = VOP_WRITE(vp, uio, ioflag, fcred);
}
- fp->f_nextoff = uio->uio_offset;
VOP_UNLOCK(vp, 0);
+ if (vp->v_type != VCHR)
+ vn_finished_write(mp);
+unlock:
VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -563,24 +650,17 @@ vn_read(fp, uio, active_cred, flags, td)
* File table vnode write routine.
*/
static int
-vn_write(fp, uio, active_cred, flags, td)
- struct file *fp;
- struct uio *uio;
- struct ucred *active_cred;
- struct thread *td;
- int flags;
+vn_write(struct file *fp, struct uio *uio, struct ucred *active_cred, int flags,
+ struct thread *td)
{
struct vnode *vp;
- struct mount *mp;
- int error, ioflag, lock_flags;
- int vfslocked;
+ struct mtx *mtxp;
+ void *rl_cookie;
+ int error, ioflag;
KASSERT(uio->uio_td == td, ("uio_td %p is not td %p",
uio->uio_td, td));
vp = fp->f_vnode;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
- if (vp->v_type == VREG)
- bwillwrite();
ioflag = IO_UNIT;
if (vp->v_type == VREG && (fp->f_flag & O_APPEND))
ioflag |= IO_APPEND;
@@ -591,36 +671,32 @@ vn_write(fp, uio, active_cred, flags, td
if ((fp->f_flag & O_FSYNC) ||
(vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS)))
ioflag |= IO_SYNC;
- mp = NULL;
- if (vp->v_type != VCHR &&
- (error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
- goto unlock;
-
- if ((MNT_SHARED_WRITES(mp) ||
- ((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) &&
- (flags & FOF_OFFSET) != 0) {
- lock_flags = LK_SHARED;
- } else {
- lock_flags = LK_EXCLUSIVE;
- }
-
- vn_lock(vp, lock_flags | LK_RETRY);
- if ((flags & FOF_OFFSET) == 0)
+ if ((flags & FOF_OFFSET) == 0) {
+ mtxp = vn_lock_foffset(fp);
uio->uio_offset = fp->f_offset;
+ } else
+ mtxp = NULL; /* gcc */
ioflag |= sequential_heuristic(uio, fp);
-#ifdef MAC
- error = mac_vnode_check_write(active_cred, fp->f_cred, vp);
- if (error == 0)
-#endif
- error = VOP_WRITE(vp, uio, ioflag, fp->f_cred);
- if ((flags & FOF_OFFSET) == 0)
+ if (vp->v_type == VREG) {
+ if ((ioflag & IO_APPEND) || !(flags & FOF_OFFSET))
+ /*
+ * For appenders, punt and lock the whole
+ * range. It also protects f_offset.
+ */
+ rl_cookie = rangelock_wlock(vp, 0, (size_t)-1);
+ else
+ rl_cookie = rangelock_wlock(vp, uio->uio_offset,
+ uio->uio_resid);
+ } else
+ rl_cookie = NULL;
+ error = vn_write_chunk(vp, uio, active_cred, fp->f_cred, ioflag);
+ if (rl_cookie != NULL)
+ rangelock_unlock(vp, rl_cookie);
+ if ((flags & FOF_OFFSET) == 0) {
fp->f_offset = uio->uio_offset;
+ vn_unlock_foffset(fp, mtxp);
+ }
fp->f_nextoff = uio->uio_offset;
- VOP_UNLOCK(vp, 0);
- if (vp->v_type != VCHR)
- vn_finished_write(mp);
-unlock:
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -628,25 +704,29 @@ unlock:
* File table truncate routine.
*/
static int
-vn_truncate(fp, length, active_cred, td)
- struct file *fp;
- off_t length;
- struct ucred *active_cred;
- struct thread *td;
+vn_truncate(struct file *fp, off_t length, struct ucred *active_cred,
+ struct thread *td)
{
struct vattr vattr;
struct mount *mp;
struct vnode *vp;
+ void *rl_cookie;
int vfslocked;
int error;
vp = fp->f_vnode;
+
+ /*
+ * Lock the range where the shortening take place. Increase of
+ * file size does not need rangelock, but it is faster to lock
+ * the range then call VOP_GETATTR to get the current size and
+ * deal with races.
+ */
+ rl_cookie = rangelock_wlock(vp, length, -1);
vfslocked = VFS_LOCK_GIANT(vp->v_mount);
error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
- if (error) {
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
- }
+ if (error)
+ goto out1;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if (vp->v_type == VDIR) {
error = EISDIR;
@@ -666,7 +746,9 @@ vn_truncate(fp, length, active_cred, td)
out:
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
+out1:
VFS_UNLOCK_GIANT(vfslocked);
+ rangelock_unlock(vp, rl_cookie);
return (error);
}
Modified: user/kib/vm6/sys/kern/vnode_if.src
==============================================================================
--- user/kib/vm6/sys/kern/vnode_if.src Sat Feb 20 16:32:33 2010 (r204131)
+++ user/kib/vm6/sys/kern/vnode_if.src Sat Feb 20 16:34:42 2010 (r204132)
@@ -611,3 +611,12 @@ vop_vptocnp {
INOUT char *buf;
INOUT int *buflen;
};
+
+%% extend vp L L L
+
+vop_extend {
+ IN struct vnode *vp;
+ IN struct ucred *cred;
+ IN u_quad_t size;
+ IN int flags;
+};
Modified: user/kib/vm6/sys/sys/buf.h
==============================================================================
--- user/kib/vm6/sys/sys/buf.h Sat Feb 20 16:32:33 2010 (r204131)
+++ user/kib/vm6/sys/sys/buf.h Sat Feb 20 16:34:42 2010 (r204132)
@@ -257,6 +257,8 @@ extern const char *buf_wmesg; /* Defaul
#include <sys/proc.h> /* XXX for curthread */
#include <sys/mutex.h>
+extern int vfs_read_max;
+
/*
* Initialize a lock.
*/
Modified: user/kib/vm6/sys/sys/file.h
==============================================================================
--- user/kib/vm6/sys/sys/file.h Sat Feb 20 16:32:33 2010 (r204131)
+++ user/kib/vm6/sys/sys/file.h Sat Feb 20 16:34:42 2010 (r204132)
@@ -141,6 +141,8 @@ struct file {
#define FOFFSET_LOCKED 0x1
#define FOFFSET_LOCK_WAITING 0x2
+#define FRA_BLOCK_SZ 16384
+
#endif /* _KERNEL || _WANT_FILE */
/*
Modified: user/kib/vm6/sys/sys/proc.h
==============================================================================
--- user/kib/vm6/sys/sys/proc.h Sat Feb 20 16:32:33 2010 (r204131)
+++ user/kib/vm6/sys/sys/proc.h Sat Feb 20 16:34:42 2010 (r204132)
@@ -354,7 +354,7 @@ do { \
#define TDP_COWINPROGRESS 0x00000010 /* Snapshot copy-on-write in progress. */
#define TDP_ALTSTACK 0x00000020 /* Have alternate signal stack. */
#define TDP_DEADLKTREAT 0x00000040 /* Lock aquisition - deadlock treatment. */
-#define TDP_UNUSED80 0x00000080 /* available. */
+#define TDP_VMIO 0x00000080 /* Busied pages for vnode_pager io. */
#define TDP_NOSLEEPING 0x00000100 /* Thread is not allowed to sleep on a sq. */
#define TDP_OWEUPC 0x00000200 /* Call addupc() at next AST. */
#define TDP_ITHREAD 0x00000400 /* Thread is an interrupt thread. */
Added: user/kib/vm6/sys/sys/rangelock.h
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/kib/vm6/sys/sys/rangelock.h Sat Feb 20 16:34:42 2010 (r204132)
@@ -0,0 +1,48 @@
+/*-
+ * Copyright (c) 2009 Konstantin Belousov <kib at FreeBSD.org>
+ * All rights reserved.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _SYS_RANGELOCK_H
+#define _SYS_RANGELOCK_H
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/lock.h>
+#include <sys/queue.h>
+#include <sys/sx.h>
+
+#ifdef _KERNEL
+
+struct vnode;
+
+struct rl_q_entry
+{
+ TAILQ_ENTRY(rl_q_entry) rl_q_link;
+ size_t rl_q_start, rl_q_end;
+ int rl_q_flags;
+};
+
+#define RL_LOCK_READ 0x0001
+#define RL_LOCK_WRITE 0x0002
+#define RL_LOCK_TYPE_MASK 0x0003
+#define RL_LOCK_GRANTED 0x0004
+
+struct rangelock
+{
+ TAILQ_HEAD(, rl_q_entry) rl_waiters;
+ struct rl_q_entry *rl_currdep;
+};
+
+void rangelock_init(struct rangelock *lock);
+void rangelock_destroy(struct rangelock *lock);
+void rangelock_unlock(struct vnode *vp, void *cookie);
+void *rangelock_unlock_range(struct vnode *vp, void *cookie, off_t base,
+ size_t len);
+void *rangelock_rlock(struct vnode *vp, off_t base, size_t len);
+void *rangelock_wlock(struct vnode *vp, off_t base, size_t len);
+#endif
+
+#endif
Modified: user/kib/vm6/sys/sys/vnode.h
==============================================================================
--- user/kib/vm6/sys/sys/vnode.h Sat Feb 20 16:32:33 2010 (r204131)
+++ user/kib/vm6/sys/sys/vnode.h Sat Feb 20 16:34:42 2010 (r204132)
@@ -38,6 +38,7 @@
#include <sys/lock.h>
#include <sys/lockmgr.h>
#include <sys/mutex.h>
+#include <sys/rangelock.h>
#include <sys/selinfo.h>
#include <sys/uio.h>
#include <sys/acl.h>
@@ -168,7 +169,8 @@ struct vnode {
*/
struct vpollinfo *v_pollinfo; /* G Poll events, p for *v_pi */
struct label *v_label; /* MAC label for vnode */
- struct lockf *v_lockf; /* Byte-level lock list */
+ struct lockf *v_lockf; /* Byte-level adv lock list */
+ struct rangelock v_rl; /* Byte-range lock */
};
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-user
mailing list