PERFORCE change 200123 for review
John Baldwin
jhb at FreeBSD.org
Wed Oct 12 21:29:14 UTC 2011
http://p4web.freebsd.org/@@200123?ac=10
Change 200123 by jhb at jhb_jhbbsd on 2011/10/12 21:28:22
Checkpoint WIP for fadvise(2). Next I need to add the logic in
the vnode f_ops to actually act on the hints during an I/O. Was
hoping to leverage vm_object_madvise(MADV_DONTNEED) to handle
FADV_NOREUSE but that doesn't actually do what I need (it still
leaves the pages on the inactive queue, and I really want them
in the cache queue to avoid the degenerate splay tree case in
pagedaemon.. in fact, what I'd really like is to not have pagedaemon
run at _all_). Perhaps we can add a new internal variant
of MADV_DONTNEED that forces pages into cache instead of the
inactive queue.
Affected files ...
.. //depot/projects/fadvise/sys/compat/freebsd32/freebsd32_misc.c#2 edit
.. //depot/projects/fadvise/sys/compat/freebsd32/syscalls.master#2 edit
.. //depot/projects/fadvise/sys/kern/kern_descrip.c#2 edit
.. //depot/projects/fadvise/sys/kern/syscalls.master#2 edit
.. //depot/projects/fadvise/sys/kern/vfs_syscalls.c#2 edit
.. //depot/projects/fadvise/sys/sys/fcntl.h#2 edit
.. //depot/projects/fadvise/sys/sys/file.h#2 edit
Differences ...
==== //depot/projects/fadvise/sys/compat/freebsd32/freebsd32_misc.c#2 (text+ko) ====
@@ -2815,3 +2815,15 @@
ap.len = (uap->lenlo | ((off_t)uap->lenhi << 32));
return (sys_posix_fallocate(td, &ap));
}
+
+int
+freebsd32_fadvise(struct thread *td, struct freebsd32_fadvise_args *uap)
+{
+ struct fadvise_args ap;
+
+ ap.fd = uap->fd;
+ ap.offset = (uap->offsetlo | ((off_t)uap->offsethi << 32));
+ ap.len = (uap->lenlo | ((off_t)uap->lenhi << 32));
+ ap.advice = uap->advice;
+ return (sys_fadvise(td, &ap));
+}
==== //depot/projects/fadvise/sys/compat/freebsd32/syscalls.master#2 (text+ko) ====
@@ -989,6 +989,9 @@
size_t inbuflen, void *outbufp, \
size_t outbuflen); }
530 AUE_NULL STD { int freebsd32_posix_fallocate(int fd,\
- uint32_t offsetlo, uint32_t offsethi,\
- uint32_t lenlo, uint32_t lenhi); }
-531 AUE_NULL UNIMPL posix_fadvise
+ uint32_t offsetlo, uint32_t offsethi,\
+ uint32_t lenlo, uint32_t lenhi); }
+531 AUE_NULL STD { int freebsd32_fadvise(int fd, \
+ uint32_t offsetlo, uint32_t offsethi,\
+ uint32_t lenlo, uint32_t lenhi, \
+ int advice); }
==== //depot/projects/fadvise/sys/kern/kern_descrip.c#2 (text+ko) ====
@@ -1654,6 +1654,7 @@
fp->f_ops = &badfileops;
fp->f_data = NULL;
fp->f_vnode = NULL;
+ fp->f_advice = FADV_NORMAL;
*resultfp = fp;
return (0);
}
==== //depot/projects/fadvise/sys/kern/syscalls.master#2 (text+ko) ====
@@ -947,6 +947,7 @@
size_t outbuflen); }
530 AUE_NULL STD { int posix_fallocate(int fd, \
off_t offset, off_t len); }
-531 AUE_NULL UNIMPL posix_fadvise
+531 AUE_NULL STD { int fadvise(int fd, off_t offset, \
+ off_t len, int advice); }
; Please copy any additions and changes to the following compatability tables:
; sys/compat/freebsd32/syscalls.master
==== //depot/projects/fadvise/sys/kern/vfs_syscalls.c#2 (text+ko) ====
@@ -4845,3 +4845,124 @@
return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
}
+
+/*
+ * Unlike madvise(2), we do not make a best effort to remember every
+ * possible caching hint. Instead, we remember the last setting with
+ * the exception that we will allow FADV_NORMAL to adjust the region
+ * of any current setting.
+ */
+int
+sys_fadvise(struct thread *td, struct fadvise_args *uap)
+{
+ struct file *fp;
+ struct vnode *vp;
+ off_t newoff, start, end;
+ int error, vfslocked;
+
+ if (uap->offset < 0 || uap->len < 0 ||
+ uap->offset + uap->len < uap->offset)
+ return (EINVAL);
+ switch (uap->advice) {
+ case FADV_NORMAL:
+ case FADV_SEQUENTIAL:
+ case FADV_RANDOM:
+ case FADV_WILLNEED:
+ case FADV_DONTNEED:
+ case FADV_NOREUSE:
+ break;
+ default:
+ return (EINVAL);
+ }
+ /* XXX: CAP_FADVISE? */
+ error = fget(td, uap->fd, 0, &fp);
+ if (error != 0)
+ return (error);
+
+ switch (fp->f_type) {
+ case DTYPE_VNODE:
+ break;
+ case DTYPE_PIPE:
+ case DTYPE_FIFO:
+ error = ESPIPE;
+ goto out;
+ default:
+ error = ENODEV;
+ goto out;
+ }
+ vp = fp->f_vnode;
+ if (vp->v_type != VREG) {
+ error = ENODEV;
+ goto out;
+ }
+ switch (uap->advice) {
+ case FADV_SEQUENTIAL:
+ case FADV_RANDOM:
+ case FADV_NOREUSE:
+ mtx_pool_lock(mtxpool_sleep, fp);
+ fp->f_advice = uap->advice;
+ fp->f_adviceoff = uap->offset;
+ fp->f_advicelen = uap->len;
+ mtx_pool_unlock(mtxpool_sleep, fp);
+ break;
+ case FADV_NORMAL:
+ /*
+ * If a the "normal" region overlaps with an existing
+ * non-standard region, trim or remove the
+ * non-standard region.
+ */
+ mtx_pool_lock(mtxpool_sleep, fp);
+ if (fp->f_advice != FADV_NORMAL) {
+ if (uap->len == 0 && fp->f_advicelen == 0) {
+ if (uap->offset > fp->f_adviceoff)
+ fp->f_advicelen =
+ uap->offset - fp->f_adviceoff;
+ else
+ fp->f_advice = FADV_NORMAL;
+ } else if (uap->len == 0) {
+ if (uap->offset <= fp->f_adviceoff)
+ fp->f_advice = FADV_NORMAL;
+ else if (fp->f_adviceoff + fp->f_advicelen >
+ uap->offset)
+ fp->f_advicelen =
+ uap->offset - fp->f_adviceoff;
+ } else if (fp->f_advicelen == 0) {
+ if (uap->offset + uap->len > fp->f_adviceoff)
+ fp->f_adviceoff =
+ uap->offset + uap->len;
+ } else if (fp->f_adviceoff < uap->offset + uap->len &&
+ fp->f_adviceoff + fp->f_advicelen > uap->offset)
+ fp->f_advicelen =
+ uap->offset - fp->f_adviceoff;
+ else if (uap->offset <
+ fp->f_adviceoff + fp->f_advicelen &&
+ uap->offset + uap->len > fp->f_adviceoff) {
+ newoff = uap->offset + uap->len
+ fp->f_advicelen -= (fp->f_adviceoff - newoff);
+ fp->f_adviceoff = newoff;
+ }
+ }
+ mtx_pool_unlock(mtxpool_sleep, fp);
+ break;
+ case FADV_WILLNEED:
+ case FADV_DONTNEED:
+ /*
+ * Apply the request to the backing VM object. Note
+ * that the FADV_* constants map directly to the same
+ * madvise(2) constants.
+ */
+ start = trunc_page(uap->offset);
+ end = round_page(uap->offset + uap->len - 1);
+ vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ if (vp->v_object != NULL)
+ vm_object_madvise(vp->v_object, OFF_TO_IDX(start),
+ atop(end - start), uap->advice);
+ VOP_UNLOCK(vp, 0);
+ VFS_UNLOCK_GIANT(vfslocked);
+ break;
+ }
+out:
+ fdrop(fp, td);
+ return (error);
+}
==== //depot/projects/fadvise/sys/sys/fcntl.h#2 (text+ko) ====
@@ -278,6 +278,34 @@
#endif
/*
+ * Advice to fadvise
+ */
+#define _FADV_NORMAL 0 /* no special treatment */
+#define _FADV_RANDOM 1 /* expect random page references */
+#define _FADV_SEQUENTIAL 2 /* expect sequential page references */
+#define _FADV_WILLNEED 3 /* will need these pages */
+#define _FADV_DONTNEED 4 /* dont need these pages */
+#define _FADV_NOREUSE 5 /* access data only once */
+
+#if __BSD_VISIBLE
+#define FADV_NORMAL _FADV_NORMAL
+#define FADV_RANDOM _FADV_RANDOM
+#define FADV_SEQUENTIAL _FADV_SEQUENTIAL
+#define FADV_WILLNEED _FADV_WILLNEED
+#define FADV_DONTNEED _FADV_DONTNEED
+#define FADV_NOREUSE _FADV_NOREUSE
+#endif
+
+#if __POSIX_VISIBLE >= 200112
+#define POSIX_FADV_NORMAL _FADV_NORMAL
+#define POSIX_FADV_RANDOM _FADV_RANDOM
+#define POSIX_FADV_SEQUENTIAL _FADV_SEQUENTIAL
+#define POSIX_FADV_WILLNEED _FADV_WILLNEED
+#define POSIX_FADV_DONTNEED _FADV_DONTNEED
+#define POSIX_FADV_NOREUSE _FADV_NOREUSE
+#endif
+
+/*
* XXX missing posix_fadvise() and POSIX_FADV_* macros.
*/
@@ -289,6 +317,12 @@
#if __BSD_VISIBLE || __POSIX_VISIBLE >= 200809
int openat(int, const char *, int, ...);
#endif
+#if __BSD_VISIBLE
+int fadvise(int, off_t, off_t, int);
+#endif
+#if __POSIX_VISIBLE >= 200112
+int posix_fadvise(int, off_t, off_t, int);
+#endif
#if __BSD_VISIBLE || __POSIX_VISIBLE >= 200112
int posix_fallocate(int, off_t, off_t);
#endif
==== //depot/projects/fadvise/sys/sys/file.h#2 (text+ko) ====
@@ -137,6 +137,9 @@
int f_seqcount; /* Count of sequential accesses. */
off_t f_nextoff; /* next expected read/write offset. */
struct cdev_privdata *f_cdevpriv; /* (d) Private data for the cdev. */
+ int f_advice; /* (f) FADV_* type. */
+ off_t f_adviceoff; /* (f) fadvice regionoffset. */
+ off_t f_advicelen; /* (f) fadvice region length. */
/*
* DFLAG_SEEKABLE specific fields
*/
More information about the p4-projects
mailing list