NFS server does not cluster writes
Bjorn Gronvall
bg at sics.se
Tue Oct 9 06:27:01 PDT 2007
Hi,
The current NFS server does only cluster reads but never writes which
in turn leads to poor sequential-write performance. The attached patch
makes the following changes:
1/ Rearrange the code so that the same code can be used to detect both
sequential reads and writes.
2/ Merge in updates from vfs_vnops.c::sequential_heuristic.
3/ Use double hashing in order to avoid hash-clustering in the
nfsheur table.
4/ Pack nfsheur table more efficiently.
5/ Tolerate reordered RPCs to some small amount (initially suggested
by Ellard and Seltzer).
6/ Back-off from sequential access rather than immediately switching to
random access.
These changes has been tested on a low performance ATA disk (with
write caching disabled) and speeded up large sequential writes by a
factor of four. I would be interested in getting numbers from more
normal server configurations if somebody has the time to try it out.
Cheers,
/b
--
_ _ ,_______________.
Bjorn Gronvall (Björn Grönvall) /_______________/|
Swedish Institute of Computer Science | ||
PO Box 1263, S-164 29 Kista, Sweden | Schroedingers ||
Email: bg at sics.se, Phone +46 -8 633 15 25 | Cat |/
Cellular +46 -70 768 06 35, Fax +46 -8 751 72 30 '---------------'
--- nfs_serv.c.orig 2007-10-09 12:03:00.000000000 +0200
+++ nfs_serv.c 2007-10-09 13:50:02.000000000 +0200
@@ -106,18 +106,98 @@
#define MAX_COMMIT_COUNT (1024 * 1024)
-#define NUM_HEURISTIC 1017
+#define NUM_HEURISTIC 1031 /* Must be prime! */
+#define HASH_MAXSTEP 0x3ff
#define NHUSE_INIT 64
#define NHUSE_INC 16
#define NHUSE_MAX 2048
+CTASSERT(NUM_HEURISTIC > (HASH_MAXSTEP + 1));
static struct nfsheur {
+ off_t nh_nextoff; /* next offset for sequential detection */
struct vnode *nh_vp; /* vp to match (unreferenced pointer) */
- off_t nh_nextr; /* next offset for sequential detection */
- int nh_use; /* use count for selection */
- int nh_seqcount; /* heuristic */
+ uint16_t nh_use; /* use count for selection */
+ uint16_t nh_seqcount; /* in units of BKVASIZE bytes */
} nfsheur[NUM_HEURISTIC];
+/*
+ * Sequential heuristic - detect sequential operation
+ */
+static
+struct nfsheur *
+sequential_heuristic(const struct uio *uio, struct vnode *vp)
+{
+ struct nfsheur *nh;
+ unsigned hi, step; /* Double hashing */
+ int try = 32; /* A bit large? */
+ int nblocks;
+
+ /*
+ * Locate best candidate
+ */
+
+ hi = ((unsigned)vp / sizeof(struct vnode)) % NUM_HEURISTIC;
+ step = ((unsigned)vp / sizeof(struct vnode)) & HASH_MAXSTEP;
+ step++; /* Step must not be zero. */
+ nh = &nfsheur[hi];
+
+ while (try--) {
+ if (nfsheur[hi].nh_vp == vp) {
+ nh = &nfsheur[hi];
+ break;
+ }
+ if (nfsheur[hi].nh_use > 0)
+ --nfsheur[hi].nh_use;
+ hi = hi + step;
+ if (hi >= NUM_HEURISTIC)
+ hi -= NUM_HEURISTIC;
+ if (nfsheur[hi].nh_use < nh->nh_use)
+ nh = &nfsheur[hi];
+ }
+
+ if (nh->nh_vp != vp) {
+ nh->nh_vp = vp;
+ nh->nh_nextoff = uio->uio_offset;
+ nh->nh_use = NHUSE_INIT;
+ if (uio->uio_offset == 0)
+ nh->nh_seqcount = 4;
+ else
+ nh->nh_seqcount = 1;
+ }
+
+ nh->nh_use += NHUSE_INC;
+ if (nh->nh_use > NHUSE_MAX)
+ nh->nh_use = NHUSE_MAX;
+
+ /*
+ * Calculate heuristic
+ */
+
+ /*
+ * XXX we assume that the filesystem block size is
+ * the default. Not true, but still gives us a pretty
+ * good indicator of how sequential the read operations
+ * are.
+ */
+ nblocks = (uio->uio_resid + BKVASIZE - 1) / BKVASIZE;
+ if ((uio->uio_offset == 0 && nh->nh_seqcount > 0) ||
+ uio->uio_offset == nh->nh_nextoff) {
+ nh->nh_seqcount += nblocks;
+ if (nh->nh_seqcount > IO_SEQMAX)
+ nh->nh_seqcount = IO_SEQMAX;
+ } else if (qabs(uio->uio_offset - nh->nh_nextoff) <=
+ 4*imax(BKVASIZE, uio->uio_resid)) {
+ /* Probably reordered RPC, do nothing. */
+ } else {
+ nh->nh_seqcount /= 4;
+ /* RPCs larger than 1 block should cluster IO. */
+ if (nblocks > 1 && nh->nh_seqcount < nblocks)
+ nh->nh_seqcount = nblocks;
+ }
+
+ return (nh);
+}
+
/* Global vars */
int nfsrvw_procrastinate = NFS_GATHERDELAY * 1000;
@@ -855,61 +935,6 @@
else
cnt = reqlen;
- /*
- * Calculate seqcount for heuristic
- */
-
- {
- int hi;
- int try = 32;
-
- /*
- * Locate best candidate
- */
-
- hi = ((int)(vm_offset_t)vp / sizeof(struct vnode)) % NUM_HEURISTIC;
- nh = &nfsheur[hi];
-
- while (try--) {
- if (nfsheur[hi].nh_vp == vp) {
- nh = &nfsheur[hi];
- break;
- }
- if (nfsheur[hi].nh_use > 0)
- --nfsheur[hi].nh_use;
- hi = (hi + 1) % NUM_HEURISTIC;
- if (nfsheur[hi].nh_use < nh->nh_use)
- nh = &nfsheur[hi];
- }
-
- if (nh->nh_vp != vp) {
- nh->nh_vp = vp;
- nh->nh_nextr = off;
- nh->nh_use = NHUSE_INIT;
- if (off == 0)
- nh->nh_seqcount = 4;
- else
- nh->nh_seqcount = 1;
- }
-
- /*
- * Calculate heuristic
- */
-
- if ((off == 0 && nh->nh_seqcount > 0) || off == nh->nh_nextr) {
- if (++nh->nh_seqcount > IO_SEQMAX)
- nh->nh_seqcount = IO_SEQMAX;
- } else if (nh->nh_seqcount > 1) {
- nh->nh_seqcount = 1;
- } else {
- nh->nh_seqcount = 0;
- }
- nh->nh_use += NHUSE_INC;
- if (nh->nh_use > NHUSE_MAX)
- nh->nh_use = NHUSE_MAX;
- ioflag |= nh->nh_seqcount << IO_SEQSHIFT;
- }
-
nfsm_reply(NFSX_POSTOPORFATTR(v3) + 3 * NFSX_UNSIGNED+nfsm_rndup(cnt));
if (v3) {
tl = nfsm_build(u_int32_t *, NFSX_V3FATTR + 4 * NFSX_UNSIGNED);
@@ -967,9 +992,11 @@
uiop->uio_resid = len;
uiop->uio_rw = UIO_READ;
uiop->uio_segflg = UIO_SYSSPACE;
+ nh = sequential_heuristic(uiop, vp);
+ ioflag |= nh->nh_seqcount << IO_SEQSHIFT;
error = VOP_READ(vp, uiop, IO_NODELOCKED | ioflag, cred);
off = uiop->uio_offset;
- nh->nh_nextr = off;
+ nh->nh_nextoff = off;
FREE((caddr_t)iv2, M_TEMP);
if (error || (getret = VOP_GETATTR(vp, vap, cred, td))) {
if (!error)
@@ -1037,12 +1064,14 @@
nfsfh_t nfh;
fhandle_t *fhp;
struct uio io, *uiop = &io;
+ struct nfsheur *nh;
off_t off;
struct mount *mntp = NULL;
int tvfslocked;
int vfslocked;
nfsdbprintf(("%s %d\n", __FILE__, __LINE__));
+ bwillwrite();
vfslocked = 0;
if (mrep == NULL) {
*mrq = NULL;
@@ -1175,9 +1204,12 @@
uiop->uio_segflg = UIO_SYSSPACE;
uiop->uio_td = NULL;
uiop->uio_offset = off;
+ nh = sequential_heuristic(uiop, vp);
+ ioflags |= nh->nh_seqcount << IO_SEQSHIFT;
error = VOP_WRITE(vp, uiop, ioflags, cred);
/* XXXRW: unlocked write. */
nfsrvstats.srvvop_writes++;
+ nh->nh_nextoff = uiop->uio_offset;
FREE((caddr_t)iv, M_TEMP);
}
aftat_ret = VOP_GETATTR(vp, vap, cred, td);
More information about the freebsd-fs
mailing list