PATCH: Forcible delaying of UFS (soft)updates

Marko Zec zec at tel.fer.hr
Fri Apr 11 19:01:16 PDT 2003


Here's a patch against 4.8-RELEASE kernel that allows disk writes on
softupdates-enabled filesystems to be delayed for (theoretically)
arbitrarily long periods of time. The motivation for such updating
policy is surprisingly not purely suicidal - it can allow disks on
laptops to spin down immediately after I/O operations and stay idle for
longer periods of time, thus saving considerable amount of battery
power.

The patch introduces a new sysctl tunable vfs.sync_extdelay which
controls the delay duration in seconds. If the variable is set to 0, the
standard UFS synching policy is restored. The tunable can be either
modified by hand or controlled by APM daemon using the attached
rc.syncdelay script.

When enabled, the extended delaying policy introduces some additional
changes:

- fsync() no longer flushes the buffers to disk, but returns immediately
instead;
- invoking sync() causes flushing of softupdates buffers to follow
immediately, which was not the case before;
- if one of the mounted filesystems becomes low on free space, which can
happen if lot of data is written to the FS but FS metadata buffers are
not updated to disk, flushing of all softupdates buffers is scheduled
automatically;
- if an I/O operation (typically read request) on ATA disk is performed,
which is likely to cause the disk to be spinned up, the pending buffers
are immediately flushed to the disk, but only if they were pending
longer than what would be the case with normal updating policy.

As I'm virtually clueless in FS concepts and theory I'm not sure if the
above model doesn't shake the foundations of UFS operation, therefore
I'd appreciate for more knowledgeable people to comment on the patch.
Nevertheless, my laptop runs without glitches for the last two weeks
with the extra delaying enabled, while happily achieving 5-10% longer
battery operated periods, depending on disk utilization patterns.

Cheers,

Marko
-------------- next part --------------
--- /usr/src/sys.org/dev/ata/ata-disk.c	Thu Jan 30 08:19:59 2003
+++ dev/ata/ata-disk.c	Sat Apr 12 00:31:26 2003
@@ -294,6 +294,7 @@ adstrategy(struct buf *bp)
     struct ad_softc *adp = bp->b_dev->si_drv1;
     int s;
 
+    stratcalls++;
     if (adp->device->flags & ATA_D_DETACHING) {
 	bp->b_error = ENXIO;
 	bp->b_flags |= B_ERROR;
--- /usr/src/sys.org/kern/vfs_subr.c	Sun Oct 13 18:19:12 2002
+++ kern/vfs_subr.c	Sat Apr 12 01:56:16 2003
@@ -116,6 +116,10 @@ SYSCTL_INT(_vfs, OID_AUTO, reassignbufme
 static int nameileafonly = 0;
 SYSCTL_INT(_vfs, OID_AUTO, nameileafonly, CTLFLAG_RW, &nameileafonly, 0, "");
 
+int stratcalls = 0;
+int sync_extdelay = 0;
+SYSCTL_INT(_vfs, OID_AUTO, sync_extdelay, CTLFLAG_RW, &sync_extdelay, 0, "");
+
 #ifdef ENABLE_VFS_IOOPT
 int vfs_ioopt = 0;
 SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, "");
@@ -137,7 +141,7 @@ static vm_zone_t vnode_zone;
  * The workitem queue.
  */
 #define SYNCER_MAXDELAY		32
-static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
+int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
 time_t syncdelay = 30;		/* max time to delay syncing data */
 time_t filedelay = 30;		/* time to delay syncing files */
 SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0, "");
@@ -145,7 +149,7 @@ time_t dirdelay = 29;		/* time to delay 
 SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0, "");
 time_t metadelay = 28;		/* time to delay syncing metadata */
 SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0, "");
-static int rushjob;			/* number of slots to run ASAP */
+int rushjob;			/* number of slots to run ASAP */
 static int stat_rush_requests;	/* number of times I/O speeded up */
 SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0, "");
 
@@ -177,6 +181,7 @@ vntblinit()
 {
 
 	desiredvnodes = maxproc + cnt.v_page_count / 4;
+	TUNABLE_INT_FETCH("kern.maxvnodes", &desiredvnodes);
 	minvnodes = desiredvnodes / 4;
 	simple_lock_init(&mntvnode_slock);
 	simple_lock_init(&mntid_slock);
@@ -1119,7 +1124,7 @@ sched_sync(void)
 {
 	struct synclist *slp;
 	struct vnode *vp;
-	long starttime;
+	time_t starttime;
 	int s;
 	struct proc *p = updateproc;
 
@@ -1127,8 +1132,6 @@ sched_sync(void)
 	    SHUTDOWN_PRI_LAST);   
 
 	for (;;) {
-		kproc_suspend_loop(p);
-
 		starttime = time_second;
 
 		/*
@@ -1198,8 +1201,25 @@ sched_sync(void)
 		 * matter as we are just trying to generally pace the
 		 * filesystem activity.
 		 */
-		if (time_second == starttime)
+		if (time_second != starttime)
+			continue;
+
+		if (sync_extdelay >= syncer_maxdelay)
+			while (syncer_delayno == 0 && rushjob == 0 &&
+	    		    abs(time_second - starttime) < sync_extdelay) {
+				stratcalls = 0;
+				tsleep(&lbolt, PPAUSE, "syncer", 0);
+				kproc_suspend_loop(p);
+				if (stratcalls != 0 && syncer_maxdelay <
+				    abs(time_second - starttime)) {
+					rushjob = syncer_maxdelay;
+					break;
+				}
+			}
+		else {
 			tsleep(&lbolt, PPAUSE, "syncer", 0);
+			kproc_suspend_loop(p);
+		}
 	}
 }
 
--- /usr/src/sys.org/kern/vfs_syscalls.c	Thu Jan  2 18:26:18 2003
+++ kern/vfs_syscalls.c	Sat Apr 12 01:55:48 2003
@@ -563,6 +563,9 @@ sync(p, uap)
 	register struct mount *mp, *nmp;
 	int asyncflag;
 
+	/* Notify sched_sync() to try flushing syncer_workitem_pending[*] */
+	rushjob += syncer_maxdelay; 
+
 	simple_lock(&mountlist_slock);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
@@ -2627,6 +2630,10 @@ fsync(p, uap)
 	struct file *fp;
 	vm_object_t obj;
 	int error;
+
+	/* Just return if we are artificially delaying disk syncs */
+	if (sync_extdelay)
+		return (0);
 
 	if ((error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) != 0)
 		return (error);
--- /usr/src/sys.org/ufs/ffs/ffs_alloc.c	Fri Sep 21 21:15:21 2001
+++ ufs/ffs/ffs_alloc.c	Sat Apr 12 00:06:20 2003
@@ -125,6 +125,10 @@ ffs_alloc(ip, lbn, bpref, size, cred, bn
 #endif /* DIAGNOSTIC */
 	if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
 		goto nospace;
+	/* Speedup flushing of syncer_wokitem_pending[*] if low on freespace */
+	if (rushjob == 0 &&
+	    freespace(fs, fs->fs_minfree + 2) - numfrags(fs, size) < 0)
+		rushjob = syncer_maxdelay;
 	if (cred->cr_uid != 0 &&
 	    freespace(fs, fs->fs_minfree) - numfrags(fs, size) < 0)
 		goto nospace;
@@ -195,6 +199,10 @@ ffs_realloccg(ip, lbprev, bpref, osize, 
 	if (cred == NOCRED)
 		panic("ffs_realloccg: missing credential");
 #endif /* DIAGNOSTIC */
+	/* Speedup flushing of syncer_wokitem_pending[*] if low on freespace */
+	if (rushjob == 0 &&
+	    freespace(fs, fs->fs_minfree + 2) - numfrags(fs, nsize - osize) < 0)
+		rushjob = syncer_maxdelay;
 	if (cred->cr_uid != 0 &&
 	    freespace(fs, fs->fs_minfree) -  numfrags(fs, nsize - osize) < 0)
 		goto nospace;
--- /usr/src/sys.org/sys/buf.h	Sat Jan 25 20:02:23 2003
+++ sys/buf.h	Sat Apr 12 00:30:48 2003
@@ -478,6 +478,7 @@ extern char	*buffers;		/* The buffer con
 extern int	bufpages;		/* Number of memory pages in the buffer pool. */
 extern struct	buf *swbuf;		/* Swap I/O buffer headers. */
 extern int	nswbuf;			/* Number of swap I/O buffer headers. */
+extern int	stratcalls;		/* I/O ops since last buffer sync */
 extern TAILQ_HEAD(swqueue, buf) bswlist;
 extern TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES];
 
--- /usr/src/sys.org/sys/vnode.h	Sun Dec 29 19:19:53 2002
+++ sys/vnode.h	Sat Apr 12 00:06:20 2003
@@ -294,6 +294,9 @@ extern	struct vm_zone *namei_zone;
 extern	int prtactive;			/* nonzero to call vprint() */
 extern	struct vattr va_null;		/* predefined null vattr structure */
 extern	int vfs_ioopt;
+extern	int rushjob;
+extern	int syncer_maxdelay;
+extern	int sync_extdelay;
 
 /*
  * Macro/function to check for client cache inconsistency w.r.t. leasing.



-------------- next part --------------
# apmd Configuration File
#
# $FreeBSD: src/etc/apmd.conf,v 1.2.2.1 2000/12/12 22:48:18 dannyboy Exp $
#

apm_event POWERSTATECHANGE {
	exec "/etc/rc.syncdelay";
}

apm_event SUSPENDREQ {
	exec "/etc/rc.suspend";
}

apm_event USERSUSPENDREQ {
	exec "sync && sync && sync";
	#exec "sleep 1";
	exec "apm -z";
}

apm_event NORMRESUME, STANDBYRESUME {
	exec "/etc/rc.resume";
	exec "/etc/rc.syncdelay";
}

# resume event configuration for serial mouse users by
# reinitializing a moused(8) connected to a serial port.
#
#apm_event NORMRESUME {
#	exec "kill -HUP `cat /var/run/moused.pid`";
#}

# suspend request event configuration for ATA HDD users:
# execute standby instead of suspend.
#
#apm_event SUSPENDREQ {
#	reject;
#	exec "sync && sync && sync";
#	exec "sleep 1";
#	exec "apm -Z";
#}

# Sample entries for battery state monitoring
#apm_battery 5% discharging {
#	exec "logger -p user.emerg battery status critical!";
#	exec "echo T250L8CE-GE-C >/dev/speaker";
#}
#apm_battery 1% discharging {
#	exec "logger -p user.emerg battery low - emergency suspend";
#	exec "echo T250L16B+BA+AG+GF+FED+DC+CC >/dev/speaker";
#	exec "apm -z";
#}
#apm_battery 99% charging {
#	exec "logger -p user.notice battery fully charged";
#}

# apmd Configuration ends here



-------------- next part --------------
#!/bin/sh
#
# Copyright (c) 2003 Marko Zec
#
#include /usr/share/examples/bsd-style-copyright
#

# 
# /etc/rc.syncdelay
#
# Adjust disk syncing policy and delay on battery powered systems.
# Invoked automatically by apmd(8) when power state change or resume
# events occur.
#

AC_DELAY=0	# no delayed syncing
BAT_DELAY=600	# sync every 10 minutes

if [ `apm -a` -eq 1 ]; then
	# AC powered mode
	sysctl vfs.sync_extdelay=$AC_DELAY
else
	# Battery powered mode
	# Allow delayed syncing only if enough battery capacity is available
	if [ `apm -l` -gt 3 ]; then
		sysctl vfs.sync_extdelay=$BAT_DELAY
	else
		sysctl vfs.sync_extdelay=0
	fi
fi

exit 0





More information about the freebsd-fs mailing list