kern/113885: improved gmirror balance algorithm

Wed Jun 20 17:50:03 UTC 2007

>Number:         113885
>Category:       kern
>Synopsis:       improved gmirror balance algorithm
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          sw-bug
>Submitter-Id:   current-users
>Arrival-Date:   Wed Jun 20 17:50:01 GMT 2007
>Closed-Date:
>Last-Modified:
>Originator:     Mykola Zubach
>Release:        6.2-RELEASE
>Organization:
AdvancedHosters
>Environment:
FreeBSD DS021 6.2-RELEASE FreeBSD 6.2-RELEASE #0: Wed Jan 17 18:10:49 UTC 2007     adm at ahb30:/usr/obj/usr/src/sys/Z  i386
>Description:
gmirror have poor read balance algorithms, which doesn't allow to utilize disks performance

'prefer' algo is suitable only for non-symmetrical enviroment, such as one local and one network-mounted disk

'load' is broken - dummy 'round-robin' shows best results in all benchmarks

'split' is completely senseless - it uses 'round-robin' for small requests and split big request to differents providers - but it's clearly understood that one hdd will take less time to complete one big requests (equal to several small sequental requests), than two (or more) drives to that several small requests, because even if there is no other activity, one disk doesn't spent time to move actuator and doesn't wait for plate rotate.

'round-robin' is best but is dummy and doesn't try to select disk, which may take less time to complete request:
single sequential read thread uses both providers at half speed (total speed is same as for single drive)
two simultaneous sequential read threads fights each other use 10% of throughput, while they could use separate provider at full speed
next threads aggravates situation
>How-To-Repeat:
it's repeatable on all gmirror installations
>Fix:
I've rewritten 'load' algo

It remembers bio_offset value of last operation (read or write) for each provider and assumes that preferable disk is that which have smallest value of offset distance:
|bio_offset - disk->last_offset|

It also remembers last read time, and try to use that disk which was used longer time ago

>From all providers it selects that one which have lesser value of:
offset_distance / (tunable_parameter + use_delay)

This algorithm is well suitable for multithread enviroment (web-servers, db-servers...), for example it allow web server work at almost full speed when awstats (log analyzer) is running, while using 'round-robin' cause 50% slowdown (bw drops from 100Mbit/s to 50Mbit/s and remains on that level until awstats finishes its job).

Patch attached with submission follows:

--- /sys/geom/mirror/g_mirror.h	Wed May 10 10:10:03 2006
+++ g_mirror.h	Wed Jun 20 18:24:21 2007
@@ -131,6 +131,7 @@
 	int		 d_state;	/* Disk state. */
 	u_int		 d_priority;	/* Disk priority. */
 	struct bintime	 d_delay;	/* Disk delay. */
+	off_t		 last_offset;	/* LBA of last operation. */
 	struct bintime	 d_last_used;	/* When disk was last used. */
 	uint64_t	 d_flags;	/* Additional flags. */
 	u_int		 d_genid;	/* Disk's generation ID. */
--- /sys/geom/mirror/g_mirror.c	Tue Sep 19 14:16:14 2006
+++ g_mirror.c	Wed Jun 20 19:45:30 2007
@@ -45,7 +45,6 @@
 #include <sys/sched.h>
 #include <geom/mirror/g_mirror.h>
 
-
 static MALLOC_DEFINE(M_MIRROR, "mirror_data", "GEOM_MIRROR Data");
 
 SYSCTL_DECL(_kern_geom);
@@ -71,6 +70,11 @@
 TUNABLE_INT("kern.geom.mirror.sync_requests", &g_mirror_syncreqs);
 SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, sync_requests, CTLFLAG_RDTUN,
     &g_mirror_syncreqs, 0, "Parallel synchronization I/O requests.");
+static u_int g_mirror_plusdelay = 60000;
+TUNABLE_INT("kern.geom.mirror.plusdelay", &g_mirror_plusdelay);
+SYSCTL_UINT(_kern_geom_mirror, OID_AUTO, plusdelay, CTLFLAG_RW,
+   &g_mirror_plusdelay, 0, "Delay in 1/64k s to plus.");
+
 
 #define	MSLEEP(ident, mtx, priority, wmesg, timeout)	do {		\
 	G_MIRROR_DEBUG(4, "%s: Sleeping %p.", __func__, (ident));	\
@@ -453,6 +457,7 @@
 	disk->d_priority = md->md_priority;
 	disk->d_delay.sec = 0;
 	disk->d_delay.frac = 0;
+	disk->last_offset = 0;
 	binuptime(&disk->d_last_used);
 	disk->d_flags = md->md_dflags;
 	if (md->md_provider[0] != '\0')
@@ -861,7 +866,7 @@
 static void
 g_mirror_update_delay(struct g_mirror_disk *disk, struct bio *bp)
 {
-
+	return;
 	if (disk->d_softc->sc_balance != G_MIRROR_BALANCE_LOAD)
 		return;
 	binuptime(&disk->d_delay);
@@ -1423,8 +1428,11 @@
 	struct g_consumer *cp;
 	struct bio *cbp;
 	struct bintime curtime;
+	off_t  bio_offset = bp->bio_offset;
+	long long int	dist, best_dist = -1;
+	long long int	use_delay = 0, best_use_delay = 0;
 
-	binuptime(&curtime);
+	getbinuptime(&curtime);
 	/*
 	 * Find a disk which the smallest load.
 	 */
@@ -1432,16 +1440,23 @@
 	LIST_FOREACH(dp, &sc->sc_disks, d_next) {
 		if (dp->d_state != G_MIRROR_DISK_STATE_ACTIVE)
 			continue;
-		/* If disk wasn't used for more than 2 sec, use it. */
-		if (curtime.sec - dp->d_last_used.sec >= 2) {
-			disk = dp;
-			break;
-		}
-		if (disk == NULL ||
-		    bintime_cmp(&dp->d_delay, &disk->d_delay) < 0) {
+
+		dist = dp->last_offset - bio_offset;
+		if (dist < 0)
+			dist = -dist;
+
+/* TODO: please rewrite this code using '<<' and '>>' */
+		use_delay = g_mirror_plusdelay + (curtime.sec - dp->d_last_used.sec)*65536 + (curtime.frac - dp->d_last_used.frac)/65536/65536/65536;
+
+/* select disk with lesser (dist/(g_mirror_plusdelay + real_use_delay)) ratio - its heads looks to be close to [bio_offset] and disk was being used enough long time ago */
+		if (best_dist == -1 || dist * best_use_delay < best_dist * use_delay) {
 			disk = dp;
+			best_dist = dist;
+			best_use_delay = use_delay;
 		}
 	}
+//G_MIRROR_DEBUG(0, "%li.%03hu %lld %s(%lld) %lld/%lld", (long int)curtime.sec, (unsigned short) (curtime.frac/65536/65536/65536/65.536), bp->bio_offset/512, disk->d_name, disk->last_offset/512, best_dist, use_delay);
+
 	KASSERT(disk != NULL, ("NULL disk for %s.", sc->sc_name));
 	cbp = g_clone_bio(bp);
 	if (cbp == NULL) {
@@ -1456,7 +1471,8 @@
 	cp = disk->d_consumer;
 	cbp->bio_done = g_mirror_done;
 	cbp->bio_to = cp->provider;
-	binuptime(&disk->d_last_used);
+	disk->d_last_used = curtime;
+	disk->last_offset = bio_offset;
 	G_MIRROR_LOGREQ(3, cbp, "Sending request.");
 	KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1,
 	    ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, cp->acr,
@@ -1610,6 +1626,7 @@
 				g_io_deliver(bp, bp->bio_error);
 				return;
 			}
+			disk->last_offset = bp->bio_offset;
 			bioq_insert_tail(&queue, cbp);
 			cbp->bio_done = g_mirror_done;
 			cp = disk->d_consumer;


>Release-Note:
>Audit-Trail:
>Unformatted: