PERFORCE change 165689 for review

Mon Jul 6 14:38:03 UTC 2009

http://perforce.freebsd.org/chv.cgi?CH=165689

Change 165689 by fabio at fabio_granpasso on 2009/07/06 14:37:41

	
	Snapshot of private repo: add seekiness and thinktime heuristics,
	improve async writeout accounting, update the bio classification
	code to use the hooks committed in HEAD (when configured).

Affected files ...

.. //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/g_sched.c#2 edit
.. //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/gs_rr.c#2 edit
.. //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/gs_scheduler.h#2 edit

Differences ...

==== //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/g_sched.c#2 (text+ko) ====

@@ -114,6 +114,7 @@
 #include "gs_scheduler.h"
 #include "g_sched.h"		/* geom hooks */
 
+#define HAVE_BIO_CLASSIFIER
 /*
  * Size of the per-geom hash table storing traffic classes.
  * We may decide to change it at a later time, it has no ABI
@@ -178,14 +179,6 @@
 	.gs_expire_secs = 10,
 };
 
-/*
- * What kind of classifier we want to use ?
- * (not supported yet)
- */
-#define	G_CLASS_PID	0
-
-static const int g_sched_classifier = G_CLASS_PID;
-
 SYSCTL_DECL(_kern_geom);
 SYSCTL_NODE(_kern_geom, OID_AUTO, sched, CTLFLAG_RW, 0,
     "GEOM_SCHED stuff");
@@ -321,31 +314,24 @@
  * so we do not make assumptions on the return value which for
  * us is just an opaque identifier.
  */
-static inline struct thread *
-g_sched_issuer(struct bio *bp)
+#ifndef HAVE_BIO_CLASSIFIER
+static inline u_long
+g_sched_classify(struct bio *bp)
 {
 
 	while (bp->bio_parent != NULL)
 		bp = bp->bio_parent;
 
-	return (bp->bio_caller1);
+	return ((u_long)bp->bio_caller1);
 }
-
-/*
- * Fetch the actual field used for classification, among the
- * ones available in the credentials associated with the bio.
- * Not much to do so far.
- */
-static u_long
-g_sched_classify(struct thread *tp)
+#else
+static inline u_long
+g_sched_classify(struct bio *bp)
 {
 
-	switch (g_sched_classifier) {
-	case G_CLASS_PID:
-	default:
-		return (tp->td_tid);
-	}
+	return ((u_long)bp->bio_classifier1);
 }
+#endif
 
 /* Return the hash chain for the given key. */
 static inline struct g_hash *
@@ -369,12 +355,10 @@
 	struct g_sched_class *gsc;
 	struct g_gsched *gsp;
 	struct g_hash *bucket;
-	struct thread *tp;
 	u_long key;
 
 	sc = gp->softc;
-	tp = g_sched_issuer(bp);
-	key = g_sched_classify(tp);
+	key = g_sched_classify(bp);
 	bucket = g_sched_hash(sc, key);
 	LIST_FOREACH(gsc, bucket, gsc_clist) {
 		if (key == gsc->gsc_key) {
@@ -389,7 +373,7 @@
 	if (!gsc)
 		return (NULL);
 
-	if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv, tp)) {
+	if (gsp->gs_init_class(sc->sc_data, gsc->gsc_priv)) {
 		free(gsc, M_GEOM_SCHED);
 		return (NULL);
 	}
@@ -1258,6 +1242,7 @@
  * code in g_ioreq_patch() for the details.
  */
 
+#ifndef HAVE_BIO_CLASSIFIER
 #if defined(__i386__)
 #define	CODE_SIZE	29
 #define	STORE_SIZE	5
@@ -1374,6 +1359,47 @@
 	}
 }
 
+static inline void
+g_classifier_ini(void)
+{
+
+	g_ioreq_patch();
+}
+
+static inline void
+g_classifier_fini(void)
+{
+
+	g_ioreq_restore();
+}
+#else /* !HAVE_BIO_CLASSIFIER */
+static int
+g_sched_tag(void *arg, struct bio *bp)
+{
+
+	bp->bio_classifier1 = curthread;
+	return (1);
+}
+
+static struct g_classifier_hook g_sched_classifier = {
+	.func =	g_sched_tag,
+};
+
+static inline void
+g_classifier_ini(void)
+{
+
+	g_register_classifier(&g_sched_classifier);
+}
+
+static inline void
+g_classifier_fini(void)
+{
+
+	g_unregister_classifier(&g_sched_classifier);
+}
+#endif
+
 static void
 g_sched_init(struct g_class *mp)
 {
@@ -1384,14 +1410,14 @@
 	    mp, &g_sched_class);
 
 	/* Patch g_io_request to store classification info in the bio. */
-	g_ioreq_patch();
+	g_classifier_ini();
 }
 
 static void
 g_sched_fini(struct g_class *mp)
 {
 
-	g_ioreq_restore();
+	g_classifier_fini();
 
 	G_SCHED_DEBUG(0, "Unloading...");
 

==== //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/gs_rr.c#2 (text+ko) ====

@@ -65,6 +65,11 @@
 	G_QUEUE_IDLING		/* Waiting for a new request. */
 };
 
+/* possible queue flags */
+enum g_rr_flags {
+	G_FLAG_COMPLETED = 1,	/* Completed a req. in the current budget. */
+};
+
 struct g_rr_softc;
 
 /*
@@ -79,6 +84,7 @@
 	enum g_rr_state	q_status;
 	unsigned int	q_service;	/* service received so far */
 	int		q_slice_end;	/* actual slice end in ticks */
+	enum g_rr_flags	q_flags;	/* queue flags */
 	struct bio_queue_head q_bioq;
 
 	/* Scheduling parameters */
@@ -86,6 +92,13 @@
 	unsigned int	q_slice_duration; /* slice size in ticks */
 	unsigned int	q_wait_ticks;	/* wait time for anticipation */
 
+	/* Stats to drive the various heuristics. */
+	struct g_savg	q_thinktime;	/* Thinktime average. */
+	struct g_savg	q_seekdist;	/* Seek distance average. */
+
+	off_t		q_lastoff;	/* Last submitted req. offset. */
+	int		q_lastsub;	/* Last submitted req. time. */
+
 	/* Expiration deadline for an empty queue. */
 	int		q_expire;
 
@@ -289,7 +302,7 @@
 }
 
 static int
-g_rr_init_class(void *data, void *priv, struct thread *tp)
+g_rr_init_class(void *data, void *priv)
 {
 	struct g_rr_softc *sc = data;
 	struct g_rr_queue *qp = priv;
@@ -339,8 +352,33 @@
 g_rr_queue_expired(struct g_rr_queue *qp)
 {
 
-	return (qp->q_service >= qp->q_budget ||
-	    ticks - qp->q_slice_end >= 0);
+	if (qp->q_service >= qp->q_budget)
+		return (1);
+
+	if ((qp->q_flags & G_FLAG_COMPLETED) &&
+	    ticks - qp->q_slice_end >= 0)
+		return (1);
+
+	return (0);
+}
+
+static inline int
+g_rr_should_anticipate(struct g_rr_queue *qp, struct bio *bp)
+{
+	int wait = get_bounded(&me.wait_ms, 2);
+
+	if (!me.w_anticipate && (bp->bio_cmd & BIO_WRITE))
+		return (0);
+
+	if (g_savg_valid(&qp->q_thinktime) &&
+	    g_savg_read(&qp->q_thinktime) > wait)
+		return (0);
+
+	if (g_savg_valid(&qp->q_seekdist) &&
+	    g_savg_read(&qp->q_seekdist) > 2048)
+		return (0);
+
+	return (1);
 }
 
 /*
@@ -389,9 +427,7 @@
 		TAILQ_REMOVE(&sc->sc_rr_tailq, qp, q_tailq);
 		sc->sc_active = qp;
 		qp->q_service = 0;
-		/* in case we want to make the slice adaptive */
-		qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
-		qp->q_slice_end = ticks + qp->q_slice_duration;
+		qp->q_flags &= ~G_FLAG_COMPLETED;
 	}
 
 	bp = gs_bioq_takefirst(&qp->q_bioq);	/* surely not NULL */
@@ -412,7 +448,7 @@
 	 *    on read or writes (e.g., anticipate only on reads).
 	 */
 	expired = g_rr_queue_expired(qp);	/* are we expired ? */
-	next = gs_bioq_first(&qp->q_bioq);		/* do we have one more ? */
+	next = gs_bioq_first(&qp->q_bioq);	/* do we have one more ? */
  	if (expired) {
 		sc->sc_active = NULL;
 		/* Either requeue or release reference. */
@@ -423,7 +459,7 @@
 	} else if (next != NULL) {
 		qp->q_status = G_QUEUE_READY;
 	} else {
-		if (!force && (me.w_anticipate || bp->bio_cmd & BIO_READ)) {
+		if (!force && g_rr_should_anticipate(qp, bp)) {
 			/* anticipate */
 			qp->q_status = G_QUEUE_BUSY;
 		} else {
@@ -439,6 +475,30 @@
 	return (bp);
 }
 
+static inline void
+g_rr_update_thinktime(struct g_rr_queue *qp)
+{
+	int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2);
+
+	qp->q_lastsub = ticks;
+	delta = (delta > 2 * wait) ? 2 * wait : delta;
+	g_savg_add_sample(&qp->q_thinktime, delta);
+}
+
+static inline void
+g_rr_update_seekdist(struct g_rr_queue *qp, struct bio *bp)
+{
+	off_t dist;
+
+	if (qp->q_lastoff > bp->bio_offset)
+		dist = qp->q_lastoff - bp->bio_offset;
+	else
+		dist = bp->bio_offset - qp->q_lastoff;
+
+	qp->q_lastoff = bp->bio_offset + bp->bio_length;
+	g_savg_add_sample(&qp->q_seekdist, qp->q_seekdist.gs_smpl ? dist : 0);
+}
+
 /*
  * Called when a real request for disk I/O arrives.
  * Locate the queue associated with the client.
@@ -476,6 +536,9 @@
 		}
 	}
 
+	g_rr_update_thinktime(qp);
+	g_rr_update_seekdist(qp, bp);
+
 	/* Inherit the reference returned by g_rr_queue_get(). */
 	bp->bio_caller1 = qp;
 	gs_bioq_disksort(&qp->q_bioq, bp);
@@ -559,6 +622,13 @@
 
 	qp = bp->bio_caller1;
 	if (qp == sc->sc_active && qp->q_status == G_QUEUE_BUSY) {
+		if (!(qp->q_flags & G_FLAG_COMPLETED)) {
+			qp->q_flags |= G_FLAG_COMPLETED;
+			/* in case we want to make the slice adaptive */
+			qp->q_slice_duration = get_bounded(&me.quantum_ms, 2);
+			qp->q_slice_end = ticks + qp->q_slice_duration;
+		}
+
 		/* The queue is trying anticipation, start the timer. */
 		qp->q_status = G_QUEUE_IDLING;
 		/* may make this adaptive */

==== //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/gs_scheduler.h#2 (text+ko) ====

@@ -90,7 +90,7 @@
 typedef int gs_start_t (void *data, struct bio *bio);
 typedef void gs_done_t (void *data, struct bio *bio);
 typedef struct bio *gs_next_t (void *data, int force);
-typedef int gs_init_class_t (void *data, void *priv, struct thread *tp);
+typedef int gs_init_class_t (void *data, void *priv);
 typedef void gs_fini_class_t (void *data, void *priv);
 
 struct g_gsched {
@@ -175,6 +175,40 @@
 void g_sched_dispatch(struct g_geom *geom);
 
 /*
+ * Simple gathering of statistical data, used by schedulers to collect
+ * info on process history.  Just keep an exponential average of the
+ * samples, with some extra bits of precision.
+ */
+struct g_savg {
+	uint64_t	gs_avg;
+	unsigned int	gs_smpl;
+};
+
+static inline void
+g_savg_add_sample(struct g_savg *ss, uint64_t sample)
+{
+
+	/* EMA with alpha = 0.125, fixed point, 3 bits of precision. */
+	ss->gs_avg = sample + ss->gs_avg - (ss->gs_avg >> 3);
+	ss->gs_smpl = 1 + ss->gs_smpl - (ss->gs_smpl >> 3);
+}
+
+static inline int
+g_savg_valid(struct g_savg *ss)
+{
+
+	/* We want at least 8 samples to deem an average as valid. */
+	return (ss->gs_smpl > 7);
+}
+
+static inline uint64_t
+g_savg_read(struct g_savg *ss)
+{
+
+	return (ss->gs_avg / ss->gs_smpl);
+}
+
+/*
  * Declaration of a scheduler module.
  */
 int g_gsched_modevent(module_t mod, int cmd, void *arg);