PERFORCE change 167421 for review
Fabio Checconi
fabio at FreeBSD.org
Sun Aug 16 22:06:17 UTC 2009
http://perforce.freebsd.org/chv.cgi?CH=167421
Change 167421 by fabio at fabio_granpasso on 2009/08/16 22:05:59
Resync with the private repo: (hopefully) safer proxy
insertion/deletion, a new proportional share scheduler,
revised heuristics.
Affected files ...
.. //depot/projects/soc2009/fabio_gsched/geom_sched/geom_sched.c#2 edit
.. //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/g_sched.c#3 edit
.. //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/gs_bfq.c#1 add
.. //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/gs_rr.c#3 edit
.. //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/gs_scheduler.h#3 edit
.. //depot/projects/soc2009/fabio_gsched/geom_sched/sys/modules/geom/geom_sched/Makefile#2 edit
.. //depot/projects/soc2009/fabio_gsched/geom_sched/sys/modules/geom/geom_sched/gsched_bfq/Makefile#1 add
Differences ...
==== //depot/projects/soc2009/fabio_gsched/geom_sched/geom_sched.c#2 (text+ko) ====
@@ -69,7 +69,7 @@
else
reqalgo = algo;
- snprintf(name, sizeof(name), "gsched_%s", algo);
+ snprintf(name, sizeof(name), "gsched_%s", reqalgo);
/*
* Do not complain about errors here, gctl_issue()
* will fail anyway.
==== //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/g_sched.c#3 (text+ko) ====
@@ -410,7 +410,8 @@
}
static void
-g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask)
+g_sched_hash_fini(struct g_geom *gp, struct g_hash *hp, u_long mask,
+ struct g_gsched *gsp, void *data)
{
struct g_sched_class *cp, *cp2;
int i;
@@ -418,6 +419,9 @@
if (!hp)
return;
+ if (data && gsp->gs_hash_unref)
+ gsp->gs_hash_unref(data);
+
for (i = 0; i < G_SCHED_HASH_SIZE; i++) {
LIST_FOREACH_SAFE(cp, &hp[i], gsc_clist, cp2)
g_sched_put_class(gp, cp->gsc_priv);
@@ -468,58 +472,70 @@
*
* Must be called with the gp mutex held and topology locked.
*/
-static void
+static int
g_sched_wait_pending(struct g_geom *gp)
{
struct g_sched_softc *sc = gp->softc;
+ int endticks = ticks + hz;
g_topology_assert();
- while (sc->sc_pending)
+ while (sc->sc_pending && endticks - ticks >= 0)
msleep(gp, &sc->sc_mtx, 0, "sched_wait_pending", hz / 4);
+
+ return (sc->sc_pending ? ETIMEDOUT : 0);
}
-static void
+static int
g_sched_remove_locked(struct g_geom *gp, struct g_gsched *gsp)
{
struct g_sched_softc *sc = gp->softc;
+ int error;
/* Set the flushing flag: new bios will not enter the scheduler. */
sc->sc_flags |= G_SCHED_FLUSHING;
g_sched_forced_dispatch(gp);
- g_sched_wait_pending(gp);
-
+ error = g_sched_wait_pending(gp);
+ if (error)
+ goto failed;
+
/* No more requests pending or in flight from the old gsp. */
- g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask);
+ g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask, gsp, sc->sc_data);
sc->sc_hash = NULL;
/*
- * XXX avoid deadlock here by releasing the gp mutex and
- * reacquiring it once done.
- * It should be safe, since no reconfiguration
- * or destruction can take place due to the geom topology
- * lock; no new request can use the current sc_data since
- * we flagged the geom as being flushed.
+ * Avoid deadlock here by releasing the gp mutex and reacquiring
+ * it once done. It should be safe, since no reconfiguration or
+ * destruction can take place due to the geom topology lock; no
+ * new request can use the current sc_data since we flagged the
+ * geom as being flushed.
*/
g_sched_unlock(gp);
gsp->gs_fini(sc->sc_data);
g_sched_lock(gp);
sc->sc_gsched = NULL;
+ sc->sc_data = NULL;
+ g_gsched_unref(gsp);
+
+failed:
sc->sc_flags &= ~G_SCHED_FLUSHING;
- g_gsched_unref(gsp);
+ return (error);
}
-static void
+static int
g_sched_remove(struct g_geom *gp, struct g_gsched *gsp)
{
+ int error;
g_sched_lock(gp);
- g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */
+ error = g_sched_remove_locked(gp, gsp); /* gsp is surely non-null */
g_sched_unlock(gp);
+
+ return (error);
}
/*
@@ -607,6 +623,7 @@
struct g_gsched *gsp = parm->gup_gsp, *cur, *tmp;
struct g_sched_softc *sc;
struct g_geom *gp, *gp_tmp;
+ int error;
parm->gup_error = 0;
@@ -622,8 +639,11 @@
continue; /* Should not happen. */
sc = gp->softc;
- if (sc->sc_gsched == gsp)
- g_sched_remove(gp, gsp);
+ if (sc->sc_gsched == gsp) {
+ error = g_sched_remove(gp, gsp);
+ if (error)
+ goto failed;
+ }
}
LIST_FOREACH_SAFE(cur, &me.gs_scheds, glist, tmp) {
@@ -647,6 +667,7 @@
parm->gup_error = ENOENT;
}
+failed:
mtx_unlock(&me.gs_mtx);
}
@@ -878,6 +899,65 @@
mtx_unlock(&me.gs_mtx);
}
+static void
+g_sched_flush_pending(g_start_t *start)
+{
+ struct bio *bp;
+
+ while ((bp = gs_bioq_takefirst(&me.gs_pending)))
+ start(bp);
+}
+
+static int
+g_insert_proxy(struct g_geom *gp, struct g_provider *newpp,
+ struct g_geom *dstgp, struct g_provider *pp, struct g_consumer *cp)
+{
+ struct g_sched_softc *sc = gp->softc;
+ g_start_t *saved_start, *flush = g_sched_start;
+ int error = 0, endticks = ticks + hz;
+
+ g_cancel_event(newpp); /* prevent taste() */
+ /* copy private fields */
+ newpp->private = pp->private;
+ newpp->index = pp->index;
+
+ /* Queue all the early requests coming for us. */
+ me.gs_npending = 0;
+ saved_start = pp->geom->start;
+ dstgp->start = g_sched_temporary_start;
+
+ while (pp->nstart - pp->nend != me.gs_npending &&
+ endticks - ticks >= 0)
+ tsleep(pp, PRIBIO, "-", hz/10);
+
+ if (pp->nstart - pp->nend != me.gs_npending) {
+ flush = saved_start;
+ error = ETIMEDOUT;
+ goto fail;
+ }
+
+ /* link pp to this geom */
+ LIST_REMOVE(pp, provider);
+ pp->geom = gp;
+ LIST_INSERT_HEAD(&gp->provider, pp, provider);
+
+ /*
+ * replicate the counts from the parent in the
+ * new provider and consumer nodes
+ */
+ cp->acr = newpp->acr = pp->acr;
+ cp->acw = newpp->acw = pp->acw;
+ cp->ace = newpp->ace = pp->ace;
+ sc->sc_flags |= G_SCHED_PROXYING;
+
+fail:
+ dstgp->start = saved_start;
+
+ g_sched_flush_pending(flush);
+
+ return (error);
+}
+
/*
* Create a geom node for the device passed as *pp.
* If successful, add a reference to this gsp.
@@ -886,12 +966,10 @@
g_sched_create(struct gctl_req *req, struct g_class *mp,
struct g_provider *pp, struct g_gsched *gsp, int proxy)
{
- struct g_sched_softc *sc;
- struct g_geom *gp, *dst_gp;
+ struct g_sched_softc *sc = NULL;
+ struct g_geom *gp, *dstgp;
struct g_provider *newpp = NULL;
struct g_consumer *cp = NULL;
- void (*saved_start)(struct bio *bp);
- struct bio *bp;
char name[64];
int error;
@@ -907,7 +985,7 @@
}
gp = g_new_geomf(mp, name);
- dst_gp = proxy ? pp->geom : gp; /* where do we link the provider */
+ dstgp = proxy ? pp->geom : gp; /* where do we link the provider */
if (gp == NULL) {
gctl_error(req, "Cannot create geom %s.", name);
error = ENOMEM;
@@ -937,7 +1015,7 @@
gp->access = g_sched_access;
gp->dumpconf = g_sched_dumpconf;
- newpp = g_new_providerf(dst_gp, gp->name);
+ newpp = g_new_providerf(dstgp, gp->name);
if (newpp == NULL) {
gctl_error(req, "Cannot create provider %s.", name);
error = ENOMEM;
@@ -962,44 +1040,16 @@
goto fail;
}
- g_gsched_ref(gsp);
-
g_error_provider(newpp, 0);
if (proxy) {
- g_cancel_event(newpp); /* prevent taste() */
- /* copy private fields */
- newpp->private = pp->private;
- newpp->index = pp->index;
-
- /* Queue all the early requests coming for us. */
- me.gs_npending = 0;
- saved_start = pp->geom->start;
- dst_gp->start = g_sched_temporary_start;
-
- while (pp->nstart - pp->nend != me.gs_npending)
- tsleep(pp, PRIBIO, "-", hz/10);
-
- /* link pp to this geom */
- LIST_REMOVE(pp, provider);
- pp->geom = gp;
- LIST_INSERT_HEAD(&gp->provider, pp, provider);
-
- dst_gp->start = saved_start;
-
- /*
- * replicate the counts from the parent in the
- * new provider and consumer nodes
- */
- cp->acr = newpp->acr = pp->acr;
- cp->acw = newpp->acw = pp->acw;
- cp->ace = newpp->ace = pp->ace;
- sc->sc_flags |= G_SCHED_PROXYING;
-
- while ((bp = gs_bioq_takefirst(&me.gs_pending)))
- g_sched_start(bp);
+ error = g_insert_proxy(gp, newpp, dstgp, pp, cp);
+ if (error)
+ goto fail;
}
G_SCHED_DEBUG(0, "Device %s created.", gp->name);
+ g_gsched_ref(gsp);
+
return (0);
fail:
@@ -1012,13 +1062,14 @@
if (newpp != NULL)
g_destroy_provider(newpp);
- if (pp->geom == gp) {
- /* Link pp back to the original geom */
- LIST_REMOVE(pp, provider);
- pp->geom = dst_gp;
- LIST_INSERT_HEAD(&pp->geom->provider, pp, provider);
+ if (sc && sc->sc_hash) {
+ g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
+ gsp, sc->sc_data);
}
+ if (sc && sc->sc_data)
+ gsp->gs_fini(sc->sc_data);
+
if (gp != NULL) {
if (gp->softc != NULL)
g_free(gp->softc);
@@ -1043,6 +1094,7 @@
struct g_hash *newh;
void *data;
u_long mask;
+ int error = 0;
gp = pp->geom;
sc = gp->softc;
@@ -1052,14 +1104,18 @@
return (ENOMEM);
newh = g_sched_hash_init(gsp, &mask, HASH_WAITOK);
- if (gsp->gs_priv_size && newh == NULL) {
- gsp->gs_fini(data);
- return (ENOMEM);
+ if (gsp->gs_priv_size && !newh) {
+ error = ENOMEM;
+ goto fail;
}
g_sched_lock(gp);
- if (sc->sc_gsched) /* can be NULL in some cases */
- g_sched_remove_locked(gp, sc->sc_gsched);
+ if (sc->sc_gsched) { /* can be NULL in some cases */
+ error = g_sched_remove_locked(gp, sc->sc_gsched);
+ if (error)
+ goto fail;
+ }
+
g_gsched_ref(gsp);
sc->sc_gsched = gsp;
sc->sc_data = data;
@@ -1069,8 +1125,23 @@
g_sched_unlock(gp);
return (0);
+
+fail:
+ if (newh)
+ g_sched_hash_fini(gp, newh, mask, gsp, data);
+
+ if (data)
+ gsp->gs_fini(data);
+
+ g_sched_unlock(gp);
+
+ return (error);
}
+/*
+ * Stop the request flow directed to the proxy, redirecting the new
+ * requests to the me.gs_pending queue.
+ */
static struct g_provider *
g_detach_proxy(struct g_geom *gp)
{
@@ -1091,13 +1162,6 @@
me.gs_npending = 0;
pp->geom->start = g_sched_temporary_start;
- /* Link provider to the original geom. */
- LIST_REMOVE(pp, provider);
- pp->private = newpp->private;
- pp->index = newpp->index;
- pp->geom = newpp->geom;
- LIST_INSERT_HEAD(&pp->geom->provider, pp, provider);
-
return (pp);
} while (0);
printf("%s error detaching proxy %s\n", __FUNCTION__, gp->name);
@@ -1106,11 +1170,51 @@
}
static void
-g_destroy_proxy(struct g_geom *gp, struct g_provider *old_pp)
+g_sched_blackhole(struct bio *bp)
+{
+
+ g_io_deliver(bp, ENXIO);
+}
+
+static inline void
+g_reparent_provider(struct g_provider *pp, struct g_geom *gp,
+ struct g_provider *newpp)
+{
+
+ LIST_REMOVE(pp, provider);
+ if (newpp) {
+ pp->private = newpp->private;
+ pp->index = newpp->index;
+ }
+ pp->geom = gp;
+ LIST_INSERT_HEAD(&gp->provider, pp, provider);
+}
+
+static inline void
+g_unproxy_provider(struct g_provider *oldpp, struct g_provider *newpp)
+{
+ struct g_geom *gp = oldpp->geom;
+
+ g_reparent_provider(oldpp, newpp->geom, newpp);
+
+ /*
+ * Hackish: let the system destroy the old provider for us, just
+ * in case someone attached a consumer to it, in which case a
+ * direct call to g_destroy_provider() would not work.
+ */
+ g_reparent_provider(newpp, gp, NULL);
+}
+
+/*
+ * Complete the proxy destruction, linking the old provider to its
+ * original geom, and destroying the proxy provider. Also take care
+ * of issuing the pending requests collected in me.gs_pending (if any).
+ */
+static int
+g_destroy_proxy(struct g_geom *gp, struct g_provider *oldpp)
{
struct g_consumer *cp;
struct g_provider *newpp;
- struct bio *bp;
do {
cp = LIST_FIRST(&gp->consumer);
@@ -1119,52 +1223,58 @@
newpp = cp->provider;
if (newpp == NULL)
break;
- /* detach consumer from provider, and destroy provider. */
+
+ /* Relink the provider to its original geom. */
+ g_unproxy_provider(oldpp, newpp);
+
+ /* Detach consumer from provider, and destroy provider. */
cp->acr = newpp->acr = 0;
cp->acw = newpp->acw = 0;
cp->ace = newpp->ace = 0;
g_detach(cp);
- printf("destroying provider %s\n", newpp->name);
- g_destroy_provider(newpp);
- while ((bp = gs_bioq_takefirst(&me.gs_pending)))
- old_pp->geom->start(bp);
+ /* Send the pending bios through the right start function. */
+ g_sched_flush_pending(oldpp->geom->start);
- return;
+ return (0);
} while (0);
printf("%s error destroying proxy %s\n", __FUNCTION__, gp->name);
+
+ /* We cannot send the pending bios anywhere... */
+ g_sched_flush_pending(g_sched_blackhole);
+
+ return (EINVAL);
}
static int
g_sched_destroy(struct g_geom *gp, boolean_t force)
{
- struct g_provider *pp, *old_pp = NULL;
+ struct g_provider *pp, *oldpp = NULL;
struct g_sched_softc *sc;
struct g_gsched *gsp;
+ int error;
g_topology_assert();
sc = gp->softc;
if (sc == NULL)
return (ENXIO);
- if ((sc->sc_flags & G_SCHED_PROXYING)) {
- old_pp = g_detach_proxy(gp);
- goto flush;
- }
- pp = LIST_FIRST(&gp->provider);
- if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
- const char *msg = force ?
- "but we force removal" : "cannot remove";
+ if (!(sc->sc_flags & G_SCHED_PROXYING)) {
+ pp = LIST_FIRST(&gp->provider);
+ if (pp && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
+ const char *msg = force ?
+ "but we force removal" : "cannot remove";
- G_SCHED_DEBUG(!force,
- "Device %s is still open (r%dw%de%d), %s.",
- pp->name, pp->acr, pp->acw, pp->ace, msg);
- if (!force)
- return (EBUSY);
- } else {
- G_SCHED_DEBUG(0, "Device %s removed.", gp->name);
- }
+ G_SCHED_DEBUG(!force,
+ "Device %s is still open (r%dw%de%d), %s.",
+ pp->name, pp->acr, pp->acw, pp->ace, msg);
+ if (!force)
+ return (EBUSY);
+ } else {
+ G_SCHED_DEBUG(0, "Device %s removed.", gp->name);
+ }
+ } else
+ oldpp = g_detach_proxy(gp);
-flush:
gsp = sc->sc_gsched;
if (gsp) {
/*
@@ -1174,26 +1284,69 @@
*/
g_sched_lock(gp);
/*
- * We are dying here, no new request should enter
+ * We are dying here, no new requests should enter
* the scheduler. This is granted by the topolgy,
- * either in case we were proxying (the provider we
- * stole now returned home) or not (see the access
- * check above).
+ * either in case we were proxying (new bios are
+ * being redirected) or not (see the access check
+ * above).
*/
g_sched_forced_dispatch(gp);
- g_sched_wait_pending(gp);
+ error = g_sched_wait_pending(gp);
+
+ if (error) {
+ /*
+ * Not all the requests came home: this might happen
+ * under heavy load, or if we were waiting for any
+ * bio which is served in the event path (see
+ * geom_slice.c for an example of how this can
+ * happen). Try to restore a working configuration
+ * if we can fail.
+ */
+ if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
+ g_sched_flush_pending(force ?
+ g_sched_blackhole : g_sched_start);
+ }
+
+ /*
+ * In the forced destroy case there is not so much
+ * we can do, we have pending bios that will call
+ * g_sched_done() somehow, and we don't want them
+ * to crash the system using freed memory. We tell
+ * the user that something went wrong, and leak some
+ * memory here.
+ * Note: the callers using force = 1 ignore the
+ * return value.
+ */
+ if (force) {
+ G_SCHED_DEBUG(0, "Pending requests while "
+ " destroying geom, some memory leaked.");
+ }
+
+ return (error);
+ }
+
g_sched_unlock(gp);
-
- g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask);
+ g_sched_hash_fini(gp, sc->sc_hash, sc->sc_mask,
+ gsp, sc->sc_data);
sc->sc_hash = NULL;
-
gsp->gs_fini(sc->sc_data);
-
g_gsched_unref(gsp);
+ sc->sc_gsched = NULL;
}
- if ((sc->sc_flags & G_SCHED_PROXYING) && old_pp)
- g_destroy_proxy(gp, old_pp);
+ if ((sc->sc_flags & G_SCHED_PROXYING) && oldpp) {
+ error = g_destroy_proxy(gp, oldpp);
+
+ if (error) {
+ if (force) {
+ G_SCHED_DEBUG(0, "Unrecoverable error while "
+ "destroying a proxy geom, leaking some "
+ " memory.");
+ }
+
+ return (error);
+ }
+ }
mtx_destroy(&sc->sc_mtx);
@@ -1201,7 +1354,7 @@
gp->softc = NULL;
g_wither_geom(gp, ENXIO);
- return (0);
+ return (error);
}
static int
@@ -1372,7 +1525,7 @@
g_ioreq_restore();
}
-#else /* !HAVE_BIO_CLASSIFIER */
+#else /* HAVE_BIO_CLASSIFIER */
static int
g_sched_tag(void *arg, struct bio *bp)
{
==== //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/gs_rr.c#3 (text+ko) ====
@@ -96,6 +96,8 @@
struct g_savg q_thinktime; /* Thinktime average. */
struct g_savg q_seekdist; /* Seek distance average. */
+ int q_bionum; /* Number of requests. */
+
off_t q_lastoff; /* Last submitted req. offset. */
int q_lastsub; /* Last submitted req. time. */
@@ -182,7 +184,7 @@
.sc_head = LIST_HEAD_INITIALIZER(&me.sc_head),
.w_anticipate = 1,
.queue_depth = { 1, 1, 50 },
- .wait_ms = { 1, 5, 30 },
+ .wait_ms = { 1, 10, 30 },
.quantum_ms = { 1, 100, 500 },
.quantum_kb = { 16, 8192, 65536 },
};
@@ -375,14 +377,14 @@
return (0);
if (g_savg_valid(&qp->q_seekdist) &&
- g_savg_read(&qp->q_seekdist) > 2048)
+ g_savg_read(&qp->q_seekdist) > 8192)
return (0);
return (1);
}
/*
- * called on a request arrival, timeout or completion.
+ * Called on a request arrival, timeout or completion.
* Try to serve a request among those queued.
*/
static struct bio *
@@ -468,7 +470,7 @@
sc->sc_active = NULL;
}
}
- /* if sc_active != NULL, its q_status is always correct */
+ /* If sc_active != NULL, its q_status is always correct. */
sc->sc_in_flight++;
@@ -480,9 +482,13 @@
{
int delta = ticks - qp->q_lastsub, wait = get_bounded(&me.wait_ms, 2);
+ if (qp->q_sc->sc_active != qp)
+ return;
+
qp->q_lastsub = ticks;
delta = (delta > 2 * wait) ? 2 * wait : delta;
- g_savg_add_sample(&qp->q_thinktime, delta);
+ if (qp->q_bionum > 7)
+ g_savg_add_sample(&qp->q_thinktime, delta);
}
static inline void
@@ -495,8 +501,13 @@
else
dist = bp->bio_offset - qp->q_lastoff;
+ if (dist > (8192 * 8))
+ dist = 8192 * 8;
+
qp->q_lastoff = bp->bio_offset + bp->bio_length;
- g_savg_add_sample(&qp->q_seekdist, qp->q_seekdist.gs_smpl ? dist : 0);
+
+ if (qp->q_bionum > 7)
+ g_savg_add_sample(&qp->q_seekdist, dist);
}
/*
@@ -536,6 +547,8 @@
}
}
+ qp->q_bionum = 1 + qp->q_bionum - (qp->q_bionum >> 3);
+
g_rr_update_thinktime(qp);
g_rr_update_seekdist(qp, bp);
==== //depot/projects/soc2009/fabio_gsched/geom_sched/sys/geom/sched/gs_scheduler.h#3 (text+ko) ====
@@ -79,6 +79,10 @@
*
* gs_init_class() is called when a new client (as determined by
* the classifier) starts being used.
+ *
+ * gs_hash_unref() is called right before the class hashtable is
+ * destroyed; after this call, the scheduler is supposed to hold no
+ * more references to the elements in the table.
*/
/* Forward declarations for prototypes. */
@@ -92,6 +96,7 @@
typedef struct bio *gs_next_t (void *data, int force);
typedef int gs_init_class_t (void *data, void *priv);
typedef void gs_fini_class_t (void *data, void *priv);
+typedef void gs_hash_unref_t (void *data);
struct g_gsched {
const char *gs_name;
@@ -107,6 +112,7 @@
gs_init_class_t *gs_init_class;
gs_fini_class_t *gs_fini_class;
+ gs_hash_unref_t *gs_hash_unref;
LIST_ENTRY(g_gsched) glist;
};
==== //depot/projects/soc2009/fabio_gsched/geom_sched/sys/modules/geom/geom_sched/Makefile#2 (text+ko) ====
@@ -1,6 +1,6 @@
# $FreeBSD: $
-SUBDIR= gs_sched gsched_as gsched_rr
+SUBDIR= gs_sched gsched_as gsched_rr gsched_bfq
.if defined(WITH_SSD)
SUBDIR += gsched_ssd
.endif
More information about the p4-projects
mailing list