svn commit: r287876 - in projects/iosched/sys/cam: . ata scsi
Warner Losh
imp at FreeBSD.org
Wed Sep 16 22:15:51 UTC 2015
Author: imp
Date: Wed Sep 16 22:15:50 2015
New Revision: 287876
URL: https://svnweb.freebsd.org/changeset/base/287876
Log:
Commit the post-BSDcan level (and a little more) iosched work.
This work is described in the paper that I presented at BSDcan
http://people.freebsd.org/~imp/bsdcan2015/iosched-v3.pdf
section XII. Recent Changes.
Modified:
projects/iosched/sys/cam/ata/ata_da.c
projects/iosched/sys/cam/cam_iosched.c
projects/iosched/sys/cam/cam_iosched.h
projects/iosched/sys/cam/scsi/scsi_da.c
Modified: projects/iosched/sys/cam/ata/ata_da.c
==============================================================================
--- projects/iosched/sys/cam/ata/ata_da.c Wed Sep 16 21:43:51 2015 (r287875)
+++ projects/iosched/sys/cam/ata/ata_da.c Wed Sep 16 22:15:50 2015 (r287876)
@@ -98,11 +98,13 @@ typedef enum {
typedef enum {
ADA_Q_NONE = 0x00,
ADA_Q_4K = 0x01,
+ ADA_Q_NCQ_TRIM_BROKEN = 0x02,
} ada_quirks;
#define ADA_Q_BIT_STRING \
"\020" \
- "\0014K"
+ "\0014K" \
+ "\002NCQ_TRIM_BROKEN"
typedef enum {
ADA_CCB_RAHEAD = 0x01,
@@ -160,6 +162,8 @@ struct ada_softc {
int trim_max_ranges;
int read_ahead;
int write_cache;
+ int unmappedio;
+ int rotating;
#ifdef ADA_TEST_FAILURE
int force_read_error;
int force_write_error;
@@ -173,6 +177,13 @@ struct ada_softc {
struct sysctl_oid *sysctl_tree;
struct callout sendordered_c;
struct trim_request trim_req;
+#ifdef CAM_IO_STATS
+ struct sysctl_ctx_list sysctl_stats_ctx;
+ struct sysctl_oid *sysctl_stats_tree;
+ u_int timeouts;
+ u_int errors;
+ u_int invalidations;
+#endif
};
struct ada_quirk_entry {
@@ -350,6 +361,38 @@ static struct ada_quirk_entry ada_quirk_
},
{
/*
+ * Crucial M500 SSDs EU07 firmware
+ * NCQ Trim works ?
+ */
+ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*M500*", "EU07" },
+ /*quirks*/0
+ },
+ {
+ /*
+ * Crucial M500 SSDs all other firmware
+ * NCQ Trim doesn't work
+ */
+ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*M500*", "*" },
+ /*quirks*/ADA_Q_NCQ_TRIM_BROKEN
+ },
+ {
+ /*
+ * Crucial M550 SSDs
+ * NCQ Trim doesn't work, but only on MU01 firmware
+ */
+ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*M550*", "MU01" },
+ /*quirks*/ADA_Q_NCQ_TRIM_BROKEN
+ },
+ {
+ /*
+ * Crucial MX100 SSDs
+ * NCQ Trim doesn't work, but only on MU01 firmware
+ */
+ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Crucial CT*MX100*", "MU01" },
+ /*quirks*/ADA_Q_NCQ_TRIM_BROKEN
+ },
+ {
+ /*
* Crucial RealSSD C300 SSDs
* 4k optimised
*/
@@ -422,6 +465,30 @@ static struct ada_quirk_entry ada_quirk_
},
{
/*
+ * Micron M500 SSDs firmware EU07
+ * NCQ Trim works?
+ */
+ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Micron M500*", "EU07" },
+ /*quirks*/0
+ },
+ {
+ /*
+ * Micron M500 SSDs all other firmware
+ * NCQ Trim doesn't work
+ */
+ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Micron M500*", "*" },
+ /*quirks*/ADA_Q_NCQ_TRIM_BROKEN
+ },
+ {
+ /*
+ * Micron M5[15]0 SSDs
+ * NCQ Trim doesn't work, but only MU01 firmware
+ */
+ { T_DIRECT, SIP_MEDIA_FIXED, "*", "Micron M5[15]0*", "MU01" },
+ /*quirks*/ADA_Q_NCQ_TRIM_BROKEN
+ },
+ {
+ /*
* OCZ Agility 2 SSDs
* 4k optimised & trim only works in 4k requests + 4k aligned
*/
@@ -471,22 +538,22 @@ static struct ada_quirk_entry ada_quirk_
{
/*
* Samsung 830 Series SSDs
- * 4k optimised
+ * 4k optimised, NCQ TRIM broken (normal TRIM fine)
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG SSD 830 Series*", "*" },
- /*quirks*/ADA_Q_4K
+ /*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN
},
{
/*
* Samsung 840 SSDs
- * 4k optimised
+ * 4k optimised, NCQ TRIM broken (normal TRIM fine)
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Samsung SSD 840*", "*" },
- /*quirks*/ADA_Q_4K
+ /*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN
},
{
/*
- * Samsung 843T Series SSDs
+ * Samsung PM843T Series SSDs
* 4k optimised
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "SAMSUNG MZ7WD*", "*" },
@@ -495,10 +562,10 @@ static struct ada_quirk_entry ada_quirk_
{
/*
* Samsung 850 SSDs
- * 4k optimised
+ * 4k optimised, NCQ TRIM broken (normal TRIM fine)
*/
{ T_DIRECT, SIP_MEDIA_FIXED, "*", "Samsung SSD 850*", "*" },
- /*quirks*/ADA_Q_4K
+ /*quirks*/ADA_Q_4K | ADA_Q_NCQ_TRIM_BROKEN
},
{
/*
@@ -782,8 +849,6 @@ adastrategy(struct bio *bp)
/*
* Place it in the queue of disk activities for this disk
*/
- if (bp->bio_cmd == BIO_DELETE) {
- }
cam_iosched_queue_work(softc->cam_iosched, bp);
/*
@@ -865,7 +930,7 @@ adadump(void *arg, void *virtual, vm_off
0,
NULL,
0,
- ada_default_timeout*1000);
+ 5*1000);
if (softc->flags & ADA_FLAG_CAN_48BIT)
ata_48bit_cmd(&ccb.ataio, ATA_FLUSHCACHE48, 0, 0, 0);
@@ -939,6 +1004,9 @@ adaoninvalidate(struct cam_periph *perip
* De-register any async callbacks.
*/
xpt_register_async(0, adaasync, periph, periph->path);
+#ifdef CAM_IO_STATS
+ softc->invalidations++;
+#endif
/*
* Return all queued I/O with ENXIO.
@@ -959,12 +1027,20 @@ adacleanup(struct cam_periph *periph)
cam_periph_unlock(periph);
+ cam_iosched_fini(softc->cam_iosched);
+
/*
* If we can't free the sysctl tree, oh well...
*/
- if ((softc->flags & ADA_FLAG_SCTX_INIT) != 0
- && sysctl_ctx_free(&softc->sysctl_ctx) != 0) {
- xpt_print(periph->path, "can't remove sysctl context\n");
+ if ((softc->flags & ADA_FLAG_SCTX_INIT) != 0) {
+#ifdef CAM_IO_STATS
+ if (sysctl_ctx_free(&softc->sysctl_stats_ctx) != 0)
+ xpt_print(periph->path,
+ "can't remove sysctl stats context\n");
+#endif
+ if (sysctl_ctx_free(&softc->sysctl_ctx) != 0)
+ xpt_print(periph->path,
+ "can't remove sysctl context\n");
}
disk_destroy(softc->disk);
@@ -977,16 +1053,9 @@ static void
adasetdeletemethod(struct ada_softc *softc)
{
-#if 0
- /*
- * Don't set NCQ_DSM_TRIM method by default. It is currently
- * a "feature of interest" implicated in some data corruption.
- */
if (softc->flags & ADA_FLAG_CAN_NCQ_TRIM)
softc->delete_method = ADA_DELETE_NCQ_DSM_TRIM;
- else
-#endif
- if (softc->flags & ADA_FLAG_CAN_TRIM)
+ else if (softc->flags & ADA_FLAG_CAN_TRIM)
softc->delete_method = ADA_DELETE_DSM_TRIM;
else if ((softc->flags & ADA_FLAG_CAN_CFA) && !(softc->flags & ADA_FLAG_CAN_48BIT))
softc->delete_method = ADA_DELETE_CFA_ERASE;
@@ -1069,7 +1138,8 @@ adaasync(void *callback_arg, u_int32_t c
* the sim do do things properly. Perhaps we should look at log 13
* dword 0 bit 0 and dword 1 bit 0 are set too...
*/
- if ((softc->flags & ADA_FLAG_PIM_CAN_NCQ_TRIM) != 0 &&
+ if ((softc->quirks & ADA_Q_NCQ_TRIM_BROKEN) == 0 &&
+ (softc->flags & ADA_FLAG_PIM_CAN_NCQ_TRIM) != 0 &&
(cgd.ident_data.satacapabilities2 & ATA_SUPPORT_RCVSND_FPDMA_QUEUED) != 0 &&
(softc->flags & ADA_FLAG_CAN_TRIM) != 0)
softc->flags |= ADA_FLAG_CAN_NCQ_TRIM;
@@ -1165,6 +1235,12 @@ adasysctlinit(void *context, int pending
SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
OID_AUTO, "write_cache", CTLFLAG_RW | CTLFLAG_MPSAFE,
&softc->write_cache, 0, "Enable disk write cache.");
+ SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
+ OID_AUTO, "unmapped_io", CTLFLAG_RD | CTLFLAG_MPSAFE,
+ &softc->unmappedio, 0, "Unmapped I/O leaf");
+ SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
+ OID_AUTO, "rotating", CTLFLAG_RD | CTLFLAG_MPSAFE,
+ &softc->rotating, 0, "Rotating media");
#ifdef ADA_TEST_FAILURE
/*
* Add a 'door bell' sysctl which allows one to set it from userland
@@ -1184,6 +1260,28 @@ adasysctlinit(void *context, int pending
&softc->periodic_read_error, 0,
"Force a read error every N reads (don't set too low).");
#endif
+
+#ifdef CAM_IO_STATS
+ softc->sysctl_stats_tree = SYSCTL_ADD_NODE(&softc->sysctl_stats_ctx,
+ SYSCTL_CHILDREN(softc->sysctl_tree), OID_AUTO, "stats",
+ CTLFLAG_RD, 0, "Statistics");
+ SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
+ SYSCTL_CHILDREN(softc->sysctl_stats_tree),
+ OID_AUTO, "timeouts", CTLFLAG_RD | CTLFLAG_MPSAFE,
+ &softc->timeouts, 0,
+ "Device timeouts reported by the SIM");
+ SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
+ SYSCTL_CHILDREN(softc->sysctl_stats_tree),
+ OID_AUTO, "errors", CTLFLAG_RD | CTLFLAG_MPSAFE,
+ &softc->errors, 0,
+ "Transport errors reported by the SIM.");
+ SYSCTL_ADD_INT(&softc->sysctl_stats_ctx,
+ SYSCTL_CHILDREN(softc->sysctl_stats_tree),
+ OID_AUTO, "pack_invalidations", CTLFLAG_RD | CTLFLAG_MPSAFE,
+ &softc->invalidations, 0,
+ "Device pack invalidations.");
+#endif
+
cam_iosched_sysctl_init(softc->cam_iosched, &softc->sysctl_ctx,
softc->sysctl_tree);
@@ -1270,7 +1368,7 @@ adaregister(struct cam_periph *periph, v
return(CAM_REQ_CMP_ERR);
}
- if (cam_iosched_init(&softc->cam_iosched) != 0) {
+ if (cam_iosched_init(&softc->cam_iosched, periph) != 0) {
printf("adaregister: Unable to probe new device. "
"Unable to allocate iosched memory\n");
return(CAM_REQ_CMP_ERR);
@@ -1346,8 +1444,12 @@ adaregister(struct cam_periph *periph, v
"kern.cam.ada.%d.write_cache", periph->unit_number);
TUNABLE_INT_FETCH(announce_buf, &softc->write_cache);
/* Disable queue sorting for non-rotational media by default. */
- cam_iosched_set_sort_queue(softc->cam_iosched,
- cgd->ident_data.media_rotation_rate == ATA_RATE_NON_ROTATING);
+ if (cgd->ident_data.media_rotation_rate == ATA_RATE_NON_ROTATING) {
+ softc->rotating = 0;
+ } else {
+ softc->rotating = 1;
+ }
+ cam_iosched_set_sort_queue(softc->cam_iosched, softc->rotating ? -1 : 0);
adagetparams(periph, cgd);
softc->disk = disk_alloc();
softc->disk->d_rotation_rate = cgd->ident_data.media_rotation_rate;
@@ -1390,8 +1492,10 @@ adaregister(struct cam_periph *periph, v
softc->disk->d_delmaxsize = 256 * softc->params.secsize;
} else
softc->disk->d_delmaxsize = maxio;
- if ((cpi.hba_misc & PIM_UNMAPPED) != 0)
+ if ((cpi.hba_misc & PIM_UNMAPPED) != 0) {
softc->disk->d_flags |= DISKFLAG_UNMAPPED_BIO;
+ softc->unmappedio = 1;
+ }
/*
* If we can do RCVSND_FPDMA_QUEUED commands, we may be able to do
* NCQ trims, if we support trims at all. We also need support from
@@ -1400,9 +1504,9 @@ adaregister(struct cam_periph *periph, v
*/
if (cpi.hba_misc & PIM_NCQ_KLUDGE)
softc->flags |= ADA_FLAG_PIM_CAN_NCQ_TRIM;
- if ((softc->flags & ADA_FLAG_PIM_CAN_NCQ_TRIM) != 0 &&
- (cgd->ident_data.satacapabilities2 &
- ATA_SUPPORT_RCVSND_FPDMA_QUEUED) != 0 &&
+ if ((softc->quirks & ADA_Q_NCQ_TRIM_BROKEN) == 0 &&
+ (softc->flags & ADA_FLAG_PIM_CAN_NCQ_TRIM) != 0 &&
+ (cgd->ident_data.satacapabilities2 & ATA_SUPPORT_RCVSND_FPDMA_QUEUED) != 0 &&
(softc->flags & ADA_FLAG_CAN_TRIM) != 0)
softc->flags |= ADA_FLAG_CAN_NCQ_TRIM;
strlcpy(softc->disk->d_descr, cgd->ident_data.model,
@@ -1675,8 +1779,7 @@ adastart(struct cam_periph *periph, unio
}
if ((bp->bio_flags & BIO_ORDERED) != 0 ||
- (bp->bio_cmd != BIO_DELETE &&
- (softc->flags & ADA_FLAG_NEED_OTAG) != 0)) {
+ (bp->bio_cmd != BIO_DELETE && (softc->flags & ADA_FLAG_NEED_OTAG) != 0)) {
softc->flags &= ~ADA_FLAG_NEED_OTAG;
softc->flags |= ADA_FLAG_WAS_OTAG;
tag_code = 0;
@@ -1807,7 +1910,10 @@ adastart(struct cam_periph *periph, unio
ada_cfaerase(softc, bp, ataio);
break;
default:
- panic("adastart: BIO_DELETE without method, not possible.");
+ biofinish(bp, NULL, EOPNOTSUPP);
+ xpt_release_ccb(start_ccb);
+ adaschedule(periph);
+ return;
}
start_ccb->ccb_h.ccb_state = ADA_CCB_TRIM;
start_ccb->ccb_h.flags |= CAM_UNLOCKED;
@@ -1893,7 +1999,7 @@ adadone(struct cam_periph *periph, union
case ADA_CCB_TRIM:
{
struct bio *bp;
- int error, need_sched;
+ int error;
cam_periph_lock(periph);
bp = (struct bio *)done_ccb->ccb_h.ccb_bp;
@@ -1945,7 +2051,7 @@ adadone(struct cam_periph *periph, union
if (softc->outstanding_cmds == 0)
softc->flags |= ADA_FLAG_WAS_OTAG;
- need_sched = cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb);
+ cam_iosched_bio_complete(softc->cam_iosched, bp, done_ccb);
xpt_release_ccb(done_ccb);
if (state == ADA_CCB_TRIM) {
TAILQ_HEAD(, bio) queue;
@@ -1957,9 +2063,9 @@ adadone(struct cam_periph *periph, union
* Normally, the xpt_release_ccb() above would make sure
* that when we have more work to do, that work would
* get kicked off. However, we specifically keep
- * trim running set to 0 before the call above to allow
+ * trim_running set to 0 before the call above to allow
* other I/O to progress when many BIO_DELETE requests
- * are pushed down. We set trim running to 0 and call
+ * are pushed down. We set trim_running to 0 and call
* daschedule again so that we don't stall if there are
* no other I/Os pending apart from BIO_DELETEs.
*/
@@ -1977,8 +2083,7 @@ adadone(struct cam_periph *periph, union
biodone(bp1);
}
} else {
- if (need_sched)
- adaschedule(periph);
+ adaschedule(periph);
cam_periph_unlock(periph);
biodone(bp);
}
@@ -2070,6 +2175,31 @@ out:
static int
adaerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags)
{
+ struct ada_softc *softc;
+ struct cam_periph *periph;
+
+ periph = xpt_path_periph(ccb->ccb_h.path);
+ softc = (struct ada_softc *)periph->softc;
+
+ switch (ccb->ccb_h.status & CAM_STATUS_MASK) {
+ case CAM_CMD_TIMEOUT:
+#ifdef CAM_IO_STATS
+ softc->timeouts++;
+#endif
+ break;
+ case CAM_REQ_ABORTED:
+ case CAM_REQ_CMP_ERR:
+ case CAM_REQ_TERMIO:
+ case CAM_UNREC_HBA_ERROR:
+ case CAM_DATA_RUN_ERR:
+ case CAM_ATA_STATUS_ERROR:
+#ifdef CAM_IO_STATS
+ softc->errors++;
+#endif
+ break;
+ default:
+ break;
+ }
return(cam_periph_error(ccb, cam_flags, sense_flags, NULL));
}
Modified: projects/iosched/sys/cam/cam_iosched.c
==============================================================================
--- projects/iosched/sys/cam/cam_iosched.c Wed Sep 16 21:43:51 2015 (r287875)
+++ projects/iosched/sys/cam/cam_iosched.c Wed Sep 16 22:15:50 2015 (r287876)
@@ -63,9 +63,6 @@ static MALLOC_DEFINE(M_CAMSCHED, "CAM I/
*/
#ifdef CAM_NETFLIX_IOSCHED
-#define IOP_MAX_SKIP 50
-#define IOP_MAX_TRAINING 500
-#define ALPHA_BITS 14 /* ~32k events or about the last minute */
SYSCTL_DECL(_kern_cam);
static int do_netflix_iosched = 1;
@@ -74,20 +71,178 @@ SYSCTL_INT(_kern_cam, OID_AUTO, do_netfl
&do_netflix_iosched, 1,
"Enable Netflix I/O scheduler optimizations.");
+static int alpha_bits = 9;
+TUNABLE_INT("kern.cam.iosched_alpha_bits", &alpha_bits);
+SYSCTL_INT(_kern_cam, OID_AUTO, iosched_alpha_bits, CTLFLAG_RW,
+ &alpha_bits, 1,
+ "Bits in EMA's alpha.");
+
+
+
+struct iop_stats;
+struct cam_iosched_softc;
+
int iosched_debug = 0;
+typedef enum {
+ none = 0, /* No limits */
+ queue_depth, /* Limit how many ops we queue to SIM */
+ iops, /* Limit # of IOPS to the drive */
+ bandwidth, /* Limit bandwidth to the drive */
+ limiter_max
+} io_limiter;
+
+static const char *cam_iosched_limiter_names[] =
+ { "none", "queue_depth", "iops", "bandwidth" };
+
+/*
+ * Called to initialize the bits of the iop_stats structure relevant to the
+ * limiter. Called just after the limiter is set.
+ */
+typedef int l_init_t(struct iop_stats *);
+
+/*
+ * Called every tick.
+ */
+typedef int l_tick_t(struct iop_stats *);
+
+/*
+ * Called to see if the limiter thinks this IOP can be allowed to
+ * proceed. If so, the limiter assumes that the while IOP proceeded
+ * and makes any accounting of it that's needed.
+ */
+typedef int l_iop_t(struct iop_stats *, struct bio *);
+
+/*
+ * Called when an I/O completes so the limiter can updates its
+ * accounting. Pending I/Os may complete in any order (even when
+ * sent to the hardware at the same time), so the limiter may not
+ * make any assumptions other than this I/O has completed. If it
+ * returns 1, then xpt_schedule() needs to be called again.
+ */
+typedef int l_iodone_t(struct iop_stats *, struct bio *);
+
+static l_iop_t cam_iosched_qd_iop;
+static l_iop_t cam_iosched_qd_caniop;
+static l_iodone_t cam_iosched_qd_iodone;
+
+static l_init_t cam_iosched_iops_init;
+static l_tick_t cam_iosched_iops_tick;
+static l_iop_t cam_iosched_iops_caniop;
+static l_iop_t cam_iosched_iops_iop;
+
+static l_init_t cam_iosched_bw_init;
+static l_tick_t cam_iosched_bw_tick;
+static l_iop_t cam_iosched_bw_caniop;
+static l_iop_t cam_iosched_bw_iop;
+
+struct limswitch
+{
+ l_init_t *l_init;
+ l_tick_t *l_tick;
+ l_iop_t *l_iop;
+ l_iop_t *l_caniop;
+ l_iodone_t *l_iodone;
+} limsw[] =
+{
+ { /* none */
+ .l_init = NULL,
+ .l_tick = NULL,
+ .l_iop = NULL,
+ .l_iodone= NULL,
+ },
+ { /* queue_depth */
+ .l_init = NULL,
+ .l_tick = NULL,
+ .l_caniop = cam_iosched_qd_caniop,
+ .l_iop = cam_iosched_qd_iop,
+ .l_iodone= cam_iosched_qd_iodone,
+ },
+ { /* iops */
+ .l_init = cam_iosched_iops_init,
+ .l_tick = cam_iosched_iops_tick,
+ .l_caniop = cam_iosched_iops_caniop,
+ .l_iop = cam_iosched_iops_iop,
+ .l_iodone= NULL,
+ },
+ { /* bandwidth */
+ .l_init = cam_iosched_bw_init,
+ .l_tick = cam_iosched_bw_tick,
+ .l_caniop = cam_iosched_bw_caniop,
+ .l_iop = cam_iosched_bw_iop,
+ .l_iodone= NULL,
+ },
+};
+
struct iop_stats
{
- sbintime_t data[IOP_MAX_TRAINING]; /* Data for training period */
- sbintime_t worst; /* estimate of worst case latency */
- int outliers; /* Number of outlier latency I/Os */
- int skipping; /* Skipping I/Os when < IOP_MAX_SKIP */
- int training; /* Training when < IOP_MAX_TRAINING */
+ /*
+ * sysctl state for this subnode.
+ */
+ struct sysctl_ctx_list sysctl_ctx;
+ struct sysctl_oid *sysctl_tree;
+
+ /*
+ * Information about the current rate limiters, if any
+ */
+ io_limiter limiter; /* How are I/Os being limited */
+ int min; /* Low range of limit */
+ int max; /* High range of limit */
+ int current; /* Current rate limiter */
+ int l_value1; /* per-limiter scratch value 1. */
+ int l_value2; /* per-limiter scratch value 2. */
+
+
+ /*
+ * Debug information about counts of I/Os that have gone through the
+ * scheduler.
+ */
+ int pending; /* I/Os pending in the hardware */
+ int queued; /* number currently in the queue */
+ int total; /* Total for all time -- wraps */
+ int in; /* number queued all time -- wraps */
+ int out; /* number completed all time -- wraps */
+
+ /*
+ * Statistics on different bits of the process.
+ */
/* Exp Moving Average, alpha = 1 / (1 << alpha_bits) */
sbintime_t ema;
sbintime_t emss; /* Exp Moving sum of the squares */
sbintime_t sd; /* Last computed sd */
+
+ struct cam_iosched_softc *softc;
};
+
+
+typedef enum {
+ set_max = 0, /* current = max */
+ read_latency, /* Steer read latency by throttling writes */
+ cl_max /* Keep last */
+} control_type;
+
+static const char *cam_iosched_control_type_names[] =
+ { "set_max", "read_latency" };
+
+struct control_loop
+{
+ /*
+ * sysctl state for this subnode.
+ */
+ struct sysctl_ctx_list sysctl_ctx;
+ struct sysctl_oid *sysctl_tree;
+
+ sbintime_t next_steer; /* Time of next steer */
+ sbintime_t steer_interval; /* How often do we steer? */
+ sbintime_t lolat;
+ sbintime_t hilat;
+ int alpha;
+ control_type type; /* What type of control? */
+ int last_count; /* Last I/O count */
+
+ struct cam_iosched_softc *softc;
+};
+
#endif
struct cam_iosched_softc
@@ -98,36 +253,375 @@ struct cam_iosched_softc
uint32_t flags;
int sort_io_queue;
#ifdef CAM_NETFLIX_IOSCHED
- /* Number of pending transactions */
- int pending_reads;
- int pending_writes;
- /* Have at least this many transactions in progress, if possible */
- int min_reads;
- int min_writes;
- /* Maximum number of each type of transaction in progress */
- int max_reads;
- int max_writes;
-
- int trims;
- int reads;
- int writes;
- int queued_reads;
- int queued_writes;
- int in_reads;
- int in_writes;
- int out_reads;
- int out_writes;
-
- int read_bias;
- int current_read_bias;
+ int read_bias; /* Read bias setting */
+ int current_read_bias; /* Current read bias state */
+ int total_ticks;
struct bio_queue_head write_queue;
struct iop_stats read_stats, write_stats, trim_stats;
+ struct sysctl_ctx_list sysctl_ctx;
+ struct sysctl_oid *sysctl_tree;
+
+ int quanta; /* Number of quanta per second */
+ struct callout ticker; /* Callout for our quota system */
+ struct cam_periph *periph; /* cam periph associated with this device */
+ uint32_t this_frac; /* Fraction of a second (1024ths) for this tick */
+ sbintime_t last_time; /* Last time we ticked */
+ struct control_loop cl;
#endif
};
+#ifdef CAM_NETFLIX_IOSCHED
+/*
+ * helper functions to call the limsw functions.
+ */
+static int
+cam_iosched_limiter_init(struct iop_stats *ios)
+{
+ int lim = ios->limiter;
+
+ /* maybe this should be a kassert */
+ if (lim < none || lim >= limiter_max)
+ return EINVAL;
+
+ if (limsw[lim].l_init)
+ return limsw[lim].l_init(ios);
+
+ return 0;
+}
+
+static int
+cam_iosched_limiter_tick(struct iop_stats *ios)
+{
+ int lim = ios->limiter;
+
+ /* maybe this should be a kassert */
+ if (lim < none || lim >= limiter_max)
+ return EINVAL;
+
+ if (limsw[lim].l_tick)
+ return limsw[lim].l_tick(ios);
+
+ return 0;
+}
+
+static int
+cam_iosched_limiter_iop(struct iop_stats *ios, struct bio *bp)
+{
+ int lim = ios->limiter;
+
+ /* maybe this should be a kassert */
+ if (lim < none || lim >= limiter_max)
+ return EINVAL;
+
+ if (limsw[lim].l_iop)
+ return limsw[lim].l_iop(ios, bp);
+
+ return 0;
+}
+
+static int
+cam_iosched_limiter_caniop(struct iop_stats *ios, struct bio *bp)
+{
+ int lim = ios->limiter;
+
+ /* maybe this should be a kassert */
+ if (lim < none || lim >= limiter_max)
+ return EINVAL;
+
+ if (limsw[lim].l_caniop)
+ return limsw[lim].l_caniop(ios, bp);
+
+ return 0;
+}
+
+static int
+cam_iosched_limiter_iodone(struct iop_stats *ios, struct bio *bp)
+{
+ int lim = ios->limiter;
+
+ /* maybe this should be a kassert */
+ if (lim < none || lim >= limiter_max)
+ return 0;
+
+ if (limsw[lim].l_iodone)
+ return limsw[lim].l_iodone(ios, bp);
+
+ return 0;
+}
+
+/*
+ * Functions to implement the different kinds of limiters
+ */
+
+static int
+cam_iosched_qd_iop(struct iop_stats *ios, struct bio *bp)
+{
+
+ if (ios->current <= 0 || ios->pending < ios->current)
+ return 0;
+
+ return EAGAIN;
+}
+
+static int
+cam_iosched_qd_caniop(struct iop_stats *ios, struct bio *bp)
+{
+
+ if (ios->current <= 0 || ios->pending < ios->current)
+ return 0;
+
+ return EAGAIN;
+}
+
+static int
+cam_iosched_qd_iodone(struct iop_stats *ios, struct bio *bp)
+{
+
+ if (ios->current <= 0 || ios->pending != ios->current)
+ return 0;
+
+ return 1;
+}
+
+static int
+cam_iosched_iops_init(struct iop_stats *ios)
+{
+
+ ios->l_value1 = ios->current / ios->softc->quanta;
+ if (ios->l_value1 <= 0)
+ ios->l_value1 = 1;
+
+ return 0;
+}
+
+static int
+cam_iosched_iops_tick(struct iop_stats *ios)
+{
+
+ ios->l_value1 = (int)((ios->current * (uint64_t)ios->softc->this_frac) >> 16);
+ if (ios->l_value1 <= 0)
+ ios->l_value1 = 1;
+
+ return 0;
+}
+
+static int
+cam_iosched_iops_caniop(struct iop_stats *ios, struct bio *bp)
+{
+
+ /*
+ * So if we have any more IOPs left, allow it,
+ * otherwise wait.
+ */
+ if (ios->l_value1 <= 0)
+ return EAGAIN;
+ return 0;
+}
+
+static int
+cam_iosched_iops_iop(struct iop_stats *ios, struct bio *bp)
+{
+ int rv;
+
+ rv = cam_iosched_limiter_caniop(ios, bp);
+ if (rv == 0)
+ ios->l_value1--;
+
+ return rv;
+}
+
+static int
+cam_iosched_bw_init(struct iop_stats *ios)
+{
+
+ /* ios->current is in kB/s, so scale to bytes */
+ ios->l_value1 = ios->current * 1000 / ios->softc->quanta;
+
+ return 0;
+}
+
+static int
+cam_iosched_bw_tick(struct iop_stats *ios)
+{
+ int bw;
+
+ /*
+ * If we're in the hole for available quota from
+ * the last time, then add the quantum for this.
+ * If we have any left over from last quantum,
+ * then too bad, that's lost. Also, ios->current
+ * is in kB/s, so scale.
+ *
+ * We also allow up to 4 quanta of credits to
+ * accumulate to deal with burstiness. 4 is extremely
+ * arbitrary.
+ */
+ bw = (int)((ios->current * 1000ull * (uint64_t)ios->softc->this_frac) >> 16);
+ if (ios->l_value1 < bw * 4)
+ ios->l_value1 += bw;
+
+ return 0;
+}
+
+static int
+cam_iosched_bw_caniop(struct iop_stats *ios, struct bio *bp)
+{
+ /*
+ * So if we have any more bw quota left, allow it,
+ * otherwise wait. Not, we'll go negative and that's
+ * OK. We'll just get a lettle less next quota.
+ *
+ * Note on going negative: that allows us to process
+ * requests in order better, since we won't allow
+ * shorter reads to get around the long one that we
+ * don't have the quota to do just yet. It also prevents
+ * starvation by being a little more permissive about
+ * what we let through this quantum (to prevent the
+ * starvation), at the cost of getting a little less
+ * next quantum.
+ */
+ if (ios->l_value1 <= 0)
+ return EAGAIN;
+
+
+ return 0;
+}
+
+static int
+cam_iosched_bw_iop(struct iop_stats *ios, struct bio *bp)
+{
+ int rv;
+
+ rv = cam_iosched_limiter_caniop(ios, bp);
+ if (rv == 0)
+ ios->l_value1 -= bp->bio_length;
+
+ return rv;
+}
+
+static void cam_iosched_cl_maybe_steer(struct control_loop *clp);
+
+static void
+cam_iosched_ticker(void *arg)
+{
+ struct cam_iosched_softc *isc = arg;
+ sbintime_t now, delta;
+
+ callout_reset(&isc->ticker, hz / isc->quanta - 1, cam_iosched_ticker, isc);
+
+ now = sbinuptime();
+ delta = now - isc->last_time;
+ isc->this_frac = (uint32_t)delta >> 16; /* Note: discards seconds -- should be 0 harmless if not */
+ isc->last_time = now;
+
+ cam_iosched_cl_maybe_steer(&isc->cl);
+
+ cam_iosched_limiter_tick(&isc->read_stats);
+ cam_iosched_limiter_tick(&isc->write_stats);
+ cam_iosched_limiter_tick(&isc->trim_stats);
+
+ cam_iosched_schedule(isc, isc->periph);
+
+ isc->total_ticks++;
+}
+
+
+static void
+cam_iosched_cl_init(struct control_loop *clp, struct cam_iosched_softc *isc)
+{
+
+ clp->next_steer = sbinuptime();
+ clp->softc = isc;
+ clp->steer_interval = SBT_1S * 5; /* Let's start out steering every 5s */
+ clp->lolat = 5 * SBT_1MS;
+ clp->hilat = 15 * SBT_1MS;
+ clp->alpha = 20; /* Alpha == gain. 20 = .2 */
+ clp->type = set_max;
+}
+
+static void
+cam_iosched_cl_maybe_steer(struct control_loop *clp)
+{
+ struct cam_iosched_softc *isc;
+ sbintime_t now, lat;
+ int old;
+
+ isc = clp->softc;
+ now = isc->last_time;
+ if (now < clp->next_steer)
+ return;
+
+ clp->next_steer = now + clp->steer_interval;
+ switch (clp->type) {
+ case set_max:
+ if (isc->write_stats.current != isc->write_stats.max)
+ printf("Steering write from %d kBps to %d kBps\n",
+ isc->write_stats.current, isc->write_stats.max);
+ isc->read_stats.current = isc->read_stats.max;
+ isc->write_stats.current = isc->write_stats.max;
+ isc->trim_stats.current = isc->trim_stats.max;
+ break;
+ case read_latency:
+ old = isc->write_stats.current;
+ lat = isc->read_stats.ema;
+ /*
+ * Simple PLL-like engine. Since we're steering to a range for
+ * the SP (set point) that makes things a little more
+ * complicated. In addition, we're not directly controlling our
+ * PV (process variable), the read latency, but instead are
+ * manipulating the write bandwidth limit for our MV
+ * (manipulation variable), analysis of this code gets a bit
+ * messy. Also, the MV is a very noisy control surface for read
+ * latency since it is affected by many hidden processes inside
+ * the device which change how responsive read latency will be
+ * in reaction to changes in write bandwidth. Unlike the classic
+ * boiler control PLL. this may result in over-steering while
+ * the SSD takes its time to react to the new, lower load. This
+ * is why we use a relatively low alpha of between .1 and .25 to
+ * compensate for this effect. At .1, it takes ~22 steering
+ * intervals to back off by a factor of 10. At .2 it only takes
+ * ~10. At .25 it only takes ~8. However some preliminary data
+ * from the SSD drives suggests a reasponse time in 10's of
+ * seconds before latency drops regardless of the new write
+ * rate. Careful observation will be reqiured to tune this
+ * effectively.
+ *
+ * Also, when there's no read traffic, we jack up the write
+ * limit too regardless of the last read latency. 10 is
+ * somewhat arbitrary.
+ */
+ if (lat < clp->lolat || isc->read_stats.total - clp->last_count < 10)
+ isc->write_stats.current = isc->write_stats.current *
+ (100 + clp->alpha) / 100; /* Scale up */
+ else if (lat > clp->hilat)
+ isc->write_stats.current = isc->write_stats.current *
+ (100 - clp->alpha) / 100; /* Scale down */
+ clp->last_count = isc->read_stats.total;
+
+ /*
+ * Even if we don't steer, per se, enforce the min/max limits as
+ * those may have changed.
+ */
+ if (isc->write_stats.current < isc->write_stats.min)
+ isc->write_stats.current = isc->write_stats.min;
+ if (isc->write_stats.current > isc->write_stats.max)
+ isc->write_stats.current = isc->write_stats.max;
+ if (old != isc->write_stats.current)
+ printf("Steering write from %d kBps to %d kBps due to latency of %ldus\n",
+ old, isc->write_stats.current,
+ ((uint64_t)1000000 * (uint32_t)lat) >> 32);
+ break;
+ case cl_max:
+ break;
+ }
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-projects
mailing list