svn commit: r248577 - head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs

Steven Hartland smh at FreeBSD.org
Thu Mar 21 11:02:09 UTC 2013


Author: smh
Date: Thu Mar 21 11:02:08 2013
New Revision: 248577
URL: http://svnweb.freebsd.org/changeset/base/248577

Log:
  Optimisation of TRIM processing.
  
  Previously TRIM processing was very bursty. This was made worse by the fact
  that TRIM requests on SSD's are typically much slower than reads or writes.
  This often resulted in stalls while large numbers of TRIM's where processed.
  
  In addition due to the way the TRIM thread was only woken by writes, deletes
  could stall in the queue for extensive periods of time.
  
  This patch adds a number of controls to how often the TRIM thread for each
  SPA processes its outstanding delete requests.
  vfs.zfs.trim.timeout: Delay TRIMs by up to this many seconds
  vfs.zfs.trim.txg_delay: Delay TRIMs by up to this many TXGs (reduced to 32)
  vfs.zfs.vdev.trim_max_bytes: Maximum pending TRIM bytes for a vdev
  vfs.zfs.vdev.trim_max_pending: Maximum pending TRIM segments for a vdev
  vfs.zfs.trim.max_interval: Maximum interval between TRIM queue processing
  (seconds)
  
  Given the most common TRIM implementation is ATA TRIM the current defaults
  are targeted at that.
  
  Reviewed by:	pjd (mentor)
  Approved by:	pjd (mentor)
  MFC after:	2 weeks

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c	Thu Mar 21 10:41:30 2013	(r248576)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c	Thu Mar 21 11:02:08 2013	(r248577)
@@ -40,6 +40,18 @@
 #define	TRIM_ZIO_END(vd, offset, size)	(offset +		\
  	P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift))
 
+#define TRIM_MAP_SINC(tm, size)					\
+	atomic_add_64(&(tm)->tm_bytes, (size))
+
+#define TRIM_MAP_SDEC(tm, size)					\
+	atomic_subtract_64(&(tm)->tm_bytes, (size))
+
+#define TRIM_MAP_QINC(tm)					\
+	atomic_inc_64(&(tm)->tm_pending);			\
+
+#define TRIM_MAP_QDEC(tm)					\
+	atomic_dec_64(&(tm)->tm_pending);
+
 typedef struct trim_map {
 	list_t		tm_head;		/* List of segments sorted by txg. */
 	avl_tree_t	tm_queued_frees;	/* AVL tree of segments waiting for TRIM. */
@@ -47,6 +59,8 @@ typedef struct trim_map {
 	avl_tree_t	tm_inflight_writes;	/* AVL tree of in-flight writes. */
 	list_t		tm_pending_writes;	/* Writes blocked on in-flight frees. */
 	kmutex_t	tm_lock;
+	uint64_t	tm_pending;		/* Count of pending TRIMs. */
+	uint64_t	tm_bytes;		/* Total size in bytes of queued TRIMs. */
 } trim_map_t;
 
 typedef struct trim_seg {
@@ -60,17 +74,41 @@ typedef struct trim_seg {
 
 extern boolean_t zfs_notrim;
 
+static u_int trim_txg_delay = 32;
+static u_int trim_timeout = 30;
+static u_int trim_max_interval = 1;
+/* Limit outstanding TRIMs to 2G (max size for a single TRIM request) */
+static uint64_t trim_vdev_max_bytes = 2147483648;
+/* Limit outstanding TRIMs to 64 (max ranges for a single TRIM request) */	
+static u_int trim_vdev_max_pending = 64;
+
 SYSCTL_DECL(_vfs_zfs);
-/* Delay TRIMs by that many TXGs. */
-static int trim_txg_limit = 64;
-TUNABLE_INT("vfs.zfs.trim_txg_limit", &trim_txg_limit);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, trim_txg_limit, CTLFLAG_RW, &trim_txg_limit, 0,
-    "Delay TRIMs by that many TXGs.");
-
-static int trim_l2arc_limit = 30;
-TUNABLE_INT("vfs.zfs.trim_l2arc_limit", &trim_l2arc_limit);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, trim_l2arc_limit, CTLFLAG_RWTUN, &trim_l2arc_limit, 0,
-    "Delay TRIMs by this many seconds for cache devices.");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD, 0, "ZFS TRIM");
+
+TUNABLE_INT("vfs.zfs.trim.txg_delay", &trim_txg_delay);
+SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, txg_delay, CTLFLAG_RWTUN, &trim_txg_delay,
+    0, "Delay TRIMs by up to this many TXGs");
+
+TUNABLE_INT("vfs.zfs.trim.timeout", &trim_timeout);
+SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, timeout, CTLFLAG_RWTUN, &trim_timeout, 0,
+    "Delay TRIMs by up to this many seconds");
+
+TUNABLE_INT("vfs.zfs.trim.max_interval", &trim_max_interval);
+SYSCTL_UINT(_vfs_zfs_trim, OID_AUTO, max_interval, CTLFLAG_RWTUN,
+    &trim_max_interval, 0,
+    "Maximum interval between TRIM queue processing (seconds)");
+
+SYSCTL_DECL(_vfs_zfs_vdev);
+TUNABLE_QUAD("vfs.zfs.vdev.trim_max_bytes", &trim_vdev_max_bytes);
+SYSCTL_QUAD(_vfs_zfs_vdev, OID_AUTO, trim_max_bytes, CTLFLAG_RWTUN,
+    &trim_vdev_max_bytes, 0,
+    "Maximum pending TRIM bytes for a vdev");
+
+TUNABLE_INT("vfs.zfs.vdev.trim_max_pending", &trim_vdev_max_pending);
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN,
+    &trim_vdev_max_pending, 0,
+    "Maximum pending TRIM segments for a vdev");
+
 
 static void trim_map_vdev_commit_done(spa_t *spa, vdev_t *vd);
 
@@ -164,6 +202,8 @@ trim_map_destroy(vdev_t *vd)
 		avl_remove(&tm->tm_queued_frees, ts);
 		list_remove(&tm->tm_head, ts);
 		kmem_free(ts, sizeof (*ts));
+		TRIM_MAP_SDEC(tm, ts->ts_end - ts->ts_start);
+		TRIM_MAP_QDEC(tm);
 	}
 	mutex_exit(&tm->tm_lock);
 
@@ -204,21 +244,31 @@ trim_map_segment_add(trim_map_t *tm, uin
 	ts_before = avl_nearest(&tm->tm_queued_frees, where, AVL_BEFORE);
 	ts_after = avl_nearest(&tm->tm_queued_frees, where, AVL_AFTER);
 
-	merge_before = (ts_before != NULL && ts_before->ts_end == start &&
-	    ts_before->ts_txg == txg);
-	merge_after = (ts_after != NULL && ts_after->ts_start == end &&
-	    ts_after->ts_txg == txg);
+	merge_before = (ts_before != NULL && ts_before->ts_end == start);
+	merge_after = (ts_after != NULL && ts_after->ts_start == end);
 
 	if (merge_before && merge_after) {
+		TRIM_MAP_SINC(tm, ts_after->ts_start - ts_before->ts_end);
+		TRIM_MAP_QDEC(tm);
 		avl_remove(&tm->tm_queued_frees, ts_before);
 		list_remove(&tm->tm_head, ts_before);
 		ts_after->ts_start = ts_before->ts_start;
+		ts_after->ts_txg = txg;
+		ts_after->ts_time = time;
 		kmem_free(ts_before, sizeof (*ts_before));
 	} else if (merge_before) {
+		TRIM_MAP_SINC(tm, end - ts_before->ts_end);
 		ts_before->ts_end = end;
+		ts_before->ts_txg = txg;
+		ts_before->ts_time = time;
 	} else if (merge_after) {
+		TRIM_MAP_SINC(tm, ts_after->ts_start - start);
 		ts_after->ts_start = start;
+		ts_after->ts_txg = txg;
+		ts_after->ts_time = time;
 	} else {
+		TRIM_MAP_SINC(tm, end - start);
+		TRIM_MAP_QINC(tm);
 		ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
 		ts->ts_start = start;
 		ts->ts_end = end;
@@ -241,6 +291,7 @@ trim_map_segment_remove(trim_map_t *tm, 
 	left_over = (ts->ts_start < start);
 	right_over = (ts->ts_end > end);
 
+	TRIM_MAP_SDEC(tm, end - start);
 	if (left_over && right_over) {
 		nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
 		nts->ts_start = end;
@@ -250,6 +301,7 @@ trim_map_segment_remove(trim_map_t *tm, 
 		ts->ts_end = start;
 		avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
 		list_insert_after(&tm->tm_head, ts, nts);
+		TRIM_MAP_QINC(tm);
 	} else if (left_over) {
 		ts->ts_end = start;
 	} else if (right_over) {
@@ -257,6 +309,7 @@ trim_map_segment_remove(trim_map_t *tm, 
 	} else {
 		avl_remove(&tm->tm_queued_frees, ts);
 		list_remove(&tm->tm_head, ts);
+		TRIM_MAP_QDEC(tm);
 		kmem_free(ts, sizeof (*ts));
 	}
 }
@@ -368,20 +421,25 @@ trim_map_write_done(zio_t *zio)
 }
 
 /*
- * Return the oldest segment (the one with the lowest txg) or false if
- * the list is empty or the first element's txg is greater than txg given
- * as function argument, or the first element's time is greater than time
- * given as function argument
+ * Return the oldest segment (the one with the lowest txg / time) or NULL if:
+ * 1. The list is empty
+ * 2. The first element's txg is greater than txgsafe
+ * 3. The first element's txg is not greater than the txg argument and the
+ *    the first element's time is not greater than time argument
  */
 static trim_seg_t *
-trim_map_first(trim_map_t *tm, uint64_t txg, hrtime_t time)
+trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time)
 {
 	trim_seg_t *ts;
 
 	ASSERT(MUTEX_HELD(&tm->tm_lock));
+	VERIFY(txgsafe >= txg);
 
 	ts = list_head(&tm->tm_head);
-	if (ts != NULL && ts->ts_txg <= txg && ts->ts_time <= time)
+	if (ts != NULL && ts->ts_txg <= txgsafe &&
+	    (ts->ts_txg <= txg || ts->ts_time <= time ||
+	    tm->tm_bytes > trim_vdev_max_bytes ||
+	    tm->tm_pending > trim_vdev_max_pending))
 		return (ts);
 	return (NULL);
 }
@@ -391,7 +449,7 @@ trim_map_vdev_commit(spa_t *spa, zio_t *
 {
 	trim_map_t *tm = vd->vdev_trimmap;
 	trim_seg_t *ts;
-	uint64_t start, size, txglimit;
+	uint64_t size, txgtarget, txgsafe;
 	hrtime_t timelimit;
 
 	ASSERT(vd->vdev_ops->vdev_op_leaf);
@@ -399,26 +457,29 @@ trim_map_vdev_commit(spa_t *spa, zio_t *
 	if (tm == NULL)
 		return;
 
+	timelimit = gethrtime() - trim_timeout * NANOSEC;
 	if (vd->vdev_isl2cache) {
-		timelimit = gethrtime() - trim_l2arc_limit * NANOSEC;
-		txglimit = UINT64_MAX;
+		txgsafe = UINT64_MAX;
+		txgtarget = UINT64_MAX;
 	} else {
-		timelimit = TIME_MAX;
-		txglimit = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa)) -
-		    trim_txg_limit;
+		txgsafe = MIN(spa_last_synced_txg(spa), spa_freeze_txg(spa));
+		if (txgsafe > trim_txg_delay)
+			txgtarget = txgsafe - trim_txg_delay;
+		else
+			txgtarget = 0;
 	}
 
 	mutex_enter(&tm->tm_lock);
-	/*
-	 * Loop until we send all frees up to the txglimit
-	 * or time limit if this is a cache device.
-	 */
-	while ((ts = trim_map_first(tm, txglimit, timelimit)) != NULL) {
+	/* Loop until we have sent all outstanding free's */
+	while ((ts = trim_map_first(tm, txgtarget, txgsafe, timelimit))
+	    != NULL) {
 		list_remove(&tm->tm_head, ts);
 		avl_remove(&tm->tm_queued_frees, ts);
 		avl_add(&tm->tm_inflight_frees, ts);
-		zio_nowait(zio_trim(zio, spa, vd, ts->ts_start,
-		    ts->ts_end - ts->ts_start));
+		size = ts->ts_end - ts->ts_start;
+		zio_nowait(zio_trim(zio, spa, vd, ts->ts_start, size));
+		TRIM_MAP_SDEC(tm, size);
+		TRIM_MAP_QDEC(tm);
 	}
 	mutex_exit(&tm->tm_lock);
 }
@@ -463,7 +524,7 @@ trim_map_commit(spa_t *spa, zio_t *zio, 
 {
 	int c;
 
-	if (vd == NULL || spa_last_synced_txg(spa) <= trim_txg_limit)
+	if (vd == NULL)
 		return;
 
 	if (vd->vdev_ops->vdev_op_leaf) {
@@ -509,7 +570,9 @@ trim_thread(void *arg)
 			mutex_exit(&spa->spa_trim_lock);
 			thread_exit();
 		}
-		cv_wait(&spa->spa_trim_cv, &spa->spa_trim_lock);
+
+		(void) cv_timedwait(&spa->spa_trim_cv, &spa->spa_trim_lock,
+		    hz * trim_max_interval);
 		mutex_exit(&spa->spa_trim_lock);
 
 		zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);


More information about the svn-src-all mailing list