svn commit: r252140 - in stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs: . sys

Xin LI delphij at FreeBSD.org
Mon Jun 24 05:00:33 UTC 2013


Author: delphij
Date: Mon Jun 24 05:00:31 2013
New Revision: 252140
URL: http://svnweb.freebsd.org/changeset/base/252140

Log:
  MFC r251478: MFV r251474:
  
   * Illumos zfs issue #3137 L2ARC compression
  
  Whether or not to compress buffers entering the L2ARC is
  controlled by "compression" setting on the dataset, when
  compression is not "off", L2ARC compression is enabled.
  
  The compress method is always LZ4 for L2ARC when enabled
  because it works best for the scenario.

Modified:
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
  stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
Directory Properties:
  stable/9/sys/   (props changed)
  stable/9/sys/cddl/contrib/opensolaris/   (props changed)

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Mon Jun 24 02:24:22 2013	(r252139)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c	Mon Jun 24 05:00:31 2013	(r252140)
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
 /*
@@ -120,6 +121,7 @@
 
 #include <sys/spa.h>
 #include <sys/zio.h>
+#include <sys/zio_compress.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/refcount.h>
@@ -322,7 +324,11 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_l2_cksum_bad;
 	kstat_named_t arcstat_l2_io_error;
 	kstat_named_t arcstat_l2_size;
+	kstat_named_t arcstat_l2_asize;
 	kstat_named_t arcstat_l2_hdr_size;
+	kstat_named_t arcstat_l2_compress_successes;
+	kstat_named_t arcstat_l2_compress_zeros;
+	kstat_named_t arcstat_l2_compress_failures;
 	kstat_named_t arcstat_l2_write_trylock_fail;
 	kstat_named_t arcstat_l2_write_passed_headroom;
 	kstat_named_t arcstat_l2_write_spa_mismatch;
@@ -395,7 +401,11 @@ static arc_stats_t arc_stats = {
 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
 	{ "l2_size",			KSTAT_DATA_UINT64 },
+	{ "l2_asize",			KSTAT_DATA_UINT64 },
 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
+	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
+	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
+	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
 	{ "l2_write_trylock_fail",	KSTAT_DATA_UINT64 },
 	{ "l2_write_passed_headroom",	KSTAT_DATA_UINT64 },
 	{ "l2_write_spa_mismatch",	KSTAT_DATA_UINT64 },
@@ -474,6 +484,9 @@ static arc_state_t	*arc_l2c_only;
 #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
 #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
 
+#define	L2ARC_IS_VALID_COMPRESS(_c_) \
+	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
+
 static int		arc_no_grow;	/* Don't try to grow cache size */
 static uint64_t		arc_tempreserve;
 static uint64_t		arc_loaned_bytes;
@@ -636,7 +649,12 @@ uint64_t zfs_crc64_table[256];
  */
 
 #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
-#define	L2ARC_HEADROOM		2		/* num of writes */
+#define	L2ARC_HEADROOM		2			/* num of writes */
+/*
+ * If we discover during ARC scan any buffers to be compressed, we boost
+ * our headroom for the next scanning cycle by this percentage multiple.
+ */
+#define	L2ARC_HEADROOM_BOOST	200
 #define	L2ARC_FEED_SECS		1		/* caching interval secs */
 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
 
@@ -649,6 +667,7 @@ uint64_t zfs_crc64_table[256];
 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
 uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
+uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
 boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
@@ -722,8 +741,6 @@ typedef struct l2arc_dev {
 	vdev_t			*l2ad_vdev;	/* vdev */
 	spa_t			*l2ad_spa;	/* spa */
 	uint64_t		l2ad_hand;	/* next write location */
-	uint64_t		l2ad_write;	/* desired write size, bytes */
-	uint64_t		l2ad_boost;	/* warmup write boost, bytes */
 	uint64_t		l2ad_start;	/* first addr on device */
 	uint64_t		l2ad_end;	/* last addr on device */
 	uint64_t		l2ad_evict;	/* last addr eviction reached */
@@ -744,11 +761,12 @@ static kmutex_t l2arc_free_on_write_mtx;
 static uint64_t l2arc_ndev;			/* number of devices */
 
 typedef struct l2arc_read_callback {
-	arc_buf_t	*l2rcb_buf;		/* read buffer */
-	spa_t		*l2rcb_spa;		/* spa */
-	blkptr_t	l2rcb_bp;		/* original blkptr */
-	zbookmark_t	l2rcb_zb;		/* original bookmark */
-	int		l2rcb_flags;		/* original flags */
+	arc_buf_t		*l2rcb_buf;		/* read buffer */
+	spa_t			*l2rcb_spa;		/* spa */
+	blkptr_t		l2rcb_bp;		/* original blkptr */
+	zbookmark_t		l2rcb_zb;		/* original bookmark */
+	int			l2rcb_flags;		/* original flags */
+	enum zio_compress	l2rcb_compress;		/* applied compress */
 } l2arc_read_callback_t;
 
 typedef struct l2arc_write_callback {
@@ -758,8 +776,14 @@ typedef struct l2arc_write_callback {
 
 struct l2arc_buf_hdr {
 	/* protected by arc_buf_hdr  mutex */
-	l2arc_dev_t	*b_dev;			/* L2ARC device */
-	uint64_t	b_daddr;		/* disk address, offset byte */
+	l2arc_dev_t		*b_dev;		/* L2ARC device */
+	uint64_t		b_daddr;	/* disk address, offset byte */
+	/* compression applied to buffer data */
+	enum zio_compress	b_compress;
+	/* real alloc'd buffer size depending on b_compress applied */
+	int			b_asize;
+	/* temporary buffer holder for in-flight compressed data */
+	void			*b_tmp_cdata;
 };
 
 typedef struct l2arc_data_free {
@@ -778,6 +802,11 @@ static void l2arc_read_done(zio_t *zio);
 static void l2arc_hdr_stat_add(void);
 static void l2arc_hdr_stat_remove(void);
 
+static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
+static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
+    enum zio_compress c);
+static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
+
 static uint64_t
 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
@@ -1696,6 +1725,7 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 			    hdr->b_size, 0);
 			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
 			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+			ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
 			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
 			if (hdr->b_state == arc_l2c_only)
 				l2arc_hdr_stat_remove();
@@ -3118,6 +3148,8 @@ top:
 		arc_access(hdr, hash_lock);
 		if (*arc_flags & ARC_L2CACHE)
 			hdr->b_flags |= ARC_L2CACHE;
+		if (*arc_flags & ARC_L2COMPRESS)
+			hdr->b_flags |= ARC_L2COMPRESS;
 		mutex_exit(hash_lock);
 		ARCSTAT_BUMP(arcstat_hits);
 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
@@ -3158,6 +3190,8 @@ top:
 			}
 			if (*arc_flags & ARC_L2CACHE)
 				hdr->b_flags |= ARC_L2CACHE;
+			if (*arc_flags & ARC_L2COMPRESS)
+				hdr->b_flags |= ARC_L2COMPRESS;
 			if (BP_GET_LEVEL(bp) > 0)
 				hdr->b_flags |= ARC_INDIRECT;
 		} else {
@@ -3174,6 +3208,8 @@ top:
 				add_reference(hdr, hash_lock, private);
 			if (*arc_flags & ARC_L2CACHE)
 				hdr->b_flags |= ARC_L2CACHE;
+			if (*arc_flags & ARC_L2COMPRESS)
+				hdr->b_flags |= ARC_L2COMPRESS;
 			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 			buf->b_hdr = hdr;
 			buf->b_data = NULL;
@@ -3247,6 +3283,7 @@ top:
 				cb->l2rcb_bp = *bp;
 				cb->l2rcb_zb = *zb;
 				cb->l2rcb_flags = zio_flags;
+				cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
 
 				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
 				    addr + size < vd->vdev_psize -
@@ -3255,16 +3292,31 @@ top:
 				/*
 				 * l2arc read.  The SCL_L2ARC lock will be
 				 * released by l2arc_read_done().
+				 * Issue a null zio if the underlying buffer
+				 * was squashed to zero size by compression.
 				 */
-				rzio = zio_read_phys(pio, vd, addr, size,
-				    buf->b_data, ZIO_CHECKSUM_OFF,
-				    l2arc_read_done, cb, priority, zio_flags |
-				    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
-				    ZIO_FLAG_DONT_PROPAGATE |
-				    ZIO_FLAG_DONT_RETRY, B_FALSE);
+				if (hdr->b_l2hdr->b_compress ==
+				    ZIO_COMPRESS_EMPTY) {
+					rzio = zio_null(pio, spa, vd,
+					    l2arc_read_done, cb,
+					    zio_flags | ZIO_FLAG_DONT_CACHE |
+					    ZIO_FLAG_CANFAIL |
+					    ZIO_FLAG_DONT_PROPAGATE |
+					    ZIO_FLAG_DONT_RETRY);
+				} else {
+					rzio = zio_read_phys(pio, vd, addr,
+					    hdr->b_l2hdr->b_asize,
+					    buf->b_data, ZIO_CHECKSUM_OFF,
+					    l2arc_read_done, cb, priority,
+					    zio_flags | ZIO_FLAG_DONT_CACHE |
+					    ZIO_FLAG_CANFAIL |
+					    ZIO_FLAG_DONT_PROPAGATE |
+					    ZIO_FLAG_DONT_RETRY, B_FALSE);
+				}
 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
 				    zio_t *, rzio);
-				ARCSTAT_INCR(arcstat_l2_read_bytes, size);
+				ARCSTAT_INCR(arcstat_l2_read_bytes,
+				    hdr->b_l2hdr->b_asize);
 
 				if (*arc_flags & ARC_NOWAIT) {
 					zio_nowait(rzio);
@@ -3531,6 +3583,7 @@ arc_release(arc_buf_t *buf, void *tag)
 	buf->b_private = NULL;
 
 	if (l2hdr) {
+		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
 		trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
 		    hdr->b_size, 0);
 		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
@@ -3682,9 +3735,9 @@ arc_write_done(zio_t *zio)
 
 zio_t *
 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
-    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private,
-    int priority, int zio_flags, const zbookmark_t *zb)
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
+    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
+    void *private, int priority, int zio_flags, const zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
@@ -3697,6 +3750,8 @@ arc_write(zio_t *pio, spa_t *spa, uint64
 	ASSERT(hdr->b_acb == NULL);
 	if (l2arc)
 		hdr->b_flags |= ARC_L2CACHE;
+	if (l2arc_compress)
+		hdr->b_flags |= ARC_L2COMPRESS;
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
 	callback->awcb_ready = ready;
 	callback->awcb_done = done;
@@ -4147,8 +4202,12 @@ arc_fini(void)
  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
  * It does this by periodically scanning buffers from the eviction-end of
  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
- * not already there.  It scans until a headroom of buffers is satisfied,
- * which itself is a buffer for ARC eviction.  The thread that does this is
+ * not already there. It scans until a headroom of buffers is satisfied,
+ * which itself is a buffer for ARC eviction. If a compressible buffer is
+ * found during scanning and selected for writing to an L2ARC device, we
+ * temporarily boost scanning headroom during the next scan cycle to make
+ * sure we adapt to compression effects (which might significantly reduce
+ * the data volume we write to L2ARC). The thread that does this is
  * l2arc_feed_thread(), illustrated below; example sizes are included to
  * provide a better sense of ratio than this diagram:
  *
@@ -4213,6 +4272,11 @@ arc_fini(void)
  *	l2arc_write_boost	extra write bytes during device warmup
  *	l2arc_noprefetch	skip caching prefetched buffers
  *	l2arc_headroom		number of max device writes to precache
+ *	l2arc_headroom_boost	when we find compressed buffers during ARC
+ *				scanning, we multiply headroom by this
+ *				percentage factor for the next scan cycle,
+ *				since more compressed buffers are likely to
+ *				be present
  *	l2arc_feed_secs		seconds between L2ARC writing
  *
  * Tunables may be removed or added as future performance improvements are
@@ -4259,14 +4323,24 @@ l2arc_write_eligible(uint64_t spa_guid, 
 }
 
 static uint64_t
-l2arc_write_size(l2arc_dev_t *dev)
+l2arc_write_size(void)
 {
 	uint64_t size;
 
-	size = dev->l2ad_write;
+	/*
+	 * Make sure our globals have meaningful values in case the user
+	 * altered them.
+	 */
+	size = l2arc_write_max;
+	if (size == 0) {
+		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
+		    "be greater than zero, resetting it to the default (%d)",
+		    L2ARC_WRITE_SIZE);
+		size = l2arc_write_max = L2ARC_WRITE_SIZE;
+	}
 
 	if (arc_warm == B_FALSE)
-		size += dev->l2ad_boost;
+		size += l2arc_write_boost;
 
 	return (size);
 
@@ -4440,12 +4514,20 @@ l2arc_write_done(zio_t *zio)
 			continue;
 		}
 
+		abl2 = ab->b_l2hdr;
+
+		/*
+		 * Release the temporary compressed buffer as soon as possible.
+		 */
+		if (abl2->b_compress != ZIO_COMPRESS_OFF)
+			l2arc_release_cdata_buf(ab);
+
 		if (zio->io_error != 0) {
 			/*
 			 * Error - drop L2ARC entry.
 			 */
 			list_remove(buflist, ab);
-			abl2 = ab->b_l2hdr;
+			ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
 			ab->b_l2hdr = NULL;
 			trim_map_free(abl2->b_dev->l2ad_vdev, abl2->b_daddr,
 			    ab->b_size, 0);
@@ -4500,6 +4582,13 @@ l2arc_read_done(zio_t *zio)
 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
 
 	/*
+	 * If the buffer was compressed, decompress it first.
+	 */
+	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
+		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
+	ASSERT(zio->io_data != NULL);
+
+	/*
 	 * Check this survived the L2ARC journey.
 	 */
 	equal = arc_cksum_equal(buf);
@@ -4695,6 +4784,7 @@ top:
 			 */
 			if (ab->b_l2hdr != NULL) {
 				abl2 = ab->b_l2hdr;
+				ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
 				ab->b_l2hdr = NULL;
 				kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
 				ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
@@ -4720,38 +4810,55 @@ top:
  *
  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
  * for reading until they have completed writing.
+ * The headroom_boost is an in-out parameter used to maintain headroom boost
+ * state between calls to this function.
+ *
+ * Returns the number of bytes actually written (which may be smaller than
+ * the delta by which the device hand has changed due to alignment).
  */
 static uint64_t
-l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
+l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
+    boolean_t *headroom_boost)
 {
 	arc_buf_hdr_t *ab, *ab_prev, *head;
-	l2arc_buf_hdr_t *hdrl2;
 	list_t *list;
-	uint64_t passed_sz, write_sz, buf_sz, headroom;
+	uint64_t write_asize, write_psize, write_sz, headroom,
+	    buf_compress_minsz;
 	void *buf_data;
-	kmutex_t *hash_lock, *list_lock;
-	boolean_t have_lock, full;
+	kmutex_t *list_lock;
+	boolean_t full;
 	l2arc_write_callback_t *cb;
 	zio_t *pio, *wzio;
 	uint64_t guid = spa_load_guid(spa);
+	const boolean_t do_headroom_boost = *headroom_boost;
 	int try;
 
 	ASSERT(dev->l2ad_vdev != NULL);
 
+	/* Lower the flag now, we might want to raise it again later. */
+	*headroom_boost = B_FALSE;
+
 	pio = NULL;
-	write_sz = 0;
+	write_sz = write_asize = write_psize = 0;
 	full = B_FALSE;
 	head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
 	head->b_flags |= ARC_L2_WRITE_HEAD;
 
 	ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
 	/*
+	 * We will want to try to compress buffers that are at least 2x the
+	 * device sector size.
+	 */
+	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
+
+	/*
 	 * Copy buffers for L2ARC writing.
 	 */
 	mutex_enter(&l2arc_buflist_mtx);
 	for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
+		uint64_t passed_sz = 0;
+
 		list = l2arc_list_locked(try, &list_lock);
-		passed_sz = 0;
 		ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
 
 		/*
@@ -4760,7 +4867,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
 		 * Until the ARC is warm and starts to evict, read from the
 		 * head of the ARC lists rather than the tail.
 		 */
-		headroom = target_sz * l2arc_headroom;
 		if (arc_warm == B_FALSE)
 			ab = list_head(list);
 		else
@@ -4768,7 +4874,15 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
 		if (ab == NULL)
 			ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
 
+		headroom = target_sz * l2arc_headroom;
+		if (do_headroom_boost)
+			headroom = (headroom * l2arc_headroom_boost) / 100;
+
 		for (; ab; ab = ab_prev) {
+			l2arc_buf_hdr_t *l2hdr;
+			kmutex_t *hash_lock;
+			uint64_t buf_sz;
+
 			if (arc_warm == B_FALSE)
 				ab_prev = list_next(list, ab);
 			else
@@ -4776,8 +4890,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
 			ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
 
 			hash_lock = HDR_LOCK(ab);
-			have_lock = MUTEX_HELD(hash_lock);
-			if (!have_lock && !mutex_tryenter(hash_lock)) {
+			if (!mutex_tryenter(hash_lock)) {
 				ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
 				/*
 				 * Skip this buffer rather than waiting.
@@ -4827,15 +4940,26 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
 			/*
 			 * Create and add a new L2ARC header.
 			 */
-			hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
-			hdrl2->b_dev = dev;
-			hdrl2->b_daddr = dev->l2ad_hand;
-
+			l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
+			l2hdr->b_dev = dev;
 			ab->b_flags |= ARC_L2_WRITING;
-			ab->b_l2hdr = hdrl2;
-			list_insert_head(dev->l2ad_buflist, ab);
-			buf_data = ab->b_buf->b_data;
+
+			/*
+			 * Temporarily stash the data buffer in b_tmp_cdata.
+			 * The subsequent write step will pick it up from
+			 * there. This is because can't access ab->b_buf
+			 * without holding the hash_lock, which we in turn
+			 * can't access without holding the ARC list locks
+			 * (which we want to avoid during compression/writing).
+			 */
+			l2hdr->b_compress = ZIO_COMPRESS_OFF;
+			l2hdr->b_asize = ab->b_size;
+			l2hdr->b_tmp_cdata = ab->b_buf->b_data;
+
 			buf_sz = ab->b_size;
+			ab->b_l2hdr = l2hdr;
+
+			list_insert_head(dev->l2ad_buflist, ab);
 
 			/*
 			 * Compute and store the buffer cksum before
@@ -4846,6 +4970,64 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
 
 			mutex_exit(hash_lock);
 
+			write_sz += buf_sz;
+		}
+
+		mutex_exit(list_lock);
+
+		if (full == B_TRUE)
+			break;
+	}
+
+	/* No buffers selected for writing? */
+	if (pio == NULL) {
+		ASSERT0(write_sz);
+		mutex_exit(&l2arc_buflist_mtx);
+		kmem_cache_free(hdr_cache, head);
+		return (0);
+	}
+
+	/*
+	 * Now start writing the buffers. We're starting at the write head
+	 * and work backwards, retracing the course of the buffer selector
+	 * loop above.
+	 */
+	for (ab = list_prev(dev->l2ad_buflist, head); ab;
+	    ab = list_prev(dev->l2ad_buflist, ab)) {
+		l2arc_buf_hdr_t *l2hdr;
+		uint64_t buf_sz;
+
+		/*
+		 * We shouldn't need to lock the buffer here, since we flagged
+		 * it as ARC_L2_WRITING in the previous step, but we must take
+		 * care to only access its L2 cache parameters. In particular,
+		 * ab->b_buf may be invalid by now due to ARC eviction.
+		 */
+		l2hdr = ab->b_l2hdr;
+		l2hdr->b_daddr = dev->l2ad_hand;
+
+		if ((ab->b_flags & ARC_L2COMPRESS) &&
+		    l2hdr->b_asize >= buf_compress_minsz) {
+			if (l2arc_compress_buf(l2hdr)) {
+				/*
+				 * If compression succeeded, enable headroom
+				 * boost on the next scan cycle.
+				 */
+				*headroom_boost = B_TRUE;
+			}
+		}
+
+		/*
+		 * Pick up the buffer data we had previously stashed away
+		 * (and now potentially also compressed).
+		 */
+		buf_data = l2hdr->b_tmp_cdata;
+		buf_sz = l2hdr->b_asize;
+
+		/* Compression may have squashed the buffer to zero length. */
+		if (buf_sz != 0) {
+			uint64_t buf_p_sz;
+
 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
 			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
 			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
@@ -4855,33 +5037,24 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
 			    zio_t *, wzio);
 			(void) zio_nowait(wzio);
 
+			write_asize += buf_sz;
 			/*
 			 * Keep the clock hand suitably device-aligned.
 			 */
-			buf_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
-
-			write_sz += buf_sz;
-			dev->l2ad_hand += buf_sz;
+			buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
+			write_psize += buf_p_sz;
+			dev->l2ad_hand += buf_p_sz;
 		}
-
-		mutex_exit(list_lock);
-
-		if (full == B_TRUE)
-			break;
 	}
-	mutex_exit(&l2arc_buflist_mtx);
 
-	if (pio == NULL) {
-		ASSERT0(write_sz);
-		kmem_cache_free(hdr_cache, head);
-		return (0);
-	}
+	mutex_exit(&l2arc_buflist_mtx);
 
-	ASSERT3U(write_sz, <=, target_sz);
+	ASSERT3U(write_asize, <=, target_sz);
 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
-	ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
+	ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
 	ARCSTAT_INCR(arcstat_l2_size, write_sz);
-	vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
+	ARCSTAT_INCR(arcstat_l2_asize, write_asize);
+	vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
 
 	/*
 	 * Bump device hand to the device start if it is approaching the end.
@@ -4899,7 +5072,153 @@ l2arc_write_buffers(spa_t *spa, l2arc_de
 	(void) zio_wait(pio);
 	dev->l2ad_writing = B_FALSE;
 
-	return (write_sz);
+	return (write_asize);
+}
+
+/*
+ * Compresses an L2ARC buffer.
+ * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
+ * size in l2hdr->b_asize. This routine tries to compress the data and
+ * depending on the compression result there are three possible outcomes:
+ * *) The buffer was incompressible. The original l2hdr contents were left
+ *    untouched and are ready for writing to an L2 device.
+ * *) The buffer was all-zeros, so there is no need to write it to an L2
+ *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
+ *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
+ * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
+ *    data buffer which holds the compressed data to be written, and b_asize
+ *    tells us how much data there is. b_compress is set to the appropriate
+ *    compression algorithm. Once writing is done, invoke
+ *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
+ *
+ * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
+ * buffer was incompressible).
+ */
+static boolean_t
+l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
+{
+	void *cdata;
+	size_t csize, len;
+
+	ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
+	ASSERT(l2hdr->b_tmp_cdata != NULL);
+
+	len = l2hdr->b_asize;
+	cdata = zio_data_buf_alloc(len);
+	csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
+	    cdata, l2hdr->b_asize);
+
+	if (csize == 0) {
+		/* zero block, indicate that there's nothing to write */
+		zio_data_buf_free(cdata, len);
+		l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
+		l2hdr->b_asize = 0;
+		l2hdr->b_tmp_cdata = NULL;
+		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
+		return (B_TRUE);
+	} else if (csize > 0 && csize < len) {
+		/*
+		 * Compression succeeded, we'll keep the cdata around for
+		 * writing and release it afterwards.
+		 */
+		l2hdr->b_compress = ZIO_COMPRESS_LZ4;
+		l2hdr->b_asize = csize;
+		l2hdr->b_tmp_cdata = cdata;
+		ARCSTAT_BUMP(arcstat_l2_compress_successes);
+		return (B_TRUE);
+	} else {
+		/*
+		 * Compression failed, release the compressed buffer.
+		 * l2hdr will be left unmodified.
+		 */
+		zio_data_buf_free(cdata, len);
+		ARCSTAT_BUMP(arcstat_l2_compress_failures);
+		return (B_FALSE);
+	}
+}
+
+/*
+ * Decompresses a zio read back from an l2arc device. On success, the
+ * underlying zio's io_data buffer is overwritten by the uncompressed
+ * version. On decompression error (corrupt compressed stream), the
+ * zio->io_error value is set to signal an I/O error.
+ *
+ * Please note that the compressed data stream is not checksummed, so
+ * if the underlying device is experiencing data corruption, we may feed
+ * corrupt data to the decompressor, so the decompressor needs to be
+ * able to handle this situation (LZ4 does).
+ */
+static void
+l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
+{
+	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
+
+	if (zio->io_error != 0) {
+		/*
+		 * An io error has occured, just restore the original io
+		 * size in preparation for a main pool read.
+		 */
+		zio->io_orig_size = zio->io_size = hdr->b_size;
+		return;
+	}
+
+	if (c == ZIO_COMPRESS_EMPTY) {
+		/*
+		 * An empty buffer results in a null zio, which means we
+		 * need to fill its io_data after we're done restoring the
+		 * buffer's contents.
+		 */
+		ASSERT(hdr->b_buf != NULL);
+		bzero(hdr->b_buf->b_data, hdr->b_size);
+		zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
+	} else {
+		ASSERT(zio->io_data != NULL);
+		/*
+		 * We copy the compressed data from the start of the arc buffer
+		 * (the zio_read will have pulled in only what we need, the
+		 * rest is garbage which we will overwrite at decompression)
+		 * and then decompress back to the ARC data buffer. This way we
+		 * can minimize copying by simply decompressing back over the
+		 * original compressed data (rather than decompressing to an
+		 * aux buffer and then copying back the uncompressed buffer,
+		 * which is likely to be much larger).
+		 */
+		uint64_t csize;
+		void *cdata;
+
+		csize = zio->io_size;
+		cdata = zio_data_buf_alloc(csize);
+		bcopy(zio->io_data, cdata, csize);
+		if (zio_decompress_data(c, cdata, zio->io_data, csize,
+		    hdr->b_size) != 0)
+			zio->io_error = EIO;
+		zio_data_buf_free(cdata, csize);
+	}
+
+	/* Restore the expected uncompressed IO size. */
+	zio->io_orig_size = zio->io_size = hdr->b_size;
+}
+
+/*
+ * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
+ * This buffer serves as a temporary holder of compressed data while
+ * the buffer entry is being written to an l2arc device. Once that is
+ * done, we can dispose of it.
+ */
+static void
+l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
+{
+	l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
+
+	if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
+		/*
+		 * If the data was compressed, then we've allocated a
+		 * temporary buffer for it, so now we need to release it.
+		 */
+		ASSERT(l2hdr->b_tmp_cdata != NULL);
+		zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
+	}
+	l2hdr->b_tmp_cdata = NULL;
 }
 
 /*
@@ -4914,6 +5233,7 @@ l2arc_feed_thread(void *dummy __unused)
 	spa_t *spa;
 	uint64_t size, wrote;
 	clock_t begin, next = ddi_get_lbolt();
+	boolean_t headroom_boost = B_FALSE;
 
 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
@@ -4974,7 +5294,7 @@ l2arc_feed_thread(void *dummy __unused)
 
 		ARCSTAT_BUMP(arcstat_l2_feeds);
 
-		size = l2arc_write_size(dev);
+		size = l2arc_write_size();
 
 		/*
 		 * Evict L2ARC buffers that will be overwritten.
@@ -4984,7 +5304,7 @@ l2arc_feed_thread(void *dummy __unused)
 		/*
 		 * Write ARC buffers.
 		 */
-		wrote = l2arc_write_buffers(spa, dev, size);
+		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
 
 		/*
 		 * Calculate interval between writes.
@@ -5032,15 +5352,12 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
 	adddev->l2ad_spa = spa;
 	adddev->l2ad_vdev = vd;
-	adddev->l2ad_write = l2arc_write_max;
-	adddev->l2ad_boost = l2arc_write_boost;
 	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
 	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
 	adddev->l2ad_hand = adddev->l2ad_start;
 	adddev->l2ad_evict = adddev->l2ad_start;
 	adddev->l2ad_first = B_TRUE;
 	adddev->l2ad_writing = B_FALSE;
-	ASSERT3U(adddev->l2ad_write, >, 0);
 
 	/*
 	 * This is a list of all ARC buffers that are still valid on the

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Mon Jun 24 02:24:22 2013	(r252139)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	Mon Jun 24 05:00:31 2013	(r252140)
@@ -22,6 +22,7 @@
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -568,6 +569,8 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t
 
 	if (DBUF_IS_L2CACHEABLE(db))
 		aflags |= ARC_L2CACHE;
+	if (DBUF_IS_L2COMPRESSIBLE(db))
+		aflags |= ARC_L2COMPRESS;
 
 	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
 	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
@@ -2710,8 +2713,9 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_
 	} else {
 		ASSERT(arc_released(data));
 		dr->dr_zio = arc_write(zio, os->os_spa, txg,
-		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
-		    dbuf_write_ready, dbuf_write_done, db,
-		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
+		    DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
+		    dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
+		    ZIO_FLAG_MUSTSUCCEED, &zb);
 	}
 }

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Mon Jun 24 02:24:22 2013	(r252139)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c	Mon Jun 24 05:00:31 2013	(r252140)
@@ -23,6 +23,8 @@
  * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
+/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
+
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
@@ -1514,9 +1516,9 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_s
 	dsa->dsa_tx = NULL;
 
 	zio_nowait(arc_write(pio, os->os_spa, txg,
-	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp,
-	    dmu_sync_ready, dmu_sync_done, dsa,
-	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
+	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
+	    DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, dmu_sync_done,
+	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
 
 	return (0);
 }

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	Mon Jun 24 02:24:22 2013	(r252139)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c	Mon Jun 24 05:00:31 2013	(r252140)
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -276,6 +277,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dat
 
 		if (DMU_OS_IS_L2CACHEABLE(os))
 			aflags |= ARC_L2CACHE;
+		if (DMU_OS_IS_L2COMPRESSIBLE(os))
+			aflags |= ARC_L2COMPRESS;
 
 		dprintf_bp(os->os_rootbp, "reading %s", "");
 		err = arc_read(NULL, spa, os->os_rootbp,
@@ -991,9 +994,10 @@ dmu_objset_sync(objset_t *os, zio_t *pio
 	dmu_write_policy(os, NULL, 0, 0, &zp);
 
 	zio = arc_write(pio, os->os_spa, tx->tx_txg,
-	    os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp,
-	    dmu_objset_write_ready, dmu_objset_write_done, os,
-	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+	    os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
+	    DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready,
+	    dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
+	    ZIO_FLAG_MUSTSUCCEED, &zb);
 
 	/*
 	 * Sync special dnodes - the parent IO for the sync is the root block

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h	Mon Jun 24 02:24:22 2013	(r252139)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h	Mon Jun 24 05:00:31 2013	(r252140)
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
 #ifndef	_SYS_ARC_H
@@ -67,6 +68,7 @@ typedef enum arc_buf_contents {
 #define	ARC_PREFETCH	(1 << 3)	/* I/O is a prefetch */
 #define	ARC_CACHED	(1 << 4)	/* I/O was already in cache */
 #define	ARC_L2CACHE	(1 << 5)	/* cache in L2ARC */
+#define	ARC_L2COMPRESS	(1 << 6)	/* compress in L2ARC */
 
 /*
  * The following breakdows of arc_size exist for kstat only.
@@ -105,9 +107,9 @@ int arc_read(zio_t *pio, spa_t *spa, con
     arc_done_func_t *done, void *priv, int priority, int flags,
     uint32_t *arc_flags, const zbookmark_t *zb);
 zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
-    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
-    arc_done_func_t *ready, arc_done_func_t *done, void *priv,
-    int priority, int zio_flags, const zbookmark_t *zb);
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
+    const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
+    void *priv, int priority, int zio_flags, const zbookmark_t *zb);
 
 void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *priv);
 int arc_buf_evict(arc_buf_t *buf);

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h	Mon Jun 24 02:24:22 2013	(r252139)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h	Mon Jun 24 05:00:31 2013	(r252140)
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
 #ifndef	_SYS_DBUF_H
@@ -324,6 +325,9 @@ boolean_t dbuf_is_metadata(dmu_buf_impl_
 	(dbuf_is_metadata(_db) &&					\
 	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
 
+#define	DBUF_IS_L2COMPRESSIBLE(_db)					\
+	((_db)->db_objset->os_compress != ZIO_COMPRESS_OFF)
+
 #ifdef ZFS_DEBUG
 
 /*

Modified: stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
==============================================================================
--- stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h	Mon Jun 24 02:24:22 2013	(r252139)
+++ stable/9/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h	Mon Jun 24 05:00:31 2013	(r252140)
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
@@ -129,6 +130,8 @@ struct objset {
 	((os)->os_secondary_cache == ZFS_CACHE_ALL ||		\
 	(os)->os_secondary_cache == ZFS_CACHE_METADATA)
 
+#define	DMU_OS_IS_L2COMPRESSIBLE(os)	((os)->os_compress != ZIO_COMPRESS_OFF)
+
 /* called from zpl */
 int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
 int dmu_objset_own(const char *name, dmu_objset_type_t type,


More information about the svn-src-all mailing list