svn commit: r308782 - in head: cddl/contrib/opensolaris/cmd/ztest sys/cddl/contrib/opensolaris/uts/common/fs/zfs sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys
Alexander Motin
mav at FreeBSD.org
Thu Nov 17 22:11:48 UTC 2016
It is in OpenZFS review queue now:
https://github.com/openzfs/openzfs/pull/219 Welcome to comment there to
speed up the process.
On 17.11.2016 13:43, Steven Hartland wrote:
> Is this something that should be upstreamed?
>
> On 17/11/2016 21:01, Alexander Motin wrote:
>> Author: mav
>> Date: Thu Nov 17 21:01:27 2016
>> New Revision: 308782
>> URL: https://svnweb.freebsd.org/changeset/base/308782
>>
>> Log:
>> After some ZIL changes 6 years ago zil_slog_limit got partially broken
>> due to zl_itx_list_sz not updated when async itx'es upgraded to sync.
>> Actually because of other changes about that time zl_itx_list_sz is not
>> really required to implement the functionality, so this patch removes
>> some unneeded broken code and variables.
>>
>> Original idea of zil_slog_limit was to reduce chance of SLOG abuse by
>> single heavy logger, that increased latency for other (more latency critical)
>> loggers, by pushing heavy log out into the main pool instead of SLOG. Beside
>> huge latency increase for heavy writers, this implementation caused double
>> write of all data, since the log records were explicitly prepared for SLOG.
>> Since we now have I/O scheduler, I've found it can be much more efficient
>> to reduce priority of heavy logger SLOG writes from ZIO_PRIORITY_SYNC_WRITE
>> to ZIO_PRIORITY_ASYNC_WRITE, while still leave them on SLOG.
>>
>> Existing ZIL implementation had problem with space efficiency when it
>> has to write large chunks of data into log blocks of limited size. In some
>> cases efficiency stopped to almost as low as 50%. In case of ZIL stored on
>> spinning rust, that also reduced log write speed in half, since head had to
>> uselessly fly over allocated but not written areas. This change improves
>> the situation by offloading problematic operations from z*_log_write() to
>> zil_lwb_commit(), which knows real situation of log blocks allocation and
>> can split large requests into pieces much more efficiently. Also as side
>> effect it removes one of two data copy operations done by ZIL code WR_COPIED
>> case.
>>
>> While there, untangle and unify code of z*_log_write() functions.
>> Also zfs_log_write() alike to zvol_log_write() can now handle writes crossing
>> block boundary, that may also improve efficiency if ZPL is made to do that.
>>
>> Sponsored by: iXsystems, Inc.
>>
>> Modified:
>> head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
>> head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
>> head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
>> head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
>> head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
>> head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
>> head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
>> head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
>>
>> Modified: head/cddl/contrib/opensolaris/cmd/ztest/ztest.c
>> ==============================================================================
>> --- head/cddl/contrib/opensolaris/cmd/ztest/ztest.c Thu Nov 17 20:44:51 2016 (r308781)
>> +++ head/cddl/contrib/opensolaris/cmd/ztest/ztest.c Thu Nov 17 21:01:27 2016 (r308782)
>> @@ -1371,7 +1371,6 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t
>> itx->itx_private = zd;
>> itx->itx_wr_state = write_state;
>> itx->itx_sync = (ztest_random(8) == 0);
>> - itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
>>
>> bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
>> sizeof (*lr) - sizeof (lr_t));
>>
>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
>> ==============================================================================
>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h Thu Nov 17 20:44:51 2016 (r308781)
>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h Thu Nov 17 21:01:27 2016 (r308782)
>> @@ -369,7 +369,6 @@ typedef struct itx {
>> void *itx_private; /* type-specific opaque data */
>> itx_wr_state_t itx_wr_state; /* write state */
>> uint8_t itx_sync; /* synchronous transaction */
>> - uint64_t itx_sod; /* record size on disk */
>> uint64_t itx_oid; /* object id */
>> lr_t itx_lr; /* common part of log record */
>> /* followed by type-specific part of lr_xx_t and its immediate data */
>>
>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
>> ==============================================================================
>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h Thu Nov 17 20:44:51 2016 (r308781)
>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h Thu Nov 17 21:01:27 2016 (r308782)
>> @@ -42,6 +42,7 @@ extern "C" {
>> typedef struct lwb {
>> zilog_t *lwb_zilog; /* back pointer to log struct */
>> blkptr_t lwb_blk; /* on disk address of this log blk */
>> + boolean_t lwb_slog; /* lwb_blk is on SLOG device */
>> int lwb_nused; /* # used bytes in buffer */
>> int lwb_sz; /* size of block and buffer */
>> char *lwb_buf; /* log write buffer */
>> @@ -62,7 +63,6 @@ typedef struct itxs {
>> typedef struct itxg {
>> kmutex_t itxg_lock; /* lock for this structure */
>> uint64_t itxg_txg; /* txg for this chain */
>> - uint64_t itxg_sod; /* total size on disk for this txg */
>> itxs_t *itxg_itxs; /* sync and async itxs */
>> } itxg_t;
>>
>> @@ -120,7 +120,6 @@ struct zilog {
>> kcondvar_t zl_cv_batch[2]; /* batch condition variables */
>> itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */
>> list_t zl_itx_commit_list; /* itx list to be committed */
>> - uint64_t zl_itx_list_sz; /* total size of records on list */
>> uint64_t zl_cur_used; /* current commit log size used */
>> list_t zl_lwb_list; /* in-flight log write list */
>> kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */
>> @@ -142,6 +141,8 @@ typedef struct zil_bp_node {
>>
>> #define ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
>> sizeof (lr_write_t))
>> +#define ZIL_MAX_COPIED_DATA \
>> + ((SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t))
>>
>> #ifdef __cplusplus
>> }
>>
>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
>> ==============================================================================
>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h Thu Nov 17 20:44:51 2016 (r308781)
>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h Thu Nov 17 21:01:27 2016 (r308782)
>> @@ -547,7 +547,7 @@ extern zio_t *zio_free_sync(zio_t *pio,
>> const blkptr_t *bp, uint64_t size, enum zio_flag flags);
>>
>> extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
>> - blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
>> + blkptr_t *old_bp, uint64_t size, boolean_t *slog);
>> extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
>> extern void zio_flush(zio_t *zio, vdev_t *vd);
>> extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset,
>>
>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
>> ==============================================================================
>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c Thu Nov 17 20:44:51 2016 (r308781)
>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c Thu Nov 17 21:01:27 2016 (r308782)
>> @@ -464,20 +464,17 @@ void
>> zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
>> znode_t *zp, offset_t off, ssize_t resid, int ioflag)
>> {
>> + uint32_t blocksize = zp->z_blksz;
>> itx_wr_state_t write_state;
>> - boolean_t slogging;
>> uintptr_t fsync_cnt;
>> - ssize_t immediate_write_sz;
>>
>> if (zil_replaying(zilog, tx) || zp->z_unlinked)
>> return;
>>
>> - immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
>> - ? 0 : zfs_immediate_write_sz;
>> -
>> - slogging = spa_has_slogs(zilog->zl_spa) &&
>> - (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
>> - if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz)
>> + if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
>> + write_state = WR_INDIRECT;
>> + else if (!spa_has_slogs(zilog->zl_spa) &&
>> + resid >= zfs_immediate_write_sz)
>> write_state = WR_INDIRECT;
>> else if (ioflag & (FSYNC | FDSYNC))
>> write_state = WR_COPIED;
>> @@ -491,30 +488,26 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *
>> while (resid) {
>> itx_t *itx;
>> lr_write_t *lr;
>> - ssize_t len;
>> + itx_wr_state_t wr_state = write_state;
>> + ssize_t len = resid;
>>
>> - /*
>> - * If the write would overflow the largest block then split it.
>> - */
>> - if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
>> - len = SPA_OLD_MAXBLOCKSIZE >> 1;
>> - else
>> - len = resid;
>> + if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
>> + wr_state = WR_NEED_COPY;
>> + else if (wr_state == WR_INDIRECT)
>> + len = MIN(blocksize - P2PHASE(off, blocksize), resid);
>>
>> itx = zil_itx_create(txtype, sizeof (*lr) +
>> - (write_state == WR_COPIED ? len : 0));
>> + (wr_state == WR_COPIED ? len : 0));
>> lr = (lr_write_t *)&itx->itx_lr;
>> - if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
>> + if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
>> zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
>> zil_itx_destroy(itx);
>> itx = zil_itx_create(txtype, sizeof (*lr));
>> lr = (lr_write_t *)&itx->itx_lr;
>> - write_state = WR_NEED_COPY;
>> + wr_state = WR_NEED_COPY;
>> }
>>
>> - itx->itx_wr_state = write_state;
>> - if (write_state == WR_NEED_COPY)
>> - itx->itx_sod += len;
>> + itx->itx_wr_state = wr_state;
>> lr->lr_foid = zp->z_id;
>> lr->lr_offset = off;
>> lr->lr_length = len;
>>
>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
>> ==============================================================================
>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c Thu Nov 17 20:44:51 2016 (r308781)
>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c Thu Nov 17 21:01:27 2016 (r308782)
>> @@ -88,6 +88,15 @@ SYSCTL_DECL(_vfs_zfs_trim);
>> SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0,
>> "Enable ZFS TRIM");
>>
>> +/*
>> + * Limit SLOG write size per commit executed with synchronous priority.
>> + * Any writes above that executed with lower (asynchronous) priority to
>> + * limit potential SLOG device abuse by single active ZIL writer.
>> + */
>> +uint64_t zil_slog_limit = 768 * 1024;
>> +SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_limit, CTLFLAG_RWTUN,
>> + &zil_slog_limit, 0, "Maximal SLOG commit size with sync priority");
>> +
>> static kmem_cache_t *zil_lwb_cache;
>>
>> #define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
>> @@ -447,13 +456,14 @@ zil_free_log_record(zilog_t *zilog, lr_t
>> }
>>
>> static lwb_t *
>> -zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
>> +zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
>> {
>> lwb_t *lwb;
>>
>> lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
>> lwb->lwb_zilog = zilog;
>> lwb->lwb_blk = *bp;
>> + lwb->lwb_slog = slog;
>> lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
>> lwb->lwb_max_txg = txg;
>> lwb->lwb_zio = NULL;
>> @@ -516,6 +526,7 @@ zil_create(zilog_t *zilog)
>> dmu_tx_t *tx = NULL;
>> blkptr_t blk;
>> int error = 0;
>> + boolean_t slog = FALSE;
>>
>> /*
>> * Wait for any previous destroy to complete.
>> @@ -544,7 +555,7 @@ zil_create(zilog_t *zilog)
>> }
>>
>> error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
>> - ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
>> + ZIL_MIN_BLKSZ, &slog);
>>
>> if (error == 0)
>> zil_init_log_chain(zilog, &blk);
>> @@ -554,7 +565,7 @@ zil_create(zilog_t *zilog)
>> * Allocate a log write buffer (lwb) for the first log block.
>> */
>> if (error == 0)
>> - lwb = zil_alloc_lwb(zilog, &blk, txg);
>> + lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
>>
>> /*
>> * If we just allocated the first log block, commit our transaction
>> @@ -885,6 +896,7 @@ static void
>> zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
>> {
>> zbookmark_phys_t zb;
>> + zio_priority_t prio;
>>
>> SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
>> ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
>> @@ -895,9 +907,13 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t
>> ZIO_FLAG_CANFAIL);
>> }
>> if (lwb->lwb_zio == NULL) {
>> + if (zilog->zl_cur_used <= zil_slog_limit || !lwb->lwb_slog)
>> + prio = ZIO_PRIORITY_SYNC_WRITE;
>> + else
>> + prio = ZIO_PRIORITY_ASYNC_WRITE;
>> lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
>> 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
>> - zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE,
>> + zil_lwb_write_done, lwb, prio,
>> ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
>> }
>> }
>> @@ -917,18 +933,6 @@ uint64_t zil_block_buckets[] = {
>> };
>>
>> /*
>> - * Use the slog as long as the logbias is 'latency' and the current commit size
>> - * is less than the limit or the total list size is less than 2X the limit.
>> - * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
>> - */
>> -uint64_t zil_slog_limit = 1024 * 1024;
>> -SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_limit, CTLFLAG_RWTUN,
>> - &zil_slog_limit, 0, "Maximal commit size to use SLOG");
>> -#define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
>> - (((zilog)->zl_cur_used < zil_slog_limit) || \
>> - ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
>> -
>> -/*
>> * Start a log block write and advance to the next log block.
>> * Calls are serialized.
>> */
>> @@ -943,6 +947,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_
>> uint64_t txg;
>> uint64_t zil_blksz, wsz;
>> int i, error;
>> + boolean_t slog;
>>
>> if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
>> zilc = (zil_chain_t *)lwb->lwb_buf;
>> @@ -999,8 +1004,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_
>>
>> BP_ZERO(bp);
>> /* pass the old blkptr in order to spread log blocks across devs */
>> - error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
>> - USE_SLOG(zilog));
>> + error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
>> if (error == 0) {
>> ASSERT3U(bp->blk_birth, ==, txg);
>> bp->blk_cksum = lwb->lwb_blk.blk_cksum;
>> @@ -1009,7 +1013,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_
>> /*
>> * Allocate a new log write buffer (lwb).
>> */
>> - nlwb = zil_alloc_lwb(zilog, bp, txg);
>> + nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
>>
>> /* Record the block for later vdev flushing */
>> zil_add_block(zilog, &lwb->lwb_blk);
>> @@ -1046,12 +1050,13 @@ zil_lwb_write_start(zilog_t *zilog, lwb_
>> static lwb_t *
>> zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
>> {
>> - lr_t *lrc = &itx->itx_lr; /* common log record */
>> - lr_write_t *lrw = (lr_write_t *)lrc;
>> + lr_t *lrcb, *lrc = &itx->itx_lr; /* common log record */
>> + lr_write_t *lrwb, *lrw = (lr_write_t *)lrc;
>> char *lr_buf;
>> uint64_t txg = lrc->lrc_txg;
>> uint64_t reclen = lrc->lrc_reclen;
>> uint64_t dlen = 0;
>> + uint64_t dnow, lwb_sp;
>>
>> if (lwb == NULL)
>> return (NULL);
>> @@ -1068,25 +1073,30 @@ zil_lwb_commit(zilog_t *zilog, itx_t *it
>>
>> zil_lwb_write_init(zilog, lwb);
>>
>> +cont:
>> /*
>> * If this record won't fit in the current log block, start a new one.
>> + * For WR_NEED_COPY optimize layout for minimal number of chunks, but
>> + * try to keep wasted space withing reasonable range (12%).
>> */
>> - if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
>> + lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
>> + if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
>> + lwb_sp < ZIL_MAX_LOG_DATA / 8 && (dlen % ZIL_MAX_LOG_DATA == 0 ||
>> + lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) {
>> lwb = zil_lwb_write_start(zilog, lwb);
>> if (lwb == NULL)
>> return (NULL);
>> zil_lwb_write_init(zilog, lwb);
>> ASSERT(LWB_EMPTY(lwb));
>> - if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
>> - txg_wait_synced(zilog->zl_dmu_pool, txg);
>> - return (lwb);
>> - }
>> + lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
>> + ASSERT3U(reclen + MIN(dlen, sizeof(uint64_t)), <=, lwb_sp);
>> }
>>
>> + dnow = MIN(dlen, lwb_sp - reclen);
>> lr_buf = lwb->lwb_buf + lwb->lwb_nused;
>> bcopy(lrc, lr_buf, reclen);
>> - lrc = (lr_t *)lr_buf;
>> - lrw = (lr_write_t *)lrc;
>> + lrcb = (lr_t *)lr_buf;
>> + lrwb = (lr_write_t *)lrcb;
>>
>> /*
>> * If it's a write, fetch the data or get its blkptr as appropriate.
>> @@ -1098,16 +1108,19 @@ zil_lwb_commit(zilog_t *zilog, itx_t *it
>> char *dbuf;
>> int error;
>>
>> - if (dlen) {
>> - ASSERT(itx->itx_wr_state == WR_NEED_COPY);
>> + if (itx->itx_wr_state == WR_NEED_COPY) {
>> dbuf = lr_buf + reclen;
>> - lrw->lr_common.lrc_reclen += dlen;
>> + lrcb->lrc_reclen += dnow;
>> + if (lrwb->lr_length > dnow)
>> + lrwb->lr_length = dnow;
>> + lrw->lr_offset += dnow;
>> + lrw->lr_length -= dnow;
>> } else {
>> ASSERT(itx->itx_wr_state == WR_INDIRECT);
>> dbuf = NULL;
>> }
>> error = zilog->zl_get_data(
>> - itx->itx_private, lrw, dbuf, lwb->lwb_zio);
>> + itx->itx_private, lrwb, dbuf, lwb->lwb_zio);
>> if (error == EIO) {
>> txg_wait_synced(zilog->zl_dmu_pool, txg);
>> return (lwb);
>> @@ -1126,12 +1139,18 @@ zil_lwb_commit(zilog_t *zilog, itx_t *it
>> * equal to the itx sequence number because not all transactions
>> * are synchronous, and sometimes spa_sync() gets there first.
>> */
>> - lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
>> - lwb->lwb_nused += reclen + dlen;
>> + lrcb->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
>> + lwb->lwb_nused += reclen + dnow;
>> lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
>> ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
>> ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
>>
>> + dlen -= dnow;
>> + if (dlen > 0) {
>> + zilog->zl_cur_used += reclen;
>> + goto cont;
>> + }
>> +
>> return (lwb);
>> }
>>
>> @@ -1145,7 +1164,6 @@ zil_itx_create(uint64_t txtype, size_t l
>> itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
>> itx->itx_lr.lrc_txtype = txtype;
>> itx->itx_lr.lrc_reclen = lrsize;
>> - itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
>> itx->itx_lr.lrc_seq = 0; /* defensive */
>> itx->itx_sync = B_TRUE; /* default is synchronous */
>>
>> @@ -1294,11 +1312,8 @@ zil_itx_assign(zilog_t *zilog, itx_t *it
>> * this itxg. Save the itxs for release below.
>> * This should be rare.
>> */
>> - atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
>> - itxg->itxg_sod = 0;
>> clean = itxg->itxg_itxs;
>> }
>> - ASSERT(itxg->itxg_sod == 0);
>> itxg->itxg_txg = txg;
>> itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
>>
>> @@ -1310,8 +1325,6 @@ zil_itx_assign(zilog_t *zilog, itx_t *it
>> }
>> if (itx->itx_sync) {
>> list_insert_tail(&itxs->i_sync_list, itx);
>> - atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod);
>> - itxg->itxg_sod += itx->itx_sod;
>> } else {
>> avl_tree_t *t = &itxs->i_async_tree;
>> uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
>> @@ -1359,8 +1372,6 @@ zil_clean(zilog_t *zilog, uint64_t synce
>> ASSERT3U(itxg->itxg_txg, <=, synced_txg);
>> ASSERT(itxg->itxg_txg != 0);
>> ASSERT(zilog->zl_clean_taskq != NULL);
>> - atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
>> - itxg->itxg_sod = 0;
>> clean_me = itxg->itxg_itxs;
>> itxg->itxg_itxs = NULL;
>> itxg->itxg_txg = 0;
>> @@ -1384,7 +1395,6 @@ zil_get_commit_list(zilog_t *zilog)
>> {
>> uint64_t otxg, txg;
>> list_t *commit_list = &zilog->zl_itx_commit_list;
>> - uint64_t push_sod = 0;
>>
>> if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
>> otxg = ZILTEST_TXG;
>> @@ -1401,12 +1411,9 @@ zil_get_commit_list(zilog_t *zilog)
>> }
>>
>> list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
>> - push_sod += itxg->itxg_sod;
>> - itxg->itxg_sod = 0;
>>
>> mutex_exit(&itxg->itxg_lock);
>> }
>> - atomic_add_64(&zilog->zl_itx_list_sz, -push_sod);
>> }
>>
>> /*
>>
>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
>> ==============================================================================
>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Thu Nov 17 20:44:51 2016 (r308781)
>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c Thu Nov 17 21:01:27 2016 (r308782)
>> @@ -2908,20 +2908,21 @@ zio_dva_unallocate(zio_t *zio, zio_gang_
>> */
>> int
>> zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
>> - uint64_t size, boolean_t use_slog)
>> + uint64_t size, boolean_t *slog)
>> {
>> int error = 1;
>>
>> ASSERT(txg > spa_syncing_txg(spa));
>>
>> - if (use_slog) {
>> - error = metaslab_alloc(spa, spa_log_class(spa), size,
>> - new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
>> - }
>> -
>> - if (error) {
>> + error = metaslab_alloc(spa, spa_log_class(spa), size,
>> + new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
>> + if (error == 0) {
>> + *slog = TRUE;
>> + } else {
>> error = metaslab_alloc(spa, spa_normal_class(spa), size,
>> new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
>> + if (error == 0)
>> + *slog = FALSE;
>> }
>>
>> if (error == 0) {
>>
>> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
>> ==============================================================================
>> --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c Thu Nov 17 20:44:51 2016 (r308781)
>> +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c Thu Nov 17 21:01:27 2016 (r308782)
>> @@ -1387,54 +1387,44 @@ zvol_log_write(zvol_state_t *zv, dmu_tx_
>> {
>> uint32_t blocksize = zv->zv_volblocksize;
>> zilog_t *zilog = zv->zv_zilog;
>> - boolean_t slogging;
>> - ssize_t immediate_write_sz;
>> + itx_wr_state_t write_state;
>>
>> if (zil_replaying(zilog, tx))
>> return;
>>
>> - immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
>> - ? 0 : zvol_immediate_write_sz;
>> -
>> - slogging = spa_has_slogs(zilog->zl_spa) &&
>> - (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
>> + if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
>> + write_state = WR_INDIRECT;
>> + else if (!spa_has_slogs(zilog->zl_spa) &&
>> + resid >= blocksize && blocksize > zvol_immediate_write_sz)
>> + write_state = WR_INDIRECT;
>> + else if (sync)
>> + write_state = WR_COPIED;
>> + else
>> + write_state = WR_NEED_COPY;
>>
>> while (resid) {
>> itx_t *itx;
>> lr_write_t *lr;
>> - ssize_t len;
>> - itx_wr_state_t write_state;
>> + itx_wr_state_t wr_state = write_state;
>> + ssize_t len = resid;
>>
>> - /*
>> - * Unlike zfs_log_write() we can be called with
>> - * upto DMU_MAX_ACCESS/2 (5MB) writes.
>> - */
>> - if (blocksize > immediate_write_sz && !slogging &&
>> - resid >= blocksize && off % blocksize == 0) {
>> - write_state = WR_INDIRECT; /* uses dmu_sync */
>> - len = blocksize;
>> - } else if (sync) {
>> - write_state = WR_COPIED;
>> - len = MIN(ZIL_MAX_LOG_DATA, resid);
>> - } else {
>> - write_state = WR_NEED_COPY;
>> - len = MIN(ZIL_MAX_LOG_DATA, resid);
>> - }
>> + if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
>> + wr_state = WR_NEED_COPY;
>> + else if (wr_state == WR_INDIRECT)
>> + len = MIN(blocksize - P2PHASE(off, blocksize), resid);
>>
>> itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
>> - (write_state == WR_COPIED ? len : 0));
>> + (wr_state == WR_COPIED ? len : 0));
>> lr = (lr_write_t *)&itx->itx_lr;
>> - if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
>> + if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
>> ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
>> zil_itx_destroy(itx);
>> itx = zil_itx_create(TX_WRITE, sizeof (*lr));
>> lr = (lr_write_t *)&itx->itx_lr;
>> - write_state = WR_NEED_COPY;
>> + wr_state = WR_NEED_COPY;
>> }
>>
>> - itx->itx_wr_state = write_state;
>> - if (write_state == WR_NEED_COPY)
>> - itx->itx_sod += len;
>> + itx->itx_wr_state = wr_state;
>> lr->lr_foid = ZVOL_OBJ;
>> lr->lr_offset = off;
>> lr->lr_length = len;
>>
>
--
Alexander Motin
More information about the svn-src-all
mailing list