diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 5ce00bc02583..7a2ba8ea0a05 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -336,14 +336,6 @@ typedef struct dmu_buf_impl { /* The buffer was partially read. More reads may follow. */ uint8_t db_partial_read; - - /* - * This block is being held under a writer rangelock of a Direct I/O - * write that is waiting for previous buffered writes to synced out - * due to mixed buffered and O_DIRECT operations. This is needed to - * check whether to grab the rangelock in zfs_get_data(). - */ - uint8_t db_mixed_io_dio_wait; } dmu_buf_impl_t; #define DBUF_HASH_MUTEX(h, idx) \ @@ -392,7 +384,7 @@ dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, uint64_t blkid, uint64_t *hash_out); int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); -void dmu_buf_will_clone(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_will_clone_or_dio(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail); boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed); @@ -401,9 +393,6 @@ dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx); boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -void dmu_buf_direct_mixed_io_wait(dmu_buf_impl_t *db, uint64_t txg, - boolean_t read); -void dmu_buf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); blkptr_t *dmu_buf_get_bp_from_dbuf(dmu_buf_impl_t *db); int dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 216d7d28853d..38ce279808f5 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -1088,7 +1088,6 @@ typedef struct zgd { struct blkptr *zgd_bp; dmu_buf_t *zgd_db; struct zfs_locked_range *zgd_lr; - boolean_t zgd_grabbed_rangelock; void *zgd_private; } zgd_t; diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index cf66c38f0ce2..77f7664fb22f 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -82,13 +82,6 @@ typedef struct dbuf_stats { */ kstat_named_t cache_levels[DN_MAX_LEVELS]; kstat_named_t cache_levels_bytes[DN_MAX_LEVELS]; - /* - * Statistics for Direct I/O. - */ - kstat_named_t direct_mixed_io_read_wait; - kstat_named_t direct_mixed_io_write_wait; - kstat_named_t direct_sync_wait; - kstat_named_t direct_undirty; /* * Statistics about the dbuf hash table. */ @@ -137,10 +130,6 @@ dbuf_stats_t dbuf_stats = { { "cache_total_evicts", KSTAT_DATA_UINT64 }, { { "cache_levels_N", KSTAT_DATA_UINT64 } }, { { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } }, - { "direct_mixed_io_read_wait", KSTAT_DATA_UINT64 }, - { "direct_mixed_io_write_wait", KSTAT_DATA_UINT64 }, - { "direct_sync_wait", KSTAT_DATA_UINT64 }, - { "direct_undirty", KSTAT_DATA_UINT64 }, { "hash_hits", KSTAT_DATA_UINT64 }, { "hash_misses", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, @@ -162,10 +151,6 @@ struct { wmsum_t cache_total_evicts; wmsum_t cache_levels[DN_MAX_LEVELS]; wmsum_t cache_levels_bytes[DN_MAX_LEVELS]; - wmsum_t direct_mixed_io_read_wait; - wmsum_t direct_mixed_io_write_wait; - wmsum_t direct_sync_wait; - wmsum_t direct_undirty; wmsum_t hash_hits; wmsum_t hash_misses; wmsum_t hash_collisions; @@ -911,14 +896,6 @@ dbuf_kstat_update(kstat_t *ksp, int rw) ds->cache_levels_bytes[i].value.ui64 = wmsum_value(&dbuf_sums.cache_levels_bytes[i]); } - ds->direct_mixed_io_read_wait.value.ui64 = - wmsum_value(&dbuf_sums.direct_mixed_io_read_wait); - ds->direct_mixed_io_write_wait.value.ui64 = - wmsum_value(&dbuf_sums.direct_mixed_io_write_wait); - ds->direct_sync_wait.value.ui64 = - wmsum_value(&dbuf_sums.direct_sync_wait); - ds->direct_undirty.value.ui64 = - wmsum_value(&dbuf_sums.direct_undirty); ds->hash_hits.value.ui64 = wmsum_value(&dbuf_sums.hash_hits); ds->hash_misses.value.ui64 = @@ -1021,10 +998,6 @@ dbuf_init(void) wmsum_init(&dbuf_sums.cache_levels[i], 0); wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0); } - wmsum_init(&dbuf_sums.direct_mixed_io_read_wait, 0); - wmsum_init(&dbuf_sums.direct_mixed_io_write_wait, 0); - wmsum_init(&dbuf_sums.direct_sync_wait, 0); - wmsum_init(&dbuf_sums.direct_undirty, 0); wmsum_init(&dbuf_sums.hash_hits, 0); wmsum_init(&dbuf_sums.hash_misses, 0); wmsum_init(&dbuf_sums.hash_collisions, 0); @@ -1097,10 +1070,6 @@ dbuf_fini(void) wmsum_fini(&dbuf_sums.cache_levels[i]); wmsum_fini(&dbuf_sums.cache_levels_bytes[i]); } - wmsum_fini(&dbuf_sums.direct_mixed_io_read_wait); - wmsum_fini(&dbuf_sums.direct_mixed_io_write_wait); - wmsum_fini(&dbuf_sums.direct_sync_wait); - wmsum_fini(&dbuf_sums.direct_undirty); wmsum_fini(&dbuf_sums.hash_hits); wmsum_fini(&dbuf_sums.hash_misses); wmsum_fini(&dbuf_sums.hash_collisions); @@ -1271,9 +1240,8 @@ dbuf_clear_data(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); dbuf_evict_user(db); - /* Direct I/O writes may have data */ - if (db->db_buf == NULL) - db->db.db_data = NULL; + ASSERT3P(db->db_buf, ==, NULL); + db->db.db_data = NULL; if (db->db_state != DB_NOFILL) { db->db_state = DB_UNCACHED; DTRACE_SET_STATE(db, "clear data"); @@ -2789,93 +2757,6 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) return (dr != NULL); } -void -dmu_buf_direct_mixed_io_wait(dmu_buf_impl_t *db, uint64_t txg, boolean_t read) -{ - ASSERT(MUTEX_HELD(&db->db_mtx)); - - if (read == B_TRUE) { - /* - * If a buffered read is in process, a Direct I/O read will - * wait for the buffered I/O to complete. - */ - ASSERT3U(txg, ==, 0); - while (db->db_state == DB_READ) { - DBUF_STAT_BUMP(direct_mixed_io_read_wait); - cv_wait(&db->db_changed, &db->db_mtx); - } - } else { - /* - * There must be an ARC buf associated with this Direct I/O - * write otherwise there is no reason to wait for previous - * dirty records to sync out. - * - * The db_state will temporarily be set to DB_CACHED so that - * that any synchronous writes issued through the ZIL will - * still be handled properly. In particular, the call to - * dbuf_read() in dmu_sync_late_arrival() must account for the - * data still being in the ARC. After waiting here for previous - * TXGs to sync out, dmu_write_direct_done() will update the - * db_state. - */ - ASSERT3P(db->db_buf, !=, NULL); - ASSERT3U(txg, >, 0); - db->db_mixed_io_dio_wait = TRUE; - db->db_state = DB_CACHED; - while (dbuf_find_dirty_lte(db, txg) != NULL) { - DBUF_STAT_BUMP(direct_mixed_io_write_wait); - cv_wait(&db->db_changed, &db->db_mtx); - } - db->db_mixed_io_dio_wait = FALSE; - } -} - -/* - * Direct I/O writes may need to undirty the open-context dirty record - * associated with it in the event of an I/O error. - */ -void -dmu_buf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) -{ - /* - * Direct I/O writes always happen in open-context. - */ - ASSERT(!dmu_tx_is_syncing(tx)); - ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(db->db_state == DB_NOFILL || db->db_state == DB_UNCACHED); - - - /* - * In the event of an I/O error we will handle the metaslab clean up in - * zio_done(). Also, the dirty record's dr_overridden_by BP is not - * currently set as that is done in dmu_sync_done(). Since the db_state - * is still set to DB_NOFILL, dbuf_unoverride() will not be called in - * dbuf_undirty() and the dirty record's BP will not be added the SPA's - * spa_free_bplist via zio_free(). - * - * This function can also be called in the event that a Direct I/O - * write is overwriting a previous Direct I/O to the same block for - * this TXG. It is important to go ahead and free up the space - * accounting in this case through dbuf_undirty() -> dbuf_unoverride() - * -> zio_free(). This is necessary because the space accounting for - * determining if a write can occur in zfs_write() happens through - * dmu_tx_assign(). This can cause an issue with Direct I/O writes in - * the case of overwrites, because all DVA allocations are being done - * in open-context. Constanstly allowing Direct I/O overwrites to the - * same blocks can exhaust the pools available space leading to ENOSPC - * errors at the DVA allcoation part of the ZIO pipeline, which will - * eventually suspend the pool. By cleaning up space accounting now - * the ENOSPC pool suspend can be avoided. - * - * Since we are undirtying the record for the Direct I/O in - * open-context we must have a hold on the db, so it should never be - * evicted after calling dbuf_undirty(). - */ - VERIFY3B(dbuf_undirty(db, tx), ==, B_FALSE); - - DBUF_STAT_BUMP(direct_undirty); -} - /* * Normally the db_blkptr points to the most recent on-disk content for the * dbuf (and anything newer will be cached in the dbuf). However, a recent @@ -2951,7 +2832,7 @@ dmu_buf_untransform_direct(dmu_buf_impl_t *db, spa_t *spa) } void -dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) +dmu_buf_will_clone_or_dio(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT0(db->db_level); @@ -2959,14 +2840,41 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); /* - * Block cloning: We are going to clone into this block, so undirty - * modifications done to this block so far in this txg. This includes - * writes and clones into this block. + * Block clones and Direct I/O writes always happen in open-context. */ + ASSERT(!dmu_tx_is_syncing(tx)); + ASSERT0(db->db_level); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); + mutex_enter(&db->db_mtx); DBUF_VERIFY(db); - VERIFY(!dbuf_undirty(db, tx)); + + /* + * We are going to clone or issue a Direct I/O write on this block, so + * undirty modifications done to this block so far in this txg. This + * includes writes and clones into this block. + * + * If there dirty record associated with this txg from a previous Direct + * I/O write then space accounting cleanup takes place. It is important + * to go ahead free up the space accounting through dbuf_undirty() -> + * dbuf_unoverride() -> zio_free(). Space accountiung for determining + * if a write can occur in zfs_write() happens through dmu_tx_assign(). + * This can cuase an issue with Direct I/O writes in the case of + * overwriting the same block, because all DVA allocations are being + * done in open-context. Constantly allowing Direct I/O overwrites to + * the same block can exhaust the pools available space leading to + * ENOSPC errors at the DVA allocation part of the ZIO pipeline, which + * will eventually suspend the pool. By cleaning up sapce acccounting + * now, the ENOSPC error can be avoided. + * + * Since we are undirtying the record in open-context, we must have a + * hold on the db, so it should never be evicted after calling + * dbuf_undirty(). + */ + VERIFY3B(dbuf_undirty(db, tx), ==, B_FALSE); ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg)); + if (db->db_buf != NULL) { /* * If there is an associated ARC buffer with this dbuf we can @@ -2977,6 +2885,11 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) arc_buf_destroy(db->db_buf, db); + /* + * Setting the dbuf's data pointers to NULL will force all + * future reads down to the devices to get the most up to date + * version of the data after a Direct I/O write has completed. + */ db->db_buf = NULL; dbuf_clear_data(db); } @@ -2985,7 +2898,8 @@ dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx) ASSERT3P(db->db.db_data, ==, NULL); db->db_state = DB_NOFILL; - DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone"); + DTRACE_SET_STATE(db, + "allocating NOFILL buffer for clone or direct I/O write"); DBUF_VERIFY(db); mutex_exit(&db->db_mtx); @@ -3532,7 +3446,6 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db_user_immediate_evict = FALSE; db->db_freed_in_flight = FALSE; db->db_pending_evict = FALSE; - db->db_mixed_io_dio_wait = FALSE; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); @@ -4788,25 +4701,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); - - /* - * It is possible a buffered read has come in after a Direct I/O - * write and is currently transistioning the db_state from DB_READ - * in dbuf_read_impl() to another state in dbuf_read_done(). We - * have to wait in order for the dbuf state to change from DB_READ - * before syncing the dirty record of the Direct I/O write. - */ - if (db->db_state == DB_READ && !dr->dt.dl.dr_brtwrite) { - ASSERT3P(*datap, ==, NULL); - ASSERT3P(db->db_buf, ==, NULL); - ASSERT3P(db->db.db_data, ==, NULL); - ASSERT3U(dr->dt.dl.dr_override_state, ==, DR_OVERRIDDEN); - while (db->db_state == DB_READ) { - DBUF_STAT_BUMP(direct_sync_wait); - cv_wait(&db->db_changed, &db->db_mtx); - } - } - /* * To be synced, we must be dirtied. But we might have been freed * after the dirty. @@ -4819,13 +4713,21 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(db->db.db_data != dr->dt.dl.dr_data); } else if (db->db_state == DB_READ) { /* - * This buffer has a clone we need to write, and an in-flight - * read on the BP we're about to clone. Its safe to issue the - * write here because the read has already been issued and the - * contents won't change. + * This buffer was either cloned or had a Direct I/O write + * occur and has an in-flgiht read on the BP. It is safe to + * issue the write here, because the read has already been + * issued and the contents won't change. + * + * We can verify the case of both the clone and Direct I/O + * write by making sure the first dirty record for the dbuf + * has no ARC buffer associated with it. */ - ASSERT(dr->dt.dl.dr_brtwrite && - dr->dt.dl.dr_override_state == DR_OVERRIDDEN); + dbuf_dirty_record_t *dr_head = + list_head(&db->db_dirty_records); + ASSERT3P(db->db_buf, ==, NULL); + ASSERT3P(db->db.db_data, ==, NULL); + ASSERT3P(dr_head->dt.dl.dr_data, ==, NULL); + ASSERT3U(dr_head->dt.dl.dr_override_state, ==, DR_OVERRIDDEN); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } @@ -5522,7 +5424,7 @@ EXPORT_SYMBOL(dbuf_dirty); EXPORT_SYMBOL(dmu_buf_set_crypt_params); EXPORT_SYMBOL(dmu_buf_will_dirty); EXPORT_SYMBOL(dmu_buf_is_dirty); -EXPORT_SYMBOL(dmu_buf_will_clone); +EXPORT_SYMBOL(dmu_buf_will_clone_or_dio); EXPORT_SYMBOL(dmu_buf_will_not_fill); EXPORT_SYMBOL(dmu_buf_will_fill); EXPORT_SYMBOL(dmu_buf_fill_done); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index ba47be9c9e15..db08b1843163 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2676,7 +2676,7 @@ dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, ASSERT(db->db_blkid != DMU_SPILL_BLKID); ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp)); - dmu_buf_will_clone(dbuf, tx); + dmu_buf_will_clone_or_dio(dbuf, tx); mutex_enter(&db->db_mtx); diff --git a/module/zfs/dmu_direct.c b/module/zfs/dmu_direct.c index 0ff3e0e55e40..bf4733530204 100644 --- a/module/zfs/dmu_direct.c +++ b/module/zfs/dmu_direct.c @@ -90,78 +90,35 @@ dmu_write_direct_done(zio_t *zio) dmu_sync_arg_t *dsa = zio->io_private; dbuf_dirty_record_t *dr = dsa->dsa_dr; dmu_buf_impl_t *db = dr->dr_dbuf; - uint64_t txg = dsa->dsa_tx->tx_txg; abd_free(zio->io_abd); + mutex_enter(&db->db_mtx); + ASSERT3P(db->db_buf, ==, NULL); + ASSERT3P(dr->dt.dl.dr_data, ==, NULL); + ASSERT3P(db->db.db_data, ==, NULL); + db->db_state = DB_UNCACHED; + mutex_exit(&db->db_mtx); - if (zio->io_error == 0) { - /* - * After a successful Direct I/O write any stale contents in - * the ARC must be cleaned up in order to force all future - * reads down to the VDEVs. - * - * If a previous write operation to this dbuf was buffered - * (in the ARC) we have to wait for the previous dirty records - * associated with this dbuf to be synced out if they are in - * the quiesce or sync phase for their TXG. This is done to - * guarantee we are not racing to destroy the ARC buf that - * is associated with the dbuf between this done callback and - * spa_sync(). Outside of using a heavy handed approach of - * locking down the spa_syncing_txg while it is being updated, - * there is no way to synchronize when a dirty record's TXG - * has moved over to the sync phase. - * - * In order to make sure all TXG's are consistent we must - * do this stall if there is an associated ARC buf with this - * dbuf. It is because of this that a user should not really - * be mixing buffered and Direct I/O writes. If they choose to - * do so, there is an associated performance penalty for that - * as we will not give up consistency with a TXG over - * performance. - */ - if (db->db_buf) { - dmu_buf_direct_mixed_io_wait(db, txg - 1, B_FALSE); - ASSERT3P(db->db_buf, ==, dr->dt.dl.dr_data); - arc_buf_destroy(db->db_buf, db); - db->db_buf = NULL; - dr->dt.dl.dr_data = NULL; - db->db.db_data = NULL; - ASSERT3U(db->db_dirtycnt, ==, 1); - } + dmu_sync_done(zio, NULL, zio->io_private); - /* - * The current contents of the dbuf are now stale. - */ - ASSERT3P(dr->dt.dl.dr_data, ==, NULL); - ASSERT3P(db->db.db_data, ==, NULL); - db->db_state = DB_UNCACHED; - } else { + if (zio->io_error != 0) { if (zio->io_flags & ZIO_FLAG_DIO_CHKSUM_ERR) ASSERT3U(zio->io_error, ==, EAGAIN); /* - * If there is a valid ARC buffer assocatied with this dirty - * record we will stall just like on a successful Direct I/O - * write to make sure all TXG's are consistent. See comment - * above. + * In the event of an I/O error the metaslab cleanup is taken + * care of in zio_done(). + * + * Since we are undirtying the record in open-context, we must + * have a hold on the db, so it should never be evicted after + * calling dbuf_undirty(). */ - if (db->db_buf) { - ASSERT3P(db->db_buf, ==, dr->dt.dl.dr_data); - dmu_buf_direct_mixed_io_wait(db, txg - 1, B_FALSE); - dmu_buf_undirty(db, dsa->dsa_tx); - db->db_state = DB_CACHED; - } else { - ASSERT3P(dr->dt.dl.dr_data, ==, NULL); - dmu_buf_undirty(db, dsa->dsa_tx); - db->db_state = DB_UNCACHED; - } - - ASSERT0(db->db_dirtycnt); + mutex_enter(&db->db_mtx); + VERIFY3B(dbuf_undirty(db, dsa->dsa_tx), ==, B_FALSE); + mutex_exit(&db->db_mtx); } - mutex_exit(&db->db_mtx); - dmu_sync_done(zio, NULL, zio->io_private); kmem_free(zio->io_bp, sizeof (blkptr_t)); zio->io_bp = NULL; } @@ -183,26 +140,11 @@ dmu_write_direct(zio_t *pio, dmu_buf_impl_t *db, abd_t *data, dmu_tx_t *tx) WP_DMU_SYNC | WP_DIRECT_WR, &zp); DB_DNODE_EXIT(db); - /* - * If we going to overwrite a previous Direct I/O write that is part of - * the current TXG, then we can can go ahead and undirty it now. Part - * of it being undirtied will be allowing for previously allocated - * space in the dr_overridden_bp BP's DVAs to be freed. This avoids - * ENOSPC errors from possibly occuring when trying to allocate new - * metaslabs in open-context for Direct I/O writes. - */ - mutex_enter(&db->db_mtx); - dr_head = dbuf_find_dirty_eq(db, dmu_tx_get_txg(tx)); - if (dbuf_dirty_is_direct_write(db, dr_head)) { - dmu_buf_undirty(db, tx); - } - mutex_exit(&db->db_mtx); - /* * Dirty this dbuf with DB_NOFILL since we will not have any data * associated with the dbuf. */ - dmu_buf_will_not_fill(&db->db, tx); + dmu_buf_will_clone_or_dio(&db->db, tx); mutex_enter(&db->db_mtx); @@ -289,7 +231,7 @@ dmu_write_abd(dnode_t *dn, uint64_t offset, uint64_t size, /* * The dbuf must be held until the Direct I/O write has completed in - * the event there was any errors and dmu_buf_undirty() was called. + * the event there was any errors and dbuf_undirty() was called. */ dmu_buf_rele_array(dbp, numbufs, FTAG); @@ -325,10 +267,11 @@ dmu_read_abd(dnode_t *dn, uint64_t offset, uint64_t size, db->db.db_object, db->db_level, db->db_blkid); /* - * If there is another buffered read for this dbuf, we will - * wait for that to complete first. + * If there is another read for this dbuf, we will wait for + * that to complete first before checking the db_state below. */ - dmu_buf_direct_mixed_io_wait(db, 0, B_TRUE); + while (db->db_state == DB_READ) + cv_wait(&db->db_changed, &db->db_mtx); blkptr_t *bp = dmu_buf_get_bp_from_dbuf(db); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index fb6f9475d350..2460805582f2 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1096,32 +1096,6 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, zgd->zgd_lwb = lwb; zgd->zgd_private = zp; - dmu_buf_t *dbp; - error = dmu_buf_hold_noread(os, object, offset, zgd, &dbp); - zgd->zgd_db = dbp; - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp; - - if (error) { - zfs_get_done(zgd, error); - return (error); - } - - /* - * If a Direct I/O write is waiting for previous dirty records to sync - * out in dmu_buf_direct_mixed_io_wait(), then the ranglock is already - * held across the entire block by the O_DIRECT write. - * - * The dirty record for this TXG will also be used to identify if this - * log record is associated with a Direct I/O write. - */ - mutex_enter(&db->db_mtx); - boolean_t rangelock_held = db->db_mixed_io_dio_wait; - zgd->zgd_grabbed_rangelock = !(rangelock_held); - dbuf_dirty_record_t *dr = - dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg); - boolean_t direct_write = dbuf_dirty_is_direct_write(db, dr); - mutex_exit(&db->db_mtx); - /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the @@ -1130,10 +1104,8 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ - if (zgd->zgd_grabbed_rangelock) { - zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, - offset, size, RL_READER); - } + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, + size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); @@ -1150,29 +1122,19 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ - if (zgd->zgd_grabbed_rangelock) { - for (;;) { - uint64_t blkoff; - size = zp->z_blksz; - blkoff = ISP2(size) ? P2PHASE(offset, size) : - offset; - offset -= blkoff; - zgd->zgd_lr = zfs_rangelock_enter( - &zp->z_rangelock, offset, size, RL_READER); - if (zp->z_blksz == size) - break; - offset += blkoff; - zfs_rangelock_exit(zgd->zgd_lr); - } - ASSERT3U(dbp->db_size, ==, size); - ASSERT3U(dbp->db_offset, ==, offset); - } else { - /* - * A Direct I/O write always covers an entire block. - */ - ASSERT3U(dbp->db_size, ==, zp->z_blksz); + for (;;) { + uint64_t blkoff; + size = zp->z_blksz; + blkoff = ISP2(size) ? P2PHASE(offset, size) : + offset; + offset -= blkoff; + zgd->zgd_lr = zfs_rangelock_enter( + &zp->z_rangelock, offset, size, RL_READER); + if (zp->z_blksz == size) + break; + offset += blkoff; + zfs_rangelock_exit(zgd->zgd_lr); } - /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); @@ -1182,48 +1144,69 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, zil_fault_io = 0; } #endif - if (error) { - zfs_get_done(zgd, error); - return (error); - } - /* - * All Direct I/O writes will have already completed and the - * block pointer can be immediately stored in the log record. - */ - if (direct_write) { - lr->lr_blkptr = dr->dt.dl.dr_overridden_by; - zfs_get_done(zgd, 0); - return (0); - } + dmu_buf_t *dbp; + if (error == 0) + error = dmu_buf_hold_noread(os, object, offset, zgd, + &dbp); + + if (error == 0) { + zgd->zgd_db = dbp; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp; + mutex_enter(&db->db_mtx); + dbuf_dirty_record_t *dr = + dbuf_find_dirty_eq(db, lr->lr_common.lrc_txg); + boolean_t direct_write = + dbuf_dirty_is_direct_write(db, dr); + mutex_exit(&db->db_mtx); - blkptr_t *bp = &lr->lr_blkptr; - zgd->zgd_bp = bp; + /* + * All Direct I/O writes will have already completed and + * the block pointer can be immediately stored in the + * log record. + */ + if (direct_write) { + /* + * A Direct I/O write always covers an entire + * block. + */ + ASSERT3U(dbp->db_size, ==, zp->z_blksz); + lr->lr_blkptr = dr->dt.dl.dr_overridden_by; + zfs_get_done(zgd, 0); + return (0); + } - error = dmu_sync(zio, lr->lr_common.lrc_txg, - zfs_get_done, zgd); - ASSERT(error || lr->lr_length <= size); + blkptr_t *bp = &lr->lr_blkptr; + zgd->zgd_bp = bp; - /* - * On success, we need to wait for the write I/O - * initiated by dmu_sync() to complete before we can - * release this dbuf. We will finish everything up - * in the zfs_get_done() callback. - */ - if (error == 0) - return (0); + ASSERT3U(dbp->db_offset, ==, offset); + ASSERT3U(dbp->db_size, ==, size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zfs_get_done, zgd); + ASSERT(error || lr->lr_length <= size); - if (error == EALREADY) { - lr->lr_common.lrc_txtype = TX_WRITE2; /* - * TX_WRITE2 relies on the data previously - * written by the TX_WRITE that caused - * EALREADY. We zero out the BP because - * it is the old, currently-on-disk BP. + * On success, we need to wait for the write I/O + * initiated by dmu_sync() to complete before we can + * release this dbuf. We will finish everything up + * in the zfs_get_done() callback. */ - zgd->zgd_bp = NULL; - BP_ZERO(bp); - error = 0; + if (error == 0) + return (0); + + if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; + /* + * TX_WRITE2 relies on the data previously + * written by the TX_WRITE that caused + * EALREADY. We zero out the BP because + * it is the old, currently-on-disk BP. + */ + zgd->zgd_bp = NULL; + BP_ZERO(bp); + error = 0; + } } } @@ -1232,18 +1215,16 @@ zfs_get_data(void *arg, uint64_t gen, lr_write_t *lr, char *buf, return (error); } - static void zfs_get_done(zgd_t *zgd, int error) { (void) error; znode_t *zp = zgd->zgd_private; - ASSERT3P(zgd->zgd_db, !=, NULL); - dmu_buf_rele(zgd->zgd_db, zgd); + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); - if (zgd->zgd_grabbed_rangelock) - zfs_rangelock_exit(zgd->zgd_lr); + zfs_rangelock_exit(zgd->zgd_lr); /* * Release the vnode asynchronously as we currently have the