From 17a66d0db1163179350f8c0ef433575ff3f6f541 Mon Sep 17 00:00:00 2001 From: "Bryant G. Ly" Date: Thu, 14 Nov 2019 18:41:43 +0000 Subject: [PATCH] Evict refcount=1 entries when DDT large This patch builds upon Matt Ahrens' initial hackathon patch from: https://www.youtube.com/watch?v=PYxFDBgxFS8 Signed-off-by: Bryant G. Ly --- cmd/zdb/zdb.c | 68 +++++++++++++--- include/sys/ddt.h | 1 + man/man5/zfs-module-parameters.5 | 13 +++ module/zfs/dbuf.c | 3 +- module/zfs/ddt.c | 135 ++++++++++++++++++++++++++++++- module/zfs/zio.c | 66 +++++++++++++-- 6 files changed, 264 insertions(+), 22 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index d5cc4a58113c..88fbd5d84abf 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1378,16 +1378,17 @@ dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) if (count == 0) return; - dspace = doi.doi_physical_blocks_512 << 9; - mspace = doi.doi_fill_count * doi.doi_data_block_size; + dspace = (doi.doi_physical_blocks_512 << 9) / count; + mspace = (doi.doi_fill_count * doi.doi_data_block_size) / count; ddt_object_name(ddt, type, class, name); - (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", + (void) printf("%s: %llu entries, " + "avg size %lluB on disk, %lluB in core\n", name, (u_longlong_t)count, - (u_longlong_t)(dspace / count), - (u_longlong_t)(mspace / count)); + (u_longlong_t)dspace, + (u_longlong_t)mspace); if (dump_opt['D'] < 3) return; @@ -3751,10 +3752,12 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, refcnt = 0; } else { ddt_phys_t *ddp = ddt_phys_select(dde, bp); - ddt_phys_decref(ddp); - refcnt = ddp->ddp_refcnt; - if (ddt_phys_total_refcnt(dde) == 0) - ddt_remove(ddt, dde); + if (ddp != NULL) { + ddt_phys_decref(ddp); + refcnt = ddp->ddp_refcnt; + if (ddt_phys_total_refcnt(dde) == 0) + ddt_remove(ddt, dde); + } } ddt_exit(ddt); } @@ -4059,10 +4062,7 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) blkptr_t blk; ddt_phys_t *ddp = dde.dde_phys; - if (ddb.ddb_class == DDT_CLASS_UNIQUE) - return; - - ASSERT(ddt_phys_total_refcnt(&dde) > 1); + ASSERT(ddt_phys_total_refcnt(&dde) > 0); for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0) @@ -4086,6 +4086,46 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) ASSERT(error == ENOENT); } +static boolean_t +zdb_ddt_leak_fini(spa_t *spa) +{ + boolean_t rv = B_FALSE; + + if (dump_opt['L']) + return (rv); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + + if (ddt == NULL) + continue; + + for (ddt_entry_t *dde = avl_first(&ddt->ddt_tree); dde != NULL; + dde = AVL_NEXT(&ddt->ddt_tree, dde)) { + ddt_phys_t *ddp = dde->dde_phys; + rv = B_TRUE; + (void) printf("leaked DDT refcount %llu:\n", + (long long)ddt_phys_total_refcnt(dde)); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0) + continue; + + blkptr_t bp; + ddt_bp_create(c, &dde->dde_key, ddp, &bp); + + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), &bp); + + printf(" %s\n", blkbuf); + } + } + } + if (!rv) + (void) printf("DDT refcounts verified\n"); + return (rv); +} + typedef struct checkpoint_sm_exclude_entry_arg { vdev_t *cseea_vd; uint64_t cseea_checkpoint_size; @@ -4513,6 +4553,8 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) } } + leaks |= zdb_ddt_leak_fini(spa); + umem_free(zcb->zcb_vd_obsolete_counts, rvd->vdev_children * sizeof (uint32_t *)); zcb->zcb_vd_obsolete_counts = NULL; diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 804aab10e30d..4b947cff5b12 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -213,6 +213,7 @@ extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh); extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo); extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh); extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total); +extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg); extern uint64_t ddt_get_dedup_dspace(spa_t *spa); extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa); diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 index 8ad3ce466ce5..a2947519048c 100644 --- a/man/man5/zfs-module-parameters.5 +++ b/man/man5/zfs-module-parameters.5 @@ -2610,6 +2610,19 @@ Flush dirty data to disk at least every N seconds (maximum txg duration) Default value: \fB5\fR. .RE +.sp +.ne 2 +.na +\fBzfs_unique_ddt_max\fR (ulong) +.ad +.RS 12n +Max unique refcount=1 entries in bytes. Default value is 128MB. +.sp +This value can be changed dynamically. +.sp +Default value: \fB134217728\fR. +.RE + .sp .ne 2 .na diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 22c5867b6acc..7d6448a48a37 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -514,7 +514,8 @@ dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) /* Clients must resolve a dbuf before attaching user data. */ ASSERT(db->db.db_data != NULL); - ASSERT3U(db->db_state, ==, DB_CACHED); + if (db->db_blkid != DMU_BONUS_BLKID) + ASSERT3U(db->db_state, ==, DB_CACHED); holds = zfs_refcount_count(&db->db_holds); if (verify_type == DBVU_EVICTING) { diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 46466703a525..bfed0a2cd49b 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -46,6 +46,14 @@ static kmem_cache_t *ddt_entry_cache; */ int zfs_dedup_prefetch = 0; +/* + * Maximum number of unique (refcount==1) entries allowed in the DDT. + * If more entries are added, old (randomly selected) entries will be evicted. + * Assuming each entry is 320bytes, 128MB/320 = 134217728/320 = 419430 entries + */ +#define DDT_UNIQUE_MAX_SIZE 134217728 +unsigned long zfs_unique_ddt_max = DDT_UNIQUE_MAX_SIZE; + static const ddt_ops_t *ddt_ops[DDT_TYPES] = { &ddt_zap_ops, }; @@ -227,6 +235,9 @@ ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, { ASSERT(ddt_object_exists(ddt, type, class)); + dde->dde_type = type; + dde->dde_class = class; + return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, ddt->ddt_object[type][class], dde, walk)); } @@ -436,7 +447,7 @@ ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) *d++ += (*s++ ^ neg) - neg; } -static void +void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) { ddt_stat_t dds; @@ -1133,10 +1144,41 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) if (otype != DDT_TYPES && (otype != ntype || oclass != nclass || total_refcnt == 0)) { + /* + * Note: if we could evict entries from the (UNIQUE) DDT + * while there are outstanding changes (ddt_entry_t's in + * ddt_tree), this remove could fail because it could have + * been evicted. + */ VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); } +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + (void) printf("ddt_sync_entry(txg=%llu): writing oclass=%u nclass=%u: ", + (long long)txg, + oclass, nclass); +#endif + for (int p = 0; p < DDT_PHYS_TYPES; p++) { + if (dde->dde_phys[p].ddp_phys_birth == 0) + continue; +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + (void) printf("phys_type=%u rc=%llu phys_birth=%llu vd0=%llu off0=0x%llx vd1=%llu off1=0x%llx vd2=%llu off2=0x%llx; \n", + p, + (long long)dde->dde_phys[p].ddp_refcnt, + (long long)dde->dde_phys[p].ddp_phys_birth, + (long long)DVA_GET_VDEV(&dde->dde_phys[p].ddp_dva[0]), + (long long)DVA_GET_OFFSET(&dde->dde_phys[p].ddp_dva[0]), + (long long)DVA_GET_VDEV(&dde->dde_phys[p].ddp_dva[1]), + (long long)DVA_GET_OFFSET(&dde->dde_phys[p].ddp_dva[1]), + (long long)DVA_GET_VDEV(&dde->dde_phys[p].ddp_dva[2]), + (long long)DVA_GET_OFFSET(&dde->dde_phys[p].ddp_dva[2])); +#endif + } +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + (void) printf("\n"); +#endif + if (total_refcnt != 0) { dde->dde_type = ntype; dde->dde_class = nclass; @@ -1159,12 +1201,40 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) } } +static uint64_t +ddt_unique_max_size(void) +{ + uint64_t size; + /* + * Make sure our globals have meaningful values in case the user + * altered them. + */ + size = zfs_unique_ddt_max; + if (size == 0) { + cmn_err(CE_NOTE, "Bad value for zfs_unique_ddt_max, value must " + "be greater than zero, resetting it to the default (%d)", + DDT_UNIQUE_MAX_SIZE); + size = zfs_unique_ddt_max = DDT_UNIQUE_MAX_SIZE; + } + + /* + * Abstract the calculation of conversion from amt of ram consumption + * to the number of entries. Here we take the given set value of the + * set size in bytes default of 1GB and divide by size of ddt_entry_t + * this will give us the amount of entries that should be set. + */ + size = size/sizeof(ddt_entry_t); + + return (size); +} + static void ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) { spa_t *spa = ddt->ddt_spa; ddt_entry_t *dde; void *cookie = NULL; + uint64_t ddt_unique_max = ddt_unique_max_size(); if (avl_numnodes(&ddt->ddt_tree) == 0) return; @@ -1182,6 +1252,64 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) ddt_free(dde); } + /* + * If the dedup table is too big, evict entries. + * + * This is currently based on the value of zfs_unique_ddt_max. + * We do not need to tie to arc_min or c_min because if we set + * the value of zfs_unique_ddt_max too high, then that just means + * we allow more refcount=1 entries. Once c_min hits then it will + * start shrinking the ddt anyways. Therefore we dont need to + * tune this area based upon DDT size /unique ddt entry size in MB. + * People might also want flexbility with fine tuning the exact + * unique entries they want for their own data, depending on what kind + * of writes they are doing. + */ + if (ddt_object_exists(ddt, DDT_TYPE_CURRENT, DDT_CLASS_UNIQUE)) { + uint64_t count = 0; + VERIFY(ddt_object_count(ddt, DDT_TYPE_CURRENT, DDT_CLASS_UNIQUE, &count) == 0); + for (int64_t i = 0; i < (int64_t)(count - ddt_unique_max); i++) { + ddt_entry_t rmdde; + bzero(&rmdde, sizeof (ddt_entry_t)); + uint64_t walk = spa_get_random(UINT64_MAX); + if (ddt_object_walk(ddt, + DDT_TYPE_CURRENT, DDT_CLASS_UNIQUE, + &walk, &rmdde) == 0) { + enum ddt_phys_type t; + for (t = 0; t < DDT_PHYS_TYPES; t++) { + if (rmdde.dde_phys[t].ddp_refcnt != 0) + break; + } +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + (void) printf("ddt_sync_table(txg=%llu): evicting %p type=%u vd0=%llu off0=0x%llx vd1=%llu off1=0x%llx vd2=%llu off2=0x%llx\n", + (long long)txg, + &rmdde, + t, + (long long)DVA_GET_VDEV(&rmdde.dde_phys[t].ddp_dva[0]), + (long long)DVA_GET_OFFSET(&rmdde.dde_phys[t].ddp_dva[0]), + (long long)DVA_GET_VDEV(&rmdde.dde_phys[t].ddp_dva[1]), + (long long)DVA_GET_OFFSET(&rmdde.dde_phys[t].ddp_dva[1]), + (long long)DVA_GET_VDEV(&rmdde.dde_phys[t].ddp_dva[2]), + (long long)DVA_GET_OFFSET(&rmdde.dde_phys[t].ddp_dva[2])); +#endif + ASSERT3U(ddt_phys_total_refcnt(&rmdde), ==, 1); + ASSERT3U(t, >, DDT_PHYS_DITTO); + ASSERT(rmdde.dde_phys[t].ddp_phys_birth != 0); + ASSERT3U(rmdde.dde_phys[t].ddp_refcnt, ==, 1); + for (enum ddt_phys_type u = 0; u < DDT_PHYS_TYPES; u++) { + if (u != t) { + ASSERT0(rmdde.dde_phys[u].ddp_phys_birth); + ASSERT0(rmdde.dde_phys[u].ddp_refcnt); + } + } + ddt_stat_update(ddt, &rmdde, -1); + VERIFY0(ddt_object_remove(ddt, + DDT_TYPE_CURRENT, DDT_CLASS_UNIQUE, + &rmdde, tx)); + } + } + } + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { uint64_t add, count = 0; for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { @@ -1254,8 +1382,6 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) ddb->ddb_type, ddb->ddb_class, &ddb->ddb_cursor, dde); } - dde->dde_type = ddb->ddb_type; - dde->dde_class = ddb->ddb_class; if (error == 0) return (0); if (error != ENOENT) @@ -1273,4 +1399,7 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) #if defined(_KERNEL) module_param(zfs_dedup_prefetch, int, 0644); MODULE_PARM_DESC(zfs_dedup_prefetch, "Enable prefetching dedup-ed blks"); + +module_param(zfs_unique_ddt_max, ulong, 0644); +MODULE_PARM_DESC(zfs_unique_ddt_max, "Max refcount=1 size"); #endif diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 035b3a499d5a..614f45aa84b3 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3352,16 +3352,72 @@ zio_ddt_free(zio_t *zio) ddt_t *ddt = ddt_select(spa, bp); ddt_entry_t *dde; ddt_phys_t *ddp; + boolean_t added = B_FALSE; ASSERT(BP_GET_DEDUP(bp)); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ddt_enter(ddt); - freedde = dde = ddt_lookup(ddt, bp, B_TRUE); - if (dde) { - ddp = ddt_phys_select(dde, bp); - if (ddp) - ddt_phys_decref(ddp); + dde = ddt_lookup(ddt, bp, B_FALSE); + if (dde == NULL) { + dde = ddt_lookup(ddt, bp, B_TRUE); + added = B_TRUE; + } + freedde = dde; + ddp = ddt_phys_select(dde, bp); + if (ddp != NULL) { +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + (void) printf("zio_ddt_free(vd=%llu off=%llx) found matching entry with rc=%llu\n", + (long long)DVA_GET_VDEV(BP_IDENTITY(bp)), + (long long)DVA_GET_OFFSET(BP_IDENTITY(bp)), + (long long)ddt_phys_total_refcnt(dde)); + ddt_phys_decref(ddp); +#endif + } else { +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + (void) printf("zio_ddt_free(vd=%llu off=%llx) non-matching, added=%u rc=%llu\n", + (long long)DVA_GET_VDEV(BP_IDENTITY(bp)), + (long long)DVA_GET_OFFSET(BP_IDENTITY(bp)), + added, + (long long)ddt_phys_total_refcnt(dde)); +#endif + /* + * ddt_phys_select() failed, because this entry was evicted + * from the DDT (see ddt_sync_table()). It can fail in two + * cases: the dde is "fresh" (there's no corresponding + * on-disk entry, and the dde's refcount is zero); or the + * dde's phys birth time doesn't match the bp's (after our + * entry was evicted, a block with the same checksum was + * re-added to the DDT). In this case, we know the + * effective refcount is currently 1 (because we only evict + * from DDT_CLASS_UNIQUE), so it is being effectively + * decremented to 0 and we need to free this block. + */ + zio->io_pipeline = ZIO_FREE_PIPELINE; + if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) + zio->io_pipeline |= ZIO_GANG_STAGES; + if (added) { + /* + * Our call to ddt_lookup() added the in-memory + * dde. We need to undo this, because we are not + * actually modifying the on-disk DDT. This is + * true whether the in-memory dde is "fresh", or it + * corresponds to an on-disk entry in the DDT + * (caused by the evict and re-add case). + */ + if (ddt_phys_total_refcnt(dde) > 0) + ddt_stat_update(ddt, dde, 0); + /* If another thread called ddt_lookup() + * on a bp with the same checksum, they could be + * referencing this dde (in ddt_lookup(), waiting + * for us to drop the ddt_lock). When we + * free the dde, they will dereference freed + * memory. We can call ddt_remove(ddt,dde), but we + * risk racing. If we do not call it, then the dde + * will live until the end of the txg, which is + * minimal risk. + */ + } } ddt_exit(ddt);