Skip to content

Commit

Permalink
Merge pull request openzfs#7 from bgly/refcount
Browse files Browse the repository at this point in the history
Evict refcount=1 entries when DDT large
  • Loading branch information
Bryant G. Ly authored Nov 15, 2019
2 parents b6f63a1 + 17a66d0 commit 9c1dda1
Show file tree
Hide file tree
Showing 6 changed files with 264 additions and 22 deletions.
68 changes: 55 additions & 13 deletions cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -1378,16 +1378,17 @@ dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
if (count == 0)
return;

dspace = doi.doi_physical_blocks_512 << 9;
mspace = doi.doi_fill_count * doi.doi_data_block_size;
dspace = (doi.doi_physical_blocks_512 << 9) / count;
mspace = (doi.doi_fill_count * doi.doi_data_block_size) / count;

ddt_object_name(ddt, type, class, name);

(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
(void) printf("%s: %llu entries, "
"avg size %lluB on disk, %lluB in core\n",
name,
(u_longlong_t)count,
(u_longlong_t)(dspace / count),
(u_longlong_t)(mspace / count));
(u_longlong_t)dspace,
(u_longlong_t)mspace);

if (dump_opt['D'] < 3)
return;
Expand Down Expand Up @@ -3751,10 +3752,12 @@ zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp,
refcnt = 0;
} else {
ddt_phys_t *ddp = ddt_phys_select(dde, bp);
ddt_phys_decref(ddp);
refcnt = ddp->ddp_refcnt;
if (ddt_phys_total_refcnt(dde) == 0)
ddt_remove(ddt, dde);
if (ddp != NULL) {
ddt_phys_decref(ddp);
refcnt = ddp->ddp_refcnt;
if (ddt_phys_total_refcnt(dde) == 0)
ddt_remove(ddt, dde);
}
}
ddt_exit(ddt);
}
Expand Down Expand Up @@ -4059,10 +4062,7 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
blkptr_t blk;
ddt_phys_t *ddp = dde.dde_phys;

if (ddb.ddb_class == DDT_CLASS_UNIQUE)
return;

ASSERT(ddt_phys_total_refcnt(&dde) > 1);
ASSERT(ddt_phys_total_refcnt(&dde) > 0);

for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0)
Expand All @@ -4086,6 +4086,46 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
ASSERT(error == ENOENT);
}

static boolean_t
zdb_ddt_leak_fini(spa_t *spa)
{
boolean_t rv = B_FALSE;

if (dump_opt['L'])
return (rv);

for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
ddt_t *ddt = spa->spa_ddt[c];

if (ddt == NULL)
continue;

for (ddt_entry_t *dde = avl_first(&ddt->ddt_tree); dde != NULL;
dde = AVL_NEXT(&ddt->ddt_tree, dde)) {
ddt_phys_t *ddp = dde->dde_phys;
rv = B_TRUE;
(void) printf("leaked DDT refcount %llu:\n",
(long long)ddt_phys_total_refcnt(dde));

for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0)
continue;

blkptr_t bp;
ddt_bp_create(c, &dde->dde_key, ddp, &bp);

char blkbuf[BP_SPRINTF_LEN];
snprintf_blkptr(blkbuf, sizeof (blkbuf), &bp);

printf(" %s\n", blkbuf);
}
}
}
if (!rv)
(void) printf("DDT refcounts verified\n");
return (rv);
}

typedef struct checkpoint_sm_exclude_entry_arg {
vdev_t *cseea_vd;
uint64_t cseea_checkpoint_size;
Expand Down Expand Up @@ -4513,6 +4553,8 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb)
}
}

leaks |= zdb_ddt_leak_fini(spa);

umem_free(zcb->zcb_vd_obsolete_counts,
rvd->vdev_children * sizeof (uint32_t *));
zcb->zcb_vd_obsolete_counts = NULL;
Expand Down
1 change: 1 addition & 0 deletions include/sys/ddt.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total);
extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg);

extern uint64_t ddt_get_dedup_dspace(spa_t *spa);
extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
Expand Down
13 changes: 13 additions & 0 deletions man/man5/zfs-module-parameters.5
Original file line number Diff line number Diff line change
Expand Up @@ -2610,6 +2610,19 @@ Flush dirty data to disk at least every N seconds (maximum txg duration)
Default value: \fB5\fR.
.RE

.sp
.ne 2
.na
\fBzfs_unique_ddt_max\fR (ulong)
.ad
.RS 12n
Max unique refcount=1 entries in bytes. Default value is 128MB.
.sp
This value can be changed dynamically.
.sp
Default value: \fB134217728\fR.
.RE

.sp
.ne 2
.na
Expand Down
3 changes: 2 additions & 1 deletion module/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -514,7 +514,8 @@ dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)

/* Clients must resolve a dbuf before attaching user data. */
ASSERT(db->db.db_data != NULL);
ASSERT3U(db->db_state, ==, DB_CACHED);
if (db->db_blkid != DMU_BONUS_BLKID)
ASSERT3U(db->db_state, ==, DB_CACHED);

holds = zfs_refcount_count(&db->db_holds);
if (verify_type == DBVU_EVICTING) {
Expand Down
135 changes: 132 additions & 3 deletions module/zfs/ddt.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,14 @@ static kmem_cache_t *ddt_entry_cache;
*/
int zfs_dedup_prefetch = 0;

/*
* Maximum number of unique (refcount==1) entries allowed in the DDT.
* If more entries are added, old (randomly selected) entries will be evicted.
* Assuming each entry is 320bytes, 128MB/320 = 134217728/320 = 419430 entries
*/
#define DDT_UNIQUE_MAX_SIZE 134217728
unsigned long zfs_unique_ddt_max = DDT_UNIQUE_MAX_SIZE;

static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
&ddt_zap_ops,
};
Expand Down Expand Up @@ -227,6 +235,9 @@ ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
{
ASSERT(ddt_object_exists(ddt, type, class));

dde->dde_type = type;
dde->dde_class = class;

return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
ddt->ddt_object[type][class], dde, walk));
}
Expand Down Expand Up @@ -436,7 +447,7 @@ ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
*d++ += (*s++ ^ neg) - neg;
}

static void
void
ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
{
ddt_stat_t dds;
Expand Down Expand Up @@ -1133,10 +1144,41 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)

if (otype != DDT_TYPES &&
(otype != ntype || oclass != nclass || total_refcnt == 0)) {
/*
* Note: if we could evict entries from the (UNIQUE) DDT
* while there are outstanding changes (ddt_entry_t's in
* ddt_tree), this remove could fail because it could have
* been evicted.
*/
VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
}

#if defined(ZFS_DEBUG) && !defined(_KERNEL)
(void) printf("ddt_sync_entry(txg=%llu): writing oclass=%u nclass=%u: ",
(long long)txg,
oclass, nclass);
#endif
for (int p = 0; p < DDT_PHYS_TYPES; p++) {
if (dde->dde_phys[p].ddp_phys_birth == 0)
continue;
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
(void) printf("phys_type=%u rc=%llu phys_birth=%llu vd0=%llu off0=0x%llx vd1=%llu off1=0x%llx vd2=%llu off2=0x%llx; \n",
p,
(long long)dde->dde_phys[p].ddp_refcnt,
(long long)dde->dde_phys[p].ddp_phys_birth,
(long long)DVA_GET_VDEV(&dde->dde_phys[p].ddp_dva[0]),
(long long)DVA_GET_OFFSET(&dde->dde_phys[p].ddp_dva[0]),
(long long)DVA_GET_VDEV(&dde->dde_phys[p].ddp_dva[1]),
(long long)DVA_GET_OFFSET(&dde->dde_phys[p].ddp_dva[1]),
(long long)DVA_GET_VDEV(&dde->dde_phys[p].ddp_dva[2]),
(long long)DVA_GET_OFFSET(&dde->dde_phys[p].ddp_dva[2]));
#endif
}
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
(void) printf("\n");
#endif

if (total_refcnt != 0) {
dde->dde_type = ntype;
dde->dde_class = nclass;
Expand All @@ -1159,12 +1201,40 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
}
}

static uint64_t
ddt_unique_max_size(void)
{
uint64_t size;
/*
* Make sure our globals have meaningful values in case the user
* altered them.
*/
size = zfs_unique_ddt_max;
if (size == 0) {
cmn_err(CE_NOTE, "Bad value for zfs_unique_ddt_max, value must "
"be greater than zero, resetting it to the default (%d)",
DDT_UNIQUE_MAX_SIZE);
size = zfs_unique_ddt_max = DDT_UNIQUE_MAX_SIZE;
}

/*
* Abstract the calculation of conversion from amt of ram consumption
* to the number of entries. Here we take the given set value of the
* set size in bytes default of 1GB and divide by size of ddt_entry_t
* this will give us the amount of entries that should be set.
*/
size = size/sizeof(ddt_entry_t);

return (size);
}

static void
ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
{
spa_t *spa = ddt->ddt_spa;
ddt_entry_t *dde;
void *cookie = NULL;
uint64_t ddt_unique_max = ddt_unique_max_size();

if (avl_numnodes(&ddt->ddt_tree) == 0)
return;
Expand All @@ -1182,6 +1252,64 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
ddt_free(dde);
}

/*
* If the dedup table is too big, evict entries.
*
* This is currently based on the value of zfs_unique_ddt_max.
* We do not need to tie to arc_min or c_min because if we set
* the value of zfs_unique_ddt_max too high, then that just means
* we allow more refcount=1 entries. Once c_min hits then it will
* start shrinking the ddt anyways. Therefore we dont need to
* tune this area based upon DDT size /unique ddt entry size in MB.
* People might also want flexbility with fine tuning the exact
* unique entries they want for their own data, depending on what kind
* of writes they are doing.
*/
if (ddt_object_exists(ddt, DDT_TYPE_CURRENT, DDT_CLASS_UNIQUE)) {
uint64_t count = 0;
VERIFY(ddt_object_count(ddt, DDT_TYPE_CURRENT, DDT_CLASS_UNIQUE, &count) == 0);
for (int64_t i = 0; i < (int64_t)(count - ddt_unique_max); i++) {
ddt_entry_t rmdde;
bzero(&rmdde, sizeof (ddt_entry_t));
uint64_t walk = spa_get_random(UINT64_MAX);
if (ddt_object_walk(ddt,
DDT_TYPE_CURRENT, DDT_CLASS_UNIQUE,
&walk, &rmdde) == 0) {
enum ddt_phys_type t;
for (t = 0; t < DDT_PHYS_TYPES; t++) {
if (rmdde.dde_phys[t].ddp_refcnt != 0)
break;
}
#if defined(ZFS_DEBUG) && !defined(_KERNEL)
(void) printf("ddt_sync_table(txg=%llu): evicting %p type=%u vd0=%llu off0=0x%llx vd1=%llu off1=0x%llx vd2=%llu off2=0x%llx\n",
(long long)txg,
&rmdde,
t,
(long long)DVA_GET_VDEV(&rmdde.dde_phys[t].ddp_dva[0]),
(long long)DVA_GET_OFFSET(&rmdde.dde_phys[t].ddp_dva[0]),
(long long)DVA_GET_VDEV(&rmdde.dde_phys[t].ddp_dva[1]),
(long long)DVA_GET_OFFSET(&rmdde.dde_phys[t].ddp_dva[1]),
(long long)DVA_GET_VDEV(&rmdde.dde_phys[t].ddp_dva[2]),
(long long)DVA_GET_OFFSET(&rmdde.dde_phys[t].ddp_dva[2]));
#endif
ASSERT3U(ddt_phys_total_refcnt(&rmdde), ==, 1);
ASSERT3U(t, >, DDT_PHYS_DITTO);
ASSERT(rmdde.dde_phys[t].ddp_phys_birth != 0);
ASSERT3U(rmdde.dde_phys[t].ddp_refcnt, ==, 1);
for (enum ddt_phys_type u = 0; u < DDT_PHYS_TYPES; u++) {
if (u != t) {
ASSERT0(rmdde.dde_phys[u].ddp_phys_birth);
ASSERT0(rmdde.dde_phys[u].ddp_refcnt);
}
}
ddt_stat_update(ddt, &rmdde, -1);
VERIFY0(ddt_object_remove(ddt,
DDT_TYPE_CURRENT, DDT_CLASS_UNIQUE,
&rmdde, tx));
}
}
}

for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
uint64_t add, count = 0;
for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
Expand Down Expand Up @@ -1254,8 +1382,6 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
ddb->ddb_type, ddb->ddb_class,
&ddb->ddb_cursor, dde);
}
dde->dde_type = ddb->ddb_type;
dde->dde_class = ddb->ddb_class;
if (error == 0)
return (0);
if (error != ENOENT)
Expand All @@ -1273,4 +1399,7 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
#if defined(_KERNEL)
module_param(zfs_dedup_prefetch, int, 0644);
MODULE_PARM_DESC(zfs_dedup_prefetch, "Enable prefetching dedup-ed blks");

module_param(zfs_unique_ddt_max, ulong, 0644);
MODULE_PARM_DESC(zfs_unique_ddt_max, "Max refcount=1 size");
#endif
Loading

0 comments on commit 9c1dda1

Please sign in to comment.