Skip to content

Commit

Permalink
Reduce stack for traverse_visitbp() recursion
Browse files Browse the repository at this point in the history
During pool import stack overflows may still occur due to the
potentially deep recursion of traverse_visitbp().  This is most
likely to occur when additional layers are added to the block
device stack such as DM multipath.  To minimize the stack usage
for this call path the following changes were made:

1) Added the keywork 'noinline' to the vdev_*_map_alloc() functions
   to prevent them from being inlined by gcc.  This reduced the
   stack usage of vdev_raidz_io_start() from 208 to 128 bytes, and
   vdev_mirror_io_start() from 144 to 128 bytes.

2) The 'saved_poolname' charater array in zfsdev_ioctl() was moved
   from the stack to the heap.  This reduced the stack usage of
   zfsdev_ioctl() from 368 to 112 bytes.

3) The major saving came from slimming down traverse_visitbp() from
   from 224 to 144 bytes.  Since this function is called recursively
   the 80 bytes saved per invokation adds up.  The following changes
   were made:

  a) The 'hard' local variable was replaced by a TD_HARD() macro.

  b) The 'pd' local variable was replaced by 'td->td_pfd' references.

  c) The zbookmark_t was moved to the heap.  This does cost us an
     additional memory allocation per recursion by that cost should
     still be minimal.  The cost could be further reduced by adding
     a dedicated zbookmark_t slab cache.

  d) The variable declarations in 'if (BP_GET_LEVEL()) { }' were
     restructured to use the minimum amount of stack.  This includes
     removing the 'cbp' local variable.

Overall for the offending use case roughly 1584 of total stack space
has been saved.  This is enough to avoid overflowing the stack on
stock kernels with 8k stacks.  See #1778 for additional details.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Ned Bass <bass6@llnl.gov>
Closes #1778
  • Loading branch information
behlendorf committed Nov 14, 2013
1 parent 65c67ea commit a168788
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 42 deletions.
77 changes: 41 additions & 36 deletions module/zfs/dmu_traverse.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ typedef struct traverse_data {
void *td_arg;
} traverse_data_t;

#define TD_HARD(td) (td->td_flags & TRAVERSE_HARD)

static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
uint64_t objset, uint64_t object);
static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *,
Expand Down Expand Up @@ -208,11 +210,8 @@ static int
traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
const blkptr_t *bp, const zbookmark_t *zb)
{
zbookmark_t czb;
int err = 0, lasterr = 0;
arc_buf_t *buf = NULL;
prefetch_data_t *pd = td->td_pfd;
boolean_t hard = td->td_flags & TRAVERSE_HARD;
boolean_t pause = B_FALSE;

switch (resume_skip_check(td, dnp, zb)) {
Expand All @@ -234,16 +233,17 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
if (bp->blk_birth <= td->td_min_txg)
return (0);

if (pd && !pd->pd_exited &&
((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
if (td->td_pfd && !td->td_pfd->pd_exited &&
((td->td_pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) {
mutex_enter(&pd->pd_mtx);
ASSERT(pd->pd_blks_fetched >= 0);
while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
cv_wait(&pd->pd_cv, &pd->pd_mtx);
pd->pd_blks_fetched--;
cv_broadcast(&pd->pd_cv);
mutex_exit(&pd->pd_mtx);
mutex_enter(&td->td_pfd->pd_mtx);
ASSERT(td->td_pfd->pd_blks_fetched >= 0);
while (td->td_pfd->pd_blks_fetched == 0 &&
!td->td_pfd->pd_exited)
cv_wait(&td->td_pfd->pd_cv, &td->td_pfd->pd_mtx);
td->td_pfd->pd_blks_fetched--;
cv_broadcast(&td->td_pfd->pd_cv);
mutex_exit(&td->td_pfd->pd_mtx);
}

if (td->td_flags & TRAVERSE_PRE) {
Expand All @@ -259,39 +259,45 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,

if (BP_GET_LEVEL(bp) > 0) {
uint32_t flags = ARC_WAIT;
int i;
blkptr_t *cbp;
int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
int32_t i;
int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
zbookmark_t *czb;

err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
return (err);
cbp = buf->b_data;

czb = kmem_alloc(sizeof (zbookmark_t), KM_PUSHPAGE);

for (i = 0; i < epb; i++) {
SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object,
zb->zb_level - 1,
zb->zb_blkid * epb + i);
traverse_prefetch_metadata(td, &cbp[i], &czb);
traverse_prefetch_metadata(td,
&((blkptr_t *)buf->b_data)[i], czb);
}

/* recursively visitbp() blocks below this */
for (i = 0; i < epb; i++) {
SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object,
zb->zb_level - 1,
zb->zb_blkid * epb + i);
err = traverse_visitbp(td, dnp, &cbp[i], &czb);
err = traverse_visitbp(td, dnp,
&((blkptr_t *)buf->b_data)[i], czb);
if (err != 0) {
if (!hard)
if (!TD_HARD(td))
break;
lasterr = err;
}
}

kmem_free(czb, sizeof (zbookmark_t));

} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
uint32_t flags = ARC_WAIT;
int i;
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
int32_t i;
int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;

err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
Expand All @@ -309,7 +315,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
err = traverse_dnode(td, &dnp[i], zb->zb_objset,
zb->zb_blkid * epb + i);
if (err != 0) {
if (!hard)
if (!TD_HARD(td))
break;
lasterr = err;
}
Expand Down Expand Up @@ -337,7 +343,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,

err = traverse_dnode(td, dnp, zb->zb_objset,
DMU_META_DNODE_OBJECT);
if (err && hard) {
if (err && TD_HARD(td)) {
lasterr = err;
err = 0;
}
Expand All @@ -346,7 +352,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
err = traverse_dnode(td, dnp, zb->zb_objset,
DMU_USERUSED_OBJECT);
}
if (err && hard) {
if (err && TD_HARD(td)) {
lasterr = err;
err = 0;
}
Expand All @@ -369,7 +375,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,

if (pause && td->td_resume != NULL) {
ASSERT3U(err, ==, ERESTART);
ASSERT(!hard);
ASSERT(!TD_HARD(td));
traverse_pause(td, zb);
}

Expand Down Expand Up @@ -400,13 +406,12 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
{
int j, err = 0, lasterr = 0;
zbookmark_t czb;
boolean_t hard = (td->td_flags & TRAVERSE_HARD);

for (j = 0; j < dnp->dn_nblkptr; j++) {
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
if (err != 0) {
if (!hard)
if (!TD_HARD(td))
break;
lasterr = err;
}
Expand All @@ -416,7 +421,7 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
if (err != 0) {
if (!hard)
if (!TD_HARD(td))
return (err);
lasterr = err;
}
Expand Down Expand Up @@ -498,9 +503,9 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
*/
ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA));

td = kmem_alloc(sizeof(traverse_data_t), KM_PUSHPAGE);
pd = kmem_zalloc(sizeof(prefetch_data_t), KM_PUSHPAGE);
czb = kmem_alloc(sizeof(zbookmark_t), KM_PUSHPAGE);
td = kmem_alloc(sizeof (traverse_data_t), KM_PUSHPAGE);
pd = kmem_zalloc(sizeof (prefetch_data_t), KM_PUSHPAGE);
czb = kmem_alloc(sizeof (zbookmark_t), KM_PUSHPAGE);

td->td_spa = spa;
td->td_objset = objset;
Expand Down Expand Up @@ -554,9 +559,9 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
mutex_destroy(&pd->pd_mtx);
cv_destroy(&pd->pd_cv);

kmem_free(czb, sizeof(zbookmark_t));
kmem_free(pd, sizeof(struct prefetch_data));
kmem_free(td, sizeof(struct traverse_data));
kmem_free(czb, sizeof (zbookmark_t));
kmem_free(pd, sizeof (struct prefetch_data));
kmem_free(td, sizeof (struct traverse_data));

return (err);
}
Expand Down
14 changes: 10 additions & 4 deletions module/zfs/vdev_mirror.c
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,14 @@ static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
static int
vdev_mirror_pending(vdev_t *vd)
{
return avl_numnodes(&vd->vdev_queue.vq_pending_tree);
return (avl_numnodes(&vd->vdev_queue.vq_pending_tree));
}

static mirror_map_t *
/*
* Avoid inlining the function to keep vdev_mirror_io_start(), which
* is this functions only caller, as small as possible on the stack.
*/
noinline static mirror_map_t *
vdev_mirror_map_alloc(zio_t *zio)
{
mirror_map_t *mm = NULL;
Expand All @@ -106,7 +110,8 @@ vdev_mirror_map_alloc(zio_t *zio)

c = BP_GET_NDVAS(zio->io_bp);

mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE);
mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]),
KM_PUSHPAGE);
mm->mm_children = c;
mm->mm_replacing = B_FALSE;
mm->mm_preferred = spa_get_random(c);
Expand Down Expand Up @@ -136,7 +141,8 @@ vdev_mirror_map_alloc(zio_t *zio)

c = vd->vdev_children;

mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE);
mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]),
KM_PUSHPAGE);
mm->mm_children = c;
mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
vd->vdev_ops == &vdev_spare_ops);
Expand Down
5 changes: 4 additions & 1 deletion module/zfs/vdev_raidz.c
Original file line number Diff line number Diff line change
Expand Up @@ -433,8 +433,11 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
/*
* Divides the IO evenly across all child vdevs; usually, dcols is
* the number of children in the target vdev.
*
* Avoid inlining the function to keep vdev_raidz_io_start(), which
* is this functions only caller, as small as possible on the stack.
*/
static raidz_map_t *
noinline static raidz_map_t *
vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
uint64_t nparity)
{
Expand Down
4 changes: 3 additions & 1 deletion module/zfs/zfs_ioctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -5590,7 +5590,7 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
uint_t vecnum;
int error, rc, len, flag = 0;
const zfs_ioc_vec_t *vec;
char saved_poolname[MAXNAMELEN];
char *saved_poolname;
nvlist_t *innvl = NULL;

vecnum = cmd - ZFS_IOC_FIRST;
Expand All @@ -5599,6 +5599,7 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
vec = &zfs_ioc_vec[vecnum];

zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP | KM_NODEBUG);
saved_poolname = kmem_alloc(MAXNAMELEN, KM_SLEEP);

error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
if (error != 0) {
Expand Down Expand Up @@ -5718,6 +5719,7 @@ zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
(void) tsd_set(zfs_allow_log_key, strdup(saved_poolname));
}

kmem_free(saved_poolname, MAXNAMELEN);
kmem_free(zc, sizeof (zfs_cmd_t));
return (-error);
}
Expand Down

0 comments on commit a168788

Please sign in to comment.