Skip to content

Commit

Permalink
Merge tag 'bcachefs-2024-04-10' of https://evilpiepirate.org/git/bcac…
Browse files Browse the repository at this point in the history
…hefs

Pull more bcachefs fixes from Kent Overstreet:
 "Notable user impacting bugs

   - On multi device filesystems, recovery was looping in
     btree_trans_too_many_iters(). This checks if a transaction has
     touched too many btree paths (because of iteration over many keys),
     and isuses a restart to drop unneeded paths.

     But it's now possible for some paths to exceed the previous limit
     without iteration in the interior btree update path, since the
     transaction commit will do alloc updates for every old and new
     btree node, and during journal replay we don't use the btree write
     buffer for locking reasons and thus those updates use btree paths
     when they wouldn't normally.

   - Fix a corner case in rebalance when moving extents on a
     durability=0 device. This wouldn't be hit when a device was
     formatted with durability=0 since in that case we'll only use it as
     a write through cache (only cached extents will live on it), but
     durability can now be changed on an existing device.

   - bch2_get_acl() could rarely forget to handle a transaction restart;
     this manifested as the occasional missing acl that came back after
     dropping caches.

   - Fix a major performance regression on high iops multithreaded write
     workloads (only since 6.9-rc1); a previous fix for a deadlock in
     the interior btree update path to check the journal watermark
     introduced a dependency on the state of btree write buffer flushing
     that we didn't want.

   - Assorted other repair paths and recovery fixes"

* tag 'bcachefs-2024-04-10' of https://evilpiepirate.org/git/bcachefs: (25 commits)
  bcachefs: Fix __bch2_btree_and_journal_iter_init_node_iter()
  bcachefs: Kill read lock dropping in bch2_btree_node_lock_write_nofail()
  bcachefs: Fix a race in btree_update_nodes_written()
  bcachefs: btree_node_scan: Respect member.data_allowed
  bcachefs: Don't scan for btree nodes when we can reconstruct
  bcachefs: Fix check_topology() when using node scan
  bcachefs: fix eytzinger0_find_gt()
  bcachefs: fix bch2_get_acl() transaction restart handling
  bcachefs: fix the count of nr_freed_pcpu after changing bc->freed_nonpcpu list
  bcachefs: Fix gap buffer bug in bch2_journal_key_insert_take()
  bcachefs: Rename struct field swap to prevent macro naming collision
  MAINTAINERS: Add entry for bcachefs documentation
  Documentation: filesystems: Add bcachefs toctree
  bcachefs: JOURNAL_SPACE_LOW
  bcachefs: Disable errors=panic for BCH_IOCTL_FSCK_OFFLINE
  bcachefs: Fix BCH_IOCTL_FSCK_OFFLINE for encrypted filesystems
  bcachefs: fix rand_delete unit test
  bcachefs: fix ! vs ~ typo in __clear_bit_le64()
  bcachefs: Fix rebalance from durability=0 device
  bcachefs: Print shutdown journal sequence number
  ...
  • Loading branch information
torvalds committed Apr 11, 2024
2 parents 346668f + 1189bdd commit e1dc191
Show file tree
Hide file tree
Showing 27 changed files with 372 additions and 238 deletions.
11 changes: 11 additions & 0 deletions Documentation/filesystems/bcachefs/index.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
.. SPDX-License-Identifier: GPL-2.0
======================
bcachefs Documentation
======================

.. toctree::
:maxdepth: 2
:numbered:

errorcodes
1 change: 1 addition & 0 deletions Documentation/filesystems/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ Documentation for filesystem implementations.
afs
autofs
autofs-mount-control
bcachefs/index
befs
bfs
btrfs
Expand Down
1 change: 1 addition & 0 deletions MAINTAINERS
Original file line number Diff line number Diff line change
Expand Up @@ -3573,6 +3573,7 @@ S: Supported
C: irc://irc.oftc.net/bcache
T: git https://evilpiepirate.org/git/bcachefs.git
F: fs/bcachefs/
F: Documentation/filesystems/bcachefs/

BDISP ST MEDIA DRIVER
M: Fabien Dessenne <fabien.dessenne@foss.st.com>
Expand Down
30 changes: 14 additions & 16 deletions fs/bcachefs/acl.c
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,6 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
struct bkey_s_c_xattr xattr;
struct posix_acl *acl = NULL;
struct bkey_s_c k;
int ret;
Expand All @@ -290,28 +289,27 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,

ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
&hash, inode_inum(inode), &search, 0);
if (ret) {
if (!bch2_err_matches(ret, ENOENT))
acl = ERR_PTR(ret);
goto out;
}
if (ret)
goto err;

k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret) {
acl = ERR_PTR(ret);
goto out;
}
if (ret)
goto err;

xattr = bkey_s_c_to_xattr(k);
struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
le16_to_cpu(xattr.v->x_val_len));
ret = PTR_ERR_OR_ZERO(acl);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;

if (!IS_ERR(acl))
if (ret)
acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL;

if (!IS_ERR_OR_NULL(acl))
set_cached_acl(&inode->v, type, acl);
out:
if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
goto retry;

bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
Expand Down
14 changes: 14 additions & 0 deletions fs/bcachefs/bcachefs_format.h
Original file line number Diff line number Diff line change
Expand Up @@ -1535,6 +1535,20 @@ enum btree_id {
BTREE_ID_NR
};

static inline bool btree_id_is_alloc(enum btree_id id)
{
switch (id) {
case BTREE_ID_alloc:
case BTREE_ID_backpointers:
case BTREE_ID_need_discard:
case BTREE_ID_freespace:
case BTREE_ID_bucket_gens:
return true;
default:
return false;
}
}

#define BTREE_MAX_DEPTH 4U

/* Btree nodes */
Expand Down
13 changes: 9 additions & 4 deletions fs/bcachefs/btree_gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -368,11 +368,16 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
buf.buf)) {
bch2_btree_node_evict(trans, cur_k.k);
cur = NULL;
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?:
bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
if (ret)
break;

if (!btree_id_is_alloc(b->c.btree_id)) {
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
if (ret)
break;
}
continue;
}

Expand Down Expand Up @@ -544,12 +549,12 @@ int bch2_check_topology(struct bch_fs *c)
bch2_btree_root_alloc_fake(c, i, 0);
} else {
bch2_btree_root_alloc_fake(c, i, 1);
bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
if (ret)
break;
}

bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
reconstructed_root = true;
}

Expand Down
2 changes: 1 addition & 1 deletion fs/bcachefs/btree_iter.h
Original file line number Diff line number Diff line change
Expand Up @@ -642,7 +642,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *);

static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8)
if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8)
return __bch2_btree_trans_too_many_iters(trans);

return 0;
Expand Down
67 changes: 52 additions & 15 deletions fs/bcachefs/btree_journal_iter.c
Original file line number Diff line number Diff line change
Expand Up @@ -130,22 +130,45 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree
return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
}

static void journal_iter_verify(struct journal_iter *iter)
{
struct journal_keys *keys = iter->keys;
size_t gap_size = keys->size - keys->nr;

BUG_ON(iter->idx >= keys->gap &&
iter->idx < keys->gap + gap_size);

if (iter->idx < keys->size) {
struct journal_key *k = keys->data + iter->idx;

int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
cmp_int(k->level, iter->level);
BUG_ON(cmp < 0);
}
}

static void journal_iters_fix(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
/* The key we just inserted is immediately before the gap: */
size_t gap_end = keys->gap + (keys->size - keys->nr);
struct btree_and_journal_iter *iter;
struct journal_key *new_key = &keys->data[keys->gap - 1];
struct journal_iter *iter;

/*
* If an iterator points one after the key we just inserted, decrement
* the iterator so it points at the key we just inserted - if the
* decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
* handle that:
*/
list_for_each_entry(iter, &c->journal_iters, journal.list)
if (iter->journal.idx == gap_end)
iter->journal.idx = keys->gap - 1;
list_for_each_entry(iter, &c->journal_iters, list) {
journal_iter_verify(iter);
if (iter->idx == gap_end &&
new_key->btree_id == iter->btree_id &&
new_key->level == iter->level)
iter->idx = keys->gap - 1;
journal_iter_verify(iter);
}
}

static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
Expand Down Expand Up @@ -192,7 +215,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
if (idx > keys->gap)
idx -= keys->size - keys->nr;

size_t old_gap = keys->gap;

if (keys->nr == keys->size) {
journal_iters_move_gap(c, old_gap, keys->size);
old_gap = keys->size;

struct journal_keys new_keys = {
.nr = keys->nr,
.size = max_t(size_t, keys->size, 8) * 2,
Expand All @@ -216,7 +244,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
keys->gap = keys->nr;
}

journal_iters_move_gap(c, keys->gap, idx);
journal_iters_move_gap(c, old_gap, idx);

move_gap(keys, idx);

Expand Down Expand Up @@ -301,16 +329,21 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)

static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
struct journal_key *k = iter->keys->data + iter->idx;
journal_iter_verify(iter);

while (iter->idx < iter->keys->size) {
struct journal_key *k = iter->keys->data + iter->idx;

int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
cmp_int(k->level, iter->level);
if (cmp > 0)
break;
BUG_ON(cmp);

while (k < iter->keys->data + iter->keys->size &&
k->btree_id == iter->btree_id &&
k->level == iter->level) {
if (!k->overwritten)
return bkey_i_to_s_c(k->k);

bch2_journal_iter_advance(iter);
k = iter->keys->data + iter->idx;
}

return bkey_s_c_null;
Expand All @@ -330,6 +363,8 @@ static void bch2_journal_iter_init(struct bch_fs *c,
iter->level = level;
iter->keys = &c->journal_keys;
iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);

journal_iter_verify(iter);
}

static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
Expand Down Expand Up @@ -434,10 +469,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
iter->trans = trans;
iter->b = b;
iter->node_iter = node_iter;
bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
INIT_LIST_HEAD(&iter->journal.list);
iter->pos = b->data->min_key;
iter->at_end = false;
INIT_LIST_HEAD(&iter->journal.list);

if (trans->journal_replay_not_finished) {
bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
list_add(&iter->journal.list, &trans->c->journal_iters);
}
}

/*
Expand All @@ -452,9 +492,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,

bch2_btree_node_iter_init_from_start(&node_iter, b);
__bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
if (trans->journal_replay_not_finished &&
!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
list_add(&iter->journal.list, &trans->c->journal_iters);
}

/* sort and dedup all keys in the journal: */
Expand Down
4 changes: 3 additions & 1 deletion fs/bcachefs/btree_key_cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
} else {
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_pcpu);
bc->nr_freed_pcpu++;
mutex_unlock(&bc->lock);
}
}
Expand Down Expand Up @@ -245,6 +246,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
if (!list_empty(&bc->freed_pcpu)) {
ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
list_del_init(&ck->list);
bc->nr_freed_pcpu--;
}
mutex_unlock(&bc->lock);
}
Expand Down Expand Up @@ -659,7 +661,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
commit_flags |= BCH_WATERMARK_reclaim;

if (ck->journal.seq != journal_last_seq(j) ||
j->watermark == BCH_WATERMARK_stripe)
!test_bit(JOURNAL_SPACE_LOW, &c->journal.flags))
commit_flags |= BCH_TRANS_COMMIT_no_journal_res;

ret = bch2_btree_iter_traverse(&b_iter) ?:
Expand Down
28 changes: 1 addition & 27 deletions fs/bcachefs/btree_locking.c
Original file line number Diff line number Diff line change
Expand Up @@ -440,33 +440,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b)
{
struct btree_path *linked;
unsigned i, iter;
int ret;

/*
* XXX BIG FAT NOTICE
*
* Drop all read locks before taking a write lock:
*
* This is a hack, because bch2_btree_node_lock_write_nofail() is a
* hack - but by dropping read locks first, this should never fail, and
* we only use this in code paths where whatever read locks we've
* already taken are no longer needed:
*/

trans_for_each_path(trans, linked, iter) {
if (!linked->nodes_locked)
continue;

for (i = 0; i < BTREE_MAX_DEPTH; i++)
if (btree_node_read_locked(linked, i)) {
btree_node_unlock(trans, linked, i);
btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK);
}
}

ret = __btree_node_lock_write(trans, path, b, true);
int ret = __btree_node_lock_write(trans, path, b, true);
BUG_ON(ret);
}

Expand Down
11 changes: 10 additions & 1 deletion fs/bcachefs/btree_node_scan.c
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
if (le64_to_cpu(bn->magic) != bset_magic(c))
return;

if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
return;

rcu_read_lock();
struct found_btree_node n = {
.btree_id = BTREE_NODE_ID(bn),
Expand Down Expand Up @@ -213,6 +216,9 @@ static int read_btree_nodes(struct find_btree_nodes *f)
closure_init_stack(&cl);

for_each_online_member(c, ca) {
if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
continue;

struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
struct task_struct *t;

Expand Down Expand Up @@ -290,7 +296,7 @@ static int handle_overwrites(struct bch_fs *c,
found_btree_node_to_text(&buf, c, n);
bch_err(c, "%s", buf.buf);
printbuf_exit(&buf);
return -1;
return -BCH_ERR_fsck_repair_unimplemented;
}
}

Expand Down Expand Up @@ -436,6 +442,9 @@ bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos node_min, struct bpos node_max)
{
if (btree_id_is_alloc(btree))
return 0;

struct find_btree_nodes *f = &c->found_btree_nodes;

int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
Expand Down
14 changes: 14 additions & 0 deletions fs/bcachefs/btree_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -364,7 +364,21 @@ struct btree_insert_entry {
unsigned long ip_allocated;
};

/* Number of btree paths we preallocate, usually enough */
#define BTREE_ITER_INITIAL 64
/*
* Lmiit for btree_trans_too_many_iters(); this is enough that almost all code
* paths should run inside this limit, and if they don't it usually indicates a
* bug (leaking/duplicated btree paths).
*
* exception: some fsck paths
*
* bugs with excessive path usage seem to have possibly been eliminated now, so
* we might consider eliminating this (and btree_trans_too_many_iter()) at some
* point.
*/
#define BTREE_ITER_NORMAL_LIMIT 256
/* never exceed limit */
#define BTREE_ITER_MAX (1U << 10)

struct btree_trans_commit_hook;
Expand Down
Loading

0 comments on commit e1dc191

Please sign in to comment.