1
0
Fork 0
mirror of synced 2025-03-06 20:59:54 +01:00

bcachefs fixes for 6.14-rc3

- More fixes for going read-only: the previous fix was insufficient, but
   with more work on ordering journal reclaim flushing (and a btree node
   accounting fix so we don't split until we have to) the
   tiering_replication test now consistently goes read-only in less than
   a second.
 
 - fix for fsck when we have reflink pointers to missing indirect
   extents
 
 - some transaction restart handling fixes from Alan; the "Pass
   _orig_restart_count to trans_was_restarted" likely fixes some rare
   undefined behaviour heisenbugs.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEKnAFLkS8Qha+jvQrE6szbY3KbnYFAmeuOa8ACgkQE6szbY3K
 bnbCog//SqsVcJKqkkSS1mmqnagy9UV7HGY4gXXzo5a2g/3Nkg1ZLgz18G23txaR
 C9O8UXbSeZ6YWwer6brK++vtD22P7CGK94MnD0JNGsm7uqeMhqkNlUn5Z6dXXRsA
 9lPOBCLugC5nJBZYZ9ITTWB4pGbIvv++yoWA4qluTUZ+NakP1X9+fNDBN0BQAEwz
 JCHPkLcL4L9ifwxgoTRJjyTEQx6zTgCqC5UjWrvtvnZgDrfIt2OTR1Y2GKc3WFcf
 3kbbpTjbyycL7b8pfF0x97ZpilKwqBSvWvBqNt9130lk9/CvaROlctqnzwNwqz36
 ISPTKO1dmS+OX4lQNqvSu6ZnFEaKXUK4pPAifQWKNduPBaLB9zeMAR9aGXie+Ta5
 fn0vXKD9KKs9VWe1bwni278u9oN0TmZya3d1Jo+wy2Z0DLI+5j4jobg7rtimLfnJ
 4LMRzmhooTJENE9Zw3qStpnJxbvKA4HJuNTNyp+qDfpaQnF/u9FcRfSET0Hvt99E
 byCAUVEuOscAaStIaaLugTLecK2r5MCxfdO93pGL8yfUhaiJq6KpvXLieoOYOoOP
 7KWneV3/6/0RRw8WcHlnuXBucRrVDpqq48cTpZhb08ig5/P5kuODV0IeeGBH6qVx
 H0tAMNq9d1XdBNRd+jNH/WkJvtkTI6SPOP2mKoGyfVWYyTEHKOw=
 =qdkE
 -----END PGP SIGNATURE-----

Merge tag 'bcachefs-2025-02-12' of git://evilpiepirate.org/bcachefs

Pull bcachefs fixes from Kent Overstreet:
 "Just small stuff.

  As a general announcement, on disk format is now frozen in my master
  branch - future on disk format changes will be optional, not required.

   - More fixes for going read-only: the previous fix was insufficient,
     but with more work on ordering journal reclaim flushing (and a
     btree node accounting fix so we don't split until we have to) the
     tiering_replication test now consistently goes read-only in less
     than a second.

   - fix for fsck when we have reflink pointers to missing indirect
     extents

   - some transaction restart handling fixes from Alan; the "Pass
     _orig_restart_count to trans_was_restarted" likely fixes some rare
     undefined behaviour heisenbugs"

* tag 'bcachefs-2025-02-12' of git://evilpiepirate.org/bcachefs:
  bcachefs: Reuse transaction
  bcachefs: Pass _orig_restart_count to trans_was_restarted
  bcachefs: CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
  bcachefs: Fix want_new_bset() so we write until the end of the btree node
  bcachefs: Split out journal pins by btree level
  bcachefs: Fix use after free
  bcachefs: Fix marking reflink pointers to missing indirect extents
This commit is contained in:
Linus Torvalds 2025-02-13 11:58:11 -08:00
commit 1854c7f79d
12 changed files with 100 additions and 27 deletions

View file

@ -61,6 +61,13 @@ config BCACHEFS_DEBUG
The resulting code will be significantly slower than normal; you
probably shouldn't select this option unless you're a developer.
config BCACHEFS_INJECT_TRANSACTION_RESTARTS
bool "Randomly inject transaction restarts"
depends on BCACHEFS_DEBUG
help
Randomly inject transaction restarts in a few core paths - may have a
significant performance penalty
config BCACHEFS_TESTS
bool "bcachefs unit and performance tests"
depends on BCACHEFS_FS

View file

@ -2357,6 +2357,12 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (unlikely(ret)) {
k = bkey_s_c_err(ret);
goto out_no_locked;
}
if (iter->update_path) {
bch2_path_put_nokeep(trans, iter->update_path,
iter->flags & BTREE_ITER_intent);
@ -2622,6 +2628,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
int ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (unlikely(ret)) {
k = bkey_s_c_err(ret);
goto out_no_locked;
}
while (1) {
k = __bch2_btree_iter_peek_prev(iter, search_key);
if (unlikely(!k.k))
@ -2749,6 +2761,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
bch2_btree_iter_verify_entry_exit(iter);
EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (unlikely(ret)) {
k = bkey_s_c_err(ret);
goto out_no_locked;
}
/* extents can't span inode numbers: */
if ((iter->flags & BTREE_ITER_is_extents) &&
unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
@ -3106,6 +3124,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (ret)
return ERR_PTR(ret);
struct btree_transaction_stats *s = btree_trans_stats(trans);
s->max_mem = max(s->max_mem, new_bytes);
@ -3163,7 +3185,8 @@ out_new_mem:
if (old_bytes) {
trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
return ERR_PTR(btree_trans_restart_ip(trans,
BCH_ERR_transaction_restart_mem_realloced, _RET_IP_));
}
out_change_top:
p = trans->mem + trans->mem_top;
@ -3271,6 +3294,14 @@ u32 bch2_trans_begin(struct btree_trans *trans)
trans->last_begin_ip = _RET_IP_;
#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
if (trans->restarted) {
trans->restart_count_this_trans++;
} else {
trans->restart_count_this_trans = 0;
}
#endif
trans_set_locked(trans, false);
if (trans->restarted) {

View file

@ -355,6 +355,18 @@ static int btree_trans_restart(struct btree_trans *trans, int err)
return btree_trans_restart_ip(trans, err, _THIS_IP_);
}
static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip)
{
#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) {
trace_and_count(trans->c, trans_restart_injected, trans, ip);
return btree_trans_restart_ip(trans,
BCH_ERR_transaction_restart_fault_inject, ip);
}
#endif
return 0;
}
bool bch2_btree_node_upgrade(struct btree_trans *,
struct btree_path *, unsigned);
@ -739,7 +751,7 @@ transaction_restart: \
if (!_ret2) \
bch2_trans_verify_not_restarted(_trans, _restart_count);\
\
_ret2 ?: trans_was_restarted(_trans, _restart_count); \
_ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \
})
#define for_each_btree_key_max_continue(_trans, _iter, \

View file

@ -999,6 +999,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
bch2_trans_verify_not_unlocked_or_in_restart(trans);
ret = trans_maybe_inject_restart(trans, _RET_IP_);
if (unlikely(ret))
goto out_reset;
if (!trans->nr_updates &&
!trans->journal_entries_u64s)
goto out_reset;

View file

@ -509,6 +509,9 @@ struct btree_trans {
bool notrace_relock_fail:1;
enum bch_errcode restarted:16;
u32 restart_count;
#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
u32 restart_count_this_trans;
#endif
u64 last_begin_time;
unsigned long last_begin_ip;

View file

@ -278,12 +278,12 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct bt
{
struct bset_tree *t = bset_tree_last(b);
struct btree_node_entry *bne = max(write_block(b),
(void *) btree_bkey_last(b, bset_tree_last(b)));
(void *) btree_bkey_last(b, t));
ssize_t remaining_space =
__bch2_btree_u64s_remaining(b, bne->keys.start);
if (unlikely(bset_written(b, bset(b, t)))) {
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
if (b->written + block_sectors(c) <= btree_sectors(c))
return bne;
} else {
if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&

View file

@ -210,11 +210,13 @@ static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *
static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
u64 *v, unsigned nr)
{
percpu_down_read(&c->mark_lock);
struct bch_accounting_mem *acc = &c->accounting;
unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
accounting_pos_cmp, &p);
bch2_accounting_mem_read_counters(acc, idx, v, nr, false);
percpu_up_read(&c->mark_lock);
}
static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)

View file

@ -411,6 +411,16 @@ void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
__bch2_write_op_error(out, op, op->pos.offset);
}
static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out,
struct bch_write_op *op, u64 offset)
{
bch2_inum_offset_err_msg_trans(trans, out,
(subvol_inum) { op->subvol, op->pos.inode, },
offset << 9);
prt_printf(out, "write error%s: ",
op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
}
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
enum bch_data_type type,
const struct bkey_i *k,
@ -1193,7 +1203,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
struct printbuf buf = PRINTBUF;
__bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k));
prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
bch_err_ratelimited(c, "%s", buf.buf);
printbuf_exit(&buf);

View file

@ -384,12 +384,16 @@ void bch2_journal_pin_drop(struct journal *j,
spin_unlock(&j->lock);
}
static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin,
journal_pin_flush_fn fn)
{
if (fn == bch2_btree_node_flush0 ||
fn == bch2_btree_node_flush1)
return JOURNAL_PIN_TYPE_btree;
else if (fn == bch2_btree_key_cache_journal_flush)
fn == bch2_btree_node_flush1) {
unsigned idx = fn == bch2_btree_node_flush1;
struct btree *b = container_of(pin, struct btree, writes[idx].journal);
return JOURNAL_PIN_TYPE_btree0 - b->c.level;
} else if (fn == bch2_btree_key_cache_journal_flush)
return JOURNAL_PIN_TYPE_key_cache;
else
return JOURNAL_PIN_TYPE_other;
@ -441,7 +445,7 @@ void bch2_journal_pin_copy(struct journal *j,
bool reclaim = __journal_pin_drop(j, dst);
bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
@ -465,7 +469,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
bool reclaim = __journal_pin_drop(j, pin);
bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
@ -587,7 +591,7 @@ static size_t journal_flush_pins(struct journal *j,
spin_lock(&j->lock);
/* Pin might have been dropped or rearmed: */
if (likely(!err && !j->flush_in_progress_dropped))
list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]);
list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]);
j->flush_in_progress = NULL;
j->flush_in_progress_dropped = false;
spin_unlock(&j->lock);
@ -869,18 +873,13 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
mutex_lock(&j->reclaim_lock);
if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
BIT(JOURNAL_PIN_TYPE_key_cache)|
BIT(JOURNAL_PIN_TYPE_other))) {
*did_work = true;
goto unlock;
}
if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
BIT(JOURNAL_PIN_TYPE_btree))) {
*did_work = true;
goto unlock;
}
for (int type = JOURNAL_PIN_TYPE_NR - 1;
type >= 0;
--type)
if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) {
*did_work = true;
goto unlock;
}
if (seq_to_flush > journal_cur_seq(j))
bch2_journal_entry_close(j);

View file

@ -53,7 +53,10 @@ struct journal_buf {
*/
enum journal_pin_type {
JOURNAL_PIN_TYPE_btree,
JOURNAL_PIN_TYPE_btree3,
JOURNAL_PIN_TYPE_btree2,
JOURNAL_PIN_TYPE_btree1,
JOURNAL_PIN_TYPE_btree0,
JOURNAL_PIN_TYPE_key_cache,
JOURNAL_PIN_TYPE_other,
JOURNAL_PIN_TYPE_NR,

View file

@ -381,6 +381,8 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
not_found:
if (flags & BTREE_TRIGGER_check_repair) {
ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false);
if (ret == -BCH_ERR_missing_indirect_extent)
ret = 0;
if (ret)
goto err;
}

View file

@ -180,9 +180,9 @@ enum bch_fsck_flags {
x(ptr_crc_nonce_mismatch, 162, 0) \
x(ptr_stripe_redundant, 163, 0) \
x(reservation_key_nr_replicas_invalid, 164, 0) \
x(reflink_v_refcount_wrong, 165, 0) \
x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \
x(reflink_v_pos_bad, 292, 0) \
x(reflink_p_to_missing_reflink_v, 166, 0) \
x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \
x(reflink_refcount_underflow, 293, 0) \
x(stripe_pos_bad, 167, 0) \
x(stripe_val_size_bad, 168, 0) \