bcachefs fixes for 6.14-rc3
- More fixes for going read-only: the previous fix was insufficient, but with more work on ordering journal reclaim flushing (and a btree node accounting fix so we don't split until we have to) the tiering_replication test now consistently goes read-only in less than a second. - fix for fsck when we have reflink pointers to missing indirect extents - some transaction restart handling fixes from Alan; the "Pass _orig_restart_count to trans_was_restarted" likely fixes some rare undefined behaviour heisenbugs. -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEKnAFLkS8Qha+jvQrE6szbY3KbnYFAmeuOa8ACgkQE6szbY3K bnbCog//SqsVcJKqkkSS1mmqnagy9UV7HGY4gXXzo5a2g/3Nkg1ZLgz18G23txaR C9O8UXbSeZ6YWwer6brK++vtD22P7CGK94MnD0JNGsm7uqeMhqkNlUn5Z6dXXRsA 9lPOBCLugC5nJBZYZ9ITTWB4pGbIvv++yoWA4qluTUZ+NakP1X9+fNDBN0BQAEwz JCHPkLcL4L9ifwxgoTRJjyTEQx6zTgCqC5UjWrvtvnZgDrfIt2OTR1Y2GKc3WFcf 3kbbpTjbyycL7b8pfF0x97ZpilKwqBSvWvBqNt9130lk9/CvaROlctqnzwNwqz36 ISPTKO1dmS+OX4lQNqvSu6ZnFEaKXUK4pPAifQWKNduPBaLB9zeMAR9aGXie+Ta5 fn0vXKD9KKs9VWe1bwni278u9oN0TmZya3d1Jo+wy2Z0DLI+5j4jobg7rtimLfnJ 4LMRzmhooTJENE9Zw3qStpnJxbvKA4HJuNTNyp+qDfpaQnF/u9FcRfSET0Hvt99E byCAUVEuOscAaStIaaLugTLecK2r5MCxfdO93pGL8yfUhaiJq6KpvXLieoOYOoOP 7KWneV3/6/0RRw8WcHlnuXBucRrVDpqq48cTpZhb08ig5/P5kuODV0IeeGBH6qVx H0tAMNq9d1XdBNRd+jNH/WkJvtkTI6SPOP2mKoGyfVWYyTEHKOw= =qdkE -----END PGP SIGNATURE----- Merge tag 'bcachefs-2025-02-12' of git://evilpiepirate.org/bcachefs Pull bcachefs fixes from Kent Overstreet: "Just small stuff. As a general announcement, on disk format is now frozen in my master branch - future on disk format changes will be optional, not required. - More fixes for going read-only: the previous fix was insufficient, but with more work on ordering journal reclaim flushing (and a btree node accounting fix so we don't split until we have to) the tiering_replication test now consistently goes read-only in less than a second. - fix for fsck when we have reflink pointers to missing indirect extents - some transaction restart handling fixes from Alan; the "Pass _orig_restart_count to trans_was_restarted" likely fixes some rare undefined behaviour heisenbugs" * tag 'bcachefs-2025-02-12' of git://evilpiepirate.org/bcachefs: bcachefs: Reuse transaction bcachefs: Pass _orig_restart_count to trans_was_restarted bcachefs: CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS bcachefs: Fix want_new_bset() so we write until the end of the btree node bcachefs: Split out journal pins by btree level bcachefs: Fix use after free bcachefs: Fix marking reflink pointers to missing indirect extents
This commit is contained in:
commit
1854c7f79d
12 changed files with 100 additions and 27 deletions
|
@ -61,6 +61,13 @@ config BCACHEFS_DEBUG
|
|||
The resulting code will be significantly slower than normal; you
|
||||
probably shouldn't select this option unless you're a developer.
|
||||
|
||||
config BCACHEFS_INJECT_TRANSACTION_RESTARTS
|
||||
bool "Randomly inject transaction restarts"
|
||||
depends on BCACHEFS_DEBUG
|
||||
help
|
||||
Randomly inject transaction restarts in a few core paths - may have a
|
||||
significant performance penalty
|
||||
|
||||
config BCACHEFS_TESTS
|
||||
bool "bcachefs unit and performance tests"
|
||||
depends on BCACHEFS_FS
|
||||
|
|
|
@ -2357,6 +2357,12 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en
|
|||
bch2_btree_iter_verify_entry_exit(iter);
|
||||
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX));
|
||||
|
||||
ret = trans_maybe_inject_restart(trans, _RET_IP_);
|
||||
if (unlikely(ret)) {
|
||||
k = bkey_s_c_err(ret);
|
||||
goto out_no_locked;
|
||||
}
|
||||
|
||||
if (iter->update_path) {
|
||||
bch2_path_put_nokeep(trans, iter->update_path,
|
||||
iter->flags & BTREE_ITER_intent);
|
||||
|
@ -2622,6 +2628,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp
|
|||
bch2_btree_iter_verify_entry_exit(iter);
|
||||
EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN));
|
||||
|
||||
int ret = trans_maybe_inject_restart(trans, _RET_IP_);
|
||||
if (unlikely(ret)) {
|
||||
k = bkey_s_c_err(ret);
|
||||
goto out_no_locked;
|
||||
}
|
||||
|
||||
while (1) {
|
||||
k = __bch2_btree_iter_peek_prev(iter, search_key);
|
||||
if (unlikely(!k.k))
|
||||
|
@ -2749,6 +2761,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
|
|||
bch2_btree_iter_verify_entry_exit(iter);
|
||||
EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache));
|
||||
|
||||
ret = trans_maybe_inject_restart(trans, _RET_IP_);
|
||||
if (unlikely(ret)) {
|
||||
k = bkey_s_c_err(ret);
|
||||
goto out_no_locked;
|
||||
}
|
||||
|
||||
/* extents can't span inode numbers: */
|
||||
if ((iter->flags & BTREE_ITER_is_extents) &&
|
||||
unlikely(iter->pos.offset == KEY_OFFSET_MAX)) {
|
||||
|
@ -3106,6 +3124,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
|
|||
|
||||
WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
|
||||
|
||||
ret = trans_maybe_inject_restart(trans, _RET_IP_);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
struct btree_transaction_stats *s = btree_trans_stats(trans);
|
||||
s->max_mem = max(s->max_mem, new_bytes);
|
||||
|
||||
|
@ -3163,7 +3185,8 @@ out_new_mem:
|
|||
|
||||
if (old_bytes) {
|
||||
trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
|
||||
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
|
||||
return ERR_PTR(btree_trans_restart_ip(trans,
|
||||
BCH_ERR_transaction_restart_mem_realloced, _RET_IP_));
|
||||
}
|
||||
out_change_top:
|
||||
p = trans->mem + trans->mem_top;
|
||||
|
@ -3271,6 +3294,14 @@ u32 bch2_trans_begin(struct btree_trans *trans)
|
|||
|
||||
trans->last_begin_ip = _RET_IP_;
|
||||
|
||||
#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
|
||||
if (trans->restarted) {
|
||||
trans->restart_count_this_trans++;
|
||||
} else {
|
||||
trans->restart_count_this_trans = 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
trans_set_locked(trans, false);
|
||||
|
||||
if (trans->restarted) {
|
||||
|
|
|
@ -355,6 +355,18 @@ static int btree_trans_restart(struct btree_trans *trans, int err)
|
|||
return btree_trans_restart_ip(trans, err, _THIS_IP_);
|
||||
}
|
||||
|
||||
static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip)
|
||||
{
|
||||
#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
|
||||
if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) {
|
||||
trace_and_count(trans->c, trans_restart_injected, trans, ip);
|
||||
return btree_trans_restart_ip(trans,
|
||||
BCH_ERR_transaction_restart_fault_inject, ip);
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool bch2_btree_node_upgrade(struct btree_trans *,
|
||||
struct btree_path *, unsigned);
|
||||
|
||||
|
@ -739,7 +751,7 @@ transaction_restart: \
|
|||
if (!_ret2) \
|
||||
bch2_trans_verify_not_restarted(_trans, _restart_count);\
|
||||
\
|
||||
_ret2 ?: trans_was_restarted(_trans, _restart_count); \
|
||||
_ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \
|
||||
})
|
||||
|
||||
#define for_each_btree_key_max_continue(_trans, _iter, \
|
||||
|
|
|
@ -999,6 +999,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
|
|||
|
||||
bch2_trans_verify_not_unlocked_or_in_restart(trans);
|
||||
|
||||
ret = trans_maybe_inject_restart(trans, _RET_IP_);
|
||||
if (unlikely(ret))
|
||||
goto out_reset;
|
||||
|
||||
if (!trans->nr_updates &&
|
||||
!trans->journal_entries_u64s)
|
||||
goto out_reset;
|
||||
|
|
|
@ -509,6 +509,9 @@ struct btree_trans {
|
|||
bool notrace_relock_fail:1;
|
||||
enum bch_errcode restarted:16;
|
||||
u32 restart_count;
|
||||
#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS
|
||||
u32 restart_count_this_trans;
|
||||
#endif
|
||||
|
||||
u64 last_begin_time;
|
||||
unsigned long last_begin_ip;
|
||||
|
|
|
@ -278,12 +278,12 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct bt
|
|||
{
|
||||
struct bset_tree *t = bset_tree_last(b);
|
||||
struct btree_node_entry *bne = max(write_block(b),
|
||||
(void *) btree_bkey_last(b, bset_tree_last(b)));
|
||||
(void *) btree_bkey_last(b, t));
|
||||
ssize_t remaining_space =
|
||||
__bch2_btree_u64s_remaining(b, bne->keys.start);
|
||||
|
||||
if (unlikely(bset_written(b, bset(b, t)))) {
|
||||
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
|
||||
if (b->written + block_sectors(c) <= btree_sectors(c))
|
||||
return bne;
|
||||
} else {
|
||||
if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) &&
|
||||
|
|
|
@ -210,11 +210,13 @@ static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem *
|
|||
static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p,
|
||||
u64 *v, unsigned nr)
|
||||
{
|
||||
percpu_down_read(&c->mark_lock);
|
||||
struct bch_accounting_mem *acc = &c->accounting;
|
||||
unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]),
|
||||
accounting_pos_cmp, &p);
|
||||
|
||||
bch2_accounting_mem_read_counters(acc, idx, v, nr, false);
|
||||
percpu_up_read(&c->mark_lock);
|
||||
}
|
||||
|
||||
static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset)
|
||||
|
|
|
@ -411,6 +411,16 @@ void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op)
|
|||
__bch2_write_op_error(out, op, op->pos.offset);
|
||||
}
|
||||
|
||||
static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out,
|
||||
struct bch_write_op *op, u64 offset)
|
||||
{
|
||||
bch2_inum_offset_err_msg_trans(trans, out,
|
||||
(subvol_inum) { op->subvol, op->pos.inode, },
|
||||
offset << 9);
|
||||
prt_printf(out, "write error%s: ",
|
||||
op->flags & BCH_WRITE_MOVE ? "(internal move)" : "");
|
||||
}
|
||||
|
||||
void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
|
||||
enum bch_data_type type,
|
||||
const struct bkey_i *k,
|
||||
|
@ -1193,7 +1203,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
|
|||
struct bkey_i *insert = bch2_keylist_front(&op->insert_keys);
|
||||
|
||||
struct printbuf buf = PRINTBUF;
|
||||
__bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k));
|
||||
bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k));
|
||||
prt_printf(&buf, "btree update error: %s", bch2_err_str(ret));
|
||||
bch_err_ratelimited(c, "%s", buf.buf);
|
||||
printbuf_exit(&buf);
|
||||
|
|
|
@ -384,12 +384,16 @@ void bch2_journal_pin_drop(struct journal *j,
|
|||
spin_unlock(&j->lock);
|
||||
}
|
||||
|
||||
static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
|
||||
static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin,
|
||||
journal_pin_flush_fn fn)
|
||||
{
|
||||
if (fn == bch2_btree_node_flush0 ||
|
||||
fn == bch2_btree_node_flush1)
|
||||
return JOURNAL_PIN_TYPE_btree;
|
||||
else if (fn == bch2_btree_key_cache_journal_flush)
|
||||
fn == bch2_btree_node_flush1) {
|
||||
unsigned idx = fn == bch2_btree_node_flush1;
|
||||
struct btree *b = container_of(pin, struct btree, writes[idx].journal);
|
||||
|
||||
return JOURNAL_PIN_TYPE_btree0 - b->c.level;
|
||||
} else if (fn == bch2_btree_key_cache_journal_flush)
|
||||
return JOURNAL_PIN_TYPE_key_cache;
|
||||
else
|
||||
return JOURNAL_PIN_TYPE_other;
|
||||
|
@ -441,7 +445,7 @@ void bch2_journal_pin_copy(struct journal *j,
|
|||
|
||||
bool reclaim = __journal_pin_drop(j, dst);
|
||||
|
||||
bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
|
||||
bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn));
|
||||
|
||||
if (reclaim)
|
||||
bch2_journal_reclaim_fast(j);
|
||||
|
@ -465,7 +469,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
|
|||
|
||||
bool reclaim = __journal_pin_drop(j, pin);
|
||||
|
||||
bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
|
||||
bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn));
|
||||
|
||||
if (reclaim)
|
||||
bch2_journal_reclaim_fast(j);
|
||||
|
@ -587,7 +591,7 @@ static size_t journal_flush_pins(struct journal *j,
|
|||
spin_lock(&j->lock);
|
||||
/* Pin might have been dropped or rearmed: */
|
||||
if (likely(!err && !j->flush_in_progress_dropped))
|
||||
list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]);
|
||||
list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]);
|
||||
j->flush_in_progress = NULL;
|
||||
j->flush_in_progress_dropped = false;
|
||||
spin_unlock(&j->lock);
|
||||
|
@ -869,18 +873,13 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
|
|||
|
||||
mutex_lock(&j->reclaim_lock);
|
||||
|
||||
if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
|
||||
BIT(JOURNAL_PIN_TYPE_key_cache)|
|
||||
BIT(JOURNAL_PIN_TYPE_other))) {
|
||||
*did_work = true;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
|
||||
BIT(JOURNAL_PIN_TYPE_btree))) {
|
||||
*did_work = true;
|
||||
goto unlock;
|
||||
}
|
||||
for (int type = JOURNAL_PIN_TYPE_NR - 1;
|
||||
type >= 0;
|
||||
--type)
|
||||
if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) {
|
||||
*did_work = true;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
if (seq_to_flush > journal_cur_seq(j))
|
||||
bch2_journal_entry_close(j);
|
||||
|
|
|
@ -53,7 +53,10 @@ struct journal_buf {
|
|||
*/
|
||||
|
||||
enum journal_pin_type {
|
||||
JOURNAL_PIN_TYPE_btree,
|
||||
JOURNAL_PIN_TYPE_btree3,
|
||||
JOURNAL_PIN_TYPE_btree2,
|
||||
JOURNAL_PIN_TYPE_btree1,
|
||||
JOURNAL_PIN_TYPE_btree0,
|
||||
JOURNAL_PIN_TYPE_key_cache,
|
||||
JOURNAL_PIN_TYPE_other,
|
||||
JOURNAL_PIN_TYPE_NR,
|
||||
|
|
|
@ -381,6 +381,8 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
|
|||
not_found:
|
||||
if (flags & BTREE_TRIGGER_check_repair) {
|
||||
ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false);
|
||||
if (ret == -BCH_ERR_missing_indirect_extent)
|
||||
ret = 0;
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
|
|
@ -180,9 +180,9 @@ enum bch_fsck_flags {
|
|||
x(ptr_crc_nonce_mismatch, 162, 0) \
|
||||
x(ptr_stripe_redundant, 163, 0) \
|
||||
x(reservation_key_nr_replicas_invalid, 164, 0) \
|
||||
x(reflink_v_refcount_wrong, 165, 0) \
|
||||
x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \
|
||||
x(reflink_v_pos_bad, 292, 0) \
|
||||
x(reflink_p_to_missing_reflink_v, 166, 0) \
|
||||
x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \
|
||||
x(reflink_refcount_underflow, 293, 0) \
|
||||
x(stripe_pos_bad, 167, 0) \
|
||||
x(stripe_val_size_bad, 168, 0) \
|
||||
|
|
Loading…
Add table
Reference in a new issue