From 595170d4b660640289003d1881a9a6ef8ded6865 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 6 Feb 2025 22:28:41 -0500 Subject: [PATCH 1/7] bcachefs: Fix marking reflink pointers to missing indirect extents reflink pointers to missing indirect extents aren't deleted, they just have an error bit set - in case the indirect extent somehow reappears. fsck/mark and sweep thus needs to ignore these errors. Also, they can be marked AUTOFIX now. Reported-by: Roland Vet Signed-off-by: Kent Overstreet --- fs/bcachefs/reflink.c | 2 ++ fs/bcachefs/sb-errors_format.h | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 93ba4f4e47ca..376fd0a6e868 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -381,6 +381,8 @@ static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans, not_found: if (flags & BTREE_TRIGGER_check_repair) { ret = bch2_indirect_extent_missing_error(trans, p, *idx, next_idx, false); + if (ret == -BCH_ERR_missing_indirect_extent) + ret = 0; if (ret) goto err; } diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index ea0a18364751..b86ec013d7d7 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -180,9 +180,9 @@ enum bch_fsck_flags { x(ptr_crc_nonce_mismatch, 162, 0) \ x(ptr_stripe_redundant, 163, 0) \ x(reservation_key_nr_replicas_invalid, 164, 0) \ - x(reflink_v_refcount_wrong, 165, 0) \ + x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \ x(reflink_v_pos_bad, 292, 0) \ - x(reflink_p_to_missing_reflink_v, 166, 0) \ + x(reflink_p_to_missing_reflink_v, 166, FSCK_AUTOFIX) \ x(reflink_refcount_underflow, 293, 0) \ x(stripe_pos_bad, 167, 0) \ x(stripe_val_size_bad, 168, 0) \ From 1c316eb57c11fb3dc447b04ef765459cd61c8647 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Mon, 10 Feb 2025 11:04:22 +0800 Subject: [PATCH 2/7] bcachefs: Fix use after free acc->k.data should be used with the lock hold: 00221 ========= TEST generic/187 00221 run fstests generic/187 at 2025-02-09 21:08:10 00221 spectre-v4 mitigation disabled by command-line option 00222 bcachefs (vdc): starting version 1.20: directory_size opts=errors=ro 00222 bcachefs (vdc): initializing new filesystem 00222 bcachefs (vdc): going read-write 00222 bcachefs (vdc): marking superblocks 00222 bcachefs (vdc): initializing freespace 00222 bcachefs (vdc): done initializing freespace 00222 bcachefs (vdc): reading snapshots table 00222 bcachefs (vdc): reading snapshots done 00222 bcachefs (vdc): done starting filesystem 00222 bcachefs (vdc): shutting down 00222 bcachefs (vdc): going read-only 00222 bcachefs (vdc): finished waiting for writes to stop 00223 bcachefs (vdc): flushing journal and stopping allocators, journal seq 6 00223 bcachefs (vdc): flushing journal and stopping allocators complete, journal seq 8 00223 bcachefs (vdc): clean shutdown complete, journal seq 9 00223 bcachefs (vdc): marking filesystem clean 00223 bcachefs (vdc): shutdown complete 00223 bcachefs (vdc): starting version 1.20: directory_size opts=errors=ro 00223 bcachefs (vdc): initializing new filesystem 00223 bcachefs (vdc): going read-write 00223 bcachefs (vdc): marking superblocks 00223 bcachefs (vdc): initializing freespace 00223 bcachefs (vdc): done initializing freespace 00223 bcachefs (vdc): reading snapshots table 00223 bcachefs (vdc): reading snapshots done 00223 bcachefs (vdc): done starting filesystem 00244 hrtimer: interrupt took 123350440 ns 00264 bcachefs (vdc): shutting down 00264 bcachefs (vdc): going read-only 00264 bcachefs (vdc): finished waiting for writes to stop 00264 bcachefs (vdc): flushing journal and stopping allocators, journal seq 97 00265 bcachefs (vdc): flushing journal and stopping allocators complete, journal seq 101 00265 bcachefs (vdc): clean shutdown complete, journal seq 102 00265 bcachefs (vdc): marking filesystem clean 00265 bcachefs (vdc): shutdown complete 00265 bcachefs (vdc): starting version 1.20: directory_size opts=errors=ro 00265 bcachefs (vdc): recovering from clean shutdown, journal seq 102 00265 bcachefs (vdc): accounting_read... 00265 ================================================================== 00265 done 00265 BUG: KASAN: slab-use-after-free in bch2_fs_to_text+0x12b4/0x1728 00265 bcachefs (vdc): alloc_read... done 00265 bcachefs (vdc): stripes_read... done 00265 Read of size 4 at addr ffffff80c57eac00 by task cat/7531 00265 bcachefs (vdc): snapshots_read... done 00265 00265 CPU: 6 UID: 0 PID: 7531 Comm: cat Not tainted 6.13.0-rc3-ktest-g16fc6fa3819d #14103 00265 Hardware name: linux,dummy-virt (DT) 00265 Call trace: 00265 show_stack+0x1c/0x30 (C) 00265 dump_stack_lvl+0x6c/0x80 00265 print_report+0xf8/0x5d8 00265 kasan_report+0x90/0xd0 00265 __asan_report_load4_noabort+0x1c/0x28 00265 bch2_fs_to_text+0x12b4/0x1728 00265 bch2_fs_show+0x94/0x188 00265 sysfs_kf_seq_show+0x1a4/0x348 00265 kernfs_seq_show+0x12c/0x198 00265 seq_read_iter+0x27c/0xfd0 00265 kernfs_fop_read_iter+0x390/0x4f8 00265 vfs_read+0x480/0x7f0 00265 ksys_read+0xe0/0x1e8 00265 __arm64_sys_read+0x70/0xa8 00265 invoke_syscall.constprop.0+0x74/0x1e8 00265 do_el0_svc+0xc8/0x1c8 00265 el0_svc+0x20/0x60 00265 el0t_64_sync_handler+0x104/0x130 00265 el0t_64_sync+0x154/0x158 00265 00265 Allocated by task 7510: 00265 kasan_save_stack+0x28/0x50 00265 kasan_save_track+0x1c/0x38 00265 kasan_save_alloc_info+0x3c/0x50 00265 __kasan_kmalloc+0xac/0xb0 00265 __kmalloc_node_noprof+0x168/0x348 00265 __kvmalloc_node_noprof+0x20/0x140 00265 __bch2_darray_resize_noprof+0x90/0x1b0 00265 __bch2_accounting_mem_insert+0x76c/0xb08 00265 bch2_accounting_mem_insert+0x224/0x3b8 00265 bch2_accounting_mem_mod_locked+0x480/0xc58 00265 bch2_accounting_read+0xa94/0x3eb8 00265 bch2_run_recovery_pass+0x80/0x178 00265 bch2_run_recovery_passes+0x340/0x698 00265 bch2_fs_recovery+0x1c98/0x2bd8 00265 bch2_fs_start+0x240/0x490 00265 bch2_fs_get_tree+0xe1c/0x1458 00265 vfs_get_tree+0x7c/0x250 00265 path_mount+0xe24/0x1648 00265 __arm64_sys_mount+0x240/0x438 00265 invoke_syscall.constprop.0+0x74/0x1e8 00265 do_el0_svc+0xc8/0x1c8 00265 el0_svc+0x20/0x60 00265 el0t_64_sync_handler+0x104/0x130 00265 el0t_64_sync+0x154/0x158 00265 00265 Freed by task 7510: 00265 kasan_save_stack+0x28/0x50 00265 kasan_save_track+0x1c/0x38 00265 kasan_save_free_info+0x48/0x88 00265 __kasan_slab_free+0x48/0x60 00265 kfree+0x188/0x408 00265 kvfree+0x3c/0x50 00265 __bch2_darray_resize_noprof+0xe0/0x1b0 00265 __bch2_accounting_mem_insert+0x76c/0xb08 00265 bch2_accounting_mem_insert+0x224/0x3b8 00265 bch2_accounting_mem_mod_locked+0x480/0xc58 00265 bch2_accounting_read+0xa94/0x3eb8 00265 bch2_run_recovery_pass+0x80/0x178 00265 bch2_run_recovery_passes+0x340/0x698 00265 bch2_fs_recovery+0x1c98/0x2bd8 00265 bch2_fs_start+0x240/0x490 00265 bch2_fs_get_tree+0xe1c/0x1458 00265 vfs_get_tree+0x7c/0x250 00265 path_mount+0xe24/0x1648 00265 bcachefs (vdc): going read-write 00265 __arm64_sys_mount+0x240/0x438 00265 invoke_syscall.constprop.0+0x74/0x1e8 00265 do_el0_svc+0xc8/0x1c8 00265 el0_svc+0x20/0x60 00265 el0t_64_sync_handler+0x104/0x130 00265 el0t_64_sync+0x154/0x158 00265 00265 The buggy address belongs to the object at ffffff80c57eac00 00265 which belongs to the cache kmalloc-128 of size 128 00265 The buggy address is located 0 bytes inside of 00265 freed 128-byte region [ffffff80c57eac00, ffffff80c57eac80) 00265 00265 The buggy address belongs to the physical page: 00265 page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1057ea 00265 head: order:1 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 00265 flags: 0x8000000000000040(head|zone=2) 00265 page_type: f5(slab) 00265 raw: 8000000000000040 ffffff80c0002800 dead000000000100 dead000000000122 00265 raw: 0000000000000000 0000000000200020 00000001f5000000 ffffff80c57a6400 00265 head: 8000000000000040 ffffff80c0002800 dead000000000100 dead000000000122 00265 head: 0000000000000000 0000000000200020 00000001f5000000 ffffff80c57a6400 00265 head: 8000000000000001 fffffffec315fa81 ffffffffffffffff 0000000000000000 00265 head: 0000000000000002 0000000000000000 00000000ffffffff 0000000000000000 00265 page dumped because: kasan: bad access detected 00265 00265 Memory state around the buggy address: 00265 ffffff80c57eab00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00265 ffffff80c57eab80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc 00265 >ffffff80c57eac00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb 00265 ^ 00265 ffffff80c57eac80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc 00265 ffffff80c57ead00: 00 00 00 00 00 00 00 00 00 00 00 00 00 fc fc fc 00265 ================================================================== 00265 Kernel panic - not syncing: kasan.fault=panic set ... 00265 CPU: 6 UID: 0 PID: 7531 Comm: cat Not tainted 6.13.0-rc3-ktest-g16fc6fa3819d #14103 00265 Hardware name: linux,dummy-virt (DT) 00265 Call trace: 00265 show_stack+0x1c/0x30 (C) 00265 dump_stack_lvl+0x30/0x80 00265 dump_stack+0x18/0x20 00265 panic+0x4d4/0x518 00265 start_report.constprop.0+0x0/0x90 00265 kasan_report+0xa0/0xd0 00265 __asan_report_load4_noabort+0x1c/0x28 00265 bch2_fs_to_text+0x12b4/0x1728 00265 bch2_fs_show+0x94/0x188 00265 sysfs_kf_seq_show+0x1a4/0x348 00265 kernfs_seq_show+0x12c/0x198 00265 seq_read_iter+0x27c/0xfd0 00265 kernfs_fop_read_iter+0x390/0x4f8 00265 vfs_read+0x480/0x7f0 00265 ksys_read+0xe0/0x1e8 00265 __arm64_sys_read+0x70/0xa8 00265 invoke_syscall.constprop.0+0x74/0x1e8 00265 do_el0_svc+0xc8/0x1c8 00265 el0_svc+0x20/0x60 00265 el0t_64_sync_handler+0x104/0x130 00265 el0t_64_sync+0x154/0x158 00265 SMP: stopping secondary CPUs 00265 Kernel Offset: disabled 00265 CPU features: 0x000,00000070,00000010,8240500b 00265 Memory Limit: none 00265 ---[ end Kernel panic - not syncing: kasan.fault=panic set ... ]--- 00270 ========= FAILED TIMEOUT generic.187 in 1200s Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/disk_accounting.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index 5360cbb3ec29..f4372cafea2e 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -210,11 +210,13 @@ static inline void bch2_accounting_mem_read_counters(struct bch_accounting_mem * static inline void bch2_accounting_mem_read(struct bch_fs *c, struct bpos p, u64 *v, unsigned nr) { + percpu_down_read(&c->mark_lock); struct bch_accounting_mem *acc = &c->accounting; unsigned idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &p); bch2_accounting_mem_read_counters(acc, idx, v, nr, false); + percpu_up_read(&c->mark_lock); } static inline struct bversion journal_pos_to_bversion(struct journal_res *res, unsigned offset) From 1e690efa72596a1163dc56709707f459221889d2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 11:34:59 -0500 Subject: [PATCH 3/7] bcachefs: Split out journal pins by btree level This lets us flush the journal to go read-only more effectively. Flushing the journal and going read-only requires halting mutually recursive processes, which strictly speaking are not guaranteed to terminate. Flushing btree node journal pins will kick off a btree node write, and btree node writes on completion must do another btree update to the parent node to update the 'sectors_written' field for that node's key. If the parent node is full and requires a split or compaction, that's going to generate a whole bunch of additional btree updates - alloc info, LRU btree, and more - which then have to be flushed, and the cycle repeats. This process will terminate much more effectively if we tweak journal reclaim to flush btree updates leaf to root: i.e., don't flush updates for a given btree node (kicking off a write, and consuming space within that node up to the next block boundary) if there might still be unflushed updates in child nodes. Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_reclaim.c | 37 +++++++++++++++++------------------ fs/bcachefs/journal_types.h | 5 ++++- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 6a9cefb635d6..d373cd181a7f 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -384,12 +384,16 @@ void bch2_journal_pin_drop(struct journal *j, spin_unlock(&j->lock); } -static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) +static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin, + journal_pin_flush_fn fn) { if (fn == bch2_btree_node_flush0 || - fn == bch2_btree_node_flush1) - return JOURNAL_PIN_TYPE_btree; - else if (fn == bch2_btree_key_cache_journal_flush) + fn == bch2_btree_node_flush1) { + unsigned idx = fn == bch2_btree_node_flush1; + struct btree *b = container_of(pin, struct btree, writes[idx].journal); + + return JOURNAL_PIN_TYPE_btree0 - b->c.level; + } else if (fn == bch2_btree_key_cache_journal_flush) return JOURNAL_PIN_TYPE_key_cache; else return JOURNAL_PIN_TYPE_other; @@ -441,7 +445,7 @@ void bch2_journal_pin_copy(struct journal *j, bool reclaim = __journal_pin_drop(j, dst); - bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn)); + bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); @@ -465,7 +469,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq, bool reclaim = __journal_pin_drop(j, pin); - bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn)); + bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn)); if (reclaim) bch2_journal_reclaim_fast(j); @@ -587,7 +591,7 @@ static size_t journal_flush_pins(struct journal *j, spin_lock(&j->lock); /* Pin might have been dropped or rearmed: */ if (likely(!err && !j->flush_in_progress_dropped)) - list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]); + list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]); j->flush_in_progress = NULL; j->flush_in_progress_dropped = false; spin_unlock(&j->lock); @@ -869,18 +873,13 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush, mutex_lock(&j->reclaim_lock); - if (journal_flush_pins_or_still_flushing(j, seq_to_flush, - BIT(JOURNAL_PIN_TYPE_key_cache)| - BIT(JOURNAL_PIN_TYPE_other))) { - *did_work = true; - goto unlock; - } - - if (journal_flush_pins_or_still_flushing(j, seq_to_flush, - BIT(JOURNAL_PIN_TYPE_btree))) { - *did_work = true; - goto unlock; - } + for (int type = JOURNAL_PIN_TYPE_NR - 1; + type >= 0; + --type) + if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) { + *did_work = true; + goto unlock; + } if (seq_to_flush > journal_cur_seq(j)) bch2_journal_entry_close(j); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index a198a81d7478..1ef3a28ed6ab 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -53,7 +53,10 @@ struct journal_buf { */ enum journal_pin_type { - JOURNAL_PIN_TYPE_btree, + JOURNAL_PIN_TYPE_btree3, + JOURNAL_PIN_TYPE_btree2, + JOURNAL_PIN_TYPE_btree1, + JOURNAL_PIN_TYPE_btree0, JOURNAL_PIN_TYPE_key_cache, JOURNAL_PIN_TYPE_other, JOURNAL_PIN_TYPE_NR, From 9f734cd076931fa4d7feb5728e5cd95cde0af114 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 10 Feb 2025 17:46:36 -0500 Subject: [PATCH 4/7] bcachefs: Fix want_new_bset() so we write until the end of the btree node want_new_bset() returns the address of a new bset to initialize if we wish to do so in a btree node - either because the previous one is too big, or because it's been written. The case for 'previous bset was written' was wrong: it's only supposed to check for if we have space in the node for one more block, but because it subtracted the header from the space available it would never initialize a new bset if we were down to the last block in a node. Fixing this results in fewer btree node splits/compactions, which fixes a bug with flushing the journal to go read-only sometimes not terminating or taking excessively long. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_update_interior.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 7930ffea3075..26d646e1275c 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -278,12 +278,12 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct bt { struct bset_tree *t = bset_tree_last(b); struct btree_node_entry *bne = max(write_block(b), - (void *) btree_bkey_last(b, bset_tree_last(b))); + (void *) btree_bkey_last(b, t)); ssize_t remaining_space = __bch2_btree_u64s_remaining(b, bne->keys.start); if (unlikely(bset_written(b, bset(b, t)))) { - if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) + if (b->written + block_sectors(c) <= btree_sectors(c)) return bne; } else { if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && From 9cf6b84b71adb97f3c19476ebb5a42228fad89b5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 23 Sep 2024 22:12:31 -0400 Subject: [PATCH 5/7] bcachefs: CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS Incorrectly handled transaction restarts can be a source of heisenbugs; add a mode where we randomly inject them to shake them out. Signed-off-by: Kent Overstreet --- fs/bcachefs/Kconfig | 7 +++++++ fs/bcachefs/btree_iter.c | 33 +++++++++++++++++++++++++++++++- fs/bcachefs/btree_iter.h | 12 ++++++++++++ fs/bcachefs/btree_trans_commit.c | 4 ++++ fs/bcachefs/btree_types.h | 3 +++ 5 files changed, 58 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index 85eea7a4dea3..fc7efd0a7525 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -61,6 +61,13 @@ config BCACHEFS_DEBUG The resulting code will be significantly slower than normal; you probably shouldn't select this option unless you're a developer. +config BCACHEFS_INJECT_TRANSACTION_RESTARTS + bool "Randomly inject transaction restarts" + depends on BCACHEFS_DEBUG + help + Randomly inject transaction restarts in a few core paths - may have a + significant performance penalty + config BCACHEFS_TESTS bool "bcachefs unit and performance tests" depends on BCACHEFS_FS diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 5988219c6908..e32fce4fd258 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2357,6 +2357,12 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en bch2_btree_iter_verify_entry_exit(iter); EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bkey_eq(end, POS_MAX)); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + if (iter->update_path) { bch2_path_put_nokeep(trans, iter->update_path, iter->flags & BTREE_ITER_intent); @@ -2622,6 +2628,12 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp bch2_btree_iter_verify_entry_exit(iter); EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN)); + int ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + while (1) { k = __bch2_btree_iter_peek_prev(iter, search_key); if (unlikely(!k.k)) @@ -2749,6 +2761,12 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) bch2_btree_iter_verify_entry_exit(iter); EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + /* extents can't span inode numbers: */ if ((iter->flags & BTREE_ITER_is_extents) && unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { @@ -3106,6 +3124,10 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (ret) + return ERR_PTR(ret); + struct btree_transaction_stats *s = btree_trans_stats(trans); s->max_mem = max(s->max_mem, new_bytes); @@ -3163,7 +3185,8 @@ out_new_mem: if (old_bytes) { trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); - return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); + return ERR_PTR(btree_trans_restart_ip(trans, + BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); } out_change_top: p = trans->mem + trans->mem_top; @@ -3271,6 +3294,14 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->last_begin_ip = _RET_IP_; +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + if (trans->restarted) { + trans->restart_count_this_trans++; + } else { + trans->restart_count_this_trans = 0; + } +#endif + trans_set_locked(trans, false); if (trans->restarted) { diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index b9538e6e6d65..8c16d9a3ec1d 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -355,6 +355,18 @@ static int btree_trans_restart(struct btree_trans *trans, int err) return btree_trans_restart_ip(trans, err, _THIS_IP_); } +static inline int trans_maybe_inject_restart(struct btree_trans *trans, unsigned long ip) +{ +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + if (!(ktime_get_ns() & ~(~0ULL << min(63, (10 + trans->restart_count_this_trans))))) { + trace_and_count(trans->c, trans_restart_injected, trans, ip); + return btree_trans_restart_ip(trans, + BCH_ERR_transaction_restart_fault_inject, ip); + } +#endif + return 0; +} + bool bch2_btree_node_upgrade(struct btree_trans *, struct btree_path *, unsigned); diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 2760dd9569ed..c4f524b2ca9a 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -999,6 +999,10 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) bch2_trans_verify_not_unlocked_or_in_restart(trans); + ret = trans_maybe_inject_restart(trans, _RET_IP_); + if (unlikely(ret)) + goto out_reset; + if (!trans->nr_updates && !trans->journal_entries_u64s) goto out_reset; diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index a6f251eb4164..a09cbe9cd94f 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -509,6 +509,9 @@ struct btree_trans { bool notrace_relock_fail:1; enum bch_errcode restarted:16; u32 restart_count; +#ifdef CONFIG_BCACHEFS_INJECT_TRANSACTION_RESTARTS + u32 restart_count_this_trans; +#endif u64 last_begin_time; unsigned long last_begin_ip; From 531323a2efc3fbe20b540e3f41ecc94d68e74b76 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Thu, 13 Feb 2025 02:11:01 +0800 Subject: [PATCH 6/7] bcachefs: Pass _orig_restart_count to trans_was_restarted _orig_restart_count is unused now, according to the logic, trans_was_restarted should be using _orig_restart_count. Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 8c16d9a3ec1d..b96157f3dc9c 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -751,7 +751,7 @@ transaction_restart: \ if (!_ret2) \ bch2_trans_verify_not_restarted(_trans, _restart_count);\ \ - _ret2 ?: trans_was_restarted(_trans, _restart_count); \ + _ret2 ?: trans_was_restarted(_trans, _orig_restart_count); \ }) #define for_each_btree_key_max_continue(_trans, _iter, \ From 406e445b3c6be65ab0ee961145e74bfd7ef6c9e1 Mon Sep 17 00:00:00 2001 From: Alan Huang Date: Wed, 12 Feb 2025 17:27:51 +0800 Subject: [PATCH 7/7] bcachefs: Reuse transaction bch2_nocow_write_convert_unwritten is already in transaction context: 00191 ========= TEST generic/648 00242 kernel BUG at fs/bcachefs/btree_iter.c:3332! 00242 Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP 00242 Modules linked in: 00242 CPU: 4 UID: 0 PID: 2593 Comm: fsstress Not tainted 6.13.0-rc3-ktest-g345af8f855b7 #14403 00242 Hardware name: linux,dummy-virt (DT) 00242 pstate: 60001005 (nZCv daif -PAN -UAO -TCO -DIT +SSBS BTYPE=--) 00242 pc : __bch2_trans_get+0x120/0x410 00242 lr : __bch2_trans_get+0xcc/0x410 00242 sp : ffffff80d89af600 00242 x29: ffffff80d89af600 x28: ffffff80ddb23000 x27: 00000000fffff705 00242 x26: ffffff80ddb23028 x25: ffffff80d8903fe0 x24: ffffff80ebb30168 00242 x23: ffffff80c8aeb500 x22: 000000000000005d x21: ffffff80d8904078 00242 x20: ffffff80d8900000 x19: ffffff80da9e8000 x18: 0000000000000000 00242 x17: 64747568735f6c61 x16: 6e72756f6a20726f x15: 0000000000000028 00242 x14: 0000000000000004 x13: 000000000000f787 x12: ffffffc081bbcdc8 00242 x11: 0000000000000000 x10: 0000000000000003 x9 : ffffffc08094efbc 00242 x8 : 000000001092c111 x7 : 000000000000000c x6 : ffffffc083c31fc4 00242 x5 : ffffffc083c31f28 x4 : ffffff80c8aeb500 x3 : ffffff80ebb30000 00242 x2 : 0000000000000001 x1 : 0000000000000a21 x0 : 000000000000028e 00242 Call trace: 00242 __bch2_trans_get+0x120/0x410 (P) 00242 bch2_inum_offset_err_msg+0x48/0xb0 00242 bch2_nocow_write_convert_unwritten+0x3d0/0x530 00242 bch2_nocow_write+0xeb0/0x1000 00242 __bch2_write+0x330/0x4e8 00242 bch2_write+0x1f0/0x530 00242 bch2_direct_write+0x530/0xc00 00242 bch2_write_iter+0x160/0xbe0 00242 vfs_write+0x1cc/0x360 00242 ksys_write+0x5c/0xf0 00242 __arm64_sys_write+0x20/0x30 00242 invoke_syscall.constprop.0+0x54/0xe8 00242 do_el0_svc+0x44/0xc0 00242 el0_svc+0x34/0xa0 00242 el0t_64_sync_handler+0x104/0x130 00242 el0t_64_sync+0x154/0x158 00242 Code: 6b01001f 54ffff01 79408460 3617fec0 (d4210000) 00242 ---[ end trace 0000000000000000 ]--- 00242 Kernel panic - not syncing: Oops - BUG: Fatal exception Signed-off-by: Alan Huang Signed-off-by: Kent Overstreet --- fs/bcachefs/io_write.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index dd508d93e9fc..03892388832b 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -411,6 +411,16 @@ void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) __bch2_write_op_error(out, op, op->pos.offset); } +static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, + struct bch_write_op *op, u64 offset) +{ + bch2_inum_offset_err_msg_trans(trans, out, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + prt_printf(out, "write error%s: ", + op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); +} + void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, const struct bkey_i *k, @@ -1193,7 +1203,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); struct printbuf buf = PRINTBUF; - __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); + bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k)); prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf);