1
0
Fork 0
mirror of synced 2025-03-06 20:59:54 +01:00

bcachefs: Split out journal pins by btree level

This lets us flush the journal to go read-only more effectively.

Flushing the journal and going read-only requires halting mutually
recursive processes, which strictly speaking are not guaranteed to
terminate.

Flushing btree node journal pins will kick off a btree node write, and
btree node writes on completion must do another btree update to the
parent node to update the 'sectors_written' field for that node's key.

If the parent node is full and requires a split or compaction, that's
going to generate a whole bunch of additional btree updates - alloc
info, LRU btree, and more - which then have to be flushed, and the cycle
repeats.

This process will terminate much more effectively if we tweak journal
reclaim to flush btree updates leaf to root: i.e., don't flush updates
for a given btree node (kicking off a write, and consuming space within
that node up to the next block boundary) if there might still be
unflushed updates in child nodes.

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
Kent Overstreet 2025-02-10 11:34:59 -05:00
parent 1c316eb57c
commit 1e690efa72
2 changed files with 22 additions and 20 deletions

View file

@ -384,12 +384,16 @@ void bch2_journal_pin_drop(struct journal *j,
spin_unlock(&j->lock);
}
static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
static enum journal_pin_type journal_pin_type(struct journal_entry_pin *pin,
journal_pin_flush_fn fn)
{
if (fn == bch2_btree_node_flush0 ||
fn == bch2_btree_node_flush1)
return JOURNAL_PIN_TYPE_btree;
else if (fn == bch2_btree_key_cache_journal_flush)
fn == bch2_btree_node_flush1) {
unsigned idx = fn == bch2_btree_node_flush1;
struct btree *b = container_of(pin, struct btree, writes[idx].journal);
return JOURNAL_PIN_TYPE_btree0 - b->c.level;
} else if (fn == bch2_btree_key_cache_journal_flush)
return JOURNAL_PIN_TYPE_key_cache;
else
return JOURNAL_PIN_TYPE_other;
@ -441,7 +445,7 @@ void bch2_journal_pin_copy(struct journal *j,
bool reclaim = __journal_pin_drop(j, dst);
bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(dst, flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
@ -465,7 +469,7 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
bool reclaim = __journal_pin_drop(j, pin);
bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(pin, flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
@ -587,7 +591,7 @@ static size_t journal_flush_pins(struct journal *j,
spin_lock(&j->lock);
/* Pin might have been dropped or rearmed: */
if (likely(!err && !j->flush_in_progress_dropped))
list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(flush_fn)]);
list_move(&pin->list, &journal_seq_pin(j, seq)->flushed[journal_pin_type(pin, flush_fn)]);
j->flush_in_progress = NULL;
j->flush_in_progress_dropped = false;
spin_unlock(&j->lock);
@ -869,18 +873,13 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
mutex_lock(&j->reclaim_lock);
if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
BIT(JOURNAL_PIN_TYPE_key_cache)|
BIT(JOURNAL_PIN_TYPE_other))) {
*did_work = true;
goto unlock;
}
if (journal_flush_pins_or_still_flushing(j, seq_to_flush,
BIT(JOURNAL_PIN_TYPE_btree))) {
*did_work = true;
goto unlock;
}
for (int type = JOURNAL_PIN_TYPE_NR - 1;
type >= 0;
--type)
if (journal_flush_pins_or_still_flushing(j, seq_to_flush, BIT(type))) {
*did_work = true;
goto unlock;
}
if (seq_to_flush > journal_cur_seq(j))
bch2_journal_entry_close(j);

View file

@ -53,7 +53,10 @@ struct journal_buf {
*/
enum journal_pin_type {
JOURNAL_PIN_TYPE_btree,
JOURNAL_PIN_TYPE_btree3,
JOURNAL_PIN_TYPE_btree2,
JOURNAL_PIN_TYPE_btree1,
JOURNAL_PIN_TYPE_btree0,
JOURNAL_PIN_TYPE_key_cache,
JOURNAL_PIN_TYPE_other,
JOURNAL_PIN_TYPE_NR,