1
0
Fork 0
mirror of synced 2025-03-06 20:59:54 +01:00

xfs: fix buffer lookup vs release race

Since commit 298f342245 ("xfs: lockless buffer lookup") the buffer
lookup fastpath is done without a hash-wide lock (then pag_buf_lock, now
bc_lock) and only under RCU protection.  But this means that nothing
serializes lookups against the temporary 0 reference count for buffers
that are added to the LRU after dropping the last regular reference,
and a concurrent lookup would fail to find them.

Fix this by doing all b_hold modifications under b_lock.  We're already
doing this for release so this "only" ~ doubles the b_lock round trips.
We'll later look into the lockref infrastructure to optimize the number
of lock round trips again.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
This commit is contained in:
Christoph Hellwig 2025-01-16 07:01:42 +01:00 committed by Carlos Maiolino
parent 07eae0fa67
commit ee10f6fcdb
3 changed files with 54 additions and 51 deletions

View file

@ -127,15 +127,6 @@ __xfs_buf_ioacct_dec(
} }
} }
static inline void
xfs_buf_ioacct_dec(
struct xfs_buf *bp)
{
spin_lock(&bp->b_lock);
__xfs_buf_ioacct_dec(bp);
spin_unlock(&bp->b_lock);
}
/* /*
* When we mark a buffer stale, we remove the buffer from the LRU and clear the * When we mark a buffer stale, we remove the buffer from the LRU and clear the
* b_lru_ref count so that the buffer is freed immediately when the buffer * b_lru_ref count so that the buffer is freed immediately when the buffer
@ -171,9 +162,9 @@ xfs_buf_stale(
atomic_set(&bp->b_lru_ref, 0); atomic_set(&bp->b_lru_ref, 0);
if (!(bp->b_state & XFS_BSTATE_DISPOSE) && if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
(list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
atomic_dec(&bp->b_hold); bp->b_hold--;
ASSERT(atomic_read(&bp->b_hold) >= 1); ASSERT(bp->b_hold >= 1);
spin_unlock(&bp->b_lock); spin_unlock(&bp->b_lock);
} }
@ -229,14 +220,14 @@ _xfs_buf_alloc(
*/ */
flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
atomic_set(&bp->b_hold, 1); spin_lock_init(&bp->b_lock);
bp->b_hold = 1;
atomic_set(&bp->b_lru_ref, 1); atomic_set(&bp->b_lru_ref, 1);
init_completion(&bp->b_iowait); init_completion(&bp->b_iowait);
INIT_LIST_HEAD(&bp->b_lru); INIT_LIST_HEAD(&bp->b_lru);
INIT_LIST_HEAD(&bp->b_list); INIT_LIST_HEAD(&bp->b_list);
INIT_LIST_HEAD(&bp->b_li_list); INIT_LIST_HEAD(&bp->b_li_list);
sema_init(&bp->b_sema, 0); /* held, no waiters */ sema_init(&bp->b_sema, 0); /* held, no waiters */
spin_lock_init(&bp->b_lock);
bp->b_target = target; bp->b_target = target;
bp->b_mount = target->bt_mount; bp->b_mount = target->bt_mount;
bp->b_flags = flags; bp->b_flags = flags;
@ -580,6 +571,20 @@ xfs_buf_find_lock(
return 0; return 0;
} }
static bool
xfs_buf_try_hold(
struct xfs_buf *bp)
{
spin_lock(&bp->b_lock);
if (bp->b_hold == 0) {
spin_unlock(&bp->b_lock);
return false;
}
bp->b_hold++;
spin_unlock(&bp->b_lock);
return true;
}
static inline int static inline int
xfs_buf_lookup( xfs_buf_lookup(
struct xfs_buf_cache *bch, struct xfs_buf_cache *bch,
@ -592,7 +597,7 @@ xfs_buf_lookup(
rcu_read_lock(); rcu_read_lock();
bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params); bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
if (!bp || !atomic_inc_not_zero(&bp->b_hold)) { if (!bp || !xfs_buf_try_hold(bp)) {
rcu_read_unlock(); rcu_read_unlock();
return -ENOENT; return -ENOENT;
} }
@ -655,7 +660,7 @@ xfs_buf_find_insert(
spin_unlock(&bch->bc_lock); spin_unlock(&bch->bc_lock);
goto out_free_buf; goto out_free_buf;
} }
if (bp && atomic_inc_not_zero(&bp->b_hold)) { if (bp && xfs_buf_try_hold(bp)) {
/* found an existing buffer */ /* found an existing buffer */
spin_unlock(&bch->bc_lock); spin_unlock(&bch->bc_lock);
error = xfs_buf_find_lock(bp, flags); error = xfs_buf_find_lock(bp, flags);
@ -1037,7 +1042,10 @@ xfs_buf_hold(
struct xfs_buf *bp) struct xfs_buf *bp)
{ {
trace_xfs_buf_hold(bp, _RET_IP_); trace_xfs_buf_hold(bp, _RET_IP_);
atomic_inc(&bp->b_hold);
spin_lock(&bp->b_lock);
bp->b_hold++;
spin_unlock(&bp->b_lock);
} }
static void static void
@ -1045,10 +1053,15 @@ xfs_buf_rele_uncached(
struct xfs_buf *bp) struct xfs_buf *bp)
{ {
ASSERT(list_empty(&bp->b_lru)); ASSERT(list_empty(&bp->b_lru));
if (atomic_dec_and_test(&bp->b_hold)) {
xfs_buf_ioacct_dec(bp); spin_lock(&bp->b_lock);
xfs_buf_free(bp); if (--bp->b_hold) {
spin_unlock(&bp->b_lock);
return;
} }
__xfs_buf_ioacct_dec(bp);
spin_unlock(&bp->b_lock);
xfs_buf_free(bp);
} }
static void static void
@ -1058,51 +1071,40 @@ xfs_buf_rele_cached(
struct xfs_buftarg *btp = bp->b_target; struct xfs_buftarg *btp = bp->b_target;
struct xfs_perag *pag = bp->b_pag; struct xfs_perag *pag = bp->b_pag;
struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag); struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag);
bool release;
bool freebuf = false; bool freebuf = false;
trace_xfs_buf_rele(bp, _RET_IP_); trace_xfs_buf_rele(bp, _RET_IP_);
ASSERT(atomic_read(&bp->b_hold) > 0);
/*
* We grab the b_lock here first to serialise racing xfs_buf_rele()
* calls. The pag_buf_lock being taken on the last reference only
* serialises against racing lookups in xfs_buf_find(). IOWs, the second
* to last reference we drop here is not serialised against the last
* reference until we take bp->b_lock. Hence if we don't grab b_lock
* first, the last "release" reference can win the race to the lock and
* free the buffer before the second-to-last reference is processed,
* leading to a use-after-free scenario.
*/
spin_lock(&bp->b_lock); spin_lock(&bp->b_lock);
release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock); ASSERT(bp->b_hold >= 1);
if (!release) { if (bp->b_hold > 1) {
/* /*
* Drop the in-flight state if the buffer is already on the LRU * Drop the in-flight state if the buffer is already on the LRU
* and it holds the only reference. This is racy because we * and it holds the only reference. This is racy because we
* haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
* ensures the decrement occurs only once per-buf. * ensures the decrement occurs only once per-buf.
*/ */
if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru)) if (--bp->b_hold == 1 && !list_empty(&bp->b_lru))
__xfs_buf_ioacct_dec(bp); __xfs_buf_ioacct_dec(bp);
goto out_unlock; goto out_unlock;
} }
/* the last reference has been dropped ... */ /* we are asked to drop the last reference */
spin_lock(&bch->bc_lock);
__xfs_buf_ioacct_dec(bp); __xfs_buf_ioacct_dec(bp);
if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
/* /*
* If the buffer is added to the LRU take a new reference to the * If the buffer is added to the LRU, keep the reference to the
* buffer for the LRU and clear the (now stale) dispose list * buffer for the LRU and clear the (now stale) dispose list
* state flag * state flag, else drop the reference.
*/ */
if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) { if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
bp->b_state &= ~XFS_BSTATE_DISPOSE; bp->b_state &= ~XFS_BSTATE_DISPOSE;
atomic_inc(&bp->b_hold); else
} bp->b_hold--;
spin_unlock(&bch->bc_lock); spin_unlock(&bch->bc_lock);
} else { } else {
bp->b_hold--;
/* /*
* most of the time buffers will already be removed from the * most of the time buffers will already be removed from the
* LRU, so optimise that case by checking for the * LRU, so optimise that case by checking for the
@ -1748,13 +1750,14 @@ xfs_buftarg_drain_rele(
struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
struct list_head *dispose = arg; struct list_head *dispose = arg;
if (atomic_read(&bp->b_hold) > 1) { if (!spin_trylock(&bp->b_lock))
return LRU_SKIP;
if (bp->b_hold > 1) {
/* need to wait, so skip it this pass */ /* need to wait, so skip it this pass */
spin_unlock(&bp->b_lock);
trace_xfs_buf_drain_buftarg(bp, _RET_IP_); trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
return LRU_SKIP; return LRU_SKIP;
} }
if (!spin_trylock(&bp->b_lock))
return LRU_SKIP;
/* /*
* clear the LRU reference count so the buffer doesn't get * clear the LRU reference count so the buffer doesn't get
@ -2093,7 +2096,7 @@ xfs_buf_delwri_queue(
*/ */
bp->b_flags |= _XBF_DELWRI_Q; bp->b_flags |= _XBF_DELWRI_Q;
if (list_empty(&bp->b_list)) { if (list_empty(&bp->b_list)) {
atomic_inc(&bp->b_hold); xfs_buf_hold(bp);
list_add_tail(&bp->b_list, list); list_add_tail(&bp->b_list, list);
} }

View file

@ -168,7 +168,7 @@ struct xfs_buf {
xfs_daddr_t b_rhash_key; /* buffer cache index */ xfs_daddr_t b_rhash_key; /* buffer cache index */
int b_length; /* size of buffer in BBs */ int b_length; /* size of buffer in BBs */
atomic_t b_hold; /* reference count */ unsigned int b_hold; /* reference count */
atomic_t b_lru_ref; /* lru reclaim ref count */ atomic_t b_lru_ref; /* lru reclaim ref count */
xfs_buf_flags_t b_flags; /* status flags */ xfs_buf_flags_t b_flags; /* status flags */
struct semaphore b_sema; /* semaphore for lockables */ struct semaphore b_sema; /* semaphore for lockables */

View file

@ -498,7 +498,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
__entry->dev = bp->b_target->bt_dev; __entry->dev = bp->b_target->bt_dev;
__entry->bno = xfs_buf_daddr(bp); __entry->bno = xfs_buf_daddr(bp);
__entry->nblks = bp->b_length; __entry->nblks = bp->b_length;
__entry->hold = atomic_read(&bp->b_hold); __entry->hold = bp->b_hold;
__entry->pincount = atomic_read(&bp->b_pin_count); __entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count; __entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags; __entry->flags = bp->b_flags;
@ -569,7 +569,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
__entry->bno = xfs_buf_daddr(bp); __entry->bno = xfs_buf_daddr(bp);
__entry->length = bp->b_length; __entry->length = bp->b_length;
__entry->flags = flags; __entry->flags = flags;
__entry->hold = atomic_read(&bp->b_hold); __entry->hold = bp->b_hold;
__entry->pincount = atomic_read(&bp->b_pin_count); __entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count; __entry->lockval = bp->b_sema.count;
__entry->caller_ip = caller_ip; __entry->caller_ip = caller_ip;
@ -612,7 +612,7 @@ TRACE_EVENT(xfs_buf_ioerror,
__entry->dev = bp->b_target->bt_dev; __entry->dev = bp->b_target->bt_dev;
__entry->bno = xfs_buf_daddr(bp); __entry->bno = xfs_buf_daddr(bp);
__entry->length = bp->b_length; __entry->length = bp->b_length;
__entry->hold = atomic_read(&bp->b_hold); __entry->hold = bp->b_hold;
__entry->pincount = atomic_read(&bp->b_pin_count); __entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count; __entry->lockval = bp->b_sema.count;
__entry->error = error; __entry->error = error;
@ -656,7 +656,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
__entry->buf_bno = xfs_buf_daddr(bip->bli_buf); __entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
__entry->buf_len = bip->bli_buf->b_length; __entry->buf_len = bip->bli_buf->b_length;
__entry->buf_flags = bip->bli_buf->b_flags; __entry->buf_flags = bip->bli_buf->b_flags;
__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); __entry->buf_hold = bip->bli_buf->b_hold;
__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
__entry->buf_lockval = bip->bli_buf->b_sema.count; __entry->buf_lockval = bip->bli_buf->b_sema.count;
__entry->li_flags = bip->bli_item.li_flags; __entry->li_flags = bip->bli_item.li_flags;
@ -4978,7 +4978,7 @@ DECLARE_EVENT_CLASS(xfbtree_buf_class,
__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino; __entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
__entry->bno = xfs_buf_daddr(bp); __entry->bno = xfs_buf_daddr(bp);
__entry->nblks = bp->b_length; __entry->nblks = bp->b_length;
__entry->hold = atomic_read(&bp->b_hold); __entry->hold = bp->b_hold;
__entry->pincount = atomic_read(&bp->b_pin_count); __entry->pincount = atomic_read(&bp->b_pin_count);
__entry->lockval = bp->b_sema.count; __entry->lockval = bp->b_sema.count;
__entry->flags = bp->b_flags; __entry->flags = bp->b_flags;