xfs: fix buffer lookup vs release race
Since commit 298f342245
("xfs: lockless buffer lookup") the buffer
lookup fastpath is done without a hash-wide lock (then pag_buf_lock, now
bc_lock) and only under RCU protection. But this means that nothing
serializes lookups against the temporary 0 reference count for buffers
that are added to the LRU after dropping the last regular reference,
and a concurrent lookup would fail to find them.
Fix this by doing all b_hold modifications under b_lock. We're already
doing this for release so this "only" ~ doubles the b_lock round trips.
We'll later look into the lockref infrastructure to optimize the number
of lock round trips again.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Carlos Maiolino <cem@kernel.org>
This commit is contained in:
parent
07eae0fa67
commit
ee10f6fcdb
3 changed files with 54 additions and 51 deletions
|
@ -127,15 +127,6 @@ __xfs_buf_ioacct_dec(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void
|
|
||||||
xfs_buf_ioacct_dec(
|
|
||||||
struct xfs_buf *bp)
|
|
||||||
{
|
|
||||||
spin_lock(&bp->b_lock);
|
|
||||||
__xfs_buf_ioacct_dec(bp);
|
|
||||||
spin_unlock(&bp->b_lock);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* When we mark a buffer stale, we remove the buffer from the LRU and clear the
|
* When we mark a buffer stale, we remove the buffer from the LRU and clear the
|
||||||
* b_lru_ref count so that the buffer is freed immediately when the buffer
|
* b_lru_ref count so that the buffer is freed immediately when the buffer
|
||||||
|
@ -171,9 +162,9 @@ xfs_buf_stale(
|
||||||
atomic_set(&bp->b_lru_ref, 0);
|
atomic_set(&bp->b_lru_ref, 0);
|
||||||
if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
|
if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
|
||||||
(list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
|
(list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
|
||||||
atomic_dec(&bp->b_hold);
|
bp->b_hold--;
|
||||||
|
|
||||||
ASSERT(atomic_read(&bp->b_hold) >= 1);
|
ASSERT(bp->b_hold >= 1);
|
||||||
spin_unlock(&bp->b_lock);
|
spin_unlock(&bp->b_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -229,14 +220,14 @@ _xfs_buf_alloc(
|
||||||
*/
|
*/
|
||||||
flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
|
flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
|
||||||
|
|
||||||
atomic_set(&bp->b_hold, 1);
|
spin_lock_init(&bp->b_lock);
|
||||||
|
bp->b_hold = 1;
|
||||||
atomic_set(&bp->b_lru_ref, 1);
|
atomic_set(&bp->b_lru_ref, 1);
|
||||||
init_completion(&bp->b_iowait);
|
init_completion(&bp->b_iowait);
|
||||||
INIT_LIST_HEAD(&bp->b_lru);
|
INIT_LIST_HEAD(&bp->b_lru);
|
||||||
INIT_LIST_HEAD(&bp->b_list);
|
INIT_LIST_HEAD(&bp->b_list);
|
||||||
INIT_LIST_HEAD(&bp->b_li_list);
|
INIT_LIST_HEAD(&bp->b_li_list);
|
||||||
sema_init(&bp->b_sema, 0); /* held, no waiters */
|
sema_init(&bp->b_sema, 0); /* held, no waiters */
|
||||||
spin_lock_init(&bp->b_lock);
|
|
||||||
bp->b_target = target;
|
bp->b_target = target;
|
||||||
bp->b_mount = target->bt_mount;
|
bp->b_mount = target->bt_mount;
|
||||||
bp->b_flags = flags;
|
bp->b_flags = flags;
|
||||||
|
@ -580,6 +571,20 @@ xfs_buf_find_lock(
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool
|
||||||
|
xfs_buf_try_hold(
|
||||||
|
struct xfs_buf *bp)
|
||||||
|
{
|
||||||
|
spin_lock(&bp->b_lock);
|
||||||
|
if (bp->b_hold == 0) {
|
||||||
|
spin_unlock(&bp->b_lock);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
bp->b_hold++;
|
||||||
|
spin_unlock(&bp->b_lock);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
static inline int
|
static inline int
|
||||||
xfs_buf_lookup(
|
xfs_buf_lookup(
|
||||||
struct xfs_buf_cache *bch,
|
struct xfs_buf_cache *bch,
|
||||||
|
@ -592,7 +597,7 @@ xfs_buf_lookup(
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
|
bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
|
||||||
if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
|
if (!bp || !xfs_buf_try_hold(bp)) {
|
||||||
rcu_read_unlock();
|
rcu_read_unlock();
|
||||||
return -ENOENT;
|
return -ENOENT;
|
||||||
}
|
}
|
||||||
|
@ -655,7 +660,7 @@ xfs_buf_find_insert(
|
||||||
spin_unlock(&bch->bc_lock);
|
spin_unlock(&bch->bc_lock);
|
||||||
goto out_free_buf;
|
goto out_free_buf;
|
||||||
}
|
}
|
||||||
if (bp && atomic_inc_not_zero(&bp->b_hold)) {
|
if (bp && xfs_buf_try_hold(bp)) {
|
||||||
/* found an existing buffer */
|
/* found an existing buffer */
|
||||||
spin_unlock(&bch->bc_lock);
|
spin_unlock(&bch->bc_lock);
|
||||||
error = xfs_buf_find_lock(bp, flags);
|
error = xfs_buf_find_lock(bp, flags);
|
||||||
|
@ -1037,7 +1042,10 @@ xfs_buf_hold(
|
||||||
struct xfs_buf *bp)
|
struct xfs_buf *bp)
|
||||||
{
|
{
|
||||||
trace_xfs_buf_hold(bp, _RET_IP_);
|
trace_xfs_buf_hold(bp, _RET_IP_);
|
||||||
atomic_inc(&bp->b_hold);
|
|
||||||
|
spin_lock(&bp->b_lock);
|
||||||
|
bp->b_hold++;
|
||||||
|
spin_unlock(&bp->b_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
@ -1045,10 +1053,15 @@ xfs_buf_rele_uncached(
|
||||||
struct xfs_buf *bp)
|
struct xfs_buf *bp)
|
||||||
{
|
{
|
||||||
ASSERT(list_empty(&bp->b_lru));
|
ASSERT(list_empty(&bp->b_lru));
|
||||||
if (atomic_dec_and_test(&bp->b_hold)) {
|
|
||||||
xfs_buf_ioacct_dec(bp);
|
spin_lock(&bp->b_lock);
|
||||||
xfs_buf_free(bp);
|
if (--bp->b_hold) {
|
||||||
|
spin_unlock(&bp->b_lock);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
__xfs_buf_ioacct_dec(bp);
|
||||||
|
spin_unlock(&bp->b_lock);
|
||||||
|
xfs_buf_free(bp);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
@ -1058,51 +1071,40 @@ xfs_buf_rele_cached(
|
||||||
struct xfs_buftarg *btp = bp->b_target;
|
struct xfs_buftarg *btp = bp->b_target;
|
||||||
struct xfs_perag *pag = bp->b_pag;
|
struct xfs_perag *pag = bp->b_pag;
|
||||||
struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag);
|
struct xfs_buf_cache *bch = xfs_buftarg_buf_cache(btp, pag);
|
||||||
bool release;
|
|
||||||
bool freebuf = false;
|
bool freebuf = false;
|
||||||
|
|
||||||
trace_xfs_buf_rele(bp, _RET_IP_);
|
trace_xfs_buf_rele(bp, _RET_IP_);
|
||||||
|
|
||||||
ASSERT(atomic_read(&bp->b_hold) > 0);
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We grab the b_lock here first to serialise racing xfs_buf_rele()
|
|
||||||
* calls. The pag_buf_lock being taken on the last reference only
|
|
||||||
* serialises against racing lookups in xfs_buf_find(). IOWs, the second
|
|
||||||
* to last reference we drop here is not serialised against the last
|
|
||||||
* reference until we take bp->b_lock. Hence if we don't grab b_lock
|
|
||||||
* first, the last "release" reference can win the race to the lock and
|
|
||||||
* free the buffer before the second-to-last reference is processed,
|
|
||||||
* leading to a use-after-free scenario.
|
|
||||||
*/
|
|
||||||
spin_lock(&bp->b_lock);
|
spin_lock(&bp->b_lock);
|
||||||
release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock);
|
ASSERT(bp->b_hold >= 1);
|
||||||
if (!release) {
|
if (bp->b_hold > 1) {
|
||||||
/*
|
/*
|
||||||
* Drop the in-flight state if the buffer is already on the LRU
|
* Drop the in-flight state if the buffer is already on the LRU
|
||||||
* and it holds the only reference. This is racy because we
|
* and it holds the only reference. This is racy because we
|
||||||
* haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
|
* haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
|
||||||
* ensures the decrement occurs only once per-buf.
|
* ensures the decrement occurs only once per-buf.
|
||||||
*/
|
*/
|
||||||
if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
|
if (--bp->b_hold == 1 && !list_empty(&bp->b_lru))
|
||||||
__xfs_buf_ioacct_dec(bp);
|
__xfs_buf_ioacct_dec(bp);
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* the last reference has been dropped ... */
|
/* we are asked to drop the last reference */
|
||||||
|
spin_lock(&bch->bc_lock);
|
||||||
__xfs_buf_ioacct_dec(bp);
|
__xfs_buf_ioacct_dec(bp);
|
||||||
if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
|
if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
|
||||||
/*
|
/*
|
||||||
* If the buffer is added to the LRU take a new reference to the
|
* If the buffer is added to the LRU, keep the reference to the
|
||||||
* buffer for the LRU and clear the (now stale) dispose list
|
* buffer for the LRU and clear the (now stale) dispose list
|
||||||
* state flag
|
* state flag, else drop the reference.
|
||||||
*/
|
*/
|
||||||
if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) {
|
if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru))
|
||||||
bp->b_state &= ~XFS_BSTATE_DISPOSE;
|
bp->b_state &= ~XFS_BSTATE_DISPOSE;
|
||||||
atomic_inc(&bp->b_hold);
|
else
|
||||||
}
|
bp->b_hold--;
|
||||||
spin_unlock(&bch->bc_lock);
|
spin_unlock(&bch->bc_lock);
|
||||||
} else {
|
} else {
|
||||||
|
bp->b_hold--;
|
||||||
/*
|
/*
|
||||||
* most of the time buffers will already be removed from the
|
* most of the time buffers will already be removed from the
|
||||||
* LRU, so optimise that case by checking for the
|
* LRU, so optimise that case by checking for the
|
||||||
|
@ -1748,13 +1750,14 @@ xfs_buftarg_drain_rele(
|
||||||
struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
|
struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
|
||||||
struct list_head *dispose = arg;
|
struct list_head *dispose = arg;
|
||||||
|
|
||||||
if (atomic_read(&bp->b_hold) > 1) {
|
if (!spin_trylock(&bp->b_lock))
|
||||||
|
return LRU_SKIP;
|
||||||
|
if (bp->b_hold > 1) {
|
||||||
/* need to wait, so skip it this pass */
|
/* need to wait, so skip it this pass */
|
||||||
|
spin_unlock(&bp->b_lock);
|
||||||
trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
|
trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
|
||||||
return LRU_SKIP;
|
return LRU_SKIP;
|
||||||
}
|
}
|
||||||
if (!spin_trylock(&bp->b_lock))
|
|
||||||
return LRU_SKIP;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* clear the LRU reference count so the buffer doesn't get
|
* clear the LRU reference count so the buffer doesn't get
|
||||||
|
@ -2093,7 +2096,7 @@ xfs_buf_delwri_queue(
|
||||||
*/
|
*/
|
||||||
bp->b_flags |= _XBF_DELWRI_Q;
|
bp->b_flags |= _XBF_DELWRI_Q;
|
||||||
if (list_empty(&bp->b_list)) {
|
if (list_empty(&bp->b_list)) {
|
||||||
atomic_inc(&bp->b_hold);
|
xfs_buf_hold(bp);
|
||||||
list_add_tail(&bp->b_list, list);
|
list_add_tail(&bp->b_list, list);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -168,7 +168,7 @@ struct xfs_buf {
|
||||||
|
|
||||||
xfs_daddr_t b_rhash_key; /* buffer cache index */
|
xfs_daddr_t b_rhash_key; /* buffer cache index */
|
||||||
int b_length; /* size of buffer in BBs */
|
int b_length; /* size of buffer in BBs */
|
||||||
atomic_t b_hold; /* reference count */
|
unsigned int b_hold; /* reference count */
|
||||||
atomic_t b_lru_ref; /* lru reclaim ref count */
|
atomic_t b_lru_ref; /* lru reclaim ref count */
|
||||||
xfs_buf_flags_t b_flags; /* status flags */
|
xfs_buf_flags_t b_flags; /* status flags */
|
||||||
struct semaphore b_sema; /* semaphore for lockables */
|
struct semaphore b_sema; /* semaphore for lockables */
|
||||||
|
|
|
@ -498,7 +498,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
|
||||||
__entry->dev = bp->b_target->bt_dev;
|
__entry->dev = bp->b_target->bt_dev;
|
||||||
__entry->bno = xfs_buf_daddr(bp);
|
__entry->bno = xfs_buf_daddr(bp);
|
||||||
__entry->nblks = bp->b_length;
|
__entry->nblks = bp->b_length;
|
||||||
__entry->hold = atomic_read(&bp->b_hold);
|
__entry->hold = bp->b_hold;
|
||||||
__entry->pincount = atomic_read(&bp->b_pin_count);
|
__entry->pincount = atomic_read(&bp->b_pin_count);
|
||||||
__entry->lockval = bp->b_sema.count;
|
__entry->lockval = bp->b_sema.count;
|
||||||
__entry->flags = bp->b_flags;
|
__entry->flags = bp->b_flags;
|
||||||
|
@ -569,7 +569,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class,
|
||||||
__entry->bno = xfs_buf_daddr(bp);
|
__entry->bno = xfs_buf_daddr(bp);
|
||||||
__entry->length = bp->b_length;
|
__entry->length = bp->b_length;
|
||||||
__entry->flags = flags;
|
__entry->flags = flags;
|
||||||
__entry->hold = atomic_read(&bp->b_hold);
|
__entry->hold = bp->b_hold;
|
||||||
__entry->pincount = atomic_read(&bp->b_pin_count);
|
__entry->pincount = atomic_read(&bp->b_pin_count);
|
||||||
__entry->lockval = bp->b_sema.count;
|
__entry->lockval = bp->b_sema.count;
|
||||||
__entry->caller_ip = caller_ip;
|
__entry->caller_ip = caller_ip;
|
||||||
|
@ -612,7 +612,7 @@ TRACE_EVENT(xfs_buf_ioerror,
|
||||||
__entry->dev = bp->b_target->bt_dev;
|
__entry->dev = bp->b_target->bt_dev;
|
||||||
__entry->bno = xfs_buf_daddr(bp);
|
__entry->bno = xfs_buf_daddr(bp);
|
||||||
__entry->length = bp->b_length;
|
__entry->length = bp->b_length;
|
||||||
__entry->hold = atomic_read(&bp->b_hold);
|
__entry->hold = bp->b_hold;
|
||||||
__entry->pincount = atomic_read(&bp->b_pin_count);
|
__entry->pincount = atomic_read(&bp->b_pin_count);
|
||||||
__entry->lockval = bp->b_sema.count;
|
__entry->lockval = bp->b_sema.count;
|
||||||
__entry->error = error;
|
__entry->error = error;
|
||||||
|
@ -656,7 +656,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class,
|
||||||
__entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
|
__entry->buf_bno = xfs_buf_daddr(bip->bli_buf);
|
||||||
__entry->buf_len = bip->bli_buf->b_length;
|
__entry->buf_len = bip->bli_buf->b_length;
|
||||||
__entry->buf_flags = bip->bli_buf->b_flags;
|
__entry->buf_flags = bip->bli_buf->b_flags;
|
||||||
__entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
|
__entry->buf_hold = bip->bli_buf->b_hold;
|
||||||
__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
|
__entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
|
||||||
__entry->buf_lockval = bip->bli_buf->b_sema.count;
|
__entry->buf_lockval = bip->bli_buf->b_sema.count;
|
||||||
__entry->li_flags = bip->bli_item.li_flags;
|
__entry->li_flags = bip->bli_item.li_flags;
|
||||||
|
@ -4978,7 +4978,7 @@ DECLARE_EVENT_CLASS(xfbtree_buf_class,
|
||||||
__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
|
__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
|
||||||
__entry->bno = xfs_buf_daddr(bp);
|
__entry->bno = xfs_buf_daddr(bp);
|
||||||
__entry->nblks = bp->b_length;
|
__entry->nblks = bp->b_length;
|
||||||
__entry->hold = atomic_read(&bp->b_hold);
|
__entry->hold = bp->b_hold;
|
||||||
__entry->pincount = atomic_read(&bp->b_pin_count);
|
__entry->pincount = atomic_read(&bp->b_pin_count);
|
||||||
__entry->lockval = bp->b_sema.count;
|
__entry->lockval = bp->b_sema.count;
|
||||||
__entry->flags = bp->b_flags;
|
__entry->flags = bp->b_flags;
|
||||||
|
|
Loading…
Add table
Reference in a new issue