New code for 6.6:
* Allow the kernel to initiate a freeze of a filesystem. The kernel and userspace can both hold a freeze on a filesystem at the same time; the freeze is not lifted until /both/ holders lift it. This will enable us to fix a longstanding bug in XFS online fsck. * Use kernel-initated fsfreeze to fix some longstanding false negatives in onlin fsck of the free space and inode counters. Signed-off-by: Darrick J. Wong <djwong@kernel.org> -----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQQ2qTKExjcn+O1o2YRKO3ySh0YRpgUCZM0XzQAKCRBKO3ySh0YR phSCAQD9hQmd9tngbNGos44XthgHDIfVHLQLWLt6lwcD0WNfIgEAwMWKLzI9hi7G SmX3NWDQBj7kvC96HYizIvdSsdkvHw0= =ulEr -----END PGP SIGNATURE----- gpgsig -----BEGIN PGP SIGNATURE----- iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZOXpMgAKCRCRxhvAZXjc ovFBAP97HEUSf78XXTQehluJgkbSVu208DFC4mCyFA6rRihskQD/Yz0uosr/51zJ FdUPNg8MNkQCRtqx5LQ7yClNSr9Sxg4= =uIAe -----END PGP SIGNATURE----- Merge tag 'vfs-6.6-merge-3' of ssh://gitolite.kernel.org/pub/scm/fs/xfs/xfs-linux Pull xfs online fsck update from Darrick Wong: New code for 6.6: * Allow the kernel to initiate a freeze of a filesystem. The kernel and userspace can both hold a freeze on a filesystem at the same time; the freeze is not lifted until /both/ holders lift it. This will enable us to fix a longstanding bug in XFS online fsck. * Use kernel-initated fsfreeze to fix some longstanding false negatives in online fsck of the free space and inode counters. Signed-off-by: Darrick J. Wong <djwong@kernel.org> Message-Id: <20230822182604.GB11286@frogsfrogsfrogs> Signed-off-by: Christian Brauner <brauner@kernel.org>
This commit is contained in:
commit
cd4284cfd3
4 changed files with 183 additions and 38 deletions
|
@ -1,4 +1,4 @@
|
|||
// SPDX-License-Identifier: GPL-2.0+
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/*
|
||||
* Copyright (C) 2019-2023 Oracle. All Rights Reserved.
|
||||
* Author: Darrick J. Wong <djwong@kernel.org>
|
||||
|
@ -8,6 +8,8 @@
|
|||
#include "xfs_shared.h"
|
||||
#include "xfs_format.h"
|
||||
#include "xfs_trans_resv.h"
|
||||
#include "xfs_log_format.h"
|
||||
#include "xfs_trans.h"
|
||||
#include "xfs_mount.h"
|
||||
#include "xfs_alloc.h"
|
||||
#include "xfs_ialloc.h"
|
||||
|
@ -16,6 +18,7 @@
|
|||
#include "xfs_ag.h"
|
||||
#include "xfs_rtalloc.h"
|
||||
#include "xfs_inode.h"
|
||||
#include "xfs_icache.h"
|
||||
#include "scrub/scrub.h"
|
||||
#include "scrub/common.h"
|
||||
#include "scrub/trace.h"
|
||||
|
@ -53,6 +56,7 @@ struct xchk_fscounters {
|
|||
uint64_t frextents;
|
||||
unsigned long long icount_min;
|
||||
unsigned long long icount_max;
|
||||
bool frozen;
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -123,6 +127,82 @@ xchk_fscount_warmup(
|
|||
return error;
|
||||
}
|
||||
|
||||
static inline int
|
||||
xchk_fsfreeze(
|
||||
struct xfs_scrub *sc)
|
||||
{
|
||||
int error;
|
||||
|
||||
error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
|
||||
trace_xchk_fsfreeze(sc, error);
|
||||
return error;
|
||||
}
|
||||
|
||||
static inline int
|
||||
xchk_fsthaw(
|
||||
struct xfs_scrub *sc)
|
||||
{
|
||||
int error;
|
||||
|
||||
/* This should always succeed, we have a kernel freeze */
|
||||
error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL);
|
||||
trace_xchk_fsthaw(sc, error);
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* We couldn't stabilize the filesystem long enough to sample all the variables
|
||||
* that comprise the summary counters and compare them to the percpu counters.
|
||||
* We need to disable all writer threads, which means taking the first two
|
||||
* freeze levels to put userspace to sleep, and the third freeze level to
|
||||
* prevent background threads from starting new transactions. Take one level
|
||||
* more to prevent other callers from unfreezing the filesystem while we run.
|
||||
*/
|
||||
STATIC int
|
||||
xchk_fscounters_freeze(
|
||||
struct xfs_scrub *sc)
|
||||
{
|
||||
struct xchk_fscounters *fsc = sc->buf;
|
||||
int error = 0;
|
||||
|
||||
if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
|
||||
sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
|
||||
mnt_drop_write_file(sc->file);
|
||||
}
|
||||
|
||||
/* Try to grab a kernel freeze. */
|
||||
while ((error = xchk_fsfreeze(sc)) == -EBUSY) {
|
||||
if (xchk_should_terminate(sc, &error))
|
||||
return error;
|
||||
|
||||
delay(HZ / 10);
|
||||
}
|
||||
if (error)
|
||||
return error;
|
||||
|
||||
fsc->frozen = true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Thaw the filesystem after checking or repairing fscounters. */
|
||||
STATIC void
|
||||
xchk_fscounters_cleanup(
|
||||
void *buf)
|
||||
{
|
||||
struct xchk_fscounters *fsc = buf;
|
||||
struct xfs_scrub *sc = fsc->sc;
|
||||
int error;
|
||||
|
||||
if (!fsc->frozen)
|
||||
return;
|
||||
|
||||
error = xchk_fsthaw(sc);
|
||||
if (error)
|
||||
xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error);
|
||||
else
|
||||
fsc->frozen = false;
|
||||
}
|
||||
|
||||
int
|
||||
xchk_setup_fscounters(
|
||||
struct xfs_scrub *sc)
|
||||
|
@ -140,6 +220,7 @@ xchk_setup_fscounters(
|
|||
sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS);
|
||||
if (!sc->buf)
|
||||
return -ENOMEM;
|
||||
sc->buf_cleanup = xchk_fscounters_cleanup;
|
||||
fsc = sc->buf;
|
||||
fsc->sc = sc;
|
||||
|
||||
|
@ -150,7 +231,18 @@ xchk_setup_fscounters(
|
|||
if (error)
|
||||
return error;
|
||||
|
||||
return xchk_trans_alloc(sc, 0);
|
||||
/*
|
||||
* Pause all writer activity in the filesystem while we're scrubbing to
|
||||
* reduce the likelihood of background perturbations to the counters
|
||||
* throwing off our calculations.
|
||||
*/
|
||||
if (sc->flags & XCHK_TRY_HARDER) {
|
||||
error = xchk_fscounters_freeze(sc);
|
||||
if (error)
|
||||
return error;
|
||||
}
|
||||
|
||||
return xfs_trans_alloc_empty(sc->mp, &sc->tp);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -290,8 +382,7 @@ retry:
|
|||
if (fsc->ifree > fsc->icount) {
|
||||
if (tries--)
|
||||
goto retry;
|
||||
xchk_set_incomplete(sc);
|
||||
return 0;
|
||||
return -EDEADLOCK;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@ -367,6 +458,8 @@ xchk_fscount_count_frextents(
|
|||
* Otherwise, we /might/ have a problem. If the change in the summations is
|
||||
* more than we want to tolerate, the filesystem is probably busy and we should
|
||||
* just send back INCOMPLETE and see if userspace will try again.
|
||||
*
|
||||
* If we're repairing then we require an exact match.
|
||||
*/
|
||||
static inline bool
|
||||
xchk_fscount_within_range(
|
||||
|
@ -396,21 +489,7 @@ xchk_fscount_within_range(
|
|||
if (expected >= min_value && expected <= max_value)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* If the difference between the two summations is too large, the fs
|
||||
* might just be busy and so we'll mark the scrub incomplete. Return
|
||||
* true here so that we don't mark the counter corrupt.
|
||||
*
|
||||
* XXX: In the future when userspace can grant scrub permission to
|
||||
* quiesce the filesystem to solve the outsized variance problem, this
|
||||
* check should be moved up and the return code changed to signal to
|
||||
* userspace that we need quiesce permission.
|
||||
*/
|
||||
if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) {
|
||||
xchk_set_incomplete(sc);
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Everything else is bad. */
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -422,6 +501,7 @@ xchk_fscounters(
|
|||
struct xfs_mount *mp = sc->mp;
|
||||
struct xchk_fscounters *fsc = sc->buf;
|
||||
int64_t icount, ifree, fdblocks, frextents;
|
||||
bool try_again = false;
|
||||
int error;
|
||||
|
||||
/* Snapshot the percpu counters. */
|
||||
|
@ -431,9 +511,26 @@ xchk_fscounters(
|
|||
frextents = percpu_counter_sum(&mp->m_frextents);
|
||||
|
||||
/* No negative values, please! */
|
||||
if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0)
|
||||
if (icount < 0 || ifree < 0)
|
||||
xchk_set_corrupt(sc);
|
||||
|
||||
/*
|
||||
* If the filesystem is not frozen, the counter summation calls above
|
||||
* can race with xfs_mod_freecounter, which subtracts a requested space
|
||||
* reservation from the counter and undoes the subtraction if that made
|
||||
* the counter go negative. Therefore, it's possible to see negative
|
||||
* values here, and we should only flag that as a corruption if we
|
||||
* froze the fs. This is much more likely to happen with frextents
|
||||
* since there are no reserved pools.
|
||||
*/
|
||||
if (fdblocks < 0 || frextents < 0) {
|
||||
if (!fsc->frozen)
|
||||
return -EDEADLOCK;
|
||||
|
||||
xchk_set_corrupt(sc);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* See if icount is obviously wrong. */
|
||||
if (icount < fsc->icount_min || icount > fsc->icount_max)
|
||||
xchk_set_corrupt(sc);
|
||||
|
@ -446,12 +543,6 @@ xchk_fscounters(
|
|||
if (frextents > mp->m_sb.sb_rextents)
|
||||
xchk_set_corrupt(sc);
|
||||
|
||||
/*
|
||||
* XXX: We can't quiesce percpu counter updates, so exit early.
|
||||
* This can be re-enabled when we gain exclusive freeze functionality.
|
||||
*/
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* If ifree exceeds icount by more than the minimum variance then
|
||||
* something's probably wrong with the counters.
|
||||
|
@ -463,8 +554,6 @@ xchk_fscounters(
|
|||
error = xchk_fscount_aggregate_agcounts(sc, fsc);
|
||||
if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
|
||||
return error;
|
||||
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
|
||||
return 0;
|
||||
|
||||
/* Count the free extents counter for rt volumes. */
|
||||
error = xchk_fscount_count_frextents(sc, fsc);
|
||||
|
@ -473,20 +562,45 @@ xchk_fscounters(
|
|||
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
|
||||
return 0;
|
||||
|
||||
/* Compare the in-core counters with whatever we counted. */
|
||||
if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
|
||||
xchk_set_corrupt(sc);
|
||||
/*
|
||||
* Compare the in-core counters with whatever we counted. If the fs is
|
||||
* frozen, we treat the discrepancy as a corruption because the freeze
|
||||
* should have stabilized the counter values. Otherwise, we need
|
||||
* userspace to call us back having granted us freeze permission.
|
||||
*/
|
||||
if (!xchk_fscount_within_range(sc, icount, &mp->m_icount,
|
||||
fsc->icount)) {
|
||||
if (fsc->frozen)
|
||||
xchk_set_corrupt(sc);
|
||||
else
|
||||
try_again = true;
|
||||
}
|
||||
|
||||
if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree))
|
||||
xchk_set_corrupt(sc);
|
||||
if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) {
|
||||
if (fsc->frozen)
|
||||
xchk_set_corrupt(sc);
|
||||
else
|
||||
try_again = true;
|
||||
}
|
||||
|
||||
if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
|
||||
fsc->fdblocks))
|
||||
xchk_set_corrupt(sc);
|
||||
fsc->fdblocks)) {
|
||||
if (fsc->frozen)
|
||||
xchk_set_corrupt(sc);
|
||||
else
|
||||
try_again = true;
|
||||
}
|
||||
|
||||
if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents,
|
||||
fsc->frextents))
|
||||
xchk_set_corrupt(sc);
|
||||
fsc->frextents)) {
|
||||
if (fsc->frozen)
|
||||
xchk_set_corrupt(sc);
|
||||
else
|
||||
try_again = true;
|
||||
}
|
||||
|
||||
if (try_again)
|
||||
return -EDEADLOCK;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -184,8 +184,10 @@ xchk_teardown(
|
|||
xchk_irele(sc, sc->ip);
|
||||
sc->ip = NULL;
|
||||
}
|
||||
if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
|
||||
if (sc->flags & XCHK_HAVE_FREEZE_PROT) {
|
||||
sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
|
||||
mnt_drop_write_file(sc->file);
|
||||
}
|
||||
if (sc->buf) {
|
||||
if (sc->buf_cleanup)
|
||||
sc->buf_cleanup(sc->buf);
|
||||
|
@ -505,6 +507,8 @@ retry_op:
|
|||
error = mnt_want_write_file(sc->file);
|
||||
if (error)
|
||||
goto out_sc;
|
||||
|
||||
sc->flags |= XCHK_HAVE_FREEZE_PROT;
|
||||
}
|
||||
|
||||
/* Set up for the operation. */
|
||||
|
|
|
@ -106,6 +106,7 @@ struct xfs_scrub {
|
|||
|
||||
/* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */
|
||||
#define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */
|
||||
#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */
|
||||
#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */
|
||||
#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */
|
||||
#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */
|
||||
|
|
|
@ -98,6 +98,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
|
|||
|
||||
#define XFS_SCRUB_STATE_STRINGS \
|
||||
{ XCHK_TRY_HARDER, "try_harder" }, \
|
||||
{ XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \
|
||||
{ XCHK_FSGATES_DRAIN, "fsgates_drain" }, \
|
||||
{ XCHK_NEED_DRAIN, "need_drain" }, \
|
||||
{ XREP_ALREADY_FIXED, "already_fixed" }
|
||||
|
@ -693,6 +694,31 @@ TRACE_EVENT(xchk_fscounters_within_range,
|
|||
__entry->old_value)
|
||||
)
|
||||
|
||||
DECLARE_EVENT_CLASS(xchk_fsfreeze_class,
|
||||
TP_PROTO(struct xfs_scrub *sc, int error),
|
||||
TP_ARGS(sc, error),
|
||||
TP_STRUCT__entry(
|
||||
__field(dev_t, dev)
|
||||
__field(unsigned int, type)
|
||||
__field(int, error)
|
||||
),
|
||||
TP_fast_assign(
|
||||
__entry->dev = sc->mp->m_super->s_dev;
|
||||
__entry->type = sc->sm->sm_type;
|
||||
__entry->error = error;
|
||||
),
|
||||
TP_printk("dev %d:%d type %s error %d",
|
||||
MAJOR(__entry->dev), MINOR(__entry->dev),
|
||||
__print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
|
||||
__entry->error)
|
||||
);
|
||||
#define DEFINE_XCHK_FSFREEZE_EVENT(name) \
|
||||
DEFINE_EVENT(xchk_fsfreeze_class, name, \
|
||||
TP_PROTO(struct xfs_scrub *sc, int error), \
|
||||
TP_ARGS(sc, error))
|
||||
DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsfreeze);
|
||||
DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsthaw);
|
||||
|
||||
TRACE_EVENT(xchk_refcount_incorrect,
|
||||
TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec,
|
||||
xfs_nlink_t seen),
|
||||
|
|
Loading…
Add table
Reference in a new issue