Don't use btrfs_set_item_key_safe() to modify the keys in the RAID stripe-tree, as this can lead to corruption of the tree, which is caught by the checks in btrfs_set_item_key_safe(): BTRFS info (device nvme1n1): leaf 49168384 gen 15 total ptrs 194 free space 8329 owner 12 BTRFS info (device nvme1n1): refs 2 lock_owner 1030 current 1030 [ snip ] item 105 key (354549760 230 20480) itemoff 14587 itemsize 16 stride 0 devid 5 physical 67502080 item 106 key (354631680 230 4096) itemoff 14571 itemsize 16 stride 0 devid 1 physical 88559616 item 107 key (354631680 230 32768) itemoff 14555 itemsize 16 stride 0 devid 1 physical 88555520 item 108 key (354717696 230 28672) itemoff 14539 itemsize 16 stride 0 devid 2 physical 67604480 [ snip ] BTRFS critical (device nvme1n1): slot 106 key (354631680 230 32768) new key (354635776 230 4096) ------------[ cut here ]------------ kernel BUG at fs/btrfs/ctree.c:2602! Oops: invalid opcode: 0000 [#1] PREEMPT SMP PTI CPU: 1 UID: 0 PID: 1055 Comm: fsstress Not tainted 6.13.0-rc1+ #1464 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014 RIP: 0010:btrfs_set_item_key_safe+0xf7/0x270 Code: <snip> RSP: 0018:ffffc90001337ab0 EFLAGS: 00010287 RAX: 0000000000000000 RBX: ffff8881115fd000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000001 RDI: 00000000ffffffff RBP: ffff888110ed6f50 R08: 00000000ffffefff R09: ffffffff8244c500 R10: 00000000ffffefff R11: 00000000ffffffff R12: ffff888100586000 R13: 00000000000000c9 R14: ffffc90001337b1f R15: ffff888110f23b58 FS: 00007f7d75c72740(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa811652c60 CR3: 0000000111398001 CR4: 0000000000370eb0 Call Trace: <TASK> ? __die_body.cold+0x14/0x1a ? die+0x2e/0x50 ? do_trap+0xca/0x110 ? do_error_trap+0x65/0x80 ? btrfs_set_item_key_safe+0xf7/0x270 ? exc_invalid_op+0x50/0x70 ? btrfs_set_item_key_safe+0xf7/0x270 ? asm_exc_invalid_op+0x1a/0x20 ? btrfs_set_item_key_safe+0xf7/0x270 btrfs_partially_delete_raid_extent+0xc4/0xe0 btrfs_delete_raid_extent+0x227/0x240 __btrfs_free_extent.isra.0+0x57f/0x9c0 ? exc_coproc_segment_overrun+0x40/0x40 __btrfs_run_delayed_refs+0x2fa/0xe80 btrfs_run_delayed_refs+0x81/0xe0 btrfs_commit_transaction+0x2dd/0xbe0 ? preempt_count_add+0x52/0xb0 btrfs_sync_file+0x375/0x4c0 do_fsync+0x39/0x70 __x64_sys_fsync+0x13/0x20 do_syscall_64+0x54/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f7d7550ef90 Code: <snip> RSP: 002b:00007ffd70237248 EFLAGS: 00000202 ORIG_RAX: 000000000000004a RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007f7d7550ef90 RDX: 000000000000013a RSI: 000000000040eb28 RDI: 0000000000000004 RBP: 000000000000001b R08: 0000000000000078 R09: 00007ffd7023725c R10: 00007f7d75400390 R11: 0000000000000202 R12: 028f5c28f5c28f5c R13: 8f5c28f5c28f5c29 R14: 000000000040b520 R15: 00007f7d75c726c8 </TASK> While the root cause of the tree order corruption isn't clear, using btrfs_duplicate_item() to copy the item and then adjusting both the key and the per-device physical addresses is a safe way to counter this problem. Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
478 lines
12 KiB
C
478 lines
12 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright (C) 2023 Western Digital Corporation or its affiliates.
|
|
*/
|
|
|
|
#include <linux/btrfs_tree.h>
|
|
#include "ctree.h"
|
|
#include "fs.h"
|
|
#include "accessors.h"
|
|
#include "transaction.h"
|
|
#include "disk-io.h"
|
|
#include "raid-stripe-tree.h"
|
|
#include "volumes.h"
|
|
#include "print-tree.h"
|
|
|
|
static int btrfs_partially_delete_raid_extent(struct btrfs_trans_handle *trans,
|
|
struct btrfs_path *path,
|
|
const struct btrfs_key *oldkey,
|
|
u64 newlen, u64 frontpad)
|
|
{
|
|
struct btrfs_root *stripe_root = trans->fs_info->stripe_root;
|
|
struct btrfs_stripe_extent *extent, *newitem;
|
|
struct extent_buffer *leaf;
|
|
int slot;
|
|
size_t item_size;
|
|
struct btrfs_key newkey = {
|
|
.objectid = oldkey->objectid + frontpad,
|
|
.type = BTRFS_RAID_STRIPE_KEY,
|
|
.offset = newlen,
|
|
};
|
|
int ret;
|
|
|
|
ASSERT(newlen > 0);
|
|
ASSERT(oldkey->type == BTRFS_RAID_STRIPE_KEY);
|
|
|
|
leaf = path->nodes[0];
|
|
slot = path->slots[0];
|
|
item_size = btrfs_item_size(leaf, slot);
|
|
|
|
newitem = kzalloc(item_size, GFP_NOFS);
|
|
if (!newitem)
|
|
return -ENOMEM;
|
|
|
|
extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
|
|
|
|
for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
|
|
struct btrfs_raid_stride *stride = &extent->strides[i];
|
|
u64 phys;
|
|
|
|
phys = btrfs_raid_stride_physical(leaf, stride) + frontpad;
|
|
btrfs_set_stack_raid_stride_physical(&newitem->strides[i], phys);
|
|
}
|
|
|
|
ret = btrfs_del_item(trans, stripe_root, path);
|
|
if (ret)
|
|
goto out;
|
|
|
|
btrfs_release_path(path);
|
|
ret = btrfs_insert_item(trans, stripe_root, &newkey, newitem, item_size);
|
|
|
|
out:
|
|
kfree(newitem);
|
|
return ret;
|
|
}
|
|
|
|
int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length)
|
|
{
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
|
struct btrfs_root *stripe_root = fs_info->stripe_root;
|
|
struct btrfs_path *path;
|
|
struct btrfs_key key;
|
|
struct extent_buffer *leaf;
|
|
u64 found_start;
|
|
u64 found_end;
|
|
u64 end = start + length;
|
|
int slot;
|
|
int ret;
|
|
|
|
if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE) || !stripe_root)
|
|
return 0;
|
|
|
|
if (!btrfs_is_testing(fs_info)) {
|
|
struct btrfs_chunk_map *map;
|
|
bool use_rst;
|
|
|
|
map = btrfs_find_chunk_map(fs_info, start, length);
|
|
if (!map)
|
|
return -EINVAL;
|
|
use_rst = btrfs_need_stripe_tree_update(fs_info, map->type);
|
|
btrfs_free_chunk_map(map);
|
|
if (!use_rst)
|
|
return 0;
|
|
}
|
|
|
|
path = btrfs_alloc_path();
|
|
if (!path)
|
|
return -ENOMEM;
|
|
|
|
while (1) {
|
|
key.objectid = start;
|
|
key.type = BTRFS_RAID_STRIPE_KEY;
|
|
key.offset = 0;
|
|
|
|
ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1);
|
|
if (ret < 0)
|
|
break;
|
|
|
|
if (path->slots[0] == btrfs_header_nritems(path->nodes[0]))
|
|
path->slots[0]--;
|
|
|
|
leaf = path->nodes[0];
|
|
slot = path->slots[0];
|
|
btrfs_item_key_to_cpu(leaf, &key, slot);
|
|
found_start = key.objectid;
|
|
found_end = found_start + key.offset;
|
|
ret = 0;
|
|
|
|
/*
|
|
* The stripe extent starts before the range we want to delete,
|
|
* but the range spans more than one stripe extent:
|
|
*
|
|
* |--- RAID Stripe Extent ---||--- RAID Stripe Extent ---|
|
|
* |--- keep ---|--- drop ---|
|
|
*
|
|
* This means we have to get the previous item, truncate its
|
|
* length and then restart the search.
|
|
*/
|
|
if (found_start > start) {
|
|
if (slot == 0) {
|
|
ret = btrfs_previous_item(stripe_root, path, start,
|
|
BTRFS_RAID_STRIPE_KEY);
|
|
if (ret) {
|
|
if (ret > 0)
|
|
ret = -ENOENT;
|
|
break;
|
|
}
|
|
} else {
|
|
path->slots[0]--;
|
|
}
|
|
|
|
leaf = path->nodes[0];
|
|
slot = path->slots[0];
|
|
btrfs_item_key_to_cpu(leaf, &key, slot);
|
|
found_start = key.objectid;
|
|
found_end = found_start + key.offset;
|
|
ASSERT(found_start <= start);
|
|
}
|
|
|
|
if (key.type != BTRFS_RAID_STRIPE_KEY)
|
|
break;
|
|
|
|
/* That stripe ends before we start, we're done. */
|
|
if (found_end <= start)
|
|
break;
|
|
|
|
trace_btrfs_raid_extent_delete(fs_info, start, end,
|
|
found_start, found_end);
|
|
|
|
/*
|
|
* The stripe extent starts before the range we want to delete
|
|
* and ends after the range we want to delete, i.e. we're
|
|
* punching a hole in the stripe extent:
|
|
*
|
|
* |--- RAID Stripe Extent ---|
|
|
* | keep |--- drop ---| keep |
|
|
*
|
|
* This means we need to a) truncate the existing item and b)
|
|
* create a second item for the remaining range.
|
|
*/
|
|
if (found_start < start && found_end > end) {
|
|
size_t item_size;
|
|
u64 diff_start = start - found_start;
|
|
u64 diff_end = found_end - end;
|
|
struct btrfs_stripe_extent *extent;
|
|
struct btrfs_key newkey = {
|
|
.objectid = end,
|
|
.type = BTRFS_RAID_STRIPE_KEY,
|
|
.offset = diff_end,
|
|
};
|
|
|
|
/* The "right" item. */
|
|
ret = btrfs_duplicate_item(trans, stripe_root, path, &newkey);
|
|
if (ret)
|
|
break;
|
|
|
|
item_size = btrfs_item_size(leaf, path->slots[0]);
|
|
extent = btrfs_item_ptr(leaf, path->slots[0],
|
|
struct btrfs_stripe_extent);
|
|
|
|
for (int i = 0; i < btrfs_num_raid_stripes(item_size); i++) {
|
|
struct btrfs_raid_stride *stride = &extent->strides[i];
|
|
u64 phys;
|
|
|
|
phys = btrfs_raid_stride_physical(leaf, stride);
|
|
phys += diff_start + length;
|
|
btrfs_set_raid_stride_physical(leaf, stride, phys);
|
|
}
|
|
|
|
/* The "left" item. */
|
|
path->slots[0]--;
|
|
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
|
|
btrfs_partially_delete_raid_extent(trans, path, &key,
|
|
diff_start, 0);
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* The stripe extent starts before the range we want to delete:
|
|
*
|
|
* |--- RAID Stripe Extent ---|
|
|
* |--- keep ---|--- drop ---|
|
|
*
|
|
* This means we have to duplicate the tree item, truncate the
|
|
* length to the new size and then re-insert the item.
|
|
*/
|
|
if (found_start < start) {
|
|
u64 diff_start = start - found_start;
|
|
|
|
btrfs_partially_delete_raid_extent(trans, path, &key,
|
|
diff_start, 0);
|
|
|
|
start += (key.offset - diff_start);
|
|
length -= (key.offset - diff_start);
|
|
if (length == 0)
|
|
break;
|
|
|
|
btrfs_release_path(path);
|
|
continue;
|
|
}
|
|
|
|
/*
|
|
* The stripe extent ends after the range we want to delete:
|
|
*
|
|
* |--- RAID Stripe Extent ---|
|
|
* |--- drop ---|--- keep ---|
|
|
*
|
|
* This means we have to duplicate the tree item, truncate the
|
|
* length to the new size and then re-insert the item.
|
|
*/
|
|
if (found_end > end) {
|
|
u64 diff_end = found_end - end;
|
|
|
|
btrfs_partially_delete_raid_extent(trans, path, &key,
|
|
key.offset - length,
|
|
length);
|
|
ASSERT(key.offset - diff_end == length);
|
|
break;
|
|
}
|
|
|
|
/* Finally we can delete the whole item, no more special cases. */
|
|
ret = btrfs_del_item(trans, stripe_root, path);
|
|
if (ret)
|
|
break;
|
|
|
|
start += key.offset;
|
|
length -= key.offset;
|
|
if (length == 0)
|
|
break;
|
|
|
|
btrfs_release_path(path);
|
|
}
|
|
|
|
btrfs_free_path(path);
|
|
return ret;
|
|
}
|
|
|
|
static int update_raid_extent_item(struct btrfs_trans_handle *trans,
|
|
struct btrfs_key *key,
|
|
struct btrfs_stripe_extent *stripe_extent,
|
|
const size_t item_size)
|
|
{
|
|
struct btrfs_path *path;
|
|
struct extent_buffer *leaf;
|
|
int ret;
|
|
int slot;
|
|
|
|
path = btrfs_alloc_path();
|
|
if (!path)
|
|
return -ENOMEM;
|
|
|
|
ret = btrfs_search_slot(trans, trans->fs_info->stripe_root, key, path,
|
|
0, 1);
|
|
if (ret)
|
|
return (ret == 1 ? ret : -EINVAL);
|
|
|
|
leaf = path->nodes[0];
|
|
slot = path->slots[0];
|
|
|
|
write_extent_buffer(leaf, stripe_extent, btrfs_item_ptr_offset(leaf, slot),
|
|
item_size);
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
}
|
|
|
|
EXPORT_FOR_TESTS
|
|
int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans,
|
|
struct btrfs_io_context *bioc)
|
|
{
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
|
struct btrfs_key stripe_key;
|
|
struct btrfs_root *stripe_root = fs_info->stripe_root;
|
|
const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type);
|
|
struct btrfs_stripe_extent *stripe_extent;
|
|
const size_t item_size = struct_size(stripe_extent, strides, num_stripes);
|
|
int ret;
|
|
|
|
stripe_extent = kzalloc(item_size, GFP_NOFS);
|
|
if (!stripe_extent) {
|
|
btrfs_abort_transaction(trans, -ENOMEM);
|
|
btrfs_end_transaction(trans);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
trace_btrfs_insert_one_raid_extent(fs_info, bioc->logical, bioc->size,
|
|
num_stripes);
|
|
for (int i = 0; i < num_stripes; i++) {
|
|
u64 devid = bioc->stripes[i].dev->devid;
|
|
u64 physical = bioc->stripes[i].physical;
|
|
struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i];
|
|
|
|
btrfs_set_stack_raid_stride_devid(raid_stride, devid);
|
|
btrfs_set_stack_raid_stride_physical(raid_stride, physical);
|
|
}
|
|
|
|
stripe_key.objectid = bioc->logical;
|
|
stripe_key.type = BTRFS_RAID_STRIPE_KEY;
|
|
stripe_key.offset = bioc->size;
|
|
|
|
ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent,
|
|
item_size);
|
|
if (ret == -EEXIST)
|
|
ret = update_raid_extent_item(trans, &stripe_key, stripe_extent,
|
|
item_size);
|
|
if (ret)
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
kfree(stripe_extent);
|
|
|
|
return ret;
|
|
}
|
|
|
|
int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans,
|
|
struct btrfs_ordered_extent *ordered_extent)
|
|
{
|
|
struct btrfs_io_context *bioc;
|
|
int ret;
|
|
|
|
if (!btrfs_fs_incompat(trans->fs_info, RAID_STRIPE_TREE))
|
|
return 0;
|
|
|
|
list_for_each_entry(bioc, &ordered_extent->bioc_list, rst_ordered_entry) {
|
|
ret = btrfs_insert_one_raid_extent(trans, bioc);
|
|
if (ret)
|
|
return ret;
|
|
}
|
|
|
|
while (!list_empty(&ordered_extent->bioc_list)) {
|
|
bioc = list_first_entry(&ordered_extent->bioc_list,
|
|
typeof(*bioc), rst_ordered_entry);
|
|
list_del(&bioc->rst_ordered_entry);
|
|
btrfs_put_bioc(bioc);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info,
|
|
u64 logical, u64 *length, u64 map_type,
|
|
u32 stripe_index, struct btrfs_io_stripe *stripe)
|
|
{
|
|
struct btrfs_root *stripe_root = fs_info->stripe_root;
|
|
struct btrfs_stripe_extent *stripe_extent;
|
|
struct btrfs_key stripe_key;
|
|
struct btrfs_key found_key;
|
|
struct btrfs_path *path;
|
|
struct extent_buffer *leaf;
|
|
const u64 end = logical + *length;
|
|
int num_stripes;
|
|
u64 offset;
|
|
u64 found_logical;
|
|
u64 found_length;
|
|
u64 found_end;
|
|
int slot;
|
|
int ret;
|
|
|
|
stripe_key.objectid = logical;
|
|
stripe_key.type = BTRFS_RAID_STRIPE_KEY;
|
|
stripe_key.offset = 0;
|
|
|
|
path = btrfs_alloc_path();
|
|
if (!path)
|
|
return -ENOMEM;
|
|
|
|
if (stripe->rst_search_commit_root) {
|
|
path->skip_locking = 1;
|
|
path->search_commit_root = 1;
|
|
}
|
|
|
|
ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0);
|
|
if (ret < 0)
|
|
goto free_path;
|
|
if (ret) {
|
|
if (path->slots[0] != 0)
|
|
path->slots[0]--;
|
|
}
|
|
|
|
while (1) {
|
|
leaf = path->nodes[0];
|
|
slot = path->slots[0];
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, slot);
|
|
found_logical = found_key.objectid;
|
|
found_length = found_key.offset;
|
|
found_end = found_logical + found_length;
|
|
|
|
if (found_logical > end) {
|
|
ret = -ENODATA;
|
|
goto out;
|
|
}
|
|
|
|
if (in_range(logical, found_logical, found_length))
|
|
break;
|
|
|
|
ret = btrfs_next_item(stripe_root, path);
|
|
if (ret)
|
|
goto out;
|
|
}
|
|
|
|
offset = logical - found_logical;
|
|
|
|
/*
|
|
* If we have a logically contiguous, but physically non-continuous
|
|
* range, we need to split the bio. Record the length after which we
|
|
* must split the bio.
|
|
*/
|
|
if (end > found_end)
|
|
*length -= end - found_end;
|
|
|
|
num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot));
|
|
stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent);
|
|
|
|
for (int i = 0; i < num_stripes; i++) {
|
|
struct btrfs_raid_stride *stride = &stripe_extent->strides[i];
|
|
u64 devid = btrfs_raid_stride_devid(leaf, stride);
|
|
u64 physical = btrfs_raid_stride_physical(leaf, stride);
|
|
|
|
if (devid != stripe->dev->devid)
|
|
continue;
|
|
|
|
if ((map_type & BTRFS_BLOCK_GROUP_DUP) && stripe_index != i)
|
|
continue;
|
|
|
|
stripe->physical = physical + offset;
|
|
|
|
trace_btrfs_get_raid_extent_offset(fs_info, logical, *length,
|
|
stripe->physical, devid);
|
|
|
|
ret = 0;
|
|
goto free_path;
|
|
}
|
|
|
|
/* If we're here, we haven't found the requested devid in the stripe. */
|
|
ret = -ENODATA;
|
|
out:
|
|
if (ret > 0)
|
|
ret = -ENODATA;
|
|
if (ret && ret != -EIO && !stripe->rst_search_commit_root) {
|
|
btrfs_debug(fs_info,
|
|
"cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s",
|
|
logical, logical + *length, stripe->dev->devid,
|
|
btrfs_bg_type_to_raid_name(map_type));
|
|
}
|
|
free_path:
|
|
btrfs_free_path(path);
|
|
|
|
return ret;
|
|
}
|