summaryrefslogtreecommitdiff
path: root/cmds-check.c
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2014-10-17 18:20:08 +0100
committerDavid Sterba <dsterba@suse.cz>2014-10-17 18:24:54 +0200
commit555b7feaebc109bbc184f333779bd664adc34125 (patch)
tree6c1366dfc668186bce1517cb1f94772c1f06169e /cmds-check.c
parenta197726ad770760c28a2595cf24ef5cf40f673e0 (diff)
Btrfs-progs: check, ability to detect and fix outdated snapshot root items
This change adds code to detect and fix the issue introduced in the kernel release 3.17, where creation of read-only snapshots lead to a corrupted filesystem if they were created at a moment when the source subvolume/snapshot had orphan items. The issue was that the on-disk root items became incorrect, referring to the pre orphan cleanup root node instead of the post orphan cleanup root node. A test filesystem can be generated with the test case recently submitted for xfstests/fstests, which is essencially the following (bash script): workout() { ops=$1 procs=$2 num_snapshots=$3 _scratch_mkfs >> $seqres.full 2>&1 _scratch_mount snapshot_cmd="$BTRFS_UTIL_PROG subvolume snapshot -r $SCRATCH_MNT" snapshot_cmd="$snapshot_cmd $SCRATCH_MNT/snap_\`date +'%H_%M_%S_%N'\`" run_check $FSSTRESS_PROG -p $procs \ -x "$snapshot_cmd" -X $num_snapshots -d $SCRATCH_MNT -n $ops } ops=10000 procs=4 snapshots=500 workout $ops $procs $snapshots Example of btrfsck's (btrfs check) behaviour against such filesystem: $ btrfsck /dev/loop0 root item for root 311, current bytenr 44630016, current gen 60, current level 1, new bytenr 44957696, new gen 61, new level 1 root item for root 1480, current bytenr 1003569152, current gen 1271, current level 1, new bytenr 1004175360, new gen 1272, new level 1 root item for root 1509, current bytenr 1037434880, current gen 1300, current level 1, new bytenr 1038467072, new gen 1301, new level 1 root item for root 1562, current bytenr 33636352, current gen 1354, current level 1, new bytenr 34455552, new gen 1355, new level 1 root item for root 3094, current bytenr 1011712000, current gen 2935, current level 1, new bytenr 1008484352, new gen 2936, new level 1 root item for root 3716, current bytenr 80805888, current gen 3578, current level 1, new bytenr 73515008, new gen 3579, new level 1 root item for root 4085, current bytenr 714031104, current gen 3958, current level 1, new bytenr 716816384, new gen 3959, new level 1 Found 7 roots with an outdated root item. Please run a filesystem check with the option --repair to fix them. $ echo $? 1 $ btrfsck --repair /dev/loop0 enabling repair mode fixing root item for root 311, current bytenr 44630016, current gen 60, current level 1, new bytenr 44957696, new gen 61, new level 1 fixing root item for root 1480, current bytenr 1003569152, current gen 1271, current level 1, new bytenr 1004175360, new gen 1272, new level 1 fixing root item for root 1509, current bytenr 1037434880, current gen 1300, current level 1, new bytenr 1038467072, new gen 1301, new level 1 fixing root item for root 1562, current bytenr 33636352, current gen 1354, current level 1, new bytenr 34455552, new gen 1355, new level 1 fixing root item for root 3094, current bytenr 1011712000, current gen 2935, current level 1, new bytenr 1008484352, new gen 2936, new level 1 fixing root item for root 3716, current bytenr 80805888, current gen 3578, current level 1, new bytenr 73515008, new gen 3579, new level 1 fixing root item for root 4085, current bytenr 714031104, current gen 3958, current level 1, new bytenr 716816384, new gen 3959, new level 1 Fixed 7 roots. Checking filesystem on /dev/loop0 UUID: 2186e9b9-c977-4a35-9c7b-69c6609d4620 checking extents checking free space cache cache and super generation don't match, space cache will be invalidated checking fs roots checking csums checking root refs found 618537000 bytes used err is 0 total csum bytes: 130824 total tree bytes: 601620480 total fs tree bytes: 580288512 total extent tree bytes: 18464768 btree space waste bytes: 136939144 file data blocks allocated: 34150318080 referenced 27815415808 Btrfs v3.17-rc3-2-gbbe1dd8 $ echo $? 0 Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.cz>
Diffstat (limited to 'cmds-check.c')
-rw-r--r--cmds-check.c356
1 files changed, 356 insertions, 0 deletions
diff --git a/cmds-check.c b/cmds-check.c
index 310eb2a8..2a5f823d 100644
--- a/cmds-check.c
+++ b/cmds-check.c
@@ -7198,6 +7198,345 @@ static int fill_csum_tree(struct btrfs_trans_handle *trans,
return ret;
}
+struct root_item_info {
+ /* level of the root */
+ u8 level;
+ /* number of nodes at this level, must be 1 for a root */
+ int node_count;
+ u64 bytenr;
+ u64 gen;
+ struct cache_extent cache_extent;
+};
+
+static struct cache_tree *roots_info_cache = NULL;
+
+static void free_roots_info_cache(void)
+{
+ if (!roots_info_cache)
+ return;
+
+ while (!cache_tree_empty(roots_info_cache)) {
+ struct cache_extent *entry;
+ struct root_item_info *rii;
+
+ entry = first_cache_extent(roots_info_cache);
+ remove_cache_extent(roots_info_cache, entry);
+ rii = container_of(entry, struct root_item_info, cache_extent);
+ free(rii);
+ }
+
+ free(roots_info_cache);
+ roots_info_cache = NULL;
+}
+
+static int build_roots_info_cache(struct btrfs_fs_info *info)
+{
+ int ret = 0;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_path *path;
+
+ if (!roots_info_cache) {
+ roots_info_cache = malloc(sizeof(*roots_info_cache));
+ if (!roots_info_cache)
+ return -ENOMEM;
+ cache_tree_init(roots_info_cache);
+ }
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = 0;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, info->extent_root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ leaf = path->nodes[0];
+
+ while (1) {
+ struct btrfs_key found_key;
+ struct btrfs_extent_item *ei;
+ struct btrfs_extent_inline_ref *iref;
+ int slot = path->slots[0];
+ int type;
+ u64 flags;
+ u64 root_id;
+ u8 level;
+ struct cache_extent *entry;
+ struct root_item_info *rii;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(info->extent_root, path);
+ if (ret < 0) {
+ break;
+ } else if (ret) {
+ ret = 0;
+ break;
+ }
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.type != BTRFS_EXTENT_ITEM_KEY &&
+ found_key.type != BTRFS_METADATA_ITEM_KEY)
+ goto next;
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+ flags = btrfs_extent_flags(leaf, ei);
+
+ if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+ !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+ goto next;
+
+ if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
+ iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+ level = found_key.offset;
+ } else {
+ struct btrfs_tree_block_info *info;
+
+ info = (struct btrfs_tree_block_info *)(ei + 1);
+ iref = (struct btrfs_extent_inline_ref *)(info + 1);
+ level = btrfs_tree_block_level(leaf, info);
+ }
+
+ /*
+ * For a root extent, it must be of the following type and the
+ * first (and only one) iref in the item.
+ */
+ type = btrfs_extent_inline_ref_type(leaf, iref);
+ if (type != BTRFS_TREE_BLOCK_REF_KEY)
+ goto next;
+
+ root_id = btrfs_extent_inline_ref_offset(leaf, iref);
+ entry = lookup_cache_extent(roots_info_cache, root_id, 1);
+ if (!entry) {
+ rii = malloc(sizeof(struct root_item_info));
+ if (!rii) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ rii->cache_extent.start = root_id;
+ rii->cache_extent.size = 1;
+ rii->level = (u8)-1;
+ entry = &rii->cache_extent;
+ ret = insert_cache_extent(roots_info_cache, entry);
+ ASSERT(ret == 0);
+ } else {
+ rii = container_of(entry, struct root_item_info,
+ cache_extent);
+ }
+
+ ASSERT(rii->cache_extent.start == root_id);
+ ASSERT(rii->cache_extent.size == 1);
+
+ if (level > rii->level || rii->level == (u8)-1) {
+ rii->level = level;
+ rii->bytenr = found_key.objectid;
+ rii->gen = btrfs_extent_generation(leaf, ei);
+ rii->node_count = 1;
+ } else if (level == rii->level) {
+ rii->node_count++;
+ }
+next:
+ path->slots[0]++;
+ }
+
+out:
+ btrfs_free_path(path);
+
+ return ret;
+}
+
+static int maybe_repair_root_item(struct btrfs_fs_info *info,
+ struct btrfs_path *path,
+ const struct btrfs_key *root_key,
+ const int read_only_mode)
+{
+ const u64 root_id = root_key->objectid;
+ struct cache_extent *entry;
+ struct root_item_info *rii;
+ struct btrfs_root_item ri;
+ unsigned long offset;
+
+ entry = lookup_cache_extent(roots_info_cache, root_id, 1);
+ if (!entry) {
+ fprintf(stderr,
+ "Error: could not find extent items for root %llu\n",
+ root_key->objectid);
+ return -ENOENT;
+ }
+
+ rii = container_of(entry, struct root_item_info, cache_extent);
+ ASSERT(rii->cache_extent.start == root_id);
+ ASSERT(rii->cache_extent.size == 1);
+
+ if (rii->node_count != 1) {
+ fprintf(stderr,
+ "Error: could not find btree root extent for root %llu\n",
+ root_id);
+ return -ENOENT;
+ }
+
+ offset = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+ read_extent_buffer(path->nodes[0], &ri, offset, sizeof(ri));
+
+ if (btrfs_root_bytenr(&ri) != rii->bytenr ||
+ btrfs_root_level(&ri) != rii->level ||
+ btrfs_root_generation(&ri) != rii->gen) {
+
+ /*
+ * If we're in repair mode but our caller told us to not update
+ * the root item, i.e. just check if it needs to be updated, don't
+ * print this message, since the caller will call us again shortly
+ * for the same root item without read only mode (the caller will
+ * open a transaction first).
+ */
+ if (!(read_only_mode && repair))
+ fprintf(stderr,
+ "%sroot item for root %llu,"
+ " current bytenr %llu, current gen %llu, current level %u,"
+ " new bytenr %llu, new gen %llu, new level %u\n",
+ (read_only_mode ? "" : "fixing "),
+ root_id,
+ btrfs_root_bytenr(&ri), btrfs_root_generation(&ri),
+ btrfs_root_level(&ri),
+ rii->bytenr, rii->gen, rii->level);
+
+ if (btrfs_root_generation(&ri) > rii->gen) {
+ fprintf(stderr,
+ "root %llu has a root item with a more recent gen (%llu) compared to the found root node (%llu)\n",
+ root_id, btrfs_root_generation(&ri), rii->gen);
+ return -EINVAL;
+ }
+
+ if (!read_only_mode) {
+ btrfs_set_root_bytenr(&ri, rii->bytenr);
+ btrfs_set_root_level(&ri, rii->level);
+ btrfs_set_root_generation(&ri, rii->gen);
+ write_extent_buffer(path->nodes[0], &ri,
+ offset, sizeof(ri));
+ }
+
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * A regression introduced in the 3.17 kernel (more specifically in 3.17-rc2),
+ * caused read-only snapshots to be corrupted if they were created at a moment
+ * when the source subvolume/snapshot had orphan items. The issue was that the
+ * on-disk root items became incorrect, referring to the pre orphan cleanup root
+ * node instead of the post orphan cleanup root node.
+ * So this function, and its callees, just detects and fixes those cases. Even
+ * though the regression was for read-only snapshots, this function applies to
+ * any snapshot/subvolume root.
+ * This must be run before any other repair code - not doing it so, makes other
+ * repair code delete or modify backrefs in the extent tree for example, which
+ * will result in an inconsistent fs after repairing the root items.
+ */
+static int repair_root_items(struct btrfs_fs_info *info)
+{
+ struct btrfs_path *path = NULL;
+ struct btrfs_key key;
+ struct extent_buffer *leaf;
+ struct btrfs_trans_handle *trans = NULL;
+ int ret = 0;
+ int bad_roots = 0;
+ int need_trans = 0;
+
+ ret = build_roots_info_cache(info);
+ if (ret)
+ goto out;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+ key.type = BTRFS_ROOT_ITEM_KEY;
+ key.offset = 0;
+
+again:
+ /*
+ * Avoid opening and committing transactions if a leaf doesn't have
+ * any root items that need to be fixed, so that we avoid rotating
+ * backup roots unnecessarily.
+ */
+ if (need_trans) {
+ trans = btrfs_start_transaction(info->tree_root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+ }
+
+ ret = btrfs_search_slot(trans, info->tree_root, &key, path,
+ 0, trans ? 1 : 0);
+ if (ret < 0)
+ goto out;
+ leaf = path->nodes[0];
+
+ while (1) {
+ struct btrfs_key found_key;
+
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ int no_more_keys = find_next_key(path, &key);
+
+ btrfs_release_path(path);
+ if (trans) {
+ ret = btrfs_commit_transaction(trans,
+ info->tree_root);
+ trans = NULL;
+ if (ret < 0)
+ goto out;
+ }
+ need_trans = 0;
+ if (no_more_keys)
+ break;
+ goto again;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+ if (found_key.type != BTRFS_ROOT_ITEM_KEY)
+ goto next;
+
+ ret = maybe_repair_root_item(info, path, &found_key,
+ trans ? 0 : 1);
+ if (ret < 0)
+ goto out;
+ if (ret) {
+ if (!trans && repair) {
+ need_trans = 1;
+ key = found_key;
+ btrfs_release_path(path);
+ goto again;
+ }
+ bad_roots++;
+ }
+next:
+ path->slots[0]++;
+ }
+ ret = 0;
+out:
+ free_roots_info_cache();
+ if (path)
+ btrfs_free_path(path);
+ if (ret < 0)
+ return ret;
+
+ return bad_roots;
+}
+
static struct option long_options[] = {
{ "super", 1, NULL, 's' },
{ "repair", 0, NULL, 0 },
@@ -7320,6 +7659,23 @@ int cmd_check(int argc, char **argv)
}
root = info->fs_root;
+
+ ret = repair_root_items(info);
+ if (ret < 0)
+ goto close_out;
+ if (repair) {
+ fprintf(stderr, "Fixed %d roots.\n", ret);
+ ret = 0;
+ } else if (ret > 0) {
+ fprintf(stderr,
+ "Found %d roots with an outdated root item.\n",
+ ret);
+ fprintf(stderr,
+ "Please run a filesystem check with the option --repair to fix them.\n");
+ ret = 1;
+ goto close_out;
+ }
+
/*
* repair mode will force us to commit transaction which
* will make us fail to load log tree when mounting.