From 4d48b96b280a76f29eb5d5f1578559932f551b2d Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 11 Jul 2009 18:12:37 +0100 Subject: Add basic RAID[56] support David Woodhouse originally contributed this code, and Chris Mason changed it around to reflect the current design goals for raid56. The original code expected all metadata and data writes to be full stripes. This meant metadata block size == stripe size, and had a few other restrictions. This version allows metadata blocks smaller than the stripe size. It implements both raid5 and raid6, although it does not have code to rebuild from parity if one of the drives is missing or incorrect. Signed-off-by: David Woodhouse Signed-off-by: Chris Mason --- disk-io.c | 216 +++++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 194 insertions(+), 22 deletions(-) (limited to 'disk-io.c') diff --git a/disk-io.c b/disk-io.c index f00fe4e7..8b369050 100644 --- a/disk-io.c +++ b/disk-io.c @@ -89,8 +89,8 @@ int csum_tree_block_size(struct extent_buffer *buf, u16 csum_size, if (verify) { if (memcmp_extent_buffer(buf, result, 0, csum_size)) { - printk("checksum verify failed on %llu wanted %X " - "found %X\n", (unsigned long long)buf->start, + printk("checksum verify failed on %llu found %X " + "wanted %X\n", (unsigned long long)buf->start, *((int *)result), *((char *)buf->data)); free(result); return 1; @@ -141,7 +141,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, length = blocksize; ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, - bytenr, &length, &multi, 0); + bytenr, &length, &multi, 0, NULL); BUG_ON(ret); device = multi->stripes[0].dev; device->total_ios++; @@ -182,15 +182,52 @@ out: } +static int read_whole_eb(struct btrfs_fs_info *info, struct extent_buffer *eb, int mirror) +{ + unsigned long offset = 0; + struct btrfs_multi_bio *multi = NULL; + struct btrfs_device *device; + int ret = 0; + u64 read_len; + unsigned long bytes_left = eb->len; + + while (bytes_left) { + read_len = bytes_left; + ret = btrfs_map_block(&info->mapping_tree, READ, + eb->start + offset, &read_len, &multi, + mirror, NULL); + if (ret) { + printk("Couldn't map the block %Lu\n", eb->start + offset); + return -EIO; + } + device = multi->stripes[0].dev; + + if (device->fd == 0) + return -EIO; + + eb->fd = device->fd; + device->total_ios++; + eb->dev_bytenr = multi->stripes[0].physical; + kfree(multi); + + if (read_len > bytes_left) + read_len = bytes_left; + + ret = read_extent_from_disk(eb, offset, read_len); + if (ret) + return -EIO; + offset += read_len; + bytes_left -= read_len; + } + return 0; +} + struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, u64 parent_transid) { int ret; struct extent_buffer *eb; - u64 length; u64 best_transid = 0; - struct btrfs_multi_bio *multi = NULL; - struct btrfs_device *device; int mirror_num = 0; int good_mirror = 0; int num_copies; @@ -203,21 +240,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, if (btrfs_buffer_uptodate(eb, parent_transid)) return eb; - length = blocksize; while (1) { - ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, - eb->start, &length, &multi, mirror_num); - if (ret) { - printk("Couldn't map the block %Lu\n", bytenr); - break; - } - device = multi->stripes[0].dev; - eb->fd = device->fd; - device->total_ios++; - eb->dev_bytenr = multi->stripes[0].physical; - kfree(multi); - ret = read_extent_from_disk(eb); - + ret = read_whole_eb(root->fs_info, eb, mirror_num); if (ret == 0 && check_tree_block(root, eb) == 0 && csum_tree_block(root, eb, 1) == 0 && verify_parent_transid(eb->tree, eb, parent_transid, ignore) @@ -253,12 +277,156 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, return NULL; } +static int rmw_eb(struct btrfs_fs_info *info, + struct extent_buffer *eb, struct extent_buffer *orig_eb) +{ + int ret; + unsigned long orig_off = 0; + unsigned long dest_off = 0; + unsigned long copy_len = eb->len; + + ret = read_whole_eb(info, eb, 0); + if (ret) + return ret; + + if (eb->start + eb->len <= orig_eb->start || + eb->start >= orig_eb->start + orig_eb->len) + return 0; + /* + * | ----- orig_eb ------- | + * | ----- stripe ------- | + * | ----- orig_eb ------- | + * | ----- orig_eb ------- | + */ + if (eb->start > orig_eb->start) + orig_off = eb->start - orig_eb->start; + if (orig_eb->start > eb->start) + dest_off = orig_eb->start - eb->start; + + if (copy_len > orig_eb->len - orig_off) + copy_len = orig_eb->len - orig_off; + if (copy_len > eb->len - dest_off) + copy_len = eb->len - dest_off; + + memcpy(eb->data + dest_off, orig_eb->data + orig_off, copy_len); + return 0; +} + +static void split_eb_for_raid56(struct btrfs_fs_info *info, + struct extent_buffer *orig_eb, + struct extent_buffer **ebs, + u64 stripe_len, u64 *raid_map, + int num_stripes) +{ + struct extent_buffer *eb; + u64 start = orig_eb->start; + u64 this_eb_start; + int i; + int ret; + + for (i = 0; i < num_stripes; i++) { + if (raid_map[i] >= BTRFS_RAID5_P_STRIPE) + break; + + eb = malloc(sizeof(struct extent_buffer) + stripe_len); + if (!eb) + BUG(); + memset(eb, 0, sizeof(struct extent_buffer) + stripe_len); + + eb->start = raid_map[i]; + eb->len = stripe_len; + eb->refs = 1; + eb->flags = 0; + eb->fd = -1; + eb->dev_bytenr = (u64)-1; + + this_eb_start = raid_map[i]; + + if (start > this_eb_start || + start + orig_eb->len < this_eb_start + stripe_len) { + ret = rmw_eb(info, eb, orig_eb); + BUG_ON(ret); + } else { + memcpy(eb->data, orig_eb->data + eb->start - start, stripe_len); + } + ebs[i] = eb; + } +} + +static int write_raid56_with_parity(struct btrfs_fs_info *info, + struct extent_buffer *eb, + struct btrfs_multi_bio *multi, + u64 stripe_len, u64 *raid_map) +{ + struct extent_buffer *ebs[multi->num_stripes], *p_eb = NULL, *q_eb = NULL; + int i; + int j; + int ret; + int alloc_size = eb->len; + + if (stripe_len > alloc_size) + alloc_size = stripe_len; + + split_eb_for_raid56(info, eb, ebs, stripe_len, raid_map, + multi->num_stripes); + + for (i = 0; i < multi->num_stripes; i++) { + struct extent_buffer *new_eb; + if (raid_map[i] < BTRFS_RAID5_P_STRIPE) { + ebs[i]->dev_bytenr = multi->stripes[i].physical; + ebs[i]->fd = multi->stripes[i].dev->fd; + multi->stripes[i].dev->total_ios++; + BUG_ON(ebs[i]->start != raid_map[i]); + continue; + } + new_eb = kmalloc(sizeof(*eb) + alloc_size, GFP_NOFS); + BUG_ON(!new_eb); + new_eb->dev_bytenr = multi->stripes[i].physical; + new_eb->fd = multi->stripes[i].dev->fd; + multi->stripes[i].dev->total_ios++; + new_eb->len = stripe_len; + + if (raid_map[i] == BTRFS_RAID5_P_STRIPE) + p_eb = new_eb; + else if (raid_map[i] == BTRFS_RAID6_Q_STRIPE) + q_eb = new_eb; + } + if (q_eb) { + void *pointers[multi->num_stripes]; + ebs[multi->num_stripes - 2] = p_eb; + ebs[multi->num_stripes - 1] = q_eb; + + for (i = 0; i < multi->num_stripes; i++) + pointers[i] = ebs[i]->data; + + raid6_gen_syndrome(multi->num_stripes, stripe_len, pointers); + } else { + ebs[multi->num_stripes - 1] = p_eb; + memcpy(p_eb->data, ebs[0]->data, stripe_len); + for (j = 1; j < multi->num_stripes - 1; j++) { + for (i = 0; i < stripe_len; i += sizeof(unsigned long)) { + *(unsigned long *)(p_eb->data + i) ^= + *(unsigned long *)(ebs[j]->data + i); + } + } + } + + for (i = 0; i < multi->num_stripes; i++) { + ret = write_extent_to_disk(ebs[i]); + BUG_ON(ret); + if (ebs[i] != eb) + kfree(ebs[i]); + } + return 0; +} + int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb) { int ret; int dev_nr; u64 length; + u64 *raid_map = NULL; struct btrfs_multi_bio *multi = NULL; if (check_tree_block(root, eb)) @@ -272,9 +440,13 @@ int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, dev_nr = 0; length = eb->len; ret = btrfs_map_block(&root->fs_info->mapping_tree, WRITE, - eb->start, &length, &multi, 0); + eb->start, &length, &multi, 0, &raid_map); - while(dev_nr < multi->num_stripes) { + if (raid_map) { + ret = write_raid56_with_parity(root->fs_info, eb, multi, + length, raid_map); + BUG_ON(ret); + } else while (dev_nr < multi->num_stripes) { BUG_ON(ret); eb->fd = multi->stripes[dev_nr].dev->fd; eb->dev_bytenr = multi->stripes[dev_nr].physical; -- cgit v1.2.3