summaryrefslogtreecommitdiff
path: root/disk-io.c
diff options
context:
space:
mode:
authorDavid Woodhouse <David.Woodhouse@intel.com>2009-07-11 18:12:37 +0100
committerChris Mason <chris.mason@fusionio.com>2013-02-01 14:22:07 -0500
commit4d48b96b280a76f29eb5d5f1578559932f551b2d (patch)
treec67db59d335759896fcb3c05d54f1a3232dd6471 /disk-io.c
parent6f082141d0a608c286c3f1b42ddd558e72ea1d9f (diff)
Add basic RAID[56] support
David Woodhouse originally contributed this code, and Chris Mason changed it around to reflect the current design goals for raid56. The original code expected all metadata and data writes to be full stripes. This meant metadata block size == stripe size, and had a few other restrictions. This version allows metadata blocks smaller than the stripe size. It implements both raid5 and raid6, although it does not have code to rebuild from parity if one of the drives is missing or incorrect. Signed-off-by: David Woodhouse <David.Woodhouse@intel.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
Diffstat (limited to 'disk-io.c')
-rw-r--r--disk-io.c216
1 files changed, 194 insertions, 22 deletions
diff --git a/disk-io.c b/disk-io.c
index f00fe4e7..8b369050 100644
--- a/disk-io.c
+++ b/disk-io.c
@@ -89,8 +89,8 @@ int csum_tree_block_size(struct extent_buffer *buf, u16 csum_size,
if (verify) {
if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
- printk("checksum verify failed on %llu wanted %X "
- "found %X\n", (unsigned long long)buf->start,
+ printk("checksum verify failed on %llu found %X "
+ "wanted %X\n", (unsigned long long)buf->start,
*((int *)result), *((char *)buf->data));
free(result);
return 1;
@@ -141,7 +141,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
length = blocksize;
ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
- bytenr, &length, &multi, 0);
+ bytenr, &length, &multi, 0, NULL);
BUG_ON(ret);
device = multi->stripes[0].dev;
device->total_ios++;
@@ -182,15 +182,52 @@ out:
}
+static int read_whole_eb(struct btrfs_fs_info *info, struct extent_buffer *eb, int mirror)
+{
+ unsigned long offset = 0;
+ struct btrfs_multi_bio *multi = NULL;
+ struct btrfs_device *device;
+ int ret = 0;
+ u64 read_len;
+ unsigned long bytes_left = eb->len;
+
+ while (bytes_left) {
+ read_len = bytes_left;
+ ret = btrfs_map_block(&info->mapping_tree, READ,
+ eb->start + offset, &read_len, &multi,
+ mirror, NULL);
+ if (ret) {
+ printk("Couldn't map the block %Lu\n", eb->start + offset);
+ return -EIO;
+ }
+ device = multi->stripes[0].dev;
+
+ if (device->fd == 0)
+ return -EIO;
+
+ eb->fd = device->fd;
+ device->total_ios++;
+ eb->dev_bytenr = multi->stripes[0].physical;
+ kfree(multi);
+
+ if (read_len > bytes_left)
+ read_len = bytes_left;
+
+ ret = read_extent_from_disk(eb, offset, read_len);
+ if (ret)
+ return -EIO;
+ offset += read_len;
+ bytes_left -= read_len;
+ }
+ return 0;
+}
+
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
u32 blocksize, u64 parent_transid)
{
int ret;
struct extent_buffer *eb;
- u64 length;
u64 best_transid = 0;
- struct btrfs_multi_bio *multi = NULL;
- struct btrfs_device *device;
int mirror_num = 0;
int good_mirror = 0;
int num_copies;
@@ -203,21 +240,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
if (btrfs_buffer_uptodate(eb, parent_transid))
return eb;
- length = blocksize;
while (1) {
- ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
- eb->start, &length, &multi, mirror_num);
- if (ret) {
- printk("Couldn't map the block %Lu\n", bytenr);
- break;
- }
- device = multi->stripes[0].dev;
- eb->fd = device->fd;
- device->total_ios++;
- eb->dev_bytenr = multi->stripes[0].physical;
- kfree(multi);
- ret = read_extent_from_disk(eb);
-
+ ret = read_whole_eb(root->fs_info, eb, mirror_num);
if (ret == 0 && check_tree_block(root, eb) == 0 &&
csum_tree_block(root, eb, 1) == 0 &&
verify_parent_transid(eb->tree, eb, parent_transid, ignore)
@@ -253,12 +277,156 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
return NULL;
}
+static int rmw_eb(struct btrfs_fs_info *info,
+ struct extent_buffer *eb, struct extent_buffer *orig_eb)
+{
+ int ret;
+ unsigned long orig_off = 0;
+ unsigned long dest_off = 0;
+ unsigned long copy_len = eb->len;
+
+ ret = read_whole_eb(info, eb, 0);
+ if (ret)
+ return ret;
+
+ if (eb->start + eb->len <= orig_eb->start ||
+ eb->start >= orig_eb->start + orig_eb->len)
+ return 0;
+ /*
+ * | ----- orig_eb ------- |
+ * | ----- stripe ------- |
+ * | ----- orig_eb ------- |
+ * | ----- orig_eb ------- |
+ */
+ if (eb->start > orig_eb->start)
+ orig_off = eb->start - orig_eb->start;
+ if (orig_eb->start > eb->start)
+ dest_off = orig_eb->start - eb->start;
+
+ if (copy_len > orig_eb->len - orig_off)
+ copy_len = orig_eb->len - orig_off;
+ if (copy_len > eb->len - dest_off)
+ copy_len = eb->len - dest_off;
+
+ memcpy(eb->data + dest_off, orig_eb->data + orig_off, copy_len);
+ return 0;
+}
+
+static void split_eb_for_raid56(struct btrfs_fs_info *info,
+ struct extent_buffer *orig_eb,
+ struct extent_buffer **ebs,
+ u64 stripe_len, u64 *raid_map,
+ int num_stripes)
+{
+ struct extent_buffer *eb;
+ u64 start = orig_eb->start;
+ u64 this_eb_start;
+ int i;
+ int ret;
+
+ for (i = 0; i < num_stripes; i++) {
+ if (raid_map[i] >= BTRFS_RAID5_P_STRIPE)
+ break;
+
+ eb = malloc(sizeof(struct extent_buffer) + stripe_len);
+ if (!eb)
+ BUG();
+ memset(eb, 0, sizeof(struct extent_buffer) + stripe_len);
+
+ eb->start = raid_map[i];
+ eb->len = stripe_len;
+ eb->refs = 1;
+ eb->flags = 0;
+ eb->fd = -1;
+ eb->dev_bytenr = (u64)-1;
+
+ this_eb_start = raid_map[i];
+
+ if (start > this_eb_start ||
+ start + orig_eb->len < this_eb_start + stripe_len) {
+ ret = rmw_eb(info, eb, orig_eb);
+ BUG_ON(ret);
+ } else {
+ memcpy(eb->data, orig_eb->data + eb->start - start, stripe_len);
+ }
+ ebs[i] = eb;
+ }
+}
+
+static int write_raid56_with_parity(struct btrfs_fs_info *info,
+ struct extent_buffer *eb,
+ struct btrfs_multi_bio *multi,
+ u64 stripe_len, u64 *raid_map)
+{
+ struct extent_buffer *ebs[multi->num_stripes], *p_eb = NULL, *q_eb = NULL;
+ int i;
+ int j;
+ int ret;
+ int alloc_size = eb->len;
+
+ if (stripe_len > alloc_size)
+ alloc_size = stripe_len;
+
+ split_eb_for_raid56(info, eb, ebs, stripe_len, raid_map,
+ multi->num_stripes);
+
+ for (i = 0; i < multi->num_stripes; i++) {
+ struct extent_buffer *new_eb;
+ if (raid_map[i] < BTRFS_RAID5_P_STRIPE) {
+ ebs[i]->dev_bytenr = multi->stripes[i].physical;
+ ebs[i]->fd = multi->stripes[i].dev->fd;
+ multi->stripes[i].dev->total_ios++;
+ BUG_ON(ebs[i]->start != raid_map[i]);
+ continue;
+ }
+ new_eb = kmalloc(sizeof(*eb) + alloc_size, GFP_NOFS);
+ BUG_ON(!new_eb);
+ new_eb->dev_bytenr = multi->stripes[i].physical;
+ new_eb->fd = multi->stripes[i].dev->fd;
+ multi->stripes[i].dev->total_ios++;
+ new_eb->len = stripe_len;
+
+ if (raid_map[i] == BTRFS_RAID5_P_STRIPE)
+ p_eb = new_eb;
+ else if (raid_map[i] == BTRFS_RAID6_Q_STRIPE)
+ q_eb = new_eb;
+ }
+ if (q_eb) {
+ void *pointers[multi->num_stripes];
+ ebs[multi->num_stripes - 2] = p_eb;
+ ebs[multi->num_stripes - 1] = q_eb;
+
+ for (i = 0; i < multi->num_stripes; i++)
+ pointers[i] = ebs[i]->data;
+
+ raid6_gen_syndrome(multi->num_stripes, stripe_len, pointers);
+ } else {
+ ebs[multi->num_stripes - 1] = p_eb;
+ memcpy(p_eb->data, ebs[0]->data, stripe_len);
+ for (j = 1; j < multi->num_stripes - 1; j++) {
+ for (i = 0; i < stripe_len; i += sizeof(unsigned long)) {
+ *(unsigned long *)(p_eb->data + i) ^=
+ *(unsigned long *)(ebs[j]->data + i);
+ }
+ }
+ }
+
+ for (i = 0; i < multi->num_stripes; i++) {
+ ret = write_extent_to_disk(ebs[i]);
+ BUG_ON(ret);
+ if (ebs[i] != eb)
+ kfree(ebs[i]);
+ }
+ return 0;
+}
+
int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *eb)
{
int ret;
int dev_nr;
u64 length;
+ u64 *raid_map = NULL;
struct btrfs_multi_bio *multi = NULL;
if (check_tree_block(root, eb))
@@ -272,9 +440,13 @@ int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
dev_nr = 0;
length = eb->len;
ret = btrfs_map_block(&root->fs_info->mapping_tree, WRITE,
- eb->start, &length, &multi, 0);
+ eb->start, &length, &multi, 0, &raid_map);
- while(dev_nr < multi->num_stripes) {
+ if (raid_map) {
+ ret = write_raid56_with_parity(root->fs_info, eb, multi,
+ length, raid_map);
+ BUG_ON(ret);
+ } else while (dev_nr < multi->num_stripes) {
BUG_ON(ret);
eb->fd = multi->stripes[dev_nr].dev->fd;
eb->dev_bytenr = multi->stripes[dev_nr].physical;