summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile2
-rw-r--r--btrfs-corrupt-block.c5
-rw-r--r--btrfs-map-logical.c5
-rw-r--r--cmds-balance.c4
-rw-r--r--cmds-filesystem.c6
-rw-r--r--convert.c2
-rw-r--r--ctree.h8
-rw-r--r--disk-io.c223
-rw-r--r--disk-io.h3
-rw-r--r--extent-tree.c6
-rw-r--r--extent_io.c7
-rw-r--r--extent_io.h3
-rw-r--r--find-root.c3
-rw-r--r--kerncompat.h6
-rw-r--r--mkfs.c40
-rw-r--r--raid6.c99
-rw-r--r--restore.c2
-rw-r--r--volumes.c184
-rw-r--r--volumes.h10
19 files changed, 561 insertions, 57 deletions
diff --git a/Makefile b/Makefile
index bef1e13b..596bf93f 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ objects = ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
root-tree.o dir-item.o file-item.o inode-item.o \
inode-map.o crc32c.o rbtree.o extent-cache.o extent_io.o \
volumes.o utils.o btrfs-list.o btrfslabel.o repair.o \
- send-stream.o send-utils.o qgroup.o
+ send-stream.o send-utils.o qgroup.o raid6.o
cmds_objects = cmds-subvolume.o cmds-filesystem.o cmds-device.o cmds-scrub.o \
cmds-inspect.o cmds-balance.o cmds-send.o cmds-receive.o \
cmds-quota.o cmds-qgroup.o cmds-replace.o
diff --git a/btrfs-corrupt-block.c b/btrfs-corrupt-block.c
index b57e7573..8176fad3 100644
--- a/btrfs-corrupt-block.c
+++ b/btrfs-corrupt-block.c
@@ -50,7 +50,8 @@ struct extent_buffer *debug_corrupt_block(struct btrfs_root *root, u64 bytenr,
length = blocksize;
while (1) {
ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
- eb->start, &length, &multi, mirror_num);
+ eb->start, &length, &multi,
+ mirror_num, NULL);
BUG_ON(ret);
device = multi->stripes[0].dev;
eb->fd = device->fd;
@@ -63,7 +64,7 @@ struct extent_buffer *debug_corrupt_block(struct btrfs_root *root, u64 bytenr,
kfree(multi);
if (!copy || mirror_num == copy) {
- ret = read_extent_from_disk(eb);
+ ret = read_extent_from_disk(eb, 0, eb->len);
printf("corrupting %llu copy %d\n", eb->start,
mirror_num);
memset(eb->data, 0, eb->len);
diff --git a/btrfs-map-logical.c b/btrfs-map-logical.c
index fa4fb3f5..b9635f77 100644
--- a/btrfs-map-logical.c
+++ b/btrfs-map-logical.c
@@ -55,7 +55,8 @@ struct extent_buffer *debug_read_block(struct btrfs_root *root, u64 bytenr,
length = blocksize;
while (1) {
ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
- eb->start, &length, &multi, mirror_num);
+ eb->start, &length, &multi,
+ mirror_num, NULL);
BUG_ON(ret);
device = multi->stripes[0].dev;
eb->fd = device->fd;
@@ -68,7 +69,7 @@ struct extent_buffer *debug_read_block(struct btrfs_root *root, u64 bytenr,
kfree(multi);
if (!copy || mirror_num == copy)
- ret = read_extent_from_disk(eb);
+ ret = read_extent_from_disk(eb, 0, eb->len);
num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
eb->start, eb->len);
diff --git a/cmds-balance.c b/cmds-balance.c
index 03057213..dde7c683 100644
--- a/cmds-balance.c
+++ b/cmds-balance.c
@@ -48,6 +48,10 @@ static int parse_one_profile(const char *profile, u64 *flags)
*flags |= BTRFS_BLOCK_GROUP_RAID1;
} else if (!strcmp(profile, "raid10")) {
*flags |= BTRFS_BLOCK_GROUP_RAID10;
+ } else if (!strcmp(profile, "raid5")) {
+ *flags |= BTRFS_BLOCK_GROUP_RAID5;
+ } else if (!strcmp(profile, "raid6")) {
+ *flags |= BTRFS_BLOCK_GROUP_RAID6;
} else if (!strcmp(profile, "dup")) {
*flags |= BTRFS_BLOCK_GROUP_DUP;
} else if (!strcmp(profile, "single")) {
diff --git a/cmds-filesystem.c b/cmds-filesystem.c
index 507239ad..5332f801 100644
--- a/cmds-filesystem.c
+++ b/cmds-filesystem.c
@@ -148,6 +148,12 @@ static int cmd_df(int argc, char **argv)
} else if (flags & BTRFS_BLOCK_GROUP_RAID10) {
snprintf(description+written, 9, "%s", ", RAID10");
written += 8;
+ } else if (flags & BTRFS_BLOCK_GROUP_RAID5) {
+ snprintf(description+written, 9, "%s", ", RAID5");
+ written += 7;
+ } else if (flags & BTRFS_BLOCK_GROUP_RAID6) {
+ snprintf(description+written, 9, "%s", ", RAID6");
+ written += 7;
}
total_bytes = pretty_sizes(sargs->spaces[i].total_bytes);
diff --git a/convert.c b/convert.c
index 1de2a441..2b3f42f2 100644
--- a/convert.c
+++ b/convert.c
@@ -2430,7 +2430,7 @@ static int may_rollback(struct btrfs_root *root)
while (1) {
ret = btrfs_map_block(&info->mapping_tree, WRITE, bytenr,
- &length, &multi, 0);
+ &length, &multi, 0, NULL);
if (ret)
goto fail;
diff --git a/ctree.h b/ctree.h
index d0f6062f..9bdcf6b6 100644
--- a/ctree.h
+++ b/ctree.h
@@ -445,6 +445,7 @@ struct btrfs_super_block {
* code was pretty buggy. Lets not let them try anymore.
*/
#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
+#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
@@ -455,8 +456,9 @@ struct btrfs_super_block {
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
- BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
- BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
+ BTRFS_FEATURE_INCOMPAT_RAID56 | \
+ BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
/*
* A leaf is full of items. offset and size tell us where to find
@@ -796,6 +798,8 @@ struct btrfs_csum_item {
#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
+#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
#define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE
/* used in struct btrfs_balance_args fields */
diff --git a/disk-io.c b/disk-io.c
index 0bf73f05..a7d5c583 100644
--- a/disk-io.c
+++ b/disk-io.c
@@ -89,8 +89,8 @@ int csum_tree_block_size(struct extent_buffer *buf, u16 csum_size,
if (verify) {
if (memcmp_extent_buffer(buf, result, 0, csum_size)) {
- printk("checksum verify failed on %llu wanted %X "
- "found %X\n", (unsigned long long)buf->start,
+ printk("checksum verify failed on %llu found %X "
+ "wanted %X\n", (unsigned long long)buf->start,
*((int *)result), *((char *)buf->data));
free(result);
return 1;
@@ -141,7 +141,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
length = blocksize;
ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
- bytenr, &length, &multi, 0);
+ bytenr, &length, &multi, 0, NULL);
BUG_ON(ret);
device = multi->stripes[0].dev;
device->total_ios++;
@@ -182,15 +182,52 @@ out:
}
+static int read_whole_eb(struct btrfs_fs_info *info, struct extent_buffer *eb, int mirror)
+{
+ unsigned long offset = 0;
+ struct btrfs_multi_bio *multi = NULL;
+ struct btrfs_device *device;
+ int ret = 0;
+ u64 read_len;
+ unsigned long bytes_left = eb->len;
+
+ while (bytes_left) {
+ read_len = bytes_left;
+ ret = btrfs_map_block(&info->mapping_tree, READ,
+ eb->start + offset, &read_len, &multi,
+ mirror, NULL);
+ if (ret) {
+ printk("Couldn't map the block %Lu\n", eb->start + offset);
+ return -EIO;
+ }
+ device = multi->stripes[0].dev;
+
+ if (device->fd == 0)
+ return -EIO;
+
+ eb->fd = device->fd;
+ device->total_ios++;
+ eb->dev_bytenr = multi->stripes[0].physical;
+ kfree(multi);
+
+ if (read_len > bytes_left)
+ read_len = bytes_left;
+
+ ret = read_extent_from_disk(eb, offset, read_len);
+ if (ret)
+ return -EIO;
+ offset += read_len;
+ bytes_left -= read_len;
+ }
+ return 0;
+}
+
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
u32 blocksize, u64 parent_transid)
{
int ret;
struct extent_buffer *eb;
- u64 length;
u64 best_transid = 0;
- struct btrfs_multi_bio *multi = NULL;
- struct btrfs_device *device;
int mirror_num = 0;
int good_mirror = 0;
int num_copies;
@@ -203,21 +240,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
if (btrfs_buffer_uptodate(eb, parent_transid))
return eb;
- length = blocksize;
while (1) {
- ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
- eb->start, &length, &multi, mirror_num);
- if (ret) {
- printk("Couldn't map the block %Lu\n", bytenr);
- break;
- }
- device = multi->stripes[0].dev;
- eb->fd = device->fd;
- device->total_ios++;
- eb->dev_bytenr = multi->stripes[0].physical;
- kfree(multi);
- ret = read_extent_from_disk(eb);
-
+ ret = read_whole_eb(root->fs_info, eb, mirror_num);
if (ret == 0 && check_tree_block(root, eb) == 0 &&
csum_tree_block(root, eb, 1) == 0 &&
verify_parent_transid(eb->tree, eb, parent_transid, ignore)
@@ -253,12 +277,156 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
return NULL;
}
+static int rmw_eb(struct btrfs_fs_info *info,
+ struct extent_buffer *eb, struct extent_buffer *orig_eb)
+{
+ int ret;
+ unsigned long orig_off = 0;
+ unsigned long dest_off = 0;
+ unsigned long copy_len = eb->len;
+
+ ret = read_whole_eb(info, eb, 0);
+ if (ret)
+ return ret;
+
+ if (eb->start + eb->len <= orig_eb->start ||
+ eb->start >= orig_eb->start + orig_eb->len)
+ return 0;
+ /*
+ * | ----- orig_eb ------- |
+ * | ----- stripe ------- |
+ * | ----- orig_eb ------- |
+ * | ----- orig_eb ------- |
+ */
+ if (eb->start > orig_eb->start)
+ orig_off = eb->start - orig_eb->start;
+ if (orig_eb->start > eb->start)
+ dest_off = orig_eb->start - eb->start;
+
+ if (copy_len > orig_eb->len - orig_off)
+ copy_len = orig_eb->len - orig_off;
+ if (copy_len > eb->len - dest_off)
+ copy_len = eb->len - dest_off;
+
+ memcpy(eb->data + dest_off, orig_eb->data + orig_off, copy_len);
+ return 0;
+}
+
+static void split_eb_for_raid56(struct btrfs_fs_info *info,
+ struct extent_buffer *orig_eb,
+ struct extent_buffer **ebs,
+ u64 stripe_len, u64 *raid_map,
+ int num_stripes)
+{
+ struct extent_buffer *eb;
+ u64 start = orig_eb->start;
+ u64 this_eb_start;
+ int i;
+ int ret;
+
+ for (i = 0; i < num_stripes; i++) {
+ if (raid_map[i] >= BTRFS_RAID5_P_STRIPE)
+ break;
+
+ eb = malloc(sizeof(struct extent_buffer) + stripe_len);
+ if (!eb)
+ BUG();
+ memset(eb, 0, sizeof(struct extent_buffer) + stripe_len);
+
+ eb->start = raid_map[i];
+ eb->len = stripe_len;
+ eb->refs = 1;
+ eb->flags = 0;
+ eb->fd = -1;
+ eb->dev_bytenr = (u64)-1;
+
+ this_eb_start = raid_map[i];
+
+ if (start > this_eb_start ||
+ start + orig_eb->len < this_eb_start + stripe_len) {
+ ret = rmw_eb(info, eb, orig_eb);
+ BUG_ON(ret);
+ } else {
+ memcpy(eb->data, orig_eb->data + eb->start - start, stripe_len);
+ }
+ ebs[i] = eb;
+ }
+}
+
+static int write_raid56_with_parity(struct btrfs_fs_info *info,
+ struct extent_buffer *eb,
+ struct btrfs_multi_bio *multi,
+ u64 stripe_len, u64 *raid_map)
+{
+ struct extent_buffer *ebs[multi->num_stripes], *p_eb = NULL, *q_eb = NULL;
+ int i;
+ int j;
+ int ret;
+ int alloc_size = eb->len;
+
+ if (stripe_len > alloc_size)
+ alloc_size = stripe_len;
+
+ split_eb_for_raid56(info, eb, ebs, stripe_len, raid_map,
+ multi->num_stripes);
+
+ for (i = 0; i < multi->num_stripes; i++) {
+ struct extent_buffer *new_eb;
+ if (raid_map[i] < BTRFS_RAID5_P_STRIPE) {
+ ebs[i]->dev_bytenr = multi->stripes[i].physical;
+ ebs[i]->fd = multi->stripes[i].dev->fd;
+ multi->stripes[i].dev->total_ios++;
+ BUG_ON(ebs[i]->start != raid_map[i]);
+ continue;
+ }
+ new_eb = kmalloc(sizeof(*eb) + alloc_size, GFP_NOFS);
+ BUG_ON(!new_eb);
+ new_eb->dev_bytenr = multi->stripes[i].physical;
+ new_eb->fd = multi->stripes[i].dev->fd;
+ multi->stripes[i].dev->total_ios++;
+ new_eb->len = stripe_len;
+
+ if (raid_map[i] == BTRFS_RAID5_P_STRIPE)
+ p_eb = new_eb;
+ else if (raid_map[i] == BTRFS_RAID6_Q_STRIPE)
+ q_eb = new_eb;
+ }
+ if (q_eb) {
+ void *pointers[multi->num_stripes];
+ ebs[multi->num_stripes - 2] = p_eb;
+ ebs[multi->num_stripes - 1] = q_eb;
+
+ for (i = 0; i < multi->num_stripes; i++)
+ pointers[i] = ebs[i]->data;
+
+ raid6_gen_syndrome(multi->num_stripes, stripe_len, pointers);
+ } else {
+ ebs[multi->num_stripes - 1] = p_eb;
+ memcpy(p_eb->data, ebs[0]->data, stripe_len);
+ for (j = 1; j < multi->num_stripes - 1; j++) {
+ for (i = 0; i < stripe_len; i += sizeof(unsigned long)) {
+ *(unsigned long *)(p_eb->data + i) ^=
+ *(unsigned long *)(ebs[j]->data + i);
+ }
+ }
+ }
+
+ for (i = 0; i < multi->num_stripes; i++) {
+ ret = write_extent_to_disk(ebs[i]);
+ BUG_ON(ret);
+ if (ebs[i] != eb)
+ kfree(ebs[i]);
+ }
+ return 0;
+}
+
int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *eb)
{
int ret;
int dev_nr;
u64 length;
+ u64 *raid_map = NULL;
struct btrfs_multi_bio *multi = NULL;
if (check_tree_block(root, eb))
@@ -272,9 +440,13 @@ int write_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
dev_nr = 0;
length = eb->len;
ret = btrfs_map_block(&root->fs_info->mapping_tree, WRITE,
- eb->start, &length, &multi, 0);
+ eb->start, &length, &multi, 0, &raid_map);
- while(dev_nr < multi->num_stripes) {
+ if (raid_map) {
+ ret = write_raid56_with_parity(root->fs_info, eb, multi,
+ length, raid_map);
+ BUG_ON(ret);
+ } else while (dev_nr < multi->num_stripes) {
BUG_ON(ret);
eb->fd = multi->stripes[dev_nr].dev->fd;
eb->dev_bytenr = multi->stripes[dev_nr].physical;
@@ -641,6 +813,9 @@ static struct btrfs_fs_info *__open_ctree_fd(int fp, const char *path,
if (sb_bytenr == 0)
sb_bytenr = BTRFS_SUPER_INFO_OFFSET;
+ /* try to drop all the caches */
+ posix_fadvise(fp, 0, 0, POSIX_FADV_DONTNEED);
+
ret = btrfs_scan_one_device(fp, path, &fs_devices,
&total_devs, sb_bytenr);
@@ -1091,6 +1266,10 @@ static int close_all_devices(struct btrfs_fs_info *fs_info)
list = &fs_info->fs_devices->devices;
list_for_each(next, list) {
device = list_entry(next, struct btrfs_device, dev_list);
+ if (device->fd) {
+ fsync(device->fd);
+ posix_fadvise(device->fd, 0, 0, POSIX_FADV_DONTNEED);
+ }
close(device->fd);
}
return 0;
diff --git a/disk-io.h b/disk-io.h
index 53e9b17a..53ef0238 100644
--- a/disk-io.h
+++ b/disk-io.h
@@ -82,3 +82,6 @@ int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
int verify);
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
#endif
+
+/* raid6.c */
+void raid6_gen_syndrome(int disks, size_t bytes, void **ptrs);
diff --git a/extent-tree.c b/extent-tree.c
index 10cc995b..85f5670d 100644
--- a/extent-tree.c
+++ b/extent-tree.c
@@ -1762,6 +1762,8 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10 |
+ BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6 |
BTRFS_BLOCK_GROUP_DUP);
if (extra_flags) {
if (flags & BTRFS_BLOCK_GROUP_DATA)
@@ -2408,6 +2410,8 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
WARN_ON(num_bytes < root->sectorsize);
btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
+ search_start = stripe_align(root, search_start);
+
if (hint_byte) {
block_group = btrfs_lookup_first_block_group(info, hint_byte);
if (!block_group)
@@ -2423,6 +2427,7 @@ static int noinline find_free_extent(struct btrfs_trans_handle *trans,
total_needed += empty_size;
check_failed:
+ search_start = stripe_align(root, search_start);
if (!block_group) {
block_group = btrfs_lookup_first_block_group(info,
search_start);
@@ -2435,7 +2440,6 @@ check_failed:
if (ret)
goto error;
- search_start = stripe_align(root, search_start);
ins->objectid = search_start;
ins->offset = num_bytes;
diff --git a/extent_io.c b/extent_io.c
index ebb35b28..45fa6bf6 100644
--- a/extent_io.c
+++ b/extent_io.c
@@ -663,13 +663,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
return eb;
}
-int read_extent_from_disk(struct extent_buffer *eb)
+int read_extent_from_disk(struct extent_buffer *eb,
+ unsigned long offset, unsigned long len)
{
int ret;
- ret = pread(eb->fd, eb->data, eb->len, eb->dev_bytenr);
+ ret = pread(eb->fd, eb->data + offset, len, eb->dev_bytenr);
if (ret < 0)
goto out;
- if (ret != eb->len) {
+ if (ret != len) {
ret = -EIO;
goto out;
}
diff --git a/extent_io.h b/extent_io.h
index a5d6bf0e..5df44f7d 100644
--- a/extent_io.h
+++ b/extent_io.h
@@ -95,7 +95,8 @@ struct extent_buffer *find_first_extent_buffer(struct extent_io_tree *tree,
struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
u64 bytenr, u32 blocksize);
void free_extent_buffer(struct extent_buffer *eb);
-int read_extent_from_disk(struct extent_buffer *eb);
+int read_extent_from_disk(struct extent_buffer *eb,
+ unsigned long offset, unsigned long len);
int write_extent_to_disk(struct extent_buffer *eb);
int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len);
diff --git a/find-root.c b/find-root.c
index 51e9a233..3d713235 100644
--- a/find-root.c
+++ b/find-root.c
@@ -377,7 +377,8 @@ static int find_root(struct btrfs_root *root)
offset = metadata_offset;
}
err = __btrfs_map_block(&root->fs_info->mapping_tree, READ,
- offset, &map_length, &type, &multi, 0);
+ offset, &map_length, &type,
+ &multi, 0, NULL);
if (err) {
offset += map_length;
continue;
diff --git a/kerncompat.h b/kerncompat.h
index a38a9b06..0ab2baf9 100644
--- a/kerncompat.h
+++ b/kerncompat.h
@@ -36,7 +36,7 @@
#define gfp_t int
#define get_cpu_var(p) (p)
#define __get_cpu_var(p) (p)
-#define BITS_PER_LONG (sizeof(long) * 8)
+#define BITS_PER_LONG (__SIZEOF_LONG__ * 8)
#define __GFP_BITS_SHIFT 20
#define __GFP_BITS_MASK ((int)((1 << __GFP_BITS_SHIFT) - 1))
#define GFP_KERNEL 0
@@ -126,6 +126,10 @@ static inline int mutex_is_locked(struct mutex *m)
#define BITOP_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
+#ifndef __attribute_const__
+#define __attribute_const__ __attribute__((__const__))
+#endif
+
/**
* __set_bit - Set a bit in memory
* @nr: the bit to set
diff --git a/mkfs.c b/mkfs.c
index 0e042c7a..9dd10d68 100644
--- a/mkfs.c
+++ b/mkfs.c
@@ -208,7 +208,8 @@ static int create_raid_groups(struct btrfs_trans_handle *trans,
int metadata_profile_opt, int mixed, int ssd)
{
u64 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy);
- u64 allowed;
+ u64 allowed = 0;
+ u64 devices_for_raid = num_devices;
int ret;
/*
@@ -228,13 +229,22 @@ static int create_raid_groups(struct btrfs_trans_handle *trans,
BTRFS_BLOCK_GROUP_RAID0 : 0; /* raid0 or single */
}
- if (num_devices == 1)
- allowed = BTRFS_BLOCK_GROUP_DUP;
- else if (num_devices >= 4) {
- allowed = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID10;
- } else
- allowed = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1;
+ if (devices_for_raid > 4)
+ devices_for_raid = 4;
+
+ switch (devices_for_raid) {
+ default:
+ case 4:
+ allowed |= BTRFS_BLOCK_GROUP_RAID10;
+ case 3:
+ allowed |= BTRFS_BLOCK_GROUP_RAID6;
+ case 2:
+ allowed |= BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5;
+ break;
+ case 1:
+ allowed |= BTRFS_BLOCK_GROUP_DUP;
+ }
if (metadata_profile & ~allowed) {
fprintf(stderr, "unable to create FS with metadata "
@@ -341,6 +351,10 @@ static u64 parse_profile(char *s)
return BTRFS_BLOCK_GROUP_RAID0;
} else if (strcmp(s, "raid1") == 0) {
return BTRFS_BLOCK_GROUP_RAID1;
+ } else if (strcmp(s, "raid5") == 0) {
+ return BTRFS_BLOCK_GROUP_RAID5;
+ } else if (strcmp(s, "raid6") == 0) {
+ return BTRFS_BLOCK_GROUP_RAID6;
} else if (strcmp(s, "raid10") == 0) {
return BTRFS_BLOCK_GROUP_RAID10;
} else if (strcmp(s, "dup") == 0) {
@@ -1494,6 +1508,16 @@ raid_groups:
btrfs_set_super_incompat_flags(super, flags);
+ if ((data_profile | metadata_profile) &
+ (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+ struct btrfs_super_block *super = &root->fs_info->super_copy;
+ u64 flags = btrfs_super_incompat_flags(super);
+
+ flags |= BTRFS_FEATURE_INCOMPAT_RAID56;
+ btrfs_set_super_incompat_flags(super, flags);
+ printf("Setting RAID5/6 feature flag\n");
+ }
+
printf("fs created label %s on %s\n\tnodesize %u leafsize %u "
"sectorsize %u size %s\n",
label, first_file, nodesize, leafsize, sectorsize,
diff --git a/raid6.c b/raid6.c
new file mode 100644
index 00000000..ce0f6557
--- /dev/null
+++ b/raid6.c
@@ -0,0 +1,99 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ * Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ * Boston MA 02111-1307, USA; either version 2 of the License, or
+ * (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * raid6int1.c
+ *
+ * 1-way unrolled portable integer math RAID-6 instruction set
+ *
+ * This file was postprocessed using unroll.pl and then ported to userspace
+ */
+#include <stdint.h>
+#include <unistd.h>
+#include "kerncompat.h"
+
+/*
+ * This is the C data type to use
+ */
+
+/* Change this from BITS_PER_LONG if there is something better... */
+#if BITS_PER_LONG == 64
+# define NBYTES(x) ((x) * 0x0101010101010101UL)
+# define NSIZE 8
+# define NSHIFT 3
+typedef uint64_t unative_t;
+#else
+# define NBYTES(x) ((x) * 0x01010101U)
+# define NSIZE 4
+# define NSHIFT 2
+typedef uint32_t unative_t;
+#endif
+
+/*
+ * These sub-operations are separate inlines since they can sometimes be
+ * specially optimized using architecture-specific hacks.
+ */
+
+/*
+ * The SHLBYTE() operation shifts each byte left by 1, *not*
+ * rolling over into the next byte
+ */
+static inline __attribute_const__ unative_t SHLBYTE(unative_t v)
+{
+ unative_t vv;
+
+ vv = (v << 1) & NBYTES(0xfe);
+ return vv;
+}
+
+/*
+ * The MASK() operation returns 0xFF in any byte for which the high
+ * bit is 1, 0x00 for any byte for which the high bit is 0.
+ */
+static inline __attribute_const__ unative_t MASK(unative_t v)
+{
+ unative_t vv;
+
+ vv = v & NBYTES(0x80);
+ vv = (vv << 1) - (vv >> 7); /* Overflow on the top bit is OK */
+ return vv;
+}
+
+
+void raid6_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+ uint8_t **dptr = (uint8_t **)ptrs;
+ uint8_t *p, *q;
+ int d, z, z0;
+
+ unative_t wd0, wq0, wp0, w10, w20;
+
+ z0 = disks - 3; /* Highest data disk */
+ p = dptr[z0+1]; /* XOR parity */
+ q = dptr[z0+2]; /* RS syndrome */
+
+ for ( d = 0 ; d < bytes ; d += NSIZE*1 ) {
+ wq0 = wp0 = *(unative_t *)&dptr[z0][d+0*NSIZE];
+ for ( z = z0-1 ; z >= 0 ; z-- ) {
+ wd0 = *(unative_t *)&dptr[z][d+0*NSIZE];
+ wp0 ^= wd0;
+ w20 = MASK(wq0);
+ w10 = SHLBYTE(wq0);
+ w20 &= NBYTES(0x1d);
+ w10 ^= w20;
+ wq0 = w10 ^ wd0;
+ }
+ *(unative_t *)&p[d+NSIZE*0] = wp0;
+ *(unative_t *)&q[d+NSIZE*0] = wq0;
+ }
+}
+
diff --git a/restore.c b/restore.c
index 80afb843..cd85092d 100644
--- a/restore.c
+++ b/restore.c
@@ -228,7 +228,7 @@ static int copy_one_extent(struct btrfs_root *root, int fd,
again:
length = size_left;
ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
- bytenr, &length, &multi, 0);
+ bytenr, &length, &multi, 0, NULL);
if (ret) {
free(inbuf);
free(outbuf);
diff --git a/volumes.c b/volumes.c
index 9c527197..de9f2642 100644
--- a/volumes.c
+++ b/volumes.c
@@ -35,6 +35,23 @@ struct stripe {
u64 physical;
};
+static inline int nr_parity_stripes(struct map_lookup *map)
+{
+ if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+ return 1;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ return 2;
+ else
+ return 0;
+}
+
+static inline int nr_data_stripes(struct map_lookup *map)
+{
+ return map->num_stripes - nr_parity_stripes(map);
+}
+
+#define is_parity_stripe(x) ( ((x) == BTRFS_RAID5_P_STRIPE) || ((x) == BTRFS_RAID6_Q_STRIPE) )
+
#define map_lookup_size(n) (sizeof(struct map_lookup) + \
(sizeof(struct btrfs_bio_stripe) * (n)))
@@ -176,6 +193,8 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int flags)
goto fail;
}
+ posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED);
+
if (device->devid == fs_devices->latest_devid)
fs_devices->latest_bdev = fd;
if (device->devid == fs_devices->lowest_devid)
@@ -618,11 +637,21 @@ static u64 chunk_bytes_by_type(u64 type, u64 calc_size, int num_stripes,
return calc_size;
else if (type & BTRFS_BLOCK_GROUP_RAID10)
return calc_size * (num_stripes / sub_stripes);
+ else if (type & BTRFS_BLOCK_GROUP_RAID5)
+ return calc_size * (num_stripes - 1);
+ else if (type & BTRFS_BLOCK_GROUP_RAID6)
+ return calc_size * (num_stripes - 2);
else
return calc_size * num_stripes;
}
+static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
+{
+ /* TODO, add a way to store the preferred stripe size */
+ return 64 * 1024;
+}
+
int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 *start,
u64 *num_bytes, u64 type)
@@ -659,6 +688,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
}
if (type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_DUP)) {
if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
@@ -698,6 +728,22 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
sub_stripes = 2;
min_stripes = 4;
}
+ if (type & (BTRFS_BLOCK_GROUP_RAID5)) {
+ num_stripes = btrfs_super_num_devices(&info->super_copy);
+ if (num_stripes < 2)
+ return -ENOSPC;
+ min_stripes = 2;
+ stripe_len = find_raid56_stripe_len(num_stripes - 1,
+ btrfs_super_stripesize(&info->super_copy));
+ }
+ if (type & (BTRFS_BLOCK_GROUP_RAID6)) {
+ num_stripes = btrfs_super_num_devices(&info->super_copy);
+ if (num_stripes < 3)
+ return -ENOSPC;
+ min_stripes = 3;
+ stripe_len = find_raid56_stripe_len(num_stripes - 2,
+ btrfs_super_stripesize(&info->super_copy));
+ }
/* we don't want a chunk larger than 10% of the FS */
percent_max = div_factor(btrfs_super_total_bytes(&info->super_copy), 1);
@@ -974,6 +1020,10 @@ int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
ret = map->num_stripes;
else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
ret = map->sub_stripes;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+ ret = 2;
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ ret = 3;
else
ret = 1;
return ret;
@@ -1013,6 +1063,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 bytenr;
u64 length;
u64 stripe_nr;
+ u64 rmap_len;
int i, j, nr = 0;
ce = find_first_cache_extent(&map_tree->cache_tree, chunk_start);
@@ -1020,10 +1071,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
map = container_of(ce, struct map_lookup, ce);
length = ce->size;
+ rmap_len = map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID10)
length = ce->size / (map->num_stripes / map->sub_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
length = ce->size / map->num_stripes;
+ else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+ length = ce->size / nr_data_stripes(map);
+ rmap_len = map->stripe_len * nr_data_stripes(map);
+ }
buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
@@ -1042,8 +1099,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
map->sub_stripes;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
stripe_nr = stripe_nr * map->num_stripes + i;
- }
- bytenr = ce->start + stripe_nr * map->stripe_len;
+ } /* else if RAID[56], multiply by nr_data_stripes().
+ * Alternatively, just use rmap_len below instead of
+ * map->stripe_len */
+
+ bytenr = ce->start + stripe_nr * rmap_len;
for (j = 0; j < nr; j++) {
if (buf[j] == bytenr)
break;
@@ -1054,28 +1114,60 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
*logical = buf;
*naddrs = nr;
- *stripe_len = map->stripe_len;
+ *stripe_len = rmap_len;
return 0;
}
+static inline int parity_smaller(u64 a, u64 b)
+{
+ return a > b;
+}
+
+/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
+static void sort_parity_stripes(struct btrfs_multi_bio *bbio, u64 *raid_map)
+{
+ struct btrfs_bio_stripe s;
+ int i;
+ u64 l;
+ int again = 1;
+
+ while (again) {
+ again = 0;
+ for (i = 0; i < bbio->num_stripes - 1; i++) {
+ if (parity_smaller(raid_map[i], raid_map[i+1])) {
+ s = bbio->stripes[i];
+ l = raid_map[i];
+ bbio->stripes[i] = bbio->stripes[i+1];
+ raid_map[i] = raid_map[i+1];
+ bbio->stripes[i+1] = s;
+ raid_map[i+1] = l;
+ again = 1;
+ }
+ }
+ }
+}
+
int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length,
- struct btrfs_multi_bio **multi_ret, int mirror_num)
+ struct btrfs_multi_bio **multi_ret, int mirror_num,
+ u64 **raid_map_ret)
{
return __btrfs_map_block(map_tree, rw, logical, length, NULL,
- multi_ret, mirror_num);
+ multi_ret, mirror_num, raid_map_ret);
}
int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length, u64 *type,
- struct btrfs_multi_bio **multi_ret, int mirror_num)
+ struct btrfs_multi_bio **multi_ret, int mirror_num,
+ u64 **raid_map_ret)
{
struct cache_extent *ce;
struct map_lookup *map;
u64 offset;
u64 stripe_offset;
u64 stripe_nr;
+ u64 *raid_map = NULL;
int stripes_allocated = 8;
int stripes_required = 1;
int stripe_index;
@@ -1115,10 +1207,24 @@ again:
stripes_required = map->sub_stripes;
}
}
+ if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)
+ && multi_ret && ((rw & WRITE) || mirror_num > 1) && raid_map_ret) {
+ /* RAID[56] write or recovery. Return all stripes */
+ stripes_required = map->num_stripes;
+
+ /* Only allocate the map if we've already got a large enough multi_ret */
+ if (stripes_allocated >= stripes_required) {
+ raid_map = kmalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
+ if (!raid_map) {
+ kfree(multi);
+ return -ENOMEM;
+ }
+ }
+ }
+
/* if our multi bio struct is too small, back off and try again */
- if (multi_ret && rw == WRITE &&
- stripes_allocated < stripes_required) {
- stripes_allocated = map->num_stripes;
+ if (multi_ret && stripes_allocated < stripes_required) {
+ stripes_allocated = stripes_required;
kfree(multi);
goto again;
}
@@ -1136,6 +1242,7 @@ again:
stripe_offset = offset - stripe_offset;
if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+ BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_DUP)) {
/* we limit the length of each bio to what fits in a stripe */
@@ -1174,6 +1281,59 @@ again:
multi->num_stripes = map->num_stripes;
else if (mirror_num)
stripe_index = mirror_num - 1;
+ } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+ BTRFS_BLOCK_GROUP_RAID6)) {
+
+ if (raid_map) {
+ int i, rot;
+ u64 tmp;
+ u64 raid56_full_stripe_start;
+ u64 full_stripe_len = nr_data_stripes(map) * map->stripe_len;
+
+ /*
+ * align the start of our data stripe in the logical
+ * address space
+ */
+ raid56_full_stripe_start = offset / full_stripe_len;
+ raid56_full_stripe_start *= full_stripe_len;
+
+ /* get the data stripe number */
+ stripe_nr = raid56_full_stripe_start / map->stripe_len;
+ stripe_nr = stripe_nr / nr_data_stripes(map);
+
+ /* Work out the disk rotation on this stripe-set */
+ rot = stripe_nr % map->num_stripes;
+
+ /* Fill in the logical address of each stripe */
+ tmp = stripe_nr * nr_data_stripes(map);
+
+ for (i = 0; i < nr_data_stripes(map); i++)
+ raid_map[(i+rot) % map->num_stripes] =
+ ce->start + (tmp + i) * map->stripe_len;
+
+ raid_map[(i+rot) % map->num_stripes] = BTRFS_RAID5_P_STRIPE;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ raid_map[(i+rot+1) % map->num_stripes] = BTRFS_RAID6_Q_STRIPE;
+
+ *length = map->stripe_len;
+ stripe_index = 0;
+ stripe_offset = 0;
+ multi->num_stripes = map->num_stripes;
+ } else {
+ stripe_index = stripe_nr % nr_data_stripes(map);
+ stripe_nr = stripe_nr / nr_data_stripes(map);
+
+ /*
+ * Mirror #0 or #1 means the original data block.
+ * Mirror #2 is RAID5 parity block.
+ * Mirror #3 is RAID6 Q block.
+ */
+ if (mirror_num > 1)
+ stripe_index = nr_data_stripes(map) + mirror_num - 2;
+
+ /* We distribute the parity blocks across stripes */
+ stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
+ }
} else {
/*
* after this do_div call, stripe_nr is the number of stripes
@@ -1193,8 +1353,14 @@ again:
stripe_index++;
}
*multi_ret = multi;
+
if (type)
*type = map->type;
+
+ if (raid_map) {
+ sort_parity_stripes(multi, raid_map);
+ *raid_map_ret = raid_map;
+ }
out:
return 0;
}
diff --git a/volumes.h b/volumes.h
index 9c084061..911f7881 100644
--- a/volumes.h
+++ b/volumes.h
@@ -135,6 +135,10 @@ struct map_lookup {
#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8)
#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9)
+#define BTRFS_RAID5_P_STRIPE ((u64)-2)
+#define BTRFS_RAID6_Q_STRIPE ((u64)-1)
+
+
int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
struct btrfs_device *device,
u64 chunk_tree, u64 chunk_objectid,
@@ -142,10 +146,12 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
u64 num_bytes, u64 *start);
int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length, u64 *type,
- struct btrfs_multi_bio **multi_ret, int mirror_num);
+ struct btrfs_multi_bio **multi_ret, int mirror_num,
+ u64 **raid_map);
int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
u64 logical, u64 *length,
- struct btrfs_multi_bio **multi_ret, int mirror_num);
+ struct btrfs_multi_bio **multi_ret, int mirror_num,
+ u64 **raid_map_ret);
int btrfs_next_metadata(struct btrfs_mapping_tree *map_tree, u64 *logical,
u64 *size);
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,