From e9e43ec36756c50a5dabf6db52d9bebbccaaa72f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 13 Aug 2009 11:12:54 +1000 Subject: Grow: support restart of new migrations. --- Grow.c | 279 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 251 insertions(+), 28 deletions(-) (limited to 'Grow.c') diff --git a/Grow.c b/Grow.c index 6bc00b88..803f5eb8 100644 --- a/Grow.c +++ b/Grow.c @@ -36,6 +36,10 @@ #include "md_u.h" #include "md_p.h" +#ifndef offsetof +#define offsetof(t,f) ((size_t)&(((t*)0)->f)) +#endif + int Grow_Add_device(char *devname, int fd, char *newdev) { /* Add a device to an active array. @@ -424,6 +428,7 @@ static int child_shrink(int afd, struct mdinfo *sra, unsigned long blocks, int dests, int *destfd, unsigned long long *destoffsets); static int child_same_size(int afd, struct mdinfo *sra, unsigned long blocks, int *fds, unsigned long long *offsets, + unsigned long long start, int disks, int chunk, int level, int layout, int data, int dests, int *destfd, unsigned long long *destoffsets); @@ -1115,6 +1120,7 @@ int Grow_reshape(char *devname, int fd, int quiet, char *backup_file, else done = child_same_size(fd, sra, stripes, fdlist, offsets, + 0, odisks, ochunk, array.level, olayout, odata, d - odisks, fdlist+odisks, offsets+odisks); if (backup_file && done) @@ -1466,10 +1472,11 @@ static int child_shrink(int afd, struct mdinfo *sra, unsigned long stripes, static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes, int *fds, unsigned long long *offsets, + unsigned long long start, int disks, int chunk, int level, int layout, int data, int dests, int *destfd, unsigned long long *destoffsets) { - unsigned long long start, size; + unsigned long long size; unsigned long tailstripes = stripes; int part; char *buf; @@ -1484,19 +1491,19 @@ static int child_same_size(int afd, struct mdinfo *sra, unsigned long stripes, sysfs_get_ll(sra, NULL, "sync_speed_min", &speed); sysfs_set_num(sra, NULL, "sync_speed_min", 200000); - grow_backup(sra, 0, stripes, + grow_backup(sra, start, stripes, fds, offsets, disks, chunk, level, layout, dests, destfd, destoffsets, 0, buf); - grow_backup(sra, stripes * chunk/512, stripes, + grow_backup(sra, (start + stripes) * chunk/512, stripes, fds, offsets, disks, chunk, level, layout, dests, destfd, destoffsets, 1, buf); validate(afd, destfd[0], destoffsets[0]); part = 0; - start = stripes * 2; /* where to read next */ + start += stripes * 2; /* where to read next */ size = sra->component_size / (chunk/512); while (start < size) { if (wait_backup(sra, (start-stripes*2)*chunk/512, @@ -1545,19 +1552,26 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt unsigned long long nstripe, ostripe, last_block; int ndata, odata; - if (info->delta_disks < 0) - return 1; /* cannot handle a shrink */ - if (info->new_level != info->array.level || - info->new_layout != info->array.layout || - info->new_chunk != info->array.chunk_size) - return 1; /* Can only handle change in disks */ + if (info->new_level != info->array.level) + return 1; /* Cannot handle level changes (they are instantaneous) */ + + odata = info->array.raid_disks - info->delta_disks - 1; + if (info->array.level == 6) odata--; /* number of data disks */ + ndata = info->array.raid_disks - 1; + if (info->new_level == 6) ndata--; old_disks = info->array.raid_disks - info->delta_disks; + if (info->delta_disks <= 0) + /* Didn't grow, so the backup file must have + * been used + */ + old_disks = cnt; for (i=old_disks-(backup_file?1:0); iuuid, 16) != 0) continue; /* Wrong uuid */ @@ -1598,18 +1618,46 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt info->array.utime < __le64_to_cpu(bsb.mtime)) continue; /* time stamp is too bad */ - if (__le64_to_cpu(bsb.arraystart) != 0) - continue; /* Can only handle backup from start of array */ - if (__le64_to_cpu(bsb.length) < - info->reshape_progress) - continue; /* No new data here */ - + if (bsb.magic[15] == '1') { + if (info->delta_disks >= 0) { + /* reshape_progress is increasing */ + if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) < + info->reshape_progress) + continue; /* No new data here */ + } else { + /* reshape_progress is decreasing */ + if (__le64_to_cpu(bsb.arraystart) >= + info->reshape_progress) + continue; /* No new data here */ + } + } else { + if (info->delta_disks >= 0) { + /* reshape_progress is increasing */ + if (__le64_to_cpu(bsb.arraystart) + __le64_to_cpu(bsb.length) < + info->reshape_progress && + __le64_to_cpu(bsb.arraystart2) + __le64_to_cpu(bsb.length2) < + info->reshape_progress) + continue; /* No new data here */ + } else { + /* reshape_progress is decreasing */ + if (__le64_to_cpu(bsb.arraystart) >= + info->reshape_progress && + __le64_to_cpu(bsb.arraystart2) >= + info->reshape_progress) + continue; /* No new data here */ + } + } if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) continue; /* Cannot seek */ /* There should be a duplicate backup superblock 4k before here */ if (lseek64(fd, -4096, 1) < 0 || - read(fd, buf, 4096) != 4096 || - memcmp(buf, &bsb, sizeof(bsb)) != 0) + read(fd, buf, 4096) != 4096) + continue; /* Cannot find leading superblock */ + if (bsb.magic[15] == '1') + bsbsize = offsetof(struct mdp_backup_super, pad1); + else + bsbsize = offsetof(struct mdp_backup_super, pad); + if (memcmp(buf, &bsb, bsbsize) != 0) continue; /* Cannot find leading superblock */ /* Now need the data offsets for all devices. */ @@ -1632,37 +1680,67 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt info->new_level, info->new_layout, fd, __le64_to_cpu(bsb.devstart)*512, - 0, __le64_to_cpu(bsb.length)*512)) { + __le64_to_cpu(bsb.arraystart), + __le64_to_cpu(bsb.length)*512)) { + /* didn't succeed, so giveup */ + return 1; + } + + if (bsb.magic[15] == '2' && + restore_stripes(fdlist, offsets, + info->array.raid_disks, + info->new_chunk, + info->new_level, + info->new_layout, + fd, __le64_to_cpu(bsb.devstart)*512 + + __le64_to_cpu(bsb.devstart2)*512, + __le64_to_cpu(bsb.arraystart2), + __le64_to_cpu(bsb.length2)*512)) { /* didn't succeed, so giveup */ return 1; } + /* Ok, so the data is restored. Let's update those superblocks. */ + if (info->delta_disks >= 0) { + info->reshape_progress = __le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length); + if (bsb.magic[15] == '2') { + unsigned long long p2 = __le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2); + if (p2 > info->reshape_progress) + info->reshape_progress = p2; + } + } else { + info->reshape_progress = __le64_to_cpu(bsb.arraystart); + if (bsb.magic[15] == '2') { + unsigned long long p2 = __le64_to_cpu(bsb.arraystart2); + if (p2 < info->reshape_progress) + info->reshape_progress = p2; + } + } for (j=0; jarray.raid_disks; j++) { if (fdlist[j] < 0) continue; if (st->ss->load_super(st, fdlist[j], NULL)) continue; st->ss->getinfo_super(st, &dinfo); - dinfo.reshape_progress = __le64_to_cpu(bsb.length); + dinfo.reshape_progress = info->reshape_progress; st->ss->update_super(st, &dinfo, "_reshape_progress", NULL,0, 0, NULL); st->ss->store_super(st, fdlist[j]); st->ss->free_super(st); } - - /* And we are done! */ return 0; } /* Didn't find any backup data, try to see if any * was needed. */ + if (info->delta_disks == 0) + /* Alway need backup data when size doesn't change */ + return 1; nstripe = ostripe = 0; - odata = info->array.raid_disks - info->delta_disks - 1; - if (info->array.level == 6) odata--; /* number of data disks */ - ndata = info->array.raid_disks - 1; - if (info->new_level == 6) ndata--; last_block = 0; while (nstripe >= ostripe) { nstripe += info->new_chunk / 512; @@ -1676,3 +1754,148 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt /* needed to recover critical section! */ return 1; } + +int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info, + char *backup_file) +{ + /* Array is assembled and ready to be started, but + * monitoring is probably required. + * So: + * - start read-only + * - set upper bound for resync + * - initialise the 'suspend' boundaries + * - switch to read-write + * - fork and continue monitoring + */ + int err; + int backup_list[1]; + unsigned long long backup_offsets[1]; + int odisks, ndisks, ochunk, nchunk,odata,ndata; + unsigned long a,b,blocks,stripes; + int backup_fd; + int *fds; + unsigned long long *offsets; + int d; + struct mdinfo *sra, *sd; + int rv; + int done = 0; + + err = sysfs_set_str(info, NULL, "array_state", "readonly"); + if (err) + return err; + + /* make sure reshape doesn't progress until we are ready */ + sysfs_set_str(info, NULL, "sync_max", "0"); + sysfs_set_str(info, NULL, "array_state", "active"); /* FIXME or clean */ + + /* ndisks is not growing, so raid_disks is old and +delta is new */ + odisks = info->array.raid_disks; + ndisks = odisks + info->delta_disks; + odata = odisks - 1; + ndata = ndisks - 1; + if (info->array.level == 6) { + odata--; + ndata--; + } + ochunk = info->array.chunk_size; + nchunk = info->new_chunk; + + + a = ochunk/512 * odata; + b = nchunk/512 * ndata; + /* Find GCD */ + while (a != b) { + if (a < b) + b -= a; + if (b < a) + a -= b; + } + /* LCM == product / GCD */ + blocks = ochunk/512 * nchunk/512 * odata * ndata / a; + + if (ndata == odata) + blocks *= 16; + stripes = blocks / (info->array.chunk_size/512) / odata; + + + memset(&bsb, 0, 512); + memcpy(bsb.magic, "md_backup_data-1", 16); + memcpy(&bsb.set_uuid, info->uuid, 16); + bsb.mtime = __cpu_to_le64(time(0)); + bsb.devstart2 = blocks; + + backup_fd = open(backup_file, O_RDWR|O_CREAT, S_IRUSR | S_IWUSR); + backup_list[0] = backup_fd; + backup_offsets[0] = 8 * 512; + fds = malloc(odisks * sizeof(fds[0])); + offsets = malloc(odisks * sizeof(offsets[0])); + for (d=0; dsys_name), + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE| + GET_CACHE); + + for (sd = sra->devs; sd; sd = sd->next) { + if (sd->disk.state & (1<disk.state & (1<disk.major, + sd->disk.minor, 1); + fds[sd->disk.raid_disk] + = dev_open(dn, O_RDONLY); + offsets[sd->disk.raid_disk] = sd->data_offset*512; + if (fds[sd->disk.raid_disk] < 0) { + fprintf(stderr, Name ": %s: cannot open component %s\n", + info->sys_name, dn?dn:"-unknown-"); + rv = 1; + goto release; + } + free(dn); + } + } + + switch(fork()) { + case 0: + close(mdfd); + mlockall(MCL_FUTURE); + if (info->delta_disks < 0) + done = child_shrink(-1, info, stripes, + fds, offsets, + info->array.raid_disks, + info->array.chunk_size, + info->array.level, info->array.layout, + odata, + 1, backup_list, backup_offsets); + else if (info->delta_disks == 0) { + /* The 'start' is a per-device stripe number. + * reshape_progress is a per-array sector number. + * So divide by ndata * chunk_size + */ + unsigned long long start = info->reshape_progress / ndata; + start /= (info->array.chunk_size/512); + done = child_same_size(-1, info, stripes, + fds, offsets, + start, + info->array.raid_disks, + info->array.chunk_size, + info->array.level, info->array.layout, + odata, + 1, backup_list, backup_offsets); + } + if (backup_file && done) + unlink(backup_file); + /* FIXME should I intuit a level change */ + exit(0); + case -1: + fprintf(stderr, Name ": Cannot run child to continue monitoring reshape: %s\n", + strerror(errno)); + return 1; + default: + break; + } +release: + return 0; +} + + -- cgit v1.2.3