summaryrefslogtreecommitdiff
path: root/Manage.c
diff options
context:
space:
mode:
Diffstat (limited to 'Manage.c')
-rw-r--r--Manage.c235
1 files changed, 165 insertions, 70 deletions
diff --git a/Manage.c b/Manage.c
index 206f34ef..7e1b94be 100644
--- a/Manage.c
+++ b/Manage.c
@@ -211,15 +211,11 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry)
if (md_get_version(fd) < 9000) {
if (ioctl(fd, STOP_MD, 0) == 0)
return 0;
- pr_err("stopping device %s "
- "failed: %s\n",
+ pr_err("stopping device %s failed: %s\n",
devname, strerror(errno));
return 1;
}
- /* If this is an mdmon managed array, just write 'inactive'
- * to the array state and let mdmon clear up.
- */
strcpy(devnm, fd2devnm(fd));
/* Get EXCL access first. If this fails, then attempting
* to stop is probably a bad idea.
@@ -236,13 +232,17 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry)
container[0] = 0;
close(fd);
count = 5;
- while (((fd = ((devnm[0] == '/')
+ while (((fd = ((devname[0] == '/')
?open(devname, O_RDONLY|O_EXCL)
:open_dev_flags(devnm, O_RDONLY|O_EXCL))) < 0
|| strcmp(fd2devnm(fd), devnm) != 0)
&& container[0]
&& mdmon_running(container)
&& count) {
+ /* Can't open, so something might be wrong. However it
+ * is a container, so we might be racing with mdmon, so
+ * retry for a bit.
+ */
if (fd >= 0)
close(fd);
flush_mdmon(container);
@@ -252,13 +252,13 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry)
if (fd >= 0)
close(fd);
if (verbose >= 0)
- pr_err("Cannot get exclusive access to %s:"
- "Perhaps a running "
- "process, mounted filesystem "
- "or active volume group?\n",
+ pr_err("Cannot get exclusive access to %s:Perhaps a running process, mounted filesystem or active volume group?\n",
devname);
return 1;
}
+ /* If this is an mdmon managed array, just write 'inactive'
+ * to the array state and let mdmon clear up.
+ */
if (mdi &&
mdi->array.level > 0 &&
is_subarray(mdi->text_version)) {
@@ -266,7 +266,7 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry)
/* This is mdmon managed. */
close(fd);
- /* As we have an O_EXCL open, any use of the device
+ /* As we had an O_EXCL open, any use of the device
* which blocks STOP_ARRAY is probably a transient use,
* so it is reasonable to retry for a while - 5 seconds.
*/
@@ -293,8 +293,7 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry)
fd = open_dev_excl(devnm);
if (fd < 0) {
if (verbose >= 0)
- pr_err("failed to completely stop %s"
- ": Device is busy\n",
+ pr_err("failed to completely stop %s: Device is busy\n",
devname);
rv = 1;
goto out;
@@ -320,9 +319,8 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry)
metadata_container_matches(m->metadata_version+9,
devnm)) {
if (verbose >= 0)
- pr_err("Cannot stop container %s: "
- "member %s still active\n",
- devname, m->dev);
+ pr_err("Cannot stop container %s: member %s still active\n",
+ devname, m->devnm);
free_mdstat(mds);
rv = 1;
goto out;
@@ -346,9 +344,7 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry)
sysfs_attribute_available(mdi, NULL, "reshape_direction") &&
sysfs_get_str(mdi, NULL, "sync_action", buf, 20) > 0 &&
strcmp(buf, "reshape\n") == 0 &&
- sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2 &&
- sysfs_set_str(mdi, NULL, "sync_action", "frozen") == 0) {
- /* Array is frozen */
+ sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2) {
unsigned long long position, curr;
unsigned long long chunk1, chunk2;
unsigned long long rddiv, chunkdiv;
@@ -359,12 +355,28 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry)
int delay;
int scfd;
+ delay = 40;
+ while (rd1 > rd2 && delay > 0 &&
+ sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) == 0) {
+ /* must be in the critical section - wait a bit */
+ delay -= 1;
+ usleep(100000);
+ }
+
+ if (sysfs_set_str(mdi, NULL, "sync_action", "frozen") != 0)
+ goto done;
+ /* Array is frozen */
+
rd1 -= mdi->array.level == 6 ? 2 : 1;
rd2 -= mdi->array.level == 6 ? 2 : 1;
sysfs_get_str(mdi, NULL, "reshape_direction", buf, sizeof(buf));
if (strncmp(buf, "back", 4) == 0)
backwards = 1;
- sysfs_get_ll(mdi, NULL, "reshape_position", &position);
+ if (sysfs_get_ll(mdi, NULL, "reshape_position", &position) != 0) {
+ /* reshape must have finished now */
+ sysfs_set_str(mdi, NULL, "sync_action", "idle");
+ goto done;
+ }
sysfs_get_two(mdi, NULL, "chunk_size", &chunk1, &chunk2);
chunk1 /= 512;
chunk2 /= 512;
@@ -381,9 +393,20 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry)
size &= ~(chunk1-1);
size &= ~(chunk2-1);
/* rd1 must be smaller */
+ /* Reshape may have progressed further backwards than
+ * recorded, so target even further back (hence "-1")
+ */
position = (position / sectors - 1) * sectors;
+ /* rd1 is always the conversion factor between 'sync'
+ * position and 'reshape' position.
+ * We read 1 "new" stripe worth of data from where-ever,
+ * and when write out that full stripe.
+ */
sync_max = size - position/rd1;
} else {
+ /* Reshape will very likely be beyond position, and it may
+ * be too late to stop at '+1', so aim for '+2'
+ */
position = (position / sectors + 2) * sectors;
sync_max = position/rd1;
}
@@ -406,6 +429,7 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry)
delay = 3000;
scfd = sysfs_open(mdi->sys_name, NULL, "sync_completed");
while (scfd >= 0 && delay > 0 && old_sync_max > 0) {
+ unsigned long long max_completed;
sysfs_get_ll(mdi, NULL, "reshape_position", &curr);
sysfs_fd_get_str(scfd, buf, sizeof(buf));
if (strncmp(buf, "none", 4) == 0) {
@@ -419,7 +443,10 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry)
break;
}
- if (sysfs_fd_get_ll(scfd, &completed) == 0 &&
+ if (sysfs_fd_get_two(scfd, &completed,
+ &max_completed) == 2 &&
+ /* 'completed' sometimes reads as max-uulong */
+ completed < max_completed &&
(completed > sync_max ||
(completed == sync_max && curr != position))) {
while (completed > sync_max) {
@@ -443,6 +470,7 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry)
close(scfd);
}
+done:
/* As we have an O_EXCL open, any use of the device
* which blocks STOP_ARRAY is probably a transient use,
@@ -460,9 +488,7 @@ int Manage_stop(char *devname, int fd, int verbose, int will_retry)
pr_err("failed to stop array %s: %s\n",
devname, strerror(errno));
if (errno == EBUSY)
- cont_err("Perhaps a running "
- "process, mounted filesystem "
- "or active volume group?\n");
+ cont_err("Perhaps a running process, mounted filesystem or active volume group?\n");
}
rv = 1;
goto out;
@@ -643,6 +669,15 @@ int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
disc.number = mdi.disk.number;
disc.raid_disk = mdi.disk.raid_disk;
disc.state = mdi.disk.state;
+ if (array->state & (1 << MD_SB_CLUSTERED)) {
+ /* extra flags are needed when adding to a cluster as
+ * there are two cases to distinguish
+ */
+ if (dv->disposition == 'c')
+ disc.state |= (1 << MD_DISK_CANDIDATE);
+ else
+ disc.state |= (1 << MD_DISK_CLUSTER_ADD);
+ }
if (dv->writemostly == 1)
disc.state |= 1 << MD_DISK_WRITEMOSTLY;
if (dv->writemostly == 2)
@@ -652,8 +687,7 @@ int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
int rv = -1;
tfd = dev_open(dv->devname, O_RDWR);
if (tfd < 0) {
- pr_err("failed to open %s for"
- " superblock update during re-add\n", dv->devname);
+ pr_err("failed to open %s for superblock update during re-add\n", dv->devname);
return -1;
}
@@ -673,8 +707,7 @@ int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
rv = dev_st->ss->store_super(dev_st, tfd);
close(tfd);
if (rv != 0) {
- pr_err("failed to update"
- " superblock during re-add\n");
+ pr_err("failed to update superblock during re-add\n");
return -1;
}
}
@@ -700,7 +733,8 @@ skip_re_add:
int Manage_add(int fd, int tfd, struct mddev_dev *dv,
struct supertype *tst, mdu_array_info_t *array,
int force, int verbose, char *devname,
- char *update, unsigned long rdev, unsigned long long array_size)
+ char *update, unsigned long rdev, unsigned long long array_size,
+ int raid_slot)
{
unsigned long long ldsize;
struct supertype *dev_st = NULL;
@@ -717,17 +751,13 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
if (tst->ss == &super0 && ldsize > 4ULL*1024*1024*1024*1024) {
/* More than 4TB is wasted on v0.90 */
if (!force) {
- pr_err("%s is larger than %s can "
- "effectively use.\n"
- " Add --force is you "
- "really want to add this device.\n",
+ pr_err("%s is larger than %s can effectively use.\n"
+ " Add --force is you really want to add this device.\n",
dv->devname, devname);
return -1;
}
- pr_err("%s is larger than %s can "
- "effectively use.\n"
- " Adding anyway as --force "
- "was given.\n",
+ pr_err("%s is larger than %s can effectively use.\n"
+ " Adding anyway as --force was given.\n",
dv->devname, devname);
}
if (!tst->ss->external &&
@@ -795,7 +825,9 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
}
/* Make sure device is large enough */
- if (tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) <
+ if (dv->disposition != 'j' && /* skip size check for Journal */
+ tst->sb &&
+ tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) <
array_size) {
if (dv->disposition == 'M')
return 0;
@@ -841,7 +873,7 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
int d;
int found = 0;
- for (d = 0; d < MAX_DISKS && found < array->active_disks; d++) {
+ for (d = 0; d < MAX_DISKS && found < array->nr_disks; d++) {
disc.number = d;
if (ioctl(fd, GET_DISK_INFO, &disc))
continue;
@@ -893,10 +925,36 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
}
disc.major = major(rdev);
disc.minor = minor(rdev);
- disc.number =j;
+ if (raid_slot < 0)
+ disc.number = j;
+ else
+ disc.number = raid_slot;
disc.state = 0;
+
+ /* only add journal to array that supports journaling */
+ if (dv->disposition == 'j') {
+ struct mdinfo mdi;
+ struct mdinfo *mdp;
+
+ mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE);
+
+ if (strncmp(mdp->sysfs_array_state, "readonly", 8) != 0) {
+ pr_err("%s is not readonly, cannot add journal.\n", devname);
+ return -1;
+ }
+
+ tst->ss->getinfo_super(tst, &mdi, NULL);
+ if (mdi.journal_device_required == 0) {
+ pr_err("%s does not support journal device.\n", devname);
+ return -1;
+ }
+ disc.raid_disk = 0;
+ }
+
if (array->not_persistent==0) {
int dfd;
+ if (dv->disposition == 'j')
+ disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC);
if (dv->writemostly == 1)
disc.state |= 1 << MD_DISK_WRITEMOSTLY;
dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
@@ -934,6 +992,14 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
}
free(used);
}
+
+ if (array->state & (1 << MD_SB_CLUSTERED)) {
+ if (dv->disposition == 'c')
+ disc.state |= (1 << MD_DISK_CANDIDATE);
+ else
+ disc.state |= (1 << MD_DISK_CLUSTER_ADD);
+ }
+
if (dv->writemostly == 1)
disc.state |= (1 << MD_DISK_WRITEMOSTLY);
if (tst->ss->external) {
@@ -949,8 +1015,7 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
container_fd = open_dev_excl(devnm);
if (container_fd < 0) {
- pr_err("add failed for %s:"
- " could not get exclusive access to container\n",
+ pr_err("add failed for %s: could not get exclusive access to container\n",
dv->devname);
tst->ss->free_super(tst);
return -1;
@@ -989,8 +1054,7 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
* would block add_disk */
tst->ss->free_super(tst);
if (sysfs_add_disk(sra, &new_mdi, 0) != 0) {
- pr_err("add new device to external metadata"
- " failed for %s\n", dv->devname);
+ pr_err("add new device to external metadata failed for %s\n", dv->devname);
close(container_fd);
sysfs_free(sra);
return -1;
@@ -1001,10 +1065,20 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
} else {
tst->ss->free_super(tst);
if (ioctl(fd, ADD_NEW_DISK, &disc)) {
- pr_err("add new device failed for %s as %d: %s\n",
- dv->devname, j, strerror(errno));
+ if (dv->disposition == 'j')
+ pr_err("Failed to hot add %s as journal, "
+ "please try restart %s.\n", dv->devname, devname);
+ else
+ pr_err("add new device failed for %s as %d: %s\n",
+ dv->devname, j, strerror(errno));
return -1;
}
+ if (dv->disposition == 'j') {
+ pr_err("Journal added successfully, making %s read-write\n", devname);
+ if (Manage_ro(devname, fd, -1))
+ pr_err("Failed to make %s read-write\n", devname);
+ }
+
}
if (verbose >= 0)
pr_err("added %s\n", dv->devname);
@@ -1032,8 +1106,7 @@ int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv,
strcpy(devnm, fd2devnm(fd));
lfd = open_dev_excl(devnm);
if (lfd < 0) {
- pr_err("Cannot get exclusive access "
- " to container - odd\n");
+ pr_err("Cannot get exclusive access to container - odd\n");
return -1;
}
/* We may not be able to check on holders in
@@ -1093,8 +1166,7 @@ int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv,
}
}
if (err) {
- pr_err("hot remove failed "
- "for %s: %s\n", dv->devname,
+ pr_err("hot remove failed for %s: %s\n", dv->devname,
strerror(errno));
if (lfd >= 0)
close(lfd);
@@ -1239,6 +1311,7 @@ int Manage_subdevs(char *devname, int fd,
* try HOT_ADD_DISK
* If that fails EINVAL, try ADD_NEW_DISK
* 'S' - add the device as a spare - don't try re-add
+ * 'j' - add the device as a journal device
* 'A' - re-add the device
* 'r' - remove the device: HOT_REMOVE_DISK
* device can be 'faulty' or 'detached' in which case all
@@ -1257,6 +1330,7 @@ int Manage_subdevs(char *devname, int fd,
* variant on 'A'
* 'F' - Another variant of 'A', where the device was faulty
* so must be removed from the array first.
+ * 'c' - confirm the device as found (for clustered environments)
*
* For 'f' and 'r', the device can also be a kernel-internal
* name such as 'sdb'.
@@ -1270,8 +1344,10 @@ int Manage_subdevs(char *devname, int fd,
int sysfd = -1;
int count = 0; /* number of actions taken */
struct mdinfo info;
+ struct mdinfo devinfo;
int frozen = 0;
int busy = 0;
+ int raid_slot = -1;
if (ioctl(fd, GET_ARRAY_INFO, &array)) {
pr_err("Cannot get array info for %s\n",
@@ -1300,12 +1376,22 @@ int Manage_subdevs(char *devname, int fd,
int rv;
int mj,mn;
+ raid_slot = -1;
+ if (dv->disposition == 'c') {
+ rv = parse_cluster_confirm_arg(dv->devname,
+ &dv->devname,
+ &raid_slot);
+ if (rv) {
+ pr_err("Could not get the devname of cluster\n");
+ goto abort;
+ }
+ }
+
if (strcmp(dv->devname, "failed") == 0 ||
strcmp(dv->devname, "faulty") == 0) {
if (dv->disposition != 'A'
&& dv->disposition != 'r') {
- pr_err("%s only meaningful "
- "with -r or --re-add, not -%c\n",
+ pr_err("%s only meaningful with -r or --re-add, not -%c\n",
dv->devname, dv->disposition);
goto abort;
}
@@ -1315,8 +1401,7 @@ int Manage_subdevs(char *devname, int fd,
}
if (strcmp(dv->devname, "detached") == 0) {
if (dv->disposition != 'r' && dv->disposition != 'f') {
- pr_err("%s only meaningful "
- "with -r of -f, not -%c\n",
+ pr_err("%s only meaningful with -r of -f, not -%c\n",
dv->devname, dv->disposition);
goto abort;
}
@@ -1327,9 +1412,13 @@ int Manage_subdevs(char *devname, int fd,
if (strcmp(dv->devname, "missing") == 0) {
struct mddev_dev *add_devlist = NULL;
struct mddev_dev **dp;
+ if (dv->disposition == 'c') {
+ rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL);
+ break;
+ }
+
if (dv->disposition != 'A') {
- pr_err("'missing' only meaningful "
- "with --re-add\n");
+ pr_err("'missing' only meaningful with --re-add\n");
goto abort;
}
add_devlist = conf_get_devs();
@@ -1381,8 +1470,7 @@ int Manage_subdevs(char *devname, int fd,
int found = 0;
char dname[55];
if (dv->disposition != 'r' && dv->disposition != 'f') {
- pr_err("%s only meaningful "
- "with -r or -f, not -%c\n",
+ pr_err("%s only meaningful with -r or -f, not -%c\n",
dv->devname, dv->disposition);
goto abort;
}
@@ -1402,8 +1490,7 @@ int Manage_subdevs(char *devname, int fd,
if (!found) {
sysfd = sysfs_open(fd2devnm(fd), dname, "state");
if (sysfd < 0) {
- pr_err("%s does not appear "
- "to be a component of %s\n",
+ pr_err("%s does not appear to be a component of %s\n",
dv->devname, devname);
goto abort;
}
@@ -1457,16 +1544,28 @@ int Manage_subdevs(char *devname, int fd,
goto abort;
case 'a':
case 'S': /* --add-spare */
+ case 'j': /* --add-journal */
case 'A':
case 'M': /* --re-add missing */
case 'F': /* --re-add faulty */
+ case 'c': /* --cluster-confirm */
/* add the device */
if (subarray) {
- pr_err("Cannot add disks to a"
- " \'member\' array, perform this"
- " operation on the parent container\n");
+ pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n");
goto abort;
}
+
+ /* Let's first try to write re-add to sysfs */
+ if (rdev != 0 &&
+ (dv->disposition == 'A' || dv->disposition == 'F')) {
+ sysfs_init_dev(&devinfo, rdev);
+ if (sysfs_set_str(&info, &devinfo, "state", "re-add") == 0) {
+ pr_err("re-add %s to %s succeed\n",
+ dv->devname, info.sys_name);
+ break;
+ }
+ }
+
if (dv->disposition == 'F')
/* Need to remove first */
ioctl(fd, HOT_REMOVE_DISK, rdev);
@@ -1495,7 +1594,7 @@ int Manage_subdevs(char *devname, int fd,
}
rv = Manage_add(fd, tfd, dv, tst, &array,
force, verbose, devname, update,
- rdev, array_size);
+ rdev, array_size, raid_slot);
close(tfd);
tfd = -1;
if (rv < 0)
@@ -1507,9 +1606,7 @@ int Manage_subdevs(char *devname, int fd,
case 'r':
/* hot remove */
if (subarray) {
- pr_err("Cannot remove disks from a"
- " \'member\' array, perform this"
- " operation on the parent container\n");
+ pr_err("Cannot remove disks from a \'member\' array, perform this operation on the parent container\n");
rv = -1;
} else
rv = Manage_remove(tst, fd, dv, sysfd,
@@ -1547,9 +1644,7 @@ int Manage_subdevs(char *devname, int fd,
break;
case 'R': /* Mark as replaceable */
if (subarray) {
- pr_err("Cannot replace disks in a"
- " \'member\' array, perform this"
- " operation on the parent container\n");
+ pr_err("Cannot replace disks in a \'member\' array, perform this operation on the parent container\n");
rv = -1;
} else {
if (!frozen) {