summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Tokarev <mjt@tls.msk.ru>2014-09-20 08:50:44 +0400
committerMichael Tokarev <mjt@tls.msk.ru>2014-09-20 08:50:44 +0400
commit20d0ba040d767361c1d0c8a825b2a5e8bfdb39a6 (patch)
tree687a5eda9314ea76da64e234ba269d84345f7efe
parentfef4f4267e21104887eba03b8ddec5e6a3fc9747 (diff)
parentfed12d436b9803ad97d1f11cc8f312ab08c3a659 (diff)
Merge tag 'mdadm-3.3.2' into debian
Release mdadm-3.3.2 Minor bugfix/stability release.
-rw-r--r--ANNOUNCE-3.3.123
-rw-r--r--ANNOUNCE-3.3.216
-rw-r--r--Assemble.c333
-rw-r--r--Create.c36
-rw-r--r--Detail.c6
-rw-r--r--Grow.c173
-rw-r--r--Incremental.c146
-rw-r--r--Makefile48
-rw-r--r--Manage.c42
-rw-r--r--Monitor.c24
-rw-r--r--ReadMe.c9
-rw-r--r--bitmap.c38
-rw-r--r--config.c6
-rwxr-xr-xinventory16
-rw-r--r--managemon.c5
-rw-r--r--md.4355
-rw-r--r--mdadm.8.in107
-rw-r--r--mdadm.c75
-rw-r--r--mdadm.conf.517
-rw-r--r--mdadm.h63
-rw-r--r--mdadm.spec2
-rw-r--r--mdassemble.82
-rw-r--r--mdmon.82
-rw-r--r--mdmon.c21
-rw-r--r--misc/mdcheck158
-rw-r--r--monitor.c10
-rw-r--r--platform-intel.h1
-rw-r--r--policy.c39
-rw-r--r--probe_roms.c2
-rw-r--r--raid6check.c352
-rw-r--r--super-ddf.c1465
-rw-r--r--super-intel.c119
-rw-r--r--super0.c7
-rw-r--r--super1.c18
-rw-r--r--systemd/SUSE-mdadm_env.sh45
-rw-r--r--systemd/mdadm-grow-continue@.service17
-rw-r--r--systemd/mdadm-last-resort@.service7
-rw-r--r--systemd/mdadm-last-resort@.timer7
-rw-r--r--systemd/mdadm.shutdown4
-rw-r--r--systemd/mdmon@.service18
-rw-r--r--systemd/mdmonitor.service16
-rw-r--r--tests/03r5assem-failed12
-rw-r--r--tests/10ddf-assemble-missing61
-rw-r--r--tests/10ddf-create10
-rw-r--r--tests/10ddf-fail-readd55
-rw-r--r--tests/10ddf-fail-readd-readonly71
-rw-r--r--tests/10ddf-fail-stop-readd66
-rw-r--r--tests/10ddf-fail-two-spares5
-rw-r--r--tests/10ddf-geometry24
-rw-r--r--tests/10ddf-incremental-wrong-order131
-rw-r--r--tests/10ddf-sudden-degraded18
-rw-r--r--tests/env-ddf-template18
-rw-r--r--udev-md-raid-arrays.rules8
-rw-r--r--udev-md-raid-assembly.rules24
-rw-r--r--util.c41
55 files changed, 3219 insertions, 1175 deletions
diff --git a/ANNOUNCE-3.3.1 b/ANNOUNCE-3.3.1
new file mode 100644
index 00000000..7d5e666e
--- /dev/null
+++ b/ANNOUNCE-3.3.1
@@ -0,0 +1,23 @@
+Subject: ANNOUNCE: mdadm 3.3.1 - A tool for managing md Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.3.1
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://git.neil.brown.name/git/mdadm.git
+
+The main changes are:
+ - lots of work on "DDF" support. Hopefully it will be more stable
+ now. Bug reports are always welcome.
+ - improved interactions with 'systemd'. Where possible, background
+ tasks are run from systemd (if it is present) rather then forking
+ disassociationg from the session. This is important because udev
+ doesn't really let you disassociate.
+
+though there are a number of other little bug fixes too.
+
+NeilBrown 5th June 2014
diff --git a/ANNOUNCE-3.3.2 b/ANNOUNCE-3.3.2
new file mode 100644
index 00000000..6b549611
--- /dev/null
+++ b/ANNOUNCE-3.3.2
@@ -0,0 +1,16 @@
+Subject: ANNOUNCE: mdadm 3.3.2 - A tool for managing md Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.3.2
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://git.neil.brown.name/git/mdadm.git
+
+Changes since 3.3.1 are mostly little bugfixes and some man-page
+updates.
+
+NeilBrown 21st August 2014
diff --git a/Assemble.c b/Assemble.c
index bc85603d..cdcdb0f8 100644
--- a/Assemble.c
+++ b/Assemble.c
@@ -366,9 +366,6 @@ static int select_devices(struct mddev_dev *devlist,
tmpdev = NULL;
goto loop;
} else {
- int rv = 0;
- struct mddev_ident *match;
-
content = *contentp;
tst->ss->getinfo_super(tst, content, NULL);
@@ -377,25 +374,33 @@ static int select_devices(struct mddev_dev *devlist,
report_mismatch ? devname : NULL))
goto loop;
- match = conf_match(tst, content, devname,
- report_mismatch ? c->verbose : -1,
- &rv);
- if (!match && rv == 2)
- goto loop;
- if (match && match->devname &&
- strcasecmp(match->devname, "<ignore>") == 0) {
- if (report_mismatch)
- pr_err("%s is a member of an explicitly ignored array\n",
- devname);
- goto loop;
- }
- if (match && !ident_matches(match, content, tst,
- c->homehost, c->update,
- report_mismatch ? devname : NULL))
- /* Array exists in mdadm.conf but some
- * details don't match, so reject it
+ if (auto_assem) {
+ /* Never auto-assemble things that conflict
+ * with mdadm.conf in some way
*/
- goto loop;
+ struct mddev_ident *match;
+ int rv = 0;
+
+ match = conf_match(tst, content, devname,
+ report_mismatch ? c->verbose : -1,
+ &rv);
+ if (!match && rv == 2)
+ goto loop;
+ if (match && match->devname &&
+ strcasecmp(match->devname, "<ignore>") == 0) {
+ if (report_mismatch)
+ pr_err("%s is a member of an explicitly ignored array\n",
+ devname);
+ goto loop;
+ }
+ if (match && !ident_matches(match, content, tst,
+ c->homehost, c->update,
+ report_mismatch ? devname : NULL))
+ /* Array exists in mdadm.conf but some
+ * details don't match, so reject it
+ */
+ goto loop;
+ }
/* should be safe to try an exclusive open now, we
* have rejected anything that some other mdadm might
@@ -551,7 +556,7 @@ struct devs {
};
static int load_devices(struct devs *devices, char *devmap,
- struct mddev_ident *ident, struct supertype *st,
+ struct mddev_ident *ident, struct supertype **stp,
struct mddev_dev *devlist, struct context *c,
struct mdinfo *content,
int mdfd, char *mddev,
@@ -567,10 +572,12 @@ static int load_devices(struct devs *devices, char *devmap,
int most_recent = -1;
int bestcnt = 0;
int *best = *bestp;
+ struct supertype *st = *stp;
for (tmpdev = devlist; tmpdev; tmpdev=tmpdev->next) {
char *devname = tmpdev->devname;
struct stat stb;
+ struct supertype *tst;
int i;
if (tmpdev->used != 1)
@@ -581,7 +588,6 @@ static int load_devices(struct devs *devices, char *devmap,
int dfd;
/* prepare useful information in info structures */
struct stat stb2;
- struct supertype *tst;
int err;
fstat(mdfd, &stb2);
@@ -610,6 +616,9 @@ static int load_devices(struct devs *devices, char *devmap,
close(mdfd);
free(devices);
free(devmap);
+ tst->ss->free_super(tst);
+ free(tst);
+ *stp = st;
return -1;
}
tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks);
@@ -636,6 +645,7 @@ static int load_devices(struct devs *devices, char *devmap,
close(dfd);
free(devices);
free(devmap);
+ *stp = st;
return -1;
}
if (strcmp(c->update, "uuid")==0 &&
@@ -657,15 +667,13 @@ static int load_devices(struct devs *devices, char *devmap,
else
bitmap_done = 1;
}
- tst->ss->free_super(tst);
} else
#endif
{
- struct supertype *tst = dup_super(st);
- int dfd;
- dfd = dev_open(devname,
- tmpdev->disposition == 'I'
- ? O_RDWR : (O_RDWR|O_EXCL));
+ int dfd = dev_open(devname,
+ tmpdev->disposition == 'I'
+ ? O_RDWR : (O_RDWR|O_EXCL));
+ tst = dup_super(st);
if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) {
pr_err("cannot re-read metadata from %s - aborting\n",
@@ -675,10 +683,12 @@ static int load_devices(struct devs *devices, char *devmap,
close(mdfd);
free(devices);
free(devmap);
+ tst->ss->free_super(tst);
+ free(tst);
+ *stp = st;
return -1;
}
tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks);
- tst->ss->free_super(tst);
close(dfd);
}
@@ -698,9 +708,15 @@ static int load_devices(struct devs *devices, char *devmap,
if (devices[devcnt].i.disk.state == 6) {
if (most_recent < 0 ||
devices[devcnt].i.events
- > devices[most_recent].i.events)
+ > devices[most_recent].i.events) {
+ struct supertype *tmp = tst;
+ tst = st;
+ st = tmp;
most_recent = devcnt;
+ }
}
+ tst->ss->free_super(tst);
+ free(tst);
if (content->array.level == LEVEL_MULTIPATH)
/* with multipath, the raid_disk from the superblock is meaningless */
@@ -759,6 +775,7 @@ static int load_devices(struct devs *devices, char *devmap,
close(mdfd);
free(devices);
free(devmap);
+ *stp = st;
return -1;
}
if (best[i] == -1
@@ -772,6 +789,7 @@ static int load_devices(struct devs *devices, char *devmap,
*most_recentp = most_recent;
*bestcntp = bestcnt;
*bestp = best;
+ *stp = st;
return devcnt;
}
@@ -803,7 +821,9 @@ static int force_array(struct mdinfo *content,
int chosen_drive = -1;
int i;
- for (i = 0; i < content->array.raid_disks && i < bestcnt; i++) {
+ for (i = 0;
+ i < content->array.raid_disks * 2 && i < bestcnt;
+ i += 2) {
int j = best[i];
if (j>=0 &&
!devices[j].uptodate &&
@@ -863,7 +883,9 @@ static int force_array(struct mdinfo *content,
/* If there are any other drives of the same vintage,
* add them in as well. We can't lose and we might gain
*/
- for (i = 0; i < content->array.raid_disks && i < bestcnt ; i++) {
+ for (i = 0;
+ i < content->array.raid_disks * 2 && i < bestcnt ;
+ i += 2) {
int j = best[i];
if (j >= 0 &&
!devices[j].uptodate &&
@@ -984,8 +1006,28 @@ static int start_array(int mdfd,
content->array.raid_disks);
fprintf(stderr, "\n");
}
+
+ if (st->ss->validate_container) {
+ struct mdinfo *devices_list;
+ struct mdinfo *info_devices = xmalloc(sizeof(struct mdinfo)*(okcnt+sparecnt));
+ unsigned int count;
+ devices_list = NULL;
+ for (count = 0; count < okcnt+sparecnt; count++) {
+ info_devices[count] = devices[count].i;
+ info_devices[count].next = devices_list;
+ devices_list = &info_devices[count];
+ }
+ if (st->ss->validate_container(devices_list))
+ pr_err("Mismatch detected!\n");
+ free(info_devices);
+ }
+
st->ss->free_super(st);
sysfs_uevent(content, "change");
+ if (err_ok && okcnt < (unsigned)content->array.raid_disks)
+ /* Was partial, is still partial, so signal an error
+ * to ensure we don't retry */
+ return 1;
return 0;
}
@@ -1023,7 +1065,7 @@ static int start_array(int mdfd,
"array_state", "readonly");
if (rv == 0)
rv = Grow_continue(mdfd, st, content,
- c->backup_file,
+ c->backup_file, 0,
c->freeze_reshape);
} else if (c->readonly &&
sysfs_attribute_available(
@@ -1033,6 +1075,7 @@ static int start_array(int mdfd,
} else
#endif
rv = ioctl(mdfd, RUN_ARRAY, NULL);
+ reopen_mddev(mdfd); /* drop O_EXCL */
if (rv == 0) {
if (c->verbose >= 0) {
pr_err("%s has been started with %d drive%s",
@@ -1415,7 +1458,7 @@ try_again:
/* This is a member of a container. Try starting the array. */
int err;
err = assemble_container_content(st, mdfd, content, c,
- chosen_name);
+ chosen_name, NULL);
close(mdfd);
return err;
}
@@ -1423,7 +1466,7 @@ try_again:
/* Ok, no bad inconsistancy, we can try updating etc */
devices = xcalloc(num_devs, sizeof(*devices));
devmap = xcalloc(num_devs, content->array.raid_disks);
- devcnt = load_devices(devices, devmap, ident, st, devlist,
+ devcnt = load_devices(devices, devmap, ident, &st, devlist,
c, content, mdfd, mddev,
&most_recent, &bestcnt, &best, inargv);
if (devcnt < 0)
@@ -1528,7 +1571,7 @@ try_again:
*/
chosen_drive = -1;
st->ss->free_super(st);
- for (i=0; chosen_drive < 0 && i<bestcnt; i++) {
+ for (i=0; chosen_drive < 0 && i<bestcnt; i+=2) {
int j = best[i];
int fd;
@@ -1658,6 +1701,8 @@ try_again:
pr_err(":%s has an active reshape - checking "
"if critical section needs to be restored\n",
chosen_name);
+ if (!c->backup_file)
+ c->backup_file = locate_backup(content->sys_name);
enable_fds(bestcnt/2);
for (i = 0; i < bestcnt/2; i++) {
int j = best[i*2];
@@ -1767,18 +1812,20 @@ try_again:
#ifndef MDASSEMBLE
int assemble_container_content(struct supertype *st, int mdfd,
struct mdinfo *content, struct context *c,
- char *chosen_name)
+ char *chosen_name, int *result)
{
- struct mdinfo *dev, *sra;
+ struct mdinfo *dev, *sra, *dev2;
int working = 0, preexist = 0;
int expansion = 0;
struct map_ent *map = NULL;
int old_raid_disks;
int start_reshape;
+ char *avail = NULL;
+ int err;
sysfs_init(content, mdfd, NULL);
- sra = sysfs_read(mdfd, NULL, GET_VERSION);
+ sra = sysfs_read(mdfd, NULL, GET_VERSION|GET_DEVS);
if (sra == NULL || strcmp(sra->text_version, content->text_version) != 0) {
if (content->array.major_version == -1 &&
content->array.minor_version == -2 &&
@@ -1805,10 +1852,27 @@ int assemble_container_content(struct supertype *st, int mdfd,
if (st->ss->external && content->recovery_blocked && start_reshape)
block_subarray(content);
- if (sra)
- sysfs_free(sra);
+ for (dev2 = sra->devs; dev2; dev2 = dev2->next) {
+ for (dev = content->devs; dev; dev = dev->next)
+ if (dev2->disk.major == dev->disk.major &&
+ dev2->disk.minor == dev->disk.minor)
+ break;
+ if (dev)
+ continue;
+ /* Don't want this one any more */
+ if (sysfs_set_str(sra, dev2, "slot", "none") < 0 &&
+ errno == EBUSY) {
+ pr_err("Cannot remove old device %s: not updating %s\n", dev2->sys_name, sra->sys_name);
+ sysfs_free(sra);
+ return 1;
+ }
+ sysfs_set_str(sra, dev2, "state", "remove");
+ }
old_raid_disks = content->array.raid_disks - content->delta_disks;
- for (dev = content->devs; dev; dev = dev->next)
+ avail = xcalloc(content->array.raid_disks, 1);
+ for (dev = content->devs; dev; dev = dev->next) {
+ if (dev->disk.raid_disk >= 0)
+ avail[dev->disk.raid_disk] = 1;
if (sysfs_add_disk(content, dev, 1) == 0) {
if (dev->disk.raid_disk >= old_raid_disks &&
content->reshape_active)
@@ -1817,100 +1881,127 @@ int assemble_container_content(struct supertype *st, int mdfd,
working++;
} else if (errno == EEXIST)
preexist++;
- if (working + expansion == 0)
+ }
+ sysfs_free(sra);
+ if (working + expansion == 0 && c->runstop <= 0) {
+ free(avail);
return 1;/* Nothing new, don't try to start */
-
+ }
map_update(&map, fd2devnm(mdfd),
content->text_version,
content->uuid, chosen_name);
- if (c->runstop > 0 ||
- (working + preexist + expansion) >=
- content->array.working_disks) {
- int err;
-
- if (start_reshape) {
- int spare = content->array.raid_disks + expansion;
- if (restore_backup(st, content,
- working,
- spare, c->backup_file, c->verbose) == 1)
- return 1;
-
- err = sysfs_set_str(content, NULL,
- "array_state", "readonly");
- if (err)
- return 1;
- if (st->ss->external) {
- if (!mdmon_running(st->container_devnm))
- start_mdmon(st->container_devnm);
- ping_monitor(st->container_devnm);
- if (mdmon_running(st->container_devnm) &&
- st->update_tail == NULL)
- st->update_tail = &st->updates;
- }
-
- err = Grow_continue(mdfd, st, content, c->backup_file,
- c->freeze_reshape);
- } else switch(content->array.level) {
- case LEVEL_LINEAR:
- case LEVEL_MULTIPATH:
- case 0:
- err = sysfs_set_str(content, NULL, "array_state",
- c->readonly ? "readonly" : "active");
- break;
- default:
- err = sysfs_set_str(content, NULL, "array_state",
- "readonly");
- /* start mdmon if needed. */
- if (!err) {
- if (!mdmon_running(st->container_devnm))
- start_mdmon(st->container_devnm);
- ping_monitor(st->container_devnm);
- }
- break;
- }
- if (!err)
- sysfs_set_safemode(content, content->safe_mode_delay);
-
- /* Block subarray here if it is not reshaped now
- * It has be blocked a little later to allow mdmon to switch in
- * in to R/W state
- */
- if (st->ss->external && content->recovery_blocked &&
- !start_reshape)
- block_subarray(content);
-
- if (c->verbose >= 0) {
- if (err)
- pr_err("array %s now has %d device%s",
- chosen_name, working + preexist,
- working + preexist == 1 ? "":"s");
- else
- pr_err("Started %s with %d device%s",
- chosen_name, working + preexist,
- working + preexist == 1 ? "":"s");
+ if (enough(content->array.level, content->array.raid_disks,
+ content->array.layout, content->array.state & 1, avail) == 0) {
+ if (c->export && result)
+ *result |= INCR_NO;
+ else if (c->verbose >= 0) {
+ pr_err("%s assembled with %d device%s",
+ chosen_name, preexist + working,
+ preexist + working == 1 ? "":"s");
if (preexist)
fprintf(stderr, " (%d new)", working);
- if (expansion)
- fprintf(stderr, " ( + %d for expansion)",
- expansion);
- fprintf(stderr, "\n");
+ fprintf(stderr, " but not started\n");
}
- if (!err)
- wait_for(chosen_name, mdfd);
- return err;
- /* FIXME should have an O_EXCL and wait for read-auto */
- } else {
- if (c->verbose >= 0) {
+ free(avail);
+ return 1;
+ }
+ free(avail);
+
+ if (c->runstop <= 0 &&
+ (working + preexist + expansion) <
+ content->array.working_disks) {
+ if (c->export && result)
+ *result |= INCR_UNSAFE;
+ else if (c->verbose >= 0) {
pr_err("%s assembled with %d device%s",
chosen_name, preexist + working,
preexist + working == 1 ? "":"s");
if (preexist)
fprintf(stderr, " (%d new)", working);
- fprintf(stderr, " but not started\n");
+ fprintf(stderr, " but not safe to start\n");
}
return 1;
}
+
+
+ if (start_reshape) {
+ int spare = content->array.raid_disks + expansion;
+ if (restore_backup(st, content,
+ working,
+ spare, &c->backup_file, c->verbose) == 1)
+ return 1;
+
+ err = sysfs_set_str(content, NULL,
+ "array_state", "readonly");
+ if (err)
+ return 1;
+
+ if (st->ss->external) {
+ if (!mdmon_running(st->container_devnm))
+ start_mdmon(st->container_devnm);
+ ping_monitor(st->container_devnm);
+ if (mdmon_running(st->container_devnm) &&
+ st->update_tail == NULL)
+ st->update_tail = &st->updates;
+ }
+
+ err = Grow_continue(mdfd, st, content, c->backup_file,
+ 0, c->freeze_reshape);
+ } else switch(content->array.level) {
+ case LEVEL_LINEAR:
+ case LEVEL_MULTIPATH:
+ case 0:
+ err = sysfs_set_str(content, NULL, "array_state",
+ c->readonly ? "readonly" : "active");
+ break;
+ default:
+ err = sysfs_set_str(content, NULL, "array_state",
+ "readonly");
+ /* start mdmon if needed. */
+ if (!err) {
+ if (!mdmon_running(st->container_devnm))
+ start_mdmon(st->container_devnm);
+ ping_monitor(st->container_devnm);
+ }
+ break;
+ }
+ if (!err)
+ sysfs_set_safemode(content, content->safe_mode_delay);
+
+ /* Block subarray here if it is not reshaped now
+ * It has be blocked a little later to allow mdmon to switch in
+ * in to R/W state
+ */
+ if (st->ss->external && content->recovery_blocked &&
+ !start_reshape)
+ block_subarray(content);
+
+ if (c->export && result) {
+ if (err)
+ *result |= INCR_NO;
+ else
+ *result |= INCR_YES;
+ } else if (c->verbose >= 0) {
+ if (err)
+ pr_err("array %s now has %d device%s",
+ chosen_name, working + preexist,
+ working + preexist == 1 ? "":"s");
+ else
+ pr_err("Started %s with %d device%s",
+ chosen_name, working + preexist,
+ working + preexist == 1 ? "":"s");
+ if (preexist)
+ fprintf(stderr, " (%d new)", working);
+ if (expansion)
+ fprintf(stderr, " ( + %d for expansion)",
+ expansion);
+ fprintf(stderr, "\n");
+ }
+ if (!err)
+ wait_for(chosen_name, mdfd);
+ return err;
+ /* FIXME should have an O_EXCL and wait for read-auto */
}
#endif
diff --git a/Create.c b/Create.c
index 9247d46b..330c5b42 100644
--- a/Create.c
+++ b/Create.c
@@ -285,6 +285,12 @@ int Create(struct supertype *st, char *mddev,
info.array.active_disks = 0;
info.array.working_disks = 0;
dnum = 0;
+ for (dv = devlist; dv ; dv = dv->next)
+ if (data_offset == VARIABLE_OFFSET)
+ dv->data_offset = INVALID_SECTORS;
+ else
+ dv->data_offset = data_offset;
+
for (dv=devlist; dv && !have_container; dv=dv->next, dnum++) {
char *dname = dv->devname;
unsigned long long freesize;
@@ -509,6 +515,7 @@ int Create(struct supertype *st, char *mddev,
if (!s->bitmap_file &&
s->level >= 1 &&
+ st->ss->add_internal_bitmap &&
(s->write_behind || s->size > 100*1024*1024ULL)) {
if (c->verbose > 0)
pr_err("automatically enabling write-intent bitmap on large array\n");
@@ -740,7 +747,9 @@ int Create(struct supertype *st, char *mddev,
map_update(&map, fd2devnm(mdfd), info.text_version,
info.uuid, chosen_name);
- map_unlock(&map);
+ /* Keep map locked until devices have been added to array
+ * to stop another mdadm from finding and using those devices.
+ */
if (s->bitmap_file && vers < 9003) {
major_num = BITMAP_MAJOR_HOSTENDIAN;
@@ -753,18 +762,18 @@ int Create(struct supertype *st, char *mddev,
if (s->bitmap_file && strcmp(s->bitmap_file, "internal")==0) {
if ((vers%100) < 2) {
pr_err("internal bitmaps not supported by this kernel.\n");
- goto abort;
+ goto abort_locked;
}
if (!st->ss->add_internal_bitmap) {
pr_err("internal bitmaps not supported with %s metadata\n",
st->ss->name);
- goto abort;
+ goto abort_locked;
}
if (!st->ss->add_internal_bitmap(st, &s->bitmap_chunk,
c->delay, s->write_behind,
bitmapsize, 1, major_num)) {
pr_err("Given bitmap chunk size not supported.\n");
- goto abort;
+ goto abort_locked;
}
s->bitmap_file = NULL;
}
@@ -790,7 +799,7 @@ int Create(struct supertype *st, char *mddev,
if (container_fd < 0) {
pr_err("Cannot get exclusive "
"open on container - weird.\n");
- goto abort;
+ goto abort_locked;
}
if (mdmon_running(st->container_devnm)) {
if (c->verbose)
@@ -805,7 +814,7 @@ int Create(struct supertype *st, char *mddev,
if (rv) {
pr_err("failed to set array info for %s: %s\n",
mddev, strerror(errno));
- goto abort;
+ goto abort_locked;
}
if (s->bitmap_file) {
@@ -816,18 +825,18 @@ int Create(struct supertype *st, char *mddev,
c->delay, s->write_behind,
bitmapsize,
major_num)) {
- goto abort;
+ goto abort_locked;
}
bitmap_fd = open(s->bitmap_file, O_RDWR);
if (bitmap_fd < 0) {
pr_err("weird: %s cannot be openned\n",
s->bitmap_file);
- goto abort;
+ goto abort_locked;
}
if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) {
pr_err("Cannot set bitmap file for %s: %s\n",
mddev, strerror(errno));
- goto abort;
+ goto abort_locked;
}
}
@@ -884,7 +893,7 @@ int Create(struct supertype *st, char *mddev,
pr_err("failed to open %s "
"after earlier success - aborting\n",
dv->devname);
- goto abort;
+ goto abort_locked;
}
fstat(fd, &stb);
inf->disk.major = major(stb.st_rdev);
@@ -896,7 +905,7 @@ int Create(struct supertype *st, char *mddev,
fd, dv->devname,
dv->data_offset)) {
ioctl(mdfd, STOP_ARRAY, NULL);
- goto abort;
+ goto abort_locked;
}
st->ss->getinfo_super(st, inf, NULL);
safe_mode_delay = inf->safe_mode_delay;
@@ -922,7 +931,7 @@ int Create(struct supertype *st, char *mddev,
pr_err("ADD_NEW_DISK for %s "
"failed: %s\n",
dv->devname, strerror(errno));
- goto abort;
+ goto abort_locked;
}
break;
}
@@ -939,7 +948,6 @@ int Create(struct supertype *st, char *mddev,
* the subarray cursor such that ->getinfo_super once
* again returns container info.
*/
- map_lock(&map);
st->ss->getinfo_super(st, &info_new, NULL);
if (st->ss->external && s->level != LEVEL_CONTAINER &&
!same_uuid(info_new.uuid, info.uuid, 0)) {
@@ -964,12 +972,12 @@ int Create(struct supertype *st, char *mddev,
info_new.uuid, path);
free(path);
}
- map_unlock(&map);
flush_metadata_updates(st);
st->ss->free_super(st);
}
}
+ map_unlock(&map);
free(infos);
if (s->level == LEVEL_CONTAINER) {
diff --git a/Detail.c b/Detail.c
index 6d13d3a3..c4fcad96 100644
--- a/Detail.c
+++ b/Detail.c
@@ -109,7 +109,7 @@ int Detail(char *dev, struct context *c)
st = super_by_fd(fd, &subarray);
if (ioctl(fd, GET_ARRAY_INFO, &array) == 0) {
inactive = 0;
- } else if (errno == ENODEV) {
+ } else if (errno == ENODEV && sra) {
array = sra->array;
inactive = 1;
} else {
@@ -465,8 +465,8 @@ int Detail(char *dev, struct context *c)
(!e || (e->percent < 0 && e->percent != RESYNC_PENDING &&
e->percent != RESYNC_DELAYED)) ? "" : sync_action[e->resync],
larray_size ? "": ", Not Started",
- e->percent == RESYNC_DELAYED ? " (DELAYED)": "",
- e->percent == RESYNC_PENDING ? " (PENDING)": "");
+ (e && e->percent == RESYNC_DELAYED) ? " (DELAYED)": "",
+ (e && e->percent == RESYNC_PENDING) ? " (PENDING)": "");
} else if (inactive) {
printf(" State : inactive\n");
}
diff --git a/Grow.c b/Grow.c
index f9f97920..a9c8589c 100644
--- a/Grow.c
+++ b/Grow.c
@@ -24,8 +24,10 @@
#include "mdadm.h"
#include "dlink.h"
#include <sys/mman.h>
+#include <stddef.h>
#include <stdint.h>
#include <signal.h>
+#include <sys/wait.h>
#if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN)
#error no endian defined
@@ -33,15 +35,11 @@
#include "md_u.h"
#include "md_p.h"
-#ifndef offsetof
-#define offsetof(t,f) ((size_t)&(((t*)0)->f))
-#endif
-
int restore_backup(struct supertype *st,
struct mdinfo *content,
int working_disks,
int next_spare,
- char *backup_file,
+ char **backup_filep,
int verbose)
{
int i;
@@ -49,6 +47,7 @@ int restore_backup(struct supertype *st,
struct mdinfo *dev;
int err;
int disk_count = next_spare + working_disks;
+ char *backup_file = *backup_filep;
dprintf("Called restore_backup()\n");
fdlist = xmalloc(sizeof(int) * disk_count);
@@ -70,6 +69,11 @@ int restore_backup(struct supertype *st,
fdlist[next_spare++] = fd;
}
+ if (!backup_file) {
+ backup_file = locate_backup(content->sys_name);
+ *backup_filep = backup_file;
+ }
+
if (st->ss->external && st->ss->recover_backup)
err = st->ss->recover_backup(st, content);
else
@@ -612,9 +616,14 @@ static void unfreeze(struct supertype *st)
return unfreeze_container(st);
else {
struct mdinfo *sra = sysfs_read(-1, st->devnm, GET_VERSION);
+ char buf[20];
- if (sra)
+ if (sra &&
+ sysfs_get_str(sra, NULL, "sync_action", buf, 20) > 0
+ && strcmp(buf, "frozen\n") == 0) {
+ printf("unfreeze\n");
sysfs_set_str(sra, NULL, "sync_action", "idle");
+ }
sysfs_free(sra);
}
}
@@ -881,6 +890,7 @@ int reshape_open_backup_file(char *backup_file,
long blocks,
int *fdlist,
unsigned long long *offsets,
+ char *sys_name,
int restart)
{
/* Return 1 on success, 0 on any form of failure */
@@ -928,6 +938,14 @@ int reshape_open_backup_file(char *backup_file,
return 0;
}
+ if (!restart && strncmp(backup_file, MAP_DIR, strlen(MAP_DIR)) != 0) {
+ char *bu = make_backup(sys_name);
+ if (symlink(backup_file, bu))
+ pr_err("Recording backup file in " MAP_DIR "failed: %s\n",
+ strerror(errno));
+ free(bu);
+ }
+
return 1;
}
@@ -1010,7 +1028,12 @@ char *analyse_change(char *devname, struct mdinfo *info, struct reshape *re)
switch (info->array.level) {
default:
- return "Cannot understand this RAID level";
+ return "No reshape is possibly for this RAID level";
+ case LEVEL_LINEAR:
+ if (info->delta_disks != UnSet)
+ return "Only --add is supported for LINEAR, setting --raid-disks is not needed";
+ else
+ return "Only --add is supported for LINEAR, other --grow options are not meaningful";
case 1:
/* RAID1 can convert to RAID1 with different disks, or
* raid5 with 2 disks, or
@@ -1322,7 +1345,6 @@ char *analyse_change(char *devname, struct mdinfo *info, struct reshape *re)
switch (re->level) {
case 4:
- re->before.layout = 0;
re->after.layout = 0;
break;
case 5:
@@ -1495,8 +1517,8 @@ static int reshape_container(char *container, char *devname,
struct supertype *st,
struct mdinfo *info,
int force,
- char *backup_file,
- int verbose, int restart, int freeze_reshape);
+ char *backup_file, int verbose,
+ int forked, int restart, int freeze_reshape);
int Grow_reshape(char *devname, int fd,
struct mddev_dev *devlist,
@@ -1796,7 +1818,7 @@ int Grow_reshape(char *devname, int fd,
if (s->size == MAX_SIZE)
s->size = 0;
array.size = s->size;
- if ((unsigned)array.size != s->size) {
+ if (array.size != (signed)s->size) {
/* got truncated to 32bit, write to
* component_size instead
*/
@@ -2048,7 +2070,7 @@ size_change_error:
* performed at the level of the container
*/
rv = reshape_container(container, devname, -1, st, &info,
- c->force, c->backup_file, c->verbose, 0, 0);
+ c->force, c->backup_file, c->verbose, 0, 0, 0);
frozen = 0;
} else {
/* get spare devices from external metadata
@@ -2666,7 +2688,7 @@ static int impose_level(int fd, int level, char *devname, int verbose)
for (d = 0, found = 0;
d < MAX_DISKS && found < array.nr_disks;
d++) {
- mdu_disk_info_t disk;
+ mdu_disk_info_t disk;
disk.number = d;
if (ioctl(fd, GET_DISK_INFO, &disk) < 0)
continue;
@@ -2734,6 +2756,48 @@ static void catch_term(int sig)
sigterm = 1;
}
+static int continue_via_systemd(char *devnm)
+{
+ int skipped, i, pid, status;
+ char pathbuf[1024];
+ /* In a systemd/udev world, it is best to get systemd to
+ * run "mdadm --grow --continue" rather than running in the
+ * background.
+ */
+ switch(fork()) {
+ case 0:
+ /* FIXME yuk. CLOSE_EXEC?? */
+ skipped = 0;
+ for (i = 3; skipped < 20; i++)
+ if (close(i) < 0)
+ skipped++;
+ else
+ skipped = 0;
+
+ /* Don't want to see error messages from
+ * systemctl. If the service doesn't exist,
+ * we fork ourselves.
+ */
+ close(2);
+ open("/dev/null", O_WRONLY);
+ snprintf(pathbuf, sizeof(pathbuf), "mdadm-grow-continue@%s.service",
+ devnm);
+ status = execl("/usr/bin/systemctl", "systemctl",
+ "start",
+ pathbuf, NULL);
+ status = execl("/bin/systemctl", "systemctl", "start",
+ pathbuf, NULL);
+ exit(1);
+ case -1: /* Just do it ourselves. */
+ break;
+ default: /* parent - good */
+ pid = wait(&status);
+ if (pid >= 0 && status == 0)
+ return 1;
+ }
+ return 0;
+}
+
static int reshape_array(char *container, int fd, char *devname,
struct supertype *st, struct mdinfo *info,
int force, struct mddev_dev *devlist,
@@ -2763,6 +2827,7 @@ static int reshape_array(char *container, int fd, char *devname,
unsigned long long array_size;
int done;
struct mdinfo *sra = NULL;
+ char buf[20];
/* when reshaping a RAID0, the component_size might be zero.
* So try to fix that up.
@@ -2810,7 +2875,9 @@ static int reshape_array(char *container, int fd, char *devname,
goto release;
}
- if (st->ss->external && restart && (info->reshape_progress == 0)) {
+ if (st->ss->external && restart && (info->reshape_progress == 0) &&
+ !((sysfs_get_str(info, NULL, "sync_action", buf, sizeof(buf)) > 0) &&
+ (strncmp(buf, "reshape", 7) == 0))) {
/* When reshape is restarted from '0', very begin of array
* it is possible that for external metadata reshape and array
* configuration doesn't happen.
@@ -2838,6 +2905,10 @@ static int reshape_array(char *container, int fd, char *devname,
devname);
goto release;
}
+
+ if (!backup_file)
+ backup_file = locate_backup(sra->sys_name);
+
goto started;
}
/* The container is frozen but the array may not be.
@@ -3167,6 +3238,7 @@ started:
if (!reshape_open_backup_file(backup_file, fd, devname,
(signed)blocks,
fdlist+d, offsets+d,
+ sra->sys_name,
restart)) {
goto release;
}
@@ -3208,6 +3280,14 @@ started:
return 1;
}
+ if (!forked && !check_env("MDADM_NO_SYSTEMCTL"))
+ if (continue_via_systemd(container ?: sra->sys_name)) {
+ free(fdlist);
+ free(offsets);
+ sysfs_free(sra);
+ return 0;
+ }
+
/* Now we just need to kick off the reshape and watch, while
* handling backups of the data...
* This is all done by a forked background process.
@@ -3287,8 +3367,21 @@ started:
free(fdlist);
free(offsets);
- if (backup_file && done)
+ if (backup_file && done) {
+ char *bul;
+ bul = make_backup(sra->sys_name);
+ if (bul) {
+ char buf[1024];
+ int l = readlink(bul, buf, sizeof(buf));
+ if (l > 0) {
+ buf[l]=0;
+ unlink(buf);
+ }
+ unlink(bul);
+ free(bul);
+ }
unlink(backup_file);
+ }
if (!done) {
abort_reshape(sra);
goto out;
@@ -3362,8 +3455,8 @@ int reshape_container(char *container, char *devname,
struct supertype *st,
struct mdinfo *info,
int force,
- char *backup_file,
- int verbose, int restart, int freeze_reshape)
+ char *backup_file, int verbose,
+ int forked, int restart, int freeze_reshape)
{
struct mdinfo *cc = NULL;
int rv = restart;
@@ -3388,7 +3481,11 @@ int reshape_container(char *container, char *devname,
*/
ping_monitor(container);
- switch (fork()) {
+ if (!forked && !freeze_reshape && !check_env("MDADM_NO_SYSTEMCTL"))
+ if (continue_via_systemd(container))
+ return 0;
+
+ switch (forked ? 0 : fork()) {
case -1: /* error */
perror("Cannot fork to complete reshape\n");
unfreeze(st);
@@ -4316,7 +4413,12 @@ int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
}
/* FIXME maybe call progress_reshape one more time instead */
- abort_reshape(sra); /* remove any remaining suspension */
+ /* remove any remaining suspension */
+ sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL);
+ sysfs_set_num(sra, NULL, "suspend_hi", 0);
+ sysfs_set_num(sra, NULL, "suspend_lo", 0);
+ sysfs_set_num(sra, NULL, "sync_min", 0);
+
if (reshape->before.data_disks == reshape->after.data_disks)
sysfs_set_num(sra, NULL, "sync_speed_min", speed);
free(buf);
@@ -4824,7 +4926,7 @@ int Grow_continue_command(char *devname, int fd,
/* continue reshape
*/
- ret_val = Grow_continue(fd, st, content, backup_file, 0);
+ ret_val = Grow_continue(fd, st, content, backup_file, 1, 0);
Grow_continue_command_exit:
if (fd2 > -1)
@@ -4840,7 +4942,7 @@ Grow_continue_command_exit:
}
int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
- char *backup_file, int freeze_reshape)
+ char *backup_file, int forked, int freeze_reshape)
{
int ret_val = 2;
@@ -4857,15 +4959,40 @@ int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
close(cfd);
ret_val = reshape_container(st->container_devnm, NULL, mdfd,
st, info, 0, backup_file,
- 0,
+ 0, forked,
1 | info->reshape_active,
freeze_reshape);
} else
ret_val = reshape_array(NULL, mdfd, "array", st, info, 1,
NULL, INVALID_SECTORS,
- backup_file, 0, 0,
+ backup_file, 0, forked,
1 | info->reshape_active,
freeze_reshape);
return ret_val;
}
+
+char *make_backup(char *name)
+{
+ char *base = "backup_file-";
+ int len;
+ char *fname;
+
+ len = strlen(MAP_DIR) + 1 + strlen(base) + strlen(name)+1;
+ fname = xmalloc(len);
+ sprintf(fname, "%s/%s%s", MAP_DIR, base, name);
+ return fname;
+}
+
+char *locate_backup(char *name)
+{
+ char *fl = make_backup(name);
+ struct stat stb;
+
+ if (stat(fl, &stb) == 0 &&
+ S_ISREG(stb.st_mode))
+ return fl;
+
+ free(fl);
+ return NULL;
+}
diff --git a/Incremental.c b/Incremental.c
index f256b48d..c9372587 100644
--- a/Incremental.c
+++ b/Incremental.c
@@ -44,9 +44,9 @@ static int try_spare(char *devname, int *dfdp, struct dev_policy *pol,
struct supertype *st, int verbose);
static int Incremental_container(struct supertype *st, char *devname,
- struct context *c);
+ struct context *c, char *only);
-int Incremental(char *devname, struct context *c,
+int Incremental(struct mddev_dev *devlist, struct context *c,
struct supertype *st)
{
/* Add this device to an array, creating the array if necessary
@@ -91,6 +91,7 @@ int Incremental(char *devname, struct context *c,
struct mdinfo *sra = NULL, *d;
struct mddev_ident *match;
char chosen_name[1024];
+ char *md_devname;
int rv = 1;
struct map_ent *mp, *map = NULL;
int dfd = -1, mdfd = -1;
@@ -102,6 +103,7 @@ int Incremental(char *devname, struct context *c,
struct dev_policy *policy = NULL;
struct map_ent target_array;
int have_target;
+ char *devname = devlist->devname;
struct createinfo *ci = conf_get_create_info();
@@ -138,7 +140,9 @@ int Incremental(char *devname, struct context *c,
if (map_lock(&map))
pr_err("failed to get "
"exclusive lock on mapfile\n");
- rv = Incremental_container(st, devname, c);
+ if (c->export)
+ printf("MD_DEVNAME=%s\n", devname);
+ rv = Incremental_container(st, devname, c, NULL);
map_unlock(&map);
return rv;
}
@@ -150,7 +154,20 @@ int Incremental(char *devname, struct context *c,
/* 1/ Check if device is permitted by mdadm.conf */
- if (!conf_test_dev(devname)) {
+ for (;devlist; devlist = devlist->next)
+ if (conf_test_dev(devlist->devname))
+ break;
+ if (!devlist) {
+ devlist = conf_get_devs();
+ for (;devlist; devlist = devlist->next) {
+ struct stat st2;
+ if (stat(devlist->devname, &st2) == 0 &&
+ (st2.st_mode & S_IFMT) == S_IFBLK &&
+ st2.st_rdev == stb.st_rdev)
+ break;
+ }
+ }
+ if (!devlist) {
if (c->verbose >= 0)
pr_err("%s not permitted by mdadm.conf.\n",
devname);
@@ -459,6 +476,15 @@ int Incremental(char *devname, struct context *c,
info.array.working_disks ++;
}
+ if (strncmp(chosen_name, "/dev/md/", 8) == 0)
+ md_devname = chosen_name+8;
+ else
+ md_devname = chosen_name;
+ if (c->export) {
+ printf("MD_DEVICE=%s\n", fd2devnm(mdfd));
+ printf("MD_DEVNAME=%s\n", md_devname);
+ printf("MD_FOREIGN=%s\n", trustworthy == FOREIGN ? "yes" : "no");
+ }
/* 7/ Is there enough devices to possibly start the array? */
/* 7a/ if not, finish with success. */
@@ -466,7 +492,7 @@ int Incremental(char *devname, struct context *c,
char devnm[32];
/* Try to assemble within the container */
sysfs_uevent(sra, "change");
- if (c->verbose >= 0)
+ if (!c->export && c->verbose >= 0)
pr_err("container %s now has %d device%s\n",
chosen_name, info.array.working_disks,
info.array.working_disks == 1?"":"s");
@@ -478,13 +504,8 @@ int Incremental(char *devname, struct context *c,
close(mdfd);
sysfs_free(sra);
if (!rv)
- rv = Incremental_container(st, chosen_name, c);
+ rv = Incremental_container(st, chosen_name, c, NULL);
map_unlock(&map);
- if (rv == 1)
- /* Don't fail the whole -I if a subarray didn't
- * have enough devices to start yet
- */
- rv = 0;
/* after spare is added, ping monitor for external metadata
* so that it can eg. try to rebuild degraded array */
if (st->ss->external)
@@ -503,7 +524,9 @@ int Incremental(char *devname, struct context *c,
if (enough(info.array.level, info.array.raid_disks,
info.array.layout, info.array.state & 1,
avail) == 0) {
- if (c->verbose >= 0)
+ if (c->export) {
+ printf("MD_STARTED=no\n");
+ } else if (c->verbose >= 0)
pr_err("%s attached to %s, not enough to start (%d).\n",
devname, chosen_name, active_disks);
rv = 0;
@@ -517,7 +540,9 @@ int Incremental(char *devname, struct context *c,
/* + start the array (auto-readonly). */
if (ioctl(mdfd, GET_ARRAY_INFO, &ainf) == 0) {
- if (c->verbose >= 0)
+ if (c->export) {
+ printf("MD_STARTED=already\n");
+ } else if (c->verbose >= 0)
pr_err("%s attached to %s which is already active.\n",
devname, chosen_name);
rv = 0;
@@ -563,8 +588,14 @@ int Incremental(char *devname, struct context *c,
else
rv = sysfs_set_str(sra, NULL,
"array_state", "read-auto");
+ /* Array might be O_EXCL which will interfere with
+ * fsck and mount. So re-open without O_EXCL.
+ */
+ reopen_mddev(mdfd);
if (rv == 0) {
- if (c->verbose >= 0)
+ if (c->export) {
+ printf("MD_STARTED=yes\n");
+ } else if (c->verbose >= 0)
pr_err("%s attached to %s, which has been started.\n",
devname, chosen_name);
rv = 0;
@@ -587,7 +618,9 @@ int Incremental(char *devname, struct context *c,
rv = 1;
}
} else {
- if (c->verbose >= 0)
+ if (c->export) {
+ printf("MD_STARTED=unsafe\n");
+ } else if (c->verbose >= 0)
pr_err("%s attached to %s, not enough to start safely.\n",
devname, chosen_name);
rv = 0;
@@ -1278,7 +1311,7 @@ static int try_spare(char *devname, int *dfdp, struct dev_policy *pol,
return rv;
}
-int IncrementalScan(int verbose, char *devnm)
+int IncrementalScan(struct context *c, char *devnm)
{
/* look at every device listed in the 'map' file.
* If one is found that is not running then:
@@ -1290,10 +1323,13 @@ int IncrementalScan(int verbose, char *devnm)
struct map_ent *me;
struct mddev_ident *devs, *mddev;
int rv = 0;
+ char container[32];
+ char *only = NULL;
map_read(&mapl);
devs = conf_get_ident(NULL);
+restart:
for (me = mapl ; me ; me = me->next) {
mdu_array_info_t array;
mdu_bitmap_file_t bmf;
@@ -1302,10 +1338,42 @@ int IncrementalScan(int verbose, char *devnm)
if (devnm && strcmp(devnm, me->devnm) != 0)
continue;
+ if (devnm && me->metadata[0] == '/') {
+ char *sl;
+ /* member array, need to work on container */
+ strncpy(container, me->metadata+1, 32);
+ container[31] = 0;
+ sl = strchr(container, '/');
+ if (sl)
+ *sl = 0;
+ only = devnm;
+ devnm = container;
+ goto restart;
+ }
mdfd = open_dev(me->devnm);
if (mdfd < 0)
continue;
+ if (!isdigit(me->metadata[0])) {
+ /* must be a container */
+ struct supertype *st = super_by_fd(mdfd, NULL);
+ int ret = 0;
+ struct map_ent *map = NULL;
+ if (st)
+ st->ignore_hw_compat = 1;
+ if (st && st->ss->load_container)
+ ret = st->ss->load_container(st, mdfd, NULL);
+ close(mdfd);
+ if (!ret && st->ss->container_content) {
+ if (map_lock(&map))
+ pr_err("failed to get exclusive lock on mapfile\n");
+ ret = Incremental_container(st, me->path, c, only);
+ map_unlock(&map);
+ }
+ if (ret)
+ rv = 1;
+ continue;
+ }
if (ioctl(mdfd, GET_ARRAY_INFO, &array) == 0 ||
errno != ENODEV) {
close(mdfd);
@@ -1330,7 +1398,7 @@ int IncrementalScan(int verbose, char *devnm)
close(bmfd);
}
}
- if (verbose >= 0) {
+ if (c->verbose >= 0) {
if (added == 0)
pr_err("Added bitmap %s to %s\n",
mddev->bitmap_file, me->path);
@@ -1346,7 +1414,7 @@ int IncrementalScan(int verbose, char *devnm)
if (sra) {
if (sysfs_set_str(sra, NULL,
"array_state", "read-auto") == 0) {
- if (verbose >= 0)
+ if (c->verbose >= 0)
pr_err("started array %s\n",
me->path ?: me->devnm);
} else {
@@ -1387,7 +1455,7 @@ static char *container2devname(char *devname)
}
static int Incremental_container(struct supertype *st, char *devname,
- struct context *c)
+ struct context *c, char *only)
{
/* Collect the contents of this container and for each
* array, choose a device name and assemble the array.
@@ -1406,6 +1474,7 @@ static int Incremental_container(struct supertype *st, char *devname,
int sfd;
int ra_blocked = 0;
int ra_all = 0;
+ int result = 0;
st->ss->getinfo_super(st, &info, NULL);
@@ -1413,7 +1482,9 @@ static int Incremental_container(struct supertype *st, char *devname,
info.container_enough > 0)
/* pass */;
else {
- if (c->verbose)
+ if (c->export) {
+ printf("MD_STARTED=no\n");
+ } else if (c->verbose)
pr_err("not enough devices to start the container\n");
return 0;
}
@@ -1434,8 +1505,12 @@ static int Incremental_container(struct supertype *st, char *devname,
list = st->ss->container_content(st, NULL);
/* when nothing to activate - quit */
- if (list == NULL)
+ if (list == NULL) {
+ if (c->export) {
+ printf("MD_STARTED=nothing\n");
+ }
return 0;
+ }
for (ra = list ; ra ; ra = ra->next) {
int mdfd;
char chosen_name[1024];
@@ -1458,7 +1533,7 @@ static int Incremental_container(struct supertype *st, char *devname,
strcpy(chosen_name, mp->path);
else
strcpy(chosen_name, mp->devnm);
- } else {
+ } else if (!only) {
/* Check in mdadm.conf for container == devname and
* member == ra->text_version after second slash.
@@ -1504,7 +1579,7 @@ static int Incremental_container(struct supertype *st, char *devname,
pr_err("array %s/%s is "
"explicitly ignored by mdadm.conf\n",
match->container, match->member);
- return 2;
+ continue;
}
if (match)
trustworthy = LOCAL;
@@ -1515,6 +1590,8 @@ static int Incremental_container(struct supertype *st, char *devname,
trustworthy,
chosen_name);
}
+ if (only && (!mp || strcmp(mp->devnm, only) != 0))
+ continue;
if (mdfd < 0) {
pr_err("failed to open %s: %s.\n",
@@ -1523,9 +1600,30 @@ static int Incremental_container(struct supertype *st, char *devname,
}
assemble_container_content(st, mdfd, ra, c,
- chosen_name);
+ chosen_name, &result);
close(mdfd);
}
+ if (c->export && result) {
+ char sep = '=';
+ printf("MD_STARTED");
+ if (result & INCR_NO) {
+ printf("%cno", sep);
+ sep = ',';
+ }
+ if (result & INCR_UNSAFE) {
+ printf("%cunsafe", sep);
+ sep = ',';
+ }
+ if (result & INCR_ALREADY) {
+ printf("%calready", sep);
+ sep = ',';
+ }
+ if (result & INCR_YES) {
+ printf("%cyes", sep);
+ sep = ',';
+ }
+ printf("\n");
+ }
/* don't move spares to container with volume being activated
when all volumes are blocked */
diff --git a/Makefile b/Makefile
index e8da3a5d..f058a22e 100644
--- a/Makefile
+++ b/Makefile
@@ -57,6 +57,7 @@ ifdef DEFAULT_OLD_METADATA
else
DEFAULT_METADATA=1.2
endif
+CPPFLAGS += -DBINDIR=\"$(BINDIR)\"
PKG_CONFIG ?= pkg-config
@@ -131,7 +132,7 @@ INCL = mdadm.h part.h bitmap.h
MON_OBJS = mdmon.o monitor.o managemon.o util.o maps.o mdstat.o sysfs.o \
policy.o lib.o \
- Kill.o sg_io.o dlink.o ReadMe.o super0.o super1.o super-intel.o \
+ Kill.o sg_io.o dlink.o ReadMe.o super-intel.o \
super-mbr.o super-gpt.o \
super-ddf.o sha1.o crc32.o msg.o bitmap.o xmalloc.o \
platform-intel.o probe_roms.o
@@ -156,7 +157,7 @@ all : check_rundir mdadm mdmon
man : mdadm.man md.man mdadm.conf.man mdmon.man raid6check.man
check_rundir:
- @if [ ! -d "$(dir $(RUN_DIR))" -a "$(CHECK_RUN_DIR)" == 1 ]; then \
+ @if [ ! -d "$(dir $(RUN_DIR))" -a "$(CHECK_RUN_DIR)" = 1 ]; then \
echo "***** Parent of $(RUN_DIR) does not exist. Maybe set different RUN_DIR="; \
echo "***** e.g. make RUN_DIR=/dev/.mdadm" ; \
echo "***** or set CHECK_RUN_DIR=0"; exit 1; \
@@ -185,13 +186,13 @@ mdadm.klibc : $(SRCS) $(INCL)
$(CC) -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 $(CFLAGS) $(SRCS)
mdadm.Os : $(SRCS) $(INCL)
- $(CC) -o mdadm.Os $(CFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -Os $(SRCS)
+ $(CC) -o mdadm.Os $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -Os $(SRCS)
mdadm.O2 : $(SRCS) $(INCL) mdmon.O2
- $(CC) -o mdadm.O2 $(CFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(SRCS)
+ $(CC) -o mdadm.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(SRCS)
mdmon.O2 : $(MON_SRCS) $(INCL) mdmon.h
- $(CC) -o mdmon.O2 $(CFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS)
+ $(CC) -o mdmon.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS)
# use '-z now' to guarantee no dynamic linker interactions with the monitor thread
mdmon : check_rundir $(MON_OBJS)
@@ -213,7 +214,7 @@ mdassemble.diet : $(ASSEMBLE_SRCS) $(INCL)
mdassemble.static : $(ASSEMBLE_SRCS) $(INCL)
rm -f $(OBJS)
- $(CC) $(LDFLAGS) $(ASSEMBLE_FLAGS) -static -DHAVE_STDINT_H -o mdassemble.static $(ASSEMBLE_SRCS) $(STATICSRC)
+ $(CC) $(LDFLAGS) $(CPPFLAGS) $(ASSEMBLE_FLAGS) -static -DHAVE_STDINT_H -o mdassemble.static $(ASSEMBLE_SRCS) $(STATICSRC)
mdassemble.auto : $(ASSEMBLE_SRCS) $(INCL) $(ASSEMBLE_AUTO_SRCS)
rm -f mdassemble.static
@@ -234,22 +235,22 @@ mdadm.8 : mdadm.8.in
-e 's,{MAP_PATH},$(MAP_PATH),g' mdadm.8.in > mdadm.8
mdadm.man : mdadm.8
- nroff -man mdadm.8 > mdadm.man
+ man -l mdadm.8 > mdadm.man
mdmon.man : mdmon.8
- nroff -man mdmon.8 > mdmon.man
+ man -l mdmon.8 > mdmon.man
md.man : md.4
- nroff -man md.4 > md.man
+ man -l md.4 > md.man
mdadm.conf.man : mdadm.conf.5
- nroff -man mdadm.conf.5 > mdadm.conf.man
+ man -l mdadm.conf.5 > mdadm.conf.man
mdassemble.man : mdassemble.8
- nroff -man mdassemble.8 > mdassemble.man
+ man -l mdassemble.8 > mdassemble.man
raid6check.man : raid6check.8
- nroff -man raid6check.8 > raid6check.man
+ man -l raid6check.8 > raid6check.man
$(OBJS) : $(INCL) mdmon.h
$(MON_OBJS) : $(INCL) mdmon.h
@@ -280,11 +281,28 @@ install-man: mdadm.8 md.4 mdadm.conf.5 mdmon.8
$(INSTALL) -D -m 644 mdadm.conf.5 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5
install-udev: udev-md-raid-arrays.rules udev-md-raid-assembly.rules
- $(INSTALL) -D -m 644 udev-md-raid-arrays.rules $(DESTDIR)$(UDEVDIR)/rules.d/63-md-raid-arrays.rules
- $(INSTALL) -D -m 644 udev-md-raid-assembly.rules $(DESTDIR)$(UDEVDIR)/rules.d/64-md-raid-assembly.rules
+ @for file in 63-md-raid-arrays.rules 64-md-raid-assembly.rules ; \
+ do sed -e 's,BINDIR,$(BINDIR),g' udev-$${file#??-} > .install.tmp.1 && \
+ echo $(INSTALL) -D -m 644 udev-$${file#??-} $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \
+ $(INSTALL) -D -m 644 .install.tmp.1 $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \
+ rm -f .install.tmp.1; \
+ done
install-systemd: systemd/mdmon@.service
- $(INSTALL) -D -m 644 systemd/mdmon@.service $(DESTDIR)$(SYSTEMD_DIR)/mdmon@.service
+ @for file in mdmon@.service mdmonitor.service mdadm-last-resort@.timer \
+ mdadm-last-resort@.service mdadm-grow-continue@.service; \
+ do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.2 && \
+ echo $(INSTALL) -D -m 644 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \
+ $(INSTALL) -D -m 644 .install.tmp.2 $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \
+ rm -f .install.tmp.2; \
+ done
+ @for file in mdadm.shutdown ; \
+ do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.3 && \
+ echo $(INSTALL) -D -m 755 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \
+ $(INSTALL) -D -m 755 .install.tmp.3 $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \
+ rm -f .install.tmp.3; \
+ done
+ if [ -f /etc/SuSE-release -o -n "$(SUSE)" ] ;then $(INSTALL) -D -m 755 systemd/SUSE-mdadm_env.sh $(DESTDIR)$(SYSTEMD_DIR)/../scripts/mdadm_env.sh ;fi
uninstall:
rm -f $(DESTDIR)$(MAN8DIR)/mdadm.8 $(DESTDIR)$(MAN8DIR)/mdmon.8 $(DESTDIR)$(MAN4DIR)/md.4 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 $(DESTDIR)$(BINDIR)/mdadm
diff --git a/Manage.c b/Manage.c
index afa9b422..206f34ef 100644
--- a/Manage.c
+++ b/Manage.c
@@ -170,7 +170,7 @@ static void remove_devices(char *devnm, char *path)
free(path2);
}
-int Manage_run(char *devname, int fd, int verbose)
+int Manage_run(char *devname, int fd, struct context *c)
{
/* Run the array. Array must already be configured
* Requires >= 0.90.0
@@ -187,7 +187,7 @@ int Manage_run(char *devname, int fd, int verbose)
return 1;
}
strcpy(nm, nmp);
- return IncrementalScan(verbose, nm);
+ return IncrementalScan(c, nm);
}
int Manage_stop(char *devname, int fd, int verbose, int will_retry)
@@ -783,7 +783,8 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
break;
}
/* FIXME this is a bad test to be using */
- if (!tst->sb && dv->disposition != 'a') {
+ if (!tst->sb && (dv->disposition != 'a'
+ && dv->disposition != 'S')) {
/* we are re-adding a device to a
* completely dead array - have to depend
* on kernel to check
@@ -813,7 +814,7 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
dev_st = dup_super(tst);
dev_st->ss->load_super(dev_st, tfd, NULL);
}
- if (dev_st && dev_st->sb) {
+ if (dev_st && dev_st->sb && dv->disposition != 'S') {
int rv = attempt_re_add(fd, tfd, dv,
dev_st, tst,
rdev,
@@ -846,13 +847,14 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
continue;
if (disc.major == 0 && disc.minor == 0)
continue;
+ found++;
if (!(disc.state & (1<<MD_DISK_SYNC)))
continue;
avail[disc.raid_disk] = 1;
- found++;
}
array_failed = !enough(array->level, array->raid_disks,
array->layout, 1, avail);
+ free(avail);
} else
array_failed = 0;
if (array_failed) {
@@ -1236,6 +1238,7 @@ int Manage_subdevs(char *devname, int fd,
* 'a' - add the device
* try HOT_ADD_DISK
* If that fails EINVAL, try ADD_NEW_DISK
+ * 'S' - add the device as a spare - don't try re-add
* 'A' - re-add the device
* 'r' - remove the device: HOT_REMOVE_DISK
* device can be 'faulty' or 'detached' in which case all
@@ -1261,7 +1264,6 @@ int Manage_subdevs(char *devname, int fd,
mdu_array_info_t array;
unsigned long long array_size;
struct mddev_dev *dv;
- struct stat stb;
int tfd = -1;
struct supertype *tst;
char *subarray = NULL;
@@ -1293,9 +1295,10 @@ int Manage_subdevs(char *devname, int fd,
goto abort;
}
- stb.st_rdev = 0;
for (dv = devlist; dv; dv = dv->next) {
+ unsigned long rdev = 0; /* device to add/remove etc */
int rv;
+ int mj,mn;
if (strcmp(dv->devname, "failed") == 0 ||
strcmp(dv->devname, "faulty") == 0) {
@@ -1388,10 +1391,9 @@ int Manage_subdevs(char *devname, int fd,
sysfd = sysfs_open(fd2devnm(fd), dname, "block/dev");
if (sysfd >= 0) {
char dn[20];
- int mj,mn;
if (sysfs_fd_get_str(sysfd, dn, 20) > 0 &&
sscanf(dn, "%d:%d", &mj,&mn) == 2) {
- stb.st_rdev = makedev(mj,mn);
+ rdev = makedev(mj,mn);
found = 1;
}
close(sysfd);
@@ -1406,7 +1408,14 @@ int Manage_subdevs(char *devname, int fd,
goto abort;
}
}
+ } else if ((dv->disposition == 'r' || dv->disposition == 'f')
+ && get_maj_min(dv->devname, &mj, &mn)) {
+ /* for 'fail' and 'remove', the device might
+ * not exist.
+ */
+ rdev = makedev(mj, mn);
} else {
+ struct stat stb;
tfd = dev_open(dv->devname, O_RDONLY);
if (tfd >= 0)
fstat(tfd, &stb);
@@ -1439,6 +1448,7 @@ int Manage_subdevs(char *devname, int fd,
goto abort;
}
}
+ rdev = stb.st_rdev;
}
switch(dv->disposition){
default:
@@ -1446,6 +1456,7 @@ int Manage_subdevs(char *devname, int fd,
dv->devname, dv->disposition);
goto abort;
case 'a':
+ case 'S': /* --add-spare */
case 'A':
case 'M': /* --re-add missing */
case 'F': /* --re-add faulty */
@@ -1458,8 +1469,7 @@ int Manage_subdevs(char *devname, int fd,
}
if (dv->disposition == 'F')
/* Need to remove first */
- ioctl(fd, HOT_REMOVE_DISK,
- (unsigned long)stb.st_rdev);
+ ioctl(fd, HOT_REMOVE_DISK, rdev);
/* Make sure it isn't in use (in 2.6 or later) */
tfd = dev_open(dv->devname, O_RDONLY|O_EXCL);
if (tfd >= 0) {
@@ -1485,7 +1495,7 @@ int Manage_subdevs(char *devname, int fd,
}
rv = Manage_add(fd, tfd, dv, tst, &array,
force, verbose, devname, update,
- stb.st_rdev, array_size);
+ rdev, array_size);
close(tfd);
tfd = -1;
if (rv < 0)
@@ -1503,7 +1513,7 @@ int Manage_subdevs(char *devname, int fd,
rv = -1;
} else
rv = Manage_remove(tst, fd, dv, sysfd,
- stb.st_rdev, verbose,
+ rdev, verbose,
devname);
if (sysfd >= 0)
close(sysfd);
@@ -1518,7 +1528,7 @@ int Manage_subdevs(char *devname, int fd,
/* FIXME check current member */
if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) ||
(sysfd < 0 && ioctl(fd, SET_DISK_FAULTY,
- (unsigned long) stb.st_rdev))) {
+ rdev))) {
if (errno == EBUSY)
busy = 1;
pr_err("set device faulty failed for %s: %s\n",
@@ -1549,7 +1559,7 @@ int Manage_subdevs(char *devname, int fd,
frozen = -1;
}
rv = Manage_replace(tst, fd, dv,
- stb.st_rdev, verbose,
+ rdev, verbose,
devname);
}
if (rv < 0)
@@ -1563,7 +1573,7 @@ int Manage_subdevs(char *devname, int fd,
goto abort;
case 'w': /* --with device which was matched */
rv = Manage_with(tst, fd, dv,
- stb.st_rdev, verbose, devname);
+ rdev, verbose, devname);
if (rv < 0)
goto abort;
break;
diff --git a/Monitor.c b/Monitor.c
index c1a5d60d..5cb24fab 100644
--- a/Monitor.c
+++ b/Monitor.c
@@ -38,6 +38,7 @@ struct state {
char *spare_group;
int active, working, failed, spare, raid;
int from_config;
+ int from_auto;
int expected_spares;
int devstate[MAX_DISKS];
dev_t devid[MAX_DISKS];
@@ -206,7 +207,7 @@ int Monitor(struct mddev_dev *devlist,
while (! finished) {
int new_found = 0;
- struct state *st;
+ struct state *st, **stp;
int anydegraded = 0;
if (mdstat)
@@ -236,6 +237,16 @@ int Monitor(struct mddev_dev *devlist,
mdstat_wait(c->delay);
}
c->test = 0;
+
+ for (stp = &statelist; (st = *stp) != NULL; ) {
+ if (st->from_auto && st->err > 5) {
+ *stp = st->next;
+ free(st->devname);
+ free(st->spare_group);
+ free(st);
+ } else
+ stp = &st->next;
+ }
}
for (st2 = statelist; st2; st2 = statelist) {
statelist = st2->next;
@@ -461,14 +472,14 @@ static int check_array(struct state *st, struct mdstat_ent *mdstat,
if (fd < 0) {
if (!st->err)
alert("DeviceDisappeared", dev, NULL, ainfo);
- st->err=1;
+ st->err++;
return 0;
}
fcntl(fd, F_SETFD, FD_CLOEXEC);
if (ioctl(fd, GET_ARRAY_INFO, &array)<0) {
if (!st->err)
alert("DeviceDisappeared", dev, NULL, ainfo);
- st->err=1;
+ st->err++;
close(fd);
return 0;
}
@@ -478,7 +489,7 @@ static int check_array(struct state *st, struct mdstat_ent *mdstat,
if (array.level == 0 || array.level == -1) {
if (!st->err && !st->from_config)
alert("DeviceDisappeared", dev, "Wrong-Level", ainfo);
- st->err = 1;
+ st->err++;
close(fd);
return 0;
}
@@ -494,7 +505,7 @@ static int check_array(struct state *st, struct mdstat_ent *mdstat,
if (!mse) {
/* duplicated array in statelist
* or re-created after reading mdstat*/
- st->err = 1;
+ st->err++;
close(fd);
return 0;
}
@@ -505,7 +516,7 @@ static int check_array(struct state *st, struct mdstat_ent *mdstat,
array.utime = st->utime + 1;;
if (st->err) {
- /* New array appeared where previously had and error */
+ /* New array appeared where previously had an error */
st->err = 0;
st->percent = RESYNC_NONE;
new_array = 1;
@@ -684,6 +695,7 @@ static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist,
close(fd);
st->next = *statelist;
st->err = 1;
+ st->from_auto = 1;
strcpy(st->devnm, mse->devnm);
st->percent = RESYNC_UNKNOWN;
st->expected_spares = -1;
diff --git a/ReadMe.c b/ReadMe.c
index d742cc70..445c3882 100644
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -1,7 +1,7 @@
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2014 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
@@ -25,10 +25,10 @@
#include "mdadm.h"
#ifndef VERSION
-#define VERSION "3.3"
+#define VERSION "3.3.2"
#endif
#ifndef VERS_DATE
-#define VERS_DATE "3rd September 2013"
+#define VERS_DATE "21st August 2014"
#endif
char Version[] = Name " - v" VERSION " - " VERS_DATE "\n";
@@ -153,6 +153,7 @@ struct option long_options[] = {
/* Management */
{"add", 0, 0, Add},
+ {"add-spare", 0, 0, AddSpare},
{"remove", 0, 0, Remove},
{"fail", 0, 0, Fail},
{"set-faulty",0, 0, Fail},
@@ -165,6 +166,7 @@ struct option long_options[] = {
{"no-degraded",0,0, NoDegraded },
{"wait", 0, 0, WaitOpt},
{"wait-clean", 0, 0, Waitclean },
+ {"action", 1, 0, Action },
/* For Detail/Examine */
{"brief", 0, 0, Brief},
@@ -505,6 +507,7 @@ char Help_misc[] =
" --readwrite -w : mark array as readwrite\n"
" --test -t : exit status 0 if ok, 1 if degrade, 2 if dead, 4 if missing\n"
" --wait -W : wait for resync/rebuild/recovery to finish\n"
+" --action= : initiate or abort ('idle' or 'frozen') a 'check' or 'repair'.\n"
;
char Help_monitor[] =
diff --git a/bitmap.c b/bitmap.c
index 028225d5..020f10d9 100644
--- a/bitmap.c
+++ b/bitmap.c
@@ -194,24 +194,23 @@ out:
return info;
}
-bitmap_info_t *bitmap_file_read(char *filename, int brief, struct supertype **stp)
+int bitmap_file_open(char *filename, struct supertype **stp)
{
int fd;
- bitmap_info_t *info;
struct stat stb;
struct supertype *st = *stp;
if (stat(filename, &stb) < 0) {
pr_err("failed to find file %s: %s\n",
filename, strerror(errno));
- return NULL;
+ return -1;
}
if ((S_IFMT & stb.st_mode) == S_IFBLK) {
fd = open(filename, O_RDONLY|O_DIRECT);
if (fd < 0) {
pr_err("failed to open bitmap file %s: %s\n",
filename, strerror(errno));
- return NULL;
+ return -1;
}
/* block device, so we are probably after an internal bitmap */
if (!st) st = guess_super(fd);
@@ -221,7 +220,7 @@ bitmap_info_t *bitmap_file_read(char *filename, int brief, struct supertype **st
} else if (!st->ss->locate_bitmap) {
pr_err("No bitmap possible with %s metadata\n",
st->ss->name);
- return NULL;
+ return -1;
} else
st->ss->locate_bitmap(st, fd);
@@ -231,13 +230,11 @@ bitmap_info_t *bitmap_file_read(char *filename, int brief, struct supertype **st
if (fd < 0) {
pr_err("failed to open bitmap file %s: %s\n",
filename, strerror(errno));
- return NULL;
+ return -1;
}
}
- info = bitmap_fd_read(fd, brief);
- close(fd);
- return info;
+ return fd;
}
__u32 swapl(__u32 l)
@@ -263,22 +260,37 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st)
int rv = 1;
char buf[64];
int swap;
+ int fd;
__u32 uuid32[4];
- info = bitmap_file_read(filename, brief, &st);
- if (!info)
+ fd = bitmap_file_open(filename, &st);
+ if (fd < 0)
return rv;
+ info = bitmap_fd_read(fd, brief);
+ if (!info)
+ return rv;
sb = &info->sb;
+ if (sb->magic != BITMAP_MAGIC && md_get_version(fd) > 0) {
+ pr_err("This is an md array. To view a bitmap you need to examine\n");
+ pr_err("a member device, not the array.\n");
+ pr_err("Reporting bitmap that would be used if this array were used\n");
+ pr_err("as a member of some other array\n");
+ }
+ close(fd);
printf(" Filename : %s\n", filename);
printf(" Magic : %08x\n", sb->magic);
if (sb->magic != BITMAP_MAGIC) {
- pr_err("invalid bitmap magic 0x%x, the bitmap file appears to be corrupted\n", sb->magic);
+ pr_err("invalid bitmap magic 0x%x, the bitmap file appears\n",
+ sb->magic);
+ pr_err("to be corrupted or missing.\n");
}
printf(" Version : %d\n", sb->version);
if (sb->version < BITMAP_MAJOR_LO ||
sb->version > BITMAP_MAJOR_HI) {
- pr_err("unknown bitmap version %d, either the bitmap file is corrupted or you need to upgrade your tools\n", sb->version);
+ pr_err("unknown bitmap version %d, either the bitmap file\n",
+ sb->version);
+ pr_err("is corrupted or you need to upgrade your tools\n");
goto free_info;
}
diff --git a/config.c b/config.c
index f1a70c59..909f83f4 100644
--- a/config.c
+++ b/config.c
@@ -187,6 +187,7 @@ struct createinfo createinfo = {
.autof = 2, /* by default, create devices with standard names */
.symlinks = 1,
.names = 0, /* By default, stick with numbered md devices. */
+ .bblist = 1, /* Use a bad block list by default */
#ifdef DEBIAN
.gid = 6, /* disk */
.mode = 0660,
@@ -306,6 +307,10 @@ static void createline(char *line)
createinfo.names = 1;
else if (strncasecmp(w, "names=no", 11) == 0)
createinfo.names = 0;
+ else if (strncasecmp(w, "bbl=no", 11) == 0)
+ createinfo.bblist = 0;
+ else if (strncasecmp(w, "bbl=yes", 11) == 0)
+ createinfo.bblist = 1;
else {
pr_err("unrecognised word on CREATE line: %s\n",
w);
@@ -575,6 +580,7 @@ void autoline(char *line)
if (auto_seen)
return;
+ auto_seen = 1;
/* Parse the 'auto' line creating policy statements for the 'auto' policy.
*
diff --git a/inventory b/inventory
index 7aff3743..40598cee 100755
--- a/inventory
+++ b/inventory
@@ -18,6 +18,8 @@ ANNOUNCE-3.2.4
ANNOUNCE-3.2.5
ANNOUNCE-3.2.6
ANNOUNCE-3.3
+ANNOUNCE-3.3.1
+ANNOUNCE-3.3.2
Assemble.c
Build.c
COPYING
@@ -75,6 +77,7 @@ mdmon.h
mdopen.c
mdstat.c
misc/
+misc/mdcheck
misc/syslog-events
mkinitramfs
monitor.c
@@ -103,7 +106,13 @@ super1.c
swap_super.c
sysfs.c
systemd/
+systemd/SUSE-mdadm_env.sh
+systemd/mdadm-grow-continue@.service
+systemd/mdadm-last-resort@.service
+systemd/mdadm-last-resort@.timer
+systemd/mdadm.shutdown
systemd/mdmon@.service
+systemd/mdmonitor.service
test
tests/
tests/00linear
@@ -128,6 +137,7 @@ tests/02r6grow
tests/03assem-incr
tests/03r0assem
tests/03r5assem
+tests/03r5assem-failed
tests/03r5assemV1
tests/04r0update
tests/04r1update
@@ -173,13 +183,19 @@ tests/07testreshape5
tests/09imsm-assemble
tests/09imsm-create-fail-rebuild
tests/09imsm-overlap
+tests/10ddf-assemble-missing
tests/10ddf-create
tests/10ddf-create-fail-rebuild
tests/10ddf-fail-create-race
+tests/10ddf-fail-readd
+tests/10ddf-fail-readd-readonly
tests/10ddf-fail-spare
+tests/10ddf-fail-stop-readd
tests/10ddf-fail-twice
tests/10ddf-fail-two-spares
tests/10ddf-geometry
+tests/10ddf-incremental-wrong-order
+tests/10ddf-sudden-degraded
tests/11spare-migration
tests/12imsm-r0_2d-grow-r0_3d
tests/12imsm-r0_2d-grow-r0_4d
diff --git a/managemon.c b/managemon.c
index fc8d1fe7..1c9eccc4 100644
--- a/managemon.c
+++ b/managemon.c
@@ -697,7 +697,7 @@ static void manage_new(struct mdstat_ent *mdstat,
new->resync_start_fd = sysfs_open(new->info.sys_name, NULL, "resync_start");
new->metadata_fd = sysfs_open(new->info.sys_name, NULL, "metadata_version");
new->sync_completed_fd = sysfs_open(new->info.sys_name, NULL, "sync_completed");
- dprintf("%s: inst: %d action: %d state: %d\n", __func__, atoi(inst),
+ dprintf("%s: inst: %s action: %d state: %d\n", __func__, inst,
new->action_fd, new->info.state_fd);
if (sigterm)
@@ -819,7 +819,8 @@ static void handle_message(struct supertype *container, struct metadata_update *
mu->space_list = NULL;
mu->next = NULL;
if (container->ss->prepare_update)
- container->ss->prepare_update(container, mu);
+ if (!container->ss->prepare_update(container, mu))
+ free_updates(&mu);
queue_metadata_update(mu);
}
}
diff --git a/md.4 b/md.4
index 5f6c3a7c..e222237d 100644
--- a/md.4
+++ b/md.4
@@ -267,31 +267,326 @@ the resulting collection of datablocks are distributed over multiple
drives.
When configuring a RAID10 array, it is necessary to specify the number
-of replicas of each data block that are required (this will normally
-be 2) and whether the replicas should be 'near', 'offset' or 'far'.
-(Note that the 'offset' layout is only available from 2.6.18).
-
-When 'near' replicas are chosen, the multiple copies of a given chunk
-are laid out consecutively across the stripes of the array, so the two
-copies of a datablock will likely be at the same offset on two
-adjacent devices.
-
-When 'far' replicas are chosen, the multiple copies of a given chunk
-are laid out quite distant from each other. The first copy of all
-data blocks will be striped across the early part of all drives in
-RAID0 fashion, and then the next copy of all blocks will be striped
-across a later section of all drives, always ensuring that all copies
-of any given block are on different drives.
-
-The 'far' arrangement can give sequential read performance equal to
-that of a RAID0 array, but at the cost of reduced write performance.
-
-When 'offset' replicas are chosen, the multiple copies of a given
-chunk are laid out on consecutive drives and at consecutive offsets.
-Effectively each stripe is duplicated and the copies are offset by one
-device. This should give similar read characteristics to 'far' if a
-suitably large chunk size is used, but without as much seeking for
-writes.
+of replicas of each data block that are required (this will usually
+be\ 2) and whether their layout should be "near", "far" or "offset"
+(with "offset" being available since Linux\ 2.6.18).
+
+.B About the RAID10 Layout Examples:
+.br
+The examples below visualise the chunk distribution on the underlying
+devices for the respective layout.
+
+For simplicity it is assumed that the size of the chunks equals the
+size of the blocks of the underlying devices as well as those of the
+RAID10 device exported by the kernel (for example \fB/dev/md/\fPname).
+.br
+Therefore the chunks\ /\ chunk numbers map directly to the blocks\ /\
+block addresses of the exported RAID10 device.
+
+Decimal numbers (0,\ 1, 2,\ ...) are the chunks of the RAID10 and due
+to the above assumption also the blocks and block addresses of the
+exported RAID10 device.
+.br
+Repeated numbers mean copies of a chunk\ /\ block (obviously on
+different underlying devices).
+.br
+Hexadecimal numbers (0x00,\ 0x01, 0x02,\ ...) are the block addresses
+of the underlying devices.
+
+.TP
+\fB "near" Layout\fP
+When "near" replicas are chosen, the multiple copies of a given chunk are laid
+out consecutively ("as close to each other as possible") across the stripes of
+the array.
+
+With an even number of devices, they will likely (unless some misalignment is
+present) lay at the very same offset on the different devices.
+.br
+This is as the "classic" RAID1+0; that is two groups of mirrored devices (in the
+example below the groups Device\ #1\ /\ #2 and Device\ #3\ /\ #4 are each a
+RAID1) both in turn forming a striped RAID0.
+
+.ne 10
+.B Example with 2\ copies per chunk and an even number\ (4) of devices:
+.TS
+tab(;);
+ C - - - -
+ C | C | C | C | C |
+| - | - | - | - | - |
+| C | C | C | C | C |
+| C | C | C | C | C |
+| C | C | C | C | C |
+| C | C | C | C | C |
+| C | C | C | C | C |
+| C | C | C | C | C |
+| - | - | - | - | - |
+ C C S C S
+ C C S C S
+ C C S S S
+ C C S S S.
+;
+;Device #1;Device #2;Device #3;Device #4
+0x00;0;0;1;1
+0x01;2;2;3;3
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.
+:;:;:;:;:
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.
+0x80;254;254;255;255
+;\\---------v---------/;\\---------v---------/
+;RAID1;RAID1
+;\\---------------------v---------------------/
+;RAID0
+.TE
+
+.ne 10
+.B Example with 2\ copies per chunk and an odd number\ (5) of devices:
+.TS
+tab(;);
+ C - - - - -
+ C | C | C | C | C | C |
+| - | - | - | - | - | - |
+| C | C | C | C | C | C |
+| C | C | C | C | C | C |
+| C | C | C | C | C | C |
+| C | C | C | C | C | C |
+| C | C | C | C | C | C |
+| C | C | C | C | C | C |
+| - | - | - | - | - | - |
+C.
+;
+;Dev #1;Dev #2;Dev #3;Dev #4;Dev #5
+0x00;0;0;1;1;2
+0x01;2;3;3;4;4
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.
+:;:;:;:;:;:
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.
+0x80;317;318;318;319;319
+;
+.TE
+
+.TP
+\fB "far" Layout\fP
+When "far" replicas are chosen, the multiple copies of a given chunk
+are laid out quite distant ("as far as reasonably possible") from each
+other.
+
+First a complete sequence of all data blocks (that is all the data one
+sees on the exported RAID10 block device) is striped over the
+devices. Then another (though "shifted") complete sequence of all data
+blocks; and so on (in the case of more than 2\ copies per chunk).
+
+The "shift" needed to prevent placing copies of the same chunks on the
+same devices is actually a cyclic permutation with offset\ 1 of each
+of the stripes within a complete sequence of chunks.
+.br
+The offset\ 1 is relative to the previous complete sequence of chunks,
+so in case of more than 2\ copies per chunk one gets the following
+offsets:
+.br
+1.\ complete sequence of chunks: offset\ =\ \ 0
+.br
+2.\ complete sequence of chunks: offset\ =\ \ 1
+.br
+3.\ complete sequence of chunks: offset\ =\ \ 2
+.br
+ :
+.br
+n.\ complete sequence of chunks: offset\ =\ n-1
+
+.ne 10
+.B Example with 2\ copies per chunk and an even number\ (4) of devices:
+.TS
+tab(;);
+ C - - - -
+ C | C | C | C | C |
+| - | - | - | - | - |
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| - | - | - | - | - |
+C.
+;
+;Device #1;Device #2;Device #3;Device #4
+;
+0x00;0;1;2;3;\\
+0x01;4;5;6;7;> [#]
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;:
+:;:;:;:;:;:
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;:
+0x40;252;253;254;255;/
+0x41;3;0;1;2;\\
+0x42;7;4;5;6;> [#]~
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;:
+:;:;:;:;:;:
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;:
+0x80;255;252;253;254;/
+;
+.TE
+
+.ne 10
+.B Example with 2\ copies per chunk and an odd number\ (5) of devices:
+.TS
+tab(;);
+ C - - - - -
+ C | C | C | C | C | C |
+| - | - | - | - | - | - |
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| - | - | - | - | - | - |
+C.
+;
+;Dev #1;Dev #2;Dev #3;Dev #4;Dev #5
+;
+0x00;0;1;2;3;4;\\
+0x01;5;6;7;8;9;> [#]
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;:
+:;:;:;:;:;:;:
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;:
+0x40;315;316;317;318;319;/
+0x41;4;0;1;2;3;\\
+0x42;9;5;6;7;8;> [#]~
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;:
+:;:;:;:;:;:;:
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;:
+0x80;319;315;316;317;318;/
+;
+.TE
+
+With [#]\ being the complete sequence of chunks and [#]~\ the cyclic permutation
+with offset\ 1 thereof (in the case of more than 2 copies per chunk there would
+be ([#]~)~,\ (([#]~)~)~,\ ...).
+
+The advantage of this layout is that MD can easily spread sequential reads over
+the devices, making them similar to RAID0 in terms of speed.
+.br
+The cost is more seeking for writes, making them substantially slower.
+
+.TP
+\fB"offset" Layout\fP
+When "offset" replicas are chosen, all the copies of a given chunk are
+striped consecutively ("offset by the stripe length after each other")
+over the devices.
+
+Explained in detail, <number of devices> consecutive chunks are
+striped over the devices, immediately followed by a "shifted" copy of
+these chunks (and by further such "shifted" copies in the case of more
+than 2\ copies per chunk).
+.br
+This pattern repeats for all further consecutive chunks of the
+exported RAID10 device (in other words: all further data blocks).
+
+The "shift" needed to prevent placing copies of the same chunks on the
+same devices is actually a cyclic permutation with offset\ 1 of each
+of the striped copies of <number of devices> consecutive chunks.
+.br
+The offset\ 1 is relative to the previous striped copy of <number of
+devices> consecutive chunks, so in case of more than 2\ copies per
+chunk one gets the following offsets:
+.br
+1.\ <number of devices> consecutive chunks: offset\ =\ \ 0
+.br
+2.\ <number of devices> consecutive chunks: offset\ =\ \ 1
+.br
+3.\ <number of devices> consecutive chunks: offset\ =\ \ 2
+.br
+ :
+.br
+n.\ <number of devices> consecutive chunks: offset\ =\ n-1
+
+.ne 10
+.B Example with 2\ copies per chunk and an even number\ (4) of devices:
+.TS
+tab(;);
+ C - - - -
+ C | C | C | C | C |
+| - | - | - | - | - |
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| - | - | - | - | - |
+C.
+;
+;Device #1;Device #2;Device #3;Device #4
+;
+0x00;0;1;2;3;) AA
+0x01;3;0;1;2;) AA~
+0x02;4;5;6;7;) AB
+0x03;7;4;5;6;) AB~
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\.
+:;:;:;:;:; :
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\.
+0x79;251;252;253;254;) EX
+0x80;254;251;252;253;) EX~
+;
+.TE
+
+.ne 10
+.B Example with 2\ copies per chunk and an odd number\ (5) of devices:
+.TS
+tab(;);
+ C - - - - -
+ C | C | C | C | C | C |
+| - | - | - | - | - | - |
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| - | - | - | - | - | - |
+C.
+;
+;Dev #1;Dev #2;Dev #3;Dev #4;Dev #5
+;
+0x00;0;1;2;3;4;) AA
+0x01;4;0;1;2;3;) AA~
+0x02;5;6;7;8;9;) AB
+0x03;9;5;6;7;8;) AB~
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\.
+:;:;:;:;:;:; :
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\.
+0x79;314;315;316;317;318;) EX
+0x80;318;314;315;316;317;) EX~
+;
+.TE
+
+With AA,\ AB,\ ..., AZ,\ BA,\ ... being the sets of <number of devices> consecutive
+chunks and AA~,\ AB~,\ ..., AZ~,\ BA~,\ ... the cyclic permutations with offset\ 1
+thereof (in the case of more than 2 copies per chunk there would be (AA~)~,\ ...
+as well as ((AA~)~)~,\ ... and so on).
+
+This should give similar read characteristics to "far" if a suitably large chunk
+size is used, but without as much seeking for writes.
+.PP
+
It should be noted that the number of devices in a RAID10 array need
not be a multiple of the number of replica of each data block; however,
@@ -301,7 +596,7 @@ If, for example, an array is created with 5 devices and 2 replicas,
then space equivalent to 2.5 of the devices will be available, and
every block will be stored on two different devices.
-Finally, it is possible to have an array with both 'near' and 'far'
+Finally, it is possible to have an array with both "near" and "far"
copies. If an array is configured with 2 near copies and 2 far
copies, then there will be a total of 4 copies of each block, each on
a different drive. This is an artifact of the implementation and is
@@ -551,7 +846,7 @@ intent log if one is present.
In 2.6.13, intent bitmaps are only supported with RAID1. Other levels
with redundancy are supported from 2.6.15.
-.SS BAD BLOCK LOG
+.SS BAD BLOCK LIST
From Linux 3.5 each device in an
.I md
@@ -561,7 +856,7 @@ and the data.
When a block cannot be read and cannot be repaired by writing data
recovered from other devices, the address of the block is stored in
-the bad block log. Similarly if an attempt to write a block fails,
+the bad block list. Similarly if an attempt to write a block fails,
the address will be recorded as a bad block. If attempting to record
the bad block fails, the whole device will be marked faulty.
@@ -575,9 +870,9 @@ This allows an array to fail more gracefully - a few blocks on
different devices can be faulty without taking the whole array out of
action.
-The log is particularly useful when recovering to a spare. If a few blocks
+The list is particularly useful when recovering to a spare. If a few blocks
cannot be read from the other devices, the bulk of the recovery can
-complete and those few bad blocks will be recorded in the bad block log.
+complete and those few bad blocks will be recorded in the bad block list.
.SS WRITE-BEHIND
diff --git a/mdadm.8.in b/mdadm.8.in
index 09aff811..a6303107 100644
--- a/mdadm.8.in
+++ b/mdadm.8.in
@@ -5,7 +5,7 @@
.\" the Free Software Foundation; either version 2 of the License, or
.\" (at your option) any later version.
.\" See file COPYING in distribution for details.
-.TH MDADM 8 "" v3.3
+.TH MDADM 8 "" v3.3.2
.SH NAME
mdadm \- manage MD devices
.I aka
@@ -214,7 +214,10 @@ to detect and assemble arrays \(em possibly in an
.P
If a device is given before any options, or if the first option is
+and of
.BR \-\-add ,
+.BR \-\-re\-add ,
+.BR \-\-add\-spare ,
.BR \-\-fail ,
.BR \-\-remove ,
or
@@ -529,7 +532,7 @@ amount of available space is.
.BR \-c ", " \-\-chunk=
Specify chunk size of kibibytes. The default when creating an
array is 512KB. To ensure compatibility with earlier versions, the
-default when Building and array with no persistent metadata is 64KB.
+default when building an array with no persistent metadata is 64KB.
This is only meaningful for RAID0, RAID4, RAID5, RAID6, and RAID10.
RAID4, RAID5, RAID6, and RAID10 require the chunk size to be a power
@@ -1083,7 +1086,7 @@ argument given to this flag can be one of
.BR devicesize ,
.BR no\-bitmap ,
.BR bbl ,
-.BR no-\bbl ,
+.BR no\-bbl ,
.BR metadata ,
or
.BR super\-minor .
@@ -1298,6 +1301,15 @@ useful if you are certain that the reason for failure has been
resolved.
.TP
+.B \-\-add\-spare
+Add a device as a spare. This is similar to
+.B \-\-add
+except that it does not attempt
+.B \-\-re\-add
+first. The device will be added as a spare even if it looks like it
+could be an recent member of the array.
+
+.TP
.BR \-r ", " \-\-remove
remove listed devices. They must not be active. i.e. they should
be failed or spare devices.
@@ -1428,13 +1440,30 @@ absolute filepath or a link, e.g.
.TP
.BR \-Y ", " \-\-export
When used with
-.B \-\-detail , \-\-detail-platform
-or
+.BR \-\-detail ,
+.BR \-\-detail-platform ,
.BR \-\-examine ,
+or
+.B \-\-incremental
output will be formatted as
.B key=value
pairs for easy import into the environment.
+With
+.B \-\-incremental
+The value
+.B MD_STARTED
+indicates whether an array was started
+.RB ( yes )
+or not, which may include a reason
+.RB ( unsafe ", " nothing ", " no ).
+Also the value
+.B MD_FOREIGN
+indicates if the array is expected on this host
+.RB ( no ),
+or seems to be from elsewhere
+.RB ( yes ).
+
.TP
.BR \-E ", " \-\-examine
Print contents of the metadata stored on the named device(s).
@@ -1559,6 +1588,31 @@ successfully waited. For native arrays this returns immediately as the
kernel handles dirty-clean transitions at shutdown. No action is taken
if safe-mode handling is disabled.
+.TP
+.B \-\-action=
+Set the "sync_action" for all md devices given to one of
+.BR idle ,
+.BR frozen ,
+.BR check ,
+.BR repair .
+Setting to
+.B idle
+will abort any currently running action though some actions will
+automatically restart.
+Setting to
+.B frozen
+will abort any current action and ensure no other action starts
+automatically.
+
+Details of
+.B check
+and
+.B repair
+can be found it
+.IR md (4)
+under
+.BR "SCRUBBING AND MISMATCHES" .
+
.SH For Incremental Assembly mode:
.TP
.BR \-\-rebuild\-map ", " \-r
@@ -2334,7 +2388,8 @@ hot-spare and resync operations which are monitored.
.TP
.B RebuildStarted
-An md array started reconstruction. (syslog priority: Warning)
+An md array started reconstruction (e.g. recovery, resync, reshape,
+check, repair). (syslog priority: Warning)
.TP
.BI Rebuild NN
@@ -2647,6 +2702,7 @@ Usage:
.RB [ \-\-run ]
.RB [ \-\-quiet ]
.I component-device
+.RI [ optional-aliases-for-device ]
.HP 12
Usage:
.B mdadm \-\-incremental \-\-fail
@@ -2701,16 +2757,23 @@ That is, is it listed in a
.B DEVICES
line in that file. If
.B DEVICES
-is absent then the default it to allow any device. Similar if
+is absent then the default it to allow any device. Similarly if
.B DEVICES
contains the special word
.B partitions
then any device is allowed. Otherwise the device name given to
-.I mdadm
+.IR mdadm ,
+or one of the aliases given, or an alias found in the filesystem,
must match one of the names or patterns in a
.B DEVICES
line.
+This is the only context where the aliases are used. They are
+usually provided by a
+.I udev
+rules mentioning
+.BR ${DEVLINKS} .
+
.IP +
Does the device have a valid md superblock? If a specific metadata
version is requested with
@@ -2824,6 +2887,20 @@ to '1', the
will create and devices that are needed.
.TP
+.B MDADM_NO_SYSTEMCTL
+If
+.I mdadm
+detects that
+.I systemd
+is in use it will normally request
+.I systemd
+to start various background tasks (particularly
+.IR mdmon )
+rather than forking and running them in the background. This can be
+suppressed by setting
+.BR MDADM_NO_SYSTEMCTL=1 .
+
+.TP
.B IMSM_NO_PLATFORM
A key value of IMSM metadata is that it allows interoperability with
boot ROMs on Intel platforms, and with other major operating systems.
@@ -2840,6 +2917,18 @@ recovery. You should be aware that interoperability may be
compromised by setting this value.
.TP
+.B MDADM_GROW_ALLOW_OLD
+If an array is stopped while it is performing a reshape and that
+reshape was making use of a backup file, then when the array is
+re-assembled
+.I mdadm
+will sometimes complain that the backup file is too old. If this
+happens and you are certain it is the right backup file, you can
+over-ride this check by setting
+.B MDADM_GROW_ALLOW_OLD=1
+in the environment.
+
+.TP
.B MDADM_CONF_AUTO
Any string given in this variable is added to the start of the
.B AUTO
@@ -3075,7 +3164,7 @@ Partition numbers should be indicated by adding "pMM" to these, thus "/dev/md/d1
From kernel version 2.6.28 the "non-partitioned array" can actually
be partitioned. So the "md_d\fBNN\fP"
names are no longer needed, and
-partitions such as "/dev/md\fBNN\fPp\fBXX\fp"
+partitions such as "/dev/md\fBNN\fPp\fBXX\fP"
are possible.
.PP
From kernel version 2.6.29 standard names can be non-numeric following
diff --git a/mdadm.c b/mdadm.c
index 1ada6079..be990b8a 100644
--- a/mdadm.c
+++ b/mdadm.c
@@ -187,6 +187,7 @@ int main(int argc, char *argv[])
break;
case 'a':
case Add:
+ case AddSpare:
case 'r':
case Remove:
case Replace:
@@ -229,6 +230,7 @@ int main(int argc, char *argv[])
case ExamineBB:
case Dump:
case Restore:
+ case Action:
newmode = MISC;
break;
@@ -915,6 +917,9 @@ int main(int argc, char *argv[])
case O(MANAGE,Add): /* add a drive */
devmode = 'a';
continue;
+ case O(MANAGE,AddSpare): /* add drive - never re-add */
+ devmode = 'S';
+ continue;
case O(MANAGE,ReAdd):
devmode = 'A';
continue;
@@ -983,6 +988,7 @@ int main(int argc, char *argv[])
case O(MISC, UpdateSubarray):
case O(MISC, Dump):
case O(MISC, Restore):
+ case O(MISC ,Action):
if (opt == KillSubarray || opt == UpdateSubarray) {
if (c.subarray) {
pr_err("subarray can only"
@@ -991,6 +997,21 @@ int main(int argc, char *argv[])
}
c.subarray = optarg;
}
+ if (opt == Action) {
+ if (c.action) {
+ pr_err("Only one --action can be specified\n");
+ exit(2);
+ }
+ if (strcmp(optarg, "idle") == 0 ||
+ strcmp(optarg, "frozen") == 0 ||
+ strcmp(optarg, "check") == 0 ||
+ strcmp(optarg, "repair") == 0)
+ c.action = optarg;
+ else {
+ pr_err("action must be one of idle, frozen, check, repair\n");
+ exit(2);
+ }
+ }
if (devmode && devmode != opt &&
(devmode == 'E' || (opt == 'E' && devmode != 'Q'))) {
pr_err("--examine/-E cannot be given with ");
@@ -1293,7 +1314,7 @@ int main(int argc, char *argv[])
if (!rv && c.readonly < 0)
rv = Manage_ro(devlist->devname, mdfd, c.readonly);
if (!rv && c.runstop > 0)
- rv = Manage_run(devlist->devname, mdfd, c.verbose);
+ rv = Manage_run(devlist->devname, mdfd, &c);
if (!rv && c.runstop < 0)
rv = Manage_stop(devlist->devname, mdfd, c.verbose, 0);
break;
@@ -1527,6 +1548,11 @@ int main(int argc, char *argv[])
RebuildMap();
}
if (c.scan) {
+ rv = 1;
+ if (devlist) {
+ pr_err("In --incremental mode, a device cannot be given with --scan.\n");
+ break;
+ }
if (c.runstop <= 0) {
pr_err("--incremental --scan meaningless without --run.\n");
break;
@@ -1535,7 +1561,7 @@ int main(int argc, char *argv[])
pr_err("--incremental --scan --fail not supported.\n");
break;
}
- rv = IncrementalScan(c.verbose, NULL);
+ rv = IncrementalScan(&c, NULL);
}
if (!devlist) {
if (!rebuild_map && !c.scan) {
@@ -1544,16 +1570,16 @@ int main(int argc, char *argv[])
}
break;
}
- if (devlist->next) {
- pr_err("--incremental can only handle one device.\n");
- rv = 1;
- break;
- }
- if (devmode == 'f')
+ if (devmode == 'f') {
+ if (devlist->next) {
+ pr_err("'--incremental --fail' can only handle one device.\n");
+ rv = 1;
+ break;
+ }
rv = IncrementalRemove(devlist->devname, remove_path,
c.verbose);
- else
- rv = Incremental(devlist->devname, &c, ss);
+ } else
+ rv = Incremental(devlist, &c, ss);
break;
case AUTODETECT:
autodetect();
@@ -1793,6 +1819,9 @@ static int misc_list(struct mddev_dev *devlist,
rv |= Restore_metadata(dv->devname, dump_directory, c, ss,
(dv == devlist && dv->next == NULL));
continue;
+ case Action:
+ rv |= SetAction(dv->devname, c->action);
+ continue;
}
if (dv->devname[0] == '/')
mdfd = open_mddev(dv->devname, 1);
@@ -1804,7 +1833,8 @@ static int misc_list(struct mddev_dev *devlist,
if (mdfd>=0) {
switch(dv->disposition) {
case 'R':
- rv |= Manage_run(dv->devname, mdfd, c->verbose); break;
+ c->runstop = 1;
+ rv |= Manage_run(dv->devname, mdfd, c); break;
case 'S':
rv |= Manage_stop(dv->devname, mdfd, c->verbose, 0); break;
case 'o':
@@ -1818,3 +1848,26 @@ static int misc_list(struct mddev_dev *devlist,
}
return rv;
}
+
+int SetAction(char *dev, char *action)
+{
+ int fd = open(dev, O_RDONLY);
+ struct mdinfo mdi;
+ if (fd < 0) {
+ pr_err("Couldn't open %s: %s\n", dev, strerror(errno));
+ return 1;
+ }
+ sysfs_init(&mdi, fd, NULL);
+ close(fd);
+ if (!mdi.sys_name[0]) {
+ pr_err("%s is no an md array\n", dev);
+ return 1;
+ }
+
+ if (sysfs_set_str(&mdi, NULL, "sync_action", action) < 0) {
+ pr_err("Count not set action for %s to %s: %s\n",
+ dev, action, strerror(errno));
+ return 1;
+ }
+ return 0;
+}
diff --git a/mdadm.conf.5 b/mdadm.conf.5
index 088e6450..18512cb0 100644
--- a/mdadm.conf.5
+++ b/mdadm.conf.5
@@ -293,8 +293,8 @@ line and it should be give only one program.
.B CREATE
The
.B create
-line gives default values to be used when creating arrays and device entries for
-arrays.
+line gives default values to be used when creating arrays, new members
+of arrays, and device entries for arrays.
These include:
.RS 4
@@ -365,6 +365,16 @@ is given, then non-numeric
device names will not be used even if the default changes in a future
release of
.IR mdadm .
+
+.TP
+.B bbl=no
+By default,
+.I mdadm
+will reserve space for a bad block list (bbl) on all devices
+included in or added to any array that supports them. Setting
+.B bbl=no
+will prevent this, so newly added devices will not have a bad
+block log.
.RE
.TP
@@ -482,7 +492,7 @@ A device may belong to several domains. The domain of an array is a union
of domains of all devices in that array. A spare can be automatically
moved from one array to another if the set of the destination array's
.I domains
-ppcontains all the
+contains all the
.I domains
of the new disk or if both arrays have the same
.IR spare-group .
@@ -515,6 +525,7 @@ or
.TP
.B action=
include, re-add, spare, spare-same-slot, or force-spare
+.TP
.B auto=
yes, no, or homehost.
diff --git a/mdadm.h b/mdadm.h
index 91d27139..fc1fd318 100644
--- a/mdadm.h
+++ b/mdadm.h
@@ -129,12 +129,12 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
#if !defined(__KLIBC__)
#if BYTE_ORDER == LITTLE_ENDIAN
-#define __cpu_to_le16(_x) (_x)
-#define __cpu_to_le32(_x) (_x)
-#define __cpu_to_le64(_x) (_x)
-#define __le16_to_cpu(_x) (_x)
-#define __le32_to_cpu(_x) (_x)
-#define __le64_to_cpu(_x) (_x)
+#define __cpu_to_le16(_x) (unsigned int)(_x)
+#define __cpu_to_le32(_x) (unsigned int)(_x)
+#define __cpu_to_le64(_x) (unsigned long long)(_x)
+#define __le16_to_cpu(_x) (unsigned int)(_x)
+#define __le32_to_cpu(_x) (unsigned int)(_x)
+#define __le64_to_cpu(_x) (unsigned long long)(_x)
#define __cpu_to_be16(_x) bswap_16(_x)
#define __cpu_to_be32(_x) bswap_32(_x)
@@ -150,12 +150,12 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
#define __le32_to_cpu(_x) bswap_32(_x)
#define __le64_to_cpu(_x) bswap_64(_x)
-#define __cpu_to_be16(_x) (_x)
-#define __cpu_to_be32(_x) (_x)
-#define __cpu_to_be64(_x) (_x)
-#define __be16_to_cpu(_x) (_x)
-#define __be32_to_cpu(_x) (_x)
-#define __be64_to_cpu(_x) (_x)
+#define __cpu_to_be16(_x) (unsigned int)(_x)
+#define __cpu_to_be32(_x) (unsigned int)(_x)
+#define __cpu_to_be64(_x) (unsigned long long)(_x)
+#define __be16_to_cpu(_x) (unsigned int)(_x)
+#define __be32_to_cpu(_x) (unsigned int)(_x)
+#define __be64_to_cpu(_x) (unsigned long long)(_x)
#else
# error "unknown endianness."
#endif
@@ -178,6 +178,8 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
(void) (&_max1 == &_max2); \
_max1 > _max2 ? _max1 : _max2; })
+#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
+
/* general information that might be extracted from a superblock */
struct mdinfo {
mdu_array_info_t array;
@@ -256,6 +258,7 @@ struct createinfo {
int mode;
int symlinks;
int names;
+ int bblist;
struct supertype *supertype;
};
@@ -309,6 +312,7 @@ enum special_options {
Brief,
ManageOpt,
Add,
+ AddSpare,
Remove,
Fail,
Replace,
@@ -339,6 +343,7 @@ enum special_options {
ExamineBB,
Dump,
Restore,
+ Action,
};
enum prefix_standard {
@@ -412,6 +417,7 @@ struct context {
int freeze_reshape;
char *backup_file;
int invalid_backup;
+ char *action;
};
struct shape {
@@ -582,9 +588,12 @@ extern int reshape_open_backup_file(char *backup,
long blocks,
int *fdlist,
unsigned long long *offsets,
+ char *sysfs_name,
int restart);
extern unsigned long compute_backup_blocks(int nchunk, int ochunk,
unsigned int ndata, unsigned int odata);
+extern char *locate_backup(char *name);
+extern char *make_backup(char *name);
extern int save_stripes(int *source, unsigned long long *offsets,
int raid_disks, int chunk_size, int level, int layout,
@@ -921,7 +930,10 @@ extern struct superswitch {
void (*sync_metadata)(struct supertype *st);
void (*process_update)(struct supertype *st,
struct metadata_update *update);
- void (*prepare_update)(struct supertype *st,
+ /* Prepare updates allocates extra memory that might be
+ * needed. If the update cannot be understood, return 0.
+ */
+ int (*prepare_update)(struct supertype *st,
struct metadata_update *update);
/* activate_spare will check if the array is degraded and, if it
@@ -952,6 +964,9 @@ extern struct superswitch {
/* for external backup area */
int (*recover_backup)(struct supertype *st, struct mdinfo *info);
+ /* validate container after assemble */
+ int (*validate_container)(struct mdinfo *info);
+
int swapuuid; /* true if uuid is bigending rather than hostendian */
int external;
const char *name; /* canonical metadata name */
@@ -1169,7 +1184,7 @@ struct stat64;
extern int add_dev(const char *name, const struct stat *stb, int flag, struct FTW *s);
extern int Manage_ro(char *devname, int fd, int readonly);
-extern int Manage_run(char *devname, int fd, int quiet);
+extern int Manage_run(char *devname, int fd, struct context *c);
extern int Manage_stop(char *devname, int fd, int quiet,
int will_retry);
extern int Manage_subdevs(char *devname, int fd,
@@ -1187,13 +1202,13 @@ extern int Grow_restart(struct supertype *st, struct mdinfo *info,
int *fdlist, int cnt, char *backup_file, int verbose);
extern int Grow_continue(int mdfd, struct supertype *st,
struct mdinfo *info, char *backup_file,
- int freeze_reshape);
+ int forked, int freeze_reshape);
extern int restore_backup(struct supertype *st,
struct mdinfo *content,
int working_disks,
int spares,
- char *backup_file,
+ char **backup_filep,
int verbose);
extern int Grow_continue_command(char *devname, int fd,
char *backup_file, int verbose);
@@ -1231,11 +1246,12 @@ extern int Kill_subarray(char *dev, char *subarray, int verbose);
extern int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int quiet);
extern int Wait(char *dev);
extern int WaitClean(char *dev, int sock, int verbose);
+extern int SetAction(char *dev, char *action);
-extern int Incremental(char *devname, struct context *c,
+extern int Incremental(struct mddev_dev *devlist, struct context *c,
struct supertype *st);
extern void RebuildMap(void);
-extern int IncrementalScan(int verbose, char *devnm);
+extern int IncrementalScan(struct context *c, char *devnm);
extern int IncrementalRemove(char *devname, char *path, int verbose);
extern int CreateBitmap(char *filename, int force, char uuid[16],
unsigned long chunksize, unsigned long daemon_sleep,
@@ -1267,8 +1283,10 @@ extern int check_partitions(int fd, char *dname,
unsigned long long size);
extern int get_mdp_major(void);
+extern int get_maj_min(char *dev, int *major, int *minor);
extern int dev_open(char *dev, int flags);
extern int open_dev(char *devnm);
+extern void reopen_mddev(int mdfd);
extern int open_dev_flags(char *devnm, int flags);
extern int open_dev_excl(char *devnm);
extern int is_standard(char *dev, int *nump);
@@ -1328,7 +1346,11 @@ extern void append_metadata_update(struct supertype *st, void *buf, int len);
extern int assemble_container_content(struct supertype *st, int mdfd,
struct mdinfo *content,
struct context *c,
- char *chosen_name);
+ char *chosen_name, int *result);
+#define INCR_NO 1
+#define INCR_UNSAFE 2
+#define INCR_ALREADY 4
+#define INCR_YES 8
extern struct mdinfo *container_choose_spares(struct supertype *st,
unsigned long long min_size,
struct domainlist *domlist,
@@ -1451,6 +1473,9 @@ char *xstrdup(const char *str);
#define LEVEL_CONTAINER (-100)
#define LEVEL_UNSUPPORTED (-200)
+/* the kernel does know about this one ... */
+#define LEVEL_NONE (-1000000)
+
/* faulty stuff */
#define WriteTransient 0
diff --git a/mdadm.spec b/mdadm.spec
index 8e6e1062..384a1d89 100644
--- a/mdadm.spec
+++ b/mdadm.spec
@@ -1,6 +1,6 @@
Summary: mdadm is used for controlling Linux md devices (aka RAID arrays)
Name: mdadm
-Version: 3.3
+Version: 3.3.2
Release: 1
Source: http://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-%{version}.tar.gz
URL: http://neil.brown.name/blog/mdadm
diff --git a/mdassemble.8 b/mdassemble.8
index 1ec83f5b..ae6f6d45 100644
--- a/mdassemble.8
+++ b/mdassemble.8
@@ -1,5 +1,5 @@
.\" -*- nroff -*-
-.TH MDASSEMBLE 8 "" v3.3
+.TH MDASSEMBLE 8 "" v3.3.2
.SH NAME
mdassemble \- assemble MD devices
.I aka
diff --git a/mdmon.8 b/mdmon.8
index d7d26205..4f9a439a 100644
--- a/mdmon.8
+++ b/mdmon.8
@@ -1,5 +1,5 @@
.\" See file COPYING in distribution for details.
-.TH MDMON 8 "" v3.3
+.TH MDMON 8 "" v3.3.2
.SH NAME
mdmon \- monitor MD external metadata arrays
diff --git a/mdmon.c b/mdmon.c
index f0b06237..27045a12 100644
--- a/mdmon.c
+++ b/mdmon.c
@@ -232,6 +232,7 @@ static int make_control_sock(char *devname)
addr.sun_family = PF_LOCAL;
strcpy(addr.sun_path, path);
+ umask(077); /* ensure no world write access */
if (bind(sfd, &addr, sizeof(addr)) < 0) {
close(sfd);
return -1;
@@ -320,7 +321,7 @@ int main(int argc, char *argv[])
dofork = 0;
break;
case OffRootOpt:
- /* silently ignore old option */
+ argv[0][0] = '@';
break;
case 'h':
default:
@@ -429,6 +430,7 @@ static int mdmon(char *devnm, int must_fork, int takeover)
wait(&status);
status = WEXITSTATUS(status);
}
+ close(pfd[0]);
return status;
}
} else
@@ -516,10 +518,12 @@ static int mdmon(char *devnm, int must_fork, int takeover)
container->sock = make_control_sock(devnm);
status = 0;
- if (write(pfd[1], &status, sizeof(status)) < 0)
- pr_err("failed to notify our parent: %d\n",
- getppid());
- close(pfd[1]);
+ if (pfd[1] >= 0) {
+ if (write(pfd[1], &status, sizeof(status)) < 0)
+ pr_err("failed to notify our parent: %d\n",
+ getppid());
+ close(pfd[1]);
+ }
mlockall(MCL_CURRENT | MCL_FUTURE);
@@ -587,3 +591,10 @@ int save_stripes(int *source, unsigned long long *offsets,
{
return 0;
}
+
+struct superswitch super0 = {
+ .name = "0.90",
+};
+struct superswitch super1 = {
+ .name = "1.x",
+};
diff --git a/misc/mdcheck b/misc/mdcheck
new file mode 100644
index 00000000..33570b97
--- /dev/null
+++ b/misc/mdcheck
@@ -0,0 +1,158 @@
+#!/bin/bash
+
+# Copyright (C) 2014 Neil Brown <neilb@suse.de>
+#
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# Author: Neil Brown
+# Email: <neilb@suse.de>
+
+# This script should be run periodically to automatically
+# perform a 'check' on any md arrays.
+#
+# It supports a 'time budget' such that any incomplete 'check'
+# will be checkpointed when that time has expired.
+# A subsequent invocation can allow the 'check' to continue.
+#
+# Options are:
+# --continue Don't start new checks, only continue old ones.
+# --duration This is passed to "date --date=$duration" to find out
+# when to finish
+#
+# To support '--continue', arrays are identified by UUID and the 'sync_completed'
+# value is stored in /var/lib/mdcheck/$UUID
+
+# convert a /dev/md name into /sys/.../md equivalent
+sysname() {
+ set `ls -lLd $1`
+ maj=${5%,}
+ min=$6
+ readlink -f /sys/dev/block/$maj:$min
+}
+
+args=$(getopt -o hcd: -l help,continue,duration: -n mdcheck -- "$@")
+rv=$?
+if [ $rv -ne 0 ]; then exit $rv; fi
+
+eval set -- $args
+
+cont=
+endtime=
+while [ " $1" != " --" ]
+do
+ case $1 in
+ --help )
+ echo >&2 'Usage: mdcheck [--continue] [--duration time-offset]'
+ echo >&2 ' time-offset must be understood by "date --date"'
+ exit 0
+ ;;
+ --continue ) cont=yes ;;
+ --duration ) shift; dur=$1
+ endtime=$(date --date "$dur" "+%s")
+ ;;
+ esac
+ shift
+done
+shift
+
+# We need a temp file occasionally...
+tmp=/var/lib/mdcheck/.md-check-$$
+trap 'rm -f "$tmp"' 0
+
+
+# firstly, clean out really old state files
+mkdir -p /var/lib/mdcheck
+find /var/lib/mdcheck -name "MD_UUID*" -type f -mtime +180 -exec rm {} \;
+
+# Now look at each md device.
+cnt=0
+for dev in /dev/md?*
+do
+ sys=`sysname $dev`
+ if [ ! -f "$sys/md/sync_action" ]
+ then # cannot check this array
+ continue
+ fi
+ if [ "`cat $sys/md/sync_action`" != 'idle' ]
+ then # This array is busy
+ continue
+ fi
+
+ mdadm --detail --export "$dev" > $tmp || continue
+ source $tmp
+ fl="/var/lib/mdcheck/MD_UUID_$MD_UUID"
+ if [ -z "$cont" ]
+ then
+ start=0
+ elif [ -z "$MD_UUID" -o ! -f "$fl" ]
+ then
+ # Nothing to continue here
+ continue
+ else
+ start=`cat "$fl"`
+ fi
+
+ cnt=$[cnt+1]
+ eval MD_${cnt}_fl=\$fl
+ eval MD_${cnt}_sys=\$sys
+ echo $start > $fl
+ echo $start > $sys/md/sync_min
+ echo check > $sys/md/sync_action
+done
+
+if [ -z "$endtime" ]
+then
+ exit 0
+fi
+
+while [ `date +%s` -lt $endtime ]
+do
+ any=
+ for i in `eval echo {1..$cnt}`
+ do
+ eval fl=\$MD_${i}_fl
+ eval sys=\$MD_${i}_sys
+
+ if [ -z "$fl" ]; then continue; fi
+
+ if [ "`cat $sys/md/sync_action`" != 'check' ]
+ then
+ eval MD_${i}_fl=
+ rm -f $fl
+ continue;
+ fi
+ read a rest < $sys/md/sync_completed
+ echo $a > $fl
+ any=yes
+ done
+ if [ -z "$any" ]; then exit 0; fi
+ sleep 120
+done
+
+# We've waited, and there are still checks running.
+# Time to stop them.
+for i in `eval echo {1..$cnt}`
+do
+ eval fl=\$MD_${i}_fl
+ eval sys=\$MD_${i}_sys
+
+ if [ -z "$fl" ]; then continue; fi
+
+ if [ "`cat $sys/md/sync_action`" != 'check' ]
+ then
+ eval MD_${i}_fl=
+ rm -f $fl
+ continue;
+ fi
+ echo idle > $sys/md/sync_action
+ cat $sys/md/sync_min > $fl
+done
diff --git a/monitor.c b/monitor.c
index 742aa196..f81e7075 100644
--- a/monitor.c
+++ b/monitor.c
@@ -270,13 +270,6 @@ static int read_and_act(struct active_array *a)
a->info.resync_start
);
- if (a->curr_state > inactive &&
- a->prev_state == inactive) {
- /* array has been started
- * possible that container operation has to be completed
- */
- a->container->ss->set_array_state(a, 0);
- }
if ((a->curr_state == bad_word || a->curr_state <= inactive) &&
a->prev_state > inactive) {
/* array has been stopped */
@@ -428,8 +421,7 @@ static int read_and_act(struct active_array *a)
if (sync_completed > a->last_checkpoint)
a->last_checkpoint = sync_completed;
- if (deactivate || a->curr_state >= clean)
- a->container->ss->sync_metadata(a->container);
+ a->container->ss->sync_metadata(a->container);
dprintf("%s(%d): state:%s action:%s next(", __func__, a->info.container_member,
array_states[a->curr_state], sync_actions[a->curr_action]);
diff --git a/platform-intel.h b/platform-intel.h
index bcd84b7b..8226be35 100644
--- a/platform-intel.h
+++ b/platform-intel.h
@@ -204,6 +204,7 @@ struct sys_dev *find_intel_devices(void);
const struct imsm_orom *find_imsm_capability(enum sys_dev_type hba_id);
const struct imsm_orom *find_imsm_orom(void);
int disk_attached_to_hba(int fd, const char *hba_path);
+int devt_attached_to_hba(dev_t dev, const char *hba_path);
char *devt_to_devpath(dev_t dev);
int path_attached_to_hba(const char *disk_path, const char *hba_path);
const char *get_sys_dev_type(enum sys_dev_type);
diff --git a/policy.c b/policy.c
index b4f39434..ef83621a 100644
--- a/policy.c
+++ b/policy.c
@@ -200,26 +200,25 @@ static char *disk_path(struct mdinfo *disk)
int rv;
by_path = opendir(symlink);
- if (!by_path)
- return NULL;
- prefix_len = strlen(symlink);
-
- while ((ent = readdir(by_path)) != NULL) {
- if (ent->d_type != DT_LNK)
- continue;
- strncpy(symlink + prefix_len,
- ent->d_name,
- sizeof(symlink) - prefix_len);
- if (stat(symlink, &stb) < 0)
- continue;
- if ((stb.st_mode & S_IFMT) != S_IFBLK)
- continue;
- if (stb.st_rdev != makedev(disk->disk.major, disk->disk.minor))
- continue;
+ if (by_path) {
+ prefix_len = strlen(symlink);
+ while ((ent = readdir(by_path)) != NULL) {
+ if (ent->d_type != DT_LNK)
+ continue;
+ strncpy(symlink + prefix_len,
+ ent->d_name,
+ sizeof(symlink) - prefix_len);
+ if (stat(symlink, &stb) < 0)
+ continue;
+ if ((stb.st_mode & S_IFMT) != S_IFBLK)
+ continue;
+ if (stb.st_rdev != makedev(disk->disk.major, disk->disk.minor))
+ continue;
+ closedir(by_path);
+ return xstrdup(ent->d_name);
+ }
closedir(by_path);
- return xstrdup(ent->d_name);
}
- closedir(by_path);
/* A NULL path isn't really acceptable - use the devname.. */
sprintf(symlink, "/sys/dev/block/%d:%d", disk->disk.major, disk->disk.minor);
rv = readlink(symlink, nm, sizeof(nm)-1);
@@ -800,12 +799,12 @@ char *find_rule(struct rule *rule, char *rule_type)
#define UDEV_RULE_FORMAT \
"ACTION==\"add\", SUBSYSTEM==\"block\", " \
"ENV{DEVTYPE}==\"%s\", ENV{ID_PATH}==\"%s\", " \
-"RUN+=\"/sbin/mdadm --incremental $env{DEVNAME}\"\n"
+"RUN+=\"" BINDIR "/mdadm --incremental $env{DEVNAME}\"\n"
#define UDEV_RULE_FORMAT_NOTYPE \
"ACTION==\"add\", SUBSYSTEM==\"block\", " \
"ENV{ID_PATH}==\"%s\", " \
-"RUN+=\"/sbin/mdadm --incremental $env{DEVNAME}\"\n"
+"RUN+=\"" BINDIR "/mdadm --incremental $env{DEVNAME}\"\n"
/* Write rule in the rule file. Use format from UDEV_RULE_FORMAT */
int write_rule(struct rule *rule, int fd, int force_part)
diff --git a/probe_roms.c b/probe_roms.c
index 61297959..b0b08833 100644
--- a/probe_roms.c
+++ b/probe_roms.c
@@ -35,8 +35,6 @@ static const int rom_len = 0xf0000 - 0xc0000; /* option-rom memory region */
static int _sigbus;
static unsigned long rom_align;
-#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
-
static void sigbus(int sig)
{
_sigbus = 1;
diff --git a/raid6check.c b/raid6check.c
index 805d090a..1d8ac40d 100644
--- a/raid6check.c
+++ b/raid6check.c
@@ -27,6 +27,9 @@
#include <signal.h>
#include <sys/mman.h>
+#define CHECK_PAGE_BITS (12)
+#define CHECK_PAGE_SIZE (1 << CHECK_PAGE_BITS)
+
enum repair {
NO_REPAIR = 0,
MANUAL_REPAIR,
@@ -73,15 +76,15 @@ void raid6_collect(int chunk_size, uint8_t *p, uint8_t *q,
}
}
-/* Try to find out if a specific disk has problems */
-int raid6_stats(int *results, int raid_disks, int chunk_size)
+/* Try to find out if a specific disk has problems in a CHECK_PAGE_SIZE page size */
+int raid6_stats_blk(int *results, int raid_disks)
{
int i;
int curr_broken_disk = -255;
int prev_broken_disk = -255;
int broken_status = 0;
- for(i = 0; i < chunk_size; i++) {
+ for(i = 0; i < CHECK_PAGE_SIZE; i++) {
if(results[i] != -255)
curr_broken_disk = results[i];
@@ -112,6 +115,16 @@ int raid6_stats(int *results, int raid_disks, int chunk_size)
return curr_broken_disk;
}
+/* Collect disks status for a strip in CHECK_PAGE_SIZE page size blocks */
+void raid6_stats(int *disk, int *results, int raid_disks, int chunk_size)
+{
+ int i, j;
+
+ for(i = 0, j = 0; i < chunk_size; i += CHECK_PAGE_SIZE, j++) {
+ disk[j] = raid6_stats_blk(&results[i], raid_disks);
+ }
+}
+
int lock_stripe(struct mdinfo *info, unsigned long long start,
int chunk_size, int data_disks, sighandler_t *sig) {
int rv;
@@ -143,6 +156,169 @@ int unlock_all_stripes(struct mdinfo *info, sighandler_t *sig) {
return rv * 256;
}
+/* Autorepair */
+int autorepair(int *disk, int diskP, int diskQ, unsigned long long start, int chunk_size,
+ char *name[], int raid_disks, int data_disks, char **blocks_page,
+ char **blocks, uint8_t *p, char **stripes, int *block_index_for_slot,
+ int *source, unsigned long long *offsets)
+{
+ int i, j;
+ int pages_to_write_count = 0;
+ int page_to_write[chunk_size >> CHECK_PAGE_BITS];
+ for(j = 0; j < (chunk_size >> CHECK_PAGE_BITS); j++) {
+ if (disk[j] >= 0) {
+ printf("Auto-repairing slot %d (%s)\n", disk[j], name[disk[j]]);
+ pages_to_write_count++;
+ page_to_write[j] = 1;
+ for(i = 0; i < raid_disks; i++) {
+ blocks_page[i] = blocks[i] + j * CHECK_PAGE_SIZE;
+ }
+ if (disk[j] == diskQ) {
+ qsyndrome(p, (uint8_t*)stripes[diskQ] + j * CHECK_PAGE_SIZE, (uint8_t**)blocks_page, data_disks, CHECK_PAGE_SIZE);
+ }
+ else {
+ char *all_but_failed_blocks[data_disks];
+ int failed_block_index = block_index_for_slot[disk[j]];
+ for(i = 0; i < data_disks; i++) {
+ if (failed_block_index == i) {
+ all_but_failed_blocks[i] = stripes[diskP] + j * CHECK_PAGE_SIZE;
+ }
+ else {
+ all_but_failed_blocks[i] = blocks_page[i];
+ }
+ }
+ xor_blocks(stripes[disk[j]] + j * CHECK_PAGE_SIZE,
+ all_but_failed_blocks, data_disks, CHECK_PAGE_SIZE);
+ }
+ }
+ else {
+ page_to_write[j] = 0;
+ }
+ }
+
+ if(pages_to_write_count > 0) {
+ int write_res = 0;
+ for(j = 0; j < (chunk_size >> CHECK_PAGE_BITS); j++) {
+ if(page_to_write[j] == 1) {
+ lseek64(source[disk[j]], offsets[disk[j]] + start * chunk_size + j * CHECK_PAGE_SIZE, SEEK_SET);
+ write_res += write(source[disk[j]], stripes[disk[j]] + j * CHECK_PAGE_SIZE, CHECK_PAGE_SIZE);
+ }
+ }
+
+ if (write_res != (CHECK_PAGE_SIZE * pages_to_write_count)) {
+ fprintf(stderr, "Failed to write a full chunk.\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/* Manual repair */
+int manual_repair(int diskP, int diskQ, int chunk_size, int raid_disks, int data_disks,
+ int failed_disk1, int failed_disk2, unsigned long long start, int *block_index_for_slot,
+ char *name[], char **stripes, char **blocks, uint8_t *p, struct mdinfo *info, sighandler_t *sig,
+ int *source, unsigned long long *offsets)
+{
+ int err = 0;
+ int i;
+ printf("Repairing stripe %llu\n", start);
+ printf("Assuming slots %d (%s) and %d (%s) are incorrect\n",
+ failed_disk1, name[failed_disk1],
+ failed_disk2, name[failed_disk2]);
+
+ if (failed_disk1 == diskQ || failed_disk2 == diskQ) {
+ char *all_but_failed_blocks[data_disks];
+ int failed_data_or_p;
+ int failed_block_index;
+
+ if (failed_disk1 == diskQ) {
+ failed_data_or_p = failed_disk2;
+ }
+ else {
+ failed_data_or_p = failed_disk1;
+ }
+ printf("Repairing D/P(%d) and Q\n", failed_data_or_p);
+ failed_block_index = block_index_for_slot[failed_data_or_p];
+ for (i = 0; i < data_disks; i++) {
+ if (failed_block_index == i) {
+ all_but_failed_blocks[i] = stripes[diskP];
+ }
+ else {
+ all_but_failed_blocks[i] = blocks[i];
+ }
+ }
+ xor_blocks(stripes[failed_data_or_p],
+ all_but_failed_blocks, data_disks, chunk_size);
+ qsyndrome(p, (uint8_t*)stripes[diskQ], (uint8_t**)blocks, data_disks, chunk_size);
+ }
+ else {
+ ensure_zero_has_size(chunk_size);
+ if (failed_disk1 == diskP || failed_disk2 == diskP) {
+ int failed_data, failed_block_index;
+ if (failed_disk1 == diskP) {
+ failed_data = failed_disk2;
+ }
+ else {
+ failed_data = failed_disk1;
+ }
+ failed_block_index = block_index_for_slot[failed_data];
+ printf("Repairing D(%d) and P\n", failed_data);
+ raid6_datap_recov(raid_disks, chunk_size, failed_block_index, (uint8_t**)blocks);
+ }
+ else {
+ printf("Repairing D and D\n");
+ int failed_block_index1 = block_index_for_slot[failed_disk1];
+ int failed_block_index2 = block_index_for_slot[failed_disk2];
+ if (failed_block_index1 > failed_block_index2) {
+ int t = failed_block_index1;
+ failed_block_index1 = failed_block_index2;
+ failed_block_index2 = t;
+ }
+ raid6_2data_recov(raid_disks, chunk_size, failed_block_index1, failed_block_index2, (uint8_t**)blocks);
+ }
+ }
+
+ err = lock_stripe(info, start, chunk_size, data_disks, sig);
+ if(err != 0) {
+ if (err != 2) {
+ return -1;
+ }
+ return -2;;
+ }
+
+ int write_res1, write_res2;
+ off64_t seek_res;
+
+ seek_res = lseek64(source[failed_disk1],
+ offsets[failed_disk1] + start * chunk_size, SEEK_SET);
+ if (seek_res < 0) {
+ fprintf(stderr, "lseek failed for failed_disk1\n");
+ return -1;
+ }
+ write_res1 = write(source[failed_disk1], stripes[failed_disk1], chunk_size);
+
+ seek_res = lseek64(source[failed_disk2],
+ offsets[failed_disk2] + start * chunk_size, SEEK_SET);
+ if (seek_res < 0) {
+ fprintf(stderr, "lseek failed for failed_disk1\n");
+ return -1;
+ }
+ write_res2 = write(source[failed_disk2], stripes[failed_disk2], chunk_size);
+
+ err = unlock_all_stripes(info, sig);
+ if(err != 0) {
+ return -2;
+ }
+
+ if (write_res1 != chunk_size || write_res2 != chunk_size) {
+ fprintf(stderr, "Failed to write a complete chunk.\n");
+ return -2;
+ }
+
+ return 0;
+}
+
int check_stripes(struct mdinfo *info, int *source, unsigned long long *offsets,
int raid_disks, int chunk_size, int level, int layout,
unsigned long long start, unsigned long long length, char *name[],
@@ -152,13 +328,14 @@ int check_stripes(struct mdinfo *info, int *source, unsigned long long *offsets,
char *stripe_buf = xmalloc(raid_disks * chunk_size);
char **stripes = xmalloc(raid_disks * sizeof(char*));
char **blocks = xmalloc(raid_disks * sizeof(char*));
+ char **blocks_page = xmalloc(raid_disks * sizeof(char*));
int *block_index_for_slot = xmalloc(raid_disks * sizeof(int));
uint8_t *p = xmalloc(chunk_size);
uint8_t *q = xmalloc(chunk_size);
int *results = xmalloc(chunk_size * sizeof(int));
sighandler_t *sig = xmalloc(3 * sizeof(sighandler_t));
- int i;
+ int i, j;
int diskP, diskQ;
int data_disks = raid_disks - 2;
int err = 0;
@@ -172,9 +349,7 @@ int check_stripes(struct mdinfo *info, int *source, unsigned long long *offsets,
stripes[i] = stripe_buf + i * chunk_size;
while (length > 0) {
- int disk;
-
- printf("pos --> %llu\n", start);
+ int disk[chunk_size >> CHECK_PAGE_BITS];
err = lock_stripe(info, start, chunk_size, data_disks, sig);
if(err != 0) {
@@ -199,15 +374,11 @@ int check_stripes(struct mdinfo *info, int *source, unsigned long long *offsets,
goto exitCheck;
}
}
- err = unlock_all_stripes(info, sig);
- if(err != 0)
- goto exitCheck;
for (i = 0 ; i < data_disks ; i++) {
int disk = geo_map(i, start, raid_disks, level, layout);
blocks[i] = stripes[disk];
block_index_for_slot[disk] = i;
- printf("%d->%d\n", i, disk);
}
qsyndrome(p, q, (uint8_t**)blocks, data_disks, chunk_size);
@@ -218,145 +389,45 @@ int check_stripes(struct mdinfo *info, int *source, unsigned long long *offsets,
blocks[data_disks+1] = stripes[diskQ];
block_index_for_slot[diskQ] = data_disks+1;
- if (memcmp(p, stripes[diskP], chunk_size) != 0) {
- printf("P(%d) wrong at %llu\n", diskP, start);
- }
- if (memcmp(q, stripes[diskQ], chunk_size) != 0) {
- printf("Q(%d) wrong at %llu\n", diskQ, start);
- }
raid6_collect(chunk_size, p, q, stripes[diskP], stripes[diskQ], results);
- disk = raid6_stats(results, raid_disks, chunk_size);
+ raid6_stats(disk, results, raid_disks, chunk_size);
- if(disk >= -2) {
- disk = geo_map(disk, start, raid_disks, level, layout);
- }
- if(disk >= 0) {
- printf("Error detected at %llu: possible failed disk slot: %d --> %s\n",
- start, disk, name[disk]);
- }
- if(disk == -65535) {
- printf("Error detected at %llu: disk slot unknown\n", start);
- }
- if(repair == MANUAL_REPAIR) {
- printf("Repairing stripe %llu\n", start);
- printf("Assuming slots %d (%s) and %d (%s) are incorrect\n",
- failed_disk1, name[failed_disk1],
- failed_disk2, name[failed_disk2]);
-
- if (failed_disk1 == diskQ || failed_disk2 == diskQ) {
- char *all_but_failed_blocks[data_disks];
- int failed_data_or_p;
- int failed_block_index;
-
- if (failed_disk1 == diskQ)
- failed_data_or_p = failed_disk2;
- else
- failed_data_or_p = failed_disk1;
- printf("Repairing D/P(%d) and Q\n", failed_data_or_p);
- failed_block_index = block_index_for_slot[failed_data_or_p];
- for (i=0; i < data_disks; i++)
- if (failed_block_index == i)
- all_but_failed_blocks[i] = stripes[diskP];
- else
- all_but_failed_blocks[i] = blocks[i];
- xor_blocks(stripes[failed_data_or_p],
- all_but_failed_blocks, data_disks, chunk_size);
- qsyndrome(p, (uint8_t*)stripes[diskQ], (uint8_t**)blocks, data_disks, chunk_size);
- } else {
- ensure_zero_has_size(chunk_size);
- if (failed_disk1 == diskP || failed_disk2 == diskP) {
- int failed_data, failed_block_index;
- if (failed_disk1 == diskP)
- failed_data = failed_disk2;
- else
- failed_data = failed_disk1;
- failed_block_index = block_index_for_slot[failed_data];
- printf("Repairing D(%d) and P\n", failed_data);
- raid6_datap_recov(raid_disks, chunk_size, failed_block_index, (uint8_t**)blocks);
- } else {
- printf("Repairing D and D\n");
- int failed_block_index1 = block_index_for_slot[failed_disk1];
- int failed_block_index2 = block_index_for_slot[failed_disk2];
- if (failed_block_index1 > failed_block_index2) {
- int t = failed_block_index1;
- failed_block_index1 = failed_block_index2;
- failed_block_index2 = t;
- }
- raid6_2data_recov(raid_disks, chunk_size, failed_block_index1, failed_block_index2, (uint8_t**)blocks);
- }
+ for(j = 0; j < (chunk_size >> CHECK_PAGE_BITS); j++) {
+ if(disk[j] >= -2) {
+ disk[j] = geo_map(disk[j], start, raid_disks, level, layout);
}
-
- err = lock_stripe(info, start, chunk_size, data_disks, sig);
- if(err != 0) {
- if (err != 2)
- unlock_all_stripes(info, sig);
- goto exitCheck;
+ if(disk[j] >= 0) {
+ printf("Error detected at stripe %llu, page %d: possible failed disk slot: %d --> %s\n",
+ start, j, disk[j], name[disk[j]]);
}
-
- int write_res1, write_res2;
- off64_t seek_res;
-
- seek_res = lseek64(source[failed_disk1],
- offsets[failed_disk1] + start * chunk_size, SEEK_SET);
- if (seek_res < 0) {
- fprintf(stderr, "lseek failed for failed_disk1\n");
- unlock_all_stripes(info, sig);
- err = -1;
- goto exitCheck;
- }
- write_res1 = write(source[failed_disk1], stripes[failed_disk1], chunk_size);
-
- seek_res = lseek64(source[failed_disk2],
- offsets[failed_disk2] + start * chunk_size, SEEK_SET);
- if (seek_res < 0) {
- fprintf(stderr, "lseek failed for failed_disk1\n");
- unlock_all_stripes(info, sig);
- err = -1;
- goto exitCheck;
- }
- write_res2 = write(source[failed_disk2], stripes[failed_disk2], chunk_size);
-
- err = unlock_all_stripes(info, sig);
- if(err != 0)
- goto exitCheck;
-
- if (write_res1 != chunk_size || write_res2 != chunk_size) {
- fprintf(stderr, "Failed to write a complete chunk.\n");
- goto exitCheck;
- }
-
- } else if (disk >= 0 && repair == AUTO_REPAIR) {
- printf("Auto-repairing slot %d (%s)\n", disk, name[disk]);
- if (disk == diskQ) {
- qsyndrome(p, (uint8_t*)stripes[diskQ], (uint8_t**)blocks, data_disks, chunk_size);
- } else {
- char *all_but_failed_blocks[data_disks];
- int failed_block_index = block_index_for_slot[disk];
- for (i=0; i < data_disks; i++)
- if (failed_block_index == i)
- all_but_failed_blocks[i] = stripes[diskP];
- else
- all_but_failed_blocks[i] = blocks[i];
- xor_blocks(stripes[disk],
- all_but_failed_blocks, data_disks, chunk_size);
+ if(disk[j] == -65535) {
+ printf("Error detected at stripe %llu, page %d: disk slot unknown\n", start, j);
}
+ }
- err = lock_stripe(info, start, chunk_size, data_disks, sig);
+ if(repair == AUTO_REPAIR) {
+ err = autorepair(disk, diskP, diskQ, start, chunk_size,
+ name, raid_disks, data_disks, blocks_page,
+ blocks, p, stripes, block_index_for_slot,
+ source, offsets);
if(err != 0) {
- if (err != 2)
- unlock_all_stripes(info, sig);
+ unlock_all_stripes(info, sig);
goto exitCheck;
}
+ }
- lseek64(source[disk], offsets[disk] + start * chunk_size, 0);
- int write_res = write(source[disk], stripes[disk], chunk_size);
-
- err = unlock_all_stripes(info, sig);
- if(err != 0 || write_res != chunk_size)
- goto exitCheck;
+ err = unlock_all_stripes(info, sig);
+ if(err != 0) {
+ goto exitCheck;
+ }
- if (write_res != chunk_size) {
- fprintf(stderr, "Failed to write a full chunk.\n");
+ if(repair == MANUAL_REPAIR) {
+ err = manual_repair(diskP, diskQ, chunk_size, raid_disks, data_disks,
+ failed_disk1, failed_disk2, start, block_index_for_slot,
+ name, stripes, blocks, p, info, sig,
+ source, offsets);
+ if(err == -1) {
+ unlock_all_stripes(info, sig);
goto exitCheck;
}
}
@@ -370,6 +441,7 @@ exitCheck:
free(stripe_buf);
free(stripes);
free(blocks);
+ free(blocks_page);
free(block_index_for_slot);
free(p);
free(q);
@@ -554,7 +626,7 @@ int main(int argc, char *argv[])
if(disk_slot >= 0) {
disk_name[disk_slot] = map_dev(comp->disk.major, comp->disk.minor, 0);
offsets[disk_slot] = comp->data_offset * 512;
- fds[disk_slot] = open(disk_name[disk_slot], O_RDWR);
+ fds[disk_slot] = open(disk_name[disk_slot], O_RDWR | O_SYNC);
if (fds[disk_slot] < 0) {
perror(disk_name[disk_slot]);
fprintf(stderr,"%s: cannot open %s\n", prg, disk_name[disk_slot]);
diff --git a/super-ddf.c b/super-ddf.c
index 8bba70a3..bc0ce2c0 100644
--- a/super-ddf.c
+++ b/super-ddf.c
@@ -1,7 +1,7 @@
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2006-2014 Neil Brown <neilb@suse.de>
*
*
* This program is free software; you can redistribute it and/or modify
@@ -21,7 +21,7 @@
* Author: Neil Brown
* Email: <neil@brown.name>
*
- * Specifications for DDF takes from Common RAID DDF Specification Revision 1.2
+ * Specifications for DDF taken from Common RAID DDF Specification Revision 1.2
* (July 28 2006). Reused by permission of SNIA.
*/
@@ -30,6 +30,7 @@
#include "mdmon.h"
#include "sha1.h"
#include <values.h>
+#include <stddef.h>
/* a non-official T10 name for creation GUIDs */
static char T10[] = "Linux-MD";
@@ -229,7 +230,8 @@ struct ddf_controller_data {
struct phys_disk {
be32 magic; /* DDF_PHYS_RECORDS_MAGIC */
be32 crc;
- be16 used_pdes;
+ be16 used_pdes; /* This is a counter, not a max - the list
+ * of used entries may not be dense */
be16 max_pdes;
__u8 pad[52];
struct phys_disk_entry {
@@ -237,8 +239,10 @@ struct phys_disk {
be32 refnum;
be16 type;
be16 state;
- be64 config_size; /* DDF structures must be after here */
- char path[18]; /* another horrible structure really */
+ be64 config_size; /* DDF structures must be after here */
+ char path[18]; /* Another horrible structure really
+ * but is "used for information
+ * purposes only" */
__u8 pad[6];
} entries[0];
};
@@ -340,7 +344,10 @@ struct vd_config {
* for concat I hope) */
be64 array_blocks; /* blocks in array */
__u8 pad1[8];
- be32 spare_refs[8];
+ be32 spare_refs[8]; /* This is used to detect missing spares.
+ * As we don't have an interface for that
+ * the values are ignored.
+ */
__u8 cache_pol[8];
__u8 bg_rate;
__u8 pad2[3];
@@ -432,28 +439,34 @@ struct bad_block_log {
* and reconstructed for writing. This means that we only need
* to make config changes once and they are automatically
* propagated to all devices.
- * Note that the ddf_super has space of the conf and disk data
- * for this disk and also for a list of all such data.
- * The list is only used for the superblock that is being
- * built in Create or Assemble to describe the whole array.
+ * The global (config and disk data) records are each in a list
+ * of separate data structures. When writing we find the entry
+ * or entries applicable to the particular device.
*/
struct ddf_super {
- struct ddf_header anchor, primary, secondary;
+ struct ddf_header anchor, primary, secondary;
struct ddf_controller_data controller;
- struct ddf_header *active;
+ struct ddf_header *active;
struct phys_disk *phys;
struct virtual_disk *virt;
char *conf;
- int pdsize, vdsize;
- unsigned int max_part, mppe, conf_rec_len;
- int currentdev;
- int updates_pending;
+ int pdsize, vdsize;
+ unsigned int max_part, mppe, conf_rec_len;
+ int currentdev;
+ int updates_pending;
struct vcl {
union {
char space[512];
struct {
struct vcl *next;
unsigned int vcnum; /* index into ->virt */
+ /* For an array with a secondary level there are
+ * multiple vd_config structures, all with the same
+ * guid but with different sec_elmnt_seq.
+ * One of these structures is in 'conf' below.
+ * The others are in other_bvds, not in any
+ * particular order.
+ */
struct vd_config **other_bvds;
__u64 *block_sizes; /* NULL if all the same */
};
@@ -479,6 +492,7 @@ struct ddf_super {
/* These fields used by auto-layout */
int raiddisk; /* slot to fill in autolayout */
__u64 esize;
+ int displayed;
};
};
struct disk_data disk;
@@ -486,12 +500,44 @@ struct ddf_super {
} *dlist, *add_list;
};
-#ifndef offsetof
-#define offsetof(t,f) ((size_t)&(((t*)0)->f))
+#ifndef MDASSEMBLE
+static int load_super_ddf_all(struct supertype *st, int fd,
+ void **sbp, char *devname);
+static int get_svd_state(const struct ddf_super *, const struct vcl *);
+static int
+validate_geometry_ddf_container(struct supertype *st,
+ int level, int layout, int raiddisks,
+ int chunk, unsigned long long size,
+ unsigned long long data_offset,
+ char *dev, unsigned long long *freesize,
+ int verbose);
+
+static int validate_geometry_ddf_bvd(struct supertype *st,
+ int level, int layout, int raiddisks,
+ int *chunk, unsigned long long size,
+ unsigned long long data_offset,
+ char *dev, unsigned long long *freesize,
+ int verbose);
#endif
-#if DEBUG
+static void free_super_ddf(struct supertype *st);
static int all_ff(const char *guid);
+static unsigned int get_pd_index_from_refnum(const struct vcl *vc,
+ be32 refnum, unsigned int nmax,
+ const struct vd_config **bvd,
+ unsigned int *idx);
+static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *map);
+static void uuid_from_ddf_guid(const char *guid, int uuid[4]);
+static void uuid_from_super_ddf(struct supertype *st, int uuid[4]);
+static void _ddf_array_name(char *name, const struct ddf_super *ddf, int i);
+static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, char *map);
+static int init_super_ddf_bvd(struct supertype *st,
+ mdu_array_info_t *info,
+ unsigned long long size,
+ char *name, char *homehost,
+ int *uuid, unsigned long long data_offset);
+
+#if DEBUG
static void pr_state(struct ddf_super *ddf, const char *msg)
{
unsigned int i;
@@ -509,19 +555,21 @@ static void pr_state(struct ddf_super *ddf, const char *msg)
static void pr_state(const struct ddf_super *ddf, const char *msg) {}
#endif
-static void _ddf_set_updates_pending(struct ddf_super *ddf, const char *func)
+static void _ddf_set_updates_pending(struct ddf_super *ddf, struct vd_config *vc,
+ const char *func)
{
+ if (vc) {
+ vc->timestamp = cpu_to_be32(time(0)-DECADE);
+ vc->seqnum = cpu_to_be32(be32_to_cpu(vc->seqnum) + 1);
+ }
+ if (ddf->updates_pending)
+ return;
ddf->updates_pending = 1;
ddf->active->seq = cpu_to_be32((be32_to_cpu(ddf->active->seq)+1));
pr_state(ddf, func);
}
-#define ddf_set_updates_pending(x) _ddf_set_updates_pending((x), __func__)
-
-static unsigned int get_pd_index_from_refnum(const struct vcl *vc,
- be32 refnum, unsigned int nmax,
- const struct vd_config **bvd,
- unsigned int *idx);
+#define ddf_set_updates_pending(x,v) _ddf_set_updates_pending((x), (v), __func__)
static be32 calc_crc(void *buf, int len)
{
@@ -533,7 +581,7 @@ static be32 calc_crc(void *buf, int len)
newcrc = crc32(0, buf, len);
ddf->crc = oldcrc;
- /* The crc is store (like everything) bigendian, so convert
+ /* The crc is stored (like everything) bigendian, so convert
* here for simplicity
*/
return cpu_to_be32(newcrc);
@@ -622,15 +670,23 @@ static int layout_md2ddf(const mdu_array_info_t *array,
rlq = DDF_RAID1_SIMPLE;
prim_elmnt_count = cpu_to_be16(2);
sec_elmnt_count = array->raid_disks / 2;
+ srl = DDF_2SPANNED;
+ prl = DDF_RAID1;
} else if (array->raid_disks % 3 == 0
&& array->layout == 0x103) {
rlq = DDF_RAID1_MULTI;
prim_elmnt_count = cpu_to_be16(3);
sec_elmnt_count = array->raid_disks / 3;
+ srl = DDF_2SPANNED;
+ prl = DDF_RAID1;
+ } else if (array->layout == 0x201) {
+ prl = DDF_RAID1E;
+ rlq = DDF_RAID1E_OFFSET;
+ } else if (array->layout == 0x102) {
+ prl = DDF_RAID1E;
+ rlq = DDF_RAID1E_ADJACENT;
} else
return err_bad_md_layout(array);
- srl = DDF_2SPANNED;
- prl = DDF_RAID1;
break;
default:
return err_bad_md_layout(array);
@@ -691,6 +747,15 @@ static int layout_ddf2md(const struct vd_config *conf,
return err_bad_ddf_layout(conf);
level = 1;
break;
+ case DDF_RAID1E:
+ if (conf->rlq == DDF_RAID1E_ADJACENT)
+ layout = 0x102;
+ else if (conf->rlq == DDF_RAID1E_OFFSET)
+ layout = 0x201;
+ else
+ return err_bad_ddf_layout(conf);
+ level = 10;
+ break;
case DDF_RAID4:
if (conf->rlq != DDF_RAID4_N)
return err_bad_ddf_layout(conf);
@@ -880,7 +945,8 @@ static int load_ddf_headers(int fd, struct ddf_super *super, char *devname)
super->primary.openflag && !super->secondary.openflag)
)
super->active = &super->secondary;
- } else if (devname)
+ } else if (devname &&
+ be64_to_cpu(super->anchor.secondary_lba) != ~(__u64)0)
pr_err("Failed to load secondary DDF header on %s\n",
devname);
if (super->active == NULL)
@@ -981,16 +1047,16 @@ static int load_ddf_local(int fd, struct ddf_super *super,
unsigned int i;
unsigned int confsec;
int vnum;
- unsigned int max_virt_disks = be16_to_cpu
- (super->active->max_vd_entries);
+ unsigned int max_virt_disks =
+ be16_to_cpu(super->active->max_vd_entries);
unsigned long long dsize;
/* First the local disk info */
if (posix_memalign((void**)&dl, 512,
- sizeof(*dl) +
- (super->max_part) * sizeof(dl->vlist[0])) != 0) {
+ sizeof(*dl) +
+ (super->max_part) * sizeof(dl->vlist[0])) != 0) {
pr_err("%s could not allocate disk info buffer\n",
- __func__);
+ __func__);
return 1;
}
@@ -1049,7 +1115,7 @@ static int load_ddf_local(int fd, struct ddf_super *super,
if (dl->spare)
continue;
if (posix_memalign((void**)&dl->spare, 512,
- super->conf_rec_len*512) != 0) {
+ super->conf_rec_len*512) != 0) {
pr_err("%s could not allocate spare info buf\n",
__func__);
return 1;
@@ -1059,7 +1125,9 @@ static int load_ddf_local(int fd, struct ddf_super *super,
continue;
}
if (!be32_eq(vd->magic, DDF_VD_CONF_MAGIC))
+ /* Must be vendor-unique - I cannot handle those */
continue;
+
for (vcl = super->conflist; vcl; vcl = vcl->next) {
if (memcmp(vcl->conf.guid,
vd->guid, DDF_GUID_LEN) == 0)
@@ -1078,8 +1146,8 @@ static int load_ddf_local(int fd, struct ddf_super *super,
continue;
} else {
if (posix_memalign((void**)&vcl, 512,
- (super->conf_rec_len*512 +
- offsetof(struct vcl, conf))) != 0) {
+ (super->conf_rec_len*512 +
+ offsetof(struct vcl, conf))) != 0) {
pr_err("%s could not allocate vcl buf\n",
__func__);
return 1;
@@ -1108,13 +1176,6 @@ static int load_ddf_local(int fd, struct ddf_super *super,
return 0;
}
-#ifndef MDASSEMBLE
-static int load_super_ddf_all(struct supertype *st, int fd,
- void **sbp, char *devname);
-#endif
-
-static void free_super_ddf(struct supertype *st);
-
static int load_super_ddf(struct supertype *st, int fd,
char *devname)
{
@@ -1125,7 +1186,7 @@ static int load_super_ddf(struct supertype *st, int fd,
if (get_dev_size(fd, devname, &dsize) == 0)
return 1;
- if (!st->ignore_hw_compat && test_partition(fd))
+ if (test_partition(fd))
/* DDF is not allowed on partitions */
return 1;
@@ -1239,7 +1300,7 @@ static void free_super_ddf(struct supertype *st)
static struct supertype *match_metadata_desc_ddf(char *arg)
{
- /* 'ddf' only support containers */
+ /* 'ddf' only supports containers */
struct supertype *st;
if (strcmp(arg, "ddf") != 0 &&
strcmp(arg, "default") != 0
@@ -1387,7 +1448,7 @@ static void examine_vd(int n, struct ddf_super *sb, char *guid)
be16_to_cpu(vc->prim_elmnt_count));
for (i = 0; i < be16_to_cpu(vc->prim_elmnt_count); i++) {
int j;
- int cnt = be16_to_cpu(sb->phys->used_pdes);
+ int cnt = be16_to_cpu(sb->phys->max_pdes);
for (j=0; j<cnt; j++)
if (be32_eq(vc->phys_refnum[i],
sb->phys->entries[j].refnum))
@@ -1397,6 +1458,7 @@ static void examine_vd(int n, struct ddf_super *sb, char *guid)
printf("%d", j);
else
printf("--");
+ printf("@%lluK", (unsigned long long) be64_to_cpu(LBA_OFFSET(sb, vc)[i])/2);
}
printf(")\n");
if (vc->chunk_shift != 255)
@@ -1447,17 +1509,24 @@ static void examine_vds(struct ddf_super *sb)
static void examine_pds(struct ddf_super *sb)
{
- int cnt = be16_to_cpu(sb->phys->used_pdes);
+ int cnt = be16_to_cpu(sb->phys->max_pdes);
int i;
struct dl *dl;
+ int unlisted = 0;
printf(" Physical Disks : %d\n", cnt);
printf(" Number RefNo Size Device Type/State\n");
+ for (dl = sb->dlist; dl; dl = dl->next)
+ dl->displayed = 0;
+
for (i=0 ; i<cnt ; i++) {
struct phys_disk_entry *pd = &sb->phys->entries[i];
int type = be16_to_cpu(pd->type);
int state = be16_to_cpu(pd->state);
+ if (be32_to_cpu(pd->refnum) == 0xffffffff)
+ /* Not in use */
+ continue;
//printf(" PD GUID[%d] : ", i); print_guid(pd->guid, 0);
//printf("\n");
printf(" %3d %08x ", i,
@@ -1475,6 +1544,8 @@ static void examine_pds(struct ddf_super *sb)
}
if (!dl)
printf("%15s","");
+ else
+ dl->displayed = 1;
printf(" %s%s%s%s%s",
(type&2) ? "active":"",
(type&4) ? "Global-Spare":"",
@@ -1494,6 +1565,19 @@ static void examine_pds(struct ddf_super *sb)
(state&64)? ", Missing" : "");
printf("\n");
}
+ for (dl = sb->dlist; dl; dl = dl->next) {
+ char *dv;
+ if (dl->displayed)
+ continue;
+ if (!unlisted)
+ printf(" Physical disks not in metadata!:\n");
+ unlisted = 1;
+ dv = map_dev(dl->major, dl->minor, 0);
+ printf(" %08x %s\n", be32_to_cpu(dl->disk.refnum),
+ dv ? dv : "-unknown-");
+ }
+ if (unlisted)
+ printf("\n");
}
static void examine_super_ddf(struct supertype *st, char *homehost)
@@ -1507,18 +1591,13 @@ static void examine_super_ddf(struct supertype *st, char *homehost)
printf(" Container GUID : "); print_guid(sb->anchor.guid, 1);
printf("\n");
printf(" Seq : %08x\n", be32_to_cpu(sb->active->seq));
- printf(" Redundant hdr : %s\n", be32_eq(sb->secondary.magic,
+ printf(" Redundant hdr : %s\n", (be32_eq(sb->secondary.magic,
DDF_HEADER_MAGIC)
- ?"yes" : "no");
+ ?"yes" : "no"));
examine_vds(sb);
examine_pds(sb);
}
-static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *map);
-
-static void uuid_from_ddf_guid(const char *guid, int uuid[4]);
-static void uuid_from_super_ddf(struct supertype *st, int uuid[4]);
-
static unsigned int get_vd_num_of_subarray(struct supertype *st)
{
/*
@@ -1564,7 +1643,8 @@ static void brief_examine_super_ddf(struct supertype *st, int verbose)
static void brief_examine_subarrays_ddf(struct supertype *st, int verbose)
{
- /* We just write a generic DDF ARRAY entry
+ /* We write a DDF ARRAY member entry for each vd, identifying container
+ * by uuid and member by unit number and uuid.
*/
struct ddf_super *ddf = st->sb;
struct mdinfo info;
@@ -1577,13 +1657,17 @@ static void brief_examine_subarrays_ddf(struct supertype *st, int verbose)
struct virtual_entry *ve = &ddf->virt->entries[i];
struct vcl vcl;
char nbuf1[64];
+ char namebuf[17];
if (all_ff(ve->guid))
continue;
memcpy(vcl.conf.guid, ve->guid, DDF_GUID_LEN);
ddf->currentconf =&vcl;
+ vcl.vcnum = i;
uuid_from_super_ddf(st, info.uuid);
fname_from_uuid(st, &info, nbuf1, ':');
- printf("ARRAY container=%s member=%d UUID=%s\n",
+ _ddf_array_name(namebuf, ddf, i);
+ printf("ARRAY%s%s container=%s member=%d UUID=%s\n",
+ namebuf[0] == '\0' ? "" : " /dev/md/", namebuf,
nbuf+5, i, nbuf1+5);
}
}
@@ -1614,7 +1698,7 @@ static int copy_metadata_ddf(struct supertype *st, int from, int to)
* So it is easiest to find the earliest of primary and
* secondary, and copy everything from there.
*
- * Anchor is 512 from end It contains primary_lba and secondary_lba
+ * Anchor is 512 from end. It contains primary_lba and secondary_lba
* we choose one of those
*/
@@ -1665,14 +1749,63 @@ err:
static void detail_super_ddf(struct supertype *st, char *homehost)
{
- /* FIXME later
- * Could print DDF GUID
- * Need to find which array
- * If whole, briefly list all arrays
- * If one, give name
+ struct ddf_super *sb = st->sb;
+ int cnt = be16_to_cpu(sb->virt->populated_vdes);
+
+ printf(" Container GUID : "); print_guid(sb->anchor.guid, 1);
+ printf("\n");
+ printf(" Seq : %08x\n", be32_to_cpu(sb->active->seq));
+ printf(" Virtual Disks : %d\n", cnt);
+ printf("\n");
+}
+#endif
+
+static const char *vendors_with_variable_volume_UUID[] = {
+ "LSI ",
+};
+
+static int volume_id_is_reliable(const struct ddf_super *ddf)
+{
+ int n = ARRAY_SIZE(vendors_with_variable_volume_UUID);
+ int i;
+ for (i = 0; i < n; i++)
+ if (!memcmp(ddf->controller.guid,
+ vendors_with_variable_volume_UUID[i], 8))
+ return 0;
+ return 1;
+}
+
+static void uuid_of_ddf_subarray(const struct ddf_super *ddf,
+ unsigned int vcnum, int uuid[4])
+{
+ char buf[DDF_GUID_LEN+18], sha[20], *p;
+ struct sha1_ctx ctx;
+ if (volume_id_is_reliable(ddf)) {
+ uuid_from_ddf_guid(ddf->virt->entries[vcnum].guid, uuid);
+ return;
+ }
+ /*
+ * Some fake RAID BIOSes (in particular, LSI ones) change the
+ * VD GUID at every boot. These GUIDs are not suitable for
+ * identifying an array. Luckily the header GUID appears to
+ * remain constant.
+ * We construct a pseudo-UUID from the header GUID and those
+ * properties of the subarray that we expect to remain constant.
*/
+ memset(buf, 0, sizeof(buf));
+ p = buf;
+ memcpy(p, ddf->anchor.guid, DDF_GUID_LEN);
+ p += DDF_GUID_LEN;
+ memcpy(p, ddf->virt->entries[vcnum].name, 16);
+ p += 16;
+ *((__u16 *) p) = vcnum;
+ sha1_init_ctx(&ctx);
+ sha1_process_bytes(buf, sizeof(buf), &ctx);
+ sha1_finish_ctx(&ctx, sha);
+ memcpy(uuid, sha, 4*4);
}
+#ifndef MDASSEMBLE
static void brief_detail_super_ddf(struct supertype *st)
{
struct mdinfo info;
@@ -1684,7 +1817,7 @@ static void brief_detail_super_ddf(struct supertype *st)
else if (vcnum == DDF_NOTFOUND)
return;
else
- uuid_from_ddf_guid(ddf->virt->entries[vcnum].guid, info.uuid);
+ uuid_of_ddf_subarray(ddf, vcnum, info.uuid);
fname_from_uuid(st, &info, nbuf,':');
printf(" UUID=%s", nbuf + 5);
}
@@ -1694,7 +1827,8 @@ static int match_home_ddf(struct supertype *st, char *homehost)
{
/* It matches 'this' host if the controller is a
* Linux-MD controller with vendor_data matching
- * the hostname
+ * the hostname. It would be nice if we could
+ * test against controller found in /sys or somewhere...
*/
struct ddf_super *ddf = st->sb;
unsigned int len;
@@ -1715,11 +1849,14 @@ static int find_index_in_bvd(const struct ddf_super *ddf,
unsigned int *n_bvd)
{
/*
- * Find the index of the n-th valid physical disk in this BVD
+ * Find the index of the n-th valid physical disk in this BVD.
+ * Unused entries can be sprinkled in with the used entries,
+ * but don't count.
*/
unsigned int i, j;
- for (i = 0, j = 0; i < ddf->mppe &&
- j < be16_to_cpu(conf->prim_elmnt_count); i++) {
+ for (i = 0, j = 0;
+ i < ddf->mppe && j < be16_to_cpu(conf->prim_elmnt_count);
+ i++) {
if (be32_to_cpu(conf->phys_refnum[i]) != 0xffffffff) {
if (n == j) {
*n_bvd = i;
@@ -1733,6 +1870,13 @@ static int find_index_in_bvd(const struct ddf_super *ddf,
return 0;
}
+/* Given a member array instance number, and a raid disk within that instance,
+ * find the vd_config structure. The offset of the given disk in the phys_refnum
+ * table is returned in n_bvd.
+ * For two-level members with a secondary raid level the vd_config for
+ * the appropriate BVD is returned.
+ * The return value is always &vlc->conf, where vlc is returned in last pointer.
+ */
static struct vd_config *find_vdcr(struct ddf_super *ddf, unsigned int inst,
unsigned int n,
unsigned int *n_bvd, struct vcl **vcl)
@@ -1827,17 +1971,13 @@ static void uuid_from_super_ddf(struct supertype *st, int uuid[4])
*/
struct ddf_super *ddf = st->sb;
struct vcl *vcl = ddf->currentconf;
- char *guid;
if (vcl)
- guid = vcl->conf.guid;
+ uuid_of_ddf_subarray(ddf, vcl->vcnum, uuid);
else
- guid = ddf->anchor.guid;
- uuid_from_ddf_guid(guid, uuid);
+ uuid_from_ddf_guid(ddf->anchor.guid, uuid);
}
-static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, char *map);
-
static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *map)
{
struct ddf_super *ddf = st->sb;
@@ -1857,13 +1997,13 @@ static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *m
cptr = (__u32 *)(ddf->anchor.guid + 16);
info->array.ctime = DECADE + __be32_to_cpu(*cptr);
- info->array.utime = 0;
info->array.chunk_size = 0;
info->container_enough = 1;
- info->disk.major = 0;
- info->disk.minor = 0;
+ info->disk.major = 0;
+ info->disk.minor = 0;
if (ddf->dlist) {
+ struct phys_disk_entry *pde = NULL;
info->disk.number = be32_to_cpu(ddf->dlist->disk.refnum);
info->disk.raid_disk = find_phys(ddf, ddf->dlist->disk.refnum);
@@ -1871,12 +2011,23 @@ static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *m
entries[info->disk.raid_disk].
config_size);
info->component_size = ddf->dlist->size - info->data_offset;
+ if (info->disk.raid_disk >= 0)
+ pde = ddf->phys->entries + info->disk.raid_disk;
+ if (pde &&
+ !(be16_to_cpu(pde->state) & DDF_Failed) &&
+ !(be16_to_cpu(pde->state) & DDF_Missing))
+ info->disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE);
+ else
+ info->disk.state = 1 << MD_DISK_FAULTY;
+
} else {
+ /* There should always be a dlist, but just in case...*/
info->disk.number = -1;
info->disk.raid_disk = -1;
-// info->disk.raid_disk = find refnum in the table and use index;
+ info->disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE);
}
- info->disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE);
+ info->events = be32_to_cpu(ddf->active->seq);
+ info->array.utime = DECADE + be32_to_cpu(ddf->active->timestamp);
info->recovery_start = MaxSector;
info->reshape_active = 0;
@@ -1891,12 +2042,14 @@ static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *m
uuid_from_super_ddf(st, info->uuid);
if (map) {
- int i;
- for (i = 0 ; i < map_disks; i++) {
- if (i < info->array.raid_disks &&
- (be16_to_cpu(ddf->phys->entries[i].state)
- & DDF_Online) &&
- !(be16_to_cpu(ddf->phys->entries[i].state)
+ int i, e = 0;
+ int max = be16_to_cpu(ddf->phys->max_pdes);
+ for (i = e = 0 ; i < map_disks ; i++, e++) {
+ while (e < max &&
+ be32_to_cpu(ddf->phys->entries[e].refnum) == 0xffffffff)
+ e++;
+ if (i < info->array.raid_disks && e < max &&
+ !(be16_to_cpu(ddf->phys->entries[e].state)
& DDF_Failed))
map[i] = 1;
else
@@ -1905,6 +2058,17 @@ static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *m
}
}
+/* size of name must be at least 17 bytes! */
+static void _ddf_array_name(char *name, const struct ddf_super *ddf, int i)
+{
+ int j;
+ memcpy(name, ddf->virt->entries[i].name, 16);
+ name[16] = 0;
+ for(j = 0; j < 16; j++)
+ if (name[j] == ' ')
+ name[j] = 0;
+}
+
static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, char *map)
{
struct ddf_super *ddf = st->sb;
@@ -1912,7 +2076,7 @@ static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, cha
int cd = ddf->currentdev;
int n_prim;
int j;
- struct dl *dl;
+ struct dl *dl = NULL;
int map_disks = info->array.raid_disks;
__u32 *cptr;
struct vd_config *conf;
@@ -1925,7 +2089,7 @@ static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, cha
info->array.ctime = DECADE + __be32_to_cpu(*cptr);
info->array.utime = DECADE + be32_to_cpu(vc->conf.timestamp);
info->array.chunk_size = 512 << vc->conf.chunk_shift;
- info->custom_array_size = 0;
+ info->custom_array_size = be64_to_cpu(vc->conf.array_blocks);
conf = &vc->conf;
n_prim = be16_to_cpu(conf->prim_elmnt_count);
@@ -1942,22 +2106,27 @@ static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, cha
info->component_size = vc->block_sizes[cd];
else
info->component_size = be64_to_cpu(conf->blocks);
- }
- for (dl = ddf->dlist; dl ; dl = dl->next)
- if (be32_eq(dl->disk.refnum, conf->phys_refnum[cd]))
- break;
+ for (dl = ddf->dlist; dl ; dl = dl->next)
+ if (be32_eq(dl->disk.refnum, conf->phys_refnum[cd]))
+ break;
+ }
info->disk.major = 0;
info->disk.minor = 0;
info->disk.state = 0;
- if (dl) {
+ if (dl && dl->pdnum >= 0) {
info->disk.major = dl->major;
info->disk.minor = dl->minor;
info->disk.raid_disk = cd + conf->sec_elmnt_seq
* be16_to_cpu(conf->prim_elmnt_count);
info->disk.number = dl->pdnum;
- info->disk.state = (1<<MD_DISK_SYNC)|(1<<MD_DISK_ACTIVE);
+ info->disk.state = 0;
+ if (info->disk.number >= 0 &&
+ (be16_to_cpu(ddf->phys->entries[info->disk.number].state) & DDF_Online) &&
+ !(be16_to_cpu(ddf->phys->entries[info->disk.number].state) & DDF_Failed))
+ info->disk.state = (1<<MD_DISK_SYNC)|(1<<MD_DISK_ACTIVE);
+ info->events = be32_to_cpu(ddf->active->seq);
}
info->container_member = ddf->currentconf->vcnum;
@@ -1982,11 +2151,7 @@ static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, cha
info->container_member);
info->safe_mode_delay = DDF_SAFE_MODE_DELAY;
- memcpy(info->name, ddf->virt->entries[info->container_member].name, 16);
- info->name[16]=0;
- for(j=0; j<16; j++)
- if (info->name[j] == ' ')
- info->name[j] = 0;
+ _ddf_array_name(info->name, ddf, info->container_member);
if (map)
for (j = 0; j < map_disks; j++) {
@@ -2038,7 +2203,7 @@ static int update_super_ddf(struct supertype *st, struct mdinfo *info,
// struct virtual_entry *ve = find_ve(ddf);
/* we don't need to handle "force-*" or "assemble" as
- * there is no need to 'trick' the kernel. We the metadata is
+ * there is no need to 'trick' the kernel. When the metadata is
* first updated to activate the array, all the implied modifications
* will just happen.
*/
@@ -2133,12 +2298,6 @@ static unsigned int find_vde_by_guid(const struct ddf_super *ddf,
}
#endif
-static int init_super_ddf_bvd(struct supertype *st,
- mdu_array_info_t *info,
- unsigned long long size,
- char *name, char *homehost,
- int *uuid, unsigned long long data_offset);
-
static int init_super_ddf(struct supertype *st,
mdu_array_info_t *info,
unsigned long long size, char *name, char *homehost,
@@ -2155,12 +2314,12 @@ static int init_super_ddf(struct supertype *st,
* We need to create the entire 'ddf' structure which includes:
* DDF headers - these are easy.
* Controller data - a Sector describing this controller .. not that
- * this is a controller exactly.
+ * this is a controller exactly.
* Physical Disk Record - one entry per device, so
- * leave plenty of space.
+ * leave plenty of space.
* Virtual Disk Records - again, just leave plenty of space.
- * This just lists VDs, doesn't give details
- * Config records - describes the VDs that use this disk
+ * This just lists VDs, doesn't give details.
+ * Config records - describe the VDs that use this disk
* DiskData - describes 'this' device.
* BadBlockManagement - empty
* Diag Space - empty
@@ -2178,11 +2337,6 @@ static int init_super_ddf(struct supertype *st,
struct phys_disk *pd;
struct virtual_disk *vd;
- if (data_offset != INVALID_SECTORS) {
- pr_err("data-offset not supported by DDF\n");
- return 0;
- }
-
if (st->sb)
return init_super_ddf_bvd(st, info, size, name, homehost, uuid,
data_offset);
@@ -2192,8 +2346,6 @@ static int init_super_ddf(struct supertype *st,
return 0;
}
memset(ddf, 0, sizeof(*ddf));
- ddf->dlist = NULL; /* no physical disks yet */
- ddf->conflist = NULL; /* No virtual disks yet */
st->sb = ddf;
if (info == NULL) {
@@ -2205,8 +2357,7 @@ static int init_super_ddf(struct supertype *st,
* start 32MB from the end, and put the primary header there.
* Don't do secondary for now.
* We don't know exactly where that will be yet as it could be
- * different on each device. To just set up the lengths.
- *
+ * different on each device. So just set up the lengths.
*/
ddf->anchor.magic = DDF_HEADER_MAGIC;
@@ -2228,18 +2379,18 @@ static int init_super_ddf(struct supertype *st,
ddf->anchor.workspace_len = cpu_to_be32(32768); /* Must be reserved */
/* Put this at bottom of 32M reserved.. */
ddf->anchor.workspace_lba = cpu_to_be64(~(__u64)0);
- max_phys_disks = 1023; /* Should be enough */
+ max_phys_disks = 1023; /* Should be enough, 4095 is also allowed */
ddf->anchor.max_pd_entries = cpu_to_be16(max_phys_disks);
- max_virt_disks = 255;
- ddf->anchor.max_vd_entries = cpu_to_be16(max_virt_disks); /* ?? */
- ddf->anchor.max_partitions = cpu_to_be16(64); /* ?? */
+ max_virt_disks = 255; /* 15, 63, 255, 1024, 4095 are all allowed */
+ ddf->anchor.max_vd_entries = cpu_to_be16(max_virt_disks);
ddf->max_part = 64;
- ddf->mppe = 256;
+ ddf->anchor.max_partitions = cpu_to_be16(ddf->max_part);
+ ddf->mppe = 256; /* 16, 64, 256, 1024, 4096 are all allowed */
ddf->conf_rec_len = 1 + ROUND_UP(ddf->mppe * (4+8), 512)/512;
ddf->anchor.config_record_len = cpu_to_be16(ddf->conf_rec_len);
ddf->anchor.max_primary_element_entries = cpu_to_be16(ddf->mppe);
memset(ddf->anchor.pad3, 0xff, 54);
- /* controller sections is one sector long immediately
+ /* Controller section is one sector long immediately
* after the ddf header */
sector = 1;
ddf->anchor.controller_section_offset = cpu_to_be32(sector);
@@ -2357,7 +2508,7 @@ static int init_super_ddf(struct supertype *st,
memset(&vd->entries[i], 0xff, sizeof(struct virtual_entry));
st->sb = ddf;
- ddf_set_updates_pending(ddf);
+ ddf_set_updates_pending(ddf, NULL);
return 1;
}
@@ -2383,17 +2534,18 @@ static int cmp_extent(const void *av, const void *bv)
static struct extent *get_extents(struct ddf_super *ddf, struct dl *dl)
{
- /* find a list of used extents on the give physical device
+ /* Find a list of used extents on the given physical device
* (dnum) of the given ddf.
* Return a malloced array of 'struct extent'
-
- * FIXME ignore DDF_Legacy devices?
-
*/
struct extent *rv;
int n = 0;
unsigned int i;
- __u16 state = be16_to_cpu(ddf->phys->entries[dl->pdnum].state);
+ __u16 state;
+
+ if (dl->pdnum < 0)
+ return NULL;
+ state = be16_to_cpu(ddf->phys->entries[dl->pdnum].state);
if ((state & (DDF_Online|DDF_Failed|DDF_Missing)) != DDF_Online)
return NULL;
@@ -2418,6 +2570,54 @@ static struct extent *get_extents(struct ddf_super *ddf, struct dl *dl)
rv[n].size = 0;
return rv;
}
+
+static unsigned long long find_space(
+ struct ddf_super *ddf, struct dl *dl,
+ unsigned long long data_offset,
+ unsigned long long *size)
+{
+ /* Find if the requested amount of space is available.
+ * If it is, return start.
+ * If not, set *size to largest space.
+ * If data_offset != INVALID_SECTORS, then the space must start
+ * at this location.
+ */
+ struct extent *e = get_extents(ddf, dl);
+ int i = 0;
+ unsigned long long pos = 0;
+ unsigned long long max_size = 0;
+
+ if (!e) {
+ *size = 0;
+ return INVALID_SECTORS;
+ }
+ do {
+ unsigned long long esize = e[i].start - pos;
+ if (data_offset != INVALID_SECTORS &&
+ pos <= data_offset &&
+ e[i].start > data_offset) {
+ pos = data_offset;
+ esize = e[i].start - pos;
+ }
+ if (data_offset != INVALID_SECTORS &&
+ pos != data_offset) {
+ i++;
+ continue;
+ }
+ if (esize >= *size) {
+ /* Found! */
+ free(e);
+ return pos;
+ }
+ if (esize > max_size)
+ max_size = esize;
+ pos = e[i].start + e[i].size;
+ i++;
+ } while (e[i-1].size);
+ *size = max_size;
+ free(e);
+ return INVALID_SECTORS;
+}
#endif
static int init_super_ddf_bvd(struct supertype *st,
@@ -2534,18 +2734,16 @@ static int init_super_ddf_bvd(struct supertype *st,
vcl->next = ddf->conflist;
ddf->conflist = vcl;
ddf->currentconf = vcl;
- ddf_set_updates_pending(ddf);
+ ddf_set_updates_pending(ddf, NULL);
return 1;
}
-
#ifndef MDASSEMBLE
-static int get_svd_state(const struct ddf_super *, const struct vcl *);
-
static void add_to_super_ddf_bvd(struct supertype *st,
- mdu_disk_info_t *dk, int fd, char *devname)
+ mdu_disk_info_t *dk, int fd, char *devname,
+ unsigned long long data_offset)
{
- /* fd and devname identify a device with-in the ddf container (st).
+ /* fd and devname identify a device within the ddf container (st).
* dk identifies a location in the new BVD.
* We need to find suitable free space in that device and update
* the phys_refnum and lba_offset for the newly created vd_config.
@@ -2559,8 +2757,7 @@ static void add_to_super_ddf_bvd(struct supertype *st,
struct ddf_super *ddf = st->sb;
struct vd_config *vc;
unsigned int i;
- unsigned long long blocks, pos, esize;
- struct extent *ex;
+ unsigned long long blocks, pos;
unsigned int raid_disk = dk->raid_disk;
if (fd == -1) {
@@ -2573,7 +2770,7 @@ static void add_to_super_ddf_bvd(struct supertype *st,
dl->minor == dk->minor)
break;
}
- if (!dl || ! (dk->state & (1<<MD_DISK_SYNC)))
+ if (!dl || dl->pdnum < 0 || ! (dk->state & (1<<MD_DISK_SYNC)))
return;
vc = &ddf->currentconf->conf;
@@ -2584,25 +2781,12 @@ static void add_to_super_ddf_bvd(struct supertype *st,
raid_disk %= n;
}
- ex = get_extents(ddf, dl);
- if (!ex)
- return;
-
- i = 0; pos = 0;
blocks = be64_to_cpu(vc->blocks);
if (ddf->currentconf->block_sizes)
blocks = ddf->currentconf->block_sizes[dk->raid_disk];
- do {
- esize = ex[i].start - pos;
- if (esize >= blocks)
- break;
- pos = ex[i].start + ex[i].size;
- i++;
- } while (ex[i-1].size);
-
- free(ex);
- if (esize < blocks)
+ pos = find_space(ddf, dl, data_offset, &blocks);
+ if (pos == INVALID_SECTORS)
return;
ddf->currentdev = dk->raid_disk;
@@ -2634,7 +2818,7 @@ static void add_to_super_ddf_bvd(struct supertype *st,
__func__, dl->pdnum, be32_to_cpu(dl->disk.refnum),
ddf->currentconf->vcnum, guid_str(vc->guid),
dk->raid_disk);
- ddf_set_updates_pending(ddf);
+ ddf_set_updates_pending(ddf, vc);
}
static unsigned int find_unused_pde(const struct ddf_super *ddf)
@@ -2647,7 +2831,34 @@ static unsigned int find_unused_pde(const struct ddf_super *ddf)
return DDF_NOTFOUND;
}
-/* add a device to a container, either while creating it or while
+static void _set_config_size(struct phys_disk_entry *pde, const struct dl *dl)
+{
+ __u64 cfs, t;
+ cfs = min(dl->size - 32*1024*2ULL, be64_to_cpu(dl->primary_lba));
+ t = be64_to_cpu(dl->secondary_lba);
+ if (t != ~(__u64)0)
+ cfs = min(cfs, t);
+ /*
+ * Some vendor DDF structures interpret workspace_lba
+ * very differently than we do: Make a sanity check on the value.
+ */
+ t = be64_to_cpu(dl->workspace_lba);
+ if (t < cfs) {
+ __u64 wsp = cfs - t;
+ if (wsp > 1024*1024*2ULL && wsp > dl->size / 16) {
+ pr_err("%s: %x:%x: workspace size 0x%llx too big, ignoring\n",
+ __func__, dl->major, dl->minor,
+ (unsigned long long)wsp);
+ } else
+ cfs = t;
+ }
+ pde->config_size = cpu_to_be64(cfs);
+ dprintf("%s: %x:%x config_size %llx, DDF structure is %llx blocks\n",
+ __func__, dl->major, dl->minor,
+ (unsigned long long)cfs, (unsigned long long)(dl->size-cfs));
+}
+
+/* Add a device to a container, either while creating it or while
* expanding a pre-existing container
*/
static int add_to_super_ddf(struct supertype *st,
@@ -2665,7 +2876,7 @@ static int add_to_super_ddf(struct supertype *st,
__u32 *tptr;
if (ddf->currentconf) {
- add_to_super_ddf_bvd(st, dk, fd, devname);
+ add_to_super_ddf_bvd(st, dk, fd, devname, data_offset);
return 0;
}
@@ -2766,8 +2977,10 @@ static int add_to_super_ddf(struct supertype *st,
} while (0)
__calc_lba(dd, ddf->dlist, workspace_lba, 32);
__calc_lba(dd, ddf->dlist, primary_lba, 16);
- __calc_lba(dd, ddf->dlist, secondary_lba, 32);
- pde->config_size = dd->workspace_lba;
+ if (ddf->dlist == NULL ||
+ be64_to_cpu(ddf->dlist->secondary_lba) != ~(__u64)0)
+ __calc_lba(dd, ddf->dlist, secondary_lba, 32);
+ _set_config_size(pde, dd);
sprintf(pde->path, "%17.17s","Information: nil") ;
memset(pde->pad, 0xff, 6);
@@ -2778,7 +2991,7 @@ static int add_to_super_ddf(struct supertype *st,
} else {
dd->next = ddf->dlist;
ddf->dlist = dd;
- ddf_set_updates_pending(ddf);
+ ddf_set_updates_pending(ddf, NULL);
}
return 0;
@@ -2801,7 +3014,7 @@ static int remove_from_super_ddf(struct supertype *st, mdu_disk_info_t *dk)
if (dl->major == dk->major &&
dl->minor == dk->minor)
break;
- if (!dl)
+ if (!dl || dl->pdnum < 0)
return -1;
if (st->update_tail) {
@@ -2824,7 +3037,6 @@ static int remove_from_super_ddf(struct supertype *st, mdu_disk_info_t *dk)
* called when creating a container or adding another device to a
* container.
*/
-#define NULL_CONF_SZ 4096
static int __write_ddf_structure(struct dl *d, struct ddf_super *ddf, __u8 type)
{
@@ -2848,6 +3060,8 @@ static int __write_ddf_structure(struct dl *d, struct ddf_super *ddf, __u8 type)
default:
return 0;
}
+ if (sector == ~(__u64)0)
+ return 0;
header->type = type;
header->openflag = 1;
@@ -2895,12 +3109,11 @@ static int __write_ddf_structure(struct dl *d, struct ddf_super *ddf, __u8 type)
(const struct vd_config **)&vdc,
&dummy);
}
- if (c) {
+ if (vdc) {
dprintf("writing conf record %i on disk %08x for %s/%u\n",
i, be32_to_cpu(d->disk.refnum),
guid_str(vdc->guid),
vdc->sec_elmnt_seq);
- vdc->seqnum = header->seq;
vdc->crc = calc_crc(vdc, conf_size);
memcpy(conf + i*conf_size, vdc, conf_size);
} else
@@ -2940,6 +3153,7 @@ static int _write_super_to_disk(struct ddf_super *ddf, struct dl *d)
*/
get_dev_size(fd, NULL, &size);
size /= 512;
+ memcpy(&ddf->anchor, ddf->active, 512);
if (be64_to_cpu(d->workspace_lba) != 0ULL)
ddf->anchor.workspace_lba = d->workspace_lba;
else
@@ -2955,7 +3169,7 @@ static int _write_super_to_disk(struct ddf_super *ddf, struct dl *d)
else
ddf->anchor.secondary_lba =
cpu_to_be64(size - 32*1024*2);
- ddf->anchor.seq = ddf->active->seq;
+ ddf->anchor.timestamp = cpu_to_be32(time(0) - DECADE);
memcpy(&ddf->primary, &ddf->anchor, 512);
memcpy(&ddf->secondary, &ddf->anchor, 512);
@@ -3002,7 +3216,7 @@ static int write_init_super_ddf(struct supertype *st)
struct ddf_super *ddf = st->sb;
struct vcl *currentconf = ddf->currentconf;
- /* we are done with currentconf reset it to point st at the container */
+ /* We are done with currentconf - reset it so st refers to the container */
ddf->currentconf = NULL;
if (st->update_tail) {
@@ -3013,6 +3227,7 @@ static int write_init_super_ddf(struct supertype *st)
unsigned int i;
if (!currentconf) {
+ /* Must be adding a physical disk to the container */
int len = (sizeof(struct phys_disk) +
sizeof(struct phys_disk_entry));
@@ -3045,13 +3260,15 @@ static int write_init_super_ddf(struct supertype *st)
len);
append_metadata_update(st, vc, tlen);
- /* FIXME I need to close the fds! */
return 0;
} else {
struct dl *d;
if (!currentconf)
for (d = ddf->dlist; d; d=d->next)
while (Kill(d->devname, NULL, 0, -1, 1) == 0);
+ /* Note: we don't close the fd's now, but a subsequent
+ * ->free_super() will
+ */
return __write_init_super_ddf(st);
}
}
@@ -3071,12 +3288,14 @@ static __u64 avail_size_ddf(struct supertype *st, __u64 devsize,
static int reserve_space(struct supertype *st, int raiddisks,
unsigned long long size, int chunk,
+ unsigned long long data_offset,
unsigned long long *freesize)
{
/* Find 'raiddisks' spare extents at least 'size' big (but
* only caring about multiples of 'chunk') and remember
- * them.
- * If the cannot be found, fail.
+ * them. If size==0, find the largest size possible.
+ * Report available size in *freesize
+ * If space cannot be found, fail.
*/
struct dl *dl;
struct ddf_super *ddf = st->sb;
@@ -3088,32 +3307,13 @@ static int reserve_space(struct supertype *st, int raiddisks,
}
/* Now find largest extent on each device */
for (dl = ddf->dlist ; dl ; dl=dl->next) {
- struct extent *e = get_extents(ddf, dl);
- unsigned long long pos = 0;
- int i = 0;
- int found = 0;
- unsigned long long minsize = size;
-
- if (size == 0)
- minsize = chunk;
+ unsigned long long minsize = ULLONG_MAX;
- if (!e)
- continue;
- do {
- unsigned long long esize;
- esize = e[i].start - pos;
- if (esize >= minsize) {
- found = 1;
- minsize = esize;
- }
- pos = e[i].start + e[i].size;
- i++;
- } while (e[i-1].size);
- if (found) {
+ find_space(ddf, dl, data_offset, &minsize);
+ if (minsize >= size && minsize >= (unsigned)chunk) {
cnt++;
dl->esize = minsize;
}
- free(e);
}
if (cnt < raiddisks) {
pr_err("not enough devices with space to create array.\n");
@@ -3156,21 +3356,6 @@ static int reserve_space(struct supertype *st, int raiddisks,
return 1;
}
-static int
-validate_geometry_ddf_container(struct supertype *st,
- int level, int layout, int raiddisks,
- int chunk, unsigned long long size,
- unsigned long long data_offset,
- char *dev, unsigned long long *freesize,
- int verbose);
-
-static int validate_geometry_ddf_bvd(struct supertype *st,
- int level, int layout, int raiddisks,
- int *chunk, unsigned long long size,
- unsigned long long data_offset,
- char *dev, unsigned long long *freesize,
- int verbose);
-
static int validate_geometry_ddf(struct supertype *st,
int level, int layout, int raiddisks,
int *chunk, unsigned long long size,
@@ -3192,7 +3377,8 @@ static int validate_geometry_ddf(struct supertype *st,
if (*chunk == UnSet)
*chunk = DEFAULT_CHUNK;
- if (level == -1000000) level = LEVEL_CONTAINER;
+ if (level == LEVEL_NONE)
+ level = LEVEL_CONTAINER;
if (level == LEVEL_CONTAINER) {
/* Must be a fresh device to add to a container */
return validate_geometry_ddf_container(st, level, layout,
@@ -3204,7 +3390,8 @@ static int validate_geometry_ddf(struct supertype *st,
if (!dev) {
mdu_array_info_t array = {
- .level = level, .layout = layout,
+ .level = level,
+ .layout = layout,
.raid_disks = raiddisks
};
struct vd_config conf;
@@ -3224,7 +3411,8 @@ static int validate_geometry_ddf(struct supertype *st,
* chosen so that add_to_super/getinfo_super
* can return them.
*/
- return reserve_space(st, raiddisks, size, *chunk, freesize);
+ return reserve_space(st, raiddisks, size, *chunk,
+ data_offset, freesize);
}
return 1;
}
@@ -3248,17 +3436,8 @@ static int validate_geometry_ddf(struct supertype *st,
*/
fd = open(dev, O_RDONLY|O_EXCL, 0);
if (fd >= 0) {
- sra = sysfs_read(fd, NULL, GET_VERSION);
close(fd);
- if (sra && sra->array.major_version == -1 &&
- strcmp(sra->text_version, "ddf") == 0) {
-
- /* load super */
- /* find space for 'n' devices. */
- /* remember the devices */
- /* Somehow return the fact that we have enough */
- }
-
+ /* Just a bare device, no good to us */
if (verbose)
pr_err("ddf: Cannot create this array "
"on device %s - a container is required.\n",
@@ -3351,10 +3530,7 @@ static int validate_geometry_ddf_bvd(struct supertype *st,
struct stat stb;
struct ddf_super *ddf = st->sb;
struct dl *dl;
- unsigned long long pos = 0;
unsigned long long maxsize;
- struct extent *e;
- int i;
/* ddf/bvd supports lots of things, but not containers */
if (level == LEVEL_CONTAINER) {
if (verbose)
@@ -3373,25 +3549,10 @@ static int validate_geometry_ddf_bvd(struct supertype *st,
int dcnt = 0;
if (minsize == 0)
minsize = 8;
- for (dl = ddf->dlist; dl ; dl = dl->next)
- {
- int found = 0;
- pos = 0;
-
- i = 0;
- e = get_extents(ddf, dl);
- if (!e) continue;
- do {
- unsigned long long esize;
- esize = e[i].start - pos;
- if (esize >= minsize)
- found = 1;
- pos = e[i].start + e[i].size;
- i++;
- } while (e[i-1].size);
- if (found)
+ for (dl = ddf->dlist; dl ; dl = dl->next) {
+ if (find_space(ddf, dl, data_offset, &minsize)
+ != INVALID_SECTORS)
dcnt++;
- free(e);
}
if (dcnt < raiddisks) {
if (verbose)
@@ -3419,19 +3580,9 @@ static int validate_geometry_ddf_bvd(struct supertype *st,
dev);
return 0;
}
- e = get_extents(ddf, dl);
- maxsize = 0;
- i = 0;
- if (e) do {
- unsigned long long esize;
- esize = e[i].start - pos;
- if (esize >= maxsize)
- maxsize = esize;
- pos = e[i].start + e[i].size;
- i++;
- } while (e[i-1].size);
+ maxsize = ULLONG_MAX;
+ find_space(ddf, dl, data_offset, &maxsize);
*freesize = maxsize;
- // FIXME here I am
return 1;
}
@@ -3580,7 +3731,7 @@ static int check_secondary(const struct vcl *vc)
}
for (i = 0; i < conf->sec_elmnt_count; i++) {
if (!__was_sec_seen(i)) {
- pr_err("BVD %d is missing\n", i);
+ /* pr_err("BVD %d is missing\n", i); */
return -1;
}
}
@@ -3599,13 +3750,13 @@ static unsigned int get_pd_index_from_refnum(const struct vcl *vc,
for (i = 0, j = 0 ; i < nmax ; i++) {
/* j counts valid entries for this BVD */
- if (be32_to_cpu(vc->conf.phys_refnum[i]) != 0xffffffff)
- j++;
if (be32_eq(vc->conf.phys_refnum[i], refnum)) {
*bvd = &vc->conf;
*idx = i;
- return sec * cnt + j - 1;
+ return sec * cnt + j;
}
+ if (be32_to_cpu(vc->conf.phys_refnum[i]) != 0xffffffff)
+ j++;
}
if (vc->other_bvds == NULL)
goto bad;
@@ -3616,13 +3767,13 @@ static unsigned int get_pd_index_from_refnum(const struct vcl *vc,
if (sec == DDF_UNUSED_BVD)
continue;
for (i = 0, j = 0 ; i < nmax ; i++) {
- if (be32_to_cpu(vd->phys_refnum[i]) != 0xffffffff)
- j++;
if (be32_eq(vd->phys_refnum[i], refnum)) {
*bvd = vd;
*idx = i;
- return sec * cnt + j - 1;
+ return sec * cnt + j;
}
+ if (be32_to_cpu(vd->phys_refnum[i]) != 0xffffffff)
+ j++;
}
}
bad:
@@ -3644,10 +3795,8 @@ static struct mdinfo *container_content_ddf(struct supertype *st, char *subarray
struct mdinfo *rest = NULL;
struct vcl *vc;
- for (vc = ddf->conflist ; vc ; vc=vc->next)
- {
+ for (vc = ddf->conflist ; vc ; vc=vc->next) {
unsigned int i;
- unsigned int j;
struct mdinfo *this;
char *ep;
__u32 *cptr;
@@ -3672,7 +3821,7 @@ static struct mdinfo *container_content_ddf(struct supertype *st, char *subarray
this->array.md_minor = -1;
this->array.major_version = -1;
this->array.minor_version = -2;
- this->safe_mode_delay = DDF_SAFE_MODE_DELAY;
+ this->safe_mode_delay = DDF_SAFE_MODE_DELAY;
cptr = (__u32 *)(vc->conf.guid + 16);
this->array.ctime = DECADE + __be32_to_cpu(*cptr);
this->array.utime = DECADE +
@@ -3689,16 +3838,11 @@ static struct mdinfo *container_content_ddf(struct supertype *st, char *subarray
this->array.state = 1;
this->resync_start = MaxSector;
}
- memcpy(this->name, ddf->virt->entries[i].name, 16);
- this->name[16]=0;
- for(j=0; j<16; j++)
- if (this->name[j] == ' ')
- this->name[j] = 0;
-
+ _ddf_array_name(this->name, ddf, i);
memset(this->uuid, 0, sizeof(this->uuid));
- this->component_size = be64_to_cpu(vc->conf.blocks);
- this->array.size = this->component_size / 2;
- this->container_member = i;
+ this->component_size = be64_to_cpu(vc->conf.blocks);
+ this->array.size = this->component_size / 2;
+ this->container_member = i;
ddf->currentconf = vc;
uuid_from_super_ddf(st, this->uuid);
@@ -3708,7 +3852,7 @@ static struct mdinfo *container_content_ddf(struct supertype *st, char *subarray
sprintf(this->text_version, "/%s/%d",
st->container_devnm, this->container_member);
- for (pd = 0; pd < be16_to_cpu(ddf->phys->used_pdes); pd++) {
+ for (pd = 0; pd < be16_to_cpu(ddf->phys->max_pdes); pd++) {
struct mdinfo *dev;
struct dl *d;
const struct vd_config *bvd;
@@ -3741,17 +3885,17 @@ static struct mdinfo *container_content_ddf(struct supertype *st, char *subarray
continue;
dev = xcalloc(1, sizeof(*dev));
- dev->next = this->devs;
- this->devs = dev;
+ dev->next = this->devs;
+ this->devs = dev;
dev->disk.number = be32_to_cpu(d->disk.refnum);
- dev->disk.major = d->major;
- dev->disk.minor = d->minor;
+ dev->disk.major = d->major;
+ dev->disk.minor = d->minor;
dev->disk.raid_disk = i;
- dev->disk.state = (1<<MD_DISK_SYNC)|(1<<MD_DISK_ACTIVE);
+ dev->disk.state = (1<<MD_DISK_SYNC)|(1<<MD_DISK_ACTIVE);
dev->recovery_start = MaxSector;
- dev->events = be32_to_cpu(ddf->primary.seq);
+ dev->events = be32_to_cpu(ddf->active->seq);
dev->data_offset =
be64_to_cpu(LBA_OFFSET(ddf, bvd)[iphys]);
dev->component_size = be64_to_cpu(bvd->blocks);
@@ -3819,7 +3963,7 @@ static int compare_super_ddf(struct supertype *st, struct supertype *tst)
/*
* return:
* 0 same, or first was empty, and second was copied
- * 1 second had wrong number
+ * 1 second had wrong magic number - but that isn't possible
* 2 wrong uuid
* 3 wrong other info
*/
@@ -3838,53 +3982,25 @@ static int compare_super_ddf(struct supertype *st, struct supertype *tst)
if (memcmp(first->anchor.guid, second->anchor.guid, DDF_GUID_LEN) != 0)
return 2;
- if (!be32_eq(first->anchor.seq, second->anchor.seq)) {
- dprintf("%s: sequence number mismatch %u/%u\n", __func__,
- be32_to_cpu(first->anchor.seq),
- be32_to_cpu(second->anchor.seq));
- return 3;
- }
- if (first->max_part != second->max_part ||
- !be16_eq(first->phys->used_pdes, second->phys->used_pdes) ||
- !be16_eq(first->virt->populated_vdes,
- second->virt->populated_vdes)) {
- dprintf("%s: PD/VD number mismatch\n", __func__);
- return 3;
- }
-
- max_pds = be16_to_cpu(first->phys->used_pdes);
- for (dl2 = second->dlist; dl2; dl2 = dl2->next) {
- for (pd = 0; pd < max_pds; pd++)
- if (be32_eq(first->phys->entries[pd].refnum,
- dl2->disk.refnum))
- break;
- if (pd == max_pds) {
- dprintf("%s: no match for disk %08x\n", __func__,
- be32_to_cpu(dl2->disk.refnum));
- return 3;
- }
- }
+ /* It is only OK to compare info in the anchor. Anything else
+ * could be changing due to a reconfig so must be ignored.
+ * guid really should be enough anyway.
+ */
- max_vds = be16_to_cpu(first->active->max_vd_entries);
- for (vl2 = second->conflist; vl2; vl2 = vl2->next) {
- if (!be32_eq(vl2->conf.magic, DDF_VD_CONF_MAGIC))
- continue;
- for (vd = 0; vd < max_vds; vd++)
- if (!memcmp(first->virt->entries[vd].guid,
- vl2->conf.guid, DDF_GUID_LEN))
- break;
- if (vd == max_vds) {
- dprintf("%s: no match for VD config\n", __func__);
- return 3;
- }
+ if (!be32_eq(first->active->seq, second->active->seq)) {
+ dprintf("%s: sequence number mismatch %u<->%u\n", __func__,
+ be32_to_cpu(first->active->seq),
+ be32_to_cpu(second->active->seq));
+ return 0;
}
- /* FIXME should I look at anything else? */
/*
- At this point we are fairly sure that the meta data matches.
- But the new disk may contain additional local data.
- Add it to the super block.
+ * At this point we are fairly sure that the meta data matches.
+ * But the new disk may contain additional local data.
+ * Add it to the super block.
*/
+ max_vds = be16_to_cpu(first->active->max_vd_entries);
+ max_pds = be16_to_cpu(first->phys->max_pdes);
for (vl2 = second->conflist; vl2; vl2 = vl2->next) {
for (vl1 = first->conflist; vl1; vl1 = vl1->next)
if (!memcmp(vl1->conf.guid, vl2->conf.guid,
@@ -3950,7 +4066,7 @@ static int compare_super_ddf(struct supertype *st, struct supertype *tst)
if (be32_eq(first->phys->entries[pd].refnum,
dl1->disk.refnum))
break;
- dl1->pdnum = pd;
+ dl1->pdnum = pd < max_pds ? (int)pd : -1;
if (dl2->spare) {
if (posix_memalign((void **)&dl1->spare, 512,
first->conf_rec_len*512) != 0) {
@@ -4007,7 +4123,7 @@ static int ddf_open_new(struct supertype *c, struct active_array *a, char *inst)
if (dl->major == dev->disk.major &&
dl->minor == dev->disk.minor)
break;
- if (!dl) {
+ if (!dl || dl->pdnum < 0) {
pr_err("%s: device %d/%d of subarray %d not found in meta data\n",
__func__, dev->disk.major, dev->disk.minor, n);
return -1;
@@ -4028,6 +4144,56 @@ static int ddf_open_new(struct supertype *c, struct active_array *a, char *inst)
return 0;
}
+static void handle_missing(struct ddf_super *ddf, struct active_array *a, int inst)
+{
+ /* This member array is being activated. If any devices
+ * are missing they must now be marked as failed.
+ */
+ struct vd_config *vc;
+ unsigned int n_bvd;
+ struct vcl *vcl;
+ struct dl *dl;
+ int pd;
+ int n;
+ int state;
+
+ for (n = 0; ; n++) {
+ vc = find_vdcr(ddf, inst, n, &n_bvd, &vcl);
+ if (!vc)
+ break;
+ for (dl = ddf->dlist; dl; dl = dl->next)
+ if (be32_eq(dl->disk.refnum, vc->phys_refnum[n_bvd]))
+ break;
+ if (dl)
+ /* Found this disk, so not missing */
+ continue;
+
+ /* Mark the device as failed/missing. */
+ pd = find_phys(ddf, vc->phys_refnum[n_bvd]);
+ if (pd >= 0 && be16_and(ddf->phys->entries[pd].state,
+ cpu_to_be16(DDF_Online))) {
+ be16_clear(ddf->phys->entries[pd].state,
+ cpu_to_be16(DDF_Online));
+ be16_set(ddf->phys->entries[pd].state,
+ cpu_to_be16(DDF_Failed|DDF_Missing));
+ vc->phys_refnum[n_bvd] = cpu_to_be32(0);
+ ddf_set_updates_pending(ddf, vc);
+ }
+
+ /* Mark the array as Degraded */
+ state = get_svd_state(ddf, vcl);
+ if (ddf->virt->entries[inst].state !=
+ ((ddf->virt->entries[inst].state & ~DDF_state_mask)
+ | state)) {
+ ddf->virt->entries[inst].state =
+ (ddf->virt->entries[inst].state & ~DDF_state_mask)
+ | state;
+ a->check_degraded = 1;
+ ddf_set_updates_pending(ddf, vc);
+ }
+ }
+}
+
/*
* The array 'a' is to be marked clean in the metadata.
* If '->resync_start' is not ~(unsigned long long)0, then the array is only
@@ -4043,7 +4209,7 @@ static int ddf_set_array_state(struct active_array *a, int consistent)
int inst = a->info.container_member;
int old = ddf->virt->entries[inst].state;
if (consistent == 2) {
- /* Should check if a recovery should be started FIXME */
+ handle_missing(ddf, a, inst);
consistent = 1;
if (!is_resync_complete(&a->info))
consistent = 0;
@@ -4053,7 +4219,7 @@ static int ddf_set_array_state(struct active_array *a, int consistent)
else
ddf->virt->entries[inst].state |= DDF_state_inconsistent;
if (old != ddf->virt->entries[inst].state)
- ddf_set_updates_pending(ddf);
+ ddf_set_updates_pending(ddf, NULL);
old = ddf->virt->entries[inst].init_state;
ddf->virt->entries[inst].init_state &= ~DDF_initstate_mask;
@@ -4064,7 +4230,7 @@ static int ddf_set_array_state(struct active_array *a, int consistent)
else
ddf->virt->entries[inst].init_state |= DDF_init_quick;
if (old != ddf->virt->entries[inst].init_state)
- ddf_set_updates_pending(ddf);
+ ddf_set_updates_pending(ddf, NULL);
dprintf("ddf mark %d/%s (%d) %s %llu\n", inst,
guid_str(ddf->virt->entries[inst].guid), a->curr_state,
@@ -4079,6 +4245,11 @@ static int get_bvd_state(const struct ddf_super *ddf,
unsigned int i, n_bvd, working = 0;
unsigned int n_prim = be16_to_cpu(vc->prim_elmnt_count);
int pd, st, state;
+ char *avail = xcalloc(1, n_prim);
+ mdu_array_info_t array;
+
+ layout_ddf2md(vc, &array);
+
for (i = 0; i < n_prim; i++) {
if (!find_index_in_bvd(ddf, vc, i, &n_bvd))
continue;
@@ -4087,8 +4258,10 @@ static int get_bvd_state(const struct ddf_super *ddf,
continue;
st = be16_to_cpu(ddf->phys->entries[pd].state);
if ((st & (DDF_Online|DDF_Failed|DDF_Rebuilding))
- == DDF_Online)
+ == DDF_Online) {
working++;
+ avail[i] = 1;
+ }
}
state = DDF_state_degraded;
@@ -4107,6 +4280,10 @@ static int get_bvd_state(const struct ddf_super *ddf,
else if (working >= 2)
state = DDF_state_part_optimal;
break;
+ case DDF_RAID1E:
+ if (!enough(10, n_prim, array.layout, 1, avail))
+ state = DDF_state_failed;
+ break;
case DDF_RAID4:
case DDF_RAID5:
if (working < n_prim - 1)
@@ -4178,6 +4355,7 @@ static void ddf_set_disk(struct active_array *a, int n, int state)
int pd;
struct mdinfo *mdi;
struct dl *dl;
+ int update = 0;
dprintf("%s: %d to %x\n", __func__, n, state);
if (vc == NULL) {
@@ -4217,8 +4395,9 @@ static void ddf_set_disk(struct active_array *a, int n, int state)
dprintf("%s: array %u disk %u ref %08x pd %d\n",
__func__, inst, n_bvd,
be32_to_cpu(vc->phys_refnum[n_bvd]), pd);
- if ((state & DS_INSYNC) && ! (state & DS_FAULTY)) {
- pd = dl->pdnum; /* FIXME: is this really correct ? */
+ if ((state & DS_INSYNC) && ! (state & DS_FAULTY) &&
+ dl->pdnum >= 0) {
+ pd = dl->pdnum;
vc->phys_refnum[n_bvd] = dl->disk.refnum;
LBA_OFFSET(ddf, vc)[n_bvd] =
cpu_to_be64(mdi->data_offset);
@@ -4226,7 +4405,7 @@ static void ddf_set_disk(struct active_array *a, int n, int state)
cpu_to_be16(DDF_Global_Spare));
be16_set(ddf->phys->entries[pd].type,
cpu_to_be16(DDF_Active_in_VD));
- ddf_set_updates_pending(ddf);
+ update = 1;
}
} else {
be16 old = ddf->phys->entries[pd].state;
@@ -4240,7 +4419,7 @@ static void ddf_set_disk(struct active_array *a, int n, int state)
cpu_to_be16(DDF_Rebuilding));
}
if (!be16_eq(old, ddf->phys->entries[pd].state))
- ddf_set_updates_pending(ddf);
+ update = 1;
}
dprintf("ddf: set_disk %d (%08x) to %x->%02x\n", n,
@@ -4257,18 +4436,17 @@ static void ddf_set_disk(struct active_array *a, int n, int state)
if (ddf->virt->entries[inst].state !=
((ddf->virt->entries[inst].state & ~DDF_state_mask)
| state)) {
-
ddf->virt->entries[inst].state =
(ddf->virt->entries[inst].state & ~DDF_state_mask)
| state;
- ddf_set_updates_pending(ddf);
+ update = 1;
}
-
+ if (update)
+ ddf_set_updates_pending(ddf, vc);
}
static void ddf_sync_metadata(struct supertype *st)
{
-
/*
* Write all data to all devices.
* Later, we might be able to track whether only local changes
@@ -4331,8 +4509,9 @@ static int kill_subarray_ddf(struct supertype *st)
*/
struct vcl *victim = ddf->currentconf;
struct vd_config *conf;
- ddf->currentconf = NULL;
unsigned int vdnum;
+
+ ddf->currentconf = NULL;
if (!victim) {
pr_err("%s: nothing to kill\n", __func__);
return -1;
@@ -4363,7 +4542,7 @@ static int kill_subarray_ddf(struct supertype *st)
append_metadata_update(st, vd, len);
} else {
_kill_subarray_ddf(ddf, conf->guid);
- ddf_set_updates_pending(ddf);
+ ddf_set_updates_pending(ddf, NULL);
ddf_sync_metadata(st);
}
return 0;
@@ -4390,6 +4569,297 @@ static void copy_matching_bvd(struct ddf_super *ddf,
conf->sec_elmnt_seq, guid_str(conf->guid));
}
+static void ddf_process_phys_update(struct supertype *st,
+ struct metadata_update *update)
+{
+ struct ddf_super *ddf = st->sb;
+ struct phys_disk *pd;
+ unsigned int ent;
+
+ pd = (struct phys_disk*)update->buf;
+ ent = be16_to_cpu(pd->used_pdes);
+ if (ent >= be16_to_cpu(ddf->phys->max_pdes))
+ return;
+ if (be16_and(pd->entries[0].state, cpu_to_be16(DDF_Missing))) {
+ struct dl **dlp;
+ /* removing this disk. */
+ be16_set(ddf->phys->entries[ent].state,
+ cpu_to_be16(DDF_Missing));
+ for (dlp = &ddf->dlist; *dlp; dlp = &(*dlp)->next) {
+ struct dl *dl = *dlp;
+ if (dl->pdnum == (signed)ent) {
+ close(dl->fd);
+ dl->fd = -1;
+ *dlp = dl->next;
+ update->space = dl->devname;
+ *(void**)dl = update->space_list;
+ update->space_list = (void**)dl;
+ break;
+ }
+ }
+ ddf_set_updates_pending(ddf, NULL);
+ return;
+ }
+ if (!all_ff(ddf->phys->entries[ent].guid))
+ return;
+ ddf->phys->entries[ent] = pd->entries[0];
+ ddf->phys->used_pdes = cpu_to_be16
+ (1 + be16_to_cpu(ddf->phys->used_pdes));
+ ddf_set_updates_pending(ddf, NULL);
+ if (ddf->add_list) {
+ struct active_array *a;
+ struct dl *al = ddf->add_list;
+ ddf->add_list = al->next;
+
+ al->next = ddf->dlist;
+ ddf->dlist = al;
+
+ /* As a device has been added, we should check
+ * for any degraded devices that might make
+ * use of this spare */
+ for (a = st->arrays ; a; a=a->next)
+ a->check_degraded = 1;
+ }
+}
+
+static void ddf_process_virt_update(struct supertype *st,
+ struct metadata_update *update)
+{
+ struct ddf_super *ddf = st->sb;
+ struct virtual_disk *vd;
+ unsigned int ent;
+
+ vd = (struct virtual_disk*)update->buf;
+
+ if (vd->entries[0].state == DDF_state_deleted) {
+ if (_kill_subarray_ddf(ddf, vd->entries[0].guid))
+ return;
+ } else {
+ ent = find_vde_by_guid(ddf, vd->entries[0].guid);
+ if (ent != DDF_NOTFOUND) {
+ dprintf("%s: VD %s exists already in slot %d\n",
+ __func__, guid_str(vd->entries[0].guid),
+ ent);
+ return;
+ }
+ ent = find_unused_vde(ddf);
+ if (ent == DDF_NOTFOUND)
+ return;
+ ddf->virt->entries[ent] = vd->entries[0];
+ ddf->virt->populated_vdes =
+ cpu_to_be16(
+ 1 + be16_to_cpu(
+ ddf->virt->populated_vdes));
+ dprintf("%s: added VD %s in slot %d(s=%02x i=%02x)\n",
+ __func__, guid_str(vd->entries[0].guid), ent,
+ ddf->virt->entries[ent].state,
+ ddf->virt->entries[ent].init_state);
+ }
+ ddf_set_updates_pending(ddf, NULL);
+}
+
+static void ddf_remove_failed(struct ddf_super *ddf)
+{
+ /* Now remove any 'Failed' devices that are not part
+ * of any VD. They will have the Transition flag set.
+ * Once done, we need to update all dl->pdnum numbers.
+ */
+ unsigned int pdnum;
+ unsigned int pd2 = 0;
+ struct dl *dl;
+
+ for (pdnum = 0; pdnum < be16_to_cpu(ddf->phys->max_pdes);
+ pdnum++) {
+ if (be32_to_cpu(ddf->phys->entries[pdnum].refnum) ==
+ 0xFFFFFFFF)
+ continue;
+ if (be16_and(ddf->phys->entries[pdnum].state,
+ cpu_to_be16(DDF_Failed))
+ && be16_and(ddf->phys->entries[pdnum].state,
+ cpu_to_be16(DDF_Transition))) {
+ /* skip this one unless in dlist*/
+ for (dl = ddf->dlist; dl; dl = dl->next)
+ if (dl->pdnum == (int)pdnum)
+ break;
+ if (!dl)
+ continue;
+ }
+ if (pdnum == pd2)
+ pd2++;
+ else {
+ ddf->phys->entries[pd2] =
+ ddf->phys->entries[pdnum];
+ for (dl = ddf->dlist; dl; dl = dl->next)
+ if (dl->pdnum == (int)pdnum)
+ dl->pdnum = pd2;
+ pd2++;
+ }
+ }
+ ddf->phys->used_pdes = cpu_to_be16(pd2);
+ while (pd2 < pdnum) {
+ memset(ddf->phys->entries[pd2].guid, 0xff,
+ DDF_GUID_LEN);
+ pd2++;
+ }
+}
+
+static void ddf_update_vlist(struct ddf_super *ddf, struct dl *dl)
+{
+ struct vcl *vcl;
+ unsigned int vn = 0;
+ int in_degraded = 0;
+
+ if (dl->pdnum < 0)
+ return;
+ for (vcl = ddf->conflist; vcl ; vcl = vcl->next) {
+ unsigned int dn, ibvd;
+ const struct vd_config *conf;
+ int vstate;
+ dn = get_pd_index_from_refnum(vcl,
+ dl->disk.refnum,
+ ddf->mppe,
+ &conf, &ibvd);
+ if (dn == DDF_NOTFOUND)
+ continue;
+ dprintf("dev %d/%08x has %s (sec=%u) at %d\n",
+ dl->pdnum,
+ be32_to_cpu(dl->disk.refnum),
+ guid_str(conf->guid),
+ conf->sec_elmnt_seq, vn);
+ /* Clear the Transition flag */
+ if (be16_and
+ (ddf->phys->entries[dl->pdnum].state,
+ cpu_to_be16(DDF_Failed)))
+ be16_clear(ddf->phys
+ ->entries[dl->pdnum].state,
+ cpu_to_be16(DDF_Transition));
+ dl->vlist[vn++] = vcl;
+ vstate = ddf->virt->entries[vcl->vcnum].state
+ & DDF_state_mask;
+ if (vstate == DDF_state_degraded ||
+ vstate == DDF_state_part_optimal)
+ in_degraded = 1;
+ }
+ while (vn < ddf->max_part)
+ dl->vlist[vn++] = NULL;
+ if (dl->vlist[0]) {
+ be16_clear(ddf->phys->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Global_Spare));
+ if (!be16_and(ddf->phys
+ ->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Active_in_VD))) {
+ be16_set(ddf->phys
+ ->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Active_in_VD));
+ if (in_degraded)
+ be16_set(ddf->phys
+ ->entries[dl->pdnum]
+ .state,
+ cpu_to_be16
+ (DDF_Rebuilding));
+ }
+ }
+ if (dl->spare) {
+ be16_clear(ddf->phys->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Global_Spare));
+ be16_set(ddf->phys->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Spare));
+ }
+ if (!dl->vlist[0] && !dl->spare) {
+ be16_set(ddf->phys->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Global_Spare));
+ be16_clear(ddf->phys->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Spare));
+ be16_clear(ddf->phys->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Active_in_VD));
+ }
+}
+
+static void ddf_process_conf_update(struct supertype *st,
+ struct metadata_update *update)
+{
+ struct ddf_super *ddf = st->sb;
+ struct vd_config *vc;
+ struct vcl *vcl;
+ struct dl *dl;
+ unsigned int ent;
+ unsigned int pdnum, len;
+
+ vc = (struct vd_config*)update->buf;
+ len = ddf->conf_rec_len * 512;
+ if ((unsigned int)update->len != len * vc->sec_elmnt_count) {
+ pr_err("%s: %s: insufficient data (%d) for %u BVDs\n",
+ __func__, guid_str(vc->guid), update->len,
+ vc->sec_elmnt_count);
+ return;
+ }
+ for (vcl = ddf->conflist; vcl ; vcl = vcl->next)
+ if (memcmp(vcl->conf.guid, vc->guid, DDF_GUID_LEN) == 0)
+ break;
+ dprintf("%s: conf update for %s (%s)\n", __func__,
+ guid_str(vc->guid), (vcl ? "old" : "new"));
+ if (vcl) {
+ /* An update, just copy the phys_refnum and lba_offset
+ * fields
+ */
+ unsigned int i;
+ unsigned int k;
+ copy_matching_bvd(ddf, &vcl->conf, update);
+ for (k = 0; k < be16_to_cpu(vc->prim_elmnt_count); k++)
+ dprintf("BVD %u has %08x at %llu\n", 0,
+ be32_to_cpu(vcl->conf.phys_refnum[k]),
+ be64_to_cpu(LBA_OFFSET(ddf,
+ &vcl->conf)[k]));
+ for (i = 1; i < vc->sec_elmnt_count; i++) {
+ copy_matching_bvd(ddf, vcl->other_bvds[i-1],
+ update);
+ for (k = 0; k < be16_to_cpu(
+ vc->prim_elmnt_count); k++)
+ dprintf("BVD %u has %08x at %llu\n", i,
+ be32_to_cpu
+ (vcl->other_bvds[i-1]->
+ phys_refnum[k]),
+ be64_to_cpu
+ (LBA_OFFSET
+ (ddf,
+ vcl->other_bvds[i-1])[k]));
+ }
+ } else {
+ /* A new VD_CONF */
+ unsigned int i;
+ if (!update->space)
+ return;
+ vcl = update->space;
+ update->space = NULL;
+ vcl->next = ddf->conflist;
+ memcpy(&vcl->conf, vc, len);
+ ent = find_vde_by_guid(ddf, vc->guid);
+ if (ent == DDF_NOTFOUND)
+ return;
+ vcl->vcnum = ent;
+ ddf->conflist = vcl;
+ for (i = 1; i < vc->sec_elmnt_count; i++)
+ memcpy(vcl->other_bvds[i-1],
+ update->buf + len * i, len);
+ }
+ /* Set DDF_Transition on all Failed devices - to help
+ * us detect those that are no longer in use
+ */
+ for (pdnum = 0; pdnum < be16_to_cpu(ddf->phys->max_pdes);
+ pdnum++)
+ if (be16_and(ddf->phys->entries[pdnum].state,
+ cpu_to_be16(DDF_Failed)))
+ be16_set(ddf->phys->entries[pdnum].state,
+ cpu_to_be16(DDF_Transition));
+
+ /* Now make sure vlist is correct for each dl. */
+ for (dl = ddf->dlist; dl; dl = dl->next)
+ ddf_update_vlist(ddf, dl);
+ ddf_remove_failed(ddf);
+
+ ddf_set_updates_pending(ddf, vc);
+}
+
static void ddf_process_update(struct supertype *st,
struct metadata_update *update)
{
@@ -4420,304 +4890,57 @@ static void ddf_process_update(struct supertype *st,
* and offset. This will also mark the spare as active with
* a spare-assignment record.
*/
- struct ddf_super *ddf = st->sb;
be32 *magic = (be32 *)update->buf;
- struct phys_disk *pd;
- struct virtual_disk *vd;
- struct vd_config *vc;
- struct vcl *vcl;
- struct dl *dl;
- unsigned int ent;
- unsigned int pdnum, pd2, len;
dprintf("Process update %x\n", be32_to_cpu(*magic));
if (be32_eq(*magic, DDF_PHYS_RECORDS_MAGIC)) {
-
- if (update->len != (sizeof(struct phys_disk) +
+ if (update->len == (sizeof(struct phys_disk) +
sizeof(struct phys_disk_entry)))
- return;
- pd = (struct phys_disk*)update->buf;
-
- ent = be16_to_cpu(pd->used_pdes);
- if (ent >= be16_to_cpu(ddf->phys->max_pdes))
- return;
- if (be16_and(pd->entries[0].state, cpu_to_be16(DDF_Missing))) {
- struct dl **dlp;
- /* removing this disk. */
- be16_set(ddf->phys->entries[ent].state,
- cpu_to_be16(DDF_Missing));
- for (dlp = &ddf->dlist; *dlp; dlp = &(*dlp)->next) {
- struct dl *dl = *dlp;
- if (dl->pdnum == (signed)ent) {
- close(dl->fd);
- dl->fd = -1;
- /* FIXME this doesn't free
- * dl->devname */
- update->space = dl;
- *dlp = dl->next;
- break;
- }
- }
- ddf_set_updates_pending(ddf);
- return;
- }
- if (!all_ff(ddf->phys->entries[ent].guid))
- return;
- ddf->phys->entries[ent] = pd->entries[0];
- ddf->phys->used_pdes = cpu_to_be16
- (1 + be16_to_cpu(ddf->phys->used_pdes));
- ddf_set_updates_pending(ddf);
- if (ddf->add_list) {
- struct active_array *a;
- struct dl *al = ddf->add_list;
- ddf->add_list = al->next;
-
- al->next = ddf->dlist;
- ddf->dlist = al;
-
- /* As a device has been added, we should check
- * for any degraded devices that might make
- * use of this spare */
- for (a = st->arrays ; a; a=a->next)
- a->check_degraded = 1;
- }
+ ddf_process_phys_update(st, update);
} else if (be32_eq(*magic, DDF_VIRT_RECORDS_MAGIC)) {
-
- if (update->len != (sizeof(struct virtual_disk) +
+ if (update->len == (sizeof(struct virtual_disk) +
sizeof(struct virtual_entry)))
- return;
- vd = (struct virtual_disk*)update->buf;
-
- if (vd->entries[0].state == DDF_state_deleted) {
- if (_kill_subarray_ddf(ddf, vd->entries[0].guid))
- return;
- } else {
-
- ent = find_vde_by_guid(ddf, vd->entries[0].guid);
- if (ent != DDF_NOTFOUND) {
- dprintf("%s: VD %s exists already in slot %d\n",
- __func__, guid_str(vd->entries[0].guid),
- ent);
- return;
- }
- ent = find_unused_vde(ddf);
- if (ent == DDF_NOTFOUND)
- return;
- ddf->virt->entries[ent] = vd->entries[0];
- ddf->virt->populated_vdes =
- cpu_to_be16(
- 1 + be16_to_cpu(
- ddf->virt->populated_vdes));
- dprintf("%s: added VD %s in slot %d(s=%02x i=%02x)\n",
- __func__, guid_str(vd->entries[0].guid), ent,
- ddf->virt->entries[ent].state,
- ddf->virt->entries[ent].init_state);
- }
- ddf_set_updates_pending(ddf);
- }
-
- else if (be32_eq(*magic, DDF_VD_CONF_MAGIC)) {
- vc = (struct vd_config*)update->buf;
- len = ddf->conf_rec_len * 512;
- if ((unsigned int)update->len != len * vc->sec_elmnt_count) {
- pr_err("%s: %s: insufficient data (%d) for %u BVDs\n",
- __func__, guid_str(vc->guid), update->len,
- vc->sec_elmnt_count);
- return;
- }
- for (vcl = ddf->conflist; vcl ; vcl = vcl->next)
- if (memcmp(vcl->conf.guid, vc->guid, DDF_GUID_LEN) == 0)
- break;
- dprintf("%s: conf update for %s (%s)\n", __func__,
- guid_str(vc->guid), (vcl ? "old" : "new"));
- if (vcl) {
- /* An update, just copy the phys_refnum and lba_offset
- * fields
- */
- unsigned int i;
- unsigned int k;
- copy_matching_bvd(ddf, &vcl->conf, update);
- for (k = 0; k < be16_to_cpu(vc->prim_elmnt_count); k++)
- dprintf("BVD %u has %08x at %llu\n", 0,
- be32_to_cpu(vcl->conf.phys_refnum[k]),
- be64_to_cpu(LBA_OFFSET(ddf,
- &vcl->conf)[k]));
- for (i = 1; i < vc->sec_elmnt_count; i++) {
- copy_matching_bvd(ddf, vcl->other_bvds[i-1],
- update);
- for (k = 0; k < be16_to_cpu(
- vc->prim_elmnt_count); k++)
- dprintf("BVD %u has %08x at %llu\n", i,
- be32_to_cpu
- (vcl->other_bvds[i-1]->
- phys_refnum[k]),
- be64_to_cpu
- (LBA_OFFSET
- (ddf,
- vcl->other_bvds[i-1])[k]));
- }
- } else {
- /* A new VD_CONF */
- unsigned int i;
- if (!update->space)
- return;
- vcl = update->space;
- update->space = NULL;
- vcl->next = ddf->conflist;
- memcpy(&vcl->conf, vc, len);
- ent = find_vde_by_guid(ddf, vc->guid);
- if (ent == DDF_NOTFOUND)
- return;
- vcl->vcnum = ent;
- ddf->conflist = vcl;
- for (i = 1; i < vc->sec_elmnt_count; i++)
- memcpy(vcl->other_bvds[i-1],
- update->buf + len * i, len);
- }
- /* Set DDF_Transition on all Failed devices - to help
- * us detect those that are no longer in use
- */
- for (pdnum = 0; pdnum < be16_to_cpu(ddf->phys->used_pdes);
- pdnum++)
- if (be16_and(ddf->phys->entries[pdnum].state,
- cpu_to_be16(DDF_Failed)))
- be16_set(ddf->phys->entries[pdnum].state,
- cpu_to_be16(DDF_Transition));
- /* Now make sure vlist is correct for each dl. */
- for (dl = ddf->dlist; dl; dl = dl->next) {
- unsigned int vn = 0;
- int in_degraded = 0;
- for (vcl = ddf->conflist; vcl ; vcl = vcl->next) {
- unsigned int dn, ibvd;
- const struct vd_config *conf;
- int vstate;
- dn = get_pd_index_from_refnum(vcl,
- dl->disk.refnum,
- ddf->mppe,
- &conf, &ibvd);
- if (dn == DDF_NOTFOUND)
- continue;
- dprintf("dev %d/%08x has %s (sec=%u) at %d\n",
- dl->pdnum,
- be32_to_cpu(dl->disk.refnum),
- guid_str(conf->guid),
- conf->sec_elmnt_seq, vn);
- /* Clear the Transition flag */
- if (be16_and
- (ddf->phys->entries[dl->pdnum].state,
- cpu_to_be16(DDF_Failed)))
- be16_clear(ddf->phys
- ->entries[dl->pdnum].state,
- cpu_to_be16(DDF_Transition));
- dl->vlist[vn++] = vcl;
- vstate = ddf->virt->entries[vcl->vcnum].state
- & DDF_state_mask;
- if (vstate == DDF_state_degraded ||
- vstate == DDF_state_part_optimal)
- in_degraded = 1;
- }
- while (vn < ddf->max_part)
- dl->vlist[vn++] = NULL;
- if (dl->vlist[0]) {
- be16_clear(ddf->phys->entries[dl->pdnum].type,
- cpu_to_be16(DDF_Global_Spare));
- if (!be16_and(ddf->phys
- ->entries[dl->pdnum].type,
- cpu_to_be16(DDF_Active_in_VD))) {
- be16_set(ddf->phys
- ->entries[dl->pdnum].type,
- cpu_to_be16(DDF_Active_in_VD));
- if (in_degraded)
- be16_set(ddf->phys
- ->entries[dl->pdnum]
- .state,
- cpu_to_be16
- (DDF_Rebuilding));
- }
- }
- if (dl->spare) {
- be16_clear(ddf->phys->entries[dl->pdnum].type,
- cpu_to_be16(DDF_Global_Spare));
- be16_set(ddf->phys->entries[dl->pdnum].type,
- cpu_to_be16(DDF_Spare));
- }
- if (!dl->vlist[0] && !dl->spare) {
- be16_set(ddf->phys->entries[dl->pdnum].type,
- cpu_to_be16(DDF_Global_Spare));
- be16_clear(ddf->phys->entries[dl->pdnum].type,
- cpu_to_be16(DDF_Spare));
- be16_clear(ddf->phys->entries[dl->pdnum].type,
- cpu_to_be16(DDF_Active_in_VD));
- }
- }
-
- /* Now remove any 'Failed' devices that are not part
- * of any VD. They will have the Transition flag set.
- * Once done, we need to update all dl->pdnum numbers.
- */
- pd2 = 0;
- for (pdnum = 0; pdnum < be16_to_cpu(ddf->phys->used_pdes);
- pdnum++) {
- if (be16_and(ddf->phys->entries[pdnum].state,
- cpu_to_be16(DDF_Failed))
- && be16_and(ddf->phys->entries[pdnum].state,
- cpu_to_be16(DDF_Transition))) {
- /* skip this one unless in dlist*/
- for (dl = ddf->dlist; dl; dl = dl->next)
- if (dl->pdnum == (int)pdnum)
- break;
- if (!dl)
- continue;
- }
- if (pdnum == pd2)
- pd2++;
- else {
- ddf->phys->entries[pd2] =
- ddf->phys->entries[pdnum];
- for (dl = ddf->dlist; dl; dl = dl->next)
- if (dl->pdnum == (int)pdnum)
- dl->pdnum = pd2;
- pd2++;
- }
- }
- ddf->phys->used_pdes = cpu_to_be16(pd2);
- while (pd2 < pdnum) {
- memset(ddf->phys->entries[pd2].guid, 0xff,
- DDF_GUID_LEN);
- pd2++;
- }
-
- ddf_set_updates_pending(ddf);
+ ddf_process_virt_update(st, update);
+ } else if (be32_eq(*magic, DDF_VD_CONF_MAGIC)) {
+ ddf_process_conf_update(st, update);
}
/* case DDF_SPARE_ASSIGN_MAGIC */
}
-static void ddf_prepare_update(struct supertype *st,
- struct metadata_update *update)
+static int ddf_prepare_update(struct supertype *st,
+ struct metadata_update *update)
{
/* This update arrived at managemon.
* We are about to pass it to monitor.
* If a malloc is needed, do it here.
*/
struct ddf_super *ddf = st->sb;
- be32 *magic = (be32 *)update->buf;
+ be32 *magic;
+ if (update->len < 4)
+ return 0;
+ magic = (be32 *)update->buf;
if (be32_eq(*magic, DDF_VD_CONF_MAGIC)) {
struct vcl *vcl;
- struct vd_config *conf = (struct vd_config *) update->buf;
+ struct vd_config *conf;
+ if (update->len < (int)sizeof(*conf))
+ return 0;
+ conf = (struct vd_config *) update->buf;
if (posix_memalign(&update->space, 512,
offsetof(struct vcl, conf)
+ ddf->conf_rec_len * 512) != 0) {
update->space = NULL;
- return;
+ return 0;
}
vcl = update->space;
vcl->conf.sec_elmnt_count = conf->sec_elmnt_count;
if (alloc_other_bvds(ddf, vcl) != 0) {
free(update->space);
update->space = NULL;
+ return 0;
}
}
+ return 1;
}
/*
@@ -4769,7 +4992,7 @@ out:
* arrange for their inclusion.
* We only choose devices which are not already in the array,
* and prefer those with a spare-assignment to this array.
- * otherwise we choose global spares - assuming always that
+ * Otherwise we choose global spares - assuming always that
* there is enough room.
* For each spare that we assign, we return an 'mdinfo' which
* describes the position for the device in the array.
@@ -4849,9 +5072,11 @@ static struct mdinfo *ddf_activate_spare(struct active_array *a,
struct mdinfo *d2;
int is_global = 0;
int is_dedicated = 0;
- struct extent *ex;
- unsigned int j;
- be16 state = ddf->phys->entries[dl->pdnum].state;
+ be16 state;
+
+ if (dl->pdnum < 0)
+ continue;
+ state = ddf->phys->entries[dl->pdnum].state;
if (be16_and(state,
cpu_to_be16(DDF_Failed|DDF_Missing)) ||
!be16_and(state,
@@ -4876,6 +5101,7 @@ static struct mdinfo *ddf_activate_spare(struct active_array *a,
if (dl->spare) {
if (dl->spare->type & DDF_spare_dedicated) {
/* check spare_ents for guid */
+ unsigned int j;
for (j = 0 ;
j < be16_to_cpu
(dl->spare
@@ -4907,23 +5133,9 @@ static struct mdinfo *ddf_activate_spare(struct active_array *a,
/* We are allowed to use this device - is there space?
* We need a->info.component_size sectors */
- ex = get_extents(ddf, dl);
- if (!ex) {
- dprintf("cannot get extents\n");
- continue;
- }
- j = 0; pos = 0;
- esize = 0;
-
- do {
- esize = ex[j].start - pos;
- if (esize >= a->info.component_size)
- break;
- pos = ex[j].start + ex[j].size;
- j++;
- } while (ex[j-1].size);
+ esize = a->info.component_size;
+ pos = find_space(ddf, dl, INVALID_SECTORS, &esize);
- free(ex);
if (esize < a->info.component_size) {
dprintf("%x:%x has no room: %llu %llu\n",
dl->major, dl->minor,
@@ -4942,7 +5154,6 @@ static struct mdinfo *ddf_activate_spare(struct active_array *a,
di->recovery_start = 0;
di->data_offset = pos;
di->component_size = a->info.component_size;
- di->container_member = dl->pdnum;
di->next = rv;
rv = di;
dprintf("%x:%x (%08x) to be %d at %llu\n",
@@ -5000,7 +5211,7 @@ static struct mdinfo *ddf_activate_spare(struct active_array *a,
if (dl->major == di->disk.major
&& dl->minor == di->disk.minor)
break;
- if (!dl) {
+ if (!dl || dl->pdnum < 0) {
pr_err("%s: BUG: can't find disk %d (%d/%d)\n",
__func__, di->disk.raid_disk,
di->disk.major, di->disk.minor);
diff --git a/super-intel.c b/super-intel.c
index 7b240686..e28ac7d3 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -4422,8 +4422,9 @@ static int load_super_imsm(struct supertype *st, int fd, char *devname)
{
struct intel_super *super;
int rv;
+ int retry;
- if (!st->ignore_hw_compat && test_partition(fd))
+ if (test_partition(fd))
/* IMSM not allowed on partitions */
return 1;
@@ -4444,6 +4445,22 @@ static int load_super_imsm(struct supertype *st, int fd, char *devname)
}
rv = load_and_parse_mpb(fd, super, devname, 0);
+ /* retry the load if we might have raced against mdmon */
+ if (rv == 3) {
+ struct mdstat_ent *mdstat = mdstat_by_component(fd2devnm(fd));
+
+ if (mdstat && mdmon_running(mdstat->devnm) && getpid() != mdmon_pid(mdstat->devnm)) {
+ for (retry = 0; retry < 3; retry++) {
+ usleep(3000);
+ rv = load_and_parse_mpb(fd, super, devname, 0);
+ if (rv != 3)
+ break;
+ }
+ }
+
+ free_mdstat(mdstat);
+ }
+
if (rv) {
if (devname)
pr_err("Failed to load all information "
@@ -5210,6 +5227,8 @@ static int create_array(struct supertype *st, int dev_idx)
int idx = get_imsm_disk_idx(dev, i, MAP_X);
disk = get_imsm_disk(super, idx);
+ if (!disk)
+ disk = get_imsm_missing(super, idx);
serialcpy(inf[i].serial, disk->serial);
}
append_metadata_update(st, u, len);
@@ -8568,7 +8587,7 @@ static void imsm_process_update(struct supertype *st,
}
case update_add_remove_disk: {
/* we may be able to repair some arrays if disks are
- * being added, check teh status of add_remove_disk
+ * being added, check the status of add_remove_disk
* if discs has been added.
*/
if (add_remove_disk_update(super)) {
@@ -8588,8 +8607,8 @@ static void imsm_process_update(struct supertype *st,
static struct mdinfo *get_spares_for_grow(struct supertype *st);
-static void imsm_prepare_update(struct supertype *st,
- struct metadata_update *update)
+static int imsm_prepare_update(struct supertype *st,
+ struct metadata_update *update)
{
/**
* Allocate space to hold new disk entries, raid-device entries or a new
@@ -8598,19 +8617,28 @@ static void imsm_prepare_update(struct supertype *st,
* integrated by the monitor thread without worrying about live pointers
* in the manager thread.
*/
- enum imsm_update_type type = *(enum imsm_update_type *) update->buf;
+ enum imsm_update_type type;
struct intel_super *super = st->sb;
struct imsm_super *mpb = super->anchor;
size_t buf_len;
size_t len = 0;
+ if (update->len < (int)sizeof(type))
+ return 0;
+
+ type = *(enum imsm_update_type *) update->buf;
+
switch (type) {
case update_general_migration_checkpoint:
+ if (update->len < (int)sizeof(struct imsm_update_general_migration_checkpoint))
+ return 0;
dprintf("imsm: prepare_update() "
"for update_general_migration_checkpoint called\n");
break;
case update_takeover: {
struct imsm_update_takeover *u = (void *)update->buf;
+ if (update->len < (int)sizeof(*u))
+ return 0;
if (u->direction == R0_TO_R10) {
void **tail = (void **)&update->space_list;
struct imsm_dev *dev = get_imsm_dev(super, u->subarray);
@@ -8651,6 +8679,9 @@ static void imsm_prepare_update(struct supertype *st,
struct intel_dev *dl;
void **space_tail = (void**)&update->space_list;
+ if (update->len < (int)sizeof(*u))
+ return 0;
+
dprintf("imsm: imsm_prepare_update() for update_reshape\n");
for (dl = super->devlist; dl; dl = dl->next) {
@@ -8683,6 +8714,9 @@ static void imsm_prepare_update(struct supertype *st,
void *s;
int current_level = -1;
+ if (update->len < (int)sizeof(*u))
+ return 0;
+
dprintf("imsm: imsm_prepare_update() for update_reshape\n");
/* add space for bigger array in update
@@ -8750,6 +8784,13 @@ static void imsm_prepare_update(struct supertype *st,
break;
}
case update_size_change: {
+ if (update->len < (int)sizeof(struct imsm_update_size_change))
+ return 0;
+ break;
+ }
+ case update_activate_spare: {
+ if (update->len < (int)sizeof(struct imsm_update_activate_spare))
+ return 0;
break;
}
case update_create_array: {
@@ -8762,6 +8803,9 @@ static void imsm_prepare_update(struct supertype *st,
int i;
int activate = 0;
+ if (update->len < (int)sizeof(*u))
+ return 0;
+
inf = get_disk_info(u);
len = sizeof_imsm_dev(dev, 1);
/* allocate a new super->devlist entry */
@@ -8783,9 +8827,22 @@ static void imsm_prepare_update(struct supertype *st,
}
len += activate * sizeof(struct imsm_disk);
break;
- default:
+ }
+ case update_kill_array: {
+ if (update->len < (int)sizeof(struct imsm_update_kill_array))
+ return 0;
break;
}
+ case update_rename_array: {
+ if (update->len < (int)sizeof(struct imsm_update_rename_array))
+ return 0;
+ break;
+ }
+ case update_add_remove_disk:
+ /* no update->len needed */
+ break;
+ default:
+ return 0;
}
/* check if we need a larger metadata buffer */
@@ -8809,6 +8866,7 @@ static void imsm_prepare_update(struct supertype *st,
else
super->next_buf = NULL;
}
+ return 1;
}
/* must be called while manager is quiesced */
@@ -8990,6 +9048,47 @@ int open_backup_targets(struct mdinfo *info, int raid_disks, int *raid_fds,
return 0;
}
+/*******************************************************************************
+ * Function: validate_container_imsm
+ * Description: This routine validates container after assemble,
+ * eg. if devices in container are under the same controller.
+ *
+ * Parameters:
+ * info : linked list with info about devices used in array
+ * Returns:
+ * 1 : HBA mismatch
+ * 0 : Success
+ ******************************************************************************/
+int validate_container_imsm(struct mdinfo *info)
+{
+ if (!check_env("IMSM_NO_PLATFORM")) {
+ struct sys_dev *idev;
+ struct mdinfo *dev;
+ char *hba_path = NULL;
+ char *dev_path = devt_to_devpath(makedev(info->disk.major,
+ info->disk.minor));
+
+ for (idev = find_intel_devices(); idev; idev = idev->next) {
+ if (strstr(dev_path, idev->path)) {
+ hba_path = idev->path;
+ break;
+ }
+ }
+ free(dev_path);
+
+ if (hba_path) {
+ for (dev = info->next; dev; dev = dev->next) {
+ if (!devt_attached_to_hba(makedev(dev->disk.major,
+ dev->disk.minor), hba_path)) {
+ pr_err("WARNING - IMSM container assembled with disks under different HBAs!\n"
+ " This operation is not supported and can lead to data loss.\n");
+ return 1;
+ }
+ }
+ }
+ }
+ return 0;
+}
#ifndef MDASSEMBLE
/*******************************************************************************
* Function: init_migr_record_imsm
@@ -9338,7 +9437,7 @@ static const char *imsm_get_disk_controller_domain(const char *path)
char *drv=NULL;
struct stat st;
- strncpy(disk_path, disk_by_path, PATH_MAX - 1);
+ strcpy(disk_path, disk_by_path);
strncat(disk_path, path, PATH_MAX - strlen(disk_path) - 1);
if (stat(disk_path, &st) == 0) {
struct sys_dev* hba;
@@ -9697,8 +9796,8 @@ static void imsm_update_metadata_locally(struct supertype *st,
mu.space = NULL;
mu.space_list = NULL;
mu.next = NULL;
- imsm_prepare_update(st, &mu);
- imsm_process_update(st, &mu);
+ if (imsm_prepare_update(st, &mu))
+ imsm_process_update(st, &mu);
while (mu.space_list) {
void **space = mu.space_list;
@@ -10465,6 +10564,7 @@ abort:
return ret_val;
}
+
#endif /* MDASSEMBLE */
struct superswitch super_imsm = {
@@ -10508,6 +10608,7 @@ struct superswitch super_imsm = {
.free_super = free_super_imsm,
.match_metadata_desc = match_metadata_desc_imsm,
.container_content = container_content_imsm,
+ .validate_container = validate_container_imsm,
.external = 1,
.name = "imsm",
diff --git a/super0.c b/super0.c
index 3218377f..1c203927 100644
--- a/super0.c
+++ b/super0.c
@@ -805,9 +805,8 @@ static int add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo,
dk->major = dinfo->major;
dk->minor = dinfo->minor;
dk->raid_disk = dinfo->raid_disk;
- dk->state = dinfo->state;
- /* In case our source disk was writemostly, don't copy that bit */
- dk->state &= ~(1<<MD_DISK_WRITEMOSTLY);
+ dk->state = dinfo->state & ((1<<MD_DISK_ACTIVE) |
+ (1<<MD_DISK_SYNC));
sb->this_disk = sb->disks[dinfo->number];
sb->sb_csum = calc_sb0_csum(sb);
@@ -887,7 +886,7 @@ static int write_init_super0(struct supertype *st)
for (di = st->info ; di && ! rv ; di = di->next) {
- if (di->disk.state == 1)
+ if (di->disk.state & (1 << MD_DISK_FAULTY))
continue;
if (di->fd == -1)
continue;
diff --git a/super1.c b/super1.c
index 1bc5216f..727a08a1 100644
--- a/super1.c
+++ b/super1.c
@@ -22,6 +22,7 @@
* Email: <neilb@suse.de>
*/
+#include <stddef.h>
#include "mdadm.h"
/*
* The version-1 superblock :
@@ -133,9 +134,6 @@ struct misc_dev_info {
|MD_FEATURE_NEW_OFFSET \
)
-#ifndef offsetof
-#define offsetof(t,f) ((size_t)&(((t*)0)->f))
-#endif
static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
{
unsigned int disk_csum, csum;
@@ -1559,7 +1557,7 @@ static int write_init_super1(struct supertype *st)
unsigned long long data_offset;
for (di = st->info; di; di = di->next) {
- if (di->disk.state == 1)
+ if (di->disk.state & (1 << MD_DISK_FAULTY))
continue;
if (di->fd < 0)
continue;
@@ -1690,6 +1688,10 @@ static int write_init_super1(struct supertype *st)
rv = -EINVAL;
goto out;
}
+ if (conf_get_create_info()->bblist == 0) {
+ sb->bblog_size = 0;
+ sb->bblog_offset = 0;
+ }
sb->sb_csum = calc_sb_1_csum(sb);
rv = store_super1(st, di->fd);
@@ -2046,8 +2048,8 @@ add_internal_bitmap1(struct supertype *st,
* been left.
*/
offset = 0;
- room = choose_bm_space(__le64_to_cpu(sb->size));
bbl_size = 8;
+ room = choose_bm_space(__le64_to_cpu(sb->size)) + bbl_size;
} else {
room = __le64_to_cpu(sb->super_offset)
- __le64_to_cpu(sb->data_offset)
@@ -2073,8 +2075,8 @@ add_internal_bitmap1(struct supertype *st,
case 2: /* between superblock and data */
if (creating) {
offset = 4*2;
- room = choose_bm_space(__le64_to_cpu(sb->size));
bbl_size = 8;
+ room = choose_bm_space(__le64_to_cpu(sb->size)) + bbl_size;
} else {
room = __le64_to_cpu(sb->data_offset)
- __le64_to_cpu(sb->super_offset);
@@ -2103,6 +2105,10 @@ add_internal_bitmap1(struct supertype *st,
/* Limit to 128K of bitmap when chunk size not requested */
room = 128*2;
+ if (room <= 1)
+ /* No room for a bitmap */
+ return 0;
+
max_bits = (room * 512 - sizeof(bitmap_super_t)) * 8;
min_chunk = 4096; /* sub-page chunks don't work yet.. */
diff --git a/systemd/SUSE-mdadm_env.sh b/systemd/SUSE-mdadm_env.sh
new file mode 100644
index 00000000..10b2e749
--- /dev/null
+++ b/systemd/SUSE-mdadm_env.sh
@@ -0,0 +1,45 @@
+#!/bin/sh
+
+# extract configuration from /etc/sysconfig/mdadm and write
+# environment to /run/sysconfig/mdadm to be used by
+# systemd unit files.
+
+MDADM_SCAN="yes"
+
+# Following adapted from /etc/init.d/mdadmd on openSUSE
+
+mdadmd_CONFIG=/etc/sysconfig/mdadm
+if test -r $mdadmd_CONFIG; then
+ . $mdadmd_CONFIG
+fi
+
+if [ x$MDADM_DELAY != x"" ]; then
+ MDADM_DELAY="-d "$MDADM_DELAY;
+fi
+
+if [ x$MDADM_MAIL != x"" ]; then
+ MDADM_MAIL="-m \"$MDADM_MAIL\""
+fi
+
+if [ x$MDADM_PROGRAM != x"" ]; then
+ MDADM_PROGRAM="-p \"$MDADM_PROGRAM\""
+fi
+
+if [ x$MDADM_SCAN = x"yes" ]; then
+ MDADM_SCAN="--scan"
+else
+ MDADM_SCAN=""
+fi
+
+if [ x$MDADM_SEND_MAIL_ON_START = x"yes" ]; then
+ MDADM_SEND_MAIL="-t"
+else
+ MDADM_SEND_MAIL=""
+fi
+
+if [ x$MDADM_CONFIG != x"" ]; then
+ MDADM_CONFIG="-c \"$MDADM_CONFIG\""
+fi
+
+mkdir -p /run/sysconfig
+echo "MDADM_MONITOR_ARGS=$MDADM_RAIDDEVICES $MDADM_DELAY $MDADM_MAIL $MDADM_PROGRAM $MDADM_SCAN $MDADM_SEND_MAIL $MDADM_CONFIG" > /run/sysconfig/mdadm
diff --git a/systemd/mdadm-grow-continue@.service b/systemd/mdadm-grow-continue@.service
new file mode 100644
index 00000000..5c667d2a
--- /dev/null
+++ b/systemd/mdadm-grow-continue@.service
@@ -0,0 +1,17 @@
+# This file is part of mdadm.
+#
+# mdadm is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+[Unit]
+Description=Manage MD Reshape on /dev/%I
+DefaultDependencies=no
+
+[Service]
+ExecStart=BINDIR/mdadm --grow --continue /dev/%I
+StandardInput=null
+StandardOutput=null
+StandardError=null
+KillMode=none
diff --git a/systemd/mdadm-last-resort@.service b/systemd/mdadm-last-resort@.service
new file mode 100644
index 00000000..5179f194
--- /dev/null
+++ b/systemd/mdadm-last-resort@.service
@@ -0,0 +1,7 @@
+[Unit]
+Description=Activate md array even though degraded
+DefaultDependencies=no
+
+[Service]
+Type=oneshot
+ExecStart=BINDIR/mdadm --run /dev/%i
diff --git a/systemd/mdadm-last-resort@.timer b/systemd/mdadm-last-resort@.timer
new file mode 100644
index 00000000..52b3f227
--- /dev/null
+++ b/systemd/mdadm-last-resort@.timer
@@ -0,0 +1,7 @@
+[Unit]
+Description=Timer to wait for more drives before activating degraded array.
+DefaultDependencies=no
+Conflicts=sys-devices-virtual-block-%i.device
+
+[Timer]
+OnActiveSec=30
diff --git a/systemd/mdadm.shutdown b/systemd/mdadm.shutdown
new file mode 100644
index 00000000..33f27783
--- /dev/null
+++ b/systemd/mdadm.shutdown
@@ -0,0 +1,4 @@
+#!/bin/sh
+# We need to ensure all md arrays with external metadata
+# (e.g. IMSM, DDF) are clean before completing the shutdown.
+BINDIR/mdadm --wait-clean --scan
diff --git a/systemd/mdmon@.service b/systemd/mdmon@.service
index 809f5273..85a3a7c5 100644
--- a/systemd/mdmon@.service
+++ b/systemd/mdmon@.service
@@ -11,8 +11,18 @@ DefaultDependencies=no
Before=initrd-switch-root.target
[Service]
-ExecStart=/sbin/mdmon --foreground %I
-StandardInput=null
-StandardOutput=null
-StandardError=null
+# mdmon should never complain due to lack of a platform,
+# that is mdadm's job if at all.
+Environment=IMSM_NO_PLATFORM=1
+# The mdmon starting in the initramfs (with dracut at least)
+# cannot see sysfs after root is mounted, so we will have to
+# 'takeover'. As the '--offroot --takeover' don't hurt when
+# not necessary, are are useful with root-on-md in dracut,
+# have them always present.
+ExecStart=BINDIR/mdmon --offroot --takeover %I
+Type=forking
+# Don't set the PIDFile. It isn't necessary (systemd can work
+# it out) and systemd will remove it when transitioning from
+# initramfs to rootfs.
+#PIDFile=/run/mdadm/%I.pid
KillMode=none
diff --git a/systemd/mdmonitor.service b/systemd/mdmonitor.service
new file mode 100644
index 00000000..c7cff3e4
--- /dev/null
+++ b/systemd/mdmonitor.service
@@ -0,0 +1,16 @@
+# This file is part of mdadm.
+#
+# mdadm is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+[Unit]
+Description=MD array monitor
+DefaultDependencies=no
+
+[Service]
+Environment= MDADM_MONITOR_ARGS=--scan
+EnvironmentFile=-/run/sysconfig/mdadm
+ExecStartPre=-/usr/lib/systemd/scripts/mdadm_env.sh
+ExecStart=BINDIR/mdadm --monitor $MDADM_MONITOR_ARGS
diff --git a/tests/03r5assem-failed b/tests/03r5assem-failed
new file mode 100644
index 00000000..d38241df
--- /dev/null
+++ b/tests/03r5assem-failed
@@ -0,0 +1,12 @@
+
+# Create an array, fail one device while array is active, stop array,
+# then re-assemble listing the failed device first.
+
+mdadm -CR $md1 -l5 -n4 $dev0 $dev1 $dev2 $dev3
+check wait
+
+echo 2000 > /sys/block/md1/md/safe_mode_delay
+mkfs $md1
+mdadm $md1 -f $dev0
+mdadm -S $md1
+mdadm -A $md1 $dev0 $dev1 $dev2 $dev3 || exit 1
diff --git a/tests/10ddf-assemble-missing b/tests/10ddf-assemble-missing
new file mode 100644
index 00000000..17f17244
--- /dev/null
+++ b/tests/10ddf-assemble-missing
@@ -0,0 +1,61 @@
+# An array is assembled incompletely.
+# Re missing disks get marked as missing and are not allowed back in
+
+. tests/env-ddf-template
+tmp=$(mktemp /tmp/mdtest-XXXXXX)
+rm -f $tmp /var/tmp/mdmon.log
+ret=0
+
+mdadm -CR $container -e ddf -n 4 $dev8 $dev9 $dev10 $dev11
+ddf_check container 4
+
+mdadm -CR $member1 -n 4 -l 10 $dev8 $dev10 $dev9 $dev11 -z 10000
+mdadm -CR $member0 -n 2 -l 1 $dev8 $dev9 -z 10000
+
+mdadm --wait $member0
+mdadm --wait $member1
+
+mdadm -Ss
+sleep 1
+
+# Add all devices except those for $member0
+mdadm -I $dev10
+mdadm -I $dev11
+
+# Start runnable members
+mdadm -IRs
+mdadm -Ss
+
+#[ -f /var/tmp/mdmon.log ] && cat /var/tmp/mdmon.log
+
+# Now reassemble
+# This should work because BVDs weren't written to
+for d in $dev8 $dev9 $dev10 $dev11; do
+ mdadm -I $d
+done
+mdadm -Ss
+
+# Expect consistent state
+for d in $dev10 $dev11; do
+ mdadm -E $d>$tmp
+ egrep 'state\[0\] : Degraded, Consistent' $tmp || {
+ ret=1
+ echo ERROR: $member0 has unexpected state on $d
+ }
+ egrep 'state\[1\] : Optimal, Consistent' $tmp || {
+ ret=1
+ echo ERROR: $member1 has unexpected state on $d
+ }
+
+ if [ x$(egrep -c 'active/Online$' $tmp) != x2 ]; then
+ ret=1
+ echo ERROR: unexpected number of online disks on $d
+ fi
+done
+
+if [ $ret -ne 0 ]; then
+ mdadm -E $dev10
+ mdadm -E $dev8
+fi
+rm -f $tmp /var/tmp/mdmon.log
+[ $ret -eq 0 ]
diff --git a/tests/10ddf-create b/tests/10ddf-create
index 50c85ae9..2f7747cd 100644
--- a/tests/10ddf-create
+++ b/tests/10ddf-create
@@ -9,6 +9,8 @@
# add some data, tear down the array, reassemble
# and make sure it is still there.
set -e
+. tests/env-ddf-template
+sda=$(get_rootdev) || exit 1
mdadm -CR /dev/md/ddf0 -e ddf -n 5 $dev8 $dev9 $dev10 $dev11 $dev12
mdadm -CR r5 -l5 -n5 /dev/md/ddf0 -z 5000
@@ -23,10 +25,10 @@ testdev /dev/md/r10 2 5000 512
# r0/r10 will use 4608 due to chunk size, so that leaves 23552 for the rest
testdev /dev/md/r1 1 23552 64
testdev /dev/md/r0 3 23552 512
-dd if=/dev/sda of=/dev/md/r0 || true
-dd if=/dev/sda of=/dev/md/r10 || true
-dd if=/dev/sda of=/dev/md/r1 || true
-dd if=/dev/sda of=/dev/md/r5 || true
+dd if=$sda of=/dev/md/r0 || true
+dd if=$sda of=/dev/md/r10 || true
+dd if=$sda of=/dev/md/r1 || true
+dd if=$sda of=/dev/md/r5 || true
s0=`sha1sum /dev/md/r0`
s10=`sha1sum /dev/md/r10`
diff --git a/tests/10ddf-fail-readd b/tests/10ddf-fail-readd
new file mode 100644
index 00000000..746f049e
--- /dev/null
+++ b/tests/10ddf-fail-readd
@@ -0,0 +1,55 @@
+# Simple fail / re-add test
+. tests/env-ddf-template
+
+tmp=$(mktemp /tmp/mdtest-XXXXXX)
+rm -f $tmp
+
+mdadm --zero-superblock $dev8 $dev9
+mdadm -CR $container -e ddf -l container -n 2 $dev8 $dev9
+
+mdadm -CR $member0 -l raid1 -n 2 $container
+#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1
+
+mke2fs $member0
+check wait
+
+set -- $(get_raiddisks $member0)
+fail0=$1
+mdadm $member0 --fail $fail0
+
+sleep 1
+mdadm $container --remove $fail0
+
+set -- $(get_raiddisks $member0)
+case $1 in MISSING) shift;; esac
+good0=$1
+
+# We re-add the disk now
+mdadm $container --add $fail0
+
+sleep 1
+mdadm --wait $member0
+
+ret=0
+set -- $(get_raiddisks $member0)
+case $1:$2 in
+ $dev8:$dev9|$dev9:$dev8);;
+ *) echo ERROR: bad raid disks "$@"; ret=1;;
+esac
+
+mdadm -Ss
+for x in $@; do
+ mdadm -E $x >$tmp
+ if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then
+ echo ERROR: member 0 should be optimal in meta data on $x
+ ret=1
+ fi
+done
+
+rm -f $tmp
+if [ $ret -ne 0 ]; then
+ mdadm -E $dev8
+ mdadm -E $dev9
+fi
+
+[ $ret -eq 0 ]
diff --git a/tests/10ddf-fail-readd-readonly b/tests/10ddf-fail-readd-readonly
new file mode 100644
index 00000000..ed24585d
--- /dev/null
+++ b/tests/10ddf-fail-readd-readonly
@@ -0,0 +1,71 @@
+# Simple fail / re-add test
+. tests/env-ddf-template
+
+tmp=$(mktemp /tmp/mdtest-XXXXXX)
+rm -f $tmp
+
+mdadm --zero-superblock $dev8 $dev9
+mdadm -CR $container -e ddf -l container -n 2 $dev8 $dev9
+
+mdadm -CR $member0 -l raid1 -n 2 $container
+#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1
+
+check wait
+
+set -- $(get_raiddisks $member0)
+fail0=$1
+mdadm $member0 --fail $fail0
+
+sleep 1
+set -- $(get_raiddisks $member0)
+case $1 in MISSING) shift;; esac
+good0=$1
+
+# Check that the meta data now show one disk as failed
+ret=0
+for x in $@; do
+ mdadm -E $x >$tmp
+ if ! grep -q 'state\[0\] : Degraded, Consistent' $tmp; then
+ echo ERROR: member 0 should be degraded in meta data on $x
+ ret=1
+ fi
+ phys=$(grep $x $tmp)
+ case $x:$phys in
+ $fail0:*active/Offline,\ Failed);;
+ $good0:*active/Online);;
+ *) echo ERROR: wrong phys disk state for $x
+ ret=1
+ ;;
+ esac
+done
+
+mdadm $container --remove $fail0
+
+# We re-add the disk now
+mdadm $container --add $fail0
+
+sleep 1
+mdadm --wait $member0
+
+set -- $(get_raiddisks $member0)
+case $1:$2 in
+ $dev8:$dev9|$dev9:$dev8);;
+ *) echo ERROR: bad raid disks "$@"; ret=1;;
+esac
+
+mdadm -Ss
+for x in $@; do
+ mdadm -E $x >$tmp
+ if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then
+ echo ERROR: member 0 should be optimal in meta data on $x
+ ret=1
+ fi
+done
+
+rm -f $tmp
+if [ $ret -ne 0 ]; then
+ mdadm -E $dev8
+ mdadm -E $dev9
+fi
+
+[ $ret -eq 0 ]
diff --git a/tests/10ddf-fail-stop-readd b/tests/10ddf-fail-stop-readd
new file mode 100644
index 00000000..4a0511a1
--- /dev/null
+++ b/tests/10ddf-fail-stop-readd
@@ -0,0 +1,66 @@
+# Simple fail / re-add test
+. tests/env-ddf-template
+
+tmp=$(mktemp /tmp/mdtest-XXXXXX)
+rm -f $tmp
+
+mdadm --zero-superblock $dev8 $dev9
+mdadm -CR $container -e ddf -l container -n 2 $dev8 $dev9
+
+mdadm -CR $member0 -l raid1 -n 2 $container
+#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1
+
+# Write to the array
+mke2fs $member0
+check wait
+
+set -- $(get_raiddisks $member0)
+fail0=$1
+mdadm $member0 --fail $fail0
+
+sleep 1
+mdadm $container --remove $fail0
+
+set -- $(get_raiddisks $member0)
+case $1 in MISSING) shift;; esac
+good0=$1
+
+mdadm -Ss
+
+sleep 1
+# Now simulate incremental assembly
+mdadm -I $good0
+mdadm -IRs
+
+# Write to the array
+mke2fs $member0
+
+# We re-add the disk now
+mdadm $container --add $fail0
+
+sleep 1
+mdadm --wait $member0
+
+ret=0
+set -- $(get_raiddisks $member0)
+case $1:$2 in
+ $dev8:$dev9|$dev9:$dev8);;
+ *) echo ERROR: bad raid disks "$@"; ret=1;;
+esac
+
+mdadm -Ss
+for x in $@; do
+ mdadm -E $x >$tmp
+ if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then
+ echo ERROR: member 0 should be optimal in meta data on $x
+ ret=1
+ fi
+done
+
+rm -f $tmp
+if [ $ret -ne 0 ]; then
+ mdadm -E $dev8
+ mdadm -E $dev9
+fi
+
+[ $ret -eq 0 ]
diff --git a/tests/10ddf-fail-two-spares b/tests/10ddf-fail-two-spares
index cc2cbb41..fa6e2e8c 100644
--- a/tests/10ddf-fail-two-spares
+++ b/tests/10ddf-fail-two-spares
@@ -1,5 +1,6 @@
# Simulate two disks failing shorty after each other
. tests/env-ddf-template
+sda=$(get_rootdev) || exit 1
tmp=$(mktemp /tmp/mdtest-XXXXXX)
mdadm --zero-superblock $dev8 $dev9 $dev10 $dev11 $dev12 $dev13
@@ -13,8 +14,8 @@ mdadm -CR $member0 -l raid6 -n 4 $dev10 $dev11 $dev12 $dev13 -z 16384
# >/tmp/mdmon.txt 2>&1
mdadm -CR $member1 -l raid10 -n 4 $dev10 $dev11 $dev12 $dev13 -z 16384
-dd if=/dev/sda of=$member0 bs=1M
-dd if=/dev/sda of=$member1 bs=1M skip=16
+dd if=$sda of=$member0 bs=1M
+dd if=$sda of=$member1 bs=1M skip=16
check wait
diff --git a/tests/10ddf-geometry b/tests/10ddf-geometry
index f6b40a90..b0cce2f6 100644
--- a/tests/10ddf-geometry
+++ b/tests/10ddf-geometry
@@ -32,14 +32,14 @@ mdadm -CR l1m -l1 -n3 $dev8 $dev9 $dev10 -z 10000
assert_fail -CR badl1 -l1 -n4 /dev/md/ddf0
# RAID10 geometries
-assert_fail -CR badl10 -l10 -n3 /dev/md/ddf0
-assert_fail -CR badl10 -l10 -n5 /dev/md/ddf0
-assert_fail -CR badl10 -l10 -n4 -pn3 /dev/md/ddf0
-mdadm -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000
-mdadm -CR l10_3 -l10 -n6 -pn3 /dev/md/ddf0 -z 5000
+mdadm -CR l10_0 -l10 -n3 /dev/md/ddf0 -z 1000
+mdadm -CR l10_1 -l10 -n5 /dev/md/ddf0 -z 1000
+assert_fail mdadm -CR badl10 -l10 -n4 -pn3 /dev/md/ddf0
+mdadm -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 4000
+mdadm -CR l10_3 -l10 -n6 -pn3 /dev/md/ddf0 -z 4000
assert_fail -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000
-assert_kill /dev/md/l10_2 2
+assert_kill /dev/md/l10_2 4
# gone now, must be able to create it again
mdadm -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000
@@ -50,12 +50,12 @@ mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12 $dev13
# Same as above, on inactive container
assert_fail -CR l10_3 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000
# Kill subarray without having started anything (no mdmon)
-mdadm --kill-subarray=3 /dev/md/ddf0
+mdadm --kill-subarray=5 /dev/md/ddf0
mdadm -I /dev/md/ddf0
mdadm -CR l10_3 -l10 -n6 -pn3 /dev/md/ddf0 -z 5000
-assert_kill /dev/md/l10_2 2
-assert_kill /dev/md/l10_3 3
+assert_kill /dev/md/l10_2 4
+assert_kill /dev/md/l10_3 5
# RAID5 geometries
mdadm -CR l5la -l5 -n3 --layout=ddf-N-restart /dev/md/ddf0 -z 5000
@@ -68,9 +68,9 @@ mdadm -Ss
mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12 $dev13
mdadm -I /dev/md/ddf0
-assert_kill /dev/md/l5la 2
-assert_kill /dev/md/l5ls 4
-assert_kill /dev/md/l5ra 3
+assert_kill /dev/md/l5la 4
+assert_kill /dev/md/l5ls 6
+assert_kill /dev/md/l5ra 5
# RAID6 geometries
assert_fail -CR l6la -l6 -n3 -pla /dev/md/ddf0 -z 5000
diff --git a/tests/10ddf-incremental-wrong-order b/tests/10ddf-incremental-wrong-order
new file mode 100644
index 00000000..2324f1d7
--- /dev/null
+++ b/tests/10ddf-incremental-wrong-order
@@ -0,0 +1,131 @@
+# An array is assembled incompletely. Some disks will
+# have later metadata than others.
+# The array is then reassembled in the "wrong" order -
+# older meta data first.
+# This FAILS with mdadm 3.3
+. tests/env-ddf-template
+tmp=$(mktemp /tmp/mdtest-XXXXXX)
+rm -f $tmp /var/tmp/mdmon.log
+ret=0
+
+mdadm -CR $container -e ddf -n 4 $dev8 $dev9 $dev10 $dev11
+ddf_check container 4
+
+mdadm -CR $member1 -n 4 -l 10 $dev8 $dev10 $dev9 $dev11 -z 10000
+mdadm -CR $member0 -n 2 -l 1 $dev8 $dev9 -z 10000
+
+mdadm --wait $member0
+mdadm --wait $member1
+
+mke2fs $member0
+mke2fs $member1
+sha_0a=$(sha1_sum $member0)
+sha_1a=$(sha1_sum $member1)
+
+mdadm -Ss
+sleep 1
+
+# Add all devices except those for $member0
+mdadm -I $dev10
+mdadm -I $dev11
+
+# Start runnable members ($member1) and write
+mdadm -IRs
+e2fsck -fy $member1
+sha_1b=$(sha1_sum $member1)
+
+mdadm -Ss
+sleep 1
+
+# Seq number should be different now
+seq8a=$(mdadm -E $dev8 | sed -n 's/^ *Seq : //p')
+seq10a=$(mdadm -E $dev10 | sed -n 's/^ *Seq : //p')
+
+if [ $seq8a -ge $seq10a ]; then
+ ret=1
+ echo ERROR: sequential number of $dev10 not bigger than $dev8
+fi
+if [ x$sha_1a = x$sha_1b ]; then
+ ret=1
+ echo ERROR: sha1sums equal after write
+fi
+
+#[ -f /var/tmp/mdmon.log ] && cat /var/tmp/mdmon.log
+
+# Now reassemble
+# Note that we add the previously missing disks first.
+# $dev10 should have a higher seq number than $dev8
+for d in $dev8 $dev9 $dev10 $dev11; do
+ mdadm -I $d
+done
+
+mdadm -IRs
+sha_0c=$(sha1_sum $member0)
+sha_1c=$(sha1_sum $member1)
+
+mdadm -Ss
+sleep 1
+
+seq8c=$(mdadm -E $dev8 | sed -n 's/^ *Seq : //p')
+seq10c=$(mdadm -E $dev10 | sed -n 's/^ *Seq : //p')
+
+if [ x$sha_0a != x$sha_0c ]; then
+ ret=1
+ echo ERROR: sha1sum of $member0 has changed
+fi
+if [ x$sha_1b != x$sha_1c ]; then
+ ret=1
+ echo ERROR: sha1sum of $member1 has changed
+fi
+if [ \( $seq10a -ge $seq10c \) -o \( $seq8c -ne $seq10c \) ]; then
+ ret=1
+ echo ERROR: sequential numbers are wrong
+fi
+
+# Expect consistent state
+for d in $dev10 $dev8; do
+ mdadm -E $d>$tmp
+ for x in 0 1; do
+ egrep 'state\['$x'\] : Optimal, Consistent' $tmp || {
+ ret=1
+ echo ERROR: $member0 has unexpected state on $d
+ }
+ done
+ if [ x$(egrep -c 'active/Online$' $tmp) != x4 ]; then
+ ret=1
+ echo ERROR: unexpected number of online disks on $d
+ fi
+done
+
+# Now try assembly
+if mdadm -A $container $dev8 $dev9 $dev10 $dev11; then
+ mdadm -IR $container
+ sha_0d=$(sha1_sum $member0)
+ sha_1d=$(sha1_sum $member1)
+ mdadm -Ss
+ sleep 1
+ seq8d=$(mdadm -E $dev8 | sed -n 's/^ *Seq : //p')
+ seq10d=$(mdadm -E $dev10 | sed -n 's/^ *Seq : //p')
+ if [ x$sha_0a != x$sha_0d ]; then
+ ret=1
+ echo ERROR: sha1sum of $member0 has changed
+ fi
+ if [ x$sha_1b != x$sha_1d ]; then
+ ret=1
+ echo ERROR: sha1sum of $member1 has changed
+ fi
+ if [ \( $seq10a -ge $seq10d \) -o \( $seq8d -ne $seq10d \) ]; then
+ ret=1
+ echo ERROR: sequential numbers are wrong
+ fi
+else
+ ret=1
+ echo ERROR: assembly failed
+fi
+
+if [ $ret -ne 0 ]; then
+ mdadm -E $dev10
+ mdadm -E $dev8
+fi
+rm -f $tmp /var/tmp/mdmon.log
+[ $ret -eq 0 ]
diff --git a/tests/10ddf-sudden-degraded b/tests/10ddf-sudden-degraded
new file mode 100644
index 00000000..1eab361a
--- /dev/null
+++ b/tests/10ddf-sudden-degraded
@@ -0,0 +1,18 @@
+#
+# An array is assembled with one device missing.
+# The other device must be marked as Failed in metadata
+
+. tests/env-ddf-template
+
+mdadm -CR $container -e ddf -n 2 $dev8 $dev9
+ddf_check container 2
+
+mdadm -CR $member1 -n 2 -l1 $dev8 $dev9
+mdadm --wait $member1
+mdadm -Ss
+
+mdadm -I $dev8
+mdadm -R $container
+mkfs $member1
+# There must be a missing device recorded
+mdadm --examine $dev8 | grep 'Raid Devices.*--' || exit 1
diff --git a/tests/env-ddf-template b/tests/env-ddf-template
index 1c1ca124..90d7272f 100644
--- a/tests/env-ddf-template
+++ b/tests/env-ddf-template
@@ -1,3 +1,16 @@
+sha1_sum() {
+ sha1sum "$1" | cut -c 1-40
+}
+
+get_rootdev() {
+ local dev=$(stat -c %D /)
+ local maj=$(expr $dev : '\(..*\)..')
+ local min=${dev#$maj}
+ local bd=/dev/$(basename $(readlink /sys/dev/block/$((0x$maj)):$((0x$min))))
+ [ -b $bd ] || exit 1
+ echo $bd
+}
+
get_sysdir() {
local mddev=$1
[ -L $mddev ] && mddev=$(readlink -f $mddev)
@@ -89,9 +102,12 @@ ddf_check() {
esac
}
-container=/dev/md/ddf
+container=/dev/md/ddf0
member0=/dev/md/vol0
member1=/dev/md/vol1
member2=/dev/md/vol2
member3=/dev/md/vol3
member4=/dev/md/vol4
+
+# We don't want systemd to start system mdmon; start our own
+export MDADM_NO_SYSTEMCTL=1
diff --git a/udev-md-raid-arrays.rules b/udev-md-raid-arrays.rules
index 0540ed80..c95ec7b1 100644
--- a/udev-md-raid-arrays.rules
+++ b/udev-md-raid-arrays.rules
@@ -17,7 +17,7 @@ TEST!="md/array_state", ENV{SYSTEMD_READY}="0", GOTO="md_end"
ATTR{md/array_state}=="|clear|inactive", ENV{SYSTEMD_READY}="0", GOTO="md_end"
LABEL="md_ignore_state"
-IMPORT{program}="/sbin/mdadm --detail --export $devnode"
+IMPORT{program}="BINDIR/mdadm --detail --export $devnode"
ENV{DEVTYPE}=="disk", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}", OPTIONS+="string_escape=replace"
ENV{DEVTYPE}=="disk", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}"
ENV{DEVTYPE}=="disk", ENV{MD_DEVNAME}=="?*", SYMLINK+="md/$env{MD_DEVNAME}"
@@ -32,4 +32,10 @@ OPTIONS+="watch"
ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}"
ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}"
+ENV{MD_LEVEL}=="raid[1-9]*", ENV{SYSTEMD_WANTS}+="mdmonitor.service"
+
+# Tell systemd to run mdmon for our container, if we need it.
+ENV{MD_LEVEL}=="raid[1-9]*", ENV{MD_CONTAINER}=="?*", PROGRAM="/usr/bin/readlink $env{MD_CONTAINER}", ENV{MD_MON_THIS}="%c"
+ENV{MD_MON_THIS}=="?*", PROGRAM="/usr/bin/basename $env{MD_MON_THIS}", ENV{SYSTEMD_WANTS}+="mdmon@%c.service"
+
LABEL="md_end"
diff --git a/udev-md-raid-assembly.rules b/udev-md-raid-assembly.rules
index b6532658..d0d440a6 100644
--- a/udev-md-raid-assembly.rules
+++ b/udev-md-raid-assembly.rules
@@ -1,19 +1,35 @@
# do not edit this file, it will be overwritten on update
+# Don't process any events if anaconda is running as anaconda brings up
+# raid devices manually
+ENV{ANACONDA}=="?*", GOTO="md_inc_end"
# assemble md arrays
SUBSYSTEM!="block", GOTO="md_inc_end"
# handle potential components of arrays (the ones supported by md)
-ENV{ID_FS_TYPE}=="ddf_raid_member|isw_raid_member|linux_raid_member", GOTO="md_inc"
+ENV{ID_FS_TYPE}=="linux_raid_member", GOTO="md_inc"
+
+# "noiswmd" on kernel command line stops mdadm from handling
+# "isw" (aka IMSM - Intel RAID).
+# "nodmraid" on kernel command line stops mdadm from handling
+# "isw" or "ddf".
+IMPORT{cmdline}="noiswmd"
+IMPORT{cmdline}="nodmraid"
+
+ENV{nodmraid}=="?*", GOTO="md_inc_end"
+ENV{ID_FS_TYPE}=="ddf_raid_member", GOTO="md_inc"
+ENV{noiswmd}=="?*", GOTO="md_inc_end"
+ENV{ID_FS_TYPE}=="isw_raid_member", GOTO="md_inc"
GOTO="md_inc_end"
LABEL="md_inc"
# remember you can limit what gets auto/incrementally assembled by
# mdadm.conf(5)'s 'AUTO' and selectively whitelist using 'ARRAY'
-ACTION=="add", RUN+="/sbin/mdadm --incremental $devnode --offroot"
-ACTION=="remove", ENV{ID_PATH}=="?*", RUN+="/sbin/mdadm -If $name --path $env{ID_PATH}"
-ACTION=="remove", ENV{ID_PATH}!="?*", RUN+="/sbin/mdadm -If $name"
+ACTION=="add|change", IMPORT{program}="BINDIR/mdadm --incremental --export $devnode --offroot ${DEVLINKS}"
+ACTION=="add|change", ENV{MD_STARTED}=="*unsafe*", ENV{MD_FOREIGN}=="no", ENV{SYSTEMD_WANTS}+="mdadm-last-resort@$env{MD_DEVICE}.timer"
+ACTION=="remove", ENV{ID_PATH}=="?*", RUN+="BINDIR/mdadm -If $name --path $env{ID_PATH}"
+ACTION=="remove", ENV{ID_PATH}!="?*", RUN+="BINDIR/mdadm -If $name"
LABEL="md_inc_end"
diff --git a/util.c b/util.c
index 5f95f1f9..37c6e0d3 100644
--- a/util.c
+++ b/util.c
@@ -307,7 +307,7 @@ int test_partition(int fd)
if (ioctl(fd, BLKPG, &a) == 0)
/* Very unlikely, but not a partition */
return 0;
- if (errno == ENXIO)
+ if (errno == ENXIO || errno == ENOTTY)
/* not a partition */
return 0;
@@ -387,7 +387,6 @@ int enough_fd(int fd)
{
struct mdu_array_info_s array;
struct mdu_disk_info_s disk;
- int avail_disks = 0;
int i, rv;
char *avail;
@@ -407,7 +406,6 @@ int enough_fd(int fd)
continue;
if (disk.raid_disk < 0 || disk.raid_disk >= array.raid_disks)
continue;
- avail_disks++;
avail[disk.raid_disk] = 1;
}
/* This is used on an active array, so assume it is clean */
@@ -873,12 +871,20 @@ void put_md_name(char *name)
}
#endif /* !defined(MDASSEMBLE) || defined(MDASSEMBLE) && defined(MDASSEMBLE_AUTO) */
+int get_maj_min(char *dev, int *major, int *minor)
+{
+ char *e;
+ *major = strtoul(dev, &e, 0);
+ return (e > dev && *e == ':' && e[1] &&
+ (*minor = strtoul(e+1, &e, 0)) >= 0 &&
+ *e == 0);
+}
+
int dev_open(char *dev, int flags)
{
/* like 'open', but if 'dev' matches %d:%d, create a temp
* block device and open that
*/
- char *e;
int fd = -1;
char devname[32];
int major;
@@ -887,10 +893,7 @@ int dev_open(char *dev, int flags)
if (!dev) return -1;
flags |= O_DIRECT;
- major = strtoul(dev, &e, 0);
- if (e > dev && *e == ':' && e[1] &&
- (minor = strtoul(e+1, &e, 0)) >= 0 &&
- *e == 0) {
+ if (get_maj_min(dev, &major, &minor)) {
snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d:%d",
(int)getpid(), major, minor);
if (mknod(devname, S_IFBLK|0600, makedev(major, minor)) == 0) {
@@ -1690,7 +1693,7 @@ int start_mdmon(char *devnm)
char pathbuf[1024];
char *paths[4] = {
pathbuf,
- "/sbin/mdmon",
+ BINDIR "/mdmon",
"./mdmon",
NULL
};
@@ -1948,7 +1951,25 @@ int in_initrd(void)
{
/* This is based on similar function in systemd. */
struct statfs s;
+ /* statfs.f_type is signed long on s390x and MIPS, causing all
+ sorts of sign extension problems with RAMFS_MAGIC being
+ defined as 0x858458f6 */
return statfs("/", &s) >= 0 &&
((unsigned long)s.f_type == TMPFS_MAGIC ||
- (unsigned long)s.f_type == RAMFS_MAGIC);
+ ((unsigned long)s.f_type & 0xFFFFFFFFUL) ==
+ ((unsigned long)RAMFS_MAGIC & 0xFFFFFFFFUL));
+}
+
+void reopen_mddev(int mdfd)
+{
+ /* Re-open without any O_EXCL, but keep
+ * the same fd
+ */
+ char *devnm;
+ int fd;
+ devnm = fd2devnm(mdfd);
+ close(mdfd);
+ fd = open_dev(devnm);
+ if (fd >= 0 && fd != mdfd)
+ dup2(fd, mdfd);
}