summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDimitri John Ledkov <xnox@ubuntu.com>2016-02-19 16:18:57 +0000
committerDimitri John Ledkov <xnox@ubuntu.com>2016-02-19 16:18:57 +0000
commit7e15cddececa359fa92a1a58e4a0c360e13058c1 (patch)
treea4ce38859718496770223da4c9fae331a6d39bb5
parent5df672d08eb77e64a0f5ed922d2904f418d1758e (diff)
New upstream release.
-rw-r--r--ANNOUNCE-3.424
-rw-r--r--Assemble.c59
-rw-r--r--Create.c35
-rw-r--r--Detail.c27
-rwxr-xr-x[-rw-r--r--]Grow.c36
-rw-r--r--Incremental.c32
-rw-r--r--Makefile37
-rw-r--r--Manage.c101
-rw-r--r--ReadMe.c14
-rw-r--r--bitmap.c63
-rw-r--r--bitmap.h8
-rw-r--r--config.c30
-rw-r--r--crc32c.c104
-rw-r--r--debian/changelog6
-rwxr-xr-xinventory3
-rw-r--r--mapfile.c2
-rw-r--r--md.420
-rw-r--r--md_p.h71
-rw-r--r--md_u.h5
-rw-r--r--mdadm.8.in66
-rw-r--r--mdadm.c114
-rw-r--r--mdadm.conf.52
-rwxr-xr-x[-rw-r--r--]mdadm.h123
-rw-r--r--mdadm.spec2
-rw-r--r--mdassemble.84
-rw-r--r--mdmon.82
-rw-r--r--mdmon.c2
-rw-r--r--msg.c2
-rw-r--r--platform-intel.c86
-rw-r--r--platform-intel.h4
-rw-r--r--raid6check.c3
-rw-r--r--restripe.c6
-rw-r--r--sha1.h8
-rw-r--r--super-intel.c138
-rw-r--r--super0.c14
-rw-r--r--super1.c349
-rw-r--r--sysfs.c13
-rw-r--r--systemd/mdadm-last-resort@.service1
-rw-r--r--systemd/mdmonitor.service5
-rwxr-xr-xtest9
-rw-r--r--tests/19raid6auto-repair66
-rw-r--r--tests/19raid6repair81
-rw-r--r--tests/20raid5journal64
-rw-r--r--udev-md-raid-arrays.rules8
-rw-r--r--udev-md-raid-assembly.rules5
-rw-r--r--util.c227
46 files changed, 1799 insertions, 282 deletions
diff --git a/ANNOUNCE-3.4 b/ANNOUNCE-3.4
new file mode 100644
index 0000000..2689732
--- /dev/null
+++ b/ANNOUNCE-3.4
@@ -0,0 +1,24 @@
+Subject: ANNOUNCE: mdadm 3.4 - A tool for managing md Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.4
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://git.neil.brown.name/git/mdadm
+
+The new second-level version number reflects significant new
+functionality, particular support for journalled RAID5/6 and clustered
+RAID1. This new support is probably still buggy. Please report bugs.
+
+There are also a number of fixes for Intel's IMSM metadata support,
+and an assortment of minor bug fixes.
+
+I plan for this to be the last release of mdadm that I provide as I am
+retiring from MD and mdadm maintenance. Jes Sorensen has volunteered
+to oversee mdadm for the next while. Thanks Jes!
+
+NeilBrown 28th January 2016
diff --git a/Assemble.c b/Assemble.c
index 2925733..d199afc 100644
--- a/Assemble.c
+++ b/Assemble.c
@@ -1,7 +1,7 @@
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2016 Neil Brown <neilb@suse.com>
*
*
* This program is free software; you can redistribute it and/or modify
@@ -637,6 +637,19 @@ static int load_devices(struct devs *devices, char *devmap,
if (strcmp(c->update, "byteorder") == 0)
err = 0;
+ else if (strcmp(c->update, "home-cluster") == 0) {
+ tst->cluster_name = c->homecluster;
+ err = tst->ss->write_bitmap(tst, dfd, NameUpdate);
+ } else if (strcmp(c->update, "nodes") == 0) {
+ tst->nodes = c->nodes;
+ err = tst->ss->write_bitmap(tst, dfd, NodeNumUpdate);
+ } else if (strcmp(c->update, "revert-reshape") == 0 &&
+ c->invalid_backup)
+ err = tst->ss->update_super(tst, content,
+ "revert-reshape-nobackup",
+ devname, c->verbose,
+ ident->uuid_set,
+ c->homehost);
else
err = tst->ss->update_super(tst, content, c->update,
devname, c->verbose,
@@ -729,7 +742,7 @@ static int load_devices(struct devs *devices, char *devmap,
i = devcnt;
else
i = devices[devcnt].i.disk.raid_disk;
- if (i+1 == 0) {
+ if (i+1 == 0 || i == MD_DISK_ROLE_JOURNAL) {
if (nextspare < content->array.raid_disks*2)
nextspare = content->array.raid_disks*2;
i = nextspare++;
@@ -907,7 +920,6 @@ static int force_array(struct mdinfo *content,
avail[chosen_drive] = 1;
okcnt++;
tst->ss->free_super(tst);
-
/* If there are any other drives of the same vintage,
* add them in as well. We can't lose and we might gain
*/
@@ -938,6 +950,7 @@ static int start_array(int mdfd,
unsigned int okcnt,
unsigned int sparecnt,
unsigned int rebuilding_cnt,
+ unsigned int journalcnt,
struct context *c,
int clean, char *avail,
int start_partial_ok,
@@ -949,6 +962,15 @@ static int start_array(int mdfd,
int i;
unsigned int req_cnt;
+ if (content->journal_device_required && (content->journal_clean == 0)) {
+ if (!c->force) {
+ pr_err("Not safe to assemble with missing or stale journal device, consider --force.\n");
+ return 1;
+ }
+ pr_err("Journal is missing or stale, starting array read only.\n");
+ c->readonly = 1;
+ }
+
rv = set_array_info(mdfd, st, content);
if (rv && !err_ok) {
pr_err("failed to set array info for %s: %s\n",
@@ -1026,7 +1048,8 @@ static int start_array(int mdfd,
if (content->array.level == LEVEL_CONTAINER) {
if (c->verbose >= 0) {
pr_err("Container %s has been assembled with %d drive%s",
- mddev, okcnt+sparecnt, okcnt+sparecnt==1?"":"s");
+ mddev, okcnt+sparecnt+journalcnt,
+ okcnt+sparecnt+journalcnt==1?"":"s");
if (okcnt < (unsigned)content->array.raid_disks)
fprintf(stderr, " (out of %d)",
content->array.raid_disks);
@@ -1112,6 +1135,8 @@ static int start_array(int mdfd,
fprintf(stderr, "%s %d rebuilding", sparecnt?",":" and", rebuilding_cnt);
if (sparecnt)
fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s");
+ if (content->journal_clean)
+ fprintf(stderr, " and %d journal", journalcnt);
fprintf(stderr, ".\n");
}
if (content->reshape_active &&
@@ -1283,7 +1308,8 @@ int Assemble(struct supertype *st, char *mddev,
int *best = NULL; /* indexed by raid_disk */
int bestcnt = 0;
int devcnt;
- unsigned int okcnt, sparecnt, rebuilding_cnt, replcnt;
+ unsigned int okcnt, sparecnt, rebuilding_cnt, replcnt, journalcnt;
+ int journal_clean = 0;
int i;
int was_forced = 0;
int most_recent = 0;
@@ -1524,6 +1550,7 @@ try_again:
okcnt = 0;
replcnt = 0;
sparecnt=0;
+ journalcnt=0;
rebuilding_cnt=0;
for (i=0; i< bestcnt; i++) {
int j = best[i];
@@ -1534,8 +1561,13 @@ try_again:
/* note: we ignore error flags in multipath arrays
* as they don't make sense
*/
- if (content->array.level != LEVEL_MULTIPATH)
- if (!(devices[j].i.disk.state & (1<<MD_DISK_ACTIVE))) {
+ if (content->array.level != LEVEL_MULTIPATH) {
+ if (devices[j].i.disk.state & (1<<MD_DISK_JOURNAL)) {
+ if (content->journal_device_required)
+ journalcnt++;
+ else /* unexpected journal, mark as faulty */
+ devices[j].i.disk.state |= (1<<MD_DISK_FAULTY);
+ } else if (!(devices[j].i.disk.state & (1<<MD_DISK_ACTIVE))) {
if (!(devices[j].i.disk.state
& (1<<MD_DISK_FAULTY))) {
devices[j].uptodate = 1;
@@ -1543,6 +1575,7 @@ try_again:
}
continue;
}
+ }
/* If this device thinks that 'most_recent' has failed, then
* we must reject this device.
*/
@@ -1566,6 +1599,8 @@ try_again:
devices[most_recent].i.events
) {
devices[j].uptodate = 1;
+ if (devices[j].i.disk.state & (1<<MD_DISK_JOURNAL))
+ journal_clean = 1;
if (i < content->array.raid_disks * 2) {
if (devices[j].i.recovery_start == MaxSector ||
(content->reshape_active &&
@@ -1577,7 +1612,7 @@ try_again:
replcnt++;
} else
rebuilding_cnt++;
- } else
+ } else if (devices[j].i.disk.raid_disk != MD_DISK_ROLE_JOURNAL)
sparecnt++;
}
}
@@ -1637,11 +1672,15 @@ try_again:
#ifndef MDASSEMBLE
sysfs_init(content, mdfd, NULL);
#endif
+ /* after reload context, store journal_clean in context */
+ content->journal_clean = journal_clean;
for (i=0; i<bestcnt; i++) {
int j = best[i];
unsigned int desired_state;
- if (i >= content->array.raid_disks * 2)
+ if (devices[j].i.disk.raid_disk == MD_DISK_ROLE_JOURNAL)
+ desired_state = (1<<MD_DISK_JOURNAL);
+ else if (i >= content->array.raid_disks * 2)
desired_state = 0;
else if (i & 1)
desired_state = (1<<MD_DISK_ACTIVE) | (1<<MD_DISK_REPLACEMENT);
@@ -1788,7 +1827,7 @@ try_again:
rv = start_array(mdfd, mddev, content,
st, ident, best, bestcnt,
chosen_drive, devices, okcnt, sparecnt,
- rebuilding_cnt,
+ rebuilding_cnt, journalcnt,
c,
clean, avail, start_partial_ok,
pre_exist != NULL,
diff --git a/Create.c b/Create.c
index ef28da0..1e4a6ee 100644
--- a/Create.c
+++ b/Create.c
@@ -87,7 +87,7 @@ int Create(struct supertype *st, char *mddev,
unsigned long long minsize=0, maxsize=0;
char *mindisc = NULL;
char *maxdisc = NULL;
- int dnum;
+ int dnum, raid_disk_num;
struct mddev_dev *dv;
int fail=0, warn=0;
struct stat stb;
@@ -114,6 +114,8 @@ int Create(struct supertype *st, char *mddev,
unsigned long long newsize;
int major_num = BITMAP_MAJOR_HI;
+ if (s->bitmap_file && strcmp(s->bitmap_file, "clustered") == 0)
+ major_num = BITMAP_MAJOR_CLUSTERED;
memset(&info, 0, sizeof(info));
if (s->level == UnSet && st && st->ss->default_geometry)
@@ -180,11 +182,11 @@ int Create(struct supertype *st, char *mddev,
pr_err("This metadata type does not support spare disks at create time\n");
return 1;
}
- if (subdevs > s->raiddisks+s->sparedisks) {
+ if (subdevs > s->raiddisks+s->sparedisks+s->journaldisks) {
pr_err("You have listed more devices (%d) than are in the array(%d)!\n", subdevs, s->raiddisks+s->sparedisks);
return 1;
}
- if (!have_container && subdevs < s->raiddisks+s->sparedisks) {
+ if (!have_container && subdevs < s->raiddisks+s->sparedisks+s->journaldisks) {
pr_err("You haven't given enough devices (real or missing) to create this array\n");
return 1;
}
@@ -328,7 +330,7 @@ int Create(struct supertype *st, char *mddev,
}
close(dfd);
info.array.working_disks++;
- if (dnum < s->raiddisks)
+ if (dnum < s->raiddisks && dv->disposition != 'j')
info.array.active_disks++;
if (st == NULL) {
struct createinfo *ci = conf_get_create_info();
@@ -397,6 +399,9 @@ int Create(struct supertype *st, char *mddev,
}
}
+ if (dv->disposition == 'j')
+ goto skip_size_check; /* skip write journal for size check */
+
freesize /= 2; /* convert to K */
if (s->chunk && s->chunk != UnSet) {
/* round to chunk size */
@@ -429,6 +434,7 @@ int Create(struct supertype *st, char *mddev,
mindisc = dname;
minsize = freesize;
}
+ skip_size_check:
if (c->runstop != 1 || c->verbose >= 0) {
int fd = open(dname, O_RDONLY);
if (fd <0 ) {
@@ -531,6 +537,8 @@ int Create(struct supertype *st, char *mddev,
st->ss->name);
warn = 1;
}
+ st->nodes = c->nodes;
+ st->cluster_name = c->homecluster;
if (warn) {
if (c->runstop!= 1) {
@@ -750,7 +758,8 @@ int Create(struct supertype *st, char *mddev,
#endif
}
- if (s->bitmap_file && strcmp(s->bitmap_file, "internal")==0) {
+ if (s->bitmap_file && (strcmp(s->bitmap_file, "internal")==0 ||
+ strcmp(s->bitmap_file, "clustered")==0)) {
if ((vers%100) < 2) {
pr_err("internal bitmaps not supported by this kernel.\n");
goto abort_locked;
@@ -834,7 +843,7 @@ int Create(struct supertype *st, char *mddev,
for (pass=1; pass <=2 ; pass++) {
struct mddev_dev *moved_disk = NULL; /* the disk that was moved out of the insert point */
- for (dnum=0, dv = devlist ; dv ;
+ for (dnum=0, raid_disk_num=0, dv = devlist ; dv ;
dv=(dv->next)?(dv->next):moved_disk, dnum++) {
int fd;
struct stat stb;
@@ -843,11 +852,14 @@ int Create(struct supertype *st, char *mddev,
if (dnum >= total_slots)
abort();
if (dnum == insert_point) {
+ raid_disk_num += 1;
moved_disk = dv;
continue;
}
- if (strcasecmp(dv->devname, "missing")==0)
+ if (strcasecmp(dv->devname, "missing")==0) {
+ raid_disk_num += 1;
continue;
+ }
if (have_container)
moved_disk = NULL;
if (have_container && dnum < info.array.raid_disks - 1)
@@ -859,8 +871,13 @@ int Create(struct supertype *st, char *mddev,
*inf = info;
inf->disk.number = dnum;
- inf->disk.raid_disk = dnum;
- if (inf->disk.raid_disk < s->raiddisks)
+ inf->disk.raid_disk = raid_disk_num++;
+
+ if (dv->disposition == 'j') {
+ inf->disk.raid_disk = MD_DISK_ROLE_JOURNAL;
+ inf->disk.state = (1<<MD_DISK_JOURNAL);
+ raid_disk_num--;
+ } else if (inf->disk.raid_disk < s->raiddisks)
inf->disk.state = (1<<MD_DISK_ACTIVE) |
(1<<MD_DISK_SYNC);
else
diff --git a/Detail.c b/Detail.c
index dd72ede..0cfccad 100644
--- a/Detail.c
+++ b/Detail.c
@@ -299,7 +299,8 @@ int Detail(char *dev, struct context *c)
for (d = 0; d < max_disks * 2; d++) {
disks[d].state = (1<<MD_DISK_REMOVED);
disks[d].major = disks[d].minor = 0;
- disks[d].number = disks[d].raid_disk = d;
+ disks[d].number = -1;
+ disks[d].raid_disk = d/2;
}
next = array.raid_disks*2;
@@ -325,7 +326,8 @@ int Detail(char *dev, struct context *c)
&& disks[disk.raid_disk*2].state == (1<<MD_DISK_REMOVED))
disks[disk.raid_disk*2] = disk;
else if (disk.raid_disk >= 0 && disk.raid_disk < array.raid_disks
- && disks[disk.raid_disk*2+1].state == (1<<MD_DISK_REMOVED))
+ && disks[disk.raid_disk*2+1].state == (1<<MD_DISK_REMOVED)
+ && !(disk.state & (1<<MD_DISK_JOURNAL)))
disks[disk.raid_disk*2+1] = disk;
else if (next < max_disks*2)
disks[next++] = disk;
@@ -339,7 +341,8 @@ int Detail(char *dev, struct context *c)
(disks[d*2+1].state & (1<<MD_DISK_SYNC))) {
avail_disks ++;
avail[d] = 1;
- }
+ } else
+ rv |= !! c->test;
}
if (c->brief) {
@@ -422,8 +425,9 @@ int Detail(char *dev, struct context *c)
else
printf(" Used Dev Size : unknown\n");
} else
- printf(" Used Dev Size : %d%s\n", array.size,
- human_size((long long)array.size<<10));
+ printf(" Used Dev Size : %lu%s\n",
+ (unsigned long)array.size,
+ human_size((unsigned long long)array.size<<10));
}
if (array.raid_disks)
printf(" Raid Devices : %d\n", array.raid_disks);
@@ -616,12 +620,15 @@ This is pretty boring
continue;
if (!c->brief) {
if (d == array.raid_disks*2) printf("\n");
- if (disk.number < 0)
+ if (disk.number < 0 && disk.raid_disk < 0)
printf(" - %5d %5d - ",
disk.major, disk.minor);
- else if (disk.raid_disk < 0)
+ else if (disk.raid_disk < 0 || disk.state & (1<<MD_DISK_JOURNAL))
printf(" %5d %5d %5d - ",
disk.number, disk.major, disk.minor);
+ else if (disk.number < 0)
+ printf(" - %5d %5d %5d ",
+ disk.major, disk.minor, disk.raid_disk);
else
printf(" %5d %5d %5d %5d ",
disk.number, disk.major, disk.minor, disk.raid_disk);
@@ -650,9 +657,10 @@ This is pretty boring
}
if (disk.state & (1<<MD_DISK_REMOVED)) printf(" removed");
if (disk.state & (1<<MD_DISK_WRITEMOSTLY)) printf(" writemostly");
+ if (disk.state & (1<<MD_DISK_JOURNAL)) printf(" journal");
if ((disk.state &
((1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC)
- |(1<<MD_DISK_REMOVED)|(1<<MD_DISK_FAULTY)))
+ |(1<<MD_DISK_REMOVED)|(1<<MD_DISK_FAULTY)|(1<<MD_DISK_JOURNAL)))
== 0) {
printf(" spare");
if (is_26) {
@@ -671,9 +679,6 @@ This is pretty boring
}
}
if (disk.state == 0) spares++;
- if (c->test && d < array.raid_disks
- && !(disk.state & (1<<MD_DISK_SYNC)))
- rv |= 1;
dv=map_dev_preferred(disk.major, disk.minor, 0, c->prefer);
if (dv != NULL) {
if (c->brief)
diff --git a/Grow.c b/Grow.c
index a336593..bbdd46c 100644..100755
--- a/Grow.c
+++ b/Grow.c
@@ -297,6 +297,9 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
" between different architectures. Consider upgrading the Linux kernel.\n");
}
+ if (s->bitmap_file && strcmp(s->bitmap_file, "clustered") == 0)
+ major = BITMAP_MAJOR_CLUSTERED;
+
if (ioctl(fd, GET_BITMAP_FILE, &bmf) != 0) {
if (errno == ENOMEM)
pr_err("Memory allocation failure.\n");
@@ -325,13 +328,15 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
if (strcmp(s->bitmap_file, "none")==0) {
array.state &= ~(1<<MD_SB_BITMAP_PRESENT);
if (ioctl(fd, SET_ARRAY_INFO, &array)!= 0) {
- pr_err("failed to remove internal bitmap.\n");
+ if (array.state & (1<<MD_SB_CLUSTERED))
+ pr_err("failed to remove clustered bitmap.\n");
+ else
+ pr_err("failed to remove internal bitmap.\n");
return 1;
}
return 0;
}
- pr_err("Internal bitmap already present on %s\n",
- devname);
+ pr_err("bitmap already present on %s\n", devname);
return 1;
}
@@ -375,7 +380,8 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
free(st);
return 1;
}
- if (strcmp(s->bitmap_file, "internal") == 0) {
+ if (strcmp(s->bitmap_file, "internal") == 0 ||
+ strcmp(s->bitmap_file, "clustered") == 0) {
int rv;
int d;
int offset_setable = 0;
@@ -384,6 +390,8 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
pr_err("Internal bitmaps not supported with %s metadata\n", st->ss->name);
return 1;
}
+ st->nodes = c->nodes;
+ st->cluster_name = c->homecluster;
mdi = sysfs_read(fd, NULL, GET_BITMAP_LOCATION);
if (mdi)
offset_setable = 1;
@@ -410,7 +418,7 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
bitmapsize, offset_setable,
major)
)
- st->ss->write_bitmap(st, fd2);
+ st->ss->write_bitmap(st, fd2, NoUpdate);
else {
pr_err("failed to create internal bitmap - chunksize problem.\n");
close(fd2);
@@ -426,6 +434,8 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
rv = sysfs_set_num_signed(mdi, NULL, "bitmap/location",
mdi->bitmap_offset);
} else {
+ if (strcmp(s->bitmap_file, "clustered") == 0)
+ array.state |= (1<<MD_SB_CLUSTERED);
array.state |= (1<<MD_SB_BITMAP_PRESENT);
rv = ioctl(fd, SET_ARRAY_INFO, &array);
}
@@ -1580,6 +1590,15 @@ int Grow_reshape(char *devname, int fd,
pr_err("Cannot increase raid-disks on this array beyond %d\n", st->max_devs);
return 1;
}
+ if (s->level == 0 &&
+ (array.state & (1<<MD_SB_BITMAP_PRESENT)) &&
+ !(array.state & (1<<MD_SB_CLUSTERED))) {
+ array.state &= ~(1<<MD_SB_BITMAP_PRESENT);
+ if (ioctl(fd, SET_ARRAY_INFO, &array)!= 0) {
+ pr_err("failed to remove internal bitmap.\n");
+ return 1;
+ }
+ }
/* in the external case we need to check that the requested reshape is
* supported, and perform an initial check that the container holds the
@@ -4496,8 +4515,8 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
* sometimes they aren't... So allow considerable flexability in matching, and allow
* this test to be overridden by an environment variable.
*/
- if (info->array.utime > (int)__le64_to_cpu(bsb.mtime) + 2*60*60 ||
- info->array.utime < (int)__le64_to_cpu(bsb.mtime) - 10*60) {
+ if(time_after(info->array.utime, (unsigned int)__le64_to_cpu(bsb.mtime) + 2*60*60) ||
+ time_before(info->array.utime, (unsigned int)__le64_to_cpu(bsb.mtime) - 10*60)) {
if (check_env("MDADM_GROW_ALLOW_OLD")) {
pr_err("accepting backup with timestamp %lu for array with timestamp %lu\n",
(unsigned long)__le64_to_cpu(bsb.mtime),
@@ -4866,6 +4885,9 @@ int Grow_continue_command(char *devname, int fd,
sysfs_init(content, fd2, mdstat->devnm);
+ close(fd2);
+ fd2 = -1;
+
/* start mdmon in case it is not running
*/
if (!mdmon_running(container))
diff --git a/Incremental.c b/Incremental.c
index 41876b9..24fd827 100644
--- a/Incremental.c
+++ b/Incremental.c
@@ -104,6 +104,7 @@ int Incremental(struct mddev_dev *devlist, struct context *c,
struct map_ent target_array;
int have_target;
char *devname = devlist->devname;
+ int journal_device_missing = 0;
struct createinfo *ci = conf_get_create_info();
@@ -312,6 +313,12 @@ int Incremental(struct mddev_dev *devlist, struct context *c,
if (mdfd < 0) {
+ /* Skip the clustered ones. This should be started by
+ * clustering resource agents
+ */
+ if (info.array.state & (1 << MD_SB_CLUSTERED))
+ goto out;
+
/* Couldn't find an existing array, maybe make a new one */
mdfd = create_mddev(match ? match->devname : NULL,
name_to_use, c->autof, trustworthy, chosen_name);
@@ -437,6 +444,10 @@ int Incremental(struct mddev_dev *devlist, struct context *c,
/* add disk needs to know about containers */
if (st->ss->external)
sra->array.level = LEVEL_CONTAINER;
+
+ if (info.array.state & (1 << MD_SB_CLUSTERED))
+ info.disk.state |= (1 << MD_DISK_CLUSTER_ADD);
+
err = add_disk(mdfd, st, sra, &info);
if (err < 0 && errno == EBUSY) {
/* could be another device present with the same
@@ -514,6 +525,9 @@ int Incremental(struct mddev_dev *devlist, struct context *c,
sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE |
GET_OFFSET | GET_SIZE));
active_disks = count_active(st, sra, mdfd, &avail, &info);
+
+ journal_device_missing = (info.journal_device_required) && (info.journal_clean == 0);
+
if (enough(info.array.level, info.array.raid_disks,
info.array.layout, info.array.state & 1,
avail) == 0) {
@@ -543,10 +557,12 @@ int Incremental(struct mddev_dev *devlist, struct context *c,
}
map_unlock(&map);
- if (c->runstop > 0 || active_disks >= info.array.working_disks) {
+ if (c->runstop > 0 || (!journal_device_missing && active_disks >= info.array.working_disks)) {
struct mdinfo *dsk;
/* Let's try to start it */
+ if (journal_device_missing)
+ pr_err("Trying to run with missing journal device\n");
if (info.reshape_active && !(info.reshape_active & RESHAPE_NO_BACKUP)) {
pr_err("%s: This array is being reshaped and cannot be started\n",
chosen_name);
@@ -613,6 +629,8 @@ int Incremental(struct mddev_dev *devlist, struct context *c,
} else {
if (c->export) {
printf("MD_STARTED=unsafe\n");
+ } else if (journal_device_missing) {
+ pr_err("Journal device is missing, not safe to start yet.\n");
} else if (c->verbose >= 0)
pr_err("%s attached to %s, not enough to start safely.\n",
devname, chosen_name);
@@ -649,7 +667,7 @@ static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra,
* without thinking more */
for (d = sra->devs; d ; d = d->next) {
- char dn[10];
+ char dn[24]; // 2*11 bytes for ints (including sign) + colon + null byte
int dfd;
struct mdinfo info;
sprintf(dn, "%d:%d", d->disk.major, d->disk.minor);
@@ -713,8 +731,11 @@ static int count_active(struct supertype *st, struct mdinfo *sra,
close(dfd);
if (ok != 0)
continue;
+
info.array.raid_disks = raid_disks;
st->ss->getinfo_super(st, &info, devmap + raid_disks * devnum);
+ if (info.disk.raid_disk == MD_DISK_ROLE_JOURNAL)
+ bestinfo->journal_clean = 1;
if (!avail) {
raid_disks = info.array.raid_disks;
avail = xcalloc(raid_disks, 1);
@@ -764,6 +785,7 @@ static int count_active(struct supertype *st, struct mdinfo *sra,
replcnt++;
st->ss->free_super(st);
}
+
if (!avail)
return 0;
/* We need to reject any device that thinks the best device is
@@ -1012,12 +1034,12 @@ static int array_try_spare(char *devname, int *dfdp, struct dev_policy *pol,
int mdfd = open_dev(chosen->sys_name);
if (mdfd >= 0) {
struct mddev_dev devlist;
- char devname[20];
+ char chosen_devname[24]; // 2*11 for int (including signs) + colon + null
devlist.next = NULL;
devlist.used = 0;
devlist.writemostly = 0;
- devlist.devname = devname;
- sprintf(devname, "%d:%d", major(stb.st_rdev),
+ devlist.devname = chosen_devname;
+ sprintf(chosen_devname, "%d:%d", major(stb.st_rdev),
minor(stb.st_rdev));
devlist.disposition = 'a';
close(dfd);
diff --git a/Makefile b/Makefile
index a02a97f..fd79cfb 100644
--- a/Makefile
+++ b/Makefile
@@ -43,7 +43,7 @@ KLIBC_GCC = gcc -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIB
CC = $(CROSS_COMPILE)gcc
CXFLAGS ?= -ggdb
-CWFLAGS = -Wall -Wstrict-prototypes -Wextra -Wno-unused-parameter
+CWFLAGS = -Wall -Werror -Wstrict-prototypes -Wextra -Wno-unused-parameter
ifdef WARN_UNUSED
CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O3
endif
@@ -62,8 +62,8 @@ CPPFLAGS += -DBINDIR=\"$(BINDIR)\"
PKG_CONFIG ?= pkg-config
SYSCONFDIR = /etc
-CONFFILE = $(SYSCONFDIR)/mdadm/mdadm.conf
-CONFFILE2 = $(SYSCONFDIR)/mdadm.conf
+CONFFILE = $(SYSCONFDIR)/mdadm.conf
+CONFFILE2 = $(SYSCONFDIR)/mdadm/mdadm.conf
MAILCMD =/usr/sbin/sendmail -t
CONFFILEFLAGS = -DCONFFILE=\"$(CONFFILE)\" -DCONFFILE2=\"$(CONFFILE2)\"
# Both MAP_DIR and MDMON_DIR should be somewhere that persists across the
@@ -79,10 +79,14 @@ MDMON_DIR = $(RUN_DIR)
# place for autoreplace cookies
FAILED_SLOTS_DIR = $(RUN_DIR)/failed-slots
SYSTEMD_DIR=/lib/systemd/system
+
+COROSYNC:=$(shell [ -d /usr/include/corosync ] || echo -DNO_COROSYNC)
+DLM:=$(shell [ -f /usr/include/libdlm.h ] || echo -DNO_DLM)
+
DIRFLAGS = -DMAP_DIR=\"$(MAP_DIR)\" -DMAP_FILE=\"$(MAP_FILE)\"
DIRFLAGS += -DMDMON_DIR=\"$(MDMON_DIR)\"
DIRFLAGS += -DFAILED_SLOTS_DIR=\"$(FAILED_SLOTS_DIR)\"
-CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS)
+CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS) $(COROSYNC) $(DLM)
VERSION = $(shell [ -d .git ] && git describe HEAD | sed 's/mdadm-//')
VERS_DATE = $(shell [ -d .git ] && date --date="`git log -n1 --format=format:%cd --date=short`" '+%0dth %B %Y' | sed -e 's/1th/1st/' -e 's/2th/2nd/' -e 's/11st/11th/' -e 's/12nd/12th/')
@@ -101,6 +105,7 @@ endif
# If you want a static binary, you might uncomment these
# LDFLAGS = -static
# STRIP = -s
+LDLIBS=-ldl
INSTALL = /usr/bin/install
DESTDIR =
@@ -115,6 +120,12 @@ ifndef UDEVDIR
UDEVDIR = /lib/udev
endif
+ifeq (,$(findstring s,$(MAKEFLAGS)))
+ ECHO=echo
+else
+ ECHO=:
+endif
+
OBJS = mdadm.o config.o policy.o mdstat.o ReadMe.o util.o maps.o lib.o \
Manage.o Assemble.o Build.o \
Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \
@@ -122,7 +133,7 @@ OBJS = mdadm.o config.o policy.o mdstat.o ReadMe.o util.o maps.o lib.o \
mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \
super-mbr.o super-gpt.o \
restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o xmalloc.o \
- platform-intel.o probe_roms.o
+ platform-intel.o probe_roms.o crc32c.o
CHECK_OBJS = restripe.o sysfs.o maps.o lib.o xmalloc.o dlink.o
@@ -176,7 +187,7 @@ mdadm : $(OBJS) | check_rundir
$(CC) $(CFLAGS) $(LDFLAGS) -o mdadm $(OBJS) $(LDLIBS)
mdadm.static : $(OBJS) $(STATICOBJS)
- $(CC) $(CFLAGS) $(LDFLAGS) -static -o mdadm.static $(OBJS) $(STATICOBJS)
+ $(CC) $(CFLAGS) $(LDFLAGS) -static -o mdadm.static $(OBJS) $(STATICOBJS) $(LDLIBS)
mdadm.tcc : $(SRCS) $(INCL)
$(TCC) -o mdadm.tcc $(SRCS)
@@ -186,13 +197,13 @@ mdadm.klibc : $(SRCS) $(INCL)
$(CC) -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 $(CFLAGS) $(SRCS)
mdadm.Os : $(SRCS) $(INCL)
- $(CC) -o mdadm.Os $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -Os $(SRCS)
+ $(CC) -o mdadm.Os $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -Os $(SRCS) $(LDLIBS)
mdadm.O2 : $(SRCS) $(INCL) mdmon.O2
- $(CC) -o mdadm.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(SRCS)
+ $(CC) -o mdadm.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(SRCS) $(LDLIBS)
mdmon.O2 : $(MON_SRCS) $(INCL) mdmon.h
- $(CC) -o mdmon.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS)
+ $(CC) -o mdmon.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS) $(LDLIBS)
# use '-z now' to guarantee no dynamic linker interactions with the monitor thread
mdmon : $(MON_OBJS) | check_rundir
@@ -200,7 +211,7 @@ mdmon : $(MON_OBJS) | check_rundir
msg.o: msg.c msg.h
test_stripe : restripe.c xmalloc.o mdadm.h
- $(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe xmalloc.o -DMAIN restripe.c
+ $(CC) $(CFLAGS) $(CXFLAGS) $(LDFLAGS) -o test_stripe xmalloc.o -DMAIN restripe.c
raid6check : raid6check.o mdadm.h $(CHECK_OBJS)
$(CC) $(CXFLAGS) $(LDFLAGS) -o raid6check raid6check.o $(CHECK_OBJS)
@@ -283,7 +294,7 @@ install-man: mdadm.8 md.4 mdadm.conf.5 mdmon.8
install-udev: udev-md-raid-arrays.rules udev-md-raid-assembly.rules
@for file in 63-md-raid-arrays.rules 64-md-raid-assembly.rules ; \
do sed -e 's,BINDIR,$(BINDIR),g' udev-$${file#??-} > .install.tmp.1 && \
- echo $(INSTALL) -D -m 644 udev-$${file#??-} $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \
+ $(ECHO) $(INSTALL) -D -m 644 udev-$${file#??-} $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \
$(INSTALL) -D -m 644 .install.tmp.1 $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \
rm -f .install.tmp.1; \
done
@@ -292,13 +303,13 @@ install-systemd: systemd/mdmon@.service
@for file in mdmon@.service mdmonitor.service mdadm-last-resort@.timer \
mdadm-last-resort@.service mdadm-grow-continue@.service; \
do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.2 && \
- echo $(INSTALL) -D -m 644 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \
+ $(ECHO) $(INSTALL) -D -m 644 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \
$(INSTALL) -D -m 644 .install.tmp.2 $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \
rm -f .install.tmp.2; \
done
@for file in mdadm.shutdown ; \
do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.3 && \
- echo $(INSTALL) -D -m 755 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \
+ $(ECHO) $(INSTALL) -D -m 755 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \
$(INSTALL) -D -m 755 .install.tmp.3 $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \
rm -f .install.tmp.3; \
done
diff --git a/Manage.c b/Manage.c
index 47faeed..7e1b94b 100644
--- a/Manage.c
+++ b/Manage.c
@@ -669,6 +669,15 @@ int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
disc.number = mdi.disk.number;
disc.raid_disk = mdi.disk.raid_disk;
disc.state = mdi.disk.state;
+ if (array->state & (1 << MD_SB_CLUSTERED)) {
+ /* extra flags are needed when adding to a cluster as
+ * there are two cases to distinguish
+ */
+ if (dv->disposition == 'c')
+ disc.state |= (1 << MD_DISK_CANDIDATE);
+ else
+ disc.state |= (1 << MD_DISK_CLUSTER_ADD);
+ }
if (dv->writemostly == 1)
disc.state |= 1 << MD_DISK_WRITEMOSTLY;
if (dv->writemostly == 2)
@@ -724,7 +733,8 @@ skip_re_add:
int Manage_add(int fd, int tfd, struct mddev_dev *dv,
struct supertype *tst, mdu_array_info_t *array,
int force, int verbose, char *devname,
- char *update, unsigned long rdev, unsigned long long array_size)
+ char *update, unsigned long rdev, unsigned long long array_size,
+ int raid_slot)
{
unsigned long long ldsize;
struct supertype *dev_st = NULL;
@@ -815,7 +825,8 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
}
/* Make sure device is large enough */
- if (tst->sb &&
+ if (dv->disposition != 'j' && /* skip size check for Journal */
+ tst->sb &&
tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) <
array_size) {
if (dv->disposition == 'M')
@@ -914,10 +925,36 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
}
disc.major = major(rdev);
disc.minor = minor(rdev);
- disc.number =j;
+ if (raid_slot < 0)
+ disc.number = j;
+ else
+ disc.number = raid_slot;
disc.state = 0;
+
+ /* only add journal to array that supports journaling */
+ if (dv->disposition == 'j') {
+ struct mdinfo mdi;
+ struct mdinfo *mdp;
+
+ mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE);
+
+ if (strncmp(mdp->sysfs_array_state, "readonly", 8) != 0) {
+ pr_err("%s is not readonly, cannot add journal.\n", devname);
+ return -1;
+ }
+
+ tst->ss->getinfo_super(tst, &mdi, NULL);
+ if (mdi.journal_device_required == 0) {
+ pr_err("%s does not support journal device.\n", devname);
+ return -1;
+ }
+ disc.raid_disk = 0;
+ }
+
if (array->not_persistent==0) {
int dfd;
+ if (dv->disposition == 'j')
+ disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC);
if (dv->writemostly == 1)
disc.state |= 1 << MD_DISK_WRITEMOSTLY;
dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
@@ -955,6 +992,14 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
}
free(used);
}
+
+ if (array->state & (1 << MD_SB_CLUSTERED)) {
+ if (dv->disposition == 'c')
+ disc.state |= (1 << MD_DISK_CANDIDATE);
+ else
+ disc.state |= (1 << MD_DISK_CLUSTER_ADD);
+ }
+
if (dv->writemostly == 1)
disc.state |= (1 << MD_DISK_WRITEMOSTLY);
if (tst->ss->external) {
@@ -1020,10 +1065,20 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
} else {
tst->ss->free_super(tst);
if (ioctl(fd, ADD_NEW_DISK, &disc)) {
- pr_err("add new device failed for %s as %d: %s\n",
- dv->devname, j, strerror(errno));
+ if (dv->disposition == 'j')
+ pr_err("Failed to hot add %s as journal, "
+ "please try restart %s.\n", dv->devname, devname);
+ else
+ pr_err("add new device failed for %s as %d: %s\n",
+ dv->devname, j, strerror(errno));
return -1;
}
+ if (dv->disposition == 'j') {
+ pr_err("Journal added successfully, making %s read-write\n", devname);
+ if (Manage_ro(devname, fd, -1))
+ pr_err("Failed to make %s read-write\n", devname);
+ }
+
}
if (verbose >= 0)
pr_err("added %s\n", dv->devname);
@@ -1256,6 +1311,7 @@ int Manage_subdevs(char *devname, int fd,
* try HOT_ADD_DISK
* If that fails EINVAL, try ADD_NEW_DISK
* 'S' - add the device as a spare - don't try re-add
+ * 'j' - add the device as a journal device
* 'A' - re-add the device
* 'r' - remove the device: HOT_REMOVE_DISK
* device can be 'faulty' or 'detached' in which case all
@@ -1274,6 +1330,7 @@ int Manage_subdevs(char *devname, int fd,
* variant on 'A'
* 'F' - Another variant of 'A', where the device was faulty
* so must be removed from the array first.
+ * 'c' - confirm the device as found (for clustered environments)
*
* For 'f' and 'r', the device can also be a kernel-internal
* name such as 'sdb'.
@@ -1287,8 +1344,10 @@ int Manage_subdevs(char *devname, int fd,
int sysfd = -1;
int count = 0; /* number of actions taken */
struct mdinfo info;
+ struct mdinfo devinfo;
int frozen = 0;
int busy = 0;
+ int raid_slot = -1;
if (ioctl(fd, GET_ARRAY_INFO, &array)) {
pr_err("Cannot get array info for %s\n",
@@ -1317,6 +1376,17 @@ int Manage_subdevs(char *devname, int fd,
int rv;
int mj,mn;
+ raid_slot = -1;
+ if (dv->disposition == 'c') {
+ rv = parse_cluster_confirm_arg(dv->devname,
+ &dv->devname,
+ &raid_slot);
+ if (rv) {
+ pr_err("Could not get the devname of cluster\n");
+ goto abort;
+ }
+ }
+
if (strcmp(dv->devname, "failed") == 0 ||
strcmp(dv->devname, "faulty") == 0) {
if (dv->disposition != 'A'
@@ -1342,6 +1412,11 @@ int Manage_subdevs(char *devname, int fd,
if (strcmp(dv->devname, "missing") == 0) {
struct mddev_dev *add_devlist = NULL;
struct mddev_dev **dp;
+ if (dv->disposition == 'c') {
+ rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL);
+ break;
+ }
+
if (dv->disposition != 'A') {
pr_err("'missing' only meaningful with --re-add\n");
goto abort;
@@ -1469,14 +1544,28 @@ int Manage_subdevs(char *devname, int fd,
goto abort;
case 'a':
case 'S': /* --add-spare */
+ case 'j': /* --add-journal */
case 'A':
case 'M': /* --re-add missing */
case 'F': /* --re-add faulty */
+ case 'c': /* --cluster-confirm */
/* add the device */
if (subarray) {
pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n");
goto abort;
}
+
+ /* Let's first try to write re-add to sysfs */
+ if (rdev != 0 &&
+ (dv->disposition == 'A' || dv->disposition == 'F')) {
+ sysfs_init_dev(&devinfo, rdev);
+ if (sysfs_set_str(&info, &devinfo, "state", "re-add") == 0) {
+ pr_err("re-add %s to %s succeed\n",
+ dv->devname, info.sys_name);
+ break;
+ }
+ }
+
if (dv->disposition == 'F')
/* Need to remove first */
ioctl(fd, HOT_REMOVE_DISK, rdev);
@@ -1505,7 +1594,7 @@ int Manage_subdevs(char *devname, int fd,
}
rv = Manage_add(fd, tfd, dv, tst, &array,
force, verbose, devname, update,
- rdev, array_size);
+ rdev, array_size, raid_slot);
close(tfd);
tfd = -1;
if (rv < 0)
diff --git a/ReadMe.c b/ReadMe.c
index a05c74e..d3fcb61 100644
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -1,7 +1,7 @@
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2015 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2016 Neil Brown <neilb@suse.com>
*
*
* This program is free software; you can redistribute it and/or modify
@@ -25,10 +25,10 @@
#include "mdadm.h"
#ifndef VERSION
-#define VERSION "3.3.4"
+#define VERSION "3.4"
#endif
#ifndef VERS_DATE
-#define VERS_DATE "3rd August 2015"
+#define VERS_DATE "28th January 2016"
#endif
char Version[] = "mdadm - v" VERSION " - " VERS_DATE "\n";
@@ -140,6 +140,9 @@ struct option long_options[] = {
{"homehost", 1, 0, HomeHost},
{"symlinks", 1, 0, Symlinks},
{"data-offset",1, 0, DataOffset},
+ {"nodes",1, 0, Nodes}, /* also for --assemble */
+ {"home-cluster",1, 0, ClusterName},
+ {"write-journal",1, 0, WriteJournal},
/* For assemble */
{"uuid", 1, 0, 'u'},
@@ -154,6 +157,7 @@ struct option long_options[] = {
/* Management */
{"add", 0, 0, Add},
{"add-spare", 0, 0, AddSpare},
+ {"add-journal", 0, 0, AddJournal},
{"remove", 0, 0, Remove},
{"fail", 0, 0, Fail},
{"set-faulty",0, 0, Fail},
@@ -167,6 +171,7 @@ struct option long_options[] = {
{"wait", 0, 0, WaitOpt},
{"wait-clean", 0, 0, Waitclean },
{"action", 1, 0, Action },
+ {"cluster-confirm", 0, 0, ClusterConfirm},
/* For Detail/Examine */
{"brief", 0, 0, Brief},
@@ -372,6 +377,7 @@ char Help_create[] =
" --name= -N : Textual name for array - max 32 characters\n"
" --bitmap-chunk= : bitmap chunksize in Kilobytes.\n"
" --delay= -d : bitmap update delay in seconds.\n"
+" --write-journal= : Specify journal device for RAID-4/5/6 array\n"
"\n"
;
@@ -593,7 +599,7 @@ char Help_incr[] =
;
char Help_config[] =
-"The /etc/mdadm/mdadm.conf config file:\n\n"
+"The /etc/mdadm.conf config file:\n\n"
" The config file contains, apart from blank lines and comment lines that\n"
" start with a hash(#), array lines, device lines, and various\n"
" configuration lines.\n"
diff --git a/bitmap.c b/bitmap.c
index bbe9bae..dab674b 100644
--- a/bitmap.c
+++ b/bitmap.c
@@ -32,6 +32,8 @@ static inline void sb_le_to_cpu(bitmap_super_t *sb)
sb->daemon_sleep = __le32_to_cpu(sb->daemon_sleep);
sb->sync_size = __le64_to_cpu(sb->sync_size);
sb->write_behind = __le32_to_cpu(sb->write_behind);
+ sb->nodes = __le32_to_cpu(sb->nodes);
+ sb->sectors_reserved = __le32_to_cpu(sb->sectors_reserved);
}
static inline void sb_cpu_to_le(bitmap_super_t *sb)
@@ -219,8 +221,12 @@ int bitmap_file_open(char *filename, struct supertype **stp)
pr_err("No bitmap possible with %s metadata\n",
st->ss->name);
return -1;
- } else
- st->ss->locate_bitmap(st, fd);
+ } else {
+ if (st->ss->locate_bitmap(st, fd)) {
+ pr_err("%s doesn't have bitmap\n", filename);
+ fd = -1;
+ }
+ }
*stp = st;
} else {
@@ -258,7 +264,7 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st)
int rv = 1;
char buf[64];
int swap;
- int fd;
+ int fd, i;
__u32 uuid32[4];
fd = bitmap_file_open(filename, &st);
@@ -285,7 +291,7 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st)
}
printf(" Version : %d\n", sb->version);
if (sb->version < BITMAP_MAJOR_LO ||
- sb->version > BITMAP_MAJOR_HI) {
+ sb->version > BITMAP_MAJOR_CLUSTERED) {
pr_err("unknown bitmap version %d, either the bitmap file\n",
sb->version);
pr_err("is corrupted or you need to upgrade your tools\n");
@@ -315,9 +321,13 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st)
uuid32[2],
uuid32[3]);
- printf(" Events : %llu\n", (unsigned long long)sb->events);
- printf(" Events Cleared : %llu\n", (unsigned long long)sb->events_cleared);
- printf(" State : %s\n", bitmap_state(sb->state));
+ if (sb->nodes == 0) {
+ printf(" Events : %llu\n", (unsigned long long)sb->events);
+ printf(" Events Cleared : %llu\n", (unsigned long long)sb->events_cleared);
+ printf(" State : %s\n", bitmap_state(sb->state));
+
+ }
+
printf(" Chunksize : %s\n", human_chunksize(sb->chunksize));
printf(" Daemon : %ds flush period\n", sb->daemon_sleep);
if (sb->write_behind)
@@ -327,11 +337,40 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st)
printf(" Write Mode : %s\n", buf);
printf(" Sync Size : %llu%s\n", (unsigned long long)sb->sync_size/2,
human_size(sb->sync_size * 512));
- if (brief)
- goto free_info;
- printf(" Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n",
- info->total_bits, info->dirty_bits,
- 100.0 * info->dirty_bits / (info->total_bits?:1));
+
+ if (sb->nodes == 0) {
+ if (brief)
+ goto free_info;
+ printf(" Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n",
+ info->total_bits, info->dirty_bits,
+ 100.0 * info->dirty_bits / (info->total_bits?:1));
+ } else {
+ printf(" Cluster nodes : %d\n", sb->nodes);
+ printf(" Cluster name : %-64s\n", sb->cluster_name);
+ for (i = 0; i < (int)sb->nodes; i++) {
+ if (i) {
+ free(info);
+ info = bitmap_fd_read(fd, brief);
+ sb = &info->sb;
+ }
+ if (sb->magic != BITMAP_MAGIC)
+ pr_err("invalid bitmap magic 0x%x, the bitmap file appears to be corrupted\n", sb->magic);
+
+ printf(" Node Slot : %d\n", i);
+ printf(" Events : %llu\n",
+ (unsigned long long)sb->events);
+ printf(" Events Cleared : %llu\n",
+ (unsigned long long)sb->events_cleared);
+ printf(" State : %s\n", bitmap_state(sb->state));
+ if (brief)
+ continue;
+ printf(" Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n",
+ info->total_bits, info->dirty_bits,
+ 100.0 * info->dirty_bits / (info->total_bits?:1));
+
+ }
+ }
+
free_info:
free(info);
return rv;
diff --git a/bitmap.h b/bitmap.h
index c8725a3..b8fb071 100644
--- a/bitmap.h
+++ b/bitmap.h
@@ -12,6 +12,7 @@
*/
#define BITMAP_MAJOR_HI 4
#define BITMAP_MAJOR_HOSTENDIAN 3
+#define BITMAP_MAJOR_CLUSTERED 5
#define BITMAP_MINOR 39
@@ -154,8 +155,11 @@ typedef struct bitmap_super_s {
__u32 chunksize; /* 52 the bitmap chunk size in bytes */
__u32 daemon_sleep; /* 56 seconds between disk flushes */
__u32 write_behind; /* 60 number of outstanding write-behind writes */
-
- __u8 pad[256 - 64]; /* set to zero */
+ __u32 sectors_reserved; /* 64 number of 512-byte sectors that are
+ * reserved for the bitmap. */
+ __u32 nodes; /* 68 the maximum number of nodes in cluster. */
+ __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */
+ __u8 pad[256 - 136]; /* set to zero */
} bitmap_super_t;
/* notes:
diff --git a/config.c b/config.c
index a882ed3..b308b6c 100644
--- a/config.c
+++ b/config.c
@@ -63,6 +63,9 @@
* but may not wrap over lines
*
*/
+#ifndef _POSIX_C_SOURCE
+#define _POSIX_C_SOURCE 200809L
+#endif
#ifndef CONFFILE
#define CONFFILE "/etc/mdadm.conf"
@@ -77,7 +80,7 @@ char DefaultAltConfFile[] = CONFFILE2;
char DefaultAltConfDir[] = CONFFILE2 ".d";
enum linetype { Devices, Array, Mailaddr, Mailfrom, Program, CreateDev,
- Homehost, AutoMode, Policy, PartPolicy, LTEnd };
+ Homehost, HomeCluster, AutoMode, Policy, PartPolicy, LTEnd };
char *keywords[] = {
[Devices] = "devices",
[Array] = "array",
@@ -86,6 +89,7 @@ char *keywords[] = {
[Program] = "program",
[CreateDev]= "create",
[Homehost] = "homehost",
+ [HomeCluster] = "homecluster",
[AutoMode] = "auto",
[Policy] = "policy",
[PartPolicy]="part-policy",
@@ -562,6 +566,21 @@ void homehostline(char *line)
}
}
+static char *home_cluster = NULL;
+void homeclusterline(char *line)
+{
+ char *w;
+
+ for (w=dl_next(line); w != line ; w=dl_next(w)) {
+ if (home_cluster == NULL) {
+ if (strcasecmp(w, "<none>")==0)
+ home_cluster = xstrdup("");
+ else
+ home_cluster = xstrdup(w);
+ }
+ }
+}
+
char auto_yes[] = "yes";
char auto_no[] = "no";
char auto_homehost[] = "homehost";
@@ -724,6 +743,9 @@ void conf_file(FILE *f)
case Homehost:
homehostline(line);
break;
+ case HomeCluster:
+ homeclusterline(line);
+ break;
case AutoMode:
autoline(line);
break;
@@ -884,6 +906,12 @@ char *conf_get_homehost(int *require_homehostp)
return home_host;
}
+char *conf_get_homecluster(void)
+{
+ load_conffile();
+ return home_cluster;
+}
+
struct createinfo *conf_get_create_info(void)
{
load_conffile();
diff --git a/crc32c.c b/crc32c.c
new file mode 100644
index 0000000..156cba1
--- /dev/null
+++ b/crc32c.c
@@ -0,0 +1,104 @@
+/*
+ * Oct 28, 2015 Song Liu simplified the code and port it to mdadm
+ *
+ * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
+ * cleaned up code to current version of sparse and added the slicing-by-8
+ * algorithm to the closely similar existing slicing-by-4 algorithm.
+ *
+ * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks!
+ * Code was from the public domain, copyright abandoned. Code was
+ * subsequently included in the kernel, thus was re-licensed under the
+ * GNU GPL v2.
+ *
+ * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * Same crc32 function was used in 5 other places in the kernel.
+ * I made one version, and deleted the others.
+ * There are various incantations of crc32(). Some use a seed of 0 or ~0.
+ * Some xor at the end with ~0. The generic crc32() function takes
+ * seed as an argument, and doesn't xor at the end. Then individual
+ * users can do whatever they need.
+ * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0.
+ * fs/jffs2 uses seed 0, doesn't xor with ~0.
+ * fs/partitions/efi.c uses seed ~0, xor's with ~0.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <sys/types.h>
+#include <asm/types.h>
+#include <stdlib.h>
+
+/*
+ * There are multiple 16-bit CRC polynomials in common use, but this is
+ * *the* standard CRC-32 polynomial, first popularized by Ethernet.
+ * x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x^1+x^0
+ */
+#define CRCPOLY_LE 0xedb88320
+#define CRCPOLY_BE 0x04c11db7
+
+/*
+ * This is the CRC32c polynomial, as outlined by Castagnoli.
+ * x^32+x^28+x^27+x^26+x^25+x^23+x^22+x^20+x^19+x^18+x^14+x^13+x^11+x^10+x^9+
+ * x^8+x^6+x^0
+ */
+#define CRC32C_POLY_LE 0x82F63B78
+
+/**
+ * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II
+ * CRC32/CRC32C
+ * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for other
+ * uses, or the previous crc32/crc32c value if computing incrementally.
+ * @p: pointer to buffer over which CRC32/CRC32C is run
+ * @len: length of buffer @p
+ * @polynomial: CRC32/CRC32c LE polynomial
+ */
+static inline __u32 crc32_le_generic(__u32 crc, unsigned char const *p,
+ size_t len, __u32 polynomial)
+{
+ int i;
+ while (len--) {
+ crc ^= *p++;
+ for (i = 0; i < 8; i++)
+ crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0);
+ }
+ return crc;
+}
+
+__u32 crc32_le(__u32 crc, unsigned char const *p, size_t len)
+{
+ return crc32_le_generic(crc, p, len, CRCPOLY_LE);
+}
+
+__u32 crc32c_le(__u32 crc, unsigned char const *p, size_t len)
+{
+ return crc32_le_generic(crc, p, len, CRC32C_POLY_LE);
+}
+
+/**
+ * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
+ * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for
+ * other uses, or the previous crc32 value if computing incrementally.
+ * @p: pointer to buffer over which CRC32 is run
+ * @len: length of buffer @p
+ * @polynomial: CRC32 BE polynomial
+ */
+static inline __u32 crc32_be_generic(__u32 crc, unsigned char const *p,
+ size_t len, __u32 polynomial)
+{
+ int i;
+ while (len--) {
+ crc ^= *p++ << 24;
+ for (i = 0; i < 8; i++)
+ crc =
+ (crc << 1) ^ ((crc & 0x80000000) ? polynomial :
+ 0);
+ }
+ return crc;
+}
+
+__u32 crc32_be(__u32 crc, unsigned char const *p, size_t len)
+{
+ return crc32_be_generic(crc, p, len, CRCPOLY_BE);
+}
diff --git a/debian/changelog b/debian/changelog
index cd2f9c2..3676228 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+mdadm (3.4-1) unstable; urgency=medium
+
+ * New upstream release.
+
+ -- Dimitri John Ledkov <xnox@ubuntu.com> Fri, 19 Feb 2016 16:18:36 +0000
+
mdadm (3.3.4-1.1) unstable; urgency=medium
* Non-maintainer upload.
diff --git a/inventory b/inventory
index a9fc3c0..ace5df0 100755
--- a/inventory
+++ b/inventory
@@ -22,6 +22,7 @@ ANNOUNCE-3.3.1
ANNOUNCE-3.3.2
ANNOUNCE-3.3.3
ANNOUNCE-3.3.4
+ANNOUNCE-3.4
Assemble.c
Build.c
COPYING
@@ -46,6 +47,7 @@ bitmap.h
config.c
crc32.c
crc32.h
+crc32c.c
dlink.c
dlink.h
external-reshape-design.txt
@@ -239,6 +241,7 @@ tests/19raid6auto-repair
tests/19raid6check
tests/19raid6repair
tests/19repair-does-not-destroy
+tests/20raid5journal
tests/ToTest
tests/check
tests/env-ddf-template
diff --git a/mapfile.c b/mapfile.c
index 41599df..243ded1 100644
--- a/mapfile.c
+++ b/mapfile.c
@@ -176,7 +176,7 @@ void map_read(struct map_ent **melp)
{
FILE *f;
char buf[8192];
- char path[200];
+ char path[201];
int uuid[4];
char devnm[32];
char metadata[30];
diff --git a/md.4 b/md.4
index e955c3b..f1b88ee 100644
--- a/md.4
+++ b/md.4
@@ -874,6 +874,26 @@ The list is particularly useful when recovering to a spare. If a few blocks
cannot be read from the other devices, the bulk of the recovery can
complete and those few bad blocks will be recorded in the bad block list.
+.SS RAID456 WRITE JOURNAL
+
+Due to non-atomicity nature of RAID write operations, interruption of
+write operations (system crash, etc.) to RAID456 array can lead to
+inconsistent parity and data loss (so called RAID-5 write hole).
+
+To plug the write hole, from Linux 4.4 (to be confirmed),
+.I md
+supports write ahead journal for RAID456. When the array is created,
+an additional journal device can be added to the array through
+.IR write-journal
+option. The RAID write journal works similar to file system journals.
+Before writing to the data disks, md persists data AND parity of the
+stripe to the journal device. After crashes, md searches the journal
+device for incomplete write operations, and replay them to the data
+disks.
+
+When the journal device fails, the RAID array is forced to run in
+read-only mode.
+
.SS WRITE-BEHIND
From Linux 2.6.14,
diff --git a/md_p.h b/md_p.h
index c4846ba..0d691fb 100644
--- a/md_p.h
+++ b/md_p.h
@@ -78,6 +78,12 @@
#define MD_DISK_ACTIVE 1 /* disk is running but may not be in sync */
#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */
#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */
+#define MD_DISK_CLUSTER_ADD 4 /* Initiate a disk add across the cluster
+ * For clustered enviroments only.
+ */
+#define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed
+ * For clustered enviroments only.
+ */
#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config.
* read requests will only be sent here in
@@ -85,6 +91,12 @@
*/
#define MD_DISK_REPLACEMENT 17
+#define MD_DISK_JOURNAL 18 /* disk is used as the write journal in RAID-5/6 */
+
+#define MD_DISK_ROLE_SPARE 0xffff
+#define MD_DISK_ROLE_FAULTY 0xfffe
+#define MD_DISK_ROLE_JOURNAL 0xfffd
+#define MD_DISK_ROLE_MAX 0xff00 /* max value of regular disk role */
typedef struct mdp_device_descriptor_s {
__u32 number; /* 0 Device number in the entire set */
@@ -106,6 +118,7 @@ typedef struct mdp_device_descriptor_s {
#define MD_SB_BLOCK_CONTAINER_RESHAPE 3 /* block container wide reshapes */
#define MD_SB_BLOCK_VOLUME 4 /* block activation of array, other arrays
* in container can be activated */
+#define MD_SB_CLUSTERED 5 /* MD is clustered */
#define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */
typedef struct mdp_superblock_s {
@@ -195,4 +208,62 @@ static inline __u64 md_event(mdp_super_t *sb) {
return (ev<<32)| sb->events_lo;
}
+struct r5l_payload_header {
+ __u16 type;
+ __u16 flags;
+} __attribute__ ((__packed__));
+
+enum r5l_payload_type {
+ R5LOG_PAYLOAD_DATA = 0,
+ R5LOG_PAYLOAD_PARITY = 1,
+ R5LOG_PAYLOAD_FLUSH = 2,
+};
+
+struct r5l_payload_data_parity {
+ struct r5l_payload_header header;
+ __u32 size; /* sector. data/parity size. each 4k has a checksum */
+ __u64 location; /* sector. For data, it's raid sector. For
+ parity, it's stripe sector */
+ __u32 checksum[];
+} __attribute__ ((__packed__));
+
+enum r5l_payload_data_parity_flag {
+ R5LOG_PAYLOAD_FLAG_DISCARD = 1, /* payload is discard */
+ /*
+ * RESHAPED/RESHAPING is only set when there is reshape activity. Note,
+ * both data/parity of a stripe should have the same flag set
+ *
+ * RESHAPED: reshape is running, and this stripe finished reshape
+ * RESHAPING: reshape is running, and this stripe isn't reshaped
+ * */
+ R5LOG_PAYLOAD_FLAG_RESHAPED = 2,
+ R5LOG_PAYLOAD_FLAG_RESHAPING = 3,
+};
+
+struct r5l_payload_flush {
+ struct r5l_payload_header header;
+ __u32 size; /* flush_stripes size, bytes */
+ __u64 flush_stripes[];
+} __attribute__ ((__packed__));
+
+enum r5l_payload_flush_flag {
+ R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */
+};
+
+struct r5l_meta_block {
+ __u32 magic;
+ __u32 checksum;
+ __u8 version;
+ __u8 __zero_pading_1;
+ __u16 __zero_pading_2;
+ __u32 meta_size; /* whole size of the block */
+
+ __u64 seq;
+ __u64 position; /* sector, start from rdev->data_offset, current position */
+ struct r5l_payload_header payloads[];
+} __attribute__ ((__packed__));
+
+#define R5LOG_VERSION 0x1
+#define R5LOG_MAGIC 0x6433c509
+
#endif
diff --git a/md_u.h b/md_u.h
index be9868a..f570a34 100644
--- a/md_u.h
+++ b/md_u.h
@@ -44,6 +44,7 @@
#define STOP_ARRAY _IO (MD_MAJOR, 0x32)
#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33)
#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34)
+#define CLUSTERED_DISK_NACK _IO (MD_MAJOR, 0x35)
typedef struct mdu_version_s {
int major;
@@ -58,7 +59,7 @@ typedef struct mdu_array_info_s {
int major_version;
int minor_version;
int patch_version;
- int ctime;
+ unsigned int ctime;
int level;
int size;
int nr_disks;
@@ -69,7 +70,7 @@ typedef struct mdu_array_info_s {
/*
* Generic state information
*/
- int utime; /* 0 Superblock update time */
+ unsigned int utime; /* 0 Superblock update time */
int state; /* 1 State bits (clean, ...) */
int active_disks; /* 2 Number of currently active disks */
int working_disks; /* 3 Number of working disks */
diff --git a/mdadm.8.in b/mdadm.8.in
index 14bd8b9..50be1aa 100644
--- a/mdadm.8.in
+++ b/mdadm.8.in
@@ -5,7 +5,7 @@
.\" the Free Software Foundation; either version 2 of the License, or
.\" (at your option) any later version.
.\" See file COPYING in distribution for details.
-.TH MDADM 8 "" v3.3.4
+.TH MDADM 8 "" v3.4
.SH NAME
mdadm \- manage MD devices
.I aka
@@ -267,13 +267,13 @@ the exact meaning of this option in different contexts.
.TP
.BR \-c ", " \-\-config=
Specify the config file or directory. Default is to use
-.B /etc/mdadm/mdadm.conf
+.B /etc/mdadm.conf
and
-.BR /etc/mdadm/mdadm.conf.d ,
+.BR /etc/mdadm.conf.d ,
or if those are missing then
-.B /etc/mdadm.conf
+.B /etc/mdadm/mdadm.conf
and
-.BR /etc/mdadm.conf.d .
+.BR /etc/mdadm/mdadm.conf.d .
If the config file given is
.B "partitions"
then nothing will be read, but
@@ -422,6 +422,12 @@ This functionality is currently only provided by
and
.BR \-\-monitor .
+.TP
+.B \-\-home\-cluster=
+specifies the cluster name for the md device. The md device can be assembled
+only on the cluster which matches the name specified. If this option is not
+provided, mdadm tries to detect the cluster name automatically.
+
.SH For create, build, or grow:
.TP
@@ -701,7 +707,12 @@ and so is replicated on all devices. If the word
.B "none"
is given with
.B \-\-grow
-mode, then any bitmap that is present is removed.
+mode, then any bitmap that is present is removed. If the word
+.B "clustered"
+is given, the array is created for a clustered environment. One bitmap
+is created for each node as defined by the
+.B \-\-nodes
+parameter and are stored internally.
To help catch typing errors, the filename must contain at least one
slash ('/') if it is a real file (not 'internal' or 'none').
@@ -973,6 +984,18 @@ However for RAID0, it is not possible to add spares. So to increase
the number of devices in a RAID0, it is necessary to set the new
number of devices, and to add the new devices, in the same command.
+.TP
+.BR \-\-nodes
+Only works when the array is for clustered environment. It specifies
+the maximum number of nodes in the cluster that will use this device
+simultaneously. If not specified, this defaults to 4.
+
+.TP
+.BR \-\-write-journal
+Specify journal device for the RAID-4/5/6 array. The journal device
+should be a SSD with reasonable lifetime.
+
+
.SH For assemble:
.TP
@@ -1087,7 +1110,9 @@ argument given to this flag can be one of
.BR summaries ,
.BR uuid ,
.BR name ,
+.BR nodes ,
.BR homehost ,
+.BR home-cluster ,
.BR resync ,
.BR byteorder ,
.BR devicesize ,
@@ -1142,6 +1167,13 @@ of the array as stored in the superblock. This is only supported for
version-1 superblocks.
The
+.B nodes
+option will change the
+.I nodes
+of the array as stored in the bitmap superblock. This option only
+works for a clustered environment.
+
+The
.B homehost
option will change the
.I homehost
@@ -1150,6 +1182,11 @@ same as updating the UUID.
For version-1 superblocks, this involves updating the name.
The
+.B home\-cluster
+option will change the cluster name as recorded in the superblock and
+bitmap. This option only works for clustered environment.
+
+The
.B resync
option will cause the array to be marked
.I dirty
@@ -1396,6 +1433,15 @@ will avoid reading from these devices if possible.
.BR \-\-readwrite
Subsequent devices that are added or re\-added will have the 'write-mostly'
flag cleared.
+.TP
+.BR \-\-cluster\-confirm
+Confirm the existence of the device. This is issued in response to an \-\-add
+request by a node in a cluster. When a node adds a device it sends a message
+to all nodes in the cluster to look for a device with a UUID. This translates
+to a udev notification with the UUID of the device to be added and the slot
+number. The receiving node must acknowledge this message
+with \-\-cluster\-confirm. Valid arguments are <slot>:<devicename> in case
+the device is found or <slot>:missing in case the device is not found.
.P
Each of these options requires that the first device listed is the array
@@ -1803,9 +1849,9 @@ The config file is only used if explicitly named with
or requested with (a possibly implicit)
.BR \-\-scan .
In the later case,
-.B /etc/mdadm/mdadm.conf
-or
.B /etc/mdadm.conf
+or
+.B /etc/mdadm/mdadm.conf
is used.
If
@@ -3099,7 +3145,7 @@ uses this to find arrays when
is given in Misc mode, and to monitor array reconstruction
on Monitor mode.
-.SS /etc/mdadm/mdadm.conf (or /etc/mdadm.conf)
+.SS /etc/mdadm.conf
The config file lists which devices may be scanned to see if
they contain MD super block, and gives identifying information
@@ -3107,7 +3153,7 @@ they contain MD super block, and gives identifying information
.BR mdadm.conf (5)
for more details.
-.SS /etc/mdadm/mdadm.conf.d (or /etc/mdadm.conf.d)
+.SS /etc/mdadm.conf.d
A directory containing configuration files which are read in lexical
order.
diff --git a/mdadm.c b/mdadm.c
index 93732a8..51e16f3 100644
--- a/mdadm.c
+++ b/mdadm.c
@@ -74,6 +74,7 @@ int main(int argc, char *argv[])
.require_homehost = 1,
};
struct shape s = {
+ .journaldisks = 0,
.level = UnSet,
.layout = UnSet,
.bitmap_chunk = UnSet,
@@ -189,6 +190,7 @@ int main(int argc, char *argv[])
case 'a':
case Add:
case AddSpare:
+ case AddJournal:
case 'r':
case Remove:
case Replace:
@@ -196,6 +198,7 @@ int main(int argc, char *argv[])
case 'f':
case Fail:
case ReAdd: /* re-add */
+ case ClusterConfirm:
if (!mode) {
newmode = MANAGE;
shortopt = short_bitmap_options;
@@ -588,7 +591,23 @@ int main(int argc, char *argv[])
}
ident.raid_disks = s.raiddisks;
continue;
-
+ case O(ASSEMBLE, Nodes):
+ case O(CREATE, Nodes):
+ c.nodes = parse_num(optarg);
+ if (c.nodes <= 0) {
+ pr_err("invalid number for the number of cluster nodes: %s\n",
+ optarg);
+ exit(2);
+ }
+ continue;
+ case O(CREATE, ClusterName):
+ case O(ASSEMBLE, ClusterName):
+ c.homecluster = optarg;
+ if (strlen(c.homecluster) > 64) {
+ pr_err("Cluster name too big.\n");
+ exit(ERANGE);
+ }
+ continue;
case O(CREATE,'x'): /* number of spare (eXtra) disks */
if (s.sparedisks) {
pr_err("spare-devices set twice: %d and %s\n",
@@ -726,6 +745,10 @@ int main(int argc, char *argv[])
continue;
if (strcmp(c.update, "homehost")==0)
continue;
+ if (strcmp(c.update, "home-cluster")==0)
+ continue;
+ if (strcmp(c.update, "nodes")==0)
+ continue;
if (strcmp(c.update, "devicesize")==0)
continue;
if (strcmp(c.update, "no-bitmap")==0)
@@ -734,6 +757,8 @@ int main(int argc, char *argv[])
continue;
if (strcmp(c.update, "no-bbl") == 0)
continue;
+ if (strcmp(c.update, "force-no-bbl") == 0)
+ continue;
if (strcmp(c.update, "metadata") == 0)
continue;
if (strcmp(c.update, "revert-reshape") == 0)
@@ -764,10 +789,10 @@ int main(int argc, char *argv[])
Name, c.update);
}
fprintf(outf, "Valid --update options are:\n"
- " 'sparc2.2', 'super-minor', 'uuid', 'name', 'resync',\n"
- " 'summaries', 'homehost', 'byteorder', 'devicesize',\n"
+ " 'sparc2.2', 'super-minor', 'uuid', 'name', 'nodes', 'resync',\n"
+ " 'summaries', 'homehost', 'home-cluster', 'byteorder', 'devicesize',\n"
" 'no-bitmap', 'metadata', 'revert-reshape'\n"
- " 'bbl', 'no-bbl'\n"
+ " 'bbl', 'no-bbl', 'force-no-bbl'\n"
);
exit(outf == stdout ? 0 : 2);
@@ -785,8 +810,9 @@ int main(int argc, char *argv[])
c.update = optarg;
if (strcmp(c.update, "devicesize") != 0 &&
strcmp(c.update, "bbl") != 0 &&
+ strcmp(c.update, "force-no-bbl") != 0 &&
strcmp(c.update, "no-bbl") != 0) {
- pr_err("only 'devicesize', 'bbl' and 'no-bbl' can be updated with --re-add\n");
+ pr_err("only 'devicesize', 'bbl', 'no-bbl', and 'force-no-bbl' can be updated with --re-add\n");
exit(2);
}
continue;
@@ -903,6 +929,13 @@ int main(int argc, char *argv[])
case O(MANAGE,AddSpare): /* add drive - never re-add */
devmode = 'S';
continue;
+ case O(MANAGE,AddJournal): /* add journal */
+ if (s.journaldisks && (s.level < 4 || s.level > 6)) {
+ pr_err("--add-journal is only supported for RAID level 4/5/6.\n");
+ exit(2);
+ }
+ devmode = 'j';
+ continue;
case O(MANAGE,ReAdd):
devmode = 'A';
continue;
@@ -919,6 +952,9 @@ int main(int argc, char *argv[])
* remove the device */
devmode = 'f';
continue;
+ case O(MANAGE, ClusterConfirm):
+ devmode = 'c';
+ continue;
case O(MANAGE,Replace):
/* Mark these devices for replacement */
devmode = 'R';
@@ -1097,6 +1133,15 @@ int main(int argc, char *argv[])
s.bitmap_file = optarg;
continue;
}
+ if (strcmp(optarg, "clustered")== 0) {
+ s.bitmap_file = optarg;
+ /* Set the default number of cluster nodes
+ * to 4 if not already set by user
+ */
+ if (c.nodes < 1)
+ c.nodes = 4;
+ continue;
+ }
/* probable typo */
pr_err("bitmap file must contain a '/', or be 'internal', or 'none'\n"
" not '%s'\n", optarg);
@@ -1137,6 +1182,23 @@ int main(int argc, char *argv[])
case O(INCREMENTAL, IncrementalPath):
remove_path = optarg;
continue;
+ case O(CREATE, WriteJournal):
+ if (s.journaldisks) {
+ pr_err("Please specify only one journal device for the array.\n");
+ pr_err("Ignoring --write-journal %s...\n", optarg);
+ continue;
+ }
+ dv = xmalloc(sizeof(*dv));
+ dv->devname = optarg;
+ dv->disposition = 'j'; /* WriteJournal */
+ dv->used = 0;
+ dv->next = NULL;
+ *devlistend = dv;
+ devlistend = &dv->next;
+ devs_found++;
+
+ s.journaldisks = 1;
+ continue;
}
/* We have now processed all the valid options. Anything else is
* an error
@@ -1164,6 +1226,11 @@ int main(int argc, char *argv[])
exit(0);
}
+ if (s.journaldisks && (s.level < 4 || s.level > 6)) {
+ pr_err("--write-journal is only supported for RAID level 4/5/6.\n");
+ exit(2);
+ }
+
if (!mode && devs_found) {
mode = MISC;
devmode = 'Q';
@@ -1260,6 +1327,20 @@ int main(int argc, char *argv[])
c.require_homehost = 0;
}
+ rv = 0;
+
+ set_hooks(); /* set hooks from libs */
+
+ if (c.homecluster == NULL && (c.nodes > 0)) {
+ c.homecluster = conf_get_homecluster();
+ if (c.homecluster == NULL)
+ rv = get_cluster_name(&c.homecluster);
+ if (rv) {
+ pr_err("The md can't get cluster name\n");
+ exit(1);
+ }
+ }
+
if (c.backup_file && data_offset != INVALID_SECTORS) {
pr_err("--backup-file and --data-offset are incompatible\n");
exit(2);
@@ -1279,7 +1360,6 @@ int main(int argc, char *argv[])
/* --scan implied --brief unless -vv */
c.brief = 1;
- rv = 0;
switch(mode) {
case MANAGE:
/* readonly, add/remove, readwrite, runstop */
@@ -1366,8 +1446,9 @@ int main(int argc, char *argv[])
}
if (s.bitmap_file) {
- if (strcmp(s.bitmap_file, "internal")==0) {
- pr_err("'internal' bitmaps not supported with --build\n");
+ if (strcmp(s.bitmap_file, "internal")==0 ||
+ strcmp(s.bitmap_file, "clustered") == 0) {
+ pr_err("'internal' and 'clustered' bitmaps not supported with --build\n");
rv |= 1;
break;
}
@@ -1377,6 +1458,21 @@ int main(int argc, char *argv[])
case CREATE:
if (c.delay == 0)
c.delay = DEFAULT_BITMAP_DELAY;
+
+ if (c.nodes) {
+ if (!s.bitmap_file || strcmp(s.bitmap_file, "clustered") != 0) {
+ pr_err("--nodes argument only compatible with --bitmap=clustered\n");
+ rv = 1;
+ break;
+ }
+
+ if (s.level != 1) {
+ pr_err("--bitmap=clustered is currently supported with RAID mirror only\n");
+ rv = 1;
+ break;
+ }
+ }
+
if (s.write_behind && !s.bitmap_file) {
pr_err("write-behind mode requires a bitmap.\n");
rv = 1;
@@ -1442,8 +1538,6 @@ int main(int argc, char *argv[])
else
c.delay = 60;
}
- if (c.delay == 0)
- c.delay = 60;
rv= Monitor(devlist, mailaddr, program,
&c, daemonise, oneshot,
dosyslog, pidfile, increments,
diff --git a/mdadm.conf.5 b/mdadm.conf.5
index 542e263..18512cb 100644
--- a/mdadm.conf.5
+++ b/mdadm.conf.5
@@ -8,7 +8,7 @@
.SH NAME
mdadm.conf \- configuration for management of Software RAID with mdadm
.SH SYNOPSIS
-/etc/mdadm/mdadm.conf
+/etc/mdadm.conf
.SH DESCRIPTION
.PP
.I mdadm
diff --git a/mdadm.h b/mdadm.h
index b597658..dd02be7 100644..100755
--- a/mdadm.h
+++ b/mdadm.h
@@ -35,6 +35,7 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
#include <sys/types.h>
#include <sys/stat.h>
+#include <stdint.h>
#include <stdlib.h>
#include <time.h>
#include <sys/time.h>
@@ -51,6 +52,32 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
#define srandom srand
#endif
+#ifdef NO_COROSYNC
+#define CS_OK 1
+typedef uint64_t cmap_handle_t;
+#else
+#include <corosync/cmap.h>
+#endif
+
+#ifndef NO_DLM
+#include <libdlm.h>
+#include <errno.h>
+#else
+#define LKF_NOQUEUE 0x00000001
+#define LKF_CONVERT 0x00000004
+#define LKM_PWMODE 4
+#define EUNLOCK 0x10002
+
+typedef void *dlm_lshandle_t;
+
+struct dlm_lksb {
+ int sb_status;
+ uint32_t sb_lkid;
+ char sb_flags;
+ char *sb_lvbptr;
+};
+#endif
+
#include <linux/kdev_t.h>
/*#include <linux/fs.h> */
#include <sys/mount.h>
@@ -162,6 +189,31 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
#endif /* __KLIBC__ */
/*
+ * Check at compile time that something is of a particular type.
+ * Always evaluates to 1 so you may use it easily in comparisons.
+*/
+
+#define typecheck(type,x) \
+({ type __dummy; \
+ typeof(x) __dummy2; \
+ (void)(&__dummy == &__dummy2); \
+ 1; \
+})
+
+/*
+ * These inlines deal with timer wrapping correctly.
+ *
+ * time_after(a,b) returns true if the time a is after time b.
+*/
+
+#define time_after(a,b) \
+ (typecheck(unsigned int, a) && \
+ typecheck(unsigned int, b) && \
+ ((int)((b) - (a)) < 0))
+
+#define time_before(a,b) time_after(b,a)
+
+/*
* min()/max()/clamp() macros that also do
* strict type-checking.. See the
* "unnecessary" pointer comparison.
@@ -210,6 +262,9 @@ struct mdinfo {
* for native metadata it is
* reshape_active field mirror
*/
+ int journal_device_required;
+ int journal_clean;
+
/* During reshape we can sometimes change the data_offset to avoid
* over-writing still-valid data. We need to know if there is space.
* So getinfo_super will fill in space_before and space_after in sectors.
@@ -251,6 +306,8 @@ struct mdinfo {
#define DS_UNBLOCK 2048
int prev_state, curr_state, next_state;
+ /* info read from sysfs */
+ char sysfs_array_state[20];
};
struct createinfo {
@@ -313,6 +370,7 @@ enum special_options {
ManageOpt,
Add,
AddSpare,
+ AddJournal,
Remove,
Fail,
Replace,
@@ -344,6 +402,10 @@ enum special_options {
Dump,
Restore,
Action,
+ Nodes,
+ ClusterName,
+ ClusterConfirm,
+ WriteJournal,
};
enum prefix_standard {
@@ -351,6 +413,12 @@ enum prefix_standard {
IEC
};
+enum bitmap_update {
+ NoUpdate,
+ NameUpdate,
+ NodeNumUpdate,
+};
+
/* structures read from config file */
/* List of mddevice names and identifiers
* Identifiers can be:
@@ -418,11 +486,14 @@ struct context {
char *backup_file;
int invalid_backup;
char *action;
+ int nodes;
+ char *homecluster;
};
struct shape {
int raiddisks;
int sparedisks;
+ int journaldisks;
int level;
int layout;
char *layout_str;
@@ -521,6 +592,7 @@ enum sysfs_read_flags {
GET_SIZE = (1 << 22),
GET_STATE = (1 << 23),
GET_ERROR = (1 << 24),
+ GET_ARRAY_STATE = (1 << 25),
};
/* If fd >= 0, get the array it is open on,
@@ -528,6 +600,7 @@ enum sysfs_read_flags {
*/
extern int sysfs_open(char *devnm, char *devname, char *attr);
extern void sysfs_init(struct mdinfo *mdi, int fd, char *devnm);
+extern void sysfs_init_dev(struct mdinfo *mdi, unsigned long devid);
extern void sysfs_free(struct mdinfo *sra);
extern struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options);
extern int sysfs_attr_match(const char *attr, const char *str);
@@ -747,7 +820,8 @@ extern struct superswitch {
* readwrite - clear the WriteMostly1 bit in the superblock devflags
* no-bitmap - clear any record that a bitmap is present.
* bbl - add a bad-block-log if possible
- * no-bbl - remove and bad-block-log is it is empty.
+ * no-bbl - remove any bad-block-log is it is empty.
+ * force-no-bbl - remove any bad-block-log even if empty.
* revert-reshape - If a reshape is in progress, modify metadata so
* it will resume going in the opposite direction.
*/
@@ -830,11 +904,11 @@ extern struct superswitch {
/* Seek 'fd' to start of write-intent-bitmap. Must be an
* md-native format bitmap
*/
- void (*locate_bitmap)(struct supertype *st, int fd);
+ int (*locate_bitmap)(struct supertype *st, int fd);
/* if add_internal_bitmap succeeded for existing array, this
* writes it out.
*/
- int (*write_bitmap)(struct supertype *st, int fd);
+ int (*write_bitmap)(struct supertype *st, int fd, enum bitmap_update update);
/* Free the superblock and any other allocated data */
void (*free_super)(struct supertype *st);
@@ -1018,6 +1092,8 @@ struct supertype {
*/
int devcnt;
int retry_soon;
+ int nodes;
+ char *cluster_name;
struct mdinfo *devs;
@@ -1264,6 +1340,7 @@ extern int parse_uuid(char *str, int uuid[4]);
extern int parse_layout_10(char *layout);
extern int parse_layout_faulty(char *layout);
extern long parse_num(char *num);
+extern int parse_cluster_confirm_arg(char *inp, char **devname, int *slot);
extern int check_ext2(int fd, char *name);
extern int check_reiser(int fd, char *name);
extern int check_raid(int fd, char *name);
@@ -1294,6 +1371,7 @@ extern char *conf_get_mailaddr(void);
extern char *conf_get_mailfrom(void);
extern char *conf_get_program(void);
extern char *conf_get_homehost(int *require_homehostp);
+extern char *conf_get_homecluster(void);
extern char *conf_line(FILE *file);
extern char *conf_word(FILE *file, int allow_key);
extern void print_quoted(char *str);
@@ -1403,6 +1481,45 @@ extern char *fd2devnm(int fd);
extern int in_initrd(void);
+struct cmap_hooks {
+ void *cmap_handle; /* corosync lib related */
+
+ int (*initialize)(cmap_handle_t *handle);
+ int (*get_string)(cmap_handle_t handle,
+ const char *string,
+ char **name);
+ int (*finalize)(cmap_handle_t handle);
+};
+
+extern void set_cmap_hooks(void);
+extern void set_hooks(void);
+
+struct dlm_hooks {
+ void *dlm_handle; /* dlm lib related */
+
+ dlm_lshandle_t (*create_lockspace)(const char *name,
+ unsigned int mode);
+ int (*release_lockspace)(const char *name, dlm_lshandle_t ls,
+ int force);
+ int (*ls_lock)(dlm_lshandle_t lockspace, uint32_t mode,
+ struct dlm_lksb *lksb, uint32_t flags,
+ const void *name, unsigned int namelen,
+ uint32_t parent, void (*astaddr) (void *astarg),
+ void *astarg, void (*bastaddr) (void *astarg),
+ void *range);
+ int (*ls_unlock)(dlm_lshandle_t lockspace, uint32_t lkid,
+ uint32_t flags, struct dlm_lksb *lksb,
+ void *astarg);
+ int (*ls_get_fd)(dlm_lshandle_t ls);
+ int (*dispatch)(int fd);
+};
+
+extern int get_cluster_name(char **name);
+extern int dlm_funs_ready(void);
+extern int cluster_get_dlmlock(int *lockid);
+extern int cluster_release_dlmlock(int lockid);
+extern void set_dlm_hooks(void);
+
#define _ROUND_UP(val, base) (((val) + (base) - 1) & ~(base - 1))
#define ROUND_UP(val, base) _ROUND_UP(val, (typeof(val))(base))
#define ROUND_UP_PTR(ptr, base) ((typeof(ptr)) \
diff --git a/mdadm.spec b/mdadm.spec
index 293cb19..685a564 100644
--- a/mdadm.spec
+++ b/mdadm.spec
@@ -1,6 +1,6 @@
Summary: mdadm is used for controlling Linux md devices (aka RAID arrays)
Name: mdadm
-Version: 3.3.4
+Version: 3.4
Release: 1
Source: http://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-%{version}.tar.gz
URL: http://neil.brown.name/blog/mdadm
diff --git a/mdassemble.8 b/mdassemble.8
index 601c1d1..d0c83c3 100644
--- a/mdassemble.8
+++ b/mdassemble.8
@@ -1,5 +1,5 @@
.\" -*- nroff -*-
-.TH MDASSEMBLE 8 "" v3.3.4
+.TH MDASSEMBLE 8 "" v3.4
.SH NAME
mdassemble \- assemble MD devices
.I aka
@@ -40,7 +40,7 @@ There are no options to
.SH FILES
-.SS /etc/mdadm/mdadm.conf
+.SS /etc/mdadm.conf
The config file lists which devices may be scanned to see if
they contain MD super block, and gives identifying information
diff --git a/mdmon.8 b/mdmon.8
index beb82e0..cc6add8 100644
--- a/mdmon.8
+++ b/mdmon.8
@@ -1,5 +1,5 @@
.\" See file COPYING in distribution for details.
-.TH MDMON 8 "" v3.3.4
+.TH MDMON 8 "" v3.4
.SH NAME
mdmon \- monitor MD external metadata arrays
diff --git a/mdmon.c b/mdmon.c
index ee12b7c..e4b73d9 100644
--- a/mdmon.c
+++ b/mdmon.c
@@ -235,7 +235,7 @@ static int make_control_sock(char *devname)
addr.sun_family = PF_LOCAL;
strcpy(addr.sun_path, path);
umask(077); /* ensure no world write access */
- if (bind(sfd, &addr, sizeof(addr)) < 0) {
+ if (bind(sfd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
close(sfd);
return -1;
}
diff --git a/msg.c b/msg.c
index 754630b..45cd450 100644
--- a/msg.c
+++ b/msg.c
@@ -170,7 +170,7 @@ int connect_monitor(char *devname)
addr.sun_family = PF_LOCAL;
strcpy(addr.sun_path, path);
- if (connect(sfd, &addr, sizeof(addr)) < 0) {
+ if (connect(sfd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
close(sfd);
return -1;
}
diff --git a/platform-intel.c b/platform-intel.c
index edb8679..88818f3 100644
--- a/platform-intel.c
+++ b/platform-intel.c
@@ -33,8 +33,6 @@
static int devpath_to_ll(const char *dev_path, const char *entry,
unsigned long long *val);
-static __u16 devpath_to_vendor(const char *dev_path);
-
static void free_sys_dev(struct sys_dev **list)
{
while (*list) {
@@ -57,6 +55,7 @@ struct sys_dev *find_driver_devices(const char *bus, const char *driver)
struct dirent *de;
struct sys_dev *head = NULL;
struct sys_dev *list = NULL;
+ struct sys_dev *vmd = NULL;
enum sys_dev_type type;
unsigned long long dev_id;
unsigned long long class;
@@ -65,17 +64,25 @@ struct sys_dev *find_driver_devices(const char *bus, const char *driver)
type = SYS_DEV_SAS;
else if (strcmp(driver, "ahci") == 0)
type = SYS_DEV_SATA;
- else if (strcmp(driver, "nvme") == 0)
+ else if (strcmp(driver, "nvme") == 0) {
+ /* if looking for nvme devs, first look for vmd */
+ vmd = find_driver_devices("pci", "vmd");
type = SYS_DEV_NVME;
+ } else if (strcmp(driver, "vmd") == 0)
+ type = SYS_DEV_VMD;
else
type = SYS_DEV_UNKNOWN;
sprintf(path, "/sys/bus/%s/drivers/%s", bus, driver);
driver_dir = opendir(path);
- if (!driver_dir)
+ if (!driver_dir) {
+ if (vmd)
+ free_sys_dev(&vmd);
return NULL;
+ }
for (de = readdir(driver_dir); de; de = readdir(driver_dir)) {
int n;
+ int skip = 0;
/* is 'de' a device? check that the 'subsystem' link exists and
* that its target matches 'bus'
@@ -95,8 +102,19 @@ struct sys_dev *find_driver_devices(const char *bus, const char *driver)
sprintf(path, "/sys/bus/%s/drivers/%s/%s",
bus, driver, de->d_name);
- /* if it's not Intel device skip it. */
- if (devpath_to_vendor(path) != 0x8086)
+ /* if searching for nvme - skip vmd connected one */
+ if (type == SYS_DEV_NVME) {
+ struct sys_dev *dev;
+ char *rp = realpath(path, NULL);
+ for (dev = vmd; dev; dev = dev->next) {
+ if ((strncmp(dev->path, rp, strlen(dev->path)) == 0))
+ skip = 1;
+ }
+ free(rp);
+ }
+
+ /* if it's not Intel device or mark as VMD connected - skip it. */
+ if (devpath_to_vendor(path) != 0x8086 || skip == 1)
continue;
if (devpath_to_ll(path, "device", &dev_id) != 0)
@@ -122,12 +140,28 @@ struct sys_dev *find_driver_devices(const char *bus, const char *driver)
list->dev_id = (__u16) dev_id;
list->class = (__u32) class;
list->type = type;
+ /* Each VMD device (domain) adds separate PCI bus, it is better to
+ * store path as a path to that bus (easier further determination which
+ * NVMe dev is connected to this particular VMD domain).
+ */
+ if (type == SYS_DEV_VMD) {
+ sprintf(path, "/sys/bus/%s/drivers/%s/%s/domain/device",
+ bus, driver, de->d_name);
+ }
list->path = realpath(path, NULL);
list->next = NULL;
if ((list->pci_id = strrchr(list->path, '/')) != NULL)
list->pci_id++;
}
closedir(driver_dir);
+
+ if (vmd) {
+ if (list)
+ list->next = vmd;
+ else
+ head = vmd;
+ }
+
return head;
}
@@ -160,7 +194,7 @@ static int devpath_to_ll(const char *dev_path, const char *entry, unsigned long
return n;
}
-static __u16 devpath_to_vendor(const char *dev_path)
+__u16 devpath_to_vendor(const char *dev_path)
{
char path[strlen(dev_path) + strlen("/vendor") + 1];
char vendor[7];
@@ -196,6 +230,7 @@ struct sys_dev *find_intel_devices(void)
isci = find_driver_devices("pci", "isci");
ahci = find_driver_devices("pci", "ahci");
+ /* Searching for NVMe will return list of NVMe and VMD controllers */
nvme = find_driver_devices("pci", "nvme");
if (!isci && !ahci) {
@@ -430,6 +465,7 @@ static const struct imsm_orom *find_imsm_hba_orom(struct sys_dev *hba)
#define AHCI_PROP "RstSataV"
#define AHCI_SSATA_PROP "RstsSatV"
#define AHCI_CSATA_PROP "RstCSatV"
+#define VMD_PROP "RstUefiV"
#define VENDOR_GUID \
EFI_GUID(0x193dfefa, 0xa445, 0x4302, 0x99, 0xd8, 0xef, 0x3a, 0xad, 0x1a, 0x04, 0xc6)
@@ -545,15 +581,21 @@ const struct imsm_orom *find_imsm_efi(struct sys_dev *hba)
if (!csata)
csata = add_orom(&orom);
add_orom_device_id(csata, hba->dev_id);
+ csata->type = hba->type;
return &csata->orom;
}
}
+ if (hba->type == SYS_DEV_VMD) {
+ err = read_efi_variable(&orom, sizeof(orom), VMD_PROP, VENDOR_GUID);
+ }
+
if (err)
return NULL;
ret = add_orom(&orom);
add_orom_device_id(ret, hba->dev_id);
+ ret->type = hba->type;
return &ret->orom;
}
@@ -583,6 +625,7 @@ const struct imsm_orom *find_imsm_nvme(struct sys_dev *hba)
nvme_orom = add_orom(&nvme_orom_compat);
}
add_orom_device_id(nvme_orom, hba->dev_id);
+ nvme_orom->type = SYS_DEV_NVME;
return &nvme_orom->orom;
}
@@ -667,3 +710,32 @@ int disk_attached_to_hba(int fd, const char *hba_path)
return rc;
}
+
+char *vmd_domain_to_controller(struct sys_dev *hba, char *buf)
+{
+ struct dirent *ent;
+ DIR *dir;
+ char path[PATH_MAX];
+
+ if (!hba)
+ return NULL;
+
+ if (hba->type != SYS_DEV_VMD)
+ return NULL;
+
+ dir = opendir("/sys/bus/pci/drivers/vmd");
+
+ for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) {
+ sprintf(path, "/sys/bus/pci/drivers/vmd/%s/domain/device",
+ ent->d_name);
+
+ if (!realpath(path, buf))
+ continue;
+
+ if (strncmp(buf, hba->path, strlen(buf)) == 0) {
+ sprintf(path, "/sys/bus/pci/drivers/vmd/%s", ent->d_name);
+ return realpath(path, buf);
+ }
+ }
+ return NULL;
+}
diff --git a/platform-intel.h b/platform-intel.h
index 695d6c6..a8ae85f 100644
--- a/platform-intel.h
+++ b/platform-intel.h
@@ -189,6 +189,7 @@ enum sys_dev_type {
SYS_DEV_SAS,
SYS_DEV_SATA,
SYS_DEV_NVME,
+ SYS_DEV_VMD,
SYS_DEV_MAX
};
@@ -213,6 +214,7 @@ struct devid_list {
struct orom_entry {
struct imsm_orom orom;
struct devid_list *devid_list;
+ enum sys_dev_type type;
struct orom_entry *next;
};
@@ -229,6 +231,7 @@ static inline char *guid_str(char *buf, struct efi_guid guid)
}
char *diskfd_to_devpath(int fd);
+__u16 devpath_to_vendor(const char *dev_path);
struct sys_dev *find_driver_devices(const char *bus, const char *driver);
struct sys_dev *find_intel_devices(void);
const struct imsm_orom *find_imsm_capability(struct sys_dev *hba);
@@ -241,3 +244,4 @@ const char *get_sys_dev_type(enum sys_dev_type);
const struct orom_entry *get_orom_entry_by_device_id(__u16 dev_id);
const struct imsm_orom *get_orom_by_device_id(__u16 device_id);
struct sys_dev *device_by_id(__u16 device_id);
+char *vmd_domain_to_controller(struct sys_dev *hba, char *buf);
diff --git a/raid6check.c b/raid6check.c
index cb8522e..ad7ffe7 100644
--- a/raid6check.c
+++ b/raid6check.c
@@ -349,7 +349,8 @@ int check_stripes(struct mdinfo *info, int *source, unsigned long long *offsets,
if (!tables_ready)
make_tables();
- posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size);
+ if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size) != 0)
+ exit(4);
block_index_for_slot += 2;
blocks += 2;
blocks_page += 2;
diff --git a/restripe.c b/restripe.c
index 4d92190..56dca73 100644
--- a/restripe.c
+++ b/restripe.c
@@ -434,7 +434,7 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs,
/* Try to find out if a specific disk has a problem */
int raid6_check_disks(int data_disks, int start, int chunk_size,
int level, int layout, int diskP, int diskQ,
- char *p, char *q, char **stripes)
+ uint8_t *p, uint8_t *q, char **stripes)
{
int i;
int data_id, diskD;
@@ -827,8 +827,8 @@ int test_stripes(int *source, unsigned long long *offsets,
char *stripe_buf = xmalloc(raid_disks * chunk_size);
char **stripes = xmalloc(raid_disks * sizeof(char*));
char **blocks = xmalloc(raid_disks * sizeof(char*));
- char *p = xmalloc(chunk_size);
- char *q = xmalloc(chunk_size);
+ uint8_t *p = xmalloc(chunk_size);
+ uint8_t *q = xmalloc(chunk_size);
int i;
int diskP, diskQ;
diff --git a/sha1.h b/sha1.h
index 0f98658..999fc6a 100644
--- a/sha1.h
+++ b/sha1.h
@@ -22,7 +22,7 @@
#include <stdio.h>
-#if 1 /* defined HAVE_LIMITS_H || _LIBC */
+#if defined HAVE_LIMITS_H || _LIBC
# include <limits.h>
#endif
@@ -33,9 +33,9 @@
the resulting executable. Locally running cross-compiled executables
is usually not possible. */
-#if 1 /* def _LIBC */
-# include <stdint.h>
-typedef uint32_t sha1_uint32;
+#ifdef _LIBC
+# include <sys/types.h>
+typedef u_int32_t sha1_uint32;
typedef uintptr_t sha1_uintptr;
#else
# define INT_MAX_32_BITS 2147483647
diff --git a/super-intel.c b/super-intel.c
index 95a72b6..90b7b6d 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -510,7 +510,8 @@ static const char *_sys_dev_type[] = {
[SYS_DEV_UNKNOWN] = "Unknown",
[SYS_DEV_SAS] = "SAS",
[SYS_DEV_SATA] = "SATA",
- [SYS_DEV_NVME] = "NVMe"
+ [SYS_DEV_NVME] = "NVMe",
+ [SYS_DEV_VMD] = "VMD"
};
const char *get_sys_dev_type(enum sys_dev_type type)
@@ -565,6 +566,10 @@ static int attach_hba_to_super(struct intel_super *super, struct sys_dev *device
if (device->type != hba->type)
return 2;
+ /* Always forbid spanning between VMD domains (seen as different controllers by mdadm) */
+ if (device->type == SYS_DEV_VMD && !path_attached_to_hba(device->path, hba->path))
+ return 2;
+
/* Multiple same type HBAs can be used if they share the same OROM */
const struct imsm_orom *device_orom = get_orom_by_device_id(device->dev_id);
@@ -1761,6 +1766,57 @@ static int ahci_enumerate_ports(const char *hba_path, int port_count, int host_b
return err;
}
+static int print_vmd_attached_devs(struct sys_dev *hba)
+{
+ struct dirent *ent;
+ DIR *dir;
+ char path[292];
+ char link[256];
+ char *c, *rp;
+
+ if (hba->type != SYS_DEV_VMD)
+ return 1;
+
+ /* scroll through /sys/dev/block looking for devices attached to
+ * this hba
+ */
+ dir = opendir("/sys/bus/pci/drivers/nvme");
+ for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) {
+ int n;
+
+ /* is 'ent' a device? check that the 'subsystem' link exists and
+ * that its target matches 'bus'
+ */
+ sprintf(path, "/sys/bus/pci/drivers/nvme/%s/subsystem",
+ ent->d_name);
+ n = readlink(path, link, sizeof(link));
+ if (n < 0 || n >= (int)sizeof(link))
+ continue;
+ link[n] = '\0';
+ c = strrchr(link, '/');
+ if (!c)
+ continue;
+ if (strncmp("pci", c+1, strlen("pci")) != 0)
+ continue;
+
+ sprintf(path, "/sys/bus/pci/drivers/nvme/%s", ent->d_name);
+ /* if not a intel NVMe - skip it*/
+ if (devpath_to_vendor(path) != 0x8086)
+ continue;
+
+ rp = realpath(path, NULL);
+ if (!rp)
+ continue;
+
+ if (path_attached_to_hba(rp, hba->path)) {
+ printf(" NVMe under VMD : %s\n", rp);
+ }
+ free(rp);
+ }
+
+ return 0;
+}
+
static void print_found_intel_controllers(struct sys_dev *elem)
{
for (; elem; elem = elem->next) {
@@ -1771,7 +1827,12 @@ static void print_found_intel_controllers(struct sys_dev *elem)
fprintf(stderr, "SAS ");
else if (elem->type == SYS_DEV_NVME)
fprintf(stderr, "NVMe ");
- fprintf(stderr, "RAID controller");
+
+ if (elem->type == SYS_DEV_VMD)
+ fprintf(stderr, "VMD domain");
+ else
+ fprintf(stderr, "RAID controller");
+
if (elem->pci_id)
fprintf(stderr, " at %s", elem->pci_id);
fprintf(stderr, ".\n");
@@ -1935,8 +1996,10 @@ static int detail_platform_imsm(int verbose, int enumerate_only, char *controlle
if (controller_path && (compare_paths(hba->path, controller_path) != 0))
continue;
if (!find_imsm_capability(hba)) {
+ char buf[PATH_MAX];
pr_err("imsm capabilities not found for controller: %s (type %s)\n",
- hba->path, get_sys_dev_type(hba->type));
+ hba->type == SYS_DEV_VMD ? vmd_domain_to_controller(hba, buf) : hba->path,
+ get_sys_dev_type(hba->type));
continue;
}
result = 0;
@@ -1951,13 +2014,27 @@ static int detail_platform_imsm(int verbose, int enumerate_only, char *controlle
const struct orom_entry *entry;
for (entry = orom_entries; entry; entry = entry->next) {
- print_imsm_capability(&entry->orom);
+ if (entry->type == SYS_DEV_VMD) {
+ for (hba = list; hba; hba = hba->next) {
+ if (hba->type == SYS_DEV_VMD) {
+ char buf[PATH_MAX];
+ print_imsm_capability(&entry->orom);
+ printf(" I/O Controller : %s (%s)\n",
+ vmd_domain_to_controller(hba, buf), get_sys_dev_type(hba->type));
+ print_vmd_attached_devs(hba);
+ printf("\n");
+ }
+ }
+ continue;
+ }
- if (imsm_orom_is_nvme(&entry->orom)) {
+ print_imsm_capability(&entry->orom);
+ if (entry->type == SYS_DEV_NVME) {
for (hba = list; hba; hba = hba->next) {
if (hba->type == SYS_DEV_NVME)
printf(" NVMe Device : %s\n", hba->path);
}
+ printf("\n");
continue;
}
@@ -2000,16 +2077,25 @@ static int export_detail_platform_imsm(int verbose, char *controller_path)
for (hba = list; hba; hba = hba->next) {
if (controller_path && (compare_paths(hba->path,controller_path) != 0))
continue;
- if (!find_imsm_capability(hba) && verbose > 0)
- pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_IMSM_CAPABLE_DEVICE_UNDER_%s\n", hba->path);
+ if (!find_imsm_capability(hba) && verbose > 0) {
+ char buf[PATH_MAX];
+ pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_IMSM_CAPABLE_DEVICE_UNDER_%s\n",
+ hba->type == SYS_DEV_VMD ? vmd_domain_to_controller(hba, buf) : hba->path);
+ }
else
result = 0;
}
const struct orom_entry *entry;
- for (entry = orom_entries; entry; entry = entry->next)
+ for (entry = orom_entries; entry; entry = entry->next) {
+ if (entry->type == SYS_DEV_VMD) {
+ for (hba = list; hba; hba = hba->next)
+ print_imsm_capability_export(&entry->orom);
+ continue;
+ }
print_imsm_capability_export(&entry->orom);
+ }
return result;
}
@@ -3862,12 +3948,14 @@ static int find_intel_hba_capability(int fd, struct intel_super *super, char *de
if (devname) {
struct intel_hba *hba = super->hba;
- pr_err("%s is attached to Intel(R) %s RAID controller (%s),\n"
- " but the container is assigned to Intel(R) %s RAID controller (",
+ pr_err("%s is attached to Intel(R) %s %s (%s),\n"
+ " but the container is assigned to Intel(R) %s %s (",
devname,
get_sys_dev_type(hba_name->type),
+ hba_name->type == SYS_DEV_VMD ? "domain" : "RAID controller",
hba_name->pci_id ? : "Err!",
- get_sys_dev_type(super->hba->type));
+ get_sys_dev_type(super->hba->type),
+ hba->type == SYS_DEV_VMD ? "domain" : "RAID controller");
while (hba) {
fprintf(stderr, "%s", hba->pci_id ? : "Err!");
@@ -3876,7 +3964,8 @@ static int find_intel_hba_capability(int fd, struct intel_super *super, char *de
hba = hba->next;
}
fprintf(stderr, ").\n"
- " Mixing devices attached to different controllers is not allowed.\n");
+ " Mixing devices attached to different %s is not allowed.\n",
+ hba_name->type == SYS_DEV_VMD ? "VMD domains" : "controllers");
}
return 2;
}
@@ -5878,7 +5967,6 @@ count_volumes(struct intel_hba *hba, int dpa, int verbose)
devid_list = entry->devid_list;
for (dv = devid_list; dv; dv = dv->next) {
-
struct md_list *devlist = NULL;
struct sys_dev *device = device_by_id(dv->devid);
char *hba_path;
@@ -5889,6 +5977,14 @@ count_volumes(struct intel_hba *hba, int dpa, int verbose)
else
return 0;
+ /* VMD has one orom entry for all domain, but spanning is not allowed.
+ * VMD arrays should be counted per domain (controller), so skip
+ * domains that are not the given one.
+ */
+ if ((hba->type == SYS_DEV_VMD) &&
+ (strncmp(device->path, hba->path, strlen(device->path)) != 0))
+ continue;
+
devlist = get_devices(hba_path);
/* if no intel devices return zero volumes */
if (devlist == NULL)
@@ -9150,7 +9246,7 @@ int validate_container_imsm(struct mdinfo *info)
return 1;
}
- if (orom != orom2) {
+ if ((orom != orom2) || ((hba->type == SYS_DEV_VMD) && (hba != hba2))) {
pr_err("WARNING - IMSM container assembled with disks under different HBAs!\n"
" This operation is not supported and can lead to data loss.\n");
return 1;
@@ -10277,7 +10373,7 @@ int wait_for_reshape_imsm(struct mdinfo *sra, int ndata)
if (sysfs_fd_get_ll(fd, &completed) < 0) {
dprintf("cannot read reshape_position (no reshape in progres)\n");
close(fd);
- return 0;
+ return 1;
}
if (completed > position_to_set) {
@@ -10297,11 +10393,14 @@ int wait_for_reshape_imsm(struct mdinfo *sra, int ndata)
do {
char action[20];
- sysfs_wait(fd, NULL);
+ int timeout = 3000;
+ sysfs_wait(fd, &timeout);
if (sysfs_get_str(sra, NULL, "sync_action",
action, 20) > 0 &&
- strncmp(action, "reshape", 7) != 0)
- break;
+ strncmp(action, "reshape", 7) != 0) {
+ close(fd);
+ return -1;
+ }
if (sysfs_fd_get_ll(fd, &completed) < 0) {
dprintf("cannot read reshape_position (in loop)\n");
close(fd);
@@ -10563,7 +10662,7 @@ static int imsm_manage_reshape(
sra->reshape_progress = next_step;
/* wait until reshape finish */
- if (wait_for_reshape_imsm(sra, ndata) < 0) {
+ if (wait_for_reshape_imsm(sra, ndata)) {
dprintf("wait_for_reshape_imsm returned error!\n");
goto abort;
}
@@ -10601,7 +10700,6 @@ static int imsm_manage_reshape(
ret_val = 1;
abort:
free(buf);
- abort_reshape(sra);
return ret_val;
}
diff --git a/super0.c b/super0.c
index deb5999..59a6a03 100644
--- a/super0.c
+++ b/super0.c
@@ -405,7 +405,8 @@ static void getinfo_super0(struct supertype *st, struct mdinfo *info, char *map)
info->array.utime = sb->utime;
info->array.chunk_size = sb->chunk_size;
info->array.state = sb->state;
- info->component_size = sb->size*2;
+ info->component_size = sb->size;
+ info->component_size *= 2;
if (sb->state & (1<<MD_SB_BITMAP_PRESENT))
info->bitmap_offset = 8;
@@ -900,7 +901,7 @@ static int write_init_super0(struct supertype *st)
rv = store_super0(st, di->fd);
if (rv == 0 && (sb->state & (1<<MD_SB_BITMAP_PRESENT)))
- rv = st->ss->write_bitmap(st, di->fd);
+ rv = st->ss->write_bitmap(st, di->fd, NoUpdate);
if (rv)
pr_err("failed to write superblock to %s\n",
@@ -1155,16 +1156,16 @@ static int add_internal_bitmap0(struct supertype *st, int *chunkp,
return 1;
}
-static void locate_bitmap0(struct supertype *st, int fd)
+static int locate_bitmap0(struct supertype *st, int fd)
{
unsigned long long dsize;
unsigned long long offset;
if (!get_dev_size(fd, NULL, &dsize))
- return;
+ return -1;
if (dsize < MD_RESERVED_SECTORS*512)
- return;
+ return -1;
offset = MD_NEW_SIZE_SECTORS(dsize>>9);
@@ -1173,9 +1174,10 @@ static void locate_bitmap0(struct supertype *st, int fd)
offset += MD_SB_BYTES;
lseek64(fd, offset, 0);
+ return 0;
}
-static int write_bitmap0(struct supertype *st, int fd)
+static int write_bitmap0(struct supertype *st, int fd, enum bitmap_update update)
{
unsigned long long dsize;
unsigned long long offset;
diff --git a/super1.c b/super1.c
index f0508fe..8bcaa2f 100644
--- a/super1.c
+++ b/super1.c
@@ -1,7 +1,7 @@
/*
* mdadm - manage Linux "md" devices aka RAID arrays.
*
- * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2016 Neil Brown <neilb@suse.com>
*
*
* This program is free software; you can redistribute it and/or modify
@@ -68,7 +68,10 @@ struct mdp_superblock_1 {
__u64 data_offset; /* sector start of data, often 0 */
__u64 data_size; /* sectors in this device that can be used for data */
__u64 super_offset; /* sector start of this superblock */
- __u64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
+ union {
+ __u64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
+ __u64 journal_tail;/* journal tail of journal device (from data_offset) */
+ };
__u32 dev_number; /* permanent identifier of this device - not role in raid */
__u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */
__u8 device_uuid[16]; /* user-space setable, ignored by kernel */
@@ -125,6 +128,8 @@ struct misc_dev_info {
* backwards anyway.
*/
#define MD_FEATURE_NEW_OFFSET 64 /* new_offset must be honoured */
+#define MD_FEATURE_BITMAP_VERSIONED 256 /* bitmap version number checked properly */
+#define MD_FEATURE_JOURNAL 512 /* support write journal */
#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
|MD_FEATURE_RECOVERY_OFFSET \
|MD_FEATURE_RESHAPE_ACTIVE \
@@ -132,8 +137,39 @@ struct misc_dev_info {
|MD_FEATURE_REPLACEMENT \
|MD_FEATURE_RESHAPE_BACKWARDS \
|MD_FEATURE_NEW_OFFSET \
+ |MD_FEATURE_BITMAP_VERSIONED \
+ |MD_FEATURE_JOURNAL \
)
+#ifndef MDASSEMBLE
+static int role_from_sb(struct mdp_superblock_1 *sb)
+{
+ unsigned int d;
+ int role;
+
+ d = __le32_to_cpu(sb->dev_number);
+ if (d < __le32_to_cpu(sb->max_dev))
+ role = __le16_to_cpu(sb->dev_roles[d]);
+ else
+ role = MD_DISK_ROLE_SPARE;
+ return role;
+}
+#endif
+
+/* return how many bytes are needed for bitmap, for cluster-md each node
+ * should have it's own bitmap */
+static unsigned int calc_bitmap_size(bitmap_super_t *bms, unsigned int boundary)
+{
+ unsigned long long bits, bytes;
+
+ bits = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9);
+ bytes = (bits+7) >> 3;
+ bytes += sizeof(bitmap_super_t);
+ bytes = ROUND_UP(bytes, boundary);
+
+ return bytes;
+}
+
static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
{
unsigned int disk_csum, csum;
@@ -256,6 +292,7 @@ static int awrite(struct align_fd *afd, void *buf, int len)
static void examine_super1(struct supertype *st, char *homehost)
{
struct mdp_superblock_1 *sb = st->sb;
+ bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb)+MAX_SB_SIZE);
time_t atime;
unsigned int d;
int role;
@@ -289,6 +326,8 @@ static void examine_super1(struct supertype *st, char *homehost)
strncmp(sb->set_name, homehost, l) == 0)
printf(" (local to host %s)", homehost);
printf("\n");
+ if (bms->nodes > 0 && (__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET))
+ printf(" Cluster Name : %-64s\n", bms->cluster_name);
atime = __le64_to_cpu(sb->ctime) & 0xFFFFFFFFFFULL;
printf(" Creation Time : %.24s\n", ctime(&atime));
c=map_num(pers, __le32_to_cpu(sb->level));
@@ -446,25 +485,23 @@ static void examine_super1(struct supertype *st, char *homehost)
/* This turns out to just be confusing */
printf(" Array Slot : %d (", __le32_to_cpu(sb->dev_number));
for (i= __le32_to_cpu(sb->max_dev); i> 0 ; i--)
- if (__le16_to_cpu(sb->dev_roles[i-1]) != 0xffff)
+ if (__le16_to_cpu(sb->dev_roles[i-1]) != MD_DISK_ROLE_SPARE)
break;
for (d=0; d < i; d++) {
int role = __le16_to_cpu(sb->dev_roles[d]);
if (d) printf(", ");
- if (role == 0xffff) printf("empty");
- else if(role == 0xfffe) printf("failed");
+ if (role == MD_DISK_ROLE_SPARE) printf("empty");
+ else if(role == MD_DISK_ROLE_FAULTY) printf("failed");
else printf("%d", role);
}
printf(")\n");
#endif
printf(" Device Role : ");
- d = __le32_to_cpu(sb->dev_number);
- if (d < __le32_to_cpu(sb->max_dev))
- role = __le16_to_cpu(sb->dev_roles[d]);
- else
- role = 0xFFFF;
- if (role >= 0xFFFE)
+ role = role_from_sb(sb);
+ if (role >= MD_DISK_ROLE_FAULTY)
printf("spare\n");
+ else if (role == MD_DISK_ROLE_JOURNAL)
+ printf("Journal\n");
else if (sb->feature_map & __cpu_to_le32(MD_FEATURE_REPLACEMENT))
printf("Replacement device %d\n", role);
else
@@ -493,7 +530,7 @@ static void examine_super1(struct supertype *st, char *homehost)
faulty = 0;
for (i=0; i< __le32_to_cpu(sb->max_dev); i++) {
int role = __le16_to_cpu(sb->dev_roles[i]);
- if (role == 0xFFFE)
+ if (role == MD_DISK_ROLE_FAULTY)
faulty++;
}
if (faulty) printf(" %d failed", faulty);
@@ -681,12 +718,8 @@ static int copy_metadata1(struct supertype *st, int from, int to)
/* have the header, can calculate
* correct bitmap bytes */
bitmap_super_t *bms;
- int bits;
bms = (void*)buf;
- bits = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9);
- bytes = (bits+7) >> 3;
- bytes += sizeof(bitmap_super_t);
- bytes = ROUND_UP(bytes, 512);
+ bytes = calc_bitmap_size(bms, 512);
if (n > bytes)
n = bytes;
}
@@ -740,6 +773,7 @@ err:
static void detail_super1(struct supertype *st, char *homehost)
{
struct mdp_superblock_1 *sb = st->sb;
+ bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE);
int i;
int l = homehost ? strlen(homehost) : 0;
@@ -748,6 +782,8 @@ static void detail_super1(struct supertype *st, char *homehost)
sb->set_name[l] == ':' &&
strncmp(sb->set_name, homehost, l) == 0)
printf(" (local to host %s)", homehost);
+ if (bms->nodes > 0 && (__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET))
+ printf("\n Cluster Name : %-64s", bms->cluster_name);
printf("\n UUID : ");
for (i=0; i<16; i++) {
if ((i&3)==0 && i != 0) printf(":");
@@ -891,6 +927,8 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
info->array.state =
(__le64_to_cpu(sb->resync_offset) == MaxSector)
? 1 : 0;
+ if (__le32_to_cpu(bsb->nodes) > 1)
+ info->array.state |= (1 << MD_SB_CLUSTERED);
info->data_offset = __le64_to_cpu(sb->data_offset);
info->component_size = __le64_to_cpu(sb->size);
@@ -902,7 +940,7 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
info->disk.number = __le32_to_cpu(sb->dev_number);
if (__le32_to_cpu(sb->dev_number) >= __le32_to_cpu(sb->max_dev) ||
__le32_to_cpu(sb->dev_number) >= MAX_DEVS)
- role = 0xfffe;
+ role = MD_DISK_ROLE_FAULTY;
else
role = __le16_to_cpu(sb->dev_roles[__le32_to_cpu(sb->dev_number)]);
@@ -943,7 +981,7 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
size /= 512;
bmend += size;
if (bmend > earliest)
- bmend = earliest;
+ earliest = bmend;
}
if (sb->bblog_offset && sb->bblog_size) {
unsigned long long bbend = super_offset;
@@ -969,12 +1007,17 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
info->disk.raid_disk = -1;
switch(role) {
- case 0xFFFF:
+ case MD_DISK_ROLE_SPARE:
info->disk.state = 0; /* spare: not active, not sync, not faulty */
break;
- case 0xFFFE:
+ case MD_DISK_ROLE_FAULTY:
info->disk.state = 1; /* faulty */
break;
+ case MD_DISK_ROLE_JOURNAL:
+ info->disk.state = (1 << MD_DISK_JOURNAL);
+ info->disk.raid_disk = role;
+ info->space_after = (misc->device_size - info->data_offset) % 8; /* journal uses all 4kB blocks*/
+ break;
default:
info->disk.state = 6; /* active and in sync */
info->disk.raid_disk = role;
@@ -1022,7 +1065,7 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
map[i] = 0;
for (i = 0; i < __le32_to_cpu(sb->max_dev); i++) {
role = __le16_to_cpu(sb->dev_roles[i]);
- if (/*role == 0xFFFF || */role < (unsigned) info->array.raid_disks) {
+ if (/*role == MD_DISK_ROLE_SPARE || */role < (unsigned) info->array.raid_disks) {
working++;
if (map && role < map_disks)
map[role] = 1;
@@ -1030,6 +1073,9 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
}
info->array.working_disks = working;
+ if (sb->feature_map & __le32_to_cpu(MD_FEATURE_JOURNAL))
+ info->journal_device_required = 1;
+ info->journal_clean = 0;
}
static struct mdinfo *container_content1(struct supertype *st, char *subarray)
@@ -1054,7 +1100,18 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
* ignored.
*/
int rv = 0;
+ int lockid;
struct mdp_superblock_1 *sb = st->sb;
+ bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE);
+
+ if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) {
+ rv = cluster_get_dlmlock(&lockid);
+ if (rv) {
+ pr_err("Cannot get dlmlock in %s return %d\n", __func__, rv);
+ cluster_release_dlmlock(lockid);
+ return rv;
+ }
+ }
if (strcmp(update, "homehost") == 0 &&
homehost) {
@@ -1094,8 +1151,10 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
int want;
if (info->disk.state & (1<<MD_DISK_ACTIVE))
want = info->disk.raid_disk;
+ else if (info->disk.state & (1<<MD_DISK_JOURNAL))
+ want = MD_DISK_ROLE_JOURNAL;
else
- want = 0xFFFF;
+ want = MD_DISK_ROLE_SPARE;
if (sb->dev_roles[d] != __cpu_to_le16(want)) {
sb->dev_roles[d] = __cpu_to_le16(want);
rv = 1;
@@ -1120,7 +1179,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
unsigned int max = __le32_to_cpu(sb->max_dev);
for (i=0 ; i < max ; i++)
- if (__le16_to_cpu(sb->dev_roles[i]) >= 0xfffe)
+ if (__le16_to_cpu(sb->dev_roles[i]) >= MD_DISK_ROLE_FAULTY)
break;
sb->dev_number = __cpu_to_le32(i);
info->disk.number = i;
@@ -1225,6 +1284,11 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
sb->bblog_shift = 0;
sb->bblog_offset = 0;
}
+ } else if (strcmp(update, "force-no-bbl") == 0) {
+ sb->feature_map &= ~ __cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
+ sb->bblog_size = 0;
+ sb->bblog_shift = 0;
+ sb->bblog_offset = 0;
} else if (strcmp(update, "name") == 0) {
if (info->name[0] == 0)
sprintf(info->name, "%d", info->array.md_minor);
@@ -1245,7 +1309,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
(st->sb + MAX_SB_SIZE + BM_SUPER_SIZE);
sb->data_size = __cpu_to_le64(
misc->device_size - __le64_to_cpu(sb->data_offset));
- } else if (strcmp(update, "revert-reshape") == 0) {
+ } else if (strncmp(update, "revert-reshape", 14) == 0) {
rv = -2;
if (!(sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE)))
pr_err("No active reshape to revert on %s\n",
@@ -1255,6 +1319,24 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
unsigned long long reshape_sectors;
long reshape_chunk;
rv = 0;
+ /* If the reshape hasn't started, just stop it.
+ * It is conceivable that a stripe was modified but
+ * the metadata not updated. In that case the backup
+ * should have been used to get passed the critical stage.
+ * If that couldn't happen, the "-nobackup" version
+ * will be used.
+ */
+ if (strcmp(update, "revert-reshape-nobackup") == 0 &&
+ sb->reshape_position == 0 &&
+ (__le32_to_cpu(sb->delta_disks) > 0 ||
+ (__le32_to_cpu(sb->delta_disks) == 0 &&
+ !(sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS))))) {
+ sb->feature_map &= ~__cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
+ sb->raid_disks = __cpu_to_le32(__le32_to_cpu(sb->raid_disks) -
+ __le32_to_cpu(sb->delta_disks));
+ sb->delta_disks = 0;
+ goto done;
+ }
/* reshape_position is a little messy.
* Its value must be a multiple of the larger
* chunk size, and of the "after" data disks.
@@ -1301,6 +1383,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
sb->new_offset = __cpu_to_le32(-offset_delta);
sb->data_size = __cpu_to_le64(__le64_to_cpu(sb->data_size) - offset_delta);
}
+ done:;
}
} else if (strcmp(update, "_reshape_progress")==0)
sb->reshape_position = __cpu_to_le64(info->reshape_progress);
@@ -1312,6 +1395,9 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
rv = -1;
sb->sb_csum = calc_sb_1_csum(sb);
+ if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready())
+ cluster_release_dlmlock(lockid);
+
return rv;
}
@@ -1415,13 +1501,26 @@ static int add_to_super1(struct supertype *st, mdu_disk_info_t *dk,
struct mdp_superblock_1 *sb = st->sb;
__u16 *rp = sb->dev_roles + dk->number;
struct devinfo *di, **dip;
+ bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE);
+ int rv, lockid;
+
+ if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) {
+ rv = cluster_get_dlmlock(&lockid);
+ if (rv) {
+ pr_err("Cannot get dlmlock in %s return %d\n", __func__, rv);
+ cluster_release_dlmlock(lockid);
+ return rv;
+ }
+ }
if ((dk->state & 6) == 6) /* active, sync */
*rp = __cpu_to_le16(dk->raid_disk);
+ else if (dk->state & (1<<MD_DISK_JOURNAL))
+ *rp = MD_DISK_ROLE_JOURNAL;
else if ((dk->state & ~2) == 0) /* active or idle -> spare */
- *rp = 0xffff;
+ *rp = MD_DISK_ROLE_SPARE;
else
- *rp = 0xfffe;
+ *rp = MD_DISK_ROLE_FAULTY;
if (dk->number >= (int)__le32_to_cpu(sb->max_dev) &&
__le32_to_cpu(sb->max_dev) < MAX_DEVS)
@@ -1442,11 +1541,14 @@ static int add_to_super1(struct supertype *st, mdu_disk_info_t *dk,
di->next = NULL;
*dip = di;
+ if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready())
+ cluster_release_dlmlock(lockid);
+
return 0;
}
#endif
-static void locate_bitmap1(struct supertype *st, int fd);
+static int locate_bitmap1(struct supertype *st, int fd);
static int store_super1(struct supertype *st, int fd)
{
@@ -1455,6 +1557,17 @@ static int store_super1(struct supertype *st, int fd)
struct align_fd afd;
int sbsize;
unsigned long long dsize;
+ bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE);
+ int rv, lockid;
+
+ if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) {
+ rv = cluster_get_dlmlock(&lockid);
+ if (rv) {
+ pr_err("Cannot get dlmlock in %s return %d\n", __func__, rv);
+ cluster_release_dlmlock(lockid);
+ return rv;
+ }
+ }
if (!get_dev_size(fd, NULL, &dsize))
return 1;
@@ -1515,6 +1628,9 @@ static int store_super1(struct supertype *st, int fd)
}
}
fsync(fd);
+ if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready())
+ cluster_release_dlmlock(lockid);
+
return 0;
}
@@ -1537,7 +1653,55 @@ static unsigned long choose_bm_space(unsigned long devsize)
static void free_super1(struct supertype *st);
+#define META_BLOCK_SIZE 4096
+__u32 crc32c_le(__u32 crc, unsigned char const *p, size_t len);
+
#ifndef MDASSEMBLE
+static int write_empty_r5l_meta_block(struct supertype *st, int fd)
+{
+ struct r5l_meta_block *mb;
+ struct mdp_superblock_1 *sb = st->sb;
+ struct align_fd afd;
+ __u32 crc;
+
+ init_afd(&afd, fd);
+
+ if (posix_memalign((void**)&mb, 4096, META_BLOCK_SIZE) != 0) {
+ pr_err("Could not allocate memory for the meta block.\n");
+ return 1;
+ }
+
+ memset(mb, 0, META_BLOCK_SIZE);
+
+ mb->magic = __cpu_to_le32(R5LOG_MAGIC);
+ mb->version = R5LOG_VERSION;
+ mb->meta_size = __cpu_to_le32(sizeof(struct r5l_meta_block));
+ mb->seq = __cpu_to_le64(random32());
+ mb->position = __cpu_to_le64(0);
+
+ crc = crc32c_le(0xffffffff, sb->set_uuid, sizeof(sb->set_uuid));
+ crc = crc32c_le(crc, (void *)mb, META_BLOCK_SIZE);
+ mb->checksum = crc;
+
+ if (lseek64(fd, (sb->data_offset) * 512, 0) < 0LL) {
+ pr_err("cannot seek to offset of the meta block\n");
+ goto fail_to_write;
+ }
+
+ if (awrite(&afd, mb, META_BLOCK_SIZE) != META_BLOCK_SIZE) {
+ pr_err("failed to store write the meta block \n");
+ goto fail_to_write;
+ }
+ fsync(fd);
+
+ free(mb);
+ return 0;
+
+fail_to_write:
+ free(mb);
+ return 1;
+}
+
static int write_init_super1(struct supertype *st)
{
struct mdp_superblock_1 *sb = st->sb;
@@ -1551,6 +1715,11 @@ static int write_init_super1(struct supertype *st)
unsigned long long data_offset;
for (di = st->info; di; di = di->next) {
+ if (di->disk.state & (1 << MD_DISK_JOURNAL))
+ sb->feature_map |= MD_FEATURE_JOURNAL;
+ }
+
+ for (di = st->info; di; di = di->next) {
if (di->disk.state & (1 << MD_DISK_FAULTY))
continue;
if (di->fd < 0)
@@ -1573,7 +1742,8 @@ static int write_init_super1(struct supertype *st)
if (rfd >= 0)
close(rfd);
- sb->events = 0;
+ if (!(di->disk.state & (1<<MD_DISK_JOURNAL)))
+ sb->events = 0;
refst = dup_super(st);
if (load_super1(refst, di->fd, NULL)==0) {
@@ -1681,15 +1851,23 @@ static int write_init_super1(struct supertype *st)
rv = -EINVAL;
goto out;
}
- if (conf_get_create_info()->bblist == 0) {
+ /* Disable badblock log on clusters, or when explicitly requested */
+ if (st->nodes > 0 || conf_get_create_info()->bblist == 0) {
sb->bblog_size = 0;
sb->bblog_offset = 0;
}
sb->sb_csum = calc_sb_1_csum(sb);
rv = store_super1(st, di->fd);
+
+ if (rv == 0 && (di->disk.state & (1 << MD_DISK_JOURNAL))) {
+ rv = write_empty_r5l_meta_block(st, di->fd);
+ if (rv)
+ goto error_out;
+ }
+
if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1))
- rv = st->ss->write_bitmap(st, di->fd);
+ rv = st->ss->write_bitmap(st, di->fd, NoUpdate);
close(di->fd);
di->fd = -1;
if (rv)
@@ -2054,7 +2232,7 @@ add_internal_bitmap1(struct supertype *st,
bbl_size = -bbl_offset;
if (!may_change || (room < 3*2 &&
- __le32_to_cpu(sb->max_dev) <= 384)) {
+ __le32_to_cpu(sb->max_dev) <= 384)) {
room = 3*2;
offset = 1*2;
bbl_size = 0;
@@ -2144,32 +2322,45 @@ add_internal_bitmap1(struct supertype *st,
bms->daemon_sleep = __cpu_to_le32(delay);
bms->sync_size = __cpu_to_le64(size);
bms->write_behind = __cpu_to_le32(write_behind);
+ bms->nodes = __cpu_to_le32(st->nodes);
+ if (st->nodes)
+ sb->feature_map = __cpu_to_le32(__le32_to_cpu(sb->feature_map)
+ | MD_FEATURE_BITMAP_VERSIONED);
+ if (st->cluster_name)
+ strncpy((char *)bms->cluster_name,
+ st->cluster_name, strlen(st->cluster_name));
*chunkp = chunk;
return 1;
}
-static void locate_bitmap1(struct supertype *st, int fd)
+static int locate_bitmap1(struct supertype *st, int fd)
{
unsigned long long offset;
struct mdp_superblock_1 *sb;
int mustfree = 0;
+ int ret;
if (!st->sb) {
if (st->ss->load_super(st, fd, NULL))
- return; /* no error I hope... */
+ return -1; /* no error I hope... */
mustfree = 1;
}
sb = st->sb;
+ if ((__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET))
+ ret = 0;
+ else
+ ret = -1;
offset = __le64_to_cpu(sb->super_offset);
offset += (int32_t) __le32_to_cpu(sb->bitmap_offset);
if (mustfree)
free(sb);
lseek64(fd, offset<<9, 0);
+ return ret;
}
-static int write_bitmap1(struct supertype *st, int fd)
+static int write_bitmap1(struct supertype *st, int fd, enum bitmap_update update)
{
struct mdp_superblock_1 *sb = st->sb;
bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb)+MAX_SB_SIZE);
@@ -2177,6 +2368,43 @@ static int write_bitmap1(struct supertype *st, int fd)
void *buf;
int towrite, n;
struct align_fd afd;
+ unsigned int i = 0;
+ unsigned long long total_bm_space, bm_space_per_node;
+
+ switch (update) {
+ case NameUpdate:
+ /* update cluster name */
+ if (st->cluster_name) {
+ memset((char *)bms->cluster_name, 0, sizeof(bms->cluster_name));
+ strncpy((char *)bms->cluster_name, st->cluster_name, 64);
+ }
+ break;
+ case NodeNumUpdate:
+ /* cluster md only supports superblock 1.2 now */
+ if (st->minor_version != 2) {
+ pr_err("Warning: cluster md only works with superblock 1.2\n");
+ return -EINVAL;
+ }
+
+ /* Each node has an independent bitmap, it is necessary to calculate the
+ * space is enough or not, first get how many bytes for the total bitmap */
+ bm_space_per_node = calc_bitmap_size(bms, 4096);
+
+ total_bm_space = 512 * (__le64_to_cpu(sb->data_offset) - __le64_to_cpu(sb->super_offset));
+ total_bm_space = total_bm_space - 4096; /* leave another 4k for superblock */
+
+ if (bm_space_per_node * st->nodes > total_bm_space) {
+ pr_err("Warning: The max num of nodes can't exceed %llu\n",
+ total_bm_space / bm_space_per_node);
+ return -ENOMEM;
+ }
+
+ bms->nodes = __cpu_to_le32(st->nodes);
+ break;
+ case NoUpdate:
+ default:
+ break;
+ }
init_afd(&afd, fd);
@@ -2185,27 +2413,37 @@ static int write_bitmap1(struct supertype *st, int fd)
if (posix_memalign(&buf, 4096, 4096))
return -ENOMEM;
- memset(buf, 0xff, 4096);
- memcpy(buf, (char *)bms, sizeof(bitmap_super_t));
-
- towrite = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9);
- towrite = (towrite+7) >> 3; /* bits to bytes */
- towrite += sizeof(bitmap_super_t);
- towrite = ROUND_UP(towrite, 512);
- while (towrite > 0) {
- n = towrite;
- if (n > 4096)
- n = 4096;
- n = awrite(&afd, buf, n);
- if (n > 0)
- towrite -= n;
+ do {
+ /* Only the bitmap[0] should resync
+ * whole device on initial assembly
+ */
+ if (i)
+ memset(buf, 0x00, 4096);
else
+ memset(buf, 0xff, 4096);
+ memcpy(buf, (char *)bms, sizeof(bitmap_super_t));
+
+ towrite = calc_bitmap_size(bms, 4096);
+ while (towrite > 0) {
+ n = towrite;
+ if (n > 4096)
+ n = 4096;
+ n = awrite(&afd, buf, n);
+ if (n > 0)
+ towrite -= n;
+ else
+ break;
+ if (i)
+ memset(buf, 0x00, 4096);
+ else
+ memset(buf, 0xff, 4096);
+ }
+ fsync(fd);
+ if (towrite) {
+ rv = -2;
break;
- memset(buf, 0xff, 4096);
- }
- fsync(fd);
- if (towrite)
- rv = -2;
+ }
+ } while (++i < __le32_to_cpu(bms->nodes));
free(buf);
return rv;
@@ -2213,6 +2451,7 @@ static int write_bitmap1(struct supertype *st, int fd)
static void free_super1(struct supertype *st)
{
+
if (st->sb)
free(st->sb);
while (st->info) {
@@ -2370,7 +2609,7 @@ void *super1_make_v0(struct supertype *st, struct mdinfo *info, mdp_super_t *sb0
for (i = 0; i < MD_SB_DISKS; i++) {
int state = sb0->disks[i].state;
- sb->dev_roles[i] = 0xFFFF;
+ sb->dev_roles[i] = MD_DISK_ROLE_SPARE;
if ((state & (1<<MD_DISK_SYNC)) &&
!(state & (1<<MD_DISK_FAULTY)))
sb->dev_roles[i] = __cpu_to_le16(sb0->disks[i].raid_disk);
diff --git a/sysfs.c b/sysfs.c
index 7268470..2600343 100644
--- a/sysfs.c
+++ b/sysfs.c
@@ -74,6 +74,12 @@ int sysfs_open(char *devnm, char *devname, char *attr)
return fd;
}
+void sysfs_init_dev(struct mdinfo *mdi, unsigned long devid)
+{
+ snprintf(mdi->sys_name,
+ sizeof(mdi->sys_name), "dev-%s", devid2kname(devid));
+}
+
void sysfs_init(struct mdinfo *mdi, int fd, char *devnm)
{
mdi->sys_name[0] = 0;
@@ -224,6 +230,13 @@ struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options)
goto abort;
}
+ if (options & GET_ARRAY_STATE) {
+ strcpy(base, "array_state");
+ if (load_sys(fname, sra->sysfs_array_state))
+ goto abort;
+ } else
+ sra->sysfs_array_state[0] = 0;
+
if (! (options & GET_DEVS))
return sra;
diff --git a/systemd/mdadm-last-resort@.service b/systemd/mdadm-last-resort@.service
index 5179f19..e93d72b 100644
--- a/systemd/mdadm-last-resort@.service
+++ b/systemd/mdadm-last-resort@.service
@@ -1,6 +1,7 @@
[Unit]
Description=Activate md array even though degraded
DefaultDependencies=no
+Conflicts=sys-devices-virtual-block-%i.device
[Service]
Type=oneshot
diff --git a/systemd/mdmonitor.service b/systemd/mdmonitor.service
index 9aff2f5..c7cff3e 100644
--- a/systemd/mdmonitor.service
+++ b/systemd/mdmonitor.service
@@ -10,4 +10,7 @@ Description=MD array monitor
DefaultDependencies=no
[Service]
-ExecStart=BINDIR/mdadm --monitor --scan
+Environment= MDADM_MONITOR_ARGS=--scan
+EnvironmentFile=-/run/sysconfig/mdadm
+ExecStartPre=-/usr/lib/systemd/scripts/mdadm_env.sh
+ExecStart=BINDIR/mdadm --monitor $MDADM_MONITOR_ARGS
diff --git a/test b/test
index d0a6cb8..13f1bda 100755
--- a/test
+++ b/test
@@ -246,6 +246,15 @@ check() {
fi
;;
+ readonly )
+ grep -s "read-only" > /dev/null /proc/mdstat || {
+ echo >&2 "ERROR array is not read-only!"; cat /proc/mdstat ; exit 1; }
+ ;;
+
+ inactive )
+ grep -s "inactive" > /dev/null /proc/mdstat || {
+ echo >&2 "ERROR array is not inactive!"; cat /proc/mdstat ; exit 1; }
+ ;;
* ) echo >&2 ERROR unknown check $1 ; exit 1;
esac
}
diff --git a/tests/19raid6auto-repair b/tests/19raid6auto-repair
index 7fb1c72..ce4a7c0 100644
--- a/tests/19raid6auto-repair
+++ b/tests/19raid6auto-repair
@@ -10,32 +10,40 @@ data_offset_in_kib=$[2048/2]
# make a raid5 from a file
dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib
-mdadm -CR $md0 -l6 -n$number_of_disks -c $chunksize_in_kib $devs
-dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib
-blockdev --flushbufs $md0; sync
-check wait
-blockdev --flushbufs $devs; sync
-echo 3 > /proc/sys/vm/drop_caches
-cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; }
-
-# wipe out 5 chunks on each device
-dd if=/dev/urandom of=$dev0 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*0]
-dd if=/dev/urandom of=$dev1 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*5]
-dd if=/dev/urandom of=$dev2 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*10]
-dd if=/dev/urandom of=$dev3 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*15]
-dd if=/dev/urandom of=$dev4 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*20]
-
-blockdev --flushbufs $devs; sync
-echo 3 > /proc/sys/vm/drop_caches
-
-$dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; }
-
-$dir/raid6check $md0 0 0 autorepair > /dev/null || { echo repair failed; exit 2; }
-blockdev --flushbufs $md0 $devs; sync
-echo 3 > /proc/sys/vm/drop_caches
-
-$dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; }
-cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; }
-
-mdadm -S $md0
-udevadm settle
+
+# perform test for every layout
+layouts="ls rs la ra parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \
+ left-asymmetric-6 right-asymmetric-6 left-symmetric-6 \
+ right-symmetric-6 parity-first-6"
+
+for layout in $layouts
+do
+ mdadm -CR $md0 -l6 --layout=$layout -n$number_of_disks -c $chunksize_in_kib $devs
+ dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib
+ blockdev --flushbufs $md0; sync
+ check wait
+ blockdev --flushbufs $devs; sync
+ echo 3 > /proc/sys/vm/drop_caches
+ cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; }
+
+ # wipe out 5 chunks on each device
+ dd if=/dev/urandom of=$dev0 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*0]
+ dd if=/dev/urandom of=$dev1 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*5]
+ dd if=/dev/urandom of=$dev2 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*10]
+ dd if=/dev/urandom of=$dev3 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*15]
+ dd if=/dev/urandom of=$dev4 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*20]
+
+ blockdev --flushbufs $devs; sync
+ echo 3 > /proc/sys/vm/drop_caches
+
+ $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; }
+
+ $dir/raid6check $md0 0 0 autorepair > /dev/null || { echo repair failed; exit 2; }
+ blockdev --flushbufs $md0 $devs; sync
+ echo 3 > /proc/sys/vm/drop_caches
+
+ $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; }
+ cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; }
+
+ mdadm -S $md0
+done
diff --git a/tests/19raid6repair b/tests/19raid6repair
index 1159bd3..26846cc 100644
--- a/tests/19raid6repair
+++ b/tests/19raid6repair
@@ -8,40 +8,49 @@ devs="$dev1 $dev2 $dev3 $dev4"
# default 2048 sectors
data_offset_in_kib=$[2048/2]
-for failure in "$dev3 3 3 2" "$dev3 3 2 3" "$dev3 3 2 1" "$dev3 3 2 0" "$dev4 3 3 0" "$dev4 3 3 1" "$dev4 3 3 2" \
- "$dev1 3 0 1" "$dev1 3 0 2" "$dev1 3 0 3" "$dev2 3 1 0" "$dev2 3 1 2" "$dev2 3 1 3" ; do
- failure_split=( $failure )
- device_with_error=${failure_split[0]}
- stripe_with_error=${failure_split[1]}
- repair_params="$stripe_with_error ${failure_split[2]} ${failure_split[3]}"
- start_of_errors_in_kib=$[data_offset_in_kib+chunksize_in_kib*stripe_with_error]
-
- # make a raid5 from a file
- dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib
- mdadm -CR $md0 -l6 -n$number_of_disks -c $chunksize_in_kib $devs
- dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib
- blockdev --flushbufs $md0; sync
-
- check wait
- blockdev --flushbufs $devs; sync
- echo 3 > /proc/sys/vm/drop_caches
- cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; }
-
- dd if=/dev/urandom of=$device_with_error bs=1024 count=$chunksize_in_kib seek=$start_of_errors_in_kib
- blockdev --flushbufs $device_with_error; sync
- echo 3 > /proc/sys/vm/drop_caches
-
- $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; }
-
- $dir/raid6check $md0 repair $repair_params > /dev/null || { echo repair failed; exit 2; }
- blockdev --flushbufs $md0 $devs; sync
- echo 3 > /proc/sys/vm/drop_caches
-
- $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; }
- cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; }
-
- mdadm -S $md0
- udevadm settle
- sync
- echo 3 > /proc/sys/vm/drop_caches
+layouts="ls rs la ra parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \
+ left-asymmetric-6 right-asymmetric-6 left-symmetric-6 \
+ right-symmetric-6 parity-first-6"
+
+for layout in $layouts
+do
+ for failure in "$dev3 3 3 2" "$dev3 3 2 3" "$dev3 3 2 1" "$dev3 3 2 0" \
+ "$dev4 3 3 0" "$dev4 3 3 1" "$dev4 3 3 2" \
+ "$dev1 3 0 1" "$dev1 3 0 2" "$dev1 3 0 3" \
+ "$dev2 3 1 0" "$dev2 3 1 2" "$dev2 3 1 3" ; do
+ failure_split=( $failure )
+ device_with_error=${failure_split[0]}
+ stripe_with_error=${failure_split[1]}
+ repair_params="$stripe_with_error ${failure_split[2]} ${failure_split[3]}"
+ start_of_errors_in_kib=$[data_offset_in_kib+chunksize_in_kib*stripe_with_error]
+
+ # make a raid5 from a file
+ dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib
+ mdadm -CR $md0 -l6 --layout=$layout -n$number_of_disks -c $chunksize_in_kib $devs
+ dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib
+ blockdev --flushbufs $md0; sync
+
+ check wait
+ blockdev --flushbufs $devs; sync
+ echo 3 > /proc/sys/vm/drop_caches
+ cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; }
+
+ dd if=/dev/urandom of=$device_with_error bs=1024 count=$chunksize_in_kib seek=$start_of_errors_in_kib
+ blockdev --flushbufs $device_with_error; sync
+ echo 3 > /proc/sys/vm/drop_caches
+
+ $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; }
+
+ $dir/raid6check $md0 repair $repair_params > /dev/null || { echo repair failed; exit 2; }
+ blockdev --flushbufs $md0 $devs; sync
+ echo 3 > /proc/sys/vm/drop_caches
+
+ $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; }
+ cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; }
+
+ mdadm -S $md0
+ udevadm settle
+ sync
+ echo 3 > /proc/sys/vm/drop_caches
+ done
done
diff --git a/tests/20raid5journal b/tests/20raid5journal
new file mode 100644
index 0000000..f751ace
--- /dev/null
+++ b/tests/20raid5journal
@@ -0,0 +1,64 @@
+# check write journal of raid456
+
+# test --detail
+test_detail_shows_journal() {
+ mdadm -D $1 | grep journal || {
+ echo >&2 "ERROR --detail does show journal device!"; mdadm -D $1 ; exit 1; }
+}
+
+# test --examine
+test_examine_shows_journal() {
+ mdadm -E $1 | grep Journal || {
+ echo >&2 "ERROR --examine does show Journal device!"; mdadm -E $1 ; exit 1; }
+}
+
+# test --create
+create_with_journal_and_stop() {
+ mdadm -CR $md0 -l5 -n4 $dev0 $dev1 $dev2 $dev3 --write-journal $dev4
+ check wait
+ tar cf - /etc > $md0
+ ./raid6check $md0 0 0 | grep 'Error detected' && exit 1
+ test_detail_shows_journal $md0
+ test_examine_shows_journal $dev4
+ mdadm -S $md0
+}
+
+# test --assemble
+test_assemble() {
+ create_with_journal_and_stop
+ if mdadm -A $md0 $dev0 $dev1 $dev2 $dev3
+ then
+ echo >&2 "ERROR should return 1 when journal is missing!"; cat /proc/mdstat ; exit 1;
+ fi
+ mdadm -S $md0
+
+ mdadm -A $md0 $dev0 $dev1 $dev2 $dev3 --force
+ check readonly
+ mdadm -S $md0
+}
+
+# test --incremental
+test_incremental() {
+ create_with_journal_and_stop
+ for d in $dev0 $dev1 $dev2 $dev3
+ do
+ mdadm -I $d
+ done
+ check inactive
+ mdadm -I $dev4
+ check raid5
+ mdadm -S $md0
+
+ # test --incremental with journal missing
+ for d in $dev0 $dev1 $dev2 $dev3
+ do
+ mdadm -I $d
+ done
+ mdadm -R $md0
+ check readonly
+ mdadm -S $md0
+}
+
+create_with_journal_and_stop
+test_assemble
+test_incremental
diff --git a/udev-md-raid-arrays.rules b/udev-md-raid-arrays.rules
index a32b6d2..c95ec7b 100644
--- a/udev-md-raid-arrays.rules
+++ b/udev-md-raid-arrays.rules
@@ -17,7 +17,7 @@ TEST!="md/array_state", ENV{SYSTEMD_READY}="0", GOTO="md_end"
ATTR{md/array_state}=="|clear|inactive", ENV{SYSTEMD_READY}="0", GOTO="md_end"
LABEL="md_ignore_state"
-IMPORT{program}="BINDIR/mdadm --detail --export $tempnode"
+IMPORT{program}="BINDIR/mdadm --detail --export $devnode"
ENV{DEVTYPE}=="disk", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}", OPTIONS+="string_escape=replace"
ENV{DEVTYPE}=="disk", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}"
ENV{DEVTYPE}=="disk", ENV{MD_DEVNAME}=="?*", SYMLINK+="md/$env{MD_DEVNAME}"
@@ -26,14 +26,16 @@ ENV{DEVTYPE}=="partition", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env
ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[^0-9]", SYMLINK+="md/$env{MD_DEVNAME}%n"
ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[0-9]", SYMLINK+="md/$env{MD_DEVNAME}p%n"
-IMPORT{program}="/sbin/blkid -o udev -p -u noraid $tempnode"
+IMPORT{builtin}="blkid"
+OPTIONS+="link_priority=100"
+OPTIONS+="watch"
ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}"
ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}"
ENV{MD_LEVEL}=="raid[1-9]*", ENV{SYSTEMD_WANTS}+="mdmonitor.service"
# Tell systemd to run mdmon for our container, if we need it.
-ENV{MD_LEVEL}=="raid[1-9]*", ENV{MD_CONTAINER}=="?*", PROGRAM="/bin/readlink $env{MD_CONTAINER}", ENV{MD_MON_THIS}="%c"
+ENV{MD_LEVEL}=="raid[1-9]*", ENV{MD_CONTAINER}=="?*", PROGRAM="/usr/bin/readlink $env{MD_CONTAINER}", ENV{MD_MON_THIS}="%c"
ENV{MD_MON_THIS}=="?*", PROGRAM="/usr/bin/basename $env{MD_MON_THIS}", ENV{SYSTEMD_WANTS}+="mdmon@%c.service"
LABEL="md_end"
diff --git a/udev-md-raid-assembly.rules b/udev-md-raid-assembly.rules
index 5bde607..d0d440a 100644
--- a/udev-md-raid-assembly.rules
+++ b/udev-md-raid-assembly.rules
@@ -25,12 +25,9 @@ GOTO="md_inc_end"
LABEL="md_inc"
-# Disable incremental assembly to fix Debian bug #784070
-GOTO="md_inc_end"
-
# remember you can limit what gets auto/incrementally assembled by
# mdadm.conf(5)'s 'AUTO' and selectively whitelist using 'ARRAY'
-ACTION=="add|change", IMPORT{program}="BINDIR/mdadm --incremental --export $tempnode --offroot ${DEVLINKS}"
+ACTION=="add|change", IMPORT{program}="BINDIR/mdadm --incremental --export $devnode --offroot ${DEVLINKS}"
ACTION=="add|change", ENV{MD_STARTED}=="*unsafe*", ENV{MD_FOREIGN}=="no", ENV{SYSTEMD_WANTS}+="mdadm-last-resort@$env{MD_DEVICE}.timer"
ACTION=="remove", ENV{ID_PATH}=="?*", RUN+="BINDIR/mdadm -If $name --path $env{ID_PATH}"
ACTION=="remove", ENV{ID_PATH}!="?*", RUN+="BINDIR/mdadm -If $name"
diff --git a/util.c b/util.c
index cc98d3b..970d484 100644
--- a/util.c
+++ b/util.c
@@ -24,6 +24,7 @@
#include "mdadm.h"
#include "md_p.h"
+#include <sys/poll.h>
#include <sys/socket.h>
#include <sys/utsname.h>
#include <sys/wait.h>
@@ -34,6 +35,8 @@
#include <ctype.h>
#include <dirent.h>
#include <signal.h>
+#include <dlfcn.h>
+
/*
* following taken from linux/blkpg.h because they aren't
@@ -79,6 +82,143 @@ struct blkpg_partition {
aren't permitted). */
#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
+static int is_dlm_hooks_ready = 0;
+
+int dlm_funs_ready(void)
+{
+ return is_dlm_hooks_ready ? 1 : 0;
+}
+
+#ifndef MDASSEMBLE
+static struct dlm_hooks *dlm_hooks = NULL;
+struct dlm_lock_resource *dlm_lock_res = NULL;
+static int ast_called = 0;
+
+struct dlm_lock_resource {
+ dlm_lshandle_t *ls;
+ struct dlm_lksb lksb;
+};
+
+/* Using poll(2) to wait for and dispatch ASTs */
+static int poll_for_ast(dlm_lshandle_t ls)
+{
+ struct pollfd pfd;
+
+ pfd.fd = dlm_hooks->ls_get_fd(ls);
+ pfd.events = POLLIN;
+
+ while (!ast_called)
+ {
+ if (poll(&pfd, 1, 0) < 0)
+ {
+ perror("poll");
+ return -1;
+ }
+ dlm_hooks->dispatch(dlm_hooks->ls_get_fd(ls));
+ }
+ ast_called = 0;
+
+ return 0;
+}
+
+static void dlm_ast(void *arg)
+{
+ ast_called = 1;
+}
+
+static char *cluster_name = NULL;
+/* Create the lockspace, take bitmapXXX locks on all the bitmaps. */
+int cluster_get_dlmlock(int *lockid)
+{
+ int ret = -1;
+ char str[64];
+ int flags = LKF_NOQUEUE;
+
+ ret = get_cluster_name(&cluster_name);
+ if (ret) {
+ pr_err("The md can't get cluster name\n");
+ return -1;
+ }
+
+ dlm_lock_res = xmalloc(sizeof(struct dlm_lock_resource));
+ dlm_lock_res->ls = dlm_hooks->create_lockspace(cluster_name, O_RDWR);
+ if (!dlm_lock_res->ls) {
+ pr_err("%s failed to create lockspace\n", cluster_name);
+ return -ENOMEM;
+ }
+
+ /* Conversions need the lockid in the LKSB */
+ if (flags & LKF_CONVERT)
+ dlm_lock_res->lksb.sb_lkid = *lockid;
+
+ snprintf(str, 64, "bitmap%s", cluster_name);
+ /* if flags with LKF_CONVERT causes below return ENOENT which means
+ * "No such file or directory" */
+ ret = dlm_hooks->ls_lock(dlm_lock_res->ls, LKM_PWMODE, &dlm_lock_res->lksb,
+ flags, str, strlen(str), 0, dlm_ast,
+ dlm_lock_res, NULL, NULL);
+ if (ret) {
+ pr_err("error %d when get PW mode on lock %s\n", errno, str);
+ dlm_hooks->release_lockspace(cluster_name, dlm_lock_res->ls, 1);
+ return ret;
+ }
+
+ /* Wait for it to complete */
+ poll_for_ast(dlm_lock_res->ls);
+ *lockid = dlm_lock_res->lksb.sb_lkid;
+
+ return dlm_lock_res->lksb.sb_status;
+}
+
+int cluster_release_dlmlock(int lockid)
+{
+ int ret = -1;
+
+ if (!cluster_name)
+ return -1;
+
+ /* if flags with LKF_CONVERT causes below return EINVAL which means
+ * "Invalid argument" */
+ ret = dlm_hooks->ls_unlock(dlm_lock_res->ls, lockid, 0,
+ &dlm_lock_res->lksb, dlm_lock_res);
+ if (ret) {
+ pr_err("error %d happened when unlock\n", errno);
+ /* XXX make sure the lock is unlocked eventually */
+ goto out;
+ }
+
+ /* Wait for it to complete */
+ poll_for_ast(dlm_lock_res->ls);
+
+ errno = dlm_lock_res->lksb.sb_status;
+ if (errno != EUNLOCK) {
+ pr_err("error %d happened in ast when unlock lockspace\n", errno);
+ /* XXX make sure the lockspace is unlocked eventually */
+ goto out;
+ }
+
+ ret = dlm_hooks->release_lockspace(cluster_name, dlm_lock_res->ls, 1);
+ if (ret) {
+ pr_err("error %d happened when release lockspace\n", errno);
+ /* XXX make sure the lockspace is released eventually */
+ goto out;
+ }
+ free(dlm_lock_res);
+
+out:
+ return ret;
+}
+#else
+int cluster_get_dlmlock(int *lockid)
+{
+ return -1;
+}
+int cluster_release_dlmlock(int lockid)
+{
+ return -1;
+}
+#endif
+
/*
* Parse a 128 bit uuid in 4 integers
* format is 32 hexx nibbles with options :.<space> separator
@@ -271,6 +411,16 @@ long parse_num(char *num)
}
#endif
+int parse_cluster_confirm_arg(char *input, char **devname, int *slot)
+{
+ char *dev;
+ *slot = strtoul(input, &dev, 10);
+ if (dev == input || dev[0] != ':')
+ return -1;
+ *devname = dev+1;
+ return 0;
+}
+
void remove_partitions(int fd)
{
/* remove partitions from this block devices.
@@ -1976,3 +2126,80 @@ void reopen_mddev(int mdfd)
if (fd >= 0 && fd != mdfd)
dup2(fd, mdfd);
}
+
+#ifndef MDASSEMBLE
+static struct cmap_hooks *cmap_hooks = NULL;
+static int is_cmap_hooks_ready = 0;
+
+void set_cmap_hooks(void)
+{
+ cmap_hooks = xmalloc(sizeof(struct cmap_hooks));
+ cmap_hooks->cmap_handle = dlopen("libcmap.so.4", RTLD_NOW | RTLD_LOCAL);
+ if (!cmap_hooks->cmap_handle)
+ return;
+
+ cmap_hooks->initialize = dlsym(cmap_hooks->cmap_handle, "cmap_initialize");
+ cmap_hooks->get_string = dlsym(cmap_hooks->cmap_handle, "cmap_get_string");
+ cmap_hooks->finalize = dlsym(cmap_hooks->cmap_handle, "cmap_finalize");
+
+ if (!cmap_hooks->initialize || !cmap_hooks->get_string ||
+ !cmap_hooks->finalize)
+ dlclose(cmap_hooks->cmap_handle);
+ else
+ is_cmap_hooks_ready = 1;
+}
+
+int get_cluster_name(char **cluster_name)
+{
+ int rv = -1;
+ cmap_handle_t handle;
+
+ if (!is_cmap_hooks_ready)
+ return rv;
+
+ rv = cmap_hooks->initialize(&handle);
+ if (rv != CS_OK)
+ goto out;
+
+ rv = cmap_hooks->get_string(handle, "totem.cluster_name", cluster_name);
+ if (rv != CS_OK) {
+ free(*cluster_name);
+ rv = -1;
+ goto name_err;
+ }
+
+ rv = 0;
+name_err:
+ cmap_hooks->finalize(handle);
+out:
+ return rv;
+}
+
+void set_dlm_hooks(void)
+{
+ dlm_hooks = xmalloc(sizeof(struct dlm_hooks));
+ dlm_hooks->dlm_handle = dlopen("libdlm_lt.so.3", RTLD_NOW | RTLD_LOCAL);
+ if (!dlm_hooks->dlm_handle)
+ return;
+
+ dlm_hooks->create_lockspace = dlsym(dlm_hooks->dlm_handle, "dlm_create_lockspace");
+ dlm_hooks->release_lockspace = dlsym(dlm_hooks->dlm_handle, "dlm_release_lockspace");
+ dlm_hooks->ls_lock = dlsym(dlm_hooks->dlm_handle, "dlm_ls_lock");
+ dlm_hooks->ls_unlock = dlsym(dlm_hooks->dlm_handle, "dlm_ls_unlock");
+ dlm_hooks->ls_get_fd = dlsym(dlm_hooks->dlm_handle, "dlm_ls_get_fd");
+ dlm_hooks->dispatch = dlsym(dlm_hooks->dlm_handle, "dlm_dispatch");
+
+ if (!dlm_hooks->create_lockspace || !dlm_hooks->ls_lock ||
+ !dlm_hooks->ls_unlock || !dlm_hooks->release_lockspace ||
+ !dlm_hooks->ls_get_fd || !dlm_hooks->dispatch)
+ dlclose(dlm_hooks->dlm_handle);
+ else
+ is_dlm_hooks_ready = 1;
+}
+
+void set_hooks(void)
+{
+ set_dlm_hooks();
+ set_cmap_hooks();
+}
+#endif