46 files changed, 1799 insertions, 282 deletions
diff --git a/ANNOUNCE-3.4 b/ANNOUNCE-3.4
new file mode 100644
index 00000000..2689732d
--- /dev/null
+++ b/ANNOUNCE-3.4
@@ -0,0 +1,24 @@
+Subject: ANNOUNCE: mdadm 3.4 - A tool for managing md Soft RAID under Linux
+
+I am pleased to announce the availability of
+   mdadm version 3.4
+
+It is available at the usual places:
+   http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+   git://github.com/neilbrown/mdadm
+   git://neil.brown.name/mdadm
+   http://git.neil.brown.name/git/mdadm
+
+The new second-level version number reflects significant new
+functionality, particular support for journalled RAID5/6 and clustered
+RAID1.  This new support is probably still buggy.  Please report bugs.
+
+There are also a number of fixes for Intel's IMSM metadata support,
+and an assortment of minor bug fixes.
+
+I plan for this to be the last release of mdadm that I provide as I am
+retiring from MD and mdadm maintenance.  Jes Sorensen has volunteered
+to oversee mdadm for the next while.  Thanks Jes!
+
+NeilBrown 28th January 2016
diff --git a/Assemble.c b/Assemble.c
index 29257330..d199afc9 100644
--- a/Assemble.c
+++ b/Assemble.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2016 Neil Brown <neilb@suse.com>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
@@ -637,6 +637,19 @@ static int load_devices(struct devs *devices, char *devmap,
 
 			if (strcmp(c->update, "byteorder") == 0)
 				err = 0;
+			else if (strcmp(c->update, "home-cluster") == 0) {
+				tst->cluster_name = c->homecluster;
+				err = tst->ss->write_bitmap(tst, dfd, NameUpdate);
+			} else if (strcmp(c->update, "nodes") == 0) {
+				tst->nodes = c->nodes;
+				err = tst->ss->write_bitmap(tst, dfd, NodeNumUpdate);
+			} else if (strcmp(c->update, "revert-reshape") == 0 &&
+				   c->invalid_backup)
+				err = tst->ss->update_super(tst, content,
+							    "revert-reshape-nobackup",
+							    devname, c->verbose,
+							    ident->uuid_set,
+							    c->homehost);
 			else
 				err = tst->ss->update_super(tst, content, c->update,
 							    devname, c->verbose,
@@ -729,7 +742,7 @@ static int load_devices(struct devs *devices, char *devmap,
 			i = devcnt;
 		else
 			i = devices[devcnt].i.disk.raid_disk;
-		if (i+1 == 0) {
+		if (i+1 == 0 || i == MD_DISK_ROLE_JOURNAL) {
 			if (nextspare < content->array.raid_disks*2)
 				nextspare = content->array.raid_disks*2;
 			i = nextspare++;
@@ -907,7 +920,6 @@ static int force_array(struct mdinfo *content,
 		avail[chosen_drive] = 1;
 		okcnt++;
 		tst->ss->free_super(tst);
-
 		/* If there are any other drives of the same vintage,
 		 * add them in as well.  We can't lose and we might gain
 		 */
@@ -938,6 +950,7 @@ static int start_array(int mdfd,
 		       unsigned int okcnt,
 		       unsigned int sparecnt,
 		       unsigned int rebuilding_cnt,
+		       unsigned int journalcnt,
 		       struct context *c,
 		       int clean, char *avail,
 		       int start_partial_ok,
@@ -949,6 +962,15 @@ static int start_array(int mdfd,
 	int i;
 	unsigned int req_cnt;
 
+	if (content->journal_device_required && (content->journal_clean == 0)) {
+		if (!c->force) {
+			pr_err("Not safe to assemble with missing or stale journal device, consider --force.\n");
+			return 1;
+		}
+		pr_err("Journal is missing or stale, starting array read only.\n");
+		c->readonly = 1;
+	}
+
 	rv = set_array_info(mdfd, st, content);
 	if (rv && !err_ok) {
 		pr_err("failed to set array info for %s: %s\n",
@@ -1026,7 +1048,8 @@ static int start_array(int mdfd,
 	if (content->array.level == LEVEL_CONTAINER) {
 		if (c->verbose >= 0) {
 			pr_err("Container %s has been assembled with %d drive%s",
-			       mddev, okcnt+sparecnt, okcnt+sparecnt==1?"":"s");
+			       mddev, okcnt+sparecnt+journalcnt,
+			       okcnt+sparecnt+journalcnt==1?"":"s");
 			if (okcnt < (unsigned)content->array.raid_disks)
 				fprintf(stderr, " (out of %d)",
 					content->array.raid_disks);
@@ -1112,6 +1135,8 @@ static int start_array(int mdfd,
 					fprintf(stderr, "%s %d rebuilding", sparecnt?",":" and", rebuilding_cnt);
 				if (sparecnt)
 					fprintf(stderr, " and %d spare%s", sparecnt, sparecnt==1?"":"s");
+				if (content->journal_clean)
+					fprintf(stderr, " and %d journal", journalcnt);
 				fprintf(stderr, ".\n");
 			}
 			if (content->reshape_active &&
@@ -1283,7 +1308,8 @@ int Assemble(struct supertype *st, char *mddev,
 	int *best = NULL; /* indexed by raid_disk */
 	int bestcnt = 0;
 	int devcnt;
-	unsigned int okcnt, sparecnt, rebuilding_cnt, replcnt;
+	unsigned int okcnt, sparecnt, rebuilding_cnt, replcnt, journalcnt;
+	int journal_clean = 0;
 	int i;
 	int was_forced = 0;
 	int most_recent = 0;
@@ -1524,6 +1550,7 @@ try_again:
 	okcnt = 0;
 	replcnt = 0;
 	sparecnt=0;
+	journalcnt=0;
 	rebuilding_cnt=0;
 	for (i=0; i< bestcnt; i++) {
 		int j = best[i];
@@ -1534,8 +1561,13 @@ try_again:
 		/* note: we ignore error flags in multipath arrays
 		 * as they don't make sense
 		 */
-		if (content->array.level != LEVEL_MULTIPATH)
-			if (!(devices[j].i.disk.state & (1<<MD_DISK_ACTIVE))) {
+		if (content->array.level != LEVEL_MULTIPATH) {
+			if (devices[j].i.disk.state & (1<<MD_DISK_JOURNAL)) {
+				if (content->journal_device_required)
+					journalcnt++;
+				else	/* unexpected journal, mark as faulty */
+					devices[j].i.disk.state |= (1<<MD_DISK_FAULTY);
+			} else if (!(devices[j].i.disk.state & (1<<MD_DISK_ACTIVE))) {
 				if (!(devices[j].i.disk.state
 				      & (1<<MD_DISK_FAULTY))) {
 					devices[j].uptodate = 1;
@@ -1543,6 +1575,7 @@ try_again:
 				}
 				continue;
 			}
+		}
 		/* If this device thinks that 'most_recent' has failed, then
 		 * we must reject this device.
 		 */
@@ -1566,6 +1599,8 @@ try_again:
 		    devices[most_recent].i.events
 			) {
 			devices[j].uptodate = 1;
+			if (devices[j].i.disk.state & (1<<MD_DISK_JOURNAL))
+				journal_clean = 1;
 			if (i < content->array.raid_disks * 2) {
 				if (devices[j].i.recovery_start == MaxSector ||
 				    (content->reshape_active &&
@@ -1577,7 +1612,7 @@ try_again:
 						replcnt++;
 				} else
 					rebuilding_cnt++;
-			} else
+			} else if (devices[j].i.disk.raid_disk != MD_DISK_ROLE_JOURNAL)
 				sparecnt++;
 		}
 	}
@@ -1637,11 +1672,15 @@ try_again:
 #ifndef MDASSEMBLE
 	sysfs_init(content, mdfd, NULL);
 #endif
+	/* after reload context, store journal_clean in context */
+	content->journal_clean = journal_clean;
 	for (i=0; i<bestcnt; i++) {
 		int j = best[i];
 		unsigned int desired_state;
 
-		if (i >= content->array.raid_disks * 2)
+		if (devices[j].i.disk.raid_disk == MD_DISK_ROLE_JOURNAL)
+			desired_state = (1<<MD_DISK_JOURNAL);
+		else if (i >= content->array.raid_disks * 2)
 			desired_state = 0;
 		else if (i & 1)
 			desired_state = (1<<MD_DISK_ACTIVE) | (1<<MD_DISK_REPLACEMENT);
@@ -1788,7 +1827,7 @@ try_again:
 	rv = start_array(mdfd, mddev, content,
 			 st, ident, best, bestcnt,
 			 chosen_drive, devices, okcnt, sparecnt,
-			 rebuilding_cnt,
+			 rebuilding_cnt, journalcnt,
 			 c,
 			 clean, avail, start_partial_ok,
 			 pre_exist != NULL,
diff --git a/Create.c b/Create.c
index ef28da0c..1e4a6ee0 100644
--- a/Create.c
+++ b/Create.c
@@ -87,7 +87,7 @@ int Create(struct supertype *st, char *mddev,
 	unsigned long long minsize=0, maxsize=0;
 	char *mindisc = NULL;
 	char *maxdisc = NULL;
-	int dnum;
+	int dnum, raid_disk_num;
 	struct mddev_dev *dv;
 	int fail=0, warn=0;
 	struct stat stb;
@@ -114,6 +114,8 @@ int Create(struct supertype *st, char *mddev,
 	unsigned long long newsize;
 
 	int major_num = BITMAP_MAJOR_HI;
+	if (s->bitmap_file && strcmp(s->bitmap_file, "clustered") == 0)
+		major_num = BITMAP_MAJOR_CLUSTERED;
 
 	memset(&info, 0, sizeof(info));
 	if (s->level == UnSet && st && st->ss->default_geometry)
@@ -180,11 +182,11 @@ int Create(struct supertype *st, char *mddev,
 		pr_err("This metadata type does not support spare disks at create time\n");
 		return 1;
 	}
-	if (subdevs > s->raiddisks+s->sparedisks) {
+	if (subdevs > s->raiddisks+s->sparedisks+s->journaldisks) {
 		pr_err("You have listed more devices (%d) than are in the array(%d)!\n", subdevs, s->raiddisks+s->sparedisks);
 		return 1;
 	}
-	if (!have_container && subdevs < s->raiddisks+s->sparedisks) {
+	if (!have_container && subdevs < s->raiddisks+s->sparedisks+s->journaldisks) {
 		pr_err("You haven't given enough devices (real or missing) to create this array\n");
 		return 1;
 	}
@@ -328,7 +330,7 @@ int Create(struct supertype *st, char *mddev,
 		}
 		close(dfd);
 		info.array.working_disks++;
-		if (dnum < s->raiddisks)
+		if (dnum < s->raiddisks && dv->disposition != 'j')
 			info.array.active_disks++;
 		if (st == NULL) {
 			struct createinfo *ci = conf_get_create_info();
@@ -397,6 +399,9 @@ int Create(struct supertype *st, char *mddev,
 			}
 		}
 
+		if (dv->disposition == 'j')
+			goto skip_size_check;  /* skip write journal for size check */
+
 		freesize /= 2; /* convert to K */
 		if (s->chunk && s->chunk != UnSet) {
 			/* round to chunk size */
@@ -429,6 +434,7 @@ int Create(struct supertype *st, char *mddev,
 			mindisc = dname;
 			minsize = freesize;
 		}
+	skip_size_check:
 		if (c->runstop != 1 || c->verbose >= 0) {
 			int fd = open(dname, O_RDONLY);
 			if (fd <0 ) {
@@ -531,6 +537,8 @@ int Create(struct supertype *st, char *mddev,
 				st->ss->name);
 		warn = 1;
 	}
+	st->nodes = c->nodes;
+	st->cluster_name = c->homecluster;
 
 	if (warn) {
 		if (c->runstop!= 1) {
@@ -750,7 +758,8 @@ int Create(struct supertype *st, char *mddev,
 #endif
 	}
 
-	if (s->bitmap_file && strcmp(s->bitmap_file, "internal")==0) {
+	if (s->bitmap_file && (strcmp(s->bitmap_file, "internal")==0 ||
+			       strcmp(s->bitmap_file, "clustered")==0)) {
 		if ((vers%100) < 2) {
 			pr_err("internal bitmaps not supported by this kernel.\n");
 			goto abort_locked;
@@ -834,7 +843,7 @@ int Create(struct supertype *st, char *mddev,
 	for (pass=1; pass <=2 ; pass++) {
 		struct mddev_dev *moved_disk = NULL; /* the disk that was moved out of the insert point */
 
-		for (dnum=0, dv = devlist ; dv ;
+		for (dnum=0, raid_disk_num=0, dv = devlist ; dv ;
 		     dv=(dv->next)?(dv->next):moved_disk, dnum++) {
 			int fd;
 			struct stat stb;
@@ -843,11 +852,14 @@ int Create(struct supertype *st, char *mddev,
 			if (dnum >= total_slots)
 				abort();
 			if (dnum == insert_point) {
+				raid_disk_num += 1;
 				moved_disk = dv;
 				continue;
 			}
-			if (strcasecmp(dv->devname, "missing")==0)
+			if (strcasecmp(dv->devname, "missing")==0) {
+				raid_disk_num += 1;
 				continue;
+			}
 			if (have_container)
 				moved_disk = NULL;
 			if (have_container && dnum < info.array.raid_disks - 1)
@@ -859,8 +871,13 @@ int Create(struct supertype *st, char *mddev,
 				*inf = info;
 
 				inf->disk.number = dnum;
-				inf->disk.raid_disk = dnum;
-				if (inf->disk.raid_disk < s->raiddisks)
+				inf->disk.raid_disk = raid_disk_num++;
+
+				if (dv->disposition == 'j') {
+					inf->disk.raid_disk = MD_DISK_ROLE_JOURNAL;
+					inf->disk.state = (1<<MD_DISK_JOURNAL);
+					raid_disk_num--;
+				} else if (inf->disk.raid_disk < s->raiddisks)
 					inf->disk.state = (1<<MD_DISK_ACTIVE) |
 						(1<<MD_DISK_SYNC);
 				else
diff --git a/Detail.c b/Detail.c
index dd72eded..0cfccadb 100644
--- a/Detail.c
+++ b/Detail.c
@@ -299,7 +299,8 @@ int Detail(char *dev, struct context *c)
 	for (d = 0; d < max_disks * 2; d++) {
 		disks[d].state = (1<<MD_DISK_REMOVED);
 		disks[d].major = disks[d].minor = 0;
-		disks[d].number = disks[d].raid_disk = d;
+		disks[d].number = -1;
+		disks[d].raid_disk = d/2;
 	}
 
 	next = array.raid_disks*2;
@@ -325,7 +326,8 @@ int Detail(char *dev, struct context *c)
 		    && disks[disk.raid_disk*2].state == (1<<MD_DISK_REMOVED))
 			disks[disk.raid_disk*2] = disk;
 		else if (disk.raid_disk >= 0 && disk.raid_disk < array.raid_disks
-			 && disks[disk.raid_disk*2+1].state == (1<<MD_DISK_REMOVED))
+			 && disks[disk.raid_disk*2+1].state == (1<<MD_DISK_REMOVED)
+			 && !(disk.state & (1<<MD_DISK_JOURNAL)))
 			disks[disk.raid_disk*2+1] = disk;
 		else if (next < max_disks*2)
 			disks[next++] = disk;
@@ -339,7 +341,8 @@ int Detail(char *dev, struct context *c)
 		    (disks[d*2+1].state & (1<<MD_DISK_SYNC))) {
 			avail_disks ++;
 			avail[d] = 1;
-		}
+		} else
+			rv |= !! c->test;
 	}
 
 	if (c->brief) {
@@ -422,8 +425,9 @@ int Detail(char *dev, struct context *c)
 				else
 					printf("  Used Dev Size : unknown\n");
 			} else
-				printf("  Used Dev Size : %d%s\n", array.size,
-				       human_size((long long)array.size<<10));
+				printf("  Used Dev Size : %lu%s\n",
+				       (unsigned long)array.size,
+				       human_size((unsigned long long)array.size<<10));
 		}
 		if (array.raid_disks)
 			printf("   Raid Devices : %d\n", array.raid_disks);
@@ -616,12 +620,15 @@ This is pretty boring
 			continue;
 		if (!c->brief) {
 			if (d == array.raid_disks*2) printf("\n");
-			if (disk.number < 0)
+			if (disk.number < 0 && disk.raid_disk < 0)
 				printf("       -   %5d    %5d        -     ",
 				       disk.major, disk.minor);
-			else if (disk.raid_disk < 0)
+			else if (disk.raid_disk < 0 || disk.state & (1<<MD_DISK_JOURNAL))
 				printf("   %5d   %5d    %5d        -     ",
 				       disk.number, disk.major, disk.minor);
+			else if (disk.number < 0)
+				printf("       -   %5d    %5d    %5d     ",
+				       disk.major, disk.minor, disk.raid_disk);
 			else
 				printf("   %5d   %5d    %5d    %5d     ",
 				       disk.number, disk.major, disk.minor, disk.raid_disk);
@@ -650,9 +657,10 @@ This is pretty boring
 			}
 			if (disk.state & (1<<MD_DISK_REMOVED)) printf(" removed");
 			if (disk.state & (1<<MD_DISK_WRITEMOSTLY)) printf(" writemostly");
+			if (disk.state & (1<<MD_DISK_JOURNAL)) printf(" journal");
 			if ((disk.state &
 			     ((1<<MD_DISK_ACTIVE)|(1<<MD_DISK_SYNC)
-			      |(1<<MD_DISK_REMOVED)|(1<<MD_DISK_FAULTY)))
+			      |(1<<MD_DISK_REMOVED)|(1<<MD_DISK_FAULTY)|(1<<MD_DISK_JOURNAL)))
 			    == 0) {
 				printf(" spare");
 				if (is_26) {
@@ -671,9 +679,6 @@ This is pretty boring
 			}
 		}
 		if (disk.state == 0) spares++;
-		if (c->test && d < array.raid_disks
-		    && !(disk.state & (1<<MD_DISK_SYNC)))
-			rv |= 1;
 		dv=map_dev_preferred(disk.major, disk.minor, 0, c->prefer);
 		if (dv != NULL) {
 			if (c->brief)
diff --git a/Grow.c b/Grow.c
index a336593d..bbdd46c0 100644..100755
--- a/Grow.c
+++ b/Grow.c
@@ -297,6 +297,9 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
 			"  between different architectures.  Consider upgrading the Linux kernel.\n");
 	}
 
+	if (s->bitmap_file && strcmp(s->bitmap_file, "clustered") == 0)
+		major = BITMAP_MAJOR_CLUSTERED;
+
 	if (ioctl(fd, GET_BITMAP_FILE, &bmf) != 0) {
 		if (errno == ENOMEM)
 			pr_err("Memory allocation failure.\n");
@@ -325,13 +328,15 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
 		if (strcmp(s->bitmap_file, "none")==0) {
 			array.state &= ~(1<<MD_SB_BITMAP_PRESENT);
 			if (ioctl(fd, SET_ARRAY_INFO, &array)!= 0) {
-				pr_err("failed to remove internal bitmap.\n");
+				if (array.state & (1<<MD_SB_CLUSTERED))
+					pr_err("failed to remove clustered bitmap.\n");
+				else
+					pr_err("failed to remove internal bitmap.\n");
 				return 1;
 			}
 			return 0;
 		}
-		pr_err("Internal bitmap already present on %s\n",
-			devname);
+		pr_err("bitmap already present on %s\n", devname);
 		return 1;
 	}
 
@@ -375,7 +380,8 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
 		free(st);
 		return 1;
 	}
-	if (strcmp(s->bitmap_file, "internal") == 0) {
+	if (strcmp(s->bitmap_file, "internal") == 0 ||
+	    strcmp(s->bitmap_file, "clustered") == 0) {
 		int rv;
 		int d;
 		int offset_setable = 0;
@@ -384,6 +390,8 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
 			pr_err("Internal bitmaps not supported with %s metadata\n", st->ss->name);
 			return 1;
 		}
+		st->nodes = c->nodes;
+		st->cluster_name = c->homecluster;
 		mdi = sysfs_read(fd, NULL, GET_BITMAP_LOCATION);
 		if (mdi)
 			offset_setable = 1;
@@ -410,7 +418,7 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
 						    bitmapsize, offset_setable,
 						    major)
 						)
-						st->ss->write_bitmap(st, fd2);
+						st->ss->write_bitmap(st, fd2, NoUpdate);
 					else {
 						pr_err("failed to create internal bitmap - chunksize problem.\n");
 						close(fd2);
@@ -426,6 +434,8 @@ int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
 			rv = sysfs_set_num_signed(mdi, NULL, "bitmap/location",
 						  mdi->bitmap_offset);
 		} else {
+			if (strcmp(s->bitmap_file, "clustered") == 0)
+				array.state |= (1<<MD_SB_CLUSTERED);
 			array.state |= (1<<MD_SB_BITMAP_PRESENT);
 			rv = ioctl(fd, SET_ARRAY_INFO, &array);
 		}
@@ -1580,6 +1590,15 @@ int Grow_reshape(char *devname, int fd,
 		pr_err("Cannot increase raid-disks on this array beyond %d\n", st->max_devs);
 		return 1;
 	}
+	if (s->level == 0 &&
+	    (array.state & (1<<MD_SB_BITMAP_PRESENT)) &&
+	    !(array.state & (1<<MD_SB_CLUSTERED))) {
+                array.state &= ~(1<<MD_SB_BITMAP_PRESENT);
+                if (ioctl(fd, SET_ARRAY_INFO, &array)!= 0) {
+                        pr_err("failed to remove internal bitmap.\n");
+                        return 1;
+                }
+        }
 
 	/* in the external case we need to check that the requested reshape is
 	 * supported, and perform an initial check that the container holds the
@@ -4496,8 +4515,8 @@ int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, int cnt
 		 * sometimes they aren't... So allow considerable flexability in matching, and allow
 		 * this test to be overridden by an environment variable.
 		 */
-		if (info->array.utime > (int)__le64_to_cpu(bsb.mtime) + 2*60*60 ||
-		    info->array.utime < (int)__le64_to_cpu(bsb.mtime) - 10*60) {
+		if(time_after(info->array.utime, (unsigned int)__le64_to_cpu(bsb.mtime) + 2*60*60) ||
+		   time_before(info->array.utime, (unsigned int)__le64_to_cpu(bsb.mtime) - 10*60)) {
 			if (check_env("MDADM_GROW_ALLOW_OLD")) {
 				pr_err("accepting backup with timestamp %lu for array with timestamp %lu\n",
 					(unsigned long)__le64_to_cpu(bsb.mtime),
@@ -4866,6 +4885,9 @@ int Grow_continue_command(char *devname, int fd,
 
 		sysfs_init(content, fd2, mdstat->devnm);
 
+		close(fd2);
+		fd2 = -1;
+
 		/* start mdmon in case it is not running
 		 */
 		if (!mdmon_running(container))
diff --git a/Incremental.c b/Incremental.c
index 41876b9e..24fd8276 100644
--- a/Incremental.c
+++ b/Incremental.c
@@ -104,6 +104,7 @@ int Incremental(struct mddev_dev *devlist, struct context *c,
 	struct map_ent target_array;
 	int have_target;
 	char *devname = devlist->devname;
+	int journal_device_missing = 0;
 
 	struct createinfo *ci = conf_get_create_info();
 
@@ -312,6 +313,12 @@ int Incremental(struct mddev_dev *devlist, struct context *c,
 
 	if (mdfd < 0) {
 
+		/* Skip the clustered ones. This should be started by
+		 * clustering resource agents
+		 */
+		if (info.array.state & (1 << MD_SB_CLUSTERED))
+			goto out;
+
 		/* Couldn't find an existing array, maybe make a new one */
 		mdfd = create_mddev(match ? match->devname : NULL,
 				    name_to_use, c->autof, trustworthy, chosen_name);
@@ -437,6 +444,10 @@ int Incremental(struct mddev_dev *devlist, struct context *c,
 		/* add disk needs to know about containers */
 		if (st->ss->external)
 			sra->array.level = LEVEL_CONTAINER;
+
+		if (info.array.state & (1 << MD_SB_CLUSTERED))
+			info.disk.state |= (1 << MD_DISK_CLUSTER_ADD);
+
 		err = add_disk(mdfd, st, sra, &info);
 		if (err < 0 && errno == EBUSY) {
 			/* could be another device present with the same
@@ -514,6 +525,9 @@ int Incremental(struct mddev_dev *devlist, struct context *c,
 	sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE |
 				    GET_OFFSET | GET_SIZE));
 	active_disks = count_active(st, sra, mdfd, &avail, &info);
+
+	journal_device_missing = (info.journal_device_required) && (info.journal_clean == 0);
+
 	if (enough(info.array.level, info.array.raid_disks,
 		   info.array.layout, info.array.state & 1,
 		   avail) == 0) {
@@ -543,10 +557,12 @@ int Incremental(struct mddev_dev *devlist, struct context *c,
 	}
 
 	map_unlock(&map);
-	if (c->runstop > 0 || active_disks >= info.array.working_disks) {
+	if (c->runstop > 0 || (!journal_device_missing && active_disks >= info.array.working_disks)) {
 		struct mdinfo *dsk;
 		/* Let's try to start it */
 
+		if (journal_device_missing)
+			pr_err("Trying to run with missing journal device\n");
 		if (info.reshape_active && !(info.reshape_active & RESHAPE_NO_BACKUP)) {
 			pr_err("%s: This array is being reshaped and cannot be started\n",
 			       chosen_name);
@@ -613,6 +629,8 @@ int Incremental(struct mddev_dev *devlist, struct context *c,
 	} else {
 		if (c->export) {
 			printf("MD_STARTED=unsafe\n");
+		} else if (journal_device_missing) {
+			pr_err("Journal device is missing, not safe to start yet.\n");
 		} else if (c->verbose >= 0)
 			pr_err("%s attached to %s, not enough to start safely.\n",
 			       devname, chosen_name);
@@ -649,7 +667,7 @@ static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra,
 			 * without thinking more */
 
 	for (d = sra->devs; d ; d = d->next) {
-		char dn[10];
+		char dn[24]; // 2*11 bytes for ints (including sign) + colon + null byte
 		int dfd;
 		struct mdinfo info;
 		sprintf(dn, "%d:%d", d->disk.major, d->disk.minor);
@@ -713,8 +731,11 @@ static int count_active(struct supertype *st, struct mdinfo *sra,
 		close(dfd);
 		if (ok != 0)
 			continue;
+
 		info.array.raid_disks = raid_disks;
 		st->ss->getinfo_super(st, &info, devmap + raid_disks * devnum);
+		if (info.disk.raid_disk == MD_DISK_ROLE_JOURNAL)
+			bestinfo->journal_clean = 1;
 		if (!avail) {
 			raid_disks = info.array.raid_disks;
 			avail = xcalloc(raid_disks, 1);
@@ -764,6 +785,7 @@ static int count_active(struct supertype *st, struct mdinfo *sra,
 			replcnt++;
 		st->ss->free_super(st);
 	}
+
 	if (!avail)
 		return 0;
 	/* We need to reject any device that thinks the best device is
@@ -1012,12 +1034,12 @@ static int array_try_spare(char *devname, int *dfdp, struct dev_policy *pol,
 		int mdfd = open_dev(chosen->sys_name);
 		if (mdfd >= 0) {
 			struct mddev_dev devlist;
-			char devname[20];
+			char chosen_devname[24]; // 2*11 for int (including signs) + colon + null
 			devlist.next = NULL;
 			devlist.used = 0;
 			devlist.writemostly = 0;
-			devlist.devname = devname;
-			sprintf(devname, "%d:%d", major(stb.st_rdev),
+			devlist.devname = chosen_devname;
+			sprintf(chosen_devname, "%d:%d", major(stb.st_rdev),
 				minor(stb.st_rdev));
 			devlist.disposition = 'a';
 			close(dfd);
diff --git a/Makefile b/Makefile
index a02a97f3..fd79cfbc 100644
--- a/Makefile
+++ b/Makefile
@@ -43,7 +43,7 @@ KLIBC_GCC = gcc -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIB
 
 CC = $(CROSS_COMPILE)gcc
 CXFLAGS ?= -ggdb
-CWFLAGS = -Wall -Wstrict-prototypes -Wextra -Wno-unused-parameter
+CWFLAGS = -Wall -Werror -Wstrict-prototypes -Wextra -Wno-unused-parameter
 ifdef WARN_UNUSED
 CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O3
 endif
@@ -62,8 +62,8 @@ CPPFLAGS += -DBINDIR=\"$(BINDIR)\"
 PKG_CONFIG ?= pkg-config
 
 SYSCONFDIR = /etc
-CONFFILE = $(SYSCONFDIR)/mdadm/mdadm.conf
-CONFFILE2 = $(SYSCONFDIR)/mdadm.conf
+CONFFILE = $(SYSCONFDIR)/mdadm.conf
+CONFFILE2 = $(SYSCONFDIR)/mdadm/mdadm.conf
 MAILCMD =/usr/sbin/sendmail -t
 CONFFILEFLAGS = -DCONFFILE=\"$(CONFFILE)\" -DCONFFILE2=\"$(CONFFILE2)\"
 # Both MAP_DIR and MDMON_DIR should be somewhere that persists across the
@@ -79,10 +79,14 @@ MDMON_DIR = $(RUN_DIR)
 # place for autoreplace cookies
 FAILED_SLOTS_DIR = $(RUN_DIR)/failed-slots
 SYSTEMD_DIR=/lib/systemd/system
+
+COROSYNC:=$(shell [ -d /usr/include/corosync ] || echo -DNO_COROSYNC)
+DLM:=$(shell [ -f /usr/include/libdlm.h ] || echo -DNO_DLM)
+
 DIRFLAGS = -DMAP_DIR=\"$(MAP_DIR)\" -DMAP_FILE=\"$(MAP_FILE)\"
 DIRFLAGS += -DMDMON_DIR=\"$(MDMON_DIR)\"
 DIRFLAGS += -DFAILED_SLOTS_DIR=\"$(FAILED_SLOTS_DIR)\"
-CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS)
+CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS) $(COROSYNC) $(DLM)
 
 VERSION = $(shell [ -d .git ] && git describe HEAD | sed 's/mdadm-//')
 VERS_DATE = $(shell [ -d .git ] && date --date="`git log -n1 --format=format:%cd --date=short`" '+%0dth %B %Y' | sed -e 's/1th/1st/' -e 's/2th/2nd/' -e 's/11st/11th/' -e 's/12nd/12th/')
@@ -101,6 +105,7 @@ endif
 # If you want a static binary, you might uncomment these
 # LDFLAGS = -static
 # STRIP = -s
+LDLIBS=-ldl
 
 INSTALL = /usr/bin/install
 DESTDIR =
@@ -115,6 +120,12 @@ ifndef UDEVDIR
  UDEVDIR = /lib/udev
 endif
 
+ifeq (,$(findstring s,$(MAKEFLAGS)))
+	ECHO=echo
+else
+	ECHO=:
+endif
+
 OBJS =  mdadm.o config.o policy.o mdstat.o  ReadMe.o util.o maps.o lib.o \
 	Manage.o Assemble.o Build.o \
 	Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \
@@ -122,7 +133,7 @@ OBJS =  mdadm.o config.o policy.o mdstat.o  ReadMe.o util.o maps.o lib.o \
 	mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \
 	super-mbr.o super-gpt.o \
 	restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o xmalloc.o \
-	platform-intel.o probe_roms.o
+	platform-intel.o probe_roms.o crc32c.o
 
 CHECK_OBJS = restripe.o sysfs.o maps.o lib.o xmalloc.o dlink.o
 
@@ -176,7 +187,7 @@ mdadm : $(OBJS) | check_rundir
 	$(CC) $(CFLAGS) $(LDFLAGS) -o mdadm $(OBJS) $(LDLIBS)
 
 mdadm.static : $(OBJS) $(STATICOBJS)
-	$(CC) $(CFLAGS) $(LDFLAGS) -static -o mdadm.static $(OBJS) $(STATICOBJS)
+	$(CC) $(CFLAGS) $(LDFLAGS) -static -o mdadm.static $(OBJS) $(STATICOBJS) $(LDLIBS)
 
 mdadm.tcc : $(SRCS) $(INCL)
 	$(TCC) -o mdadm.tcc $(SRCS)
@@ -186,13 +197,13 @@ mdadm.klibc : $(SRCS) $(INCL)
 	$(CC) -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 $(CFLAGS) $(SRCS)
 
 mdadm.Os : $(SRCS) $(INCL)
-	$(CC) -o mdadm.Os $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -Os $(SRCS)
+	$(CC) -o mdadm.Os $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -Os $(SRCS) $(LDLIBS)
 
 mdadm.O2 : $(SRCS) $(INCL) mdmon.O2
-	$(CC) -o mdadm.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(SRCS)
+	$(CC) -o mdadm.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(SRCS) $(LDLIBS)
 
 mdmon.O2 : $(MON_SRCS) $(INCL) mdmon.h
-	$(CC) -o mdmon.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS)
+	$(CC) -o mdmon.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS) $(LDLIBS)
 
 # use '-z now' to guarantee no dynamic linker interactions with the monitor thread
 mdmon : $(MON_OBJS) | check_rundir
@@ -200,7 +211,7 @@ mdmon : $(MON_OBJS) | check_rundir
 msg.o: msg.c msg.h
 
 test_stripe : restripe.c xmalloc.o mdadm.h
-	$(CC) $(CXFLAGS) $(LDFLAGS) -o test_stripe xmalloc.o  -DMAIN restripe.c
+	$(CC) $(CFLAGS) $(CXFLAGS) $(LDFLAGS) -o test_stripe xmalloc.o  -DMAIN restripe.c
 
 raid6check : raid6check.o mdadm.h $(CHECK_OBJS)
 	$(CC) $(CXFLAGS) $(LDFLAGS) -o raid6check raid6check.o $(CHECK_OBJS)
@@ -283,7 +294,7 @@ install-man: mdadm.8 md.4 mdadm.conf.5 mdmon.8
 install-udev: udev-md-raid-arrays.rules udev-md-raid-assembly.rules
 	@for file in 63-md-raid-arrays.rules 64-md-raid-assembly.rules ; \
 	do sed -e 's,BINDIR,$(BINDIR),g' udev-$${file#??-} > .install.tmp.1 && \
-	   echo $(INSTALL) -D -m 644 udev-$${file#??-} $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \
+	   $(ECHO) $(INSTALL) -D -m 644 udev-$${file#??-} $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \
 	   $(INSTALL) -D -m 644 .install.tmp.1 $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \
 	   rm -f .install.tmp.1; \
 	done
@@ -292,13 +303,13 @@ install-systemd: systemd/mdmon@.service
 	@for file in mdmon@.service mdmonitor.service mdadm-last-resort@.timer \
 		mdadm-last-resort@.service mdadm-grow-continue@.service; \
 	do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.2 && \
-	   echo $(INSTALL) -D -m 644 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \
+	   $(ECHO) $(INSTALL) -D -m 644 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \
 	   $(INSTALL) -D -m 644 .install.tmp.2 $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \
 	   rm -f .install.tmp.2; \
 	done
 	@for file in mdadm.shutdown ; \
 	do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.3 && \
-	   echo $(INSTALL) -D -m 755  systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \
+	   $(ECHO) $(INSTALL) -D -m 755  systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \
 	   $(INSTALL) -D -m 755  .install.tmp.3 $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \
 	   rm -f .install.tmp.3; \
 	done
diff --git a/Manage.c b/Manage.c
index 47faeedb..7e1b94be 100644
--- a/Manage.c
+++ b/Manage.c
@@ -669,6 +669,15 @@ int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
 		disc.number = mdi.disk.number;
 		disc.raid_disk = mdi.disk.raid_disk;
 		disc.state = mdi.disk.state;
+		if (array->state & (1 << MD_SB_CLUSTERED)) {
+			/* extra flags are needed when adding to a cluster as
+			 * there are two cases to distinguish
+			 */
+			if (dv->disposition == 'c')
+				disc.state |= (1 << MD_DISK_CANDIDATE);
+			else
+				disc.state |= (1 << MD_DISK_CLUSTER_ADD);
+		}
 		if (dv->writemostly == 1)
 			disc.state |= 1 << MD_DISK_WRITEMOSTLY;
 		if (dv->writemostly == 2)
@@ -724,7 +733,8 @@ skip_re_add:
 int Manage_add(int fd, int tfd, struct mddev_dev *dv,
 	       struct supertype *tst, mdu_array_info_t *array,
 	       int force, int verbose, char *devname,
-	       char *update, unsigned long rdev, unsigned long long array_size)
+	       char *update, unsigned long rdev, unsigned long long array_size,
+	       int raid_slot)
 {
 	unsigned long long ldsize;
 	struct supertype *dev_st = NULL;
@@ -815,7 +825,8 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
 		}
 
 		/* Make sure device is large enough */
-		if (tst->sb &&
+		if (dv->disposition != 'j' &&  /* skip size check for Journal */
+		    tst->sb &&
 		    tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) <
 		    array_size) {
 			if (dv->disposition == 'M')
@@ -914,10 +925,36 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
 	}
 	disc.major = major(rdev);
 	disc.minor = minor(rdev);
-	disc.number =j;
+	if (raid_slot < 0)
+		disc.number = j;
+	else
+		disc.number = raid_slot;
 	disc.state = 0;
+
+	/* only add journal to array that supports journaling */
+	if (dv->disposition == 'j') {
+		struct mdinfo mdi;
+		struct mdinfo *mdp;
+
+		mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE);
+
+		if (strncmp(mdp->sysfs_array_state, "readonly", 8) != 0) {
+			pr_err("%s is not readonly, cannot add journal.\n", devname);
+			return -1;
+		}
+
+		tst->ss->getinfo_super(tst, &mdi, NULL);
+		if (mdi.journal_device_required == 0) {
+			pr_err("%s does not support journal device.\n", devname);
+			return -1;
+		}
+		disc.raid_disk = 0;
+	}
+
 	if (array->not_persistent==0) {
 		int dfd;
+		if (dv->disposition == 'j')
+			disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC);
 		if (dv->writemostly == 1)
 			disc.state |= 1 << MD_DISK_WRITEMOSTLY;
 		dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
@@ -955,6 +992,14 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
 			}
 		free(used);
 	}
+
+	if (array->state & (1 << MD_SB_CLUSTERED)) {
+		if (dv->disposition == 'c')
+			disc.state |= (1 << MD_DISK_CANDIDATE);
+		else
+			disc.state |= (1 << MD_DISK_CLUSTER_ADD);
+	}
+
 	if (dv->writemostly == 1)
 		disc.state |= (1 << MD_DISK_WRITEMOSTLY);
 	if (tst->ss->external) {
@@ -1020,10 +1065,20 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv,
 	} else {
 		tst->ss->free_super(tst);
 		if (ioctl(fd, ADD_NEW_DISK, &disc)) {
-			pr_err("add new device failed for %s as %d: %s\n",
-			       dv->devname, j, strerror(errno));
+			if (dv->disposition == 'j')
+				pr_err("Failed to hot add %s as journal, "
+				       "please try restart %s.\n", dv->devname, devname);
+			else
+				pr_err("add new device failed for %s as %d: %s\n",
+				       dv->devname, j, strerror(errno));
 			return -1;
 		}
+		if (dv->disposition == 'j') {
+			pr_err("Journal added successfully, making %s read-write\n", devname);
+			if (Manage_ro(devname, fd, -1))
+				pr_err("Failed to make %s read-write\n", devname);
+		}
+
 	}
 	if (verbose >= 0)
 		pr_err("added %s\n", dv->devname);
@@ -1256,6 +1311,7 @@ int Manage_subdevs(char *devname, int fd,
 	 *	   try HOT_ADD_DISK
 	 *         If that fails EINVAL, try ADD_NEW_DISK
 	 *  'S' - add the device as a spare - don't try re-add
+	 *  'j' - add the device as a journal device
 	 *  'A' - re-add the device
 	 *  'r' - remove the device: HOT_REMOVE_DISK
 	 *        device can be 'faulty' or 'detached' in which case all
@@ -1274,6 +1330,7 @@ int Manage_subdevs(char *devname, int fd,
 	 *        variant on 'A'
 	 *  'F' - Another variant of 'A', where the device was faulty
 	 *        so must be removed from the array first.
+	 *  'c' - confirm the device as found (for clustered environments)
 	 *
 	 * For 'f' and 'r', the device can also be a kernel-internal
 	 * name such as 'sdb'.
@@ -1287,8 +1344,10 @@ int Manage_subdevs(char *devname, int fd,
 	int sysfd = -1;
 	int count = 0; /* number of actions taken */
 	struct mdinfo info;
+	struct mdinfo devinfo;
 	int frozen = 0;
 	int busy = 0;
+	int raid_slot = -1;
 
 	if (ioctl(fd, GET_ARRAY_INFO, &array)) {
 		pr_err("Cannot get array info for %s\n",
@@ -1317,6 +1376,17 @@ int Manage_subdevs(char *devname, int fd,
 		int rv;
 		int mj,mn;
 
+		raid_slot = -1;
+		if (dv->disposition == 'c') {
+			rv = parse_cluster_confirm_arg(dv->devname,
+						       &dv->devname,
+						       &raid_slot);
+			if (rv) {
+				pr_err("Could not get the devname of cluster\n");
+				goto abort;
+			}
+		}
+
 		if (strcmp(dv->devname, "failed") == 0 ||
 		    strcmp(dv->devname, "faulty") == 0) {
 			if (dv->disposition != 'A'
@@ -1342,6 +1412,11 @@ int Manage_subdevs(char *devname, int fd,
 		if (strcmp(dv->devname, "missing") == 0) {
 			struct mddev_dev *add_devlist = NULL;
 			struct mddev_dev **dp;
+			if (dv->disposition == 'c') {
+				rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL);
+				break;
+			}
+
 			if (dv->disposition != 'A') {
 				pr_err("'missing' only meaningful with --re-add\n");
 				goto abort;
@@ -1469,14 +1544,28 @@ int Manage_subdevs(char *devname, int fd,
 			goto abort;
 		case 'a':
 		case 'S': /* --add-spare */
+		case 'j': /* --add-journal */
 		case 'A':
 		case 'M': /* --re-add missing */
 		case 'F': /* --re-add faulty  */
+		case 'c': /* --cluster-confirm */
 			/* add the device */
 			if (subarray) {
 				pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n");
 				goto abort;
 			}
+
+			/* Let's first try to write re-add to sysfs */
+			if (rdev != 0 &&
+			    (dv->disposition == 'A' || dv->disposition == 'F')) {
+				sysfs_init_dev(&devinfo, rdev);
+				if (sysfs_set_str(&info, &devinfo, "state", "re-add") == 0) {
+					pr_err("re-add %s to %s succeed\n",
+						dv->devname, info.sys_name);
+					break;
+				}
+			}
+
 			if (dv->disposition == 'F')
 				/* Need to remove first */
 				ioctl(fd, HOT_REMOVE_DISK, rdev);
@@ -1505,7 +1594,7 @@ int Manage_subdevs(char *devname, int fd,
 			}
 			rv = Manage_add(fd, tfd, dv, tst, &array,
 					force, verbose, devname, update,
-					rdev, array_size);
+					rdev, array_size, raid_slot);
 			close(tfd);
 			tfd = -1;
 			if (rv < 0)
diff --git a/ReadMe.c b/ReadMe.c
index a05c74ec..d3fcb613 100644
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2015 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2016 Neil Brown <neilb@suse.com>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
@@ -25,10 +25,10 @@
 #include "mdadm.h"
 
 #ifndef VERSION
-#define VERSION "3.3.4"
+#define VERSION "3.4"
 #endif
 #ifndef VERS_DATE
-#define VERS_DATE "3rd August 2015"
+#define VERS_DATE "28th January 2016"
 #endif
 char Version[] = "mdadm - v" VERSION " - " VERS_DATE "\n";
 
@@ -140,6 +140,9 @@ struct option long_options[] = {
     {"homehost",  1, 0,  HomeHost},
     {"symlinks",  1, 0,  Symlinks},
     {"data-offset",1, 0, DataOffset},
+    {"nodes",1, 0, Nodes}, /* also for --assemble */
+    {"home-cluster",1, 0, ClusterName},
+    {"write-journal",1, 0, WriteJournal},
 
     /* For assemble */
     {"uuid",      1, 0, 'u'},
@@ -154,6 +157,7 @@ struct option long_options[] = {
     /* Management */
     {"add",       0, 0, Add},
     {"add-spare", 0, 0, AddSpare},
+    {"add-journal", 0, 0, AddJournal},
     {"remove",    0, 0, Remove},
     {"fail",      0, 0, Fail},
     {"set-faulty",0, 0, Fail},
@@ -167,6 +171,7 @@ struct option long_options[] = {
     {"wait",	  0, 0,  WaitOpt},
     {"wait-clean", 0, 0, Waitclean },
     {"action",    1, 0, Action },
+    {"cluster-confirm", 0, 0, ClusterConfirm},
 
     /* For Detail/Examine */
     {"brief",	  0, 0, Brief},
@@ -372,6 +377,7 @@ char Help_create[] =
 "  --name=       -N   : Textual name for array - max 32 characters\n"
 "  --bitmap-chunk=    : bitmap chunksize in Kilobytes.\n"
 "  --delay=      -d   : bitmap update delay in seconds.\n"
+"  --write-journal=   : Specify journal device for RAID-4/5/6 array\n"
 "\n"
 ;
 
@@ -593,7 +599,7 @@ char Help_incr[] =
 ;
 
 char Help_config[] =
-"The /etc/mdadm/mdadm.conf config file:\n\n"
+"The /etc/mdadm.conf config file:\n\n"
 " The config file contains, apart from blank lines and comment lines that\n"
 " start with a hash(#), array lines, device lines, and various\n"
 " configuration lines.\n"
diff --git a/bitmap.c b/bitmap.c
index bbe9baec..dab674b4 100644
--- a/bitmap.c
+++ b/bitmap.c
@@ -32,6 +32,8 @@ static inline void sb_le_to_cpu(bitmap_super_t *sb)
 	sb->daemon_sleep = __le32_to_cpu(sb->daemon_sleep);
 	sb->sync_size = __le64_to_cpu(sb->sync_size);
 	sb->write_behind = __le32_to_cpu(sb->write_behind);
+	sb->nodes = __le32_to_cpu(sb->nodes);
+	sb->sectors_reserved = __le32_to_cpu(sb->sectors_reserved);
 }
 
 static inline void sb_cpu_to_le(bitmap_super_t *sb)
@@ -219,8 +221,12 @@ int bitmap_file_open(char *filename, struct supertype **stp)
 			pr_err("No bitmap possible with %s metadata\n",
 				st->ss->name);
 			return -1;
-		} else
-			st->ss->locate_bitmap(st, fd);
+		} else {
+			if (st->ss->locate_bitmap(st, fd)) {
+				pr_err("%s doesn't have bitmap\n", filename);
+				fd = -1;
+			}
+		}
 
 		*stp = st;
 	} else {
@@ -258,7 +264,7 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st)
 	int rv = 1;
 	char buf[64];
 	int swap;
-	int fd;
+	int fd, i;
 	__u32 uuid32[4];
 
 	fd = bitmap_file_open(filename, &st);
@@ -285,7 +291,7 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st)
 	}
 	printf("         Version : %d\n", sb->version);
 	if (sb->version < BITMAP_MAJOR_LO ||
-	    sb->version > BITMAP_MAJOR_HI) {
+	    sb->version > BITMAP_MAJOR_CLUSTERED) {
 		pr_err("unknown bitmap version %d, either the bitmap file\n",
 		       sb->version);
 		pr_err("is corrupted or you need to upgrade your tools\n");
@@ -315,9 +321,13 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st)
 		       uuid32[2],
 		       uuid32[3]);
 
-	printf("          Events : %llu\n", (unsigned long long)sb->events);
-	printf("  Events Cleared : %llu\n", (unsigned long long)sb->events_cleared);
-	printf("           State : %s\n", bitmap_state(sb->state));
+	if (sb->nodes == 0) {
+		printf("          Events : %llu\n", (unsigned long long)sb->events);
+		printf("  Events Cleared : %llu\n", (unsigned long long)sb->events_cleared);
+		printf("           State : %s\n", bitmap_state(sb->state));
+
+	}
+
 	printf("       Chunksize : %s\n", human_chunksize(sb->chunksize));
 	printf("          Daemon : %ds flush period\n", sb->daemon_sleep);
 	if (sb->write_behind)
@@ -327,11 +337,40 @@ int ExamineBitmap(char *filename, int brief, struct supertype *st)
 	printf("      Write Mode : %s\n", buf);
 	printf("       Sync Size : %llu%s\n", (unsigned long long)sb->sync_size/2,
 					human_size(sb->sync_size * 512));
-	if (brief)
-		goto free_info;
-	printf("          Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n",
-			info->total_bits, info->dirty_bits,
-			100.0 * info->dirty_bits / (info->total_bits?:1));
+
+	if (sb->nodes == 0) {
+		if (brief)
+			goto free_info;
+		printf("          Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n",
+		       info->total_bits, info->dirty_bits,
+		       100.0 * info->dirty_bits / (info->total_bits?:1));
+	} else {
+		printf("   Cluster nodes : %d\n", sb->nodes);
+		printf("    Cluster name : %-64s\n", sb->cluster_name);
+		for (i = 0; i < (int)sb->nodes; i++) {
+			if (i) {
+				free(info);
+				info = bitmap_fd_read(fd, brief);
+				sb = &info->sb;
+			}
+			if (sb->magic != BITMAP_MAGIC)
+				pr_err("invalid bitmap magic 0x%x, the bitmap file appears to be corrupted\n", sb->magic);
+
+			printf("       Node Slot : %d\n", i);
+			printf("          Events : %llu\n",
+			       (unsigned long long)sb->events);
+			printf("  Events Cleared : %llu\n",
+			       (unsigned long long)sb->events_cleared);
+			printf("           State : %s\n", bitmap_state(sb->state));
+			if (brief)
+				continue;
+			printf("          Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n",
+			       info->total_bits, info->dirty_bits,
+			       100.0 * info->dirty_bits / (info->total_bits?:1));
+
+		}
+	}
+
 free_info:
 	free(info);
 	return rv;
diff --git a/bitmap.h b/bitmap.h
index c8725a30..b8fb0714 100644
--- a/bitmap.h
+++ b/bitmap.h
@@ -12,6 +12,7 @@
  */
 #define BITMAP_MAJOR_HI 4
 #define	BITMAP_MAJOR_HOSTENDIAN 3
+#define	BITMAP_MAJOR_CLUSTERED 5
 
 #define BITMAP_MINOR 39
 
@@ -154,8 +155,11 @@ typedef struct bitmap_super_s {
 	__u32 chunksize;    /* 52  the bitmap chunk size in bytes */
 	__u32 daemon_sleep; /* 56  seconds between disk flushes */
 	__u32 write_behind; /* 60  number of outstanding write-behind writes */
-
-	__u8  pad[256 - 64]; /* set to zero */
+	__u32 sectors_reserved; /* 64 number of 512-byte sectors that are
+				 * reserved for the bitmap. */
+	__u32 nodes;        /* 68 the maximum number of nodes in cluster. */
+	__u8 cluster_name[64]; /* 72 cluster name to which this md belongs */
+	__u8  pad[256 - 136]; /* set to zero */
 } bitmap_super_t;
 
 /* notes:
diff --git a/config.c b/config.c
index a882ed33..b308b6cc 100644
--- a/config.c
+++ b/config.c
@@ -63,6 +63,9 @@
  * but may not wrap over lines
  *
  */
+#ifndef _POSIX_C_SOURCE
+#define _POSIX_C_SOURCE 200809L
+#endif
 
 #ifndef CONFFILE
 #define CONFFILE "/etc/mdadm.conf"
@@ -77,7 +80,7 @@ char DefaultAltConfFile[] = CONFFILE2;
 char DefaultAltConfDir[] = CONFFILE2 ".d";
 
 enum linetype { Devices, Array, Mailaddr, Mailfrom, Program, CreateDev,
-		Homehost, AutoMode, Policy, PartPolicy, LTEnd };
+		Homehost, HomeCluster, AutoMode, Policy, PartPolicy, LTEnd };
 char *keywords[] = {
 	[Devices]  = "devices",
 	[Array]    = "array",
@@ -86,6 +89,7 @@ char *keywords[] = {
 	[Program]  = "program",
 	[CreateDev]= "create",
 	[Homehost] = "homehost",
+	[HomeCluster] = "homecluster",
 	[AutoMode] = "auto",
 	[Policy]   = "policy",
 	[PartPolicy]="part-policy",
@@ -562,6 +566,21 @@ void homehostline(char *line)
 	}
 }
 
+static char *home_cluster = NULL;
+void homeclusterline(char *line)
+{
+	char *w;
+
+	for (w=dl_next(line); w != line ; w=dl_next(w)) {
+		if (home_cluster == NULL) {
+			if (strcasecmp(w, "<none>")==0)
+				home_cluster = xstrdup("");
+			else
+				home_cluster = xstrdup(w);
+		}
+	}
+}
+
 char auto_yes[] = "yes";
 char auto_no[] = "no";
 char auto_homehost[] = "homehost";
@@ -724,6 +743,9 @@ void conf_file(FILE *f)
 		case Homehost:
 			homehostline(line);
 			break;
+		case HomeCluster:
+			homeclusterline(line);
+			break;
 		case AutoMode:
 			autoline(line);
 			break;
@@ -884,6 +906,12 @@ char *conf_get_homehost(int *require_homehostp)
 	return home_host;
 }
 
+char *conf_get_homecluster(void)
+{
+	load_conffile();
+	return home_cluster;
+}
+
 struct createinfo *conf_get_create_info(void)
 {
 	load_conffile();
diff --git a/crc32c.c b/crc32c.c
new file mode 100644
index 00000000..156cba19
--- /dev/null
+++ b/crc32c.c
@@ -0,0 +1,104 @@
+/*
+ * Oct 28, 2015 Song Liu simplified the code and port it to mdadm
+ *
+ * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
+ * cleaned up code to current version of sparse and added the slicing-by-8
+ * algorithm to the closely similar existing slicing-by-4 algorithm.
+ *
+ * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * Nicer crc32 functions/docs submitted by linux@horizon.com.  Thanks!
+ * Code was from the public domain, copyright abandoned.  Code was
+ * subsequently included in the kernel, thus was re-licensed under the
+ * GNU GPL v2.
+ *
+ * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * Same crc32 function was used in 5 other places in the kernel.
+ * I made one version, and deleted the others.
+ * There are various incantations of crc32().  Some use a seed of 0 or ~0.
+ * Some xor at the end with ~0.  The generic crc32() function takes
+ * seed as an argument, and doesn't xor at the end.  Then individual
+ * users can do whatever they need.
+ *   drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0.
+ *   fs/jffs2 uses seed 0, doesn't xor with ~0.
+ *   fs/partitions/efi.c uses seed ~0, xor's with ~0.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2.  See the file COPYING for more details.
+ */
+
+#include <sys/types.h>
+#include <asm/types.h>
+#include <stdlib.h>
+
+/*
+ * There are multiple 16-bit CRC polynomials in common use, but this is
+ * *the* standard CRC-32 polynomial, first popularized by Ethernet.
+ * x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x^1+x^0
+ */
+#define CRCPOLY_LE 0xedb88320
+#define CRCPOLY_BE 0x04c11db7
+
+/*
+ * This is the CRC32c polynomial, as outlined by Castagnoli.
+ * x^32+x^28+x^27+x^26+x^25+x^23+x^22+x^20+x^19+x^18+x^14+x^13+x^11+x^10+x^9+
+ * x^8+x^6+x^0
+ */
+#define CRC32C_POLY_LE 0x82F63B78
+
+/**
+ * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II
+ *			CRC32/CRC32C
+ * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for other
+ *	 uses, or the previous crc32/crc32c value if computing incrementally.
+ * @p: pointer to buffer over which CRC32/CRC32C is run
+ * @len: length of buffer @p
+ * @polynomial: CRC32/CRC32c LE polynomial
+ */
+static inline __u32 crc32_le_generic(__u32 crc, unsigned char const *p,
+				     size_t len, __u32 polynomial)
+{
+	int i;
+	while (len--) {
+		crc ^= *p++;
+		for (i = 0; i < 8; i++)
+			crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0);
+	}
+	return crc;
+}
+
+__u32 crc32_le(__u32 crc, unsigned char const *p, size_t len)
+{
+	return crc32_le_generic(crc, p, len, CRCPOLY_LE);
+}
+
+__u32 crc32c_le(__u32 crc, unsigned char const *p, size_t len)
+{
+	return crc32_le_generic(crc, p, len, CRC32C_POLY_LE);
+}
+
+/**
+ * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
+ * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for
+ *	other uses, or the previous crc32 value if computing incrementally.
+ * @p: pointer to buffer over which CRC32 is run
+ * @len: length of buffer @p
+ * @polynomial: CRC32 BE polynomial
+ */
+static inline __u32 crc32_be_generic(__u32 crc, unsigned char const *p,
+				     size_t len, __u32 polynomial)
+{
+	int i;
+	while (len--) {
+		crc ^= *p++ << 24;
+		for (i = 0; i < 8; i++)
+			crc =
+			    (crc << 1) ^ ((crc & 0x80000000) ? polynomial :
+					  0);
+	}
+	return crc;
+}
+
+__u32 crc32_be(__u32 crc, unsigned char const *p, size_t len)
+{
+	return crc32_be_generic(crc, p, len, CRCPOLY_BE);
+}
diff --git a/debian/changelog b/debian/changelog
index cd2f9c2b..36762287 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+mdadm (3.4-1) unstable; urgency=medium
+
+  * New upstream release.
+
+ -- Dimitri John Ledkov <xnox@ubuntu.com>  Fri, 19 Feb 2016 16:18:36 +0000
+
 mdadm (3.3.4-1.1) unstable; urgency=medium
 
   * Non-maintainer upload.
diff --git a/inventory b/inventory
index a9fc3c01..ace5df04 100755
--- a/inventory
+++ b/inventory
@@ -22,6 +22,7 @@ ANNOUNCE-3.3.1
 ANNOUNCE-3.3.2
 ANNOUNCE-3.3.3
 ANNOUNCE-3.3.4
+ANNOUNCE-3.4
 Assemble.c
 Build.c
 COPYING
@@ -46,6 +47,7 @@ bitmap.h
 config.c
 crc32.c
 crc32.h
+crc32c.c
 dlink.c
 dlink.h
 external-reshape-design.txt
@@ -239,6 +241,7 @@ tests/19raid6auto-repair
 tests/19raid6check
 tests/19raid6repair
 tests/19repair-does-not-destroy
+tests/20raid5journal
 tests/ToTest
 tests/check
 tests/env-ddf-template
diff --git a/mapfile.c b/mapfile.c
index 41599df0..243ded18 100644
--- a/mapfile.c
+++ b/mapfile.c
@@ -176,7 +176,7 @@ void map_read(struct map_ent **melp)
 {
 	FILE *f;
 	char buf[8192];
-	char path[200];
+	char path[201];
 	int uuid[4];
 	char devnm[32];
 	char metadata[30];
diff --git a/md.4 b/md.4
index e955c3b4..f1b88ee6 100644
--- a/md.4
+++ b/md.4
@@ -874,6 +874,26 @@ The list is particularly useful when recovering to a spare.  If a few blocks
 cannot be read from the other devices, the bulk of the recovery can
 complete and those few bad blocks will be recorded in the bad block list.
 
+.SS RAID456 WRITE JOURNAL
+
+Due to non-atomicity nature of RAID write operations, interruption of
+write operations (system crash, etc.) to RAID456 array can lead to
+inconsistent parity and data loss (so called RAID-5 write hole).
+
+To plug the write hole, from Linux 4.4 (to be confirmed),
+.I md
+supports write ahead journal for RAID456. When the array is created,
+an additional journal device can be added to the array through
+.IR write-journal
+option. The RAID write journal works similar to file system journals.
+Before writing to the data disks, md persists data AND parity of the
+stripe to the journal device. After crashes, md searches the journal
+device for incomplete write operations, and replay them to the data
+disks.
+
+When the journal device fails, the RAID array is forced to run in
+read-only mode.
+
 .SS WRITE-BEHIND
 
 From Linux 2.6.14,
diff --git a/md_p.h b/md_p.h
index c4846bab..0d691fbc 100644
--- a/md_p.h
+++ b/md_p.h
@@ -78,6 +78,12 @@
 #define MD_DISK_ACTIVE		1 /* disk is running but may not be in sync */
 #define MD_DISK_SYNC		2 /* disk is in sync with the raid set */
 #define MD_DISK_REMOVED		3 /* disk is in sync with the raid set */
+#define MD_DISK_CLUSTER_ADD	4 /* Initiate a disk add across the cluster
+				   * For clustered enviroments only.
+				   */
+#define MD_DISK_CANDIDATE	5 /* disk is added as spare (local) until confirmed
+				   * For clustered enviroments only.
+				   */
 
 #define	MD_DISK_WRITEMOSTLY	9 /* disk is "write-mostly" is RAID1 config.
 				   * read requests will only be sent here in
@@ -85,6 +91,12 @@
 				   */
 
 #define MD_DISK_REPLACEMENT	17
+#define MD_DISK_JOURNAL		18 /* disk is used as the write journal in RAID-5/6 */
+
+#define MD_DISK_ROLE_SPARE	0xffff
+#define MD_DISK_ROLE_FAULTY	0xfffe
+#define MD_DISK_ROLE_JOURNAL	0xfffd
+#define MD_DISK_ROLE_MAX	0xff00 /* max value of regular disk role */
 
 typedef struct mdp_device_descriptor_s {
 	__u32 number;		/* 0 Device number in the entire set	      */
@@ -106,6 +118,7 @@ typedef struct mdp_device_descriptor_s {
 #define MD_SB_BLOCK_CONTAINER_RESHAPE 3 /* block container wide reshapes */
 #define MD_SB_BLOCK_VOLUME	4 /* block activation of array, other arrays
 				   * in container can be activated */
+#define MD_SB_CLUSTERED		5 /* MD is clustered  */
 #define	MD_SB_BITMAP_PRESENT	8 /* bitmap may be present nearby */
 
 typedef struct mdp_superblock_s {
@@ -195,4 +208,62 @@ static inline __u64 md_event(mdp_super_t *sb) {
 	return (ev<<32)| sb->events_lo;
 }
 
+struct r5l_payload_header {
+	__u16 type;
+	__u16 flags;
+} __attribute__ ((__packed__));
+
+enum r5l_payload_type {
+	R5LOG_PAYLOAD_DATA = 0,
+	R5LOG_PAYLOAD_PARITY = 1,
+	R5LOG_PAYLOAD_FLUSH = 2,
+};
+
+struct r5l_payload_data_parity {
+	struct r5l_payload_header header;
+	__u32 size; /* sector. data/parity size. each 4k has a checksum */
+	__u64 location; /* sector. For data, it's raid sector. For
+				parity, it's stripe sector */
+	__u32 checksum[];
+} __attribute__ ((__packed__));
+
+enum r5l_payload_data_parity_flag {
+	R5LOG_PAYLOAD_FLAG_DISCARD = 1, /* payload is discard */
+	/*
+	 * RESHAPED/RESHAPING is only set when there is reshape activity. Note,
+	 * both data/parity of a stripe should have the same flag set
+	 *
+	 * RESHAPED: reshape is running, and this stripe finished reshape
+	 * RESHAPING: reshape is running, and this stripe isn't reshaped
+	 * */
+	R5LOG_PAYLOAD_FLAG_RESHAPED = 2,
+	R5LOG_PAYLOAD_FLAG_RESHAPING = 3,
+};
+
+struct r5l_payload_flush {
+	struct r5l_payload_header header;
+	__u32 size; /* flush_stripes size, bytes */
+	__u64 flush_stripes[];
+} __attribute__ ((__packed__));
+
+enum r5l_payload_flush_flag {
+	R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */
+};
+
+struct r5l_meta_block {
+	__u32 magic;
+	__u32 checksum;
+	__u8 version;
+	__u8 __zero_pading_1;
+	__u16 __zero_pading_2;
+	__u32 meta_size; /* whole size of the block */
+
+	__u64 seq;
+	__u64 position; /* sector, start from rdev->data_offset, current position */
+	struct r5l_payload_header payloads[];
+} __attribute__ ((__packed__));
+
+#define R5LOG_VERSION 0x1
+#define R5LOG_MAGIC 0x6433c509
+
 #endif
diff --git a/md_u.h b/md_u.h
index be9868a7..f570a346 100644
--- a/md_u.h
+++ b/md_u.h
@@ -44,6 +44,7 @@
 #define STOP_ARRAY		_IO (MD_MAJOR, 0x32)
 #define STOP_ARRAY_RO		_IO (MD_MAJOR, 0x33)
 #define RESTART_ARRAY_RW	_IO (MD_MAJOR, 0x34)
+#define CLUSTERED_DISK_NACK	_IO (MD_MAJOR, 0x35)
 
 typedef struct mdu_version_s {
 	int major;
@@ -58,7 +59,7 @@ typedef struct mdu_array_info_s {
 	int major_version;
 	int minor_version;
 	int patch_version;
-	int ctime;
+	unsigned int ctime;
 	int level;
 	int size;
 	int nr_disks;
@@ -69,7 +70,7 @@ typedef struct mdu_array_info_s {
 	/*
 	 * Generic state information
 	 */
-	int utime;		/*  0 Superblock update time		      */
+	unsigned int utime;	/*  0 Superblock update time		      */
 	int state;		/*  1 State bits (clean, ...)		      */
 	int active_disks;	/*  2 Number of currently active disks	      */
 	int working_disks;	/*  3 Number of working disks		      */
diff --git a/mdadm.8.in b/mdadm.8.in
index 14bd8b99..50be1aa8 100644
--- a/mdadm.8.in
+++ b/mdadm.8.in
@@ -5,7 +5,7 @@
 .\"   the Free Software Foundation; either version 2 of the License, or
 .\"   (at your option) any later version.
 .\" See file COPYING in distribution for details.
-.TH MDADM 8 "" v3.3.4
+.TH MDADM 8 "" v3.4
 .SH NAME
 mdadm \- manage MD devices
 .I aka
@@ -267,13 +267,13 @@ the exact meaning of this option in different contexts.
 .TP
 .BR \-c ", " \-\-config=
 Specify the config file or directory.  Default is to use
-.B /etc/mdadm/mdadm.conf
+.B /etc/mdadm.conf
 and
-.BR /etc/mdadm/mdadm.conf.d ,
+.BR /etc/mdadm.conf.d ,
 or if those are missing then
-.B /etc/mdadm.conf
+.B /etc/mdadm/mdadm.conf
 and
-.BR /etc/mdadm.conf.d .
+.BR /etc/mdadm/mdadm.conf.d .
 If the config file given is
 .B "partitions"
 then nothing will be read, but
@@ -422,6 +422,12 @@ This functionality is currently only provided by
 and
 .BR \-\-monitor .
 
+.TP
+.B \-\-home\-cluster=
+specifies the cluster name for the md device. The md device can be assembled
+only on the cluster which matches the name specified. If this option is not
+provided, mdadm tries to detect the cluster name automatically.
+
 .SH For create, build, or grow:
 
 .TP
@@ -701,7 +707,12 @@ and so is replicated on all devices.  If the word
 .B "none"
 is given with
 .B \-\-grow
-mode, then any bitmap that is present is removed.
+mode, then any bitmap that is present is removed. If the word
+.B "clustered"
+is given, the array is created for a clustered environment. One bitmap
+is created for each node as defined by the
+.B \-\-nodes
+parameter and are stored internally.
 
 To help catch typing errors, the filename must contain at least one
 slash ('/') if it is a real file (not 'internal' or 'none').
@@ -973,6 +984,18 @@ However for RAID0, it is not possible to add spares.  So to increase
 the number of devices in a RAID0, it is necessary to set the new
 number of devices, and to add the new devices, in the same command.
 
+.TP
+.BR \-\-nodes
+Only works when the array is for clustered environment. It specifies
+the maximum number of nodes in the cluster that will use this device
+simultaneously. If not specified, this defaults to 4.
+
+.TP
+.BR \-\-write-journal
+Specify journal device for the RAID-4/5/6 array. The journal device
+should be a SSD with reasonable lifetime.
+
+
 .SH For assemble:
 
 .TP
@@ -1087,7 +1110,9 @@ argument given to this flag can be one of
 .BR summaries ,
 .BR uuid ,
 .BR name ,
+.BR nodes ,
 .BR homehost ,
+.BR home-cluster ,
 .BR resync ,
 .BR byteorder ,
 .BR devicesize ,
@@ -1142,6 +1167,13 @@ of the array as stored in the superblock.  This is only supported for
 version-1 superblocks.
 
 The
+.B nodes
+option will change the
+.I nodes
+of the array as stored in the bitmap superblock. This option only
+works for a clustered environment.
+
+The
 .B homehost
 option will change the
 .I homehost
@@ -1150,6 +1182,11 @@ same as updating the UUID.
 For version-1 superblocks, this involves updating the name.
 
 The
+.B home\-cluster
+option will change the cluster name as recorded in the superblock and
+bitmap. This option only works for clustered environment.
+
+The
 .B resync
 option will cause the array to be marked
 .I dirty
@@ -1396,6 +1433,15 @@ will avoid reading from these devices if possible.
 .BR \-\-readwrite
 Subsequent devices that are added or re\-added will have the 'write-mostly'
 flag cleared.
+.TP
+.BR \-\-cluster\-confirm
+Confirm the existence of the device. This is issued in response to an \-\-add
+request by a node in a cluster. When a node adds a device it sends a message
+to all nodes in the cluster to look for a device with a UUID. This translates
+to a udev notification with the UUID of the device to be added and the slot
+number. The receiving node must acknowledge this message
+with \-\-cluster\-confirm. Valid arguments are <slot>:<devicename> in case
+the device is found or <slot>:missing in case the device is not found.
 
 .P
 Each of these options requires that the first device listed is the array
@@ -1803,9 +1849,9 @@ The config file is only used if explicitly named with
 or requested with (a possibly implicit)
 .BR \-\-scan .
 In the later case,
-.B /etc/mdadm/mdadm.conf
-or
 .B /etc/mdadm.conf
+or
+.B /etc/mdadm/mdadm.conf
 is used.
 
 If
@@ -3099,7 +3145,7 @@ uses this to find arrays when
 is given in Misc mode, and to monitor array reconstruction
 on Monitor mode.
 
-.SS /etc/mdadm/mdadm.conf (or /etc/mdadm.conf)
+.SS /etc/mdadm.conf
 
 The config file lists which devices may be scanned to see if
 they contain MD super block, and gives identifying information
@@ -3107,7 +3153,7 @@ they contain MD super block, and gives identifying information
 .BR mdadm.conf (5)
 for more details.
 
-.SS /etc/mdadm/mdadm.conf.d (or /etc/mdadm.conf.d)
+.SS /etc/mdadm.conf.d
 
 A directory containing configuration files which are read in lexical
 order.
diff --git a/mdadm.c b/mdadm.c
index 93732a8f..51e16f3f 100644
--- a/mdadm.c
+++ b/mdadm.c
@@ -74,6 +74,7 @@ int main(int argc, char *argv[])
 		.require_homehost = 1,
 	};
 	struct shape s = {
+		.journaldisks	= 0,
 		.level		= UnSet,
 		.layout		= UnSet,
 		.bitmap_chunk	= UnSet,
@@ -189,6 +190,7 @@ int main(int argc, char *argv[])
 		case 'a':
 		case Add:
 		case AddSpare:
+		case AddJournal:
 		case 'r':
 		case Remove:
 		case Replace:
@@ -196,6 +198,7 @@ int main(int argc, char *argv[])
 		case 'f':
 		case Fail:
 		case ReAdd: /* re-add */
+		case ClusterConfirm:
 			if (!mode) {
 				newmode = MANAGE;
 				shortopt = short_bitmap_options;
@@ -588,7 +591,23 @@ int main(int argc, char *argv[])
 			}
 			ident.raid_disks = s.raiddisks;
 			continue;
-
+		case O(ASSEMBLE, Nodes):
+		case O(CREATE, Nodes):
+			c.nodes = parse_num(optarg);
+			if (c.nodes <= 0) {
+				pr_err("invalid number for the number of cluster nodes: %s\n",
+					optarg);
+				exit(2);
+			}
+			continue;
+		case O(CREATE, ClusterName):
+		case O(ASSEMBLE, ClusterName):
+			c.homecluster = optarg;
+			if (strlen(c.homecluster) > 64) {
+				pr_err("Cluster name too big.\n");
+				exit(ERANGE);
+			}
+			continue;
 		case O(CREATE,'x'): /* number of spare (eXtra) disks */
 			if (s.sparedisks) {
 				pr_err("spare-devices set twice: %d and %s\n",
@@ -726,6 +745,10 @@ int main(int argc, char *argv[])
 				continue;
 			if (strcmp(c.update, "homehost")==0)
 				continue;
+			if (strcmp(c.update, "home-cluster")==0)
+				continue;
+			if (strcmp(c.update, "nodes")==0)
+				continue;
 			if (strcmp(c.update, "devicesize")==0)
 				continue;
 			if (strcmp(c.update, "no-bitmap")==0)
@@ -734,6 +757,8 @@ int main(int argc, char *argv[])
 				continue;
 			if (strcmp(c.update, "no-bbl") == 0)
 				continue;
+			if (strcmp(c.update, "force-no-bbl") == 0)
+				continue;
 			if (strcmp(c.update, "metadata") == 0)
 				continue;
 			if (strcmp(c.update, "revert-reshape") == 0)
@@ -764,10 +789,10 @@ int main(int argc, char *argv[])
 					Name, c.update);
 			}
 			fprintf(outf, "Valid --update options are:\n"
-		"     'sparc2.2', 'super-minor', 'uuid', 'name', 'resync',\n"
-		"     'summaries', 'homehost', 'byteorder', 'devicesize',\n"
+		"     'sparc2.2', 'super-minor', 'uuid', 'name', 'nodes', 'resync',\n"
+		"     'summaries', 'homehost', 'home-cluster', 'byteorder', 'devicesize',\n"
 		"     'no-bitmap', 'metadata', 'revert-reshape'\n"
-		"     'bbl', 'no-bbl'\n"
+		"     'bbl', 'no-bbl', 'force-no-bbl'\n"
 				);
 			exit(outf == stdout ? 0 : 2);
 
@@ -785,8 +810,9 @@ int main(int argc, char *argv[])
 			c.update = optarg;
 			if (strcmp(c.update, "devicesize") != 0 &&
 			    strcmp(c.update, "bbl") != 0 &&
+			    strcmp(c.update, "force-no-bbl") != 0 &&
 			    strcmp(c.update, "no-bbl") != 0) {
-				pr_err("only 'devicesize', 'bbl' and 'no-bbl' can be updated with --re-add\n");
+				pr_err("only 'devicesize', 'bbl', 'no-bbl', and 'force-no-bbl' can be updated with --re-add\n");
 				exit(2);
 			}
 			continue;
@@ -903,6 +929,13 @@ int main(int argc, char *argv[])
 		case O(MANAGE,AddSpare): /* add drive - never re-add */
 			devmode = 'S';
 			continue;
+		case O(MANAGE,AddJournal): /* add journal */
+			if (s.journaldisks && (s.level < 4 || s.level > 6)) {
+				pr_err("--add-journal is only supported for RAID level 4/5/6.\n");
+				exit(2);
+			}
+			devmode = 'j';
+			continue;
 		case O(MANAGE,ReAdd):
 			devmode = 'A';
 			continue;
@@ -919,6 +952,9 @@ int main(int argc, char *argv[])
 					   * remove the device */
 			devmode = 'f';
 			continue;
+		case O(MANAGE, ClusterConfirm):
+			devmode = 'c';
+			continue;
 		case O(MANAGE,Replace):
 			/* Mark these devices for replacement */
 			devmode = 'R';
@@ -1097,6 +1133,15 @@ int main(int argc, char *argv[])
 				s.bitmap_file = optarg;
 				continue;
 			}
+			if (strcmp(optarg, "clustered")== 0) {
+				s.bitmap_file = optarg;
+				/* Set the default number of cluster nodes
+				 * to 4 if not already set by user
+				 */
+				if (c.nodes < 1)
+					c.nodes = 4;
+				continue;
+			}
 			/* probable typo */
 			pr_err("bitmap file must contain a '/', or be 'internal', or 'none'\n"
 				"       not '%s'\n", optarg);
@@ -1137,6 +1182,23 @@ int main(int argc, char *argv[])
 		case O(INCREMENTAL, IncrementalPath):
 			remove_path = optarg;
 			continue;
+		case O(CREATE, WriteJournal):
+			if (s.journaldisks) {
+				pr_err("Please specify only one journal device for the array.\n");
+				pr_err("Ignoring --write-journal %s...\n", optarg);
+				continue;
+			}
+			dv = xmalloc(sizeof(*dv));
+			dv->devname = optarg;
+			dv->disposition = 'j';  /* WriteJournal */
+			dv->used = 0;
+			dv->next = NULL;
+			*devlistend = dv;
+			devlistend = &dv->next;
+			devs_found++;
+
+			s.journaldisks = 1;
+			continue;
 		}
 		/* We have now processed all the valid options. Anything else is
 		 * an error
@@ -1164,6 +1226,11 @@ int main(int argc, char *argv[])
 		exit(0);
 	}
 
+	if (s.journaldisks && (s.level < 4 || s.level > 6)) {
+		pr_err("--write-journal is only supported for RAID level 4/5/6.\n");
+		exit(2);
+	}
+
 	if (!mode && devs_found) {
 		mode = MISC;
 		devmode = 'Q';
@@ -1260,6 +1327,20 @@ int main(int argc, char *argv[])
 		c.require_homehost = 0;
 	}
 
+	rv = 0;
+
+	set_hooks(); /* set hooks from libs */
+
+	if (c.homecluster == NULL && (c.nodes > 0)) {
+		c.homecluster = conf_get_homecluster();
+		if (c.homecluster == NULL)
+			rv = get_cluster_name(&c.homecluster);
+		if (rv) {
+			pr_err("The md can't get cluster name\n");
+			exit(1);
+		}
+	}
+
 	if (c.backup_file && data_offset != INVALID_SECTORS) {
 		pr_err("--backup-file and --data-offset are incompatible\n");
 		exit(2);
@@ -1279,7 +1360,6 @@ int main(int argc, char *argv[])
 		/* --scan implied --brief unless -vv */
 		c.brief = 1;
 
-	rv = 0;
 	switch(mode) {
 	case MANAGE:
 		/* readonly, add/remove, readwrite, runstop */
@@ -1366,8 +1446,9 @@ int main(int argc, char *argv[])
 		}
 
 		if (s.bitmap_file) {
-			if (strcmp(s.bitmap_file, "internal")==0) {
-				pr_err("'internal' bitmaps not supported with --build\n");
+			if (strcmp(s.bitmap_file, "internal")==0 ||
+			    strcmp(s.bitmap_file, "clustered") == 0) {
+				pr_err("'internal' and 'clustered' bitmaps not supported with --build\n");
 				rv |= 1;
 				break;
 			}
@@ -1377,6 +1458,21 @@ int main(int argc, char *argv[])
 	case CREATE:
 		if (c.delay == 0)
 			c.delay = DEFAULT_BITMAP_DELAY;
+
+		if (c.nodes) {
+			if (!s.bitmap_file || strcmp(s.bitmap_file, "clustered") != 0) {
+				pr_err("--nodes argument only compatible with --bitmap=clustered\n");
+				rv = 1;
+				break;
+			}
+
+			if (s.level != 1) {
+				pr_err("--bitmap=clustered is currently supported with RAID mirror only\n");
+				rv = 1;
+				break;
+			}
+		}
+
 		if (s.write_behind && !s.bitmap_file) {
 			pr_err("write-behind mode requires a bitmap.\n");
 			rv = 1;
@@ -1442,8 +1538,6 @@ int main(int argc, char *argv[])
 			else
 				c.delay = 60;
 		}
-		if (c.delay == 0)
-			c.delay = 60;
 		rv= Monitor(devlist, mailaddr, program,
 			    &c, daemonise, oneshot,
 			    dosyslog, pidfile, increments,
diff --git a/mdadm.conf.5 b/mdadm.conf.5
index 542e2635..18512cb0 100644
--- a/mdadm.conf.5
+++ b/mdadm.conf.5
@@ -8,7 +8,7 @@
 .SH NAME
 mdadm.conf \- configuration for management of Software RAID with mdadm
 .SH SYNOPSIS
-/etc/mdadm/mdadm.conf
+/etc/mdadm.conf
 .SH DESCRIPTION
 .PP
 .I mdadm
diff --git a/mdadm.h b/mdadm.h
index b5976582..dd02be71 100644..100755
--- a/mdadm.h
+++ b/mdadm.h
@@ -35,6 +35,7 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
 
 #include	<sys/types.h>
 #include	<sys/stat.h>
+#include	<stdint.h>
 #include	<stdlib.h>
 #include	<time.h>
 #include	<sys/time.h>
@@ -51,6 +52,32 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
 #define srandom srand
 #endif
 
+#ifdef NO_COROSYNC
+#define CS_OK 1
+typedef uint64_t cmap_handle_t;
+#else
+#include	<corosync/cmap.h>
+#endif
+
+#ifndef NO_DLM
+#include	<libdlm.h>
+#include	<errno.h>
+#else
+#define LKF_NOQUEUE	0x00000001
+#define LKF_CONVERT	0x00000004
+#define LKM_PWMODE	4
+#define EUNLOCK		0x10002
+
+typedef void *dlm_lshandle_t;
+
+struct dlm_lksb {
+	int sb_status;
+	uint32_t sb_lkid;
+	char sb_flags;
+	char *sb_lvbptr;
+};
+#endif
+
 #include	<linux/kdev_t.h>
 /*#include	<linux/fs.h> */
 #include	<sys/mount.h>
@@ -162,6 +189,31 @@ extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
 #endif /* __KLIBC__ */
 
 /*
+  * Check at compile time that something is of a particular type.
+  * Always evaluates to 1 so you may use it easily in comparisons.
+*/
+
+#define typecheck(type,x) \
+({	   type __dummy; \
+	   typeof(x) __dummy2; \
+	   (void)(&__dummy == &__dummy2); \
+	   1; \
+})
+
+/*
+ *  These inlines deal with timer wrapping correctly.
+ *
+ * time_after(a,b) returns true if the time a is after time b.
+*/
+
+#define time_after(a,b)	\
+        (typecheck(unsigned int, a) && \
+         typecheck(unsigned int, b) && \
+         ((int)((b) - (a)) < 0))
+
+#define time_before(a,b)        time_after(b,a)
+
+/*
  * min()/max()/clamp() macros that also do
  * strict type-checking.. See the
  * "unnecessary" pointer comparison.
@@ -210,6 +262,9 @@ struct mdinfo {
 						   * for native metadata it is
 						   * reshape_active field mirror
 						   */
+	int journal_device_required;
+	int journal_clean;
+
 	/* During reshape we can sometimes change the data_offset to avoid
 	 * over-writing still-valid data.  We need to know if there is space.
 	 * So getinfo_super will fill in space_before and space_after in sectors.
@@ -251,6 +306,8 @@ struct mdinfo {
 	#define	DS_UNBLOCK	2048
 	int prev_state, curr_state, next_state;
 
+	/* info read from sysfs */
+	char		sysfs_array_state[20];
 };
 
 struct createinfo {
@@ -313,6 +370,7 @@ enum special_options {
 	ManageOpt,
 	Add,
 	AddSpare,
+	AddJournal,
 	Remove,
 	Fail,
 	Replace,
@@ -344,6 +402,10 @@ enum special_options {
 	Dump,
 	Restore,
 	Action,
+	Nodes,
+	ClusterName,
+	ClusterConfirm,
+	WriteJournal,
 };
 
 enum prefix_standard {
@@ -351,6 +413,12 @@ enum prefix_standard {
 	IEC
 };
 
+enum bitmap_update {
+    NoUpdate,
+    NameUpdate,
+    NodeNumUpdate,
+};
+
 /* structures read from config file */
 /* List of mddevice names and identifiers
  * Identifiers can be:
@@ -418,11 +486,14 @@ struct context {
 	char	*backup_file;
 	int	invalid_backup;
 	char	*action;
+	int	nodes;
+	char	*homecluster;
 };
 
 struct shape {
 	int	raiddisks;
 	int	sparedisks;
+	int	journaldisks;
 	int	level;
 	int	layout;
 	char	*layout_str;
@@ -521,6 +592,7 @@ enum sysfs_read_flags {
 	GET_SIZE	= (1 << 22),
 	GET_STATE	= (1 << 23),
 	GET_ERROR	= (1 << 24),
+	GET_ARRAY_STATE = (1 << 25),
 };
 
 /* If fd >= 0, get the array it is open on,
@@ -528,6 +600,7 @@ enum sysfs_read_flags {
  */
 extern int sysfs_open(char *devnm, char *devname, char *attr);
 extern void sysfs_init(struct mdinfo *mdi, int fd, char *devnm);
+extern void sysfs_init_dev(struct mdinfo *mdi, unsigned long devid);
 extern void sysfs_free(struct mdinfo *sra);
 extern struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options);
 extern int sysfs_attr_match(const char *attr, const char *str);
@@ -747,7 +820,8 @@ extern struct superswitch {
 	 *   readwrite - clear the WriteMostly1 bit in the superblock devflags
 	 *   no-bitmap - clear any record that a bitmap is present.
 	 *   bbl       - add a bad-block-log if possible
-	 *   no-bbl    - remove and bad-block-log is it is empty.
+	 *   no-bbl    - remove any bad-block-log is it is empty.
+	 *   force-no-bbl - remove any bad-block-log even if empty.
 	 *   revert-reshape - If a reshape is in progress, modify metadata so
 	 *                    it will resume going in the opposite direction.
 	 */
@@ -830,11 +904,11 @@ extern struct superswitch {
 	/* Seek 'fd' to start of write-intent-bitmap.  Must be an
 	 * md-native format bitmap
 	 */
-	void (*locate_bitmap)(struct supertype *st, int fd);
+	int (*locate_bitmap)(struct supertype *st, int fd);
 	/* if add_internal_bitmap succeeded for existing array, this
 	 * writes it out.
 	 */
-	int (*write_bitmap)(struct supertype *st, int fd);
+	int (*write_bitmap)(struct supertype *st, int fd, enum bitmap_update update);
 	/* Free the superblock and any other allocated data */
 	void (*free_super)(struct supertype *st);
 
@@ -1018,6 +1092,8 @@ struct supertype {
 			 */
 	int devcnt;
 	int retry_soon;
+	int nodes;
+	char *cluster_name;
 
 	struct mdinfo *devs;
 
@@ -1264,6 +1340,7 @@ extern int parse_uuid(char *str, int uuid[4]);
 extern int parse_layout_10(char *layout);
 extern int parse_layout_faulty(char *layout);
 extern long parse_num(char *num);
+extern int parse_cluster_confirm_arg(char *inp, char **devname, int *slot);
 extern int check_ext2(int fd, char *name);
 extern int check_reiser(int fd, char *name);
 extern int check_raid(int fd, char *name);
@@ -1294,6 +1371,7 @@ extern char *conf_get_mailaddr(void);
 extern char *conf_get_mailfrom(void);
 extern char *conf_get_program(void);
 extern char *conf_get_homehost(int *require_homehostp);
+extern char *conf_get_homecluster(void);
 extern char *conf_line(FILE *file);
 extern char *conf_word(FILE *file, int allow_key);
 extern void print_quoted(char *str);
@@ -1403,6 +1481,45 @@ extern char *fd2devnm(int fd);
 
 extern int in_initrd(void);
 
+struct cmap_hooks {
+	void *cmap_handle;      /* corosync lib related */
+
+	int (*initialize)(cmap_handle_t *handle);
+	int (*get_string)(cmap_handle_t handle,
+			  const char *string,
+			  char **name);
+	int (*finalize)(cmap_handle_t handle);
+};
+
+extern void set_cmap_hooks(void);
+extern void set_hooks(void);
+
+struct dlm_hooks {
+	void *dlm_handle;	/* dlm lib related */
+
+	dlm_lshandle_t (*create_lockspace)(const char *name,
+					   unsigned int mode);
+	int (*release_lockspace)(const char *name, dlm_lshandle_t ls,
+				 int force);
+	int (*ls_lock)(dlm_lshandle_t lockspace, uint32_t mode,
+		       struct dlm_lksb *lksb, uint32_t flags,
+		       const void *name, unsigned int namelen,
+		       uint32_t parent, void (*astaddr) (void *astarg),
+		       void *astarg, void (*bastaddr) (void *astarg),
+		       void *range);
+	int (*ls_unlock)(dlm_lshandle_t lockspace, uint32_t lkid,
+			 uint32_t flags, struct dlm_lksb *lksb,
+			 void *astarg);
+	int (*ls_get_fd)(dlm_lshandle_t ls);
+	int (*dispatch)(int fd);
+};
+
+extern int get_cluster_name(char **name);
+extern int dlm_funs_ready(void);
+extern int cluster_get_dlmlock(int *lockid);
+extern int cluster_release_dlmlock(int lockid);
+extern void set_dlm_hooks(void);
+
 #define _ROUND_UP(val, base)	(((val) + (base) - 1) & ~(base - 1))
 #define ROUND_UP(val, base)	_ROUND_UP(val, (typeof(val))(base))
 #define ROUND_UP_PTR(ptr, base)	((typeof(ptr)) \
diff --git a/mdadm.spec b/mdadm.spec
index 293cb190..685a5642 100644
--- a/mdadm.spec
+++ b/mdadm.spec
@@ -1,6 +1,6 @@
 Summary:     mdadm is used for controlling Linux md devices (aka RAID arrays)
 Name:        mdadm
-Version:     3.3.4
+Version:     3.4
 Release:     1
 Source:      http://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-%{version}.tar.gz
 URL:         http://neil.brown.name/blog/mdadm
diff --git a/mdassemble.8 b/mdassemble.8
index 601c1d10..d0c83c39 100644
--- a/mdassemble.8
+++ b/mdassemble.8
@@ -1,5 +1,5 @@
 .\" -*- nroff -*-
-.TH MDASSEMBLE 8 "" v3.3.4
+.TH MDASSEMBLE 8 "" v3.4
 .SH NAME
 mdassemble \- assemble MD devices
 .I aka
@@ -40,7 +40,7 @@ There are no options to
 
 .SH FILES
 
-.SS /etc/mdadm/mdadm.conf
+.SS /etc/mdadm.conf
 
 The config file lists which devices may be scanned to see if
 they contain MD super block, and gives identifying information
diff --git a/mdmon.8 b/mdmon.8
index beb82e03..cc6add8f 100644
--- a/mdmon.8
+++ b/mdmon.8
@@ -1,5 +1,5 @@
 .\" See file COPYING in distribution for details.
-.TH MDMON 8 "" v3.3.4
+.TH MDMON 8 "" v3.4
 .SH NAME
 mdmon \- monitor MD external metadata arrays
 
diff --git a/mdmon.c b/mdmon.c
index ee12b7c7..e4b73d96 100644
--- a/mdmon.c
+++ b/mdmon.c
@@ -235,7 +235,7 @@ static int make_control_sock(char *devname)
 	addr.sun_family = PF_LOCAL;
 	strcpy(addr.sun_path, path);
 	umask(077); /* ensure no world write access */
-	if (bind(sfd, &addr, sizeof(addr)) < 0) {
+	if (bind(sfd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
 		close(sfd);
 		return -1;
 	}
diff --git a/msg.c b/msg.c
index 754630b9..45cd4504 100644
--- a/msg.c
+++ b/msg.c
@@ -170,7 +170,7 @@ int connect_monitor(char *devname)
 
 	addr.sun_family = PF_LOCAL;
 	strcpy(addr.sun_path, path);
-	if (connect(sfd, &addr, sizeof(addr)) < 0) {
+	if (connect(sfd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
 		close(sfd);
 		return -1;
 	}
diff --git a/platform-intel.c b/platform-intel.c
index edb86795..88818f34 100644
--- a/platform-intel.c
+++ b/platform-intel.c
@@ -33,8 +33,6 @@
 static int devpath_to_ll(const char *dev_path, const char *entry,
 			 unsigned long long *val);
 
-static __u16 devpath_to_vendor(const char *dev_path);
-
 static void free_sys_dev(struct sys_dev **list)
 {
 	while (*list) {
@@ -57,6 +55,7 @@ struct sys_dev *find_driver_devices(const char *bus, const char *driver)
 	struct dirent *de;
 	struct sys_dev *head = NULL;
 	struct sys_dev *list = NULL;
+	struct sys_dev *vmd = NULL;
 	enum sys_dev_type type;
 	unsigned long long dev_id;
 	unsigned long long class;
@@ -65,17 +64,25 @@ struct sys_dev *find_driver_devices(const char *bus, const char *driver)
 		type = SYS_DEV_SAS;
 	else if (strcmp(driver, "ahci") == 0)
 		type = SYS_DEV_SATA;
-	else if (strcmp(driver, "nvme") == 0)
+	else if (strcmp(driver, "nvme") == 0) {
+		/* if looking for nvme devs, first look for vmd */
+		vmd = find_driver_devices("pci", "vmd");
 		type = SYS_DEV_NVME;
+	} else if (strcmp(driver, "vmd") == 0)
+		type = SYS_DEV_VMD;
 	else
 		type = SYS_DEV_UNKNOWN;
 
 	sprintf(path, "/sys/bus/%s/drivers/%s", bus, driver);
 	driver_dir = opendir(path);
-	if (!driver_dir)
+	if (!driver_dir) {
+		if (vmd)
+			free_sys_dev(&vmd);
 		return NULL;
+	}
 	for (de = readdir(driver_dir); de; de = readdir(driver_dir)) {
 		int n;
+		int skip = 0;
 
 		/* is 'de' a device? check that the 'subsystem' link exists and
 		 * that its target matches 'bus'
@@ -95,8 +102,19 @@ struct sys_dev *find_driver_devices(const char *bus, const char *driver)
 		sprintf(path, "/sys/bus/%s/drivers/%s/%s",
 			bus, driver, de->d_name);
 
-		/* if it's not Intel device skip it. */
-		if (devpath_to_vendor(path) != 0x8086)
+		/* if searching for nvme - skip vmd connected one */
+		if (type == SYS_DEV_NVME) {
+			struct sys_dev *dev;
+			char *rp = realpath(path, NULL);
+			for (dev = vmd; dev; dev = dev->next) {
+				if ((strncmp(dev->path, rp, strlen(dev->path)) == 0))
+					skip = 1;
+			}
+			free(rp);
+		}
+
+		/* if it's not Intel device or mark as VMD connected - skip it. */
+		if (devpath_to_vendor(path) != 0x8086 || skip == 1)
 			continue;
 
 		if (devpath_to_ll(path, "device", &dev_id) != 0)
@@ -122,12 +140,28 @@ struct sys_dev *find_driver_devices(const char *bus, const char *driver)
 		list->dev_id = (__u16) dev_id;
 		list->class = (__u32) class;
 		list->type = type;
+		/* Each VMD device (domain) adds separate PCI bus, it is better to
+		 * store path as a path to that bus (easier further determination which
+		 * NVMe dev is connected to this particular VMD domain).
+		 */
+		if (type == SYS_DEV_VMD) {
+			sprintf(path, "/sys/bus/%s/drivers/%s/%s/domain/device",
+			bus, driver, de->d_name);
+		}
 		list->path = realpath(path, NULL);
 		list->next = NULL;
 		if ((list->pci_id = strrchr(list->path, '/')) != NULL)
 			list->pci_id++;
 	}
 	closedir(driver_dir);
+
+	if (vmd) {
+		if (list)
+			list->next = vmd;
+		else
+			head = vmd;
+	}
+
 	return head;
 }
 
@@ -160,7 +194,7 @@ static int devpath_to_ll(const char *dev_path, const char *entry, unsigned long
 	return n;
 }
 
-static __u16 devpath_to_vendor(const char *dev_path)
+__u16 devpath_to_vendor(const char *dev_path)
 {
 	char path[strlen(dev_path) + strlen("/vendor") + 1];
 	char vendor[7];
@@ -196,6 +230,7 @@ struct sys_dev *find_intel_devices(void)
 
 	isci = find_driver_devices("pci", "isci");
 	ahci = find_driver_devices("pci", "ahci");
+	/* Searching for NVMe will return list of NVMe and VMD controllers */
 	nvme = find_driver_devices("pci", "nvme");
 
 	if (!isci && !ahci) {
@@ -430,6 +465,7 @@ static const struct imsm_orom *find_imsm_hba_orom(struct sys_dev *hba)
 #define AHCI_PROP "RstSataV"
 #define AHCI_SSATA_PROP "RstsSatV"
 #define AHCI_CSATA_PROP "RstCSatV"
+#define VMD_PROP "RstUefiV"
 
 #define VENDOR_GUID \
 	EFI_GUID(0x193dfefa, 0xa445, 0x4302, 0x99, 0xd8, 0xef, 0x3a, 0xad, 0x1a, 0x04, 0xc6)
@@ -545,15 +581,21 @@ const struct imsm_orom *find_imsm_efi(struct sys_dev *hba)
 			if (!csata)
 				csata = add_orom(&orom);
 			add_orom_device_id(csata, hba->dev_id);
+			csata->type = hba->type;
 			return &csata->orom;
 		}
 	}
 
+	if (hba->type == SYS_DEV_VMD) {
+		err = read_efi_variable(&orom, sizeof(orom), VMD_PROP, VENDOR_GUID);
+	}
+
 	if (err)
 		return NULL;
 
 	ret = add_orom(&orom);
 	add_orom_device_id(ret, hba->dev_id);
+	ret->type = hba->type;
 
 	return &ret->orom;
 }
@@ -583,6 +625,7 @@ const struct imsm_orom *find_imsm_nvme(struct sys_dev *hba)
 		nvme_orom = add_orom(&nvme_orom_compat);
 	}
 	add_orom_device_id(nvme_orom, hba->dev_id);
+	nvme_orom->type = SYS_DEV_NVME;
 	return &nvme_orom->orom;
 }
 
@@ -667,3 +710,32 @@ int disk_attached_to_hba(int fd, const char *hba_path)
 
 	return rc;
 }
+
+char *vmd_domain_to_controller(struct sys_dev *hba, char *buf)
+{
+	struct dirent *ent;
+	DIR *dir;
+	char path[PATH_MAX];
+
+	if (!hba)
+		return NULL;
+
+	if (hba->type != SYS_DEV_VMD)
+		return NULL;
+
+	dir = opendir("/sys/bus/pci/drivers/vmd");
+
+	for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) {
+		sprintf(path, "/sys/bus/pci/drivers/vmd/%s/domain/device",
+			ent->d_name);
+
+		if (!realpath(path, buf))
+			continue;
+
+		if (strncmp(buf, hba->path, strlen(buf)) == 0) {
+			sprintf(path, "/sys/bus/pci/drivers/vmd/%s", ent->d_name);
+			return realpath(path, buf);
+		}
+	}
+	return NULL;
+}
diff --git a/platform-intel.h b/platform-intel.h
index 695d6c66..a8ae85f4 100644
--- a/platform-intel.h
+++ b/platform-intel.h
@@ -189,6 +189,7 @@ enum sys_dev_type {
 	SYS_DEV_SAS,
 	SYS_DEV_SATA,
 	SYS_DEV_NVME,
+	SYS_DEV_VMD,
 	SYS_DEV_MAX
 };
 
@@ -213,6 +214,7 @@ struct devid_list {
 struct orom_entry {
 	struct imsm_orom orom;
 	struct devid_list *devid_list;
+	enum sys_dev_type type;
 	struct orom_entry *next;
 };
 
@@ -229,6 +231,7 @@ static inline char *guid_str(char *buf, struct efi_guid guid)
 }
 
 char *diskfd_to_devpath(int fd);
+__u16 devpath_to_vendor(const char *dev_path);
 struct sys_dev *find_driver_devices(const char *bus, const char *driver);
 struct sys_dev *find_intel_devices(void);
 const struct imsm_orom *find_imsm_capability(struct sys_dev *hba);
@@ -241,3 +244,4 @@ const char *get_sys_dev_type(enum sys_dev_type);
 const struct orom_entry *get_orom_entry_by_device_id(__u16 dev_id);
 const struct imsm_orom *get_orom_by_device_id(__u16 device_id);
 struct sys_dev *device_by_id(__u16 device_id);
+char *vmd_domain_to_controller(struct sys_dev *hba, char *buf);
diff --git a/raid6check.c b/raid6check.c
index cb8522e5..ad7ffe7e 100644
--- a/raid6check.c
+++ b/raid6check.c
@@ -349,7 +349,8 @@ int check_stripes(struct mdinfo *info, int *source, unsigned long long *offsets,
 	if (!tables_ready)
 		make_tables();
 
-	posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size);
+	if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size) != 0)
+		exit(4);
 	block_index_for_slot += 2;
 	blocks += 2;
 	blocks_page += 2;
diff --git a/restripe.c b/restripe.c
index 4d921904..56dca73e 100644
--- a/restripe.c
+++ b/restripe.c
@@ -434,7 +434,7 @@ void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs,
 /* Try to find out if a specific disk has a problem */
 int raid6_check_disks(int data_disks, int start, int chunk_size,
 		      int level, int layout, int diskP, int diskQ,
-		      char *p, char *q, char **stripes)
+		      uint8_t *p, uint8_t *q, char **stripes)
 {
 	int i;
 	int data_id, diskD;
@@ -827,8 +827,8 @@ int test_stripes(int *source, unsigned long long *offsets,
 	char *stripe_buf = xmalloc(raid_disks * chunk_size);
 	char **stripes = xmalloc(raid_disks * sizeof(char*));
 	char **blocks = xmalloc(raid_disks * sizeof(char*));
-	char *p = xmalloc(chunk_size);
-	char *q = xmalloc(chunk_size);
+	uint8_t *p = xmalloc(chunk_size);
+	uint8_t *q = xmalloc(chunk_size);
 
 	int i;
 	int diskP, diskQ;
diff --git a/sha1.h b/sha1.h
index 0f986585..999fc6a3 100644
--- a/sha1.h
+++ b/sha1.h
@@ -22,7 +22,7 @@
 
 #include <stdio.h>
 
-#if 1 /* defined HAVE_LIMITS_H || _LIBC */
+#if defined HAVE_LIMITS_H || _LIBC
 # include <limits.h>
 #endif
 
@@ -33,9 +33,9 @@
    the resulting executable.  Locally running cross-compiled executables
    is usually not possible.  */
 
-#if 1 /* def _LIBC */
-# include <stdint.h>
-typedef uint32_t sha1_uint32;
+#ifdef _LIBC
+# include <sys/types.h>
+typedef u_int32_t sha1_uint32;
 typedef uintptr_t sha1_uintptr;
 #else
 #  define INT_MAX_32_BITS 2147483647
diff --git a/super-intel.c b/super-intel.c
index 95a72b6a..90b7b6de 100644
--- a/super-intel.c
+++ b/super-intel.c
@@ -510,7 +510,8 @@ static const char *_sys_dev_type[] = {
 	[SYS_DEV_UNKNOWN] = "Unknown",
 	[SYS_DEV_SAS] = "SAS",
 	[SYS_DEV_SATA] = "SATA",
-	[SYS_DEV_NVME] = "NVMe"
+	[SYS_DEV_NVME] = "NVMe",
+	[SYS_DEV_VMD] = "VMD"
 };
 
 const char *get_sys_dev_type(enum sys_dev_type type)
@@ -565,6 +566,10 @@ static int attach_hba_to_super(struct intel_super *super, struct sys_dev *device
 	if (device->type != hba->type)
 		return 2;
 
+	/* Always forbid spanning between VMD domains (seen as different controllers by mdadm) */
+	if (device->type == SYS_DEV_VMD && !path_attached_to_hba(device->path, hba->path))
+		return 2;
+
 	/* Multiple same type HBAs can be used if they share the same OROM */
 	const struct imsm_orom *device_orom = get_orom_by_device_id(device->dev_id);
 
@@ -1761,6 +1766,57 @@ static int ahci_enumerate_ports(const char *hba_path, int port_count, int host_b
 	return err;
 }
 
+static int print_vmd_attached_devs(struct sys_dev *hba)
+{
+	struct dirent *ent;
+	DIR *dir;
+	char path[292];
+	char link[256];
+	char *c, *rp;
+
+	if (hba->type != SYS_DEV_VMD)
+		return 1;
+
+	/* scroll through /sys/dev/block looking for devices attached to
+	 * this hba
+	 */
+	dir = opendir("/sys/bus/pci/drivers/nvme");
+	for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) {
+		int n;
+
+		/* is 'ent' a device? check that the 'subsystem' link exists and
+		 * that its target matches 'bus'
+		 */
+		sprintf(path, "/sys/bus/pci/drivers/nvme/%s/subsystem",
+			ent->d_name);
+		n = readlink(path, link, sizeof(link));
+		if (n < 0 || n >= (int)sizeof(link))
+			continue;
+		link[n] = '\0';
+		c = strrchr(link, '/');
+		if (!c)
+			continue;
+		if (strncmp("pci", c+1, strlen("pci")) != 0)
+			continue;
+
+		sprintf(path, "/sys/bus/pci/drivers/nvme/%s", ent->d_name);
+		/* if not a intel NVMe - skip it*/
+		if (devpath_to_vendor(path) != 0x8086)
+			continue;
+
+		rp = realpath(path, NULL);
+		if (!rp)
+			continue;
+
+		if (path_attached_to_hba(rp, hba->path)) {
+			printf(" NVMe under VMD : %s\n", rp);
+		}
+		free(rp);
+	}
+
+	return 0;
+}
+
 static void print_found_intel_controllers(struct sys_dev *elem)
 {
 	for (; elem; elem = elem->next) {
@@ -1771,7 +1827,12 @@ static void print_found_intel_controllers(struct sys_dev *elem)
 			fprintf(stderr, "SAS ");
 		else if (elem->type == SYS_DEV_NVME)
 			fprintf(stderr, "NVMe ");
-		fprintf(stderr, "RAID controller");
+
+		if (elem->type == SYS_DEV_VMD)
+			fprintf(stderr, "VMD domain");
+		else
+			fprintf(stderr, "RAID controller");
+
 		if (elem->pci_id)
 			fprintf(stderr, " at %s", elem->pci_id);
 		fprintf(stderr, ".\n");
@@ -1935,8 +1996,10 @@ static int detail_platform_imsm(int verbose, int enumerate_only, char *controlle
 		if (controller_path && (compare_paths(hba->path, controller_path) != 0))
 			continue;
 		if (!find_imsm_capability(hba)) {
+			char buf[PATH_MAX];
 			pr_err("imsm capabilities not found for controller: %s (type %s)\n",
-				hba->path, get_sys_dev_type(hba->type));
+				  hba->type == SYS_DEV_VMD ? vmd_domain_to_controller(hba, buf) : hba->path,
+				  get_sys_dev_type(hba->type));
 			continue;
 		}
 		result = 0;
@@ -1951,13 +2014,27 @@ static int detail_platform_imsm(int verbose, int enumerate_only, char *controlle
 	const struct orom_entry *entry;
 
 	for (entry = orom_entries; entry; entry = entry->next) {
-		print_imsm_capability(&entry->orom);
+		if (entry->type == SYS_DEV_VMD) {
+			for (hba = list; hba; hba = hba->next) {
+				if (hba->type == SYS_DEV_VMD) {
+					char buf[PATH_MAX];
+					print_imsm_capability(&entry->orom);
+					printf(" I/O Controller : %s (%s)\n",
+						vmd_domain_to_controller(hba, buf), get_sys_dev_type(hba->type));
+					print_vmd_attached_devs(hba);
+					printf("\n");
+				}
+			}
+			continue;
+		}
 
-		if (imsm_orom_is_nvme(&entry->orom)) {
+		print_imsm_capability(&entry->orom);
+		if (entry->type == SYS_DEV_NVME) {
 			for (hba = list; hba; hba = hba->next) {
 				if (hba->type == SYS_DEV_NVME)
 					printf("    NVMe Device : %s\n", hba->path);
 			}
+			printf("\n");
 			continue;
 		}
 
@@ -2000,16 +2077,25 @@ static int export_detail_platform_imsm(int verbose, char *controller_path)
 	for (hba = list; hba; hba = hba->next) {
 		if (controller_path && (compare_paths(hba->path,controller_path) != 0))
 			continue;
-		if (!find_imsm_capability(hba) && verbose > 0)
-			pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_IMSM_CAPABLE_DEVICE_UNDER_%s\n", hba->path);
+		if (!find_imsm_capability(hba) && verbose > 0) {
+			char buf[PATH_MAX];
+			pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_IMSM_CAPABLE_DEVICE_UNDER_%s\n",
+			hba->type == SYS_DEV_VMD ? vmd_domain_to_controller(hba, buf) : hba->path);
+		}
 		else
 			result = 0;
 	}
 
 	const struct orom_entry *entry;
 
-	for (entry = orom_entries; entry; entry = entry->next)
+	for (entry = orom_entries; entry; entry = entry->next) {
+		if (entry->type == SYS_DEV_VMD) {
+			for (hba = list; hba; hba = hba->next)
+				print_imsm_capability_export(&entry->orom);
+			continue;
+		}
 		print_imsm_capability_export(&entry->orom);
+	}
 
 	return result;
 }
@@ -3862,12 +3948,14 @@ static int find_intel_hba_capability(int fd, struct intel_super *super, char *de
 		if (devname) {
 			struct intel_hba *hba = super->hba;
 
-			pr_err("%s is attached to Intel(R) %s RAID controller (%s),\n"
-				"    but the container is assigned to Intel(R) %s RAID controller (",
+			pr_err("%s is attached to Intel(R) %s %s (%s),\n"
+				"    but the container is assigned to Intel(R) %s %s (",
 				devname,
 				get_sys_dev_type(hba_name->type),
+				hba_name->type == SYS_DEV_VMD ? "domain" : "RAID controller",
 				hba_name->pci_id ? : "Err!",
-				get_sys_dev_type(super->hba->type));
+				get_sys_dev_type(super->hba->type),
+				hba->type == SYS_DEV_VMD ? "domain" : "RAID controller");
 
 			while (hba) {
 				fprintf(stderr, "%s", hba->pci_id ? : "Err!");
@@ -3876,7 +3964,8 @@ static int find_intel_hba_capability(int fd, struct intel_super *super, char *de
 				hba = hba->next;
 			}
 			fprintf(stderr, ").\n"
-				"    Mixing devices attached to different controllers is not allowed.\n");
+				"    Mixing devices attached to different %s is not allowed.\n",
+				hba_name->type == SYS_DEV_VMD ? "VMD domains" : "controllers");
 		}
 		return 2;
 	}
@@ -5878,7 +5967,6 @@ count_volumes(struct intel_hba *hba, int dpa, int verbose)
 
 	devid_list = entry->devid_list;
 	for (dv = devid_list; dv; dv = dv->next) {
-
 		struct md_list *devlist = NULL;
 		struct sys_dev *device = device_by_id(dv->devid);
 		char *hba_path;
@@ -5889,6 +5977,14 @@ count_volumes(struct intel_hba *hba, int dpa, int verbose)
 		else
 			return 0;
 
+		/* VMD has one orom entry for all domain, but spanning is not allowed.
+		 * VMD arrays should be counted per domain (controller), so skip
+		 * domains that are not the given one.
+		 */
+		if ((hba->type == SYS_DEV_VMD) &&
+		   (strncmp(device->path, hba->path, strlen(device->path)) != 0))
+			continue;
+
 		devlist = get_devices(hba_path);
 		/* if no intel devices return zero volumes */
 		if (devlist == NULL)
@@ -9150,7 +9246,7 @@ int validate_container_imsm(struct mdinfo *info)
 			return 1;
 		}
 
-		if (orom != orom2) {
+		if ((orom != orom2) || ((hba->type == SYS_DEV_VMD) && (hba != hba2))) {
 			pr_err("WARNING - IMSM container assembled with disks under different HBAs!\n"
 				"       This operation is not supported and can lead to data loss.\n");
 			return 1;
@@ -10277,7 +10373,7 @@ int wait_for_reshape_imsm(struct mdinfo *sra, int ndata)
 	if (sysfs_fd_get_ll(fd, &completed) < 0) {
 		dprintf("cannot read reshape_position (no reshape in progres)\n");
 		close(fd);
-		return 0;
+		return 1;
 	}
 
 	if (completed > position_to_set) {
@@ -10297,11 +10393,14 @@ int wait_for_reshape_imsm(struct mdinfo *sra, int ndata)
 
 	do {
 		char action[20];
-		sysfs_wait(fd, NULL);
+		int timeout = 3000;
+		sysfs_wait(fd, &timeout);
 		if (sysfs_get_str(sra, NULL, "sync_action",
 				  action, 20) > 0 &&
-				strncmp(action, "reshape", 7) != 0)
-			break;
+				strncmp(action, "reshape", 7) != 0) {
+			close(fd);
+			return -1;
+		}
 		if (sysfs_fd_get_ll(fd, &completed) < 0) {
 			dprintf("cannot read reshape_position (in loop)\n");
 			close(fd);
@@ -10563,7 +10662,7 @@ static int imsm_manage_reshape(
 		sra->reshape_progress = next_step;
 
 		/* wait until reshape finish */
-		if (wait_for_reshape_imsm(sra, ndata) < 0) {
+		if (wait_for_reshape_imsm(sra, ndata)) {
 			dprintf("wait_for_reshape_imsm returned error!\n");
 			goto abort;
 		}
@@ -10601,7 +10700,6 @@ static int imsm_manage_reshape(
 	ret_val = 1;
 abort:
 	free(buf);
-	abort_reshape(sra);
 
 	return ret_val;
 }
diff --git a/super0.c b/super0.c
index deb59994..59a6a034 100644
--- a/super0.c
+++ b/super0.c
@@ -405,7 +405,8 @@ static void getinfo_super0(struct supertype *st, struct mdinfo *info, char *map)
 	info->array.utime = sb->utime;
 	info->array.chunk_size = sb->chunk_size;
 	info->array.state = sb->state;
-	info->component_size = sb->size*2;
+	info->component_size = sb->size;
+	info->component_size *= 2;
 
 	if (sb->state & (1<<MD_SB_BITMAP_PRESENT))
 		info->bitmap_offset = 8;
@@ -900,7 +901,7 @@ static int write_init_super0(struct supertype *st)
 		rv = store_super0(st, di->fd);
 
 		if (rv == 0 && (sb->state & (1<<MD_SB_BITMAP_PRESENT)))
-			rv = st->ss->write_bitmap(st, di->fd);
+			rv = st->ss->write_bitmap(st, di->fd, NoUpdate);
 
 		if (rv)
 			pr_err("failed to write superblock to %s\n",
@@ -1155,16 +1156,16 @@ static int add_internal_bitmap0(struct supertype *st, int *chunkp,
 	return 1;
 }
 
-static void locate_bitmap0(struct supertype *st, int fd)
+static int locate_bitmap0(struct supertype *st, int fd)
 {
 	unsigned long long dsize;
 	unsigned long long offset;
 
 	if (!get_dev_size(fd, NULL, &dsize))
-		return;
+		return -1;
 
 	if (dsize < MD_RESERVED_SECTORS*512)
-		return;
+		return -1;
 
 	offset = MD_NEW_SIZE_SECTORS(dsize>>9);
 
@@ -1173,9 +1174,10 @@ static void locate_bitmap0(struct supertype *st, int fd)
 	offset += MD_SB_BYTES;
 
 	lseek64(fd, offset, 0);
+	return 0;
 }
 
-static int write_bitmap0(struct supertype *st, int fd)
+static int write_bitmap0(struct supertype *st, int fd, enum bitmap_update update)
 {
 	unsigned long long dsize;
 	unsigned long long offset;
diff --git a/super1.c b/super1.c
index f0508fe7..8bcaa2fe 100644
--- a/super1.c
+++ b/super1.c
@@ -1,7 +1,7 @@
 /*
  * mdadm - manage Linux "md" devices aka RAID arrays.
  *
- * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2001-2016 Neil Brown <neilb@suse.com>
  *
  *
  *    This program is free software; you can redistribute it and/or modify
@@ -68,7 +68,10 @@ struct mdp_superblock_1 {
 	__u64	data_offset;	/* sector start of data, often 0 */
 	__u64	data_size;	/* sectors in this device that can be used for data */
 	__u64	super_offset;	/* sector start of this superblock */
-	__u64	recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
+	union {
+		__u64	recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
+		__u64	journal_tail;/* journal tail of journal device (from data_offset) */
+	};
 	__u32	dev_number;	/* permanent identifier of this  device - not role in raid */
 	__u32	cnt_corrected_read; /* number of read errors that were corrected by re-writing */
 	__u8	device_uuid[16]; /* user-space setable, ignored by kernel */
@@ -125,6 +128,8 @@ struct misc_dev_info {
 					    * backwards anyway.
 					    */
 #define	MD_FEATURE_NEW_OFFSET		64 /* new_offset must be honoured */
+#define	MD_FEATURE_BITMAP_VERSIONED	256 /* bitmap version number checked properly */
+#define	MD_FEATURE_JOURNAL		512 /* support write journal */
 #define	MD_FEATURE_ALL			(MD_FEATURE_BITMAP_OFFSET	\
 					|MD_FEATURE_RECOVERY_OFFSET	\
 					|MD_FEATURE_RESHAPE_ACTIVE	\
@@ -132,8 +137,39 @@ struct misc_dev_info {
 					|MD_FEATURE_REPLACEMENT		\
 					|MD_FEATURE_RESHAPE_BACKWARDS	\
 					|MD_FEATURE_NEW_OFFSET		\
+					|MD_FEATURE_BITMAP_VERSIONED	\
+					|MD_FEATURE_JOURNAL		\
 					)
 
+#ifndef MDASSEMBLE
+static int role_from_sb(struct mdp_superblock_1 *sb)
+{
+	unsigned int d;
+	int role;
+
+	d = __le32_to_cpu(sb->dev_number);
+	if (d < __le32_to_cpu(sb->max_dev))
+		role = __le16_to_cpu(sb->dev_roles[d]);
+	else
+		role = MD_DISK_ROLE_SPARE;
+	return role;
+}
+#endif
+
+/* return how many bytes are needed for bitmap, for cluster-md each node
+ * should have it's own bitmap */
+static unsigned int calc_bitmap_size(bitmap_super_t *bms, unsigned int boundary)
+{
+	unsigned long long bits, bytes;
+
+	bits = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9);
+	bytes = (bits+7) >> 3;
+	bytes += sizeof(bitmap_super_t);
+	bytes = ROUND_UP(bytes, boundary);
+
+	return bytes;
+}
+
 static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
 {
 	unsigned int disk_csum, csum;
@@ -256,6 +292,7 @@ static int awrite(struct align_fd *afd, void *buf, int len)
 static void examine_super1(struct supertype *st, char *homehost)
 {
 	struct mdp_superblock_1 *sb = st->sb;
+	bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb)+MAX_SB_SIZE);
 	time_t atime;
 	unsigned int d;
 	int role;
@@ -289,6 +326,8 @@ static void examine_super1(struct supertype *st, char *homehost)
 	    strncmp(sb->set_name, homehost, l) == 0)
 		printf("  (local to host %s)", homehost);
 	printf("\n");
+	if (bms->nodes > 0 && (__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET))
+		printf("   Cluster Name : %-64s\n", bms->cluster_name);
 	atime = __le64_to_cpu(sb->ctime) & 0xFFFFFFFFFFULL;
 	printf("  Creation Time : %.24s\n", ctime(&atime));
 	c=map_num(pers, __le32_to_cpu(sb->level));
@@ -446,25 +485,23 @@ static void examine_super1(struct supertype *st, char *homehost)
 	/* This turns out to just be confusing */
 	printf("    Array Slot : %d (", __le32_to_cpu(sb->dev_number));
 	for (i= __le32_to_cpu(sb->max_dev); i> 0 ; i--)
-		if (__le16_to_cpu(sb->dev_roles[i-1]) != 0xffff)
+		if (__le16_to_cpu(sb->dev_roles[i-1]) != MD_DISK_ROLE_SPARE)
 			break;
 	for (d=0; d < i; d++) {
 		int role = __le16_to_cpu(sb->dev_roles[d]);
 		if (d) printf(", ");
-		if (role == 0xffff) printf("empty");
-		else if(role == 0xfffe) printf("failed");
+		if (role == MD_DISK_ROLE_SPARE) printf("empty");
+		else if(role == MD_DISK_ROLE_FAULTY) printf("failed");
 		else printf("%d", role);
 	}
 	printf(")\n");
 #endif
 	printf("   Device Role : ");
-	d = __le32_to_cpu(sb->dev_number);
-	if (d < __le32_to_cpu(sb->max_dev))
-		role = __le16_to_cpu(sb->dev_roles[d]);
-	else
-		role = 0xFFFF;
-	if (role >= 0xFFFE)
+	role = role_from_sb(sb);
+	if (role >= MD_DISK_ROLE_FAULTY)
 		printf("spare\n");
+	else if (role == MD_DISK_ROLE_JOURNAL)
+		printf("Journal\n");
 	else if (sb->feature_map & __cpu_to_le32(MD_FEATURE_REPLACEMENT))
 		printf("Replacement device %d\n", role);
 	else
@@ -493,7 +530,7 @@ static void examine_super1(struct supertype *st, char *homehost)
 	faulty = 0;
 	for (i=0; i< __le32_to_cpu(sb->max_dev); i++) {
 		int role = __le16_to_cpu(sb->dev_roles[i]);
-		if (role == 0xFFFE)
+		if (role == MD_DISK_ROLE_FAULTY)
 			faulty++;
 	}
 	if (faulty) printf(" %d failed", faulty);
@@ -681,12 +718,8 @@ static int copy_metadata1(struct supertype *st, int from, int to)
 				/* have the header, can calculate
 				 * correct bitmap bytes */
 				bitmap_super_t *bms;
-				int bits;
 				bms = (void*)buf;
-				bits = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9);
-				bytes = (bits+7) >> 3;
-				bytes += sizeof(bitmap_super_t);
-				bytes = ROUND_UP(bytes, 512);
+				bytes = calc_bitmap_size(bms, 512);
 				if (n > bytes)
 					n =  bytes;
 			}
@@ -740,6 +773,7 @@ err:
 static void detail_super1(struct supertype *st, char *homehost)
 {
 	struct mdp_superblock_1 *sb = st->sb;
+	bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE);
 	int i;
 	int l = homehost ? strlen(homehost) : 0;
 
@@ -748,6 +782,8 @@ static void detail_super1(struct supertype *st, char *homehost)
 	    sb->set_name[l] == ':' &&
 	    strncmp(sb->set_name, homehost, l) == 0)
 		printf("  (local to host %s)", homehost);
+	if (bms->nodes > 0 && (__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET))
+	    printf("\n   Cluster Name : %-64s", bms->cluster_name);
 	printf("\n           UUID : ");
 	for (i=0; i<16; i++) {
 		if ((i&3)==0 && i != 0) printf(":");
@@ -891,6 +927,8 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
 	info->array.state =
 		(__le64_to_cpu(sb->resync_offset) == MaxSector)
 		? 1 : 0;
+	if (__le32_to_cpu(bsb->nodes) > 1)
+		info->array.state |= (1 << MD_SB_CLUSTERED);
 
 	info->data_offset = __le64_to_cpu(sb->data_offset);
 	info->component_size = __le64_to_cpu(sb->size);
@@ -902,7 +940,7 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
 	info->disk.number = __le32_to_cpu(sb->dev_number);
 	if (__le32_to_cpu(sb->dev_number) >= __le32_to_cpu(sb->max_dev) ||
 	    __le32_to_cpu(sb->dev_number) >= MAX_DEVS)
-		role = 0xfffe;
+		role = MD_DISK_ROLE_FAULTY;
 	else
 		role = __le16_to_cpu(sb->dev_roles[__le32_to_cpu(sb->dev_number)]);
 
@@ -943,7 +981,7 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
 			size /= 512;
 			bmend += size;
 			if (bmend > earliest)
-				bmend = earliest;
+				earliest = bmend;
 		}
 		if (sb->bblog_offset && sb->bblog_size) {
 			unsigned long long bbend = super_offset;
@@ -969,12 +1007,17 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
 
 	info->disk.raid_disk = -1;
 	switch(role) {
-	case 0xFFFF:
+	case MD_DISK_ROLE_SPARE:
 		info->disk.state = 0; /* spare: not active, not sync, not faulty */
 		break;
-	case 0xFFFE:
+	case MD_DISK_ROLE_FAULTY:
 		info->disk.state = 1; /* faulty */
 		break;
+	case MD_DISK_ROLE_JOURNAL:
+		info->disk.state = (1 << MD_DISK_JOURNAL);
+		info->disk.raid_disk = role;
+		info->space_after = (misc->device_size - info->data_offset) % 8; /* journal uses all 4kB blocks*/
+		break;
 	default:
 		info->disk.state = 6; /* active and in sync */
 		info->disk.raid_disk = role;
@@ -1022,7 +1065,7 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
 			map[i] = 0;
 	for (i = 0; i < __le32_to_cpu(sb->max_dev); i++) {
 		role = __le16_to_cpu(sb->dev_roles[i]);
-		if (/*role == 0xFFFF || */role < (unsigned) info->array.raid_disks) {
+		if (/*role == MD_DISK_ROLE_SPARE || */role < (unsigned) info->array.raid_disks) {
 			working++;
 			if (map && role < map_disks)
 				map[role] = 1;
@@ -1030,6 +1073,9 @@ static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
 	}
 
 	info->array.working_disks = working;
+	if (sb->feature_map & __le32_to_cpu(MD_FEATURE_JOURNAL))
+		info->journal_device_required = 1;
+	info->journal_clean = 0;
 }
 
 static struct mdinfo *container_content1(struct supertype *st, char *subarray)
@@ -1054,7 +1100,18 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
 	 * ignored.
 	 */
 	int rv = 0;
+	int lockid;
 	struct mdp_superblock_1 *sb = st->sb;
+	bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE);
+
+	if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) {
+		rv = cluster_get_dlmlock(&lockid);
+		if (rv) {
+			pr_err("Cannot get dlmlock in %s return %d\n", __func__, rv);
+			cluster_release_dlmlock(lockid);
+			return rv;
+		}
+	}
 
 	if (strcmp(update, "homehost") == 0 &&
 	    homehost) {
@@ -1094,8 +1151,10 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
 		int want;
 		if (info->disk.state & (1<<MD_DISK_ACTIVE))
 			want = info->disk.raid_disk;
+		else if (info->disk.state & (1<<MD_DISK_JOURNAL))
+			want = MD_DISK_ROLE_JOURNAL;
 		else
-			want = 0xFFFF;
+			want = MD_DISK_ROLE_SPARE;
 		if (sb->dev_roles[d] != __cpu_to_le16(want)) {
 			sb->dev_roles[d] = __cpu_to_le16(want);
 			rv = 1;
@@ -1120,7 +1179,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
 		unsigned int max = __le32_to_cpu(sb->max_dev);
 
 		for (i=0 ; i < max ; i++)
-			if (__le16_to_cpu(sb->dev_roles[i]) >= 0xfffe)
+			if (__le16_to_cpu(sb->dev_roles[i]) >= MD_DISK_ROLE_FAULTY)
 				break;
 		sb->dev_number = __cpu_to_le32(i);
 		info->disk.number = i;
@@ -1225,6 +1284,11 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
 			sb->bblog_shift = 0;
 			sb->bblog_offset = 0;
 		}
+	} else if (strcmp(update, "force-no-bbl") == 0) {
+		sb->feature_map &= ~ __cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
+		sb->bblog_size = 0;
+		sb->bblog_shift = 0;
+		sb->bblog_offset = 0;
 	} else if (strcmp(update, "name") == 0) {
 		if (info->name[0] == 0)
 			sprintf(info->name, "%d", info->array.md_minor);
@@ -1245,7 +1309,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
 			(st->sb + MAX_SB_SIZE + BM_SUPER_SIZE);
 		sb->data_size = __cpu_to_le64(
 			misc->device_size - __le64_to_cpu(sb->data_offset));
-	} else if (strcmp(update, "revert-reshape") == 0) {
+	} else if (strncmp(update, "revert-reshape", 14) == 0) {
 		rv = -2;
 		if (!(sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE)))
 			pr_err("No active reshape to revert on %s\n",
@@ -1255,6 +1319,24 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
 			unsigned long long reshape_sectors;
 			long reshape_chunk;
 			rv = 0;
+			/* If the reshape hasn't started, just stop it.
+			 * It is conceivable that a stripe was modified but
+			 * the metadata not updated.  In that case the backup
+			 * should have been used to get passed the critical stage.
+			 * If that couldn't happen, the "-nobackup" version
+			 * will be used.
+			 */
+			if (strcmp(update, "revert-reshape-nobackup") == 0 &&
+			    sb->reshape_position == 0 &&
+			    (__le32_to_cpu(sb->delta_disks) > 0 ||
+			     (__le32_to_cpu(sb->delta_disks) == 0 &&
+			      !(sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS))))) {
+				sb->feature_map &= ~__cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
+				sb->raid_disks = __cpu_to_le32(__le32_to_cpu(sb->raid_disks) -
+							       __le32_to_cpu(sb->delta_disks));
+				sb->delta_disks = 0;
+				goto done;
+			}
 			/* reshape_position is a little messy.
 			 * Its value must be a multiple of the larger
 			 * chunk size, and of the "after" data disks.
@@ -1301,6 +1383,7 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
 				sb->new_offset = __cpu_to_le32(-offset_delta);
 				sb->data_size = __cpu_to_le64(__le64_to_cpu(sb->data_size) - offset_delta);
 			}
+		done:;
 		}
 	} else if (strcmp(update, "_reshape_progress")==0)
 		sb->reshape_position = __cpu_to_le64(info->reshape_progress);
@@ -1312,6 +1395,9 @@ static int update_super1(struct supertype *st, struct mdinfo *info,
 		rv = -1;
 
 	sb->sb_csum = calc_sb_1_csum(sb);
+	if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready())
+		cluster_release_dlmlock(lockid);
+
 	return rv;
 }
 
@@ -1415,13 +1501,26 @@ static int add_to_super1(struct supertype *st, mdu_disk_info_t *dk,
 	struct mdp_superblock_1 *sb = st->sb;
 	__u16 *rp = sb->dev_roles + dk->number;
 	struct devinfo *di, **dip;
+	bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE);
+	int rv, lockid;
+
+	if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) {
+		rv = cluster_get_dlmlock(&lockid);
+		if (rv) {
+			pr_err("Cannot get dlmlock in %s return %d\n", __func__, rv);
+			cluster_release_dlmlock(lockid);
+			return rv;
+		}
+	}
 
 	if ((dk->state & 6) == 6) /* active, sync */
 		*rp = __cpu_to_le16(dk->raid_disk);
+	else if (dk->state & (1<<MD_DISK_JOURNAL))
+                *rp = MD_DISK_ROLE_JOURNAL;
 	else if ((dk->state & ~2) == 0) /* active or idle -> spare */
-		*rp = 0xffff;
+		*rp = MD_DISK_ROLE_SPARE;
 	else
-		*rp = 0xfffe;
+		*rp = MD_DISK_ROLE_FAULTY;
 
 	if (dk->number >= (int)__le32_to_cpu(sb->max_dev) &&
 	    __le32_to_cpu(sb->max_dev) < MAX_DEVS)
@@ -1442,11 +1541,14 @@ static int add_to_super1(struct supertype *st, mdu_disk_info_t *dk,
 	di->next = NULL;
 	*dip = di;
 
+	if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready())
+		cluster_release_dlmlock(lockid);
+
 	return 0;
 }
 #endif
 
-static void locate_bitmap1(struct supertype *st, int fd);
+static int locate_bitmap1(struct supertype *st, int fd);
 
 static int store_super1(struct supertype *st, int fd)
 {
@@ -1455,6 +1557,17 @@ static int store_super1(struct supertype *st, int fd)
 	struct align_fd afd;
 	int sbsize;
 	unsigned long long dsize;
+	bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE);
+	int rv, lockid;
+
+	if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready()) {
+		rv = cluster_get_dlmlock(&lockid);
+		if (rv) {
+			pr_err("Cannot get dlmlock in %s return %d\n", __func__, rv);
+			cluster_release_dlmlock(lockid);
+			return rv;
+		}
+	}
 
 	if (!get_dev_size(fd, NULL, &dsize))
 		return 1;
@@ -1515,6 +1628,9 @@ static int store_super1(struct supertype *st, int fd)
 		}
 	}
 	fsync(fd);
+	if (bms->version == BITMAP_MAJOR_CLUSTERED && dlm_funs_ready())
+		cluster_release_dlmlock(lockid);
+
 	return 0;
 }
 
@@ -1537,7 +1653,55 @@ static unsigned long choose_bm_space(unsigned long devsize)
 
 static void free_super1(struct supertype *st);
 
+#define META_BLOCK_SIZE 4096
+__u32 crc32c_le(__u32 crc, unsigned char const *p, size_t len);
+
 #ifndef MDASSEMBLE
+static int write_empty_r5l_meta_block(struct supertype *st, int fd)
+{
+	struct r5l_meta_block *mb;
+	struct mdp_superblock_1 *sb = st->sb;
+	struct align_fd afd;
+	__u32 crc;
+
+	init_afd(&afd, fd);
+
+	if (posix_memalign((void**)&mb, 4096, META_BLOCK_SIZE) != 0) {
+		pr_err("Could not allocate memory for the meta block.\n");
+		return 1;
+	}
+
+	memset(mb, 0, META_BLOCK_SIZE);
+
+	mb->magic = __cpu_to_le32(R5LOG_MAGIC);
+	mb->version = R5LOG_VERSION;
+	mb->meta_size = __cpu_to_le32(sizeof(struct r5l_meta_block));
+	mb->seq = __cpu_to_le64(random32());
+	mb->position = __cpu_to_le64(0);
+
+	crc = crc32c_le(0xffffffff, sb->set_uuid, sizeof(sb->set_uuid));
+	crc = crc32c_le(crc, (void *)mb, META_BLOCK_SIZE);
+	mb->checksum = crc;
+
+	if (lseek64(fd, (sb->data_offset) * 512, 0) < 0LL) {
+		pr_err("cannot seek to offset of the meta block\n");
+		goto fail_to_write;
+	}
+
+	if (awrite(&afd, mb, META_BLOCK_SIZE) != META_BLOCK_SIZE) {
+		pr_err("failed to store write the meta block \n");
+		goto fail_to_write;
+	}
+	fsync(fd);
+
+	free(mb);
+	return 0;
+
+fail_to_write:
+	free(mb);
+	return 1;
+}
+
 static int write_init_super1(struct supertype *st)
 {
 	struct mdp_superblock_1 *sb = st->sb;
@@ -1551,6 +1715,11 @@ static int write_init_super1(struct supertype *st)
 	unsigned long long data_offset;
 
 	for (di = st->info; di; di = di->next) {
+		if (di->disk.state & (1 << MD_DISK_JOURNAL))
+			sb->feature_map |= MD_FEATURE_JOURNAL;
+	}
+
+	for (di = st->info; di; di = di->next) {
 		if (di->disk.state & (1 << MD_DISK_FAULTY))
 			continue;
 		if (di->fd < 0)
@@ -1573,7 +1742,8 @@ static int write_init_super1(struct supertype *st)
 		if (rfd >= 0)
 			close(rfd);
 
-		sb->events = 0;
+		if (!(di->disk.state & (1<<MD_DISK_JOURNAL)))
+			sb->events = 0;
 
 		refst = dup_super(st);
 		if (load_super1(refst, di->fd, NULL)==0) {
@@ -1681,15 +1851,23 @@ static int write_init_super1(struct supertype *st)
 			rv = -EINVAL;
 			goto out;
 		}
-		if (conf_get_create_info()->bblist == 0) {
+		/* Disable badblock log on clusters, or when explicitly requested */
+		if (st->nodes > 0 || conf_get_create_info()->bblist == 0) {
 			sb->bblog_size = 0;
 			sb->bblog_offset = 0;
 		}
 
 		sb->sb_csum = calc_sb_1_csum(sb);
 		rv = store_super1(st, di->fd);
+
+		if (rv == 0 && (di->disk.state & (1 << MD_DISK_JOURNAL))) {
+			rv = write_empty_r5l_meta_block(st, di->fd);
+			if (rv)
+				goto error_out;
+		}
+
 		if (rv == 0 && (__le32_to_cpu(sb->feature_map) & 1))
-			rv = st->ss->write_bitmap(st, di->fd);
+			rv = st->ss->write_bitmap(st, di->fd, NoUpdate);
 		close(di->fd);
 		di->fd = -1;
 		if (rv)
@@ -2054,7 +2232,7 @@ add_internal_bitmap1(struct supertype *st,
 				bbl_size = -bbl_offset;
 
 			if (!may_change || (room < 3*2 &&
-				  __le32_to_cpu(sb->max_dev) <= 384)) {
+					    __le32_to_cpu(sb->max_dev) <= 384)) {
 				room = 3*2;
 				offset = 1*2;
 				bbl_size = 0;
@@ -2144,32 +2322,45 @@ add_internal_bitmap1(struct supertype *st,
 	bms->daemon_sleep = __cpu_to_le32(delay);
 	bms->sync_size = __cpu_to_le64(size);
 	bms->write_behind = __cpu_to_le32(write_behind);
+	bms->nodes = __cpu_to_le32(st->nodes);
+	if (st->nodes)
+		sb->feature_map = __cpu_to_le32(__le32_to_cpu(sb->feature_map)
+						| MD_FEATURE_BITMAP_VERSIONED);
+	if (st->cluster_name)
+		strncpy((char *)bms->cluster_name,
+			st->cluster_name, strlen(st->cluster_name));
 
 	*chunkp = chunk;
 	return 1;
 }
 
-static void locate_bitmap1(struct supertype *st, int fd)
+static int locate_bitmap1(struct supertype *st, int fd)
 {
 	unsigned long long offset;
 	struct mdp_superblock_1 *sb;
 	int mustfree = 0;
+	int ret;
 
 	if (!st->sb) {
 		if (st->ss->load_super(st, fd, NULL))
-			return; /* no error I hope... */
+			return -1; /* no error I hope... */
 		mustfree = 1;
 	}
 	sb = st->sb;
 
+	if ((__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET))
+		ret = 0;
+	else
+		ret = -1;
 	offset = __le64_to_cpu(sb->super_offset);
 	offset += (int32_t) __le32_to_cpu(sb->bitmap_offset);
 	if (mustfree)
 		free(sb);
 	lseek64(fd, offset<<9, 0);
+	return ret;
 }
 
-static int write_bitmap1(struct supertype *st, int fd)
+static int write_bitmap1(struct supertype *st, int fd, enum bitmap_update update)
 {
 	struct mdp_superblock_1 *sb = st->sb;
 	bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb)+MAX_SB_SIZE);
@@ -2177,6 +2368,43 @@ static int write_bitmap1(struct supertype *st, int fd)
 	void *buf;
 	int towrite, n;
 	struct align_fd afd;
+	unsigned int i = 0;
+	unsigned long long total_bm_space, bm_space_per_node;
+
+	switch (update) {
+	case NameUpdate:
+		/* update cluster name */
+		if (st->cluster_name) {
+			memset((char *)bms->cluster_name, 0, sizeof(bms->cluster_name));
+			strncpy((char *)bms->cluster_name, st->cluster_name, 64);
+		}
+		break;
+	case NodeNumUpdate:
+		/* cluster md only supports superblock 1.2 now */
+		if (st->minor_version != 2) {
+			pr_err("Warning: cluster md only works with superblock 1.2\n");
+			return -EINVAL;
+		}
+
+		/* Each node has an independent bitmap, it is necessary to calculate the
+		 * space is enough or not, first get how many bytes for the total bitmap */
+		bm_space_per_node = calc_bitmap_size(bms, 4096);
+
+		total_bm_space = 512 * (__le64_to_cpu(sb->data_offset) - __le64_to_cpu(sb->super_offset));
+		total_bm_space = total_bm_space - 4096; /* leave another 4k for superblock */
+
+		if (bm_space_per_node * st->nodes > total_bm_space) {
+			pr_err("Warning: The max num of nodes can't exceed %llu\n",
+				total_bm_space / bm_space_per_node);
+			return -ENOMEM;
+		}
+
+		bms->nodes = __cpu_to_le32(st->nodes);
+		break;
+	case NoUpdate:
+	default:
+		break;
+	}
 
 	init_afd(&afd, fd);
 
@@ -2185,27 +2413,37 @@ static int write_bitmap1(struct supertype *st, int fd)
 	if (posix_memalign(&buf, 4096, 4096))
 		return -ENOMEM;
 
-	memset(buf, 0xff, 4096);
-	memcpy(buf, (char *)bms, sizeof(bitmap_super_t));
-
-	towrite = __le64_to_cpu(bms->sync_size) / (__le32_to_cpu(bms->chunksize)>>9);
-	towrite = (towrite+7) >> 3; /* bits to bytes */
-	towrite += sizeof(bitmap_super_t);
-	towrite = ROUND_UP(towrite, 512);
-	while (towrite > 0) {
-		n = towrite;
-		if (n > 4096)
-			n = 4096;
-		n = awrite(&afd, buf, n);
-		if (n > 0)
-			towrite -= n;
+	do {
+		/* Only the bitmap[0] should resync
+		 * whole device on initial assembly
+		 */
+		if (i)
+			memset(buf, 0x00, 4096);
 		else
+			memset(buf, 0xff, 4096);
+		memcpy(buf, (char *)bms, sizeof(bitmap_super_t));
+
+		towrite = calc_bitmap_size(bms, 4096);
+		while (towrite > 0) {
+			n = towrite;
+			if (n > 4096)
+				n = 4096;
+			n = awrite(&afd, buf, n);
+			if (n > 0)
+				towrite -= n;
+			else
+				break;
+			if (i)
+				memset(buf, 0x00, 4096);
+			else
+				memset(buf, 0xff, 4096);
+		}
+		fsync(fd);
+		if (towrite) {
+			rv = -2;
 			break;
-		memset(buf, 0xff, 4096);
-	}
-	fsync(fd);
-	if (towrite)
-		rv = -2;
+		}
+	} while (++i < __le32_to_cpu(bms->nodes));
 
 	free(buf);
 	return rv;
@@ -2213,6 +2451,7 @@ static int write_bitmap1(struct supertype *st, int fd)
 
 static void free_super1(struct supertype *st)
 {
+
 	if (st->sb)
 		free(st->sb);
 	while (st->info) {
@@ -2370,7 +2609,7 @@ void *super1_make_v0(struct supertype *st, struct mdinfo *info, mdp_super_t *sb0
 
 	for (i = 0; i < MD_SB_DISKS; i++) {
 		int state = sb0->disks[i].state;
-		sb->dev_roles[i] = 0xFFFF;
+		sb->dev_roles[i] = MD_DISK_ROLE_SPARE;
 		if ((state & (1<<MD_DISK_SYNC)) &&
 		    !(state & (1<<MD_DISK_FAULTY)))
 			sb->dev_roles[i] = __cpu_to_le16(sb0->disks[i].raid_disk);
diff --git a/sysfs.c b/sysfs.c
index 72684702..26003432 100644
--- a/sysfs.c
+++ b/sysfs.c
@@ -74,6 +74,12 @@ int sysfs_open(char *devnm, char *devname, char *attr)
 	return fd;
 }
 
+void sysfs_init_dev(struct mdinfo *mdi, unsigned long devid)
+{
+	snprintf(mdi->sys_name,
+		 sizeof(mdi->sys_name), "dev-%s", devid2kname(devid));
+}
+
 void sysfs_init(struct mdinfo *mdi, int fd, char *devnm)
 {
 	mdi->sys_name[0] = 0;
@@ -224,6 +230,13 @@ struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options)
 			goto abort;
 	}
 
+	if (options & GET_ARRAY_STATE) {
+		strcpy(base, "array_state");
+		if (load_sys(fname, sra->sysfs_array_state))
+			goto abort;
+	} else
+		sra->sysfs_array_state[0] = 0;
+
 	if (! (options & GET_DEVS))
 		return sra;
 
diff --git a/systemd/mdadm-last-resort@.service b/systemd/mdadm-last-resort@.service
index 5179f194..e93d72b2 100644
--- a/systemd/mdadm-last-resort@.service
+++ b/systemd/mdadm-last-resort@.service
@@ -1,6 +1,7 @@
 [Unit]
 Description=Activate md array even though degraded
 DefaultDependencies=no
+Conflicts=sys-devices-virtual-block-%i.device
 
 [Service]
 Type=oneshot
diff --git a/systemd/mdmonitor.service b/systemd/mdmonitor.service
index 9aff2f56..c7cff3e4 100644
--- a/systemd/mdmonitor.service
+++ b/systemd/mdmonitor.service
@@ -10,4 +10,7 @@ Description=MD array monitor
 DefaultDependencies=no
 
 [Service]
-ExecStart=BINDIR/mdadm --monitor --scan
+Environment=  MDADM_MONITOR_ARGS=--scan
+EnvironmentFile=-/run/sysconfig/mdadm
+ExecStartPre=-/usr/lib/systemd/scripts/mdadm_env.sh
+ExecStart=BINDIR/mdadm --monitor $MDADM_MONITOR_ARGS
diff --git a/test b/test
index d0a6cb85..13f1bda7 100755
--- a/test
+++ b/test
@@ -246,6 +246,15 @@ check() {
        fi
       ;;
 
+    readonly )
+       grep -s "read-only" > /dev/null /proc/mdstat || {
+                echo >&2 "ERROR array is not read-only!"; cat /proc/mdstat ; exit 1; }
+      ;;
+
+    inactive )
+       grep -s "inactive" > /dev/null /proc/mdstat || {
+                echo >&2 "ERROR array is not inactive!"; cat /proc/mdstat ; exit 1; }
+      ;;
     * ) echo >&2 ERROR unknown check $1 ; exit 1;
    esac
 }
diff --git a/tests/19raid6auto-repair b/tests/19raid6auto-repair
index 7fb1c72f..ce4a7c08 100644
--- a/tests/19raid6auto-repair
+++ b/tests/19raid6auto-repair
@@ -10,32 +10,40 @@ data_offset_in_kib=$[2048/2]
 
 # make a raid5 from a file
 dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib
-mdadm -CR $md0 -l6 -n$number_of_disks -c $chunksize_in_kib $devs
-dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib
-blockdev --flushbufs $md0; sync
-check wait
-blockdev --flushbufs $devs; sync
-echo 3 > /proc/sys/vm/drop_caches
-cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; }
-
-# wipe out 5 chunks on each device
-dd if=/dev/urandom of=$dev0 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*0]
-dd if=/dev/urandom of=$dev1 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*5]
-dd if=/dev/urandom of=$dev2 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*10]
-dd if=/dev/urandom of=$dev3 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*15]
-dd if=/dev/urandom of=$dev4 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*20]
-
-blockdev --flushbufs $devs; sync
-echo 3 > /proc/sys/vm/drop_caches
-
-$dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; }
-
-$dir/raid6check $md0 0 0 autorepair > /dev/null || { echo repair failed; exit 2; }
-blockdev --flushbufs $md0 $devs; sync
-echo 3 > /proc/sys/vm/drop_caches
-
-$dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; }
-cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; }
-
-mdadm -S $md0
-udevadm settle
+
+# perform test for every layout
+layouts="ls rs la ra parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \
+	left-asymmetric-6 right-asymmetric-6 left-symmetric-6 \
+	right-symmetric-6 parity-first-6"
+
+for layout in $layouts
+do
+    mdadm -CR $md0 -l6 --layout=$layout -n$number_of_disks -c $chunksize_in_kib $devs
+    dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib
+    blockdev --flushbufs $md0; sync
+    check wait
+    blockdev --flushbufs $devs; sync
+    echo 3 > /proc/sys/vm/drop_caches
+    cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; }
+
+    # wipe out 5 chunks on each device
+    dd if=/dev/urandom of=$dev0 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*0]
+    dd if=/dev/urandom of=$dev1 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*5]
+    dd if=/dev/urandom of=$dev2 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*10]
+    dd if=/dev/urandom of=$dev3 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*15]
+    dd if=/dev/urandom of=$dev4 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*20]
+
+    blockdev --flushbufs $devs; sync
+    echo 3 > /proc/sys/vm/drop_caches
+
+    $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; }
+
+    $dir/raid6check $md0 0 0 autorepair > /dev/null || { echo repair failed; exit 2; }
+    blockdev --flushbufs $md0 $devs; sync
+    echo 3 > /proc/sys/vm/drop_caches
+
+    $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; }
+    cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; }
+
+    mdadm -S $md0
+done
diff --git a/tests/19raid6repair b/tests/19raid6repair
index 1159bd3e..26846cc9 100644
--- a/tests/19raid6repair
+++ b/tests/19raid6repair
@@ -8,40 +8,49 @@ devs="$dev1 $dev2 $dev3 $dev4"
 # default 2048 sectors
 data_offset_in_kib=$[2048/2]
 
-for failure in "$dev3 3 3 2" "$dev3 3 2 3" "$dev3 3 2 1" "$dev3 3 2 0" "$dev4 3 3 0" "$dev4 3 3 1" "$dev4 3 3 2" \
-		"$dev1 3 0 1" "$dev1 3 0 2" "$dev1 3 0 3" "$dev2 3 1 0" "$dev2 3 1 2" "$dev2 3 1 3" ; do
-	failure_split=( $failure )
-	device_with_error=${failure_split[0]}
-	stripe_with_error=${failure_split[1]}
-	repair_params="$stripe_with_error ${failure_split[2]} ${failure_split[3]}"
-	start_of_errors_in_kib=$[data_offset_in_kib+chunksize_in_kib*stripe_with_error]
-
-	# make a raid5 from a file
-	dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib
-	mdadm -CR $md0 -l6 -n$number_of_disks -c $chunksize_in_kib $devs
-	dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib
-	blockdev --flushbufs $md0; sync
-
-	check wait
-	blockdev --flushbufs $devs; sync
-	echo 3 > /proc/sys/vm/drop_caches
-	cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; }
-
-	dd if=/dev/urandom of=$device_with_error bs=1024 count=$chunksize_in_kib seek=$start_of_errors_in_kib
-	blockdev --flushbufs $device_with_error; sync
-	echo 3 > /proc/sys/vm/drop_caches
-
-	$dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; }
-
-	$dir/raid6check $md0 repair $repair_params > /dev/null || { echo repair failed; exit 2; }
-	blockdev --flushbufs $md0 $devs; sync
-	echo 3 > /proc/sys/vm/drop_caches
-
-	$dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; }
-	cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; }
-
-	mdadm -S $md0
-	udevadm settle
-	sync
-	echo 3 > /proc/sys/vm/drop_caches
+layouts="ls rs la ra parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \
+	left-asymmetric-6 right-asymmetric-6 left-symmetric-6 \
+	right-symmetric-6 parity-first-6"
+
+for layout in $layouts
+do
+    for failure in "$dev3 3 3 2" "$dev3 3 2 3" "$dev3 3 2 1" "$dev3 3 2 0" \
+		"$dev4 3 3 0" "$dev4 3 3 1" "$dev4 3 3 2" \
+		"$dev1 3 0 1" "$dev1 3 0 2" "$dev1 3 0 3" \
+		"$dev2 3 1 0" "$dev2 3 1 2" "$dev2 3 1 3" ; do
+	    failure_split=( $failure )
+	    device_with_error=${failure_split[0]}
+	    stripe_with_error=${failure_split[1]}
+	    repair_params="$stripe_with_error ${failure_split[2]} ${failure_split[3]}"
+	    start_of_errors_in_kib=$[data_offset_in_kib+chunksize_in_kib*stripe_with_error]
+
+	    # make a raid5 from a file
+	    dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib
+	    mdadm -CR $md0 -l6 --layout=$layout -n$number_of_disks -c $chunksize_in_kib $devs
+	    dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib
+	    blockdev --flushbufs $md0; sync
+
+	    check wait
+	    blockdev --flushbufs $devs; sync
+	    echo 3 > /proc/sys/vm/drop_caches
+	    cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; }
+
+	    dd if=/dev/urandom of=$device_with_error bs=1024 count=$chunksize_in_kib seek=$start_of_errors_in_kib
+	    blockdev --flushbufs $device_with_error; sync
+	    echo 3 > /proc/sys/vm/drop_caches
+
+	    $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; }
+
+	    $dir/raid6check $md0 repair $repair_params > /dev/null || { echo repair failed; exit 2; }
+	    blockdev --flushbufs $md0 $devs; sync
+	    echo 3 > /proc/sys/vm/drop_caches
+
+	    $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; }
+	    cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; }
+
+	    mdadm -S $md0
+	    udevadm settle
+	    sync
+	    echo 3 > /proc/sys/vm/drop_caches
+    done
 done
diff --git a/tests/20raid5journal b/tests/20raid5journal
new file mode 100644
index 00000000..f751aceb
--- /dev/null
+++ b/tests/20raid5journal
@@ -0,0 +1,64 @@
+# check write journal of raid456
+
+# test --detail
+test_detail_shows_journal() {
+    mdadm -D $1 | grep journal || {
+        echo >&2 "ERROR --detail does show journal device!"; mdadm -D $1 ; exit 1; }
+}
+
+# test --examine
+test_examine_shows_journal() {
+    mdadm -E $1 | grep Journal || {
+        echo >&2 "ERROR --examine does show Journal device!"; mdadm -E $1 ; exit 1; }
+}
+
+# test --create
+create_with_journal_and_stop() {
+    mdadm -CR $md0 -l5 -n4 $dev0 $dev1 $dev2 $dev3 --write-journal $dev4
+    check wait
+    tar cf - /etc > $md0
+    ./raid6check $md0 0 0 | grep 'Error detected' && exit 1
+    test_detail_shows_journal $md0
+    test_examine_shows_journal $dev4
+    mdadm -S $md0
+}
+
+# test --assemble
+test_assemble() {
+    create_with_journal_and_stop
+    if mdadm -A $md0 $dev0 $dev1 $dev2 $dev3
+    then
+        echo >&2 "ERROR should return 1 when journal is missing!"; cat /proc/mdstat ; exit 1;
+    fi
+    mdadm -S $md0
+
+    mdadm -A $md0 $dev0 $dev1 $dev2 $dev3 --force
+    check readonly
+    mdadm -S $md0
+}
+
+# test --incremental
+test_incremental() {
+    create_with_journal_and_stop
+    for d in $dev0 $dev1 $dev2 $dev3
+    do
+        mdadm -I $d
+    done
+    check inactive
+    mdadm -I $dev4
+    check raid5
+    mdadm -S $md0
+
+    # test --incremental with journal missing
+    for d in $dev0 $dev1 $dev2 $dev3
+    do
+        mdadm -I $d
+    done
+    mdadm -R $md0
+    check readonly
+    mdadm -S $md0
+}
+
+create_with_journal_and_stop
+test_assemble
+test_incremental
diff --git a/udev-md-raid-arrays.rules b/udev-md-raid-arrays.rules
index a32b6d2d..c95ec7b1 100644
--- a/udev-md-raid-arrays.rules
+++ b/udev-md-raid-arrays.rules
@@ -17,7 +17,7 @@ TEST!="md/array_state", ENV{SYSTEMD_READY}="0", GOTO="md_end"
 ATTR{md/array_state}=="|clear|inactive", ENV{SYSTEMD_READY}="0", GOTO="md_end"
 LABEL="md_ignore_state"
 
-IMPORT{program}="BINDIR/mdadm --detail --export $tempnode"
+IMPORT{program}="BINDIR/mdadm --detail --export $devnode"
 ENV{DEVTYPE}=="disk", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}", OPTIONS+="string_escape=replace"
 ENV{DEVTYPE}=="disk", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}"
 ENV{DEVTYPE}=="disk", ENV{MD_DEVNAME}=="?*", SYMLINK+="md/$env{MD_DEVNAME}"
@@ -26,14 +26,16 @@ ENV{DEVTYPE}=="partition", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env
 ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[^0-9]", SYMLINK+="md/$env{MD_DEVNAME}%n"
 ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[0-9]", SYMLINK+="md/$env{MD_DEVNAME}p%n"
 
-IMPORT{program}="/sbin/blkid -o udev -p -u noraid $tempnode"
+IMPORT{builtin}="blkid"
+OPTIONS+="link_priority=100"
+OPTIONS+="watch"
 ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}"
 ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}"
 
 ENV{MD_LEVEL}=="raid[1-9]*", ENV{SYSTEMD_WANTS}+="mdmonitor.service"
 
 # Tell systemd to run mdmon for our container, if we need it.
-ENV{MD_LEVEL}=="raid[1-9]*", ENV{MD_CONTAINER}=="?*", PROGRAM="/bin/readlink $env{MD_CONTAINER}", ENV{MD_MON_THIS}="%c"
+ENV{MD_LEVEL}=="raid[1-9]*", ENV{MD_CONTAINER}=="?*", PROGRAM="/usr/bin/readlink $env{MD_CONTAINER}", ENV{MD_MON_THIS}="%c"
 ENV{MD_MON_THIS}=="?*", PROGRAM="/usr/bin/basename $env{MD_MON_THIS}", ENV{SYSTEMD_WANTS}+="mdmon@%c.service"
 
 LABEL="md_end"
diff --git a/udev-md-raid-assembly.rules b/udev-md-raid-assembly.rules
index 5bde607f..d0d440a6 100644
--- a/udev-md-raid-assembly.rules
+++ b/udev-md-raid-assembly.rules
@@ -25,12 +25,9 @@ GOTO="md_inc_end"
 
 LABEL="md_inc"
 
-# Disable incremental assembly to fix Debian bug #784070
-GOTO="md_inc_end"
-
 # remember you can limit what gets auto/incrementally assembled by
 # mdadm.conf(5)'s 'AUTO' and selectively whitelist using 'ARRAY'
-ACTION=="add|change", IMPORT{program}="BINDIR/mdadm --incremental --export $tempnode --offroot ${DEVLINKS}"
+ACTION=="add|change", IMPORT{program}="BINDIR/mdadm --incremental --export $devnode --offroot ${DEVLINKS}"
 ACTION=="add|change", ENV{MD_STARTED}=="*unsafe*", ENV{MD_FOREIGN}=="no", ENV{SYSTEMD_WANTS}+="mdadm-last-resort@$env{MD_DEVICE}.timer"
 ACTION=="remove", ENV{ID_PATH}=="?*", RUN+="BINDIR/mdadm -If $name --path $env{ID_PATH}"
 ACTION=="remove", ENV{ID_PATH}!="?*", RUN+="BINDIR/mdadm -If $name"
diff --git a/util.c b/util.c
index cc98d3ba..970d4847 100644
--- a/util.c
+++ b/util.c
@@ -24,6 +24,7 @@
 
 #include	"mdadm.h"
 #include	"md_p.h"
+#include	<sys/poll.h>
 #include	<sys/socket.h>
 #include	<sys/utsname.h>
 #include	<sys/wait.h>
@@ -34,6 +35,8 @@
 #include	<ctype.h>
 #include	<dirent.h>
 #include	<signal.h>
+#include	<dlfcn.h>
+
 
 /*
  * following taken from linux/blkpg.h because they aren't
@@ -79,6 +82,143 @@ struct blkpg_partition {
    aren't permitted). */
 #define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
 
+static int is_dlm_hooks_ready = 0;
+
+int dlm_funs_ready(void)
+{
+	return is_dlm_hooks_ready ? 1 : 0;
+}
+
+#ifndef MDASSEMBLE
+static struct dlm_hooks *dlm_hooks = NULL;
+struct dlm_lock_resource *dlm_lock_res = NULL;
+static int ast_called = 0;
+
+struct dlm_lock_resource {
+	dlm_lshandle_t *ls;
+	struct dlm_lksb lksb;
+};
+
+/* Using poll(2) to wait for and dispatch ASTs */
+static int poll_for_ast(dlm_lshandle_t ls)
+{
+	struct pollfd pfd;
+
+	pfd.fd = dlm_hooks->ls_get_fd(ls);
+	pfd.events = POLLIN;
+
+	while (!ast_called)
+	{
+		if (poll(&pfd, 1, 0) < 0)
+		{
+			perror("poll");
+			return -1;
+		}
+		dlm_hooks->dispatch(dlm_hooks->ls_get_fd(ls));
+	}
+	ast_called = 0;
+
+	return 0;
+}
+
+static void dlm_ast(void *arg)
+{
+	ast_called = 1;
+}
+
+static char *cluster_name = NULL;
+/* Create the lockspace, take bitmapXXX locks on all the bitmaps. */
+int cluster_get_dlmlock(int *lockid)
+{
+	int ret = -1;
+	char str[64];
+	int flags = LKF_NOQUEUE;
+
+	ret = get_cluster_name(&cluster_name);
+	if (ret) {
+		pr_err("The md can't get cluster name\n");
+		return -1;
+	}
+
+	dlm_lock_res = xmalloc(sizeof(struct dlm_lock_resource));
+	dlm_lock_res->ls = dlm_hooks->create_lockspace(cluster_name, O_RDWR);
+	if (!dlm_lock_res->ls) {
+		pr_err("%s failed to create lockspace\n", cluster_name);
+		return -ENOMEM;
+	}
+
+	/* Conversions need the lockid in the LKSB */
+	if (flags & LKF_CONVERT)
+		dlm_lock_res->lksb.sb_lkid = *lockid;
+
+	snprintf(str, 64, "bitmap%s", cluster_name);
+	/* if flags with LKF_CONVERT causes below return ENOENT which means
+	 * "No such file or directory" */
+	ret = dlm_hooks->ls_lock(dlm_lock_res->ls, LKM_PWMODE, &dlm_lock_res->lksb,
+			  flags, str, strlen(str), 0, dlm_ast,
+			  dlm_lock_res, NULL, NULL);
+	if (ret) {
+		pr_err("error %d when get PW mode on lock %s\n", errno, str);
+		dlm_hooks->release_lockspace(cluster_name, dlm_lock_res->ls, 1);
+		return ret;
+	}
+
+	/* Wait for it to complete */
+	poll_for_ast(dlm_lock_res->ls);
+	*lockid = dlm_lock_res->lksb.sb_lkid;
+
+	return dlm_lock_res->lksb.sb_status;
+}
+
+int cluster_release_dlmlock(int lockid)
+{
+	int ret = -1;
+
+	if (!cluster_name)
+		return -1;
+
+	/* if flags with LKF_CONVERT causes below return EINVAL which means
+	 * "Invalid argument" */
+	ret = dlm_hooks->ls_unlock(dlm_lock_res->ls, lockid, 0,
+				     &dlm_lock_res->lksb, dlm_lock_res);
+	if (ret) {
+		pr_err("error %d happened when unlock\n", errno);
+		/* XXX make sure the lock is unlocked eventually */
+                goto out;
+	}
+
+	/* Wait for it to complete */
+	poll_for_ast(dlm_lock_res->ls);
+
+	errno =	dlm_lock_res->lksb.sb_status;
+	if (errno != EUNLOCK) {
+		pr_err("error %d happened in ast when unlock lockspace\n", errno);
+		/* XXX make sure the lockspace is unlocked eventually */
+                goto out;
+	}
+
+	ret = dlm_hooks->release_lockspace(cluster_name, dlm_lock_res->ls, 1);
+	if (ret) {
+		pr_err("error %d happened when release lockspace\n", errno);
+		/* XXX make sure the lockspace is released eventually */
+                goto out;
+	}
+	free(dlm_lock_res);
+
+out:
+	return ret;
+}
+#else
+int cluster_get_dlmlock(int *lockid)
+{
+	return -1;
+}
+int cluster_release_dlmlock(int lockid)
+{
+	return -1;
+}
+#endif
+
 /*
  * Parse a 128 bit uuid in 4 integers
  * format is 32 hexx nibbles with options :.<space> separator
@@ -271,6 +411,16 @@ long parse_num(char *num)
 }
 #endif
 
+int parse_cluster_confirm_arg(char *input, char **devname, int *slot)
+{
+	char *dev;
+	*slot = strtoul(input, &dev, 10);
+	if (dev == input || dev[0] != ':')
+		return -1;
+	*devname = dev+1;
+	return 0;
+}
+
 void remove_partitions(int fd)
 {
 	/* remove partitions from this block devices.
@@ -1976,3 +2126,80 @@ void reopen_mddev(int mdfd)
 	if (fd >= 0 && fd != mdfd)
 		dup2(fd, mdfd);
 }
+
+#ifndef MDASSEMBLE
+static struct cmap_hooks *cmap_hooks = NULL;
+static int is_cmap_hooks_ready = 0;
+
+void set_cmap_hooks(void)
+{
+	cmap_hooks = xmalloc(sizeof(struct cmap_hooks));
+	cmap_hooks->cmap_handle = dlopen("libcmap.so.4", RTLD_NOW | RTLD_LOCAL);
+	if (!cmap_hooks->cmap_handle)
+		return;
+
+	cmap_hooks->initialize = dlsym(cmap_hooks->cmap_handle, "cmap_initialize");
+	cmap_hooks->get_string = dlsym(cmap_hooks->cmap_handle, "cmap_get_string");
+	cmap_hooks->finalize = dlsym(cmap_hooks->cmap_handle, "cmap_finalize");
+
+	if (!cmap_hooks->initialize || !cmap_hooks->get_string ||
+	    !cmap_hooks->finalize)
+		dlclose(cmap_hooks->cmap_handle);
+	else
+		is_cmap_hooks_ready = 1;
+}
+
+int get_cluster_name(char **cluster_name)
+{
+        int rv = -1;
+	cmap_handle_t handle;
+
+	if (!is_cmap_hooks_ready)
+		return rv;
+
+        rv = cmap_hooks->initialize(&handle);
+        if (rv != CS_OK)
+                goto out;
+
+        rv = cmap_hooks->get_string(handle, "totem.cluster_name", cluster_name);
+        if (rv != CS_OK) {
+                free(*cluster_name);
+                rv = -1;
+                goto name_err;
+        }
+
+        rv = 0;
+name_err:
+        cmap_hooks->finalize(handle);
+out:
+        return rv;
+}
+
+void set_dlm_hooks(void)
+{
+	dlm_hooks = xmalloc(sizeof(struct dlm_hooks));
+	dlm_hooks->dlm_handle = dlopen("libdlm_lt.so.3", RTLD_NOW | RTLD_LOCAL);
+	if (!dlm_hooks->dlm_handle)
+		return;
+
+	dlm_hooks->create_lockspace = dlsym(dlm_hooks->dlm_handle, "dlm_create_lockspace");
+	dlm_hooks->release_lockspace = dlsym(dlm_hooks->dlm_handle, "dlm_release_lockspace");
+	dlm_hooks->ls_lock = dlsym(dlm_hooks->dlm_handle, "dlm_ls_lock");
+	dlm_hooks->ls_unlock = dlsym(dlm_hooks->dlm_handle, "dlm_ls_unlock");
+	dlm_hooks->ls_get_fd = dlsym(dlm_hooks->dlm_handle, "dlm_ls_get_fd");
+	dlm_hooks->dispatch = dlsym(dlm_hooks->dlm_handle, "dlm_dispatch");
+
+	if (!dlm_hooks->create_lockspace || !dlm_hooks->ls_lock ||
+	    !dlm_hooks->ls_unlock || !dlm_hooks->release_lockspace ||
+	    !dlm_hooks->ls_get_fd || !dlm_hooks->dispatch)
+		dlclose(dlm_hooks->dlm_handle);
+	else
+		is_dlm_hooks_ready = 1;
+}
+
+void set_hooks(void)
+{
+	set_dlm_hooks();
+	set_cmap_hooks();
+}
+#endif