block monitor: freeze spare assignment for external arrays

In order to support reshape and atomic removal of spares from containers we need to prevent mdmon from activating spares. In the reshape case we additionally need to freeze sync_action while the reshape transaction is initiated with the kernel and recorded in the metadata. When reshaping a raid0 array we need to freeze the array *before* it is transitioned to a redundant raid level. Since sync_action does not exist at this point we extend the '-' prefix of a subarray string to flag mdmon not to activate spares. Mdadm needs to be reasonably certain that the version of mdmon in the system honors this 'freeze' indication. If mdmon is not already active then we assume the version that gets started is the same as the mdadm version. Otherwise, we check the version of mdmon as returned by the extended ping_monitor() operation. This is to catch cases where mdadm is upgraded in the filesystem, but mdmon started in the initramfs is from a previous release. Signed-off-by: Dan Williams <dan.j.williams@intel.com> Signed-off-by: NeilBrown <neilb@suse.de>
author: Dan Williams <dan.j.williams@intel.com> 2010-11-23 15:00:54 +1100
committer: NeilBrown <neilb@suse.de> 2010-11-23 15:00:54 +1100
commit: bc77ed535db05fc4b5b46f20dc4c27893f01610d (patch)
tree: 92a86d4a598897adc3815935120a39ab1d26bc1b /msg.c
parent: e5408a3202c09267f780d53334c369793534a7e8 (diff)
1 files changed, 193 insertions, 2 deletions
diff --git a/msg.c b/msg.c
index aabfa8f5..bfeadb34 100644
--- a/msg.c
+++ b/msg.c
@@ -135,7 +135,15 @@ int ack(int fd, int tmo)
 int wait_reply(int fd, int tmo)
 {
 	struct metadata_update msg;
-	return receive_message(fd, &msg, tmo);
+	int err = receive_message(fd, &msg, tmo);
+
+	/* mdmon sent extra data, but caller only cares that we got a
+	 * successful reply
+	 */
+	if (err == 0 && msg.len > 0)
+		free(msg.buf);
+
+	return err;
 }
 
 int connect_monitor(char *devname)
@@ -195,7 +203,6 @@ int fping_monitor(int sfd)
 	return err;
 }
 
-
 /* give the monitor a chance to update the metadata */
 int ping_monitor(char *devname)
 {
@@ -206,6 +213,190 @@ int ping_monitor(char *devname)
 	return err;
 }
 
+static char *ping_monitor_version(char *devname)
+{
+	int sfd = connect_monitor(devname);
+	struct metadata_update msg;
+	int err = 0;
+
+	if (sfd < 0)
+		return NULL;
+
+	if (ack(sfd, 20) != 0)
+		err = -1;
+
+	if (!err && receive_message(sfd, &msg, 20) != 0)
+		err = -1;
+
+	close(sfd);
+
+	if (err || !msg.len || !msg.buf)
+		return NULL;
+	return msg.buf;
+}
+
+static int unblock_subarray(struct mdinfo *sra, const int unfreeze)
+{
+	char buf[64];
+	int rc = 0;
+
+	if (sra) {
+		sprintf(buf, "external:%s\n", sra->text_version);
+		buf[9] = '/';
+	} else
+		buf[9] = '-';
+
+	if (buf[9] == '-' ||
+	    sysfs_set_str(sra, NULL, "metadata_version", buf) ||
+	    (unfreeze &&
+	     sysfs_attribute_available(sra, NULL, "sync_action") &&
+	     sysfs_set_str(sra, NULL, "sync_action", "idle")))
+		rc = -1;
+	return rc;
+}
+
+/**
+ * block_monitor - prevent mdmon spare assignment
+ * @container - container to block
+ * @freeze - flag to additionally freeze sync_action
+ *
+ * This is used by the reshape code to freeze the container, and the
+ * auto-rebuild implementation to atomically move spares.  For reshape
+ * we need to freeze sync_action in the auto-rebuild we only need to
+ * block new spare assignment, existing rebuilds can continue
+ */
+int block_monitor(char *container, const int freeze)
+{
+	int devnum = devname2devnum(container);
+	struct mdstat_ent *ent, *e, *e2;
+	struct mdinfo *sra = NULL;
+	char *version = NULL;
+	char buf[64];
+	int rv = 0;
+
+	if (!mdmon_running(devnum)) {
+		/* if mdmon is not active we assume that any instance that is
+		 * later started will match the current mdadm version, if this
+		 * assumption is violated we may inadvertantly rebuild an array
+		 * that was meant for reshape, or start rebuild on a spare that
+		 * was to be moved to another container
+		 */
+		/* pass */;
+	} else {
+		int ver;
+
+		version = ping_monitor_version(container);
+		ver = version ? mdadm_version(version) : -1;
+		free(version);
+		if (ver < 3002000) {
+			fprintf(stderr, Name
+				": mdmon instance for %s cannot be disabled\n",
+				container);
+			return -1;
+		}
+	}
+
+	ent = mdstat_read(0, 0);
+	if (!ent) {
+		fprintf(stderr, Name
+			": failed to read /proc/mdstat while disabling mdmon\n");
+		return -1;
+	}
+
+	/* freeze container contents */
+	for (e = ent; e; e = e->next) {
+		if (!is_container_member(e, container))
+			continue;
+		sysfs_free(sra);
+		sra = sysfs_read(-1, e->devnum, GET_VERSION);
+		if (!sra) {
+			fprintf(stderr, Name
+				": failed to read sysfs for subarray%s\n",
+				to_subarray(e, container));
+			break;
+		}
+		/* can't reshape an array that we can't monitor */
+		if (sra->text_version[0] == '-')
+			break;
+
+		if (freeze && sysfs_freeze_array(sra) < 1)
+			break;
+		/* flag this array to not be modified by mdmon (close race with
+		 * takeover in reshape case and spare reassignment in the
+		 * auto-rebuild case)
+		 */
+		sprintf(buf, "external:%s\n", sra->text_version);
+		buf[9] = '-';
+		if (sysfs_set_str(sra, NULL, "metadata_version", buf))
+			break;
+		ping_monitor(container);
+
+		/* check that we did not race with recovery */
+		if ((freeze &&
+		     !sysfs_attribute_available(sra, NULL, "sync_action")) ||
+		    (freeze &&
+		     sysfs_attribute_available(sra, NULL, "sync_action") &&
+		     sysfs_get_str(sra, NULL, "sync_action", buf, 20) > 0 &&
+		     strcmp(buf, "frozen\n") == 0))
+			/* pass */;
+		else
+			break;
+	}
+
+	if (e) {
+		fprintf(stderr, Name ": failed to freeze subarray%s\n",
+			to_subarray(e, container));
+
+		/* thaw the partially frozen container */
+		for (e2 = ent; e2 && e2 != e; e2 = e2->next) {
+			if (!is_container_member(e2, container))
+				continue;
+			sysfs_free(sra);
+			sra = sysfs_read(-1, e2->devnum, GET_VERSION);
+			if (unblock_subarray(sra, freeze))
+				fprintf(stderr, Name ": Failed to unfreeze %s\n", e2->dev);
+		}
+
+		ping_monitor(container); /* cleared frozen */
+		rv = -1;
+	}
+
+	sysfs_free(sra);
+	free_mdstat(ent);
+	free(container);
+
+	return rv;
+}
+
+void unblock_monitor(char *container, const int unfreeze)
+{
+	struct mdstat_ent *ent, *e;
+	struct mdinfo *sra = NULL;
+
+	ent = mdstat_read(0, 0);
+	if (!ent) {
+		fprintf(stderr, Name
+			": failed to read /proc/mdstat while unblocking container\n");
+		return;
+	}
+
+	/* unfreeze container contents */
+	for (e = ent; e; e = e->next) {
+		if (!is_container_member(e, container))
+			continue;
+		sysfs_free(sra);
+		sra = sysfs_read(-1, e->devnum, GET_VERSION);
+		if (unblock_subarray(sra, unfreeze))
+			fprintf(stderr, Name ": Failed to unfreeze %s\n", e->dev);
+	}
+	ping_monitor(container);
+
+	sysfs_free(sra);
+	free_mdstat(ent);
+}
+
+
+
 /* give the manager a chance to view the updated container state.  This
  * would naturally happen due to the manager noticing a change in
  * /proc/mdstat; however, pinging encourages this detection to happen
author	Dan Williams <dan.j.williams@intel.com>	2010-11-23 15:00:54 +1100
committer	NeilBrown <neilb@suse.de>	2010-11-23 15:00:54 +1100
commit	bc77ed535db05fc4b5b46f20dc4c27893f01610d (patch)
tree	92a86d4a598897adc3815935120a39ab1d26bc1b /msg.c
parent	e5408a3202c09267f780d53334c369793534a7e8 (diff)